示例#1
0
def credit_profiler(ds: PandasDataset) -> ExpectationSuite:
    # simple checks on data consistency
    ds.expect_column_values_to_be_between(
        "credit_card_due", min_value=0, mostly=0.99,  # allow some outliers
    )

    ds.expect_column_values_to_be_between(
        "missed_payments_1y",
        min_value=0,
        max_value=5,
        mostly=0.99,  # allow some outliers
    )

    return ds.get_expectation_suite()
示例#2
0
def profiler_with_unrealistic_expectations(
        dataset: PandasDataset) -> ExpectationSuite:
    # need to create dataframe with corrupted data first
    df = pd.DataFrame()
    df["current_balance"] = [-100]
    df["avg_passenger_count"] = [0]

    other_ds = PandasDataset(df)
    other_ds.expect_column_max_to_be_between("current_balance", -1000, -100)
    other_ds.expect_column_values_to_be_in_set("avg_passenger_count",
                                               value_set={0})

    # this should pass
    other_ds.expect_column_min_to_be_between("avg_passenger_count", 0, 1000)

    return other_ds.get_expectation_suite()
示例#3
0
def test_validation_with_ge(feast_client: Client, kafka_server):
    kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}"
    topic_name = f"avro-{uuid.uuid4()}"

    entity, feature_table = create_schema(kafka_broker, topic_name,
                                          "validation_ge")
    feast_client.apply_entity(entity)
    feast_client.apply_feature_table(feature_table)

    train_data = generate_train_data()
    ge_ds = PandasDataset(train_data)
    ge_ds.expect_column_values_to_be_between("num", 0, 100)
    ge_ds.expect_column_values_to_be_in_set("set", ["a", "b", "c"])
    expectations = ge_ds.get_expectation_suite()

    udf = create_validation_udf("testUDF", expectations, feature_table)
    apply_validation(feast_client,
                     feature_table,
                     udf,
                     validation_window_secs=1)

    job = feast_client.start_stream_to_online_ingestion(feature_table)

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120)

    wait_retry_backoff(
        lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 120)

    test_data = generate_test_data()
    ge_ds = PandasDataset(test_data)
    validation_result = ge_ds.validate(expectations, result_format="COMPLETE")
    invalid_idx = list({
        idx
        for check in validation_result.results
        for idx in check.result["unexpected_index_list"]
    })

    entity_rows = [{"key": key} for key in test_data["key"].tolist()]

    try:
        ingested = ingest_and_retrieve(
            feast_client,
            test_data,
            avro_schema_json=avro_schema(),
            topic_name=topic_name,
            kafka_broker=kafka_broker,
            entity_rows=entity_rows,
            feature_names=["validation_ge:num", "validation_ge:set"],
            expected_ingested_count=test_data.shape[0] - len(invalid_idx),
        )
    finally:
        job.cancel()

    test_data["num"] = test_data["num"].astype(np.float64)
    test_data["num"].iloc[invalid_idx] = np.nan
    test_data["set"].iloc[invalid_idx] = None

    pd.testing.assert_frame_equal(
        ingested[["key", "validation_ge:num", "validation_ge:set"]],
        test_data[["key", "num", "set"]].rename(columns={
            "num": "validation_ge:num",
            "set": "validation_ge:set"
        }),
    )
示例#4
0
def test_validation_reports_metrics(feast_client: Client, kafka_server,
                                    statsd_server: StatsDServer):
    kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}"
    topic_name = f"avro-{uuid.uuid4()}"

    entity, feature_table = create_schema(kafka_broker, topic_name,
                                          "validation_ge_metrics")
    feast_client.apply_entity(entity)
    feast_client.apply_feature_table(feature_table)

    train_data = generate_train_data()
    ge_ds = PandasDataset(train_data)
    ge_ds.expect_column_values_to_be_between("num", 0, 100)
    ge_ds.expect_column_values_to_be_in_set("set", ["a", "b", "c"])
    expectations = ge_ds.get_expectation_suite()

    udf = create_validation_udf("testUDF", expectations, feature_table)
    apply_validation(feast_client,
                     feature_table,
                     udf,
                     validation_window_secs=10)

    job = feast_client.start_stream_to_online_ingestion(feature_table)

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120)

    wait_retry_backoff(
        lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 120)

    test_data = generate_test_data()
    ge_ds = PandasDataset(test_data)
    validation_result = ge_ds.validate(expectations, result_format="COMPLETE")
    unexpected_counts = {
        "expect_column_values_to_be_between_num_0_100":
        validation_result.results[0].result["unexpected_count"],
        "expect_column_values_to_be_in_set_set":
        validation_result.results[1].result["unexpected_count"],
    }
    invalid_idx = list({
        idx
        for check in validation_result.results
        for idx in check.result["unexpected_index_list"]
    })

    entity_rows = [{"key": key} for key in test_data["key"].tolist()]

    try:
        ingest_and_retrieve(
            feast_client,
            test_data,
            avro_schema_json=avro_schema(),
            topic_name=topic_name,
            kafka_broker=kafka_broker,
            entity_rows=entity_rows,
            feature_names=[
                "validation_ge_metrics:num", "validation_ge_metrics:set"
            ],
            expected_ingested_count=test_data.shape[0] - len(invalid_idx),
        )
    finally:
        job.cancel()

    expected_metrics = [(
        f"feast_feature_validation_check_failed#check:{check_name},"
        f"feature_table:{feature_table.name},project:{feast_client.project}",
        value,
    ) for check_name, value in unexpected_counts.items()]
    wait_retry_backoff(
        lambda: (
            None,
            all(
                statsd_server.metrics.get(m) == v
                for m, v in expected_metrics),
        ),
        timeout_secs=30,
        timeout_msg="Expected metrics were not received: " +
        str(expected_metrics) + "\n"
        "Actual received metrics" + str(statsd_server.metrics),
    )
示例#5
0
def test_validation_with_ge(feast_client: Client, kafka_server):
    entity = Entity(name="key", description="Key", value_type=ValueType.INT64)
    kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}"
    topic_name = f"avro-{uuid.uuid4()}"

    feature_table = FeatureTable(
        name="validation_test",
        entities=["key"],
        features=[
            Feature("num", ValueType.INT64),
            Feature("set", ValueType.STRING)
        ],
        batch_source=FileSource(
            event_timestamp_column="event_timestamp",
            file_format=ParquetFormat(),
            file_url="/dev/null",
        ),
        stream_source=KafkaSource(
            event_timestamp_column="event_timestamp",
            bootstrap_servers=kafka_broker,
            message_format=AvroFormat(avro_schema()),
            topic=topic_name,
        ),
    )
    feast_client.apply_entity(entity)
    feast_client.apply_feature_table(feature_table)

    train_data = generate_train_data()
    ge_ds = PandasDataset(train_data)
    ge_ds.expect_column_values_to_be_between("num", 0, 100)
    ge_ds.expect_column_values_to_be_in_set("set", ["a", "b", "c"])
    expectations = ge_ds.get_expectation_suite()

    udf = create_validation_udf("testUDF", expectations)
    apply_validation(feast_client,
                     feature_table,
                     udf,
                     validation_window_secs=1)

    job = feast_client.start_stream_to_online_ingestion(feature_table)

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120)

    wait_retry_backoff(
        lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 120)

    test_data = generate_test_data()
    ge_ds = PandasDataset(test_data)
    validation_result = ge_ds.validate(expectations, result_format="COMPLETE")
    invalid_idx = list({
        idx
        for check in validation_result.results
        for idx in check.result["unexpected_index_list"]
    })

    entity_rows = [{"key": key} for key in test_data["key"].tolist()]

    try:
        ingested = ingest_and_retrieve(
            feast_client,
            test_data,
            avro_schema_json=avro_schema(),
            topic_name=topic_name,
            kafka_broker=kafka_broker,
            entity_rows=entity_rows,
            feature_names=["validation_test:num", "validation_test:set"],
            expected_ingested_count=test_data.shape[0] - len(invalid_idx),
        )
    finally:
        job.cancel()

    test_data["num"] = test_data["num"].astype(np.float64)
    test_data["num"].iloc[invalid_idx] = np.nan
    test_data["set"].iloc[invalid_idx] = None

    pd.testing.assert_frame_equal(
        ingested[["key", "validation_test:num", "validation_test:set"]],
        test_data[["key", "num", "set"]].rename(columns={
            "num": "validation_test:num",
            "set": "validation_test:set"
        }),
    )