Пример #1
0
def ensure_stream_ingestion_jobs(client: feast.Client, all_projects: bool):
    """Ensures all required stream ingestion jobs are running and cleans up the unnecessary jobs.

    More concretely, it will determine
    - which stream ingestion jobs are running
    - which stream ingestion jobs should be running
    And it'll do 2 kinds of operations
    - Cancel all running jobs that should not be running
    - Start all non-existent jobs that should be running

    Args:
        all_projects (bool): If true, runs the check for all project.
                             Otherwise only checks the client's current project.
    """

    projects = client.list_projects() if all_projects else [client.project]

    expected_job_hash_to_table_refs = _get_expected_job_hash_to_table_refs(
        client, projects)

    expected_job_hashes = set(expected_job_hash_to_table_refs.keys())

    jobs_by_hash: Dict[str, StreamIngestionJob] = {}
    for job in client.list_jobs(include_terminated=False):
        if isinstance(job, StreamIngestionJob):
            jobs_by_hash[job.get_hash()] = job

    existing_job_hashes = set(jobs_by_hash.keys())

    job_hashes_to_cancel = existing_job_hashes - expected_job_hashes
    job_hashes_to_start = expected_job_hashes - existing_job_hashes

    logging.debug(
        f"existing_job_hashes = {sorted(list(existing_job_hashes))} expected_job_hashes = {sorted(list(expected_job_hashes))}"
    )

    for job_hash in job_hashes_to_cancel:
        job = jobs_by_hash[job_hash]
        logging.info(
            f"Cancelling a stream ingestion job with job_hash={job_hash} job_id={job.get_id()} status={job.get_status()}"
        )
        try:
            job.cancel()
        except FailedPrecondition as exc:
            logging.warning(f"Job canceling failed with exception {exc}")

    for job_hash in job_hashes_to_start:
        # Any job that we wish to start should be among expected table refs map
        project, table_name = expected_job_hash_to_table_refs[job_hash]
        logging.info(
            f"Starting a stream ingestion job for project={project}, table_name={table_name} with job_hash={job_hash}"
        )
        feature_table = client.get_feature_table(name=table_name,
                                                 project=project)
        client.start_stream_to_online_ingestion(feature_table, [],
                                                project=project)
Пример #2
0
def start_job(feast_client: Client, feature_table: FeatureTable, pytestconfig):
    if pytestconfig.getoption("scheduled_streaming_job"):
        return

    job = feast_client.start_stream_to_online_ingestion(feature_table)
    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120)
    return job
Пример #3
0
def test_streaming_ingestion(feast_client: Client, local_staging_path: str,
                             kafka_server):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )
    kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}"
    topic_name = f"avro-{uuid.uuid4()}"

    feature_table = FeatureTable(
        name="drivers_stream",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=FileSource(
            event_timestamp_column="event_timestamp",
            created_timestamp_column="event_timestamp",
            file_format=ParquetFormat(),
            file_url=os.path.join(local_staging_path, "batch-storage"),
        ),
        stream_source=KafkaSource(
            "event_timestamp",
            "event_timestamp",
            kafka_broker,
            AvroFormat(avro_schema()),
            topic=topic_name,
        ),
    )

    feast_client.apply_entity(entity)
    feast_client.apply_feature_table(feature_table)

    job = feast_client.start_stream_to_online_ingestion(feature_table)

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 60)

    wait_retry_backoff(
        lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 60)

    try:
        original = generate_data()[[
            "s2id", "unique_drivers", "event_timestamp"
        ]]
        for record in original.to_dict("records"):
            record["event_timestamp"] = (
                record["event_timestamp"].to_pydatetime().replace(
                    tzinfo=pytz.utc))

            send_avro_record_to_kafka(
                topic_name,
                record,
                bootstrap_servers=kafka_broker,
                avro_schema_json=avro_schema(),
            )

        def get_online_features():
            features = feast_client.get_online_features(
                ["drivers_stream:unique_drivers"],
                entity_rows=[{
                    "s2id": s2_id
                } for s2_id in original["s2id"].tolist()],
            ).to_dict()
            df = pd.DataFrame.from_dict(features)
            return df, not df["drivers_stream:unique_drivers"].isna().any()

        ingested = wait_retry_backoff(get_online_features, 60)
    finally:
        job.cancel()

    pd.testing.assert_frame_equal(
        ingested[["s2id", "drivers_stream:unique_drivers"]],
        original[[
            "s2id", "unique_drivers"
        ]].rename(columns={"unique_drivers": "drivers_stream:unique_drivers"}),
    )
Пример #4
0
def test_streaming_ingestion(feast_client: Client, local_staging_path: str,
                             kafka_server, pytestconfig):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )
    kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}"
    topic_name = f"avro-{uuid.uuid4()}"

    feature_table = FeatureTable(
        name="drivers_stream",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=FileSource(
            event_timestamp_column="event_timestamp",
            created_timestamp_column="event_timestamp",
            file_format=ParquetFormat(),
            file_url=os.path.join(local_staging_path, "batch-storage"),
        ),
        stream_source=KafkaSource(
            event_timestamp_column="event_timestamp",
            bootstrap_servers=kafka_broker,
            message_format=AvroFormat(avro_schema()),
            topic=topic_name,
        ),
    )

    feast_client.apply(entity)
    feast_client.apply(feature_table)

    if not pytestconfig.getoption("scheduled_streaming_job"):
        job = feast_client.start_stream_to_online_ingestion(feature_table)
        assert job.get_feature_table() == feature_table.name
        wait_retry_backoff(
            lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS),
            120)
    else:
        job = None

    wait_retry_backoff(
        lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 300)

    test_data = generate_data()[["s2id", "unique_drivers", "event_timestamp"]]

    try:
        ingested = ingest_and_retrieve(
            feast_client,
            test_data,
            avro_schema_json=avro_schema(),
            topic_name=topic_name,
            kafka_broker=kafka_broker,
            entity_rows=[{
                "s2id": s2_id
            } for s2_id in test_data["s2id"].tolist()],
            feature_names=["drivers_stream:unique_drivers"],
        )
    finally:
        if job:
            job.cancel()
        else:
            feast_client.delete_feature_table(feature_table.name)

    pd.testing.assert_frame_equal(
        ingested[["s2id", "drivers_stream:unique_drivers"]],
        test_data[[
            "s2id", "unique_drivers"
        ]].rename(columns={"unique_drivers": "drivers_stream:unique_drivers"}),
    )
Пример #5
0
def test_validation_with_ge(feast_client: Client, kafka_server):
    kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}"
    topic_name = f"avro-{uuid.uuid4()}"

    entity, feature_table = create_schema(kafka_broker, topic_name,
                                          "validation_ge")
    feast_client.apply_entity(entity)
    feast_client.apply_feature_table(feature_table)

    train_data = generate_train_data()
    ge_ds = PandasDataset(train_data)
    ge_ds.expect_column_values_to_be_between("num", 0, 100)
    ge_ds.expect_column_values_to_be_in_set("set", ["a", "b", "c"])
    expectations = ge_ds.get_expectation_suite()

    udf = create_validation_udf("testUDF", expectations, feature_table)
    apply_validation(feast_client,
                     feature_table,
                     udf,
                     validation_window_secs=1)

    job = feast_client.start_stream_to_online_ingestion(feature_table)

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120)

    wait_retry_backoff(
        lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 120)

    test_data = generate_test_data()
    ge_ds = PandasDataset(test_data)
    validation_result = ge_ds.validate(expectations, result_format="COMPLETE")
    invalid_idx = list({
        idx
        for check in validation_result.results
        for idx in check.result["unexpected_index_list"]
    })

    entity_rows = [{"key": key} for key in test_data["key"].tolist()]

    try:
        ingested = ingest_and_retrieve(
            feast_client,
            test_data,
            avro_schema_json=avro_schema(),
            topic_name=topic_name,
            kafka_broker=kafka_broker,
            entity_rows=entity_rows,
            feature_names=["validation_ge:num", "validation_ge:set"],
            expected_ingested_count=test_data.shape[0] - len(invalid_idx),
        )
    finally:
        job.cancel()

    test_data["num"] = test_data["num"].astype(np.float64)
    test_data["num"].iloc[invalid_idx] = np.nan
    test_data["set"].iloc[invalid_idx] = None

    pd.testing.assert_frame_equal(
        ingested[["key", "validation_ge:num", "validation_ge:set"]],
        test_data[["key", "num", "set"]].rename(columns={
            "num": "validation_ge:num",
            "set": "validation_ge:set"
        }),
    )
Пример #6
0
def test_validation_reports_metrics(feast_client: Client, kafka_server,
                                    statsd_server: StatsDServer):
    kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}"
    topic_name = f"avro-{uuid.uuid4()}"

    entity, feature_table = create_schema(kafka_broker, topic_name,
                                          "validation_ge_metrics")
    feast_client.apply_entity(entity)
    feast_client.apply_feature_table(feature_table)

    train_data = generate_train_data()
    ge_ds = PandasDataset(train_data)
    ge_ds.expect_column_values_to_be_between("num", 0, 100)
    ge_ds.expect_column_values_to_be_in_set("set", ["a", "b", "c"])
    expectations = ge_ds.get_expectation_suite()

    udf = create_validation_udf("testUDF", expectations, feature_table)
    apply_validation(feast_client,
                     feature_table,
                     udf,
                     validation_window_secs=10)

    job = feast_client.start_stream_to_online_ingestion(feature_table)

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120)

    wait_retry_backoff(
        lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 120)

    test_data = generate_test_data()
    ge_ds = PandasDataset(test_data)
    validation_result = ge_ds.validate(expectations, result_format="COMPLETE")
    unexpected_counts = {
        "expect_column_values_to_be_between_num_0_100":
        validation_result.results[0].result["unexpected_count"],
        "expect_column_values_to_be_in_set_set":
        validation_result.results[1].result["unexpected_count"],
    }
    invalid_idx = list({
        idx
        for check in validation_result.results
        for idx in check.result["unexpected_index_list"]
    })

    entity_rows = [{"key": key} for key in test_data["key"].tolist()]

    try:
        ingest_and_retrieve(
            feast_client,
            test_data,
            avro_schema_json=avro_schema(),
            topic_name=topic_name,
            kafka_broker=kafka_broker,
            entity_rows=entity_rows,
            feature_names=[
                "validation_ge_metrics:num", "validation_ge_metrics:set"
            ],
            expected_ingested_count=test_data.shape[0] - len(invalid_idx),
        )
    finally:
        job.cancel()

    expected_metrics = [(
        f"feast_feature_validation_check_failed#check:{check_name},"
        f"feature_table:{feature_table.name},project:{feast_client.project}",
        value,
    ) for check_name, value in unexpected_counts.items()]
    wait_retry_backoff(
        lambda: (
            None,
            all(
                statsd_server.metrics.get(m) == v
                for m, v in expected_metrics),
        ),
        timeout_secs=30,
        timeout_msg="Expected metrics were not received: " +
        str(expected_metrics) + "\n"
        "Actual received metrics" + str(statsd_server.metrics),
    )
Пример #7
0
def test_validation_with_ge(feast_client: Client, kafka_server):
    entity = Entity(name="key", description="Key", value_type=ValueType.INT64)
    kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}"
    topic_name = f"avro-{uuid.uuid4()}"

    feature_table = FeatureTable(
        name="validation_test",
        entities=["key"],
        features=[
            Feature("num", ValueType.INT64),
            Feature("set", ValueType.STRING)
        ],
        batch_source=FileSource(
            event_timestamp_column="event_timestamp",
            file_format=ParquetFormat(),
            file_url="/dev/null",
        ),
        stream_source=KafkaSource(
            event_timestamp_column="event_timestamp",
            bootstrap_servers=kafka_broker,
            message_format=AvroFormat(avro_schema()),
            topic=topic_name,
        ),
    )
    feast_client.apply_entity(entity)
    feast_client.apply_feature_table(feature_table)

    train_data = generate_train_data()
    ge_ds = PandasDataset(train_data)
    ge_ds.expect_column_values_to_be_between("num", 0, 100)
    ge_ds.expect_column_values_to_be_in_set("set", ["a", "b", "c"])
    expectations = ge_ds.get_expectation_suite()

    udf = create_validation_udf("testUDF", expectations)
    apply_validation(feast_client,
                     feature_table,
                     udf,
                     validation_window_secs=1)

    job = feast_client.start_stream_to_online_ingestion(feature_table)

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 120)

    wait_retry_backoff(
        lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 120)

    test_data = generate_test_data()
    ge_ds = PandasDataset(test_data)
    validation_result = ge_ds.validate(expectations, result_format="COMPLETE")
    invalid_idx = list({
        idx
        for check in validation_result.results
        for idx in check.result["unexpected_index_list"]
    })

    entity_rows = [{"key": key} for key in test_data["key"].tolist()]

    try:
        ingested = ingest_and_retrieve(
            feast_client,
            test_data,
            avro_schema_json=avro_schema(),
            topic_name=topic_name,
            kafka_broker=kafka_broker,
            entity_rows=entity_rows,
            feature_names=["validation_test:num", "validation_test:set"],
            expected_ingested_count=test_data.shape[0] - len(invalid_idx),
        )
    finally:
        job.cancel()

    test_data["num"] = test_data["num"].astype(np.float64)
    test_data["num"].iloc[invalid_idx] = np.nan
    test_data["set"].iloc[invalid_idx] = None

    pd.testing.assert_frame_equal(
        ingested[["key", "validation_test:num", "validation_test:set"]],
        test_data[["key", "num", "set"]].rename(columns={
            "num": "validation_test:num",
            "set": "validation_test:set"
        }),
    )