示例#1
0
def ensure_stream_ingestion_jobs(client: feast_spark.Client,
                                 all_projects: bool):
    """Ensures all required stream ingestion jobs are running and cleans up the unnecessary jobs.

    More concretely, it will determine
    - which stream ingestion jobs are running
    - which stream ingestion jobs should be running
    And it'll do 2 kinds of operations
    - Cancel all running jobs that should not be running
    - Start all non-existent jobs that should be running

    Args:
        all_projects (bool): If true, runs the check for all project.
                             Otherwise only checks the client's current project.
    """

    projects = (client.feature_store.list_projects()
                if all_projects else [client.feature_store.project])

    expected_job_hash_to_table_refs = _get_expected_job_hash_to_table_refs(
        client, projects)

    expected_job_hashes = set(expected_job_hash_to_table_refs.keys())

    jobs_by_hash: Dict[str, StreamIngestionJob] = {}
    for job in client.list_jobs(include_terminated=False):
        if isinstance(job, StreamIngestionJob):
            jobs_by_hash[job.get_hash()] = job

    existing_job_hashes = set(jobs_by_hash.keys())

    job_hashes_to_cancel = existing_job_hashes - expected_job_hashes
    job_hashes_to_start = expected_job_hashes - existing_job_hashes

    logging.debug(
        f"existing_job_hashes = {sorted(list(existing_job_hashes))} expected_job_hashes = {sorted(list(expected_job_hashes))}"
    )

    for job_hash in job_hashes_to_cancel:
        job = jobs_by_hash[job_hash]
        logging.info(
            f"Cancelling a stream ingestion job with job_hash={job_hash} job_id={job.get_id()} status={job.get_status()}"
        )
        try:
            job.cancel()
        except FailedPrecondition as exc:
            logging.warning(f"Job canceling failed with exception {exc}")

    for job_hash in job_hashes_to_start:
        # Any job that we wish to start should be among expected table refs map
        project, table_name = expected_job_hash_to_table_refs[job_hash]
        logging.info(
            f"Starting a stream ingestion job for project={project}, table_name={table_name} with job_hash={job_hash}"
        )
        feature_table = client.feature_store.get_feature_table(name=table_name,
                                                               project=project)
        client.start_stream_to_online_ingestion(feature_table, [],
                                                project=project)
示例#2
0
def test_list_jobs_long_table_name(
    feast_client: Client,
    feast_spark_client: SparkClient,
    batch_source: Union[BigQuerySource, FileSource],
):
    entity = Entity(name="long_entity_name" * 10,
                    description="S2id",
                    value_type=ValueType.INT64)

    feature_table = FeatureTable(
        name="just1a2featuretable3with4a5really6really7really8really9really10",
        entities=[entity.name],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=batch_source,
    )

    feast_client.apply(entity)
    feast_client.apply(feature_table)

    data_sample = generate_data().rename(columns={"s2id": entity.name})
    feast_client.ingest(feature_table, data_sample)

    job = feast_spark_client.start_offline_to_online_ingestion(
        feature_table,
        data_sample.event_timestamp.min().to_pydatetime(),
        data_sample.event_timestamp.max().to_pydatetime() +
        timedelta(seconds=1),
    )

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180)
    all_job_ids = [
        job.get_id() for job in feast_spark_client.list_jobs(
            include_terminated=True,
            project=feast_client.project,
            table_name=feature_table.name,
        )
    ]
    assert job.get_id() in all_job_ids

    features = feast_client.get_online_features(
        [f"{feature_table.name}:unique_drivers"],
        entity_rows=[{
            entity.name: key
        } for key in data_sample[entity.name].tolist()],
    ).to_dict()

    ingested = pd.DataFrame.from_dict(features)
    pd.testing.assert_frame_equal(
        ingested[[entity.name, f"{feature_table.name}:unique_drivers"]],
        data_sample[[entity.name, "unique_drivers"]].rename(
            columns={"unique_drivers": f"{feature_table.name}:unique_drivers"
                     }),
    )
def ingest_and_verify(
    feast_client: Client,
    feast_spark_client: SparkClient,
    feature_table: FeatureTable,
    original: pd.DataFrame,
):
    job = feast_spark_client.start_offline_to_online_ingestion(
        feature_table,
        original.event_timestamp.min().to_pydatetime(),
        original.event_timestamp.max().to_pydatetime() + timedelta(seconds=1),
    )
    assert job.get_feature_table() == feature_table.name

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180
    )

    features = feast_client.get_online_features(
        [f"{feature_table.name}:unique_drivers"],
        entity_rows=[{"s2id": s2_id} for s2_id in original["s2id"].tolist()],
    ).to_dict()

    ingested = pd.DataFrame.from_dict(features)
    pd.testing.assert_frame_equal(
        ingested[["s2id", f"{feature_table.name}:unique_drivers"]],
        original[["s2id", "unique_drivers"]].rename(
            columns={"unique_drivers": f"{feature_table.name}:unique_drivers"}
        ),
    )
示例#4
0
def start_job_service() -> None:
    """
    Start Feast Job Service
    """

    log_fmt = "%(asctime)s %(levelname)s %(message)s"
    logging.basicConfig(level=logging.INFO, format=log_fmt)

    feast_client = FeastClient()
    client = Client(feast_client)

    if client.config.getboolean(opt.JOB_SERVICE_ENABLE_CONTROL_LOOP):
        # Start the control loop thread only if it's enabled from configs
        thread = threading.Thread(target=start_control_loop, daemon=True)
        thread.start()

    server = grpc.server(ThreadPoolExecutor(),
                         interceptors=(LoggingInterceptor(), ))
    JobService_pb2_grpc.add_JobServiceServicer_to_server(
        JobServiceServicer(client), server)
    LegacyJobService_pb2_grpc.add_JobServiceServicer_to_server(
        JobServiceServicer(client), server)
    add_HealthServicer_to_server(HealthServicerImpl(), server)
    server.add_insecure_port("[::]:6568")
    server.start()
    logging.info("Feast Job Service is listening on port :6568")
    server.wait_for_termination()
示例#5
0
def start_job_service() -> None:
    """
    Start Feast Job Service
    """
    feast_client = FeastClient()
    client = Client(feast_client)

    if client.config.getboolean(opt.JOB_SERVICE_ENABLE_CONTROL_LOOP):
        # Start the control loop thread only if it's enabled from configs
        thread = threading.Thread(target=start_control_loop, daemon=True)
        thread.start()

    metricServerThread = threading.Thread(
        target=start_prometheus_serving,
        daemon=True,
        args=[client.config.getint(opt.JOB_SERVICE_PROMETHEUS_METRIC_PORT)],
    )
    metricServerThread.start()

    server = grpc.server(ThreadPoolExecutor(),
                         interceptors=(LoggingInterceptor(), ))
    JobService_pb2_grpc.add_JobServiceServicer_to_server(
        JobServiceServicer(client), server)
    LegacyJobService_pb2_grpc.add_JobServiceServicer_to_server(
        JobServiceServicer(client), server)
    add_HealthServicer_to_server(HealthServicerImpl(), server)
    server.add_insecure_port("[::]:6568")
    server.start()
    logger.info("Feast Job Service is listening on port :6568")
    server.wait_for_termination()
def test_list_jobs_long_table_name(
    feast_client: Client,
    feast_spark_client: SparkClient,
    batch_source: Union[BigQuerySource, FileSource],
):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )

    feature_table = FeatureTable(
        name=
        "just1a2featuretable3with4a5really6really7really8really9really10really11really12long13name",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=batch_source,
    )

    feast_client.apply(entity)
    feast_client.apply(feature_table)

    data_sample = generate_data()
    feast_client.ingest(feature_table, data_sample)

    job = feast_spark_client.start_offline_to_online_ingestion(
        feature_table,
        data_sample.event_timestamp.min().to_pydatetime(),
        data_sample.event_timestamp.max().to_pydatetime() +
        timedelta(seconds=1),
    )

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180)
    all_job_ids = [
        job.get_id() for job in feast_spark_client.list_jobs(
            include_terminated=True,
            project=feast_client.project,
            table_name=feature_table.name,
        )
    ]
    assert job.get_id() in all_job_ids
示例#7
0
def start_job(
    feast_spark_client: SparkClient, feature_table: FeatureTable, pytestconfig
):
    if pytestconfig.getoption("scheduled_streaming_job"):
        return

    job = feast_spark_client.start_stream_to_online_ingestion(feature_table)
    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 180
    )
    return job
示例#8
0
def test_schedule_batch_ingestion_jobs(pytestconfig, feast_client: Client,
                                       feast_spark_client: SparkClient):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )
    batch_source = FileSource(
        file_format=ParquetFormat(),
        file_url="gs://example/feast/*",
        event_timestamp_column="datetime_col",
        created_timestamp_column="timestamp",
        date_partition_column="datetime",
    )
    feature_table = FeatureTable(
        name=f"schedule_{str(uuid.uuid4())}".replace("-", "_"),
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=batch_source,
    )
    feast_client.apply(entity)
    feast_client.apply(feature_table)

    feast_spark_client.schedule_offline_to_online_ingestion(
        feature_table, 1, "0 0 * * *")
    config.load_incluster_config()
    k8s_api = client.CustomObjectsApi()

    def get_scheduled_spark_application():
        job_hash = hashlib.md5(f"{feast_client.project}-{feature_table.name}".
                               encode()).hexdigest()
        resource_name = f"feast-{job_hash}"

        return k8s_api.get_namespaced_custom_object(
            group="sparkoperator.k8s.io",
            version="v1beta2",
            namespace=pytestconfig.getoption("k8s_namespace"),
            plural="scheduledsparkapplications",
            name=resource_name,
        )

    response = get_scheduled_spark_application()
    assert response["spec"]["schedule"] == "0 0 * * *"
    feast_spark_client.schedule_offline_to_online_ingestion(
        feature_table, 1, "1 0 * * *")
    response = get_scheduled_spark_application()
    assert response["spec"]["schedule"] == "1 0 * * *"

    feast_spark_client.unschedule_offline_to_online_ingestion(feature_table)
示例#9
0
def start_control_loop() -> None:
    """Starts control loop that continuously ensures that correct jobs are being run.

    Currently this affects only the stream ingestion jobs. Please refer to
    ensure_stream_ingestion_jobs for full documentation on how the check works.

    """
    logger.info(
        "Feast Job Service is starting a control loop in a background thread, "
        "which will ensure that stream ingestion jobs are successfully running."
    )
    try:
        feature_store = FeastClient()
        client = Client(feature_store)
        while True:
            ensure_stream_ingestion_jobs(client, all_projects=True)
            time.sleep(1)
    except Exception:
        traceback.print_exc()
    finally:
        # Send interrupt signal to the main thread to kill the server if control loop fails
        os.kill(os.getpid(), signal.SIGINT)
示例#10
0
def ensure_stream_ingestion_jobs(client: Client, all_projects: bool):
    """Ensures all required stream ingestion jobs are running and cleans up the unnecessary jobs.

    More concretely, it will determine
    - which stream ingestion jobs are running
    - which stream ingestion jobs should be running
    And it'll do 2 kinds of operations
    - Cancel all running jobs that should not be running
    - Start all non-existent jobs that should be running

    Args:
        all_projects (bool): If true, runs the check for all project.
                             Otherwise only checks the client's current project.
    """

    projects = (client.feature_store.list_projects()
                if all_projects else [client.feature_store.project])
    if client.config.exists(opt.WHITELISTED_PROJECTS):
        whitelisted_projects = client.config.get(opt.WHITELISTED_PROJECTS)
        if whitelisted_projects:
            whitelisted_projects = whitelisted_projects.split(",")
            projects = [
                project for project in projects
                if project in whitelisted_projects
            ]

    expected_job_hash_to_tables = _get_expected_job_hash_to_tables(
        client, projects)

    expected_job_hashes = set(expected_job_hash_to_tables.keys())

    jobs_by_hash: Dict[str, StreamIngestionJob] = {}
    # when we want to retry failed jobs, we shouldn't include terminated jobs here
    # thus, Control Loop will behave like no job exists and will spawn new one
    for job in client.list_jobs(include_terminated=not client.config.
                                getboolean(opt.JOB_SERVICE_RETRY_FAILED_JOBS)):
        if (isinstance(job, StreamIngestionJob)
                and job.get_status() != SparkJobStatus.COMPLETED):
            jobs_by_hash[job.get_hash()] = job

    existing_job_hashes = set(jobs_by_hash.keys())

    job_hashes_to_cancel = existing_job_hashes - expected_job_hashes
    job_hashes_to_start = expected_job_hashes - existing_job_hashes

    logger.debug(f"existing_job_hashes = {sorted(list(existing_job_hashes))} "
                 f"expected_job_hashes = {sorted(list(expected_job_hashes))}")

    for job_hash in job_hashes_to_start:
        # Any job that we wish to start should be among expected table refs map
        project, feature_table = expected_job_hash_to_tables[job_hash]
        logger.warning(
            f"Starting a stream ingestion job for project={project}, "
            f"table_name={feature_table.name} with job_hash={job_hash}")
        client.start_stream_to_online_ingestion(feature_table, [],
                                                project=project)

        # prevent scheduler from peak load
        time.sleep(client.config.getint(opt.JOB_SERVICE_PAUSE_BETWEEN_JOBS))

    for job_hash in job_hashes_to_cancel:
        job = jobs_by_hash[job_hash]
        if job.get_status() != SparkJobStatus.IN_PROGRESS:
            logger.warning(
                f"Can't cancel job with job_hash={job_hash} job_id={job.get_id()} status={job.get_status()}"
            )
            continue

        logger.warning(
            f"Cancelling a stream ingestion job with job_hash={job_hash} job_id={job.get_id()} status={job.get_status()}"
        )
        try:
            job.cancel()
        except FailedPrecondition as exc:
            logger.error(f"Job canceling failed with exception {exc}")
示例#11
0
def test_streaming_ingestion(
    feast_client: Client,
    feast_spark_client: SparkClient,
    local_staging_path: str,
    kafka_server,
    pytestconfig,
):
    entity = Entity(name="s2id", description="S2id", value_type=ValueType.INT64,)
    kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}"
    topic_name = f"avro-{uuid.uuid4()}"

    feature_table = FeatureTable(
        name="drivers_stream",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=FileSource(
            event_timestamp_column="event_timestamp",
            created_timestamp_column="event_timestamp",
            file_format=ParquetFormat(),
            file_url=os.path.join(local_staging_path, "batch-storage"),
        ),
        stream_source=KafkaSource(
            event_timestamp_column="event_timestamp",
            bootstrap_servers=kafka_broker,
            message_format=AvroFormat(avro_schema()),
            topic=topic_name,
        ),
    )

    feast_client.apply(entity)
    feast_client.apply(feature_table)

    if not pytestconfig.getoption("scheduled_streaming_job"):
        job = feast_spark_client.start_stream_to_online_ingestion(feature_table)
        assert job.get_feature_table() == feature_table.name
        wait_retry_backoff(
            lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 180
        )
    else:
        job = None

    wait_retry_backoff(
        lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 300
    )

    test_data = generate_data()[["s2id", "unique_drivers", "event_timestamp"]]

    try:
        ingested = ingest_and_retrieve(
            feast_client,
            test_data,
            avro_schema_json=avro_schema(),
            topic_name=topic_name,
            kafka_broker=kafka_broker,
            entity_rows=[{"s2id": s2_id} for s2_id in test_data["s2id"].tolist()],
            feature_names=["drivers_stream:unique_drivers"],
        )
    finally:
        if job:
            job.cancel()
        else:
            feast_client.delete_feature_table(feature_table.name)

    pd.testing.assert_frame_equal(
        ingested[["s2id", "drivers_stream:unique_drivers"]],
        test_data[["s2id", "unique_drivers"]].rename(
            columns={"unique_drivers": "drivers_stream:unique_drivers"}
        ),
    )
示例#12
0
def test_historical_features(
    feast_client: Client,
    feast_spark_client: SparkClient,
    tfrecord_feast_client: Client,
    batch_source: Union[BigQuerySource, FileSource],
):
    customer_entity = Entity(
        name="user_id", description="Customer", value_type=ValueType.INT64
    )
    feast_client.apply(customer_entity)

    max_age = Duration()
    max_age.FromSeconds(2 * 86400)

    transactions_feature_table = FeatureTable(
        name="transactions",
        entities=["user_id"],
        features=[
            Feature("daily_transactions", ValueType.DOUBLE),
            Feature("total_transactions", ValueType.DOUBLE),
        ],
        batch_source=batch_source,
        max_age=max_age,
    )

    feast_client.apply(transactions_feature_table)

    transactions_df, customers_df = generate_data()
    feast_client.ingest(transactions_feature_table, transactions_df)

    feature_refs = ["transactions:daily_transactions"]

    # remove microseconds because job.get_start_time() does not contain microseconds
    job_submission_time = datetime.utcnow().replace(microsecond=0)
    job = feast_spark_client.get_historical_features(feature_refs, customers_df)
    assert job.get_start_time() >= job_submission_time
    assert job.get_start_time() <= job_submission_time + timedelta(hours=1)

    output_dir = job.get_output_file_uri()

    # will both be None if not using Azure blob storage
    account_name, account_key = _get_azure_creds(feast_client)

    joined_df = read_parquet(
        output_dir, azure_account_name=account_name, azure_account_key=account_key
    )

    expected_joined_df = pd.DataFrame(
        {
            "event_timestamp": customers_df.event_timestamp.tolist(),
            "user_id": customers_df.user_id.tolist(),
            "transactions__daily_transactions": transactions_df.daily_transactions.tolist()
            + [None] * transactions_df.shape[0],
        }
    )

    assert_frame_equal(
        joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(drop=True),
        expected_joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(
            drop=True
        ),
    )

    job = feast_spark_client.get_historical_features(feature_refs, customers_df)
    job.get_output_file_uri()
    assert job.get_status() == SparkJobStatus.COMPLETED