def ensure_stream_ingestion_jobs(client: feast_spark.Client, all_projects: bool): """Ensures all required stream ingestion jobs are running and cleans up the unnecessary jobs. More concretely, it will determine - which stream ingestion jobs are running - which stream ingestion jobs should be running And it'll do 2 kinds of operations - Cancel all running jobs that should not be running - Start all non-existent jobs that should be running Args: all_projects (bool): If true, runs the check for all project. Otherwise only checks the client's current project. """ projects = (client.feature_store.list_projects() if all_projects else [client.feature_store.project]) expected_job_hash_to_table_refs = _get_expected_job_hash_to_table_refs( client, projects) expected_job_hashes = set(expected_job_hash_to_table_refs.keys()) jobs_by_hash: Dict[str, StreamIngestionJob] = {} for job in client.list_jobs(include_terminated=False): if isinstance(job, StreamIngestionJob): jobs_by_hash[job.get_hash()] = job existing_job_hashes = set(jobs_by_hash.keys()) job_hashes_to_cancel = existing_job_hashes - expected_job_hashes job_hashes_to_start = expected_job_hashes - existing_job_hashes logging.debug( f"existing_job_hashes = {sorted(list(existing_job_hashes))} expected_job_hashes = {sorted(list(expected_job_hashes))}" ) for job_hash in job_hashes_to_cancel: job = jobs_by_hash[job_hash] logging.info( f"Cancelling a stream ingestion job with job_hash={job_hash} job_id={job.get_id()} status={job.get_status()}" ) try: job.cancel() except FailedPrecondition as exc: logging.warning(f"Job canceling failed with exception {exc}") for job_hash in job_hashes_to_start: # Any job that we wish to start should be among expected table refs map project, table_name = expected_job_hash_to_table_refs[job_hash] logging.info( f"Starting a stream ingestion job for project={project}, table_name={table_name} with job_hash={job_hash}" ) feature_table = client.feature_store.get_feature_table(name=table_name, project=project) client.start_stream_to_online_ingestion(feature_table, [], project=project)
def test_list_jobs_long_table_name( feast_client: Client, feast_spark_client: SparkClient, batch_source: Union[BigQuerySource, FileSource], ): entity = Entity(name="long_entity_name" * 10, description="S2id", value_type=ValueType.INT64) feature_table = FeatureTable( name="just1a2featuretable3with4a5really6really7really8really9really10", entities=[entity.name], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=batch_source, ) feast_client.apply(entity) feast_client.apply(feature_table) data_sample = generate_data().rename(columns={"s2id": entity.name}) feast_client.ingest(feature_table, data_sample) job = feast_spark_client.start_offline_to_online_ingestion( feature_table, data_sample.event_timestamp.min().to_pydatetime(), data_sample.event_timestamp.max().to_pydatetime() + timedelta(seconds=1), ) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180) all_job_ids = [ job.get_id() for job in feast_spark_client.list_jobs( include_terminated=True, project=feast_client.project, table_name=feature_table.name, ) ] assert job.get_id() in all_job_ids features = feast_client.get_online_features( [f"{feature_table.name}:unique_drivers"], entity_rows=[{ entity.name: key } for key in data_sample[entity.name].tolist()], ).to_dict() ingested = pd.DataFrame.from_dict(features) pd.testing.assert_frame_equal( ingested[[entity.name, f"{feature_table.name}:unique_drivers"]], data_sample[[entity.name, "unique_drivers"]].rename( columns={"unique_drivers": f"{feature_table.name}:unique_drivers" }), )
def ingest_and_verify( feast_client: Client, feast_spark_client: SparkClient, feature_table: FeatureTable, original: pd.DataFrame, ): job = feast_spark_client.start_offline_to_online_ingestion( feature_table, original.event_timestamp.min().to_pydatetime(), original.event_timestamp.max().to_pydatetime() + timedelta(seconds=1), ) assert job.get_feature_table() == feature_table.name wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180 ) features = feast_client.get_online_features( [f"{feature_table.name}:unique_drivers"], entity_rows=[{"s2id": s2_id} for s2_id in original["s2id"].tolist()], ).to_dict() ingested = pd.DataFrame.from_dict(features) pd.testing.assert_frame_equal( ingested[["s2id", f"{feature_table.name}:unique_drivers"]], original[["s2id", "unique_drivers"]].rename( columns={"unique_drivers": f"{feature_table.name}:unique_drivers"} ), )
def start_job_service() -> None: """ Start Feast Job Service """ log_fmt = "%(asctime)s %(levelname)s %(message)s" logging.basicConfig(level=logging.INFO, format=log_fmt) feast_client = FeastClient() client = Client(feast_client) if client.config.getboolean(opt.JOB_SERVICE_ENABLE_CONTROL_LOOP): # Start the control loop thread only if it's enabled from configs thread = threading.Thread(target=start_control_loop, daemon=True) thread.start() server = grpc.server(ThreadPoolExecutor(), interceptors=(LoggingInterceptor(), )) JobService_pb2_grpc.add_JobServiceServicer_to_server( JobServiceServicer(client), server) LegacyJobService_pb2_grpc.add_JobServiceServicer_to_server( JobServiceServicer(client), server) add_HealthServicer_to_server(HealthServicerImpl(), server) server.add_insecure_port("[::]:6568") server.start() logging.info("Feast Job Service is listening on port :6568") server.wait_for_termination()
def start_job_service() -> None: """ Start Feast Job Service """ feast_client = FeastClient() client = Client(feast_client) if client.config.getboolean(opt.JOB_SERVICE_ENABLE_CONTROL_LOOP): # Start the control loop thread only if it's enabled from configs thread = threading.Thread(target=start_control_loop, daemon=True) thread.start() metricServerThread = threading.Thread( target=start_prometheus_serving, daemon=True, args=[client.config.getint(opt.JOB_SERVICE_PROMETHEUS_METRIC_PORT)], ) metricServerThread.start() server = grpc.server(ThreadPoolExecutor(), interceptors=(LoggingInterceptor(), )) JobService_pb2_grpc.add_JobServiceServicer_to_server( JobServiceServicer(client), server) LegacyJobService_pb2_grpc.add_JobServiceServicer_to_server( JobServiceServicer(client), server) add_HealthServicer_to_server(HealthServicerImpl(), server) server.add_insecure_port("[::]:6568") server.start() logger.info("Feast Job Service is listening on port :6568") server.wait_for_termination()
def test_list_jobs_long_table_name( feast_client: Client, feast_spark_client: SparkClient, batch_source: Union[BigQuerySource, FileSource], ): entity = Entity( name="s2id", description="S2id", value_type=ValueType.INT64, ) feature_table = FeatureTable( name= "just1a2featuretable3with4a5really6really7really8really9really10really11really12long13name", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=batch_source, ) feast_client.apply(entity) feast_client.apply(feature_table) data_sample = generate_data() feast_client.ingest(feature_table, data_sample) job = feast_spark_client.start_offline_to_online_ingestion( feature_table, data_sample.event_timestamp.min().to_pydatetime(), data_sample.event_timestamp.max().to_pydatetime() + timedelta(seconds=1), ) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180) all_job_ids = [ job.get_id() for job in feast_spark_client.list_jobs( include_terminated=True, project=feast_client.project, table_name=feature_table.name, ) ] assert job.get_id() in all_job_ids
def start_job( feast_spark_client: SparkClient, feature_table: FeatureTable, pytestconfig ): if pytestconfig.getoption("scheduled_streaming_job"): return job = feast_spark_client.start_stream_to_online_ingestion(feature_table) wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 180 ) return job
def test_schedule_batch_ingestion_jobs(pytestconfig, feast_client: Client, feast_spark_client: SparkClient): entity = Entity( name="s2id", description="S2id", value_type=ValueType.INT64, ) batch_source = FileSource( file_format=ParquetFormat(), file_url="gs://example/feast/*", event_timestamp_column="datetime_col", created_timestamp_column="timestamp", date_partition_column="datetime", ) feature_table = FeatureTable( name=f"schedule_{str(uuid.uuid4())}".replace("-", "_"), entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=batch_source, ) feast_client.apply(entity) feast_client.apply(feature_table) feast_spark_client.schedule_offline_to_online_ingestion( feature_table, 1, "0 0 * * *") config.load_incluster_config() k8s_api = client.CustomObjectsApi() def get_scheduled_spark_application(): job_hash = hashlib.md5(f"{feast_client.project}-{feature_table.name}". encode()).hexdigest() resource_name = f"feast-{job_hash}" return k8s_api.get_namespaced_custom_object( group="sparkoperator.k8s.io", version="v1beta2", namespace=pytestconfig.getoption("k8s_namespace"), plural="scheduledsparkapplications", name=resource_name, ) response = get_scheduled_spark_application() assert response["spec"]["schedule"] == "0 0 * * *" feast_spark_client.schedule_offline_to_online_ingestion( feature_table, 1, "1 0 * * *") response = get_scheduled_spark_application() assert response["spec"]["schedule"] == "1 0 * * *" feast_spark_client.unschedule_offline_to_online_ingestion(feature_table)
def start_control_loop() -> None: """Starts control loop that continuously ensures that correct jobs are being run. Currently this affects only the stream ingestion jobs. Please refer to ensure_stream_ingestion_jobs for full documentation on how the check works. """ logger.info( "Feast Job Service is starting a control loop in a background thread, " "which will ensure that stream ingestion jobs are successfully running." ) try: feature_store = FeastClient() client = Client(feature_store) while True: ensure_stream_ingestion_jobs(client, all_projects=True) time.sleep(1) except Exception: traceback.print_exc() finally: # Send interrupt signal to the main thread to kill the server if control loop fails os.kill(os.getpid(), signal.SIGINT)
def ensure_stream_ingestion_jobs(client: Client, all_projects: bool): """Ensures all required stream ingestion jobs are running and cleans up the unnecessary jobs. More concretely, it will determine - which stream ingestion jobs are running - which stream ingestion jobs should be running And it'll do 2 kinds of operations - Cancel all running jobs that should not be running - Start all non-existent jobs that should be running Args: all_projects (bool): If true, runs the check for all project. Otherwise only checks the client's current project. """ projects = (client.feature_store.list_projects() if all_projects else [client.feature_store.project]) if client.config.exists(opt.WHITELISTED_PROJECTS): whitelisted_projects = client.config.get(opt.WHITELISTED_PROJECTS) if whitelisted_projects: whitelisted_projects = whitelisted_projects.split(",") projects = [ project for project in projects if project in whitelisted_projects ] expected_job_hash_to_tables = _get_expected_job_hash_to_tables( client, projects) expected_job_hashes = set(expected_job_hash_to_tables.keys()) jobs_by_hash: Dict[str, StreamIngestionJob] = {} # when we want to retry failed jobs, we shouldn't include terminated jobs here # thus, Control Loop will behave like no job exists and will spawn new one for job in client.list_jobs(include_terminated=not client.config. getboolean(opt.JOB_SERVICE_RETRY_FAILED_JOBS)): if (isinstance(job, StreamIngestionJob) and job.get_status() != SparkJobStatus.COMPLETED): jobs_by_hash[job.get_hash()] = job existing_job_hashes = set(jobs_by_hash.keys()) job_hashes_to_cancel = existing_job_hashes - expected_job_hashes job_hashes_to_start = expected_job_hashes - existing_job_hashes logger.debug(f"existing_job_hashes = {sorted(list(existing_job_hashes))} " f"expected_job_hashes = {sorted(list(expected_job_hashes))}") for job_hash in job_hashes_to_start: # Any job that we wish to start should be among expected table refs map project, feature_table = expected_job_hash_to_tables[job_hash] logger.warning( f"Starting a stream ingestion job for project={project}, " f"table_name={feature_table.name} with job_hash={job_hash}") client.start_stream_to_online_ingestion(feature_table, [], project=project) # prevent scheduler from peak load time.sleep(client.config.getint(opt.JOB_SERVICE_PAUSE_BETWEEN_JOBS)) for job_hash in job_hashes_to_cancel: job = jobs_by_hash[job_hash] if job.get_status() != SparkJobStatus.IN_PROGRESS: logger.warning( f"Can't cancel job with job_hash={job_hash} job_id={job.get_id()} status={job.get_status()}" ) continue logger.warning( f"Cancelling a stream ingestion job with job_hash={job_hash} job_id={job.get_id()} status={job.get_status()}" ) try: job.cancel() except FailedPrecondition as exc: logger.error(f"Job canceling failed with exception {exc}")
def test_streaming_ingestion( feast_client: Client, feast_spark_client: SparkClient, local_staging_path: str, kafka_server, pytestconfig, ): entity = Entity(name="s2id", description="S2id", value_type=ValueType.INT64,) kafka_broker = f"{kafka_server[0]}:{kafka_server[1]}" topic_name = f"avro-{uuid.uuid4()}" feature_table = FeatureTable( name="drivers_stream", entities=["s2id"], features=[Feature("unique_drivers", ValueType.INT64)], batch_source=FileSource( event_timestamp_column="event_timestamp", created_timestamp_column="event_timestamp", file_format=ParquetFormat(), file_url=os.path.join(local_staging_path, "batch-storage"), ), stream_source=KafkaSource( event_timestamp_column="event_timestamp", bootstrap_servers=kafka_broker, message_format=AvroFormat(avro_schema()), topic=topic_name, ), ) feast_client.apply(entity) feast_client.apply(feature_table) if not pytestconfig.getoption("scheduled_streaming_job"): job = feast_spark_client.start_stream_to_online_ingestion(feature_table) assert job.get_feature_table() == feature_table.name wait_retry_backoff( lambda: (None, job.get_status() == SparkJobStatus.IN_PROGRESS), 180 ) else: job = None wait_retry_backoff( lambda: (None, check_consumer_exist(kafka_broker, topic_name)), 300 ) test_data = generate_data()[["s2id", "unique_drivers", "event_timestamp"]] try: ingested = ingest_and_retrieve( feast_client, test_data, avro_schema_json=avro_schema(), topic_name=topic_name, kafka_broker=kafka_broker, entity_rows=[{"s2id": s2_id} for s2_id in test_data["s2id"].tolist()], feature_names=["drivers_stream:unique_drivers"], ) finally: if job: job.cancel() else: feast_client.delete_feature_table(feature_table.name) pd.testing.assert_frame_equal( ingested[["s2id", "drivers_stream:unique_drivers"]], test_data[["s2id", "unique_drivers"]].rename( columns={"unique_drivers": "drivers_stream:unique_drivers"} ), )
def test_historical_features( feast_client: Client, feast_spark_client: SparkClient, tfrecord_feast_client: Client, batch_source: Union[BigQuerySource, FileSource], ): customer_entity = Entity( name="user_id", description="Customer", value_type=ValueType.INT64 ) feast_client.apply(customer_entity) max_age = Duration() max_age.FromSeconds(2 * 86400) transactions_feature_table = FeatureTable( name="transactions", entities=["user_id"], features=[ Feature("daily_transactions", ValueType.DOUBLE), Feature("total_transactions", ValueType.DOUBLE), ], batch_source=batch_source, max_age=max_age, ) feast_client.apply(transactions_feature_table) transactions_df, customers_df = generate_data() feast_client.ingest(transactions_feature_table, transactions_df) feature_refs = ["transactions:daily_transactions"] # remove microseconds because job.get_start_time() does not contain microseconds job_submission_time = datetime.utcnow().replace(microsecond=0) job = feast_spark_client.get_historical_features(feature_refs, customers_df) assert job.get_start_time() >= job_submission_time assert job.get_start_time() <= job_submission_time + timedelta(hours=1) output_dir = job.get_output_file_uri() # will both be None if not using Azure blob storage account_name, account_key = _get_azure_creds(feast_client) joined_df = read_parquet( output_dir, azure_account_name=account_name, azure_account_key=account_key ) expected_joined_df = pd.DataFrame( { "event_timestamp": customers_df.event_timestamp.tolist(), "user_id": customers_df.user_id.tolist(), "transactions__daily_transactions": transactions_df.daily_transactions.tolist() + [None] * transactions_df.shape[0], } ) assert_frame_equal( joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index(drop=True), expected_joined_df.sort_values(by=["user_id", "event_timestamp"]).reset_index( drop=True ), ) job = feast_spark_client.get_historical_features(feature_refs, customers_df) job.get_output_file_uri() assert job.get_status() == SparkJobStatus.COMPLETED