def offline_to_online_ingestion( self, ingestion_job_params: BatchIngestionJobParameters ) -> BatchIngestionJob: """ Submits a batch ingestion job to a Spark cluster. Raises: SparkJobFailure: The spark job submission failed, encountered error during execution, or timeout. Returns: BatchIngestionJob: wrapper around remote job that can be used to check when job completed. """ main_file = self._datalake.upload_file( ingestion_job_params.get_main_file_path()) job_info = _submit_job( self._api, ingestion_job_params.get_project() + "_offline_to_online_ingestion", main_file, main_class=ingestion_job_params.get_class_name(), arguments=ingestion_job_params.get_arguments(), reference_files=[main_file], tags=_prepare_job_tags(ingestion_job_params, OFFLINE_TO_ONLINE_JOB_TYPE), configuration=None) return cast(BatchIngestionJob, self._job_from_job_info(job_info))
def offline_to_online_ingestion( self, ingestion_job_params: BatchIngestionJobParameters ) -> BatchIngestionJob: """ Submits a batch ingestion job to a Spark cluster. Raises: SparkJobFailure: The spark job submission failed, encountered error during execution, or timeout. Returns: BatchIngestionJob: wrapper around remote job that can be used to check when job completed. """ jar_s3_path = _upload_jar( self._staging_location, ingestion_job_params.get_main_file_path() ) step = _sync_offline_to_online_step( jar_s3_path, ingestion_job_params.get_feature_table_name(), args=ingestion_job_params.get_arguments(), ) job_ref = self._submit_emr_job(step) return EmrBatchIngestionJob( self._emr_client(), job_ref, ingestion_job_params.get_feature_table_name() )
def offline_to_online_ingestion( self, ingestion_job_params: BatchIngestionJobParameters ) -> BatchIngestionJob: job_id = str(uuid.uuid4()) ui_port = _find_free_port() job = StandaloneClusterBatchIngestionJob( job_id, ingestion_job_params.get_name(), self.spark_submit(ingestion_job_params, ui_port), ui_port, ingestion_job_params.get_feature_table_name(), ) global_job_cache.add_job(job) return job
def start_offline_to_online_ingestion( client: "Client", project: str, feature_table: FeatureTable, start: datetime, end: datetime, ) -> BatchIngestionJob: launcher = resolve_launcher(client.config) return launcher.offline_to_online_ingestion( BatchIngestionJobParameters( jar=client.config.get(opt.SPARK_INGESTION_JAR), source=_source_to_argument(feature_table.batch_source, client.config), feature_table=_feature_table_to_argument(client, project, feature_table), start=start, end=end, redis_host=client.config.get(opt.REDIS_HOST), redis_port=client.config.getint(opt.REDIS_PORT), redis_ssl=client.config.getboolean(opt.REDIS_SSL), statsd_host=(client.config.getboolean(opt.STATSD_ENABLED) and client.config.get(opt.STATSD_HOST)), statsd_port=(client.config.getboolean(opt.STATSD_ENABLED) and client.config.getint(opt.STATSD_PORT)), deadletter_path=client.config.get(opt.DEADLETTER_PATH), stencil_url=client.config.get(opt.STENCIL_URL), ))
def offline_to_online_ingestion( self, ingestion_job_params: BatchIngestionJobParameters ) -> BatchIngestionJob: """ Submits a batch ingestion job to a Spark cluster. Raises: SparkJobFailure: The spark job submission failed, encountered error during execution, or timeout. Returns: BatchIngestionJob: wrapper around remote job that can be used to check when job completed. """ jar_s3_path = self._upload_jar(ingestion_job_params.get_main_file_path()) job_id = _generate_job_id() resource = _prepare_job_resource( job_template=self._resource_template, job_id=job_id, job_type=OFFLINE_TO_ONLINE_JOB_TYPE, main_application_file=jar_s3_path, main_class=ingestion_job_params.get_class_name(), packages=[BQ_SPARK_PACKAGE], jars=[], extra_metadata={}, azure_credentials=self._get_azure_credentials(), arguments=ingestion_job_params.get_arguments(), namespace=self._namespace, extra_labels={ LABEL_FEATURE_TABLE: _truncate_label( ingestion_job_params.get_feature_table_name() ), LABEL_FEATURE_TABLE_HASH: _generate_table_hash( ingestion_job_params.get_feature_table_name() ), }, ) job_info = _submit_job( api=self._api, resource=resource, namespace=self._namespace, ) return cast(BatchIngestionJob, self._job_from_job_info(job_info))