def start_stream_to_online_ingestion( self, ingestion_job_params: StreamIngestionJobParameters ) -> StreamIngestionJob: """ Starts a stream ingestion job on a Spark cluster. Returns: StreamIngestionJob: wrapper around remote job that can be used to check on the job. """ jar_s3_path = _upload_jar(self._staging_location, ingestion_job_params.get_main_file_path()) extra_jar_paths: List[str] = [] for extra_jar in ingestion_job_params.get_extra_jar_paths(): if extra_jar.startswith("s3://"): extra_jar_paths.append(extra_jar) else: extra_jar_paths.append( _upload_jar(self._staging_location, extra_jar)) job_hash = ingestion_job_params.get_job_hash() step = _stream_ingestion_step( jar_s3_path, extra_jar_paths, ingestion_job_params.get_feature_table_name(), args=ingestion_job_params.get_arguments(), job_hash=job_hash, ) job_ref = self._submit_emr_job(step) return EmrStreamIngestionJob(self._emr_client(), job_ref, job_hash)
def start_stream_to_online_ingestion( self, ingestion_job_params: StreamIngestionJobParameters ) -> StreamIngestionJob: job_id = str(uuid.uuid4()) ui_port = _find_free_port() job = StandaloneClusterStreamingIngestionJob( job_id, ingestion_job_params.get_name(), self.spark_submit(ingestion_job_params, ui_port), ui_port, ingestion_job_params.get_job_hash(), ) global_job_cache.add_job(job) return job
def start_stream_to_online_ingestion( self, ingestion_job_params: StreamIngestionJobParameters ) -> StreamIngestionJob: job, refresh_fn, cancel_fn = self.dataproc_submit(ingestion_job_params) job_hash = ingestion_job_params.get_job_hash() return DataprocStreamingIngestionJob(job, refresh_fn, cancel_fn, job_hash)
def start_stream_to_online_ingestion( self, ingestion_job_params: StreamIngestionJobParameters ) -> StreamIngestionJob: """ Starts a stream ingestion job to a Spark cluster. Raises: SparkJobFailure: The spark job submission failed, encountered error during execution, or timeout. Returns: StreamIngestionJob: wrapper around remote job. """ jar_s3_path = self._upload_jar( ingestion_job_params.get_main_file_path()) extra_jar_paths: List[str] = [] for extra_jar in ingestion_job_params.get_extra_jar_paths(): extra_jar_paths.append(self._upload_jar(extra_jar)) job_hash = ingestion_job_params.get_job_hash() job_id = _generate_job_id() resource = _prepare_job_resource( job_template=self._resource_template, job_id=job_id, job_type=STREAM_TO_ONLINE_JOB_TYPE, main_application_file=jar_s3_path, main_class=ingestion_job_params.get_class_name(), packages=[BQ_SPARK_PACKAGE], jars=extra_jar_paths, extra_metadata={METADATA_JOBHASH: job_hash}, arguments=ingestion_job_params.get_arguments(), namespace=self._namespace, ) job_info = _submit_job( api=self._api, resource=resource, namespace=self._namespace, ) return cast(StreamIngestionJob, self._job_from_job_info(job_info))
def start_stream_to_online_ingestion( self, ingestion_job_params: StreamIngestionJobParameters ) -> StreamIngestionJob: job_id = str(uuid.uuid4()) ui_port = _find_free_port() job = StandaloneClusterStreamingIngestionJob( job_id, ingestion_job_params.get_name(), self.spark_submit(ingestion_job_params, ui_port), ui_port, ) JOB_CACHE[job_id] = job return job
def start_stream_to_online_ingestion( self, ingestion_job_params: StreamIngestionJobParameters ) -> StreamIngestionJob: job, refresh_fn, cancel_fn = self.dataproc_submit(ingestion_job_params, {}) job_hash = ingestion_job_params.get_job_hash() return DataprocStreamingIngestionJob( job=job, refresh_fn=refresh_fn, cancel_fn=cancel_fn, project=self.project_id, region=self.region, job_hash=job_hash, )
def get_stream_to_online_ingestion_params( client: "Client", project: str, feature_table: FeatureTable, extra_jars: List[str] ) -> StreamIngestionJobParameters: return StreamIngestionJobParameters( jar=client._config.get(opt.SPARK_INGESTION_JAR), extra_jars=extra_jars, source=_source_to_argument(feature_table.stream_source, client._config), feature_table=_feature_table_to_argument(client, project, feature_table), redis_host=client._config.get(opt.REDIS_HOST), redis_port=client._config.getint(opt.REDIS_PORT), redis_ssl=client._config.getboolean(opt.REDIS_SSL), statsd_host=client._config.getboolean(opt.STATSD_ENABLED) and client._config.get(opt.STATSD_HOST), statsd_port=client._config.getboolean(opt.STATSD_ENABLED) and client._config.getint(opt.STATSD_PORT), deadletter_path=client._config.get(opt.DEADLETTER_PATH), stencil_url=client._config.get(opt.STENCIL_URL), )
def start_stream_to_online_ingestion(feature_table: FeatureTable, extra_jars: List[str], client: "Client") -> StreamIngestionJob: launcher = resolve_launcher(client._config) local_jar_path = _download_jar( client._config.get(CONFIG_SPARK_INGESTION_JOB_JAR)) return launcher.start_stream_to_online_ingestion( StreamIngestionJobParameters( jar=local_jar_path, extra_jars=extra_jars, source=_source_to_argument(feature_table.stream_source), feature_table=_feature_table_to_argument(client, feature_table), redis_host=client._config.get(CONFIG_REDIS_HOST), redis_port=client._config.getint(CONFIG_REDIS_PORT), redis_ssl=client._config.getboolean(CONFIG_REDIS_SSL), ))
def start_stream_to_online_ingestion( client: "Client", project: str, feature_table: FeatureTable, extra_jars: List[str]) -> StreamIngestionJob: launcher = resolve_launcher(client._config) return launcher.start_stream_to_online_ingestion( StreamIngestionJobParameters( jar=client._config.get(CONFIG_SPARK_INGESTION_JOB_JAR), extra_jars=extra_jars, source=_source_to_argument(feature_table.stream_source), feature_table=_feature_table_to_argument(client, project, feature_table), redis_host=client._config.get(CONFIG_REDIS_HOST), redis_port=client._config.getint(CONFIG_REDIS_PORT), redis_ssl=client._config.getboolean(CONFIG_REDIS_SSL), statsd_host=client._config.getboolean(CONFIG_STATSD_ENABLED) and client._config.get(CONFIG_STATSD_HOST), statsd_port=client._config.getboolean(CONFIG_STATSD_ENABLED) and client._config.getint(CONFIG_STATSD_PORT), deadletter_path=client._config.get(CONFIG_DEADLETTER_PATH), stencil_url=client._config.get(CONFIG_STENCIL_URL), ))