def historical_feature_retrieval( self, job_params: RetrievalJobParameters ) -> RetrievalJob: with open(job_params.get_main_file_path()) as f: pyspark_script = f.read() pyspark_script_path = _s3_upload( BytesIO(pyspark_script.encode("utf8")), local_path="historical_retrieval.py", remote_path_prefix=self._staging_location, remote_path_suffix=".py", ) step = _historical_retrieval_step( pyspark_script_path, args=job_params.get_arguments() ) job_ref = self._submit_emr_job(step) return EmrRetrievalJob( self._emr_client(), job_ref, os.path.join(job_params.get_destination_path(), _random_string(8)), )
def historical_feature_retrieval( self, job_params: RetrievalJobParameters) -> RetrievalJob: with open(job_params.get_main_file_path()) as f: pyspark_script = f.read() pyspark_script_path = urlunparse( get_staging_client("s3").upload_fileobj( BytesIO(pyspark_script.encode("utf8")), local_path="historical_retrieval.py", remote_path_prefix=self._staging_location, remote_path_suffix=".py", )) step = _historical_retrieval_step( pyspark_script_path, args=job_params.get_arguments(), output_file_uri=job_params.get_destination_path(), ) job_ref = self._submit_emr_job(step) return EmrRetrievalJob( self._emr_client(), job_ref, job_params.get_destination_path(), )
def historical_feature_retrieval( self, job_params: RetrievalJobParameters) -> RetrievalJob: """ Submits a historical feature retrieval job to a Spark cluster. Raises: SparkJobFailure: The spark job submission failed, encountered error during execution, or timeout. Returns: RetrievalJob: wrapper around remote job that returns file uri to the result file. """ with open(job_params.get_main_file_path()) as f: pyspark_script = f.read() pyspark_script_path = urlunparse( self._get_staging_client().upload_fileobj( BytesIO(pyspark_script.encode("utf8")), local_path="historical_retrieval.py", remote_path_prefix=self._staging_location, remote_path_suffix=".py", )) job_id = _generate_job_id() resource = _prepare_job_resource( job_template=self._resource_template, job_id=job_id, job_type=HISTORICAL_RETRIEVAL_JOB_TYPE, main_application_file=pyspark_script_path, main_class=None, packages=[], jars=[], extra_metadata={ METADATA_OUTPUT_URI: job_params.get_destination_path() }, arguments=job_params.get_arguments(), namespace=self._namespace, ) job_info = _submit_job( api=self._api, resource=resource, namespace=self._namespace, ) return cast(RetrievalJob, self._job_from_job_info(job_info))