def historical_feature_retrieval( self, job_params: RetrievalJobParameters ) -> RetrievalJob: with open(job_params.get_main_file_path()) as f: pyspark_script = f.read() pyspark_script_path = urlunparse( get_staging_client("s3").upload_fileobj( BytesIO(pyspark_script.encode("utf8")), local_path="historical_retrieval.py", remote_path_prefix=self._staging_location, remote_path_suffix=".py", ) ) step = _historical_retrieval_step( pyspark_script_path, args=job_params.get_arguments(), output_file_uri=job_params.get_destination_path(), packages=job_params.get_extra_packages(), ) job_ref = self._submit_emr_job(step) return EmrRetrievalJob( self._emr_client(), job_ref, job_params.get_destination_path(), )
def historical_feature_retrieval( self, job_params: RetrievalJobParameters) -> RetrievalJob: """ Submits a historical feature retrieval job to a Spark cluster. Raises: SparkJobFailure: The spark job submission failed, encountered error during execution, or timeout. Returns: RetrievalJob: wrapper around remote job that returns file uri to the result file. """ main_file = self._datalake.upload_file(job_params.get_main_file_path()) job_info = _submit_job(self._api, "Historical-Retrieval", main_file, arguments=job_params.get_arguments(), tags={ LABEL_JOBTYPE: HISTORICAL_RETRIEVAL_JOB_TYPE, METADATA_OUTPUT_URI: job_params.get_destination_path() }) return cast(RetrievalJob, self._job_from_job_info(job_info))
def historical_feature_retrieval( self, job_params: RetrievalJobParameters) -> RetrievalJob: """ Submits a historical feature retrieval job to a Spark cluster. Raises: SparkJobFailure: The spark job submission failed, encountered error during execution, or timeout. Returns: RetrievalJob: wrapper around remote job that returns file uri to the result file. """ with open(job_params.get_main_file_path()) as f: pyspark_script = f.read() pyspark_script_path = urlunparse( self._staging_client.upload_fileobj( BytesIO(pyspark_script.encode("utf8")), local_path="historical_retrieval.py", remote_path_prefix=self._staging_location, remote_path_suffix=".py", )) job_id = _generate_job_id() resource = _prepare_job_resource( job_template=self._historical_retrieval_template, job_id=job_id, job_type=HISTORICAL_RETRIEVAL_JOB_TYPE, main_application_file=pyspark_script_path, main_class=None, packages=[], jars=[], extra_metadata={ METADATA_OUTPUT_URI: job_params.get_destination_path() }, azure_credentials=self._get_azure_credentials(), arguments=job_params.get_arguments(), namespace=self._namespace, ) job_info = _submit_job( api=self._api, resource=resource, namespace=self._namespace, ) return cast(RetrievalJob, self._job_from_job_info(job_info))