Exemplo n.º 1
0
    def historical_feature_retrieval(
            self, job_params: RetrievalJobParameters) -> RetrievalJob:

        with open(job_params.get_main_file_path()) as f:
            pyspark_script = f.read()

        pyspark_script_path = urlunparse(
            get_staging_client("s3").upload_fileobj(
                BytesIO(pyspark_script.encode("utf8")),
                local_path="historical_retrieval.py",
                remote_path_prefix=self._staging_location,
                remote_path_suffix=".py",
            ))

        step = _historical_retrieval_step(
            pyspark_script_path,
            args=job_params.get_arguments(),
            output_file_uri=job_params.get_destination_path(),
        )

        job_ref = self._submit_emr_job(step)

        return EmrRetrievalJob(
            self._emr_client(),
            job_ref,
            job_params.get_destination_path(),
        )
Exemplo n.º 2
0
    def historical_feature_retrieval(
        self, job_params: RetrievalJobParameters
    ) -> RetrievalJob:

        with open(job_params.get_main_file_path()) as f:
            pyspark_script = f.read()

        pyspark_script_path = _s3_upload(
            BytesIO(pyspark_script.encode("utf8")),
            local_path="historical_retrieval.py",
            remote_path_prefix=self._staging_location,
            remote_path_suffix=".py",
        )

        step = _historical_retrieval_step(
            pyspark_script_path, args=job_params.get_arguments()
        )

        job_ref = self._submit_emr_job(step)

        return EmrRetrievalJob(
            self._emr_client(),
            job_ref,
            os.path.join(job_params.get_destination_path(), _random_string(8)),
        )
Exemplo n.º 3
0
 def historical_feature_retrieval(
         self, job_params: RetrievalJobParameters) -> RetrievalJob:
     job, refresh_fn, cancel_fn = self.dataproc_submit(
         job_params,
         {"dev.feast.outputuri": job_params.get_destination_path()})
     return DataprocRetrievalJob(job, refresh_fn, cancel_fn,
                                 job_params.get_destination_path())
Exemplo n.º 4
0
 def historical_feature_retrieval(
         self, job_params: RetrievalJobParameters) -> RetrievalJob:
     job_id = str(uuid.uuid4())
     return StandaloneClusterRetrievalJob(
         job_id,
         job_params.get_name(),
         self.spark_submit(job_params),
         job_params.get_destination_path(),
     )
Exemplo n.º 5
0
 def historical_feature_retrieval(
         self, job_params: RetrievalJobParameters) -> RetrievalJob:
     job, refresh_fn, cancel_fn = self.dataproc_submit(
         job_params,
         {"dev.feast.outputuri": job_params.get_destination_path()})
     return DataprocRetrievalJob(
         job=job,
         refresh_fn=refresh_fn,
         cancel_fn=cancel_fn,
         project=self.project_id,
         region=self.region,
         output_file_uri=job_params.get_destination_path(),
     )
Exemplo n.º 6
0
    def historical_feature_retrieval(
            self, job_params: RetrievalJobParameters) -> RetrievalJob:
        """
        Submits a historical feature retrieval job to a Spark cluster.

        Raises:
            SparkJobFailure: The spark job submission failed, encountered error
                during execution, or timeout.

        Returns:
            RetrievalJob: wrapper around remote job that returns file uri to the result file.
        """

        with open(job_params.get_main_file_path()) as f:
            pyspark_script = f.read()

        pyspark_script_path = urlunparse(
            self._get_staging_client().upload_fileobj(
                BytesIO(pyspark_script.encode("utf8")),
                local_path="historical_retrieval.py",
                remote_path_prefix=self._staging_location,
                remote_path_suffix=".py",
            ))

        job_id = _generate_job_id()

        resource = _prepare_job_resource(
            job_template=self._resource_template,
            job_id=job_id,
            job_type=HISTORICAL_RETRIEVAL_JOB_TYPE,
            main_application_file=pyspark_script_path,
            main_class=None,
            packages=[],
            jars=[],
            extra_metadata={
                METADATA_OUTPUT_URI: job_params.get_destination_path()
            },
            arguments=job_params.get_arguments(),
            namespace=self._namespace,
        )

        job_info = _submit_job(
            api=self._api,
            resource=resource,
            namespace=self._namespace,
        )

        return cast(RetrievalJob, self._job_from_job_info(job_info))
Exemplo n.º 7
0
def start_historical_feature_retrieval_job(
    client: "Client",
    project: str,
    entity_source: Union[FileSource, BigQuerySource],
    feature_tables: List[FeatureTable],
    output_format: str,
    output_path: str,
) -> RetrievalJob:
    launcher = resolve_launcher(client._config)
    feature_sources = [
        _source_to_argument(
            replace_bq_table_with_joined_view(feature_table, entity_source),
            client._config,
        )
        for feature_table in feature_tables
    ]

    return launcher.historical_feature_retrieval(
        RetrievalJobParameters(
            entity_source=_source_to_argument(entity_source, client._config),
            feature_tables_sources=feature_sources,
            feature_tables=[
                _feature_table_to_argument(client, project, feature_table)
                for feature_table in feature_tables
            ],
            destination={"format": output_format, "path": output_path},
        )
    )
Exemplo n.º 8
0
def start_historical_feature_retrieval_job(
    client: "Client",
    project: str,
    entity_source: Union[FileSource, BigQuerySource],
    feature_tables: List[FeatureTable],
    output_format: str,
    output_path: str,
) -> RetrievalJob:
    launcher = resolve_launcher(client._config)
    feature_sources = [
        _source_to_argument(
            replace_bq_table_with_joined_view(feature_table, entity_source),
            client._config,
        ) for feature_table in feature_tables
    ]

    extra_packages = []
    if output_format == "tfrecord":
        extra_packages.append(
            "com.linkedin.sparktfrecord:spark-tfrecord_2.12:0.3.0")

    return launcher.historical_feature_retrieval(
        RetrievalJobParameters(
            entity_source=_source_to_argument(entity_source, client._config),
            feature_tables_sources=feature_sources,
            feature_tables=[
                _feature_table_to_argument(client, project, feature_table)
                for feature_table in feature_tables
            ],
            destination={
                "format": output_format,
                "path": output_path
            },
            extra_packages=extra_packages,
        ))
Exemplo n.º 9
0
def start_historical_feature_retrieval_job(
    client: "Client",
    entity_source: Union[FileSource, BigQuerySource],
    feature_tables: List[FeatureTable],
    output_format: str,
    output_path: str,
) -> RetrievalJob:
    launcher = resolve_launcher(client._config)
    return launcher.historical_feature_retrieval(
        RetrievalJobParameters(
            entity_source=_source_to_argument(entity_source),
            feature_tables_sources=[
                _source_to_argument(feature_table.batch_source)
                for feature_table in feature_tables
            ],
            feature_tables=[
                _feature_table_to_argument(client, feature_table)
                for feature_table in feature_tables
            ],
            destination={
                "format": output_format,
                "path": output_path
            },
            extra_options=client._config.get(CONFIG_SPARK_EXTRA_OPTIONS),
        ))
Exemplo n.º 10
0
def new_retrieval_job_params(
    entity_source_uri: str,
    feature_source_uri: str,
    destination_uri: str,
    output_format: str,
) -> RetrievalJobParameters:
    entity_source = {
        "file": {
            "format": {
                "json_class": "ParquetFormat"
            },
            "path": entity_source_uri,
            "event_timestamp_column": "event_timestamp",
        }
    }

    feature_tables_sources = [{
        "file": {
            "format": {
                "json_class": "ParquetFormat"
            },
            "path": feature_source_uri,
            "event_timestamp_column": "event_timestamp",
            "created_timestamp_column": "created_timestamp",
        }
    }]

    feature_tables = [{
        "name":
        "customer_transactions",
        "entities": [{
            "name": "customer",
            "type": "int64"
        }],
        "features": [{
            "name": "total_transactions",
            "type": "double"
        }],
    }]

    destination = {"format": output_format, "path": destination_uri}

    return RetrievalJobParameters(
        feature_tables=feature_tables,
        feature_tables_sources=feature_tables_sources,
        entity_source=entity_source,
        destination=destination,
        extra_packages=[
            "com.linkedin.sparktfrecord:spark-tfrecord_2.12:0.3.0"
        ],
    )
Exemplo n.º 11
0
def new_retrieval_job_params(
    entity_source_uri: str, feature_source_uri: str, destination_uri: str
) -> RetrievalJobParameters:
    entity_source = {
        "file": {
            "format": "parquet",
            "path": entity_source_uri,
            "event_timestamp_column": "event_timestamp",
        }
    }

    feature_tables_sources = [
        {
            "file": {
                "format": "parquet",
                "path": feature_source_uri,
                "event_timestamp_column": "event_timestamp",
                "created_timestamp_column": "created_timestamp",
            }
        }
    ]

    feature_tables = [
        {
            "name": "customer_transactions",
            "entities": [{"name": "customer", "type": "int32"}],
        }
    ]

    destination = {"format": "parquet", "path": destination_uri}

    return RetrievalJobParameters(
        feature_tables=feature_tables,
        feature_tables_sources=feature_tables_sources,
        entity_source=entity_source,
        destination=destination,
    )
Exemplo n.º 12
0
 def historical_feature_retrieval(
         self, job_params: RetrievalJobParameters) -> RetrievalJob:
     operation = self.dataproc_submit(job_params)
     cancel_fn = partial(self.dataproc_cancel, operation.metadata.job_id)
     return DataprocRetrievalJob(operation, cancel_fn,
                                 job_params.get_destination_path())
Exemplo n.º 13
0
 def historical_feature_retrieval(
         self, job_params: RetrievalJobParameters) -> RetrievalJob:
     return DataprocRetrievalJob(self.dataproc_submit(job_params),
                                 job_params.get_destination_path())