Exemplo n.º 1
0
def test_export_source_to_staging_location_s3_wildcard_as_source_should_pass(
    avro_data_path,
):
    s3_client = boto3.client("s3")
    s3_client.create_bucket(Bucket=BUCKET)
    with open(avro_data_path, "rb") as data:
        s3_client.upload_fileobj(data, BUCKET, f"{FOLDER_NAME}/file1.avro")
    with open(avro_data_path, "rb") as data:
        s3_client.upload_fileobj(data, BUCKET, f"{FOLDER_NAME}/file2.avro")
    sources = export_source_to_staging_location(f"{S3_LOCATION}/*", None)
    assert sources == [f"{S3_LOCATION}/file1.avro", f"{S3_LOCATION}/file2.avro"]
Exemplo n.º 2
0
def test_export_source_to_staging_location_dataframe_to_s3_should_pass(get_file_name):
    s3_client = boto3.client("s3")
    s3_client.create_bucket(Bucket=BUCKET)
    source = export_source_to_staging_location(TEST_DATA_FRAME, S3_LOCATION)
    file_obj = tempfile.TemporaryFile()
    uri = urlparse(source[0])
    s3_client.download_fileobj(uri.hostname, uri.path[1:], file_obj)
    file_obj.seek(0)
    avro_reader = fastavro.reader(file_obj)
    retrived_df = pd.DataFrame.from_records([r for r in avro_reader])
    assert_frame_equal(retrived_df, TEST_DATA_FRAME, check_like=True)
    assert get_file_name.call_count == 1
Exemplo n.º 3
0
    def get_batch_features(
        self,
        feature_refs: List[str],
        entity_rows: Union[pd.DataFrame, str],
        default_project: str = None,
    ) -> RetrievalJob:
        """
        Retrieves historical features from a Feast Serving deployment.

        Args:
            feature_refs (List[str]):
                List of feature references that will be returned for each entity.
                Each feature reference should have the following format
                "project/feature:version".

            entity_rows (Union[pd.DataFrame, str]):
                Pandas dataframe containing entities and a 'datetime' column.
                Each entity in a feature set must be present as a column in this
                dataframe. The datetime column must contain timestamps in
                datetime64 format.
            default_project: Default project where feature values will be found.

        Returns:
            feast.job.RetrievalJob:
                Returns a retrival job object that can be used to monitor retrieval
                progress asynchronously, and can be used to materialize the
                results.

        Examples:
            >>> from feast import Client
            >>> from datetime import datetime
            >>>
            >>> feast_client = Client(core_url="localhost:6565", serving_url="localhost:6566")
            >>> feature_refs = ["my_project/bookings_7d:1", "booking_14d"]
            >>> entity_rows = pd.DataFrame(
            >>>         {
            >>>            "datetime": [pd.datetime.now() for _ in range(3)],
            >>>            "customer": [1001, 1002, 1003],
            >>>         }
            >>>     )
            >>> feature_retrieval_job = feast_client.get_batch_features(
            >>>     feature_refs, entity_rows, default_project="my_project")
            >>> df = feature_retrieval_job.to_dataframe()
            >>> print(df)
        """

        self._connect_serving()

        feature_references = _build_feature_references(
            feature_refs=feature_refs, default_project=default_project)

        # Retrieve serving information to determine store type and
        # staging location
        serving_info = self._serving_service_stub.GetFeastServingInfo(
            GetFeastServingInfoRequest(),
            timeout=self._config.getint(
                CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY),
        )  # type: GetFeastServingInfoResponse

        if serving_info.type != FeastServingType.FEAST_SERVING_TYPE_BATCH:
            raise Exception(
                f'You are connected to a store "{self._serving_url}" which '
                f"does not support batch retrieval ")

        if isinstance(entity_rows, pd.DataFrame):
            # Pandas DataFrame detected

            # Remove timezone from datetime column
            if isinstance(entity_rows["datetime"].dtype,
                          pd.core.dtypes.dtypes.DatetimeTZDtype):
                entity_rows["datetime"] = pd.DatetimeIndex(
                    entity_rows["datetime"]).tz_localize(None)
        elif isinstance(entity_rows, str):
            # String based source
            if not entity_rows.endswith((".avro", "*")):
                raise Exception(
                    f"Only .avro and wildcard paths are accepted as entity_rows"
                )
        else:
            raise Exception(f"Only pandas.DataFrame and str types are allowed"
                            f" as entity_rows, but got {type(entity_rows)}.")

        # Export and upload entity row DataFrame to staging location
        # provided by Feast
        staged_files = export_source_to_staging_location(
            entity_rows, serving_info.job_staging_location)  # type: List[str]

        request = GetBatchFeaturesRequest(
            features=feature_references,
            dataset_source=DatasetSource(file_source=DatasetSource.FileSource(
                file_uris=staged_files,
                data_format=DataFormat.DATA_FORMAT_AVRO)),
        )

        # Retrieve Feast Job object to manage life cycle of retrieval
        response = self._serving_service_stub.GetBatchFeatures(request)
        return RetrievalJob(response.job, self._serving_service_stub)
Exemplo n.º 4
0
def test_export_source_to_staging_location_s3_file_as_source_should_pass():
    source = export_source_to_staging_location(S3_LOCATION, None)
    assert source == [S3_LOCATION]
Exemplo n.º 5
0
def test_export_source_to_staging_location_local_file_should_pass(
        get_file_name):
    source = export_source_to_staging_location(TEST_DATA_FRAME, LOCAL_FILE)
    assert source == [f"{LOCAL_FILE}/{FILE_NAME}"]
    assert get_file_name.call_count == 1