예제 #1
0
def ingest_and_verify(feast_client: Client, feature_table: FeatureTable,
                      original: pd.DataFrame):
    job = feast_client.start_offline_to_online_ingestion(
        feature_table,
        original.event_timestamp.min().to_pydatetime(),
        original.event_timestamp.max().to_pydatetime() + timedelta(seconds=1),
    )
    assert job.get_feature_table() == feature_table.name

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180)

    features = feast_client.get_online_features(
        [f"{feature_table.name}:unique_drivers"],
        entity_rows=[{
            "s2id": s2_id
        } for s2_id in original["s2id"].tolist()],
    ).to_dict()

    ingested = pd.DataFrame.from_dict(features)
    pd.testing.assert_frame_equal(
        ingested[["s2id", f"{feature_table.name}:unique_drivers"]],
        original[["s2id", "unique_drivers"]].rename(
            columns={"unique_drivers": f"{feature_table.name}:unique_drivers"
                     }),
    )
예제 #2
0
def test_list_jobs_long_table_name(
    feast_client: Client,
    feast_spark_client: SparkClient,
    batch_source: Union[BigQuerySource, FileSource],
):
    entity = Entity(name="long_entity_name" * 10,
                    description="S2id",
                    value_type=ValueType.INT64)

    feature_table = FeatureTable(
        name="just1a2featuretable3with4a5really6really7really8really9really10",
        entities=[entity.name],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=batch_source,
    )

    feast_client.apply(entity)
    feast_client.apply(feature_table)

    data_sample = generate_data().rename(columns={"s2id": entity.name})
    feast_client.ingest(feature_table, data_sample)

    job = feast_spark_client.start_offline_to_online_ingestion(
        feature_table,
        data_sample.event_timestamp.min().to_pydatetime(),
        data_sample.event_timestamp.max().to_pydatetime() +
        timedelta(seconds=1),
    )

    wait_retry_backoff(
        lambda: (None, job.get_status() == SparkJobStatus.COMPLETED), 180)
    all_job_ids = [
        job.get_id() for job in feast_spark_client.list_jobs(
            include_terminated=True,
            project=feast_client.project,
            table_name=feature_table.name,
        )
    ]
    assert job.get_id() in all_job_ids

    features = feast_client.get_online_features(
        [f"{feature_table.name}:unique_drivers"],
        entity_rows=[{
            entity.name: key
        } for key in data_sample[entity.name].tolist()],
    ).to_dict()

    ingested = pd.DataFrame.from_dict(features)
    pd.testing.assert_frame_equal(
        ingested[[entity.name, f"{feature_table.name}:unique_drivers"]],
        data_sample[[entity.name, "unique_drivers"]].rename(
            columns={"unique_drivers": f"{feature_table.name}:unique_drivers"
                     }),
    )
예제 #3
0
def test_offline_ingestion(feast_client: Client, staging_path: str):
    entity = Entity(
        name="s2id",
        description="S2id",
        value_type=ValueType.INT64,
    )

    feature_table = FeatureTable(
        name="drivers",
        entities=["s2id"],
        features=[Feature("unique_drivers", ValueType.INT64)],
        batch_source=FileSource(
            "event_timestamp",
            "event_timestamp",
            ParquetFormat(),
            os.path.join(staging_path, "batch-storage"),
        ),
    )

    feast_client.apply_entity(entity)
    feast_client.apply_feature_table(feature_table)

    original = generate_data()
    feast_client.ingest(feature_table,
                        original)  # write to batch (offline) storage

    job = feast_client.start_offline_to_online_ingestion(
        feature_table, datetime.today(),
        datetime.today() + timedelta(days=1))

    status = wait_retry_backoff(
        lambda:
        (job.get_status(), job.get_status() != SparkJobStatus.IN_PROGRESS),
        300)

    assert status == SparkJobStatus.COMPLETED

    features = feast_client.get_online_features(
        ["drivers:unique_drivers"],
        entity_rows=[{
            "s2id": s2_id
        } for s2_id in original["s2id"].tolist()],
    ).to_dict()

    ingested = pd.DataFrame.from_dict(features)
    pd.testing.assert_frame_equal(
        ingested[["s2id", "drivers:unique_drivers"]],
        original[[
            "s2id", "unique_drivers"
        ]].rename(columns={"unique_drivers": "drivers:unique_drivers"}),
    )
예제 #4
0
class DriverTransformer(kfserving.KFModel):
    """ A class object for the data handling activities of driver ranking
    Task and returns a KFServing compatible response.

    Args:
        kfserving (class object): The KFModel class from the KFServing
        modeule is passed here.
    """
    def __init__(self, name: str,
                 predictor_host: str,
                 feast_serving_url: str,
                 entity_ids: List[str],
                 feature_refs: List[str]):
        """Initialize the model name, predictor host, Feast serving URL,
           entity IDs, and feature references

        Args:
            name (str): Name of the model.
            predictor_host (str): The host in which the predictor runs.
            feast_serving_url (str): The Feast serving URL, in the form
            of <host_name:port>
            entity_ids (List[str]): The entity IDs for which to retrieve
            features from the Feast feature store
            feature_refs (List[str]): The feature references for the
            features to be retrieved
        """
        super().__init__(name)
        self.predictor_host = predictor_host
        self.client = Client(serving_url=feast_serving_url)
        self.entity_ids = entity_ids
        self.feature_refs = feature_refs

        logging.info("Model name = %s", name)
        logging.info("Predictor host = %s", predictor_host)
        logging.info("Feast serving URL = %s", feast_serving_url)
        logging.info("Entity ids = %s", entity_ids)
        logging.info("Feature refs = %s", feature_refs)

        self.timeout = 100

    def buildEntityRow(self, instance) -> Dict:
        """Build an entity row and return it as a dict.

        Args:
            instance (list): entity id attributes to identify a unique entity

        Returns:
            Dict: Returns the entity id attributes as an entity row

        """
        entity_row = {self.entity_ids[i]: instance[i] for i in range(len(instance))}
        return entity_row

    def buildPredictRequest(self, inputs, features) -> Dict:
        """Build the predict request for all entitys and return it as a dict.

        Args:
            inputs (Dict): entity ids from KFServing http request
            features (Dict): entity features extracted from the feature store

        Returns:
            Dict: Returns the entity ids with features

        """
        request_data = []
        for i in range(len(inputs['instances'])):
            entity_req = [features[self.feature_refs[j]][i] for j in range(len(self.feature_refs))]
            for j in range(len(self.entity_ids)):
                entity_req.append(inputs['instances'][i][j])
            request_data.insert(i, entity_req)

        return {'instances': request_data}

    def preprocess(self, inputs: Dict) -> Dict:
        """Pre-process activity of the driver input data.

        Args:
            inputs (Dict): KFServing http request

        Returns:
            Dict: Returns the request input after ingesting online features
        """

        entity_rows = [self.buildEntityRow(instance) for instance in inputs['instances']]
        features = self.client.get_online_features(feature_refs=self.feature_refs, entity_rows=entity_rows).to_dict()

        outputs = self.buildPredictRequest(inputs, features)

        logging.info("The input for model predict is %s", outputs)

        return outputs

    def postprocess(self, inputs: List) -> List:
        """Post process function of the driver ranking output data. Here we
        simply pass the raw rankings through.

        Args:
            inputs (List): The list of the inputs

        Returns:
            List: If a post process functionality is specified, it could convert
            raw rankings into a different list.
        """
        logging.info("The output from model predict is %s", inputs)

        return inputs