Exemplo n.º 1
0
    def ingest(
        self,
        feature_set: Union[str, FeatureSet],
        source: Union[pd.DataFrame, str],
        chunk_size: int = 10000,
        version: int = None,
        force_update: bool = False,
        max_workers: int = max(CPU_COUNT - 1, 1),
        disable_progress_bar: bool = False,
        timeout: int = KAFKA_CHUNK_PRODUCTION_TIMEOUT,
    ) -> None:
        """
        Loads feature data into Feast for a specific feature set.

        Args:
            feature_set (typing.Union[str, feast.feature_set.FeatureSet]):
                Feature set object or the string name of the feature set
                (without a version).

            source (typing.Union[pd.DataFrame, str]):
                Either a file path or Pandas Dataframe to ingest into Feast
                Files that are currently supported:
                    * parquet
                    * csv
                    * json

            chunk_size (int):
                Amount of rows to load and ingest at a time.

            version (int):
                Feature set version.

            force_update (bool):
                Automatically update feature set based on source data prior to
                ingesting. This will also register changes to Feast.

            max_workers (int):
                Number of worker processes to use to encode values.

            disable_progress_bar (bool):
                Disable printing of progress statistics.

            timeout (int):
                Timeout in seconds to wait for completion.

        Returns:
            None:
                None
        """

        if isinstance(feature_set, FeatureSet):
            name = feature_set.name
            if version is None:
                version = feature_set.version
        elif isinstance(feature_set, str):
            name = feature_set
        else:
            raise Exception(f"Feature set name must be provided")

        # Read table and get row count
        dir_path, dest_path = _read_table_from_source(source, chunk_size,
                                                      max_workers)

        pq_file = pq.ParquetFile(dest_path)

        row_count = pq_file.metadata.num_rows

        # Update the feature set based on PyArrow table of first row group
        if force_update:
            feature_set.infer_fields_from_pa(
                table=pq_file.read_row_group(0),
                discard_unused_fields=True,
                replace_existing_features=True,
            )
            self.apply(feature_set)
        current_time = time.time()

        print("Waiting for feature set to be ready for ingestion...")
        while True:
            if timeout is not None and time.time() - current_time >= timeout:
                raise TimeoutError(
                    "Timed out waiting for feature set to be ready")
            feature_set = self.get_feature_set(name, version)
            if (feature_set is not None
                    and feature_set.status == FeatureSetStatus.STATUS_READY):
                break
            time.sleep(3)

        if timeout is not None:
            timeout = timeout - int(time.time() - current_time)

        try:
            # Kafka configs
            brokers = feature_set.get_kafka_source_brokers()
            topic = feature_set.get_kafka_source_topic()
            producer = get_producer(brokers, row_count, disable_progress_bar)

            # Loop optimization declarations
            produce = producer.produce
            flush = producer.flush

            # Transform and push data to Kafka
            if feature_set.source.source_type == "Kafka":
                for chunk in get_feature_row_chunks(
                        file=dest_path,
                        row_groups=list(range(pq_file.num_row_groups)),
                        fs=feature_set,
                        max_workers=max_workers,
                ):

                    # Push FeatureRow one chunk at a time to kafka
                    for serialized_row in chunk:
                        produce(topic=topic, value=serialized_row)

                    # Force a flush after each chunk
                    flush(timeout=timeout)

                    # Remove chunk from memory
                    del chunk

            else:
                raise Exception(
                    f"Could not determine source type for feature set "
                    f'"{feature_set.name}" with source type '
                    f'"{feature_set.source.source_type}"')

            # Print ingestion statistics
            producer.print_results()
        finally:
            # Remove parquet file(s) that were created earlier
            print("Removing temporary file(s)...")
            shutil.rmtree(dir_path)

        return None
Exemplo n.º 2
0
    def ingest(
        self,
        feature_set: Union[str, FeatureSet],
        source: Union[pd.DataFrame, str],
        chunk_size: int = 10000,
        max_workers: int = max(CPU_COUNT - 1, 1),
        disable_progress_bar: bool = False,
        timeout: int = KAFKA_CHUNK_PRODUCTION_TIMEOUT,
    ) -> str:
        """
        Loads feature data into Feast for a specific feature set.

        Args:
            feature_set (typing.Union[str, feast.feature_set.FeatureSet]):
                Feature set object or the string name of the feature set

            source (typing.Union[pd.DataFrame, str]):
                Either a file path or Pandas Dataframe to ingest into Feast
                Files that are currently supported:
                    * parquet
                    * csv
                    * json

            chunk_size (int):
                Amount of rows to load and ingest at a time.

            max_workers (int):
                Number of worker processes to use to encode values.

            disable_progress_bar (bool):
                Disable printing of progress statistics.

            timeout (int):
                Timeout in seconds to wait for completion.

        Returns:
            str:
                ingestion id for this dataset

        Examples:
            >>> from feast import Client
            >>>
            >>> client = Client(core_url="localhost:6565")
            >>> fs_df = pd.DataFrame(
            >>>         {
            >>>            "datetime": [pd.datetime.now()],
            >>>            "driver": [1001],
            >>>            "rating": [4.3],
            >>>         }
            >>>     )
            >>> client.set_project("project1")
            >>> client.ingest("driver", fs_df)
            >>>
            >>> driver_fs = client.get_feature_set(name="driver", project="project1")
            >>> client.ingest(driver_fs, fs_df)
        """

        if isinstance(feature_set, FeatureSet):
            name = feature_set.name
            project = feature_set.project
        elif isinstance(feature_set, str):
            if self.project is not None:
                project = self.project
            else:
                project = "default"
            name = feature_set
        else:
            raise Exception("Feature set name must be provided")

        # Read table and get row count
        dir_path, dest_path = _read_table_from_source(source, chunk_size,
                                                      max_workers)

        pq_file = pq.ParquetFile(dest_path)

        row_count = pq_file.metadata.num_rows

        current_time = time.time()

        print("Waiting for feature set to be ready for ingestion...")
        while True:
            if timeout is not None and time.time() - current_time >= timeout:
                raise TimeoutError(
                    "Timed out waiting for feature set to be ready")
            fetched_feature_set: Optional[FeatureSet] = self.get_feature_set(
                name, project)
            if (fetched_feature_set is not None and fetched_feature_set.status
                    == FeatureSetStatus.STATUS_READY):
                feature_set = fetched_feature_set
                break
            time.sleep(3)

        if timeout is not None:
            timeout = timeout - int(time.time() - current_time)

        try:
            # Kafka configs
            brokers = feature_set.get_kafka_source_brokers()
            topic = feature_set.get_kafka_source_topic()
            producer = get_producer(brokers, row_count, disable_progress_bar)

            # Loop optimization declarations
            produce = producer.produce
            flush = producer.flush
            ingestion_id = _generate_ingestion_id(feature_set)

            # Transform and push data to Kafka
            if feature_set.source.source_type == "Kafka":
                for chunk in get_feature_row_chunks(
                        file=dest_path,
                        row_groups=list(range(pq_file.num_row_groups)),
                        fs=feature_set,
                        ingestion_id=ingestion_id,
                        max_workers=max_workers,
                ):

                    # Push FeatureRow one chunk at a time to kafka
                    for serialized_row in chunk:
                        produce(topic=topic, value=serialized_row)

                    # Force a flush after each chunk
                    flush(timeout=timeout)

                    # Remove chunk from memory
                    del chunk

            else:
                raise Exception(
                    f"Could not determine source type for feature set "
                    f'"{feature_set.name}" with source type '
                    f'"{feature_set.source.source_type}"')

            # Print ingestion statistics
            producer.print_results()
        finally:
            # Remove parquet file(s) that were created earlier
            print("Removing temporary file(s)...")
            shutil.rmtree(dir_path)

        return ingestion_id