Exemplo n.º 1
0
class CudfKafkaClient:
    def __init__(self, kafka_configs):
        """
        Base object for any client that wants to interact with a Kafka broker.
        This object creates the underlying KafkaDatasource connection which
        is used to read data from Kafka and create cudf Dataframes.
        This class should not be directly instantiated.

        Parameters
        ----------
        kafka_configs : dict,
            Dict of Key/Value pairs of librdkafka
            configuration values. Full list of valid configuration
            options can be found at
            https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
        """

        self.kafka_configs = kafka_configs

        self.kafka_confs = {
            str.encode(key): str.encode(value)
            for key, value in self.kafka_configs.items()
        }

        self.kafka_meta_client = KafkaDatasource(self.kafka_confs)

    def list_topics(self, specific_topic=None):
        """
        List the topics associated with the underlying Kafka Broker connection.

        Parameters
        ----------
        specific_topic : str,
            If specified this is the only topic that metadata information will
            be retrieved for. Otherwise metadata for all topics in the
            broker will be retrieved.
        """

        return self.kafka_meta_client.list_topics(
            b"" if specific_topic is None else specific_topic.encode())

    def unsubscribe(self):
        """
        Stop all active consumption and remove consumer subscriptions
        to topic/partition instances
        """

        self.kafka_meta_client.unsubscribe()

    def close(self, timeout=10000):
        """
        Close the underlying socket connection to Kafka and
        clean up system resources
        """

        self.kafka_meta_client.close(timeout)
Exemplo n.º 2
0
    def __init__(self, kafka_configs):

        """
        Base object for any client that wants to interact with a Kafka broker.
        This object creates the underlying KafkaDatasource connection which
        is used to read data from Kafka and create cudf Dataframes.
        This class should not be directly instantiated.

        Parameters
        ----------
        kafka_configs : dict,
            Dict of Key/Value pairs of librdkafka
            configuration values. Full list of valid configuration
            options can be found at
            https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
        """

        self.kafka_configs = kafka_configs
        self.kafka_meta_client = KafkaDatasource(kafka_configs)
Exemplo n.º 3
0
class CudfKafkaClient:
    def __init__(self, kafka_configs):
        """
        Base object for any client that wants to interact with a Kafka broker.
        This object creates the underlying KafkaDatasource connection which
        is used to read data from Kafka and create cudf Dataframes.
        This class should not be directly instantiated.

        Parameters
        ----------
        kafka_configs : dict,
            Dict of Key/Value pairs of librdkafka
            configuration values. Full list of valid configuration
            options can be found at
            https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
        """

        self.kafka_configs = kafka_configs

        self.kafka_confs = {
            str.encode(key): str.encode(value)
            for key, value in self.kafka_configs.items()
        }

        self.kafka_meta_client = KafkaDatasource(self.kafka_confs)

    def unsubscribe(self):
        """
        Stop all active consumption and remove consumer subscriptions
        to topic/partition instances
        """

        self.kafka_meta_client.unsubscribe()

    def close(self, timeout=10000):
        """
        Close the underlying socket connection to Kafka and
        clean up system resources
        """

        self.kafka_meta_client.close(timeout)
Exemplo n.º 4
0
    def read_gdf(
        self,
        topic=None,
        partition=0,
        lines=True,
        start=0,
        end=0,
        batch_timeout=10000,
        delimiter="\n",
        message_format="json",
    ):

        """
        Read messages from the underlying KafkaDatasource connection and create
        a cudf Dataframe

        Parameters
        ----------
        topic : str,
            Name of the Kafka topic that the messages
            should be read from
        parition : int,
            Partition number on the specified topic that
            should be read from
        lines : {{ True, False }}, default True,
            Whether messages should be treated as individual lines
        start : int, default 0,
            The beginning offset that should be used when
            reading a range of messages
        end : int, default 0,
            The last offset that will be read when
            reading a range of messages
        batch_timeout : int, default 10000,
            Amount of time to wait on the
            reading of the messages from Kafka in Milliseconds
        delimiter : str, default "\n",
            If lines=True this is the delimiter that
            will be placed between all messages that are read from Kafka
        message_format : {{ 'avro', 'csv', 'json', 'orc', 'parquet' }},
        default 'json',
            Format of the messages that will be read from Kafka.
            This dictates which underlying cudf reader will be invoked the
            create the Dataframe.

        Returns
        -------
        DataFrame
        """

        if topic is None:
            raise ValueError(
                "ERROR: You must specifiy the topic "
                "that you want to consume from"
            )

        kafka_datasource = KafkaDatasource(
            self.kafka_configs,
            topic.encode(),
            partition,
            start,
            end,
            batch_timeout,
            delimiter.encode(),
        )

        cudf_readers = {
            "json": cudf.io.read_json,
            "csv": cudf.io.read_csv,
            "orc": cudf.io.read_orc,
            "avro": cudf.io.read_avro,
            "parquet": cudf.io.read_parquet,
        }

        result = cudf_readers[message_format](
            kafka_datasource, engine="cudf", lines=True
        )

        # Close up the cudf datasource instance
        # TODO: Ideally the C++ destructor should handle the
        # unsubscribe and closing the socket connection.
        kafka_datasource.unsubscribe()
        kafka_datasource.close(batch_timeout)

        if result is not None:
            if isinstance(result, cudf.DataFrame):
                return result
            else:
                return cudf.DataFrame._from_data(result)
        else:
            # empty Dataframe
            return cudf.DataFrame()