Python Consumer.seek 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: confluent_kafka

클래스/타입: Consumer

메소드/함수: seek

hotexamples.com에서의 예제들: 18

Python Consumer.seek - 18개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 confluent_kafka.Consumer.seek에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Consumer(30)

consume(30)

subscribe(30)

poll(30)

list_topics(30)

get_watermark_offsets(30)

unsubscribe(30)

close(30)

commit(30)

assign(30)

assignment(16)

committed(16)

seek(16)

position(11)

store_offsets(10)

offsets_for_times(10)

unassign(7)

consumer_group_metadata(4)

pause(4)

flush(3)

resume(3)

subscriber(1)

subcribe(1)

commmit(1)

setTopic(1)

setIP(1)

run(1)

__init__(1)

getConsumer(1)

original_consumer(1)

예제 #1

파일 보기

파일: test_Consumer.py 프로젝트: confluentinc/confluent-kafka-python

def test_any_method_after_close_throws_exception():
    """ Calling any consumer method after close should thorw a RuntimeError
    """
    c = Consumer({'group.id': 'test',
                  'enable.auto.commit': True,
                  'enable.auto.offset.store': False,
                  'socket.timeout.ms': 50,
                  'session.timeout.ms': 100})

    c.subscribe(["test"])
    c.unsubscribe()
    c.close()

    with pytest.raises(RuntimeError) as ex:
        c.subscribe(['test'])
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.unsubscribe()
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.poll()
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.consume()
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.assign([TopicPartition('test', 0)])
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.unassign()
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.assignment()
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.commit()
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.committed([TopicPartition("test", 0)])
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.position([TopicPartition("test", 0)])
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        c.seek([TopicPartition("test", 0, 0)])
    assert 'Consumer closed' == str(ex.value)

    with pytest.raises(RuntimeError) as ex:
        lo, hi = c.get_watermark_offsets(TopicPartition("test", 0))
    assert 'Consumer closed' == str(ex.value)

예제 #2

파일 보기

파일: test_Consumer.py 프로젝트: wenbingshen/confluent-kafka-python

def test_any_method_after_close_throws_exception():
    """ Calling any consumer method after close should thorw a RuntimeError
    """
    c = Consumer({'group.id': 'test',
                  'enable.auto.commit': True,
                  'enable.auto.offset.store': False,
                  'socket.timeout.ms': 50,
                  'session.timeout.ms': 100})

    c.subscribe(["test"])
    c.unsubscribe()
    c.close()

    with pytest.raises(RuntimeError) as ex:
        c.subscribe(['test'])
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        c.unsubscribe()
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        c.poll()
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        c.consume()
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        c.assign([TopicPartition('test', 0)])
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        c.unassign()
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        c.assignment()
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        c.commit()
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        c.committed([TopicPartition("test", 0)])
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        c.position([TopicPartition("test", 0)])
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        c.seek([TopicPartition("test", 0, 0)])
    assert ex.match('Consumer closed')

    with pytest.raises(RuntimeError) as ex:
        lo, hi = c.get_watermark_offsets(TopicPartition("test", 0))
    assert ex.match('Consumer closed')

예제 #3

파일 보기

파일: _consumer.py 프로젝트: scipp/scippneutron

class KafkaQueryConsumer:
    """
    Wraps Kafka library consumer methods which query the
    broker for metadata and poll for single messages.
    It is a thin wrapper but allows a fake to be used
    in unit tests.
    """
    def __init__(self, broker: str):
        # Set "enable.auto.commit" to False, as we do not need to report to the
        # kafka broker where we got to (it usually does this in case of a
        # crash, but we simply restart the process and go and find the last
        # run_start message.
        #
        # Set "queued.min.messages" to 1 as we will consume backwards through
        # the partition one message at a time; we do not want to retrieve
        # multiple messages in the forward direction each time we step
        # backwards by 1 offset
        conf = {
            "bootstrap.servers": broker,
            "group.id": "consumer_group_name",
            "auto.offset.reset": "latest",
            "enable.auto.commit": False,
            "queued.min.messages": 1
        }
        self._consumer = Consumer(**conf)

    def get_topic_partitions(self, topic: str, offset: int = -1):
        metadata = self._consumer.list_topics(topic)
        return [
            TopicPartition(topic, partition[1].id, offset=offset)
            for partition in metadata.topics[topic].partitions.items()
        ]

    def seek(self, partition: TopicPartition):
        """
        Set offset in partition, the consumer will seek to that offset
        """
        self._consumer.seek(partition)

    def poll(self, timeout=2.):
        """
        Poll for a message from Kafka
        """
        return self._consumer.poll(timeout=timeout)

    def get_watermark_offsets(self,
                              partition: TopicPartition) -> Tuple[int, int]:
        """
        Get the offset of the first and last available
        message in the given partition
        """
        return self._consumer.get_watermark_offsets(partition, cached=False)

    def assign(self, partitions: List[TopicPartition]):
        self._consumer.assign(partitions)

    def offsets_for_times(self, partitions: List[TopicPartition]):
        return self._consumer.offsets_for_times(partitions)

예제 #4

파일 보기

파일: kafka-consumer-test.py 프로젝트: liugdft/stock-trading

def morning_notice():
    # 每只股票都创建 1 个 topic，包含 5 个 partition，partition 0 存放 futu 获取的 snapshot，partition 1 存放 futu 的 实时报价，partition 2 存放 futu 的实时 K线，partition 3 存放 futu 的实时 分时，
    # partition 4 存放 futu 的实时 逐比，partition 5 存放 futu 的实时摆盘，partition 6 存放 futu 的实时经纪队列，partition 7-9 暂时空闲
    consumer = Consumer({
        'bootstrap.servers': 'kafka01',
        'group.id': 'test',
        'enable.auto.commit': False,
        'default.topic.config': {
            'auto.offset.reset': 'largest'
        }
    })

    (rise_ratio_list_smallest,
     rise_ratio_list_largest) = consumer.get_watermark_offsets(
         TopicPartition('test', 0))
    (volume_list_smallest,
     volume_list_largest) = consumer.get_watermark_offsets(
         TopicPartition('test', 1))
    try:
        consumer.assign(
            [TopicPartition('test', 0, rise_ratio_list_largest - 1)])
        consumer.seek(TopicPartition('test', 0, rise_ratio_list_largest - 1))
        print(consumer.position([TopicPartition('test', 0)]))
        print(consumer.position([TopicPartition('test', 1)]))
        latest_rise_ratio = consumer.poll(3.0)
        print(consumer.position([TopicPartition('test', 0)]))
        print(consumer.position([TopicPartition('test', 1)]))

        print(latest_rise_ratio)
        consumer.assign([TopicPartition('test', 1, volume_list_largest - 1)])
        consumer.seek(TopicPartition('test', 1, volume_list_largest - 1))
        print(consumer.position([TopicPartition('test', 0)]))
        print(consumer.position([TopicPartition('test', 1)]))
        latest_volume = consumer.poll(3.0).value()
        print(consumer.position([TopicPartition('test', 0)]))
        print(consumer.position([TopicPartition('test', 1)]))
        print(latest_volume)
    finally:
        consumer.close()

예제 #5

파일 보기

class KafkaConsumer:
    def __init__(self, logger, cfg, influxdb_client, email_notification):
        """Конструктор класса

        Args:
            logger (TimedRotatingLogger): логер
            cfg (dict): словарь параметров
            influxdb_client (InfluxBDProducer): объект для логирования в базу InfluxDB
            email_notification (EmailNotification): объект для отправки email уведомлений

        """
        self.logger = logger
        self.cfg = cfg
        self.influxdb_client = influxdb_client
        self.email_notification = email_notification
        self.consumer = Consumer(self.cfg['kafka_broker']['consumer_config'])
        self.consumer.subscribe(self.cfg['kafka_broker']['consumer_topic'])

    def receive_message(self):
        """Читаем и отдаём одно сообщение из очереди со всеми атрибутами

         Returns:
            (message): сообщение из очереди со всеми атрибутами

        """
        try:
            msg = self.consumer.poll(self.cfg['kafka_broker']['poll_timeout'])

            if msg and not msg.error():
                return msg
            elif msg and msg.error().code() == KafkaError._PARTITION_EOF:
                self.logger.warning(
                    "{0} {1} reached end at offset {2}\n".format(
                        msg.topic(), msg.partition(), msg.offset()))
            elif msg and msg.error():
                raise KafkaException(msg.error())
            elif not msg:
                self.logger.warning('No more messages, end of the queue')
            else:
                self.logger.warning('Something new (unexpected turn)')
        except KafkaError as kf:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            self.logger.error("KafkaError\n{0}\n{1}".format(
                kf, traceback.extract_tb(exc_traceback)))
            self.influxdb_client.write_error(module="KAFKA_CONSUMER")
            self.email_notification.send_error_notification()
            self.consumer.close()
            sys.exit(1)
        except KafkaException as ke:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            self.logger.error("KafkaException\n{0}\n{1}".format(
                ke, traceback.extract_tb(exc_traceback)))
            self.influxdb_client.write_error(module="KAFKA_CONSUMER")
            self.email_notification.send_error_notification()
            self.consumer.close()
            sys.exit(1)

    def seek_message(self, partition, offset):
        """Сбрасываем offset в конкретной partition при ошибке обработке сообщения
        для возможности повторной обработки

        Args:
            partition (int): partition, в которой необходимо сбросить offset
            offset (int): offset, на который необходимо сбросить метку

        """
        try:
            topic_partition = TopicPartition(
                self.cfg['kafka_broker']['consumer_topic'][0], partition,
                offset)

            self.consumer.seek(topic_partition)

        except KafkaError as kf:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            self.logger.error("KafkaError\n{0}\n{1}".format(
                kf, traceback.extract_tb(exc_traceback)))
            self.influxdb_client.write_error(module="KAFKA_CONSUMER")
            self.email_notification.send_error_notification()
            self.consumer.close()
            sys.exit(1)
        except KafkaException as ke:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            self.logger.error("KafkaException\n{0}\n{1}".format(
                ke, traceback.extract_tb(exc_traceback)))
            self.influxdb_client.write_error(module="KAFKA_CONSUMER")
            self.email_notification.send_error_notification()
            self.consumer.close()
            sys.exit(1)

예제 #6

파일 보기

파일: recommender.py 프로젝트: sxcem/Job-Data-Mining

class CRecommender(BatchProc):
    def __init__(self, batch_size, proc_id, proc_num):
        super(CRecommender, self).__init__("recommender", 4)
        self.firelinker = {  # 当前函数为下一个函数提供的数据
            "result": "Usurvive"  # 初始化为“成功”
        }
        self.abyss = {  # 异常、缺陷信息统计器
            "Usurvive": 0,
            "URdead": 0,
            "UPoison": 0,
            "UPlay": 0
        }
        self.batch_size = batch_size
        self.proc_id = proc_id
        self.proc_num = proc_num

        self.latest_articles = {}
        self.earliest_user_log = ""
        self.kafka_consumer = None
        self.user_history = {}  # 用户ID作为key,用户历史列表作为value
        self.user_recommend = {}
        self.topic_vectors = np.load(topic_vector_path)

    def init_kafka_consumer(self, items):
        for item in items:
            self.user_history[item["Fuser_id"]] = []

        self.kafka_consumer = Consumer({
            'bootstrap.servers': 'localhost',
            'group.id': 'mygroup',
            'default.topic.config': {
                'auto.offset.reset': 'largest'
            }
        })
        self.kafka_consumer.subscribe(list(self.user_history.keys()))
        self.kafka_consumer.poll(timeout=1.0)

    def __reset_monitor(self):
        self.firelinker = {  # 当前函数为下一个函数提供的数据
            "result": "Usurvive"  # 初始化为“成功”
        }
        self.abyss = {  # 异常、缺陷信息统计器
            "Usurvive": 0,
            "URdead": 0,
            "UPoison": 0,
            "UPlay": 0
        }
        self.kafka_consumer.close()
        self.kafka_consumer = None
        self.user_history = {}
        self.user_recommend = {}

    def __bonfire(self):
        result = self.firelinker["result"]
        self.abyss[result] += 1
        self.abyss["UPlay"] += 1
        self.firelinker = {"result": "Usurvive"}

    def __failsafe(self):
        self._db.rollback()
        self.firelinker["result"] = "URdead"
        self.logger.log_info(traceback.format_exc())

    # 将每次处理结果讯息打印日志
    def gen_single_report(self, item):
        content_id = item['Fauto_id']
        origin_url = item['Fh5_url']
        result = self.firelinker["result"]
        self.logger.log_info("item with id %s,origin url %s, result:%s" %
                             (content_id, origin_url, result))

    # 将每批处理结果讯息打印日志
    def gen_batch_report(self):
        UPlay = self.abyss["UPlay"]
        UPoison = self.abyss["UPoison"]
        URdead = self.abyss["URdead"]
        Usurvive = self.abyss["Usurvive"]

        if UPlay != 0:
            self.logger.log_info("You played %s times, survive %s times, \
            poisoned %s times, died %s times.\n \
                survival rate: %s, poison rate: %s, death rate: %s."\
                %(UPlay, Usurvive, UPoison, URdead, \
                    Usurvive/(UPlay), UPoison/UPlay, URdead/UPlay))
        else:
            self.logger.log_info(
                "You processed zero content, please check your Sql")

    def load_users_log(self, items):
        for item in items:
            try:
                user_number = item["Fuser_id"]
                self.kafka_consumer.seek(TopicPartition(
                    str(user_number), 0, 0))  # 分区为0，OFFSET为0
                while True:
                    article_id_set = set()
                    msg = self.kafka_consumer.poll(timeout=1.0)
                    if msg is None:
                        break
                    elif msg.error():
                        self.logger.log_info(msg.error())
                        break
                    else:
                        # 加入过滤的逻辑
                        history_dict = json.loads(msg.value().decode())
                        action_date = datetime.strptime(
                            history_dict["action_date"], "%Y-%m-%d %H:%M:%S")
                        if action_date > self.earliest_user_log:
                            article_id = history_dict["article_id"]
                            if article_id not in article_id_set:
                                # 后面两个None分别是article_cluster_id, article_vector
                                history_tuple = (action_date, article_id)
                                self.user_history[user_number].append(
                                    history_tuple)
                                article_id_set.add(article_id)
                        else:
                            break
            except Exception as e:
                self.logger.log_error(str(e))

    def load_articles_hidden(self):  # 注意，这里可能会抛出文章隐藏信息找不到的异常
        article_id_set = set()
        for user_log in self.user_history.values():
            article_id_set.update([log_item[1] for log_item in user_log
                                   ])  # log_item[1]指向是文章ID

        if len(article_id_set) > 1:
            id_range_str = format(tuple(article_id_set))
            where = "Fauto_id in %s" % (id_range_str)
        elif len(article_id_set) == 1:
            id_str = list(article_id_set)[0]
            where = "Fauto_id = %s" % (id_str)
        else:
            return

        field_list = ["Fauto_id", "Fcluster_id", "Farticle_vector"]
        self._db.set_db_table("db_hiddens", "t_job_documents_hidden")
        DB_res = self._db.query(field_list, where)
        article_info_tmp = {}
        for res in DB_res:
            article_id = res["Fauto_id"]
            article_cluster_id = res["Fcluster_id"]
            article_vector = res["Farticle_vector"]
            article_info_tmp[article_id] = (article_cluster_id, article_vector)

        for user_id, history_tuple_list in self.user_history.items():
            for idx in range(len(history_tuple_list)):
                article_time = history_tuple_list[idx][0]
                article_id = history_tuple_list[idx][1]
                article_id = int(article_id)
                article_cluster_id = article_info_tmp[article_id][0]
                article_vector = article_info_tmp[article_id][1]
                new_history_tuple = (article_time, article_id,
                                     article_cluster_id, article_vector)
                self.user_history[user_id][idx] = new_history_tuple

    def classify_user(self, history):
        cluster_dict = {}
        self.firelinker["favored_clusters"] = []
        self.firelinker["favored_articles"] = [
        ]  # 形如[(1, 20180516),(2,20190618)]
        for action in history:
            action_date = action[0]
            article_id = action[1]
            article_cluster_id = action[2]
            article_vector = np.array(
                [float(elem) for elem in json.loads(action[3])])
            self.firelinker["favored_articles"].append(
                (article_id, article_cluster_id, article_vector, action_date))
            map_reduce(cluster_dict, article_cluster_id)

        for key, value in cluster_dict.items():
            if value >= FAVORED_THRESH:
                self.firelinker["favored_clusters"].append(key)

    def gen_simm_article(self):
        raw_result = {}
        for article_info in self.firelinker["favored_articles"]:
            article_id = article_info[0]
            article_cluster_id = article_info[1]
            article_vector = article_info[2]
            action_date = article_info[3]
            article_comp_list = self.latest_articles.get(
                article_cluster_id, [])
            article_candidates = []

            for article_comp in article_comp_list:
                article_comp_id = article_comp[0]
                article_comp_time = article_comp[1]
                article_comp_vector = article_comp[2]

                if article_id != article_comp_id:
                    dot_product = np.dot(article_vector, article_comp_vector)
                    norm_product = LA.norm(article_vector) * LA.norm(
                        article_comp_vector)
                    simm = (dot_product / norm_product)
                    if simm > SIMM_THRESH:
                        article_candidates.append(
                            (article_comp_id, article_comp_time))
            raw_result[article_id] = article_candidates
        self.firelinker["simm_articles"] = raw_result

    def gen_cluster_article(self):
        raw_result = {}
        for cluster_id in self.firelinker["favored_clusters"]:
            raw_result[cluster_id] = [
                (article_info[0], article_info[1])
                for article_info in self.latest_articles[cluster_id]
            ]
        self.firelinker["cluster_articles"] = raw_result

    def gen_other_article(self):
        raw_result = {}
        for cluster_id in self.latest_articles.keys():
            article_info_list = self.latest_articles[cluster_id]
            raw_result[cluster_id] = [(article_info[0], article_info[1])
                                      for article_info in article_info_list
                                      ]  # 每个类取10篇文章
        self.firelinker["other_articles"] = raw_result

    def gen_random_article(self):
        simm_articles = self.firelinker["simm_articles"]
        cluster_articles = self.firelinker["cluster_articles"]
        other_articles = self.firelinker["other_articles"]

        other_articles_new = {}
        article_id_set = set([])
        for article_id, article_list in simm_articles.items():
            simm_articles[article_id], article_id_set = intercept_article_list(
                article_list, 3, article_id_set)

        for cluster_id, article_list in cluster_articles.items():
            cluster_articles[
                cluster_id], article_id_set = intercept_article_list(
                    article_list, 4, article_id_set)

        if len(other_articles.keys()) > 10:
            cluster_id_list = random.sample(other_articles.keys(), 10)
        else:
            cluster_id_list = list(other_articles.keys())
        for cluster_id in cluster_id_list:
            other_articles_new[
                cluster_id], article_id_set = intercept_article_list(
                    other_articles[cluster_id], 2, article_id_set)

        self.firelinker["other_articles"] = other_articles_new

    def gen_result(self, user_id):
        temp_dict = {}
        simm_articles = self.firelinker["simm_articles"]
        cluster_articles = self.firelinker["cluster_articles"]
        other_articles = self.firelinker["other_articles"]

        article_id_list = []
        for key, value in simm_articles.items():
            article_id_list.append(key)
            for article in value:
                article_id_list.append(article[0])
        for value in cluster_articles.values():
            for article in value:
                article_id_list.append(article[0])
        for value in other_articles.values():
            for article in value:
                article_id_list.append(article[0])
        article_id_list = list(set(article_id_list))

        self.user_recommend[user_id] = ','.join(
            [str(article_id) for article_id in article_id_list])

        # 用于演示，发布时记删
        in_string = format(tuple(article_id_list))
        self._db.set_db_table("db_documents", "t_job_documents")
        where = "Fauto_id in %s" % (in_string)
        field_list = ["Fauto_id", "Ftitle", "Fh5_url", "Fsummary"]
        DB_res = self._db.query(field_list, where)
        self._db.commit()

        for res in DB_res:
            temp_dict[res["Fauto_id"]] = res["Ftitle"]

        show_dict = {}
        for docid, value in simm_articles.items():
            new_key = temp_dict[docid]
            print("--------------------------------")
            print("用户投递了职位：%s" % (new_key))
            for article in value:
                new_value = temp_dict[article[0]]
                print("用户被推荐了职位：%s" % (new_value))

        for cluster_id, value in cluster_articles.items():
            print("--------------------------------")
            print("根据用户对类%s的喜爱，我们推荐了下列职位" % (cluster_id))
            for article in value:
                new_value = temp_dict[article[0]]
                print("%s" % (new_value))
        '''
        print("--------------------------------")
        print("给用户推荐的其他领域的文章")
        for value in other_articles.values():
            for article in value:
                new_value = temp_dict[article[0]]
                print("%s"%(new_value))
        '''

    def process_user(self, user_id, history):
        try:
            self.classify_user(history)
            self.gen_simm_article()
            self.gen_cluster_article()
            self.gen_other_article()
            self.gen_random_article()
            self.gen_result(user_id)
        except Exception as e:
            self.__failsafe()
        finally:
            # self.gen_single_report(item)
            self.__bonfire()

    def update_user_recmmend(self):
        self._db.set_db_table('db_users', 't_user_recommends')
        field_list = ['Fuser_id', 'Frec_articles', 'Fmodify_time']
        data_list = []
        for user_id, recommends in self.user_recommend.items():
            modify_time = time_now()
            element = str((user_id, recommends, modify_time))
            data_list.append(element)

        self._db.update_batch(field_list, data_list)
        self._db.commit()

    def run(self, items):
        try:
            self.init_kafka_consumer(items)
            self.load_users_log(items)
            self.load_articles_hidden()
            for user_id, history in self.user_history.items():
                self.process_user(user_id, history)
            self.update_user_recmmend()
        except Exception as e:
            self.logger.log_error(traceback.format_exc())
        finally:
            self.__reset_monitor()

    # 读取最近的文章
    def prepare_articles(self):
        now = datetime.now()
        latest = now + timedelta(days=32 - now.day)
        time_cover = timedelta(days=ARTICLE_COVER)
        earliest = latest - time_cover
        self._db.set_db_table("db_hiddens", "t_job_documents_hidden")
        where = "Fcreate_time > '%s' and Fcreate_time < '%s' and Frec_state=1"\
                %(earliest, latest)
        field_list = [
            "Farticle_vector", "Fauto_id", "Fcluster_id", "Fcreate_time"
        ]
        DB_res = self._db.query(field_list, where)

        for item in DB_res:
            cluster_id = item["Fcluster_id"]
            article_id = item["Fauto_id"]
            article_time = item["Fcreate_time"]
            article_vector = np.array(
                [float(elem) for elem in json.loads(item["Farticle_vector"])])
            if cluster_id in self.latest_articles.keys():
                self.latest_articles[cluster_id].append(
                    (article_id, article_time, article_vector))
            else:
                self.latest_articles[cluster_id] = [(article_id, article_time,
                                                     article_vector)]

    def prepare_user_cover(self):
        now = datetime.now()
        time_cover = timedelta(days=USER_HISTORY_COVER)
        self.earliest_user_log = now - time_cover

    def main(self):
        # 初始化各种
        self.init_db()
        self.init_log()
        self.prepare_articles()
        self.prepare_user_cover()

        step = self.batch_size * self.proc_num
        offset = self.proc_id * self.batch_size

        while (True):
            where = "Fauto_id between %s and %s" % (offset + 1,
                                                    offset + self.batch_size)
            self.logger.log_info('process_id:%s, sql condition:%s' %
                                 (self.proc_id, where))
            field_list = ['Fuser_id']
            self._db.set_db_table("db_users", "t_user_recommends")
            items = self._db.query(field_list, where)
            self._db.commit()

            if not items:
                break
            self.run(items)
            offset += step
            #break
            time.sleep(5)

        self.close()

예제 #7

파일 보기

# API keys held in a non-commited file
import credentials

# Subscribe to ATM_POSSIBLE_FRAUD_ENRICHED topic
settings = {
    'bootstrap.servers': 'localhost:9092',
    'group.id': 'python_pushbullet',
    'default.topic.config': {
        'auto.offset.reset': 'largest'
    }
}
c = Consumer(settings)
tp = TopicPartition('ATM_POSSIBLE_FRAUD_ENRICHED', 1, -1)

print('seek: Seeking to %s' % tp)
c.seek(tp)
c.subscribe(['ATM_POSSIBLE_FRAUD_ENRICHED'])

# Connect to pushbullet service
pb = Pushbullet(credentials.login['pushbullet_api_token'])

# Poll for messages; and extract JSON and call pushbullet for any messages
while True:
    msg = c.poll()
    if msg.error():
        if msg.error().code() == KafkaError._PARTITION_EOF:
            continue
        else:
            print(msg.error())
            break

예제 #8

파일 보기

파일: kafka.py 프로젝트: ruezetle/snuba

class KafkaConsumer(Consumer[TPayload]):
    """
    The behavior of this consumer differs slightly from the Confluent
    consumer during rebalancing operations. Whenever a partition is assigned
    to this consumer, offsets are *always* automatically reset to the
    committed offset for that partition (or if no offsets have been committed
    for that partition, the offset is reset in accordance with the
    ``auto.offset.reset`` configuration value.) This causes partitions that
    are maintained across a rebalance to have the same offset management
    behavior as a partition that is moved from one consumer to another. To
    prevent uncommitted messages from being consumed multiple times,
    ``commit`` should be called in the partition revocation callback.

    The behavior of ``auto.offset.reset`` also differs slightly from the
    Confluent consumer as well: offsets are only reset during initial
    assignment or subsequent rebalancing operations. Any other circumstances
    that would otherwise lead to preemptive offset reset (e.g. the consumer
    tries to read a message that is before the earliest offset, or the
    consumer attempts to read a message that is after the latest offset) will
    cause an exception to be thrown, rather than resetting the offset, as
    this could lead to chunks messages being replayed or skipped, depending
    on the circumstances. This also means that if the committed offset is no
    longer available (such as when reading older messages from the log and
    those messages expire, or reading newer messages from the log and the
    leader crashes and partition ownership fails over to an out-of-date
    replica), the consumer will fail-stop rather than reset to the value of
    ``auto.offset.reset``.
    """

    # Set of logical offsets that do not correspond to actual log positions.
    # These offsets should be considered an implementation detail of the Kafka
    # consumer and not used publically.
    # https://github.com/confluentinc/confluent-kafka-python/blob/443177e1c83d9b66ce30f5eb8775e062453a738b/tests/test_enums.py#L22-L25
    LOGICAL_OFFSETS = frozenset(
        [OFFSET_BEGINNING, OFFSET_END, OFFSET_STORED, OFFSET_INVALID])

    def __init__(
        self,
        configuration: Mapping[str, Any],
        codec: Codec[KafkaPayload, TPayload],
        *,
        commit_retry_policy: Optional[RetryPolicy] = None,
    ) -> None:
        if commit_retry_policy is None:
            commit_retry_policy = NoRetryPolicy()

        auto_offset_reset = configuration.get("auto.offset.reset", "largest")
        if auto_offset_reset in {"smallest", "earliest", "beginning"}:
            self.__resolve_partition_starting_offset = (
                self.__resolve_partition_offset_earliest)
        elif auto_offset_reset in {"largest", "latest", "end"}:
            self.__resolve_partition_starting_offset = (
                self.__resolve_partition_offset_latest)
        elif auto_offset_reset == "error":
            self.__resolve_partition_starting_offset = (
                self.__resolve_partition_offset_error)
        else:
            raise ValueError(
                "invalid value for 'auto.offset.reset' configuration")

        if (as_kafka_configuration_bool(
                configuration.get("enable.auto.commit", "true")) is not False):
            raise ValueError(
                "invalid value for 'enable.auto.commit' configuration")

        if (as_kafka_configuration_bool(
                configuration.get("enable.auto.offset.store", "true"))
                is not False):
            raise ValueError(
                "invalid value for 'enable.auto.offset.store' configuration")

        # NOTE: Offsets are explicitly managed as part of the assignment
        # callback, so preemptively resetting offsets is not enabled.
        self.__consumer = ConfluentConsumer({
            **configuration, "auto.offset.reset":
            "error"
        })

        self.__codec = codec

        self.__offsets: MutableMapping[Partition, int] = {}
        self.__staged_offsets: MutableMapping[Partition, int] = {}
        self.__paused: Set[Partition] = set()

        self.__commit_retry_policy = commit_retry_policy

        self.__state = KafkaConsumerState.CONSUMING

    def __resolve_partition_offset_earliest(
            self,
            partition: ConfluentTopicPartition) -> ConfluentTopicPartition:
        low, high = self.__consumer.get_watermark_offsets(partition)
        return ConfluentTopicPartition(partition.topic, partition.partition,
                                       low)

    def __resolve_partition_offset_latest(
            self,
            partition: ConfluentTopicPartition) -> ConfluentTopicPartition:
        low, high = self.__consumer.get_watermark_offsets(partition)
        return ConfluentTopicPartition(partition.topic, partition.partition,
                                       high)

    def __resolve_partition_offset_error(
            self,
            partition: ConfluentTopicPartition) -> ConfluentTopicPartition:
        raise ConsumerError("unable to resolve partition offsets")

    def subscribe(
        self,
        topics: Sequence[Topic],
        on_assign: Optional[Callable[[Mapping[Partition, int]], None]] = None,
        on_revoke: Optional[Callable[[Sequence[Partition]], None]] = None,
    ) -> None:
        """
        Subscribe to topics. This replaces a previous subscription.

        This method does not block. The subscription may not be fulfilled
        immediately: instead, the ``on_assign`` and ``on_revoke`` callbacks
        are called when the subscription state changes with the updated
        assignment for this consumer.

        If provided, the ``on_assign`` callback is called with a mapping of
        partitions to their offsets (at this point, the working offset and the
        committed offset are the same for each partition) on each subscription
        change. Similarly, the ``on_revoke`` callback (if provided) is called
        with a sequence of partitions that are being removed from this
        consumer's assignment. (This callback does not include the offsets,
        as the working offset and committed offset may differ, in some cases
        by substantial margin.)

        Raises an ``InvalidState`` exception if called on a closed consumer.
        """
        if self.__state is not KafkaConsumerState.CONSUMING:
            raise InvalidState(self.__state)

        def assignment_callback(
                consumer: ConfluentConsumer,
                partitions: Sequence[ConfluentTopicPartition]) -> None:
            self.__state = KafkaConsumerState.ASSIGNING

            try:
                assignment: MutableSequence[ConfluentTopicPartition] = []

                for partition in self.__consumer.committed(partitions):
                    if partition.offset >= 0:
                        assignment.append(partition)
                    elif partition.offset == OFFSET_INVALID:
                        assignment.append(
                            self.__resolve_partition_starting_offset(
                                partition))
                    else:
                        raise ValueError("received unexpected offset")

                offsets: MutableMapping[Partition, int] = {
                    Partition(Topic(i.topic), i.partition): i.offset
                    for i in assignment
                }
                self.__seek(offsets)

                # Ensure that all partitions are resumed on assignment to avoid
                # carrying over state from a previous assignment.
                self.__consumer.resume([
                    ConfluentTopicPartition(partition.topic.name,
                                            partition.index, offset)
                    for partition, offset in offsets.items()
                ])

                for partition in offsets:
                    self.__paused.discard(partition)
            except Exception:
                self.__state = KafkaConsumerState.ERROR
                raise

            try:
                if on_assign is not None:
                    on_assign(offsets)
            finally:
                self.__state = KafkaConsumerState.CONSUMING

        def revocation_callback(
                consumer: ConfluentConsumer,
                partitions: Sequence[ConfluentTopicPartition]) -> None:
            self.__state = KafkaConsumerState.REVOKING

            partitions = [
                Partition(Topic(i.topic), i.partition) for i in partitions
            ]

            try:
                if on_revoke is not None:
                    on_revoke(partitions)
            finally:
                for partition in partitions:
                    # Staged offsets are deleted during partition revocation to
                    # prevent later committing offsets for partitions that are
                    # no longer owned by this consumer.
                    if partition in self.__staged_offsets:
                        logger.warning(
                            "Dropping staged offset for revoked partition (%r)!",
                            partition,
                        )
                        del self.__staged_offsets[partition]

                    try:
                        self.__offsets.pop(partition)
                    except KeyError:
                        # If there was an error during assignment, this
                        # partition may have never been added to the offsets
                        # mapping.
                        logger.warning(
                            "failed to delete offset for unknown partition: %r",
                            partition,
                        )

                    self.__paused.discard(partition)

                self.__state = KafkaConsumerState.CONSUMING

        self.__consumer.subscribe(
            [topic.name for topic in topics],
            on_assign=assignment_callback,
            on_revoke=revocation_callback,
        )

    def unsubscribe(self) -> None:
        """
        Unsubscribe from topics.

        Raises an ``InvalidState`` exception if called on a closed consumer.
        """
        if self.__state is not KafkaConsumerState.CONSUMING:
            raise InvalidState(self.__state)

        self.__consumer.unsubscribe()

    def poll(self,
             timeout: Optional[float] = None) -> Optional[Message[TPayload]]:
        """
        Return the next message available to be consumed, if one is
        available. If no message is available, this method will block up to
        the ``timeout`` value before returning ``None``. A timeout of
        ``0.0`` represents "do not block", while a timeout of ``None``
        represents "block until a message is available (or forever)".

        Calling this method may also invoke subscription state change
        callbacks.

        This method may also raise an ``EndOfPartition`` error (a subtype of
        ``ConsumerError``) when the consumer has reached the end of a
        partition that it is subscribed to and no additional messages are
        available. The ``partition`` attribute of the raised exception
        specifies the end which partition has been reached. (Since this
        consumer is multiplexing a set of partitions, this exception does not
        mean that *all* of the partitions that the consumer is subscribed to
        do not have any messages, just that it has reached the end of one of
        them. This also does not mean that additional messages won't be
        available in future poll calls.) Not every backend implementation
        supports this feature or is configured to raise in this scenario.

        Raises an ``InvalidState`` exception if called on a closed consumer.

        Raises a ``TransportError`` for various other consumption-related
        errors.
        """
        if self.__state is not KafkaConsumerState.CONSUMING:
            raise InvalidState(self.__state)

        message: Optional[ConfluentMessage] = self.__consumer.poll(
            *[timeout] if timeout is not None else [])
        if message is None:
            return None

        error: Optional[KafkaError] = message.error()
        if error is not None:
            code = error.code()
            if code == KafkaError._PARTITION_EOF:
                raise EndOfPartition(
                    Partition(Topic(message.topic()), message.partition()),
                    message.offset(),
                )
            elif code == KafkaError._TRANSPORT:
                raise TransportError(str(error))
            else:
                raise ConsumerError(str(error))

        headers: Optional[Headers] = message.headers()
        result = Message(
            Partition(Topic(message.topic()), message.partition()),
            message.offset(),
            self.__codec.decode(
                KafkaPayload(
                    message.key(),
                    message.value(),
                    headers if headers is not None else [],
                )),
            datetime.utcfromtimestamp(message.timestamp()[1] / 1000.0),
        )

        self.__offsets[result.partition] = result.get_next_offset()

        return result

    def tell(self) -> Mapping[Partition, int]:
        """
        Return the read offsets for all assigned partitions.

        Raises an ``InvalidState`` if called on a closed consumer.
        """
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        return self.__offsets

    def __validate_offsets(self, offsets: Mapping[Partition, int]) -> None:
        invalid_offsets: Mapping[Partition, int] = {
            partition: offset
            for partition, offset in offsets.items() if offset < 0
        }

        if invalid_offsets:
            raise ConsumerError(f"invalid offsets: {invalid_offsets!r}")

    def __seek(self, offsets: Mapping[Partition, int]) -> None:
        self.__validate_offsets(offsets)

        if self.__state is KafkaConsumerState.ASSIGNING:
            # Calling ``seek`` on the Confluent consumer from an assignment
            # callback will throw an "Erroneous state" error. Instead,
            # partition offsets have to be initialized by calling ``assign``.
            self.__consumer.assign([
                ConfluentTopicPartition(partition.topic.name, partition.index,
                                        offset)
                for partition, offset in offsets.items()
            ])
        else:
            for partition, offset in offsets.items():
                self.__consumer.seek(
                    ConfluentTopicPartition(partition.topic.name,
                                            partition.index, offset))

        self.__offsets.update(offsets)

    def seek(self, offsets: Mapping[Partition, int]) -> None:
        """
        Change the read offsets for the provided partitions.

        Raises an ``InvalidState`` if called on a closed consumer.
        """
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        if offsets.keys() - self.__offsets.keys():
            raise ConsumerError("cannot seek on unassigned partitions")

        self.__seek(offsets)

    def pause(self, partitions: Sequence[Partition]) -> None:
        """
        Pause the consumption of messages for the provided partitions.

        Raises an ``InvalidState`` if called on a closed consumer.
        """
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        if set(partitions) - self.__offsets.keys():
            raise ConsumerError("cannot pause unassigned partitions")

        self.__consumer.pause([
            ConfluentTopicPartition(partition.topic.name, partition.index)
            for partition in partitions
        ])

        self.__paused.update(partitions)

        # XXX: Seeking to a specific partition offset and immediately pausing
        # that partition causes the seek to be ignored for some reason.
        self.seek({
            partition: offset
            for partition, offset in self.__offsets.items()
            if partition in partitions
        })

    def resume(self, partitions: Sequence[Partition]) -> None:
        """
        Resume the consumption of messages for the provided partitions.

        Raises an ``InvalidState`` if called on a closed consumer.
        """
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        if set(partitions) - self.__offsets.keys():
            raise ConsumerError("cannot resume unassigned partitions")

        self.__consumer.resume([
            ConfluentTopicPartition(partition.topic.name, partition.index)
            for partition in partitions
        ])

        for partition in partitions:
            self.__paused.discard(partition)

    def paused(self) -> Sequence[Partition]:
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        return [*self.__paused]

    def stage_offsets(self, offsets: Mapping[Partition, int]) -> None:
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        if offsets.keys() - self.__offsets.keys():
            raise ConsumerError(
                "cannot stage offsets for unassigned partitions")

        self.__validate_offsets(offsets)

        # TODO: Maybe log a warning if these offsets exceed the current
        # offsets, since that's probably a side effect of an incorrect usage
        # pattern?
        self.__staged_offsets.update(offsets)

    def __commit(self) -> Mapping[Partition, int]:
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        result: Optional[Sequence[ConfluentTopicPartition]]

        if self.__staged_offsets:
            result = self.__consumer.commit(
                offsets=[
                    ConfluentTopicPartition(partition.topic.name,
                                            partition.index, offset)
                    for partition, offset in self.__staged_offsets.items()
                ],
                asynchronous=False,
            )
        else:
            result = []

        assert result is not None  # synchronous commit should return result immediately

        self.__staged_offsets.clear()

        offsets: MutableMapping[Partition, int] = {}

        for value in result:
            # The Confluent Kafka Consumer will include logical offsets in the
            # sequence of ``Partition`` objects returned by ``commit``. These
            # are an implementation detail of the Kafka Consumer, so we don't
            # expose them here.
            # NOTE: These should no longer be seen now that we are forcing
            # offsets to be set as part of the assignment callback.
            if value.offset in self.LOGICAL_OFFSETS:
                continue

            assert value.offset >= 0, "expected non-negative offset"
            offsets[Partition(Topic(value.topic),
                              value.partition)] = value.offset

        return offsets

    def commit_offsets(self) -> Mapping[Partition, int]:
        """
        Commit staged offsets for all partitions that this consumer is
        assigned to. The return value of this method is a mapping of
        partitions with their committed offsets as values.

        Raises an ``InvalidState`` if called on a closed consumer.
        """
        return self.__commit_retry_policy.call(self.__commit)

    def close(self, timeout: Optional[float] = None) -> None:
        """
        Close the consumer. This stops consuming messages, *may* commit
        staged offsets (depending on the configuration), and ends its
        subscription.

        Raises a ``InvalidState`` if the consumer is unable to be closed
        before the timeout is reached.
        """
        try:
            self.__consumer.close()
        except RuntimeError:
            pass

        self.__state = KafkaConsumerState.CLOSED

    @property
    def closed(self) -> bool:
        return self.__state is KafkaConsumerState.CLOSED

예제 #9

파일 보기

consumer.assign([TopicPartition('test', 4)])

# 重置 offset
consumer.assign([TopicPartition('test', 4, 2)])

# 获取一个 partition 的最小、最大 offset
consumer.get_watermark_offsets(TopicPartition('test', 4))
# （0, 19）

# 如果是一个新的 group.id 必须先消费一条消息，这样后面的重置 offset 才有效， 如果不消费，重置 offset 前后获取到的 offset 值都是-1001
# 获取当前 offset 位置
consumer.position([TopicPartition('test', 3)])

# 重置 offset 到任意位置，committed 决定了下一次连接后的 offset 位置（以 group 为维度），本次连接无效。本次连接的 offset 位置由 position 决定。
# 重置 offset 后，要 close 重新连才有效。position 决定本次连接的 offset 位置，用 seek() 修改。
consumer.seek(TopicPartition('test', 3, 1))
consumer.commit(offsets=[TopicPartition('test', 3, 7)])

# 检查重置的位置
msg = consumer.committed([TopicPartition('test', 3)])
print(msg)

# offset：Either an absolute offset (>=0) or a logical offset: OFFSET_BEGINNING, OFFSET_END, OFFSET_STORED, OFFSET_INVALID
while True:
    msg = consumer.poll(3.0)
    if msg is None:
        continue
    if msg.error():
        if msg.error().code() == KafkaError._PARTITION_EOF:
            continue
        else:

예제 #10

파일 보기

def main(options):
    if options.hostname:
        hostname = options.hostname
    else:
        hostname = gethostname()
    group_id = f'python_search@{hostname}'
    print(f'group_id = {group_id}')
    c = Consumer({
        'bootstrap.servers': options.bootstrap_servers,
        'group.id': group_id,
    })

    tp = TopicPartition(options.topic, 0, 0)  #OFFSET_BEGINNING)
    c.assign([tp])
    c.seek(tp)

    printed = 0
    print('begin')
    while True:
        msg = c.poll()
        if msg is None:
            continue
        if msg.error():
            if msg.error().code() == KafkaError._PARTITION_EOF:
                break
            else:
                print(msg.error())
                return
        offset = msg.offset()
        message_string = _bytes2string(msg.value())
        try:
            message = _json2dict_deserialize(message_string)
        except json.decoder.JSONDecodeError:
            if options.show_warnings:
                print('-> offset {0} : deserialize error'.format(offset))
        else:
            ts_type, ts_ms_value = msg.timestamp()
            if ts_type != TIMESTAMP_NOT_AVAILABLE and ts_ms_value:
                ts_value = int(ts_ms_value / 1000)
                recieved = datetime.fromtimestamp(ts_value)
            else:
                recieved = None
            try:
                criteria = eval(options.filter)
            except (KeyError, TypeError) as e:
                criteria = False
                if options.show_warnings:
                    print(f'-> offset {offset} : {e}')
            if criteria == True:
                if recieved:
                    print(
                        f'-> offset {offset}, recieved {recieved:%d-%m-%Y %H:%M:%S}'
                    )
                else:
                    print(f'-> offset {offset}')
                _json_pprint(message)
                # pprint(message)
                printed += 1
                if options.number and printed >= options.number:
                    break
        print('{0}'.format(offset), end='\r')
    c.close()
    print('end             ')

예제 #11

파일 보기

파일: main_confluent.py 프로젝트: acscott/ztf-kafka-replicator

def proc_replicate(topic,
                   src_partition,
                   end_offset,
                   part_map,
                   rerun=False,
                   commit=False):
    """
      part_map list[list[]]
    """

    src = Consumer({
        'bootstrap.servers': SRC_BOOTSTRAP_SERVERS,
        'group.id': SRC_GROUP_ID,
        'enable.auto.commit': commit
    })

    logger.info(
        f"Starting process consumer topic: {topic} src_partition:{src_partition}"
    )

    if rerun:
        logger.info(
            f"Resetting source partition {src_partition} to beginning...")
        tp = TopicPartition(topic, src_partition,
                            confluent_kafka.OFFSET_BEGINNING)
        src.assign([tp])
        src.seek(tp)
        logger.info(
            f"Reset of source partition {src_partition} offset to {src.position([tp])} complete."
        )
    else:
        tp = TopicPartition(topic, src_partition)
        src.assign([tp])

    trg = Producer({
        'bootstrap.servers': TRG_BOOTSTRAP_SERVERS,
        'group.id': TRG_GROUP_ID
    })

    trg_part_ndx = 0
    trg_part_ndx_max = len(
        part_map[src_partition]) - 1  # ex: a length of 2 has 1 as the max

    msg_count = 0

    t0 = _t1 = time.time()
    ending_offset = end_offset

    cd = 30.0
    while True:
        st = time.time()
        msg = src.poll(1.0)

        if msg is None:
            if time.time() - st >= cd:
                logger.info(
                    f"timeout after {cd} secs for topic: {topic} src_partition:{src_partition}, ending"
                )
                break
            continue

        if msg.error():
            logger.error(
                f"Consumer error topic: {topic} src_partition:{src_partition}: {msg.error()} exiting"
            )
            sys.exit(1)

        msg_count += 1
        trg.produce(topic,
                    value=msg.value(),
                    partition=part_map[src_partition][trg_part_ndx])

        if commit: src.commit()

        # 300 secs, we must do this if we want to ensure not to lose messages
        # learned of during testing; w/o it, messages do get produced but many, many
        # will not show up in the target cluster
        trg.flush(300)
        trg_part_ndx += 1
        if trg_part_ndx > trg_part_ndx_max:
            trg_part_ndx = 0

        # Print status/stats
        if msg_count % LOG_INTERVAL == 0:
            _t1 = outputstat(t0, _t1, LOG_INTERVAL, src_partition, msg.offset,
                             msg_count, ending_offset)

    logger.info(
        f"process consumer, source partition {src_partition} replication complete ========================"
    )
    _t1 = outputstat(t0, _t1, LOG_INTERVAL, src_partition, -1, msg_count,
                     ending_offset)

예제 #12

파일 보기

파일: save-kafka-data.py 프로젝트: liugdft/stock-trading

    "HK.00326", "HK.00883", "HK.06098", "HK.02869", "HK.01060", "HK.00728",
    "HK.00721", "HK.00700", "HK.01468", "HK.03993", "HK.02238", "HK.01066",
    "HK.00139", "HK.02007", "HK.00554", "HK.06878"
]

# stock_list = [ "HK.00788" ]

for stock in stock_list:
    # 存储 ticker 和 order_book 数据到 csv 文件
    # 存储 ticker 和 order_book 数据到 influxdb 数据库
    # ticker 在 partition 4
    # ticker_sample_1 = '{"code":"HK.00386","time":"2018-09-03 15:59:50","price":7.67,"volume":4000,"turnover":30680.0,"ticker_direction":"BUY","sequence":6596904796962160642,"type":"AUTO_MATCH"}'
    # ticker_sample_2 = '{"code":"HK.00386","time":"2018-09-03 15:59:50","price":7.67,"volume":2000,"turnover":15340.0,"ticker_direction":"BUY","sequence":6596904796962160644,"type":"AUTO_MATCH"}'
    # ticker_sample_3 = '{"code":"HK.00386","time":"2018-09-03 15:59:51","price":7.67,"volume":2000,"turnover":15340.0,"ticker_direction":"BUY","sequence":6596904801257127938,"type":"AUTO_MATCH"}'
    consumer.assign([TopicPartition(stock, 4, 0)])
    consumer.seek(TopicPartition(stock, 4, 0))
    ticker_pd = pd.DataFrame(columns=[
        'code', 'time', 'price', 'volume', 'turnover', 'ticker_direction',
        'sequence', 'type'
    ])
    while True:
        msg = consumer.poll(3.0)
        if msg is None:
            continue
        if msg.error():
            if msg.error().code() == KafkaError._PARTITION_EOF:
                break
            else:
                print(msg.error())
                break
        # print('Received message: {}'.format(msg.value().decode('utf-8')))

예제 #13

파일 보기

from confluent_kafka import Consumer, TopicPartition, OFFSET_BEGINNING

conf = {
    'bootstrap.servers': "localhost:9092",
    'group.id': 'my-new-group',
    'auto.offset.reset': 'earliest',
}

# consumer1 = Consumer(conf)
consumer = Consumer(conf)

topic = 'first_topic'

# creating a topic partition with topic - 'first_topic' and partition - 0
topicPartition = TopicPartition(topic=topic, partition=2)
print(topicPartition)

consumer.assign([topicPartition])

topicPartition.offset = OFFSET_BEGINNING
consumer.seek(topicPartition)

while True:
    message = consumer.poll(timeout=1.0)
    print(message.code())
    print(message.value())

예제 #14

파일 보기

파일: test_Consumer.py 프로젝트: confluentinc/confluent-kafka-python

def test_basic_api():
    """ Basic API tests, these wont really do anything since there is no
        broker configured. """

    try:
        kc = Consumer()
    except TypeError as e:
        assert str(e) == "expected configuration dict"

    def dummy_commit_cb(err, partitions):
        pass

    kc = Consumer({'group.id': 'test', 'socket.timeout.ms': '100',
                   'session.timeout.ms': 1000,  # Avoid close() blocking too long
                   'on_commit': dummy_commit_cb})

    kc.subscribe(["test"])
    kc.unsubscribe()

    def dummy_assign_revoke(consumer, partitions):
        pass

    kc.subscribe(["test"], on_assign=dummy_assign_revoke, on_revoke=dummy_assign_revoke)
    kc.unsubscribe()

    msg = kc.poll(timeout=0.001)
    if msg is None:
        print('OK: poll() timeout')
    elif msg.error():
        print('OK: consumer error: %s' % msg.error().str())
    else:
        print('OK: consumed message')

    if msg is not None:
        assert msg.timestamp() == (TIMESTAMP_NOT_AVAILABLE, -1)

    msglist = kc.consume(num_messages=10, timeout=0.001)
    assert len(msglist) == 0, "expected 0 messages, not %d" % len(msglist)

    with pytest.raises(ValueError) as ex:
        kc.consume(-100)
    assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value)

    with pytest.raises(ValueError) as ex:
        kc.consume(1000001)
    assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value)

    partitions = list(map(lambda part: TopicPartition("test", part), range(0, 100, 3)))
    kc.assign(partitions)

    with pytest.raises(KafkaException) as ex:
        kc.seek(TopicPartition("test", 0, 123))
    assert 'Erroneous state' in str(ex.value)

    # Verify assignment
    assignment = kc.assignment()
    assert partitions == assignment

    # Pause partitions
    kc.pause(partitions)

    # Resume partitions
    kc.resume(partitions)

    # Get cached watermarks, should all be invalid.
    lo, hi = kc.get_watermark_offsets(partitions[0], cached=True)
    assert lo == -1001 and hi == -1001
    assert lo == OFFSET_INVALID and hi == OFFSET_INVALID

    # Query broker for watermarks, should raise an exception.
    try:
        lo, hi = kc.get_watermark_offsets(partitions[0], timeout=0.5, cached=False)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._WAIT_COORD, KafkaError.LEADER_NOT_AVAILABLE),\
            str(e.args([0]))

    kc.unassign()

    kc.commit(asynchronous=True)

    try:
        kc.commit(asynchronous=False)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._NO_OFFSET)

    # Get current position, should all be invalid.
    kc.position(partitions)
    assert len([p for p in partitions if p.offset == OFFSET_INVALID]) == len(partitions)

    try:
        kc.committed(partitions, timeout=0.001)
    except KafkaException as e:
        assert e.args[0].code() == KafkaError._TIMED_OUT

    try:
        kc.list_topics(timeout=0.2)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT)

    try:
        kc.list_topics(topic="hi", timeout=0.1)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT)

    kc.close()

예제 #15

파일 보기

class Kafka_Confluent(object):
    Type = "Confluent-Kafka Wrapper Class"

    def __init__(self, kafka_client_config):

        print("=" * 50)
        print("Printing Kafka_Confluent kwargs...")
        import pprint
        pp = pprint.PrettyPrinter(indent=4)
        pp.pprint(kafka_client_config)
        print("=" * 50)

        self.broker = kafka_client_config["broker"]
        self.producer_params = kafka_client_config["producer_params"]
        self.consumer_1_params = kafka_client_config["consumer_1_params"]
        self.consumer_2_params = kafka_client_config["consumer_2_params"]

        self.producer_topic = kafka_client_config.get('producer_topic')
        self.consumer_1_topic = kafka_client_config.get('consumer_1_topic')
        self.consumer_2_topic = kafka_client_config.get('consumer_2_topic')

        self.producer = None
        self.consumer_1 = None
        self.consumer_2 = None

        # Create Producer
        if (self.producer_topic):
            self.producer_params['bootstrap.servers'] = kafka_client_config[
                "broker"]
            self.producer = KafkaProducer(self.producer_params)
            print("Producer created successfully...")

        # Create Consumer 1
        if (self.consumer_1_topic):
            self.consumer_1_params['bootstrap.servers'] = kafka_client_config[
                "broker"]
            self.consumer_1 = KafkaConsumer(self.consumer_1_params)
            self.consumer_1.subscribe([self.consumer_1_topic])
            self.consumer_1.poll(timeout=0.01)
            print("Consumer 1 created successfully...")

        # Create Consumer 2
        if (self.consumer_2_topic):
            self.consumer_2_params['bootstrap.servers'] = kafka_client_config[
                "broker"]
            self.consumer_2 = KafkaConsumer(self.consumer_2_params)
            self.consumer_2.subscribe([self.consumer_2_topic])
            self.consumer_2.poll(timeout=0.01)
            print("Consumer 1 created successfully...")

        # TODO : Print Complete config

    def produce(self, output, source_data):
        value = dict_to_kafka(output, source_data)

        print("=" * 50)
        print("Producing Message")
        print("self.producer_topic", self.producer_topic)
        print("message size, ", str(len(value)))
        print("=" * 50)

        self.producer.produce(self.producer_topic, value)
        self.producer.poll(0)
        return (True)

    def consume1(self):
        print("=" * 50)
        print("Consuming Message")
        print("self.consumer_1_topic", self.consumer_1_topic)
        print("=" * 50)
        message_kafka = self.consumer_1.consume(num_messages=1)[0]
        message_dict = kafka_to_dict(message_kafka)
        return (message_dict)

    def consume2(self, block=True):

        print("=" * 50)
        print("Consuming Message")
        print("self.consumer_2_topic", self.consumer_2_topic)
        print("=" * 50)

        if (block):
            message_kafka = self.consumer_2.consume(num_messages=1)[0]
        else:
            message_kafka = self.consumer_2.poll(timeout=0.01)

        if (message_kafka):
            message_dict = kafka_to_dict(message_kafka)
        else:
            message_dict = None

        return (message_dict)

    def sync_consumers(self):

        m1 = self.consumer_1.consume(num_messages=1)[0]
        m2 = self.consumer_2.consume(num_messages=1)[0]

        m1_dict, m2_dict = kafka_to_dict(m1), kafka_to_dict(m2)

        try:
            assert (m2_dict["_id"] == m1_dict["source_id"])

        except AssertionError:
            logger.info("Consumers not synced. Syncing now...")

            kafka_source_id = m1_dict[
                "_kafka_source_id"]  #"{id}:{topic}:{partition}:{offset}"
            consumer_2_topic_name = kafka_source_id.split(":")[-3]  # 3rd last
            consumer_2_partition = int(
                kafka_source_id.split(":")[-2])  # 3rd last
            consumer_2_offset = int(kafka_source_id.split(":")[-1])
            consumer_2_topic_partition = TopicPartition(
                topic=consumer_2_topic_name,
                partition=consumer_2_partition,
                offset=consumer_2_offset)

            # Sync Consumer 2
            self.consumer_2.seek(consumer_2_topic_partition)
            m2 = self.consumer_2.consume(num_messages=1)[0]
            m2_dict = kafka_to_dict(m2)

        try:
            assert (m2_dict["_id"] == m1_dict["source_id"])
            return (m1_dict, m2_dict)
        except AssertionError:
            logger.info("Consumers not synced. Unknown error.")
            sys.exit(0)

예제 #16

파일 보기

파일: kafka.py 프로젝트: Appva/snuba

class KafkaConsumer(Consumer[TopicPartition, int, bytes]):
    """
    The behavior of this consumer differs slightly from the Confluent
    consumer during rebalancing operations. Whenever a partition is assigned
    to this consumer, offsets are *always* automatically reset to the
    committed offset for that partition (or if no offsets have been committed
    for that partition, the offset is reset in accordance with the
    ``auto.offset.reset`` configuration value.) This causes partitions that
    are maintained across a rebalance to have the same offset management
    behavior as a partition that is moved from one consumer to another. To
    prevent uncommitted messages from being consumed multiple times,
    ``commit`` should be called in the partition revocation callback.

    The behavior of ``auto.offset.reset`` also differs slightly from the
    Confluent consumer as well: offsets are only reset during initial
    assignment or subsequent rebalancing operations. Any other circumstances
    that would otherwise lead to preemptive offset reset (e.g. the consumer
    tries to read a message that is before the earliest offset, or the
    consumer attempts to read a message that is after the latest offset) will
    cause an exception to be thrown, rather than resetting the offset, as
    this could lead to chunks messages being replayed or skipped, depending
    on the circumstances. This also means that if the committed offset is no
    longer available (such as when reading older messages from the log and
    those messages expire, or reading newer messages from the log and the
    leader crashes and partition ownership fails over to an out-of-date
    replica), the consumer will fail-stop rather than reset to the value of
    ``auto.offset.reset``.
    """

    # Set of logical offsets that do not correspond to actual log positions.
    # These offsets should be considered an implementation detail of the Kafka
    # consumer and not used publically.
    # https://github.com/confluentinc/confluent-kafka-python/blob/443177e1c83d9b66ce30f5eb8775e062453a738b/tests/test_enums.py#L22-L25
    LOGICAL_OFFSETS = frozenset(
        [OFFSET_BEGINNING, OFFSET_END, OFFSET_STORED, OFFSET_INVALID])

    def __init__(self, configuration: Mapping[str, Any]) -> None:
        auto_offset_reset = configuration.get("auto.offset.reset", "largest")
        if auto_offset_reset in {"smallest", "earliest", "beginning"}:
            self.__resolve_partition_starting_offset = (
                self.__resolve_partition_offset_earliest)
        elif auto_offset_reset in {"largest", "latest", "end"}:
            self.__resolve_partition_starting_offset = (
                self.__resolve_partition_offset_latest)
        elif auto_offset_reset == "error":
            self.__resolve_partition_starting_offset = (
                self.__resolve_partition_offset_error)
        else:
            raise ValueError(
                "invalid value for 'auto.offset.reset' configuration")

        # NOTE: Offsets are explicitly managed as part of the assignment
        # callback, so preemptively resetting offsets is not enabled.
        self.__consumer = ConfluentConsumer({
            **configuration, "auto.offset.reset":
            "error"
        })

        self.__offsets: MutableMapping[TopicPartition, int] = {}

        self.__state = KafkaConsumerState.CONSUMING

    def __resolve_partition_offset_earliest(
            self,
            partition: ConfluentTopicPartition) -> ConfluentTopicPartition:
        low, high = self.__consumer.get_watermark_offsets(partition)
        return ConfluentTopicPartition(partition.topic, partition.partition,
                                       low)

    def __resolve_partition_offset_latest(
            self,
            partition: ConfluentTopicPartition) -> ConfluentTopicPartition:
        low, high = self.__consumer.get_watermark_offsets(partition)
        return ConfluentTopicPartition(partition.topic, partition.partition,
                                       high)

    def __resolve_partition_offset_error(
            self,
            partition: ConfluentTopicPartition) -> ConfluentTopicPartition:
        raise ConsumerError("unable to resolve partition offsets")

    def subscribe(
        self,
        topics: Sequence[str],
        on_assign: Optional[Callable[[Sequence[TopicPartition]], None]] = None,
        on_revoke: Optional[Callable[[Sequence[TopicPartition]], None]] = None,
    ) -> None:
        if self.__state is not KafkaConsumerState.CONSUMING:
            raise InvalidState(self.__state)

        def assignment_callback(
                consumer: ConfluentConsumer,
                partitions: Sequence[ConfluentTopicPartition]) -> None:
            self.__state = KafkaConsumerState.ASSIGNING

            try:
                assignment: MutableSequence[ConfluentTopicPartition] = []

                for partition in self.__consumer.committed(partitions):
                    if partition.offset >= 0:
                        assignment.append(partition)
                    elif partition.offset == OFFSET_INVALID:
                        assignment.append(
                            self.__resolve_partition_starting_offset(
                                partition))
                    else:
                        raise ValueError("received unexpected offset")

                offsets: MutableMapping[TopicPartition, int] = {
                    TopicPartition(i.topic, i.partition): i.offset
                    for i in assignment
                }
                self.__seek(offsets)
            except Exception:
                self.__state = KafkaConsumerState.ERROR
                raise

            try:
                if on_assign is not None:
                    on_assign(list(offsets.keys()))
            finally:
                self.__state = KafkaConsumerState.CONSUMING

        def revocation_callback(
                consumer: ConfluentConsumer,
                partitions: Sequence[ConfluentTopicPartition]) -> None:
            self.__state = KafkaConsumerState.REVOKING

            streams = [
                TopicPartition(i.topic, i.partition) for i in partitions
            ]

            try:
                if on_revoke is not None:
                    on_revoke(streams)
            finally:
                for stream in streams:
                    try:
                        self.__offsets.pop(stream)
                    except KeyError:
                        # If there was an error during assignment, this stream
                        # may have never been added to the offsets mapping.
                        logger.warning(
                            "failed to delete offset for unknown stream: %r",
                            stream)

                self.__state = KafkaConsumerState.CONSUMING

        self.__consumer.subscribe(topics,
                                  on_assign=assignment_callback,
                                  on_revoke=revocation_callback)

    def unsubscribe(self) -> None:
        if self.__state is not KafkaConsumerState.CONSUMING:
            raise InvalidState(self.__state)

        self.__consumer.unsubscribe()

    def poll(self, timeout: Optional[float] = None) -> Optional[KafkaMessage]:
        if self.__state is not KafkaConsumerState.CONSUMING:
            raise InvalidState(self.__state)

        message: Optional[ConfluentMessage] = self.__consumer.poll(
            *[timeout] if timeout is not None else [])
        if message is None:
            return None

        error: Optional[KafkaError] = message.error()
        if error is not None:
            code = error.code()
            if code == KafkaError._PARTITION_EOF:
                raise EndOfStream(
                    TopicPartition(message.topic(), message.partition()),
                    message.offset(),
                )
            elif code == KafkaError._TRANSPORT:
                raise TransportError(str(error))
            else:
                raise ConsumerError(str(error))

        result = KafkaMessage(
            TopicPartition(message.topic(), message.partition()),
            message.offset(),
            message.value(),
        )

        self.__offsets[result.stream] = result.get_next_offset()

        return result

    def tell(self) -> Mapping[TopicPartition, int]:
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        return self.__offsets

    def __seek(self, offsets: Mapping[TopicPartition, int]) -> None:
        if self.__state is KafkaConsumerState.ASSIGNING:
            # Calling ``seek`` on the Confluent consumer from an assignment
            # callback will throw an "Erroneous state" error. Instead,
            # partition offsets have to be initialized by calling ``assign``.
            self.__consumer.assign([
                ConfluentTopicPartition(stream.topic, stream.partition, offset)
                for stream, offset in offsets.items()
            ])
        else:
            for stream, offset in offsets.items():
                self.__consumer.seek(
                    ConfluentTopicPartition(stream.topic, stream.partition,
                                            offset))

        self.__offsets.update(offsets)

    def seek(self, offsets: Mapping[TopicPartition, int]) -> None:
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        if offsets.keys() - self.__offsets.keys():
            raise ConsumerError("cannot seek on unassigned streams")

        self.__seek(offsets)

    def commit(self) -> Mapping[TopicPartition, int]:
        if self.__state in {
                KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR
        }:
            raise InvalidState(self.__state)

        result: Optional[Sequence[ConfluentTopicPartition]] = None

        retries_remaining = 3
        while result is None:
            try:
                result = self.__consumer.commit(asynchronous=False)
                assert result is not None
            except KafkaException as e:
                if not e.args[0].code() in (
                        KafkaError.REQUEST_TIMED_OUT,
                        KafkaError.NOT_COORDINATOR_FOR_GROUP,
                        KafkaError._WAIT_COORD,
                ):
                    raise

                if not retries_remaining:
                    raise

                logger.warning(
                    "Commit failed: %s (%d retries remaining)",
                    str(e),
                    retries_remaining,
                )
                retries_remaining -= 1
                time.sleep(1)

        offsets: MutableMapping[TopicPartition, int] = {}

        for value in result:
            # The Confluent Kafka Consumer will include logical offsets in the
            # sequence of ``TopicPartition`` objects returned by ``commit``.
            # These are an implementation detail of the Kafka Consumer, so we
            # don't expose them here.
            # NOTE: These should no longer be seen now that we are forcing
            # offsets to be set as part of the assignment callback.
            if value.offset in self.LOGICAL_OFFSETS:
                continue

            assert value.offset >= 0, "expected non-negative offset"
            offsets[TopicPartition(value.topic,
                                   value.partition)] = value.offset

        return offsets

    def close(self, timeout: Optional[float] = None) -> None:
        try:
            self.__consumer.close()
        except RuntimeError:
            pass

        self.__state = KafkaConsumerState.CLOSED

예제 #17

파일 보기

파일: test_Consumer.py 프로젝트: z0u/confluent-kafka-python

def test_basic_api():
    """ Basic API tests, these wont really do anything since there is no
        broker configured. """

    try:
        kc = Consumer()
    except TypeError as e:
        assert str(e) == "expected configuration dict"

    def dummy_commit_cb(err, partitions):
        pass

    kc = Consumer({
        'group.id': 'test',
        'socket.timeout.ms': '100',
        'session.timeout.ms': 1000,  # Avoid close() blocking too long
        'on_commit': dummy_commit_cb
    })

    kc.subscribe(["test"])
    kc.unsubscribe()

    def dummy_assign_revoke(consumer, partitions):
        pass

    kc.subscribe(["test"],
                 on_assign=dummy_assign_revoke,
                 on_revoke=dummy_assign_revoke)
    kc.unsubscribe()

    msg = kc.poll(timeout=0.001)
    if msg is None:
        print('OK: poll() timeout')
    elif msg.error():
        print('OK: consumer error: %s' % msg.error().str())
    else:
        print('OK: consumed message')

    if msg is not None:
        assert msg.timestamp() == (TIMESTAMP_NOT_AVAILABLE, -1)

    msglist = kc.consume(num_messages=10, timeout=0.001)
    assert len(msglist) == 0, "expected 0 messages, not %d" % len(msglist)

    with pytest.raises(ValueError) as ex:
        kc.consume(-100)
    assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value)

    with pytest.raises(ValueError) as ex:
        kc.consume(1000001)
    assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value)

    partitions = list(
        map(lambda part: TopicPartition("test", part), range(0, 100, 3)))
    kc.assign(partitions)

    with pytest.raises(KafkaException) as ex:
        kc.seek(TopicPartition("test", 0, 123))
    assert 'Erroneous state' in str(ex.value)

    # Verify assignment
    assignment = kc.assignment()
    assert partitions == assignment

    # Pause partitions
    kc.pause(partitions)

    # Resume partitions
    kc.resume(partitions)

    # Get cached watermarks, should all be invalid.
    lo, hi = kc.get_watermark_offsets(partitions[0], cached=True)
    assert lo == -1001 and hi == -1001
    assert lo == OFFSET_INVALID and hi == OFFSET_INVALID

    # Query broker for watermarks, should raise an exception.
    try:
        lo, hi = kc.get_watermark_offsets(partitions[0],
                                          timeout=0.5,
                                          cached=False)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._WAIT_COORD, KafkaError.LEADER_NOT_AVAILABLE),\
            str(e.args([0]))

    kc.unassign()

    kc.commit(asynchronous=True)

    try:
        kc.commit(asynchronous=False)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT,
                                    KafkaError._NO_OFFSET)

    # Get current position, should all be invalid.
    kc.position(partitions)
    assert len([p for p in partitions
                if p.offset == OFFSET_INVALID]) == len(partitions)

    try:
        kc.committed(partitions, timeout=0.001)
    except KafkaException as e:
        assert e.args[0].code() == KafkaError._TIMED_OUT

    try:
        kc.list_topics(timeout=0.2)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT,
                                    KafkaError._TRANSPORT)

    try:
        kc.list_topics(topic="hi", timeout=0.1)
    except KafkaException as e:
        assert e.args[0].code() in (KafkaError._TIMED_OUT,
                                    KafkaError._TRANSPORT)

    kc.close()

예제 #18

파일 보기

class TestConsumer(TestBaseEP):
    def __init__(self, tc_drv, cfg_sink):
        logger = logging.getLogger()

        if not cfg_sink:
            raise ValueError("'cfg_sink' is a required parameter")
        if "type" not in cfg_sink:
            raise RuntimeError("'type' NOT found in 'sink' dict")
        self.tc_drv = tc_drv
        self.tc_id = tc_drv.get_id()
        self.type = cfg_sink["type"]
        self.poll_count = 10
        self.cons = None

        if self.type == "None":
            return

        if self.type == "Kafka":
            if "kafka" not in cfg_sink:
                raise RuntimeError("'kafka' NOT found in 'sink' dict")
            super(TestConsumer, self).__init__(tc_drv, cfg_sink)

            if "group" not in cfg_sink["kafka"]:
                raise RuntimeError("'group' NOT found in 'kafka' dict")
            if not isinstance(cfg_sink["kafka"]["group"], str):
                raise TypeError("'group' must be of type 'str'")
            self.group = cfg_sink["kafka"]["group"]

            if "timeout" in cfg_sink["kafka"]:
                if not isinstance(cfg_sink["kafka"]["timeout"], int):
                    raise TypeError("'timeout' must be of type 'int'")
                self.poll_count = cfg_sink["kafka"]["timeout"]

            self.tc_drv.set_exp_type(self.type)
        elif self.type == "CFKafka":
            if "cfkafka" not in cfg_sink:
                raise RuntimeError("'cfkafka' NOT found in 'sink' dict")
            super(TestConsumer, self).__init__(tc_drv, cfg_sink)

            if "group" not in cfg_sink["cfkafka"]:
                raise RuntimeError("'group' NOT found in 'cfkafka' dict")
            if not isinstance(cfg_sink["cfkafka"]["group"], str):
                raise TypeError("'group' must be of type 'str'")
            self.group = cfg_sink["cfkafka"]["group"]

            if "timeout" in cfg_sink["cfkafka"]:
                if not isinstance(cfg_sink["cfkafka"]["timeout"], int):
                    raise TypeError("'timeout' must be of type 'int'")
            self.poll_count = cfg_sink["cfkafka"]["timeout"]

            self.tc_drv.set_exp_type(self.type)
        else:
            raise RuntimeError("Unsupported 'type'='%s' in 'sink' dict" %
                               (cfg_sink["type"]))

        if not self.cons:
            self.connect()

    def __del__(self):
        logger = logging.getLogger()

        if self.cons:
            #self.cons.unsubscribe()
            self.cons.close()

    def connect(self):
        logger = logging.getLogger()

        if self.type == "None":
            return

        if self.type == "Kafka":
            logger.debug("brokers: {}, group: {}, topic: {}".format(
                self.brokers, self.group, self.topic))
            self.cons = Consumer({
                'bootstrap.servers': self.brokers,
                'group.id': self.group,
                'default.topic.config': {
                    'auto.offset.reset': 'smallest',
                }
            })

            self.cons.subscribe([self.topic])
        elif self.type == "CFKafka":
            logger.debug(
                "brokers: {}, schema_reg: {}, group: {}, topic: {}".format(
                    self.brokers, self.schema_reg, self.group, self.topic))
            self.cons = avro.AvroConsumer({
                'bootstrap.servers': self.brokers,
                'schema.registry.url': self.schema_reg,
                'group.id': self.group,
                'default.topic.config': {
                    'auto.offset.reset': 'smallest',
                }
            })

            self.cons.subscribe([self.topic])

    def __reset_pos(self):
        logger = logging.getLogger()

        if self.type == "None":
            return

        parts = [TopicPartition(self.topic, 0)]
        (start, end) = self.cons.get_watermark_offsets(parts[0])
        logger.debug("Currently at {}/{} offset <{}, {}>".format(
            parts[0].topic, parts[0].partition, start, end))
        if end > 0:
            parts[0].offset = end - 1
            self.cons.seek(parts[0])

    def drain(self):
        logger = logging.getLogger()

        if self.type == "None":
            return

        poll_count = 60
        logger.warning("topic: {}, will timeout in {} secs".format(
            self.topic, poll_count))
        poll_num = 0
        while True:
            try:
                poll_num += 1
                msg = self.cons.poll(timeout=1.0)
            except SerializerError as exc:
                continue

            if msg is None:
                if poll_num >= poll_count:
                    break
            elif msg.error():
                break

    def rx_one(self):
        logger = logging.getLogger()

        if self.type == "None":
            return None

        logger.warning("will timeout in {} secs".format(self.poll_count))
        #logger.debug("going to consume/poll")
        #msgs = self.cons.consume(num_messages=1, timeout=5.0)
        #if not msgs:
        #    raise RuntimeError("No msg received, timed-out!")
        poll_num = 0
        while True:
            try:
                poll_num += 1
                msg = self.cons.poll(timeout=1.0)
            except SerializerError as exc:
                raise RuntimeError(
                    "Message deserialization failed: {}".format(exc))

            if msg is None:
                #parts = self.cons.position(parts)
                #logger.debug("Currently at {}/{} offset {}".format(parts[0].topic,
                #    parts[0].partition, parts[0].offset))
                if poll_num < self.poll_count:
                    continue
                else:
                    raise RuntimeError(
                        "No msg received via {}, timed-out!".format(
                            self.topic))
            elif not msg.error():
                break
            elif msg.error().code() == KafkaError._PARTITION_EOF:
                #raise RuntimeError("End of partition reached {}/{}".format(
                #    msg.topic(), msg.partition()))
                #self.__reset_pos()
                #break
                continue
            else:
                raise RuntimeError(msg.error().str())

        test_out = msg.value()
        logger.debug("RX'ed '{}' : '{}'".format(type(test_out), test_out))
        if self.type == "CFKafka":
            self.tc_drv.store_rx_one(test_out)

        self.cons.commit()

        return test_out