示例#1
0
 def offsets_for_time(self, partitions_time: list, timestamp: int = -1):
     """
     寻找指定时间后的partition最早offset
     :param partitions_time: list of (topic, partition) if timestamp > 0, (topic, partition, timestamp) if timestamp = -1
     :param timestamp: 指定的开始查询时间, 如果是-1则表示每个partitions都有自己的时间配置
     :return:
     """
     if timestamp > 0:
         _partitions = {
             TopicPartition(_tuple[0], _tuple[1]): timestamp
             for _tuple in partitions_time
         }
     else:
         _partitions = {
             TopicPartition(_tuple[0], _tuple[1]): _tuple[2]
             for _tuple in partitions_time
         }
     try:
         result = self.consumer.offsets_for_times(_partitions)
     except UnsupportedVersionError or ValueError or KafkaTimeoutError as e:
         if e.__class__ == UnsupportedVersionError:
             log.tag_error(KafkaInfo.KafkaConsumer,
                           "API VERSION ERROR, DO NOT SUPPORT")
             raise ActionError(KafkaErr.NotSupport)
         if e.__class__ == ValueError:
             log.tag_error(KafkaInfo.KafkaConsumer,
                           "Value Error: Target Timestamp is negative")
         else:
             log.tag_error(KafkaInfo.KafkaConsumer,
                           "Get offset by timestamp failed, Time out")
         raise ActionError(KafkaErr.GetOffsetFailed)
     return result
示例#2
0
    def __init__(self):
        logging.info(
            'Going to initialize KafkaHandler for kafka at endpont %s',
            kafka_endpoint)
        self.consumer = KafkaConsumer(bootstrap_servers=kafka_endpoint)
        self.dumps = {}
        end_offset = {}

        for topic in topics:
            self.dumps[topic] = collections.deque(maxlen=100)
            current_partition = TopicPartition(topic, 0)
            self.consumer.assign([current_partition])
            self.consumer.seek_to_end()
            offset = self.consumer.position(current_partition)
            end_offset[topic] = offset > 100 and offset or 100

        topic_partitions = [TopicPartition(topic, 0) for topic in topics]
        self.consumer.assign(topic_partitions)
        for topic in topics:
            self.consumer.seek(TopicPartition(topic, 0),
                               end_offset[topic] - 100)

        self.thread = threading.Thread(target=self.run, args=())
        self.thread.daemon = True  # Demonize thread
        self.thread.start()  # Start the execution
示例#3
0
    def __init__(self, location, enable_ssl, cert_path, topic, group,
                 partition_id):
        self._location = location
        self._group = group
        self._topic = topic
        kwargs = _prepare_kafka_ssl_kwargs(cert_path) if enable_ssl else {}
        self._consumer = KafkaConsumer(
            bootstrap_servers=self._location,
            group_id=self._group,
            max_partition_fetch_bytes=10485760,
            consumer_timeout_ms=100,
            client_id="%s-%s" %
            (self._topic,
             str(partition_id) if partition_id is not None else "all"),
            request_timeout_ms=120 * 1000,
            heartbeat_interval_ms=10000,
            **kwargs)

        # explicitly causing consumer to bootstrap the cluster metadata
        self._consumer.topics()

        if partition_id is not None:
            self._partitions = [TopicPartition(self._topic, partition_id)]
            self._consumer.assign(self._partitions)
        else:
            self._partitions = [
                TopicPartition(self._topic, pid)
                for pid in self._consumer.partitions_for_topic(self._topic)
            ]
            self._consumer.subscribe(topics=[self._topic])
示例#4
0
def and_we_read_from_initial_offset(step):
    test_context = world.test_environment.load_context(SEND_RECEIVE_SCENARIO)

    knodes = world.pipeline_config.cluster.node_array
    topic = world.pipeline_config.get_user_topic('scratch_topic')
    consumer_group = world.pipeline_config.get_user_defined_consumer_group('scratch_group_1')

    kreader = telegraf.KafkaIngestRecordReader(topic, knodes, consumer_group)

    # show how many partitions this topic spans
    metadata = kreader.consumer.partitions_for_topic(topic)

    # TopicPartition named tuple consists of the topic and a partition number
    tp = TopicPartition(topic, 0)

    # manually assign one or more partitions to the consumer --
    # required if we want to use explicit offsets
    kreader.consumer.assign([tp])

    topic_partition = TopicPartition(topic, list(metadata)[0])
    kreader.consumer.seek(topic_partition, test_context.offset)

    world_relay = WorldRelay(record_type='direct_sales_record', stream_id='test_stream_id', asset_id='test_asset_id')
    test_context.num_received_records = kreader.read(world_relay, world.log)
    #test_context.num_received_records = kreader.num_commits_issued
    for rec in world_relay.read_list:
        test_context.consumed_raw_record_list.append(rec)
示例#5
0
    def store_offset_records(self):
        consumer = self._getconsumer()
        partition_set = consumer.partitions_for_topic(self.topic)
        counter = 0
        while counter < 5:
            counter += 1
            partition_set = consumer.partitions_for_topic(self.topic)
            if partition_set:
                break
            else:
                time.sleep(10)

        partitions = []
        for partition_id in partition_set:
            partitions.append(TopicPartition(self.topic, partition_id))

        curr_offsets = {}
        for partition in partitions:
            committed = consumer.committed(partition)
            curr_offsets[partition.partition] = committed

        end_offsets = consumer.end_offsets(partitions)

        for partition_id, value in curr_offsets.items():
            record = {
                'curr_offset':
                value,
                'end_offset':
                end_offsets[TopicPartition(topic=self.topic,
                                           partition=partition_id)]
            }
            self.offset_records[partition_id] = record
示例#6
0
def when_we_read_from_initial_offset(step, checkpoint_freq):
    test_context = world.test_environment.load_context(DATA_COMMIT_SCENARIO)
    test_context.checkpoint_frequency = int(checkpoint_freq)

    knodes = world.pipeline_config.cluster.node_array
    topic = world.pipeline_config.get_user_topic('scratch_topic')
    consumer_group = world.pipeline_config.get_user_defined_consumer_group('scratch_group_1')

    kreader = telegraf.KafkaIngestRecordReader(topic, knodes, consumer_group)

    # show how many partitions this topic spans
    metadata = kreader.consumer.partitions_for_topic(topic)

    # TopicPartition named tuple consists of the topic and a partition number
    tp = TopicPartition(topic, 0)

    # manually assign one or more partitions to the consumer --
    # required if we want to use explicit offsets
    kreader.consumer.assign([tp])

    topic_partition = TopicPartition(topic, list(metadata)[0])
    kreader.consumer.seek(topic_partition, test_context.offset)
    
    world_relay = WorldRelay(record_type='direct_sales_record', stream_id='test_stream_id', asset_id='test_asset_id')


    # world.log.debug('calling read() on kafka reader with ckpt frequency of %d and interval of %d...' % (int(checkpoint_freq), 10))
    test_context.num_received_records = kreader.read(world_relay, world.log, checkpoint_frequency=test_context.checkpoint_frequency, checkpoint_interval=10)
    #xkreader.read(world_relay, world.log)

    test_context.num_successful_checkpoints = world_relay.checkpoint_successes
    test_context.num_checkpoint_errors = len(world_relay.checkpoint_errors)
    def consume_data(self, offset=None):
        """
        :param action: none 从 kafka 正常的 `CURRENT-OFFSET` 开始消费
                    custom 从指定offset开始
                    begin 从 kafka 从这个topic最开始消费
                    end 从 kafka 从这个topic从最新生成的数据库开始,会跳过未消费数据,慎用
        :param offset: 数字 int>=0 type为custom时有效从当前数字 offset 开始,包括当前数字,
                       如果数字大于当前topic总offset,从最新生成的数据库开始
        :return:
        """

        # 获取topic所有分区并分配给当前消费者, 需要使用 assign 的话, 在 KafkaConsumer 初始化时就不能指定 topic
        _ps = [
            TopicPartition(self.topic, p)
            for p in self.consumer.partitions_for_topic(self.topic)
        ]
        if offset is None:
            offset = self.get_last_position()
        self.consumer.assign(_ps)
        for p in self.consumer.partitions_for_topic(self.topic):
            # 也可以只指定一个分区的 offset
            self.consumer.seek(TopicPartition(self.topic, p), offset)
        # try:
        #     for message in self.consumer:
        #         yield message
        # except KeyboardInterrupt as e:
        #     print(e)
        for message in self.consumer:
            yield message
示例#8
0
def start(*args, **kwargs):
    topic_name = topic.get()
    server_host = server.get()
    server_host = server_host + ':9092'
    print server_host
    t_partition = partition.get()
    if not t_partition:
        t_partition = 0
    group = group_id.get()
    #print topic_name,server_host,t_partition,group
    # 指定分区的指定offset开始消费
    consumerx = KafkaConsumer(
        topic_name,
        bootstrap_servers=[
            server_host,
        ],
        auto_offset_reset='earliest',
        group_id=group,
    )
    consumerx.unsubscribe()
    consumerx.assign([
        TopicPartition(topic=topic_name, partition=0),
    ])  # 指定分区订阅
    record_num = consumerx.end_offsets([
        TopicPartition(topic=topic_name, partition=t_partition),
    ])
    t.insert(1.0, record_num)
示例#9
0
def start_consumer():
    consumer = KafkaConsumer( bootstrap_servers = brokers)
    producer = KafkaProducer(bootstrap_servers=brokers,key_serializer= str.encode, value_serializer= str.encode)
    consumer.assign([
        TopicPartition(topic=my_topic,partition=0)
    ])
    consumer.seek(partition=TopicPartition(topic=my_topic,partition=0),offset=345)
    for msg in consumer:
        print(msg)
        print("topic = %s" % msg.topic) # topic default is string
        print("partition = %d" % msg.offset)
        print("value = %s" % msg.value.decode()) # bytes to string
        print("timestamp = %d" % msg.timestamp)
        print("time = ", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime( msg.timestamp/1000 )) )
        fund_content = json.loads(msg.value.decode())
        keywords = service.GetKeywords()
        results = keywords.operate(fund_content['description'])
        industries = []
        technology = []
        for result in results:
            if result['label'] == 'indu':
                industries.append(result['text'])
            elif result['label'] == 'tech':
                technology.append(result['text'])
        fund_content['industries'] = industries
        fund_content['technology'] = technology
        future = producer.send(my_topic ,  key= 'import_raw', value= json.dumps(fund_content), partition= 0)
        future.get(timeout=10)
示例#10
0
    def __init__(self, location, topic, group, partition_id):
        self._location = location
        self._group = group
        self._topic = topic
        self._consumer = KafkaConsumer(
            bootstrap_servers=self._location,
            group_id=self._group,
            max_partition_fetch_bytes=10485760,
            consumer_timeout_ms=100,
            client_id="%s-%s" %
            (self._topic,
             str(partition_id) if partition_id is not None else "all"),
            request_timeout_ms=120 * 1000,
        )

        if partition_id is not None:
            self._partition_ids = [TopicPartition(self._topic, partition_id)]
            self._consumer.assign(self._partition_ids)
        else:
            self._partition_ids = [
                TopicPartition(self._topic, pid)
                for pid in self._consumer.partitions_for_topic(self._topic)
            ]
            self._consumer.subscribe(topics=[self._topic])
            if self._consumer._use_consumer_group():
                self._consumer._coordinator.ensure_coordinator_known()
                self._consumer._coordinator.ensure_active_group()

        self._consumer._update_fetch_positions(self._partition_ids)
        self._start_looping_call()
示例#11
0
 def next_requests(self):
     tps = [TopicPartition(topic=self.topic, partition=p) for p in self.topic_partitions]
     self.consumer.assign(tps)
     for partition in self.topic_partitions:
         offset = self.get_partition_offset(partition)
         self.consumer.seek(TopicPartition(topic=self.topic, partition=partition), offset)
     return self.start_consumer()
    def test_producing_with_batched_records(self):
        """
        Compared to previous test, we are going to have batching in Kafka producers (this is caused by high 'linger.ms' value).
        So a single request that reaches a Kafka broker might be carrying more than one record, for different partitions.
        """
        messages_to_send = 100
        partition1 = TopicPartition('apricots', 0)
        partition2 = TopicPartition('berries', 0)

        # This ensures that records to 'apricots' and 'berries' partitions.
        producer = KafkaProducer(
            bootstrap_servers=IntegrationTest.kafka_envoy_address(),
            api_version=(1, 0, 0),
            linger_ms=1000,
            batch_size=100)
        future_to_message1 = {}
        future_to_message2 = {}
        for _ in range(messages_to_send):
            message = Message()
            future1 = producer.send(key=message.key,
                                    value=message.value,
                                    headers=message.headers,
                                    topic=partition1.topic,
                                    partition=partition1.partition)
            future_to_message1[future1] = message

            message = Message()
            future2 = producer.send(key=message.key,
                                    value=message.value,
                                    headers=message.headers,
                                    topic=partition2.topic,
                                    partition=partition2.partition)
            future_to_message2[future2] = message

        offset_to_message1 = {}
        offset_to_message2 = {}
        for future in future_to_message1.keys():
            offset_to_message1[
                future.get().offset] = future_to_message1[future]
            self.assertTrue(future.get().offset >= 0)
        for future in future_to_message2.keys():
            offset_to_message2[
                future.get().offset] = future_to_message2[future]
            self.assertTrue(future.get().offset >= 0)
        self.assertTrue(len(offset_to_message1) == messages_to_send)
        self.assertTrue(len(offset_to_message2) == messages_to_send)
        producer.close()

        # Check the target clusters.
        self.__verify_target_kafka_cluster(
            IntegrationTest.kafka_cluster1_address(), partition1,
            offset_to_message1, partition2)
        self.__verify_target_kafka_cluster(
            IntegrationTest.kafka_cluster2_address(), partition2,
            offset_to_message2, partition1)

        # Check if requests have been received.
        self.metrics.collect_final_metrics()
        self.metrics.assert_metric_increase('produce', 1)
示例#13
0
    def _setup_consumer(self):
        """
        prepare offset numbers etc. for reading from Topic
        """
        # <WTF> https://github.com/dpkp/kafka-python/issues/601
        self.available_topics = self.client.topics()
        # </WTF>

        # might as well use it
        assert self.topic in self.available_topics

        if (self.start_params is None) != (self.end_params is None):
            raise ValueError("Both start and end params must be set or both must be None")

        if self.start_params is None:
            # setup partitions to read through
            # TODO not checked with multiple partitions since inheriting from foxglove
            # An offset is assigned to make repeatability (via a locking file) possible later on.
            # and it's easier to terminate the fetch loop this way.
            p_id = self.client.partitions_for_topic(self.topic)
            topic_partitions = [TopicPartition(topic=self.topic, partition=p) for p in list(p_id)]
            starts = self.client.beginning_offsets(topic_partitions)
            ends = self.client.end_offsets(topic_partitions)

            self.start_p_offsets = {
                tp: OffsetAndTimestamp(offset=offset, timestamp=None) for tp, offset in starts.items()
            }
            self.end_p_offsets = {
                tp: OffsetAndTimestamp(offset=offset - 1, timestamp=None) for tp, offset in ends.items()
            }

        else:
            # TODO - this code was inherited from Foxglove and hasn't be checked through
            # setup start and end partitions and offsets
            # self.client.seek_to_beginning()
            # datetime is only start/end implemented
            assert isinstance(self.start_params, datetime) and isinstance(self.end_params, datetime)
            start = int(self.start_params.timestamp() * 1000)
            end = int(self.end_params.timestamp() * 1000)

            partitions = self.client.partitions_for_topic(self.topic)
            tx = {TopicPartition(topic=self.topic, partition=p): start for p in list(partitions)}
            self.start_p_offsets = self.client.offsets_for_times(tx)

            # if you give a timestamp after the last record it returns None
            for tp, offset_details in self.start_p_offsets.items():
                if offset_details is None:
                    raise ValueError("Start date outside of available messages")

            tx = {TopicPartition(topic=self.topic, partition=p): end for p in list(partitions)}
            self.end_p_offsets = self.client.offsets_for_times(tx)

            # as above - out of range, for end offset give something useful
            for tp, offset_details in self.end_p_offsets.items():
                if offset_details is None:
                    # go to last message. I'm not 100% sure this is correct
                    end_offsets = self.client.end_offsets([tp])
                    offset = end_offsets[tp] - 1
                    self.end_p_offsets[tp] = OffsetAndTimestamp(offset=offset, timestamp=None)
 def assign_and_seek(self, partoffs):
     tps = []
     for tpo in partoffs:
         tps.append(TopicPartition(tpo.topic, tpo.partition))
     super().assign(tps)
     for tpo in partoffs:
         if (tpo.offset > 0):
             super().seek(TopicPartition(tpo.topic, tpo.partition),
                          tpo.offset)
def test_convert_partition_offsets_translates_partition_offsets_to_committable_topic_offsets(
):
    offsets = convert_partition_offsets('foo', {0: 100, 1: 200})
    assert offsets == {
        TopicPartition(topic='foo', partition=0):
        OffsetAndMetadata(offset=100, metadata=''),
        TopicPartition(topic='foo', partition=1):
        OffsetAndMetadata(offset=200, metadata='')
    }
    def test_producing(self):
        """
        This test verifies that producer can send messages through mesh filter.
        We are going to send messages to two topics: 'apples' and 'bananas'.
        The mesh filter is configured to forward records for topics starting with 'a' (like 'apples')
        to the first cluster, and the ones starting with 'b' (so 'bananas') to the second one.

        We are going to send messages one by one, so they will not be batched in Kafka producer,
        so the filter is going to receive them one by one too.

        After sending, the consumers are going to read from Kafka clusters directly to make sure that
        nothing was lost.
        """

        messages_to_send = 100
        partition1 = TopicPartition('apples', 0)
        partition2 = TopicPartition('bananas', 0)

        producer = KafkaProducer(
            bootstrap_servers=IntegrationTest.kafka_envoy_address(),
            api_version=(1, 0, 0))
        offset_to_message1 = {}
        offset_to_message2 = {}
        for _ in range(messages_to_send):
            message = Message()
            future1 = producer.send(key=message.key,
                                    value=message.value,
                                    headers=message.headers,
                                    topic=partition1.topic,
                                    partition=partition1.partition)
            self.assertTrue(future1.get().offset >= 0)
            offset_to_message1[future1.get().offset] = message

            future2 = producer.send(key=message.key,
                                    value=message.value,
                                    headers=message.headers,
                                    topic=partition2.topic,
                                    partition=partition2.partition)
            self.assertTrue(future2.get().offset >= 0)
            offset_to_message2[future2.get().offset] = message
        self.assertTrue(len(offset_to_message1) == messages_to_send)
        self.assertTrue(len(offset_to_message2) == messages_to_send)
        producer.close()

        # Check the target clusters.
        self.__verify_target_kafka_cluster(
            IntegrationTest.kafka_cluster1_address(), partition1,
            offset_to_message1, partition2)
        self.__verify_target_kafka_cluster(
            IntegrationTest.kafka_cluster2_address(), partition2,
            offset_to_message2, partition1)

        # Check if requests have been received.
        self.metrics.collect_final_metrics()
        self.metrics.assert_metric_increase('produce', 200)
def find_with_offset(offset_low=0, offset_high=-1):
    consumer = get_consumer()
    consumer.assign([TopicPartition(topic=TOPIC, partition=PARTITION_NUMBER)])
    consumer.seek(TopicPartition(topic=TOPIC, partition=PARTITION_NUMBER),
                  offset_low)

    filter_function = None
    if offset_high != -1:
        filter_function = lambda msg: msg.offset <= offset_high

    return filter(filter_function, consumer)
示例#18
0
    def seek_messages_by_timestamp(self, input_dt):
        # Seek messages by timestamp
        assert datetime.datetime.strptime(input_dt, "%Y-%m-%d %H:%M:%S"), \
            'Please provide date input in the format "%Y-%m-%d %H:%M:%S"'

        try:
            assignments = []
            # We will manually assign topic partitions to read the messages from
            self.kc.unsubscribe()
            self.kc.topics()
            for topic in self.topics:
                partitions = self.kc.partitions_for_topic(topic)
                for p in partitions:
                    assignments.append(TopicPartition(topic, p))
            self.kc.assign(assignments)
            # self.kc.poll(timeout_ms=0)
            messages = []
            for topic in self.topics:
                # Get the offset based on timestamp
                offset_time = int((datetime.datetime.strptime(
                    input_dt, "%Y-%m-%d %H:%M:%S")).timestamp() * 1000)
                # print(offset_time)
                partitions = self.kc.partitions_for_topic(topic)
                # print(partitions)

                for p in partitions:
                    dc = {TopicPartition(topic, p): offset_time}
                    last_commit_for_partition = self.kc.committed(
                        TopicPartition(topic, p))
                    offset = [
                        x[0] for x in self.kc.offsets_for_times(dc).values()
                    ][0]
                    print("Topic: " + topic + "  Partition: " + str(p) +
                          "  Offset: " + str(offset))
                    # print(str(p) + " - " + str(last_commit_for_partition))

                    if last_commit_for_partition > offset:
                        self.kc.seek(TopicPartition(topic, p), offset)
                        for msg in self.kc:
                            messages.append(
                                (msg.offset, msg.value.decode('utf-8'),
                                 msg.key))
                            # print(str(p) + " - " + str(msg.offset))
                            if msg.offset >= (last_commit_for_partition - 1):
                                # print("Offset 1 using offset position successful.")
                                break
            return messages

        except (errors.KafkaTimeoutError, AssertionError, ValueError) as e:
            raise ValueError('%r' % e)
        except TypeError:
            raise ValueError('No last commit available for the given Topics')
        finally:
            self.kc.close()
示例#19
0
def get_consumer(topic, offset=-1):
    # Check for offset, otherwise return consumer with group_id
    if offset == -1:
        consumer = KafkaConsumer(topic,
                                 group_id='MovieLog1',
                                 consumer_timeout_ms=KAFKA_TIMEOUT)
    else:
        consumer = KafkaConsumer(consumer_timeout_ms=KAFKA_TIMEOUT)
        consumer.assign([TopicPartition(topic, offset)])
        consumer.seek_to_beginning(TopicPartition(topic, offset))
    return consumer
示例#20
0
def get_tweets_for_top_10_accounts(consumer):
    resubscribe(consumer)
    all_topics = get_all_acounts(consumer)

    top10 = []
    start_date = datetime.datetime.timestamp(datetime.datetime.now() -
                                             datetime.timedelta(hours=3))

    for topic in all_topics:
        tp = TopicPartition(topic, 0)
        consumer.seek_to_end(tp)
        last_offset = int(consumer.position(tp))

        offset = consumer.offsets_for_times({tp: start_date})

        start_offset = 0
        if list(offset.values())[0]:
            start_offset = int(list(offset.values())[0].offset)

        top10.append({
            'user_id': topic,
            'amount': int(last_offset - start_offset),
            'length': last_offset
        })

    sorted(top10, key=lambda x: x['amount'])
    top10 = top10[:10]

    data['top10_producing_account_latest_tweets'] = []

    for topic in top10:
        tp = TopicPartition(topic['user_id'], 0)

        if topic['length'] >= 10:
            consumer.seek(tp, topic['length'] - 10)
        else:
            consumer.seek_to_beginning(tp)

        messages = consumer.poll(2000, 10)
        messages = list(messages.values())

        tweets = []

        for message in messages:
            tweets.append(message[0].value.decode("utf-8"))

        print(tweets)

        data['top10_producing_account_latest_tweets'].append({
            'user_id':
            topic['user_id'],
            'latest_tweets':
            tweets
        })
示例#21
0
    def seek_last_n_messages(self, last_n_offset):
        # Seek by offset
        assignments = []
        # We will manually assign topic partitions to read the messages from
        try:
            self.kc.unsubscribe()
            self.kc.topics()
            for topic in self.topics:
                partitions = self.kc.partitions_for_topic(topic)
                for p in partitions:
                    assignments.append(TopicPartition(topic, p))
            self.kc.assign(assignments)
            # self.kc.poll(timeout_ms=0)
            messages = []

            for topic in self.topics:
                partitions = self.kc.partitions_for_topic(topic)
                # print(partitions)
                last_offset = max([(self.kc.committed(TopicPartition(topic,
                                                                     p)))
                                   for p in partitions])
                offset = max(last_offset - last_n_offset, 0)
                print("Max offset - " + str(last_offset))

                for p in partitions:
                    last_commit_for_partition = self.kc.committed(
                        TopicPartition(topic, p))
                    print("Topic: " + topic + "  Partition: " + str(p))
                    print(
                        "LastOffset: {} ** Position: {} ** HighWaterMark: {}".
                        format(
                            str(last_commit_for_partition),
                            str(self.kc.position(TopicPartition(topic, p))),
                            str(self.kc.highwater(TopicPartition(topic, p)))))

                    # print(str(p) + " - " + str(last_commit_for_partition))

                    # if last_commit_for_partition > offset:
                    #     self.kc.seek(TopicPartition(topic, p), offset)
                    #     for msg in self.kc:
                    #         messages.append((msg.offset, msg.value.decode('utf-8'), msg.key))
                    # print(str(p) + " - " + str(msg.offset))
                    # if msg.offset >= (last_commit_for_partition - 1):
                    #     print("Offset 1 using offset position successful.")
                    #     break
            return messages

        except (errors.KafkaTimeoutError, AssertionError, ValueError) as e:
            raise ValueError('%r' % e)
        except TypeError:
            raise ValueError('No last commit available for the given Topics')
        finally:
            self.kc.close()
示例#22
0
    def run(self):

        consumer = KafkaConsumer(
            bootstrap_servers=self.server, 
            auto_offset_reset='earliest',
            group_id=self.groupid)

        if consumer.partitions_for_topic(self.topic) is None:
            print("El tópico %s no existe!" % self.topic)
            sys.exit(2)

        if self.partition is None:
            partitions = [TopicPartition(self.topic, partition) 
                           for partition in consumer.partitions_for_topic(self.topic)]
        else:
            partitions = [TopicPartition(self.topic, int(self.partition))]

        consumer.assign(partitions)

        if self.offset is None:
            if self.inicio:
                for partition in partitions:
                    consumer.seek_to_beginning(partition)
        else:
            for partition in partitions:
                    consumer.seek(partition, int(self.offset))

        while not self.stop_event.is_set():
            try:
                for message in consumer:
                    logging.info(message)

                    try:
                        valor = json.loads(message.value)
                        if self.words:
                            valor = valor['words']
                            
                    except (ValueError):
                        valor = message.value.decode('utf-8')

                    print ("Recibiendo Mensaje (%s/%d/%d) %s" % (message.topic, 
                                              message.partition,
                                              message.offset, 
                                              #message.key,
                                              valor))

                    if self.stop_event.is_set():
                        break
            except IndexError:
                pass

        consumer.close()      
示例#23
0
def when_we_read_and_transform_the_records(step):
    test_context = world.test_environment.load_context(
        EXTRACT_TRANSFORM_CONSUME_SCENARIO)

    mssql_db = sqldbx.SQLServerDatabase('', 'Legacy')

    db_username = world.mssql_username
    db_password = world.mssql_password

    mssql_db.login(db_username, db_password, schema='mercury')
    pmgr = sqldbx.PersistenceManager(mssql_db)

    transform_map_filename = world.pipeline_config.transform_map
    map_file_path = os.path.join(world.data_dir, transform_map_filename)

    transformer_builder = dmap.RecordTransformerBuilder(map_file_path,
                                                        persistence_mgr=pmgr)

    tfmr = transformer_builder.build()

    knodes = world.pipeline_config.cluster.node_array

    # a kafka group is a numbered context shared by some number of consumers
    group = world.pipeline_config.get_user_defined_consumer_group(
        'scratch_group_2')
    topic = world.pipeline_config.raw_topic
    kreader = telegraf.KafkaIngestRecordReader(topic, knodes, group)

    # show how many partitions this topic spans
    metadata = kreader.consumer.partitions_for_topic(topic)
    print '### partitions for topic %s:\n%s' % (topic, '\n'.join(
        [str(p) for p in metadata]))

    # TopicPartition named tuple consists of the topic and a partition number
    tp = TopicPartition(topic, 0)

    # manually assign one or more partitions to the consumer --
    # required if we want to use explicit offsets
    kreader.consumer.assign([tp])

    offset = get_offset(topic)
    topic_partition = TopicPartition(topic, list(metadata)[0])
    kreader.consumer.seek(topic_partition, offset)

    world_relay = WorldRelay(transformer=tfmr)
    kreader.read(world_relay, log)

    for rec in world_relay.read_list:
        test_context.consumed_raw_into_sst_record_list.append(rec)
示例#24
0
def updateDeviceConfig(brokerArray, producer, consumer):
    #producer = KafkaProducer(bootstrap_servers=brokerArray, acks=0, linger_ms=1000, batch_size=1000000)
    #consumer = KafkaConsumer(bootstrap_servers=brokerArray,
    #                                  max_poll_interval_ms=1000,
    #                                  group_id=group_id_suffix + "-" + name,
    #                                  auto_commit_interval_ms=500 )
    tic = time.time()
    log("Starting Device Config Update")
    deviceList = getAgentConfig(brokerArray, producer, consumer)
    log("Got Device List")
    tp1 = TopicPartition(admin_devices_topic, 0)
    tp2 = TopicPartition(admin_topic, 0)
    consumer.assign([tp1, tp2])
    log("Assigned Topics")
    consumer.seek_to_end()
    foundConfig = 1

    message = {}

    producer.send(admin_devices_topic, genericMessage("get", name, message))
    log("Sent Devices Config Request")
    deviceDatabase = []

    tic = time.time()
    while (len(deviceList) > 0) and (time.time() < (DEFAULT_TIMEOUT_S + tic)):
        messages = consumer.poll()
        if tp1 in messages:
            for message in messages[tp1]:
                m = json.loads(message.value)
                if m['cmd'] == 'config-simulation':
                    log("Passed Config Message")
                    if m['name'] in deviceList:
                        deviceList.remove(m['name'])
                        deviceDatabase.append({
                            'name':
                            m['name'],
                            'type':
                            m['message']['type'],
                            'tagCount':
                            len(m['message']['tags']),
                            'replication':
                            m['message']['replication'],
                            'scantime':
                            m['message']['scantime']
                        })
                        foundConfig += 1

    lastUpdate = time.time()
    return deviceDatabase
def amount_search():
    #array = []
    consumer.assign([TopicPartition('json_test', 0)])
    i = consumer.position(TopicPartition('json_test', 0))
    amount = raw_input(
        "\nPlease input an amount of messages before the current message offset to querry: \n(the latest offset is "
        + str(i - 1) + ")\n=>")
    consumer.seek(TopicPartition('json_test', 0), i - int(amount))

    for message in consumer:
        json_message = json.loads(message.value)
        print 'offset = ', message.offset, '\n', json_message, '\n'
        if message.offset == i - 1:
            time.sleep(5)
            break
def time_search():
    times = raw_input(
        "\nPlease input a time period (in seconds) for searching the message to querry: \n(1 day = 86400 s, 1 hr = 3600 s)\n=>"
    )
    consumer.assign([TopicPartition('json_test', 0)])
    consumer.seek(TopicPartition('json_test', 0), 0)
    t = datetime.datetime.now()
    print '\n\ntime period: ', datetime.timedelta(seconds=int(times)), '\n'
    for message in consumer:
        json_message = json.loads(message.value)
        json_message_time = datetime.datetime.strptime(json_message['time'],
                                                       "%Y-%m-%d %H:%M:%S")

        if t - json_message_time < datetime.timedelta(seconds=int(times)):
            print 'offset = ', message.offset, '\n', json_message, '\n'
示例#27
0
def wait_for_kafka_topic(hostport, topic, timeout=60):
    """Wait for a Kafka topic to become available."""
    # Delay import to facilitate module use in limited virtualenvs.
    from kafka import SimpleClient, TopicPartition

    start = time.time()
    client = SimpleClient(hostport, client_id=b'dummy', timeout=1)
    while not client.has_metadata_for_topic(topic):
        if time.time() - start > timeout:
            raise Exception('timeout reached waiting for topic')

        time.sleep(0.1)
        client.load_metadata_for_topics()

    # And wait for all partitions in that topic to have a leader.
    while True:
        tps = [
            TopicPartition(topic, p)
            for p in client.topic_partitions.get(topic, [])
        ]

        if tps and all(client.topics_to_brokers.get(tp) for tp in tps):
            break

        if time.time() - start > timeout:
            raise Exception('timeout reached waiting for topic brokers')

        time.sleep(0.1)
        client.load_metadata_for_topics()
示例#28
0
async def consume():
    consumer = AIOKafkaConsumer(
        'my_topic',
        loop=loop,
        bootstrap_servers='localhost:9092',
        group_id="my-group")
    # Get cluster layout and join group `my-group`
    await consumer.start()
    try:
        # Consume messages
        msg = await consumer.getone()
        logger.info(msg)
        logger.info(f'msg.offset = {msg.offset}')  # Unique msg autoincrement ID in this topic-partition.
        logger.info(f'msg.value = {msg.value}')

        tp = TopicPartition(msg.topic, msg.partition)

        position = await consumer.position(tp)
        # Position is the next fetched offset
        assert position == msg.offset + 1

        committed = await consumer.committed(tp)
        logger.info(f'committed = {committed}')
        # print(committed)

    finally:
        # Will leave consumer group; perform autocommit if enabled.
        await consumer.stop()
示例#29
0
def employeeportal():
    tp = TopicPartition('crashed-devices', 0)
    consumer = KafkaConsumer(
        'crashed-devices',
        bootstrap_servers=[
            'ec2-52-203-135-135.compute-1.amazonaws.com:9092',
            'ec2-52-70-111-222.compute-1.amazonaws.com:9092',
            'ec2-34-193-78-218.compute-1.amazonaws.com:9092'
        ],
        enable_auto_commit=True,
        group_id='my-group',
        auto_offset_reset='earliest',
        value_deserializer=lambda x: loads(x.decode('utf-8')))
    lastOffset = consumer.beginning_offsets([tp])[tp]
    latitudes = []
    longitudes = []
    i = 0
    for message in consumer:
        i += 1
        msg = message.value
        latitudes.append(msg['latitude'])
        longitudes.append(msg['longitude'])
        print(latitudes, longitudes)
        if i == 1:
            print("GOT HERE")
            consumer.commit()
            break

    consumer.close()

    return render_template("employeeportal.html",
                           APIkey='AIzaSyD9e3Rdo8fGQq6hzaXkdsdQzv9Hy0rTolE',
                           latitudes=latitudes,
                           longitudes=longitudes)
示例#30
0
    def debug(self, topic):
        c = KafkaConsumer(bootstrap_servers=KAFKA_HOSTS,
                          client_id=self._client_id,
                          group_id=None,
                          api_version=(0, 10))

        # assign/subscribe topic
        partitions = c.partitions_for_topic(topic)
        if not partitions:
            raise Exception("Topic " + topic + " not exist")
        c.assign([TopicPartition(topic, p) for p in partitions])

        # seek to beginning if needed
        c.seek_to_beginning()

        # fetch messages
        while True:
            partitions = c.poll(100)
            if partitions:
                for p in partitions:
                    for msg in partitions[p]:
                        yield msg.value.decode('utf-8')
            yield ""

        c.close()