Пример #1
0
    def test_send_produce_request_raises_when_noleader(self, protocol, conn):
        "Send producer request raises LeaderUnavailableError if leader is not available"

        conn.recv.return_value = 'response'  # anything but None

        brokers = {}
        brokers[0] = BrokerMetadata(0, 'broker_1', 4567)
        brokers[1] = BrokerMetadata(1, 'broker_2', 5678)

        topics = {}
        topics['topic_noleader'] = {
            0: PartitionMetadata('topic_noleader', 0, -1, [], []),
            1: PartitionMetadata('topic_noleader', 1, -1, [], [])
        }
        protocol.decode_metadata_response.return_value = (brokers, topics)

        client = KafkaClient(hosts=['broker_1:4567'])

        requests = [
            ProduceRequest(
                "topic_noleader", 0,
                [create_message("a"), create_message("b")])
        ]

        with self.assertRaises(LeaderUnavailableError):
            client.send_produce_request(requests)
Пример #2
0
    def test_send_produce_request_raises_when_noleader(self, protocol, conn):
        "Send producer request raises LeaderNotAvailableError if leader is not available"

        conn.recv.return_value = 'response'  # anything but None

        brokers = [
            BrokerMetadata(0, 'broker_1', 4567),
            BrokerMetadata(1, 'broker_2', 5678)
        ]

        topics = [
            TopicMetadata('topic_noleader', NO_ERROR, [
                PartitionMetadata('topic_noleader', 0, -1, [], [], NO_LEADER),
                PartitionMetadata('topic_noleader', 1, -1, [], [], NO_LEADER),
            ]),
        ]
        protocol.decode_metadata_response.return_value = MetadataResponse(
            brokers, topics)

        client = KafkaClient(hosts=['broker_1:4567'])

        requests = [
            ProduceRequest(
                "topic_noleader", 0,
                [create_message("a"), create_message("b")])
        ]

        with self.assertRaises(LeaderNotAvailableError):
            client.send_produce_request(requests)
Пример #3
0
    def test_send_produce_request_raises_when_topic_unknown(
            self, protocol, conn):

        conn.recv.return_value = 'response'  # anything but None

        brokers = [
            BrokerMetadata(0, 'broker_1', 4567),
            BrokerMetadata(1, 'broker_2', 5678)
        ]

        topics = [
            TopicMetadata('topic_doesnt_exist', UNKNOWN_TOPIC_OR_PARTITION,
                          []),
        ]
        protocol.decode_metadata_response.return_value = MetadataResponse(
            brokers, topics)

        client = KafkaClient(hosts=['broker_1:4567'])

        requests = [
            ProduceRequest(
                "topic_doesnt_exist", 0,
                [create_message("a"), create_message("b")])
        ]

        with self.assertRaises(UnknownTopicOrPartitionError):
            client.send_produce_request(requests)
Пример #4
0
    def test_send_produce_request_raises_when_noleader(self, protocol, conn):
        "Send producer request raises LeaderNotAvailableError if leader is not available"

        conn.recv.return_value = 'response'  # anything but None

        brokers = [
            BrokerMetadata(0, 'broker_1', 4567),
            BrokerMetadata(1, 'broker_2', 5678)
        ]

        topics = [
            TopicMetadata('topic_noleader', NO_ERROR, [
                PartitionMetadata('topic_noleader', 0, -1, [], [],
                                  NO_LEADER),
                PartitionMetadata('topic_noleader', 1, -1, [], [],
                                  NO_LEADER),
            ]),
        ]
        protocol.decode_metadata_response.return_value = MetadataResponse(brokers, topics)

        client = KafkaClient(hosts=['broker_1:4567'])

        requests = [ProduceRequest(
            "topic_noleader", 0,
            [create_message("a"), create_message("b")])]

        with self.assertRaises(LeaderNotAvailableError):
            client.send_produce_request(requests)
Пример #5
0
    def test_send_produce_request_raises_when_topic_unknown(self, protocol, conn):

        conn.recv.return_value = "response"  # anything but None

        brokers = [BrokerMetadata(0, "broker_1", 4567), BrokerMetadata(1, "broker_2", 5678)]

        topics = [TopicMetadata("topic_doesnt_exist", UNKNOWN_TOPIC_OR_PARTITION, [])]
        protocol.decode_metadata_response.return_value = MetadataResponse(brokers, topics)

        client = KafkaClient(hosts=["broker_1:4567"])

        requests = [ProduceRequest("topic_doesnt_exist", 0, [create_message("a"), create_message("b")])]

        with self.assertRaises(UnknownTopicOrPartitionError):
            client.send_produce_request(requests)
    def produce_messages(self):
        """
        Produce sample messages
        """
        # TODO: Support different kafka port
        kafka = KafkaClient(self.config.kafka_host)

        total_messages = self.batches * self.batch_size
        messages_batch = [create_message(random.choice(self.sample_messages)) for _ in xrange(self.batch_size)]

        for i in range(self.batches):
            # TODO: Support writing to all partitions
            req = ProduceRequest(topic=self.config.kafka_topic, partition=0, messages=messages_batch)
            kafka.send_produce_request(payloads=[req], fail_on_error=True)
            sent_messages = i * self.batch_size
            logging.info('Created %s out of %s sample messages', sent_messages, total_messages)
        kafka.close()
    def produce_messages(self):
        """
        Produce sample messages
        """
        # TODO: Support different kafka port
        kafka = KafkaClient(self.config.kafka_host)

        total_messages = self.batches * self.batch_size
        messages_batch = [
            create_message(random.choice(self.sample_messages))
            for _ in xrange(self.batch_size)
        ]

        for i in range(self.batches):
            # TODO: Support writing to all partitions
            req = ProduceRequest(topic=self.config.kafka_topic,
                                 partition=0,
                                 messages=messages_batch)
            kafka.send_produce_request(payloads=[req], fail_on_error=True)
            sent_messages = i * self.batch_size
            logging.info('Created %s out of %s sample messages', sent_messages,
                         total_messages)
        kafka.close()
Пример #8
0
    def test_send_produce_request_raises_when_noleader(self, protocol, conn):
        "Send producer request raises LeaderUnavailableError if leader is not available"

        conn.recv.return_value = 'response'  # anything but None

        brokers = {}
        brokers[0] = BrokerMetadata(0, 'broker_1', 4567)
        brokers[1] = BrokerMetadata(1, 'broker_2', 5678)

        topics = {}
        topics['topic_noleader'] = {
            0: PartitionMetadata('topic_noleader', 0, -1, [], []),
            1: PartitionMetadata('topic_noleader', 1, -1, [], [])
        }
        protocol.decode_metadata_response.return_value = (brokers, topics)

        client = KafkaClient(hosts=['broker_1:4567'])

        requests = [ProduceRequest(
            "topic_noleader", 0,
            [create_message(b"a"), create_message(b"b")])]

        with self.assertRaises(LeaderUnavailableError):
            client.send_produce_request(requests)
Пример #9
0
def low_level():
    '''low level'''
    from kafka import KafkaClient, create_message
    from kafka.protocol import KafkaProtocol
    from kafka.common import ProduceRequest

    kafka = KafkaClient(KAFKA_SERVER)

    req = ProduceRequest(topic=b'topic1', partition=1,
                         messages=[create_message(b'some message')])
    resps = kafka.send_produce_request(payloads=[req], fail_on_error=True)
    kafka.close()

    print resps[0].topic      # b'topic1'
    print resps[0].partition  # 1
    print resps[0].error      # 0 (hopefully)
    print resps[0].offset     # offset of the first message sent in this request
Пример #10
0
class KafkaProducer(object):
    """The KafkaProducer deals with buffering messages that need to be published
    into Kafka, preparing them for publication, and ultimately publishing them.

    Args:
        producer_position_callback (function): The producer position callback
            is called when the KafkaProducer is instantiated, and every time
            messages are published to notify the producer of current position
            information of successfully published messages.
        dry_run (Optional[bool]): When dry_run mode is on, the producer won't
            talk to real KafKa topic, nor to real Schematizer.  Default to False.
    """
    @cached_property
    def envelope(self):
        return Envelope()

    def __init__(self, producer_position_callback, dry_run=False):
        self.producer_position_callback = producer_position_callback
        self.dry_run = dry_run
        self.kafka_client = KafkaClient(get_config().cluster_config.broker_list)
        self.position_data_tracker = PositionDataTracker()
        self._reset_message_buffer()
        self.skip_messages_with_pii = get_config().skip_messages_with_pii
        self._publish_retry_policy = RetryPolicy(
            ExpBackoffPolicy(with_jitter=True),
            max_retry_count=get_config().producer_max_publish_retry_count
        )
        self._automatic_flush_enabled = True

    @contextmanager
    def disable_automatic_flushing(self):
        """Prevents the producer from flushing automatically (e.g. for timeouts
        or batch size) while the context manager is open.
        """
        try:
            self._automatic_flush_enabled = False
            yield
        finally:
            self._automatic_flush_enabled = True

    def wake(self):
        """Should be called periodically if we're not otherwise waking up by
        publishing, to ensure that messages are actually published.
        """
        # if we haven't woken up in a while, we may need to flush messages
        self._flush_if_necessary()

    def publish(self, message):
        if message.contains_pii and self.skip_messages_with_pii:
            logger.info(
                "Skipping a PII message - "
                "uuid hex: {0}, "
                "schema_id: {1}, "
                "timestamp: {2}, "
                "type: {3}".format(
                    message.uuid_hex,
                    message.schema_id,
                    message.timestamp,
                    message.message_type.name
                )
            )
            return
        self._add_message_to_buffer(message)
        self.position_data_tracker.record_message_buffered(message)
        self._flush_if_necessary()

    def flush_buffered_messages(self):
        produce_method = (self._publish_produce_requests_dry_run
                          if self.dry_run else self._publish_produce_requests)
        produce_method(self._generate_produce_requests())
        self._reset_message_buffer()

    def close(self):
        self.flush_buffered_messages()
        self.kafka_client.close()

    def _publish_produce_requests(self, requests):
        """It will try to publish all the produce requests for topics, and
        retry a number of times until either all the requests are successfully
        published or it can no longer retry, in which case, the exception will
        be thrown.

        Each time the requests that are successfully published in the previous
        round will be removed from the requests and won't be published again.
        """
        unpublished_requests = list(requests)
        retry_handler = RetryHandler(self.kafka_client, unpublished_requests)

        def has_requests_to_be_sent():
            return bool(retry_handler.requests_to_be_sent)

        retry_handler = retry_on_condition(
            retry_policy=self._publish_retry_policy,
            retry_conditions=[Predicate(has_requests_to_be_sent)],
            func_to_retry=self._publish_requests,
            use_previous_result_as_param=True,
            retry_handler=retry_handler
        )
        if retry_handler.has_unpublished_request:
            raise MaxRetryError(last_result=retry_handler)

    def _publish_requests(self, retry_handler):
        """Main function to publish message requests.  This function is wrapped
        with retry function and will be retried based on specified retry policy

        Args:
            retry_handler: :class:`data_pipeline._producer_retry.RetryHandler`
                that determines which messages should be retried next time.
        """
        if not retry_handler.requests_to_be_sent:
            return retry_handler

        responses = self._try_send_produce_requests(
            retry_handler.requests_to_be_sent
        )

        retry_handler.update_requests_to_be_sent(
            responses,
            self.position_data_tracker.topic_to_kafka_offset_map
        )
        self._record_success_requests(retry_handler.success_topic_stats_map)
        return retry_handler

    def _try_send_produce_requests(self, requests):
        # Either it throws exceptions and none of them succeeds, or it returns
        # responses of all the requests (success or fail response).
        try:
            return self.kafka_client.send_produce_request(
                payloads=requests,
                acks=get_config().kafka_client_ack_count,
                fail_on_error=False
            )
        except Exception:
            # Exceptions like KafkaUnavailableError, LeaderNotAvailableError,
            # UnknownTopicOrPartitionError, etc., are not controlled by
            # `fail_on_error` flag and could be thrown from the kafka client,
            # and fail all the requests.  We will retry all the requests until
            # either all of them are successfully published or it exceeds the
            # maximum retry criteria.
            return []

    def _record_success_requests(self, success_topic_stats_map):
        for topic_partition, stats in success_topic_stats_map.iteritems():
            topic = topic_partition.topic_name
            assert stats.message_count == len(self.message_buffer[topic])
            self.position_data_tracker.record_messages_published(
                topic=topic,
                offset=stats.original_offset,
                message_count=stats.message_count
            )
            self.message_buffer.pop(topic)

    def _publish_produce_requests_dry_run(self, requests):
        for request in requests:
            self._publish_single_request_dry_run(request)

    def _publish_single_request_dry_run(self, request):
        topic = request.topic
        message_count = len(request.messages)
        self.position_data_tracker.record_messages_published(
            topic,
            -1,
            message_count
        )

    def _is_ready_to_flush(self):
        time_limit = get_config().kafka_producer_flush_time_limit_seconds
        return (self._automatic_flush_enabled and (
            (time.time() - self.start_time) >= time_limit or
            self.message_buffer_size >= get_config().kafka_producer_buffer_size
        ))

    def _flush_if_necessary(self):
        if self._is_ready_to_flush():
            self.flush_buffered_messages()

    def _add_message_to_buffer(self, message):
        topic = message.topic
        message = self._prepare_message(message)

        self.message_buffer[topic].append(message)
        self.message_buffer_size += 1

    def _generate_produce_requests(self):
        return [
            ProduceRequest(topic=topic, partition=0, messages=messages)
            for topic, messages in self._generate_prepared_topic_and_messages()
        ]

    def _generate_prepared_topic_and_messages(self):
        return self.message_buffer.iteritems()

    def _prepare_message(self, message):
        return _prepare(_EnvelopeAndMessage(envelope=self.envelope, message=message))

    def _reset_message_buffer(self):
        if not hasattr(self, 'message_buffer_size') or self.message_buffer_size > 0:
            self.producer_position_callback(self.position_data_tracker.get_position_data())
        self.start_time = time.time()
        self.message_buffer = defaultdict(list)
        self.message_buffer_size = 0
Пример #11
0
    stream.add_filter('record-type', 'ribs')
    stream.add_filter('record-type', 'updates')

    stream.add_interval_filter(last_ts, 0)

    # Start the stream
    stream.start()

    client = KafkaClient(args.our_servers.split(","))
    count = 0
    for batch in group_by_n(
            messages_from_internal(iterate_stream(stream, args.collector)),
            1000):
        req = ProduceRequest("rib-{}".format(args.collector), 0, batch)
        for msg in reversed(req.messages):
            if msg.value is None:
                continue
            last_timestamp = json.loads(msg.value)["timestamp"]
            break

        count += len(batch)
        logger.info("sending %i", count)
        res = client.send_produce_request([req])
        try:
            # this is a bit buggy but it will do for now
            with open(save_file, "w") as f:
                f.write(str(last_timestamp))
        except:
            logger.warning("could not write offsets to %s", save_file)
            pass
Пример #12
0
class KafkaProducer(object):
    """The KafkaProducer deals with buffering messages that need to be published
    into Kafka, preparing them for publication, and ultimately publishing them.

    Args:
        producer_position_callback (function): The producer position callback
            is called when the KafkaProducer is instantiated, and every time
            messages are published to notify the producer of current position
            information of successfully published messages.
        dry_run (Optional[bool]): When dry_run mode is on, the producer won't
            talk to real KafKa topic, nor to real Schematizer.  Default to False.
    """
    @cached_property
    def envelope(self):
        return Envelope()

    def __init__(self, producer_position_callback, dry_run=False):
        self.producer_position_callback = producer_position_callback
        self.dry_run = dry_run
        self.kafka_client = KafkaClient(get_config().cluster_config.broker_list)
        self.position_data_tracker = PositionDataTracker()
        self._reset_message_buffer()
        self.skip_messages_with_pii = get_config().skip_messages_with_pii
        self._publish_retry_policy = RetryPolicy(
            ExpBackoffPolicy(with_jitter=True),
            max_retry_count=get_config().producer_max_publish_retry_count
        )
        self._automatic_flush_enabled = True

    @contextmanager
    def disable_automatic_flushing(self):
        """Prevents the producer from flushing automatically (e.g. for timeouts
        or batch size) while the context manager is open.
        """
        try:
            self._automatic_flush_enabled = False
            yield
        finally:
            self._automatic_flush_enabled = True

    def wake(self):
        """Should be called periodically if we're not otherwise waking up by
        publishing, to ensure that messages are actually published.
        """
        # if we haven't woken up in a while, we may need to flush messages
        self._flush_if_necessary()

    def publish(self, message):
        if message.contains_pii and self.skip_messages_with_pii:
            logger.info(
                "Skipping a PII message - "
                "uuid hex: {0}, "
                "schema_id: {1}, "
                "timestamp: {2}, "
                "type: {3}".format(
                    message.uuid_hex,
                    message.schema_id,
                    message.timestamp,
                    message.message_type.name
                )
            )
            return
        self._add_message_to_buffer(message)
        self.position_data_tracker.record_message_buffered(message)
        self._flush_if_necessary()

    def flush_buffered_messages(self):
        produce_method = (self._publish_produce_requests_dry_run
                          if self.dry_run else self._publish_produce_requests)
        produce_method(self._generate_produce_requests())
        self._reset_message_buffer()

    def close(self):
        self.flush_buffered_messages()
        self.kafka_client.close()

    def _publish_produce_requests(self, requests):
        """It will try to publish all the produce requests for topics, and
        retry a number of times until either all the requests are successfully
        published or it can no longer retry, in which case, the exception will
        be thrown.

        Each time the requests that are successfully published in the previous
        round will be removed from the requests and won't be published again.
        """
        unpublished_requests = list(requests)
        retry_handler = RetryHandler(self.kafka_client, unpublished_requests)

        def has_requests_to_be_sent():
            return bool(retry_handler.requests_to_be_sent)

        retry_handler = retry_on_condition(
            retry_policy=self._publish_retry_policy,
            retry_conditions=[Predicate(has_requests_to_be_sent)],
            func_to_retry=self._publish_requests,
            use_previous_result_as_param=True,
            retry_handler=retry_handler
        )
        if retry_handler.has_unpublished_request:
            raise MaxRetryError(last_result=retry_handler)

    def _publish_requests(self, retry_handler):
        """Main function to publish message requests.  This function is wrapped
        with retry function and will be retried based on specified retry policy

        Args:
            retry_handler: :class:`data_pipeline._producer_retry.RetryHandler`
                that determines which messages should be retried next time.
        """
        if not retry_handler.requests_to_be_sent:
            return retry_handler

        responses = self._try_send_produce_requests(
            retry_handler.requests_to_be_sent
        )

        topics_watermarks = self._populate_topics_to_offset_map(responses)
        self.position_data_tracker.topic_to_kafka_offset_map.update(
            topics_watermarks
        )

        retry_handler.update_requests_to_be_sent(
            responses,
            self.position_data_tracker.topic_to_kafka_offset_map
        )
        self._record_success_requests(retry_handler.success_topic_stats_map)
        return retry_handler

    def _populate_topics_to_offset_map(self, responses):
        topics_from_responses = [
            response.topic for response in responses
            if isinstance(response, ProduceResponse)
        ]

        topics_watermarks = get_topics_watermarks(
            kafka_client=self.kafka_client,
            topics=topics_from_responses,
            raise_on_error=True
        )
        topics_watermarks = {
            topic: partition_offsets[0].highmark
            for topic, partition_offsets in topics_watermarks.iteritems()
        }
        return topics_watermarks

    def _try_send_produce_requests(self, requests):
        # Either it throws exceptions and none of them succeeds, or it returns
        # responses of all the requests (success or fail response).
        try:
            return self.kafka_client.send_produce_request(
                payloads=requests,
                acks=get_config().kafka_client_ack_count,
                fail_on_error=False
            )
        except Exception:
            # Exceptions like KafkaUnavailableError, LeaderNotAvailableError,
            # UnknownTopicOrPartitionError, etc., are not controlled by
            # `fail_on_error` flag and could be thrown from the kafka client,
            # and fail all the requests.  We will retry all the requests until
            # either all of them are successfully published or it exceeds the
            # maximum retry criteria.
            return []

    def _record_success_requests(self, success_topic_stats_map):
        for topic_partition, stats in success_topic_stats_map.iteritems():
            topic = topic_partition.topic_name
            assert stats.message_count == len(self.message_buffer[topic])
            self.position_data_tracker.record_messages_published(
                topic=topic,
                offset=stats.original_offset,
                message_count=stats.message_count
            )
            self.message_buffer.pop(topic)

    def _publish_produce_requests_dry_run(self, requests):
        for request in requests:
            self._publish_single_request_dry_run(request)

    def _publish_single_request_dry_run(self, request):
        topic = request.topic
        message_count = len(request.messages)
        self.position_data_tracker.record_messages_published(
            topic,
            -1,
            message_count
        )

    def _is_ready_to_flush(self):
        time_limit = get_config().kafka_producer_flush_time_limit_seconds
        return (self._automatic_flush_enabled and (
            (time.time() - self.start_time) >= time_limit or
            self.message_buffer_size >= get_config().kafka_producer_buffer_size
        ))

    def _flush_if_necessary(self):
        if self._is_ready_to_flush():
            self.flush_buffered_messages()

    def _add_message_to_buffer(self, message):
        topic = message.topic
        message = self._prepare_message(message)

        self.message_buffer[topic].append(message)
        self.message_buffer_size += 1

    def _generate_produce_requests(self):
        return [
            ProduceRequest(topic=topic, partition=0, messages=messages)
            for topic, messages in self._generate_prepared_topic_and_messages()
        ]

    def _generate_prepared_topic_and_messages(self):
        return self.message_buffer.iteritems()

    def _prepare_message(self, message):
        return _prepare(_EnvelopeAndMessage(envelope=self.envelope, message=message))

    def _reset_message_buffer(self):
        if not hasattr(self, 'message_buffer_size') or self.message_buffer_size > 0:
            self.producer_position_callback(self.position_data_tracker.get_position_data())
        self.start_time = time.time()
        self.message_buffer = defaultdict(list)
        self.message_buffer_size = 0
Пример #13
0
            relation.labels(args.collector).inc()
            filter_out = True
        if "direct" in msg:
            connected.labels(args.collector).inc()
            filter_out = True
        if msg.get("caida_private", False) is True:
            caida_private.labels(args.collector).inc()
            filter_out = True
        if msg.get("caida_as2org", False) is True:
            caida_as2org.labels(args.collector).inc()
            filter_out = True
        if msg.get("caida_relation", False) is True:
            caida_relation.labels(args.collector).inc()
            filter_out = True
        if msg.get("caida_cone", False) is True:
            caida_cone.labels(args.collector).inc()
            filter_out = True
        if msg.get("caida_as2rel", False) is True:
            caida_as2rel.labels(args.collector).inc()
            filter_out = True

        all_events.labels(args.collector, filter_out).inc()

        if filter_out:
            continue

        abnormal.labels(args.collector).inc()

        client.send_produce_request([ProduceRequest("conflicts", PARTITIONS[args.collector], [create_message(json.dumps(msg))])])

Пример #14
0
#Listing A.1.5
from kafka import KafkaClient, SimpleProducer

kafka = KafkaClient("localhost:9092")
producer = SimpleProducer(kafka,
                          async=False,
                          req_acks=SimpleProducer.ACK_AFTER_CLUSTER_COMMIT,
                          ack_timeout=2000)

producer.send_messages("test-replicated-topic", "Hello Kafka Cluster!")
producer.send_messages("test-replicated-topic", "Message to be replicated.")
producer.send_messages("test-replicated-topic", "And so is this!")

#Listing A.1.8
from kafka import KafkaClient
from kafka.common import ProduceRequest
from kafka.protocol import KafkaProtocol, create_message

kafka = KafkaClient("localhost:9092")

f = open('A1.data', 'r')

for line in f:
    s = line.split("\t")[0]
    part = abs(hash(s)) % 3
    req = ProduceRequest(topic="click-streams",
                         partition=part,
                         messages=[create_message(s)])
    resps = kafka.send_produce_request(payloads=[req], fail_on_error=True)
Пример #15
0
for message in consumer:
	print(message)

#Listing A.1.5
from kafka import KafkaClient, SimpleProducer

kafka = KafkaClient("localhost:9092")
producer = SimpleProducer(kafka,async=False,
			  req_acks=SimpleProducer.ACK_AFTER_CLUSTER_COMMIT,
			  ack_timeout=2000)

producer.send_messages("test-replicated-topic", "Hello Kafka Cluster!")
producer.send_messages("test-replicated-topic","Message to be replicated.")
producer.send_messages("test-replicated-topic","And so is this!")

#Listing A.1.8
from kafka import KafkaClient
from kafka.common import ProduceRequest
from kafka.protocol import KafkaProtocol,create_message

kafka = KafkaClient("localhost:9092")

f = open('A1.data','r')

for line in f:
	s = line.split("\t")[0]
	part = abs(hash(s)) % 3 
	req = ProduceRequest(topic="click-streams",partition=part,messages=[create_message(s)])
	resps = kafka.send_produce_request(payloads=[req], fail_on_error=True)
Пример #16
0
                             bootstrap_servers=args.ripe_servers.split(","))

    save_file = "offsets-{}".format(args.collector)
    if args.from_beginning:
        logger.info("starting from scratch")
        offsets = {("raw-{}".format(args.collector), i): 0 for i in range(0, 10)}
        consumer.set_topic_partitions(offsets)
    elif os.path.exists(save_file):
        with open(save_file, "r") as f:
            offsets = cPickle.load(f)
        logger.info("loading offsets from file: %s", offsets)
        consumer.set_topic_partitions(offsets)
    else:
        logger.info("starting from last messages")

    client = KafkaClient(args.our_servers.split(","))
    count = 0
    for batch in group_by_n(messages_from_internal(iterate_messages(consumer, args.collector)), 1000):
        req = ProduceRequest("rib-{}".format(args.collector), 0, batch)
        count += len(batch)
        logger.info("sending %i", count)
        res = client.send_produce_request([req])
        offsets = consumer.offsets("fetch")
        try:
            # this is a bit buggy but it will do for now
            with open(save_file, "w") as f:
                f.write(cPickle.dumps(offsets))
        except:
            logger.warning("could not write offsets to %s", save_file)
            pass