示例#1
0
class DFProducer:
    def __init__(self, bootstrap_servers):
        self.kafka_client = SimpleClient(bootstrap_servers)
        self.producer = KafkaProducer(
            bootstrap_servers=bootstrap_servers,
            api_version=(0, 10),
            retries=3,
            max_block_ms=60 * 1000,
            value_serializer=lambda m: pickle.dumps(m))

    def produce(self, reader):
        data_df = reader.read_data()
        if data_df.empty:
            log.info('本次数据采集的数量为0...')
            return
        key = reader.get_table_profile().get_key()
        bkey = bytes(key, encoding="utf8")
        tpc = reader.get_table_profile().get_pub_topic()
        tp_part = len(self.kafka_client.get_partition_ids_for_topic(tpc))
        for idx, row in data_df.iterrows():
            part = idx % tp_part
            self.producer.send(tpc, value=row, key=bkey,
                               partition=part).add_errback(
                                   self.on_send_error, key, tpc)
        self.producer.flush()
        log.info('成功发送数据表key【{}】的数据【{}】条到kafka...'.format(
            key, data_df.shape[0]))

    def on_send_error(self, ex, key, topic):
        logging.error('数据表【{}】的数据发送kafka【{}】失败!!!'.format(key, topic), ex)
示例#2
0
文件: utils.py 项目: luzbetak/sparkly
def kafka_get_topics_offsets(host, topic, port=9092):
    """Return available partitions and their offsets for the given topic.

    Args:
        host (str): Kafka host.
        topic (str): Kafka topic.
        port (int): Kafka port.

    Returns:
        [(int, int, int)]: [(partition, start_offset, end_offset)].
    """
    brokers = ['{}:{}'.format(host, port)]
    client = SimpleClient(brokers)

    offsets = []
    partitions = client.get_partition_ids_for_topic(topic)

    offsets_responses_end = client.send_offset_request([
        OffsetRequestPayload(topic, partition, -1, 1)
        for partition in partitions
    ])
    offsets_responses_start = client.send_offset_request([
        OffsetRequestPayload(topic, partition, -2, 1)
        for partition in partitions
    ])

    for start_offset, end_offset in zip(offsets_responses_start,
                                        offsets_responses_end):
        offsets.append((start_offset.partition, start_offset.offsets[0],
                        end_offset.offsets[0]))

    return offsets
示例#3
0
    def _get_partition_ids(self, topic, bootstrap_server):
        """
        Get the number of partitions for a specific topic.
        :param topic: (string) topic name
        :param bootstrap_servers: (string) single bootstrap server 'host:port'
        :return: (int) no. of partitions
        """
        client = SimpleClient(bootstrap_server)
        topic_partition_ids = client.get_partition_ids_for_topic(topic.encode('utf-8'))

        return len(topic_partition_ids)
示例#4
0
class KafkaWriter:
    def __init__(self, servers, topic):
        self._servers = servers
        self._topic = topic
        self._client = None
        self._partitions_count = 0

    def open(self):
        self._boot_topic()
        self._producer = Producer({'bootstrap.servers': self._servers})

    def write(self, event):
        self._producer.poll(0)

        # Asynchronously produce a message, the delivery report callback will
        # will be triggered (from poll or flush), when the message has
        # been successfully delivered or failed permanently.
        self._producer.produce(self._topic,
                               event.to_bytes(),
                               partition=self.partition_for_key(
                                   event.get_thread_id()),
                               callback=KafkaWriter.delivery_report)

    def close(self):
        self._producer.flush()
        self._client.close()

    def partition_for_key(self, thread_id):
        return int(hashlib.sha512(thread_id).hexdigest(),
                   16) % self._partitions_count

    def _boot_topic(self):
        self._client = KafkaClient(self._servers)

        if not self._client.has_metadata_for_topic(self._topic):
            raise IOError('Kafka topic was not found.')

        self._partitions_count = len(
            self._client.get_partition_ids_for_topic(self._topic))

        if self._partitions_count == 0:
            raise IOError('Kafka topic does not have any partition.')

    @staticmethod
    def delivery_report(err, msg):
        if err is not None:
            logging.error('Event delivery failed: {}'.format(err))
        elif logging.getLogger().getEffectiveLevel() == logging.DEBUG:
            logging.debug('Event delivered to {} [{}]'.format(
                msg.topic(), msg.partition()))
示例#5
0
class KafkaIntegrationTestCase(unittest.TestCase):
    create_client = True
    topic = None
    zk = None
    server = None

    def setUp(self):
        super(KafkaIntegrationTestCase, self).setUp()
        if not os.environ.get('KAFKA_VERSION'):
            self.skipTest('Integration test requires KAFKA_VERSION')

        if not self.topic:
            topic = "%s-%s" % (self.id()[self.id().rindex(".") + 1:], random_string(10))
            self.topic = topic

        if self.create_client:
            self.client = SimpleClient('%s:%d' % (self.server.host, self.server.port))

        timeout = time.time() + 30
        while time.time() < timeout:
            try:
                self.client.load_metadata_for_topics(self.topic, ignore_leadernotavailable=False)
                if self.client.has_metadata_for_topic(topic):
                    break
            except (LeaderNotAvailableError, InvalidTopicError):
                time.sleep(1)
        else:
            raise KafkaTimeoutError('Timeout loading topic metadata!')


        # Ensure topic partitions have been created on all brokers to avoid UnknownPartitionErrors
        # TODO: It might be a good idea to move this to self.client.ensure_topic_exists
        for partition in self.client.get_partition_ids_for_topic(self.topic):
            while True:
                try:
                    req = OffsetRequestPayload(self.topic, partition, -1, 100)
                    self.client.send_offset_request([req])
                    break
                except (NotLeaderForPartitionError, UnknownTopicOrPartitionError, FailedPayloadsError) as e:
                    if time.time() > timeout:
                        raise KafkaTimeoutError('Timeout loading topic metadata!')
                    time.sleep(.1)

        self._messages = {}

    def tearDown(self):
        super(KafkaIntegrationTestCase, self).tearDown()
        if not os.environ.get('KAFKA_VERSION'):
            return

        if self.create_client:
            self.client.close()

    def current_offset(self, topic, partition):
        try:
            offsets, = self.client.send_offset_request([OffsetRequestPayload(topic,
                                                                             partition, -1, 1)])
        except Exception:
            # XXX: We've seen some UnknownErrors here and can't debug w/o server logs
            self.zk.child.dump_logs()
            self.server.child.dump_logs()
            raise
        else:
            return offsets.offsets[0]

    def msgs(self, iterable):
        return [self.msg(x) for x in iterable]

    def msg(self, s):
        if s not in self._messages:
            self._messages[s] = '%s-%s-%s' % (s, self.id(), str(uuid.uuid4()))

        return self._messages[s].encode('utf-8')

    def key(self, k):
        return k.encode('utf-8')
示例#6
0
class KafkaIntegrationTestCase(unittest.TestCase):
    create_client = True
    topic = None
    zk = None
    server = None

    def setUp(self):
        super(KafkaIntegrationTestCase, self).setUp()
        if not os.environ.get('KAFKA_VERSION'):
            self.skipTest('Integration test requires KAFKA_VERSION')

        if not self.topic:
            topic = "%s-%s" % (self.id()[self.id().rindex(".") + 1:],
                               random_string(10))
            self.topic = topic

        if self.create_client:
            self.client = SimpleClient('%s:%d' %
                                       (self.server.host, self.server.port))

        timeout = time.time() + 30
        while time.time() < timeout:
            try:
                self.client.load_metadata_for_topics(
                    self.topic, ignore_leadernotavailable=False)
                if self.client.has_metadata_for_topic(topic):
                    break
            except (LeaderNotAvailableError, InvalidTopicError):
                time.sleep(1)
        else:
            raise KafkaTimeoutError('Timeout loading topic metadata!')

        # Ensure topic partitions have been created on all brokers to avoid UnknownPartitionErrors
        # TODO: It might be a good idea to move this to self.client.ensure_topic_exists
        for partition in self.client.get_partition_ids_for_topic(self.topic):
            while True:
                try:
                    req = OffsetRequestPayload(self.topic, partition, -1, 100)
                    self.client.send_offset_request([req])
                    break
                except (NotLeaderForPartitionError,
                        UnknownTopicOrPartitionError,
                        FailedPayloadsError) as e:
                    if time.time() > timeout:
                        raise KafkaTimeoutError(
                            'Timeout loading topic metadata!')
                    time.sleep(.1)

        self._messages = {}

    def tearDown(self):
        super(KafkaIntegrationTestCase, self).tearDown()
        if not os.environ.get('KAFKA_VERSION'):
            return

        if self.create_client:
            self.client.close()

    def current_offset(self, topic, partition):
        try:
            offsets, = self.client.send_offset_request(
                [OffsetRequestPayload(topic, partition, -1, 1)])
        except Exception:
            # XXX: We've seen some UnknownErrors here and can't debug w/o server logs
            self.zk.child.dump_logs()
            self.server.child.dump_logs()
            raise
        else:
            return offsets.offsets[0]

    def msgs(self, iterable):
        return [self.msg(x) for x in iterable]

    def msg(self, s):
        if s not in self._messages:
            self._messages[s] = '%s-%s-%s' % (s, self.id(), str(uuid.uuid4()))

        return self._messages[s].encode('utf-8')

    def key(self, k):
        return k.encode('utf-8')
示例#7
0
    def spoorer(self):
        try:
            kafka_client = SimpleClient(self.kafka_hosts, timeout=self.timeout)
        except Exception as e:
            print "Error, cannot connect kafka broker."
            sys.exit(1)
        else:
            kafka_topics = kafka_client.topics
        finally:
            kafka_client.close()

        try:
            zookeeper_client = KazooClient(hosts=self.zookeeper_hosts,
                                           read_only=True,
                                           timeout=self.timeout)
            zookeeper_client.start()
        except Exception as e:
            print "Error, cannot connect zookeeper server."
            sys.exit(1)

        try:
            groups = map(
                str,
                zookeeper_client.get_children(self.zookeeper_url +
                                              'consumers'))
        except NoNodeError as e:
            print "Error, invalid zookeeper url."
            zookeeper_client.stop()
            sys.exit(2)
        else:
            for group in groups:
                if 'offsets' not in zookeeper_client.get_children(
                        self.zookeeper_url + 'consumers/%s' % group):
                    continue
                topic_path = 'consumers/%s/offsets' % (group)
                topics = map(
                    str,
                    zookeeper_client.get_children(self.zookeeper_url +
                                                  topic_path))
                if len(topics) == 0: continue

                for topic in topics:
                    if topic not in self.white_topic_group.keys():
                        continue
                    elif group not in self.white_topic_group[topic].replace(
                            ' ', '').split(','):
                        continue
                    partition_path = 'consumers/%s/offsets/%s' % (group, topic)
                    partitions = map(
                        int,
                        zookeeper_client.get_children(self.zookeeper_url +
                                                      partition_path))

                    for partition in partitions:
                        base_path = 'consumers/%s/%s/%s/%s' % (
                            group, '%s', topic, partition)
                        owner_path, offset_path = base_path % 'owners', base_path % 'offsets'
                        offset = zookeeper_client.get(self.zookeeper_url +
                                                      offset_path)[0]
                        try:
                            owner = zookeeper_client.get(self.zookeeper_url +
                                                         owner_path)[0]
                        except NoNodeError as e:
                            owner = 'null'

                        metric = {
                            'datetime':
                            time.strftime("%Y-%m-%d %H:%M:%S",
                                          time.localtime()),
                            'topic':
                            topic,
                            'group':
                            group,
                            'partition':
                            int(partition),
                            'logsize':
                            None,
                            'offset':
                            int(offset),
                            'lag':
                            None,
                            'owner':
                            owner
                        }
                        self.result.append(metric)
        finally:
            zookeeper_client.stop()

        try:
            kafka_consumer = KafkaConsumer(bootstrap_servers=self.kafka_hosts)
        except Exception as e:
            print "Error, cannot connect kafka broker."
            sys.exit(1)
        else:
            for kafka_topic in kafka_topics:
                self.kafka_logsize[kafka_topic] = {}
                partitions = kafka_client.get_partition_ids_for_topic(
                    kafka_topic)

                for partition in partitions:
                    offset = kafka_consumer.get_partition_offsets(
                        kafka_topic, partition, -1, 1)[0]
                    self.kafka_logsize[kafka_topic][partition] = offset

            f1 = open(self.log_file, 'w')
            f2 = open(self.log_day_file, 'a')

            for metric in self.result:
                logsize = self.kafka_logsize[metric['topic']][
                    metric['partition']]
                metric['logsize'] = int(logsize)
                metric['lag'] = int(logsize) - int(metric['offset'])

                f1.write(json.dumps(metric, sort_keys=True) + '\n')
                f1.flush()
                f2.write(json.dumps(metric, sort_keys=True) + '\n')
                f2.flush()
        finally:
            kafka_consumer.close()

        return ''