示例#1
0
class Partition(object):
    def __init__(self, cluster, topic, broker, number):
        self.cluster = cluster
        self.topic = topic
        self.broker = broker
        self.number = int(number)

    __repr__ = attribute_repr('topic', 'broker', 'number')

    def earliest_offset(self):
        return self.broker.client.offsets(self.topic.name, self.number,
                                          OFFSET_EARLIEST, 1)[0]

    def latest_offset(self):
        return self.broker.client.offsets(self.topic.name, self.number,
                                          OFFSET_LATEST, 1)[0]

    def publish(self, data):
        """
        Publishes one or more messages to this partition.
        """
        if isinstance(data, basestring):
            messages = [data]
        elif isinstance(data, collections.Sequence):
            messages = data
        else:
            raise TypeError

        return self.broker.client.produce(self.topic.name, self.number,
                                          messages)

    def fetch(self, offset, size):
        return self.broker.client.fetch(self.topic.name, self.number, offset,
                                        size)

    def __hash__(self):
        return hash((self.topic, self.broker.id, self.number))

    def __eq__(self, other):
        return hash(self) == hash(other)

    def __ne__(self, other):
        return not self == other
示例#2
0
class PartitionMap(object):
    """
    Manages the partitions associated with a topic on a per-broker basis.

    :param cluster: The cluster that this partition map is associated with.
    :type cluster: :class:`samsa.cluster.Cluster`
    :param topic: The topic that this partition map is associated with.
    :type topic: :class:`samsa.topics.Topic`
    """
    def __init__(self, cluster, topic):
        self.cluster = cluster
        self.topic = topic

        self.__brokers = {}

        self.topic_path = '/brokers/topics/%s' % self.topic.name
        self._topic_watcher = DataWatch(self.cluster.zookeeper,
                                        self.topic_path,
                                        self._topic_changed,
                                        allow_missing_node=True)

    __repr__ = attribute_repr('topic')

    def _topic_changed(self, data, stat):
        if stat:
            self._topic_child_watcher = ChildrenWatch(self.cluster.zookeeper,
                                                      self.topic_path,
                                                      self._configure)

    def _configure(self, broker_ids):
        logger.info('Looking up brokers for %s...', self)

        brokers = map(self.cluster.brokers.get, map(int, broker_ids))

        # Add any broker IDs that are not already present in the mapping.
        for broker in brokers:
            if broker not in self.__brokers:
                partitionset = PartitionSet(self.cluster, self.topic, broker)
                logger.info('Discovered new partition set: %s', partitionset)
                self.__brokers[broker] = partitionset

        # Remove any brokers that are no longer present in the mapping.
        dead = set(self.__brokers.keys()) - set(brokers)
        for broker in dead:
            logger.info('Removing broker %s from %s', broker, self)
            del self.__brokers[broker]

    def __len__(self):
        """
        Returns the total number of partitions for this partition map,
        including virtual partitions.
        """
        partitions = list(iter(self))
        return len(partitions)

    def __iter__(self):
        """
        Returns an iterator containing every known partition for this topic.

        This includes "virtual" partitions for brokers that are present in the
        cluster, but have not yet registered their topic/partitions with the
        ZooKeeper cluster.
        """
        return itertools.chain(self.actual, self.virtual)

    @property
    def _partition_sets(self):
        """
        Returns a list of :class:`samsa.partitions.PartitionSet`.

        """
        return self.__brokers.values()

    @property
    def actual(self):
        """
        Returns an iterator containing all of the partitions for this topic
        that have been registered by a broker in ZooKeeper.
        """
        return itertools.chain.from_iterable(
            itertools.imap(iter, self.__brokers.values()))

    @property
    def virtual(self):
        """
        Returns an iterator containing "virtual" partitions for this topic.

        Virtual partitions are placeholder partitions for brokers that are
        known to be alive but are not aware of a topic. Since these brokers
        haven't seen the topic yet, they will not have published the number of
        partitions that they're serving. Every broker should be able to accept
        writes on the 0th partition for each topic, however, so the virtual
        partitions provide a partition objects for those partitions that have
        not yet been registered but are assumed to exist.
        """
        uninitialized_brokers = set(self.cluster.brokers.values()) - \
            set(self.__brokers.keys())
        create_virtual_partitionset = functools.partial(PartitionSet,
                                                        cluster=self.cluster,
                                                        topic=self.topic,
                                                        virtual=True)
        virtual_iterator = lambda broker: \
            iter(create_virtual_partitionset(broker=broker))
        return itertools.chain.from_iterable(
            itertools.imap(virtual_iterator, uninitialized_brokers))
示例#3
0
class PartitionSet(object):
    """
    Manages the partitions for a topic on a single broker.

    :param cluster: The cluster that this partition set is associated with.
    :type cluster: :class:`samsa.cluster.Cluster`
    :param topic: The topic that this partion set is associated with.
    :type topic: :class:`samsa.topics.Topic`
    :param broker: The broker this partition set is associated with.
    :type broker: :class:`samsa.brokers.Broker`
    :param virtual: Whether this is a "virtual" partition set or not. Virtual
        partition sets are used when a broker does not have any data associated
        with a specific topic.
    :type virtual: bool
    """
    def __init__(self, cluster, topic, broker, virtual=False):
        self.cluster = cluster
        self.topic = topic
        self.broker = broker
        self.virtual = virtual

        # TODO: At some point it might be wise to enable a "hint" size for
        # virtual partitions -- this way if brokers are serving a large number
        # of partitions, we can still maintain a reasonably even distribution
        # in a predictable environment. This might cause some problems in more
        # dynamic/less homogenous environments, however, since there's no good
        # way to track down a failure on a produce request to an invalid
        # partition.
        self.__count = 1

        path = '/brokers/topics/%s/%s' % (self.topic.name, self.broker.id)
        if not self.virtual:
            self._partition_watcher = DataWatch(self.cluster.zookeeper,
                                                path,
                                                self._configure,
                                                allow_missing_node=True)

    __repr__ = attribute_repr('topic', 'broker', 'virtual')

    def _configure(self, data, stat):

        # If the node does not exist, this means this broker has not gotten
        # any writes for this partition yet. We can assume that the broker
        # is handling at least one partition for this topic, and update
        # when we have more information by setting an exists watch on the
        # node path.
        if data:
            count = int(data)
            logger.info('Found %s partitions for %s', count, self)
        else:
            count = 1
            logger.info(
                '%s is not registered in ZooKeeper, falling back '
                'to %s virtual partition(s)', self, count)

        self.__count = count

    def __len__(self):
        """
        Returns the total number of partitions available within this partition
        set.
        """
        return self.__count

    def __iter__(self):
        """
        Returns an iterator of :class:`samsa.topics.Partition` instances for
        this partition set.
        """
        for i in xrange(0, len(self)):
            yield Partition(self.cluster, self.topic, self.broker, i)
示例#4
0
class Broker(object):
    """
    A Kafka broker.

    :param cluster: The cluster this broker is associated with.
    :type cluster: :class:`samsa.cluster.Cluster`
    :param id_: Kafka broker ID
    """
    def __init__(self, cluster, id_):
        self.cluster = cluster
        self.id = int(id_)

        self.__host = None
        self.__port = None

        self.is_dead = False

        self._node_path = '/brokers/ids/%s' % self.id
        self._config_watcher = DataWatch(self.cluster.zookeeper,
                                         self._node_path, self._configure)

    __repr__ = attribute_repr('id')

    def _configure(self, data, stat):
        """
        Configures a broker based on it's state in ZooKeeper.
        """
        logger.info('Retrieved broker data for %s...', self)
        if data is None:
            logger.info('Broker data field was empty. Assuming it was dead.')
            self.__host = self.__port = None
        else:
            creator, self.__host, port = data.split(':')
            self.__port = int(port)

    @property
    def host(self):
        """
        The host that the broker is available at.
        """
        return self.__host

    @property
    def port(self):
        """
        The port that the broker is available at.
        """
        return self.__port

    @property
    def client(self):
        """
        The :class:`samsa.client.Client` object for this broker.

        Only one client is created per broker instance.
        """
        try:
            return self.__client
        except AttributeError:
            self.__client = Client(self.host,
                                   self.cluster.handler,
                                   port=self.port)
            return self.__client
示例#5
0
class Topic(object):
    """
    A topic within a Kafka cluster.

    :param cluster: The cluster that this topic is associated with.
    :type cluster: :class:`samsa.cluster.Cluster`
    :param name: The name of this topic.
    :param partitioner: callable that takes two arguments, ``partitions`` and
        ``key`` and returns a single :class:`~samsa.partitions.Partition`
        instance to publish the message to.
    :type partitioner: any callable type
    """
    def __init__(self, cluster, name, partitioner=random_partitioner):
        self.cluster = cluster
        self.name = name
        self.partitions = PartitionMap(self.cluster, self)
        self.partitioner = partitioner

    __repr__ = attribute_repr('name')

    def latest_offsets(self):
        return [(p.broker.id, p.latest_offset()) for p in self.partitions]

    def publish(self, data, key=None):
        """
        Publishes one or more messages to a random partition of this topic.

        :param data: message(s) to be sent to the broker.
        :type data: ``str`` or sequence of ``str``.
        :param key: a key to be used for semantic partitioning
        :type key: implementation-specific
        """
        if len(self.partitions) < 1:
            raise NoAvailablePartitionsError(
                'No partitions are available to '
                'accept a write for this message. (Is your Kafka broker '
                'running?)')
        partition = self.partitioner(self.partitions, key)
        return partition.publish(data)

    def subscribe(
        self,
        group,
        backoff_increment=1,
        connect_retries=4,
        fetch_size=307200,
        offset_reset='nearest',
        rebalance_retries=4,
    ):
        """
        Returns a new consumer that can be used for reading from this topic.

        `backoff_increment` is used to progressively back off asking a partition
        for messages when there aren't any ready. Incrementally increases wait
        time in seconds.

        `offset_reset` is used to determine where to reset a partition's offset
        in the event of an OffsetOutOfRangeError. Valid values are:

        "earliest": Go to the earliest message in the partition
        "latest": Go to the latest message in the partition
        "nearest": If requested offset is before the earliest, go there,
                   otherwise, go to the latest message in the partition.

        `rebalance_retries` and `connect_retries` affect the number of times
        to try acquiring partitions before giving up.

        When samsa restarts, there can be a bit of lag before
        Zookeeper realizes the old client is dead and releases the partitions
        it was consuming. Setting this means samsa will wait a bit and try to
        acquire partitions again before throwing an error. In the case of
        rebalancing, sometimes it takes a bit for a consumer to release the
        partition they're reading, and this helps account for that.

        :param group: The consumer group to join.
        :param backoff_increment: How fast to incrementally backoff when a
                                  partition has no messages to read.
        :param connect_retries: Retries before giving up on connecting
        :param fetch_size: Default fetch size (in bytes) to get from Kafka
        :param offset_reset: Where to reset when an OffsetOutOfRange happens
        :param rebalance_retries: Retries before giving up on rebalance
        :rtype: :class:`samsa.consumer.consumer.Consumer`
        """
        return Consumer(self.cluster,
                        self,
                        group,
                        backoff_increment=backoff_increment,
                        connect_retries=connect_retries,
                        fetch_size=fetch_size,
                        offset_reset=offset_reset,
                        rebalance_retries=rebalance_retries)
示例#6
0
class Message(object):
    __slots__ = ('_headers', '_payload', '_raw', '_offset', '_len', '_valid')

    Header = NamedStruct('Header', (
        ('i', 'length'),
        ('b', 'magic'),
    ))

    VersionHeaders = VersionHeaderMap({
        0:
        NamedStruct('Header', (('i', 'checksum'), )),
        1:
        NamedStruct('HeaderWithCompression', (
            ('b', 'compression'),
            ('i', 'checksum'),
        )),
    })

    def __init__(self, raw, offset=0):
        # Process headers
        self._headers = []
        header = self.Header.unpack_from(raw)
        self._headers.append(header)
        versioned_header = self.VersionHeaders[header.magic].unpack_from(
            raw, offset=self.Header.size)
        self._headers.append(versioned_header)

        # Some values used as read-only properties so we don't recalc every time
        self._raw = raw
        self._offset = offset
        self._len = len(self._raw)

        # Get the payload as a byte array
        start = self.Header.size + self.VersionHeaders[self['magic']].size
        self._payload = self._raw[start:]

        self._valid = self['checksum'] == crc32(self.payload)

    __repr__ = attribute_repr('raw', 'offset')

    def __len__(self):
        return self._len

    def __str__(self):
        return str(self.payload)

    def __getitem__(self, name):
        for header in self._headers:
            try:
                return getattr(header, name)
            except AttributeError:
                pass
        else:
            raise AttributeError('%s does not have a field named "%s".' %
                                 (repr(self), name))

    @property
    def headers(self):
        return reduce(lambda x, y: dict(x, **y),
                      methodimap('_asdict', self._headers), {})

    def get(self, name, default=None):
        try:
            return self[name]
        except AttributeError:
            return default

    @property
    def offset(self):
        return self._offset

    @property
    def next_offset(self):
        return self._offset + self._len

    @property
    def payload(self):
        return self._payload

    @property
    def raw(self):
        return self._raw

    @property
    def valid(self):
        return self._valid

    @classmethod
    def pack_into(cls, bytea, offset, payload, version, compression=None):
        """
        Packs a message payload into a buffer.

        :param bytea: buffer to pack the message into
        :type bytea: bytearray
        :param offset: offset to start writing at
        :type offset: int
        :param payload: message payload
        :type payload: str
        :param version: message version to publish ("magic number")
        :type version: int
        :param compression: which compression format to use
                            (only for version 1)
        :type compression: int
        :returns: total amount of written (in bytes)
        :rtype: int
        """
        if version < 1:
            if compression is not None:
                raise ValueError('Compression is not supported on version %s' %
                                 version)
        elif compression is not None:
            if compression not in COMPRESSION_TYPES:
                raise ValueError('%s is not a valid compression type' %
                                 compression)
            elif compression is not COMPRESSION_TYPE_NONE:
                raise NotImplementedError  # TODO
        else:
            compression = COMPRESSION_TYPE_NONE

        VersionHeader = cls.VersionHeaders[version]

        # Write generic message header.
        length = cls.Header.size + VersionHeader.size + len(payload)
        cls.Header.pack_into(bytea,
                             offset=offset,
                             length=length - 4,
                             magic=version)
        offset += cls.Header.size

        # Write versioned message header.
        version_kwargs = {}
        if compression is not None:
            version_kwargs['compression'] = compression
        VersionHeader.pack_into(bytea,
                                offset=offset,
                                checksum=crc32(payload),
                                **version_kwargs)
        offset += VersionHeader.size

        # Write message payload.
        bytea[offset:offset + len(payload)] = payload

        return length

    @classmethod
    def encode(cls, messages, version, **kwargs):
        """
        Encodes multiple messages.

        :param messages: messages to publish
        :type messages: sequence of strs
        :param version: message version to publish ("magic number")
        :type version: int
        :param \*\*kwargs: extra arguments to pass to :meth:`.pack_into`
        :returns: encoded messages
        :rtype: :class:`samsa.utils.structuredio.StructuredBytesIO`
        """
        message_header_length = (cls.Header.size +
                                 cls.VersionHeaders[version].size)
        length = (MessageSetFrameHeader.size + sum(map(len, messages)) +
                  (len(messages) * message_header_length))
        bytea = bytearray(length)
        MessageSetFrameHeader.pack_into(bytea, 0, length=length - 4)
        offset = MessageSetFrameHeader.size
        for message in messages:
            written = cls.pack_into(bytea,
                                    offset,
                                    payload=message,
                                    version=version,
                                    **kwargs)
            offset += written
        return StructuredBytesIO(bytea)
示例#7
0
class Client(object):
    """
    Low-level Kafka protocol client.

    :param host: broker host
    :param port: broker port number
    :param timeout: socket timeout
    """
    def __init__(self, host, handler, port=9092, timeout=30, autoconnect=True):
        self.connection = Connection(host, port, timeout)
        self.handler = handlers.RequestHandler(handler, self.connection)
        if autoconnect:
            self.connect()

    def connect(self):
        self.connection.connect()
        self.handler.start()

    def disconnect(self):
        self.handler.stop()
        self.connection.disconnect()

    __repr__ = attribute_repr('connection')

    # Protocol Implementation

    def produce(self,
                topic,
                partition,
                messages,
                version=DEFAULT_VERSION,
                **kwargs):
        """
        Sends messages to the broker on a single topic/partition combination.

        >>> client.produce('topic', 0, ('message',))

        :param topic: topic name
        :param partition: partition ID
        :param messages: the messages to be sent
        :type messages: list, generator, or other iterable of strings
        :param version: version of message encoding
        :type version: int
        :param \*\*kwargs: extra (version-specific) keyword arguments
                           to pass to message encoder
        """
        request = StructuredBytesIO()
        request.pack(2, REQUEST_TYPE_PRODUCE)
        write_request_header(request, topic, partition)
        request.write(Message.encode(messages, version=version, **kwargs))
        return self.handler.request(request, has_response=False)

    def multiproduce(self, data, version=DEFAULT_VERSION, **kwargs):
        """
        Sends messages to the broker on multiple topics and/or partitions.

        >>> client.produce((
        ...    ('topic-1', 0, ('message',)),
        ...    ('topic-2', 0, ('message', 'message',)),
        ... ))

        :param data: sequence of 3-tuples of the format
                     ``(topic, partition, messages)``
        :type data: list, generator, or other iterable
        :param version: version of message encoding
        :type version: int
        :param \*\*kwargs: extra (version-specific) keyword arguments
                           to pass to message encoder
        """
        payloads = []
        for topic, partition, messages in data:
            payload = StructuredBytesIO()
            write_request_header(payload, topic, partition)
            payload.write(Message.encode(messages, version=version, **kwargs))
            payloads.append(payload)

        request = StructuredBytesIO()
        request.pack(2, REQUEST_TYPE_MULTIPRODUCE)
        request.pack(2, len(payloads))
        for payload in payloads:
            request.write(payload)
        return self.handler.request(request, has_response=False)

    def fetch(self, topic, partition, offset, size):
        """
        Fetches messages from the broker on a single topic/partition.

        >>> for offset, message in client.fetch('test', 0, 0, 1000):
        ...     print offset, message
        0L 'hello world'
        20L 'hello world'

        :param topic: topic name
        :param partition: partition ID
        :param offset: offset to begin read
        :type offset: integer
        :param size: the maximum number of bytes to return
        :rtype: generator of 2-tuples in ``(offset, message)`` format
        """
        # TODO: Document failure modes.
        request = StructuredBytesIO()
        request.pack(2, REQUEST_TYPE_FETCH)
        write_request_header(request, topic, partition)
        request.pack(8, offset)
        request.pack(4, size)

        response = self.handler.request(request)

        try:
            # N.B. Using generator here makes dealing with decode errors hard
            return list(decode_messages(response.get(), from_offset=offset))
        except SocketDisconnectedError:
            return []
        except MessageTooLargeError:
            # Try again, but larger!
            return self.fetch(topic, partition, offset, size * 1.5)

    def multifetch(self, data):
        """
        Fetches messages from the broker on multiple topics/partitions.

        >>> topics = (
        ...     ('topic-1', 0, 0, 1000),
        ...     ('topic-2', 0, 0, 1000),
        ... )
        >>> for i, response in enumerate(client.fetch(topics)):
        ...     print 'response:', i
        ...     for offset, message in messages:
        ...         print offset, message
        response 0
        0L 'hello world'
        20L 'hello world'
        response 1
        0L 'hello world'
        20L 'hello world'

        :param data: sequence of 4-tuples of the format
                     ``(topic, partition, offset, size)``
                     For more information, see :meth:`Client.fetch`.
        :rtype: generator of fetch responses (message generators).
            For more information, see :meth:`Client.fetch`.
        """
        payloads = []
        from_offsets = []
        for topic, partition, offset, size in data:
            payload = StructuredBytesIO()
            write_request_header(payload, topic, partition)
            from_offsets.append(offset)
            payload.pack(8, offset)
            payload.pack(4, size)
            payloads.append(payload)

        request = StructuredBytesIO()
        request.pack(2, REQUEST_TYPE_MULTIFETCH)
        request.pack(2, len(payloads))
        for payload in payloads:
            request.write(payload)
        response = self.handler.request(request)
        return decode_message_sets(response.get(), from_offsets)

    def offsets(self, topic, partition, time, max):
        """
        Returns message offsets before a certain time for the given
        topic/partition.

        >>> client.offsets('test', 0, OFFSET_EARLIEST, 1)
        [0]

        :param topic: topic name
        :param partition: partition ID
        :param time: the time in milliseconds since the UNIX epoch, or either
            ``OFFSET_EARLIEST`` or ``OFFSET_LATEST``.
        :type time: integer
        :param max: the maximum number of offsets to return
        :rtype: list of offsets
        """
        request = StructuredBytesIO()
        request.pack(2, REQUEST_TYPE_OFFSETS)
        write_request_header(request, topic, partition)
        request.pack(8, time)
        request.pack(4, max)
        response = self.handler.request(request)
        (count, ) = OffsetsResponseHeader.unpack_from(response.get())
        offsets = []
        for i in xrange(0, count):
            offsets.append(
                Offset.unpack_from(response.get(),
                                   offset=OffsetsResponseHeader.size +
                                   (i * Offset.size)).value)
        return offsets