Пример #1
0
    def revocation_callback(streams: Sequence[TopicPartition]):
        revocation_callback.called = True
        assert streams == [TopicPartition(topic, 0)]
        assert consumer.tell() == {TopicPartition(topic, 0): 1}

        # Not sure why you'd want to do this, but it shouldn't error.
        consumer.seek({TopicPartition(topic, 0): 0})
Пример #2
0
    def test_batch_time(self, mock_time: Any) -> None:
        consumer = FakeKafkaConsumer()
        worker = FakeWorker()
        batching_consumer = BatchingKafkaConsumer(
            consumer,
            'topic',
            worker=worker,
            max_batch_size=100,
            max_batch_time=2000,
            metrics=DummyMetricsBackend(strict=True),
        )

        mock_time.return_value = time.mktime(datetime(2018, 1, 1, 0, 0, 0).timetuple())
        consumer.items = [KafkaMessage(TopicPartition('topic', 0), i, f'{i}'.encode('utf-8')) for i in [1, 2, 3]]
        for x in range(len(consumer.items)):
            batching_consumer._run_once()

        mock_time.return_value = time.mktime(datetime(2018, 1, 1, 0, 0, 1).timetuple())
        consumer.items = [KafkaMessage(TopicPartition('topic', 0), i, f'{i}'.encode('utf-8')) for i in [4, 5, 6]]
        for x in range(len(consumer.items)):
            batching_consumer._run_once()

        mock_time.return_value = time.mktime(datetime(2018, 1, 1, 0, 0, 5).timetuple())
        consumer.items = [KafkaMessage(TopicPartition('topic', 0), i, f'{i}'.encode('utf-8')) for i in [7, 8, 9]]
        for x in range(len(consumer.items)):
            batching_consumer._run_once()

        batching_consumer._shutdown()

        assert worker.processed == [b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9']
        assert worker.flushed == [[b'1', b'2', b'3', b'4', b'5', b'6']]
        assert consumer.commit_calls == 1
        assert consumer.close_calls == 1
Пример #3
0
    def assignment_callback(streams: Sequence[TopicPartition]):
        assignment_callback.called = True
        assert streams == [TopicPartition(topic, 0)]
        assert consumer.tell() == {TopicPartition(topic, 0): 0}

        consumer.seek({TopicPartition(topic, 0): 1})

        with pytest.raises(ConsumerError):
            consumer.seek({TopicPartition(topic, 1): 0})
Пример #4
0
    def eventstream(dataset_name):
        dataset = get_dataset(dataset_name)
        ensure_table_exists(dataset)
        record = json.loads(http_request.data)

        version = record[0]
        if version != 2:
            raise RuntimeError("Unsupported protocol version: %s" % record)

        message = KafkaMessage(
            TopicPartition('topic', 0),
            0,
            http_request.data,
        )

        type_ = record[1]
        metrics = DummyMetricsBackend()
        if type_ == 'insert':
            from snuba.consumer import ConsumerWorker
            worker = ConsumerWorker(dataset,
                                    producer=None,
                                    replacements_topic=None,
                                    metrics=metrics)
        else:
            from snuba.replacer import ReplacerWorker
            worker = ReplacerWorker(clickhouse_rw, dataset, metrics=metrics)

        processed = worker.process_message(message)
        if processed is not None:
            batch = [processed]
            worker.flush_batch(batch)

        return ('ok', 200, {'Content-Type': 'text/plain'})
Пример #5
0
    def test_send_message(
        self,
        message: str,
        expected: Optional[ProcessedMessage],
    ) -> None:
        dataset = get_dataset("groupedmessage")
        snapshot_id = uuid1()
        transact_data = TransactionData(xmin=100,
                                        xmax=200,
                                        xip_list=[120, 130])

        worker = SnapshotAwareWorker(
            dataset=dataset,
            producer=FakeConfluentKafkaProducer(),
            snapshot_id=str(snapshot_id),
            transaction_data=transact_data,
            replacements_topic=None,
            metrics=DummyMetricsBackend(strict=True),
        )

        ret = worker.process_message(
            KafkaMessage(
                TopicPartition('topic', 0),
                1,
                message.encode('utf-8'),
            ))
        assert ret == expected
Пример #6
0
    def test_unmerge_insert(self):
        self.event['project_id'] = self.project_id
        self.event['group_id'] = 1
        self.event['primary_hash'] = 'a' * 32
        self.write_raw_events(self.event)

        assert self._issue_count(self.project_id) == [{'count': 1, 'issue': 1}]

        timestamp = datetime.now(tz=pytz.utc)

        project_id = self.project_id

        message = KafkaMessage(
            TopicPartition('replacements', 1),
            42,
            json.dumps((2, 'end_unmerge', {
                'project_id':
                project_id,
                'previous_group_id':
                1,
                'new_group_id':
                2,
                'hashes': ['a' * 32],
                'datetime':
                timestamp.strftime(PAYLOAD_DATETIME_FORMAT),
            })).encode('utf-8'),
        )

        processed = self.replacer.process_message(message)
        self.replacer.flush_batch([processed])

        assert self._issue_count(self.project_id) == [{'count': 1, 'issue': 2}]
Пример #7
0
def test_auto_offset_reset_latest(topic: str) -> None:
    producer = ConfluentProducer(configuration)
    value = uuid.uuid1().hex.encode("utf-8")
    producer.produce(topic, value=value)
    assert producer.flush(5.0) is 0

    consumer = KafkaConsumer(
        {
            **configuration,
            "auto.offset.reset": "latest",
            "enable.auto.commit": "false",
            "enable.auto.offset.store": "true",
            "enable.partition.eof": "true",
            "group.id": "test-latest",
        }
    )

    consumer.subscribe([topic])

    try:
        consumer.poll(10.0)  # XXX: getting the subcription is slow
    except EndOfStream as error:
        assert error.stream == TopicPartition(topic, 0)
        assert error.offset == 1
    else:
        raise AssertionError('expected EndOfStream error')

    consumer.close()
Пример #8
0
def get_messages(events_file):
    "Create a fake Kafka message for each JSON event in the file."
    messages = []
    raw_events = open(events_file).readlines()
    for raw_event in raw_events:
        messages.append(
            KafkaMessage(TopicPartition('events', 1), 0,
                         raw_event.encode('utf-8')), )
    return messages
Пример #9
0
    def test_delete_tag_promoted_insert(self):
        self.event['project_id'] = self.project_id
        self.event['group_id'] = 1
        self.event['data']['tags'].append(['browser.name', 'foo'])
        self.event['data']['tags'].append(['notbrowser', 'foo'])
        self.write_raw_events(self.event)

        project_id = self.project_id

        def _issue_count(total=False):
            return json.loads(
                self.app.post('/query',
                              data=json.dumps({
                                  'project': [project_id],
                                  'aggregations': [['count()', '', 'count']],
                                  'conditions':
                                  [['tags[browser.name]', '=', 'foo']]
                                  if not total else [],
                                  'groupby': ['issue'],
                              })).data)['data']

        assert _issue_count() == [{'count': 1, 'issue': 1}]
        assert _issue_count(total=True) == [{'count': 1, 'issue': 1}]

        timestamp = datetime.now(tz=pytz.utc)

        message = KafkaMessage(
            TopicPartition('replacements', 1),
            42,
            json.dumps((2, 'end_delete_tag', {
                'project_id':
                project_id,
                'tag':
                'browser.name',
                'datetime':
                timestamp.strftime(PAYLOAD_DATETIME_FORMAT),
            })).encode('utf-8'),
        )

        processed = self.replacer.process_message(message)
        self.replacer.flush_batch([processed])

        assert _issue_count() == []
        assert _issue_count(total=True) == [{'count': 1, 'issue': 1}]
Пример #10
0
    def test_batch_size(self) -> None:
        consumer = FakeKafkaConsumer()
        worker = FakeWorker()
        batching_consumer = BatchingKafkaConsumer(
            consumer,
            'topic',
            worker=worker,
            max_batch_size=2,
            max_batch_time=100,
            metrics=DummyMetricsBackend(strict=True),
        )

        consumer.items = [KafkaMessage(TopicPartition('topic', 0), i, f'{i}'.encode('utf-8')) for i in [1, 2, 3]]
        for x in range(len(consumer.items)):
            batching_consumer._run_once()
        batching_consumer._shutdown()

        assert worker.processed == [b'1', b'2', b'3']
        assert worker.flushed == [[b'1', b'2']]
        assert consumer.commit_calls == 1
        assert consumer.close_calls == 1
Пример #11
0
def test_commit_log_consumer(topic: str) -> None:
    # XXX: This would be better as an integration test (or at least a test
    # against an abstract Producer interface) instead of against a test against
    # a mock.
    commit_log_producer = FakeConfluentKafkaProducer()

    consumer = KafkaConsumerWithCommitLog(
        {
            **configuration,
            "auto.offset.reset": "earliest",
            "enable.auto.commit": "false",
            "enable.auto.offset.store": "true",
            "enable.partition.eof": "true",
            "group.id": "test",
            "session.timeout.ms": 10000,
        },
        commit_log_producer,
        'commit-log',
    )

    consumer.subscribe([topic])

    producer = ConfluentProducer(configuration)
    producer.produce(topic)
    assert producer.flush(5.0) is 0

    message = consumer.poll(10.0)  # XXX: getting the subscription is slow
    assert isinstance(message, Message)

    assert consumer.commit() == {TopicPartition(topic, 0): message.offset + 1}

    assert len(commit_log_producer.messages) == 1
    commit_message = commit_log_producer.messages[0]
    assert commit_message.topic() == 'commit-log'
    assert commit_message.key() == '{}:{}:{}'.format(topic, 0,
                                                     'test').encode('utf-8')
    assert commit_message.value() == '{}'.format(message.offset + 1).encode(
        'utf-8')  # offsets are last processed message offset + 1
Пример #12
0
    def test_skip_too_old(self):
        replacement_topic = enforce_table_writer(
            self.dataset).get_stream_loader().get_replacement_topic_spec()
        test_worker = ConsumerWorker(self.dataset,
                                     FakeConfluentKafkaProducer(),
                                     replacement_topic.topic_name,
                                     self.metrics)

        event = self.event
        old_timestamp = datetime.utcnow() - timedelta(days=300)
        old_timestamp_str = old_timestamp.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
        event['datetime'] = old_timestamp_str
        event['data']['datetime'] = old_timestamp_str
        event['data']['received'] = int(
            calendar.timegm(old_timestamp.timetuple()))

        message = KafkaMessage(
            TopicPartition('events', 1),
            42,
            json.dumps((0, 'insert', event)).encode('utf-8'),
        )

        assert test_worker.process_message(message) is None
Пример #13
0
    def test_offsets(self):
        event = self.event

        message = KafkaMessage(
            TopicPartition('events', 456),
            123,
            json.dumps((0, 'insert',
                        event)).encode('utf-8')  # event doesn't really matter
        )

        replacement_topic = enforce_table_writer(
            self.dataset).get_stream_loader().get_replacement_topic_spec()
        test_worker = ConsumerWorker(self.dataset,
                                     FakeConfluentKafkaProducer(),
                                     replacement_topic.topic_name,
                                     self.metrics)
        batch = [test_worker.process_message(message)]
        test_worker.flush_batch(batch)

        assert self.clickhouse.execute(
            "SELECT project_id, event_id, offset, partition FROM %s" %
            self.table) == [(self.event['project_id'], self.event['event_id'],
                             123, 456)]
Пример #14
0
def test_consumer(topic: str) -> None:
    consumer = KafkaConsumer({
        **configuration,
        "auto.offset.reset": "latest",
        "enable.auto.commit": "false",
        "enable.auto.offset.store": "true",
        "enable.partition.eof": "true",
        "group.id": "test",
        "session.timeout.ms": 10000,
    })

    # TODO: It'd be much nicer if ``subscribe`` returned a future that we could
    # use to wait for assignment, but we'd need to be very careful to avoid
    # edge cases here. It's probably not worth the complexity for now.
    # XXX: There has got to be a better way to do this...
    assignment_callback = mock.MagicMock()
    revocation_callback = mock.MagicMock()
    consumer.subscribe([topic],
                       on_assign=assignment_callback,
                       on_revoke=revocation_callback)

    try:
        consumer.poll(10.0)  # XXX: getting the subcription is slow
    except EndOfStream as error:
        assert error.stream == TopicPartition(topic, 0)
        assert error.offset == 0
    else:
        raise AssertionError('expected EndOfStream error')

    assert assignment_callback.call_args_list == [
        mock.call([TopicPartition(topic, 0)])
    ]

    producer = ConfluentProducer(configuration)
    value = uuid.uuid1().hex.encode("utf-8")
    producer.produce(topic, value=value)
    assert producer.flush(5.0) is 0

    message = consumer.poll(1.0)
    assert isinstance(message, Message)
    assert message.stream == TopicPartition(topic, 0)
    assert message.offset == 0
    assert message.value == value

    try:
        assert consumer.poll(1.0) is None
    except EndOfStream as error:
        assert error.stream == TopicPartition(topic, 0)
        assert error.offset == 1
    else:
        raise AssertionError('expected EndOfStream error')

    assert consumer.commit() == {TopicPartition(topic, 0): message.offset + 1}

    consumer.unsubscribe()

    assert consumer.poll(1.0) is None

    assert revocation_callback.call_args_list == [
        mock.call([TopicPartition(topic, 0)])
    ]

    consumer.close()

    with pytest.raises(RuntimeError):
        consumer.subscribe([topic])

    with pytest.raises(RuntimeError):
        consumer.unsubscribe()

    with pytest.raises(RuntimeError):
        consumer.poll()

    with pytest.raises(RuntimeError):
        consumer.commit()

    consumer.close()
Пример #15
0
 def _wrap(self, msg: str) -> KafkaMessage:
     return KafkaMessage(
         TopicPartition('replacements', 0),
         0,
         json.dumps(msg).encode('utf-8'),
     )
Пример #16
0
def test_consumer(topic: str) -> None:

    def build_consumer() -> KafkaConsumer:
        return KafkaConsumer(
            {
                **configuration,
                "auto.offset.reset": "earliest",
                "enable.auto.commit": "false",
                "enable.auto.offset.store": "true",
                "enable.partition.eof": "true",
                "group.id": "test",
                "session.timeout.ms": 10000,
            }
        )

    producer = ConfluentProducer(configuration)
    value = uuid.uuid1().hex.encode("utf-8")
    for i in range(2):
        producer.produce(topic, value=value)
    assert producer.flush(5.0) is 0

    consumer = build_consumer()

    def assignment_callback(streams: Sequence[TopicPartition]):
        assignment_callback.called = True
        assert streams == [TopicPartition(topic, 0)]
        assert consumer.tell() == {TopicPartition(topic, 0): 0}

        consumer.seek({TopicPartition(topic, 0): 1})

        with pytest.raises(ConsumerError):
            consumer.seek({TopicPartition(topic, 1): 0})

    def revocation_callback(streams: Sequence[TopicPartition]):
        revocation_callback.called = True
        assert streams == [TopicPartition(topic, 0)]
        assert consumer.tell() == {TopicPartition(topic, 0): 1}

        # Not sure why you'd want to do this, but it shouldn't error.
        consumer.seek({TopicPartition(topic, 0): 0})

    # TODO: It'd be much nicer if ``subscribe`` returned a future that we could
    # use to wait for assignment, but we'd need to be very careful to avoid
    # edge cases here. It's probably not worth the complexity for now.
    consumer.subscribe([topic], on_assign=assignment_callback, on_revoke=revocation_callback)

    message = consumer.poll(10.0)  # XXX: getting the subcription is slow
    assert isinstance(message, Message)
    assert message.stream == TopicPartition(topic, 0)
    assert message.offset == 1
    assert message.value == value

    assert consumer.tell() == {TopicPartition(topic, 0): 2}
    assert getattr(assignment_callback, 'called', False)

    consumer.seek({TopicPartition(topic, 0): 0})
    assert consumer.tell() == {TopicPartition(topic, 0): 0}

    with pytest.raises(ConsumerError):
        consumer.seek({TopicPartition(topic, 1): 0})

    message = consumer.poll(1.0)
    assert isinstance(message, Message)
    assert message.stream == TopicPartition(topic, 0)
    assert message.offset == 0
    assert message.value == value

    assert consumer.commit() == {TopicPartition(topic, 0): message.get_next_offset()}

    consumer.unsubscribe()

    assert consumer.poll(1.0) is None

    assert consumer.tell() == {}

    with pytest.raises(ConsumerError):
        consumer.seek({TopicPartition(topic, 0): 0})

    consumer.close()

    with pytest.raises(RuntimeError):
        consumer.subscribe([topic])

    with pytest.raises(RuntimeError):
        consumer.unsubscribe()

    with pytest.raises(RuntimeError):
        consumer.poll()

    with pytest.raises(RuntimeError):
        consumer.tell()

    with pytest.raises(RuntimeError):
        consumer.seek({TopicPartition(topic, 0): 0})

    with pytest.raises(RuntimeError):
        consumer.commit()

    consumer.close()

    consumer = build_consumer()

    consumer.subscribe([topic])

    message = consumer.poll(10.0)  # XXX: getting the subscription is slow
    assert isinstance(message, Message)
    assert message.stream == TopicPartition(topic, 0)
    assert message.offset == 1
    assert message.value == value

    try:
        assert consumer.poll(1.0) is None
    except EndOfStream as error:
        assert error.stream == TopicPartition(topic, 0)
        assert error.offset == 2
    else:
        raise AssertionError('expected EndOfStream error')

    consumer.close()