Пример #1
0
    def test_batch_time(self, mock_time: Any) -> None:
        consumer = FakeKafkaConsumer()
        worker = FakeWorker()
        batching_consumer = BatchingKafkaConsumer(
            consumer,
            'topic',
            worker=worker,
            max_batch_size=100,
            max_batch_time=2000,
            metrics=DummyMetricsBackend(strict=True),
        )

        mock_time.return_value = time.mktime(datetime(2018, 1, 1, 0, 0, 0).timetuple())
        consumer.items = [KafkaMessage(TopicPartition('topic', 0), i, f'{i}'.encode('utf-8')) for i in [1, 2, 3]]
        for x in range(len(consumer.items)):
            batching_consumer._run_once()

        mock_time.return_value = time.mktime(datetime(2018, 1, 1, 0, 0, 1).timetuple())
        consumer.items = [KafkaMessage(TopicPartition('topic', 0), i, f'{i}'.encode('utf-8')) for i in [4, 5, 6]]
        for x in range(len(consumer.items)):
            batching_consumer._run_once()

        mock_time.return_value = time.mktime(datetime(2018, 1, 1, 0, 0, 5).timetuple())
        consumer.items = [KafkaMessage(TopicPartition('topic', 0), i, f'{i}'.encode('utf-8')) for i in [7, 8, 9]]
        for x in range(len(consumer.items)):
            batching_consumer._run_once()

        batching_consumer._shutdown()

        assert worker.processed == [b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9']
        assert worker.flushed == [[b'1', b'2', b'3', b'4', b'5', b'6']]
        assert consumer.commit_calls == 1
        assert consumer.close_calls == 1
Пример #2
0
    def eventstream(dataset_name):
        dataset = get_dataset(dataset_name)
        ensure_table_exists(dataset)
        record = json.loads(http_request.data)

        version = record[0]
        if version != 2:
            raise RuntimeError("Unsupported protocol version: %s" % record)

        message = KafkaMessage(
            TopicPartition('topic', 0),
            0,
            http_request.data,
        )

        type_ = record[1]
        metrics = DummyMetricsBackend()
        if type_ == 'insert':
            from snuba.consumer import ConsumerWorker
            worker = ConsumerWorker(dataset,
                                    producer=None,
                                    replacements_topic=None,
                                    metrics=metrics)
        else:
            from snuba.replacer import ReplacerWorker
            worker = ReplacerWorker(clickhouse_rw, dataset, metrics=metrics)

        processed = worker.process_message(message)
        if processed is not None:
            batch = [processed]
            worker.flush_batch(batch)

        return ('ok', 200, {'Content-Type': 'text/plain'})
Пример #3
0
    def test_unmerge_insert(self):
        self.event['project_id'] = self.project_id
        self.event['group_id'] = 1
        self.event['primary_hash'] = 'a' * 32
        self.write_raw_events(self.event)

        assert self._issue_count(self.project_id) == [{'count': 1, 'issue': 1}]

        timestamp = datetime.now(tz=pytz.utc)

        project_id = self.project_id

        message = KafkaMessage(
            TopicPartition('replacements', 1),
            42,
            json.dumps((2, 'end_unmerge', {
                'project_id':
                project_id,
                'previous_group_id':
                1,
                'new_group_id':
                2,
                'hashes': ['a' * 32],
                'datetime':
                timestamp.strftime(PAYLOAD_DATETIME_FORMAT),
            })).encode('utf-8'),
        )

        processed = self.replacer.process_message(message)
        self.replacer.flush_batch([processed])

        assert self._issue_count(self.project_id) == [{'count': 1, 'issue': 2}]
Пример #4
0
    def test_send_message(
        self,
        message: str,
        expected: Optional[ProcessedMessage],
    ) -> None:
        dataset = get_dataset("groupedmessage")
        snapshot_id = uuid1()
        transact_data = TransactionData(xmin=100,
                                        xmax=200,
                                        xip_list=[120, 130])

        worker = SnapshotAwareWorker(
            dataset=dataset,
            producer=FakeConfluentKafkaProducer(),
            snapshot_id=str(snapshot_id),
            transaction_data=transact_data,
            replacements_topic=None,
            metrics=DummyMetricsBackend(strict=True),
        )

        ret = worker.process_message(
            KafkaMessage(
                TopicPartition('topic', 0),
                1,
                message.encode('utf-8'),
            ))
        assert ret == expected
Пример #5
0
def get_messages(events_file):
    "Create a fake Kafka message for each JSON event in the file."
    messages = []
    raw_events = open(events_file).readlines()
    for raw_event in raw_events:
        messages.append(
            KafkaMessage(TopicPartition('events', 1), 0,
                         raw_event.encode('utf-8')), )
    return messages
Пример #6
0
    def test_delete_tag_promoted_insert(self):
        self.event['project_id'] = self.project_id
        self.event['group_id'] = 1
        self.event['data']['tags'].append(['browser.name', 'foo'])
        self.event['data']['tags'].append(['notbrowser', 'foo'])
        self.write_raw_events(self.event)

        project_id = self.project_id

        def _issue_count(total=False):
            return json.loads(
                self.app.post('/query',
                              data=json.dumps({
                                  'project': [project_id],
                                  'aggregations': [['count()', '', 'count']],
                                  'conditions':
                                  [['tags[browser.name]', '=', 'foo']]
                                  if not total else [],
                                  'groupby': ['issue'],
                              })).data)['data']

        assert _issue_count() == [{'count': 1, 'issue': 1}]
        assert _issue_count(total=True) == [{'count': 1, 'issue': 1}]

        timestamp = datetime.now(tz=pytz.utc)

        message = KafkaMessage(
            TopicPartition('replacements', 1),
            42,
            json.dumps((2, 'end_delete_tag', {
                'project_id':
                project_id,
                'tag':
                'browser.name',
                'datetime':
                timestamp.strftime(PAYLOAD_DATETIME_FORMAT),
            })).encode('utf-8'),
        )

        processed = self.replacer.process_message(message)
        self.replacer.flush_batch([processed])

        assert _issue_count() == []
        assert _issue_count(total=True) == [{'count': 1, 'issue': 1}]
Пример #7
0
    def test_batch_size(self) -> None:
        consumer = FakeKafkaConsumer()
        worker = FakeWorker()
        batching_consumer = BatchingKafkaConsumer(
            consumer,
            'topic',
            worker=worker,
            max_batch_size=2,
            max_batch_time=100,
            metrics=DummyMetricsBackend(strict=True),
        )

        consumer.items = [KafkaMessage(TopicPartition('topic', 0), i, f'{i}'.encode('utf-8')) for i in [1, 2, 3]]
        for x in range(len(consumer.items)):
            batching_consumer._run_once()
        batching_consumer._shutdown()

        assert worker.processed == [b'1', b'2', b'3']
        assert worker.flushed == [[b'1', b'2']]
        assert consumer.commit_calls == 1
        assert consumer.close_calls == 1
Пример #8
0
    def test_skip_too_old(self):
        replacement_topic = enforce_table_writer(
            self.dataset).get_stream_loader().get_replacement_topic_spec()
        test_worker = ConsumerWorker(self.dataset,
                                     FakeConfluentKafkaProducer(),
                                     replacement_topic.topic_name,
                                     self.metrics)

        event = self.event
        old_timestamp = datetime.utcnow() - timedelta(days=300)
        old_timestamp_str = old_timestamp.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
        event['datetime'] = old_timestamp_str
        event['data']['datetime'] = old_timestamp_str
        event['data']['received'] = int(
            calendar.timegm(old_timestamp.timetuple()))

        message = KafkaMessage(
            TopicPartition('events', 1),
            42,
            json.dumps((0, 'insert', event)).encode('utf-8'),
        )

        assert test_worker.process_message(message) is None
Пример #9
0
    def test_offsets(self):
        event = self.event

        message = KafkaMessage(
            TopicPartition('events', 456),
            123,
            json.dumps((0, 'insert',
                        event)).encode('utf-8')  # event doesn't really matter
        )

        replacement_topic = enforce_table_writer(
            self.dataset).get_stream_loader().get_replacement_topic_spec()
        test_worker = ConsumerWorker(self.dataset,
                                     FakeConfluentKafkaProducer(),
                                     replacement_topic.topic_name,
                                     self.metrics)
        batch = [test_worker.process_message(message)]
        test_worker.flush_batch(batch)

        assert self.clickhouse.execute(
            "SELECT project_id, event_id, offset, partition FROM %s" %
            self.table) == [(self.event['project_id'], self.event['event_id'],
                             123, 456)]
Пример #10
0
 def _wrap(self, msg: str) -> KafkaMessage:
     return KafkaMessage(
         TopicPartition('replacements', 0),
         0,
         json.dumps(msg).encode('utf-8'),
     )