def test_process_record_json_compressed(self, kafka_consumer_mock): msgs = [('AD{:d}'.format(i), {'body':' {:d}'.format(i)}) for i in range(1000)] + \ [('AD{:d}'.format(i), None) for i in range(100, 200)] samples = get_kafka_msg_samples(msgs, msgformat='json', compress=True) scanner, _, messages = self._get_scanner_messages(samples, 3, kafka_consumer_mock, count_variations={ 0: 2, 1: 3, 2: 2 }, msgformat='json', decompress=True) msgsdict = {m['_key']: m['body'] for m in messages} self.assertEqual(len(set(msgsdict)), 900) self.assertEqual(scanner.scanned_count, 1100) self.assertEqual(scanner.issued_count, 900) self.assertEqual(scanner.deleted_count, 100) self.assertEqual(scanner.dupes_count, 100) self.assertEqual(scanner.test_count, 900) for i in range(100, 200): self.assertTrue('AD%.3d' % i not in msgsdict)
def _test_kafka_scan_lower_offsets(self, kafka_consumer_mock, batchsize=10000): msgs = [('AD%.3d' % i, 'body %d' % i) for i in range(1000)] + \ [('AD%.3d' % i, None) for i in range(100, 200)] samples = get_kafka_msg_samples(msgs) scanner, _, messages = self._get_scanner_messages(samples, 3, kafka_consumer_mock, count_variations={ 0: 2, 1: 3, 2: 2 }, batchsize=batchsize, min_lower_offsets={ 0: 100, 1: 100, 2: 100 }) msgsdict = {m['_key']: m['body'] for m in messages} self.assertEqual(len(set(msgsdict)), 700) self.assertEqual(scanner.scanned_count, 800) self.assertEqual(scanner.issued_count, 700) self.assertEqual(scanner.dupes_count, 0) self.assertEqual(scanner.deleted_count, 100) for i in range(100, 200): self.assertTrue('AD%.3d' % i not in msgsdict)
def test_kafka_scan_resume( self, client_mock, simple_consumer_mock, mp_consumer_mock, batchsize=10000, expected_batches=(1, 1, 1) ): msgs = [("AD%.3d" % i, "body %d" % i) for i in range(1000)] samples = get_kafka_msg_samples(msgs) client_mock.return_value = FakeClient(samples, 3, {0: 235, 1: 443, 2: 322}, {0: 2, 1: 2, 2: 2}) all_msgkeys = [] expected_cumulative_messages = 0 for (resume, job_expected_messages), job_expected_batches in zip( [(False, 400), (True, 400), (True, 200)], expected_batches ): expected_cumulative_messages += job_expected_messages scanner, number_of_batches, messages = self._get_scanner_messages( client_mock, simple_consumer_mock, mp_consumer_mock, keep_offsets=resume, batchsize=batchsize, count=job_expected_messages, ) msgkeys = [m["_key"] for m in messages] all_msgkeys.extend(msgkeys) self.assertEqual(len(set(msgkeys)), job_expected_messages) self.assertEqual(len(msgkeys), job_expected_messages) self.assertEqual(number_of_batches, job_expected_batches) self.assertEqual(len(set(all_msgkeys)), expected_cumulative_messages) self.assertEqual(scanner.dupes_count, 0) self.assertEqual(len(all_msgkeys), expected_cumulative_messages)
def test_encoding(self, client_mock, simple_consumer_mock, mp_consumer_mock): msgs = [("AD001", u"hol\xc3\xa1".encode("latin1"))] samples = get_kafka_msg_samples(msgs) client_mock.return_value = FakeClient(samples, 1) scanner, number_of_batches, messages = self._get_scanner_messages( client_mock, simple_consumer_mock, mp_consumer_mock, encoding="latin1" ) self.assertEqual(messages[0]["body"], u"hol\xc3\xa1")
def test_encoding(self, kafka_consumer_mock): msgs = [('AD001', u'hol\xc3\xa1'.encode('latin1'))] samples = get_kafka_msg_samples(msgs) _, _, messages = self._get_scanner_messages(samples, 1, kafka_consumer_mock, encoding='latin1') self.assertEqual(messages[0]['body'], u'hol\xc3\xa1')
def test_wrong_encoding(self, client_mock, simple_consumer_mock, mp_consumer_mock): msgs = [("AD001", ">\xc4\xee")] samples = get_kafka_msg_samples(msgs) client_mock.return_value = FakeClient(samples, 1) scanner, number_of_batches, messages = self._get_scanner_messages( client_mock, simple_consumer_mock, mp_consumer_mock ) self.assertEqual(messages, [])
def test_kafka_scan_batchcount(self, kafka_consumer_mock, batchsize=10000, batchcount=3, num_partitions=1, expected_messages=1000): msgs = [('AD%.3d' % i, 'body %d' % i) for i in range(1000)] samples = get_kafka_msg_samples(msgs) scanner, number_of_batches, messages = self._get_scanner_messages(samples, num_partitions, kafka_consumer_mock, count_variations={0: 2, 1: 3, 2: 2}, batchsize=batchsize, batchcount=batchcount) self.assertEqual(number_of_batches, min(batchcount, 1000 / batchsize or 1)) msgkeys = [m['_key'] for m in messages] self.assertEqual(len(msgkeys), expected_messages) self.assertEqual(len(set(msgkeys)), expected_messages)
def test_kafka_scan_dedupe_many(self, kafka_consumer_mock): msgs = [('AD%.3d' % i, 'body %d' % i) for i in range(1000)] * 2 samples = get_kafka_msg_samples(msgs) scanner, _, messages = self._get_scanner_messages(samples, 3, kafka_consumer_mock, count_variations={0: 2, 1: 3, 2: 2}, batchsize=250, logcount=250) msgsdict = {m['_key']: m['body'] for m in messages} self.assertEqual(len(msgsdict), 1000) self.assertEqual(scanner.issued_count, 1000) self.assertEqual(scanner.scanned_count, 2000) self.assertEqual(scanner.dupes_count, 1000)
def test_kafka_scan_dedupe_many(self, client_mock, simple_consumer_mock, mp_consumer_mock): msgs = [("AD%.3d" % i, "body %d" % i) for i in range(1000)] * 2 samples = get_kafka_msg_samples(msgs) client_mock.return_value = FakeClient(samples, 3, count_variations={0: 2, 1: 3, 2: 2}) scanner, number_of_batches, messages = self._get_scanner_messages( client_mock, simple_consumer_mock, mp_consumer_mock, batchsize=250, logcount=250 ) msgsdict = {m["_key"]: m["body"] for m in messages} self.assertEqual(len(msgsdict), 1000) self.assertEqual(scanner.issued_count, 1000) self.assertEqual(scanner.scanned_count, 2000) self.assertEqual(scanner.dupes_count, 1000)
def test_kafka_scan_lower_offsets(self, kafka_consumer_mock, batchsize=10000): msgs = [('AD%.3d' % i, 'body %d' % i) for i in range(1000)] + \ [('AD%.3d' % i, None) for i in range(100, 200)] samples = get_kafka_msg_samples(msgs) scanner, _, messages = self._get_scanner_messages(samples, 3, kafka_consumer_mock, count_variations={0: 2, 1: 3, 2: 2}, batchsize=batchsize, min_lower_offsets={0: 100, 1: 100, 2: 100}) msgsdict = {m['_key']: m['body'] for m in messages} self.assertEqual(len(set(msgsdict)), 700) self.assertEqual(scanner.scanned_count, 800) self.assertEqual(scanner.issued_count, 700) self.assertEqual(scanner.dupes_count, 0) self.assertEqual(scanner.deleted_count, 100) for i in range(100, 200): self.assertTrue('AD%.3d' % i not in msgsdict)
def test_kafka_scan_deleted(self, client_mock, simple_consumer_mock, mp_consumer_mock, batchsize=10000): msgs = [("AD%.3d" % i, "body %d" % i) for i in range(1000)] + [("AD%.3d" % i, None) for i in range(100, 200)] samples = get_kafka_msg_samples(msgs) client_mock.return_value = FakeClient(samples, 3, count_variations={0: 2, 1: 3, 2: 2}) scanner, number_of_batches, messages = self._get_scanner_messages( client_mock, simple_consumer_mock, mp_consumer_mock ) msgsdict = {m["_key"]: m["body"] for m in messages} self.assertEqual(len(set(msgsdict)), 900) self.assertEqual(scanner.scanned_count, 1100) self.assertEqual(scanner.issued_count, 900) self.assertEqual(scanner.dupes_count, 0) self.assertEqual(scanner.deleted_count, 100) for i in range(100, 200): self.assertTrue("AD%.3d" % i not in msgsdict)
def test_process_record(self, kafka_consumer_mock): msgs = [('AD%.3d' % i, 'body %d' % i) for i in range(1000)] + \ [('AD%.3d' % i, None) for i in range(100, 200)] samples = get_kafka_msg_samples(msgs) scanner, _, messages = self._get_scanner_messages(samples, 3, kafka_consumer_mock, count_variations={0: 2, 1: 3, 2: 2}) msgsdict = {m['_key']: m['body'] for m in messages} self.assertEqual(len(set(msgsdict)), 900) self.assertEqual(scanner.scanned_count, 1100) self.assertEqual(scanner.issued_count, 900) self.assertEqual(scanner.deleted_count, 100) self.assertEqual(scanner.dupes_count, 100) self.assertEqual(scanner.test_count, 900) for i in range(100, 200): self.assertTrue('AD%.3d' % i not in msgsdict)
def test_kafka_scan_nodelete(self, kafka_consumer_mock): msgs = [('AD%.3d' % i, 'body %d' % i) for i in range(1000)] + \ [('AD%.3d' % i, None) for i in range(100, 200)] samples = get_kafka_msg_samples(msgs) scanner, _, messages = self._get_scanner_messages(samples, 3, kafka_consumer_mock, count_variations={0: 2, 1: 3, 2: 2}, nodelete=True) msgsdict = {m['_key']: m.get('body', None) for m in messages} self.assertEqual(len(set(msgsdict)), 1000) self.assertEqual(scanner.scanned_count, 1100) self.assertEqual(scanner.issued_count, 1000) self.assertEqual(scanner.dupes_count, 100) self.assertEqual(scanner.deleted_count, 0) for i in range(100, 200): self.assertEqual(msgsdict['AD%.3d' % i], None)
def test_kafka_scan_seek(self, client_mock, simple_consumer_mock, mp_consumer_mock, batchsize=10000): msgs = [] for prefix in ["AD", "CN", "UK", "ZA"]: msgs.extend([("%s%.3d" % (prefix, i), "body %s %d" % (prefix, i)) for i in range(2000)]) samples = get_kafka_msg_samples(msgs) client_mock.return_value = FakeClient(samples, 3, count_variations={0: 2, 1: 3, 2: 2}) scanner, number_of_batches, messages = self._get_scanner_messages( client_mock, simple_consumer_mock, mp_consumer_mock, batchsize=batchsize, max_next_messages=200, key_prefixes=["CN"], ) self.assertEqual(len(messages), 2000) self.assertTrue(scanner.scanned_count <= 4400)
def test_kafka_scan_dedupe_many(self, kafka_consumer_mock): msgs = [('AD%.3d' % i, 'body %d' % i) for i in range(1000)] * 2 samples = get_kafka_msg_samples(msgs) scanner, _, messages = self._get_scanner_messages(samples, 3, kafka_consumer_mock, count_variations={ 0: 2, 1: 3, 2: 2 }, batchsize=250, logcount=250) msgsdict = {m['_key']: m['body'] for m in messages} self.assertEqual(len(msgsdict), 1000) self.assertEqual(scanner.issued_count, 1000) self.assertEqual(scanner.scanned_count, 2000) self.assertEqual(scanner.dupes_count, 1000)
def test_kafka_scan_resume_after_fail(self, client_mock, simple_consumer_mock, mp_consumer_mock, batchsize=200): msgs = [("AD%.3d" % i, "body %d" % i) for i in range(1000)] samples = get_kafka_msg_samples(msgs) client_mock.return_value = FakeClient(samples, 3, {0: 235, 1: 443, 2: 322}, {0: 2, 1: 2, 2: 2}) self.assertRaisesRegexp( AssertionError, "Failed on offset 250", self._get_scanner_messages, client_mock, simple_consumer_mock, mp_consumer_mock, fail_on_offset=250, batchsize=batchsize, ) scanner, number_of_batches, messages = self._get_scanner_messages( client_mock, simple_consumer_mock, mp_consumer_mock, batchsize=batchsize, keep_offsets=True ) self.assertEqual(len(messages), 802)
def test_process_record(self, kafka_consumer_mock): msgs = [('AD%.3d' % i, 'body %d' % i) for i in range(1000)] + \ [('AD%.3d' % i, None) for i in range(100, 200)] samples = get_kafka_msg_samples(msgs) scanner, _, messages = self._get_scanner_messages(samples, 3, kafka_consumer_mock, count_variations={ 0: 2, 1: 3, 2: 2 }) msgsdict = {m['_key']: m['body'] for m in messages} self.assertEqual(len(set(msgsdict)), 900) self.assertEqual(scanner.scanned_count, 1100) self.assertEqual(scanner.issued_count, 900) self.assertEqual(scanner.deleted_count, 100) self.assertEqual(scanner.dupes_count, 100) self.assertEqual(scanner.test_count, 900) for i in range(100, 200): self.assertTrue('AD%.3d' % i not in msgsdict)
def _test_kafka_scan_dedupe(self, kafka_consumer_mock, batchsize=10000): msgs = [('AD%.3d' % i, 'body %d' % i) for i in range(1000)] + \ [('AD%.3d' % i, 'body %dA' % i) for i in range(100, 200)] samples = get_kafka_msg_samples(msgs) scanner, _, messages = self._get_scanner_messages(samples, 3, kafka_consumer_mock, count_variations={ 0: 2, 1: 3, 2: 2 }, batchsize=batchsize) msgsdict = {m['_key']: m['body'] for m in messages} self.assertEqual(len(msgsdict), 1000) self.assertEqual(scanner.issued_count, 1000) self.assertEqual(scanner.scanned_count, 1100) self.assertEqual(scanner.dupes_count, 100) for i in range(100, 200): self.assertEqual(msgsdict['AD%.3d' % i], 'body %dA' % i)
def test_kafka_scan_nodelete(self, kafka_consumer_mock): msgs = [('AD%.3d' % i, 'body %d' % i) for i in range(1000)] + \ [('AD%.3d' % i, None) for i in range(100, 200)] samples = get_kafka_msg_samples(msgs) scanner, _, messages = self._get_scanner_messages(samples, 3, kafka_consumer_mock, count_variations={ 0: 2, 1: 3, 2: 2 }, nodelete=True) msgsdict = {m['_key']: m.get('body', None) for m in messages} self.assertEqual(len(set(msgsdict)), 1000) self.assertEqual(scanner.scanned_count, 1100) self.assertEqual(scanner.issued_count, 1000) self.assertEqual(scanner.dupes_count, 100) self.assertEqual(scanner.deleted_count, 0) for i in range(100, 200): self.assertEqual(msgsdict['AD%.3d' % i], None)
def _test_kafka_scan_batchcount(self, kafka_consumer_mock, batchsize=10000, batchcount=3, num_partitions=1, expected_messages=1000): msgs = [('AD%.3d' % i, 'body %d' % i) for i in range(1000)] samples = get_kafka_msg_samples(msgs) scanner, number_of_batches, messages = self._get_scanner_messages( samples, num_partitions, kafka_consumer_mock, count_variations={ 0: 2, 1: 3, 2: 2 }, batchsize=batchsize, batchcount=batchcount) self.assertEqual(number_of_batches, min(batchcount, 1000 // batchsize or 1)) msgkeys = [m['_key'] for m in messages] self.assertEqual(len(msgkeys), expected_messages) self.assertEqual(len(set(msgkeys)), expected_messages)
class KafkaScannerTest(BaseScannerTest): msgs = [('AD%d' % i, 'body %d' % i) for i in range(7)] samples = get_kafka_msg_samples(msgs) def _test_kafka_scan(self, kafka_consumer_mock, num_partitions=1, expected_batches=1): expected_messages = 7 _, number_of_batches, messages = self._get_scanner_messages( self.samples, num_partitions, kafka_consumer_mock, nodedupe=True) msgkeys = [m['_key'] for m in messages] self.assertEqual(len(set(msgkeys)), expected_messages) self.assertEqual(len(msgkeys), expected_messages) self.assertEqual(number_of_batches, expected_batches) test_kafka_scan = _test_kafka_scan def test_kafka_scan_partitions(self, kafka_consumer_mock): self._test_kafka_scan(kafka_consumer_mock, num_partitions=3, expected_batches=1) def _test_kafka_scan_count(self, kafka_consumer_mock, num_partitions=1, expected_batches=1): expected_messages = 2 _, number_of_batches, messages = self._get_scanner_messages( self.samples, num_partitions, kafka_consumer_mock, nodedupe=True, count=2) msgkeys = [m['_key'] for m in messages] self.assertEqual(len(set(msgkeys)), expected_messages) self.assertEqual(len(msgkeys), expected_messages) self.assertEqual(number_of_batches, expected_batches) test_kafka_scan_count = _test_kafka_scan_count def test_kafka_scan_count_partitions(self, kafka_consumer_mock): self._test_kafka_scan_count(kafka_consumer_mock, num_partitions=3) def _test_kafka_scan_batchsize(self, kafka_consumer_mock, num_partitions=1): expected_messages = 7 expected_batches = 4 _, number_of_batches, messages = self._get_scanner_messages( self.samples, num_partitions, kafka_consumer_mock, nodedupe=True, batchsize=2) msgkeys = [m['_key'] for m in messages] self.assertEqual(len(set(msgkeys)), expected_messages) self.assertEqual(len(msgkeys), expected_messages) self.assertEqual(number_of_batches, expected_batches) test_kafka_scan_batchsize = _test_kafka_scan_batchsize def test_kafka_scan_batchsize_partitions(self, kafka_consumer_mock): self._test_kafka_scan_batchsize(kafka_consumer_mock, num_partitions=3) def _test_kafka_scan_batchsize_count(self, kafka_consumer_mock, num_partitions=1): expected_messages = 5 expected_batches = 3 _, number_of_batches, messages = self._get_scanner_messages( self.samples, num_partitions, kafka_consumer_mock, nodedupe=True, batchsize=2, count=5) msgkeys = [m['_key'] for m in messages] self.assertEqual(len(set(msgkeys)), expected_messages) self.assertEqual(len(msgkeys), expected_messages) self.assertEqual(number_of_batches, expected_batches) test_kafka_scan_batchsize_count = _test_kafka_scan_batchsize_count def test_kafka_scan_batchsize_count_partitions(self, kafka_consumer_mock): self._test_kafka_scan_batchsize_count(kafka_consumer_mock, num_partitions=3) def _test_kafka_scan_batchcount(self, kafka_consumer_mock, batchsize=10000, batchcount=3, num_partitions=1, expected_messages=1000): msgs = [('AD%.3d' % i, 'body %d' % i) for i in range(1000)] samples = get_kafka_msg_samples(msgs) scanner, number_of_batches, messages = self._get_scanner_messages( samples, num_partitions, kafka_consumer_mock, count_variations={ 0: 2, 1: 3, 2: 2 }, batchsize=batchsize, batchcount=batchcount) self.assertEqual(number_of_batches, min(batchcount, 1000 // batchsize or 1)) msgkeys = [m['_key'] for m in messages] self.assertEqual(len(msgkeys), expected_messages) self.assertEqual(len(set(msgkeys)), expected_messages) test_kafka_scan_batchcount = _test_kafka_scan_batchcount def test_kafka_scan_batchcount_batches(self, kafka_consumer_mock): self._test_kafka_scan_batchcount(kafka_consumer_mock, batchsize=200, expected_messages=600) def test_kafka_scan_batchcount_one_batch(self, kafka_consumer_mock): self._test_kafka_scan_batchcount(kafka_consumer_mock, batchsize=200, batchcount=1, expected_messages=200) def test_kafka_scan_batchcount_partitions(self, kafka_consumer_mock): self._test_kafka_scan_batchcount(kafka_consumer_mock, num_partitions=3) def test_kafka_scan_batchcount_batches_partitions(self, kafka_consumer_mock): self._test_kafka_scan_batchcount(kafka_consumer_mock, batchsize=200, num_partitions=3, expected_messages=600) def _test_kafka_scan_dedupe(self, kafka_consumer_mock, batchsize=10000): msgs = [('AD%.3d' % i, 'body %d' % i) for i in range(1000)] + \ [('AD%.3d' % i, 'body %dA' % i) for i in range(100, 200)] samples = get_kafka_msg_samples(msgs) scanner, _, messages = self._get_scanner_messages(samples, 3, kafka_consumer_mock, count_variations={ 0: 2, 1: 3, 2: 2 }, batchsize=batchsize) msgsdict = {m['_key']: m['body'] for m in messages} self.assertEqual(len(msgsdict), 1000) self.assertEqual(scanner.issued_count, 1000) self.assertEqual(scanner.scanned_count, 1100) self.assertEqual(scanner.dupes_count, 100) for i in range(100, 200): self.assertEqual(msgsdict['AD%.3d' % i], 'body %dA' % i) test_kafka_scan_dedupe = _test_kafka_scan_dedupe def test_kafka_scan_dedupe_batches(self, kafka_consumer_mock): self._test_kafka_scan_dedupe(kafka_consumer_mock, batchsize=200) def _test_kafka_scan_deleted(self, kafka_consumer_mock, batchsize=10000): msgs = [('AD%.3d' % i, 'body %d' % i) for i in range(1000)] + \ [('AD%.3d' % i, None) for i in range(100, 200)] samples = get_kafka_msg_samples(msgs) scanner, _, messages = self._get_scanner_messages(samples, 3, kafka_consumer_mock, count_variations={ 0: 2, 1: 3, 2: 2 }, batchsize=batchsize) msgsdict = {m['_key']: m['body'] for m in messages} self.assertEqual(len(set(msgsdict)), 900) self.assertEqual(scanner.scanned_count, 1100) self.assertEqual(scanner.issued_count, 900) self.assertEqual(scanner.dupes_count, 100) self.assertEqual(scanner.deleted_count, 100) for i in range(100, 200): self.assertTrue('AD%.3d' % i not in msgsdict) test_kafka_scan_deleted = _test_kafka_scan_deleted def test_kafka_scan_deleted_batches(self, kafka_consumer_mock): self._test_kafka_scan_deleted(kafka_consumer_mock, batchsize=200) def _test_kafka_scan_deleted_before(self, kafka_consumer_mock, batchsize=10000): msgs = [('AD%.3d' % i, None) for i in range(100, 200)] + \ [('AD%.3d' % i, 'body %d' % i) for i in range(1000)] samples = get_kafka_msg_samples(msgs) scanner, _, messages = self._get_scanner_messages(samples, 3, kafka_consumer_mock, count_variations={ 0: 2, 1: 3, 2: 2 }, batchsize=batchsize) msgsdict = {m['_key']: m['body'] for m in messages} self.assertEqual(len(set(msgsdict)), 1000) self.assertEqual(scanner.scanned_count, 1100) self.assertEqual(scanner.issued_count, 1000) self.assertEqual(scanner.dupes_count, 100) self.assertEqual(scanner.deleted_count, 0) test_kafka_scan_deleted_before = _test_kafka_scan_deleted_before def test_kafka_scan_deleted_before_batches(self, kafka_consumer_mock): self._test_kafka_scan_deleted_before(kafka_consumer_mock, batchsize=200) def test_kafka_scan_nodelete(self, kafka_consumer_mock): msgs = [('AD%.3d' % i, 'body %d' % i) for i in range(1000)] + \ [('AD%.3d' % i, None) for i in range(100, 200)] samples = get_kafka_msg_samples(msgs) scanner, _, messages = self._get_scanner_messages(samples, 3, kafka_consumer_mock, count_variations={ 0: 2, 1: 3, 2: 2 }, nodelete=True) msgsdict = {m['_key']: m.get('body', None) for m in messages} self.assertEqual(len(set(msgsdict)), 1000) self.assertEqual(scanner.scanned_count, 1100) self.assertEqual(scanner.issued_count, 1000) self.assertEqual(scanner.dupes_count, 100) self.assertEqual(scanner.deleted_count, 0) for i in range(100, 200): self.assertEqual(msgsdict['AD%.3d' % i], None) def test_kafka_scan_dedupe_many(self, kafka_consumer_mock): msgs = [('AD%.3d' % i, 'body %d' % i) for i in range(1000)] * 2 samples = get_kafka_msg_samples(msgs) scanner, _, messages = self._get_scanner_messages(samples, 3, kafka_consumer_mock, count_variations={ 0: 2, 1: 3, 2: 2 }, batchsize=250, logcount=250) msgsdict = {m['_key']: m['body'] for m in messages} self.assertEqual(len(msgsdict), 1000) self.assertEqual(scanner.issued_count, 1000) self.assertEqual(scanner.scanned_count, 2000) self.assertEqual(scanner.dupes_count, 1000) def _test_kafka_scan_lower_offsets(self, kafka_consumer_mock, batchsize=10000): msgs = [('AD%.3d' % i, 'body %d' % i) for i in range(1000)] + \ [('AD%.3d' % i, None) for i in range(100, 200)] samples = get_kafka_msg_samples(msgs) scanner, _, messages = self._get_scanner_messages(samples, 3, kafka_consumer_mock, count_variations={ 0: 2, 1: 3, 2: 2 }, batchsize=batchsize, min_lower_offsets={ 0: 100, 1: 100, 2: 100 }) msgsdict = {m['_key']: m['body'] for m in messages} self.assertEqual(len(set(msgsdict)), 700) self.assertEqual(scanner.scanned_count, 800) self.assertEqual(scanner.issued_count, 700) self.assertEqual(scanner.dupes_count, 0) self.assertEqual(scanner.deleted_count, 100) for i in range(100, 200): self.assertTrue('AD%.3d' % i not in msgsdict) test_kafka_scan_lower_offsets = _test_kafka_scan_lower_offsets def test_kafka_scan_lower_offsets_batches(self, kafka_consumer_mock): self._test_kafka_scan_lower_offsets(kafka_consumer_mock, batchsize=200) def test_encoding(self, kafka_consumer_mock): msgs = [('AD001', u'hol\xc3\xa1'.encode('latin1'))] samples = get_kafka_msg_samples(msgs) _, _, messages = self._get_scanner_messages(samples, 1, kafka_consumer_mock, encoding='latin1') self.assertEqual(messages[0]['body'], u'hol\xc3\xa1') def test_wrong_encoding(self, kafka_consumer_mock): msgs = [('AD001', six.b('>\xc4\xee'))] samples = get_kafka_msg_samples(msgs) _, _, messages = self._get_scanner_messages(samples, 1, kafka_consumer_mock) self.assertEqual(messages, [])
class KafkaScannerDirectResumeTest(BaseScannerTest): scannerclass = KafkaScannerDirect msgs = [('AD%.3d' % i, 'body %d' % i) for i in range(1000)] samples = get_kafka_msg_samples(msgs) def test_kafka_scan_resume_simple_partition(self, kafka_consumer_mock, batchsize=100): all_msgkeys = set() sum_msgkeys = 0 client = FakeClient(self.samples, 1) for batchcount in (2, 2, 4, 2): _, number_of_batches, messages = self._get_scanner_messages( None, None, kafka_consumer_mock, client=client, keep_offsets=True, batchsize=batchsize, batchcount=batchcount, group='test_group') msgkeys = set([m['_key'] for m in messages]) sum_msgkeys += len(msgkeys) all_msgkeys.update(msgkeys) self.assertEqual(len(msgkeys), batchcount * batchsize) self.assertTrue( batchsize * (batchcount - 1) <= len(msgkeys) <= batchsize * batchcount) self.assertEqual(len(all_msgkeys), sum_msgkeys) self.assertEqual(sum_msgkeys, 1000) def test_kafka_scan_resume_simple_partition_after_fail( self, kafka_consumer_mock, batchsize=100): all_msgkeys = set() sum_msgkeys = 0 client = FakeClient(self.samples, 1) # this run will fail on offset 0: 450 scanner, number_of_batches, messages = self._get_scanner_messages( None, None, kafka_consumer_mock, client=client, keep_offsets=True, batchsize=batchsize, batchcount=5, max_next_messages=100, fail_on_offset=450, group='test_group') msgkeys = set([m['_key'] for m in messages]) sum_msgkeys += len(msgkeys) all_msgkeys.update(msgkeys) # check new run is started correctly, so no messages are lost scanner, number_of_batches, messages = self._get_scanner_messages( None, None, kafka_consumer_mock, client=client, keep_offsets=True, batchsize=batchsize, batchcount=7, max_next_messages=100, group='test_group') msgkeys = set([m['_key'] for m in messages]) sum_msgkeys += len(msgkeys) all_msgkeys.update(msgkeys) expected_keys = set(m[0] for m in self.msgs) self.assertEqual(expected_keys.difference(all_msgkeys), set()) self.assertEqual(len(all_msgkeys), sum_msgkeys)
class KafkaScannerDirectTest(BaseScannerTest): scannerclass = KafkaScannerDirect msgs = [('AD%.3d' % i, 'body %d' % i) for i in range(1000)] + \ [('AD%.3d' % i, None) for i in range(100, 200)] samples = get_kafka_msg_samples(msgs) def test_kafka_scan_batch(self, kafka_consumer_mock): _, number_of_batches, messages = self._get_scanner_messages( self.samples, 3, kafka_consumer_mock, count_variations={ 0: 2, 1: 3, 2: 2 }, batchsize=200, group='test_group') msgsdict = {m['_key']: m.get('body', None) for m in messages} self.assertEqual(len(messages), 1100) self.assertEqual(len(set(msgsdict)), 1000) self.assertEqual(number_of_batches, 6) def test_kafka_scan_batches_batchcount(self, kafka_consumer_mock, batchsize=100, batchcount=3): _, number_of_batches, messages = self._get_scanner_messages( self.samples, 3, kafka_consumer_mock, count_variations={ 0: 2, 1: 3, 2: 2 }, batchsize=batchsize, batchcount=batchcount, group='test_group') msgsdict = {m['_key']: m['body'] for m in messages} self.assertTrue('AD000' in msgsdict) self.assertEqual(number_of_batches, 3) msgkeys = set(msgsdict.keys()) self.assertTrue( batchsize * (batchcount - 1) <= len(msgkeys) <= batchsize * batchcount) def test_kafka_starting_offsets(self, kafka_consumer_mock): _, number_of_batches, messages = self._get_scanner_messages( self.samples, 3, kafka_consumer_mock, count_variations={ 0: 2, 1: 3, 2: 2 }, batchsize=200, start_offsets={ 0: 150, 1: 150, 2: 200 }, group='test_group') self.assertEqual(len(messages), 600) self.assertEqual(number_of_batches, 3) def test_kafka_stop_offsets(self, kafka_consumer_mock): client = FakeClient(self.samples, 3) _, number_of_batches, messages = self._get_scanner_messages( None, None, kafka_consumer_mock, client=client, count_variations={ 0: 2, 1: 3, 2: 2 }, batchsize=200, stop_offsets={ 0: 150, 1: 150, 2: 200 }, group='test_group') self.assertEqual(len(messages), 500) self.assertEqual(number_of_batches, 3) # ensure that next run resumes from previous stop offsets _, number_of_batches, messages = self._get_scanner_messages( None, None, kafka_consumer_mock, client=client, count_variations={ 0: 2, 1: 3, 2: 2 }, batchsize=200, group='test_group') self.assertEqual(len(messages), 600) self.assertEqual(number_of_batches, 3)
def test_wrong_encoding(self, kafka_consumer_mock): msgs = [('AD001', six.b('>\xc4\xee'))] samples = get_kafka_msg_samples(msgs) _, _, messages = self._get_scanner_messages(samples, 1, kafka_consumer_mock) self.assertEqual(messages, [])
def test_wrong_encoding(self, kafka_consumer_mock): msgs = [('AD001', '>\xc4\xee')] samples = get_kafka_msg_samples(msgs) _, _, messages = self._get_scanner_messages(samples, 1, kafka_consumer_mock) self.assertEqual(messages, [])