def test_avro_it(self): num_records = self.test_pipeline.get_option('records') num_records = int(num_records) if num_records else 1000000 # Seed a `PCollection` with indices that will each be FlatMap'd into # `batch_size` records, to avoid having a too-large list in memory at # the outset batch_size = self.test_pipeline.get_option('batch-size') batch_size = int(batch_size) if batch_size else 10000 # pylint: disable=range-builtin-not-iterating batches = range(int(num_records / batch_size)) def batch_indices(start): # pylint: disable=range-builtin-not-iterating return range(start * batch_size, (start + 1) * batch_size) # A `PCollection` with `num_records` avro records records_pcoll = \ self.test_pipeline \ | 'create-batches' >> Create(batches) \ | 'expand-batches' >> FlatMap(batch_indices) \ | 'create-records' >> Map(record) fastavro_output = '/'.join([self.output, 'fastavro']) avro_output = '/'.join([self.output, 'avro']) self.addCleanup(delete_files, [self.output + '*']) # pylint: disable=expression-not-assigned records_pcoll \ | 'write_fastavro' >> WriteToAvro( fastavro_output, self.SCHEMA, use_fastavro=True ) # pylint: disable=expression-not-assigned records_pcoll \ | 'write_avro' >> WriteToAvro( avro_output, self.SCHEMA, use_fastavro=False ) result = self.test_pipeline.run() result.wait_until_finish() assert result.state == PipelineState.DONE fastavro_read_pipeline = TestPipeline(is_integration_test=True) fastavro_records = \ fastavro_read_pipeline \ | 'create-fastavro' >> Create(['%s*' % fastavro_output]) \ | 'read-fastavro' >> ReadAllFromAvro(use_fastavro=True) \ | Map(lambda rec: (rec['number'], rec)) avro_records = \ fastavro_read_pipeline \ | 'create-avro' >> Create(['%s*' % avro_output]) \ | 'read-avro' >> ReadAllFromAvro(use_fastavro=False) \ | Map(lambda rec: (rec['number'], rec)) def check(elem): v = elem[1] def assertEqual(l, r): if l != r: raise BeamAssertException('Assertion failed: %s == %s' % (l, r)) assertEqual(v.keys(), ['avro', 'fastavro']) avro_values = v['avro'] fastavro_values = v['fastavro'] assertEqual(avro_values, fastavro_values) assertEqual(len(avro_values), 1) # pylint: disable=expression-not-assigned { 'avro': avro_records, 'fastavro': fastavro_records } \ | CoGroupByKey() \ | Map(check) fastavro_read_pipeline.run().wait_until_finish() assert result.state == PipelineState.DONE
def test_avro_it(self): num_records = self.test_pipeline.get_option('records') num_records = int(num_records) if num_records else 1000000 fastavro_output = '/'.join([self.output, 'fastavro']) # Seed a `PCollection` with indices that will each be FlatMap'd into # `batch_size` records, to avoid having a too-large list in memory at # the outset batch_size = self.test_pipeline.get_option('batch-size') batch_size = int(batch_size) if batch_size else 10000 # pylint: disable=bad-option-value batches = range(int(num_records / batch_size)) def batch_indices(start): # pylint: disable=bad-option-value return range(start * batch_size, (start + 1) * batch_size) # A `PCollection` with `num_records` avro records records_pcoll = \ self.test_pipeline \ | 'create-batches' >> Create(batches) \ | 'expand-batches' >> FlatMap(batch_indices) \ | 'create-records' >> Map(record) # pylint: disable=expression-not-assigned records_pcoll \ | 'write_fastavro' >> WriteToAvro( fastavro_output, parse_schema(json.loads(self.SCHEMA_STRING)), ) result = self.test_pipeline.run() result.wait_until_finish() fastavro_pcoll = self.test_pipeline \ | 'create-fastavro' >> Create(['%s*' % fastavro_output]) \ | 'read-fastavro' >> ReadAllFromAvro() mapped_fastavro_pcoll = fastavro_pcoll | "map_fastavro" >> Map( lambda x: (x['number'], x)) mapped_record_pcoll = records_pcoll | "map_record" >> Map( lambda x: (x['number'], x)) def validate_record(elem): v = elem[1] def assertEqual(l, r): if l != r: raise BeamAssertException('Assertion failed: %s == %s' % (l, r)) assertEqual(sorted(v.keys()), ['fastavro', 'record_pcoll']) record_pcoll_values = v['record_pcoll'] fastavro_values = v['fastavro'] assertEqual(record_pcoll_values, fastavro_values) assertEqual(len(record_pcoll_values), 1) { "record_pcoll": mapped_record_pcoll, "fastavro": mapped_fastavro_pcoll } | CoGroupByKey() | Map(validate_record) result = self.test_pipeline.run() result.wait_until_finish() self.addCleanup(delete_files, [self.output]) assert result.state == PipelineState.DONE