def test_create_transform(self): with TestPipeline() as p: assert_that(p | 'Empty' >> Create([]), equal_to([]), label='empty') assert_that(p | 'One' >> Create([None]), equal_to([None]), label='one') assert_that(p | Create(list(range(10))), equal_to(list(range(10))))
def test(self): p = TestPipeline(DirectRunner()) test_user = {'account': {'id': 1}, 'country': 'Germany'} test_account_offer = { 'account_id': 1, 'account_offer_id': 2, 'offer_id': 3, } test_offer = {'offer_id': 3, 'offer_name': 'offer name'} users = p | "Create users" >> Create([test_user]) account_offers = p | "Create account offers" >> Create( [test_account_offer]) offers = p | "Create offers" >> Create([test_offer]) result = { 'users': users, 'account_offers': account_offers, 'offers': offers } | OfferStatTransform() assert_that(result, self.assertSimple) p.run()
def test_create_uses_coder_for_pickling(self): coders.registry.register_coder(_Unpicklable, _UnpicklableCoder) create = Create([_Unpicklable(1), _Unpicklable(2), _Unpicklable(3)]) unpickled_create = pickler.loads(pickler.dumps(create)) self.assertEqual( sorted(create.values, key=lambda v: v.value), sorted(unpickled_create.values, key=lambda v: v.value)) with self.assertRaises(NotImplementedError): # As there is no special coder for Union types, this will fall back to # FastPrimitivesCoder, which in turn falls back to pickling. create_mixed_types = Create([_Unpicklable(1), 2]) pickler.dumps(create_mixed_types)
def test_read_all_from_parquet_file_pattern(self): file_pattern = self._write_pattern(5) with TestPipeline() as p: assert_that( p \ | Create([file_pattern]) \ | ReadAllFromParquet(), equal_to(self.RECORDS * 5)) with TestPipeline() as p: assert_that( p \ | Create([file_pattern]) \ | ReadAllFromParquetBatched(), equal_to([self._records_as_arrow()] * 5))
def test_sdf_with_side_inputs(self): with TestPipeline() as p: side1 = p | 'Create1' >> Create(['1', '2']) side2 = p | 'Create2' >> Create(['3', '4']) side3 = p | 'Create3' >> Create(['5']) result = (p | 'create_main' >> beam.Create(['a', 'b', 'c']) | beam.ParDo(ExpandStrings(), AsList(side1), AsList(side2), AsSingleton(side3))) expected_result = [] for c in ['a', 'b', 'c']: for i in range(5): expected_result.append(c + ':' + str(i + 1)) assert_that(result, equal_to(expected_result))
def test_read_all_from_parquet_single_file(self): path = self._write_data() with TestPipeline() as p: assert_that( p \ | Create([path]) \ | ReadAllFromParquet(), equal_to(self.RECORDS)) with TestPipeline() as p: assert_that( p \ | Create([path]) \ | ReadAllFromParquetBatched(), equal_to([self._records_as_arrow()]))
def test_reified_value_assert_fail_unmatched_timestamp(self): expected = [TestWindowedValue(v, 1, [GlobalWindow()]) for v in [1, 2, 3]] with self.assertRaises(Exception): with TestPipeline() as p: assert_that(p | Create([2, 3, 1]), equal_to(expected), reify_windows=True)
def expand(self, pbegin): if self._read_operations is not None and isinstance(pbegin, PBegin): pcoll = pbegin.pipeline | Create(self._read_operations) elif not isinstance(pbegin, PBegin): if self._read_operations is not None: raise ValueError( "Read operation in the constructor only works with " "the root of the pipeline.") pcoll = pbegin else: raise ValueError( "Spanner required read operation, sql or table " "with columns.") if self._transaction is None: # reading as batch read using the spanner partitioning query to create # batches. p = ( pcoll | 'Generate Partitions' >> ParDo( _CreateReadPartitions(spanner_configuration=self._configuration)) | 'Reshuffle' >> Reshuffle() | 'Read From Partitions' >> ParDo( _ReadFromPartitionFn(spanner_configuration=self._configuration))) else: # reading as naive read, in which we don't make batches and execute the # queries as a single read. p = ( pcoll | 'Reshuffle' >> Reshuffle().with_input_types(ReadOperation) | 'Perform Read' >> ParDo( _NaiveSpannerReadDoFn(spanner_configuration=self._configuration), AsSingleton(self._transaction))) return p
def test_assert_missing_and_unexpected(self): with self.assertRaisesRegex( BeamAssertException, r"unexpected elements \['c'\].*missing elements \['d'\]"): with TestPipeline() as p: assert_that(p | Create(['a', 'b', 'c']), equal_to(['a', 'b', 'd']))
def test_read_all_from_parquet_many_file_patterns(self): file_pattern1 = self._write_pattern(5) file_pattern2 = self._write_pattern(2) file_pattern3 = self._write_pattern(3) with TestPipeline() as p: assert_that( p \ | Create([file_pattern1, file_pattern2, file_pattern3]) \ | ReadAllFromParquet(), equal_to(self.RECORDS * 10)) with TestPipeline() as p: assert_that( p \ | Create([file_pattern1, file_pattern2, file_pattern3]) \ | ReadAllFromParquetBatched(), equal_to([self._records_as_arrow()] * 10))
def test_reified_value_assert_fail_unmatched_window(self): expected = [TestWindowedValue(v, MIN_TIMESTAMP, [IntervalWindow(0, 1)]) for v in [1, 2, 3]] with self.assertRaises(Exception): with TestPipeline() as p: assert_that(p | Create([2, 3, 1]), equal_to(expected), reify_windows=True)
def test_on_direct_runner(self): class FakeSink(NativeSink): """A fake sink outputing a number of elements.""" def __init__(self): self.written_values = [] self.writer_instance = FakeSinkWriter(self.written_values) def writer(self): return self.writer_instance class FakeSinkWriter(NativeSinkWriter): """A fake sink writer for testing.""" def __init__(self, written_values): self.written_values = written_values def __enter__(self): return self def __exit__(self, *unused_args): pass def Write(self, value): self.written_values.append(value) with TestPipeline() as p: sink = FakeSink() p | Create(['a', 'b', 'c']) | _NativeWrite(sink) # pylint: disable=expression-not-assigned self.assertEqual(['a', 'b', 'c'], sorted(sink.written_values))
def test_read_all_from_parquet_many_single_files(self): path1 = self._write_data() path2 = self._write_data() path3 = self._write_data() with TestPipeline() as p: assert_that( p \ | Create([path1, path2, path3]) \ | ReadAllFromParquet(), equal_to(self.RECORDS * 3)) with TestPipeline() as p: assert_that( p \ | Create([path1, path2, path3]) \ | ReadAllFromParquetBatched(), equal_to([self._records_as_arrow()] * 3))
def test_on_direct_runner(self): class FakeSink(NativeSink): """A fake sink outputing a number of elements.""" def __init__(self): self.written_values = [] self.writer_instance = FakeSinkWriter(self.written_values) def writer(self): return self.writer_instance class FakeSinkWriter(NativeSinkWriter): """A fake sink writer for testing.""" def __init__(self, written_values): self.written_values = written_values def __enter__(self): return self def __exit__(self, *unused_args): pass def Write(self, value): self.written_values.append(value) # Records in-memory writes, only works on Direct runner. p = TestPipeline(runner='DirectRunner') sink = FakeSink() p | Create(['a', 'b', 'c']) | _NativeWrite(sink) # pylint: disable=expression-not-assigned p.run() self.assertEqual(['a', 'b', 'c'], sink.written_values)
def test_read_all_from_avro_file_pattern(self): file_pattern = self._write_pattern(5) with TestPipeline() as p: assert_that( p \ | Create([file_pattern]) \ | avroio.ReadAllFromAvro(use_fastavro=self.use_fastavro), equal_to(self.RECORDS * 5))
def test_read_all_from_avro_single_file(self): path = self._write_data() with TestPipeline() as p: assert_that( p \ | Create([path]) \ | avroio.ReadAllFromAvro(use_fastavro=self.use_fastavro), equal_to(self.RECORDS))
def test_read_all_from_avro_many_single_files(self): path1 = self._write_data() path2 = self._write_data() path3 = self._write_data() with TestPipeline() as p: assert_that( p | Create([path1, path2, path3]) | avroio.ReadAllFromAvro(), equal_to(self.RECORDS * 3))
def test_reified_value_passes(self): expected = [ TestWindowedValue(v, MIN_TIMESTAMP, [GlobalWindow()]) for v in [1, 2, 3] ] with TestPipeline() as p: assert_that(p | Create([2, 3, 1]), equal_to(expected), reify_windows=True)
def test_read_all_from_avro_with_filename(self): file_pattern, file_paths = self._write_pattern(3, return_filenames=True) result = [(path, record) for path in file_paths for record in self.RECORDS] with TestPipeline() as p: assert_that( p \ | Create([file_pattern]) \ | avroio.ReadAllFromAvro(with_filename=True), equal_to(result))
def test_read_all_from_avro_many_file_patterns(self): file_pattern1 = self._write_pattern(5) file_pattern2 = self._write_pattern(2) file_pattern3 = self._write_pattern(3) with TestPipeline() as p: assert_that( p | Create([file_pattern1, file_pattern2, file_pattern3]) | avroio.ReadAllFromAvro(), equal_to(self.RECORDS * 10))
def test_sink_transform_int96(self): with tempfile.NamedTemporaryFile() as dst: path = dst.name # pylint: disable=c-extension-no-member with self.assertRaises(pl.ArrowInvalid): with TestPipeline() as p: _ = p \ | Create(self.RECORDS) \ | WriteToParquet( path, self.SCHEMA96, num_shards=1, shard_name_template='')
def main(argv=None): options = PipelineOptions(argv) p = Pipeline(options=options) (p | Create(["a", "b", "c", "d", "e"], reshuffle=False) | Print("hello", expansion_service(options))) p.run()
def test_sink_transform_multiple_row_group(self): with tempfile.NamedTemporaryFile() as dst: path = dst.name with TestPipeline() as p: # writing 623200 bytes of data _ = p \ | Create(self.RECORDS * 4000) \ | WriteToParquet( path, self.SCHEMA, num_shards=1, codec='none', shard_name_template='', row_group_buffer_size=250000) self.assertEqual(pq.read_metadata(path).num_row_groups, 3)
def test_process_auto(self): with TempDir() as temp_dir: path = temp_dir.create_temp_file('result.gz') _write_file_gzip(path, FOO_BAR_RECORD_BASE64) with TestPipeline() as p: result = (p | Create([path]) | ReadAllFromTFRecord( coder=coders.BytesCoder(), compression_type=CompressionTypes.AUTO)) assert_that(result, equal_to([b'foo', b'bar']))
def test_process_glob(self): with TempDir() as temp_dir: self._write_glob(temp_dir, 'result') glob = temp_dir.get_path() + os.path.sep + '*result' with TestPipeline() as p: result = (p | Create([glob]) | ReadAllFromTFRecord( coder=coders.BytesCoder(), compression_type=CompressionTypes.AUTO)) assert_that(result, equal_to([b'foo', b'bar'] * 3))
def test_sink_transform_multiple_row_group(self): with TemporaryDirectory() as tmp_dirname: path = os.path.join(tmp_dirname + "tmp_filename") with TestPipeline() as p: # writing 623200 bytes of data _ = p \ | Create(self.RECORDS * 4000) \ | WriteToParquet( path, self.SCHEMA, num_shards=1, codec='none', shard_name_template='', row_group_buffer_size=250000) self.assertEqual(pq.read_metadata(path).num_row_groups, 3)
def test_sink_transform_int96(self): with tempfile.NamedTemporaryFile() as dst: path = dst.name # pylint: disable=c-extension-no-member with self.assertRaises(pl.ArrowInvalid): # Should throw an error "ArrowInvalid: Casting from timestamp[ns] to # timestamp[us] would lose data" with TestPipeline() as p: _ = p \ | Create(self.RECORDS) \ | WriteToParquet( path, self.SCHEMA96, num_shards=1, shard_name_template='')
def _generate_data(self, p, output_prefix, init_size, data_size): init_data = [x for x in range(init_size)] lines = (p | 'create' >> Create(init_data) | 'produce' >> ParDo(ProducerFn(data_size))) schema = pa.schema([('name', pa.binary()), ('number', pa.int64())]) files = lines | 'write' >> WriteToParquet( output_prefix, schema, codec='snappy', file_name_suffix='.parquet') return files
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True p = Pipeline(options=options) (p | Create(list(range(NUM_SHARDS))) | FlatMap(lambda _: (bytes(ELEMENT_BYTES) for _ in range(NUM_ELEMENTS_PER_SHARD))) | WithKeys('') | ParDo(BigBagDoFn())) p.run()
def expand(self, pcoll): """ :return: PCollection[kind_name] """ from google.cloud import datastore from apache_beam import Create import logging query = datastore.Client(self.project_id).query(kind='__kind__') query.keys_only() kinds = [entity.key.id_or_name for entity in query.fetch()] logging.info("kinds: {}".format(kinds)) return pcoll.pipeline | 'Kind' >> Create(kinds)