Пример #1
0
 def test_create_transform(self):
     with TestPipeline() as p:
         assert_that(p | 'Empty' >> Create([]), equal_to([]), label='empty')
         assert_that(p | 'One' >> Create([None]),
                     equal_to([None]),
                     label='one')
         assert_that(p | Create(list(range(10))), equal_to(list(range(10))))
    def test(self):
        p = TestPipeline(DirectRunner())

        test_user = {'account': {'id': 1}, 'country': 'Germany'}
        test_account_offer = {
            'account_id': 1,
            'account_offer_id': 2,
            'offer_id': 3,
        }
        test_offer = {'offer_id': 3, 'offer_name': 'offer name'}

        users = p | "Create users" >> Create([test_user])
        account_offers = p | "Create account offers" >> Create(
            [test_account_offer])
        offers = p | "Create offers" >> Create([test_offer])

        result = {
            'users': users,
            'account_offers': account_offers,
            'offers': offers
        } | OfferStatTransform()

        assert_that(result, self.assertSimple)

        p.run()
Пример #3
0
    def test_create_uses_coder_for_pickling(self):
        coders.registry.register_coder(_Unpicklable, _UnpicklableCoder)
        create = Create([_Unpicklable(1), _Unpicklable(2), _Unpicklable(3)])
        unpickled_create = pickler.loads(pickler.dumps(create))
        self.assertEqual(
            sorted(create.values, key=lambda v: v.value),
            sorted(unpickled_create.values, key=lambda v: v.value))

        with self.assertRaises(NotImplementedError):
            # As there is no special coder for Union types, this will fall back to
            # FastPrimitivesCoder, which in turn falls back to pickling.
            create_mixed_types = Create([_Unpicklable(1), 2])
            pickler.dumps(create_mixed_types)
Пример #4
0
 def test_read_all_from_parquet_file_pattern(self):
     file_pattern = self._write_pattern(5)
     with TestPipeline() as p:
         assert_that(
             p \
             | Create([file_pattern]) \
             | ReadAllFromParquet(),
             equal_to(self.RECORDS * 5))
     with TestPipeline() as p:
         assert_that(
             p \
             | Create([file_pattern]) \
             | ReadAllFromParquetBatched(),
             equal_to([self._records_as_arrow()] * 5))
Пример #5
0
    def test_sdf_with_side_inputs(self):
        with TestPipeline() as p:
            side1 = p | 'Create1' >> Create(['1', '2'])
            side2 = p | 'Create2' >> Create(['3', '4'])
            side3 = p | 'Create3' >> Create(['5'])
            result = (p
                      | 'create_main' >> beam.Create(['a', 'b', 'c'])
                      | beam.ParDo(ExpandStrings(), AsList(side1),
                                   AsList(side2), AsSingleton(side3)))

            expected_result = []
            for c in ['a', 'b', 'c']:
                for i in range(5):
                    expected_result.append(c + ':' + str(i + 1))
            assert_that(result, equal_to(expected_result))
Пример #6
0
    def test_read_all_from_parquet_single_file(self):
        path = self._write_data()
        with TestPipeline() as p:
            assert_that(
                p \
                | Create([path]) \
                | ReadAllFromParquet(),
                equal_to(self.RECORDS))

        with TestPipeline() as p:
            assert_that(
                p \
                | Create([path]) \
                | ReadAllFromParquetBatched(),
                equal_to([self._records_as_arrow()]))
Пример #7
0
 def test_reified_value_assert_fail_unmatched_timestamp(self):
   expected = [TestWindowedValue(v, 1, [GlobalWindow()])
               for v in [1, 2, 3]]
   with self.assertRaises(Exception):
     with TestPipeline() as p:
       assert_that(p | Create([2, 3, 1]), equal_to(expected),
                   reify_windows=True)
Пример #8
0
  def expand(self, pbegin):
    if self._read_operations is not None and isinstance(pbegin, PBegin):
      pcoll = pbegin.pipeline | Create(self._read_operations)
    elif not isinstance(pbegin, PBegin):
      if self._read_operations is not None:
        raise ValueError(
            "Read operation in the constructor only works with "
            "the root of the pipeline.")
      pcoll = pbegin
    else:
      raise ValueError(
          "Spanner required read operation, sql or table "
          "with columns.")

    if self._transaction is None:
      # reading as batch read using the spanner partitioning query to create
      # batches.
      p = (
          pcoll
          | 'Generate Partitions' >> ParDo(
              _CreateReadPartitions(spanner_configuration=self._configuration))
          | 'Reshuffle' >> Reshuffle()
          | 'Read From Partitions' >> ParDo(
              _ReadFromPartitionFn(spanner_configuration=self._configuration)))
    else:
      # reading as naive read, in which we don't make batches and execute the
      # queries as a single read.
      p = (
          pcoll
          | 'Reshuffle' >> Reshuffle().with_input_types(ReadOperation)
          | 'Perform Read' >> ParDo(
              _NaiveSpannerReadDoFn(spanner_configuration=self._configuration),
              AsSingleton(self._transaction)))
    return p
Пример #9
0
 def test_assert_missing_and_unexpected(self):
   with self.assertRaisesRegex(
       BeamAssertException,
       r"unexpected elements \['c'\].*missing elements \['d'\]"):
     with TestPipeline() as p:
       assert_that(p | Create(['a', 'b', 'c']),
                   equal_to(['a', 'b', 'd']))
Пример #10
0
 def test_read_all_from_parquet_many_file_patterns(self):
     file_pattern1 = self._write_pattern(5)
     file_pattern2 = self._write_pattern(2)
     file_pattern3 = self._write_pattern(3)
     with TestPipeline() as p:
         assert_that(
             p \
             | Create([file_pattern1, file_pattern2, file_pattern3]) \
             | ReadAllFromParquet(),
             equal_to(self.RECORDS * 10))
     with TestPipeline() as p:
         assert_that(
             p \
             | Create([file_pattern1, file_pattern2, file_pattern3]) \
             | ReadAllFromParquetBatched(),
             equal_to([self._records_as_arrow()] * 10))
Пример #11
0
 def test_reified_value_assert_fail_unmatched_window(self):
   expected = [TestWindowedValue(v, MIN_TIMESTAMP, [IntervalWindow(0, 1)])
               for v in [1, 2, 3]]
   with self.assertRaises(Exception):
     with TestPipeline() as p:
       assert_that(p | Create([2, 3, 1]), equal_to(expected),
                   reify_windows=True)
Пример #12
0
    def test_on_direct_runner(self):
        class FakeSink(NativeSink):
            """A fake sink outputing a number of elements."""
            def __init__(self):
                self.written_values = []
                self.writer_instance = FakeSinkWriter(self.written_values)

            def writer(self):
                return self.writer_instance

        class FakeSinkWriter(NativeSinkWriter):
            """A fake sink writer for testing."""
            def __init__(self, written_values):
                self.written_values = written_values

            def __enter__(self):
                return self

            def __exit__(self, *unused_args):
                pass

            def Write(self, value):
                self.written_values.append(value)

        with TestPipeline() as p:
            sink = FakeSink()
            p | Create(['a', 'b', 'c']) | _NativeWrite(sink)  # pylint: disable=expression-not-assigned

        self.assertEqual(['a', 'b', 'c'], sorted(sink.written_values))
Пример #13
0
 def test_read_all_from_parquet_many_single_files(self):
     path1 = self._write_data()
     path2 = self._write_data()
     path3 = self._write_data()
     with TestPipeline() as p:
         assert_that(
             p \
             | Create([path1, path2, path3]) \
             | ReadAllFromParquet(),
             equal_to(self.RECORDS * 3))
     with TestPipeline() as p:
         assert_that(
             p \
             | Create([path1, path2, path3]) \
             | ReadAllFromParquetBatched(),
             equal_to([self._records_as_arrow()] * 3))
Пример #14
0
    def test_on_direct_runner(self):
        class FakeSink(NativeSink):
            """A fake sink outputing a number of elements."""
            def __init__(self):
                self.written_values = []
                self.writer_instance = FakeSinkWriter(self.written_values)

            def writer(self):
                return self.writer_instance

        class FakeSinkWriter(NativeSinkWriter):
            """A fake sink writer for testing."""
            def __init__(self, written_values):
                self.written_values = written_values

            def __enter__(self):
                return self

            def __exit__(self, *unused_args):
                pass

            def Write(self, value):
                self.written_values.append(value)

        # Records in-memory writes, only works on Direct runner.
        p = TestPipeline(runner='DirectRunner')
        sink = FakeSink()
        p | Create(['a', 'b', 'c']) | _NativeWrite(sink)  # pylint: disable=expression-not-assigned
        p.run()

        self.assertEqual(['a', 'b', 'c'], sink.written_values)
Пример #15
0
 def test_read_all_from_avro_file_pattern(self):
   file_pattern = self._write_pattern(5)
   with TestPipeline() as p:
     assert_that(
         p \
         | Create([file_pattern]) \
         | avroio.ReadAllFromAvro(use_fastavro=self.use_fastavro),
         equal_to(self.RECORDS * 5))
Пример #16
0
 def test_read_all_from_avro_single_file(self):
   path = self._write_data()
   with TestPipeline() as p:
     assert_that(
         p \
         | Create([path]) \
         | avroio.ReadAllFromAvro(use_fastavro=self.use_fastavro),
         equal_to(self.RECORDS))
Пример #17
0
 def test_read_all_from_avro_many_single_files(self):
     path1 = self._write_data()
     path2 = self._write_data()
     path3 = self._write_data()
     with TestPipeline() as p:
         assert_that(
             p | Create([path1, path2, path3]) | avroio.ReadAllFromAvro(),
             equal_to(self.RECORDS * 3))
Пример #18
0
 def test_reified_value_passes(self):
     expected = [
         TestWindowedValue(v, MIN_TIMESTAMP, [GlobalWindow()])
         for v in [1, 2, 3]
     ]
     with TestPipeline() as p:
         assert_that(p | Create([2, 3, 1]),
                     equal_to(expected),
                     reify_windows=True)
Пример #19
0
 def test_read_all_from_avro_with_filename(self):
   file_pattern, file_paths = self._write_pattern(3, return_filenames=True)
   result = [(path, record) for path in file_paths for record in self.RECORDS]
   with TestPipeline() as p:
     assert_that(
         p \
         | Create([file_pattern]) \
         | avroio.ReadAllFromAvro(with_filename=True),
         equal_to(result))
Пример #20
0
 def test_read_all_from_avro_many_file_patterns(self):
     file_pattern1 = self._write_pattern(5)
     file_pattern2 = self._write_pattern(2)
     file_pattern3 = self._write_pattern(3)
     with TestPipeline() as p:
         assert_that(
             p
             | Create([file_pattern1, file_pattern2, file_pattern3])
             | avroio.ReadAllFromAvro(), equal_to(self.RECORDS * 10))
Пример #21
0
 def test_sink_transform_int96(self):
   with tempfile.NamedTemporaryFile() as dst:
     path = dst.name
     # pylint: disable=c-extension-no-member
     with self.assertRaises(pl.ArrowInvalid):
       with TestPipeline() as p:
         _ = p \
         | Create(self.RECORDS) \
         | WriteToParquet(
             path, self.SCHEMA96, num_shards=1, shard_name_template='')
Пример #22
0
def main(argv=None):
  options = PipelineOptions(argv)

  p = Pipeline(options=options)

  (p
   | Create(["a", "b", "c", "d", "e"], reshuffle=False)
   | Print("hello", expansion_service(options)))

  p.run()
Пример #23
0
 def test_sink_transform_multiple_row_group(self):
     with tempfile.NamedTemporaryFile() as dst:
         path = dst.name
         with TestPipeline() as p:
             # writing 623200 bytes of data
             _ = p \
             | Create(self.RECORDS * 4000) \
             | WriteToParquet(
                 path, self.SCHEMA, num_shards=1, codec='none',
                 shard_name_template='', row_group_buffer_size=250000)
         self.assertEqual(pq.read_metadata(path).num_row_groups, 3)
Пример #24
0
 def test_process_auto(self):
   with TempDir() as temp_dir:
     path = temp_dir.create_temp_file('result.gz')
     _write_file_gzip(path, FOO_BAR_RECORD_BASE64)
     with TestPipeline() as p:
       result = (p
                 | Create([path])
                 | ReadAllFromTFRecord(
                     coder=coders.BytesCoder(),
                     compression_type=CompressionTypes.AUTO))
       assert_that(result, equal_to([b'foo', b'bar']))
Пример #25
0
 def test_process_glob(self):
   with TempDir() as temp_dir:
     self._write_glob(temp_dir, 'result')
     glob = temp_dir.get_path() + os.path.sep + '*result'
     with TestPipeline() as p:
       result = (p
                 | Create([glob])
                 | ReadAllFromTFRecord(
                     coder=coders.BytesCoder(),
                     compression_type=CompressionTypes.AUTO))
       assert_that(result, equal_to([b'foo', b'bar'] * 3))
Пример #26
0
 def test_sink_transform_multiple_row_group(self):
     with TemporaryDirectory() as tmp_dirname:
         path = os.path.join(tmp_dirname + "tmp_filename")
         with TestPipeline() as p:
             # writing 623200 bytes of data
             _ = p \
             | Create(self.RECORDS * 4000) \
             | WriteToParquet(
                 path, self.SCHEMA, num_shards=1, codec='none',
                 shard_name_template='', row_group_buffer_size=250000)
         self.assertEqual(pq.read_metadata(path).num_row_groups, 3)
Пример #27
0
 def test_sink_transform_int96(self):
     with tempfile.NamedTemporaryFile() as dst:
         path = dst.name
         # pylint: disable=c-extension-no-member
         with self.assertRaises(pl.ArrowInvalid):
             # Should throw an error "ArrowInvalid: Casting from timestamp[ns] to
             # timestamp[us] would lose data"
             with TestPipeline() as p:
                 _ = p \
                 | Create(self.RECORDS) \
                 | WriteToParquet(
                     path, self.SCHEMA96, num_shards=1, shard_name_template='')
Пример #28
0
    def _generate_data(self, p, output_prefix, init_size, data_size):
        init_data = [x for x in range(init_size)]

        lines = (p
                 | 'create' >> Create(init_data)
                 | 'produce' >> ParDo(ProducerFn(data_size)))

        schema = pa.schema([('name', pa.binary()), ('number', pa.int64())])

        files = lines | 'write' >> WriteToParquet(
            output_prefix, schema, codec='snappy', file_name_suffix='.parquet')

        return files
Пример #29
0
def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    p = Pipeline(options=options)
    (p
     | Create(list(range(NUM_SHARDS)))
     | FlatMap(lambda _:
               (bytes(ELEMENT_BYTES) for _ in range(NUM_ELEMENTS_PER_SHARD)))
     | WithKeys('')
     | ParDo(BigBagDoFn()))

    p.run()
Пример #30
0
    def expand(self, pcoll):
        """

        :return: PCollection[kind_name]
        """
        from google.cloud import datastore
        from apache_beam import Create
        import logging
        query = datastore.Client(self.project_id).query(kind='__kind__')
        query.keys_only()
        kinds = [entity.key.id_or_name for entity in query.fetch()]
        logging.info("kinds: {}".format(kinds))
        return pcoll.pipeline | 'Kind' >> Create(kinds)