Exemplo n.º 1
0
 def test_read_do_write_with_start_bundle(self):
   input_path = self.create_temp_file('01234567890123456789\n0123456789')
   output_path = '%s.out' % input_path
   finish_path = '%s.finish' % input_path
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           fileio.TextFileSource(file_path=input_path,
                                 start_offset=0,
                                 end_offset=15,
                                 strip_trailing_newlines=True,
                                 coder=coders.StrUtf8Coder()),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
           DoFnUsingStartBundle(finish_path)),
                          output_tags=['out'],
                          output_coders=[self.OUTPUT_CODER],
                          input=(0, 0),
                          side_inputs=None),
       maptask.WorkerWrite(
           fileio.TextFileSink(file_path_prefix=output_path,
                               append_trailing_newlines=True,
                               coder=coders.ToStringCoder()),
           input=(1, 0),
           output_coders=(coders.ToStringCoder(),))
   ]))
   with open(output_path) as f:
     self.assertEqual('XYZ: 01234567890123456789\n', f.read())
   # Check that the finish_bundle method of the custom DoFn object left the
   # expected side-effect by writing a file with a specific content.
   with open(finish_path) as f:
     self.assertEqual('finish called.', f.read())
Exemplo n.º 2
0
 def test_create_do_write(self):
   output_path = self.create_temp_file('n/a')
   elements = ['abc', 'def', 'ghi']
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           inmemory.InMemorySource(
               elements=[pickler.dumps(e) for e in elements],
               # Start at the last element.
               start_index=2,
               # Go beyond the end to test that case is handled.
               end_index=15),
           output_coders=[coders.ToStringCoder()]),
       maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
           ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])),
                          output_tags=['out'],
                          output_coders=[self.OUTPUT_CODER],
                          input=(0, 0),
                          side_inputs=None),
       maptask.WorkerWrite(
           fileio.TextFileSink(file_path_prefix=output_path,
                               append_trailing_newlines=True,
                               coder=coders.ToStringCoder()),
           input=(1, 0),
           output_coders=(coders.ToStringCoder(),))
   ]))
   with open(output_path) as f:
     self.assertEqual('XYZ: ghi\n', f.read())
Exemplo n.º 3
0
 def test_shuffle_read_do_write(self):
   output_path = self.create_temp_file('n/a')
   work_spec = [
       maptask.WorkerGroupingShuffleRead(shuffle_reader_config='none',
                                         start_shuffle_position='aaa',
                                         end_shuffle_position='zzz',
                                         coder=self.SHUFFLE_CODER,
                                         output_coders=[self.SHUFFLE_CODER]),
       maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
           ptransform.CallableWrapperDoFn(
               lambda (k, vs): [str((k, v)) for v in vs])),
                          output_tags=['out'],
                          output_coders=[self.OUTPUT_CODER],
                          input=(0, 0),
                          side_inputs=None),
       maptask.WorkerWrite(
           fileio.TextFileSink(file_path_prefix=output_path,
                               append_trailing_newlines=True,
                               coder=coders.ToStringCoder()),
           input=(1, 0),
           output_coders=(coders.ToStringCoder(),))
   ]
   shuffle_source_mock = mock.MagicMock()
   shuffle_source_mock.reader().__enter__().__iter__.return_value = [
       (10, [1, 2]), (20, [3])]
   executor.MapTaskExecutor().execute(
       make_map_task(work_spec),
       test_shuffle_source=shuffle_source_mock)
   with open(output_path) as f:
     self.assertEqual('(10, 1)\n(10, 2)\n(20, 3)\n', f.read())
Exemplo n.º 4
0
 def test_read_do_write(self):
   input_path = self.create_temp_file('01234567890123456789\n0123456789')
   output_path = '%s.out' % input_path
   executor.MapTaskExecutor().execute(make_map_task([
       maptask.WorkerRead(
           fileio.TextFileSource(file_path=input_path,
                                 start_offset=0,
                                 end_offset=15,
                                 strip_trailing_newlines=True,
                                 coder=coders.StrUtf8Coder()),
           output_coders=[self.OUTPUT_CODER]),
       maptask.WorkerDoFn(serialized_fn=pickle_with_side_inputs(
           ptransform.CallableWrapperDoFn(lambda x: ['XYZ: %s' % x])),
                          output_tags=['out'],
                          output_coders=[self.OUTPUT_CODER],
                          input=(0, 0),
                          side_inputs=None),
       maptask.WorkerWrite(
           fileio.TextFileSink(file_path_prefix=output_path,
                               append_trailing_newlines=True,
                               coder=coders.ToStringCoder()),
           input=(1, 0),
           output_coders=(coders.ToStringCoder(),))
   ]))
   with open(output_path) as f:
     self.assertEqual('XYZ: 01234567890123456789\n', f.read())
Exemplo n.º 5
0
def make_text_sink(output_path, input, coder=coders.ToStringCoder()):
    return maptask.WorkerWrite(fileio.NativeTextFileSink(
        file_path_prefix=output_path,
        append_trailing_newlines=True,
        coder=coder),
                               input=input,
                               output_coders=(coder, ))
Exemplo n.º 6
0
    def test_file_sink_writing(self):
        temp_path = tempfile.NamedTemporaryFile().name
        sink = MyFileSink(temp_path,
                          file_name_suffix='.foo',
                          coder=coders.ToStringCoder())

        # Manually invoke the generic Sink API.
        init_token = sink.initialize_write()

        writer1 = sink.open_writer(init_token, '1')
        writer1.write('a')
        writer1.write('b')
        res1 = writer1.close()

        writer2 = sink.open_writer(init_token, '2')
        writer2.write('x')
        writer2.write('y')
        writer2.write('z')
        res2 = writer2.close()

        res = list(sink.finalize_write(init_token, [res1, res2]))
        # Retry the finalize operation (as if the first attempt was lost).
        res = list(sink.finalize_write(init_token, [res1, res2]))

        # Check the results.
        shard1 = temp_path + '-00000-of-00002.foo'
        shard2 = temp_path + '-00001-of-00002.foo'
        self.assertEqual(res, [shard1, shard2])
        self.assertEqual(open(shard1).read(), '[start][a][b][end]')
        self.assertEqual(open(shard2).read(), '[start][x][y][z][end]')

        # Check that any temp files are deleted.
        self.assertItemsEqual([shard1, shard2], glob.glob(temp_path + '*'))
Exemplo n.º 7
0
    def test_file_sink_multi_shards(self):
        temp_path = tempfile.NamedTemporaryFile().name
        sink = MyFileSink(temp_path,
                          file_name_suffix='.foo',
                          coder=coders.ToStringCoder())

        # Manually invoke the generic Sink API.
        init_token = sink.initialize_write()

        num_shards = 1000
        writer_results = []
        for i in range(num_shards):
            uuid = 'uuid-%05d' % i
            writer = sink.open_writer(init_token, uuid)
            writer.write('a')
            writer.write('b')
            writer.write(uuid)
            writer_results.append(writer.close())

        res_first = list(sink.finalize_write(init_token, writer_results))
        # Retry the finalize operation (as if the first attempt was lost).
        res_second = list(sink.finalize_write(init_token, writer_results))

        self.assertItemsEqual(res_first, res_second)

        res = sorted(res_second)
        for i in range(num_shards):
            shard_name = '%s-%05d-of-%05d.foo' % (temp_path, i, num_shards)
            uuid = 'uuid-%05d' % i
            self.assertEqual(res[i], shard_name)
            self.assertEqual(
                open(shard_name).read(), ('[start][a][b][%s][end]' % uuid))

        # Check that any temp files are deleted.
        self.assertItemsEqual(res, glob.glob(temp_path + '*'))
Exemplo n.º 8
0
    def __init__(self,
                 file_path_prefix,
                 append_trailing_newlines=True,
                 file_name_suffix='',
                 num_shards=0,
                 shard_name_template=None,
                 validate=True,
                 coder=coders.ToStringCoder()):
        # We initialize a file_path attribute containing just the prefix part for
        # local runner environment. For now, sharding is not supported in the local
        # runner and sharding options (template, num, suffix) are ignored.
        # The attribute is also used in the worker environment when we just write
        # to a specific file.
        # TODO(silviuc): Add support for file sharding in the local runner.
        self.file_path = file_path_prefix
        self.append_trailing_newlines = append_trailing_newlines
        self.coder = coder

        self.is_gcs_sink = self.file_path.startswith('gs://')

        self.file_name_prefix = file_path_prefix
        self.file_name_suffix = file_name_suffix
        self.num_shards = num_shards
        # TODO(silviuc): Update this when the service supports more patterns.
        self.shard_name_template = ('-SSSSS-of-NNNNN'
                                    if shard_name_template is None else
                                    shard_name_template)
        # TODO(silviuc): Implement sink validation.
        self.validate = validate
Exemplo n.º 9
0
 def test_empty_write(self):
     temp_path = tempfile.NamedTemporaryFile().name
     sink = MyFileSink(temp_path,
                       file_name_suffix='.foo',
                       coder=coders.ToStringCoder())
     p = df.Pipeline('DirectPipelineRunner')
     p | df.Create([]) | df.io.Write(sink)  # pylint: disable=expression-not-assigned
     p.run()
     self.assertEqual(
         open(temp_path + '-00000-of-00001.foo').read(), '[start][end]')
Exemplo n.º 10
0
 def __init__(self,
              file_path_prefix,
              file_name_suffix='',
              coder=coders.ToStringCoder(),
              append_trailing_newlines=True):
   super(PureTextFileSink, self).__init__(file_path_prefix,
                                          file_name_suffix=file_name_suffix,
                                          coder=coder,
                                          mime_type='text/plain')
   self.append_trailing_newlines = append_trailing_newlines
Exemplo n.º 11
0
 def test_ungrouped_shuffle_read_and_write(self):
   output_path = self.create_temp_file('n/a')
   work_spec = [
       maptask.WorkerUngroupedShuffleRead(shuffle_reader_config='none',
                                          start_shuffle_position='aaa',
                                          end_shuffle_position='zzz',
                                          coder=self.SHUFFLE_CODER,
                                          output_coders=[self.SHUFFLE_CODER]),
       maptask.WorkerWrite(
           fileio.TextFileSink(file_path_prefix=output_path,
                               append_trailing_newlines=True,
                               coder=coders.ToStringCoder()),
           input=(0, 0),
           output_coders=(coders.ToStringCoder(),))
   ]
   shuffle_source_mock = mock.MagicMock()
   shuffle_source_mock.reader().__enter__().__iter__.return_value = [1, 2, 3]
   executor.MapTaskExecutor().execute(
       make_map_task(work_spec),
       test_shuffle_source=shuffle_source_mock)
   with open(output_path) as f:
     self.assertEqual('1\n2\n3\n', f.read())
Exemplo n.º 12
0
    def test_fixed_shard_write(self):
        temp_path = tempfile.NamedTemporaryFile().name
        sink = MyFileSink(temp_path,
                          file_name_suffix='.foo',
                          num_shards=3,
                          shard_name_template='_NN_SSS_',
                          coder=coders.ToStringCoder())
        p = df.Pipeline('DirectPipelineRunner')
        p | df.Create(['a', 'b']) | df.io.Write(sink)  # pylint: disable=expression-not-assigned

        p.run()

        concat = ''.join(
            open(temp_path + '_03_%03d_.foo' % shard_num).read()
            for shard_num in range(3))
        self.assertTrue('][a][' in concat, concat)
        self.assertTrue('][b][' in concat, concat)
Exemplo n.º 13
0
    def test_file_sink_io_error(self):
        temp_path = tempfile.NamedTemporaryFile().name
        sink = MyFileSink(temp_path,
                          file_name_suffix='.foo',
                          coder=coders.ToStringCoder())

        # Manually invoke the generic Sink API.
        init_token = sink.initialize_write()

        writer1 = sink.open_writer(init_token, '1')
        writer1.write('a')
        writer1.write('b')
        res1 = writer1.close()

        writer2 = sink.open_writer(init_token, '2')
        writer2.write('x')
        writer2.write('y')
        writer2.write('z')
        res2 = writer2.close()

        os.remove(res2)
        with self.assertRaises(IOError):
            list(sink.finalize_write(init_token, [res1, res2]))
Exemplo n.º 14
0
    def __init__(
        self,
        file_path_prefix,
        file_name_suffix='',
        append_trailing_newlines=True,
        num_shards=0,
        shard_name_template=None,
        coder=coders.ToStringCoder(),
        compression_type=CompressionTypes.NO_COMPRESSION,
    ):
        """Initialize a TextFileSink.

    Args:
      file_path_prefix: The file path to write to. The files written will begin
        with this prefix, followed by a shard identifier (see num_shards), and
        end in a common extension, if given by file_name_suffix. In most cases,
        only this argument is specified and num_shards, shard_name_template, and
        file_name_suffix use default values.
      file_name_suffix: Suffix for the files written.
      append_trailing_newlines: indicate whether this sink should write an
        additional newline char after writing each element.
      num_shards: The number of files (shards) used for output. If not set, the
        service will decide on the optimal number of shards.
        Constraining the number of shards is likely to reduce
        the performance of a pipeline.  Setting this value is not recommended
        unless you require a specific number of output files.
      shard_name_template: A template string containing placeholders for
        the shard number and shard count. Currently only '' and
        '-SSSSS-of-NNNNN' are patterns accepted by the service.
        When constructing a filename for a particular shard number, the
        upper-case letters 'S' and 'N' are replaced with the 0-padded shard
        number and shard count respectively.  This argument can be '' in which
        case it behaves as if num_shards was set to 1 and only one file will be
        generated. The default pattern used is '-SSSSS-of-NNNNN'.
      coder: Coder used to encode each line.
      compression_type: Type of compression to use for this sink.

    Raises:
      TypeError: if file path parameters are not a string or if compression_type
        is not member of CompressionTypes.
      ValueError: if shard_name_template is not of expected format.

    Returns:
      A TextFileSink object usable for writing.
    """
        if not isinstance(file_path_prefix, basestring):
            raise TypeError(
                'TextFileSink: file_path_prefix must be a string; got %r instead'
                % file_path_prefix)
        if not isinstance(file_name_suffix, basestring):
            raise TypeError(
                'TextFileSink: file_name_suffix must be a string; got %r instead'
                % file_name_suffix)

        if not CompressionTypes.valid_compression_type(compression_type):
            raise TypeError(
                'compression_type must be CompressionType object but '
                'was %s' % type(compression_type))
        if compression_type == CompressionTypes.DEFLATE:
            mime_type = 'application/x-gzip'
        else:
            mime_type = 'text/plain'

        super(TextFileSink,
              self).__init__(file_path_prefix,
                             file_name_suffix=file_name_suffix,
                             num_shards=num_shards,
                             shard_name_template=shard_name_template,
                             coder=coder,
                             mime_type=mime_type)

        self.compression_type = compression_type
        self.append_trailing_newlines = append_trailing_newlines
Exemplo n.º 15
0
  def __init__(self, file_path_prefix,
               append_trailing_newlines=True,
               file_name_suffix='',
               num_shards=0,
               shard_name_template=None,
               validate=True,
               coder=coders.ToStringCoder()):
    """Initialize a TextSink.

    Args:
      file_path_prefix: The file path to write to. The files written will begin
        with this prefix, followed by a shard identifier (see num_shards), and
        end in a common extension, if given by file_name_suffix. In most cases,
        only this argument is specified and num_shards, shard_name_template, and
        file_name_suffix use default values.
      append_trailing_newlines: indicate whether this sink should write an
          additional newline char after writing each element.
      file_name_suffix: Suffix for the files written.
      num_shards: The number of files (shards) used for output. If not set, the
        service will decide on the optimal number of shards.
        Constraining the number of shards is likely to reduce
        the performance of a pipeline.  Setting this value is not recommended
        unless you require a specific number of output files.
      shard_name_template: A template string containing placeholders for
        the shard number and shard count. Currently only '' and
        '-SSSSS-of-NNNNN' are patterns accepted by the service.
        When constructing a filename for a particular shard number, the
        upper-case letters 'S' and 'N' are replaced with the 0-padded shard
        number and shard count respectively.  This argument can be '' in which
        case it behaves as if num_shards was set to 1 and only one file will be
        generated. The default pattern used is '-SSSSS-of-NNNNN'.
      validate: Enable path validation on pipeline creation.
      coder: Coder used to encode each line.

    Raises:
      TypeError: if file_path is not a string.
      ValueError: if shard_name_template is not of expected format.
    """
    if not isinstance(file_path_prefix, basestring):
      raise TypeError(
          '%s: file_path_prefix must be a string; got %r instead' %
          (self.__class__.__name__, file_path_prefix))
    if not isinstance(file_name_suffix, basestring):
      raise TypeError(
          '%s: file_name_suffix must be a string; got %r instead' %
          (self.__class__.__name__, file_name_suffix))

    # We initialize a file_path attribute containing just the prefix part for
    # local runner environment. For now, sharding is not supported in the local
    # runner and sharding options (template, num, suffix) are ignored.
    # The attribute is also used in the worker environment when we just write
    # to a specific file.
    # TODO(silviuc): Add support for file sharding in the local runner.
    self.file_path = file_path_prefix
    self.append_trailing_newlines = append_trailing_newlines
    self.coder = coder

    self.is_gcs_sink = self.file_path.startswith('gs://')

    self.file_name_prefix = file_path_prefix
    self.file_name_suffix = file_name_suffix
    self.num_shards = num_shards
    # TODO(silviuc): Update this when the service supports more patterns.
    if shard_name_template not in (None, '', '-SSSSS-of-NNNNN'):
      raise ValueError(
          'The shard_name_template argument must be an empty string or the '
          'pattern -SSSSS-of-NNNNN instead of %s' % shard_name_template)
    self.shard_name_template = (
        shard_name_template if shard_name_template is not None
        else '-SSSSS-of-NNNNN')
    # TODO(silviuc): Implement sink validation.
    self.validate = validate
Exemplo n.º 16
0
def TextFileSink(file_path_prefix,     # pylint: disable=invalid-name
                 append_trailing_newlines=True,
                 file_name_suffix='',
                 num_shards=0,
                 shard_name_template=None,
                 validate=True,
                 coder=coders.ToStringCoder()):
  """Initialize a TextSink.

  Args:
    file_path_prefix: The file path to write to. The files written will begin
      with this prefix, followed by a shard identifier (see num_shards), and
      end in a common extension, if given by file_name_suffix. In most cases,
      only this argument is specified and num_shards, shard_name_template, and
      file_name_suffix use default values.
    append_trailing_newlines: indicate whether this sink should write an
        additional newline char after writing each element.
    file_name_suffix: Suffix for the files written.
    num_shards: The number of files (shards) used for output. If not set, the
      service will decide on the optimal number of shards.
      Constraining the number of shards is likely to reduce
      the performance of a pipeline.  Setting this value is not recommended
      unless you require a specific number of output files.
    shard_name_template: A template string containing placeholders for
      the shard number and shard count. Currently only '' and
      '-SSSSS-of-NNNNN' are patterns accepted by the service.
      When constructing a filename for a particular shard number, the
      upper-case letters 'S' and 'N' are replaced with the 0-padded shard
      number and shard count respectively.  This argument can be '' in which
      case it behaves as if num_shards was set to 1 and only one file will be
      generated. The default pattern used is '-SSSSS-of-NNNNN'.
    validate: Enable path validation on pipeline creation.
    coder: Coder used to encode each line.

  Raises:
    TypeError: if file_path is not a string.
    ValueError: if shard_name_template is not of expected format.

  Returns:
    A TextFileSink object usable for writing.
  """
  if not isinstance(file_path_prefix, basestring):
    raise TypeError(
        'TextFileSink: file_path_prefix must be a string; got %r instead' %
        file_path_prefix)
  if not isinstance(file_name_suffix, basestring):
    raise TypeError(
        'TextFileSink: file_name_suffix must be a string; got %r instead' %
        file_name_suffix)
  if shard_name_template not in (None, '', '-SSSSS-of-NNNNN'):
    raise ValueError(
        'The shard_name_template argument must be an empty string or the '
        'pattern -SSSSS-of-NNNNN instead of %s' % shard_name_template)
  if shard_name_template == '':  # pylint: disable=g-explicit-bool-comparison
    num_shards = 1

  if num_shards:
    return NativeTextFileSink(file_path_prefix,
                              append_trailing_newlines=append_trailing_newlines,
                              file_name_suffix=file_name_suffix,
                              num_shards=num_shards,
                              shard_name_template=shard_name_template,
                              validate=validate,
                              coder=coder)
  else:
    return PureTextFileSink(file_path_prefix,
                            append_trailing_newlines=append_trailing_newlines,
                            file_name_suffix=file_name_suffix,
                            coder=coder)