Exemplo n.º 1
0
  def __init__(self,
               file_pattern,
               min_bundle_size=0,
               compression_type=CompressionTypes.AUTO,
               splittable=True,
               validate=True):
    """Initializes :class:`FileBasedSource`.

    Args:
      file_pattern (str): the file glob to read a string or a
        :class:`~apache_beam.options.value_provider.ValueProvider`
        (placeholder to inject a runtime value).
      min_bundle_size (str): minimum size of bundles that should be generated
        when performing initial splitting on this source.
      compression_type (str): Used to handle compressed output files.
        Typical value is :attr:`CompressionTypes.AUTO
        <apache_beam.io.filesystem.CompressionTypes.AUTO>`,
        in which case the final file path's extension will be used to detect
        the compression.
      splittable (bool): whether :class:`FileBasedSource` should try to
        logically split a single file into data ranges so that different parts
        of the same file can be read in parallel. If set to :data:`False`,
        :class:`FileBasedSource` will prevent both initial and dynamic splitting
        of sources for single files. File patterns that represent multiple files
        may still get split into sources for individual files. Even if set to
        :data:`True` by the user, :class:`FileBasedSource` may choose to not
        split the file, for example, for compressed files where currently it is
        not possible to efficiently read a data range without decompressing the
        whole file.
      validate (bool): Boolean flag to verify that the files exist during the
        pipeline creation time.

    Raises:
      ~exceptions.TypeError: when **compression_type** is not valid or if
        **file_pattern** is not a :class:`str` or a
        :class:`~apache_beam.options.value_provider.ValueProvider`.
      ~exceptions.ValueError: when compression and splittable files are
        specified.
      ~exceptions.IOError: when the file pattern specified yields an empty
        result.
    """

    if not isinstance(file_pattern, (basestring, ValueProvider)):
      raise TypeError('%s: file_pattern must be of type string'
                      ' or ValueProvider; got %r instead'
                      % (self.__class__.__name__, file_pattern))

    if isinstance(file_pattern, basestring):
      file_pattern = StaticValueProvider(str, file_pattern)
    self._pattern = file_pattern

    self._concat_source = None
    self._min_bundle_size = min_bundle_size
    if not CompressionTypes.is_valid_compression_type(compression_type):
      raise TypeError('compression_type must be CompressionType object but '
                      'was %s' % type(compression_type))
    self._compression_type = compression_type
    self._splittable = splittable
    if validate and file_pattern.is_accessible():
      self._validate()
Exemplo n.º 2
0
    def __init__(self,
                 file_pattern,
                 min_bundle_size=0,
                 compression_type=CompressionTypes.AUTO,
                 splittable=True,
                 validate=True):
        """Initializes ``FileBasedSource``.

    Args:
      file_pattern: the file glob to read a string or a ValueProvider
                    (placeholder to inject a runtime value).
      min_bundle_size: minimum size of bundles that should be generated when
                       performing initial splitting on this source.
      compression_type: compression type to use
      splittable: whether FileBasedSource should try to logically split a single
                  file into data ranges so that different parts of the same file
                  can be read in parallel. If set to False, FileBasedSource will
                  prevent both initial and dynamic splitting of sources for
                  single files. File patterns that represent multiple files may
                  still get split into sources for individual files. Even if set
                  to True by the user, FileBasedSource may choose to not split
                  the file, for example, for compressed files where currently
                  it is not possible to efficiently read a data range without
                  decompressing the whole file.
      validate: Boolean flag to verify that the files exist during the pipeline
                creation time.
    Raises:
      TypeError: when compression_type is not valid or if file_pattern is not a
                 string or a ValueProvider.
      ValueError: when compression and splittable files are specified.
      IOError: when the file pattern specified yields an empty result.
    """

        if not isinstance(file_pattern, (basestring, ValueProvider)):
            raise TypeError('%s: file_pattern must be of type string'
                            ' or ValueProvider; got %r instead' %
                            (self.__class__.__name__, file_pattern))

        if isinstance(file_pattern, basestring):
            file_pattern = StaticValueProvider(str, file_pattern)
        self._pattern = file_pattern

        self._concat_source = None
        self._min_bundle_size = min_bundle_size
        if not CompressionTypes.is_valid_compression_type(compression_type):
            raise TypeError(
                'compression_type must be CompressionType object but '
                'was %s' % type(compression_type))
        self._compression_type = compression_type
        if compression_type in (CompressionTypes.UNCOMPRESSED,
                                CompressionTypes.AUTO):
            self._splittable = splittable
        else:
            # We can't split compressed files efficiently so turn off splitting.
            self._splittable = False
        if validate and file_pattern.is_accessible():
            self._validate()
Exemplo n.º 3
0
 def __init__(self,
              reader,
              file_patterns,
              min_bundle_size=0,
              compression_type=CompressionTypes.AUTO,
              splittable=True,
              validate=True):
     if not isinstance(file_patterns, ValueProvider):
         file_patterns = StaticValueProvider(list, file_patterns)
     self._patterns = file_patterns
     self._pickle_reader = pickler.dumps(reader)
     self._reader = None
     self._concat_source = None
     self._min_bundle_size = min_bundle_size
     if not CompressionTypes.is_valid_compression_type(compression_type):
         raise TypeError(
             'compression_type must be CompressionType object but '
             'was %s' % type(compression_type))
     self._compression_type = compression_type
     self._splittable = splittable
     if validate and file_patterns.is_accessible():
         self._validate()