def __init__(self, file_pattern, min_bundle_size=0, compression_type=CompressionTypes.AUTO, splittable=True, validate=True): """Initializes :class:`FileBasedSource`. Args: file_pattern (str): the file glob to read a string or a :class:`~apache_beam.options.value_provider.ValueProvider` (placeholder to inject a runtime value). min_bundle_size (str): minimum size of bundles that should be generated when performing initial splitting on this source. compression_type (str): Used to handle compressed output files. Typical value is :attr:`CompressionTypes.AUTO <apache_beam.io.filesystem.CompressionTypes.AUTO>`, in which case the final file path's extension will be used to detect the compression. splittable (bool): whether :class:`FileBasedSource` should try to logically split a single file into data ranges so that different parts of the same file can be read in parallel. If set to :data:`False`, :class:`FileBasedSource` will prevent both initial and dynamic splitting of sources for single files. File patterns that represent multiple files may still get split into sources for individual files. Even if set to :data:`True` by the user, :class:`FileBasedSource` may choose to not split the file, for example, for compressed files where currently it is not possible to efficiently read a data range without decompressing the whole file. validate (bool): Boolean flag to verify that the files exist during the pipeline creation time. Raises: ~exceptions.TypeError: when **compression_type** is not valid or if **file_pattern** is not a :class:`str` or a :class:`~apache_beam.options.value_provider.ValueProvider`. ~exceptions.ValueError: when compression and splittable files are specified. ~exceptions.IOError: when the file pattern specified yields an empty result. """ if not isinstance(file_pattern, (basestring, ValueProvider)): raise TypeError('%s: file_pattern must be of type string' ' or ValueProvider; got %r instead' % (self.__class__.__name__, file_pattern)) if isinstance(file_pattern, basestring): file_pattern = StaticValueProvider(str, file_pattern) self._pattern = file_pattern self._concat_source = None self._min_bundle_size = min_bundle_size if not CompressionTypes.is_valid_compression_type(compression_type): raise TypeError('compression_type must be CompressionType object but ' 'was %s' % type(compression_type)) self._compression_type = compression_type self._splittable = splittable if validate and file_pattern.is_accessible(): self._validate()
def __init__(self, file_pattern, min_bundle_size=0, compression_type=CompressionTypes.AUTO, splittable=True, validate=True): """Initializes ``FileBasedSource``. Args: file_pattern: the file glob to read a string or a ValueProvider (placeholder to inject a runtime value). min_bundle_size: minimum size of bundles that should be generated when performing initial splitting on this source. compression_type: compression type to use splittable: whether FileBasedSource should try to logically split a single file into data ranges so that different parts of the same file can be read in parallel. If set to False, FileBasedSource will prevent both initial and dynamic splitting of sources for single files. File patterns that represent multiple files may still get split into sources for individual files. Even if set to True by the user, FileBasedSource may choose to not split the file, for example, for compressed files where currently it is not possible to efficiently read a data range without decompressing the whole file. validate: Boolean flag to verify that the files exist during the pipeline creation time. Raises: TypeError: when compression_type is not valid or if file_pattern is not a string or a ValueProvider. ValueError: when compression and splittable files are specified. IOError: when the file pattern specified yields an empty result. """ if not isinstance(file_pattern, (basestring, ValueProvider)): raise TypeError('%s: file_pattern must be of type string' ' or ValueProvider; got %r instead' % (self.__class__.__name__, file_pattern)) if isinstance(file_pattern, basestring): file_pattern = StaticValueProvider(str, file_pattern) self._pattern = file_pattern self._concat_source = None self._min_bundle_size = min_bundle_size if not CompressionTypes.is_valid_compression_type(compression_type): raise TypeError( 'compression_type must be CompressionType object but ' 'was %s' % type(compression_type)) self._compression_type = compression_type if compression_type in (CompressionTypes.UNCOMPRESSED, CompressionTypes.AUTO): self._splittable = splittable else: # We can't split compressed files efficiently so turn off splitting. self._splittable = False if validate and file_pattern.is_accessible(): self._validate()
def __init__(self, reader, file_patterns, min_bundle_size=0, compression_type=CompressionTypes.AUTO, splittable=True, validate=True): if not isinstance(file_patterns, ValueProvider): file_patterns = StaticValueProvider(list, file_patterns) self._patterns = file_patterns self._pickle_reader = pickler.dumps(reader) self._reader = None self._concat_source = None self._min_bundle_size = min_bundle_size if not CompressionTypes.is_valid_compression_type(compression_type): raise TypeError( 'compression_type must be CompressionType object but ' 'was %s' % type(compression_type)) self._compression_type = compression_type self._splittable = splittable if validate and file_patterns.is_accessible(): self._validate()