示例#1
0
    def match(self, url_patterns, limits=None):
        if limits is None:
            limits = [None] * len(url_patterns)

        if len(url_patterns) != len(limits):
            raise BeamIOError(
                'Patterns and limits should be equal in length: %d != %d' %
                (len(url_patterns), len(limits)))

        # TODO(udim): Update client to allow batched results.
        def _match(path_pattern, limit):
            """Find all matching paths to the pattern provided."""
            file_infos = self._hdfs_client.ls(path_pattern,
                                              detail=True)[:limit]
            metadata_list = [
                FileMetadata(file_info['name'], file_info['size'])
                for file_info in file_infos
            ]
            return MatchResult(path_pattern, metadata_list)

        exceptions = {}
        result = []
        for url_pattern, limit in zip(url_patterns, limits):
            try:
                path_pattern = self._parse_url(url_pattern)
                result.append(_match(path_pattern, limit))
            except Exception as e:  # pylint: disable=broad-except
                exceptions[url_pattern] = e

        if exceptions:
            raise BeamIOError('Match operation failed', exceptions)
        return result
示例#2
0
  def match(self, url_patterns, limits=None):
    if limits is None:
      limits = [None] * len(url_patterns)

    if len(url_patterns) != len(limits):
      raise BeamIOError(
          'Patterns and limits should be equal in length: %d != %d' % (
              len(url_patterns), len(limits)))

    def _match(path_pattern, limit):
      """Find all matching paths to the pattern provided."""
      fs = self._hdfs_client.status(path_pattern, strict=False)
      if fs and fs[_FILE_STATUS_TYPE] == _FILE_STATUS_TYPE_FILE:
        file_statuses = [(fs[_FILE_STATUS_PATH_SUFFIX], fs)][:limit]
      else:
        file_statuses = self._hdfs_client.list(path_pattern,
                                               status=True)[:limit]
      metadata_list = [FileMetadata(file_status[1][_FILE_STATUS_NAME],
                                    file_status[1][_FILE_STATUS_SIZE])
                       for file_status in file_statuses]
      return MatchResult(path_pattern, metadata_list)

    exceptions = {}
    result = []
    for url_pattern, limit in zip(url_patterns, limits):
      try:
        path_pattern = self._parse_url(url_pattern)
        result.append(_match(path_pattern, limit))
      except Exception as e:  # pylint: disable=broad-except
        exceptions[url_pattern] = e

    if exceptions:
      raise BeamIOError('Match operation failed', exceptions)
    return result
示例#3
0
    def copy(self, source_file_names, destination_file_names):
        """
    It is an error if any file to copy already exists at the destination.

    Raises ``BeamIOError`` if any error occurred.

    Args:
      source_file_names: iterable of URLs.
      destination_file_names: iterable of URLs.
    """
        if len(source_file_names) != len(destination_file_names):
            raise BeamIOError(
                'source_file_names and destination_file_names should '
                'be equal in length: %d != %d' %
                (len(source_file_names), len(destination_file_names)))

        def _copy_file(source, destination):
            with self._open(source) as f1:
                with self._create(destination) as f2:
                    while True:
                        buf = f1.read(_COPY_BUFFER_SIZE)
                        if not buf:
                            break
                        f2.write(buf)

        def _copy_path(source, destination):
            """Recursively copy the file tree from the source to the destination."""
            if self._hdfs_client.status(
                    source)[_FILE_STATUS_TYPE] != _FILE_STATUS_TYPE_DIRECTORY:
                _copy_file(source, destination)
                return

            for path, dirs, files in self._hdfs_client.walk(source):
                for dir in dirs:
                    new_dir = self._join(destination, dir)
                    if not self._exists(new_dir):
                        self._mkdirs(new_dir)

                rel_path = posixpath.relpath(path, source)
                if rel_path == '.':
                    rel_path = ''
                for file in files:
                    _copy_file(self._join(path, file),
                               self._join(destination, rel_path, file))

        exceptions = {}
        for source, destination in zip(source_file_names,
                                       destination_file_names):
            try:
                rel_source = self._parse_url(source)
                rel_destination = self._parse_url(destination)
                _copy_path(rel_source, rel_destination)
            except Exception as e:  # pylint: disable=broad-except
                exceptions[(source, destination)] = e

        if exceptions:
            raise BeamIOError('Copy operation failed', exceptions)
示例#4
0
    def rename(self, source_file_names, destination_file_names):
        exceptions = {}
        for source, destination in zip(source_file_names,
                                       destination_file_names):
            try:
                rel_source = self._parse_url(source)
                rel_destination = self._parse_url(destination)
                if not self._hdfs_client.mv(rel_source, rel_destination):
                    raise BeamIOError('libhdfs error in renaming %s to %s' %
                                      (source, destination))
            except Exception as e:  # pylint: disable=broad-except
                exceptions[(source, destination)] = e

        if exceptions:
            raise BeamIOError('Rename operation failed', exceptions)
示例#5
0
    def delete(self, paths):
        """Deletes files or directories at the provided paths.
    Directories will be deleted recursively.

    Args:
      paths: list of paths that give the file objects to be deleted

    Raises:
      ``BeamIOError`` if any of the delete operations fail
    """
        def _delete_path(path):
            """Recursively delete the file or directory at the provided path.
      """
            try:
                if os.path.isdir(path):
                    shutil.rmtree(path)
                else:
                    os.remove(path)
            except OSError as err:
                raise IOError(err)

        exceptions = {}
        for path in paths:
            try:
                _delete_path(path)
            except Exception as e:  # pylint: disable=broad-except
                exceptions[path] = e

        if exceptions:
            raise BeamIOError("Delete operation failed", exceptions)
示例#6
0
    def _list(self, dir_or_prefix):
        """List files in a location.

    Listing is non-recursive, for filesystems that support directories.

    Args:
      dir_or_prefix: (string) A directory or location prefix (for filesystems
        that don't have directories).

    Returns:
      Generator of ``FileMetadata`` objects.

    Raises:
      ``BeamIOError`` if listing fails, but not if no files were found.
    """
        if not self.exists(dir_or_prefix):
            return

        try:
            for f in os.listdir(dir_or_prefix):
                f = self.join(dir_or_prefix, f)
                try:
                    yield FileMetadata(f, os.path.getsize(f))
                except OSError:
                    # Files may disappear, such as when listing /tmp.
                    pass
        except Exception as e:  # pylint: disable=broad-except
            raise BeamIOError("List operation failed", {dir_or_prefix: e})
示例#7
0
文件: filesystems.py 项目: tedyu/beam
 def get_filesystem(path):
     """Get the correct filesystem for the specified path
 """
     try:
         return get_filesystem(path)
     except Exception as e:
         raise BeamIOError('Enable to get the Filesystem', {path: e})
示例#8
0
 def get_filesystem(path):
     """Get the correct filesystem for the specified path
 """
     try:
         path_scheme = FileSystems.get_scheme(path)
         systems = [
             fs for fs in FileSystem.get_all_subclasses()
             if fs.scheme() == path_scheme
         ]
         if len(systems) == 0:
             raise ValueError('Unable to get the Filesystem for path %s' %
                              path)
         elif len(systems) == 1:
             # Pipeline options could come either from the Pipeline itself (using
             # direct runner), or via RuntimeValueProvider (other runners).
             options = (FileSystems._pipeline_options
                        or RuntimeValueProvider.runtime_options)
             return systems[0](pipeline_options=options)
         else:
             raise ValueError('Found more than one filesystem for path %s' %
                              path)
     except ValueError:
         raise
     except Exception as e:
         raise BeamIOError('Unable to get the Filesystem', {path: e})
示例#9
0
    def rename(self, source_file_names, destination_file_names):
        """Rename the files at the source list to the destination list.
    Source and destination lists should be of the same size.

    Args:
      source_file_names: List of file paths that need to be moved
      destination_file_names: List of destination_file_names for the files

    Raises:
      ``BeamIOError`` if any of the rename operations fail
    """
        err_msg = ("source_file_names and destination_file_names should "
                   "be equal in length")
        assert len(source_file_names) == len(destination_file_names), err_msg

        def _rename_file(source, destination):
            """Rename a single file object"""
            try:
                os.rename(source, destination)
            except OSError as err:
                raise IOError(err)

        exceptions = {}
        for source, destination in zip(source_file_names,
                                       destination_file_names):
            try:
                _rename_file(source, destination)
            except Exception as e:  # pylint: disable=broad-except
                exceptions[(source, destination)] = e

        if exceptions:
            raise BeamIOError("Rename operation failed", exceptions)
示例#10
0
    def delete(self, paths):
        """Deletes files or directories at the provided paths.
    Directories will be deleted recursively.

    Args:
      paths: list of paths that give the file objects to be deleted
    """
        def _delete_path(path):
            """Recursively delete the file or directory at the provided path.
      """
            if path.endswith('/'):
                path_to_use = path + '*'
            else:
                path_to_use = path
            match_result = self.match([path_to_use])[0]
            statuses = gcsio.GcsIO().delete_batch(
                [m.path for m in match_result.metadata_list])
            failures = [e for (_, e) in statuses if e is not None]
            if failures:
                raise failures[0]

        exceptions = {}
        for path in paths:
            try:
                _delete_path(path)
            except Exception as e:  # pylint: disable=broad-except
                exceptions[path] = e

        if exceptions:
            raise BeamIOError("Delete operation failed", exceptions)
示例#11
0
  def copy(self, source_file_names, destination_file_names):
    """Recursively copy the file tree from the source to the destination

    Args:
      source_file_names: list of source file objects that needs to be copied
      destination_file_names: list of destination of the new object

    Raises:
      ``BeamIOError``: if any of the copy operations fail
    """
    err_msg = (
        "source_file_names and destination_file_names should "
        "be equal in length")
    assert len(source_file_names) == len(destination_file_names), err_msg

    def _copy_path(source, destination):
      """Recursively copy the file tree from the source to the destination
      """
      if not destination.startswith(GCSFileSystem.GCS_PREFIX):
        raise ValueError('Destination %r must be GCS path.' % destination)
      # Use copy_tree if the path ends with / as it is a directory
      if source.endswith('/'):
        gcsio.GcsIO().copytree(source, destination)
      else:
        gcsio.GcsIO().copy(source, destination)

    exceptions = {}
    for source, destination in zip(source_file_names, destination_file_names):
      try:
        _copy_path(source, destination)
      except Exception as e:  # pylint: disable=broad-except
        exceptions[(source, destination)] = e

    if exceptions:
      raise BeamIOError("Copy operation failed", exceptions)
示例#12
0
 def get_filesystem(path):
     # type: (str) -> FileSystems
     """Get the correct filesystem for the specified path
 """
     try:
         path_scheme = FileSystems.get_scheme(path)
         systems = [
             fs for fs in FileSystem.get_all_subclasses()
             if fs.scheme() == path_scheme
         ]
         if len(systems) == 0:
             raise ValueError(
                 'Unable to get filesystem from specified path, please use the '
                 'correct path or ensure the required dependency is installed, '
                 'e.g., pip install apache_beam[gcp]. Path specified: %s' %
                 path)
         elif len(systems) == 1:
             # Pipeline options could come either from the Pipeline itself (using
             # direct runner), or via RuntimeValueProvider (other runners).
             options = (FileSystems._pipeline_options
                        or RuntimeValueProvider.runtime_options)
             return systems[0](pipeline_options=options)
         else:
             raise ValueError('Found more than one filesystem for path %s' %
                              path)
     except ValueError:
         raise
     except Exception as e:
         raise BeamIOError('Unable to get the Filesystem', {path: e})
示例#13
0
    def __init__(self, hdfs_client, path):
        self._hdfs_client = hdfs_client
        if self._hdfs_client.status(path, strict=False) is not None:
            raise BeamIOError('Path already exists: %s' % path)

        self._handle_context = self._hdfs_client.write(path)
        self._handle = self._handle_context.__enter__()
示例#14
0
 def _list(self, url):
     try:
         path = self._parse_url(url)
         for res in self._hdfs_client.list(path, status=True):
             yield FileMetadata(_HDFS_PREFIX + self._join(path, res[0]),
                                res[1][_FILE_STATUS_LENGTH])
     except Exception as e:  # pylint: disable=broad-except
         raise BeamIOError('Match operation failed', {url: e})
示例#15
0
    def rename(self, source_file_names, destination_file_names):
        """Rename the files at the source list to the destination list.
    Source and destination lists should be of the same size.

    Args:
      source_file_names: List of file paths that need to be moved
      destination_file_names: List of destination_file_names for the files

    Raises:
      ``BeamIOError``: if any of the rename operations fail
    """
        if not len(source_file_names) == len(destination_file_names):
            message = 'Unable to rename unequal number of sources and destinations'
            raise BeamIOError(message)
        src_dest_pairs = list(zip(source_file_names, destination_file_names))
        results = s3io.S3IO(options=self._options).rename_files(src_dest_pairs)
        exceptions = {(src, dest): error
                      for (src, dest, error) in results if error is not None}
        if exceptions:
            raise BeamIOError("Rename operation failed", exceptions)
示例#16
0
    def delete(self, urls):
        exceptions = {}
        for url in urls:
            try:
                path = self._parse_url(url)
                self._hdfs_client.delete(path, recursive=True)
            except Exception as e:  # pylint: disable=broad-except
                exceptions[url] = e

        if exceptions:
            raise BeamIOError("Delete operation failed", exceptions)
示例#17
0
  def process(self, file_pattern: str) -> List[filesystem.FileMetadata]:
    # TODO: Should we batch the lookups?
    match_results = filesystems.FileSystems.match([file_pattern])
    match_result = match_results[0]

    if (not match_result.metadata_list and
        not EmptyMatchTreatment.allow_empty_match(file_pattern,
                                                  self._empty_match_treatment)):
      raise BeamIOError(
          'Empty match for pattern %s. Disallowed.' % file_pattern)

    return match_result.metadata_list
示例#18
0
    def exists(self, path):
        """Check if the provided path exists on the FileSystem.

    Args:
      path: string path that needs to be checked.

    Returns: boolean flag indicating if path exists
    """
        try:
            return s3io.S3IO(options=self._options).exists(path)
        except Exception as e:  # pylint: disable=broad-except
            raise BeamIOError("exists() operation failed", {path: e})
示例#19
0
  def delete(self, paths):
    """Deletes files or directories at the provided paths.
    Directories will be deleted recursively.

    Args:
      paths: list of paths that give the file objects to be deleted
    """
    results = s3io.S3IO().delete_paths(paths)
    exceptions = {path: error for (path, error) in results.items()
                  if error is not None}
    if exceptions:
      raise BeamIOError("Delete operation failed", exceptions)
示例#20
0
 def exists(self, path):
     # type: (str) -> bool
     """Check if the provided path exists on the FileSystem.
 
 Args:
   path: string path that needs to be checked.
 
 Returns: boolean flag indicating if path exists
 """
     try:
         return blobstorageio.BlobStorageIO().exists(path)
     except Exception as e:  # pylint: disable=broad-except
         raise BeamIOError("exists() operation failed", {path: e})
示例#21
0
  def test_file_sink_rename_error(self, rename_mock):
    temp_path = os.path.join(self._new_tempdir(), 'rename_error')
    sink = MyFileBasedSink(
        temp_path, file_name_suffix='.output', coder=coders.ToStringCoder())
    init_token, writer_results = self._common_init(sink)
    pre_finalize_results = sink.pre_finalize(init_token, writer_results)

    error_str = 'mock rename error description'
    rename_mock.side_effect = BeamIOError(
        'mock rename error', {('src', 'dst'): error_str})
    with self.assertRaisesRegexp(Exception, error_str):
      list(sink.finalize_write(init_token, writer_results,
                               pre_finalize_results))
示例#22
0
    def process(self, file_metadata):
        metadata = (filesystem.FileMetadata(file_metadata, 0) if isinstance(
            file_metadata, (str, unicode)) else file_metadata)

        if metadata.path.endswith('/') and self._skip_directories:
            return
        elif metadata.path.endswith('/'):
            raise BeamIOError(
                'Directories are not allowed in ReadMatches transform.'
                'Found %s.' % metadata.path)

        # TODO: Mime type? Other arguments? Maybe arguments passed in to transform?
        yield ReadableFile(metadata)
示例#23
0
    def _check_state_for_finalize_write(self, writer_results, num_shards):
        """Checks writer output files' states.

    Returns:
      src_files, dst_files: Lists of files to rename. For each i, finalize_write
        should rename(src_files[i], dst_files[i]).
      delete_files: Src files to delete. These could be leftovers from an
        incomplete (non-atomic) rename operation.
      num_skipped: Tally of writer results files already renamed, such as from
        a previous run of finalize_write().
    """
        if not writer_results:
            return [], [], [], 0

        src_glob = FileSystems.join(
            FileSystems.split(writer_results[0])[0], '*')
        dst_glob = self._get_final_name_glob(num_shards)
        src_glob_files = set(file_metadata.path
                             for mr in FileSystems.match([src_glob])
                             for file_metadata in mr.metadata_list)
        dst_glob_files = set(file_metadata.path
                             for mr in FileSystems.match([dst_glob])
                             for file_metadata in mr.metadata_list)

        src_files = []
        dst_files = []
        delete_files = []
        num_skipped = 0
        for shard_num, src in enumerate(writer_results):
            final_name = self._get_final_name(shard_num, num_shards)
            dst = final_name
            src_exists = src in src_glob_files
            dst_exists = dst in dst_glob_files
            if not src_exists and not dst_exists:
                raise BeamIOError(
                    'src and dst files do not exist. src: %s, dst: %s' %
                    (src, dst))
            if not src_exists and dst_exists:
                logging.debug('src: %s -> dst: %s already renamed, skipping',
                              src, dst)
                num_skipped += 1
                continue
            if (src_exists and dst_exists and FileSystems.checksum(src)
                    == FileSystems.checksum(dst)):
                logging.debug('src: %s == dst: %s, deleting src', src, dst)
                delete_files.append(src)
                continue

            src_files.append(src)
            dst_files.append(dst)
        return src_files, dst_files, delete_files, num_skipped
示例#24
0
  def last_updated(self, path):
    """Get UNIX Epoch time in seconds on the FileSystem.

    Args:
      path: string path of file.

    Returns: float UNIX Epoch time

    Raises:
      ``BeamIOError``: if path doesn't exist.
    """
    if not self.exists(path):
      raise BeamIOError('Path does not exist: %s' % path)
    return os.path.getmtime(path)
示例#25
0
    def copy(self, source_file_names, destination_file_names):
        """Recursively copy the file tree from the source to the destination

    Args:
      source_file_names: list of source file objects that needs to be copied
      destination_file_names: list of destination of the new object

    Raises:
      ``BeamIOError``: if any of the copy operations fail
    """
        if not len(source_file_names) == len(destination_file_names):
            message = 'Unable to copy unequal number of sources and destinations'
            raise BeamIOError(message)
        src_dest_pairs = list(zip(source_file_names, destination_file_names))
        return s3io.S3IO(options=self._options).copy_paths(src_dest_pairs)
示例#26
0
    def size(self, path):
        """Get size in bytes of a file on the FileSystem.

    Args:
      path: string filepath of file.

    Returns: int size of file according to the FileSystem.

    Raises:
      ``BeamIOError``: if path doesn't exist.
    """
        try:
            return blobstorageio.BlobStorageIO().size(path)
        except Exception as e:  # pylint: disable=broad-except
            raise BeamIOError("Size operation failed", {path: e})
示例#27
0
    def checksum(self, path):
        """Fetch checksum metadata of a file on the
    :class:`~apache_beam.io.filesystem.FileSystem`.

    Args:
      path: string path of a file.

    Returns: string containing file size.

    Raises:
      ``BeamIOError`` if path isn't a file or doesn't exist.
    """
        if not self.exists(path):
            raise BeamIOError('Path does not exist: %s' % path)
        return str(os.path.getsize(path))
示例#28
0
    def size(self, path):
        """Get size of path on the FileSystem.

    Args:
      path: string path in question.

    Returns: int size of path according to the FileSystem.

    Raises:
      ``BeamIOError`` if path doesn't exist.
    """
        try:
            return os.path.getsize(path)
        except Exception as e:  # pylint: disable=broad-except
            raise BeamIOError("Size operation failed", {path: e})
示例#29
0
    def last_updated(self, path):
        """Get UNIX Epoch time in seconds on the FileSystem.

    Args:
      path: string path of file.

    Returns: float UNIX Epoch time

    Raises:
      ``BeamIOError``: if path doesn't exist.
    """
        try:
            return s3io.S3IO(options=self._options).last_updated(path)
        except Exception as e:  # pylint: disable=broad-except
            raise BeamIOError("last_updated operation failed", {path: e})
示例#30
0
    def checksum(self, path):
        """Fetch checksum metadata of a file on the
    :class:`~apache_beam.io.filesystem.FileSystem`.

    Args:
      path: string path of a file.

    Returns: string containing checksum

    Raises:
      ``BeamIOError``: if path isn't a file or doesn't exist.
    """
        try:
            return s3io.S3IO(options=self._options).checksum(path)
        except Exception as e:  # pylint: disable=broad-except
            raise BeamIOError("Checksum operation failed", {path: e})