def match(self, url_patterns, limits=None): if limits is None: limits = [None] * len(url_patterns) if len(url_patterns) != len(limits): raise BeamIOError( 'Patterns and limits should be equal in length: %d != %d' % (len(url_patterns), len(limits))) # TODO(udim): Update client to allow batched results. def _match(path_pattern, limit): """Find all matching paths to the pattern provided.""" file_infos = self._hdfs_client.ls(path_pattern, detail=True)[:limit] metadata_list = [ FileMetadata(file_info['name'], file_info['size']) for file_info in file_infos ] return MatchResult(path_pattern, metadata_list) exceptions = {} result = [] for url_pattern, limit in zip(url_patterns, limits): try: path_pattern = self._parse_url(url_pattern) result.append(_match(path_pattern, limit)) except Exception as e: # pylint: disable=broad-except exceptions[url_pattern] = e if exceptions: raise BeamIOError('Match operation failed', exceptions) return result
def match(self, url_patterns, limits=None): if limits is None: limits = [None] * len(url_patterns) if len(url_patterns) != len(limits): raise BeamIOError( 'Patterns and limits should be equal in length: %d != %d' % ( len(url_patterns), len(limits))) def _match(path_pattern, limit): """Find all matching paths to the pattern provided.""" fs = self._hdfs_client.status(path_pattern, strict=False) if fs and fs[_FILE_STATUS_TYPE] == _FILE_STATUS_TYPE_FILE: file_statuses = [(fs[_FILE_STATUS_PATH_SUFFIX], fs)][:limit] else: file_statuses = self._hdfs_client.list(path_pattern, status=True)[:limit] metadata_list = [FileMetadata(file_status[1][_FILE_STATUS_NAME], file_status[1][_FILE_STATUS_SIZE]) for file_status in file_statuses] return MatchResult(path_pattern, metadata_list) exceptions = {} result = [] for url_pattern, limit in zip(url_patterns, limits): try: path_pattern = self._parse_url(url_pattern) result.append(_match(path_pattern, limit)) except Exception as e: # pylint: disable=broad-except exceptions[url_pattern] = e if exceptions: raise BeamIOError('Match operation failed', exceptions) return result
def copy(self, source_file_names, destination_file_names): """ It is an error if any file to copy already exists at the destination. Raises ``BeamIOError`` if any error occurred. Args: source_file_names: iterable of URLs. destination_file_names: iterable of URLs. """ if len(source_file_names) != len(destination_file_names): raise BeamIOError( 'source_file_names and destination_file_names should ' 'be equal in length: %d != %d' % (len(source_file_names), len(destination_file_names))) def _copy_file(source, destination): with self._open(source) as f1: with self._create(destination) as f2: while True: buf = f1.read(_COPY_BUFFER_SIZE) if not buf: break f2.write(buf) def _copy_path(source, destination): """Recursively copy the file tree from the source to the destination.""" if self._hdfs_client.status( source)[_FILE_STATUS_TYPE] != _FILE_STATUS_TYPE_DIRECTORY: _copy_file(source, destination) return for path, dirs, files in self._hdfs_client.walk(source): for dir in dirs: new_dir = self._join(destination, dir) if not self._exists(new_dir): self._mkdirs(new_dir) rel_path = posixpath.relpath(path, source) if rel_path == '.': rel_path = '' for file in files: _copy_file(self._join(path, file), self._join(destination, rel_path, file)) exceptions = {} for source, destination in zip(source_file_names, destination_file_names): try: rel_source = self._parse_url(source) rel_destination = self._parse_url(destination) _copy_path(rel_source, rel_destination) except Exception as e: # pylint: disable=broad-except exceptions[(source, destination)] = e if exceptions: raise BeamIOError('Copy operation failed', exceptions)
def rename(self, source_file_names, destination_file_names): exceptions = {} for source, destination in zip(source_file_names, destination_file_names): try: rel_source = self._parse_url(source) rel_destination = self._parse_url(destination) if not self._hdfs_client.mv(rel_source, rel_destination): raise BeamIOError('libhdfs error in renaming %s to %s' % (source, destination)) except Exception as e: # pylint: disable=broad-except exceptions[(source, destination)] = e if exceptions: raise BeamIOError('Rename operation failed', exceptions)
def delete(self, paths): """Deletes files or directories at the provided paths. Directories will be deleted recursively. Args: paths: list of paths that give the file objects to be deleted Raises: ``BeamIOError`` if any of the delete operations fail """ def _delete_path(path): """Recursively delete the file or directory at the provided path. """ try: if os.path.isdir(path): shutil.rmtree(path) else: os.remove(path) except OSError as err: raise IOError(err) exceptions = {} for path in paths: try: _delete_path(path) except Exception as e: # pylint: disable=broad-except exceptions[path] = e if exceptions: raise BeamIOError("Delete operation failed", exceptions)
def _list(self, dir_or_prefix): """List files in a location. Listing is non-recursive, for filesystems that support directories. Args: dir_or_prefix: (string) A directory or location prefix (for filesystems that don't have directories). Returns: Generator of ``FileMetadata`` objects. Raises: ``BeamIOError`` if listing fails, but not if no files were found. """ if not self.exists(dir_or_prefix): return try: for f in os.listdir(dir_or_prefix): f = self.join(dir_or_prefix, f) try: yield FileMetadata(f, os.path.getsize(f)) except OSError: # Files may disappear, such as when listing /tmp. pass except Exception as e: # pylint: disable=broad-except raise BeamIOError("List operation failed", {dir_or_prefix: e})
def get_filesystem(path): """Get the correct filesystem for the specified path """ try: return get_filesystem(path) except Exception as e: raise BeamIOError('Enable to get the Filesystem', {path: e})
def get_filesystem(path): """Get the correct filesystem for the specified path """ try: path_scheme = FileSystems.get_scheme(path) systems = [ fs for fs in FileSystem.get_all_subclasses() if fs.scheme() == path_scheme ] if len(systems) == 0: raise ValueError('Unable to get the Filesystem for path %s' % path) elif len(systems) == 1: # Pipeline options could come either from the Pipeline itself (using # direct runner), or via RuntimeValueProvider (other runners). options = (FileSystems._pipeline_options or RuntimeValueProvider.runtime_options) return systems[0](pipeline_options=options) else: raise ValueError('Found more than one filesystem for path %s' % path) except ValueError: raise except Exception as e: raise BeamIOError('Unable to get the Filesystem', {path: e})
def rename(self, source_file_names, destination_file_names): """Rename the files at the source list to the destination list. Source and destination lists should be of the same size. Args: source_file_names: List of file paths that need to be moved destination_file_names: List of destination_file_names for the files Raises: ``BeamIOError`` if any of the rename operations fail """ err_msg = ("source_file_names and destination_file_names should " "be equal in length") assert len(source_file_names) == len(destination_file_names), err_msg def _rename_file(source, destination): """Rename a single file object""" try: os.rename(source, destination) except OSError as err: raise IOError(err) exceptions = {} for source, destination in zip(source_file_names, destination_file_names): try: _rename_file(source, destination) except Exception as e: # pylint: disable=broad-except exceptions[(source, destination)] = e if exceptions: raise BeamIOError("Rename operation failed", exceptions)
def delete(self, paths): """Deletes files or directories at the provided paths. Directories will be deleted recursively. Args: paths: list of paths that give the file objects to be deleted """ def _delete_path(path): """Recursively delete the file or directory at the provided path. """ if path.endswith('/'): path_to_use = path + '*' else: path_to_use = path match_result = self.match([path_to_use])[0] statuses = gcsio.GcsIO().delete_batch( [m.path for m in match_result.metadata_list]) failures = [e for (_, e) in statuses if e is not None] if failures: raise failures[0] exceptions = {} for path in paths: try: _delete_path(path) except Exception as e: # pylint: disable=broad-except exceptions[path] = e if exceptions: raise BeamIOError("Delete operation failed", exceptions)
def copy(self, source_file_names, destination_file_names): """Recursively copy the file tree from the source to the destination Args: source_file_names: list of source file objects that needs to be copied destination_file_names: list of destination of the new object Raises: ``BeamIOError``: if any of the copy operations fail """ err_msg = ( "source_file_names and destination_file_names should " "be equal in length") assert len(source_file_names) == len(destination_file_names), err_msg def _copy_path(source, destination): """Recursively copy the file tree from the source to the destination """ if not destination.startswith(GCSFileSystem.GCS_PREFIX): raise ValueError('Destination %r must be GCS path.' % destination) # Use copy_tree if the path ends with / as it is a directory if source.endswith('/'): gcsio.GcsIO().copytree(source, destination) else: gcsio.GcsIO().copy(source, destination) exceptions = {} for source, destination in zip(source_file_names, destination_file_names): try: _copy_path(source, destination) except Exception as e: # pylint: disable=broad-except exceptions[(source, destination)] = e if exceptions: raise BeamIOError("Copy operation failed", exceptions)
def get_filesystem(path): # type: (str) -> FileSystems """Get the correct filesystem for the specified path """ try: path_scheme = FileSystems.get_scheme(path) systems = [ fs for fs in FileSystem.get_all_subclasses() if fs.scheme() == path_scheme ] if len(systems) == 0: raise ValueError( 'Unable to get filesystem from specified path, please use the ' 'correct path or ensure the required dependency is installed, ' 'e.g., pip install apache_beam[gcp]. Path specified: %s' % path) elif len(systems) == 1: # Pipeline options could come either from the Pipeline itself (using # direct runner), or via RuntimeValueProvider (other runners). options = (FileSystems._pipeline_options or RuntimeValueProvider.runtime_options) return systems[0](pipeline_options=options) else: raise ValueError('Found more than one filesystem for path %s' % path) except ValueError: raise except Exception as e: raise BeamIOError('Unable to get the Filesystem', {path: e})
def __init__(self, hdfs_client, path): self._hdfs_client = hdfs_client if self._hdfs_client.status(path, strict=False) is not None: raise BeamIOError('Path already exists: %s' % path) self._handle_context = self._hdfs_client.write(path) self._handle = self._handle_context.__enter__()
def _list(self, url): try: path = self._parse_url(url) for res in self._hdfs_client.list(path, status=True): yield FileMetadata(_HDFS_PREFIX + self._join(path, res[0]), res[1][_FILE_STATUS_LENGTH]) except Exception as e: # pylint: disable=broad-except raise BeamIOError('Match operation failed', {url: e})
def rename(self, source_file_names, destination_file_names): """Rename the files at the source list to the destination list. Source and destination lists should be of the same size. Args: source_file_names: List of file paths that need to be moved destination_file_names: List of destination_file_names for the files Raises: ``BeamIOError``: if any of the rename operations fail """ if not len(source_file_names) == len(destination_file_names): message = 'Unable to rename unequal number of sources and destinations' raise BeamIOError(message) src_dest_pairs = list(zip(source_file_names, destination_file_names)) results = s3io.S3IO(options=self._options).rename_files(src_dest_pairs) exceptions = {(src, dest): error for (src, dest, error) in results if error is not None} if exceptions: raise BeamIOError("Rename operation failed", exceptions)
def delete(self, urls): exceptions = {} for url in urls: try: path = self._parse_url(url) self._hdfs_client.delete(path, recursive=True) except Exception as e: # pylint: disable=broad-except exceptions[url] = e if exceptions: raise BeamIOError("Delete operation failed", exceptions)
def process(self, file_pattern: str) -> List[filesystem.FileMetadata]: # TODO: Should we batch the lookups? match_results = filesystems.FileSystems.match([file_pattern]) match_result = match_results[0] if (not match_result.metadata_list and not EmptyMatchTreatment.allow_empty_match(file_pattern, self._empty_match_treatment)): raise BeamIOError( 'Empty match for pattern %s. Disallowed.' % file_pattern) return match_result.metadata_list
def exists(self, path): """Check if the provided path exists on the FileSystem. Args: path: string path that needs to be checked. Returns: boolean flag indicating if path exists """ try: return s3io.S3IO(options=self._options).exists(path) except Exception as e: # pylint: disable=broad-except raise BeamIOError("exists() operation failed", {path: e})
def delete(self, paths): """Deletes files or directories at the provided paths. Directories will be deleted recursively. Args: paths: list of paths that give the file objects to be deleted """ results = s3io.S3IO().delete_paths(paths) exceptions = {path: error for (path, error) in results.items() if error is not None} if exceptions: raise BeamIOError("Delete operation failed", exceptions)
def exists(self, path): # type: (str) -> bool """Check if the provided path exists on the FileSystem. Args: path: string path that needs to be checked. Returns: boolean flag indicating if path exists """ try: return blobstorageio.BlobStorageIO().exists(path) except Exception as e: # pylint: disable=broad-except raise BeamIOError("exists() operation failed", {path: e})
def test_file_sink_rename_error(self, rename_mock): temp_path = os.path.join(self._new_tempdir(), 'rename_error') sink = MyFileBasedSink( temp_path, file_name_suffix='.output', coder=coders.ToStringCoder()) init_token, writer_results = self._common_init(sink) pre_finalize_results = sink.pre_finalize(init_token, writer_results) error_str = 'mock rename error description' rename_mock.side_effect = BeamIOError( 'mock rename error', {('src', 'dst'): error_str}) with self.assertRaisesRegexp(Exception, error_str): list(sink.finalize_write(init_token, writer_results, pre_finalize_results))
def process(self, file_metadata): metadata = (filesystem.FileMetadata(file_metadata, 0) if isinstance( file_metadata, (str, unicode)) else file_metadata) if metadata.path.endswith('/') and self._skip_directories: return elif metadata.path.endswith('/'): raise BeamIOError( 'Directories are not allowed in ReadMatches transform.' 'Found %s.' % metadata.path) # TODO: Mime type? Other arguments? Maybe arguments passed in to transform? yield ReadableFile(metadata)
def _check_state_for_finalize_write(self, writer_results, num_shards): """Checks writer output files' states. Returns: src_files, dst_files: Lists of files to rename. For each i, finalize_write should rename(src_files[i], dst_files[i]). delete_files: Src files to delete. These could be leftovers from an incomplete (non-atomic) rename operation. num_skipped: Tally of writer results files already renamed, such as from a previous run of finalize_write(). """ if not writer_results: return [], [], [], 0 src_glob = FileSystems.join( FileSystems.split(writer_results[0])[0], '*') dst_glob = self._get_final_name_glob(num_shards) src_glob_files = set(file_metadata.path for mr in FileSystems.match([src_glob]) for file_metadata in mr.metadata_list) dst_glob_files = set(file_metadata.path for mr in FileSystems.match([dst_glob]) for file_metadata in mr.metadata_list) src_files = [] dst_files = [] delete_files = [] num_skipped = 0 for shard_num, src in enumerate(writer_results): final_name = self._get_final_name(shard_num, num_shards) dst = final_name src_exists = src in src_glob_files dst_exists = dst in dst_glob_files if not src_exists and not dst_exists: raise BeamIOError( 'src and dst files do not exist. src: %s, dst: %s' % (src, dst)) if not src_exists and dst_exists: logging.debug('src: %s -> dst: %s already renamed, skipping', src, dst) num_skipped += 1 continue if (src_exists and dst_exists and FileSystems.checksum(src) == FileSystems.checksum(dst)): logging.debug('src: %s == dst: %s, deleting src', src, dst) delete_files.append(src) continue src_files.append(src) dst_files.append(dst) return src_files, dst_files, delete_files, num_skipped
def last_updated(self, path): """Get UNIX Epoch time in seconds on the FileSystem. Args: path: string path of file. Returns: float UNIX Epoch time Raises: ``BeamIOError``: if path doesn't exist. """ if not self.exists(path): raise BeamIOError('Path does not exist: %s' % path) return os.path.getmtime(path)
def copy(self, source_file_names, destination_file_names): """Recursively copy the file tree from the source to the destination Args: source_file_names: list of source file objects that needs to be copied destination_file_names: list of destination of the new object Raises: ``BeamIOError``: if any of the copy operations fail """ if not len(source_file_names) == len(destination_file_names): message = 'Unable to copy unequal number of sources and destinations' raise BeamIOError(message) src_dest_pairs = list(zip(source_file_names, destination_file_names)) return s3io.S3IO(options=self._options).copy_paths(src_dest_pairs)
def size(self, path): """Get size in bytes of a file on the FileSystem. Args: path: string filepath of file. Returns: int size of file according to the FileSystem. Raises: ``BeamIOError``: if path doesn't exist. """ try: return blobstorageio.BlobStorageIO().size(path) except Exception as e: # pylint: disable=broad-except raise BeamIOError("Size operation failed", {path: e})
def checksum(self, path): """Fetch checksum metadata of a file on the :class:`~apache_beam.io.filesystem.FileSystem`. Args: path: string path of a file. Returns: string containing file size. Raises: ``BeamIOError`` if path isn't a file or doesn't exist. """ if not self.exists(path): raise BeamIOError('Path does not exist: %s' % path) return str(os.path.getsize(path))
def size(self, path): """Get size of path on the FileSystem. Args: path: string path in question. Returns: int size of path according to the FileSystem. Raises: ``BeamIOError`` if path doesn't exist. """ try: return os.path.getsize(path) except Exception as e: # pylint: disable=broad-except raise BeamIOError("Size operation failed", {path: e})
def last_updated(self, path): """Get UNIX Epoch time in seconds on the FileSystem. Args: path: string path of file. Returns: float UNIX Epoch time Raises: ``BeamIOError``: if path doesn't exist. """ try: return s3io.S3IO(options=self._options).last_updated(path) except Exception as e: # pylint: disable=broad-except raise BeamIOError("last_updated operation failed", {path: e})
def checksum(self, path): """Fetch checksum metadata of a file on the :class:`~apache_beam.io.filesystem.FileSystem`. Args: path: string path of a file. Returns: string containing checksum Raises: ``BeamIOError``: if path isn't a file or doesn't exist. """ try: return s3io.S3IO(options=self._options).checksum(path) except Exception as e: # pylint: disable=broad-except raise BeamIOError("Checksum operation failed", {path: e})