def test_get_filesystem(self):
   self.assertTrue(isinstance(FileSystems.get_filesystem('/tmp'),
                              localfilesystem.LocalFileSystem))
   self.assertTrue(isinstance(FileSystems.get_filesystem('c:\\abc\def'),  # pylint: disable=anomalous-backslash-in-string
                              localfilesystem.LocalFileSystem))
   with self.assertRaises(ValueError):
     FileSystems.get_filesystem('error://abc/def')
Exemplo n.º 2
0
 def test_get_filesystem(self):
   self.assertTrue(isinstance(FileSystems.get_filesystem('/tmp'),
                              localfilesystem.LocalFileSystem))
   self.assertTrue(isinstance(FileSystems.get_filesystem('c:\\abc\def'),  # pylint: disable=anomalous-backslash-in-string
                              localfilesystem.LocalFileSystem))
   with self.assertRaises(ValueError):
     FileSystems.get_filesystem('error://abc/def')
Exemplo n.º 3
0
 def test_get_filesystem(self):
     self.assertTrue(
         isinstance(FileSystems.get_filesystem('/tmp'),
                    localfilesystem.LocalFileSystem))
     self.assertTrue(
         isinstance(FileSystems.get_filesystem('c:\\abc\\def'),
                    localfilesystem.LocalFileSystem))
     with self.assertRaises(ValueError):
         FileSystems.get_filesystem('error://abc/def')
Exemplo n.º 4
0
def get_file_size(file_path):
    """
    Gets the size of the file, in bytes. If file is not found, raises a
    FileNotFoundError.
    """
    linked_bundle_path = parse_linked_bundle_url(file_path)
    if linked_bundle_path.uses_beam and linked_bundle_path.is_archive:
        # If no archive subpath is specified for a .tar.gz or .gz file, get the uncompressed size of the entire file,
        # or the compressed size of the entire directory.
        if not linked_bundle_path.archive_subpath:
            if linked_bundle_path.is_archive_dir:
                filesystem = FileSystems.get_filesystem(
                    linked_bundle_path.bundle_path)
                return filesystem.size(linked_bundle_path.bundle_path)
            else:
                with OpenFile(linked_bundle_path.bundle_path, 'rb') as fileobj:
                    fileobj.seek(0, os.SEEK_END)
                    return fileobj.tell()
        # If the archive file is a .tar.gz file on Azure, open the specified archive subpath within the archive.
        # If it is a .gz file on Azure, open the "/contents" entry, which represents the actual gzipped file.
        with OpenIndexedArchiveFile(linked_bundle_path.bundle_path) as tf:
            assert linked_bundle_path.is_archive_dir
            fpath = "/" + linked_bundle_path.archive_subpath
            finfo = tf.getFileInfo(fpath)
            if finfo is None:
                raise FileNotFoundError(fpath)
            return finfo.size
    if not get_path_exists(file_path):
        raise FileNotFoundError(file_path)
    # Local path
    return os.stat(file_path).st_size
Exemplo n.º 5
0
def path_exists(path, d_pl_options, is_dir):
    dir_path = path
    fs = FileSystems.get_filesystem(dir_path)
    if type(fs) == GCSFileSystem:
        dir_path = gcs_correct_dir_path_form(
            dir_path, d_pl_options, strip_prefix=False) if is_dir else path
    return FileSystems.exists(dir_path), dir_path
Exemplo n.º 6
0
def list_dir(dir_path, d_pl_options, exclude_subdir=False):
    fs = FileSystems.get_filesystem(dir_path)
    if type(fs) == GCSFileSystem:
        return gcsio.GcsIO().list_prefix(
            gcs_correct_dir_path_form(dir_path,
                                      d_pl_options,
                                      strip_prefix=False))
    else:
        return tf.io.gfile.listdir(dir_path)
def get_file_size(file_path):
    """
    Gets the size of the file, in bytes. If file is not found, raises a
    FileNotFoundError.
    """
    if not get_path_exists(file_path):
        raise FileNotFoundError
    # TODO: add a FileSystems.size() method to Apache Beam to make this less verbose.
    filesystem = FileSystems.get_filesystem(file_path)
    return filesystem.size(file_path)
Exemplo n.º 8
0
def make_dirs(path, d_pl_options):
    fs = FileSystems.get_filesystem(path)
    if type(fs) == GCSFileSystem:
        gcs_form_path = gcs_correct_dir_path_form(path,
                                                  d_pl_options,
                                                  strip_prefix=True)
        blob_path = get_gcs_bucket(d_pl_options).blob(gcs_form_path)
        blob_path_create_result = blob_path.upload_from_string(
            '', content_type='application/x-www-form-urlencoded;charset=UTF-8')
        return blob_path_create_result
    else:
        dir_creation_result = None
        try:
            dir_creation_result = FileSystems.mkdirs(path)
        except Exception as e:
            # if e is not None:
            #   print(e)
            pass
        return dir_creation_result
Exemplo n.º 9
0
def get_file_size(fpath):
    fs = FileSystems.get_filesystem(fpath)
    if type(fs) == GCSFileSystem:
        return gcsio.GcsIO().size(fpath)
    else:
        return FileIO(fpath, "rb").size()
Exemplo n.º 10
0
def delete_file(path, d_pl_options, recursive=False, r_level=0, debug=False):
    fs = FileSystems.get_filesystem(path)
    if type(fs) == GCSFileSystem:
        gcs_client = get_gcs_client()
        if debug:
            print(
                f"{'-'*(r_level)} delete_file (debug): path: {path}, recursive: {recursive}"
            )
        if recursive:
            child_paths = list_dir(path, d_pl_options, exclude_subdir=False)
            for child_path in child_paths:
                if child_path != path:
                    if debug:
                        print(
                            f"{'-'*(r_level+1)} delete_file (debug): path {path} has child: {child_path}"
                        )
                    delete_file(
                        child_path,
                        d_pl_options,
                        recursive=True,
                        r_level=r_level + 1
                    )  # don't need to recurse (return, since gcsio deletes all leaves from the root)

        # not stripped, not corrrected case
        blob_path = get_gcs_bucket(d_pl_options).blob(path)
        path_not_stripped_not_gcs_corrected_exists = blob_path.exists(
            gcs_client)
        if debug:
            print(
                f"{'-'*(r_level)} {path} (not stripped, not gcs corrected): {blob_path}, exists: {path_not_stripped_not_gcs_corrected_exists}"
            )
        if path_not_stripped_not_gcs_corrected_exists:
            blob_path_delete_result = blob_path.delete(gcs_client)
            if debug:
                print(
                    f"{'-'*(r_level)} {path} (not stripped, not gcs corrected): {blob_path}, exists: {blob_path.exists(gcs_client)} (after delete attempt)"
                )
            return blob_path_delete_result
        else:
            # not stripped, gcs corrected case
            path_not_stripped_gcs_corrected = gcs_correct_dir_path_form(
                path, d_pl_options, strip_prefix=False)
            blob_path = get_gcs_bucket(d_pl_options).blob(
                path_not_stripped_gcs_corrected)
            path_not_stripped_gcs_corrected_exists = blob_path.exists(
                gcs_client)
            if debug:
                print(
                    f"{'-'*(r_level)} {path_not_stripped_gcs_corrected} (not stripped, gcs corrected): {blob_path}, exists: {path_not_stripped_gcs_corrected_exists}"
                )
            if path_not_stripped_gcs_corrected_exists:
                blob_path_delete_result = blob_path.delete(gcs_client)
                if debug:
                    print(
                        f"{'-'*(r_level)} {path_not_stripped_gcs_corrected} (not stripped, gcs corrected): {blob_path}, exists: {blob_path.exists(gcs_client)} (after delete attempt)"
                    )
                return blob_path_delete_result
            else:
                # stripped, not gcs corrected case
                path_stripped_not_gcs_corrected = gcs_path_strip_prefix(
                    path, d_pl_options)
                blob_path = get_gcs_bucket(d_pl_options).blob(
                    path_stripped_not_gcs_corrected)
                path_stripped_not_gcs_corrected_exists = blob_path.exists(
                    gcs_client)
                if debug:
                    print(
                        f"{'-'*(r_level)} {path_stripped_not_gcs_corrected} (stripped, not gcs corrected): {blob_path}, exists: {path_stripped_not_gcs_corrected_exists}"
                    )
                if path_stripped_not_gcs_corrected_exists:
                    blob_path_delete_result = blob_path.delete(gcs_client)
                    if debug:
                        print(
                            f"{'-'*(r_level)} {path_stripped_not_gcs_corrected} (stripped, not gcs corrected): {blob_path}, exists: {blob_path.exists(gcs_client)} (after delete attempt)"
                        )
                    return blob_path_delete_result
                else:
                    # stripped, gcs corrected case
                    path_stripped_gcs_corrected = gcs_correct_dir_path_form(
                        path, d_pl_options, strip_prefix=True)
                    blob_path = get_gcs_bucket(d_pl_options).blob(
                        path_stripped_gcs_corrected)
                    path_stripped_gcs_corrected_exists = blob_path.exists(
                        gcs_client)
                    if debug:
                        print(
                            f"{'-'*(r_level)} {path_stripped_gcs_corrected} (stripped, gcs corrected): {blob_path}, exists: {path_stripped_gcs_corrected_exists}"
                        )
                    if path_stripped_gcs_corrected_exists:
                        blob_path_delete_result = blob_path.delete(gcs_client)
                        if debug:
                            print(
                                f"{'-'*(r_level)} {path_stripped_gcs_corrected} (stripped, gcs corrected)): {blob_path}, exists: {blob_path.exists(gcs_client)} (after delete attempt)"
                            )
                        return blob_path_delete_result
                    else:
                        if debug:
                            print(
                                f"{'-'*(r_level)} out of options trying to delete base path {path}!"
                            )
                        return False

    else:
        return FileSystems.delete([path])
Exemplo n.º 11
0
def open_file_write(fpath):
    fs = FileSystems.get_filesystem(fpath)
    if type(fs) == GCSFileSystem:
        return gcsio.GcsIO().open(fpath, mode='w')
    else:
        return FileSystems.create(fpath)
Exemplo n.º 12
0
def open_file_read(fpath):
    fs = FileSystems.get_filesystem(fpath)
    if type(fs) == GCSFileSystem:
        return gcsio.GcsIO().open(fpath)
    else:
        return FileSystems.open(fpath)