Пример #1
0
def copy(source, dest, filesystem_from=None, filesystem_to=None):
    """
    Copy a file from the source to the destination file system

    :param source: (str) urlpath of the file to copy
    :param dest: (str) urlpath of the folder where to save the file
    :param filesystem_from: (`fsspec` compatible file system instance)
    :param filesystem_to: (`fsspec` compatible file system instance.)
    :return (str) urlpath of the copied file
    """
    _, filename = os.path.split(source)
    target = os.path.join(dest, filename)

    filesystem_from = filesystem_from or \
        fsspec.filesystem(split_protocol(source)[0])
    filesystem_to = filesystem_to or \
        fsspec.filesystem(split_protocol(dest)[0])

    with filesystem_from.open(source, "rb") as f_read:
        filesystem_to.makedirs(dest, exist_ok=True)
        with filesystem_to.open(target, "wb") as f_write:
            if isinstance(filesystem_to, dcachefs.dCacheFileSystem):
                f_write.write(f_read)  # stream upload of file-like object
            else:
                data = True
                while data:
                    data = f_read.read(CHUNKSIZE)
                    f_write.write(data)

    return target
Пример #2
0
def upload_output_directory(url):
    if url is None:
        yield None, None
        return

    protocol, _ = split_protocol(url)
    if protocol is not None:
        # To avoid extra network load, write all output files locally at runtime,
        # then upload to the remote fs at the end.
        with tempfile.TemporaryDirectory() as tmpdir:
            fs, remote_path = get_fs_and_path(url)
            if path_exists(url):
                fs.get(url, tmpdir + "/", recursive=True)

            def put_fn():
                fs.put(tmpdir, remote_path, recursive=True)

            # Write to temp directory locally
            yield tmpdir, put_fn

            # Upload to remote when finished
            put_fn()
    else:
        makedirs(url, exist_ok=True)
        # Just use the output directory directly if using a local filesystem
        yield url, None
Пример #3
0
def read_bytes_collection(
    vineyard_socket, prefix, storage_options, proc_num, proc_index
):
    """Read a set of files as a collection of ByteStreams."""
    client = vineyard.connect(vineyard_socket)

    protocol, prefix_path = split_protocol(prefix)
    fs = fsspec.filesystem(protocol, **storage_options)

    worker_prefix = os.path.join(prefix_path, '%s-%s' % (proc_num, proc_index))

    logger.info("start creating blobs ...")
    queue: "ConcurrentQueue[Tuple[ByteStream, str]]" = ConcurrentQueue()
    stream_id = read_stream_collections(client, fs, queue, worker_prefix, worker_prefix)

    client.persist(stream_id)
    report_success(stream_id)

    logger.info("start reading blobs ...")
    executor = ThreadStreamExecutor(
        ReadToByteStreamExecutor,
        parallism=1,
        client=client,
        fs=fs,
        task_queue=queue,
        chunk_size=CHUNK_SIZE,
    )
    executor.execute()
Пример #4
0
def rename(src, tgt):
    protocol, _ = split_protocol(tgt)
    if protocol is not None:
        fs = fsspec.filesystem(protocol)
        fs.mv(src, tgt, recursive=True)
    else:
        safe_move_file(src, tgt)
Пример #5
0
 def __init__(self, path=None, **storage_options):
     from fsspec import filesystem
     from fsspec.core import split_protocol
     self.pdir = make_path_posix(path or conf.get('persist_path'))
     protocol, _ = split_protocol(self.pdir)
     path = posixpath.join(self.pdir, 'cat.yaml')
     self.fs = filesystem(protocol, **storage_options)
     super(PersistStore, self).__init__(path)
Пример #6
0
def get_fs_and_path(url):
    protocol, path = split_protocol(url)
    # Parse the url to get only the escaped url path
    path = unquote(urlparse(path).path)
    # Create a windows compatible path from url path
    path = os.fspath(pathlib.PurePosixPath(path))
    fs = fsspec.filesystem(protocol)
    return fs, path
Пример #7
0
 def _get_fs_and_protocol(self):
     storage_options = self.storage_options or {}
     protocol, path = split_protocol(self.prefix_path)
     cls = fsspec.get_filesystem_class(protocol)
     options = cls._get_kwargs_from_urls(self.prefix_path)
     update_storage_options(options, storage_options)
     fs = cls(**options)
     return fs, protocol
Пример #8
0
    def _get_protocol_path(self, urlpath) -> Tuple[str, List[str]]:
        if isinstance(urlpath, str):
            return split_protocol(urlpath)

        protocols, paths = zip(*map(split_protocol, urlpath))
        assert (len(set(protocols)) == 1
                ), "Cannot mix file protocols in a single operation"
        return protocols[0], list(paths)
Пример #9
0
def upload_output_file(url):
    """Takes a remote URL as input, returns a temp filename, then uploads it when done."""
    protocol, _ = split_protocol(url)
    if protocol is not None:
        fs = fsspec.filesystem(protocol)
        with tempfile.TemporaryDirectory() as tmpdir:
            local_fname = os.path.join(tmpdir, "tmpfile")
            yield local_fname
            fs.put(local_fname, url, recursive=True)
    else:
        yield url
Пример #10
0
def get_dir(path):
    if '://' in path:
        protocol, _ = split_protocol(path)
        out = get_filesystem_class(protocol)._parent(path)
        if "://" not in out:
            # some FSs strip this, some do not
            out = protocol + "://" + out
        return out
    path = make_path_posix(os.path.join(os.getcwd(), os.path.dirname(path)))
    if path[-1] != '/':
        path += '/'
    return path
Пример #11
0
def upload_h5(url):
    protocol, _ = split_protocol(url)
    if protocol is not None:
        fs = fsspec.filesystem(protocol)
        with tempfile.TemporaryDirectory() as tmpdir:
            local_fname = os.path.join(tmpdir, 'file.h5')
            with h5py.File(local_fname, 'w') as f:
                yield f
            fs.put(local_fname, url, recursive=True)
    else:
        mode = 'r+' if path_exists(url) else 'w'
        with h5py.File(url, mode) as f:
            yield f
Пример #12
0
    def decode(
        self,
        ctx: FlyteContext,
        flyte_value: literals.StructuredDataset,
        current_task_metadata: StructuredDatasetMetadata,
    ) -> pa.Table:
        uri = flyte_value.uri
        if not ctx.file_access.is_remote(uri):
            Path(uri).parent.mkdir(parents=True, exist_ok=True)
        _, path = split_protocol(uri)

        columns = None
        if current_task_metadata.structured_dataset_type and current_task_metadata.structured_dataset_type.columns:
            columns = [c.name for c in current_task_metadata.structured_dataset_type.columns]
        try:
            fp = FSSpecPersistence(data_config=ctx.file_access.data_config)
            fs = fp.get_filesystem(uri)
            return pq.read_table(path, filesystem=fs, columns=columns)
        except NoCredentialsError as e:
            logger.debug("S3 source detected, attempting anonymous S3 access")
            fs = FSSpecPersistence.get_anonymous_filesystem(uri)
            if fs is not None:
                return pq.read_table(path, filesystem=fs, columns=columns)
            raise e
Пример #13
0
 def _get_fs_and_protocol(self):
     protocol, path = split_protocol(self.prefix_path)
     fs = fsspec.filesystem(protocol, **self.storage_options)
     return fs, protocol
Пример #14
0
 def get_path(path):
     protocol, _ = split_protocol(path)
     if protocol is not None:
         return path
     return pathlib.Path(os.path.abspath(path)).as_uri()
Пример #15
0
 def get_localized_path(self, path):
     _, lpath = split_protocol(path)
     return lpath
Пример #16
0
def upgrade_http(urlpath):
    protocol, url = split_protocol(urlpath)
    if protocol == "http":
        return "https://" + url
    return None
Пример #17
0
def is_http(urlpath):
    protocol, _ = split_protocol(urlpath)
    return protocol == "http" or protocol == "https"
Пример #18
0
def has_remote_protocol(url):
    protocol, _ = split_protocol(url)
    return protocol and protocol != "file"
Пример #19
0
def read_bytes(
    vineyard_socket: str,
    path: str,
    storage_options: Dict,
    read_options: Dict,
    proc_num: int,
    proc_index: int,
):
    """Read bytes from external storage and produce a ByteStream,
    which will later be assembled into a ParallelStream.

    Args:
        vineyard_socket (str): Ipc socket
        path (str): External storage path to write to
        storage_options (dict): Configurations of external storage
        read_options (dict): Additional options that could control the behavior of read
        proc_num (int): Total amount of process
        proc_index (int): The sequence of this process

    Raises:
        ValueError: If the stream is invalid.
    """
    client = vineyard.connect(vineyard_socket)
    builder = ByteStreamBuilder(client)

    serialization_mode = read_options.pop('serialization_mode', False)
    if serialization_mode:
        parsed = urlparse(path)
        try:
            fs = fsspec.filesystem(parsed.scheme)
        except ValueError as e:
            report_status("error", str(e))
            raise
        meta_file = f"{path}_{proc_index}.meta"
        blob_file = f"{path}_{proc_index}"
        if not fs.exists(meta_file) or not fs.exists(blob_file):
            report_status(
                "error",
                f"Some serialization file cannot be found. Expected: {meta_file} and {blob_file}"
            )
            raise FileNotFoundError('{}, {}'.format(meta_file, blob_file))
        # Used for read bytes of serialized graph
        meta_file = fsspec.open(meta_file, mode="rb", **storage_options)
        with meta_file as f:
            meta = f.read().decode('utf-8')
            meta = json.loads(meta)
        lengths = meta.pop("lengths")
        for k, v in meta.items():
            builder[k] = v
        stream = builder.seal(client)
        client.persist(stream)
        ret = {"type": "return", "content": repr(stream.id)}
        print(json.dumps(ret), flush=True)
        writer = stream.open_writer(client)
        of = fsspec.open(blob_file, mode="rb", **storage_options)
        with of as f:
            try:
                total_size = f.size()
            except TypeError:
                total_size = f.size
            assert total_size == sum(lengths), "Target file is corrupted"
            for length in lengths:
                buf = f.read(length)
                chunk = writer.next(length)
                buf_writer = pa.FixedSizeBufferWriter(pa.py_buffer(chunk))
                buf_writer.write(buf)
                buf_writer.close()
        writer.finish()
    else:
        # Used when reading tables from external storage.
        # Usually for load a property graph
        header_row = read_options.get("header_row", False)
        for k, v in read_options.items():
            if k in ("header_row", "include_all_columns"):
                builder[k] = "1" if v else "0"
            elif k == "delimiter":
                builder[k] = bytes(v, "utf-8").decode("unicode_escape")
            else:
                builder[k] = v

        try:
            protocol = split_protocol(path)[0]
            fs = fsspec.filesystem(protocol, **storage_options)
        except Exception as e:
            report_status("error",
                          f"Cannot initialize such filesystem for '{path}'")
            raise

        if fs.isfile(path):
            files = [path]
        else:
            try:
                files = fs.glob(path + '*')
                assert files, f"Cannot find such files: {path}"
            except:
                report_status("error", f"Cannot find such files for '{path}'")
                raise
        ''' Note [Semantic of read_block with delimiter]:

        read_block(fp, begin, size, delimiter) will:

            - find the first `delimiter` from `begin`, then starts read
            - after `size`, go through util the next `delimiter` or EOF, then finishes read.
              Note that the returned size may exceed `size`.
        '''

        chunk_size = 1024 * 1024 * 4
        for index, file_path in enumerate(files):
            with fs.open(file_path, mode="rb") as f:
                offset = 0
                # Only process header line when processing first file
                # And open the writer when processing first file
                if index == 0:
                    header_line = read_block(f, 0, 1, b'\n')
                    builder["header_line"] = header_line.decode(
                        "unicode_escape")
                    if header_row:
                        offset = len(header_line)
                    stream = builder.seal(client)
                    client.persist(stream)
                    ret = {"type": "return", "content": repr(stream.id)}
                    print(json.dumps(ret), flush=True)
                    writer = stream.open_writer(client)

                try:
                    total_size = f.size()
                except TypeError:
                    total_size = f.size
                part_size = (total_size - offset) // proc_num
                begin = part_size * proc_index + offset
                end = min(begin + part_size, total_size)

                # See Note [Semantic of read_block with delimiter].
                if index == 0 and proc_index == 0:
                    begin -= int(header_row)

                while begin < end:
                    buf = read_block(f,
                                     begin,
                                     min(chunk_size, end - begin),
                                     delimiter=b"\n")
                    size = len(buf)
                    if not size:
                        break
                    begin += size - 1
                    chunk = writer.next(size)
                    buf_writer = pa.FixedSizeBufferWriter(pa.py_buffer(chunk))
                    buf_writer.write(buf)
                    buf_writer.close()
        writer.finish()
Пример #20
0
def get_fs_and_path(url):
    protocol, path = split_protocol(url)
    fs = fsspec.filesystem(protocol)
    return fs, path
Пример #21
0
def read_bytes(  # noqa: C901
    vineyard_socket: str,
    path: str,
    storage_options: Dict,
    read_options: Dict,
    proc_num: int,
    proc_index: int,
):
    """Read bytes from external storage and produce a ByteStream,
    which will later be assembled into a ParallelStream.

    Args:
        vineyard_socket (str): Ipc socket
        path (str): External storage path to write to
        storage_options (dict): Configurations of external storage
        read_options (dict): Additional options that could control the behavior of read
        proc_num (int): Total amount of process
        proc_index (int): The sequence of this process

    Raises:
        ValueError: If the stream is invalid.
    """
    client = vineyard.connect(vineyard_socket)
    params = dict()

    read_block_delimiter = read_options.pop('read_block_delimiter', '\n')
    if read_block_delimiter is not None:
        read_block_delimiter = read_block_delimiter.encode('utf-8')

    # Used when reading tables from external storage.
    # Usually for load a property graph
    header_row = read_options.get("header_row", False)
    for k, v in read_options.items():
        if k in ("header_row", "include_all_columns"):
            params[k] = "1" if v else "0"
        elif k == "delimiter":
            params[k] = bytes(v, "utf-8").decode("unicode_escape")
        else:
            params[k] = v

    try:
        protocol = split_protocol(path)[0]
        fs = fsspec.filesystem(protocol, **storage_options)
    except Exception:
        report_error(
            f"Cannot initialize such filesystem for '{path}', "
            f"exception is:\n{traceback.format_exc()}"
        )
        sys.exit(-1)

    if fs.isfile(path):
        files = [path]
    else:
        try:
            files = fs.glob(path + '*')
            assert files, f"Cannot find such files: {path}"
        except Exception:
            report_error(f"Cannot find such files for '{path}'")
            sys.exit(-1)
    ''' Note [Semantic of read_block with delimiter]:

    read_block(fp, begin, size, delimiter) will:

        - find the first `delimiter` from `begin`, then starts read
        - after `size`, go through util the next `delimiter` or EOF, then finishes read.
            Note that the returned size may exceed `size`.
    '''

    stream, writer = None, None
    if 'chunk_size' in storage_options:
        chunk_size = parse_readable_size(storage_options['chunk_size'])
    else:
        chunk_size = 1024 * 1024 * 64  # default: 64MB

    try:
        for index, file_path in enumerate(files):
            with fs.open(file_path, mode="rb") as f:
                offset = 0
                # Only process header line when processing first file
                # And open the writer when processing first file
                if index == 0:
                    if header_row:
                        header_line = read_block(f, 0, 1, read_block_delimiter)
                        params["header_line"] = header_line.decode("unicode_escape")
                        offset = len(header_line)
                    stream = ByteStream.new(client, params)
                    client.persist(stream.id)
                    report_success(stream.id)
                    writer = stream.open_writer(client)

                try:
                    total_size = f.size()
                except TypeError:
                    total_size = f.size
                part_size = (total_size - offset) // proc_num
                begin = part_size * proc_index + offset
                end = min(begin + part_size, total_size)

                # See Note [Semantic of read_block with delimiter].
                if index == 0 and proc_index == 0:
                    begin -= int(header_row)

                while begin < end:
                    buffer = read_block(
                        f,
                        begin,
                        min(chunk_size, end - begin),
                        delimiter=read_block_delimiter,
                    )
                    size = len(buffer)
                    if size <= 0:
                        break
                    begin += size - 1
                    chunk = writer.next(size)
                    vineyard.memory_copy(chunk, 0, buffer)
        writer.finish()
    except Exception:
        report_exception()
        if writer is not None:
            writer.fail()
        sys.exit(-1)
Пример #22
0
def move_asset_file_to_item(item,
                            asset_href,
                            asset_subdirectory=None,
                            copy=False,
                            ignore_conflicts=False):
    """Moves an asset file to be alongside that item.

    Args:
        item (Item): The PySTAC Item
            to perform the asset transformation on.
        asset_href (str): The absolute HREF to the asset file.
        asset_subdirectory (str or None): A subdirectory that will be used
            to store the assets. If not supplied, the assets will be moved
            or copied to the same directory as their item.
        copy (bool): If False this function will move the asset file; if True,
            the asset file will be copied.
        ignore_conflicts (bool): If the asset destination file already exists,
            this function will throw an error unless ignore_conflicts is True.

    Returns:
        str: The new absolute href for the asset file
    """
    item_href = item.get_self_href()
    if item_href is None:
        raise ValueError(
            'Self HREF is not available for item {}. This operation '
            'requires that the Item HREFs are available.')

    if not is_absolute_href(asset_href):
        raise ValueError('asset_href msut be absolute.')

    item_dir = os.path.dirname(item_href)

    fname = os.path.basename(asset_href)
    if asset_subdirectory is None:
        target_dir = item_dir
    else:
        target_dir = os.path.join(item_dir, asset_subdirectory)
    new_asset_href = os.path.join(target_dir, fname)

    if asset_href != new_asset_href:
        dest_protocol = split_protocol(new_asset_href)[0]
        fs_dest = get_filesystem_class(dest_protocol)()
        op = None

        if fs_dest.exists(new_asset_href):
            if not ignore_conflicts:
                raise FileExistsError(
                    '{} already exists'.format(new_asset_href))
        else:
            if copy:

                def _op1(dry_run=False):
                    logger.info("Copying {} to {}...".format(
                        asset_href, new_asset_href))
                    if not dry_run:
                        fs_dest.makedirs(os.path.dirname(new_asset_href),
                                         exist_ok=True)
                        with fsspec.open(asset_href, 'rb') as f_src:
                            with fsspec.open(new_asset_href, 'wb') as f_dst:
                                f_dst.write(f_src.read())

                op = _op1
            else:
                source_protocol = split_protocol(asset_href)[0]

                if source_protocol == dest_protocol:

                    def _op2(dry_run=False):
                        logger.info("Moving {} to {}...".format(
                            asset_href, new_asset_href))
                        if not dry_run:
                            fs_dest.makedirs(os.path.dirname(new_asset_href),
                                             exist_ok=True)
                            fs_dest.move(asset_href, new_asset_href)

                    op = _op2
                else:

                    def _op3(dry_run=False):
                        logger.info("Moving {} to {}...".format(
                            asset_href, new_asset_href))
                        if not dry_run:
                            fs_source = get_filesystem_class(source_protocol)()
                            fs_dest.makedirs(os.path.dirname(new_asset_href),
                                             exist_ok=True)
                            with fsspec.open(asset_href, 'rb') as f_src:
                                with fsspec.open(new_asset_href,
                                                 'wb') as f_dst:
                                    f_dst.write(f_src.read())
                            fs_source.delete(asset_href)

                    op = _op3

        if op is not None:
            op(dry_run=False)

    return new_asset_href