Exemplo n.º 1
0
def test_simple(dir_server):
    root = "http://localhost:8999/"
    fn = files[0]
    f = open_files(root + fn)[0]
    with f as f:
        data = f.read()
    assert data == open(os.path.join(dir_server, fn), "rb").read()
Exemplo n.º 2
0
def test_list():
    here = os.path.abspath(os.path.dirname(__file__))
    flist = os.listdir(here)
    plist = [os.path.join(here, p) for p in flist]
    of = open_files(plist)
    assert len(of) == len(flist)
    assert [f.path for f in of] == plist
Exemplo n.º 3
0
def test_pathobject():
    import pathlib

    here = os.path.abspath(os.path.dirname(__file__))
    flist = os.listdir(here)
    plist_str = [os.path.join(here, p) for p in flist]
    plist = [pathlib.Path(p) for p in plist_str]
    of = open_files(plist)
    assert len(of) == len(flist)
    assert [f.path for f in of] == plist_str

    of = open_files(plist[0])
    assert len(of) == 1
    assert of[0].path == plist_str[0]
    with of[0] as f:
        assert f.read() == open(plist_str[0], "rb").read()
Exemplo n.º 4
0
def test_files(dir_server):
    root = "http://localhost:8999/"
    fs = open_files([root + f for f in files])
    for f, f2 in zip(fs, files):
        with f as f:
            with open(os.path.join(dir_server, f2), "rb") as expected:
                assert f.read() == expected.read()
Exemplo n.º 5
0
def test_open_files():
    with filetexts(files, mode="b"):
        myfiles = open_files("./.test.accounts.*")
        assert len(myfiles) == len(files)
        for lazy_file, data_file in zip(myfiles, sorted(files)):
            with lazy_file as f:
                x = f.read()
                assert x == files[data_file]
Exemplo n.º 6
0
def test_pathobject(tmpdir):
    import pathlib

    tmpdir = str(tmpdir)
    plist_str = [os.path.join(str(tmpdir), f).replace("\\", "/") for f in ["a", "b"]]
    open(plist_str[0], "w").write("first file")
    open(plist_str[1], "w").write("second file")
    plist = [pathlib.Path(p) for p in plist_str]
    of = open_files(plist)
    assert len(of) == 2
    assert [f.path for f in of] == plist_str

    of = open_files(plist[0])
    assert len(of) == 1
    assert of[0].path == plist_str[0]
    with of[0] as f:
        assert f.read() == open(plist_str[0], "rb").read()
Exemplo n.º 7
0
def test_open_files_text_mode(encoding):
    with filetexts(files, mode="b"):
        myfiles = open_files("./.test.accounts.*", mode="rt", encoding=encoding)
        assert len(myfiles) == len(files)
        data = []
        for file in myfiles:
            with file as f:
                data.append(f.read())
        assert list(data) == [files[k].decode(encoding) for k in sorted(files)]
Exemplo n.º 8
0
def test_fetch_range_with_headers(dir_server):
    # https://github.com/dask/dask/issues/4479
    root = "http://localhost:8999/"
    fn = files[0]
    headers = {"Date": "Wed, 21 Oct 2015 07:28:00 GMT"}
    f = open_files(root + fn, headers=headers)[0]
    with f as f:
        data = f.read(length=1) + f.read(length=-1)
    assert data == open(os.path.join(dir_server, fn), "rb").read()
Exemplo n.º 9
0
def test_py2_local_bytes(tmpdir):
    fn = str(tmpdir / "myfile.txt.gz")
    with gzip.open(fn, mode="wb") as f:
        f.write(b"hello\nworld")

    files = open_files(fn, compression="gzip", mode="rt")

    with files[0] as f:
        assert all(isinstance(line, str) for line in f)
Exemplo n.º 10
0
def test_open_files_write(s3, s3so):
    paths = ["s3://" + test_bucket_name + "/more/" + f for f in files]
    fils = open_files(paths, mode="wb", **s3so)
    for fil, data in zip(fils, files.values()):
        with fil as f:
            f.write(data)
    sample, values = read_bytes(
        "s3://" + test_bucket_name + "/more/test/accounts.*", **s3so)
    results = compute(*concat(values))
    assert set(list(files.values())) == set(results)
Exemplo n.º 11
0
def test_open_files(s3, mode, s3so):
    myfiles = open_files("s3://" + test_bucket_name + "/test/accounts.*",
                         mode=mode,
                         **s3so)
    assert len(myfiles) == len(files)
    for lazy_file, path in zip(myfiles, sorted(files)):
        with lazy_file as f:
            data = f.read()
            sol = files[path]
            assert data == sol if mode == "rb" else sol.decode()
Exemplo n.º 12
0
def test_ops_blocksize(dir_server):
    root = "http://localhost:8999/"
    fn = files[0]
    f = open_files(root + fn, block_size=2)[0]
    data = open(os.path.join(dir_server, fn), "rb").read()
    with f as f:
        # it's OK to read the whole file
        assert f.read() == data
        # and now the file magically has a size
        assert f.size == len(data)

    # note that if we reuse f from above, because it is tokenized, we get
    # the same open file - where is this cached?
    fn = files[1]
    f = open_files(root + fn, block_size=2)[0]
    with f as f:
        # fails because we want only 12 bytes
        with pytest.raises(ValueError):
            assert f.read(10) == data[:10]
Exemplo n.º 13
0
def test_pickability_of_lazy_files(tmpdir):
    cloudpickle = pytest.importorskip('cloudpickle')

    with filetexts(files, mode='b'):
        myfiles = open_files('./.test.accounts.*')
        myfiles2 = cloudpickle.loads(cloudpickle.dumps(myfiles))

        for f, f2 in zip(myfiles, myfiles2):
            assert f.path == f2.path
            assert type(f.fs) == type(f2.fs)
            with f as f_open, f2 as f2_open:
                assert f_open.read() == f2_open.read()
Exemplo n.º 14
0
def test_multi_context(tmpdir):
    fns = [os.path.join(tmpdir, fn) for fn in ["a", "b"]]
    files = open_files(fns, "wb")
    assert isinstance(files, OpenFiles)
    assert isinstance(files[0], OpenFile)
    assert len(files) == 2
    with files as of:
        assert len(of) == 2
        assert not of[0].closed
        assert of[0].name.endswith("a")
    assert of[0].closed
    assert repr(files) == "<List of 2 OpenFile instances>"
Exemplo n.º 15
0
def test_pickability_of_lazy_files(tmpdir):
    tmpdir = str(tmpdir)

    with filetexts(files, mode="b"):
        myfiles = open_files(".test.accounts.*")
        myfiles2 = cloudpickle.loads(cloudpickle.dumps(myfiles))

        for f, f2 in zip(myfiles, myfiles2):
            assert f.path == f2.path
            assert type(f.fs) == type(f2.fs)
            with f as f_open, f2 as f2_open:
                assert f_open.read() == f2_open.read()
Exemplo n.º 16
0
def test_errors(dir_server):
    f = open_files("http://localhost:8999/doesnotexist")[0]
    with pytest.raises(errs):
        with f as f:
            f.read()
    f = open_files("http://nohost/")[0]

    expected = FileNotFoundError

    with pytest.raises(expected):
        with f as f:
            f.read()
    root = "http://localhost:8999/"
    fn = files[0]
    f = open_files(root + fn, mode="wb")[0]
    with pytest.raises(NotImplementedError):
        with f:
            pass
    f = open_files(root + fn)[0]
    with f as f:
        with pytest.raises(ValueError):
            f.seek(-1)
Exemplo n.º 17
0
def test_pickability_of_lazy_files(tmpdir):
    tmpdir = str(tmpdir)
    cloudpickle = pytest.importorskip("cloudpickle")

    with filetexts(files, mode="b"):
        myfiles = open_files("./.test.accounts.*")
        myfiles2 = cloudpickle.loads(cloudpickle.dumps(myfiles))

        for f, f2 in zip(myfiles, myfiles2):
            assert f.path == f2.path
            assert isinstance(f.fs, type(f2.fs))
            with f as f_open, f2 as f2_open:
                assert f_open.read() == f2_open.read()
Exemplo n.º 18
0
def test_open_files_write(hdfs):
    path = "hdfs://%s/" % basedir
    data = [b"test data %i" % i for i in range(5)]

    files = open_files(path, num=len(data), mode="wb")
    for fil, b in zip(files, data):
        with fil as f:
            f.write(b)

    sample, vals = read_bytes("hdfs://%s/*.part" % basedir)

    (results,) = dask.compute(list(concat(vals)))
    assert data == results
Exemplo n.º 19
0
def test_ops(dir_server, block_size):
    root = "http://localhost:8999/"
    fn = files[0]
    f = open_files(root + fn)[0]
    data = open(os.path.join(dir_server, fn), "rb").read()
    with f as f:
        # these pass because the default
        assert f.read(10) == data[:10]
        f.seek(0)
        assert f.read(10) == data[:10]
        assert f.read(10) == data[10:20]
        f.seek(-10, 2)
        assert f.read() == data[-10:]
Exemplo n.º 20
0
    def _open_dataset(self):
        """
        Main entry function that finds a set of files and passes them to the
        reader.
        """
        from fsspec.core import open_files

        files = open_files(self.urlpath, **self.storage_options)
        if len(files) == 0:
            raise Exception("No files found at {}".format(self.urlpath))
        if len(files) == 1:
            self._ds = reader(files[0], self.chunks, **self._kwargs)
        else:
            self._ds = self._open_files(files)
Exemplo n.º 21
0
def test_open_files_compression(mode, fmt):
    if fmt not in compress:
        pytest.skip("compression function not provided")
    files2 = valmap(compress[fmt], files)
    with filetexts(files2, mode="b"):
        myfiles = open_files(".test.accounts.*", mode=mode, compression=fmt)
        data = []
        for file in myfiles:
            with file as f:
                data.append(f.read())
        sol = [files[k] for k in sorted(files)]
        if mode == "rt":
            sol = [b.decode() for b in sol]
        assert list(data) == sol
Exemplo n.º 22
0
def test_loc(dir_server):
    root = "http://localhost:8999/"
    fn = files[0]
    f = open_files(root + fn)[0]
    expected = open(os.path.join(dir_server, fn), "rb").read()
    with f as f:
        data = f.read(2)
        assert data == expected[:2]
        assert f.loc == 2
        f.seek(0)
        data = f.read(3)
        assert data == expected[:3]
        f.seek(1, 1)
        assert f.loc == 4
Exemplo n.º 23
0
def test_open_files_write(tmpdir, compression_opener):
    compression, opener = compression_opener
    tmpdir = str(tmpdir)
    files = open_files(tmpdir, num=2, mode='wb', compression=compression)
    assert len(files) == 2
    assert {f.mode for f in files} == {'wb'}
    for fil in files:
        with fil as f:
            f.write(b'000')
    files = sorted(os.listdir(tmpdir))
    assert files == ['0.part', '1.part']

    with opener(os.path.join(tmpdir, files[0]), 'rb') as f:
        d = f.read()
    assert d == b'000'
Exemplo n.º 24
0
def test_open_files_write(tmpdir, compression_opener):
    compression, opener = compression_opener
    tmpdir = str(tmpdir)
    files = open_files(tmpdir, num=2, mode="wb", compression=compression)
    assert len(files) == 2
    assert {f.mode for f in files} == {"wb"}
    for fil in files:
        with fil as f:
            f.write(b"000")
    files = sorted(os.listdir(tmpdir))
    assert files == ["0.part", "1.part"]

    with opener(os.path.join(tmpdir, files[0]), "rb") as f:
        d = f.read()
    assert d == b"000"
Exemplo n.º 25
0
def test_ops_blocksize(dir_server):
    root = "http://localhost:8999/"
    fn = files[0]
    f = open_files(root + fn, block_size=2)[0]
    with open(os.path.join(dir_server, fn), "rb") as expected:
        expected = expected.read()
        with f as f:
            # it's OK to read the whole file
            assert f.read() == expected
            # and now the file magically has a size
            assert f.size == len(expected)

        # note that if we reuse f from above, because it is tokenized, we get
        # the same open file - where is this cached?
        fn = files[1]
        f = open_files(root + fn, block_size=2)[0]
        with f as f:
            if parse_version(fsspec.__version__) < parse_version("2021.11.1"):
                # fails because we want only 12 bytes
                with pytest.raises(ValueError):
                    assert f.read(10) == expected[:10]
            else:
                # fixed in https://github.com/fsspec/filesystem_spec/pull/830
                assert f.read(10) == expected[:10]
Exemplo n.º 26
0
def read_text(
    urlpath,
    blocksize=None,
    compression="infer",
    encoding=system_encoding,
    errors="strict",
    linedelimiter=None,
    collection=True,
    storage_options=None,
    files_per_partition=None,
    include_path=False,
):
    """Read lines from text files

    Parameters
    ----------
    urlpath : string or list
        Absolute or relative filepath(s). Prefix with a protocol like ``s3://``
        to read from alternative filesystems. To read from multiple files you
        can pass a globstring or a list of paths, with the caveat that they
        must all have the same protocol.
    blocksize: None, int, or str
        Size (in bytes) to cut up larger files.  Streams by default.
        Can be ``None`` for streaming, an integer number of bytes, or a string
        like "128MiB"
    compression: string
        Compression format like 'gzip' or 'xz'.  Defaults to 'infer'
    encoding: string
    errors: string
    linedelimiter: string or None
    collection: bool, optional
        Return dask.bag if True, or list of delayed values if false
    storage_options: dict
        Extra options that make sense to a particular storage connection, e.g.
        host, port, username, password, etc.
    files_per_partition: None or int
        If set, group input files into partitions of the requested size,
        instead of one partition per file. Mutually exclusive with blocksize.
    include_path: bool
        Whether or not to include the path in the bag.
        If true, elements are tuples of (line, path).
        Default is False.

    Examples
    --------
    >>> b = read_text('myfiles.1.txt')  # doctest: +SKIP
    >>> b = read_text('myfiles.*.txt')  # doctest: +SKIP
    >>> b = read_text('myfiles.*.txt.gz')  # doctest: +SKIP
    >>> b = read_text('s3://bucket/myfiles.*.txt')  # doctest: +SKIP
    >>> b = read_text('s3://key:secret@bucket/myfiles.*.txt')  # doctest: +SKIP
    >>> b = read_text('hdfs://namenode.example.com/myfiles.*.txt')  # doctest: +SKIP

    Parallelize a large file by providing the number of uncompressed bytes to
    load into each partition.

    >>> b = read_text('largefile.txt', blocksize='10MB')  # doctest: +SKIP

    Get file paths of the bag by setting include_path=True

    >>> b = read_text('myfiles.*.txt', include_path=True) # doctest: +SKIP
    >>> b.take(1) # doctest: +SKIP
    (('first line of the first file', '/home/dask/myfiles.0.txt'),)

    Returns
    -------
    dask.bag.Bag or list
        dask.bag.Bag if collection is True or list of Delayed lists otherwise.

    See Also
    --------
    from_sequence: Build bag from Python sequence
    """
    if blocksize is not None and files_per_partition is not None:
        raise ValueError(
            "Only one of blocksize or files_per_partition can be set")
    if isinstance(blocksize, str):
        blocksize = parse_bytes(blocksize)

    if blocksize is None:
        if linedelimiter in [None, "", "\n", "\r", "\r\n"]:
            newline = linedelimiter
            linedelimiter = None
        else:
            newline = ""
        files = open_files(urlpath,
                           mode="rt",
                           encoding=encoding,
                           errors=errors,
                           compression=compression,
                           newline=newline,
                           **(storage_options or {}))
        if files_per_partition is None:
            blocks = [
                delayed(list)(delayed(
                    partial(file_to_blocks,
                            include_path,
                            delimiter=linedelimiter))(fil)) for fil in files
            ]
        else:
            blocks = []
            for start in range(0, len(files), files_per_partition):
                block_files = files[start:(start + files_per_partition)]
                block_lines = delayed(concat)(delayed(map)(
                    partial(file_to_blocks,
                            include_path,
                            delimiter=linedelimiter),
                    block_files,
                ))
                blocks.append(block_lines)
    else:
        # special case for linedelimiter=None: we will need to split on an actual bytestring
        # and the line reader will then use "universal" mode. Just as well that \r\n and \n
        # will both work (thankfully \r for MacOS is no longer a thing)
        o = read_bytes(urlpath,
                       delimiter=linedelimiter.encode()
                       if linedelimiter is not None else b"\n",
                       blocksize=blocksize,
                       sample=False,
                       compression=compression,
                       include_path=include_path,
                       **(storage_options or {}))
        raw_blocks = o[1]
        blocks = [
            delayed(decode)(b, encoding, errors, linedelimiter)
            for b in concat(raw_blocks)
        ]
        if include_path:
            paths = list(
                concat([[path] * len(raw_blocks[i])
                        for i, path in enumerate(o[2])]))
            blocks = [
                delayed(attach_path)(entry, path)
                for entry, path in zip(blocks, paths)
            ]

    if not blocks:
        raise ValueError("No files found", urlpath)

    if collection:
        blocks = from_delayed(blocks)

    return blocks
Exemplo n.º 27
0
def test_mismatch():
    with pytest.raises(ValueError, match="protocol"):
        open_files(["s3://test/path.csv", "/other/path.csv"])
Exemplo n.º 28
0
Arquivo: json.py Projeto: keewis/dask
def to_json(
    df,
    url_path,
    orient="records",
    lines=None,
    storage_options=None,
    compute=True,
    encoding="utf-8",
    errors="strict",
    compression=None,
    compute_kwargs=None,
    name_function=None,
    **kwargs,
):
    """Write dataframe into JSON text files

    This utilises ``pandas.DataFrame.to_json()``, and most parameters are
    passed through - see its docstring.

    Differences: orient is 'records' by default, with lines=True; this
    produces the kind of JSON output that is most common in big-data
    applications, and which can be chunked when reading (see ``read_json()``).

    Parameters
    ----------
    df: dask.DataFrame
        Data to save
    url_path: str, list of str
        Location to write to. If a string, and there are more than one
        partitions in df, should include a glob character to expand into a
        set of file names, or provide a ``name_function=`` parameter.
        Supports protocol specifications such as ``"s3://"``.
    encoding, errors:
        The text encoding to implement, e.g., "utf-8" and how to respond
        to errors in the conversion (see ``str.encode()``).
    orient, lines, kwargs
        passed to pandas; if not specified, lines=True when orient='records',
        False otherwise.
    storage_options: dict
        Passed to backend file-system implementation
    compute: bool
        If true, immediately executes. If False, returns a set of delayed
        objects, which can be computed at a later time.
    compute_kwargs : dict, optional
        Options to be passed in to the compute method
    encoding, errors:
        Text conversion, ``see str.encode()``
    compression : string or None
        String like 'gzip' or 'xz'.
    name_function : callable, default None
        Function accepting an integer (partition index) and producing a
        string to replace the asterisk in the given filename globstring.
        Should preserve the lexicographic order of partitions.
    """
    if lines is None:
        lines = orient == "records"
    if orient != "records" and lines:
        raise ValueError(
            "Line-delimited JSON is only available with" 'orient="records".'
        )
    kwargs["orient"] = orient
    kwargs["lines"] = lines and orient == "records"
    outfiles = open_files(
        url_path,
        "wt",
        encoding=encoding,
        errors=errors,
        name_function=name_function,
        num=df.npartitions,
        compression=compression,
        **(storage_options or {}),
    )
    parts = [
        delayed(write_json_partition)(d, outfile, kwargs)
        for outfile, d in zip(outfiles, df.to_delayed())
    ]
    if compute:
        if compute_kwargs is None:
            compute_kwargs = dict()
        return list(dask_compute(*parts, **compute_kwargs))
    else:
        return parts
Exemplo n.º 29
0
Arquivo: json.py Projeto: keewis/dask
def read_json(
    url_path,
    orient="records",
    lines=None,
    storage_options=None,
    blocksize=None,
    sample=2 ** 20,
    encoding="utf-8",
    errors="strict",
    compression="infer",
    meta=None,
    engine=pd.read_json,
    **kwargs,
):
    """Create a dataframe from a set of JSON files

    This utilises ``pandas.read_json()``, and most parameters are
    passed through - see its docstring.

    Differences: orient is 'records' by default, with lines=True; this
    is appropriate for line-delimited "JSON-lines" data, the kind of JSON output
    that is most common in big-data scenarios, and which can be chunked when
    reading (see ``read_json()``). All other options require blocksize=None,
    i.e., one partition per input file.

    Parameters
    ----------
    url_path: str, list of str
        Location to read from. If a string, can include a glob character to
        find a set of file names.
        Supports protocol specifications such as ``"s3://"``.
    encoding, errors:
        The text encoding to implement, e.g., "utf-8" and how to respond
        to errors in the conversion (see ``str.encode()``).
    orient, lines, kwargs
        passed to pandas; if not specified, lines=True when orient='records',
        False otherwise.
    storage_options: dict
        Passed to backend file-system implementation
    blocksize: None or int
        If None, files are not blocked, and you get one partition per input
        file. If int, which can only be used for line-delimited JSON files,
        each partition will be approximately this size in bytes, to the nearest
        newline character.
    sample: int
        Number of bytes to pre-load, to provide an empty dataframe structure
        to any blocks without data. Only relevant is using blocksize.
    encoding, errors:
        Text conversion, ``see bytes.decode()``
    compression : string or None
        String like 'gzip' or 'xz'.
    engine : function object, default ``pd.read_json``
        The underlying function that dask will use to read JSON files. By
        default, this will be the pandas JSON reader (``pd.read_json``).
    $META

    Returns
    -------
    dask.DataFrame

    Examples
    --------
    Load single file

    >>> dd.read_json('myfile.1.json')  # doctest: +SKIP

    Load multiple files

    >>> dd.read_json('myfile.*.json')  # doctest: +SKIP

    >>> dd.read_json(['myfile.1.json', 'myfile.2.json'])  # doctest: +SKIP

    Load large line-delimited JSON files using partitions of approx
    256MB size

    >> dd.read_json('data/file*.csv', blocksize=2**28)
    """
    if lines is None:
        lines = orient == "records"
    if orient != "records" and lines:
        raise ValueError(
            "Line-delimited JSON is only available with" 'orient="records".'
        )
    if blocksize and (orient != "records" or not lines):
        raise ValueError(
            "JSON file chunking only allowed for JSON-lines"
            "input (orient='records', lines=True)."
        )
    storage_options = storage_options or {}
    if blocksize:
        first, chunks = read_bytes(
            url_path,
            b"\n",
            blocksize=blocksize,
            sample=sample,
            compression=compression,
            **storage_options,
        )
        chunks = list(flatten(chunks))
        if meta is None:
            meta = read_json_chunk(first, encoding, errors, engine, kwargs)
        meta = make_meta(meta)
        parts = [
            delayed(read_json_chunk)(chunk, encoding, errors, engine, kwargs, meta=meta)
            for chunk in chunks
        ]
        return from_delayed(parts, meta=meta)
    else:
        files = open_files(
            url_path,
            "rt",
            encoding=encoding,
            errors=errors,
            compression=compression,
            **storage_options,
        )
        parts = [
            delayed(read_json_file)(f, orient, lines, engine, kwargs) for f in files
        ]
        return from_delayed(parts, meta=meta)
Exemplo n.º 30
0
def test_open_glob(dir_server):
    root = "http://localhost:8999/"
    fs = open_files(root + "/*")
    assert fs[0].path == "http://localhost:8999/a"
    assert fs[1].path == "http://localhost:8999/b"