def test_simple(dir_server): root = "http://localhost:8999/" fn = files[0] f = open_files(root + fn)[0] with f as f: data = f.read() assert data == open(os.path.join(dir_server, fn), "rb").read()
def test_list(): here = os.path.abspath(os.path.dirname(__file__)) flist = os.listdir(here) plist = [os.path.join(here, p) for p in flist] of = open_files(plist) assert len(of) == len(flist) assert [f.path for f in of] == plist
def test_pathobject(): import pathlib here = os.path.abspath(os.path.dirname(__file__)) flist = os.listdir(here) plist_str = [os.path.join(here, p) for p in flist] plist = [pathlib.Path(p) for p in plist_str] of = open_files(plist) assert len(of) == len(flist) assert [f.path for f in of] == plist_str of = open_files(plist[0]) assert len(of) == 1 assert of[0].path == plist_str[0] with of[0] as f: assert f.read() == open(plist_str[0], "rb").read()
def test_files(dir_server): root = "http://localhost:8999/" fs = open_files([root + f for f in files]) for f, f2 in zip(fs, files): with f as f: with open(os.path.join(dir_server, f2), "rb") as expected: assert f.read() == expected.read()
def test_open_files(): with filetexts(files, mode="b"): myfiles = open_files("./.test.accounts.*") assert len(myfiles) == len(files) for lazy_file, data_file in zip(myfiles, sorted(files)): with lazy_file as f: x = f.read() assert x == files[data_file]
def test_pathobject(tmpdir): import pathlib tmpdir = str(tmpdir) plist_str = [os.path.join(str(tmpdir), f).replace("\\", "/") for f in ["a", "b"]] open(plist_str[0], "w").write("first file") open(plist_str[1], "w").write("second file") plist = [pathlib.Path(p) for p in plist_str] of = open_files(plist) assert len(of) == 2 assert [f.path for f in of] == plist_str of = open_files(plist[0]) assert len(of) == 1 assert of[0].path == plist_str[0] with of[0] as f: assert f.read() == open(plist_str[0], "rb").read()
def test_open_files_text_mode(encoding): with filetexts(files, mode="b"): myfiles = open_files("./.test.accounts.*", mode="rt", encoding=encoding) assert len(myfiles) == len(files) data = [] for file in myfiles: with file as f: data.append(f.read()) assert list(data) == [files[k].decode(encoding) for k in sorted(files)]
def test_fetch_range_with_headers(dir_server): # https://github.com/dask/dask/issues/4479 root = "http://localhost:8999/" fn = files[0] headers = {"Date": "Wed, 21 Oct 2015 07:28:00 GMT"} f = open_files(root + fn, headers=headers)[0] with f as f: data = f.read(length=1) + f.read(length=-1) assert data == open(os.path.join(dir_server, fn), "rb").read()
def test_py2_local_bytes(tmpdir): fn = str(tmpdir / "myfile.txt.gz") with gzip.open(fn, mode="wb") as f: f.write(b"hello\nworld") files = open_files(fn, compression="gzip", mode="rt") with files[0] as f: assert all(isinstance(line, str) for line in f)
def test_open_files_write(s3, s3so): paths = ["s3://" + test_bucket_name + "/more/" + f for f in files] fils = open_files(paths, mode="wb", **s3so) for fil, data in zip(fils, files.values()): with fil as f: f.write(data) sample, values = read_bytes( "s3://" + test_bucket_name + "/more/test/accounts.*", **s3so) results = compute(*concat(values)) assert set(list(files.values())) == set(results)
def test_open_files(s3, mode, s3so): myfiles = open_files("s3://" + test_bucket_name + "/test/accounts.*", mode=mode, **s3so) assert len(myfiles) == len(files) for lazy_file, path in zip(myfiles, sorted(files)): with lazy_file as f: data = f.read() sol = files[path] assert data == sol if mode == "rb" else sol.decode()
def test_ops_blocksize(dir_server): root = "http://localhost:8999/" fn = files[0] f = open_files(root + fn, block_size=2)[0] data = open(os.path.join(dir_server, fn), "rb").read() with f as f: # it's OK to read the whole file assert f.read() == data # and now the file magically has a size assert f.size == len(data) # note that if we reuse f from above, because it is tokenized, we get # the same open file - where is this cached? fn = files[1] f = open_files(root + fn, block_size=2)[0] with f as f: # fails because we want only 12 bytes with pytest.raises(ValueError): assert f.read(10) == data[:10]
def test_pickability_of_lazy_files(tmpdir): cloudpickle = pytest.importorskip('cloudpickle') with filetexts(files, mode='b'): myfiles = open_files('./.test.accounts.*') myfiles2 = cloudpickle.loads(cloudpickle.dumps(myfiles)) for f, f2 in zip(myfiles, myfiles2): assert f.path == f2.path assert type(f.fs) == type(f2.fs) with f as f_open, f2 as f2_open: assert f_open.read() == f2_open.read()
def test_multi_context(tmpdir): fns = [os.path.join(tmpdir, fn) for fn in ["a", "b"]] files = open_files(fns, "wb") assert isinstance(files, OpenFiles) assert isinstance(files[0], OpenFile) assert len(files) == 2 with files as of: assert len(of) == 2 assert not of[0].closed assert of[0].name.endswith("a") assert of[0].closed assert repr(files) == "<List of 2 OpenFile instances>"
def test_pickability_of_lazy_files(tmpdir): tmpdir = str(tmpdir) with filetexts(files, mode="b"): myfiles = open_files(".test.accounts.*") myfiles2 = cloudpickle.loads(cloudpickle.dumps(myfiles)) for f, f2 in zip(myfiles, myfiles2): assert f.path == f2.path assert type(f.fs) == type(f2.fs) with f as f_open, f2 as f2_open: assert f_open.read() == f2_open.read()
def test_errors(dir_server): f = open_files("http://localhost:8999/doesnotexist")[0] with pytest.raises(errs): with f as f: f.read() f = open_files("http://nohost/")[0] expected = FileNotFoundError with pytest.raises(expected): with f as f: f.read() root = "http://localhost:8999/" fn = files[0] f = open_files(root + fn, mode="wb")[0] with pytest.raises(NotImplementedError): with f: pass f = open_files(root + fn)[0] with f as f: with pytest.raises(ValueError): f.seek(-1)
def test_pickability_of_lazy_files(tmpdir): tmpdir = str(tmpdir) cloudpickle = pytest.importorskip("cloudpickle") with filetexts(files, mode="b"): myfiles = open_files("./.test.accounts.*") myfiles2 = cloudpickle.loads(cloudpickle.dumps(myfiles)) for f, f2 in zip(myfiles, myfiles2): assert f.path == f2.path assert isinstance(f.fs, type(f2.fs)) with f as f_open, f2 as f2_open: assert f_open.read() == f2_open.read()
def test_open_files_write(hdfs): path = "hdfs://%s/" % basedir data = [b"test data %i" % i for i in range(5)] files = open_files(path, num=len(data), mode="wb") for fil, b in zip(files, data): with fil as f: f.write(b) sample, vals = read_bytes("hdfs://%s/*.part" % basedir) (results,) = dask.compute(list(concat(vals))) assert data == results
def test_ops(dir_server, block_size): root = "http://localhost:8999/" fn = files[0] f = open_files(root + fn)[0] data = open(os.path.join(dir_server, fn), "rb").read() with f as f: # these pass because the default assert f.read(10) == data[:10] f.seek(0) assert f.read(10) == data[:10] assert f.read(10) == data[10:20] f.seek(-10, 2) assert f.read() == data[-10:]
def _open_dataset(self): """ Main entry function that finds a set of files and passes them to the reader. """ from fsspec.core import open_files files = open_files(self.urlpath, **self.storage_options) if len(files) == 0: raise Exception("No files found at {}".format(self.urlpath)) if len(files) == 1: self._ds = reader(files[0], self.chunks, **self._kwargs) else: self._ds = self._open_files(files)
def test_open_files_compression(mode, fmt): if fmt not in compress: pytest.skip("compression function not provided") files2 = valmap(compress[fmt], files) with filetexts(files2, mode="b"): myfiles = open_files(".test.accounts.*", mode=mode, compression=fmt) data = [] for file in myfiles: with file as f: data.append(f.read()) sol = [files[k] for k in sorted(files)] if mode == "rt": sol = [b.decode() for b in sol] assert list(data) == sol
def test_loc(dir_server): root = "http://localhost:8999/" fn = files[0] f = open_files(root + fn)[0] expected = open(os.path.join(dir_server, fn), "rb").read() with f as f: data = f.read(2) assert data == expected[:2] assert f.loc == 2 f.seek(0) data = f.read(3) assert data == expected[:3] f.seek(1, 1) assert f.loc == 4
def test_open_files_write(tmpdir, compression_opener): compression, opener = compression_opener tmpdir = str(tmpdir) files = open_files(tmpdir, num=2, mode='wb', compression=compression) assert len(files) == 2 assert {f.mode for f in files} == {'wb'} for fil in files: with fil as f: f.write(b'000') files = sorted(os.listdir(tmpdir)) assert files == ['0.part', '1.part'] with opener(os.path.join(tmpdir, files[0]), 'rb') as f: d = f.read() assert d == b'000'
def test_open_files_write(tmpdir, compression_opener): compression, opener = compression_opener tmpdir = str(tmpdir) files = open_files(tmpdir, num=2, mode="wb", compression=compression) assert len(files) == 2 assert {f.mode for f in files} == {"wb"} for fil in files: with fil as f: f.write(b"000") files = sorted(os.listdir(tmpdir)) assert files == ["0.part", "1.part"] with opener(os.path.join(tmpdir, files[0]), "rb") as f: d = f.read() assert d == b"000"
def test_ops_blocksize(dir_server): root = "http://localhost:8999/" fn = files[0] f = open_files(root + fn, block_size=2)[0] with open(os.path.join(dir_server, fn), "rb") as expected: expected = expected.read() with f as f: # it's OK to read the whole file assert f.read() == expected # and now the file magically has a size assert f.size == len(expected) # note that if we reuse f from above, because it is tokenized, we get # the same open file - where is this cached? fn = files[1] f = open_files(root + fn, block_size=2)[0] with f as f: if parse_version(fsspec.__version__) < parse_version("2021.11.1"): # fails because we want only 12 bytes with pytest.raises(ValueError): assert f.read(10) == expected[:10] else: # fixed in https://github.com/fsspec/filesystem_spec/pull/830 assert f.read(10) == expected[:10]
def read_text( urlpath, blocksize=None, compression="infer", encoding=system_encoding, errors="strict", linedelimiter=None, collection=True, storage_options=None, files_per_partition=None, include_path=False, ): """Read lines from text files Parameters ---------- urlpath : string or list Absolute or relative filepath(s). Prefix with a protocol like ``s3://`` to read from alternative filesystems. To read from multiple files you can pass a globstring or a list of paths, with the caveat that they must all have the same protocol. blocksize: None, int, or str Size (in bytes) to cut up larger files. Streams by default. Can be ``None`` for streaming, an integer number of bytes, or a string like "128MiB" compression: string Compression format like 'gzip' or 'xz'. Defaults to 'infer' encoding: string errors: string linedelimiter: string or None collection: bool, optional Return dask.bag if True, or list of delayed values if false storage_options: dict Extra options that make sense to a particular storage connection, e.g. host, port, username, password, etc. files_per_partition: None or int If set, group input files into partitions of the requested size, instead of one partition per file. Mutually exclusive with blocksize. include_path: bool Whether or not to include the path in the bag. If true, elements are tuples of (line, path). Default is False. Examples -------- >>> b = read_text('myfiles.1.txt') # doctest: +SKIP >>> b = read_text('myfiles.*.txt') # doctest: +SKIP >>> b = read_text('myfiles.*.txt.gz') # doctest: +SKIP >>> b = read_text('s3://bucket/myfiles.*.txt') # doctest: +SKIP >>> b = read_text('s3://key:secret@bucket/myfiles.*.txt') # doctest: +SKIP >>> b = read_text('hdfs://namenode.example.com/myfiles.*.txt') # doctest: +SKIP Parallelize a large file by providing the number of uncompressed bytes to load into each partition. >>> b = read_text('largefile.txt', blocksize='10MB') # doctest: +SKIP Get file paths of the bag by setting include_path=True >>> b = read_text('myfiles.*.txt', include_path=True) # doctest: +SKIP >>> b.take(1) # doctest: +SKIP (('first line of the first file', '/home/dask/myfiles.0.txt'),) Returns ------- dask.bag.Bag or list dask.bag.Bag if collection is True or list of Delayed lists otherwise. See Also -------- from_sequence: Build bag from Python sequence """ if blocksize is not None and files_per_partition is not None: raise ValueError( "Only one of blocksize or files_per_partition can be set") if isinstance(blocksize, str): blocksize = parse_bytes(blocksize) if blocksize is None: if linedelimiter in [None, "", "\n", "\r", "\r\n"]: newline = linedelimiter linedelimiter = None else: newline = "" files = open_files(urlpath, mode="rt", encoding=encoding, errors=errors, compression=compression, newline=newline, **(storage_options or {})) if files_per_partition is None: blocks = [ delayed(list)(delayed( partial(file_to_blocks, include_path, delimiter=linedelimiter))(fil)) for fil in files ] else: blocks = [] for start in range(0, len(files), files_per_partition): block_files = files[start:(start + files_per_partition)] block_lines = delayed(concat)(delayed(map)( partial(file_to_blocks, include_path, delimiter=linedelimiter), block_files, )) blocks.append(block_lines) else: # special case for linedelimiter=None: we will need to split on an actual bytestring # and the line reader will then use "universal" mode. Just as well that \r\n and \n # will both work (thankfully \r for MacOS is no longer a thing) o = read_bytes(urlpath, delimiter=linedelimiter.encode() if linedelimiter is not None else b"\n", blocksize=blocksize, sample=False, compression=compression, include_path=include_path, **(storage_options or {})) raw_blocks = o[1] blocks = [ delayed(decode)(b, encoding, errors, linedelimiter) for b in concat(raw_blocks) ] if include_path: paths = list( concat([[path] * len(raw_blocks[i]) for i, path in enumerate(o[2])])) blocks = [ delayed(attach_path)(entry, path) for entry, path in zip(blocks, paths) ] if not blocks: raise ValueError("No files found", urlpath) if collection: blocks = from_delayed(blocks) return blocks
def test_mismatch(): with pytest.raises(ValueError, match="protocol"): open_files(["s3://test/path.csv", "/other/path.csv"])
def to_json( df, url_path, orient="records", lines=None, storage_options=None, compute=True, encoding="utf-8", errors="strict", compression=None, compute_kwargs=None, name_function=None, **kwargs, ): """Write dataframe into JSON text files This utilises ``pandas.DataFrame.to_json()``, and most parameters are passed through - see its docstring. Differences: orient is 'records' by default, with lines=True; this produces the kind of JSON output that is most common in big-data applications, and which can be chunked when reading (see ``read_json()``). Parameters ---------- df: dask.DataFrame Data to save url_path: str, list of str Location to write to. If a string, and there are more than one partitions in df, should include a glob character to expand into a set of file names, or provide a ``name_function=`` parameter. Supports protocol specifications such as ``"s3://"``. encoding, errors: The text encoding to implement, e.g., "utf-8" and how to respond to errors in the conversion (see ``str.encode()``). orient, lines, kwargs passed to pandas; if not specified, lines=True when orient='records', False otherwise. storage_options: dict Passed to backend file-system implementation compute: bool If true, immediately executes. If False, returns a set of delayed objects, which can be computed at a later time. compute_kwargs : dict, optional Options to be passed in to the compute method encoding, errors: Text conversion, ``see str.encode()`` compression : string or None String like 'gzip' or 'xz'. name_function : callable, default None Function accepting an integer (partition index) and producing a string to replace the asterisk in the given filename globstring. Should preserve the lexicographic order of partitions. """ if lines is None: lines = orient == "records" if orient != "records" and lines: raise ValueError( "Line-delimited JSON is only available with" 'orient="records".' ) kwargs["orient"] = orient kwargs["lines"] = lines and orient == "records" outfiles = open_files( url_path, "wt", encoding=encoding, errors=errors, name_function=name_function, num=df.npartitions, compression=compression, **(storage_options or {}), ) parts = [ delayed(write_json_partition)(d, outfile, kwargs) for outfile, d in zip(outfiles, df.to_delayed()) ] if compute: if compute_kwargs is None: compute_kwargs = dict() return list(dask_compute(*parts, **compute_kwargs)) else: return parts
def read_json( url_path, orient="records", lines=None, storage_options=None, blocksize=None, sample=2 ** 20, encoding="utf-8", errors="strict", compression="infer", meta=None, engine=pd.read_json, **kwargs, ): """Create a dataframe from a set of JSON files This utilises ``pandas.read_json()``, and most parameters are passed through - see its docstring. Differences: orient is 'records' by default, with lines=True; this is appropriate for line-delimited "JSON-lines" data, the kind of JSON output that is most common in big-data scenarios, and which can be chunked when reading (see ``read_json()``). All other options require blocksize=None, i.e., one partition per input file. Parameters ---------- url_path: str, list of str Location to read from. If a string, can include a glob character to find a set of file names. Supports protocol specifications such as ``"s3://"``. encoding, errors: The text encoding to implement, e.g., "utf-8" and how to respond to errors in the conversion (see ``str.encode()``). orient, lines, kwargs passed to pandas; if not specified, lines=True when orient='records', False otherwise. storage_options: dict Passed to backend file-system implementation blocksize: None or int If None, files are not blocked, and you get one partition per input file. If int, which can only be used for line-delimited JSON files, each partition will be approximately this size in bytes, to the nearest newline character. sample: int Number of bytes to pre-load, to provide an empty dataframe structure to any blocks without data. Only relevant is using blocksize. encoding, errors: Text conversion, ``see bytes.decode()`` compression : string or None String like 'gzip' or 'xz'. engine : function object, default ``pd.read_json`` The underlying function that dask will use to read JSON files. By default, this will be the pandas JSON reader (``pd.read_json``). $META Returns ------- dask.DataFrame Examples -------- Load single file >>> dd.read_json('myfile.1.json') # doctest: +SKIP Load multiple files >>> dd.read_json('myfile.*.json') # doctest: +SKIP >>> dd.read_json(['myfile.1.json', 'myfile.2.json']) # doctest: +SKIP Load large line-delimited JSON files using partitions of approx 256MB size >> dd.read_json('data/file*.csv', blocksize=2**28) """ if lines is None: lines = orient == "records" if orient != "records" and lines: raise ValueError( "Line-delimited JSON is only available with" 'orient="records".' ) if blocksize and (orient != "records" or not lines): raise ValueError( "JSON file chunking only allowed for JSON-lines" "input (orient='records', lines=True)." ) storage_options = storage_options or {} if blocksize: first, chunks = read_bytes( url_path, b"\n", blocksize=blocksize, sample=sample, compression=compression, **storage_options, ) chunks = list(flatten(chunks)) if meta is None: meta = read_json_chunk(first, encoding, errors, engine, kwargs) meta = make_meta(meta) parts = [ delayed(read_json_chunk)(chunk, encoding, errors, engine, kwargs, meta=meta) for chunk in chunks ] return from_delayed(parts, meta=meta) else: files = open_files( url_path, "rt", encoding=encoding, errors=errors, compression=compression, **storage_options, ) parts = [ delayed(read_json_file)(f, orient, lines, engine, kwargs) for f in files ] return from_delayed(parts, meta=meta)
def test_open_glob(dir_server): root = "http://localhost:8999/" fs = open_files(root + "/*") assert fs[0].path == "http://localhost:8999/a" assert fs[1].path == "http://localhost:8999/b"