def _persist(source, path, encoder=None): """Save list to files using encoding encoder : None or one of str|json|pickle None is equivalent to str """ import posixpath from dask.bytes import open_files import dask import pickle import json from intake.source.textfiles import TextFilesSource encoder = { None: str, 'str': str, 'json': json.dumps, 'pickle': pickle.dumps }[encoder] try: b = source.to_dask() except NotImplementedError: import dask.bag as db b = db.from_sequence(source.read(), npartitions=1) files = open_files(posixpath.join(path, 'part.*'), mode='wt', num=b.npartitions) dwrite = dask.delayed(write_file) out = [ dwrite(part, f, encoder) for part, f in zip(b.to_delayed(), files) ] dask.compute(out) s = TextFilesSource(posixpath.join(path, 'part.*')) return s
def _data_to_source(b, path, encoder=None, storage_options=None, **kwargs): import dask.bag as db import posixpath from fsspec import open_files import dask import pickle import json from intake.source.textfiles import TextFilesSource encoder = {None: str, 'str': str, 'json': json.dumps, 'pickle': pickle.dumps}.get(encoder, encoder) if not hasattr(b, 'to_textfiles'): try: b = db.from_sequence(b, npartitions=1) except TypeError: raise NotImplementedError files = open_files(posixpath.join(path, 'part.*'), mode='wt', num=b.npartitions, **(storage_options or {})) dwrite = dask.delayed(write_file) out = [dwrite(part, f, encoder) for part, f in zip(b.to_delayed(), files)] dask.compute(out) s = TextFilesSource(posixpath.join(path, 'part.*'), storage_options=storage_options) return s
def test_complex_bytes(tempdir, comp, pars): dump, load, read = pars dump = import_name(dump) # using bytestrings means not needing extra en/decode argument to msgpack data = [{b'something': b'simple', b'and': 0}] * 2 for f in ['1.out', '2.out']: fn = os.path.join(tempdir, f) with open_files([fn], mode='wb', compression=comp)[0] as fo: if read: fo.write(dump(data)) else: dump(data, fo) # that was all setup path = os.path.join(tempdir, '*.out') t = TextFilesSource(path, text_mode=False, compression=comp, decoder=load, read=read) t.discover() assert t.npartitions == 2 assert t._get_partition(0) == t.to_dask().to_delayed()[0].compute() out = t.read() assert isinstance(out, list) assert out[0] == data[0]
def test_textfiles(tempdir): open(os.path.join(tempdir, '1.txt'), 'wt').write('hello\nworld') open(os.path.join(tempdir, '2.txt'), 'wt').write('hello\nworld') path = os.path.join(tempdir, '*.txt') t = TextFilesSource(path) t.discover() assert t.npartitions == 2 assert t._get_partition(0) == t.to_dask().to_delayed()[0].compute() out = t.read() assert isinstance(out, list) assert out[0] == 'hello\n'
def _data_to_source(b, path, encoder=None, **kwargs): import dask.bag as db import posixpath from fsspec import open_files import dask from intake.source.textfiles import TextFilesSource if not hasattr(b, 'to_textfiles'): try: b = db.from_sequence(b, npartitions=1) except TypeError: raise NotImplementedError files = open_files(posixpath.join(path, 'part.*'), mode='wt', num=b.npartitions) dwrite = dask.delayed(write_file) out = [dwrite(part, f, encoder) for part, f in zip(b.to_delayed(), files)] dask.compute(out) s = TextFilesSource(posixpath.join(path, 'part.*')) return s
def test_complex_text(tempdir, comp): dump, load, read = 'json.dumps', 'json.loads', True dump = import_name(dump) data = [{'something': 'simple', 'and': 0}] * 2 for f in ['1.out', '2.out']: fn = os.path.join(tempdir, f) with open_files([fn], mode='wt', compression=comp)[0] as fo: if read: fo.write(dump(data)) else: dump(data, fo) # that was all setup path = os.path.join(tempdir, '*.out') t = TextFilesSource(path, text_mode=True, compression=comp, decoder=load) t.discover() assert t.npartitions == 2 assert t._get_partition(0) == t.to_dask().to_delayed()[0].compute() out = t.read() assert isinstance(out, list) assert out[0] == data[0]
def test_backtrack(temp_cache): s = TextFilesSource("*.py") s2 = s.persist() s3 = store.backtrack(s2) assert s3 == s
def test_persist_with_nonnumeric_ttl_raises_error(temp_cache): s = TextFilesSource("*.py") with pytest.raises(ValueError, match="User-provided ttl was a string"): s.persist(ttl='a string')