Exemplo n.º 1
0
    def _persist(source, path, encoder=None):
        """Save list to files using encoding

        encoder : None or one of str|json|pickle
            None is equivalent to str
        """
        import posixpath
        from dask.bytes import open_files
        import dask
        import pickle
        import json
        from intake.source.textfiles import TextFilesSource
        encoder = {
            None: str,
            'str': str,
            'json': json.dumps,
            'pickle': pickle.dumps
        }[encoder]
        try:
            b = source.to_dask()
        except NotImplementedError:
            import dask.bag as db
            b = db.from_sequence(source.read(), npartitions=1)
        files = open_files(posixpath.join(path, 'part.*'),
                           mode='wt',
                           num=b.npartitions)
        dwrite = dask.delayed(write_file)
        out = [
            dwrite(part, f, encoder) for part, f in zip(b.to_delayed(), files)
        ]
        dask.compute(out)
        s = TextFilesSource(posixpath.join(path, 'part.*'))
        return s
Exemplo n.º 2
0
    def _data_to_source(b, path, encoder=None, storage_options=None, **kwargs):
        import dask.bag as db
        import posixpath
        from fsspec import open_files
        import dask
        import pickle
        import json
        from intake.source.textfiles import TextFilesSource
        encoder = {None: str, 'str': str, 'json': json.dumps,
           'pickle': pickle.dumps}.get(encoder, encoder)

        if not hasattr(b, 'to_textfiles'):
            try:
                b = db.from_sequence(b, npartitions=1)
            except TypeError:
                raise NotImplementedError

        files = open_files(posixpath.join(path, 'part.*'), mode='wt',
                           num=b.npartitions, **(storage_options or {}))
        dwrite = dask.delayed(write_file)
        out = [dwrite(part, f, encoder)
               for part, f in zip(b.to_delayed(), files)]
        dask.compute(out)
        s = TextFilesSource(posixpath.join(path, 'part.*'), storage_options=storage_options)
        return s
Exemplo n.º 3
0
def test_complex_bytes(tempdir, comp, pars):
    dump, load, read = pars
    dump = import_name(dump)
    # using bytestrings means not needing extra en/decode argument to msgpack
    data = [{b'something': b'simple', b'and': 0}] * 2
    for f in ['1.out', '2.out']:
        fn = os.path.join(tempdir, f)
        with open_files([fn], mode='wb', compression=comp)[0] as fo:
            if read:
                fo.write(dump(data))
            else:
                dump(data, fo)
    # that was all setup

    path = os.path.join(tempdir, '*.out')
    t = TextFilesSource(path,
                        text_mode=False,
                        compression=comp,
                        decoder=load,
                        read=read)
    t.discover()
    assert t.npartitions == 2
    assert t._get_partition(0) == t.to_dask().to_delayed()[0].compute()
    out = t.read()
    assert isinstance(out, list)
    assert out[0] == data[0]
Exemplo n.º 4
0
def test_textfiles(tempdir):
    open(os.path.join(tempdir, '1.txt'), 'wt').write('hello\nworld')
    open(os.path.join(tempdir, '2.txt'), 'wt').write('hello\nworld')
    path = os.path.join(tempdir, '*.txt')
    t = TextFilesSource(path)
    t.discover()
    assert t.npartitions == 2
    assert t._get_partition(0) == t.to_dask().to_delayed()[0].compute()
    out = t.read()
    assert isinstance(out, list)
    assert out[0] == 'hello\n'
Exemplo n.º 5
0
    def _data_to_source(b, path, encoder=None, **kwargs):
        import dask.bag as db
        import posixpath
        from fsspec import open_files
        import dask
        from intake.source.textfiles import TextFilesSource
        if not hasattr(b, 'to_textfiles'):
            try:
                b = db.from_sequence(b, npartitions=1)
            except TypeError:
                raise NotImplementedError

        files = open_files(posixpath.join(path, 'part.*'), mode='wt',
                           num=b.npartitions)
        dwrite = dask.delayed(write_file)
        out = [dwrite(part, f, encoder)
               for part, f in zip(b.to_delayed(), files)]
        dask.compute(out)
        s = TextFilesSource(posixpath.join(path, 'part.*'))
        return s
Exemplo n.º 6
0
def test_complex_text(tempdir, comp):
    dump, load, read = 'json.dumps', 'json.loads', True
    dump = import_name(dump)
    data = [{'something': 'simple', 'and': 0}] * 2
    for f in ['1.out', '2.out']:
        fn = os.path.join(tempdir, f)
        with open_files([fn], mode='wt', compression=comp)[0] as fo:
            if read:
                fo.write(dump(data))
            else:
                dump(data, fo)
    # that was all setup

    path = os.path.join(tempdir, '*.out')
    t = TextFilesSource(path, text_mode=True, compression=comp, decoder=load)
    t.discover()
    assert t.npartitions == 2
    assert t._get_partition(0) == t.to_dask().to_delayed()[0].compute()
    out = t.read()
    assert isinstance(out, list)
    assert out[0] == data[0]
Exemplo n.º 7
0
def test_backtrack(temp_cache):
    s = TextFilesSource("*.py")
    s2 = s.persist()
    s3 = store.backtrack(s2)
    assert s3 == s
Exemplo n.º 8
0
def test_persist_with_nonnumeric_ttl_raises_error(temp_cache):
    s = TextFilesSource("*.py")
    with pytest.raises(ValueError, match="User-provided ttl was a string"):
        s.persist(ttl='a string')