예제 #1
0
def test_glob(hdfs):
    if type(hdfs).__module__.startswith("hdfs3"):
        from dask.bytes.hdfs3 import HDFS3HadoopFileSystem

        hdfs = HDFS3HadoopFileSystem.from_hdfs3(hdfs)
    else:
        from dask.bytes.pyarrow import PyArrowHadoopFileSystem

        hdfs = PyArrowHadoopFileSystem.from_pyarrow(hdfs)

    tree = {
        basedir: (["c", "c2"], ["a", "a1", "a2", "a3", "b1"]),
        basedir + "/c": (["d"], ["x1", "x2"]),
        basedir + "/c2": (["d"], ["x1", "x2"]),
        basedir + "/c/d": ([], ["x3"]),
    }

    hdfs.mkdirs(basedir + "/c/d/")
    hdfs.mkdirs(basedir + "/c2/d/")
    for fn in (posixpath.join(dirname, f)
               for (dirname, (_, fils)) in tree.items() for f in fils):
        with hdfs.open(fn, mode="wb") as f2:
            f2.write(b"000")

    assert set(hdfs.glob(basedir + "/a*")) == {
        basedir + p
        for p in ["/a", "/a1", "/a2", "/a3"]
    }

    assert set(hdfs.glob(basedir + "/c/*")) == {
        basedir + p
        for p in ["/c/x1", "/c/x2", "/c/d"]
    }

    assert set(hdfs.glob(basedir + "/*/x*")) == {
        basedir + p
        for p in ["/c/x1", "/c/x2", "/c2/x1", "/c2/x2"]
    }
    assert set(hdfs.glob(basedir + "/*/x1")) == {
        basedir + p
        for p in ["/c/x1", "/c2/x1"]
    }

    assert hdfs.glob(basedir + "/c") == [basedir + "/c"]
    assert hdfs.glob(basedir + "/c/") == [basedir + "/c/"]
    assert hdfs.glob(basedir + "/a") == [basedir + "/a"]

    assert hdfs.glob("/this-path-doesnt-exist") == []
    assert hdfs.glob(basedir + "/missing/") == []
    assert hdfs.glob(basedir + "/missing/x1") == []
    assert hdfs.glob(basedir + "/missing/*") == []
    assert hdfs.glob(basedir + "/*/missing") == []

    assert set(hdfs.glob(basedir + "/*")) == {
        basedir + p
        for p in ["/a", "/a1", "/a2", "/a3", "/b1", "/c", "/c2"]
    }
예제 #2
0
def test_glob(hdfs):
    if type(hdfs).__module__.startswith('hdfs3'):
        from dask.bytes.hdfs3 import HDFS3HadoopFileSystem
        hdfs = HDFS3HadoopFileSystem.from_hdfs3(hdfs)
    else:
        from dask.bytes.pyarrow import PyArrowHadoopFileSystem
        hdfs = PyArrowHadoopFileSystem.from_pyarrow(hdfs)

    tree = {
        basedir: (['c', 'c2'], ['a', 'a1', 'a2', 'a3', 'b1']),
        basedir + '/c': (['d'], ['x1', 'x2']),
        basedir + '/c2': (['d'], ['x1', 'x2']),
        basedir + '/c/d': ([], ['x3'])
    }

    hdfs.mkdirs(basedir + '/c/d/')
    hdfs.mkdirs(basedir + '/c2/d/')
    for fn in (posixpath.join(dirname, f)
               for (dirname, (_, fils)) in tree.items() for f in fils):
        with hdfs.open(fn, mode='wb') as f2:
            f2.write(b'000')

    assert (set(hdfs.glob(basedir + '/a*')) == {
        basedir + p
        for p in ['/a', '/a1', '/a2', '/a3']
    })

    assert (set(hdfs.glob(basedir + '/c/*')) == {
        basedir + p
        for p in ['/c/x1', '/c/x2', '/c/d']
    })

    assert (set(hdfs.glob(basedir + '/*/x*')) == {
        basedir + p
        for p in ['/c/x1', '/c/x2', '/c2/x1', '/c2/x2']
    })
    assert (set(hdfs.glob(basedir + '/*/x1')) == {
        basedir + p
        for p in ['/c/x1', '/c2/x1']
    })

    assert hdfs.glob(basedir + '/c') == [basedir + '/c']
    assert hdfs.glob(basedir + '/c/') == [basedir + '/c/']
    assert hdfs.glob(basedir + '/a') == [basedir + '/a']

    assert hdfs.glob('/this-path-doesnt-exist') == []
    assert hdfs.glob(basedir + '/missing/') == []
    assert hdfs.glob(basedir + '/missing/x1') == []
    assert hdfs.glob(basedir + '/missing/*') == []
    assert hdfs.glob(basedir + '/*/missing') == []

    assert (set(hdfs.glob(basedir + '/*')) == {
        basedir + p
        for p in ['/a', '/a1', '/a2', '/a3', '/b1', '/c', '/c2']
    })
예제 #3
0
파일: test_hdfs.py 프로젝트: mrocklin/dask
def test_glob(hdfs):
    if type(hdfs).__module__.startswith('hdfs3'):
        from dask.bytes.hdfs3 import HDFS3HadoopFileSystem
        hdfs = HDFS3HadoopFileSystem.from_hdfs3(hdfs)
    else:
        from dask.bytes.pyarrow import PyArrowHadoopFileSystem
        hdfs = PyArrowHadoopFileSystem.from_pyarrow(hdfs)

    tree = {basedir: (['c', 'c2'], ['a', 'a1', 'a2', 'a3', 'b1']),
            basedir + '/c': (['d'], ['x1', 'x2']),
            basedir + '/c2': (['d'], ['x1', 'x2']),
            basedir + '/c/d': ([], ['x3'])}

    hdfs.mkdirs(basedir + '/c/d/')
    hdfs.mkdirs(basedir + '/c2/d/')
    for fn in (posixpath.join(dirname, f)
               for (dirname, (_, fils)) in tree.items()
               for f in fils):
        with hdfs.open(fn, mode='wb') as f2:
            f2.write(b'000')

    assert (set(hdfs.glob(basedir + '/a*')) ==
            {basedir + p for p in ['/a', '/a1', '/a2', '/a3']})

    assert (set(hdfs.glob(basedir + '/c/*')) ==
            {basedir + p for p in ['/c/x1', '/c/x2', '/c/d']})

    assert (set(hdfs.glob(basedir + '/*/x*')) ==
            {basedir + p for p in ['/c/x1', '/c/x2', '/c2/x1', '/c2/x2']})
    assert (set(hdfs.glob(basedir + '/*/x1')) ==
            {basedir + p for p in ['/c/x1', '/c2/x1']})

    assert hdfs.glob(basedir + '/c') == [basedir + '/c']
    assert hdfs.glob(basedir + '/c/') == [basedir + '/c/']
    assert hdfs.glob(basedir + '/a') == [basedir + '/a']

    assert hdfs.glob('/this-path-doesnt-exist') == []
    assert hdfs.glob(basedir + '/missing/') == []
    assert hdfs.glob(basedir + '/missing/x1') == []
    assert hdfs.glob(basedir + '/missing/*') == []
    assert hdfs.glob(basedir + '/*/missing') == []

    assert (set(hdfs.glob(basedir + '/*')) ==
            {basedir + p for p in ['/a', '/a1', '/a2', '/a3', '/b1', '/c', '/c2']})
예제 #4
0
def test_pyarrow_compat():
    from dask.bytes.hdfs3 import HDFS3HadoopFileSystem
    dhdfs = HDFS3HadoopFileSystem()
    pa_hdfs = dhdfs._get_pyarrow_filesystem()
    assert isinstance(pa_hdfs, pyarrow.filesystem.FileSystem)