def test_glob(hdfs): if type(hdfs).__module__.startswith("hdfs3"): from dask.bytes.hdfs3 import HDFS3HadoopFileSystem hdfs = HDFS3HadoopFileSystem.from_hdfs3(hdfs) else: from dask.bytes.pyarrow import PyArrowHadoopFileSystem hdfs = PyArrowHadoopFileSystem.from_pyarrow(hdfs) tree = { basedir: (["c", "c2"], ["a", "a1", "a2", "a3", "b1"]), basedir + "/c": (["d"], ["x1", "x2"]), basedir + "/c2": (["d"], ["x1", "x2"]), basedir + "/c/d": ([], ["x3"]), } hdfs.mkdirs(basedir + "/c/d/") hdfs.mkdirs(basedir + "/c2/d/") for fn in (posixpath.join(dirname, f) for (dirname, (_, fils)) in tree.items() for f in fils): with hdfs.open(fn, mode="wb") as f2: f2.write(b"000") assert set(hdfs.glob(basedir + "/a*")) == { basedir + p for p in ["/a", "/a1", "/a2", "/a3"] } assert set(hdfs.glob(basedir + "/c/*")) == { basedir + p for p in ["/c/x1", "/c/x2", "/c/d"] } assert set(hdfs.glob(basedir + "/*/x*")) == { basedir + p for p in ["/c/x1", "/c/x2", "/c2/x1", "/c2/x2"] } assert set(hdfs.glob(basedir + "/*/x1")) == { basedir + p for p in ["/c/x1", "/c2/x1"] } assert hdfs.glob(basedir + "/c") == [basedir + "/c"] assert hdfs.glob(basedir + "/c/") == [basedir + "/c/"] assert hdfs.glob(basedir + "/a") == [basedir + "/a"] assert hdfs.glob("/this-path-doesnt-exist") == [] assert hdfs.glob(basedir + "/missing/") == [] assert hdfs.glob(basedir + "/missing/x1") == [] assert hdfs.glob(basedir + "/missing/*") == [] assert hdfs.glob(basedir + "/*/missing") == [] assert set(hdfs.glob(basedir + "/*")) == { basedir + p for p in ["/a", "/a1", "/a2", "/a3", "/b1", "/c", "/c2"] }
def test_glob(hdfs): if type(hdfs).__module__.startswith('hdfs3'): from dask.bytes.hdfs3 import HDFS3HadoopFileSystem hdfs = HDFS3HadoopFileSystem.from_hdfs3(hdfs) else: from dask.bytes.pyarrow import PyArrowHadoopFileSystem hdfs = PyArrowHadoopFileSystem.from_pyarrow(hdfs) tree = { basedir: (['c', 'c2'], ['a', 'a1', 'a2', 'a3', 'b1']), basedir + '/c': (['d'], ['x1', 'x2']), basedir + '/c2': (['d'], ['x1', 'x2']), basedir + '/c/d': ([], ['x3']) } hdfs.mkdirs(basedir + '/c/d/') hdfs.mkdirs(basedir + '/c2/d/') for fn in (posixpath.join(dirname, f) for (dirname, (_, fils)) in tree.items() for f in fils): with hdfs.open(fn, mode='wb') as f2: f2.write(b'000') assert (set(hdfs.glob(basedir + '/a*')) == { basedir + p for p in ['/a', '/a1', '/a2', '/a3'] }) assert (set(hdfs.glob(basedir + '/c/*')) == { basedir + p for p in ['/c/x1', '/c/x2', '/c/d'] }) assert (set(hdfs.glob(basedir + '/*/x*')) == { basedir + p for p in ['/c/x1', '/c/x2', '/c2/x1', '/c2/x2'] }) assert (set(hdfs.glob(basedir + '/*/x1')) == { basedir + p for p in ['/c/x1', '/c2/x1'] }) assert hdfs.glob(basedir + '/c') == [basedir + '/c'] assert hdfs.glob(basedir + '/c/') == [basedir + '/c/'] assert hdfs.glob(basedir + '/a') == [basedir + '/a'] assert hdfs.glob('/this-path-doesnt-exist') == [] assert hdfs.glob(basedir + '/missing/') == [] assert hdfs.glob(basedir + '/missing/x1') == [] assert hdfs.glob(basedir + '/missing/*') == [] assert hdfs.glob(basedir + '/*/missing') == [] assert (set(hdfs.glob(basedir + '/*')) == { basedir + p for p in ['/a', '/a1', '/a2', '/a3', '/b1', '/c', '/c2'] })
def test_glob(hdfs): if type(hdfs).__module__.startswith('hdfs3'): from dask.bytes.hdfs3 import HDFS3HadoopFileSystem hdfs = HDFS3HadoopFileSystem.from_hdfs3(hdfs) else: from dask.bytes.pyarrow import PyArrowHadoopFileSystem hdfs = PyArrowHadoopFileSystem.from_pyarrow(hdfs) tree = {basedir: (['c', 'c2'], ['a', 'a1', 'a2', 'a3', 'b1']), basedir + '/c': (['d'], ['x1', 'x2']), basedir + '/c2': (['d'], ['x1', 'x2']), basedir + '/c/d': ([], ['x3'])} hdfs.mkdirs(basedir + '/c/d/') hdfs.mkdirs(basedir + '/c2/d/') for fn in (posixpath.join(dirname, f) for (dirname, (_, fils)) in tree.items() for f in fils): with hdfs.open(fn, mode='wb') as f2: f2.write(b'000') assert (set(hdfs.glob(basedir + '/a*')) == {basedir + p for p in ['/a', '/a1', '/a2', '/a3']}) assert (set(hdfs.glob(basedir + '/c/*')) == {basedir + p for p in ['/c/x1', '/c/x2', '/c/d']}) assert (set(hdfs.glob(basedir + '/*/x*')) == {basedir + p for p in ['/c/x1', '/c/x2', '/c2/x1', '/c2/x2']}) assert (set(hdfs.glob(basedir + '/*/x1')) == {basedir + p for p in ['/c/x1', '/c2/x1']}) assert hdfs.glob(basedir + '/c') == [basedir + '/c'] assert hdfs.glob(basedir + '/c/') == [basedir + '/c/'] assert hdfs.glob(basedir + '/a') == [basedir + '/a'] assert hdfs.glob('/this-path-doesnt-exist') == [] assert hdfs.glob(basedir + '/missing/') == [] assert hdfs.glob(basedir + '/missing/x1') == [] assert hdfs.glob(basedir + '/missing/*') == [] assert hdfs.glob(basedir + '/*/missing') == [] assert (set(hdfs.glob(basedir + '/*')) == {basedir + p for p in ['/a', '/a1', '/a2', '/a3', '/b1', '/c', '/c2']})
def test_pyarrow_compat(): from dask.bytes.hdfs3 import HDFS3HadoopFileSystem dhdfs = HDFS3HadoopFileSystem() pa_hdfs = dhdfs._get_pyarrow_filesystem() assert isinstance(pa_hdfs, pyarrow.filesystem.FileSystem)