def _get_files(self, path): path = os.path.realpath(path) if os.path.isdir(path): for root, dirs, names in walk(path): for n in sorted(names): if not n.startswith('.'): yield os.path.join(root, n) else: yield path
def table(self, path, **kwargs): dpath = path[0] if isinstance(path, (list, tuple)) else path for root, dirs, names in walk(dpath): if '.field_names' in names: p = os.path.join(root, '.field_names') with open(p) as f: fields = f.read().split('\t') break else: raise Exception("no .field_names found in %s" % path) return self.tableFile(path, **kwargs).asTable(fields)
def textFile(self, path, ext='', followLink=True, maxdepth=0, cls=TextFileRDD, *ka, **kws): self.init() if isinstance(path, (list, tuple)): return self.union([ self.textFile(p, ext, followLink, maxdepth, cls, *ka, **kws) for p in path ]) path = os.path.realpath(path) def create_rdd(_cls, _path, *_ka, **_kw): if _cls is TextFileRDD: if _path.endswith('.bz2'): return BZip2FileRDD(self, _path, *_ka, **_kw) elif _path.endswith('.gz'): return GZipFileRDD(self, _path, *_ka, **_kw) return _cls(self, _path, *_ka, **_kw) if os.path.isdir(path): paths = [] for root, dirs, names in walk(path, followlinks=followLink): if maxdepth > 0: depth = len( [_f for _f in root[len(path):].split('/') if _f]) + 1 if depth > maxdepth: break for n in sorted(names): if n.endswith(ext) and not n.startswith('.'): p = os.path.join(root, n) if followLink or not os.path.islink(p): paths.append(p) dirs.sort() for d in dirs[:]: if d.startswith('.'): dirs.remove(d) rdds = [create_rdd(cls, p, *ka, **kws) for p in paths] return self.union(rdds) else: return create_rdd(cls, path, *ka, **kws)
def walk_dir(path, followlinks=False): paths = [] t = time.time() for root, dirs, names in walk(path, followlinks=followlinks): for n in sorted(names): if not n.startswith('.'): p = os.path.join(root, n) if followlinks or not os.path.islink(p): paths.append(p) dirs.sort() for d in dirs[:]: if d.startswith('.'): dirs.remove(d) t = time.time() - t print("walk {} files use {}s".format(len(paths), t)) return paths
def textFile(self, path, ext='', followLink=True, maxdepth=0, cls=TextFileRDD, *ka, **kws): self.init() if isinstance(path, (list, tuple)): return self.union([self.textFile(p, ext, followLink, maxdepth, cls, *ka, **kws) for p in path]) path = os.path.realpath(path) def create_rdd(_cls, _path, *_ka, **_kw): if _cls is TextFileRDD: if _path.endswith('.bz2'): return BZip2FileRDD(self, _path, *_ka, **_kw) elif _path.endswith('.gz'): return GZipFileRDD(self, _path, *_ka, **_kw) return _cls(self, _path, *_ka, **_kw) if os.path.isdir(path): paths = [] for root, dirs, names in walk(path, followlinks=followLink): if maxdepth > 0: depth = len([_f for _f in root[len(path):].split('/') if _f]) + 1 if depth > maxdepth: break for n in sorted(names): if n.endswith(ext) and not n.startswith('.'): p = os.path.join(root, n) if followLink or not os.path.islink(p): paths.append(p) dirs.sort() for d in dirs[:]: if d.startswith('.'): dirs.remove(d) rdds = [create_rdd(cls, p, *ka, **kws) for p in paths] return self.union(rdds) else: return create_rdd(cls, path, *ka, **kws)