예제 #1
파일: dfs.py 프로젝트: tristanbuckner/happy
def getFileSystem(fs="dfs"):
    Returns a Hadoop FileSystem object, either "dfs" (default) or "local".
    if fs == "dfs": return FileSystem.get(happy.getJobConf())
    elif fs == "local": return FileSystem.getLocal(happy.getJobConf())
    else: raise Exception("Unknown filesystem " + fs)
예제 #2
파일: dfs.py 프로젝트: tristanbuckner/happy
def merge(path, dst):
    Merges files in a specified directory to a specified file.
    input = DatasetPath(happy.getJobConf(), path)
    output = DatasetPath(happy.getJobConf(), dst)
예제 #3
파일: dfs.py 프로젝트: tristanbuckner/happy
def read(path):
    Returns a Python file-like object for a specified DFS file or directory.
    Merges files in a specified directory.
    # this is a hack because PyFile doesn't support Readers:
    return ReaderFile(DatasetPath(happy.getJobConf(), path).getReader())
예제 #4
파일: dfs.py 프로젝트: tristanbuckner/happy
def createCollector(path, fs="dfs", type="text", compressiontype="lzo", sequencetype="BLOCK"):
    Creates a type "text" (default) or "sequence" file collector at the specified path.
    Collectors are automatically closed at the end of the job.
    filesystem = getFileSystem(fs)
    datasetPath = DatasetPath(filesystem, path)
    if type == "sequence":
        collector = TextSequenceFileCollector(filesystem, happy.getJobConf(), Path(path),
                                              _getSequenceFileType(sequencetype), _getCodecInstance(compressiontype))
    elif type == "text":
        collector = TextFileCollector(filesystem, happy.getJobConf(), Path(path))
    elif type == "bjson":
        collector = BJSONCollector(filesystem, happy.getJobConf(), Path(path),
                                   _getSequenceFileType(sequencetype), _getCodecInstance(compressiontype))
    else: raise Exception("Unknown collector type " + type)
    # add as a closeable so that it is closed correctly:
    if happy.job is not None: happy.job.addCloseable(collector)
    return collector
예제 #5
파일: dfs.py 프로젝트: tristanbuckner/happy
def mktemp(name=None):
    Generate a directory path safe to use for temporary data.
    An optional name will be used to prefix the path for easier debugging.
    The path will be generated within the current hadoop.tmp.dir and will sort
    path = happy.getJobConf().get("hadoop.tmp.dir") + "/"
    if name:
        path += str(name) + "-"
    path += "%.0f%i" % (time.time(), random.randint(0, 1E5))
    return path
예제 #6
파일: dfs.py 프로젝트: tristanbuckner/happy
def rename(src, dst):
    Renames a DFS path.
    DatasetPath(happy.getJobConf(), src).rename(dst)
예제 #7
파일: dfs.py 프로젝트: tristanbuckner/happy
def copyFromLocal(localpath, path):
    Copies a local path to a DFS file.
    DatasetPath(happy.getJobConf(), path).copyFromLocal(localpath)
예제 #8
파일: dfs.py 프로젝트: tristanbuckner/happy
def copyToLocal(path, localpath):
    Copies a DFS path to a local file.  Merges files in a specified directory.
    DatasetPath(happy.getJobConf(), path).copyToLocal(localpath)
예제 #9
파일: dfs.py 프로젝트: tristanbuckner/happy
def delete(path):
    Deletes a specified DFS path.
    DatasetPath(happy.getJobConf(), path).deletePath()
예제 #10
파일: dfs.py 프로젝트: tristanbuckner/happy
def grep(path, regex):
    Returns an iterator over lines in a path that contain a given regular expression.
    Uses the Java regex syntax.
    return StringIterator.getIterator(DatasetPath(happy.getJobConf(), path).grepLines(regex))
예제 #11
파일: dfs.py 프로젝트: tristanbuckner/happy
def write(path, compressiontype=None):
    Returns a Python file-like object for a specified DFS file.  Uses a specified compression codec.
    return WriterFile(DatasetPath(happy.getJobConf(), path).getWriter(_getCodec(compressiontype)))
예제 #12
파일: dfs.py 프로젝트: tristanbuckner/happy
def openMapDir(path):
    Opens a MapDir map over a directory of MapFiles.
    return PyMapDir.openMapDir(getFileSystem(), path, happy.getJobConf())
예제 #13
파일: dfs.py 프로젝트: tristanbuckner/happy
def exists(path):
    Returns True if this path exists
    return DatasetPath(happy.getJobConf(), path).exists()
예제 #14
파일: dfs.py 프로젝트: tristanbuckner/happy
def fileStatus(path):
    Returns the org.apache.hadoop.fs.FileStatus object for this path
    return DatasetPath(happy.getJobConf(), path).getFileStatus()