Python Path示例，org.apache.hadoop.fs.Path Python示例

示例#1

0

显示文件

def main(args):
    conf = JobConf(WordCountMap)
    conf.setJobName("wordcount")

    conf.setOutputKeyClass(Text)
    conf.setOutputValueClass(IntWritable)

    conf.setMapperClass(WordCountMap)
    conf.setCombinerClass(Summer)
    conf.setReducerClass(Summer)
    try:
        flags, other_args = getopt.getopt(args[1:], "m:r:")
    except getopt.GetoptError:
        printUsage(1)
    if len(other_args) != 2:
        printUsage(1)

    for f, v in flags:
        if f == "-m":
            conf.setNumMapTasks(int(v))
        elif f == "-r":
            conf.setNumReduceTasks(int(v))
    conf.setInputPath(Path(other_args[0]))
    conf.setOutputPath(Path(other_args[1]))
    JobClient.runJob(conf)

示例#2

0

显示文件

def main(args):
    if len(args) < 6:
        printUsage(1);

    inDir = args[1];
    outDir = args[2];
    numOfReducers = int(args[3]);
    theInputFormat = args[4];
    specFile = args[5];
                                        
    print "numOfReducers: ", numOfReducers, "theInputFormat: ", theInputFormat, "specFile: ", specFile

    conf = JobConf(AbacusMapper);
    conf.setJobName("recordcount");
    conf.addDefaultResource(Path(specFile));
 
    if theInputFormat=="textinputformat":
        conf.setInputFormat(TextInputFormat);
    else:
        conf.setInputFormat(SequenceFileInputFormat);
    conf.setOutputFormat(TextOutputFormat);
    conf.setMapOutputKeyClass(Text);
    conf.setMapOutputValueClass(Text);
    conf.setOutputKeyClass(Text);
    conf.setOutputValueClass(Text);
    conf.setNumMapTasks(1);
    conf.setNumReduceTasks(numOfReducers);

    conf.setMapperClass(AbacusMapper);        
    conf.setCombinerClass(AbacusCombiner);
    conf.setReducerClass(AbacusReducer);
    conf.setInputPath(Path(args[1]))
    conf.setOutputPath(Path(args[2]))

    JobClient.runJob(conf);

示例#3

0

显示文件

文件： tap.py 项目： butzeb/pycascading

 def hdfs_folder_exists(self, folder):
     path = Path(folder)
     fs = path.getFileSystem(Configuration())
     try:
         status = fs.getFileStatus(path)
         # TODO: there could be problems if it exists but is a simple file
         return status.isDir()
     except:
         return False

示例#4

0

显示文件

文件： tap.py 项目： seanjensengrey/pycascading

 def hdfs_folder_exists(self, folder):
     path = Path(folder)
     fs = path.getFileSystem(Configuration())
     try:
         status = fs.getFileStatus(path)
         # TODO: there could be problems if it exists but is a simple file
         return status.isDir()
     except:
         return False

示例#5

0

显示文件

 def createInputFile(self, fs, fileName, input_data):
     if (fs.exists(Path(fileName))):
         raise IOException("File " + fileName +
                           " already exists on the minicluster")
     stream = fs.create(Path(fileName))
     pw = PrintWriter(OutputStreamWriter(stream, "UTF-8"))
     for i in xrange(len(input_data)):
         pw.println(input_data[i])
     pw.close()

示例#6

0

显示文件

 def mv(self, srcfpath, trgfpath):
     try:
         sp = Path(srcfpath)
         tp = Path(trgfpath)
         #  Needs work...
         self.fsHd.rename(sp, tp)
     except JException as ex:
         self.logger.error("Exception in HdfsUtil.mv({}): ex[{}]".format(
             fpath, ex))

示例#7

0

显示文件

文件： hdfs.py 项目： crankycoder/sandbox

    def _open_write(self, path):
        p = Path(path)

        dirname = p.getParent()
        self._fs.mkdirs(dirname)

        key = self._key
        value = self._value

        writer = SequenceFile.createWriter(self._fs, self._conf, p, key.getClass(), value.getClass())
        return WrappedWriter(writer, key, value)

示例#8

0

显示文件

    def __init__(self, hdfsCluster):
        self.logger = Logger.getLogger("Hdfs")
        # self.logger.setLevel(Level.DEBUG)

        coreSite = "/etc/hadoop/conf/core-site.xml"
        hdfsSite = "/etc/hadoop/conf/hdfs-site.xml"
        hdfsCluster = hdfsCluster
        self.cHdfs = Configuration()
        self.cHdfs.addResource(Path(coreSite))
        self.cHdfs.addResource(Path(hdfsSite))
        self.cHdfs.set("fs.defaultFS", hdfsCluster)
        self.fileSystem = FileSystem.get(self.cHdfs)
        self.fileUtil = FileUtil()

示例#9

0

显示文件

 def lsIterator(self, fpath):
     ''' Returns an iterator that returns files and dirs
         w/in a given dir path (no recursion).
     '''
     p = Path(fpath)
     self.lsListIterator = self.fsHd.listLocatedStatus(p)
     return self.lsListIterator

示例#10

0

显示文件

 def lsFileIterator(self, fpath, recurse=False):
     ''' Returns an iterator that returns files (only)
         w/in a given dir path (w/the option for recursion).
     '''
     p = Path(fpath)
     self.fileListIterator = self.fsHd.listFiles(p, recurse)
     return self.fileListIterator

示例#11

0

显示文件

 def isDir(self, fpath):
     try:
         # Create an empty file on HDFS
         fp = Path(fpath)
         return self.fsHd.isDirectory(fp)
     except JException as ex:
         self.logger.error("Exception in HdfsUtil.isDir({}): ex[{}]".format(
             fpath, ex))

示例#12

0

显示文件

 def setOwner(self, fpath, user, group):
     try:
         # Create an empty file on HDFS
         fp = Path(fpath)
         return self.fsHd.setOwner(fp, user, group)
     except JException as ex:
         self.logger.error(
             "Exception in HdfsUtil.setOwner({}): ex[{}]".format(fpath, ex))

示例#13

0

显示文件

文件： tap.py 项目： seanjensengrey/pycascading

def expand_path_with_home(output_folder):
    """Prepend the home folder to a relative location on HDFS if necessary.
    
    If we specified a relative path, prepend it with the home folder
    of the user on HDFS. If we are running in local mode, don't do anything.
    
    Arguments:
    output_folder -- the absolute or relative path of the output HDFS folder
    """
    import pycascading.pipe

    if pycascading.pipe.running_mode == "hadoop":
        if output_folder == "" or (output_folder[0:5] not in set(["hdfs:", "file:"]) and output_folder[0] != "/"):
            fs = Path("/").getFileSystem(Configuration())
            home_folder = fs.getHomeDirectory().toString()
            return home_folder + "/" + output_folder
    return output_folder

示例#14

0

显示文件

def expand_path_with_home(output_folder):
    """Prepend the home folder to a relative location on HDFS if necessary.

    Only if we specified a relative path and no scheme, prepend it with the
    home folder of the user on HDFS. This behavior is similar to how
    "hadoop fs" works. If we are running in local mode, don't do anything.

    Arguments:
    output_folder -- the absolute or relative path of the output HDFS folder
    """
    import pycascading.pipe
    if pycascading.pipe.config['pycascading.running_mode'] == 'hadoop':
        if not any(map(lambda scheme: output_folder.startswith(scheme), \
                       ['hdfs:', 'file:', 's3:', 's3n:', '/'])):
            fs = Path('/').getFileSystem(Configuration())
            home_folder = fs.getHomeDirectory().toString()
            return home_folder + '/' + output_folder
    return output_folder

示例#15

0

显示文件

文件： tap.py 项目： ArturFis/pycascading

def expand_path_with_home(output_folder):
    """Prepend the home folder to a relative location on HDFS if necessary.

    Only if we specified a relative path and no scheme, prepend it with the
    home folder of the user on HDFS. This behavior is similar to how
    "hadoop fs" works. If we are running in local mode, don't do anything.

    Arguments:
    output_folder -- the absolute or relative path of the output HDFS folder
    """
    import pycascading.pipe
    if pycascading.pipe.config['pycascading.running_mode'] == 'hadoop':
        if not any(map(lambda scheme: output_folder.startswith(scheme), \
                       ['hdfs:', 'file:', 's3:', 's3n:', '/'])):
            fs = Path('/').getFileSystem(Configuration())
            home_folder = fs.getHomeDirectory().toString()
            return home_folder + '/' + output_folder
    return output_folder

示例#16

0

显示文件

 def touch(self, fpath):
     try:
         # Create an empty file on HDFS
         fp = Path(fpath)
         os = self.fsHd.create(fp)
         os.close()
     except JException as ex:
         self.logger.error("Exception in HdfsUtil.touch({}): ex[{}]".format(
             fpath, ex))

示例#17

0

显示文件

 def getFileStat(self, fpath):
     try:
         # Create an empty file on HDFS
         fp = Path(fpath)
         return self.fsHd.getFileStatus(fp)
     except JException as ex:
         self.logger.error(
             "Exception in HdfsUtil.getFileStat({}): ex[{}]".format(
                 fpath, ex))

示例#18

0

显示文件

 def setPerms(self, fpath, perms):
     try:
         # Create an empty file on HDFS
         fp = Path(fpath)
         fsPerm = FsPermission(perms)
         return self.fsHd.setPerms(fp, perms)
     except JException as ex:
         self.logger.error(
             "Exception in HdfsUtil.setPerms({}): ex[{}]".format(fpath, ex))

示例#19

0

显示文件

 def ls(self, fpath):
     ''' Returns a list of all files in a given dir.
         This file list can be very long and take lots of memory.
         Use lsIterator or lsFileIterator instead to minimize
         memory usage.
     '''
     p = Path(fpath)
     self.fileList = self.fsHd.listStatus(p)
     return self.fileList

示例#20

0

显示文件

文件： tap.py 项目： seanjensengrey/pycascading

def expand_path_with_home(output_folder):
    """Prepend the home folder to a relative location on HDFS if necessary.
    
    If we specified a relative path, prepend it with the home folder
    of the user on HDFS. If we are running in local mode, don't do anything.
    
    Arguments:
    output_folder -- the absolute or relative path of the output HDFS folder
    """
    import pycascading.pipe
    if pycascading.pipe.running_mode == 'hadoop':
        if output_folder == '' or \
        (output_folder[0 : 5] not in set(['hdfs:', 'file:']) and \
         output_folder[0] != '/'):
            fs = Path('/').getFileSystem(Configuration())
            home_folder = fs.getHomeDirectory().toString()
            return home_folder + '/' + output_folder
    return output_folder

示例#21

0

显示文件

 def setRep(self, fpath, replication):
     try:
         # Create an empty file on HDFS
         fp = Path(fpath)
         return self.fsHd.setReplication(fp, replication)
     except JException as ex:
         self.logger.error(
             "Exception in HdfsUtil.setReplication({}): ex[{}]".format(
                 fpath, ex))

示例#22

0

显示文件

    def mkdir(self, fpath, perms=755):
        try:
            p = Path(fpath)
            permObj = FsPermission("{}".format(perms))
            retVal = self.fsHd.mkdirs(p, permObj)
            if retVal == False:
                self.logger.error(
                    "HdfsUtil.mkdir({}): Failed to create dir.".format(
                        fpath, ex))

        except JException as ex:
            self.logger.error("Exception in HdfsUtil.mkdir({}): ex[{}]".format(
                fpath, ex))

示例#23

0

显示文件

    def cp(self, srcfpath, trgfpath):
        "Copy data within the current HDFS."
        try:
            sp = []
            fileList = self.fsHd.globStatus(Path(srcfpath))
            if ((fileList is None) or (fileList.__len__() == 0)):
                # Emit and error: No files found for srcfPath
                None
            for sfp in fileList:
                sp.append(sfp.getPath())

            sfs = FileSystem.newInstance(self.hdfs.cHdfs)
            tp = Path(trgfpath)
            tfs = FileSystem.newInstance(self.hdfs.cHdfs)
            delSrc = False
            overWrite = True
            self.hdfs.fileUtil.copy(sfs, sp, tfs, tp, delSrc, overWrite,
                                    self.hdfs.cHdfs)
        except JException as ex:
            self.logger.error(
                "Exception in HdfsUtil.cp({} -> {}): ex[{}]".format(
                    srcfpath, trgfpath, ex))

示例#24

0

显示文件

    def exists(self, fpath):
        try:
            # Create an empty file on HDFS
            sp = []
            fileList = self.fsHd.globStatus(Path(fpath))
            if ((fileList is None) or (fileList.__len__() == 0)):
                # Emit and error: No files found for srcfPath
                return False

            return True

        except JException as ex:
            self.logger.error(
                "Exception in HdfsUtil.exists({}): ex[{}]".format(fpath, ex))

示例#25

0

显示文件

    def openRead(self, fPath):
        fpHdfs = Path(fPath)
        fsInput = self.fsHd.open(fpHdfs)

        reader = None
        pat = r'.*\.gz'
        match = re.search(pat, hst)
        if match == None:
            reader = BufferedReader(InputStreamReader(fsInput))
        else:
            # The file stream is in GZip format...
            reader = BufferedReader(InputStreamReader(
                GZIPInputStream(fsInput)))

        return reader

示例#26

0

显示文件

文件： test_pigproxy.py 项目： cbaenziger/squealer

    def testStore(self):
        from tempfile import mktemp
        tempdir = mktemp()
        outfile = tempdir + '/top_3_queries'
        args = [
            "n=3",
            "reducers=1",
            "input=" + self.INPUT_FILE,
            "output=" + outfile,
        ]
        proxy = PigProxy.from_file(self.PIG_SCRIPT, args)

        # By default all STORE and DUMP commands are removed
        proxy.unoverride("STORE")
        proxy.run_script()
        cluster = Cluster(proxy.pig.getPigContext())
        self.assert_(cluster.delete(Path(outfile)))

示例#27

0

显示文件

    def __init__(self, hdfsCluster, fpath):
        self.hdfs = Hdfs(hdfsCluster)
        self.fsHd = self.hdfs.fileSystem

        fpHdfs = Path(fpath)
        fsInput = self.fsHd.open(fpHdfs)
        # The file has text so we want to use read the input stream via the BufferedReader.
        reader = BufferedReader(InputStreamReader(fsInput))
        self.lineCount = 0
        self.lines = []
        line = reader.readLine()
        while line is not None:
            # print line
            self.lines.append(line)
            self.lineCount = self.lineCount + 1
            if ((self.lineCount % 1000) == 0):
                print self.lineCount
            line = reader.readLine()

示例#28

0

显示文件

    def rm(self, fpath, **kwargs):
        try:
            sp = []
            fileList = self.fsHd.globStatus(Path(fpath))
            if (fileList is None):
                # self.logger.warn("No Files found in: [{}]".format(fpath))
                return

            if 'recurse' in kwargs:
                recurse = kwargs['recurse']
            else:
                recurse = False

            for sfp in fileList:
                self.fsHd.delete(sfp.getPath(), recurse)

        except JException as ex:
            self.logger.error("Exception in HdfsUtil.rm({}): ex[{}]".format(
                fpath, ex))

示例#29

0

显示文件

#  Licensed to the Apache Software Foundation (ASF) under one or more
#  contributor license agreements.  See the NOTICE file distributed with
#  this work for additional information regarding copyright ownership.
#  The ASF licenses this file to You under the Apache License, Version 2.0
#  (the "License"); you may not use this file except in compliance with
#  the License.  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
from org.apache.hadoop.fs import Path  # Test for PIG-1824
p = Path('foo')


@outputSchemaFunction("squareSchema")
def square(num):
    if num == None:
        return None
    return ((num) * (num))


@schemaFunction("squareSchema")
def squareSchema(input):
    return input


@outputSchema("word:chararray")

示例#30

0

显示文件

文件： cluster.py 项目： MarkRoddy/squealer

 def copyContentFromLocalFile(self, content, dest_path, overwrite = True):
     file_path = Path(dest_path)
     fs = file_path.getFileSystem(self.configuration)
     if overwrite and fs.exists(file_path):
         fs.delete(file_path, True)
     self.createInputFile(fs, dest_path, content)

示例#31

0

显示文件

 def copyContentFromLocalFile(self, content, dest_path, overwrite=True):
     file_path = Path(dest_path)
     fs = file_path.getFileSystem(self.configuration)
     if overwrite and fs.exists(file_path):
         fs.delete(file_path, True)
     self.createInputFile(fs, dest_path, content)

示例#32

0

显示文件

文件： basic-script.py 项目： Naplues/BugTypeBasedIRBL

from java.util import UUID
from org.apache.hadoop.fs import Path

print "Home dir is " + str(fs.homeDirectory)
print "Work dir is " + str(fs.workingDirectory)
print "/user exists " + str(fs.exists("/user"))

name = UUID.randomUUID().toString()
scriptName = "src/test/resources/test.properties"
fs.copyFromLocalFile(scriptName, name)
print Path(name).makeQualified(fs)

# use the shell
dir = "script-dir/"
if not fsh.test(dir):
	fsh.mkdir(dir)
	fsh.cp(name, dir)
	fsh.chmodr(700, dir)
	print "File content is " + str(fsh.cat(dir + name))


print str(fsh.ls(dir))
fsh.rmr(dir)
fs.getLength(name)

示例#33

0

显示文件

文件： build.py 项目： internetarchive/waimea

import sys
import time

startTime = time.time()

if len(sys.argv) != 2:
    raise sys.argv[0] + ' <basedir>'

# Get reference to the Hadoop FileSystem object.  Everything we do in
# this script that interacts with HDFS is through this object.
fs = FileSystem.get(Configuration())

# Make sure the requested collection exists.
collection = sys.argv[1]
collectionDir = Path(collection)

if not fs.exists(collectionDir):
    print '\nERROR: no collection directory: %s' % collectionDir
    System.exit(1)

# Check for "guard" file.  Like a semaphore, ensures that we don't try
# to update this collection while it's in the middle of being updated.
# Since file creation in HDFS is atomic, we don't check for the existence
# of the guardFile, rather we try to create it.  If the file already exists
# then fs.createNewFile() will return False
guardFile = Path(collectionDir, '_updating')
if not fs.createNewFile(guardFile):
    print '\nERROR: collection update already in progress: %s' % guardFile
    System.exit(1)

示例#34

0

显示文件

    def openWrite(self, fPath):
        fp = Path(fPath)
        fsOutput = self.fsHd.create(fp)

        return fsOutput