def main(args): conf = JobConf(WordCountMap) conf.setJobName("wordcount") conf.setOutputKeyClass(Text) conf.setOutputValueClass(IntWritable) conf.setMapperClass(WordCountMap) conf.setCombinerClass(Summer) conf.setReducerClass(Summer) try: flags, other_args = getopt.getopt(args[1:], "m:r:") except getopt.GetoptError: printUsage(1) if len(other_args) != 2: printUsage(1) for f, v in flags: if f == "-m": conf.setNumMapTasks(int(v)) elif f == "-r": conf.setNumReduceTasks(int(v)) conf.setInputPath(Path(other_args[0])) conf.setOutputPath(Path(other_args[1])) JobClient.runJob(conf)
def main(args): if len(args) < 6: printUsage(1); inDir = args[1]; outDir = args[2]; numOfReducers = int(args[3]); theInputFormat = args[4]; specFile = args[5]; print "numOfReducers: ", numOfReducers, "theInputFormat: ", theInputFormat, "specFile: ", specFile conf = JobConf(AbacusMapper); conf.setJobName("recordcount"); conf.addDefaultResource(Path(specFile)); if theInputFormat=="textinputformat": conf.setInputFormat(TextInputFormat); else: conf.setInputFormat(SequenceFileInputFormat); conf.setOutputFormat(TextOutputFormat); conf.setMapOutputKeyClass(Text); conf.setMapOutputValueClass(Text); conf.setOutputKeyClass(Text); conf.setOutputValueClass(Text); conf.setNumMapTasks(1); conf.setNumReduceTasks(numOfReducers); conf.setMapperClass(AbacusMapper); conf.setCombinerClass(AbacusCombiner); conf.setReducerClass(AbacusReducer); conf.setInputPath(Path(args[1])) conf.setOutputPath(Path(args[2])) JobClient.runJob(conf);
def hdfs_folder_exists(self, folder): path = Path(folder) fs = path.getFileSystem(Configuration()) try: status = fs.getFileStatus(path) # TODO: there could be problems if it exists but is a simple file return status.isDir() except: return False
def createInputFile(self, fs, fileName, input_data): if (fs.exists(Path(fileName))): raise IOException("File " + fileName + " already exists on the minicluster") stream = fs.create(Path(fileName)) pw = PrintWriter(OutputStreamWriter(stream, "UTF-8")) for i in xrange(len(input_data)): pw.println(input_data[i]) pw.close()
def mv(self, srcfpath, trgfpath): try: sp = Path(srcfpath) tp = Path(trgfpath) # Needs work... self.fsHd.rename(sp, tp) except JException as ex: self.logger.error("Exception in HdfsUtil.mv({}): ex[{}]".format( fpath, ex))
def _open_write(self, path): p = Path(path) dirname = p.getParent() self._fs.mkdirs(dirname) key = self._key value = self._value writer = SequenceFile.createWriter(self._fs, self._conf, p, key.getClass(), value.getClass()) return WrappedWriter(writer, key, value)
def __init__(self, hdfsCluster): self.logger = Logger.getLogger("Hdfs") # self.logger.setLevel(Level.DEBUG) coreSite = "/etc/hadoop/conf/core-site.xml" hdfsSite = "/etc/hadoop/conf/hdfs-site.xml" hdfsCluster = hdfsCluster self.cHdfs = Configuration() self.cHdfs.addResource(Path(coreSite)) self.cHdfs.addResource(Path(hdfsSite)) self.cHdfs.set("fs.defaultFS", hdfsCluster) self.fileSystem = FileSystem.get(self.cHdfs) self.fileUtil = FileUtil()
def lsIterator(self, fpath): ''' Returns an iterator that returns files and dirs w/in a given dir path (no recursion). ''' p = Path(fpath) self.lsListIterator = self.fsHd.listLocatedStatus(p) return self.lsListIterator
def lsFileIterator(self, fpath, recurse=False): ''' Returns an iterator that returns files (only) w/in a given dir path (w/the option for recursion). ''' p = Path(fpath) self.fileListIterator = self.fsHd.listFiles(p, recurse) return self.fileListIterator
def isDir(self, fpath): try: # Create an empty file on HDFS fp = Path(fpath) return self.fsHd.isDirectory(fp) except JException as ex: self.logger.error("Exception in HdfsUtil.isDir({}): ex[{}]".format( fpath, ex))
def setOwner(self, fpath, user, group): try: # Create an empty file on HDFS fp = Path(fpath) return self.fsHd.setOwner(fp, user, group) except JException as ex: self.logger.error( "Exception in HdfsUtil.setOwner({}): ex[{}]".format(fpath, ex))
def expand_path_with_home(output_folder): """Prepend the home folder to a relative location on HDFS if necessary. If we specified a relative path, prepend it with the home folder of the user on HDFS. If we are running in local mode, don't do anything. Arguments: output_folder -- the absolute or relative path of the output HDFS folder """ import pycascading.pipe if pycascading.pipe.running_mode == "hadoop": if output_folder == "" or (output_folder[0:5] not in set(["hdfs:", "file:"]) and output_folder[0] != "/"): fs = Path("/").getFileSystem(Configuration()) home_folder = fs.getHomeDirectory().toString() return home_folder + "/" + output_folder return output_folder
def expand_path_with_home(output_folder): """Prepend the home folder to a relative location on HDFS if necessary. Only if we specified a relative path and no scheme, prepend it with the home folder of the user on HDFS. This behavior is similar to how "hadoop fs" works. If we are running in local mode, don't do anything. Arguments: output_folder -- the absolute or relative path of the output HDFS folder """ import pycascading.pipe if pycascading.pipe.config['pycascading.running_mode'] == 'hadoop': if not any(map(lambda scheme: output_folder.startswith(scheme), \ ['hdfs:', 'file:', 's3:', 's3n:', '/'])): fs = Path('/').getFileSystem(Configuration()) home_folder = fs.getHomeDirectory().toString() return home_folder + '/' + output_folder return output_folder
def touch(self, fpath): try: # Create an empty file on HDFS fp = Path(fpath) os = self.fsHd.create(fp) os.close() except JException as ex: self.logger.error("Exception in HdfsUtil.touch({}): ex[{}]".format( fpath, ex))
def getFileStat(self, fpath): try: # Create an empty file on HDFS fp = Path(fpath) return self.fsHd.getFileStatus(fp) except JException as ex: self.logger.error( "Exception in HdfsUtil.getFileStat({}): ex[{}]".format( fpath, ex))
def setPerms(self, fpath, perms): try: # Create an empty file on HDFS fp = Path(fpath) fsPerm = FsPermission(perms) return self.fsHd.setPerms(fp, perms) except JException as ex: self.logger.error( "Exception in HdfsUtil.setPerms({}): ex[{}]".format(fpath, ex))
def ls(self, fpath): ''' Returns a list of all files in a given dir. This file list can be very long and take lots of memory. Use lsIterator or lsFileIterator instead to minimize memory usage. ''' p = Path(fpath) self.fileList = self.fsHd.listStatus(p) return self.fileList
def expand_path_with_home(output_folder): """Prepend the home folder to a relative location on HDFS if necessary. If we specified a relative path, prepend it with the home folder of the user on HDFS. If we are running in local mode, don't do anything. Arguments: output_folder -- the absolute or relative path of the output HDFS folder """ import pycascading.pipe if pycascading.pipe.running_mode == 'hadoop': if output_folder == '' or \ (output_folder[0 : 5] not in set(['hdfs:', 'file:']) and \ output_folder[0] != '/'): fs = Path('/').getFileSystem(Configuration()) home_folder = fs.getHomeDirectory().toString() return home_folder + '/' + output_folder return output_folder
def setRep(self, fpath, replication): try: # Create an empty file on HDFS fp = Path(fpath) return self.fsHd.setReplication(fp, replication) except JException as ex: self.logger.error( "Exception in HdfsUtil.setReplication({}): ex[{}]".format( fpath, ex))
def mkdir(self, fpath, perms=755): try: p = Path(fpath) permObj = FsPermission("{}".format(perms)) retVal = self.fsHd.mkdirs(p, permObj) if retVal == False: self.logger.error( "HdfsUtil.mkdir({}): Failed to create dir.".format( fpath, ex)) except JException as ex: self.logger.error("Exception in HdfsUtil.mkdir({}): ex[{}]".format( fpath, ex))
def cp(self, srcfpath, trgfpath): "Copy data within the current HDFS." try: sp = [] fileList = self.fsHd.globStatus(Path(srcfpath)) if ((fileList is None) or (fileList.__len__() == 0)): # Emit and error: No files found for srcfPath None for sfp in fileList: sp.append(sfp.getPath()) sfs = FileSystem.newInstance(self.hdfs.cHdfs) tp = Path(trgfpath) tfs = FileSystem.newInstance(self.hdfs.cHdfs) delSrc = False overWrite = True self.hdfs.fileUtil.copy(sfs, sp, tfs, tp, delSrc, overWrite, self.hdfs.cHdfs) except JException as ex: self.logger.error( "Exception in HdfsUtil.cp({} -> {}): ex[{}]".format( srcfpath, trgfpath, ex))
def exists(self, fpath): try: # Create an empty file on HDFS sp = [] fileList = self.fsHd.globStatus(Path(fpath)) if ((fileList is None) or (fileList.__len__() == 0)): # Emit and error: No files found for srcfPath return False return True except JException as ex: self.logger.error( "Exception in HdfsUtil.exists({}): ex[{}]".format(fpath, ex))
def openRead(self, fPath): fpHdfs = Path(fPath) fsInput = self.fsHd.open(fpHdfs) reader = None pat = r'.*\.gz' match = re.search(pat, hst) if match == None: reader = BufferedReader(InputStreamReader(fsInput)) else: # The file stream is in GZip format... reader = BufferedReader(InputStreamReader( GZIPInputStream(fsInput))) return reader
def testStore(self): from tempfile import mktemp tempdir = mktemp() outfile = tempdir + '/top_3_queries' args = [ "n=3", "reducers=1", "input=" + self.INPUT_FILE, "output=" + outfile, ] proxy = PigProxy.from_file(self.PIG_SCRIPT, args) # By default all STORE and DUMP commands are removed proxy.unoverride("STORE") proxy.run_script() cluster = Cluster(proxy.pig.getPigContext()) self.assert_(cluster.delete(Path(outfile)))
def __init__(self, hdfsCluster, fpath): self.hdfs = Hdfs(hdfsCluster) self.fsHd = self.hdfs.fileSystem fpHdfs = Path(fpath) fsInput = self.fsHd.open(fpHdfs) # The file has text so we want to use read the input stream via the BufferedReader. reader = BufferedReader(InputStreamReader(fsInput)) self.lineCount = 0 self.lines = [] line = reader.readLine() while line is not None: # print line self.lines.append(line) self.lineCount = self.lineCount + 1 if ((self.lineCount % 1000) == 0): print self.lineCount line = reader.readLine()
def rm(self, fpath, **kwargs): try: sp = [] fileList = self.fsHd.globStatus(Path(fpath)) if (fileList is None): # self.logger.warn("No Files found in: [{}]".format(fpath)) return if 'recurse' in kwargs: recurse = kwargs['recurse'] else: recurse = False for sfp in fileList: self.fsHd.delete(sfp.getPath(), recurse) except JException as ex: self.logger.error("Exception in HdfsUtil.rm({}): ex[{}]".format( fpath, ex))
# Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from org.apache.hadoop.fs import Path # Test for PIG-1824 p = Path('foo') @outputSchemaFunction("squareSchema") def square(num): if num == None: return None return ((num) * (num)) @schemaFunction("squareSchema") def squareSchema(input): return input @outputSchema("word:chararray")
def copyContentFromLocalFile(self, content, dest_path, overwrite = True): file_path = Path(dest_path) fs = file_path.getFileSystem(self.configuration) if overwrite and fs.exists(file_path): fs.delete(file_path, True) self.createInputFile(fs, dest_path, content)
def copyContentFromLocalFile(self, content, dest_path, overwrite=True): file_path = Path(dest_path) fs = file_path.getFileSystem(self.configuration) if overwrite and fs.exists(file_path): fs.delete(file_path, True) self.createInputFile(fs, dest_path, content)
from java.util import UUID from org.apache.hadoop.fs import Path print "Home dir is " + str(fs.homeDirectory) print "Work dir is " + str(fs.workingDirectory) print "/user exists " + str(fs.exists("/user")) name = UUID.randomUUID().toString() scriptName = "src/test/resources/test.properties" fs.copyFromLocalFile(scriptName, name) print Path(name).makeQualified(fs) # use the shell dir = "script-dir/" if not fsh.test(dir): fsh.mkdir(dir) fsh.cp(name, dir) fsh.chmodr(700, dir) print "File content is " + str(fsh.cat(dir + name)) print str(fsh.ls(dir)) fsh.rmr(dir) fs.getLength(name)
import sys import time startTime = time.time() if len(sys.argv) != 2: raise sys.argv[0] + ' <basedir>' # Get reference to the Hadoop FileSystem object. Everything we do in # this script that interacts with HDFS is through this object. fs = FileSystem.get(Configuration()) # Make sure the requested collection exists. collection = sys.argv[1] collectionDir = Path(collection) if not fs.exists(collectionDir): print '\nERROR: no collection directory: %s' % collectionDir System.exit(1) # Check for "guard" file. Like a semaphore, ensures that we don't try # to update this collection while it's in the middle of being updated. # Since file creation in HDFS is atomic, we don't check for the existence # of the guardFile, rather we try to create it. If the file already exists # then fs.createNewFile() will return False guardFile = Path(collectionDir, '_updating') if not fs.createNewFile(guardFile): print '\nERROR: collection update already in progress: %s' % guardFile System.exit(1)
def openWrite(self, fPath): fp = Path(fPath) fsOutput = self.fsHd.create(fp) return fsOutput