def CheckInputFile(self, inputfile, hdfsinputdir, outputdir, force, uncompress): input_file_list = "" if (inputfile.endswith(".gz") and uncompress): input_filename = os.path.basename(inputfile).replace(".gz","") else: input_filename = os.path.basename(inputfile) pyutil.printDebug(1, "Processing input " + input_filename + "\n") # Copy the input data to HDFS # Check that the input data exists and move to HDFS if necessary. hdfsinputfile = hdfsinputdir + "/" + input_filename if (not self.CheckHDFSFile(hdfsinputfile) or force): pyutil.printInfo("Regenerating HDFS input: " + hdfsinputfile) if (not self.CheckHDir(hdfsinputdir)): pyutil.runCommand(self.hadoopmkdir_ + hdfsinputdir) if (inputfile.endswith(".gz") and uncompress): new_input = outputdir + "/" + input_filename unzipcmd = "gunzip -c " + inputfile + " > " + new_input if (pyutil.runCommand(unzipcmd) != 0): pyutil.printError(12, "Unable to unzip file: " + inputfile) pyutil.runCommand(self.hadoopmove_ + new_input + " " + hdfsinputdir) input_file_list += " --input " + hdfsinputdir + "/" + input_filename else: pyutil.runCommand(self.hadoopput_ + inputfile + " " + hdfsinputdir) input_file_list += " --input " + hdfsinputdir + "/" + input_filename if (not self.CheckHDFSFile(hdfsinputfile)): pyutil.printError(10, "Unable to create input on HDFS: " + hdfsinputfile) else: input_file_list += " --input " + hdfsinputdir + "/" + input_filename pyutil.printDebug(5, "Found file on HDFS: " + hdfsinputdir + "/" + input_filename) return input_file_list
def RunMR(self, input_files, outputdir, reduce_tasks, reducer, mapper, mroptions): mr_str = self.hadoopmr_ if (mroptions): mr_str += mroptions + " " mr_str += self.hadooplibpath_ + input_files if not reducer: mr_str += " -numReduceTasks 0 --reducer None " #mr_str += " -numReduceTasks 100 --reducer cat " else: if (int(reduce_tasks) >= 0): mr_str += " -numReduceTasks " + str(reduce_tasks) mr_str += " --reducer " + reducer mr_str += " --output " + outputdir mr_str += " --mapper " + mapper pyutil.printInfo("Running MR on: " + input_files) if (pyutil.runCommand(mr_str) != 0): pyutil.printError(33, "Error running MR" + mr_str)
"--streamingloc does not specify a valid jar files for the " + "streaming interface (checked: " + streamingloc) if (not os.path.isdir(options.refrbin) or not os.path.exists(options.refrbin + "/run-model")): optParse.error("--refrbin directory must be the Reranker Framework bin " + "direcotry. Checked: " + options.refrbin) ## Collect input filenames. filenames = [] for inputstring in options.inputlist: for tmpfile in inputstring.split(): filenames += glob.glob(tmpfile) for input in filenames: pyutil.printInfo("Input file: " + input) if (not os.path.exists(input)): pyutil.printError(130, "Input file not found: " + input) if (options.develdata and not os.path.exists(options.develdata)): pyutil.printError( 131, "Specified devel data file not found: " + options.develdata) ## Create output directory if it does not exist. if (not os.path.isdir(options.outputdir)): os.makedirs(options.outputdir) ## @var hdproc # HadoopInterface object used to process all Hadoop MR utils. hdproc = hadooputil.HadoopInterface(hadooproot, streamingloc, options.minsplitsize, options.tasktimeout,
"streaming interface (checked: " + streamingloc) if (not os.path.isdir(options.refrbin) or not os.path.exists(options.refrbin + "/run-model")): optParse.error("--refrbin directory must be the Reranker Framework bin " + "direcotry. Checked: " + options.refrbin) ## Collect input filenames. filenames = [] for inputstring in options.inputlist: for tmpfile in inputstring.split(): filenames += glob.glob(tmpfile) for input in filenames: pyutil.printInfo("Input file: " + input) if (not os.path.exists(input)): pyutil.printError(130, "Input file not found: " + input) if (options.develdata and not os.path.exists(options.develdata)): pyutil.printError(131, "Specified devel data file not found: " + options.develdata) ## Create output directory if it does not exist. if (not os.path.isdir(options.outputdir)): os.makedirs(options.outputdir) ## @var hdproc # HadoopInterface object used to process all Hadoop MR utils. hdproc = hadooputil.HadoopInterface(hadooproot, streamingloc, options.minsplitsize,