Exemplo n.º 1
0
 def CheckInputFile(self, inputfile, hdfsinputdir, outputdir, force, uncompress):
   input_file_list = ""
   if (inputfile.endswith(".gz") and uncompress):
     input_filename = os.path.basename(inputfile).replace(".gz","")
   else:
     input_filename = os.path.basename(inputfile)
   pyutil.printDebug(1, "Processing input " + input_filename + "\n")
   # Copy the input data to HDFS
   # Check that the input data exists and move to HDFS if necessary.
   hdfsinputfile = hdfsinputdir + "/" + input_filename
   if (not self.CheckHDFSFile(hdfsinputfile) or force):
     pyutil.printInfo("Regenerating HDFS input: " + hdfsinputfile)
     if (not self.CheckHDir(hdfsinputdir)):
       pyutil.runCommand(self.hadoopmkdir_ + hdfsinputdir)
     if (inputfile.endswith(".gz") and uncompress):
       new_input = outputdir + "/" + input_filename
       unzipcmd = "gunzip -c " + inputfile + " > " + new_input
       if (pyutil.runCommand(unzipcmd) != 0):
         pyutil.printError(12, "Unable to unzip file: " + inputfile)
       pyutil.runCommand(self.hadoopmove_ + new_input + " " + hdfsinputdir)
       input_file_list += " --input " + hdfsinputdir + "/" + input_filename
     else:
       pyutil.runCommand(self.hadoopput_ + inputfile + " " + hdfsinputdir)
       input_file_list += " --input " + hdfsinputdir + "/" + input_filename
     if (not self.CheckHDFSFile(hdfsinputfile)):
       pyutil.printError(10, "Unable to create input on HDFS: " + hdfsinputfile)
   else:
     input_file_list += " --input " + hdfsinputdir + "/" + input_filename
     pyutil.printDebug(5, "Found file on HDFS: " + hdfsinputdir + "/" + input_filename)
   return input_file_list
Exemplo n.º 2
0
 def RunMR(self, input_files, outputdir, reduce_tasks, reducer, mapper, mroptions):
   mr_str = self.hadoopmr_
   if (mroptions):
     mr_str += mroptions + " "
   mr_str += self.hadooplibpath_ + input_files
   if not reducer:
     mr_str += " -numReduceTasks 0 --reducer None "
     #mr_str += " -numReduceTasks 100 --reducer cat "
   else:
     if (int(reduce_tasks) >= 0):
       mr_str += " -numReduceTasks " + str(reduce_tasks)
     mr_str += " --reducer " + reducer
   mr_str += " --output " + outputdir
   mr_str += " --mapper " + mapper
   pyutil.printInfo("Running MR on: " + input_files)
   if (pyutil.runCommand(mr_str) != 0):
     pyutil.printError(33, "Error running MR" + mr_str)
Exemplo n.º 3
0
        "--streamingloc does not specify a valid jar files for the " +
        "streaming interface (checked: " + streamingloc)

if (not os.path.isdir(options.refrbin)
        or not os.path.exists(options.refrbin + "/run-model")):
    optParse.error("--refrbin directory must be the Reranker Framework bin " +
                   "direcotry.  Checked: " + options.refrbin)

## Collect input filenames.
filenames = []
for inputstring in options.inputlist:
    for tmpfile in inputstring.split():
        filenames += glob.glob(tmpfile)

for input in filenames:
    pyutil.printInfo("Input file: " + input)
    if (not os.path.exists(input)):
        pyutil.printError(130, "Input file not found: " + input)

if (options.develdata and not os.path.exists(options.develdata)):
    pyutil.printError(
        131, "Specified devel data file not found: " + options.develdata)

## Create output directory if it does not exist.
if (not os.path.isdir(options.outputdir)):
    os.makedirs(options.outputdir)

## @var hdproc
#  HadoopInterface object used to process all Hadoop MR utils.
hdproc = hadooputil.HadoopInterface(hadooproot, streamingloc,
                                    options.minsplitsize, options.tasktimeout,
Exemplo n.º 4
0
                 "streaming interface (checked: " + streamingloc)

if (not os.path.isdir(options.refrbin) or
    not os.path.exists(options.refrbin + "/run-model")):
  optParse.error("--refrbin directory must be the Reranker Framework bin " +
                 "direcotry.  Checked: " + options.refrbin)


## Collect input filenames.
filenames = []
for inputstring in options.inputlist:
  for tmpfile in inputstring.split():
    filenames += glob.glob(tmpfile)

for input in filenames:
  pyutil.printInfo("Input file: " + input)
  if (not os.path.exists(input)):
    pyutil.printError(130, "Input file not found: " + input)

if (options.develdata and not os.path.exists(options.develdata)):
  pyutil.printError(131, "Specified devel data file not found: " + options.develdata)

## Create output directory if it does not exist.
if (not os.path.isdir(options.outputdir)):
  os.makedirs(options.outputdir)

## @var hdproc
#  HadoopInterface object used to process all Hadoop MR utils.
hdproc = hadooputil.HadoopInterface(hadooproot,
                                    streamingloc,
                                    options.minsplitsize,