def runHadoopStreamingJobSerially(input,output,mapperCommand,reducerCommand=None): tmpMapperOutFile = output + "/" + "mapper-out" reducerOutFile = output + "/" + "part-00000" LinuxUtils.mkdir(output) #output dir should not exist prior to job execution LinuxUtils.rmPath(tmpMapperOutFile) for filename in os.listdir(input): inputFilePath = input + "/" + filename mapCommand = ["cat",inputFilePath,"| eval ",mapperCommand,">>",tmpMapperOutFile] LinuxUtils.runLinuxCommand(" ".join(mapCommand)) if (reducerCommand == None): LinuxUtils.cpPath(tmpMapperOutFile,reducerOutFile) else: redCommand = ["cat",tmpMapperOutFile,"| eval ",reducerCommand,">",reducerOutFile] LinuxUtils.runLinuxCommand(" ".join(redCommand))
def mergeHDFSFiles(hdfsInputDir,filesNamesPattern,hdfsOutputFilePath): LinuxUtils.runLinuxCommand("hdfs dfs -cat " + hdfsInputDir + "/" + filesNamesPattern + " | hdfs dfs -put -f - " + hdfsOutputFilePath)