def classify(mapperExecPath,reducerExecPath,inputDir,outputDir,centroidsFile,testImgsJsonFile,N): centroidsHDFSFullPath = HadoopUtils.getHDFSFullPath(centroidsFile) centroidsFileName = centroidsFile.split("/")[-1] testImgsJsonHDFSFullPath = HadoopUtils.getHDFSFullPath(testImgsJsonFile) testImgsJsonFileName = testImgsJsonFile.split("/")[-1] mapperCommandStr = HadoopUtils.buildExecCommandStr([mapperExecPath,centroidsFileName,testImgsJsonFileName,str(N)]) reducerCommandStr = HadoopUtils.buildExecCommandStr([reducerExecPath,str(N)]) HadoopUtils.removeHDFSDirIfExists(outputDir) HadoopUtils.runHadoopStreamingJob(input=inputDir, output=outputDir, mapperCommand=mapperCommandStr, reducerCommand=reducerCommandStr, filesArray=[centroidsHDFSFullPath,testImgsJsonHDFSFullPath])
def preprocessInput(mapperExecPath,reducerExecPath,inputDir,outputDir,centroidsFile): centroidsHDFSFullPath = HadoopUtils.getHDFSFullPath(centroidsFile) centroidsFileName = centroidsFile.split("/")[-1] mapperCommandStr = HadoopUtils.buildExecCommandStr([mapperExecPath,centroidsFileName,str(2)]) HadoopUtils.removeHDFSDirIfExists(outputDir) HadoopUtils.runHadoopStreamingJob(input=inputDir, output=outputDir, mapperCommand=mapperCommandStr, numReducerTasks=1, reducerCommand=reducerExecPath, filesArray=[centroidsHDFSFullPath])