Пример #1
0
def classify(mapperExecPath,reducerExecPath,inputDir,outputDir,centroidsFile,testImgsJsonFile,N):
	centroidsHDFSFullPath = HadoopUtils.getHDFSFullPath(centroidsFile)
        centroidsFileName = centroidsFile.split("/")[-1]
	testImgsJsonHDFSFullPath = HadoopUtils.getHDFSFullPath(testImgsJsonFile)
	testImgsJsonFileName = testImgsJsonFile.split("/")[-1]
        mapperCommandStr = HadoopUtils.buildExecCommandStr([mapperExecPath,centroidsFileName,testImgsJsonFileName,str(N)])
	reducerCommandStr = HadoopUtils.buildExecCommandStr([reducerExecPath,str(N)])

        HadoopUtils.removeHDFSDirIfExists(outputDir)

        HadoopUtils.runHadoopStreamingJob(input=inputDir,
                                  output=outputDir,
                                  mapperCommand=mapperCommandStr,
                                  reducerCommand=reducerCommandStr,
                                  filesArray=[centroidsHDFSFullPath,testImgsJsonHDFSFullPath])
Пример #2
0
def preprocessInput(mapperExecPath,reducerExecPath,inputDir,outputDir,centroidsFile):
	centroidsHDFSFullPath = HadoopUtils.getHDFSFullPath(centroidsFile)
	centroidsFileName = centroidsFile.split("/")[-1]
	mapperCommandStr = HadoopUtils.buildExecCommandStr([mapperExecPath,centroidsFileName,str(2)])

	HadoopUtils.removeHDFSDirIfExists(outputDir)

	HadoopUtils.runHadoopStreamingJob(input=inputDir,
                                  output=outputDir,
                                  mapperCommand=mapperCommandStr,
				  numReducerTasks=1,
				  reducerCommand=reducerExecPath,
				  filesArray=[centroidsHDFSFullPath])