def createHadoopWordcountDefinition(): parameterOrdering = DefinitionModule.createParameterOrderingTable() row = parameterOrdering.addRow() row.setColumn('source', 'input file') row.setColumn('target', 'output file') # TODO: # need to be able to customize this for each host executable = HadoopModule.JarExecutable() executable.stageable(False) executable.path([HadoopModule.getExecutablePath()]) executable.jarFile([getExamplesJar()]) executable.jarClass(['wordcount']) definition = DefinitionModule.createShellProcessDefinition( inputParameters = { 'input file':{ ParameterModule.PORT_ATTRIBUTE_COMMANDLINE:True, ParameterModule.PORT_ATTRIBUTE_ISINPUTFILE:True, }, 'output file':{ ParameterModule.PORT_ATTRIBUTE_COMMANDLINE:True, ParameterModule.PORT_ATTRIBUTE_ISSIDEEFFECT:True, } }, parameterOrderings = parameterOrdering, executable = executable ) return definition
def createHadoopStreamingDefinition(): parameterOrdering = DefinitionModule.createParameterOrderingTable() row = parameterOrdering.addRow() row.setColumn('source', 'input file') row.setColumn('target', 'output file') row = parameterOrdering.addRow() row.setColumn('source', 'output file') row.setColumn('target', 'mapper') row = parameterOrdering.addRow() row.setColumn('source', 'mapper') row.setColumn('target', 'reducer') # TODO: # need to be able to customize this for each host executable = HadoopModule.JarExecutable() executable.stageable(False) executable.path([HadoopModule.getExecutablePath()]) executable.jarFile([getStreamingJar()]) executable.jarClass([]) definition = DefinitionModule.createShellProcessDefinition( inputParameters = { 'input file':{ ParameterModule.PORT_ATTRIBUTE_COMMANDLINE:True, ParameterModule.PORT_ATTRIBUTE_ISINPUTFILE:True, ParameterModule.PORT_ATTRIBUTE_COMMANDLINE_OPTIONS:{ ParameterModule.COMMANDLINE_PREFIX_FLAG:['-input'] }, }, 'output file':{ ParameterModule.PORT_ATTRIBUTE_COMMANDLINE:True, ParameterModule.PORT_ATTRIBUTE_ISSIDEEFFECT:True, ParameterModule.PORT_ATTRIBUTE_COMMANDLINE_OPTIONS:{ ParameterModule.COMMANDLINE_PREFIX_FLAG:['-output'] }, }, 'mapper':{ ParameterModule.PORT_ATTRIBUTE_COMMANDLINE:True, ParameterModule.PORT_ATTRIBUTE_ISINPUTFILE:True, ParameterModule.PORT_ATTRIBUTE_COMMANDLINE_OPTIONS:{ ParameterModule.COMMANDLINE_PREFIX_FLAG:['-mapper'] }, }, 'reducer':{ ParameterModule.PORT_ATTRIBUTE_COMMANDLINE:True, ParameterModule.PORT_ATTRIBUTE_ISINPUTFILE:True, ParameterModule.PORT_ATTRIBUTE_COMMANDLINE_OPTIONS:{ ParameterModule.COMMANDLINE_PREFIX_FLAG:['-reducer'] }, }, }, parameterOrderings = parameterOrdering, executable = executable ) return definition
def setUp(self): self.assertTrue(os.path.exists(HadoopModule.getExecutablePath())) BaseModule.BaseTestClass.setUp(self) # TODO: # will need to place test data into HDFS self.removeOutputFiles() return
def testCreatePipesExecutable(self): path = os.path.sep.join(['', 'path', 'to', 'pipes']) executableObject = self.builder.createPipesExecutableObject(path) self.assertTrue(isinstance(executableObject, HadoopModule.PipesExecutable)) self.assertFalse(executableObject.stageable()) self.assertEquals([HadoopModule.getExecutablePath()], executableObject.path()) self.assertEquals([path], executableObject.pipesFile()) return
def testCreateStreamingExecutable(self): executableObject = self.builder.createStreamingExecutableObject() self.assertTrue(isinstance(executableObject, HadoopModule.JarExecutable)) self.assertFalse(executableObject.stageable()) self.assertEquals([HadoopModule.getExecutablePath()], executableObject.path()) self.assertEquals([HadoopModule.getStreamingJar()], executableObject.jarFile()) self.assertEquals([], executableObject.jarClass()) return
def testCreateDefaultExecutable(self): jarFile = 'myJarFile' jarClass = 'myJarClass' executableObject = self.builder.createExecutableObject(jarFile, jarClass) self.assertTrue(isinstance(executableObject, HadoopModule.JarExecutable)) self.assertFalse(executableObject.stageable()) self.assertEquals([HadoopModule.getExecutablePath()], executableObject.path()) self.assertEquals([jarFile], executableObject.jarFile()) self.assertEquals([jarClass], executableObject.jarClass()) return
def createHadoopPipesDefinition(): parameterOrdering = DefinitionModule.createParameterOrderingTable() row = parameterOrdering.addRow() row.setColumn('source', 'input file') row.setColumn('target', 'output file') # TODO: # need to be able to customize this for each host command = ['pipesProgram'] executable = HadoopModule.PipesExecutable() executable.stageable(False) executable.path([HadoopModule.getExecutablePath()]) executable.pipesFile(command) definition = DefinitionModule.createShellProcessDefinition( inputParameters = { 'input file':{ ParameterModule.PORT_ATTRIBUTE_COMMANDLINE:True, ParameterModule.PORT_ATTRIBUTE_ISINPUTFILE:True, ParameterModule.PORT_ATTRIBUTE_COMMANDLINE_OPTIONS:{ ParameterModule.COMMANDLINE_PREFIX_FLAG:['-input'] }, }, 'output file':{ ParameterModule.PORT_ATTRIBUTE_COMMANDLINE:True, ParameterModule.PORT_ATTRIBUTE_ISSIDEEFFECT:True, ParameterModule.PORT_ATTRIBUTE_COMMANDLINE_OPTIONS:{ ParameterModule.COMMANDLINE_PREFIX_FLAG:['-output'] }, }, }, parameterOrderings = parameterOrdering, executable = executable ) return definition
def fileExists(self, file): f = os.system('%s fs -stat %s' % (HadoopModule.getExecutablePath(), file)) return f is 0
def removeOutputFiles(self): # ensure that the outdir does not exist if self.fileExists(TestHadoopStreaming1.DIR_OUTPUT): os.system('%s fs -rmr %s' % (HadoopModule.getExecutablePath(), TestHadoopStreaming1.DIR_OUTPUT)) return