def createHadoopWordcountDefinition(): parameterOrdering = DefinitionModule.createParameterOrderingTable() row = parameterOrdering.addRow() row.setColumn('source', 'input file') row.setColumn('target', 'output file') # TODO: # need to be able to customize this for each host executable = HadoopModule.JarExecutable() executable.stageable(False) executable.path([HadoopModule.getExecutablePath()]) executable.jarFile([getExamplesJar()]) executable.jarClass(['wordcount']) definition = DefinitionModule.createShellProcessDefinition( inputParameters = { 'input file':{ ParameterModule.PORT_ATTRIBUTE_COMMANDLINE:True, ParameterModule.PORT_ATTRIBUTE_ISINPUTFILE:True, }, 'output file':{ ParameterModule.PORT_ATTRIBUTE_COMMANDLINE:True, ParameterModule.PORT_ATTRIBUTE_ISSIDEEFFECT:True, } }, parameterOrderings = parameterOrdering, executable = executable ) return definition
def createWordCountDefinition(dir=None): parameterOrdering = DefinitionModule.createParameterOrderingTable() row = parameterOrdering.addRow() row.setColumn('source', 'input file') row.setColumn('target', 'output file') if dir is None: dir = os.getcwd().split(os.path.sep) + ['resources', 'testdata', 'TestExecute'] command = dir + ['wordcount.py'] executable = TaskCommandModule.Executable() executable.stageable(True) executable.path(command) executable.staticArgs([]) definition = DefinitionModule.createShellProcessDefinition( inputParameters = { 'input file':{ ParameterModule.PORT_ATTRIBUTE_COMMANDLINE:True, ParameterModule.PORT_ATTRIBUTE_ISINPUTFILE:True, }, 'output file':{ ParameterModule.PORT_ATTRIBUTE_COMMANDLINE:True, ParameterModule.PORT_ATTRIBUTE_ISSIDEEFFECT:True, } }, parameterOrderings = parameterOrdering, executable = executable ) definition.name('wordcount mapper') definition.id(ID_WORDCOUNT) definition.isLibraryDefinition(True) return definition
def createHadoopStreamingDefinition(): parameterOrdering = DefinitionModule.createParameterOrderingTable() row = parameterOrdering.addRow() row.setColumn('source', 'input file') row.setColumn('target', 'output file') row = parameterOrdering.addRow() row.setColumn('source', 'output file') row.setColumn('target', 'mapper') row = parameterOrdering.addRow() row.setColumn('source', 'mapper') row.setColumn('target', 'reducer') # TODO: # need to be able to customize this for each host executable = HadoopModule.JarExecutable() executable.stageable(False) executable.path([HadoopModule.getExecutablePath()]) executable.jarFile([getStreamingJar()]) executable.jarClass([]) definition = DefinitionModule.createShellProcessDefinition( inputParameters = { 'input file':{ ParameterModule.PORT_ATTRIBUTE_COMMANDLINE:True, ParameterModule.PORT_ATTRIBUTE_ISINPUTFILE:True, ParameterModule.PORT_ATTRIBUTE_COMMANDLINE_OPTIONS:{ ParameterModule.COMMANDLINE_PREFIX_FLAG:['-input'] }, }, 'output file':{ ParameterModule.PORT_ATTRIBUTE_COMMANDLINE:True, ParameterModule.PORT_ATTRIBUTE_ISSIDEEFFECT:True, ParameterModule.PORT_ATTRIBUTE_COMMANDLINE_OPTIONS:{ ParameterModule.COMMANDLINE_PREFIX_FLAG:['-output'] }, }, 'mapper':{ ParameterModule.PORT_ATTRIBUTE_COMMANDLINE:True, ParameterModule.PORT_ATTRIBUTE_ISINPUTFILE:True, ParameterModule.PORT_ATTRIBUTE_COMMANDLINE_OPTIONS:{ ParameterModule.COMMANDLINE_PREFIX_FLAG:['-mapper'] }, }, 'reducer':{ ParameterModule.PORT_ATTRIBUTE_COMMANDLINE:True, ParameterModule.PORT_ATTRIBUTE_ISINPUTFILE:True, ParameterModule.PORT_ATTRIBUTE_COMMANDLINE_OPTIONS:{ ParameterModule.COMMANDLINE_PREFIX_FLAG:['-reducer'] }, }, }, parameterOrderings = parameterOrdering, executable = executable ) return definition
def createEchoDefinition(): command = ['/bin/echo'] executable = TaskCommandModule.Executable() executable.stageable(False) executable.path(command) executable.staticArgs([]) definition = DefinitionModule.createShellProcessDefinition( inputParameters = { 'item to echo':{ParameterModule.PORT_ATTRIBUTE_COMMANDLINE:True} }, parameterOrderings = None, executable = executable ) definition.name('echo') return definition
def createHadoopPipesDefinition(): parameterOrdering = DefinitionModule.createParameterOrderingTable() row = parameterOrdering.addRow() row.setColumn('source', 'input file') row.setColumn('target', 'output file') # TODO: # need to be able to customize this for each host command = ['pipesProgram'] executable = HadoopModule.PipesExecutable() executable.stageable(False) executable.path([HadoopModule.getExecutablePath()]) executable.pipesFile(command) definition = DefinitionModule.createShellProcessDefinition( inputParameters = { 'input file':{ ParameterModule.PORT_ATTRIBUTE_COMMANDLINE:True, ParameterModule.PORT_ATTRIBUTE_ISINPUTFILE:True, ParameterModule.PORT_ATTRIBUTE_COMMANDLINE_OPTIONS:{ ParameterModule.COMMANDLINE_PREFIX_FLAG:['-input'] }, }, 'output file':{ ParameterModule.PORT_ATTRIBUTE_COMMANDLINE:True, ParameterModule.PORT_ATTRIBUTE_ISSIDEEFFECT:True, ParameterModule.PORT_ATTRIBUTE_COMMANDLINE_OPTIONS:{ ParameterModule.COMMANDLINE_PREFIX_FLAG:['-output'] }, }, }, parameterOrderings = parameterOrdering, executable = executable ) return definition