class PythonSourceToFile(Module): """ This is the class for specifying a python code snippet for running with Hadoop Streaming, it will take its contents and output to a temporary Python file. The code will not be passed around. """ _settings = ModuleSettings( namespace='hadoop', configure_widget=PythonSourceToFileConfigurationWidget) _input_ports = [ IPort('Input File', File), IPort('source', String, optional=True) ] _output_ports = [OPort('Temporary File', File)] def compute(self): inputFile = self.force_get_input('Input File') if inputFile != None: # tempFile = file_pool.make_local_copy(inputFile.name) tempFile = inputFile else: source = urllib.unquote(self.force_get_input('source', '')) tempFile = self.interpreter.filePool.create_file() f = open(tempFile.name, 'w') f.write(source) f.close() self.set_output('Temporary File', tempFile)
class vtkPolyDataInspector(vtkDataSetInspector): _settings = ModuleSettings(abstract=False, signature=vtk_hasher) _input_ports = [('SetInputConnection0', 'vtkAlgorithmOutput'), ('SetInput', 'vtkDataSet'), ] _output_ports = [('GetVerts', 'vtkCellArray'), ('GetLines', 'vtkCellArray'), ('GetPolys', 'vtkCellArray'), ('GetStrips', 'vtkCellArray'), ('GetPoints', 'vtkPoints'), ('GetNumberOfVerts', [Integer]), ('GetNumberOfLines', [Integer]), ('GetNumberOfPolys', [Integer]), ('GetNumberOfStrips', [Integer]), ] def compute(self): vtk_object = None if self.has_input("SetInputConnection0"): port_object = self.get_input("SetInputConnection0") if hasattr(port_object, "vtkInstance"): port_object = port_object.vtkInstance producer = port_object.GetProducer() try: vtk_object = producer.GetOutput() except AttributeError: raise ModuleError(self, "expected a module that supports GetOutput") elif self.has_input("SetInput"): vtk_object = self.get_input("SetInput") if hasattr(vtk_object, "vtkInstance"): vtk_object = vtk_object.vtkInstance if vtk_object: self.auto_set_results(vtk_object)
class RichTextOutput(FileOutput): _settings = ModuleSettings( configure_widget= "vistrails.gui.modules.output_configuration:OutputModuleConfigurationWidget" ) _input_ports = [('value', 'File')] _output_modes = [HtmlToFileMode, (FileToStdoutMode, 50), IPythonHtmlMode]
class RichTextOutput(OutputModule): _settings = ModuleSettings( configure_widget= "vistrails.gui.modules.output_configuration:OutputModuleConfigurationWidget" ) # need specific spreadsheet richtext mode here pass
class MplFigureOutput(OutputModule): _settings = ModuleSettings( configure_widget= "vistrails.gui.modules.output_configuration:OutputModuleConfigurationWidget" ) _input_ports = [('value', 'MplFigure')] _output_modes = [MplFigureToFile, MplIPythonMode]
class vtkDataSetInspector(vtkBaseInspector): _settings = ModuleSettings(abstract=False, signature=vtk_hasher) _input_ports = [('SetInputConnection0', 'vtkAlgorithmOutput'), ('SetInput', 'vtkDataSet'), ] _output_ports = [('GetBounds', [Float] * 6), ('GetScalarRange', [Float] * 2), ('GetLength', [Float]), ('GetCenter', [Float] * 3), ('GetNumberOfPoints', [Integer]), ('GetNumberOfCells', [Integer]), ('GetPointData', 'vtkPointData'), ('GetCellData', 'vtkCellData'), ] def compute(self): port_object = None if self.has_input("SetInputConnection0"): ic = self.get_input("SetInputConnection0") if hasattr(ic, "vtkInstance"): ic = ic.vtkInstance producer = ic.GetProducer() try: port_object = producer.GetOutput() except AttributeError: raise ModuleError(self, "expected a module that supports GetOutput") elif self.has_input("SetInput"): port_object = self.get_input("SetInput") if hasattr(port_object, "vtkInstance"): port_object = port_object.vtkInstance if port_object: self.auto_set_results(port_object)
class URICreator(HadoopBaseModule): """ The class for caching HDFS file onto the TaskNode local drive """ _settings = ModuleSettings(namespace='hadoop') _input_ports = [IPort('HDFS File/URI', String), IPort('Symlink', String), IPort('Machine', '(org.vistrails.vistrails.remoteq:Machine)')] _output_ports = [OPort('Machine', '(org.vistrails.vistrails.remoteq:Machine)'), OPort('URI', String)] def compute(self): machine = self.get_machine() jm = self.job_monitor() id = self.signature job = jm.getCache(id) if not job: uri = self.force_get_input('HDFS File/URI') symlink = self.force_get_input('Symlink') if uri==None or symlink==None: raise ModuleError(self, "Missing 'HDFS File/URI' or 'Symlink' values") if '://' not in uri: uri = self.add_prefix(uri, machine) uri += '#' + symlink d = {'uri':uri} self.set_job_machine(d, machine) jm.setCache(id, d, self.job_name()) job = jm.getCache(id) self.set_output('URI', job.parameters['uri']) self.set_output('Machine', machine)
class TrainTestSplit(Module): """Split data into training and testing randomly.""" _settings = ModuleSettings(namespace="cross-validation") _input_ports = [("data", "basic:List", { 'shape': 'circle' }), ("target", "basic:List", { 'shape': 'circle' }), ("test_size", "basic:Float", { "defaults": [.25] })] _output_ports = [("training_data", "basic:List", { 'shape': 'circle' }), ("training_target", "basic:List", { 'shape': 'circle' }), ("test_data", "basic:List", { 'shape': 'circle' }), ("test_target", "basic:List", { 'shape': 'circle' })] def compute(self): X_train, X_test, y_train, y_test = \ train_test_split(self.get_input("data"), self.get_input("target"), test_size=try_convert(self.get_input("test_size"))) self.set_output("training_data", X_train) self.set_output("training_target", y_train) self.set_output("test_data", X_test) self.set_output("test_target", y_test)
class HDFSEnsureNew(HadoopBaseModule): """ Make sure the file is removed """ _settings = ModuleSettings(namespace='hadoop') _input_ports = [IPort('Name', String), IPort('Machine', '(org.vistrails.vistrails.remoteq:Machine)')] _output_ports = [OPort('Machine', '(org.vistrails.vistrails.remoteq:Machine)'), OPort('Name', String)] def __init__(self): HadoopBaseModule.__init__(self) def compute(self): machine = self.get_machine() jm = self.job_monitor() id = self.signature job = jm.getCache(id) if not job: entry_name = self.get_input('Name') if '://' not in entry_name: entry_name = self.add_prefix(entry_name, machine) if not int(self.call_hdfs('dfs -test -e ' + entry_name + '; echo $?', machine)): #self.call_hdfs('dfs -rm -r ' + entry_name, machine) # we are using -rmr but it is deprecated self.call_hdfs('dfs -rmr ' + entry_name, machine) d = {'entry_name':entry_name} self.set_job_machine(d, machine) jm.setCache(id, d, self.job_name()) job = jm.getCache(id) self.set_output('Name', job.parameters['entry_name']) self.set_output('Machine', machine)
class TableOutput(OutputModule): _settings = ModuleSettings( configure_widget= "vistrails.gui.modules.output_configuration:OutputModuleConfigurationWidget" ) _input_ports = [('value', 'Table')] _output_modes = [TableToFileMode]
class HDFSGet(HadoopBaseModule): """ Getting a file from the Hadoop DFS Then getting it from the server """ _settings = ModuleSettings(namespace='hadoop') _input_ports = [ IPort('Local File', Path), IPort('Remote Location', String), IPort('Override', Boolean), IPort('Machine', '(org.vistrails.vistrails.remoteq:Machine)') ] _output_ports = [ OPort('Machine', '(org.vistrails.vistrails.remoteq:Machine)'), OPort('Local File', File) ] def __init__(self): HadoopBaseModule.__init__(self) def compute(self): machine = self.get_machine() jm = self.job_monitor() id = self.signature job = jm.getCache(id) if not job: remote = self.get_input('Remote Location') local = self.get_input('Local File') override = self.force_get_input('Override', False) if '://' not in remote: remote = self.add_prefix(remote, machine) if os.path.exists(local.name): if override == False: raise ModuleError(self, 'Output already exists') else: if os.path.isdir(local.name): shutil.rmtree(local.name) else: os.unlink(local.name) tempfile = machine.remote.send_command('mktemp -d -u').strip() result = self.call_hdfs('dfs -get %s %s' % (remote, tempfile), machine) # too slow with many files #res = machine.send_command("get -r %s %s" % (tempfile, local.name) ) # tar files to increase speed result = machine.local.send_command('mkdir %s' % local.name) result = machine.sync(local.name, tempfile, mode=machine.MODE_REMOTE_LOCAL, use_tar=True) result = machine.remote.rm(tempfile, force=True, recursively=True) d = {'remote': remote, 'local': local.name} self.set_job_machine(d, machine) jm.setCache(id, d, self.job_name()) job = jm.getCache(id) self.set_output('Local File', PathObject(job.parameters['local'])) self.set_output('Machine', machine)
class FileOutput(OutputModule): _settings = ModuleSettings( configure_widget= "vistrails.gui.modules.output_configuration:OutputModuleConfigurationWidget" ) _input_ports = [('value', 'File')] # Stdout is low priority, probably a bad plan _output_modes = [(FileToStdoutMode, 50), (FileToFileMode, 200)]
class ImageOutput(FileOutput): _settings = ModuleSettings( configure_widget= "vistrails.gui.modules.output_configuration:OutputModuleConfigurationWidget" ) _input_ports = [('value', 'File')] # FileToStdoutMode is disabled, since it's definitely binary _output_modes = [FileToFileMode, (FileToStdoutMode, -1)]
class FileOutput(OutputModule): _settings = ModuleSettings( configure_widget= "vistrails.gui.modules.output_configuration:OutputModuleConfigurationWidget" ) # should set file as a higher priority here... _input_ports = [('value', 'File')] _output_modes = [FileToStdoutMode, FileToFileMode]
class Abstraction(Group): # We need Abstraction to be a subclass of Group so that the hierarchy of # modules is right # But the pipeline comes from somewhere else, so skip the transfer_attrs() _settings = ModuleSettings(name="SubWorkflow", hide_descriptor=True) def transfer_attrs(self, module): Module.transfer_attrs(self, module)
class SubmitShellJob(BaseSubmitJob): """Submits a shell script. """ _settings = ModuleSettings( configure_widget=('%s.widgets' % this_pkg, 'ShellSourceConfigurationWidget')) _input_ports = [('source', '(basic:String)')] _output_ports = [('stderr', '(basic:File)'), ('stdout', '(basic:File)')] def job_start(self, params): """Creates a temporary job with the given source, upload and submit it. """ queue = QueueCache.get(params['destination'], params['queue'], params.get('setup_runtime') or None, params.get('need_runtime') or None) # First, check if job already exists try: with ServerLogger.hide_output(): queue.status(params['job_id']) except (tej.JobNotFound, tej.QueueDoesntExist): pass else: return params # Alright, submit a new job directory = self.interpreter.filePool.create_directory( prefix='vt_tmp_shelljob_').name # We use io.open() here because we could be writing scripts on Windows # before uploading them to a POSIX server source = urllib.unquote(self.get_input('source')) if isinstance(source, bytes): kwargs = {'mode': 'wb'} else: kwargs = {'mode': 'w', 'newline': '\n'} with io.open(os.path.join(directory, 'start.sh'), **kwargs) as fp: fp.write(source) queue.submit(params['job_id'], directory) return params def job_set_results(self, params): """Gets stderr and stdout. """ super(SubmitShellJob, self).job_set_results(params) temp_dir = self.interpreter.filePool.create_directory( prefix='vt_tmp_shelljobout_').name queue = QueueCache.get(params['destination'], params['queue'], params.get('setup_runtime') or None, params.get('need_runtime') or None) queue.download(params['job_id'], ['_stderr', '_stdout'], directory=temp_dir) self.set_output('stderr', PathObject(os.path.join(temp_dir, '_stderr'))) self.set_output('stdout', PathObject(os.path.join(temp_dir, '_stdout')))
class TFOperation(Module): """A TensorFlow operation that will be run by Run as part of the graph. """ _settings = ModuleSettings(abstract=True) _output_ports = [('output', '(org.vistrails.vistrails.tensorflow:TFOperation)')] def compute(self): raise NotImplementedError
class BuildTable(Module): """Builds a table by putting together columns from multiple sources. Input can be a mix of lists, which will be used as single columns, and whole tables, whose column names will be mangled. """ _settings = ModuleSettings( configure_widget='vistrails.packages.tabledata.widgets:BuildTableWidget' ) _output_ports = [('value', Table)] def __init__(self): Module.__init__(self) self.input_ports_order = [] def transfer_attrs(self, module): Module.transfer_attrs(self, module) self.input_ports_order = [p.name for p in module.input_port_specs] def compute(self): items = None if self.input_ports_order: # pragma: no branch items = [(p, self.get_input(p)) for p in self.input_ports_order] if not items: raise ModuleError(self, "No inputs were provided") nb_rows = None cols = [] names = [] for portname, item in items: if isinstance(item, TableObject): if nb_rows is not None: if item.rows != nb_rows: raise ModuleError( self, "Different row counts: %d != %d" % (item.rows, nb_rows)) else: nb_rows = item.rows cols.extend(item.get_column(c) for c in xrange(item.columns)) if item.names is not None: names.extend(item.names) else: names.extend("%s col %d" % (portname, i) for i in xrange(len(cols) - len(names))) else: if nb_rows is not None: if len(item) != nb_rows: raise ModuleError( self, "Different row counts: %d != %d" % (len(item), nb_rows)) else: nb_rows = len(item) cols.append(item) names.append(portname) self.set_output('value', TableObject(cols, nb_rows, names))
class SubmitJob(AssembleDirectoryMixin, BaseSubmitJob): """Submits a generic job (a directory). """ _settings = ModuleSettings( configure_widget=('%s.widgets' % this_pkg, 'DirectoryConfigurationWidget')) _input_ports = [('job', '(basic:Directory)', { 'optional': True }), ('script', '(basic:String)', { 'optional': True, 'defaults': "['start.sh']" })] def __init__(self): AssembleDirectoryMixin.__init__(self) Module.__init__(self) def job_start(self, params): """Sends the directory and submits the job. """ queue = QueueCache.get(params['destination'], params['queue'], params.get('setup_runtime') or None, params.get('need_runtime') or None) # First, check if job already exists try: with ServerLogger.hide_output(): queue.status(params['job_id']) except (tej.JobNotFound, tej.QueueDoesntExist): pass else: return params if self.has_input('job'): job_dir = self.get_input('job') if not os.path.exists(job_dir.name): raise ModuleError(self, "Directory doesn't exist") else: job_dir = None # Use AssembleDirectoryMixin to get additional files from port specs job_dir = self.assemble_directory(job_dir, False) # Check that the script exists script = self.get_input('script') if not os.path.exists(os.path.join(job_dir.name, script)): raise ModuleError(self, "Script does not exist") # Alright, submit a new job queue.submit(params['job_id'], job_dir.name, script) return params
class UnsupervisedEstimator(Estimator): """Base class for all sklearn transformer modules.""" _settings = ModuleSettings(abstract=True) def compute(self): params = dict([(p, try_convert(self.get_input(p))) for p in self.inputPorts if p not in ["training_data", "training_target"]]) trans = self._estimator_class(**params) if "training_data" in self.inputPorts: training_data = np.vstack(self.get_input("training_data")) trans.fit(training_data) self.set_output("model", trans)
class ManifoldLearner(Module): """Base class for all sklearn manifold modules. """ _settings = ModuleSettings(abstract=True) _output_ports = [("transformed_data", "basic:List", {'shape': 'circle'})] def compute(self): params = dict([(p, try_convert(self.get_input(p))) for p in self.inputPorts if p not in ["training_data"]]) trans = self._estimator_class(**params) training_data = np.vstack(self.get_input("training_data")) transformed_data = trans.fit_transform(training_data) self.set_output("transformed_data", transformed_data)
class vtkRendererOutput(OutputModule): _settings = ModuleSettings( configure_widget="vistrails.gui.modules." "output_configuration:OutputModuleConfigurationWidget") _input_ports = [('value', 'vtkRenderer', { 'depth': 1 }), ('interactorStyle', 'vtkInteractorStyle'), ('picker', 'vtkAbstractPicker')] _output_modes = [vtkRendererToFile, vtkRendererToIPythonMode] if registry.has_module('org.vistrails.vistrails.spreadsheet', 'SpreadsheetCell'): from .vtkcell import vtkRendererToSpreadsheet _output_modes.append(vtkRendererToSpreadsheet)
class Iris(Module): """Example dataset: iris. """ _settings = ModuleSettings(namespace="datasets") _output_ports = [("data", "basic:List", { 'shape': 'circle' }), ("target", "basic:List", { 'shape': 'circle' })] def compute(self): data = datasets.load_iris() self.set_output("data", data.data) self.set_output("target", data.target)
class MakeDirectory(AssembleDirectoryMixin, Module): """Creates a temporary directory and puts the given files in it. """ _settings = ModuleSettings(configure_widget=( '%s.widgets' % this_pkg, 'DirectoryConfigurationWidget')) _output_ports = [('directory', '(basic:Directory)')] def __init__(self): AssembleDirectoryMixin.__init__(self) Module.__init__(self) def compute(self): directory = self.assemble_directory() self.set_output('directory', directory)
class vtkDataArrayInspector(vtkBaseInspector): _settings = ModuleSettings(abstract=False, signature=vtk_hasher) _input_ports = [('SetInput', 'vtkDataArray')] _output_ports = [('GetMaxNorm', [Float]), ('GetRange', [Float] * 2)] def compute(self): vtk_object = None if self.has_input("SetInput"): vtk_object = self.get_input("SetInput") if hasattr(vtk_object, "vtkInstance"): vtk_object = vtk_object.vtkInstance if vtk_object: self.auto_set_results(vtk_object)
class minimize(TFOperation): __doc__ = tensorflow.train.Optimizer.__doc__ _settings = ModuleSettings(namespace='train|optimizer') _input_ports = [('optimizer', Optimizer), ('loss', TFOperation), ('global_step', Variable, { 'optional': True }), ('var_list', Variable, { 'depth': 1, 'optional': True }), ('gate_gradients', '(basic:String)', { 'optional': True, 'entry_types': '["enum"]', 'values': '[["GATE_NONE", "GATE_OP", "GATE_GRAPH"]]' }), ('name', '(basic:String)', { 'optional': True })] _GATE_GRADIENTS = { 'GATE_NONE': tensorflow.train.Optimizer.GATE_NONE, 'GATE_OP': tensorflow.train.Optimizer.GATE_OP, 'GATE_GRAPH': tensorflow.train.Optimizer.GATE_GRAPH } def compute(self): if self.has_input('gate_gradients'): gate_gradients = self._GATE_GRADIENTS[self.get_input( 'gate_gradients')] else: gate_gradients = None name = self.force_get_input('name') def output(optimizer, loss, **kwargs): kw = {'loss': loss, 'name': name} if gate_gradients is not None: kw['gate_gradients'] = gate_gradients kw.update(kwargs) ret = optimizer.minimize(**kw) return ret kwargs = { 'optimizer': self.get_input('optimizer'), 'loss': self.get_input('loss') } if self.has_input('global_step'): kwargs['global_step'] = self.get_input('global_step') if self.has_input('var_list'): kwargs['var_list'] = self.get_input('var_list') self.set_output('output', Op(output, kwargs))
class vtkRendererOutput(OutputModule): # DAK: no render view here, use a separate module for this... _settings = ModuleSettings( configure_widget= "vistrails.gui.modules.output_configuration:OutputModuleConfigurationWidget" ) _input_ports = [('value', 'vtkRenderer')] # DK: these ports can be enabled, I think, just # have to be laoded without the spreadsheet being # enabled # ('interactionHandler', 'vtkInteractionHandler'), # ('interactorStyle', 'vtkInteractorStyle'), # ('picker', 'vtkAbstractPicker')] _output_modes = [vtkRendererToFile]
def gen_class_module(spec, lib, klasses, **module_settings): """Create a module from a python class specification Parameters ---------- spec : ClassSpec A class to module specification """ module_settings.update(spec.get_module_settings()) _settings = ModuleSettings(**module_settings) # convert input/output specs into VT port objects input_ports = [ CIPort(ispec.name, ispec.get_port_type(), **ispec.get_port_attrs()) for ispec in spec.input_port_specs ] output_ports = [ COPort(ospec.name, ospec.get_port_type(), **ospec.get_port_attrs()) for ospec in spec.output_port_specs ] output_ports.insert(0, COPort('Instance', spec.module_name)) # Adds instance output port _input_spec_table = {} for ps in spec.input_port_specs: _input_spec_table[ps.name] = ps _output_spec_table = {} for ps in spec.output_port_specs: _output_spec_table[ps.name] = ps d = { '__module__': __name__, '_settings': _settings, '__doc__': spec.docstring, '__name__': spec.name or spec.module_name, '_input_ports': input_ports, '_output_ports': output_ports, '_input_spec_table': _input_spec_table, '_output_spec_table': _output_spec_table, '_module_spec': spec, 'is_cacheable': lambda self: spec.cacheable, '_lib': lib } superklass = klasses.get(spec.superklass, BaseClassModule) new_klass = type(str(spec.module_name), (superklass, ), d) klasses[spec.module_name] = new_klass return new_klass
class HDFSPut(HadoopBaseModule): """ Putting a local file to the Hadoop DFS First copying it to the server """ _settings = ModuleSettings(namespace='hadoop') _input_ports = [ IPort('Local File', File), IPort('Remote Location', String), IPort('Override', Boolean), IPort('Machine', '(org.vistrails.vistrails.remoteq:Machine)') ] _output_ports = [ OPort('Machine', '(org.vistrails.vistrails.remoteq:Machine)'), OPort('Remote Location', String) ] def __init__(self): HadoopBaseModule.__init__(self) def compute(self): machine = self.get_machine() jm = self.job_monitor() id = self.signature job = jm.getCache(id) if not job: remote = self.get_input('Remote Location') local = self.get_input('Local File') override = self.force_get_input('Override', False) if '://' not in remote: remote = self.add_prefix(remote, machine) if not int( self.call_hdfs('dfs -test -e ' + remote + '; echo $?', machine)): if override: self.call_hdfs('dfs -rm -r ' + remote, machine) else: raise ModuleError(self, 'Remote entry already exists') tempfile = machine.remote.send_command('mktemp -u').strip() result = machine.sendfile(local.name, tempfile) self.call_hdfs('dfs -put %s %s' % (tempfile, remote), machine) result = machine.remote.rm(tempfile, force=True, recursively=True) d = {'remote': remote, 'local': local.name} self.set_job_machine(d, machine) jm.setCache(id, d, self.job_name()) job = jm.getJob(id) self.set_output('Remote Location', job.parameters['remote']) self.set_output('Machine', machine)
class PersistedDir(PersistedPath): """Records a directory in the file store. """ _input_ports = [ IPort('path', Directory), IPort('metadata', Metadata, optional=True)] _output_ports = [ OPort('path', Directory)] _settings = ModuleSettings(configure_widget= 'vistrails.packages.persistent_archive.widgets:SetMetadataWidget') def check_path_type(self, path): if not os.path.isdir(path): raise ModuleError(self, "Path is not a directory")