Пример #1
0
 def test_adjust_state(self):
     """Test adjusting state for modules that do not require execution."""
     # Current database state
     datasets = {
         'A': DatasetDescriptor(identifier='123'),
         'B': DatasetDescriptor(identifier='345'),
         'C': DatasetDescriptor(identifier='567')
     }
     # Read 'A', write 'B', delete 'C' and create new dataset 'D'
     prov = ModuleProvenance(read={
         'A': '123',
         'B': '345'
     },
                             write={
                                 'B': DatasetDescriptor(identifier='666'),
                                 'D': DatasetDescriptor(identifier='999')
                             },
                             delete=['C'])
     self.assertFalse(prov.requires_exec(datasets))
     state = prov.get_database_state(prev_state=datasets)
     # The resulting start should contain 'A'->123, 'B'->666, and 'D'->999
     self.assertEqual(len(state), 3)
     for name in ['A', 'B', 'D']:
         self.assertTrue(name in state)
     self.assertEqual(state['A'].identifier, '123')
     self.assertEqual(state['B'].identifier, '666')
     self.assertEqual(state['D'].identifier, '999')
Пример #2
0
    def compute_empty_dataset(self, args, context):
        """Execute empty dataset command.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        outputs = ModuleOutputs()
        default_columns = [("''", "unnamed_column")]
        ds_name = args.get_value(pckg.PARA_NAME).lower()
        if ds_name in context.datasets:
            raise ValueError('dataset \'' + ds_name + '\' exists')
        if not is_valid_name(ds_name):
            raise ValueError('invalid dataset name \'' + ds_name + '\'')
        try:
            source = "SELECT {};".format(", ".join(
                default_val + " AS " + col_name
                for default_val, col_name in default_columns))
            view_name, dependencies = mimir.createView(dict(), source)

            columns = [
                MimirDatasetColumn(identifier=col_id,
                                   name_in_dataset=col_defn[1])
                for col_defn, col_id in zip(default_columns,
                                            range(len(default_columns)))
            ]

            ds = context.datastore.register_dataset(table_name=view_name,
                                                    columns=columns,
                                                    row_counter=1)
            provenance = ModuleProvenance(
                write={
                    ds_name:
                    DatasetDescriptor(identifier=ds.identifier,
                                      columns=ds.columns,
                                      row_count=ds.row_count)
                },
                read=dict(
                )  # Need to explicitly declare a lack of dependencies.
            )
            outputs.stdout.append(
                TextOutput("Empty dataset '{}' created".format(ds_name)))
        except Exception as ex:
            provenance = ModuleProvenance()
            outputs.error(ex)
        return ExecResult(is_success=(len(outputs.stderr) == 0),
                          outputs=outputs,
                          provenance=provenance)
 def test_single_append(self):
     """Test appending a single module to an empty viztrail branch."""
     base_path = os.path.join(os.path.abspath(REPO_DIR), 'ABC')
     os.makedirs(base_path)
     vt = OSViztrailHandle.create_viztrail(identifier='ABC',
                                           properties={},
                                           base_path=base_path)
     branch = vt.get_default_branch()
     command = python_cell(source='print 2+2')
     ts = get_current_time()
     module = OSModuleHandle.create_module(
         command=command,
         external_form='print 2+2',
         state=MODULE_SUCCESS,
         outputs=ModuleOutputs(stdout=[TextOutput('4')]),
         provenance=ModuleProvenance(),
         timestamp=ModuleTimestamp(created_at=ts,
                                   started_at=ts,
                                   finished_at=ts),
         module_folder=vt.modules_folder,
         object_store=vt.object_store)
     wf = branch.append_workflow(modules=[module],
                                 action=ACTION_INSERT,
                                 command=command)
     # We expect that there exists a file for the workflow handle and one for
     # the new module
     self.assertTrue(
         os.path.isfile(os.path.join(branch.base_path, wf.identifier)))
     self.assertTrue(
         os.path.isfile(os.path.join(wf.modules[-1].module_path)))
     # Load the viztrail and get the module at the branch head
     vt = OSViztrailHandle.load_viztrail(base_path)
     module = vt.get_default_branch().get_head().modules[-1]
     self.assertEqual(module.external_form, 'print 2+2')
     self.assertEqual(module.outputs.stdout[-1].value, '4')
Пример #4
0
    def set_success(self, 
            finished_at: datetime = get_current_time(), 
            outputs: ModuleOutputs = ModuleOutputs(), 
            provenance: ModuleProvenance = ModuleProvenance(),
            updated_arguments: Optional[ModuleArguments] = None
        ):
        """Set status of the module to success. The finished_at property of the
        timestamp is set to the given value or the current time (if None).

        If case of a successful module execution the database state and module
        provenance information are also adjusted together with the module
        output streams.

        Parameters
        ----------
        finished_at: datetime.datetime, optional
            Timestamp when module started running
        outputs: vizier.viztrail.module.output.ModuleOutputs, optional
            Output streams for module
        provenance: vizier.viztrail.module.provenance.ModuleProvenance, optional
            Provenance information about datasets that were read and writen by
            previous execution of the module.
        """
        # Update state, timestamp, database state, outputs and provenance
        # information.
        super().set_success(finished_at, outputs, provenance, updated_arguments)
        # Materialize module state
        self.write_safe()
Пример #5
0
    def set_success(self,
                    finished_at: datetime = get_current_time(),
                    outputs: ModuleOutputs = ModuleOutputs(),
                    provenance: ModuleProvenance = ModuleProvenance(),
                    updated_arguments: Optional[ModuleArguments] = None):
        """Set status of the module to success. The finished_at property of the
        timestamp is set to the given value or the current time (if None).

        If case of a successful module execution the database state and module
        provenance information are also adjusted together with the module
        output streams.

        Parameters
        ----------
        finished_at: datetime.datetime, optional
            Timestamp when module started running
        outputs: vizier.viztrail.module.output.ModuleOutputs, optional
            Output streams for module
        provenance: vizier.viztrail.module.provenance.ModuleProvenance, optional
            Provenance information about datasets that were read and writen by
            previous execution of the module.
        """
        # Update state, timestamp, database state, outputs and provenance
        # information.
        self.state = MODULE_SUCCESS
        self.timestamp.finished_at = finished_at
        # If the module is set to success straight from pending state the
        # started_at timestamp may not have been set.
        if self.timestamp.started_at is None:
            self.timestamp.started_at = self.timestamp.finished_at
        if updated_arguments is not None:
            self.command.arguments = updated_arguments
        self.outputs = outputs
        self.provenance = provenance
Пример #6
0
    def compute_drop_dataset(self, args, context):
        """Execute drop dataset command.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get dataset name and remove the associated entry from the
        # dictionary of datasets in the context. Will raise exception if the
        # specified dataset does not exist.
        ds_name = args.get_value(pckg.PARA_DATASET).lower()
        ds = context.get_dataset(ds_name)
        datasets = dict(context.datasets)
        del datasets[ds_name]
        return ExecResult(outputs=ModuleOutputs(
            stdout=[TextOutput('Dataset \'' + ds_name + '\' deleted')]),
                          provenance=ModuleProvenance(read=dict(),
                                                      write=dict(),
                                                      delete=[ds_name]))
Пример #7
0
    def execute_script(self, args, context):
        """Execute a R script in the given context.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get R script from user arguments
        source = args.get_value(cmd.PARA_R_SOURCE)
        # Redirect standard output and standard error streams
        out = sys.stdout
        err = sys.stderr
        stream = list()
        sys.stdout = OutputStream(tag='out', stream=stream)
        sys.stderr = OutputStream(tag='err', stream=stream)
        outputs = ModuleOutputs()
        
        mimir_table_names = dict()
        for ds_name_o in context.datasets:
            dataset_id = context.datasets[ds_name_o]
            dataset = context.datastore.get_dataset(dataset_id)
            if dataset is None:
                raise ValueError('unknown dataset \'' + ds_name_o + '\'')
            mimir_table_names[ds_name_o] = dataset.identifier
        # Run the r code
        try:
            evalresp = mimir.evalR(mimir_table_names, source)
            ostd = evalresp['stdout']
            oerr = evalresp['stderr']
            if not ostd == '':
                outputs.stdout.append(HtmlOutput(ostd))
            if not oerr == '':
                outputs.stderr.append(TextOutput(oerr))
        except Exception as ex:
            outputs.error(ex)
        finally:
            # Make sure to reverse redirection of output streams
            sys.stdout = out
            sys.stderr = err
        # Set module outputs
        for tag, text in stream:
            text = ''.join(text).strip()
            if tag == 'out':
                outputs.stdout.append(HtmlOutput(text))
            else:
                outputs.stderr.append(TextOutput(text))
        provenance = ModuleProvenance()
        # Return execution result
        return ExecResult(
            is_success=(len(outputs.stderr) == 0),
            outputs=outputs,
            provenance=provenance
        )
Пример #8
0
    def __init__(self,
                 is_success: bool = True,
                 outputs: ModuleOutputs = ModuleOutputs(),
                 provenance: ModuleProvenance = ModuleProvenance(),
                 updated_arguments: ModuleArguments = None):
        """Initialize the result components.

        Parameters
        ----------
        is_success: bool
            Flag indicating if execution was successful
        outputs: vizier.viztrail.module.output.ModuleOutputs, optional
            Outputs to STDOUT and STDERR generated during task execution
        provenance: vizier.viztrail.module.provenance.ModuleProvenance, optional
            Provenance information about datasets that were read and writen
            during task execution.
        updated_arguments: vizier.viztrail.command.ModuleArguments, optional
            If provided, the module's arguments will be overridden by the provided
            argument list.  This functionality should *only* be used when the module
            needs to infer/guess some of its arguments (e.g., Load Dataset needs to 
            actually try to load the dataset to have type/name information for 
            columns).  If updated arguments are provided, it is up to the processor 
            to guarantee that the updated arguments are *idempotent* with its 
            current execution (although idempotence with changes to the data and/or
            processor implementation need not be enforced.)
        """
        self.is_success = is_success
        self.outputs = outputs
        self.provenance = provenance
        self.updated_arguments = updated_arguments
 def test_load_with_dataset_delete(self):
     """Test loading workflows where each module creates a new dataset and
     deletes the previous dataset (except for the first module).
     """
     base_path = os.path.join(os.path.abspath(REPO_DIR), 'ABC')
     os.makedirs(base_path)
     vt = OSViztrailHandle.create_viztrail(identifier='ABC',
                                           properties={},
                                           base_path=base_path)
     branch = vt.get_default_branch()
     # Append ten modules
     for i in range(5):
         ts = get_current_time()
         deleted_datasets = list()
         if i > 0:
             deleted_datasets.append('DS' + str(i - 1))
         command = python_cell(source='print ' + str(i) + '+' + str(i))
         module = OSModuleHandle.create_module(
             command=command,
             external_form='print ' + str(i) + '+' + str(i),
             state=MODULE_SUCCESS,
             outputs=ModuleOutputs(stdout=[TextOutput(str(i + i))]),
             provenance=ModuleProvenance(write={
                 'DS' + str(i):
                 DatasetDescriptor(
                     identifier=str(i),
                     name='DS' + str(i),
                     columns=[
                         DatasetColumn(identifier=j, name=str(j))
                         for j in range(i)
                     ],
                 )
             },
                                         delete=deleted_datasets),
             timestamp=ModuleTimestamp(created_at=ts,
                                       started_at=ts,
                                       finished_at=ts),
             module_folder=vt.modules_folder,
             object_store=vt.object_store)
         if not branch.head is None:
             modules = branch.head.modules + [module]
         else:
             modules = [module]
         branch.append_workflow(modules=modules,
                                action=ACTION_INSERT,
                                command=command)
     vt = OSViztrailHandle.load_viztrail(base_path)
     workflow = vt.get_default_branch().get_head()
     self.assertEqual(len(workflow.modules), 5)
     datasets = {}
     for i in range(5):
         module = workflow.modules[i]
         datasets = module.provenance.get_database_state(datasets)
         self.assertEqual(len(datasets), 1)
         key = 'DS' + str(i)
         self.assertTrue(key in datasets)
         self.assertEqual(len(datasets[key].columns), i)
Пример #10
0
    def compute_simple_chart(self, args, context):
        """Execute simple chart command.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get dataset name and the associated dataset. This will raise an
        # exception if the dataset name is unknown.
        ds_name = args.get_value(pckg.PARA_DATASET)
        ds = context.get_dataset(ds_name)
        # Get user-provided name for the new chart and verify that it is a
        # valid name
        chart_name = args.get_value(pckg.PARA_NAME,
                                    default_value=ds_name + ' Plot')
        if chart_name == '' or chart_name == None:
            chart_name = ds_name + ' Plot'
        if not is_valid_name(chart_name):
            raise ValueError('invalid chart name \'' + str(chart_name) + '\'')
        chart_args = args.get_value(cmd.PARA_CHART)
        chart_type = chart_args.get_value(cmd.PARA_CHART_TYPE)
        grouped_chart = chart_args.get_value(cmd.PARA_CHART_GROUPED)
        # Create a new chart view handle and add the series definitions
        view = ChartViewHandle(dataset_name=ds_name,
                               chart_name=chart_name,
                               chart_type=chart_type,
                               grouped_chart=grouped_chart)
        # The data series index for x-axis values is optional
        if args.has(cmd.PARA_XAXIS):
            x_axis = args.get_value(cmd.PARA_XAXIS)
            # X-Axis column may be empty. In that case, we ignore the
            # x-axis spec
            add_data_series(args=x_axis,
                            view=view,
                            dataset=ds,
                            col_arg_id=cmd.PARA_XAXIS_COLUMN,
                            range_arg_id=cmd.PARA_XAXIS_RANGE)
            view.x_axis = 0
        # Definition of data series. Each series is a pair of column
        # identifier and a printable label.
        for data_series in args.get_value(cmd.PARA_SERIES):
            add_data_series(args=data_series, view=view, dataset=ds)
        # Execute the query and get the result
        rows = ChartQuery.exec_query(ds, view)
        # Add chart view handle as module output
        return ExecResult(
            outputs=ModuleOutputs(stdout=[ChartOutput(view=view, rows=rows)]),
            provenance=ModuleProvenance(read={ds_name: ds.identifier},
                                        write=dict(),
                                        charts=[view]))
Пример #11
0
 def test_timestamps(self):
     """Test reading and writing modules with different timestamp values."""
     mod0 = OSModuleHandle.create_module(
         command=python_cell(source='print 2+2'),
         external_form='TEST MODULE',
         state=MODULE_PENDING,
         outputs=ModuleOutputs(),
         provenance=ModuleProvenance(),
         timestamp=ModuleTimestamp(),
         module_folder=MODULE_DIR)
     m = OSModuleHandle.load_module(identifier=mod0.identifier,
                                    module_path=mod0.module_path)
     # Test timestamps
     created_at = m.timestamp.created_at
     started_at = to_datetime('2018-11-26T13:00:00.000000')
     m.timestamp.started_at = started_at
     m.write_module()
     m = OSModuleHandle.load_module(identifier=mod0.identifier,
                                    module_path=mod0.module_path)
     self.assertEqual(m.timestamp.created_at, created_at)
     self.assertEqual(m.timestamp.started_at, started_at)
     finished_at = to_datetime('2018-11-26T13:00:00.000010')
     m.timestamp.created_at = finished_at
     m.timestamp.finished_at = finished_at
     m.write_module()
     m = OSModuleHandle.load_module(identifier=mod0.identifier,
                                    module_path=mod0.module_path)
     self.assertEqual(m.timestamp.created_at, finished_at)
     self.assertEqual(m.timestamp.started_at, started_at)
     self.assertEqual(m.timestamp.finished_at, finished_at)
     mod0 = OSModuleHandle.create_module(
         command=python_cell(source='print 2+2'),
         external_form='TEST MODULE',
         state=MODULE_PENDING,
         outputs=ModuleOutputs(),
         provenance=ModuleProvenance(),
         timestamp=ModuleTimestamp(created_at=created_at,
                                   started_at=started_at),
         module_folder=MODULE_DIR)
     m = OSModuleHandle.load_module(identifier=mod0.identifier,
                                    module_path=mod0.module_path)
     self.assertEqual(m.timestamp.created_at, created_at)
     self.assertEqual(m.timestamp.started_at, started_at)
     self.assertIsNone(m.timestamp.finished_at)
 def test_load_active(self):
     """Test loading workflows with active modules."""
     base_path = os.path.join(os.path.abspath(REPO_DIR), 'ABC')
     os.makedirs(base_path)
     vt = OSViztrailHandle.create_viztrail(identifier='ABC',
                                           properties=None,
                                           base_path=base_path)
     branch = vt.get_default_branch()
     # Append ten modules
     for i in range(5):
         ts = get_current_time()
         command = python_cell(source='print ' + str(i) + '+' + str(i))
         module = OSModuleHandle.create_module(
             command=command,
             external_form='print ' + str(i) + '+' + str(i),
             state=MODULE_SUCCESS,
             datasets=dict(),
             outputs=ModuleOutputs(stdout=[TextOutput(str(i + i))]),
             provenance=ModuleProvenance(),
             timestamp=ModuleTimestamp(created_at=ts,
                                       started_at=ts,
                                       finished_at=ts),
             module_folder=vt.modules_folder,
             object_store=vt.object_store)
         if not branch.head is None:
             modules = branch.head.modules + [module]
         else:
             modules = [module]
         branch.append_workflow(modules=modules,
                                action=ACTION_INSERT,
                                command=command)
         self.assertEqual(len(branch.get_history()), (i + 1))
     # This is a hack to simulate loading workflows with active modules
     # Change state of last two modules in branch head to an active state
     m = branch.get_head().modules[-2]
     m.state = MODULE_RUNNING
     m.write_module()
     m = branch.get_head().modules[-1]
     m.state = MODULE_RUNNING
     m.write_module()
     vt = OSViztrailHandle.load_viztrail(base_path)
     branch = vt.get_default_branch()
     self.assertTrue(branch.get_head().modules[0].is_success)
     self.assertTrue(branch.get_head().modules[1].is_success)
     self.assertTrue(branch.get_head().modules[2].is_success)
     self.assertTrue(branch.get_head().modules[3].is_canceled)
     self.assertTrue(branch.get_head().modules[4].is_canceled)
     # Change state of last module in second workflow to an active state
     m = branch.get_head().modules[1]
     m.state = MODULE_RUNNING
     m.write_module()
     vt = OSViztrailHandle.load_viztrail(base_path)
     branch = vt.get_default_branch()
     wf = branch.get_workflow(branch.get_history()[1].identifier)
     self.assertTrue(wf.modules[0].is_success)
     self.assertTrue(wf.modules[1].is_canceled)
Пример #13
0
 def test_datasets(self):
     """Test reading and writing modules with dataset information."""
     mod0 = OSModuleHandle.create_module(
         command=python_cell(source='print 2+2'),
         external_form='TEST MODULE',
         state=MODULE_PENDING,
         outputs=ModuleOutputs(),
         provenance=ModuleProvenance(write=DATASETS),
         timestamp=ModuleTimestamp(),
         module_folder=MODULE_DIR,
         datasets=DATASETS)
     m = OSModuleHandle.load_module(identifier=mod0.identifier,
                                    module_path=mod0.module_path,
                                    prev_state=dict())
     self.assertEqual(len(m.datasets), 0)
     mod0 = OSModuleHandle.create_module(
         command=python_cell(source='print 2+2'),
         external_form='TEST MODULE',
         state=MODULE_SUCCESS,
         outputs=ModuleOutputs(),
         provenance=ModuleProvenance(write=DATASETS),
         timestamp=ModuleTimestamp(),
         module_folder=MODULE_DIR,
         datasets=DATASETS)
     m = OSModuleHandle.load_module(identifier=mod0.identifier,
                                    module_path=mod0.module_path,
                                    prev_state=dict())
     self.assertEqual(len(m.datasets), 2)
     self.assertEqual(m.datasets['DS1'].identifier, 'ID1')
     self.assertEqual(len(m.datasets['DS1'].columns), 0)
     self.assertEqual(m.datasets['DS1'].row_count, 0)
     ds2 = m.datasets['DS2']
     self.assertEqual(ds2.identifier, 'ID2')
     self.assertEqual(len(ds2.columns), 2)
     col0 = ds2.columns[0]
     self.assertEqual(col0.identifier, 0)
     self.assertEqual(col0.name, 'ABC')
     self.assertEqual(col0.data_type, 'int')
     col1 = ds2.columns[1]
     self.assertEqual(col1.identifier, 1)
     self.assertEqual(col1.name, 'xyz')
     self.assertEqual(col1.data_type, 'real')
     self.assertEqual(ds2.row_count, 100)
Пример #14
0
    def __init__(self,
                 command: ModuleCommand,
                 external_form: Optional[str],
                 identifier: Optional[str] = None,
                 state: int = MODULE_PENDING,
                 timestamp: ModuleTimestamp = ModuleTimestamp(),
                 outputs: ModuleOutputs = ModuleOutputs(),
                 provenance: ModuleProvenance = ModuleProvenance()):
        """Initialize the module handle. For new modules, datasets and outputs
        are initially empty.

        Parameters
        ----------
        command : vizier.viztrail.command.ModuleCommand
            Specification of the module (i.e., package, name, and arguments)
        external_form: string
            Printable representation of module command
        identifier : string, optional
            Unique module identifier
        state: int
            Module state (one of PENDING, RUNNING, CANCELED, ERROR, SUCCESS)
        timestamp: vizier.viztrail.module.timestamp.ModuleTimestamp, optional
            Module timestamp
        outputs: vizier.viztrail.module.output.ModuleOutputs, optional
            Module output streams STDOUT and STDERR
        provenance: vizier.viztrail.module.provenance.ModuleProvenance, optional
            Provenance information about datasets that were read and writen by
            previous execution of the module.
        """
        super(ModuleHandle, self).__init__(
            state=state if not state is None else MODULE_PENDING)
        self.identifier = identifier
        self.command = command
        self.external_form = external_form
        self.outputs = outputs if not outputs is None else ModuleOutputs()
        self.provenance = provenance if not provenance is None else ModuleProvenance(
        )
        self.timestamp = timestamp if not timestamp is None else ModuleTimestamp(
        )
Пример #15
0
    def execute_script(self, args, context):
        """Execute a Markdown script in the given context.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get Markdown script from user arguments
        source = args.get_value(cmd.PARA_MARKDOWN_SOURCE)
        # Redirect standard output and standard error streams
        out = sys.stdout
        err = sys.stderr
        stream = list()
        sys.stdout = OutputStream(tag='out', stream=stream)
        sys.stderr = OutputStream(tag='err', stream=stream)
        outputs = ModuleOutputs()
        # Run the markdown code
        try:
            #we should validate the markdown here
            ostd = source
            oerr = ''
            if not ostd == '':
                outputs.stdout.append(MarkdownOutput(ostd))
            if not oerr == '':
                outputs.stderr.append(TextOutput(oerr))
        except Exception as ex:
            outputs.error(ex)
        finally:
            # Make sure to reverse redirection of output streams
            sys.stdout = out
            sys.stderr = err
        # Set module outputs
        for tag, text in stream:
            text = ''.join(text).strip()
            if tag == 'out':
                outputs.stdout.append(MarkdownOutput(text))
            else:
                outputs.stderr.append(TextOutput(text))
        provenance = ModuleProvenance()
        # Return execution result
        return ExecResult(is_success=(len(outputs.stderr) == 0),
                          outputs=outputs,
                          provenance=provenance)
 def test_completed_append(self):
     """Test appending a completed workflow to a branch."""
     base_path = os.path.join(os.path.abspath(REPO_DIR), 'ABC')
     os.makedirs(base_path)
     vt = OSViztrailHandle.create_viztrail(identifier='ABC',
                                           properties=None,
                                           base_path=base_path)
     branch = vt.get_default_branch()
     for i in range(10):
         ts = get_current_time()
         command = python_cell(source='print ' + str(i) + '+' + str(i))
         module = OSModuleHandle.create_module(
             command=command,
             external_form='print ' + str(i) + '+' + str(i),
             state=MODULE_SUCCESS,
             datasets=dict(),
             outputs=ModuleOutputs(stdout=[TextOutput(str(i + i))]),
             provenance=ModuleProvenance(),
             timestamp=ModuleTimestamp(created_at=ts,
                                       started_at=ts,
                                       finished_at=ts),
             module_folder=vt.modules_folder,
             object_store=vt.object_store)
         if not branch.head is None:
             modules = branch.head.modules + [module]
         else:
             modules = [module]
         branch.append_workflow(modules=modules,
                                action=ACTION_INSERT,
                                command=command)
     head_modules = branch.get_head().modules
     wf = branch.append_workflow(modules=head_modules[:-1],
                                 action=ACTION_DELETE,
                                 command=head_modules[-1].command)
     self.assertEqual(len(wf.modules), 9)
     self.assertEqual(wf.descriptor.identifier, '0000000A')
     self.assertEqual(wf.descriptor.action, ACTION_DELETE)
     self.assertEqual(wf.descriptor.package_id, PACKAGE_PYTHON)
     self.assertEqual(wf.descriptor.command_id, PYTHON_CODE)
     vt = OSViztrailHandle.load_viztrail(base_path)
     branch = vt.get_default_branch()
     history = branch.get_history()
     self.assertEqual(len(history), 11)
     wf = branch.get_head()
     self.assertEqual(len(wf.modules), 9)
     self.assertEqual(wf.descriptor.identifier, '0000000A')
     self.assertEqual(wf.descriptor.action, ACTION_DELETE)
     self.assertEqual(wf.descriptor.package_id, PACKAGE_PYTHON)
     self.assertEqual(wf.descriptor.command_id, PYTHON_CODE)
 def test_running(self):
     """Update module state from pending to running."""
     # Create original module
     module = OSModuleHandle.create_module(
         command=python_cell(source='print 2+2'),
         external_form='TEST MODULE',
         state=MODULE_PENDING,
         module_folder=MODULE_DIR,
         timestamp=ModuleTimestamp(),
         datasets={'DS1': DS1},
         outputs=ModuleOutputs(stdout=[TextOutput('ABC')]),
         provenance=ModuleProvenance(
             read={'DS1': 'ID1'},
             write={'DS1': DatasetDescriptor(identifier='ID2')},
             resources={'fileid': '0123456789'}))
     self.assertTrue(module.is_pending)
     module.set_running(external_form='TEST MODULE')
     self.assertTrue(module.is_running)
     self.assertIsNotNone(module.timestamp.started_at)
     self.assertEqual(len(module.datasets), 0)
     self.assertEqual(len(module.outputs.stderr), 0)
     self.assertEqual(len(module.outputs.stdout), 0)
     self.assertIsNotNone(module.provenance.read)
     self.assertIsNotNone(module.provenance.write)
     self.assertIsNotNone(module.provenance.resources)
     # Read module from object store and ensure that tall changes have been
     # materialized properly
     module = OSModuleHandle.load_module(identifier=module.identifier,
                                         module_path=module.module_path)
     self.assertTrue(module.is_running)
     self.assertIsNotNone(module.timestamp.started_at)
     self.assertEqual(len(module.datasets), 0)
     self.assertEqual(len(module.outputs.stderr), 0)
     self.assertEqual(len(module.outputs.stdout), 0)
     self.assertIsNotNone(module.provenance.read)
     self.assertIsNotNone(module.provenance.write)
     self.assertIsNotNone(module.provenance.resources)
     # Set running with all optional parameters
     module.set_running(started_at=module.timestamp.created_at,
                        external_form='Some form')
     self.assertEqual(module.timestamp.started_at,
                      module.timestamp.created_at)
     self.assertEqual(module.external_form, 'Some form')
     module = OSModuleHandle.load_module(identifier=module.identifier,
                                         module_path=module.module_path)
     self.assertEqual(module.timestamp.started_at,
                      module.timestamp.created_at)
     self.assertEqual(module.external_form, 'Some form')
 def test_state(self):
     """Ensure that only one of the state flag is True at the same time."""
     # Create original module
     module = OSModuleHandle.create_module(
         command=python_cell(source='print 2+2'),
         external_form='TEST MODULE',
         state=MODULE_PENDING,
         module_folder=MODULE_DIR,
         timestamp=ModuleTimestamp(),
         outputs=ModuleOutputs(stdout=[TextOutput('ABC')]),
         provenance=ModuleProvenance(
             read={'DS1': 'ID1'},
             write={'DS1': DatasetDescriptor(identifier='ID2',
                                             name='ID2')}))
     # Pending
     self.assertTrue(module.is_pending)
     self.assertFalse(module.is_canceled)
     self.assertFalse(module.is_error)
     self.assertFalse(module.is_running)
     self.assertFalse(module.is_success)
     # Running
     module.set_running(external_form='TEST MODULE')
     self.assertFalse(module.is_pending)
     self.assertFalse(module.is_canceled)
     self.assertFalse(module.is_error)
     self.assertTrue(module.is_running)
     self.assertFalse(module.is_success)
     # Canceled
     module.set_canceled()
     self.assertFalse(module.is_pending)
     self.assertTrue(module.is_canceled)
     self.assertFalse(module.is_error)
     self.assertFalse(module.is_running)
     self.assertFalse(module.is_success)
     # Error
     module.set_error()
     self.assertFalse(module.is_pending)
     self.assertFalse(module.is_canceled)
     self.assertTrue(module.is_error)
     self.assertFalse(module.is_running)
     self.assertFalse(module.is_success)
     # Success
     module.set_success()
     self.assertFalse(module.is_pending)
     self.assertFalse(module.is_canceled)
     self.assertFalse(module.is_error)
     self.assertFalse(module.is_running)
     self.assertTrue(module.is_success)
Пример #19
0
    def __init__(self, 
            identifier: str, 
            command: ModuleCommand, 
            external_form: str, 
            module_path: str,
            state: int = mstate.MODULE_PENDING, 
            timestamp: ModuleTimestamp = ModuleTimestamp(), 
            outputs: ModuleOutputs = ModuleOutputs(),
            provenance: ModuleProvenance = ModuleProvenance(), 
            object_store: ObjectStore = DefaultObjectStore()
        ):
        """Initialize the module handle. For new modules, datasets and outputs
        are initially empty.

        Parameters
        ----------
        identifier : string
            Unique module identifier
        command : vizier.viztrail.command.ModuleCommand
            Specification of the module (i.e., package, name, and arguments)
        external_form: string
            Printable representation of module command
        module_path: string
            Path to module resource in object store
        state: int
            Module state (one of PENDING, RUNNING, CANCELED, ERROR, SUCCESS)
        timestamp: vizier.viztrail.module.timestamp.ModuleTimestamp, optional
            Module timestamp
        outputs: vizier.viztrail.module.output.ModuleOutputs, optional
            Module output streams STDOUT and STDERR
        provenance: vizier.viztrail.module.provenance.ModuleProvenance, optional
            Provenance information about datasets that were read and writen by
            previous execution of the module.
        object_store: vizier.core.io.base.ObjectStore, optional
            Object store implementation to access and maintain resources
        """
        super(OSModuleHandle, self).__init__(
            identifier=identifier,
            command=command,
            external_form=external_form,
            state=state,
            timestamp=timestamp,
            outputs= outputs,
            provenance=provenance,
        )
        self.module_path = module_path
        self.object_store = object_store
Пример #20
0
    def create_exec_result(self,
                           dataset_name,
                           input_dataset=None,
                           output_dataset=None,
                           database_state=None,
                           stdout=None,
                           resources=None):
        """Create execution result object for a successfully completed task.
        Assumes that a single datasets has been modified.

        Note that this method is not suitable to generate the result object for
        the drop dataset and rename dataset commands.

        Parameters
        ----------
        dataset_name: string
            Name of the manipulated dataset
        input_dataset: vizier.datastore.dataset.DatasetDescriptor
            Descriptor for the input dataset
        output_dataset: vizier.datastore.dataset.DatasetDescriptor, optional
            Descriptor for the resulting dataset
        database_state: dict, optional
            Identifier for datasets in the database state agains which a task
            was executed (keyed by user-provided name)
        stdout= list(string), optional
            Lines in the command output
        resources: dict, optional
            Optional resources that were generated by the command

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        if not output_dataset is None:
            ds = DatasetDescriptor(identifier=output_dataset.identifier,
                                   columns=output_dataset.columns,
                                   row_count=output_dataset.row_count)
        else:
            ds = None
        return ExecResult(
            outputs=ModuleOutputs(stdout=[TextOutput(line)
                                          for line in stdout]),
            provenance=ModuleProvenance(
                read={dataset_name: input_dataset.identifier}
                if not input_dataset is None else None,
                write={dataset_name: ds},
                resources=resources))
Пример #21
0
    def __init__(self, is_success=True, outputs=None, provenance=None):
        """Initialize the result components.

        Parameters
        ----------
        is_success: bool
            Flag indicating if execution was successful
        outputs: vizier.viztrail.module.output.ModuleOutputs, optional
            Outputs to STDOUT and STDERR generated during task execution
        provenance: vizier.viztrail.module.provenance.ModuleProvenance, optional
            Provenance information about datasets that were read and writen
            during task execution.
        """
        self.is_success = is_success
        self.outputs = outputs if not outputs is None else ModuleOutputs()
        self.provenance = provenance if not provenance is None else ModuleProvenance(
        )
 def test_multi_append(self):
     """Test appending modules to viztrail branch."""
     base_path = os.path.join(os.path.abspath(REPO_DIR), 'ABC')
     os.makedirs(base_path)
     vt = OSViztrailHandle.create_viztrail(identifier='ABC',
                                           properties=None,
                                           base_path=base_path)
     branch = vt.get_default_branch()
     # Append ten modules
     for i in range(10):
         ts = get_current_time()
         command = python_cell(source='print ' + str(i) + '+' + str(i))
         module = OSModuleHandle.create_module(
             command=command,
             external_form='print ' + str(i) + '+' + str(i),
             state=MODULE_SUCCESS,
             datasets=dict(),
             outputs=ModuleOutputs(stdout=[TextOutput(str(i + i))]),
             provenance=ModuleProvenance(),
             timestamp=ModuleTimestamp(created_at=ts,
                                       started_at=ts,
                                       finished_at=ts),
             module_folder=vt.modules_folder,
             object_store=vt.object_store)
         if not branch.head is None:
             modules = branch.head.modules + [module]
         else:
             modules = [module]
         branch.append_workflow(modules=modules,
                                action=ACTION_INSERT,
                                command=command)
         self.assertEqual(len(branch.get_history()), (i + 1))
     vt = OSViztrailHandle.load_viztrail(base_path)
     branch = vt.get_default_branch()
     history = branch.get_history()
     self.assertEqual(len(history), 10)
     for i in range(10):
         wf = branch.get_workflow(history[i].identifier)
         self.assertEqual(len(wf.modules), (i + 1))
         for m in range(i + 1):
             module = wf.modules[m]
             self.assertEqual(module.external_form,
                              'print ' + str(m) + '+' + str(m))
             self.assertEqual(module.outputs.stdout[-1].value, str(m + m))
Пример #23
0
    def __init__(self,
                 command,
                 external_form,
                 identifier=None,
                 state=None,
                 timestamp=None,
                 datasets=None,
                 outputs=None,
                 provenance=None):
        """Initialize the module handle. For new modules, datasets and outputs
        are initially empty.

        Parameters
        ----------
        command : vizier.viztrail.command.ModuleCommand
            Specification of the module (i.e., package, name, and arguments)
        external_form: string
            Printable representation of module command
        identifier : string, optional
            Unique module identifier
        state: int
            Module state (one of PENDING, RUNNING, CANCELED, ERROR, SUCCESS)
        timestamp: vizier.viztrail.module.timestamp.ModuleTimestamp, optional
            Module timestamp
        datasets : dict(vizier.datastore.dataset.DatasetDescriptor), optional
            Dictionary of resulting datasets. The user-specified name is the key
            and the unique dataset identifier the value.
        outputs: vizier.viztrail.module.output.ModuleOutputs, optional
            Module output streams STDOUT and STDERR
        provenance: vizier.viztrail.module.provenance.ModuleProvenance, optional
            Provenance information about datasets that were read and writen by
            previous execution of the module.
        """
        super(ModuleHandle, self).__init__(
            state=state if not state is None else MODULE_PENDING)
        self.identifier = identifier
        self.command = command
        self.external_form = external_form
        self.datasets = datasets if not datasets is None else dict()
        self.outputs = outputs if not outputs is None else ModuleOutputs()
        self.provenance = provenance if not provenance is None else ModuleProvenance(
        )
        self.timestamp = timestamp if not timestamp is None else ModuleTimestamp(
        )
Пример #24
0
 def test_create_and_delete_branch_with_default_workflow(self):
     """Ensure that creating and loading branches works if the head workflow
     for the new branch is given.
     """
     base_path = os.path.join(os.path.abspath(REPO_DIR), 'ABC')
     os.makedirs(base_path)
     vt = OSViztrailHandle.create_viztrail(
         identifier='DEF',
         properties={PROPERTY_NAME: 'My Viztrail'},
         base_path=base_path)
     self.assertEqual(vt.last_modified_at,
                      vt.default_branch.last_modified_at)
     # Create five modules
     modules = list()
     for i in range(5):
         identifier = OSModuleHandle.create_module(
             command=python_cell(source='print ' + str(i)),
             external_form='TEST MODULE ' + str(i),
             state=MODULE_SUCCESS,
             outputs=ModuleOutputs(),
             provenance=ModuleProvenance(),
             timestamp=ModuleTimestamp(),
             datasets=dict(),
             module_folder=vt.modules_folder,
         ).identifier
         modules.append(identifier)
     branch = vt.create_branch(properties={PROPERTY_NAME: 'My Branch'},
                               modules=modules)
     self.assertIsNotNone(branch.head)
     self.assertEqual(len(branch.workflows), 1)
     vt = OSViztrailHandle.load_viztrail(base_path)
     branch = vt.get_branch(branch.identifier)
     self.assertIsNotNone(branch.head)
     self.assertEqual(len(branch.workflows), 1)
     wf = branch.get_workflow(branch.head.identifier)
     self.assertEqual(len(wf.modules), 5)
     for i in range(5):
         self.assertEqual(wf.modules[i].external_form,
                          'TEST MODULE ' + str(i))
     self.assertEqual(vt.last_modified_at, branch.last_modified_at)
     self.assertEqual(vt.last_modified_at, branch.last_modified_at)
Пример #25
0
def PROVENANCE(obj):
    """Convert the serialization of module provenance information into an
    instance of the ModuleProvenance class.

    Parameters
    ----------
    obj: dict
        Serialization of module provenance information

    Returns
    -------
    vizier.viztrail.module.provenance.ModuleProvenance
    """
    # All components are optional
    # Dictionary of datasets that were read
    read = None
    if 'read' in obj:
        read = dict()
        for ds in obj['read']:
            read[ds[labels.NAME]] = ds[labels.ID]
    # Dictionary of datasets that were written
    write = None
    if 'write' in obj:
        write = dict()
        for ds in obj['write']:
            if 'dataset' in ds:
                write[ds[labels.NAME]] = DATASET_DESCRIPTOR(ds['dataset'])
            else:
                write[ds[labels.NAME]] = None
    # names of datasets that were deleted
    delete = None
    if 'delete' in obj:
        delete = obj['delete']
    # Optional resource information (dictionary)
    resources = None
    if 'resources' in obj:
        resources = obj['resources']
    return ModuleProvenance(read=read,
                            write=write,
                            delete=delete,
                            resources=resources)
Пример #26
0
    def set_success(self,
                    finished_at=None,
                    datasets=None,
                    outputs=None,
                    provenance=None):
        """Set status of the module to success. The finished_at property of the
        timestamp is set to the given value or the current time (if None).

        If case of a successful module execution the database state and module
        provenance information are also adjusted together with the module
        output streams.

        Parameters
        ----------
        finished_at: datetime.datetime, optional
            Timestamp when module started running
        datasets : dict(vizier.datastore.dataset.DatasetDescriptor), optional
            Dictionary of resulting datasets. The user-specified name is the key
            for the dataset descriptor.
        outputs: vizier.viztrail.module.output.ModuleOutputs, optional
            Output streams for module
        provenance: vizier.viztrail.module.provenance.ModuleProvenance, optional
            Provenance information about datasets that were read and writen by
            previous execution of the module.
        """
        # Update state, timestamp, database state, outputs and provenance
        # information.
        self.state = mstate.MODULE_SUCCESS
        self.timestamp.finished_at = finished_at if not finished_at is None else get_current_time(
        )
        # If the module is set to success straight from pending state the
        # started_at timestamp may not have been set.
        if self.timestamp.started_at is None:
            self.timestamp.started_at = self.timestamp.finished_at
        self.outputs = outputs if not outputs is None else ModuleOutputs()
        self.datasets = datasets if not datasets is None else dict()
        self.provenance = provenance if not provenance is None else ModuleProvenance(
        )
        # Materialize module state
        self.write_safe()
 def test_load_with_missing_modules(self):
     """Test loading workflows with active modules."""
     base_path = os.path.join(os.path.abspath(REPO_DIR), 'ABC')
     os.makedirs(base_path)
     vt = OSViztrailHandle.create_viztrail(identifier='ABC',
                                           properties=None,
                                           base_path=base_path)
     branch = vt.get_default_branch()
     # Append ten modules
     for i in range(5):
         ts = get_current_time()
         command = python_cell(source='print ' + str(i) + '+' + str(i))
         module = OSModuleHandle.create_module(
             command=command,
             external_form='print ' + str(i) + '+' + str(i),
             state=MODULE_SUCCESS,
             datasets=dict(),
             outputs=ModuleOutputs(stdout=[TextOutput(str(i + i))]),
             provenance=ModuleProvenance(),
             timestamp=ModuleTimestamp(created_at=ts,
                                       started_at=ts,
                                       finished_at=ts),
             module_folder=vt.modules_folder,
             object_store=vt.object_store)
         if not branch.head is None:
             modules = branch.head.modules + [module]
         else:
             modules = [module]
         branch.append_workflow(modules=modules,
                                action=ACTION_INSERT,
                                command=command)
         self.assertEqual(len(branch.get_history()), (i + 1))
     # Delete the file for the third module to simulate an error condition in
     # which a file wasn't written properly
     os.remove(branch.head.modules[2].module_path)
     self.assertFalse(os.path.isfile(branch.head.modules[2].module_path))
     vt = OSViztrailHandle.load_viztrail(base_path)
     branch = vt.get_default_branch()
     self.assertTrue(branch.head.get_state().is_error)
     self.assertTrue(branch.head.modules[2].is_error)
Пример #28
0
    def compute_rename_dataset(self, args, context):
        """Execute rename dataset command.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get name of existing dataset and the new dataset name. Raise
        # exception if a dataset with the new name already exists or if the new
        # dataset name is not a valid name.
        ds_name = args.get_value(pckg.PARA_DATASET).lower()
        new_name = args.get_value(pckg.PARA_NAME).lower()
        if new_name in context.datasets:
            raise ValueError('dataset \'' + new_name + '\' exists')
        if not is_valid_name(new_name):
            raise ValueError('invalid dataset name \'' + new_name + '\'')
        #  Get dataset. Raises exception if the dataset does not exist.
        ds = context.get_dataset(ds_name)
        # Adjust database state
        datasets = dict(context.datasets)
        del datasets[ds_name]
        datasets[new_name] = ds
        return ExecResult(
            outputs=ModuleOutputs(stdout=[TextOutput('1 dataset renamed')]),
            provenance=ModuleProvenance(read=dict(),
                                        write={
                                            new_name:
                                            DatasetDescriptor(
                                                identifier=ds.identifier,
                                                columns=ds.columns,
                                                row_count=ds.row_count)
                                        },
                                        delete=[ds_name]))
Пример #29
0
 def test_create_branch_of_active_workflow(self):
     """Ensure thatan exception is raised when attempting to branch of a
     workflow with active modules. None of the branch resources should be
     created.
     """
     base_path = os.path.join(os.path.abspath(REPO_DIR), 'ABC')
     os.makedirs(base_path)
     vt = OSViztrailHandle.create_viztrail(
         identifier='DEF',
         properties={PROPERTY_NAME: 'My Viztrail'},
         base_path=base_path)
     # Create one branch
     branch = vt.create_branch(properties={PROPERTY_NAME: 'My Branch'})
     branch_path = os.path.join(base_path, viztrail.FOLDER_BRANCHES,
                                branch.identifier)
     self.assertTrue(os.path.isdir(branch_path))
     files = os.listdir(os.path.join(base_path, viztrail.FOLDER_BRANCHES))
     # Create five modules. The last one is active
     modules = list()
     for i in range(5):
         m = OSModuleHandle.create_module(
             command=python_cell(source='print ' + str(i)),
             external_form='TEST MODULE ' + str(i),
             state=MODULE_SUCCESS,
             outputs=ModuleOutputs(),
             provenance=ModuleProvenance(),
             timestamp=ModuleTimestamp(),
             datasets=dict(),
             module_folder=vt.modules_folder,
         )
         modules.append(m.identifier)
     m.set_running(external_form='TEST MODULE')
     with self.assertRaises(ValueError):
         vt.create_branch(properties={PROPERTY_NAME: 'My Branch'},
                          modules=modules)
     # Ensure that no additional entry in the branches folder is created
     self.assertEqual(
         len(files),
         len(os.listdir(os.path.join(base_path, viztrail.FOLDER_BRANCHES))))
 def test_safe_write(self):
     """Update module state with write error."""
     # Create original module
     module = OSModuleHandle.create_module(
         command=python_cell(source='print 2+2'),
         external_form='TEST MODULE',
         state=MODULE_PENDING,
         module_folder=MODULE_DIR,
         timestamp=ModuleTimestamp(),
         outputs=ModuleOutputs(stdout=[TextOutput('ABC')]),
         provenance=ModuleProvenance(
             read={'DS1': 'ID1'},
             write={'DS1': DatasetDescriptor(identifier='ID2',
                                             name='ID2')}))
     self.assertTrue(module.is_pending)
     module.set_running(external_form='TEST MODULE')
     self.assertTrue(module.is_running)
     module.set_success(outputs=ModuleOutputs(stderr=[None]))
     self.assertTrue(module.is_error)
     module = OSModuleHandle.load_module(identifier=module.identifier,
                                         module_path=module.module_path)
     self.assertTrue(module.is_running)