示例#1
0
 def test_adjust_state(self):
     """Test adjusting state for modules that do not require execution."""
     # Current database state
     datasets = {
         'A': DatasetDescriptor(identifier='123'),
         'B': DatasetDescriptor(identifier='345'),
         'C': DatasetDescriptor(identifier='567')
     }
     # Read 'A', write 'B', delete 'C' and create new dataset 'D'
     prov = ModuleProvenance(read={
         'A': '123',
         'B': '345'
     },
                             write={
                                 'B': DatasetDescriptor(identifier='666'),
                                 'D': DatasetDescriptor(identifier='999')
                             },
                             delete=['C'])
     self.assertFalse(prov.requires_exec(datasets))
     state = prov.get_database_state(prev_state=datasets)
     # The resulting start should contain 'A'->123, 'B'->666, and 'D'->999
     self.assertEqual(len(state), 3)
     for name in ['A', 'B', 'D']:
         self.assertTrue(name in state)
     self.assertEqual(state['A'].identifier, '123')
     self.assertEqual(state['B'].identifier, '666')
     self.assertEqual(state['D'].identifier, '999')
示例#2
0
 def test_unique_name(self):
     """Test method that computes unique column names."""
     ds = DatasetDescriptor(identifier='0',
                            columns=[
                                DatasetColumn(identifier=0, name='ABC'),
                                DatasetColumn(identifier=1, name='A'),
                                DatasetColumn(identifier=2, name='ABC_1'),
                                DatasetColumn(identifier=3, name='DEF'),
                                DatasetColumn(identifier=4, name='xyz'),
                            ])
     self.assertEqual(ds.get_unique_name('Age'), 'Age')
     self.assertEqual(ds.get_unique_name('XYZ'), 'XYZ_1')
     self.assertEqual(ds.get_unique_name('xyz'), 'xyz_1')
     self.assertEqual(ds.get_unique_name('ABC'), 'ABC_2')
 def test_load_with_dataset_delete(self):
     """Test loading workflows where each module creates a new dataset and
     deletes the previous dataset (except for the first module).
     """
     base_path = os.path.join(os.path.abspath(REPO_DIR), 'ABC')
     os.makedirs(base_path)
     vt = OSViztrailHandle.create_viztrail(identifier='ABC',
                                           properties={},
                                           base_path=base_path)
     branch = vt.get_default_branch()
     # Append ten modules
     for i in range(5):
         ts = get_current_time()
         deleted_datasets = list()
         if i > 0:
             deleted_datasets.append('DS' + str(i - 1))
         command = python_cell(source='print ' + str(i) + '+' + str(i))
         module = OSModuleHandle.create_module(
             command=command,
             external_form='print ' + str(i) + '+' + str(i),
             state=MODULE_SUCCESS,
             outputs=ModuleOutputs(stdout=[TextOutput(str(i + i))]),
             provenance=ModuleProvenance(write={
                 'DS' + str(i):
                 DatasetDescriptor(
                     identifier=str(i),
                     name='DS' + str(i),
                     columns=[
                         DatasetColumn(identifier=j, name=str(j))
                         for j in range(i)
                     ],
                 )
             },
                                         delete=deleted_datasets),
             timestamp=ModuleTimestamp(created_at=ts,
                                       started_at=ts,
                                       finished_at=ts),
             module_folder=vt.modules_folder,
             object_store=vt.object_store)
         if not branch.head is None:
             modules = branch.head.modules + [module]
         else:
             modules = [module]
         branch.append_workflow(modules=modules,
                                action=ACTION_INSERT,
                                command=command)
     vt = OSViztrailHandle.load_viztrail(base_path)
     workflow = vt.get_default_branch().get_head()
     self.assertEqual(len(workflow.modules), 5)
     datasets = {}
     for i in range(5):
         module = workflow.modules[i]
         datasets = module.provenance.get_database_state(datasets)
         self.assertEqual(len(datasets), 1)
         key = 'DS' + str(i)
         self.assertTrue(key in datasets)
         self.assertEqual(len(datasets[key].columns), i)
示例#4
0
    def create_dataset(self, columns, rows, annotations=None):
        """Create a new dataset in the datastore. Expects at least the list of
        columns and the rows for the dataset.

        Raises ValueError if (1) the column identifier are not unique, (2) the
        row identifier are not uniqe, (3) the number of columns and values in a
        row do not match, (4) any of the column or row identifier have a
        negative value, or (5) if the given column or row counter have value
        lower or equal to any of the column or row identifier.

        Parameters
        ----------
        columns: list(vizier.datastore.dataset.DatasetColumn)
            List of columns. It is expected that each column has a unique
            identifier.
        rows: list(vizier.datastore.dataset.DatasetRow)
            List of dataset rows.
        annotations: vizier.datastore.annotation.dataset.DatasetMetadata, optional
            Annotations for dataset components

        Returns
        -------
        vizier.datastore.dataset.DatasetDescriptor
        """
        # Validate (i) that each column has a unique identifier, (ii) each row
        # has a unique identifier, and (iii) that every row has exactly one
        # value per column.
        _, max_row_id = validate_dataset(columns=columns, rows=rows)
        # Get new identifier and create directory for new dataset
        identifier = get_unique_identifier()
        dataset_dir = self.get_dataset_dir(identifier)
        os.makedirs(dataset_dir)
        # Write rows to data file
        data_file = os.path.join(dataset_dir, DATA_FILE)
        DefaultJsonDatasetReader(data_file).write(rows)
        # Filter annotations for non-existing resources
        if not annotations is None:
            annotations = annotations.filter(
                columns=[c.identifier for c in columns],
                rows=[r.identifier for r in rows])
        # Create dataset an write dataset file
        dataset = FileSystemDatasetHandle(identifier=identifier,
                                          columns=columns,
                                          data_file=data_file,
                                          row_count=len(rows),
                                          max_row_id=max_row_id,
                                          annotations=annotations)
        dataset.to_file(
            descriptor_file=os.path.join(dataset_dir, DESCRIPTOR_FILE))
        # Write metadata file if annotations are given
        if not annotations is None:
            dataset.annotations.to_file(self.get_metadata_filename(identifier))
        # Return handle for new dataset
        return DatasetDescriptor(identifier=dataset.identifier,
                                 columns=dataset.columns,
                                 row_count=dataset.row_count)
示例#5
0
    def compute_empty_dataset(self, args, context):
        """Execute empty dataset command.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        outputs = ModuleOutputs()
        default_columns = [("''", "unnamed_column")]
        ds_name = args.get_value(pckg.PARA_NAME).lower()
        if ds_name in context.datasets:
            raise ValueError('dataset \'' + ds_name + '\' exists')
        if not is_valid_name(ds_name):
            raise ValueError('invalid dataset name \'' + ds_name + '\'')
        try:
            source = "SELECT {};".format(", ".join(
                default_val + " AS " + col_name
                for default_val, col_name in default_columns))
            view_name, dependencies = mimir.createView(dict(), source)

            columns = [
                MimirDatasetColumn(identifier=col_id,
                                   name_in_dataset=col_defn[1])
                for col_defn, col_id in zip(default_columns,
                                            range(len(default_columns)))
            ]

            ds = context.datastore.register_dataset(table_name=view_name,
                                                    columns=columns,
                                                    row_counter=1)
            provenance = ModuleProvenance(
                write={
                    ds_name:
                    DatasetDescriptor(identifier=ds.identifier,
                                      columns=ds.columns,
                                      row_count=ds.row_count)
                },
                read=dict(
                )  # Need to explicitly declare a lack of dependencies.
            )
            outputs.stdout.append(
                TextOutput("Empty dataset '{}' created".format(ds_name)))
        except Exception as ex:
            provenance = ModuleProvenance()
            outputs.error(ex)
        return ExecResult(is_success=(len(outputs.stderr) == 0),
                          outputs=outputs,
                          provenance=provenance)
示例#6
0
def DATASET_DESCRIPTOR(obj):
    """Convert a dictionary into a dataset descriptor.

    Parameters
    ----------
    obj: list
        Default serialization for a dataset descriptors

    Returns
    -------
    vizier.datastore.dataset.DatasetDescriptor
    """
    return DatasetDescriptor(identifier=obj[labels.ID],
                             columns=DATASET_COLUMNS(obj[labels.COLUMNS]),
                             row_count=obj[labels.ROWCOUNT])
示例#7
0
def DATASET_DESCRIPTOR(obj: Dict[str, Any]) -> DatasetDescriptor:
    """Convert a dictionary into a dataset descriptor.

    Parameters
    ----------
    obj: list
        Default serialization for a dataset descriptors

    Returns
    -------
    vizier.datastore.dataset.DatasetDescriptor
    """
    return DatasetDescriptor(identifier=obj[labels.ID],
                             name=obj[labels.NAME],
                             columns=DATASET_COLUMNS(obj[labels.COLUMNS]))
 def test_running(self):
     """Update module state from pending to running."""
     # Create original module
     module = OSModuleHandle.create_module(
         command=python_cell(source='print 2+2'),
         external_form='TEST MODULE',
         state=MODULE_PENDING,
         module_folder=MODULE_DIR,
         timestamp=ModuleTimestamp(),
         datasets={'DS1': DS1},
         outputs=ModuleOutputs(stdout=[TextOutput('ABC')]),
         provenance=ModuleProvenance(
             read={'DS1': 'ID1'},
             write={'DS1': DatasetDescriptor(identifier='ID2')},
             resources={'fileid': '0123456789'}))
     self.assertTrue(module.is_pending)
     module.set_running(external_form='TEST MODULE')
     self.assertTrue(module.is_running)
     self.assertIsNotNone(module.timestamp.started_at)
     self.assertEqual(len(module.datasets), 0)
     self.assertEqual(len(module.outputs.stderr), 0)
     self.assertEqual(len(module.outputs.stdout), 0)
     self.assertIsNotNone(module.provenance.read)
     self.assertIsNotNone(module.provenance.write)
     self.assertIsNotNone(module.provenance.resources)
     # Read module from object store and ensure that tall changes have been
     # materialized properly
     module = OSModuleHandle.load_module(identifier=module.identifier,
                                         module_path=module.module_path)
     self.assertTrue(module.is_running)
     self.assertIsNotNone(module.timestamp.started_at)
     self.assertEqual(len(module.datasets), 0)
     self.assertEqual(len(module.outputs.stderr), 0)
     self.assertEqual(len(module.outputs.stdout), 0)
     self.assertIsNotNone(module.provenance.read)
     self.assertIsNotNone(module.provenance.write)
     self.assertIsNotNone(module.provenance.resources)
     # Set running with all optional parameters
     module.set_running(started_at=module.timestamp.created_at,
                        external_form='Some form')
     self.assertEqual(module.timestamp.started_at,
                      module.timestamp.created_at)
     self.assertEqual(module.external_form, 'Some form')
     module = OSModuleHandle.load_module(identifier=module.identifier,
                                         module_path=module.module_path)
     self.assertEqual(module.timestamp.started_at,
                      module.timestamp.created_at)
     self.assertEqual(module.external_form, 'Some form')
 def test_state(self):
     """Ensure that only one of the state flag is True at the same time."""
     # Create original module
     module = OSModuleHandle.create_module(
         command=python_cell(source='print 2+2'),
         external_form='TEST MODULE',
         state=MODULE_PENDING,
         module_folder=MODULE_DIR,
         timestamp=ModuleTimestamp(),
         outputs=ModuleOutputs(stdout=[TextOutput('ABC')]),
         provenance=ModuleProvenance(
             read={'DS1': 'ID1'},
             write={'DS1': DatasetDescriptor(identifier='ID2',
                                             name='ID2')}))
     # Pending
     self.assertTrue(module.is_pending)
     self.assertFalse(module.is_canceled)
     self.assertFalse(module.is_error)
     self.assertFalse(module.is_running)
     self.assertFalse(module.is_success)
     # Running
     module.set_running(external_form='TEST MODULE')
     self.assertFalse(module.is_pending)
     self.assertFalse(module.is_canceled)
     self.assertFalse(module.is_error)
     self.assertTrue(module.is_running)
     self.assertFalse(module.is_success)
     # Canceled
     module.set_canceled()
     self.assertFalse(module.is_pending)
     self.assertTrue(module.is_canceled)
     self.assertFalse(module.is_error)
     self.assertFalse(module.is_running)
     self.assertFalse(module.is_success)
     # Error
     module.set_error()
     self.assertFalse(module.is_pending)
     self.assertFalse(module.is_canceled)
     self.assertTrue(module.is_error)
     self.assertFalse(module.is_running)
     self.assertFalse(module.is_success)
     # Success
     module.set_success()
     self.assertFalse(module.is_pending)
     self.assertFalse(module.is_canceled)
     self.assertFalse(module.is_error)
     self.assertFalse(module.is_running)
     self.assertTrue(module.is_success)
示例#10
0
    def create_exec_result(self,
                           dataset_name,
                           input_dataset=None,
                           output_dataset=None,
                           database_state=None,
                           stdout=None,
                           resources=None):
        """Create execution result object for a successfully completed task.
        Assumes that a single datasets has been modified.

        Note that this method is not suitable to generate the result object for
        the drop dataset and rename dataset commands.

        Parameters
        ----------
        dataset_name: string
            Name of the manipulated dataset
        input_dataset: vizier.datastore.dataset.DatasetDescriptor
            Descriptor for the input dataset
        output_dataset: vizier.datastore.dataset.DatasetDescriptor, optional
            Descriptor for the resulting dataset
        database_state: dict, optional
            Identifier for datasets in the database state agains which a task
            was executed (keyed by user-provided name)
        stdout= list(string), optional
            Lines in the command output
        resources: dict, optional
            Optional resources that were generated by the command

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        if not output_dataset is None:
            ds = DatasetDescriptor(identifier=output_dataset.identifier,
                                   columns=output_dataset.columns,
                                   row_count=output_dataset.row_count)
        else:
            ds = None
        return ExecResult(
            outputs=ModuleOutputs(stdout=[TextOutput(line)
                                          for line in stdout]),
            provenance=ModuleProvenance(
                read={dataset_name: input_dataset.identifier}
                if not input_dataset is None else None,
                write={dataset_name: ds},
                resources=resources))
示例#11
0
    def compute_rename_dataset(self, args, context):
        """Execute rename dataset command.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get name of existing dataset and the new dataset name. Raise
        # exception if a dataset with the new name already exists or if the new
        # dataset name is not a valid name.
        ds_name = args.get_value(pckg.PARA_DATASET).lower()
        new_name = args.get_value(pckg.PARA_NAME).lower()
        if new_name in context.datasets:
            raise ValueError('dataset \'' + new_name + '\' exists')
        if not is_valid_name(new_name):
            raise ValueError('invalid dataset name \'' + new_name + '\'')
        #  Get dataset. Raises exception if the dataset does not exist.
        ds = context.get_dataset(ds_name)
        # Adjust database state
        datasets = dict(context.datasets)
        del datasets[ds_name]
        datasets[new_name] = ds
        return ExecResult(
            outputs=ModuleOutputs(stdout=[TextOutput('1 dataset renamed')]),
            provenance=ModuleProvenance(read=dict(),
                                        write={
                                            new_name:
                                            DatasetDescriptor(
                                                identifier=ds.identifier,
                                                columns=ds.columns,
                                                row_count=ds.row_count)
                                        },
                                        delete=[ds_name]))
 def test_safe_write(self):
     """Update module state with write error."""
     # Create original module
     module = OSModuleHandle.create_module(
         command=python_cell(source='print 2+2'),
         external_form='TEST MODULE',
         state=MODULE_PENDING,
         module_folder=MODULE_DIR,
         timestamp=ModuleTimestamp(),
         outputs=ModuleOutputs(stdout=[TextOutput('ABC')]),
         provenance=ModuleProvenance(
             read={'DS1': 'ID1'},
             write={'DS1': DatasetDescriptor(identifier='ID2',
                                             name='ID2')}))
     self.assertTrue(module.is_pending)
     module.set_running(external_form='TEST MODULE')
     self.assertTrue(module.is_running)
     module.set_success(outputs=ModuleOutputs(stderr=[None]))
     self.assertTrue(module.is_error)
     module = OSModuleHandle.load_module(identifier=module.identifier,
                                         module_path=module.module_path)
     self.assertTrue(module.is_running)
示例#13
0
    def execute_query(self, args, context):
        """Execute a SQL query in the given context.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get SQL source code that is in this cell and the global
        # variables
        source = args.get_value(cmd.PARA_SQL_SOURCE)
        if not source.endswith(';'):
            source = source + ';'
        ds_name = args.get_value(cmd.PARA_OUTPUT_DATASET, raise_error=False)
        # Get mapping of datasets in the context to their respective table
        # name in the Mimir backend
        mimir_table_names = dict()
        for ds_name_o in context.datasets:
            dataset_id = context.datasets[ds_name_o]
            dataset = context.datastore.get_dataset(dataset_id)
            if dataset is None:
                raise ValueError('unknown dataset \'' + ds_name_o + '\'')
            mimir_table_names[ds_name_o] = dataset.table_name
        # Module outputs
        outputs = ModuleOutputs()
        try:
            # Create the view from the SQL source
            view_name, dependencies = mimir.createView(mimir_table_names,
                                                       source)
            sql = 'SELECT * FROM ' + view_name
            mimirSchema = mimir.getSchema(sql)

            columns = list()

            for col in mimirSchema:
                col_id = len(columns)
                name_in_dataset = col['name']
                col = MimirDatasetColumn(identifier=col_id,
                                         name_in_dataset=name_in_dataset)
                columns.append(col)

            row_count = mimir.countRows(view_name)

            provenance = None
            if ds_name is None or ds_name == '':
                ds_name = "TEMPORARY_RESULT"

            ds = context.datastore.register_dataset(table_name=view_name,
                                                    columns=columns,
                                                    row_counter=row_count)
            ds_output = server.api.datasets.get_dataset(
                project_id=context.project_id,
                dataset_id=ds.identifier,
                offset=0,
                limit=10)
            ds_output['name'] = ds_name

            dependencies = dict((dep_name.lower(),
                                 context.datasets.get(dep_name.lower(), None))
                                for dep_name in dependencies)
            # print("---- SQL DATASETS ----\n{}\n{}".format(context.datasets, dependencies))

            outputs.stdout.append(DatasetOutput(ds_output))
            provenance = ModuleProvenance(write={
                ds_name:
                DatasetDescriptor(identifier=ds.identifier,
                                  columns=ds.columns,
                                  row_count=ds.row_count)
            },
                                          read=dependencies)
        except Exception as ex:
            provenance = ModuleProvenance()
            outputs.error(ex)
        # Return execution result
        return ExecResult(is_success=(len(outputs.stderr) == 0),
                          outputs=outputs,
                          provenance=provenance)
示例#14
0
 def test_column_index(self):
     """Test access to columns based on identifier and name."""
     ds = DatasetDescriptor(identifier='0',
                            columns=[
                                DatasetColumn(identifier=0, name='ABC'),
                                DatasetColumn(identifier=1, name='A'),
                                DatasetColumn(identifier=2, name='ABC'),
                                DatasetColumn(identifier=3, name='DEF'),
                                DatasetColumn(identifier=4, name='xyz'),
                            ])
     # Get column by identifier
     self.assertEqual(ds.column_by_id(0).name, 'ABC')
     self.assertEqual(ds.column_by_id(1).name, 'A')
     self.assertEqual(ds.column_by_id(2).name, 'ABC')
     self.assertEqual(ds.column_by_id(3).name, 'DEF')
     self.assertEqual(ds.column_by_id(4).name, 'xyz')
     with self.assertRaises(ValueError):
         ds.column_by_id(6)
     with self.assertRaises(ValueError):
         ds.column_by_id(-1)
     # Get column by name
     self.assertEqual(ds.column_by_name('ABC').identifier, 0)
     self.assertEqual(ds.column_by_name('A').identifier, 1)
     self.assertEqual(
         ds.column_by_name('abc', ignore_case=True).identifier, 0)
     self.assertEqual(
         ds.column_by_name('XYZ', ignore_case=True).identifier, 4)
     self.assertIsNone(ds.column_by_name('4'))
     # Get column index
     self.assertEqual(ds.column_index(0), 0)
     self.assertEqual(ds.column_index(1), 1)
     self.assertEqual(ds.column_index('DEF'), 3)
     self.assertEqual(ds.column_index('XYZ'), 4)
     self.assertEqual(ds.column_index('A'), 1)
     self.assertEqual(ds.column_index('B'), 1)
     self.assertEqual(ds.column_index('C'), 2)
     self.assertEqual(ds.column_index('D'), 3)
     self.assertEqual(ds.column_index('E'), 4)
     for i in range(len(ds.columns)):
         self.assertEqual(ds.get_index(i), i)
     with self.assertRaises(ValueError):
         ds.column_index('ABC')
     with self.assertRaises(ValueError):
         ds.column_index('abc')
     # Create a descriptor when column identifier does not match the index
     # position in the schema
     ds = DatasetDescriptor(identifier='0',
                            columns=[
                                DatasetColumn(identifier=4, name='ABC'),
                                DatasetColumn(identifier=2, name='A'),
                                DatasetColumn(identifier=3, name='ABC'),
                                DatasetColumn(identifier=0, name='DEF'),
                                DatasetColumn(identifier=1, name='xyz'),
                            ])
     self.assertEqual(ds.column_by_id(0).name, 'DEF')
     self.assertEqual(ds.column_by_id(1).name, 'xyz')
     self.assertEqual(ds.column_by_id(2).name, 'A')
     self.assertEqual(ds.column_by_id(3).name, 'ABC')
     self.assertEqual(ds.column_by_id(4).name, 'ABC')
     self.assertEqual(ds.column_index(0), 0)
     self.assertEqual(ds.column_index(1), 1)
     self.assertEqual(ds.column_index('DEF'), 3)
     self.assertEqual(ds.column_index('XYZ'), 4)
     self.assertEqual(ds.column_index('A'), 1)
     self.assertEqual(ds.column_index('B'), 1)
     self.assertEqual(ds.column_index('C'), 2)
     self.assertEqual(ds.column_index('D'), 3)
     self.assertEqual(ds.column_index('E'), 4)
     self.assertEqual(ds.get_index(0), 3)
     self.assertEqual(ds.get_index(1), 4)
     self.assertEqual(ds.get_index(2), 1)
     self.assertEqual(ds.get_index(3), 2)
     self.assertEqual(ds.get_index(4), 0)
示例#15
0
from vizier.core.timestamp import get_current_time, to_datetime
from vizier.datastore.dataset import DatasetColumn, DatasetDescriptor
from vizier.view.chart import ChartViewHandle, DataSeriesHandle
from vizier.viztrail.objectstore.module import OSModuleHandle
from vizier.viztrail.module.base import MODULE_PENDING, MODULE_SUCCESS
from vizier.viztrail.module.output import ModuleOutputs, OutputObject, TextOutput
from vizier.viztrail.module.provenance import ModuleProvenance
from vizier.viztrail.module.timestamp import ModuleTimestamp
from vizier.engine.packages.plot.command import create_plot
from vizier.engine.packages.pycell.command import python_cell

MODULE_DIR = './.temp'

DATASETS = {
    'DS1':
    DatasetDescriptor(identifier='ID1'),
    'DS2':
    DatasetDescriptor(identifier='ID2',
                      columns=[
                          DatasetColumn(identifier=0,
                                        name='ABC',
                                        data_type='int'),
                          DatasetColumn(identifier=1,
                                        name='xyz',
                                        data_type='real')
                      ],
                      row_count=100)
}


class TestOSModuleIO(unittest.TestCase):
示例#16
0
    def load_module(
            identifier: str, 
            module_path: str, 
            prev_state: Optional[Dict[str, ArtifactDescriptor]] = None, 
            object_store: ObjectStore = DefaultObjectStore()
        ) -> "OSModuleHandle":
        """Load module from given object store.

        Parameters
        ----------
        identifier: string
            Unique module identifier
        module_path: string
            Resource path for module object
        prev_state: dict(string: vizier.datastore.dataset.DatasetDescriptor)
            Dataset descriptors keyed by the user-provided name that exist in
            the database state of the previous moudle (in sequence of occurrence
            in the workflow)
        object_store: vizier.core.io.base.ObjectStore, optional
            Object store implementation to access and maintain resources

        Returns
        -------
        vizier.viztrail.objectstore.module.OSModuleHandle
        """
        # Make sure the object store is not None
        # Read object from store. This may raise a ValueError to indicate that
        # the module does not exists (in a system error condtion). In this
        # case we return a new module that is in error state.
        try:
            obj = cast(Dict[str, Any], object_store.read_object(object_path=module_path))
        except ValueError:
            return OSModuleHandle(
                identifier=identifier,
                command=ModuleCommand(
                    package_id=UNKNOWN_ID,
                    command_id=UNKNOWN_ID,
                    arguments=list(),
                    packages=None
                ),
                external_form='fatal error: object not found',
                module_path=module_path,
                state=mstate.MODULE_ERROR,
                object_store=object_store
            )
        # Create module command
        command = ModuleCommand(
            package_id=obj[KEY_COMMAND][KEY_PACKAGE_ID],
            command_id=obj[KEY_COMMAND][KEY_COMMAND_ID],
            arguments=obj[KEY_COMMAND][KEY_ARGUMENTS],
            packages=None
        )
        # Create module timestamps
        created_at = to_datetime(obj[KEY_TIMESTAMP][KEY_CREATED_AT])
        if KEY_STARTED_AT in obj[KEY_TIMESTAMP]:
            started_at: Optional[datetime] = to_datetime(obj[KEY_TIMESTAMP][KEY_STARTED_AT])
        else:
            started_at = None
        if KEY_FINISHED_AT in obj[KEY_TIMESTAMP]:
            finished_at: Optional[datetime] = to_datetime(obj[KEY_TIMESTAMP][KEY_FINISHED_AT])
        else:
            finished_at = None
        timestamp = ModuleTimestamp(
            created_at=created_at,
            started_at=started_at,
            finished_at=finished_at
        )
        # Create module output streams.
        outputs = ModuleOutputs(
            stdout=get_output_stream(obj[KEY_OUTPUTS][KEY_STDOUT]),
            stderr=get_output_stream(obj[KEY_OUTPUTS][KEY_STDERR])
        )
        # Create module provenance information
        read_prov = None
        if KEY_PROVENANCE_READ in obj[KEY_PROVENANCE]:
            read_prov = dict()
            for ds in obj[KEY_PROVENANCE][KEY_PROVENANCE_READ]:
                read_prov[ds[KEY_DATASET_NAME]] = ds[KEY_DATASET_ID]
        write_prov = None
        if KEY_PROVENANCE_WRITE in obj[KEY_PROVENANCE]:
            write_prov = dict()
            for ds in obj[KEY_PROVENANCE][KEY_PROVENANCE_WRITE]:
                if KEY_DATAOBJECT_TYPE in ds:
                    descriptor = ArtifactDescriptor(
                        identifier=ds[KEY_DATAOBJECT_ID],
                        name=ds[KEY_DATAOBJECT_NAME],
                        artifact_type=ds[KEY_DATAOBJECT_TYPE])
                else: 
                    descriptor = DatasetDescriptor(
                        identifier=ds[KEY_DATASET_ID],
                        name=ds[KEY_DATASET_NAME],
                        columns=[
                            DatasetColumn(
                                identifier=col[KEY_COLUMN_ID],
                                name=col[KEY_COLUMN_NAME],
                                data_type=col[KEY_COLUMN_TYPE]
                            ) for col in ds[KEY_DATASET_COLUMNS]
                        ]
                    )
                write_prov[ds[KEY_DATASET_NAME]] = descriptor
        if KEY_PROVENANCE_DELETE in obj[KEY_PROVENANCE]:
            delete_prov = set(obj[KEY_PROVENANCE][KEY_PROVENANCE_DELETE])
        else:
            delete_prov = set()
        if KEY_PROVENANCE_RESOURCES in obj[KEY_PROVENANCE]:
            res_prov = cast(Dict[str, Any], obj[KEY_PROVENANCE][KEY_PROVENANCE_RESOURCES])
        else:
            res_prov = dict()
        if KEY_PROVENANCE_CHARTS in obj[KEY_PROVENANCE]:
            charts_prov = [
                ( 
                    c[0], 
                    ChartViewHandle.from_dict(c[1])  # type: ignore[no-untyped-call]
                ) if isinstance(c, list) else 
                (
                    "Chart",
                    ChartViewHandle.from_dict(c)
                )
                for c in obj[KEY_PROVENANCE][KEY_PROVENANCE_CHARTS]
            ]
        else:
            charts_prov = list()
        provenance = ModuleProvenance(
            read=read_prov,
            write=write_prov,
            delete=delete_prov,
            resources=res_prov,
            charts=charts_prov
        )
        # Return module handle
        return OSModuleHandle(
            identifier=identifier,
            command=command,
            external_form=obj[KEY_EXTERNAL_FORM],
            module_path=module_path,
            state=obj[KEY_STATE],
            timestamp=timestamp,
            outputs=outputs,
            provenance=provenance,
            object_store=object_store,
        )
import os
import shutil
import unittest

from vizier.core.timestamp import get_current_time
from vizier.datastore.dataset import DatasetDescriptor
from vizier.viztrail.objectstore.module import OSModuleHandle
from vizier.viztrail.module.base import MODULE_PENDING
from vizier.viztrail.module.output import ModuleOutputs, TextOutput
from vizier.viztrail.module.provenance import ModuleProvenance
from vizier.viztrail.module.timestamp import ModuleTimestamp
from vizier.engine.packages.pycell.command import python_cell

MODULE_DIR = './.temp'

DS1 = DatasetDescriptor(identifier='ID1', name='ID1')
DS2 = DatasetDescriptor(identifier='ID2', name='ID2')


class TestModuleState(unittest.TestCase):
    def setUp(self):
        """Create an empty directory."""
        if os.path.isdir(MODULE_DIR):
            shutil.rmtree(MODULE_DIR)
        os.makedirs(MODULE_DIR)

    def tearDown(self):
        """Delete directory.
        """
        shutil.rmtree(MODULE_DIR)
示例#18
0
    def execute_query(self, args: ModuleArguments,
                      context: TaskContext) -> ExecResult:
        """Execute a SQL query in the given context.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get SQL source code that is in this cell and the global
        # variables
        source = args.get_value(cmd.PARA_SQL_SOURCE)
        if not source.endswith(';'):
            source = source
        ds_name = args.get_value(cmd.PARA_OUTPUT_DATASET, raise_error=False)
        # Get mapping of datasets in the context to their respective table
        # name in the Mimir backend
        mimir_table_names = dict()
        for ds_name_o in context.datasets:
            dataset_id = context.datasets[ds_name_o].identifier
            dataset = context.datastore.get_dataset(dataset_id)
            if dataset is None:
                raise ValueError('unknown dataset \'' + ds_name_o + '\'')
            mimir_table_names[ds_name_o] = dataset.identifier
        # Module outputs
        outputs = ModuleOutputs()
        is_success = True
        functions = {
            name: context.dataobjects[name].identifier
            for name in context.dataobjects
            if context.dataobjects[name].obj_type == ARTIFACT_TYPE_PYTHON
        }
        try:
            # Create the view from the SQL source
            view_name, dependencies, mimirSchema, properties, functionDeps = mimir.createView(
                datasets=mimir_table_names,
                query=source,
                functions=dict(functions))
            ds = MimirDatasetHandle.from_mimir_result(view_name, mimirSchema,
                                                      properties, ds_name)

            print(mimirSchema)

            if ds_name is None or ds_name == '':
                ds_name = "TEMPORARY_RESULT"

            from vizier.api.webservice import server

            ds_output = server.api.datasets.get_dataset(
                project_id=context.project_id,
                dataset_id=ds.identifier,
                offset=0,
                limit=10)
            if ds_output is None:
                outputs.stderr.append(
                    TextOutput("Error displaying dataset {}".format(ds_name)))
            else:
                ds_output['name'] = ds_name
                outputs.stdout.append(DatasetOutput(ds_output))

            dependenciesDict: Dict[str, str] = {
                dep_name.lower(): get_artifact_id(dep)
                for dep_name, dep in [(
                    dep_name, context.datasets.get(dep_name.lower(), None))
                                      for dep_name in dependencies]
                if dep is not None
            }
            functionDepDict: Dict[str, str] = {
                dep_name.lower(): get_artifact_id(dep)
                for dep_name, dep in [(
                    dep_name, context.dataobjects.get(dep_name.lower(), None))
                                      for dep_name in dependencies]
                if dep is not None
            }
            # print("---- SQL DATASETS ----\n{}\n{}".format(context.datasets, dependencies))

            provenance = ModuleProvenance(write={
                ds_name:
                DatasetDescriptor(identifier=ds.identifier,
                                  name=ds_name,
                                  columns=ds.columns)
            },
                                          read={
                                              **dependenciesDict,
                                              **functionDepDict
                                          })
        except Exception as ex:
            provenance = ModuleProvenance()
            outputs.error(ex)
            is_success = False
        # Return execution result
        return ExecResult(is_success=is_success,
                          outputs=outputs,
                          provenance=provenance)
import unittest

from vizier.engine.packages.mimir.command import mimir_geocode
from vizier.engine.packages.mimir.command import mimir_key_repair, mimir_missing_key
from vizier.engine.packages.mimir.command import mimir_missing_value, mimir_picker
from vizier.datastore.dataset import DatasetColumn, DatasetDescriptor

import vizier.engine.packages.base as pckg
import vizier.engine.packages.mimir.base as mimir
import vizier.viztrail.command as md

DATASETS = {
    'ds':
    DatasetDescriptor(identifier='0000',
                      name='ds',
                      columns=[
                          DatasetColumn(identifier=2, name='Some Name'),
                          DatasetColumn(identifier=1, name='Street')
                      ])
}
PACKAGE = pckg.PackageIndex(mimir.MIMIR_LENSES)


class TestValidateMimir(unittest.TestCase):
    def test_mimir_geocode(self):
        """Test validation of Mimir geocode lens."""
        cmd = mimir_geocode(dataset_name='ds',
                            geocoder='GOOGLE',
                            street=1,
                            city=2,
                            materialize_input=False,
                            validate=True).to_external_form(
示例#20
0
    def compute(self, command_id, arguments, context):
        """Compute results for commands in the Mimir package using the set of
        user-provided arguments and the current database state.

        Parameters
        ----------
        command_id: string
            Unique identifier for a command in a package declaration
        arguments: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        outputs = ModuleOutputs()
        store_as_dataset = None
        update_rows = False
        lens_annotations = []
        # Get dataset. Raise exception if dataset is unknown.
        ds_name = arguments.get_value(pckg.PARA_DATASET).lower()
        dataset = context.get_dataset(ds_name)
        mimir_table_name = dataset.table_name
        # Keep track of the name of the input dataset for the provenance
        # information.
        input_ds_name = ds_name
        if command_id == cmd.MIMIR_DOMAIN:
            column = dataset.column_by_id(arguments.get_value(
                pckg.PARA_COLUMN))
            params = [column.name_in_rdb]
        elif command_id == cmd.MIMIR_GEOCODE:
            geocoder = arguments.get_value(cmd.PARA_GEOCODER)
            params = ['GEOCODER(' + geocoder + ')']
            add_column_parameter(params, 'HOUSE_NUMBER', dataset, arguments,
                                 cmd.PARA_HOUSE_NUMBER)
            add_column_parameter(params, 'STREET', dataset, arguments,
                                 cmd.PARA_STREET)
            add_column_parameter(params, 'CITY', dataset, arguments,
                                 cmd.PARA_CITY)
            add_column_parameter(params, 'STATE', dataset, arguments,
                                 cmd.PARA_STATE)
            # Add columns for LATITUDE and LONGITUDE
            column_counter = dataset.max_column_id() + 1
            cname_lat = dataset.get_unique_name('LATITUDE')
            cname_lon = dataset.get_unique_name('LONGITUDE')
            dataset.columns.append(
                MimirDatasetColumn(identifier=column_counter,
                                   name_in_dataset=cname_lat,
                                   data_type=DATATYPE_REAL))
            dataset.columns.append(
                MimirDatasetColumn(identifier=column_counter + 1,
                                   name_in_dataset=cname_lon,
                                   data_type=DATATYPE_REAL))
            params.append('RESULT_COLUMNS(' + cname_lat + ',' + cname_lon +
                          ')')
        elif command_id == cmd.MIMIR_KEY_REPAIR:
            column = dataset.column_by_id(arguments.get_value(
                pckg.PARA_COLUMN))
            params = [column.name_in_rdb]
            update_rows = True
        elif command_id == cmd.MIMIR_MISSING_KEY:
            column = dataset.column_by_id(arguments.get_value(
                pckg.PARA_COLUMN))
            params = [column.name_in_rdb]
            # Set MISSING ONLY to FALSE to ensure that all rows are returned
            params += ['MISSING_ONLY(FALSE)']
            # Need to run this lens twice in order to generate row ids for
            # any potential new tuple
            mimir_lens_response = mimir.createLens(
                dataset.table_name, params, command_id,
                arguments.get_value(cmd.PARA_MATERIALIZE_INPUT,
                                    default_value=True))
            (mimir_table_name,
             lens_annotations) = (mimir_lens_response.lensName(),
                                  mimir_lens_response.annotations())
            params = [ROW_ID, 'MISSING_ONLY(FALSE)']
            update_rows = True
        elif command_id == cmd.MIMIR_MISSING_VALUE:
            params = list()
            for col in arguments.get_value(cmd.PARA_COLUMNS, default_value=[]):
                f_col = dataset.column_by_id(col.get_value(pckg.PARA_COLUMN))
                param = f_col.name_in_rdb
                col_constraint = col.get_value(cmd.PARA_COLUMNS_CONSTRAINT,
                                               raise_error=False)
                if col_constraint == '':
                    col_constraint = None
                if not col_constraint is None:
                    param = param + ' ' + str(col_constraint).replace(
                        "'", "\'\'").replace("OR", ") OR (")
                param = '\'(' + param + ')\''
                params.append(param)
        elif command_id == cmd.MIMIR_PICKER:
            pick_from = list()
            column_names = list()
            for col in arguments.get_value(cmd.PARA_SCHEMA):
                c_col = col.get_value(cmd.PARA_PICKFROM)
                column = dataset.column_by_id(c_col)
                pick_from.append(column.name_in_rdb)
                column_names.append(column.name.upper().replace(' ', '_'))
            # Add result column to dataset schema
            pick_as = arguments.get_value(cmd.PARA_PICKAS,
                                          default_value='PICK_ONE_' +
                                          '_'.join(column_names))
            pick_as = dataset.get_unique_name(pick_as.strip().upper())
            dataset.columns.append(
                MimirDatasetColumn(identifier=dataset.max_column_id() + 1,
                                   name_in_dataset=pick_as))
            params = ['PICK_FROM(' + ','.join(pick_from) + ')']
            params.append('PICK_AS(' + pick_as + ')')
        elif command_id == cmd.MIMIR_SCHEMA_MATCHING:
            store_as_dataset = arguments.get_value(cmd.PARA_RESULT_DATASET)
            if store_as_dataset in context.datasets:
                raise ValueError('dataset \'' + store_as_dataset + '\' exists')
            if not is_valid_name(store_as_dataset):
                raise ValueError('invalid dataset name \'' + store_as_dataset +
                                 '\'')
            column_names = list()
            params = ['\'' + ROW_ID + ' int\'']
            for col in arguments.get_value(cmd.PARA_SCHEMA):
                c_name = col.get_value(pckg.PARA_COLUMN)
                c_type = col.get_value(cmd.PARA_TYPE)
                params.append('\'' + c_name + ' ' + c_type + '\'')
                column_names.append(c_name)
        elif command_id == cmd.MIMIR_TYPE_INFERENCE:
            params = [str(arguments.get_value(cmd.PARA_PERCENT_CONFORM))]
        elif command_id == cmd.MIMIR_SHAPE_DETECTOR:
            dseModel = arguments.get_value(cmd.PARA_MODEL_NAME)
            params = []
            if not dseModel is None:
                params = [str(dseModel)]
        elif command_id == cmd.MIMIR_COMMENT:
            params = []
            for comment in arguments.get_value(cmd.PARA_COMMENTS):
                c_expr = comment.get_value(cmd.PARA_EXPRESSION)
                c_cmnt = comment.get_value(cmd.PARA_COMMENT)
                c_rowid = comment.get_value(cmd.PARA_ROWID)
                if c_rowid is None:
                    params.append('COMMENT(' + c_expr + ', \'' + c_cmnt +
                                  '\') ')
                else:
                    params.append('COMMENT(' + c_expr + ', \'' + c_cmnt +
                                  '\', \'' + c_rowid + '\') ')
            result_cols = []
            for col in arguments.get_value(cmd.PARA_RESULT_COLUMNS):
                c_name = col.get_value(pckg.PARA_COLUMN)
                result_cols.append(c_name)
            if len(result_cols) > 0:
                params.append('RESULT_COLUMNS(' + ','.join(result_cols) + ')')
        else:
            raise ValueError('unknown Mimir lens \'' + str(lens) + '\'')
        # Create Mimir lens
        if command_id in [
                cmd.MIMIR_SCHEMA_MATCHING, cmd.MIMIR_TYPE_INFERENCE,
                cmd.MIMIR_SHAPE_DETECTOR
        ]:
            lens_name = mimir.createAdaptiveSchema(mimir_table_name, params,
                                                   command_id.upper())
        else:
            mimir_lens_response = mimir.createLens(
                mimir_table_name,
                params,
                command_id.upper(),
                arguments.get_value(cmd.PARA_MATERIALIZE_INPUT,
                                    default_value=True),
                human_readable_name=ds_name.upper())
            (lens_name,
             lens_annotations) = (mimir_lens_response['lensName'],
                                  mimir_lens_response['annotations'])
        # Create a view including missing row ids for the result of a
        # MISSING KEY lens
        if command_id == cmd.MIMIR_MISSING_KEY:
            lens_name, row_counter = create_missing_key_view(
                dataset, lens_name, column)
            dataset.row_counter = row_counter
        # Create datastore entry for lens.
        if not store_as_dataset is None:
            columns = list()
            for c_name in column_names:
                col_id = len(columns)
                columns.append(
                    MimirDatasetColumn(identifier=col_id,
                                       name_in_dataset=c_name))
            ds = context.datastore.register_dataset(
                table_name=lens_name,
                columns=columns,
                annotations=dataset.annotations)
            ds_name = store_as_dataset
        else:
            ds = context.datastore.register_dataset(
                table_name=lens_name,
                columns=dataset.columns,
                annotations=dataset.annotations)
        # Add dataset schema and returned annotations to output
        if command_id in [
                cmd.MIMIR_SCHEMA_MATCHING, cmd.MIMIR_TYPE_INFERENCE,
                cmd.MIMIR_SHAPE_DETECTOR
        ]:
            print_dataset_schema(outputs, ds_name, ds.columns)
        else:
            ds_output = server.api.datasets.get_dataset(
                project_id=context.project_id,
                dataset_id=ds.identifier,
                offset=0,
                limit=10)
            outputs.stdout.append(DatasetOutput(ds_output))

        print_lens_annotations(outputs, lens_annotations)
        dsd = DatasetDescriptor(identifier=ds.identifier,
                                columns=ds.columns,
                                row_count=ds.row_count)
        result_resources = dict()
        result_resources[base.RESOURCE_DATASET] = ds.identifier

        # Return task result
        return ExecResult(outputs=outputs,
                          provenance=ModuleProvenance(
                              read={input_ds_name: dataset.identifier},
                              write={ds_name: dsd},
                              resources=result_resources))
示例#21
0
    def compute(self, command_id: str, arguments: "ModuleArguments",
                context: TaskContext) -> ExecResult:
        """Compute results for commands in the sampling package using 
        the set of user-provided arguments and the current database 
        state.

        Parameters
        ----------
        command_id: string
            Unique identifier for a command in a package declaration
        arguments: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """

        input_ds_name = arguments.get_value(cmd.PARA_INPUT_DATASET).lower()
        input_dataset: DatasetDescriptor = context.get_dataset(input_ds_name)
        if input_dataset is None:
            raise ValueError('unknown dataset \'' + input_ds_name + '\'')

        output_ds_name = arguments.get_value(cmd.PARA_OUTPUT_DATASET,
                                             raise_error=False)
        if output_ds_name is None or output_ds_name == "":
            output_ds_name = input_ds_name + "_SAMPLE"
        output_ds_name = output_ds_name.lower()

        # Load the sampling configuration
        sample_mode = None

        if command_id == cmd.BASIC_SAMPLE:
            sampling_rate = float(arguments.get_value(cmd.PARA_SAMPLING_RATE))
            if sampling_rate > 1.0 or sampling_rate < 0.0:
                raise Exception("Sampling rate must be between 0.0 and 1.0")
            sample_mode = {
                "mode": cmd.SAMPLING_MODE_UNIFORM_PROBABILITY,
                "probability": sampling_rate
            }
        elif command_id == cmd.MANUAL_STRATIFIED_SAMPLE or command_id == cmd.AUTOMATIC_STRATIFIED_SAMPLE:
            column = arguments.get_value(cmd.PARA_STRATIFICATION_COLUMN)
            column_defn = input_dataset.columns[column]
            if command_id == cmd.MANUAL_STRATIFIED_SAMPLE:
                strata = [{
                    "value":
                    stratum.get_value(cmd.PARA_STRATUM_VALUE),
                    "probability":
                    stratum.get_value(cmd.PARA_SAMPLING_RATE)
                } for stratum in arguments.get_value(cmd.PARA_STRATA)]
            else:
                probability = arguments.get_value(cmd.PARA_SAMPLING_RATE)
                strata = self.get_automatic_strata(input_dataset, column_defn,
                                                   probability)
            sample_mode = {
                "mode": cmd.SAMPLING_MODE_STRATIFIED_ON,
                "column": column_defn.name,
                "type": column_defn.data_type,
                "strata": strata
            }
        else:
            raise Exception("Unknown sampling command: {}".format(command_id))

        table_name, schema = mimir.createSample(input_dataset.identifier,
                                                sample_mode,
                                                result_name="SAMPLE_" +
                                                get_unique_identifier())
        ds = MimirDatasetHandle.from_mimir_result(table_name,
                                                  schema,
                                                  properties={},
                                                  name=output_ds_name)

        # And start rendering some output
        outputs = ModuleOutputs()
        ds_output = server.api.datasets.get_dataset(
            project_id=context.project_id,
            dataset_id=ds.identifier,
            offset=0,
            limit=10)
        if ds_output is not None:
            ds_output['name'] = output_ds_name
            outputs.stdout.append(DatasetOutput(ds_output))
        else:
            outputs.stderr.append(TextOutput("Error displaying dataset"))

        # Record Reads and writes
        provenance = ModuleProvenance(
            read={input_ds_name: input_dataset.identifier},
            write={
                output_ds_name:
                DatasetDescriptor(identifier=ds.identifier,
                                  name=output_ds_name,
                                  columns=ds.columns)
            })

        # Return task result
        return ExecResult(outputs=outputs, provenance=provenance)
 def test_success(self) -> None:
     """Update module state from pending to success."""
     # Create original module
     module = OSModuleHandle.create_module(
         command=python_cell(source='print 2+2'),
         external_form='TEST MODULE',
         state=MODULE_PENDING,
         module_folder=MODULE_DIR,
         timestamp=ModuleTimestamp(),
         outputs=ModuleOutputs(stdout=[TextOutput('ABC')]),
         provenance=ModuleProvenance(
             read={'DS1': 'ID1'},
             write={'DS1': DatasetDescriptor(identifier='ID2',
                                             name='ID2')}))
     self.assertTrue(module.is_pending)
     module.set_running(external_form='TEST MODULE')
     module.set_success()
     self.assertTrue(module.is_success)
     self.assertIsNotNone(module.timestamp.started_at)
     self.assertIsNotNone(module.timestamp.finished_at)
     self.assertEqual(len(module.outputs.stderr), 0)
     self.assertEqual(len(module.outputs.stdout), 0)
     self.assertTrue(module.provenance.read == {})
     self.assertTrue(module.provenance.write == {})
     # Read module from object store and ensure that tall changes have been
     # materialized properly
     module = OSModuleHandle.load_module(identifier=module.identifier,
                                         module_path=module.module_path)
     self.assertTrue(module.is_success)
     self.assertIsNotNone(module.timestamp.started_at)
     self.assertIsNotNone(module.timestamp.finished_at)
     self.assertEqual(len(module.outputs.stderr), 0)
     self.assertEqual(len(module.outputs.stdout), 0)
     self.assertTrue(module.provenance.read == {})
     self.assertTrue(module.provenance.write == {})
     # Set success with all optional parameters
     ts = get_current_time()
     module.set_success(
         finished_at=ts,
         outputs=ModuleOutputs(stdout=[TextOutput('XYZ')]),
         provenance=ModuleProvenance(
             read={'DS1': 'ID1'},
             write={'DS1': DatasetDescriptor(identifier='ID2',
                                             name='ID2')}))
     self.assertTrue(module.is_success)
     self.assertIsNotNone(module.timestamp.started_at)
     self.assertIsNotNone(module.timestamp.finished_at)
     self.assertEqual(module.timestamp.finished_at, ts)
     self.assertEqual(len(module.outputs.stderr), 0)
     self.assertEqual(len(module.outputs.stdout), 1)
     self.assertEqual(module.outputs.stdout[0].value, 'XYZ')
     self.assertIsNotNone(module.provenance.read)
     self.assertEqual(module.provenance.read['DS1'], 'ID1')
     self.assertIsNotNone(module.provenance.write)
     self.assertEqual(module.provenance.write['DS1'].identifier, 'ID2')
     module = OSModuleHandle.load_module(identifier=module.identifier,
                                         module_path=module.module_path)
     module = OSModuleHandle.load_module(identifier=module.identifier,
                                         module_path=module.module_path,
                                         prev_state=dict())
     self.assertTrue(module.is_success)
     self.assertIsNotNone(module.timestamp.started_at)
     self.assertIsNotNone(module.timestamp.finished_at)
     self.assertEqual(module.timestamp.finished_at, ts)
     self.assertEqual(len(module.outputs.stderr), 0)
     self.assertEqual(len(module.outputs.stdout), 1)
     self.assertEqual(module.outputs.stdout[0].value, 'XYZ')
     self.assertIsNotNone(module.provenance.read)
     self.assertEqual(module.provenance.read['DS1'], 'ID1')
     self.assertIsNotNone(module.provenance.write)
     self.assertEqual(module.provenance.write['DS1'].identifier, 'ID2')
示例#23
0
 def test_requires_exec(self):
     """Test .requires_exec() method for the module provenance object."""
     # Current database state
     datasets = {
         'A': DatasetDescriptor(identifier='123'),
         'B': DatasetDescriptor(identifier='345'),
         'C': DatasetDescriptor(identifier='567')
     }
     # For an empty read or write set the .requires_exec() method should
     # always return True
     self.assertTrue(ModuleProvenance().requires_exec(datasets))
     self.assertTrue(
         ModuleProvenance(read={
             'A': '123'
         }).requires_exec(datasets))
     self.assertTrue(
         ModuleProvenance(write={
             'A': DatasetDescriptor(identifier='789')
         },
                          delete=['A']).requires_exec(datasets))
     # If the module modifies a dataset that it doesn't read but that does
     # exist the result is True
     prov = ModuleProvenance(
         read={'A': '123'},
         write={'C': DatasetDescriptor(identifier='567')},
         delete=['A'])
     self.assertTrue(prov.requires_exec(datasets))
     # If the input data has changed the module needs to execute
     prov = ModuleProvenance(
         read={'A': 'abc'},
         write={'A': DatasetDescriptor(identifier='123')})
     self.assertTrue(prov.requires_exec(datasets))
     # No execution needed if all input data is present and in the expected
     # state
     prov = ModuleProvenance(
         read={'A': '123'},
         write={'A': DatasetDescriptor(identifier='abc')},
         delete=['A'])
     self.assertFalse(prov.requires_exec(datasets))
     prov = ModuleProvenance(
         read={
             'B': '345',
             'C': '567'
         },
         write={'B': DatasetDescriptor(identifier='abc')})
     self.assertFalse(prov.requires_exec(datasets))
     prov = ModuleProvenance(read={'B': '345', 'C': '567'}, write={})
     self.assertFalse(prov.requires_exec(datasets))
     # Re-execute if a dataset is being deleted that does not exist
     prov = ModuleProvenance(
         read={
             'B': '345',
             'C': '567'
         },
         write={'B': DatasetDescriptor(identifier='345')})
     self.assertFalse(prov.requires_exec(datasets))
     prov = ModuleProvenance(
         read={
             'B': '345',
             'C': '567'
         },
         write={'B': DatasetDescriptor(identifier='345')},
         delete=['D'])
     self.assertTrue(prov.requires_exec(datasets))
示例#24
0
    def compute_load_dataset(self, args, context):
        """Execute load dataset command.

        Parameters
        ----------
        args: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        # Get the new dataset name. Raise exception if a dataset with the
        # specified name already exsists.
        ds_name = args.get_value(pckg.PARA_NAME).lower()
        if ds_name in context.datasets:
            raise ValueError('dataset \'' + ds_name + '\' exists')
        if not is_valid_name(ds_name):
            raise ValueError('invalid dataset name \'' + ds_name + '\'')
        # Get components of the load source. Raise exception if the source
        # descriptor is invalid.
        source_desc = args.get_value(cmd.PARA_FILE)
        file_id = None
        url = None
        if pckg.FILE_ID in source_desc and source_desc[
                pckg.FILE_ID] is not None:
            file_id = source_desc[pckg.FILE_ID]
        elif pckg.FILE_URL in source_desc and source_desc[
                pckg.FILE_URL] is not None:
            url = source_desc[pckg.FILE_URL]
        else:
            raise ValueError('invalid source descriptor')
        username = source_desc[
            pckg.FILE_USERNAME] if pckg.FILE_USERNAME in source_desc else None
        password = source_desc[
            pckg.FILE_PASSWORD] if pckg.FILE_PASSWORD in source_desc else None
        reload = source_desc[
            pckg.FILE_RELOAD] if pckg.FILE_RELOAD in source_desc else False
        load_format = args.get_value(cmd.PARA_LOAD_FORMAT)
        detect_headers = args.get_value(cmd.PARA_DETECT_HEADERS,
                                        raise_error=False,
                                        default_value=True)
        infer_types = args.get_value(cmd.PARA_INFER_TYPES,
                                     raise_error=False,
                                     default_value=True)
        options = args.get_value(cmd.PARA_LOAD_OPTIONS, raise_error=False)
        m_opts = []
        print((args.get_value(cmd.PARA_LOAD_DSE,
                              raise_error=False,
                              default_value=False)))
        if args.get_value(cmd.PARA_LOAD_DSE,
                          raise_error=False,
                          default_value=False):
            m_opts.append({'name': 'datasourceErrors', 'value': 'true'})
        if not options is None:
            for option in options:
                load_opt_key = option.get_value(cmd.PARA_LOAD_OPTION_KEY)
                load_opt_val = option.get_value(cmd.PARA_LOAD_OPTION_VALUE)
                m_opts.append({'name': load_opt_key, 'value': load_opt_val})
        # Execute load command.
        result = self.api.load_dataset(datastore=context.datastore,
                                       filestore=context.filestore,
                                       file_id=file_id,
                                       url=url,
                                       detect_headers=detect_headers,
                                       infer_types=infer_types,
                                       load_format=load_format,
                                       options=m_opts,
                                       username=username,
                                       password=password,
                                       resources=context.resources,
                                       reload=reload,
                                       human_readable_name=ds_name.upper())
        # Delete the uploaded file (of load was from file). A reference to the
        # created dataset is in the resources and will be used if the module is
        # re-executed.
        #if not file_id is None:
        #    context.filestore.delete_file(file_id)
        ds = DatasetDescriptor(identifier=result.dataset.identifier,
                               columns=result.dataset.columns,
                               row_count=result.dataset.row_count)
        ds_output = server.api.datasets.get_dataset(
            project_id=context.project_id,
            dataset_id=ds.identifier,
            offset=0,
            limit=10)
        ds_output['name'] = ds_name
        return ExecResult(
            outputs=ModuleOutputs(stdout=[DatasetOutput(ds_output)]),
            provenance=ModuleProvenance(
                read=dict(
                ),  # need to explicitly declare a lack of dependencies
                write={ds_name: ds},
                resources=result.resources))
示例#25
0
    def load_module(identifier,
                    module_path,
                    prev_state=None,
                    object_store=None):
        """Load module from given object store.

        Parameters
        ----------
        identifier: string
            Unique module identifier
        module_path: string
            Resource path for module object
        prev_state: dict(string: vizier.datastore.dataset.DatasetDescriptor)
            Dataset descriptors keyed by the user-provided name that exist in
            the database state of the previous moudle (in sequence of occurrence
            in the workflow)
        object_store: vizier.core.io.base.ObjectStore, optional
            Object store implementation to access and maintain resources

        Returns
        -------
        vizier.viztrail.objectstore.module.OSModuleHandle
        """
        # Make sure the object store is not None
        if object_store is None:
            object_store = DefaultObjectStore()
        # Read object from store. This may raise a ValueError to indicate that
        # the module does not exists (in a system error condtion). In this
        # case we return a new module that is in error state.
        try:
            obj = object_store.read_object(object_path=module_path)
        except ValueError:
            return OSModuleHandle(
                identifier=identifier,
                command=ModuleCommand(package_id=UNKNOWN_ID,
                                      command_id=UNKNOWN_ID),
                external_form='fatal error: object not found',
                module_path=module_path,
                state=mstate.MODULE_ERROR,
                object_store=object_store)
        # Create module command
        command = ModuleCommand(package_id=obj[KEY_COMMAND][KEY_PACKAGE_ID],
                                command_id=obj[KEY_COMMAND][KEY_COMMAND_ID],
                                arguments=obj[KEY_COMMAND][KEY_ARGUMENTS])
        # Create module timestamps
        created_at = to_datetime(obj[KEY_TIMESTAMP][KEY_CREATED_AT])
        if KEY_STARTED_AT in obj[KEY_TIMESTAMP]:
            started_at = to_datetime(obj[KEY_TIMESTAMP][KEY_STARTED_AT])
        else:
            started_at = None
        if KEY_FINISHED_AT in obj[KEY_TIMESTAMP]:
            finished_at = to_datetime(obj[KEY_TIMESTAMP][KEY_FINISHED_AT])
        else:
            finished_at = None
        timestamp = ModuleTimestamp(created_at=created_at,
                                    started_at=started_at,
                                    finished_at=finished_at)
        # Create module output streams.
        outputs = ModuleOutputs(
            stdout=get_output_stream(obj[KEY_OUTPUTS][KEY_STDOUT]),
            stderr=get_output_stream(obj[KEY_OUTPUTS][KEY_STDERR]))
        # Create module provenance information
        read_prov = None
        if KEY_PROVENANCE_READ in obj[KEY_PROVENANCE]:
            read_prov = dict()
            for ds in obj[KEY_PROVENANCE][KEY_PROVENANCE_READ]:
                read_prov[ds[KEY_DATASET_NAME]] = ds[KEY_DATASET_ID]
        write_prov = None
        if KEY_PROVENANCE_WRITE in obj[KEY_PROVENANCE]:
            write_prov = dict()
            for ds in obj[KEY_PROVENANCE][KEY_PROVENANCE_WRITE]:
                descriptor = DatasetDescriptor(
                    identifier=ds[KEY_DATASET_ID],
                    columns=[
                        DatasetColumn(identifier=col[KEY_COLUMN_ID],
                                      name=col[KEY_COLUMN_NAME],
                                      data_type=col[KEY_COLUMN_TYPE])
                        for col in ds[KEY_DATASET_COLUMNS]
                    ],
                    row_count=ds[KEY_DATASET_ROWCOUNT])
                write_prov[ds[KEY_DATASET_NAME]] = descriptor
        delete_prov = None
        if KEY_PROVENANCE_DELETE in obj[KEY_PROVENANCE]:
            delete_prov = obj[KEY_PROVENANCE][KEY_PROVENANCE_DELETE]
        res_prov = None
        if KEY_PROVENANCE_RESOURCES in obj[KEY_PROVENANCE]:
            res_prov = obj[KEY_PROVENANCE][KEY_PROVENANCE_RESOURCES]
        charts_prov = None
        if KEY_PROVENANCE_CHARTS in obj[KEY_PROVENANCE]:
            charts_prov = [
                ChartViewHandle.from_dict(c)
                for c in obj[KEY_PROVENANCE][KEY_PROVENANCE_CHARTS]
            ]
        provenance = ModuleProvenance(read=read_prov,
                                      write=write_prov,
                                      delete=delete_prov,
                                      resources=res_prov,
                                      charts=charts_prov)
        # Create dictionary of dataset descriptors only if previous state is
        # given and the module is in SUCCESS state. Otherwise, the database
        # state is empty.
        if obj[KEY_STATE] == mstate.MODULE_SUCCESS and not prev_state is None:
            datasets = provenance.get_database_state(prev_state)
        else:
            datasets = dict()
        # Return module handle
        return OSModuleHandle(identifier=identifier,
                              command=command,
                              external_form=obj[KEY_EXTERNAL_FORM],
                              module_path=module_path,
                              state=obj[KEY_STATE],
                              timestamp=timestamp,
                              datasets=datasets,
                              outputs=outputs,
                              provenance=provenance,
                              object_store=object_store)
示例#26
0
    def compute(self, command_id, arguments, context):
        """Compute results for commands in the Mimir package using the set of
        user-provided arguments and the current database state.

        Parameters
        ----------
        command_id: string
            Unique identifier for a command in a package declaration
        arguments: vizier.viztrail.command.ModuleArguments
            User-provided command arguments
        context: vizier.engine.task.base.TaskContext
            Context in which a task is being executed

        Returns
        -------
        vizier.engine.task.processor.ExecResult
        """
        outputs = ModuleOutputs()
        # Get dataset. Raise exception if dataset is unknown.
        ds_name = arguments.get_value(pckg.PARA_DATASET).lower()
        dataset = context.get_dataset(ds_name)
        mimir_table_name = dataset.identifier
        # Keep track of the name of the input dataset for the provenance
        # information.
        input_ds_name = ds_name
        if command_id == cmd.MIMIR_GEOCODE:
            geocoder = arguments.get_value(cmd.PARA_GEOCODER)
            # Add columns for LATITUDE and LONGITUDE
            column_counter = dataset.max_column_id() + 1
            cname_lat = dataset.get_unique_name('LATITUDE')
            cname_lon = dataset.get_unique_name('LONGITUDE')
            dataset.columns.append(
                MimirDatasetColumn(
                    identifier=column_counter,
                    name_in_dataset=cname_lat,
                    data_type=DATATYPE_REAL
                )
            )
            dataset.columns.append(
                MimirDatasetColumn(
                    identifier=column_counter + 1,
                    name_in_dataset=cname_lon,
                    data_type=DATATYPE_REAL
                )
            )
            house = arguments.get_value(cmd.PARA_HOUSE_NUMBER, raise_error=False, default_value=None)
            street = arguments.get_value(cmd.PARA_STREET, raise_error=False, default_value=None)
            city = arguments.get_value(cmd.PARA_CITY, raise_error=False, default_value=None)
            state = arguments.get_value(cmd.PARA_STATE, raise_error=False, default_value=None)

            params = {
                'houseColumn': dataset.column_by_id(house).name_in_rdb   if house  is not None and house  != '' else None,
                'streetColumn': dataset.column_by_id(street).name_in_rdb if street is not None and street != '' else None,
                'cityColumn': dataset.column_by_id(city).name_in_rdb     if city   is not None and city   != '' else None,
                'stateColumn': dataset.column_by_id(state).name_in_rdb   if state  is not None and state  != '' else None,
                'geocoder': geocoder#,
                #'latitudeColumn': Option[String],
                #'longitudeColumn': Option[String],
                #'cacheCode': Option[String]
            }
        elif command_id == cmd.MIMIR_KEY_REPAIR:
            column = dataset.column_by_id(arguments.get_value(pckg.PARA_COLUMN))
            params = { "key" : column.name_in_rdb }
        elif command_id == cmd.MIMIR_MISSING_KEY:
            column = dataset.column_by_id(arguments.get_value(pckg.PARA_COLUMN))
            params = column.name_in_rdb
            # Set MISSING ONLY to FALSE to ensure that all rows are returned
            #params += ['MISSING_ONLY(FALSE)']
            # Need to run this lens twice in order to generate row ids for
            # any potential new tuple
        elif command_id == cmd.MIMIR_MISSING_VALUE:
            params = list()
            for col in arguments.get_value(cmd.PARA_COLUMNS, default_value=[]):
                f_col = dataset.column_by_id(col.get_value(pckg.PARA_COLUMN))
                param = f_col.name_in_rdb
                col_constraint = col.get_value(
                    cmd.PARA_COLUMNS_CONSTRAINT,
                    raise_error=False
                )
                if col_constraint == '':
                    col_constraint = None
                #if not col_constraint is None:
                #    param = param + ' ' + str(col_constraint).replace("'", "\'\'").replace("OR", ") OR (")
                #param = '\'(' + param + ')\''
                params.append(param)
        elif command_id == cmd.MIMIR_PICKER:
            # Compute the input columns
            inputs = []
            for col in arguments.get_value(cmd.PARA_SCHEMA):
                c_col = col.get_value(cmd.PARA_PICKFROM)
                column = dataset.column_by_id(c_col)
                inputs.append(column.name_in_rdb)

            # Compute the output column
            output = arguments.get_value(cmd.PARA_PICKAS, default_value = inputs[0])
            if output == "":
                output = inputs[0]
            else:
                output = dataset.get_unique_name(output.strip().upper())

            # Compute the final parameter list
            params = {
                "inputs" : inputs,
                "output" : output
            }
        elif command_id == cmd.MIMIR_TYPE_INFERENCE:
            params = [str(arguments.get_value(cmd.PARA_PERCENT_CONFORM))]
        elif command_id == cmd.MIMIR_SHAPE_DETECTOR:
            dseModel = arguments.get_value(cmd.PARA_MODEL_NAME)
            params = []
            if not dseModel is None:
                params = [str(dseModel)]
        elif command_id == cmd.MIMIR_COMMENT:
            commentsParams = []
            for idx, comment in enumerate(arguments.get_value(cmd.PARA_COMMENTS)):
                commentParam = {}
                
                # If target is defined, it is the column that we're trying to annotate
                # If unset (or empty), it means we're annotating the row.
                column_id = comment.get_value(cmd.PARA_EXPRESSION, None)

                if column_id is not None:
                    column = dataset.column_by_id(column_id)
                    commentParam['target'] = column.name_in_rdb

                # The comment
                commentParam['comment'] = comment.get_value(cmd.PARA_COMMENT)

                # If rowid is defined, it is the row that we're trying to annotate.  
                # If unset (or empty), it means that we're annotating all rows
                rowid = comment.get_value(cmd.PARA_ROWID, None) 
                if (rowid is not None) and (rowid != ""):
                    # If rowid begins with '=', it's a formula
                    if rowid[0] == '=':
                        commentParam['condition'] = rowid[1:]
                    else:
                        commentParam['rows'] = [ int(rowid) ]
                
                #TODO: handle result columns
                commentsParams.append(commentParam)
            params = {'comments' : commentsParams}
        elif command_id == cmd.MIMIR_PIVOT:
            column = dataset.column_by_id(arguments.get_value(pckg.PARA_COLUMN))
            params = {
                "target" : column.name_in_rdb,
                "keys" : [],
                "values" : []
            }
            for col_arg in arguments.get_value(cmd.PARA_VALUES):
                col = dataset.column_by_id(col_arg.get_value(cmd.PARA_VALUE))
                params["values"].append(col.name_in_rdb)
            for col_arg in arguments.get_value(cmd.PARA_KEYS, default_value=[]):
                col = dataset.column_by_id(col_arg.get_value(cmd.PARA_KEY))
                params["keys"].append(col.name_in_rdb)
            if len(params["values"]) < 1:
                raise ValueError("Need at least one value column")
            # store_as_dataset = arguments.get_value(cmd.PARA_RESULT_DATASET)
        elif command_id == cmd.MIMIR_SHRED:
            params = { 
                "keepOriginalColumns" : arguments.get_value(cmd.PARA_KEEP_ORIGINAL)
            }
            shreds = []
            global_input_col = dataset.column_by_id(arguments.get_value(cmd.PARA_COLUMN_NAME))
            for (idx, shred) in enumerate(arguments.get_value(cmd.PARA_COLUMNS)):
                output_col = shred.get_value(cmd.PARA_OUTPUT_COLUMN)
                if output_col is None:
                    output_col = "{}_{}".format(global_input_col,idx)
                config = {}
                shred_type = shred.get_value(cmd.PARA_TYPE)
                expression = shred.get_value(cmd.PARA_EXPRESSION)
                group = shred.get_value(cmd.PARA_INDEX)
                if shred_type == "pattern":
                    config["regexp"] = expression
                    config["group"] = int(group)
                elif shred_type == "field":
                    config["separator"] = expression
                    config["field"] = int(group)
                elif shred_type == "explode":
                    config["separator"] = expression
                elif shred_type == "pass":
                    pass
                elif shred_type == "substring":
                    range_parts = re.match("([0-9]+)(([+\\-])([0-9]+))?", expression)
                    # print(range_parts)

                    # Mimir expects ranges to be given from start (inclusive) to end (exclusive)
                    # in a zero-based numbering scheme.

                    # Vizier expects input ranges to be given in a one-based numbering scheme.

                    # Convert to this format

                    if range_parts is None:
                        raise ValueError("Substring requires a range of the form '10', '10-11', or '10+1', but got '{}'".format(expression))
                    config["start"] = int(range_parts.group(1))-1 # Convert 1-based numbering to 0-based
                    if range_parts.group(2) is None:
                        config["end"] = config["start"] + 1 # if only one character, split one character
                    elif range_parts.group(3) == "+":
                        config["end"] = config["start"] + int(range_parts.group(4)) # start + length
                    elif range_parts.group(3) == "-":
                        config["end"] = int(range_parts.group(4)) # Explicit end, 1-based -> 0-based and exclusive cancel out
                    else:
                        raise ValueError("Invalid expression '{}' in substring shredder".format(expression))
                    # print("Shredding {} <- {} -- {}".format(output_col,config["start"],config["end"]))
                else:
                    raise ValueError("Invalid Shredding Type '{}'".format(shred_type))

                shreds.append({
                    **config,
                    "op" : shred_type,
                    "input" : global_input_col.name_in_rdb,
                    "output" : output_col,
                })
            params["shreds"] = shreds
            # store_as_dataset = arguments.get_value(cmd.PARA_RESULT_DATASET)
        else:
            raise ValueError("Unknown Mimir lens '{}'".format(command_id))
        # Create Mimir lens
       
        mimir_lens_response = mimir.createLens(
            mimir_table_name,
            params,
            command_id,
            arguments.get_value(cmd.PARA_MATERIALIZE_INPUT, default_value=True),
            human_readable_name = ds_name.upper()
        )
        lens_name = mimir_lens_response['name']
        lens_schema = mimir_lens_response['schema']
        lens_properties = mimir_lens_response['properties']

        ds = MimirDatasetHandle.from_mimir_result(lens_name, lens_schema, lens_properties, ds_name)

        if command_id in LENSES_THAT_SHOULD_NOT_DISPLAY_TABLES:
            print_dataset_schema(outputs, ds_name, ds.columns)
        else:
            from vizier.api.webservice import server
            ds_output = server.api.datasets.get_dataset(
                project_id=context.project_id,
                dataset_id=ds.identifier,
                offset=0,
                limit=10
            )
            outputs.stdout.append(DatasetOutput(ds_output))
        
        # Return task result
        return ExecResult(
            outputs=outputs,
            provenance=ModuleProvenance(
                read={input_ds_name: dataset.identifier},
                write={ds_name: DatasetDescriptor(
                    identifier=ds.identifier,
                    name=ds_name,
                    columns=ds.columns
                )}
            )
        )
示例#27
0
    def create_dataset(
            self,
            columns: List[DatasetColumn],
            rows: List[DatasetRow],
            properties: Optional[Dict[str, Any]] = None,
            human_readable_name: str = "Untitled Dataset",
            backend_options: Optional[List[Tuple[str, str]]] = None,
            dependencies: Optional[List[str]] = None) -> DatasetDescriptor:
        """Create a new dataset in the datastore. Expects at least the list of
        columns and the rows for the dataset.

        Raises ValueError if (1) the column identifier are not unique, (2) the
        row identifier are not uniqe, (3) the number of columns and values in a
        row do not match, (4) any of the column or row identifier have a
        negative value, or (5) if the given column or row counter have value
        lower or equal to any of the column or row identifier.

        Parameters
        ----------
        columns: list(vizier.datastore.dataset.DatasetColumn)
            List of columns. It is expected that each column has a unique
            identifier.
        rows: list(vizier.datastore.dataset.DatasetRow)
            List of dataset rows.
        properties: dict(string, ANY), optional
            Properties for dataset components

        Returns
        -------
        vizier.datastore.dataset.DatasetDescriptor
        """
        # Validate (i) that each column has a unique identifier, (ii) each row
        # has a unique identifier, and (iii) that every row has exactly one
        # value per column.
        properties = {} if properties is None else properties
        dependencies = [] if dependencies is None else dependencies
        identifiers = set(
            int(row.identifier) for row in rows
            if row.identifier is not None and int(row.identifier) >= 0)
        identifiers.add(0)
        max_row_id = max(identifiers)
        rows = [
            DatasetRow(identifier=row.identifier if row.identifier is not None
                       and int(row.identifier) >= 0 else str(idx + max_row_id),
                       values=row.values,
                       caveats=row.caveats) for idx, row in enumerate(rows)
        ]
        _, max_row_id = validate_dataset(columns=columns, rows=rows)
        # Get new identifier and create directory for new dataset
        identifier = get_unique_identifier()
        dataset_dir = self.get_dataset_dir(identifier)
        os.makedirs(dataset_dir)
        # Write rows to data file
        data_file = os.path.join(dataset_dir, DATA_FILE)
        DefaultJsonDatasetReader(data_file).write(rows)
        # Create dataset an write dataset file
        dataset = FileSystemDatasetHandle(identifier=identifier,
                                          columns=columns,
                                          data_file=data_file,
                                          row_count=len(rows),
                                          max_row_id=max_row_id,
                                          properties=properties)
        dataset.to_file(
            descriptor_file=os.path.join(dataset_dir, DESCRIPTOR_FILE))
        # Write metadata file if annotations are given
        if properties is not None:
            dataset.write_properties_to_file(
                self.get_properties_filename(identifier))
        # Return handle for new dataset
        return DatasetDescriptor(identifier=dataset.identifier,
                                 name=human_readable_name,
                                 columns=dataset.columns)
 def test_error(self):
     """Update module state from pending to error."""
     # Create original module
     module = OSModuleHandle.create_module(
         command=python_cell(source='print 2+2'),
         external_form='TEST MODULE',
         state=MODULE_PENDING,
         module_folder=MODULE_DIR,
         outputs=ModuleOutputs(stdout=[TextOutput('ABC')]),
         provenance=ModuleProvenance(
             read={'DS1': 'ID1'},
             write={'DS1': DatasetDescriptor(identifier='ID2', name='ID2')},
             resources={'fileid': '0123456789'}),
         timestamp=ModuleTimestamp())
     module.set_error()
     self.assertTrue(module.is_error)
     self.assertIsNotNone(module.timestamp.finished_at)
     self.assertEqual(len(module.outputs.stderr), 0)
     self.assertEqual(len(module.outputs.stdout), 0)
     self.assertIsNotNone(module.provenance.read)
     self.assertIsNotNone(module.provenance.write)
     self.assertIsNotNone(module.provenance.resources)
     self.assertEqual(module.provenance.resources['fileid'], '0123456789')
     # Read module from object store and ensure that tall changes have been
     # materialized properly
     module = OSModuleHandle.load_module(identifier=module.identifier,
                                         module_path=module.module_path)
     self.assertTrue(module.is_error)
     self.assertIsNotNone(module.timestamp.finished_at)
     self.assertEqual(len(module.outputs.stderr), 0)
     self.assertEqual(len(module.outputs.stdout), 0)
     self.assertIsNotNone(module.provenance.read)
     self.assertIsNotNone(module.provenance.write)
     self.assertIsNotNone(module.provenance.resources)
     self.assertEqual(module.provenance.resources['fileid'], '0123456789')
     # Set canceled with timestamp and output information
     ts = get_current_time()
     module.set_error(
         finished_at=ts,
         outputs=ModuleOutputs(stderr=[TextOutput('Some Error')]))
     self.assertTrue(module.is_error)
     self.assertIsNotNone(module.timestamp.finished_at)
     self.assertEqual(module.timestamp.finished_at, ts)
     self.assertEqual(len(module.outputs.stderr), 1)
     self.assertEqual(module.outputs.stderr[0].value, 'Some Error')
     self.assertEqual(len(module.outputs.stdout), 0)
     self.assertIsNotNone(module.provenance.read)
     self.assertIsNotNone(module.provenance.write)
     self.assertIsNotNone(module.provenance.resources)
     self.assertEqual(module.provenance.resources['fileid'], '0123456789')
     module = OSModuleHandle.load_module(identifier=module.identifier,
                                         module_path=module.module_path)
     self.assertTrue(module.is_error)
     self.assertIsNotNone(module.timestamp.finished_at)
     self.assertEqual(module.timestamp.finished_at, ts)
     self.assertEqual(len(module.outputs.stderr), 1)
     self.assertEqual(module.outputs.stderr[0].value, 'Some Error')
     self.assertEqual(len(module.outputs.stdout), 0)
     self.assertIsNotNone(module.provenance.read)
     self.assertIsNotNone(module.provenance.write)
     self.assertIsNotNone(module.provenance.resources)
     self.assertEqual(module.provenance.resources['fileid'], '0123456789')