def test_deduplicate_annotations(self):
     """Test removing duplicated annotations."""
     store = FileSystemDatastore(STORE_DIR)
     ds = store.create_dataset(
         columns=[
             DatasetColumn(identifier=0, name='A'),
             DatasetColumn(identifier=1, name='B')
         ],
         rows=[DatasetRow(identifier=0, values=['a', 'b'])],
         annotations=DatasetMetadata(
             cells=[
                 DatasetAnnotation(column_id=0, row_id=0, key='X', value=1),
                 DatasetAnnotation(column_id=0, row_id=0, key='X', value=2),
                 DatasetAnnotation(column_id=1, row_id=0, key='X', value=3),
                 DatasetAnnotation(column_id=1, row_id=1, key='X', value=3),
                 DatasetAnnotation(column_id=0, row_id=0, key='Y', value=1),
                 DatasetAnnotation(column_id=0, row_id=0, key='X', value=1),
                 DatasetAnnotation(column_id=0, row_id=0, key='X', value=2),
                 DatasetAnnotation(column_id=1, row_id=0, key='X', value=3),
                 DatasetAnnotation(column_id=1, row_id=1, key='X', value=3),
             ],
             columns=[
                 DatasetAnnotation(column_id=0, key='A', value='x'),
                 DatasetAnnotation(column_id=1, key='A', value='x'),
                 DatasetAnnotation(column_id=0, key='A', value='x'),
                 DatasetAnnotation(column_id=1, key='A', value='x'),
                 DatasetAnnotation(column_id=0, key='A', value='x'),
                 DatasetAnnotation(column_id=1, key='A', value='x'),
                 DatasetAnnotation(column_id=0, key='A', value='x'),
                 DatasetAnnotation(column_id=1, key='A', value='x')
                 ],
             rows=[
                 DatasetAnnotation(row_id=0, key='E', value=100),
                 DatasetAnnotation(row_id=0, key='E', value=100)
             ]
         )
     )
     ds = store.get_dataset(ds.identifier)
     self.assertEqual(len(ds.annotations.cells), 4)
     self.assertEqual(len(ds.annotations.columns), 2)
     self.assertEqual(len(ds.annotations.rows), 1)
     annos = ds.annotations.for_cell(column_id=0, row_id=0)
     self.assertEqual(len(annos), 3)
     self.assertTrue(1 in [a.value for a in annos])
     self.assertTrue(2 in [a.value for a in annos])
     self.assertFalse(3 in [a.value for a in annos])
     self.assertEqual(len(ds.annotations.find_all(values=annos, key='X')), 2)
     with self.assertRaises(ValueError):
         ds.annotations.find_one(values=annos, key='X')
     self.assertEqual(len(ds.annotations.for_column(column_id=0)), 1)
     self.assertEqual(len(ds.annotations.for_row(row_id=0)), 1)
     annotations = ds.annotations.filter(columns=[1])
     self.assertEqual(len(annotations.cells), 1)
     self.assertEqual(len(annotations.columns), 1)
     self.assertEqual(len(annotations.rows), 1)
 def test_query_annotations(self):
     """Test retrieving annotations via the datastore."""
     store = FileSystemDatastore(STORE_DIR)
     ds = store.create_dataset(
         columns=[
             DatasetColumn(identifier=0, name='A'),
             DatasetColumn(identifier=1, name='B')
         ],
         rows=[DatasetRow(identifier=0, values=['a', 'b'])],
         properties=EXAMPLE_PROPERTIES
     )
     properties = store.get_properties(ds.identifier)
     self.assertEqual(len(properties["columns"]), 2)
Exemplo n.º 3
0
 def test_unique_name(self):
     """Test method that computes unique column names."""
     ds = DatasetDescriptor(identifier='0',
                            columns=[
                                DatasetColumn(identifier=0, name='ABC'),
                                DatasetColumn(identifier=1, name='A'),
                                DatasetColumn(identifier=2, name='ABC_1'),
                                DatasetColumn(identifier=3, name='DEF'),
                                DatasetColumn(identifier=4, name='xyz'),
                            ])
     self.assertEqual(ds.get_unique_name('Age'), 'Age')
     self.assertEqual(ds.get_unique_name('XYZ'), 'XYZ_1')
     self.assertEqual(ds.get_unique_name('xyz'), 'xyz_1')
     self.assertEqual(ds.get_unique_name('ABC'), 'ABC_2')
Exemplo n.º 4
0
    def from_file(descriptor_file, data_file, annotations=None):
        """Read dataset descriptor from file and return a new instance of the
        dataset handle.

        Parameters
        ----------
        descriptor_file: string
            Path to the file containing the dataset descriptor
        data_file: string
            Path to the file that contains the dataset rows.
        annotations: vizier.datastore.annotation.dataset.DatasetMetadata, optional
            Annotations for dataset components

        Returns
        -------
        vizier.datastore.fs.dataset.FileSystemDatasetHandle
        """
        with open(descriptor_file, 'r') as f:
            doc = json.loads(f.read())
        return FileSystemDatasetHandle(identifier=doc[KEY_IDENTIFIER],
                                       columns=[
                                           DatasetColumn(
                                               identifier=col[KEY_COLUMN_ID],
                                               name=col[KEY_COLUMN_NAME],
                                               data_type=col[KEY_COLUMN_TYPE])
                                           for col in doc[KEY_COLUMNS]
                                       ],
                                       data_file=data_file,
                                       row_count=doc[KEY_ROWCOUNT],
                                       max_row_id=doc[KEY_MAXROWID],
                                       annotations=annotations)
Exemplo n.º 5
0
    def from_dict(obj):
        """Create the descriptor from a dictionary serialization.

        Parameters
        ----------
        obj: dict
            Dictionary serialization for dataset descriptor as returned by the
            server.

        Returns
        -------
        vizier.api.client.resources.dataset.DatasetDescriptor
        """
        return DatasetDescriptor(
            identifier=obj[labels.ID],
            name=obj[labels.NAME],
            columns=[
                DatasetColumn(
                    identifier=col[labels.ID],
                    name=col[labels.NAME],
                    data_type=col[labels.DATATYPE]
                ) for col in obj['columns']
            ],
            links=deserialize.HATEOAS(links=obj[labels.LINKS])
        )
Exemplo n.º 6
0
    def insert_column(self, 
        name: str, 
        data_type: str = "varchar", 
        position: Optional[int] = None
    ) -> DatasetColumn:
        """Add a new column to the dataset schema.

        Parameters
        ----------
        name: string
            Name of the new column
        position: int, optional
            Position in the dataset schema where new column is inserted. If
            None, the column is appended to the list of dataset columns.

        Returns
        DatasetColumn
        """
        column = DatasetColumn(name=name, data_type = data_type)
        self.columns = list(self.columns)
        if not position is None:
            self.columns.insert(position, column)
            # Add a null value to each row for the new column
            for row in self.rows:
                row.values.insert(position, None)
        else:
            self.columns.append(column)
            # Add a null value to each row for the new column
            for row in self.rows:
                row.values.append(None)
        return column
Exemplo n.º 7
0
    def filter_columns(self, identifier: str, columns: List[int],
                       names: List[str],
                       datastore: Datastore) -> VizualApiResult:
        """Dataset projection operator. Returns a copy of the dataset with the
        given identifier that contains only those columns listed in columns.
        The list of names contains optional new names for the filtered columns.
        A value of None in names indicates that the name of the corresponding
        column is not changed.

        Raises ValueError if no dataset with given identifier exists or if any
        of the filter columns are unknown.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        columns: list(int)
            List of column identifier for columns in the result.
        names: list(string)
            Optional new names for filtered columns.
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # The schema of the new dataset only contains the columns in the given
        # list. Keep track of their index positions to filter values.
        schema = list()
        val_filter = list()
        for i in range(len(columns)):
            col_idx = dataset.get_index(columns[i])
            if col_idx is None:
                raise ValueError('unknown column identifier \'' +
                                 str(columns[i]) + '\'')
            col = dataset.columns[col_idx]
            if not names[i] is None:
                schema.append(
                    DatasetColumn(identifier=col.identifier,
                                  name=names[i],
                                  data_type=col.data_type))
            else:
                schema.append(col)
            val_filter.append(col_idx)
        # Create a list of projected rows
        rows = list()
        for row in dataset.fetch_rows():
            values = list()
            for v_idx in val_filter:
                values.append(row.values[v_idx])
            rows.append(DatasetRow(identifier=row.identifier, values=values))
        # Store updated dataset to get new identifier
        ds = datastore.create_dataset(columns=schema, rows=rows, properties={})
        return VizualApiResult(ds)
 def test_query_annotations(self):
     """Test retrieving annotations via the datastore."""
     store = FileSystemDatastore(STORE_DIR)
     ds = store.create_dataset(
         columns=[
             DatasetColumn(identifier=0, name='A'),
             DatasetColumn(identifier=1, name='B')
         ],
         rows=[DatasetRow(identifier=0, values=['a', 'b'])],
         annotations=DatasetMetadata(
             cells=[
                 DatasetAnnotation(column_id=0, row_id=0, key='X', value=1),
                 DatasetAnnotation(column_id=0, row_id=0, key='X', value=2),
                 DatasetAnnotation(column_id=1, row_id=0, key='X', value=3),
                 DatasetAnnotation(column_id=0, row_id=0, key='Y', value=1)
             ],
             columns=[
                 DatasetAnnotation(column_id=0, key='A', value='x'),
                 DatasetAnnotation(column_id=1, key='A', value='x')
                 ],
             rows=[
                 DatasetAnnotation(row_id=0, key='E', value=100)
             ]
         )
     )
     annos = store.get_annotations(ds.identifier, column_id=1)
     self.assertEqual(len(annos.columns), 1)
     self.assertEqual(len(annos.rows), 0)
     self.assertEqual(len(annos.cells), 0)
     annos = store.get_annotations(ds.identifier, column_id=0)
     self.assertEqual(len(annos.columns), 1)
     self.assertEqual(len(annos.rows), 0)
     self.assertEqual(len(annos.cells), 0)
     annos = store.get_annotations(ds.identifier, row_id=0)
     self.assertEqual(len(annos.columns), 0)
     self.assertEqual(len(annos.rows), 1)
     self.assertEqual(len(annos.cells), 0)
     annos = store.get_annotations(ds.identifier, column_id=1, row_id=0)
     self.assertEqual(len(annos.columns), 0)
     self.assertEqual(len(annos.rows), 0)
     self.assertEqual(len(annos.cells), 1)
     annos = store.get_annotations(ds.identifier, column_id=0, row_id=0)
     self.assertEqual(len(annos.columns), 0)
     self.assertEqual(len(annos.rows), 0)
     self.assertEqual(len(annos.cells), 3)
 def test_load_with_dataset_delete(self):
     """Test loading workflows where each module creates a new dataset and
     deletes the previous dataset (except for the first module).
     """
     base_path = os.path.join(os.path.abspath(REPO_DIR), 'ABC')
     os.makedirs(base_path)
     vt = OSViztrailHandle.create_viztrail(identifier='ABC',
                                           properties={},
                                           base_path=base_path)
     branch = vt.get_default_branch()
     # Append ten modules
     for i in range(5):
         ts = get_current_time()
         deleted_datasets = list()
         if i > 0:
             deleted_datasets.append('DS' + str(i - 1))
         command = python_cell(source='print ' + str(i) + '+' + str(i))
         module = OSModuleHandle.create_module(
             command=command,
             external_form='print ' + str(i) + '+' + str(i),
             state=MODULE_SUCCESS,
             outputs=ModuleOutputs(stdout=[TextOutput(str(i + i))]),
             provenance=ModuleProvenance(write={
                 'DS' + str(i):
                 DatasetDescriptor(
                     identifier=str(i),
                     name='DS' + str(i),
                     columns=[
                         DatasetColumn(identifier=j, name=str(j))
                         for j in range(i)
                     ],
                 )
             },
                                         delete=deleted_datasets),
             timestamp=ModuleTimestamp(created_at=ts,
                                       started_at=ts,
                                       finished_at=ts),
             module_folder=vt.modules_folder,
             object_store=vt.object_store)
         if not branch.head is None:
             modules = branch.head.modules + [module]
         else:
             modules = [module]
         branch.append_workflow(modules=modules,
                                action=ACTION_INSERT,
                                command=command)
     vt = OSViztrailHandle.load_viztrail(base_path)
     workflow = vt.get_default_branch().get_head()
     self.assertEqual(len(workflow.modules), 5)
     datasets = {}
     for i in range(5):
         module = workflow.modules[i]
         datasets = module.provenance.get_database_state(datasets)
         self.assertEqual(len(datasets), 1)
         key = 'DS' + str(i)
         self.assertTrue(key in datasets)
         self.assertEqual(len(datasets[key].columns), i)
Exemplo n.º 10
0
    def load_dataset(
        self,
        f_handle: FileHandle,
        proposed_schema: List[Tuple[str,
                                    str]] = []) -> FileSystemDatasetHandle:
        """Create a new dataset from a given file.

        Raises ValueError if the given file could not be loaded as a dataset.

        Parameters
        ----------
        f_handle : vizier.filestore.base.FileHandle
            Handle for an uploaded file

        Returns
        -------
        vizier.datastore.fs.dataset.FileSystemDatasetHandle
        """
        # The file handle might be None in which case an exception is raised
        if f_handle is None:
            raise ValueError('unknown file')
        # Expects a file in a supported tabular data format.
        if not f_handle.is_tabular:
            raise ValueError('cannot create dataset from file \'' +
                             f_handle.name + '\'')
        # Open the file as a csv file. Expects that the first row contains the
        # column names. Read dataset schema and dataset rows into two separate
        # lists.
        columns: List[DatasetColumn] = []
        rows: List[DatasetRow] = []
        with f_handle.open() as csvfile:
            reader = csv.reader(csvfile, delimiter=f_handle.delimiter)
            for col_name in next(reader):
                columns.append(
                    DatasetColumn(identifier=len(columns),
                                  name=col_name.strip()))
            for row in reader:
                values = [cast(v.strip()) for v in row]
                rows.append(
                    DatasetRow(identifier=str(len(rows)), values=values))
        # Get unique identifier and create subfolder for the new dataset
        identifier = get_unique_identifier()
        dataset_dir = self.get_dataset_dir(identifier)
        os.makedirs(dataset_dir)
        # Write rows to data file
        data_file = os.path.join(dataset_dir, DATA_FILE)
        DefaultJsonDatasetReader(data_file).write(rows)
        # Create dataset an write descriptor to file
        dataset = FileSystemDatasetHandle(identifier=identifier,
                                          columns=columns,
                                          data_file=data_file,
                                          row_count=len(rows),
                                          max_row_id=len(rows) - 1)
        dataset.to_file(
            descriptor_file=os.path.join(dataset_dir, DESCRIPTOR_FILE))
        return dataset
Exemplo n.º 11
0
    def create_dataset(self, name, dataset, backend_options=[]):
        """Create a new dataset with given name.

        Raises ValueError if a dataset with given name already exist.

        Parameters
        ----------
        name : string
            Unique dataset name
        dataset : vizier.datastore.client.DatasetClient
            Dataset object

        Returns
        -------
        vizier.datastore.client.DatasetClient
        """
        # Raise an exception if a dataset with the given name already exists or
        # if the name is not valid
        if self.has_dataset_identifier(name):
            # Record access to the datasets
            self.read.add(name.lower())
            raise ValueError('dataset \'' + name + '\' already exists')
        if not is_valid_name(name):
            raise ValueError('invalid dataset name \'' + name + '\'')
        # Create list of columns for new dataset. Ensure that every column has
        # a positive identifier
        columns = list()
        if len(dataset.columns) > 0:
            column_counter = max(
                max([col.identifier for col in dataset.columns]) + 1, 0)
            for col in dataset.columns:
                if col.identifier < 0:
                    col.identifier = column_counter
                    column_counter += 1
                columns.append(
                    DatasetColumn(identifier=col.identifier,
                                  name=col.name,
                                  data_type=col.data_type))
        rows = dataset.rows
        if len(rows) > 0:
            # Ensure that all rows have positive identifier
            row_counter = max(max([row.identifier for row in rows]) + 1, 0)
            for row in rows:
                if row.identifier < 0:
                    row.identifier = row_counter
                    row_counter += 1
        # Write dataset to datastore and add new dataset to context
        ds = self.datastore.create_dataset(columns=columns,
                                           rows=rows,
                                           annotations=dataset.annotations,
                                           human_readable_name=name.upper(),
                                           backend_options=backend_options)
        self.set_dataset_identifier(name, ds.identifier)
        self.descriptors[ds.identifier] = ds
        return DatasetClient(dataset=self.datastore.get_dataset(ds.identifier))
 def test_properties(self):
     """Test loading a dataset from file."""
     store = FileSystemDatastore(STORE_DIR)
     ds = store.create_dataset(
         columns=[
             DatasetColumn(identifier=0, name='A'),
             DatasetColumn(identifier=1, name='B')
         ],
         rows=[DatasetRow(identifier=0, values=[1, 2])],
         properties=EXAMPLE_PROPERTIES
     )
     ds = store.get_dataset(ds.identifier)
     column_props = ds.properties['columns']
     self.assertEqual(len(column_props), 2)
     self.assertTrue('A' in [prop['name'] for prop in column_props])
     # Reload datastore
     store = FileSystemDatastore(STORE_DIR)
     ds = store.get_dataset(ds.identifier)
     column_props = ds.properties['columns']
     self.assertEqual(len(column_props), 2)
Exemplo n.º 13
0
 def test_create_dataset(self):
     """Test loading a dataset from file."""
     store = FileSystemDatastore(STORE_DIR)
     ds = store.create_dataset(
         columns=[
             DatasetColumn(identifier=0, name='A'),
             DatasetColumn(identifier=1, name='B')
         ],
         rows=[DatasetRow(identifier=0, values=['a', 'b'])]
     )
     ds = store.get_dataset(ds.identifier)
     column_ids = [col.identifier for col in ds.columns]
     self.assertEqual(len(ds.columns), 2)
     for id in [0, 1]:
         self.assertTrue(id in column_ids)
     column_names = [col.name for col in ds.columns]
     for name in ['A', 'B']:
         self.assertTrue(name in column_names)
     rows = ds.fetch_rows()
     self.assertEqual(len(rows), 1)
     self.assertEqual(rows[0].values, ['a', 'b'])
     self.assertEqual(len(ds.annotations.cells), 0)
     self.assertEqual(len(ds.annotations.columns), 0)
     self.assertEqual(len(ds.annotations.rows), 0)
     # Reload the datastore
     store = FileSystemDatastore(STORE_DIR)
     ds = store.get_dataset(ds.identifier)
     column_ids = [col.identifier for col in ds.columns]
     self.assertEqual(len(ds.columns), 2)
     for id in [0, 1]:
         self.assertTrue(id in column_ids)
     column_names = [col.name for col in ds.columns]
     for name in ['A', 'B']:
         self.assertTrue(name in column_names)
     rows = ds.fetch_rows()
     self.assertEqual(len(rows), 1)
     self.assertEqual(rows[0].values, ['a', 'b'])
     self.assertEqual(len(ds.annotations.cells), 0)
     self.assertEqual(len(ds.annotations.columns), 0)
     self.assertEqual(len(ds.annotations.rows), 0)
 def test_validate_dataset(self):
     """Test the validate dataset function."""
     columns = []
     rows = []
     # Empty dataset
     max_col_id, max_row_id = validate_dataset(columns, rows)
     self.assertEqual(max_col_id, -1)
     self.assertEqual(max_row_id, -1)
     max_col_id, max_row_id = validate_dataset(
         columns=columns,
         rows=rows
     )
     self.assertEqual(max_col_id, -1)
     self.assertEqual(max_row_id, -1)
     # Valid set of columns and rows
     columns = [DatasetColumn(0, 'A'), DatasetColumn(10, 'B')]
     rows = [DatasetRow(0, [1, 2]), DatasetRow(4, [None, 2]), DatasetRow(2, [0, 0])]
     max_col_id, max_row_id = validate_dataset(columns, rows)
     self.assertEqual(max_col_id, 10)
     self.assertEqual(max_row_id, 4)
     max_col_id, max_row_id = validate_dataset(
         columns=columns,
         rows=rows
     )
     self.assertEqual(max_col_id, 10)
     self.assertEqual(max_row_id, 4)
     # Column errors
     with self.assertRaises(ValueError):
         validate_dataset(columns + [DatasetColumn()], [])
     with self.assertRaises(ValueError):
         validate_dataset(columns + [DatasetColumn(10, 'C')], [])
     # Row errors
     with self.assertRaises(ValueError):
         validate_dataset(columns, rows + [DatasetRow(1000, [0, 1, 3])])
     with self.assertRaises(ValueError):
         validate_dataset(columns, rows + [DatasetRow(-1, [1, 3])])
     with self.assertRaises(ValueError):
         validate_dataset(columns, rows + [DatasetRow(0, [1, 3])])
Exemplo n.º 15
0
 def dataset_column_index(self):
     """Test the column by id index of the dataset handle."""
     self.setup_fileserver()
     ds = self.db.load_dataset(self.fileserver.upload_file(CSV_FILE))
     # Ensure that the project data has three columns and two rows
     self.assertEqual(ds.column_by_id(0).name.upper(), 'NAME')
     self.assertEqual(ds.column_by_id(1).name.upper(), 'AGE')
     self.assertEqual(ds.column_by_id(2).name.upper(), 'SALARY')
     with self.assertRaises(ValueError):
         ds.column_by_id(5)
     ds.columns.append(DatasetColumn(identifier=5, name='NEWNAME'))
     self.assertEqual(ds.column_by_id(5).name.upper(), 'NEWNAME')
     with self.assertRaises(ValueError):
         ds.column_by_id(4)
Exemplo n.º 16
0
    def rename_column(self, identifier: str, column_id: int, name: str,
                      datastore: Datastore) -> VizualApiResult:
        """Rename column in a given dataset.

        Raises ValueError if no dataset with given identifier exists, if the
        specified column is unknown, or if the given column name is invalid.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        column_id: int
            Unique column identifier
        name: string
            New column name
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Raise ValueError if given colum name is invalid
        if not is_valid_name(name):
            raise ValueError('invalid column name \'' + name + '\'')
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Get the specified column that is to be renamed and set the column name
        # to the new name
        col_idx = dataset.get_index(column_id)
        if col_idx is None:
            raise ValueError('unknown column identifier \'' + str(column_id) +
                             '\'')
        # Nothing needs to be changed if name does not differ from column name
        if dataset.columns[col_idx].name.lower() != name.lower():
            columns = list(dataset.columns)
            col = columns[col_idx]
            columns[col_idx] = DatasetColumn(identifier=col.identifier,
                                             name=name,
                                             data_type=col.data_type)
            # Store updated dataset to get new identifier
            ds = datastore.create_dataset(columns=columns,
                                          rows=dataset.fetch_rows(),
                                          properties={})
            return VizualApiResult(ds)
        else:
            return VizualApiResult(dataset)
Exemplo n.º 17
0
    def create_dataset(self, name, dataset, backend_options=[]):
        """Create a new dataset with given name.

        Raises ValueError if a dataset with given name already exist.

        Parameters
        ----------
        name : string
            Unique dataset name
        dataset : vizier.datastore.client.DatasetClient
            Dataset object

        Returns
        -------
        vizier.datastore.client.DatasetClient
        """
        # Raise an exception if a dataset with the given name already exists or
        # if the name is not valid
        if name.lower() in self.datasets:
            # Record access to the datasets
            raise ValueError('dataset \'' + name + '\' already exists')
        if not is_valid_name(name):
            raise ValueError('invalid dataset name \'' + name + '\'')
        # Create list of columns for new dataset. Ensure that every column has
        # a positive identifier
        columns = list()
        if len(dataset.columns) > 0:
            column_counter = max(
                max([col.identifier for col in dataset.columns]) + 1, 0)
            for col in dataset.columns:
                if col.identifier < 0:
                    col.identifier = column_counter
                    column_counter += 1
                columns.append(
                    DatasetColumn(identifier=col.identifier,
                                  name=col.name,
                                  data_type=col.data_type))
        rows = dataset.rows
        # Write dataset to datastore and add new dataset to context
        ds = self.datastore.create_dataset(columns=columns,
                                           rows=rows,
                                           properties=dataset.properties,
                                           human_readable_name=name,
                                           backend_options=backend_options)
        self.datasets[name.lower()] = ds
        self.write.add(name.lower())
        return DatasetClient(dataset=self.datastore.get_dataset(ds.identifier),
                             client=self,
                             existing_name=name.lower())
Exemplo n.º 18
0
    def insert_column(self, identifier: str, position: int, name: str,
                      datastore: Datastore) -> VizualApiResult:
        """Insert column with given name at given position in dataset.

        Raises ValueError if no dataset with given identifier exists, if the
        specified column position is outside of the current schema bounds, or if
        the column name is invalid.

        Parameters
        ----------
        identifier: string
            Unique dataset identifier
        position: int
            Index position at which the column will be inserted
        name: string
            New column name
        datastore : vizier.datastore.fs.base.FileSystemDatastore
            Datastore to retireve and update datasets

        Returns
        -------
        vizier.engine.packages.vizual.api.VizualApiResult
        """
        # Raise ValueError if given colum name is invalid
        if not name is None and not is_valid_name(name):
            raise ValueError('invalid column name \'' + name + '\'')
        # Get dataset. Raise exception if dataset is unknown
        dataset = datastore.get_dataset(identifier)
        if dataset is None:
            raise ValueError('unknown dataset \'' + identifier + '\'')
        # Make sure that position is a valid column index in the new dataset
        if position < 0 or position > len(dataset.columns):
            raise ValueError('invalid column index \'' + str(position) + '\'')
        # Insert new column into dataset
        columns = list(dataset.columns)
        rows = dataset.fetch_rows()
        columns.insert(
            position,
            DatasetColumn(identifier=dataset.max_column_id() + 1,
                          name=name if not name is None else ''))
        # Add a null value to each row for the new column
        for row in rows:
            row.values.insert(position, None)
        # Store updated dataset to get new identifier
        ds = datastore.create_dataset(columns=columns,
                                      rows=rows,
                                      properties={})
        return VizualApiResult(ds)
Exemplo n.º 19
0
def DATASET_COLUMNS(obj):
    """Convert a list of dictionaries into a list of dataset columns.

    Parameters
    ----------
    obj: list
        List of dataset columns in default serialization format

    Returns
    -------
    list
    """
    return [
        DatasetColumn(identifier=col[labels.ID],
                      name=col[labels.NAME],
                      data_type=col[labels.DATATYPE]) for col in obj
    ]
Exemplo n.º 20
0
 def test_update_annotations(self):
     """Test updating annotations via the datastore."""
     store = FileSystemDatastore(STORE_DIR)
     ds = store.create_dataset(
         columns=[
             DatasetColumn(identifier=0, name='A'),
             DatasetColumn(identifier=1, name='B')
         ],
         rows=[DatasetRow(identifier=0, values=['a', 'b'])],
         annotations=DatasetMetadata(
             cells=[
                 DatasetAnnotation(column_id=0, row_id=0, key='X', value=1),
                 DatasetAnnotation(column_id=0, row_id=0, key='X', value=2),
                 DatasetAnnotation(column_id=1, row_id=0, key='X', value=3),
                 DatasetAnnotation(column_id=0, row_id=0, key='Y', value=1)
             ],
             columns=[
                 DatasetAnnotation(column_id=0, key='A', value='x'),
                 DatasetAnnotation(column_id=1, key='A', value='x')
                 ],
             rows=[
                 DatasetAnnotation(row_id=0, key='E', value=100)
             ]
         )
     )
     # INSERT row annotatins
     store.update_annotation(
         ds.identifier,
         key='D',
         row_id=0,
         new_value=200
     )
     annos = store.get_annotations(ds.identifier, row_id=0)
     self.assertEqual(len(annos.rows), 2)
     for key in ['D', 'E']:
         self.assertTrue(key in [a.key for a in annos.rows])
     for val in [100, 200]:
         self.assertTrue(val in [a.value for a in annos.rows])
     # UPDATE column annotation
     store.update_annotation(
         ds.identifier,
         key='A',
         column_id=1,
         old_value='x',
         new_value='y'
     )
     annos = store.get_annotations(ds.identifier, column_id=1)
     self.assertEqual(annos.columns[0].key, 'A')
     self.assertEqual(annos.columns[0].value, 'y')
     # DELETE cell annotation
     store.update_annotation(
         ds.identifier,
         key='X',
         column_id=0,
         row_id=0,
         old_value=2,
     )
     annos = store.get_annotations(ds.identifier, column_id=0, row_id=0)
     self.assertEqual(len(annos.cells), 2)
     for a in annos.cells:
         self.assertNotEqual(a.value, 2)
     result = store.update_annotation(
         ds.identifier,
         key='X',
         column_id=1,
         row_id=0,
         old_value=3,
     )
     self.assertTrue(result)
     annos = store.get_annotations(ds.identifier, column_id=1, row_id=0)
     self.assertEqual(len(annos.cells), 0)
Exemplo n.º 21
0
    def load_module(identifier,
                    module_path,
                    prev_state=None,
                    object_store=None):
        """Load module from given object store.

        Parameters
        ----------
        identifier: string
            Unique module identifier
        module_path: string
            Resource path for module object
        prev_state: dict(string: vizier.datastore.dataset.DatasetDescriptor)
            Dataset descriptors keyed by the user-provided name that exist in
            the database state of the previous moudle (in sequence of occurrence
            in the workflow)
        object_store: vizier.core.io.base.ObjectStore, optional
            Object store implementation to access and maintain resources

        Returns
        -------
        vizier.viztrail.objectstore.module.OSModuleHandle
        """
        # Make sure the object store is not None
        if object_store is None:
            object_store = DefaultObjectStore()
        # Read object from store. This may raise a ValueError to indicate that
        # the module does not exists (in a system error condtion). In this
        # case we return a new module that is in error state.
        try:
            obj = object_store.read_object(object_path=module_path)
        except ValueError:
            return OSModuleHandle(
                identifier=identifier,
                command=ModuleCommand(package_id=UNKNOWN_ID,
                                      command_id=UNKNOWN_ID),
                external_form='fatal error: object not found',
                module_path=module_path,
                state=mstate.MODULE_ERROR,
                object_store=object_store)
        # Create module command
        command = ModuleCommand(package_id=obj[KEY_COMMAND][KEY_PACKAGE_ID],
                                command_id=obj[KEY_COMMAND][KEY_COMMAND_ID],
                                arguments=obj[KEY_COMMAND][KEY_ARGUMENTS])
        # Create module timestamps
        created_at = to_datetime(obj[KEY_TIMESTAMP][KEY_CREATED_AT])
        if KEY_STARTED_AT in obj[KEY_TIMESTAMP]:
            started_at = to_datetime(obj[KEY_TIMESTAMP][KEY_STARTED_AT])
        else:
            started_at = None
        if KEY_FINISHED_AT in obj[KEY_TIMESTAMP]:
            finished_at = to_datetime(obj[KEY_TIMESTAMP][KEY_FINISHED_AT])
        else:
            finished_at = None
        timestamp = ModuleTimestamp(created_at=created_at,
                                    started_at=started_at,
                                    finished_at=finished_at)
        # Create module output streams.
        outputs = ModuleOutputs(
            stdout=get_output_stream(obj[KEY_OUTPUTS][KEY_STDOUT]),
            stderr=get_output_stream(obj[KEY_OUTPUTS][KEY_STDERR]))
        # Create module provenance information
        read_prov = None
        if KEY_PROVENANCE_READ in obj[KEY_PROVENANCE]:
            read_prov = dict()
            for ds in obj[KEY_PROVENANCE][KEY_PROVENANCE_READ]:
                read_prov[ds[KEY_DATASET_NAME]] = ds[KEY_DATASET_ID]
        write_prov = None
        if KEY_PROVENANCE_WRITE in obj[KEY_PROVENANCE]:
            write_prov = dict()
            for ds in obj[KEY_PROVENANCE][KEY_PROVENANCE_WRITE]:
                descriptor = DatasetDescriptor(
                    identifier=ds[KEY_DATASET_ID],
                    columns=[
                        DatasetColumn(identifier=col[KEY_COLUMN_ID],
                                      name=col[KEY_COLUMN_NAME],
                                      data_type=col[KEY_COLUMN_TYPE])
                        for col in ds[KEY_DATASET_COLUMNS]
                    ],
                    row_count=ds[KEY_DATASET_ROWCOUNT])
                write_prov[ds[KEY_DATASET_NAME]] = descriptor
        delete_prov = None
        if KEY_PROVENANCE_DELETE in obj[KEY_PROVENANCE]:
            delete_prov = obj[KEY_PROVENANCE][KEY_PROVENANCE_DELETE]
        res_prov = None
        if KEY_PROVENANCE_RESOURCES in obj[KEY_PROVENANCE]:
            res_prov = obj[KEY_PROVENANCE][KEY_PROVENANCE_RESOURCES]
        charts_prov = None
        if KEY_PROVENANCE_CHARTS in obj[KEY_PROVENANCE]:
            charts_prov = [
                ChartViewHandle.from_dict(c)
                for c in obj[KEY_PROVENANCE][KEY_PROVENANCE_CHARTS]
            ]
        provenance = ModuleProvenance(read=read_prov,
                                      write=write_prov,
                                      delete=delete_prov,
                                      resources=res_prov,
                                      charts=charts_prov)
        # Create dictionary of dataset descriptors only if previous state is
        # given and the module is in SUCCESS state. Otherwise, the database
        # state is empty.
        if obj[KEY_STATE] == mstate.MODULE_SUCCESS and not prev_state is None:
            datasets = provenance.get_database_state(prev_state)
        else:
            datasets = dict()
        # Return module handle
        return OSModuleHandle(identifier=identifier,
                              command=command,
                              external_form=obj[KEY_EXTERNAL_FORM],
                              module_path=module_path,
                              state=obj[KEY_STATE],
                              timestamp=timestamp,
                              datasets=datasets,
                              outputs=outputs,
                              provenance=provenance,
                              object_store=object_store)
Exemplo n.º 22
0
    def load_module(
            identifier: str, 
            module_path: str, 
            prev_state: Optional[Dict[str, ArtifactDescriptor]] = None, 
            object_store: ObjectStore = DefaultObjectStore()
        ) -> "OSModuleHandle":
        """Load module from given object store.

        Parameters
        ----------
        identifier: string
            Unique module identifier
        module_path: string
            Resource path for module object
        prev_state: dict(string: vizier.datastore.dataset.DatasetDescriptor)
            Dataset descriptors keyed by the user-provided name that exist in
            the database state of the previous moudle (in sequence of occurrence
            in the workflow)
        object_store: vizier.core.io.base.ObjectStore, optional
            Object store implementation to access and maintain resources

        Returns
        -------
        vizier.viztrail.objectstore.module.OSModuleHandle
        """
        # Make sure the object store is not None
        # Read object from store. This may raise a ValueError to indicate that
        # the module does not exists (in a system error condtion). In this
        # case we return a new module that is in error state.
        try:
            obj = cast(Dict[str, Any], object_store.read_object(object_path=module_path))
        except ValueError:
            return OSModuleHandle(
                identifier=identifier,
                command=ModuleCommand(
                    package_id=UNKNOWN_ID,
                    command_id=UNKNOWN_ID,
                    arguments=list(),
                    packages=None
                ),
                external_form='fatal error: object not found',
                module_path=module_path,
                state=mstate.MODULE_ERROR,
                object_store=object_store
            )
        # Create module command
        command = ModuleCommand(
            package_id=obj[KEY_COMMAND][KEY_PACKAGE_ID],
            command_id=obj[KEY_COMMAND][KEY_COMMAND_ID],
            arguments=obj[KEY_COMMAND][KEY_ARGUMENTS],
            packages=None
        )
        # Create module timestamps
        created_at = to_datetime(obj[KEY_TIMESTAMP][KEY_CREATED_AT])
        if KEY_STARTED_AT in obj[KEY_TIMESTAMP]:
            started_at: Optional[datetime] = to_datetime(obj[KEY_TIMESTAMP][KEY_STARTED_AT])
        else:
            started_at = None
        if KEY_FINISHED_AT in obj[KEY_TIMESTAMP]:
            finished_at: Optional[datetime] = to_datetime(obj[KEY_TIMESTAMP][KEY_FINISHED_AT])
        else:
            finished_at = None
        timestamp = ModuleTimestamp(
            created_at=created_at,
            started_at=started_at,
            finished_at=finished_at
        )
        # Create module output streams.
        outputs = ModuleOutputs(
            stdout=get_output_stream(obj[KEY_OUTPUTS][KEY_STDOUT]),
            stderr=get_output_stream(obj[KEY_OUTPUTS][KEY_STDERR])
        )
        # Create module provenance information
        read_prov = None
        if KEY_PROVENANCE_READ in obj[KEY_PROVENANCE]:
            read_prov = dict()
            for ds in obj[KEY_PROVENANCE][KEY_PROVENANCE_READ]:
                read_prov[ds[KEY_DATASET_NAME]] = ds[KEY_DATASET_ID]
        write_prov = None
        if KEY_PROVENANCE_WRITE in obj[KEY_PROVENANCE]:
            write_prov = dict()
            for ds in obj[KEY_PROVENANCE][KEY_PROVENANCE_WRITE]:
                if KEY_DATAOBJECT_TYPE in ds:
                    descriptor = ArtifactDescriptor(
                        identifier=ds[KEY_DATAOBJECT_ID],
                        name=ds[KEY_DATAOBJECT_NAME],
                        artifact_type=ds[KEY_DATAOBJECT_TYPE])
                else: 
                    descriptor = DatasetDescriptor(
                        identifier=ds[KEY_DATASET_ID],
                        name=ds[KEY_DATASET_NAME],
                        columns=[
                            DatasetColumn(
                                identifier=col[KEY_COLUMN_ID],
                                name=col[KEY_COLUMN_NAME],
                                data_type=col[KEY_COLUMN_TYPE]
                            ) for col in ds[KEY_DATASET_COLUMNS]
                        ]
                    )
                write_prov[ds[KEY_DATASET_NAME]] = descriptor
        if KEY_PROVENANCE_DELETE in obj[KEY_PROVENANCE]:
            delete_prov = set(obj[KEY_PROVENANCE][KEY_PROVENANCE_DELETE])
        else:
            delete_prov = set()
        if KEY_PROVENANCE_RESOURCES in obj[KEY_PROVENANCE]:
            res_prov = cast(Dict[str, Any], obj[KEY_PROVENANCE][KEY_PROVENANCE_RESOURCES])
        else:
            res_prov = dict()
        if KEY_PROVENANCE_CHARTS in obj[KEY_PROVENANCE]:
            charts_prov = [
                ( 
                    c[0], 
                    ChartViewHandle.from_dict(c[1])  # type: ignore[no-untyped-call]
                ) if isinstance(c, list) else 
                (
                    "Chart",
                    ChartViewHandle.from_dict(c)
                )
                for c in obj[KEY_PROVENANCE][KEY_PROVENANCE_CHARTS]
            ]
        else:
            charts_prov = list()
        provenance = ModuleProvenance(
            read=read_prov,
            write=write_prov,
            delete=delete_prov,
            resources=res_prov,
            charts=charts_prov
        )
        # Return module handle
        return OSModuleHandle(
            identifier=identifier,
            command=command,
            external_form=obj[KEY_EXTERNAL_FORM],
            module_path=module_path,
            state=obj[KEY_STATE],
            timestamp=timestamp,
            outputs=outputs,
            provenance=provenance,
            object_store=object_store,
        )
Exemplo n.º 23
0
from vizier.viztrail.module.output import ModuleOutputs, OutputObject, TextOutput
from vizier.viztrail.module.provenance import ModuleProvenance
from vizier.viztrail.module.timestamp import ModuleTimestamp
from vizier.engine.packages.plot.command import create_plot
from vizier.engine.packages.pycell.command import python_cell

MODULE_DIR = './.temp'

DATASETS = {
    'DS1':
    DatasetDescriptor(identifier='ID1'),
    'DS2':
    DatasetDescriptor(identifier='ID2',
                      columns=[
                          DatasetColumn(identifier=0,
                                        name='ABC',
                                        data_type='int'),
                          DatasetColumn(identifier=1,
                                        name='xyz',
                                        data_type='real')
                      ],
                      row_count=100)
}


class TestOSModuleIO(unittest.TestCase):
    def setUp(self):
        """Create an empty directory."""
        if os.path.isdir(MODULE_DIR):
            shutil.rmtree(MODULE_DIR)
        os.makedirs(MODULE_DIR)
from atexit import register as at_exit

URLS = UrlFactory(base_url='http://localhost:5000/vizier-db/api/v1')

api = VizierApiClient(URLS)
PROJECT_ID = api.create_project({"name": "Test Client Datastore"}).identifier

at_exit(api.delete_project, PROJECT_ID)

# We're just doing some unit testing on the fields specific to DatastoreClient, so
# ignore complaints about instantiating an abstract class
store = DatastoreClient(  # type: ignore[abstract]
    urls=DatastoreClientUrlFactory(urls=URLS, project_id=PROJECT_ID))

ds = store.create_dataset(columns=[
    DatasetColumn(identifier=0, name='Name'),
    DatasetColumn(identifier=1, name='Age', data_type="int")
],
                          rows=[
                              DatasetRow(identifier=0, values=['Alice', 32]),
                              DatasetRow(identifier=1, values=['Bob', 23])
                          ],
                          properties={"example_property": "foo"})

# print(ds)
# print([col.identifier for col in ds.columns])
# print([col.name for col in ds.columns])

dh = store.get_dataset(ds.identifier)
assert dh is not None
for row in dh.fetch_rows():
from vizier.engine.packages.mimir.command import mimir_geocode
from vizier.engine.packages.mimir.command import mimir_key_repair, mimir_missing_key
from vizier.engine.packages.mimir.command import mimir_missing_value, mimir_picker
from vizier.datastore.dataset import DatasetColumn, DatasetDescriptor

import vizier.engine.packages.base as pckg
import vizier.engine.packages.mimir.base as mimir
import vizier.viztrail.command as md

DATASETS = {
    'ds':
    DatasetDescriptor(identifier='0000',
                      name='ds',
                      columns=[
                          DatasetColumn(identifier=2, name='Some Name'),
                          DatasetColumn(identifier=1, name='Street')
                      ])
}
PACKAGE = pckg.PackageIndex(mimir.MIMIR_LENSES)


class TestValidateMimir(unittest.TestCase):
    def test_mimir_geocode(self):
        """Test validation of Mimir geocode lens."""
        cmd = mimir_geocode(dataset_name='ds',
                            geocoder='GOOGLE',
                            street=1,
                            city=2,
                            materialize_input=False,
                            validate=True).to_external_form(
            line += str(anno.row_id)
        line += ']: ' + anno.key + ' = ' + str(anno.value)
        print(line)


store = DatastoreClient(
    urls=DatastoreClientUrlFactory(
        urls=UrlFactory(
            base_url='http://localhost:5000/vizier-db/api/v1'
        ),
        project_id=PROJECT_ID
    )
)

ds = store.create_dataset(
    columns=[DatasetColumn(identifier=0, name='Name'), DatasetColumn(identifier=1, name='Age')],
    rows=[DatasetRow(identifier=0, values=['Alice', 32]), DatasetRow(identifier=1, values=['Bob', 23])],
    annotations=DatasetMetadata(rows=[DatasetAnnotation(row_id=1, key='user:comment', value='Needs cleaning')])
)

print(ds)
print([col.identifier for col in ds.columns])
print([col.name for col in ds.columns])

dh = store.get_dataset(ds.identifier)
for row in dh.fetch_rows():
    print([row.identifier] + row.values)

annotations = dh.get_annotations()
print_annotations(annotations)
Exemplo n.º 27
0
 def test_column_index(self):
     """Test access to columns based on identifier and name."""
     ds = DatasetDescriptor(identifier='0',
                            columns=[
                                DatasetColumn(identifier=0, name='ABC'),
                                DatasetColumn(identifier=1, name='A'),
                                DatasetColumn(identifier=2, name='ABC'),
                                DatasetColumn(identifier=3, name='DEF'),
                                DatasetColumn(identifier=4, name='xyz'),
                            ])
     # Get column by identifier
     self.assertEqual(ds.column_by_id(0).name, 'ABC')
     self.assertEqual(ds.column_by_id(1).name, 'A')
     self.assertEqual(ds.column_by_id(2).name, 'ABC')
     self.assertEqual(ds.column_by_id(3).name, 'DEF')
     self.assertEqual(ds.column_by_id(4).name, 'xyz')
     with self.assertRaises(ValueError):
         ds.column_by_id(6)
     with self.assertRaises(ValueError):
         ds.column_by_id(-1)
     # Get column by name
     self.assertEqual(ds.column_by_name('ABC').identifier, 0)
     self.assertEqual(ds.column_by_name('A').identifier, 1)
     self.assertEqual(
         ds.column_by_name('abc', ignore_case=True).identifier, 0)
     self.assertEqual(
         ds.column_by_name('XYZ', ignore_case=True).identifier, 4)
     self.assertIsNone(ds.column_by_name('4'))
     # Get column index
     self.assertEqual(ds.column_index(0), 0)
     self.assertEqual(ds.column_index(1), 1)
     self.assertEqual(ds.column_index('DEF'), 3)
     self.assertEqual(ds.column_index('XYZ'), 4)
     self.assertEqual(ds.column_index('A'), 1)
     self.assertEqual(ds.column_index('B'), 1)
     self.assertEqual(ds.column_index('C'), 2)
     self.assertEqual(ds.column_index('D'), 3)
     self.assertEqual(ds.column_index('E'), 4)
     for i in range(len(ds.columns)):
         self.assertEqual(ds.get_index(i), i)
     with self.assertRaises(ValueError):
         ds.column_index('ABC')
     with self.assertRaises(ValueError):
         ds.column_index('abc')
     # Create a descriptor when column identifier does not match the index
     # position in the schema
     ds = DatasetDescriptor(identifier='0',
                            columns=[
                                DatasetColumn(identifier=4, name='ABC'),
                                DatasetColumn(identifier=2, name='A'),
                                DatasetColumn(identifier=3, name='ABC'),
                                DatasetColumn(identifier=0, name='DEF'),
                                DatasetColumn(identifier=1, name='xyz'),
                            ])
     self.assertEqual(ds.column_by_id(0).name, 'DEF')
     self.assertEqual(ds.column_by_id(1).name, 'xyz')
     self.assertEqual(ds.column_by_id(2).name, 'A')
     self.assertEqual(ds.column_by_id(3).name, 'ABC')
     self.assertEqual(ds.column_by_id(4).name, 'ABC')
     self.assertEqual(ds.column_index(0), 0)
     self.assertEqual(ds.column_index(1), 1)
     self.assertEqual(ds.column_index('DEF'), 3)
     self.assertEqual(ds.column_index('XYZ'), 4)
     self.assertEqual(ds.column_index('A'), 1)
     self.assertEqual(ds.column_index('B'), 1)
     self.assertEqual(ds.column_index('C'), 2)
     self.assertEqual(ds.column_index('D'), 3)
     self.assertEqual(ds.column_index('E'), 4)
     self.assertEqual(ds.get_index(0), 3)
     self.assertEqual(ds.get_index(1), 4)
     self.assertEqual(ds.get_index(2), 1)
     self.assertEqual(ds.get_index(3), 2)
     self.assertEqual(ds.get_index(4), 0)