def __init__(self, identifier, columns=None, row_count=None, annotations=None, name=None): """Initialize the dataset. Raises ValueError if dataset columns or rows do not have unique identifiers. Parameters ---------- identifier: string, optional Unique dataset identifier. columns: list(DatasetColumn), optional List of columns. It is expected that each column has a unique identifier. row_count: int, optional Number of rows in the dataset annotations: vizier.datastore.annotation.dataset.DatasetMetadata, optional Annotations for dataset components """ super(DatasetHandle, self).__init__(identifier=identifier, columns=columns, row_count=row_count, name=name) self.annotations = annotations if not annotations is None else DatasetMetadata( )
def create_dataset(): """Create a new dataset in the datastore for the project. The dataset schema and rows are given in the request body. Dataset annotations are optional. The expected request body format is: { "columns": [ { "id": 0, "name": "string", "type": "string" } ], "rows": [ { "id": 0, "values": [ "string" ] } ], "annotations": [ { "columnId": 0, "rowId": 0, "key": "string", "value": "string" } ] } """ # Validate the request obj = srv.validate_json_request( request, required=[labels.COLUMNS, labels.ROWS], optional=[labels.ANNOTATIONS] ) columns = deserialize.DATASET_COLUMNS(obj[labels.COLUMNS]) rows = [deserialize.DATASET_ROW(row) for row in obj[labels.ROWS]] annotations = None if labels.ANNOTATIONS in obj: annotations = DatasetMetadata() for anno in obj[labels.ANNOTATIONS]: a = deserialize.ANNOTATION(anno) if a.column_id is None: annotations.rows.append(a) elif a.row_id is None: annotations.columns.append(a) else: annotations.cells.append(a) try: dataset = api.datasets.create_dataset( project_id=config.project_id, columns=columns, rows=rows, annotations=annotations ) return jsonify(dataset) except ValueError as ex: raise srv.InvalidRequest(str(ex))
def test_deduplicate_annotations(self): """Test removing duplicated annotations.""" store = FileSystemDatastore(STORE_DIR) ds = store.create_dataset( columns=[ DatasetColumn(identifier=0, name='A'), DatasetColumn(identifier=1, name='B') ], rows=[DatasetRow(identifier=0, values=['a', 'b'])], annotations=DatasetMetadata( cells=[ DatasetAnnotation(column_id=0, row_id=0, key='X', value=1), DatasetAnnotation(column_id=0, row_id=0, key='X', value=2), DatasetAnnotation(column_id=1, row_id=0, key='X', value=3), DatasetAnnotation(column_id=1, row_id=1, key='X', value=3), DatasetAnnotation(column_id=0, row_id=0, key='Y', value=1), DatasetAnnotation(column_id=0, row_id=0, key='X', value=1), DatasetAnnotation(column_id=0, row_id=0, key='X', value=2), DatasetAnnotation(column_id=1, row_id=0, key='X', value=3), DatasetAnnotation(column_id=1, row_id=1, key='X', value=3), ], columns=[ DatasetAnnotation(column_id=0, key='A', value='x'), DatasetAnnotation(column_id=1, key='A', value='x'), DatasetAnnotation(column_id=0, key='A', value='x'), DatasetAnnotation(column_id=1, key='A', value='x'), DatasetAnnotation(column_id=0, key='A', value='x'), DatasetAnnotation(column_id=1, key='A', value='x'), DatasetAnnotation(column_id=0, key='A', value='x'), DatasetAnnotation(column_id=1, key='A', value='x') ], rows=[ DatasetAnnotation(row_id=0, key='E', value=100), DatasetAnnotation(row_id=0, key='E', value=100) ] ) ) ds = store.get_dataset(ds.identifier) self.assertEqual(len(ds.annotations.cells), 4) self.assertEqual(len(ds.annotations.columns), 2) self.assertEqual(len(ds.annotations.rows), 1) annos = ds.annotations.for_cell(column_id=0, row_id=0) self.assertEqual(len(annos), 3) self.assertTrue(1 in [a.value for a in annos]) self.assertTrue(2 in [a.value for a in annos]) self.assertFalse(3 in [a.value for a in annos]) self.assertEqual(len(ds.annotations.find_all(values=annos, key='X')), 2) with self.assertRaises(ValueError): ds.annotations.find_one(values=annos, key='X') self.assertEqual(len(ds.annotations.for_column(column_id=0)), 1) self.assertEqual(len(ds.annotations.for_row(row_id=0)), 1) annotations = ds.annotations.filter(columns=[1]) self.assertEqual(len(annotations.cells), 1) self.assertEqual(len(annotations.columns), 1) self.assertEqual(len(annotations.rows), 1)
def annotations(self): """Get all dataset annotations. Returns ------- vizier.datastore.annotation.dataset.DatasetMetadata """ if self._annotations is None: self._annotations = DatasetMetadata.from_list( self.dataset.get_annotations() ) return self._annotations
def get_annotations(self, identifier, column_id=None, row_id=None): """Get list of annotations for a resources of a given dataset. If only the column id is provided annotations for the identifier column will be returned. If only the row identifier is given all annotations for the specified row are returned. Otherwise, all annotations for the specified cell are returned. If both identifier are None all annotations for the dataset are returned. Parameters ---------- column_id: int, optional Unique column identifier row_id: int, optiona Unique row identifier Returns ------- vizier.datastore.annotation.dataset.DatasetMetadata """ # Test if a subfolder for the given dataset identifier exists. If not # return None. dataset_dir = self.get_dataset_dir(identifier) if not os.path.isdir(dataset_dir): return None annotations = DatasetMetadata.from_file( self.get_metadata_filename(identifier)) if column_id is None and row_id is None: return annotations elif column_id is None: return DatasetMetadata(rows=annotations.rows).filter(rows=[row_id]) elif row_id is None: return DatasetMetadata(columns=annotations.columns).filter( columns=[column_id]) else: return DatasetMetadata(cells=annotations.cells).filter( columns=[column_id], rows=[row_id])
def DATASET_ANNOTATIONS(obj): """Convert dictionary serialization into a dataset metadata object. Parameters ---------- obj: dict Default serialization for dataset metadata Returns ------- vizier.datastore.annotation.dataset.DatasetMetadata """ return DatasetMetadata( columns=[ANNOTATION(a) for a in obj['columns']], rows=[ANNOTATION(a) for a in obj['rows']], cells=[ANNOTATION(a) for a in obj['cells']], )
def test_query_annotations(self): """Test retrieving annotations via the datastore.""" store = FileSystemDatastore(STORE_DIR) ds = store.create_dataset( columns=[ DatasetColumn(identifier=0, name='A'), DatasetColumn(identifier=1, name='B') ], rows=[DatasetRow(identifier=0, values=['a', 'b'])], annotations=DatasetMetadata( cells=[ DatasetAnnotation(column_id=0, row_id=0, key='X', value=1), DatasetAnnotation(column_id=0, row_id=0, key='X', value=2), DatasetAnnotation(column_id=1, row_id=0, key='X', value=3), DatasetAnnotation(column_id=0, row_id=0, key='Y', value=1) ], columns=[ DatasetAnnotation(column_id=0, key='A', value='x'), DatasetAnnotation(column_id=1, key='A', value='x') ], rows=[ DatasetAnnotation(row_id=0, key='E', value=100) ] ) ) annos = store.get_annotations(ds.identifier, column_id=1) self.assertEqual(len(annos.columns), 1) self.assertEqual(len(annos.rows), 0) self.assertEqual(len(annos.cells), 0) annos = store.get_annotations(ds.identifier, column_id=0) self.assertEqual(len(annos.columns), 1) self.assertEqual(len(annos.rows), 0) self.assertEqual(len(annos.cells), 0) annos = store.get_annotations(ds.identifier, row_id=0) self.assertEqual(len(annos.columns), 0) self.assertEqual(len(annos.rows), 1) self.assertEqual(len(annos.cells), 0) annos = store.get_annotations(ds.identifier, column_id=1, row_id=0) self.assertEqual(len(annos.columns), 0) self.assertEqual(len(annos.rows), 0) self.assertEqual(len(annos.cells), 1) annos = store.get_annotations(ds.identifier, column_id=0, row_id=0) self.assertEqual(len(annos.columns), 0) self.assertEqual(len(annos.rows), 0) self.assertEqual(len(annos.cells), 3)
def get_dataset(self, identifier): """Read a full dataset from the data store. Returns None if no dataset with the given identifier exists. Parameters ---------- identifier : string Unique dataset identifier Returns ------- vizier.datastore.mimir.dataset.MimirDatasetHandle """ # Return None if the dataset file does not exist dataset_file = self.get_dataset_file(identifier) if not os.path.isfile(dataset_file): return None annotations = DatasetMetadata.from_file( self.get_metadata_filename(identifier)) return MimirDatasetHandle.from_file(dataset_file, annotations=annotations)
def get_dataset(self, identifier): """Read a full dataset from the data store. Returns None if no dataset with the given identifier exists. Parameters ---------- identifier : string Unique dataset identifier Returns ------- vizier.datastore.fs.dataset.FileSystemDatasetHandle """ # Test if a subfolder for the given dataset identifier exists. If not # return None. dataset_dir = self.get_dataset_dir(identifier) if not os.path.isdir(dataset_dir): return None # Load the dataset handle return FileSystemDatasetHandle.from_file( descriptor_file=os.path.join(dataset_dir, DESCRIPTOR_FILE), data_file=os.path.join(dataset_dir, DATA_FILE), annotations=DatasetMetadata.from_file( self.get_metadata_filename(identifier)))
def __init__(self, dataset=None): """Initialize the client for a given dataset. Raises ValueError if dataset columns or rows do not have unique identifiers. Parameters ---------- dataset: vizier.datastore.base.DatasetHandle, optional Handle to the dataset for which this is a client. If None this is a new dataset. """ self.dataset = dataset if not dataset is None: self.identifier = dataset.identifier self.columns = dataset.columns # Delay fetching rows and dataset annotations for now self._annotations = None self._rows = None else: self.identifier = None self.columns = list() self._annotations = DatasetMetadata() self._rows = list()
print(line) store = DatastoreClient( urls=DatastoreClientUrlFactory( urls=UrlFactory( base_url='http://localhost:5000/vizier-db/api/v1' ), project_id=PROJECT_ID ) ) ds = store.create_dataset( columns=[DatasetColumn(identifier=0, name='Name'), DatasetColumn(identifier=1, name='Age')], rows=[DatasetRow(identifier=0, values=['Alice', 32]), DatasetRow(identifier=1, values=['Bob', 23])], annotations=DatasetMetadata(rows=[DatasetAnnotation(row_id=1, key='user:comment', value='Needs cleaning')]) ) print(ds) print([col.identifier for col in ds.columns]) print([col.name for col in ds.columns]) dh = store.get_dataset(ds.identifier) for row in dh.fetch_rows(): print([row.identifier] + row.values) annotations = dh.get_annotations() print_annotations(annotations) store.update_annotation( identifier=dh.identifier,
def update_annotation(self, identifier, key, old_value=None, new_value=None, column_id=None, row_id=None): """Update the annotations for a component of the datasets with the given identifier. Returns the updated annotations or None if the dataset does not exist. Parameters ---------- identifier : string Unique dataset identifier column_id: int, optional Unique column identifier row_id: int, optional Unique row identifier key: string, optional Annotation key old_value: string, optional Previous annotation value whan updating an existing annotation. new_value: string, optional Updated annotation value Returns ------- bool """ # Raise ValueError if column id and row id are both None if column_id is None and row_id is None: raise ValueError('invalid dataset resource identifier') # Return None if the dataset is unknown dataset_dir = self.get_dataset_dir(identifier) if not os.path.isdir(dataset_dir): return None # Read annotations from file, Evaluate update statement and write result # back to file. metadata_filename = self.get_metadata_filename(identifier) annotations = DatasetMetadata.from_file(metadata_filename) # Get object annotations if column_id is None: elements = annotations.rows elif row_id is None: elements = annotations.columns else: elements = annotations.cells # Identify the type of operation: INSERT, DELETE or UPDATE if old_value is None and not new_value is None: elements.append( DatasetAnnotation(key=key, value=new_value, column_id=column_id, row_id=row_id)) elif not old_value is None and new_value is None: del_index = None for i in range(len(elements)): a = elements[i] if a.column_id == column_id and a.row_id == row_id: if a.key == key and a.value == old_value: del_index = i break if del_index is None: return False del elements[del_index] elif not old_value is None and not new_value is None: anno = None for a in elements: if a.column_id == column_id and a.row_id == row_id: if a.key == key and a.value == old_value: anno = a break if anno is None: return False anno.value = new_value else: raise ValueError('invalid modification operation') # Write modified annotations to file annotations.to_file(metadata_filename) return True
def test_update_annotations(self): """Test updating annotations via the datastore.""" store = FileSystemDatastore(STORE_DIR) ds = store.create_dataset( columns=[ DatasetColumn(identifier=0, name='A'), DatasetColumn(identifier=1, name='B') ], rows=[DatasetRow(identifier=0, values=['a', 'b'])], annotations=DatasetMetadata( cells=[ DatasetAnnotation(column_id=0, row_id=0, key='X', value=1), DatasetAnnotation(column_id=0, row_id=0, key='X', value=2), DatasetAnnotation(column_id=1, row_id=0, key='X', value=3), DatasetAnnotation(column_id=0, row_id=0, key='Y', value=1) ], columns=[ DatasetAnnotation(column_id=0, key='A', value='x'), DatasetAnnotation(column_id=1, key='A', value='x') ], rows=[ DatasetAnnotation(row_id=0, key='E', value=100) ] ) ) # INSERT row annotatins store.update_annotation( ds.identifier, key='D', row_id=0, new_value=200 ) annos = store.get_annotations(ds.identifier, row_id=0) self.assertEqual(len(annos.rows), 2) for key in ['D', 'E']: self.assertTrue(key in [a.key for a in annos.rows]) for val in [100, 200]: self.assertTrue(val in [a.value for a in annos.rows]) # UPDATE column annotation store.update_annotation( ds.identifier, key='A', column_id=1, old_value='x', new_value='y' ) annos = store.get_annotations(ds.identifier, column_id=1) self.assertEqual(annos.columns[0].key, 'A') self.assertEqual(annos.columns[0].value, 'y') # DELETE cell annotation store.update_annotation( ds.identifier, key='X', column_id=0, row_id=0, old_value=2, ) annos = store.get_annotations(ds.identifier, column_id=0, row_id=0) self.assertEqual(len(annos.cells), 2) for a in annos.cells: self.assertNotEqual(a.value, 2) result = store.update_annotation( ds.identifier, key='X', column_id=1, row_id=0, old_value=3, ) self.assertTrue(result) annos = store.get_annotations(ds.identifier, column_id=1, row_id=0) self.assertEqual(len(annos.cells), 0)
def test_add_and_delete_metadata(self): """Test functionality to add and delete annotations.""" annotations = DatasetMetadata() annotations.add(column_id=0, key='A', value=0) annotations.add(column_id=0, key='A', value=1) annotations.add(column_id=0, key='A', value=0) annotations.add(column_id=1, key='A', value=0) annotations.add(column_id=1, key='A', value=1) self.assertEqual(len(annotations.columns), 5) annotations.remove(column_id=0, value=1) self.assertEqual(len(annotations.columns), 4) annos = annotations.for_column(column_id=0) self.assertEqual(len(annos), 2) for a in annos: self.assertEqual(a.key, 'A') self.assertEqual(a.value, 0) annotations.add(row_id=0, key='A', value=0) annotations.add(row_id=0, key='B', value=1) annotations.add(row_id=0, key='A', value=0) annotations.add(row_id=1, key='A', value=0) annotations.add(row_id=1, key='A', value=1) self.assertEqual(len(annotations.rows), 5) annotations.remove(row_id=0, key='A') self.assertEqual(len(annotations.rows), 3) self.assertEqual(len(annotations.columns), 4) annos = annotations.for_row(row_id=0) self.assertEqual(len(annos), 1) self.assertEqual(annos[0].key, 'B') self.assertEqual(annos[0].value, 1) annotations.add(column_id=0, row_id=0, key='A', value=0) annotations.add(column_id=1, row_id=0, key='B', value=1) annotations.add(column_id=1, row_id=0, key='A', value=0) annotations.add(column_id=1, row_id=1, key='A', value=0) annotations.add(column_id=1, row_id=0, key='A', value=1) self.assertEqual(len(annotations.cells), 5) annotations.remove(row_id=0, column_id=1) self.assertEqual(len(annotations.cells), 2)