def test_annotations(self): """Test loading a dataset from file.""" store = FileSystemDatastore(STORE_DIR) ds = store.create_dataset( columns=[ DatasetColumn(identifier=0, name='A'), DatasetColumn(identifier=1, name='B') ], rows=[DatasetRow(identifier=0, values=['a', 'b'])], annotations=DatasetMetadata( cells=[ DatasetAnnotation(column_id=0, row_id=0, key='X', value=1), DatasetAnnotation(column_id=0, row_id=0, key='X', value=2), DatasetAnnotation(column_id=1, row_id=0, key='X', value=3), DatasetAnnotation(column_id=1, row_id=1, key='X', value=3), DatasetAnnotation(column_id=0, row_id=0, key='Y', value=1) ], columns=[ DatasetAnnotation(column_id=0, key='A', value='x'), DatasetAnnotation(column_id=2, key='A', value='x') ], rows=[ DatasetAnnotation(row_id=0, key='E', value=100) ] ) ) ds = store.get_dataset(ds.identifier) self.assertEqual(len(ds.annotations.cells), 4) self.assertEqual(len(ds.annotations.columns), 1) self.assertEqual(len(ds.annotations.rows), 1) annos = ds.annotations.for_cell(column_id=0, row_id=0) self.assertEqual(len(annos), 3) self.assertTrue(1 in [a.value for a in annos]) self.assertTrue(2 in [a.value for a in annos]) self.assertFalse(3 in [a.value for a in annos]) self.assertEqual(len(ds.annotations.find_all(values=annos, key='X')), 2) with self.assertRaises(ValueError): ds.annotations.find_one(values=annos, key='X') self.assertEqual(len(ds.annotations.for_column(column_id=0)), 1) self.assertEqual(len(ds.annotations.for_row(row_id=0)), 1) annotations = ds.annotations.filter(columns=[1]) self.assertEqual(len(annotations.cells), 1) self.assertEqual(len(annotations.columns), 0) self.assertEqual(len(annotations.rows), 1) # Reload datastore store = FileSystemDatastore(STORE_DIR) ds = store.get_dataset(ds.identifier) self.assertEqual(len(ds.annotations.cells), 4) self.assertEqual(len(ds.annotations.columns), 1) self.assertEqual(len(ds.annotations.rows), 1)
def test_properties(self): """Test loading a dataset from file.""" store = FileSystemDatastore(STORE_DIR) ds = store.create_dataset( columns=[ DatasetColumn(identifier=0, name='A'), DatasetColumn(identifier=1, name='B') ], rows=[DatasetRow(identifier=0, values=[1, 2])], properties=EXAMPLE_PROPERTIES ) ds = store.get_dataset(ds.identifier) column_props = ds.properties['columns'] self.assertEqual(len(column_props), 2) self.assertTrue('A' in [prop['name'] for prop in column_props]) # Reload datastore store = FileSystemDatastore(STORE_DIR) ds = store.get_dataset(ds.identifier) column_props = ds.properties['columns'] self.assertEqual(len(column_props), 2)
def test_get_dataset(self): """Test accessing dataset handle and descriptor.""" # None for non-existing dataset store = FileSystemDatastore(STORE_DIR) self.assertIsNone(store.get_dataset('0000')) ds_id = store.load_dataset(f_handle=FILE).identifier self.assertIsNotNone(store.get_dataset(ds_id)) self.assertIsNone(store.get_dataset('0000')) # Reload store to ensure the dataset still exists store = FileSystemDatastore(STORE_DIR) self.assertIsNotNone(store.get_dataset(ds_id)) self.assertIsNone(store.get_dataset('0000')) self.validate_class_size_dataset(store.get_dataset(ds_id)) # Load a second dataset ds_id_2 = store.load_dataset(f_handle=FILE).identifier self.assertIsNotNone(store.get_dataset(ds_id)) self.assertIsNotNone(store.get_dataset(ds_id_2)) # Reload store to ensure the dataset still exists store = FileSystemDatastore(STORE_DIR) self.assertIsNotNone(store.get_dataset(ds_id)) self.assertIsNotNone(store.get_dataset(ds_id_2))
def test_create_dataset(self): """Test loading a dataset from file.""" store = FileSystemDatastore(STORE_DIR) ds = store.create_dataset( columns=[ DatasetColumn(identifier=0, name='A'), DatasetColumn(identifier=1, name='B') ], rows=[DatasetRow(identifier=0, values=['a', 'b'])] ) ds = store.get_dataset(ds.identifier) column_ids = [col.identifier for col in ds.columns] self.assertEqual(len(ds.columns), 2) for id in [0, 1]: self.assertTrue(id in column_ids) column_names = [col.name for col in ds.columns] for name in ['A', 'B']: self.assertTrue(name in column_names) rows = ds.fetch_rows() self.assertEqual(len(rows), 1) self.assertEqual(rows[0].values, ['a', 'b']) self.assertEqual(len(ds.annotations.cells), 0) self.assertEqual(len(ds.annotations.columns), 0) self.assertEqual(len(ds.annotations.rows), 0) # Reload the datastore store = FileSystemDatastore(STORE_DIR) ds = store.get_dataset(ds.identifier) column_ids = [col.identifier for col in ds.columns] self.assertEqual(len(ds.columns), 2) for id in [0, 1]: self.assertTrue(id in column_ids) column_names = [col.name for col in ds.columns] for name in ['A', 'B']: self.assertTrue(name in column_names) rows = ds.fetch_rows() self.assertEqual(len(rows), 1) self.assertEqual(rows[0].values, ['a', 'b']) self.assertEqual(len(ds.annotations.cells), 0) self.assertEqual(len(ds.annotations.columns), 0) self.assertEqual(len(ds.annotations.rows), 0)
def test_delete_dataset(self): """Test deleting datasets.""" # None for non-existing dataset store = FileSystemDatastore(STORE_DIR) ds_id = store.load_dataset(f_handle=FILE).identifier ds_id_2 = store.load_dataset(f_handle=FILE).identifier self.assertIsNotNone(store.get_dataset(ds_id)) self.assertIsNotNone(store.get_dataset(ds_id_2)) store.delete_dataset(ds_id) self.assertIsNone(store.get_dataset(ds_id)) self.assertIsNotNone(store.get_dataset(ds_id_2)) # Reload store to ensure only one dataset still exists store = FileSystemDatastore(STORE_DIR) self.assertIsNone(store.get_dataset(ds_id)) self.assertIsNotNone(store.get_dataset(ds_id_2)) # Delete the second dataset store.delete_dataset(ds_id_2) store = FileSystemDatastore(STORE_DIR) self.assertIsNone(store.get_dataset(ds_id)) self.assertIsNone(store.get_dataset(ds_id_2))
class TestDefaultVizualProcessor(unittest.TestCase): def setUp(self): """Create an instance of the default vizier processor for an empty server directory. """ # Drop directory if it exists if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.makedirs(SERVER_DIR) self.processor = VizualTaskProcessor(api=DefaultVizualApi()) self.datastore = FileSystemDatastore(DATASTORE_DIR) self.filestore = FileSystemFilestore(FILESTORE_DIR) def tearDown(self): """Clean-up by dropping the server directory. """ if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) def test_create_api_from_dictionary(self): """Test creating the processor instance with properties parameter instead of api. """ processor = VizualTaskProcessor( properties={ PROPERTY_API: ClassLoader.to_dict( module_name='vizier.engine.packages.vizual.api.fs', class_name='DefaultVizualApi') }) fh = self.filestore.upload_file(CSV_FILE) cmd = vizual.load_dataset(dataset_name=DATASET_NAME, file={pckg.FILE_ID: fh.identifier}, validate=True) result = processor.compute(command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext( project_id=5, datastore=self.datastore, filestore=self.filestore, artifacts={})) self.assertIsNotNone(result.provenance.write) self.assertTrue(DATASET_NAME in result.provenance.write) dataset_id = result.provenance.write[DATASET_NAME].identifier self.assertTrue(result.provenance.read is None or len(result.provenance.read) == 0) self.assertIsNotNone(result.provenance.resources) self.assertEqual(result.provenance.resources[RESOURCE_DATASET], dataset_id) def load_dataset(self): """Load a single dataset and return the resulting database state.""" fh = self.filestore.upload_file(CSV_FILE) cmd = vizual.load_dataset(dataset_name=DATASET_NAME, file={pckg.FILE_ID: fh.identifier}, validate=True) result = self.processor.compute(command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext( project_id=5, datastore=self.datastore, filestore=self.filestore, artifacts={})) return result.provenance.write def test_delete_column(self): """Test functionality to delete a column.""" cmd = vizual.delete_column(dataset_name=DATASET_NAME, column=1, validate=True) self.validate_command(cmd) def test_delete_row(self): """Test functionality to delete a row.""" cmd = vizual.delete_row(dataset_name=DATASET_NAME, row=1, validate=True) self.validate_command(cmd) def test_drop_dataset(self): """Test functionality to drop a dataset.""" cmd = vizual.drop_dataset(dataset_name=DATASET_NAME, validate=True) datasets = self.load_dataset() dataset_id = datasets[DATASET_NAME] result = self.processor.compute( command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext(project_id=5, datastore=self.datastore, filestore=self.filestore, artifacts={DATASET_NAME: dataset_id})) self.assertFalse(DATASET_NAME in result.provenance.read) self.assertTrue(DATASET_NAME in result.provenance.delete) self.assertFalse(DATASET_NAME in result.provenance.write) def test_filter_columns(self): """Test projection of a dataset.""" # Create a new dataset cmd = vizual.projection(dataset_name=DATASET_NAME, columns=[{ 'column': 1 }, { 'column': 2, 'name': 'MyName' }], validate=True) self.validate_command(cmd) def test_insert_column(self): """Test functionality to insert a columns.""" cmd = vizual.insert_column(dataset_name=DATASET_NAME, position=1, name='My Col', validate=True) self.validate_command(cmd) def test_insert_row(self): """Test functionality to insert a row.""" # Create a new dataset cmd = vizual.insert_row(dataset_name=DATASET_NAME, position=1, validate=True) self.validate_command(cmd) def test_load_dataset(self): """Test functionality to load a dataset.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) cmd = vizual.load_dataset(dataset_name='ABC', file={pckg.FILE_ID: fh.identifier}, validate=True) result = self.processor.compute(command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext( project_id=5, datastore=self.datastore, filestore=self.filestore, artifacts={})) self.assertIsNotNone(result.provenance.write) self.assertTrue('abc' in result.provenance.write) dataset_id = result.provenance.write['abc'].identifier self.assertTrue(result.provenance.read is None or len(result.provenance.read) == 0) self.assertIsNotNone(result.provenance.resources) self.assertEqual(result.provenance.resources[RESOURCE_DATASET], dataset_id) # Running load again will not change the dataset identifier result = self.processor.compute( command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext(project_id=5, datastore=self.datastore, filestore=self.filestore, artifacts={}, resources=result.provenance.resources)) self.assertEqual(result.provenance.write['abc'].identifier, dataset_id) self.assertEqual(result.provenance.resources[RESOURCE_DATASET], dataset_id) def test_move_column(self): """Test functionality to move a column.""" cmd = vizual.move_column(dataset_name=DATASET_NAME, column=0, position=1, validate=True) self.validate_command(cmd) def test_move_row(self): """Test functionality to move a row.""" cmd = vizual.move_row(dataset_name=DATASET_NAME, row=0, position=1, validate=True) self.validate_command(cmd) def test_rename_column(self): """Test functionality to rename a column.""" cmd = vizual.rename_column(dataset_name=DATASET_NAME, column=1, name='The col', validate=True) self.validate_command(cmd) def test_rename_dataset(self): """Test functionality to rename a dataset.""" cmd = vizual.rename_dataset(dataset_name=DATASET_NAME, new_name='XYZ', validate=True) datasets = self.load_dataset() dataset_id = datasets[DATASET_NAME] result = self.processor.compute( command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext(project_id=5, datastore=self.datastore, filestore=self.filestore, artifacts={DATASET_NAME: dataset_id})) self.assertFalse(DATASET_NAME in result.provenance.write) self.assertFalse(DATASET_NAME in result.provenance.read) self.assertTrue(DATASET_NAME in result.provenance.delete) self.assertFalse(DATASET_NAME in result.provenance.write) self.assertTrue('xyz' in result.provenance.write) def test_sort_dataset(self): """Test sorting a dataset.""" cmd = vizual.sort_dataset(dataset_name=DATASET_NAME, columns=[{ 'column': 1, 'order': 'Z-A' }, { 'column': 2, 'order': 'A-Z' }], validate=True) self.validate_command(cmd) def test_update_cell(self): """Test functionality to update a dataset cell.""" # Create a new dataset datasets = self.load_dataset() dataset = self.datastore.get_dataset(datasets[DATASET_NAME].identifier) row_ids = [row.identifier for row in dataset.fetch_rows()] cmd = vizual.update_cell(dataset_name=DATASET_NAME, column=1, row=row_ids[0], value=9, validate=True) self.validate_command(cmd, dataset=dataset) def validate_command(self, cmd, dataset=None): """Validate execution of the given command.""" if dataset is None: datasets = self.load_dataset() dataset = datasets[DATASET_NAME] result = self.processor.compute(command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext( project_id=5, datastore=self.datastore, filestore=self.filestore, artifacts={DATASET_NAME: dataset})) self.assertNotEqual(result.provenance.write[DATASET_NAME].identifier, dataset.identifier) self.assertIsNotNone(result.provenance.read) self.assertEqual(result.provenance.read[DATASET_NAME], dataset.identifier) self.assertIsNotNone(result.provenance.write) with self.assertRaises(ValueError): result = self.processor.compute(command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext( project_id=5, datastore=self.datastore, filestore=self.filestore, artifacts={}))
class TestDefaultVizualApi(unittest.TestCase): def setUp(self): """Create an instance of the default vizier API for an empty server directory. """ # Drop directory if it exists if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.makedirs(SERVER_DIR) self.api = DefaultVizualApi() self.datastore = FileSystemDatastore(DATASTORE_DIR) self.filestore = FileSystemFilestore(FILESTORE_DIR) def tearDown(self): """Clean-up by dropping the server directory. """ if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) def test_delete_column(self): """Test functionality to delete a column.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset ds_rows = ds.fetch_rows() # Keep track of column and row identifier col_ids = [col.identifier for col in ds.columns] row_ids = [row.identifier for row in ds_rows] # Delete Age column col_id = ds.column_by_name('AGE').identifier result = self.api.delete_column(ds.identifier, col_id, self.datastore) # Resulting dataset should differ from previous one self.assertNotEqual(result.dataset.identifier, ds.identifier) # Retrieve modified dataset and ensure that it cobtains the following # # Name, Salary # ------------ # Alice, 35K # Bob, 30K ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() # Schema is Name, Salary self.assertEqual(len(ds.columns), 2) self.assertEqual(ds.columns[0].name.upper(), 'NAME') self.assertEqual(ds.columns[1].name.upper(), 'SALARY') # Make sure column identifier haven't changed del col_ids[1] for i in range(len(ds.columns)): self.assertEqual(ds.columns[i].identifier, col_ids[i]) # Make sure that all rows only have two columns row = ds_rows[0] self.assertEqual(len(row.values), 2) self.assertEqual(len(row.values), 2) self.assertEqual(row.values[0], 'Alice') self.assertEqual(row.values[1], '35K') row = ds_rows[1] self.assertEqual(len(row.values), 2) self.assertEqual(len(row.values), 2) self.assertEqual(row.values[0], 'Bob') self.assertEqual(row.values[1], '30K') # Ensure that row identifier haven't changed for i in range(len(ds_rows)): self.assertEqual(ds_rows[i].identifier, row_ids[i]) # Ensure exception is thrown if dataset identifier is unknown with self.assertRaises(ValueError): self.api.delete_column('unknown:uri', 0, self.datastore) # Ensure exception is thrown if column identifier is unknown with self.assertRaises(ValueError): self.api.delete_column(ds.identifier, col_id, self.datastore) def test_delete_row(self): """Test functionality to delete a row.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset ds_rows = ds.fetch_rows() # Keep track of column and row identifier col_ids = [col.identifier for col in ds.columns] row_ids = [row.identifier for row in ds_rows] # Delete second row result = self.api.delete_row(ds.identifier, 1, self.datastore) del row_ids[1] # Resulting dataset should differ from previous one self.assertNotEqual(result.dataset.identifier, ds.identifier) # Retrieve modified dataset and ensure that it contains the following # data: # # Name, Age, Salary # ------------ # Alice, 23, 35K ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() # Schema is Name, Salary col_names = ['Name', 'Age', 'Salary'] self.assertEqual(len(ds.columns), len(col_names)) for i in range(len(ds.columns)): self.assertEqual(ds.columns[i].name.upper(), col_names[i].upper()) # Make sure column identifier haven't changed for i in range(len(ds.columns)): self.assertEqual(ds.columns[i].identifier, col_ids[i]) # There should only be one row self.assertEqual(len(ds_rows), 1) # Ensure that row identifier haven't changed for i in range(len(ds_rows)): self.assertEqual(ds_rows[i].identifier, row_ids[i]) # Ensure exception is thrown if dataset is unknown with self.assertRaises(ValueError): self.api.delete_row('unknown:uri', 0, self.datastore) # Ensure exception is thrown if row index is out of bounds with self.assertRaises(ValueError): self.api.delete_row(ds.identifier, 100, self.datastore) def test_filter_columns(self): """Test projection of a dataset.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset result = self.api.filter_columns(ds.identifier, [2, 0], ['BD', None], self.datastore) ds = self.datastore.get_dataset(result.dataset.identifier) self.assertEqual(len(ds.columns), 2) self.assertEqual(ds.columns[0].identifier, 2) self.assertEqual(ds.columns[0].name.upper(), 'BD') self.assertEqual(ds.columns[1].identifier, 0) self.assertEqual(ds.columns[1].name.upper(), 'NAME') rows = ds.fetch_rows() self.assertEqual(rows[0].values, ['35K', 'Alice']) self.assertEqual(rows[1].values, ['30K', 'Bob']) with self.assertRaises(ValueError): self.api.filter_columns(ds.identifier, [0, 1], ['BD', None], self.datastore) def test_insert_column(self): """Test functionality to insert a columns.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset ds_rows = ds.fetch_rows() # Keep track of column and row identifier col_ids = [col.identifier for col in ds.columns] row_ids = [row.identifier for row in ds_rows] # Insert columns at position 1 col_ids.insert(1, ds.max_column_id() + 1) result = self.api.insert_column(ds.identifier, 1, 'Height', self.datastore) # Resulting dataset should differ from previous one self.assertNotEqual(result.dataset.identifier, ds.identifier) # Retrieve dataset and ensure that it has the following schema: # Name, Height, Age, Salary ds = self.datastore.get_dataset(result.dataset.identifier) col_names = ['Name', 'Height', 'Age', 'Salary'] # Ensure that there are four rows self.assertEqual(len(ds.columns), len(col_names)) for i in range(len(col_names)): col = ds.columns[i] self.assertEqual(col.identifier, col_ids[i]) self.assertEqual(col.name.upper(), col_names[i].upper()) # Insert columns at last position col_ids.append(ds.max_column_id() + 1) col_names.append('Weight') result = self.api.insert_column(ds.identifier, 4, 'Weight', self.datastore) # Resulting dataset should differ from previous one self.assertNotEqual(result.dataset.identifier, ds.identifier) # Retrieve dataset and ensure that it has the following schema: # Name, Height, Age, Salary, Weight ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() # Ensure that there are five rows self.assertEqual(len(ds.columns), len(col_names)) for i in range(len(col_names)): col = ds.columns[i] self.assertEqual(col.identifier, col_ids[i]) self.assertEqual(col.name.upper(), col_names[i].upper()) # The cell values for new columns are None all other values are not None for row in ds_rows: for i in range(len(ds.columns)): if i == 1 or i == 4: self.assertIsNone(row.values[i]) else: self.assertTrue(row.values[i]) # Ensure that row identifier haven't changed for i in range(len(ds_rows)): self.assertEqual(ds_rows[i].identifier, row_ids[i]) # Ensure exception is thrown if dataset identifier is unknown with self.assertRaises(ValueError): self.api.insert_column('unknown:uri', 1, 'Height', self.datastore) # Ensure exception is thrown if column name is invalid self.api.insert_column(ds.identifier, 1, 'Height from ground', self.datastore) with self.assertRaises(ValueError): self.api.insert_column(ds.identifier, 1, 'Height from ground!@#', self.datastore) # Ensure exception is thrown if column position is out of bounds with self.assertRaises(ValueError): self.api.insert_column(ds.identifier, 100, 'Height', self.datastore) def test_insert_row(self): """Test functionality to insert a row.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset # Keep track of column and row identifier ds_rows = ds.fetch_rows() col_ids = [col.identifier for col in ds.columns] row_ids = [row.identifier for row in ds_rows] # Insert row at index position 1 row_ids.insert(1, ds.max_row_id() + 1) # Result should indicate that one row was inserted. The identifier of # the resulting dataset should differ from the identifier of the # original dataset result = self.api.insert_row(ds.identifier, 1, self.datastore) # Resulting dataset should differ from previous one self.assertNotEqual(result.dataset.identifier, ds.identifier) # Retrieve modified dataset ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() # Ensure that there are three rows self.assertEqual(len(ds_rows), 3) # The second row has empty values for each column row = ds_rows[1] self.assertEqual(len(row.values), len(ds.columns)) for i in range(len(ds.columns)): self.assertIsNone(row.values[i]) # Append row at end current dataset row_ids.append(ds.max_row_id() + 1) result = self.api.insert_row(ds.identifier, 3, self.datastore) # Resulting dataset should differ from previous one self.assertNotEqual(result.dataset.identifier, ds.identifier) ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() # Ensure that there are three rows self.assertEqual(len(ds_rows), 4) # The next to last row has non-empty values for each column row = ds_rows[2] self.assertEqual(len(row.values), len(ds.columns)) for i in range(len(ds.columns)): self.assertIsNotNone(row.values[i]) # The last row has empty values for each column row = ds_rows[3] self.assertEqual(len(row.values), len(ds.columns)) for i in range(len(ds.columns)): self.assertIsNone(row.values[i]) # Ensure that row ids haven't changed for i in range(len(ds_rows)): self.assertEqual(ds_rows[i].identifier, str(row_ids[i])) # Make sure column identifier haven't changed for i in range(len(ds.columns)): self.assertEqual(ds.columns[i].identifier, col_ids[i]) # Ensure exception is thrown if dataset identifier is unknown with self.assertRaises(ValueError): self.api.insert_row('unknown:uri', 1, self.datastore) # Ensure exception is thrown if row index is out of bounds with self.assertRaises(ValueError): self.api.insert_row(ds.identifier, 5, self.datastore) # Ensure no exception is raised self.api.insert_row(ds.identifier, 4, self.datastore) def test_load_dataset(self): """Test functionality to load a dataset.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) result = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier) ds = result.dataset resources = result.resources ds_rows = ds.fetch_rows() self.assertEqual(len(ds.columns), 3) self.assertEqual(len(ds_rows), 2) for row in ds_rows: self.assertTrue(isinstance(row.values[1], int)) self.assertIsNotNone(resources) self.assertEqual(resources[RESOURCE_FILEID], fh.identifier) self.assertEqual(resources[RESOURCE_DATASET], ds.identifier) # Delete file handle and run load_dataset again with returned resource # information. This should not raise an error since the file is not # accessed but the previous dataset reused. self.filestore.delete_file(fh.identifier) result = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier, resources=resources) self.assertEqual(result.dataset.identifier, ds.identifier) # Doing the same without the resources should raise an exception with self.assertRaises(ValueError): self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier) # Ensure exception is thrown if dataset identifier is unknown with self.assertRaises(ValueError): self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id='unknown:uri') # Test loading file from external resource. Skip if DOWNLOAD_URL is None if DOWNLOAD_URL is None: print('Skipping download test') return result = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, url=DOWNLOAD_URL) ds = result.dataset resources = result.resources ds_rows = ds.fetch_rows() self.assertEqual(len(ds.columns), 4) self.assertEqual(len(ds_rows), 54) self.assertIsNotNone(resources) self.assertEqual(resources[RESOURCE_URL], DOWNLOAD_URL) self.assertEqual(resources[RESOURCE_DATASET], ds.identifier) # Attempt to simulate re-running without downloading again. Set the # Uri to some fake Uri that would raise an exception if an attempt was # made to download url = 'some fake uri' resources[RESOURCE_URL] = url result = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, url=url, resources=resources) prev_id = result.dataset.identifier self.assertEqual(result.dataset.identifier, prev_id) # If we re-run with reload flag true a new dataset should be returned resources[RESOURCE_URL] = DOWNLOAD_URL result = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, url=DOWNLOAD_URL, resources=resources, reload=True) self.assertNotEqual(result.dataset.identifier, prev_id) def test_move_column(self): """Test functionality to move a column.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset ds_rows = ds.fetch_rows() # Keep track of column and row identifier col_ids = [col.identifier for col in ds.columns] row_ids = [row.identifier for row in ds_rows] # Swap first two columns c = col_ids[0] del col_ids[0] col_ids.insert(1, c) result = self.api.move_column(ds.identifier, ds.column_by_name('Name').identifier, 1, self.datastore) self.assertNotEqual(result.dataset.identifier, ds.identifier) ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() self.assertEqual(ds.columns[0].name.upper(), 'Age'.upper()) self.assertEqual(ds.columns[1].name.upper(), 'Name'.upper()) self.assertEqual(ds.columns[2].name.upper(), 'Salary'.upper()) row = ds_rows[0] self.assertEqual(row.values[0], 23) self.assertEqual(row.values[1], 'Alice') self.assertEqual(row.values[2], '35K') row = ds_rows[1] self.assertEqual(row.values[0], 32) self.assertEqual(row.values[1], 'Bob') self.assertEqual(row.values[2], '30K') # Ensure that row ids haven't changed for i in range(len(ds_rows)): self.assertEqual(ds_rows[i].identifier, row_ids[i]) # Make sure column identifier haven't changed for i in range(len(ds.columns)): self.assertEqual(ds.columns[i].identifier, col_ids[i]) # Swap last two columns c = col_ids[1] del col_ids[1] col_ids.append(c) result = self.api.move_column(ds.identifier, ds.column_by_name('Salary').identifier, 1, self.datastore) ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() self.assertEqual(ds.columns[0].name.upper(), 'Age'.upper()) self.assertEqual(ds.columns[1].name.upper(), 'Salary'.upper()) self.assertEqual(ds.columns[2].name.upper(), 'Name'.upper()) row = ds_rows[0] self.assertEqual(row.values[0], 23) self.assertEqual(row.values[1], '35K') self.assertEqual(row.values[2], 'Alice') row = ds_rows[1] self.assertEqual(row.values[0], 32) self.assertEqual(row.values[1], '30K') self.assertEqual(row.values[2], 'Bob') # Ensure that row ids haven't changed for i in range(len(ds_rows)): self.assertEqual(ds_rows[i].identifier, row_ids[i]) # Make sure column identifier haven't changed for i in range(len(ds.columns)): self.assertEqual(ds.columns[i].identifier, col_ids[i]) # No changes if source and target position are the same result = self.api.move_column(ds.identifier, ds.columns[1].identifier, 1, self.datastore) self.assertEqual(ds.identifier, result.dataset.identifier) # Ensure exception is thrown if dataset identifier is unknown with self.assertRaises(ValueError): self.api.move_column('unknown:uri', 0, 1, self.datastore) # Raise error if source column is out of bounds with self.assertRaises(ValueError): self.api.move_column(ds.identifier, 40, 1, self.datastore) # Raise error if target position is out of bounds with self.assertRaises(ValueError): self.api.move_column(ds.identifier, ds.column_by_name('Name').identifier, -1, self.datastore) with self.assertRaises(ValueError): self.api.move_column(ds.identifier, ds.column_by_name('Name').identifier, 4, self.datastore) def test_move_row(self): """Test functionality to move a row.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset ds_rows = ds.fetch_rows() # Keep track of column and row identifier col_ids = [col.identifier for col in ds.columns] row_ids = [row.identifier for row in ds_rows] # Swap first two rows row_ids = [row for row in reversed(row_ids)] result = self.api.move_row(ds.identifier, 0, 1, self.datastore) self.assertNotEqual(result.dataset.identifier, ds.identifier) ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() self.assertEqual(ds.columns[0].name.upper(), 'Name'.upper()) self.assertEqual(ds.columns[1].name.upper(), 'Age'.upper()) self.assertEqual(ds.columns[2].name.upper(), 'Salary'.upper()) row = ds_rows[0] self.assertEqual(row.values[0], 'Bob') self.assertEqual(row.values[1], 32) self.assertEqual(row.values[2], '30K') row = ds_rows[1] self.assertEqual(row.values[0], 'Alice') self.assertEqual(row.values[1], 23) self.assertEqual(row.values[2], '35K') # Ensure that row ids haven't changed for i in range(len(ds_rows)): self.assertEqual(ds_rows[i].identifier, row_ids[i]) # Make sure column identifier haven't changed for i in range(len(ds.columns)): self.assertEqual(ds.columns[i].identifier, col_ids[i]) # Swap last two rows row_ids = [row for row in reversed(row_ids)] result = self.api.move_row(ds.identifier, 1, 0, self.datastore) ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() self.assertEqual(ds.columns[0].name.upper(), 'Name'.upper()) self.assertEqual(ds.columns[1].name.upper(), 'Age'.upper()) self.assertEqual(ds.columns[2].name.upper(), 'Salary'.upper()) row = ds_rows[0] self.assertEqual(row.values[0], 'Alice') self.assertEqual(row.values[1], 23) self.assertEqual(row.values[2], '35K') row = ds_rows[1] self.assertEqual(row.values[0], 'Bob') self.assertEqual(row.values[1], 32) self.assertEqual(row.values[2], '30K') # Ensure that row ids haven't changed for i in range(len(ds_rows)): self.assertEqual(ds_rows[i].identifier, row_ids[i]) # Make sure column identifier haven't changed for i in range(len(ds.columns)): self.assertEqual(ds.columns[i].identifier, col_ids[i]) # Move first row to the end result = self.api.move_row(ds.identifier, 0, 2, self.datastore) row_ids = [row for row in reversed(row_ids)] ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() row = ds_rows[0] self.assertEqual(row.values[0], 'Bob') self.assertEqual(row.values[1], 32) self.assertEqual(row.values[2], '30K') row = ds_rows[1] self.assertEqual(row.values[0], 'Alice') self.assertEqual(row.values[1], 23) self.assertEqual(row.values[2], '35K') # Ensure that row ids haven't changed for i in range(len(ds_rows)): self.assertEqual(ds_rows[i].identifier, row_ids[i]) # Make sure column identifier haven't changed for i in range(len(ds.columns)): self.assertEqual(ds.columns[i].identifier, col_ids[i]) # No changes if source and target position are the same result = self.api.move_row(ds.identifier, 1, 1, self.datastore) self.assertEqual(ds.identifier, result.dataset.identifier) # Ensure exception is thrown if dataset identifier is unknown with self.assertRaises(ValueError): self.api.move_row('unknown:uri', 0, 1, self.datastore) # Raise error if source row is out of bounds with self.assertRaises(ValueError): self.api.move_row(ds.identifier, 3, 1, self.datastore) # Raise error if target position is out of bounds with self.assertRaises(ValueError): self.api.move_row(ds.identifier, 0, -1, self.datastore) with self.assertRaises(ValueError): self.api.move_row(ds.identifier, 1, 4, self.datastore) def test_rename_column(self): """Test functionality to rename a column.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset ds_rows = ds.fetch_rows() # Keep track of column and row identifier col_ids = [col.identifier for col in ds.columns] row_ids = [row.identifier for row in ds_rows] # Rename first column to Firstname result = self.api.rename_column(ds.identifier, ds.column_by_name('Name').identifier, 'Firstname', self.datastore) self.assertNotEqual(result.dataset.identifier, ds.identifier) ds = self.datastore.get_dataset(result.dataset.identifier) self.assertEqual(ds.columns[0].name.upper(), 'Firstname'.upper()) self.assertEqual(ds.columns[1].name.upper(), 'Age'.upper()) self.assertEqual(ds.columns[2].name.upper(), 'Salary'.upper()) result = self.api.rename_column(ds.identifier, ds.column_by_name('Age').identifier, 'BDate', self.datastore) ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() self.assertEqual(ds.columns[0].name.upper(), 'Firstname'.upper()) self.assertEqual(ds.columns[1].name, 'BDate') self.assertEqual(ds.columns[2].name.upper(), 'Salary'.upper()) # Ensure that row ids haven't changed for i in range(len(ds_rows)): self.assertEqual(ds_rows[i].identifier, row_ids[i]) # Make sure column identifier haven't changed for i in range(len(ds.columns)): self.assertEqual(ds.columns[i].identifier, col_ids[i]) # No changes if the old and new column name are the same (with exception # to upper and lower cases). result = self.api.rename_column(ds.identifier, ds.column_by_name('BDate').identifier, 'BDate', self.datastore) self.assertEqual(ds.identifier, result.dataset.identifier) # Ensure exception is thrown if dataset identifier is unknown with self.assertRaises(ValueError): self.api.rename_column('unknown:uri', 0, 'Firstname', self.datastore) # Ensure exception is thrown for invalid column id with self.assertRaises(ValueError): self.api.rename_column(ds.identifier, 500, 'BDate', self.datastore) def test_sequence_of_steps(self): """Test sequence of calls that modify a dataset.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset ds = self.api.insert_row(ds.identifier, 1, self.datastore).dataset ds = self.api.insert_column(ds.identifier, 3, 'HDate', self.datastore).dataset ds = self.api.update_cell(ds.identifier, ds.column_by_name('HDate').identifier, 0, '180', self.datastore).dataset ds = self.api.update_cell(ds.identifier, ds.column_by_name('HDate').identifier, 2, '160', self.datastore).dataset ds = self.api.rename_column(ds.identifier, ds.column_by_name('HDate').identifier, 'Height', self.datastore).dataset ds = self.api.update_cell(ds.identifier, ds.column_by_name('Height').identifier, 1, '170', self.datastore).dataset ds = self.api.move_row(ds.identifier, 1, 2, self.datastore).dataset ds = self.api.update_cell(ds.identifier, ds.column_by_name('Name').identifier, 2, 'Carla', self.datastore).dataset ds = self.api.update_cell(ds.identifier, ds.column_by_name('Age').identifier, 2, '45', self.datastore).dataset ds = self.api.update_cell(ds.identifier, ds.column_by_name('Salary').identifier, 2, '56K', self.datastore).dataset ds = self.api.move_column(ds.identifier, ds.column_by_name('Salary').identifier, 4, self.datastore).dataset ds = self.api.delete_column(ds.identifier, ds.column_by_name('Age').identifier, self.datastore).dataset ds = self.api.delete_row(ds.identifier, 0, self.datastore).dataset ds = self.api.delete_row(ds.identifier, 0, self.datastore).dataset ds = self.datastore.get_dataset(ds.identifier) ds_rows = ds.fetch_rows() names = ['Name', 'Height', 'Salary'] self.assertEqual(len(ds.columns), len(names)) for i in range(len(names)): col = ds.columns[i] self.assertEqual(col.name.upper(), names[i].upper()) self.assertEqual([col.identifier for col in ds.columns], [0, 3, 2]) self.assertEqual(len(ds_rows), 1) self.assertEqual(ds_rows[0].values, ['Carla', '160', '56K']) self.assertEqual(ds_rows[0].identifier, '2') def test_sort_dataset(self): """Test sorting a dataset.""" # Create a new dataset fh = self.filestore.upload_file(SORT_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset result = self.api.sort_dataset(ds.identifier, [1, 2, 0], [False, False, True], self.datastore) ds = self.datastore.get_dataset(result.dataset.identifier) rows = ds.fetch_rows() names = ['Alice', 'Bob', 'Dave', 'Gertrud', 'Frank'] result = list() for row in rows: name = row.values[0] if name in names: result.append(name) for i in range(len(names)): self.assertEqual(names[i], result[i]) result = self.api.sort_dataset(ds.identifier, [2, 1, 0], [True, False, True], self.datastore) ds = self.datastore.get_dataset(result.dataset.identifier) rows = ds.fetch_rows() names = ['Gertrud', 'Frank', 'Bob', 'Alice', 'Dave'] result = list() for row in rows: name = row.values[0] if name in names: result.append(name) for i in range(len(names)): self.assertEqual(names[i], result[i]) # Raises error for invalid column identifier with self.assertRaises(ValueError): self.api.sort_dataset(ds.identifier, [2, 10, 0], [True, False, True], self.datastore) def test_update_cell(self): """Test functionality to update a dataset cell.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset ds_rows = ds.fetch_rows() # Keep track of column and row identifier col_ids = [col.identifier for col in ds.columns] row_ids = [row.identifier for row in ds_rows] # Update cell [0, 0]. Ensure that one row was updated and a new # identifier is generated. Also ensure that the resulting datasets # has the new value in cell [0, 0] result = self.api.update_cell(ds.identifier, 0, 0, 'MyValue', self.datastore) self.assertNotEqual(ds.identifier, result.dataset.identifier) ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() self.assertEqual(ds_rows[0].values[0], 'MyValue') result = self.api.update_cell(ds.identifier, ds.column_by_name('Name').identifier, 0, 'AValue', self.datastore) ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() self.assertEqual(ds_rows[0].values[0], 'AValue') self.assertEqual(ds_rows[0].values[ds.column_index('Name')], 'AValue') # Ensure that row ids haven't changed for i in range(len(ds_rows)): self.assertEqual(ds_rows[i].identifier, row_ids[i]) # Make sure column identifier haven't changed for i in range(len(ds.columns)): self.assertEqual(ds.columns[i].identifier, col_ids[i]) # Set value to None result = self.api.update_cell(ds.identifier, ds.column_by_name('Name').identifier, 0, None, self.datastore) ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() self.assertIsNone(ds_rows[0].values[0]) self.assertIsNone(ds_rows[0].values[ds.column_index('Name')]) # Ensure exception is thrown if dataset is unknown with self.assertRaises(ValueError): self.api.update_cell('unknown:uri', 0, 0, 'MyValue', self.datastore) # Ensure exception is thrown if column is unknown with self.assertRaises(ValueError): self.api.update_cell(ds.identifier, 100, 0, 'MyValue', self.datastore) # Ensure exception is thrown if row index is out ouf bounds with self.assertRaises(ValueError): self.api.update_cell(ds.identifier, 0, 100, 'MyValue', self.datastore)