def setUp(self): """Create an empty work trails repository.""" # Create fresh set of directories if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.mkdir(SERVER_DIR) self.filestore = FileSystemFilestore(FILESERVER_DIR)
def test_download_dataset(self): """Test loading a dataset from Url. Note that this test depends on the accessed web service to be running. It will fail otherwise.""" # Skip test if DOWNLOAD_URL is None if DOWNLOAD_URL is None: print('Skipping download test') return store = FileSystemDatastore(STORE_DIR) ds = store.download_dataset(url=DOWNLOAD_URL) dataset_dir = os.path.join(STORE_DIR, ds.identifier) self.assertTrue(os.path.isdir(dataset_dir)) self.assertTrue(os.path.isfile(os.path.join(dataset_dir, DATA_FILE))) self.assertTrue(os.path.isfile(os.path.join(dataset_dir, DESCRIPTOR_FILE))) self.assertFalse(os.path.isfile(os.path.join(dataset_dir, METADATA_FILE))) self.validate_class_size_dataset(ds) # Download file into a given filestore fs = FileSystemFilestore(FSSTORE_DIR) ds, fh = store.download_dataset( url=DOWNLOAD_URL, filestore=fs ) self.validate_class_size_dataset(ds) self.assertEqual(len(fs.list_files()), 1) self.assertIsNotNone(fh) self.assertIsNotNone(fs.get_file(fh.identifier))
def setUp(self): """Create empty server directory.""" if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.mkdir(SERVER_DIR) self.fileserver = FileSystemFilestore(FILESERVER_DIR) self.db = MimirDatastore(DATASTORE_DIRECTORY)
def setUp(self): """Create instances of the default datastore and filestore.""" # Drop directory if it exists if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.makedirs(SERVER_DIR) self.datastore = FileSystemDatastore(DATASTORE_DIR) self.filestore = FileSystemFilestore(FILESTORE_DIR)
def test_upload_stream(self): """Test file upload from an open file object.""" db = FileSystemFilestore(SERVER_DIR) file = FileStorage(filename=CSV_FILE) fh = db.upload_stream(file=file, file_name=os.path.basename(CSV_FILE)) self.assertEqual(fh.file_name, os.path.basename(CSV_FILE)) self.assertEqual(fh.mimetype, fs.FORMAT_CSV) self.assertTrue(os.path.isfile(fh.filepath)) self.assertEqual(fh.identifier, db.get_file(fh.identifier).identifier)
def setUp(self): """Create an instance of the Mimir processor for an empty server directory. """ # Drop directory if it exists if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.makedirs(SERVER_DIR) self.datastore = MimirDatastore(DATASTORE_DIR) self.filestore = FileSystemFilestore(FILESTORE_DIR)
def setUp(self): """Create an instance of the default vizier processor for an empty server directory. """ # Drop directory if it exists if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.makedirs(SERVER_DIR) self.processor = VizualTaskProcessor(api=DefaultVizualApi()) self.datastore = FileSystemDatastore(DATASTORE_DIR) self.filestore = FileSystemFilestore(FILESTORE_DIR)
def set_up(self, engine): """Create an empty file server repository.""" if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.mkdir(SERVER_DIR) # Setup file server self.fs = FileSystemFilestore(FILESERVER_DIR) # Setup the respective datastore and Vizual engine if engine == ENGINEENV_DEFAULT: self.datastore = FileSystemDatastore(DATASTORE_DIR) elif engine == ENGINEENV_MIMIR: self.datastore = MimirDatastore(DATASTORE_DIR)
def setUp(self): """Create an instance of the Mimir processor for an empty server directory. """ # Drop directory if it exists if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.makedirs(SERVER_DIR) self.processor = MimirProcessor() self.datastore = MimirDatastore(DATASTORE_DIR) self.filestore = FileSystemFilestore(FILESTORE_DIR) self.available_lenses = set(mimir.getAvailableLensTypes())
def test_get_file(self): """Test file get method.""" db = FileSystemFilestore(SERVER_DIR) fh1 = db.upload_file(CSV_FILE) fh2 = db.get_file(fh1.identifier) self.assertEqual(fh1.identifier, fh2.identifier) self.assertEqual(fh1.filepath, fh2.filepath) self.assertEqual(fh1.mimetype, fh2.mimetype) # Ensure that the file parses as a CSV file with fh1.open() as csvfile: rows = 0 for row in csv.reader(csvfile, delimiter=fh1.delimiter): rows += 1 self.assertEqual(rows, 3)
class TestMimirDatasetAnnotations(unittest.TestCase): def setUp(self): """Create empty server directory.""" if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.mkdir(SERVER_DIR) self.fileserver = FileSystemFilestore(FILESERVER_DIR) self.db = MimirDatastore(DATASTORE_DIRECTORY) def tearDown(self): """Delete server directory.""" if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) def test_dataset_annotations(self): """Run test for Mimir datastore.""" dh = self.db.load_dataset( f_handle=self.fileserver.upload_file(DATA_FILE)) ds = self.db.get_dataset(dh.identifier) rows = ds.fetch_rows() print(ds.row_ids) for row in rows: print(str(row.identifier) + '\t' + str(row.values)) for row_id in ds.row_ids: for anno in ds.get_annotations(column_id=1, row_id=row_id): print(str(row_id) + '\t' + anno.key + '=' + str(anno.value))
class TestUnicodeHandling(unittest.TestCase): def setUp(self): """Create an empty work trails repository.""" # Create fresh set of directories if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.mkdir(SERVER_DIR) self.filestore = FileSystemFilestore(FILESERVER_DIR) def tearDown(self): """Clean-up by dropping the MongoDB colelction used by the engine. """ # Delete directories # if os.path.isdir(SERVER_DIR): # shutil.rmtree(SERVER_DIR) @unittest.skip("FS Datastore doesn't like funky unicode") def test_default_config(self): """Run workflow with default configuration.""" # Create new work trail and retrieve the HEAD workflow of the default # branch self.run_workflow(FileSystemDatastore(DATASTORE_DIR)) @unittest.skip("Blocked on https://github.com/UBOdin/mimir-api/issues/9") def test_mimir_config(self): """Run workflows for Mimir configurations.""" # Create new work trail and retrieve the HEAD workflow of the default # branch import vizier.mimir as mimir self.run_workflow(MimirDatastore(DATASTORE_DIR)) def run_workflow(self, datastore): """Test functionality to execute a Python script that creates a dataset containing unicode characters.""" f_handle = self.filestore.upload_file(CSV_FILE) ds = datastore.load_dataset(f_handle) # RUN Python Script cmd = python_cell( source=PYTHON_SCRIPT, validate=True ) result = PyCellTaskProcessor().compute( command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext( datastore=datastore, filestore=self.filestore, datasets={DATASET_NAME: ds.identifier} ) ) self.assertTrue(result.is_success) #print wf.modules[-1].stdout[0]['data'] ds = datastore.get_dataset(result.provenance.write[DATASET_NAME].identifier) names = set(c.name.upper().replace('_', ' ') for c in ds.columns) self.assertTrue(len(names), 4) for name in ['DATE', 'IMO CODE', 'PORT OF DEPARTURE', 'PLACE OF RECEIPT']: self.assertTrue(name in names)
class TestSQLProcessor(unittest.TestCase): def setUp(self): """Create an instance of the Mimir processor for an empty server directory. """ # Drop directory if it exists if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.makedirs(SERVER_DIR) self.datastore = MimirDatastore(DATASTORE_DIR) self.filestore = FileSystemFilestore(FILESTORE_DIR) def tearDown(self): """Clean-up by dropping the server directory. """ if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) def test_run_sql_query(self): """Test running a SQL query without materializing the result.""" f_handle = self.filestore.upload_file(CSV_FILE) ds = self.datastore.load_dataset(f_handle=f_handle) cmd = sql_cell(source='SELECT grade_or_service_category FROM ' + DATASET_NAME + ' WHERE program = \'GENERAL EDUCATION\'', validate=True) result = SQLTaskProcessor().compute( command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext(datasets={DATASET_NAME: ds.identifier}, datastore=self.datastore, filestore=self.filestore)) self.assertTrue(result.is_success) self.assertIsNone(result.provenance.read) self.assertIsNone(result.provenance.write) self.assertTrue(len(result.outputs.stdout) > 0) self.assertEqual(len(result.outputs.stderr), 0) # Materialize result cmd = sql_cell(source='SELECT grade_or_service_category FROM ' + DATASET_NAME + ' WHERE program = \'GENERAL EDUCATION\'', output_dataset='ge', validate=True) result = SQLTaskProcessor().compute( command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext(datasets={DATASET_NAME: ds.identifier}, datastore=self.datastore, filestore=self.filestore)) self.assertTrue(result.is_success) self.assertIsNone(result.provenance.read) self.assertIsNotNone(result.provenance.write) self.assertTrue('ge' in result.provenance.write) self.assertTrue(len(result.outputs.stdout) > 0) self.assertEqual(len(result.outputs.stderr), 0)
def test_load_dataset(self): """Test validation of load dataset command.""" db = FileSystemFilestore(SERVER_DIR) fh = db.upload_file(CSV_FILE) cmd = load_dataset(dataset_name='ds', file={ pckg.FILE_ID: fh.identifier, pckg.FILE_NAME: fh.file_name }, validate=True).to_external_form(command=PACKAGE.get( vizual.VIZUAL_LOAD), datasets=DATASETS) self.assertEqual(cmd, 'LOAD DATASET ds FROM ' + fh.file_name) cmd = load_dataset(dataset_name='ds', file={ pckg.FILE_URL: 'http://some.file.url' }, validate=True).to_external_form(command=PACKAGE.get( vizual.VIZUAL_LOAD), datasets=DATASETS) self.assertEqual(cmd, 'LOAD DATASET ds FROM http://some.file.url') cmd = load_dataset(dataset_name='ds', file={ pckg.FILE_ID: fh.identifier, pckg.FILE_URL: 'http://some.file.url' }, validate=True).to_external_form(command=PACKAGE.get( vizual.VIZUAL_LOAD), datasets=DATASETS) self.assertEqual(cmd, 'LOAD DATASET ds FROM http://some.file.url') cmd = load_dataset(dataset_name='ds', file={ pckg.FILE_ID: 'Some File' }, validate=True).to_external_form(command=PACKAGE.get( vizual.VIZUAL_LOAD), datasets=DATASETS) self.assertEqual(cmd, 'LOAD DATASET ds FROM \'Some File\'')
def get_filestore(self, identifier): """Get the filestore instance for the project with the given identifier. Paramaters ---------- identifier: string Unique identifier for filestore Returns ------- vizier.filestore.fs.base.FileSystemFilestore """ filestore_dir = os.path.join(self.base_path, identifier) return FileSystemFilestore(filestore_dir)
def test_delete_file(self): """Test delete file method.""" db = FileSystemFilestore(SERVER_DIR) f = db.upload_file(CSV_FILE) f = db.get_file(f.identifier) self.assertIsNotNone(f) self.assertTrue(db.delete_file(f.identifier)) f = db.get_file(f.identifier) self.assertIsNone(f)
class TestDefaultVizualProcessor(unittest.TestCase): def setUp(self): """Create an instance of the default vizier processor for an empty server directory. """ # Drop directory if it exists if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.makedirs(SERVER_DIR) self.datastore = FileSystemDatastore(DATASTORE_DIR) self.filestore = FileSystemFilestore(FILESTORE_DIR) def tearDown(self): """Clean-up by dropping the server directory. """ if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) def count_non_null_values(self, data, column_index): """Return the number of values in a column that are not None.""" count = 0 for row in data: if not row[column_index] is None: count += 1 return count def test_query(self): """Test running a query for simple chart plots.""" ds = self.datastore.load_dataset(self.filestore.upload_file(LOAD_FILE)) view = ChartViewHandle(dataset_name='ABC', x_axis=2) view.add_series(1, range_start=25, range_end=30) view.add_series(0, range_start=25, range_end=30) view.add_series(3, range_start=25, range_end=30) data = ChartQuery().exec_query(dataset=ds, view=view)[0] self.assertEqual(len(data), 6) for row in data: self.assertEqual(len(row), 3) self.assertTrue(isinstance(data[0][0], int)) self.assertTrue(isinstance(data[0][1], float)) # Remove interval end for one series. This should return all rows # starting from index 25 view = ChartViewHandle(dataset_name='ABC', x_axis=2) view.add_series(1, range_start=25, range_end=30) view.add_series(0, range_start=25) view.add_series(3, range_start=25, range_end=30) data = ChartQuery().exec_query(dataset=ds, view=view)[0] self.assertEqual(len(data), 29) self.assertIsNone(data[28][0]) self.assertIsNotNone(data[28][1]) self.assertIsNone(data[28][2]) for row in data: self.assertEqual(len(row), 3) # Remove interval start for another series. The first series will # contain 31 values, the second 29, and the third 6 view = ChartViewHandle(dataset_name='ABC', x_axis=2) view.add_series(1, range_end=30) view.add_series(0, range_start=25) view.add_series(3, range_start=25, range_end=30) data = ChartQuery().exec_query(dataset=ds, view=view)[0] self.assertEqual(len(data), 31) self.assertEqual(self.count_non_null_values(data, 0), 31) self.assertEqual(self.count_non_null_values(data, 1), 29) self.assertEqual(self.count_non_null_values(data, 2), 6) for row in data: self.assertEqual(len(row), 3) # Without any range constraints the result should contain all 54 rows view = ChartViewHandle(dataset_name='ABC', x_axis=2) view.add_series(1, label='A') view.add_series(0, label='B') view.add_series(3) data = ChartQuery().exec_query(dataset=ds, view=view)[0] self.assertEqual(len(data), 54)
class TestVizierClient(unittest.TestCase): def setUp(self): """Create instances of the default datastore and filestore.""" # Drop directory if it exists if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.makedirs(SERVER_DIR) self.datastore = FileSystemDatastore(DATASTORE_DIR) self.filestore = FileSystemFilestore(FILESTORE_DIR) def tearDown(self): """Clean-up by dropping the server directory. """ if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) def test_create_new_dataset(self): """Test creating and updating a new dataset via the client.""" client = VizierDBClient(datastore=self.datastore, datasets=dict()) ds = DatasetClient() ds.insert_column('Name') ds.insert_column('Age') ds.insert_row(['Alice', '23']) ds.insert_row(['Bob', '25']) rows = ds.rows ds = client.create_dataset('MyDataset', ds) # Ensure the returned dataset contains the input data self.assertEqual([c.name for c in ds.columns], ['Name', 'Age']) self.assertEqual([str(v) for v in ds.rows[0].values], ['Alice', '23']) self.assertEqual([str(v) for v in ds.rows[1].values], ['Bob', '25']) # Modify the reference to the original rows to ensure that the rows in # the loaded datasets are not affected self.assertEqual([str(v) for v in rows[0].values], ['Alice', '23']) rows[0].set_value(0, 'Jane') self.assertEqual([str(v) for v in rows[0].values], ['Jane', '23']) self.assertEqual([str(v) for v in ds.rows[0].values], ['Alice', '23']) # Update dataset ds.rows[1].set_value('Age', '26') client.update_dataset('MyDataset', ds) ds = client.get_dataset('MyDataset') self.assertEqual([str(v) for v in ds.rows[1].values], ['Bob', '26']) # Value error when creating dataset with existing name with self.assertRaises(ValueError): client.create_dataset('MyDataset', ds) # Value error when retrieving unknown dataset with self.assertRaises(ValueError): client.get_dataset('SomeDataset') # Ensure the returned dataset contains the modified data client.rename_dataset('MyDataset', 'SomeDataset') ds = client.get_dataset('SomeDataset') # Ensure that access to unknown datasets is recorded with self.assertRaises(ValueError): client.get_dataset('ThisIsNotADataset') for name in ['somedataset', 'mydataset']: self.assertTrue(name in client.read) self.assertTrue(name in client.write) self.assertTrue('thisisnotadataset' in client.read) self.assertFalse('thisisnotadataset' in client.write) def test_dataset_annotations(self): """Test creating and updating an existing dataset via the client.""" # Move columns around ds = self.datastore.load_dataset(self.filestore.upload_file(CSV_FILE)) client = VizierDBClient(datastore=self.datastore, datasets={DATASET_NAME: ds.identifier}) ds = client.get_dataset(DATASET_NAME) annotations = ds.annotations annotations.add(key='comment', value='Good', column_id=0, row_id=1) annotations.add(key='comment', value='Good', column_id=1, row_id=1) annotations.add(key='quality', value='Nice', column_id=0, row_id=1) ds = client.update_dataset(name=DATASET_NAME, dataset=ds) self.assertEqual(len(ds.annotations.cells), 3) ds = client.get_dataset(DATASET_NAME) self.assertEqual(len(ds.annotations.cells), 3) row = ds.rows[1] annotations = row.annotations(0) for key in ['comment', 'quality']: self.assertTrue(key in list(annotations.keys())) annotations = row.annotations(1) self.assertTrue('comment' in list(annotations.keys())) self.assertFalse('quality' in list(annotations.keys())) row.set_value(0, 'New Value', clear_annotations=True) self.assertEqual(len(ds.annotations.cells), 1) ds = client.update_dataset(name=DATASET_NAME, dataset=ds) self.assertEqual(len(ds.annotations.cells), 1) ds = client.get_dataset(DATASET_NAME) self.assertEqual(len(ds.annotations.cells), 1) def test_update_existing_dataset(self): """Test creating and updating an existing dataset via the client.""" # Move columns around ds = self.datastore.load_dataset(self.filestore.upload_file(CSV_FILE)) client = VizierDBClient(datastore=self.datastore, datasets={DATASET_NAME: ds.identifier}) ds = client.get_dataset(DATASET_NAME) col_1 = [row.get_value(1) for row in ds.rows] ds.insert_column('empty', 2) ds = client.update_dataset(DATASET_NAME, ds) col_2 = [row.get_value(2) for row in ds.rows] ds.move_column('empty', 1) ds = client.update_dataset(DATASET_NAME, ds) for i in range(len(ds.rows)): row = ds.rows[i] self.assertEqual(row.values[1], col_2[i]) self.assertEqual(row.values[2], col_1[i]) # Rename ds.columns[1].name = 'allnone' ds = client.update_dataset(DATASET_NAME, ds) for i in range(len(ds.rows)): row = ds.rows[i] self.assertEqual(row.get_value('allnone'), col_2[i]) self.assertEqual(row.values[2], col_1[i]) # Insert row row = ds.insert_row() row.set_value('Name', 'Zoe') ds = client.create_dataset('upd', ds) self.assertEqual(len(ds.rows), 3) r2 = ds.rows[2] self.assertEqual(r2.identifier, 2) self.assertEqual(r2.values, ['Zoe', None, None, None]) # Annotations ds = client.get_dataset(DATASET_NAME) col = ds.get_column('Age') row = ds.rows[0] ds.annotations.add(column_id=col.identifier, row_id=row.identifier, key='user:comment', value='My Comment') ds = client.update_dataset(DATASET_NAME, ds) annotations = ds.rows[0].annotations('Age').find_all('user:comment') self.assertEqual(len(annotations), 1) anno = annotations[0] self.assertEqual(anno.key, 'user:comment') self.assertEqual(anno.value, 'My Comment') ds.annotations.add(column_id=col.identifier, row_id=row.identifier, key='user:comment', value='Another Comment') ds = client.update_dataset(DATASET_NAME, ds) annotations = ds.rows[0].annotations('Age').find_all('user:comment') self.assertEqual(len(annotations), 2) self.assertEqual(list(ds.rows[0].annotations('Age').keys()), ['user:comment']) values = [a.value for a in annotations] for val in ['My Comment', 'Another Comment']: self.assertTrue(val in values) anno = ds.rows[0].annotations('Age').find_one('user:comment') anno.key = 'user:issue' anno.value = 'Some Issue' ds = client.update_dataset(DATASET_NAME, ds) annotations = ds.rows[0].annotations('Age').find_all('user:comment') self.assertEqual(len(annotations), 1) keys = list(ds.rows[0].annotations('Age').keys()) for key in ['user:comment', 'user:issue']: self.assertTrue(key in keys) values = [ a.value for a in ds.rows[0].annotations('Age').find_all('user:issue') ] for val in ['Some Issue']: self.assertTrue(val in values) ds.annotations.remove( column_id=col.identifier, row_id=row.identifier, key='user:issue', ) ds = client.update_dataset(DATASET_NAME, ds) annotations = ds.rows[0].annotations('Age').find_all('user:issue') self.assertEqual(len(annotations), 0) annotations = ds.rows[0].annotations('Age').find_all('user:comment') self.assertEqual(len(annotations), 1) # Delete column ds = client.get_dataset(DATASET_NAME) ds.delete_column('Age') client.update_dataset(DATASET_NAME, ds) ds = client.get_dataset(DATASET_NAME) names = [col.name.upper() for col in ds.columns] self.assertTrue('NAME' in names) self.assertFalse('AGE' in names) self.assertTrue(DATASET_NAME in client.read) self.assertTrue(DATASET_NAME in client.write) self.assertFalse('upd' in client.read) self.assertTrue('upd' in client.write)
class TestMimirProcessor(unittest.TestCase): """Individual test for Mimir lenses. Run separately since each test has to initialize and shout down the Mimir gateway. """ def setUp(self): """Create an instance of the Mimir processor for an empty server directory. """ # Drop directory if it exists if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.makedirs(SERVER_DIR) self.processor = MimirProcessor() self.datastore = MimirDatastore(DATASTORE_DIR) self.filestore = FileSystemFilestore(FILESTORE_DIR) self.available_lenses = set(mimir.getAvailableLensTypes()) def tearDown(self): """Clean-up by dropping the server directory. """ if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) def compute_lens_result(self, ds, command): return self.processor.compute(command_id=command.command_id, arguments=command.arguments, context=TaskContext( project_id=1, datastore=self.datastore, filestore=self.filestore, artifacts={DATASET_NAME: ds})) def test_geocode_lens(self): if lens_types.MIMIR_GEOCODE not in self.available_lenses: self.skipTest("Mimir Geocoding Lens not initialized.") """Test GEOCODE lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch f_handle = self.filestore.upload_file(GEO_FILE) ds = self.datastore.load_dataset(f_handle=f_handle) # Geocode Lens command = cmd.mimir_geocode( DATASET_NAME, 'GOOGLE', house_nr=ds.column_by_name('STRNUMBER').identifier, street=ds.column_by_name('STRNAME').identifier, city=ds.column_by_name('CITY').identifier, state=ds.column_by_name('STATE').identifier) result = self.compute_lens_result(ds, command) self.assertTrue(result.is_success) ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) columns = [c.name for c in ds.columns] self.assertEqual(len(columns), 6) self.assertTrue('LATITUDE' in columns) self.assertTrue('LONGITUDE' in columns) result = self.compute_lens_result(ds, command) self.assertTrue(result.is_success) ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) columns = [c.name for c in ds.columns] self.assertEqual(len(columns), 8) self.assertTrue('LATITUDE_1' in columns) self.assertTrue('LONGITUDE_1' in columns) self.assertEqual(len(ds.columns), 8) def test_key_repair_lens(self): """Test KEY REPAIR lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch f_handle = self.filestore.upload_file(KEY_REPAIR_FILE) ds1 = self.datastore.load_dataset(f_handle=f_handle) # Missing Value Lens command = cmd.mimir_key_repair(DATASET_NAME, ds1.column_by_name('Empid').identifier) result = self.compute_lens_result(ds1, command) self.assertTrue(result.is_success) ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) self.assertEqual(len(ds.columns), 4) self.assertEqual(ds.row_count, 2) names = set() empids = set() for row in ds.fetch_rows(): empids.add(int(row.values[0])) names.add(row.values[1]) self.assertTrue(1 in empids) self.assertTrue('Alice' in names or 'Bob' in names) self.assertFalse('Alice' in names and 'Bob' in names) self.assertTrue('Carla' in names) # Test error case and command text with self.assertRaises(ValueError): command = cmd.mimir_key_repair('MY DS', 'MY COL') result = self.compute_lens_result(ds, command) def test_missing_value_lens(self): """Test MISSING_VALUE lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch f_handle = self.filestore.upload_file(INCOMPLETE_CSV_FILE) ds = self.datastore.load_dataset(f_handle=f_handle) # Missing Value Lens command = cmd.mimir_missing_value( DATASET_NAME, columns=[{ 'column': ds.column_by_name('AGE').identifier }]) result = self.compute_lens_result(ds, command) self.assertTrue(result.is_success) # Get dataset ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) rows = ds.fetch_rows() for row in rows: self.assertIsNotNone(row.values[1]) self.assertNotEqual(rows[2].values[ds.column_index('Age')], '') # MISSING VALUE Lens with value constraint command = cmd.mimir_missing_value( DATASET_NAME, columns=[{ 'column': ds.column_by_name('AGE').identifier, 'constraint': '> 30' }], ) result = self.compute_lens_result(ds, command) self.assertTrue(result.is_success) # Get dataset ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) rows = ds.fetch_rows() for row in rows: self.assertIsNotNone(row.values[1]) print(rows[2].values) # we shouldn't be imputing a value lower than the minimum value in the dataset self.assertTrue(rows[2].values[ds.column_index('Age')] >= 23) def test_missing_key_lens(self): """Test MISSING_KEY lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch f_handle = self.filestore.upload_file(INCOMPLETE_CSV_FILE) ds = self.datastore.load_dataset(f_handle=f_handle) # Missing Value Lens age_col = ds.column_by_name('Age').identifier command = cmd.mimir_missing_key(DATASET_NAME, age_col) result = self.compute_lens_result(ds, command) self.assertTrue(result.is_success) # Get dataset ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) self.assertEqual(len(ds.columns), 3) rows = ds.fetch_rows() # Depending on implementation this could be either 22 or 24, as there are two rows # with missing values for the key column. Currently, Mimir discards such rows, but # if this suddenly turns into a 24, that's not incorrect either. self.assertEqual(len(rows), 22) command = cmd.mimir_missing_key(DATASET_NAME, ds.column_by_name('Salary').identifier) result = self.compute_lens_result(ds, command) self.assertTrue(result.is_success) # Get dataset ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) self.assertEqual(len(ds.columns), 3) rows = ds.fetch_rows() self.assertEqual(len(rows), 31) def test_picker_lens(self): """Test PICKER lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch f_handle = self.filestore.upload_file(PICKER_FILE) ds = self.datastore.load_dataset(f_handle=f_handle) command = cmd.mimir_picker( DATASET_NAME, [{ 'pickFrom': ds.column_by_name('Age').identifier }, { 'pickFrom': ds.column_by_name('Salary').identifier }]) result = self.compute_lens_result(ds, command) self.assertTrue(result.is_success) # Get dataset result_ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) columns = [c.name for c in result_ds.columns] # print(columns) self.assertEqual(len(result_ds.columns), 3) self.assertTrue('AGE_1' in columns) # Pick another column, this time with custom name command = cmd.mimir_picker( DATASET_NAME, [{ 'pickFrom': ds.column_by_name('Age').identifier }, { 'pickFrom': ds.column_by_name('Salary').identifier }], pick_as='My_Column') result = self.compute_lens_result(ds, command) self.assertTrue(result.is_success) # Get dataset result_ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) columns = [c.name for c in result_ds.columns] self.assertEqual(len(result_ds.columns), 3) self.assertTrue('MY_COLUMN' in columns) def test_type_inference_lens(self): """Test TYPE INFERENCE lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch f_handle = self.filestore.upload_file(INCOMPLETE_CSV_FILE) ds = self.datastore.load_dataset(f_handle=f_handle) # Infer type command = cmd.mimir_type_inference(DATASET_NAME, 0.6) result = self.compute_lens_result(ds, command) self.assertTrue(result.is_success) # Get dataset ds2 = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) self.assertEqual(len(ds2.columns), 3) self.assertEqual(ds2.row_count, 7) ds1_rows = ds.fetch_rows() ds2_rows = ds2.fetch_rows() for i in range(ds2.row_count): self.assertEqual(ds1_rows[i].values, ds2_rows[i].values)
def setup_fileserver(self): """Create a fresh file server.""" if os.path.isdir(FILESERVER_DIR): shutil.rmtree(FILESERVER_DIR) os.mkdir(FILESERVER_DIR) self.fileserver = FileSystemFilestore(FILESERVER_DIR)
class TestMimirDatastore(unittest.TestCase): def setup_fileserver(self): """Create a fresh file server.""" if os.path.isdir(FILESERVER_DIR): shutil.rmtree(FILESERVER_DIR) os.mkdir(FILESERVER_DIR) self.fileserver = FileSystemFilestore(FILESERVER_DIR) def set_up(self): """Create empty data store directory.""" if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.mkdir(SERVER_DIR) self.db = MimirDatastore(DATASTORE_DIR) def tear_down(self): """Delete data store directory. """ if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) def test_mimir_datastore(self): """Run test for Mimir datastore.""" self.set_up() self.dataset_load() self.tear_down() self.set_up() self.datastore_init() self.tear_down() self.set_up() self.dataset_read() self.tear_down() self.set_up() self.dataset_column_index() self.tear_down() self.tear_down() def datastore_init(self): """Test initalizing a datastore with existing datasets.""" self.setup_fileserver() ds = self.db.load_dataset(self.fileserver.upload_file(CSV_FILE)) self.db = MimirDatastore(DATASTORE_DIR) def dataset_column_index(self): """Test the column by id index of the dataset handle.""" self.setup_fileserver() ds = self.db.load_dataset(self.fileserver.upload_file(CSV_FILE)) # Ensure that the project data has three columns and two rows self.assertEqual(ds.column_by_id(0).name.upper(), 'NAME') self.assertEqual(ds.column_by_id(1).name.upper(), 'AGE') self.assertEqual(ds.column_by_id(2).name.upper(), 'SALARY') with self.assertRaises(ValueError): ds.column_by_id(5) ds.columns.append(DatasetColumn(identifier=5, name='NEWNAME')) self.assertEqual(ds.column_by_id(5).name.upper(), 'NEWNAME') with self.assertRaises(ValueError): ds.column_by_id(4) def dataset_load(self): """Test create and delete dataset.""" self.setup_fileserver() ds = self.db.load_dataset(self.fileserver.upload_file(CSV_FILE)) # Ensure that the project data has three columns and two rows self.assertEqual(len(ds.columns), 3) self.assertEqual(len(ds.fetch_rows()), 2) self.assertEqual(ds.row_count, 2) def dataset_read(self): """Test reading a dataset.""" self.setup_fileserver() dh = self.db.load_dataset(self.fileserver.upload_file(CSV_FILE)) ds = self.db.get_dataset(dh.identifier) ds_rows = ds.fetch_rows() self.assertEqual(dh.identifier, ds.identifier) self.assertEqual(len(dh.columns), len(ds.columns)) self.assertEqual(len(dh.fetch_rows()), len(ds_rows)) self.assertEqual(len(dh.fetch_rows()), len(ds_rows)) self.assertEqual(dh.row_count, len(ds_rows)) # Name,Age,Salary # Alice,23,35K # Bob,32,30K self.assertEqual(ds.column_index('Name'), 0) self.assertEqual(ds.column_index('Age'), 1) self.assertEqual(ds.column_index('Salary'), 2) row = ds_rows[0] self.assertEqual(row.values[0], 'Alice') self.assertEqual(int(row.values[1]), 23) self.assertEqual(row.values[2], '35K') row = ds_rows[1] self.assertEqual(row.values[0], 'Bob') self.assertEqual(int(row.values[1]), 32) self.assertEqual(row.values[2], '30K')
class TestMimirProcessor(unittest.TestCase): """Individual test for Mimir lenses. Run separately since each test has to initialize and shout down the Mimir gateway. """ def setUp(self): """Create an instance of the Mimir processor for an empty server directory. """ # Drop directory if it exists if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.makedirs(SERVER_DIR) self.processor = MimirProcessor() self.datastore = MimirDatastore(DATASTORE_DIR) self.filestore = FileSystemFilestore(FILESTORE_DIR) def tearDown(self): """Clean-up by dropping the server directory. """ if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) def test_domain_lens(self): """Test DOMAIN lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch f_handle = self.filestore.upload_file(INCOMPLETE_CSV_FILE) ds = self.datastore.load_dataset(f_handle=f_handle) col_age = ds.column_by_name('Age') command = cmd.mimir_domain(DATASET_NAME, col_age.identifier) result = self.processor.compute( command_id=command.command_id, arguments=command.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={DATASET_NAME: ds.identifier})) self.assertTrue(result.is_success) ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) rows = ds.fetch_rows() self.assertNotEqual(rows[2].values[ds.column_index('Age')], '') # Introduce an error. Make sure command formating is correct command = cmd.mimir_domain('MY DS', 'MY COL') with self.assertRaises(ValueError): result = self.processor.compute( command_id=command.command_id, arguments=command.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={DATASET_NAME: ds.identifier})) def test_geocode_lens(self): """Test GEOCODE lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch f_handle = self.filestore.upload_file(GEO_FILE) ds = self.datastore.load_dataset(f_handle=f_handle) # Geocode Lens command = cmd.mimir_geocode( DATASET_NAME, 'GOOGLE', house_nr=ds.column_by_name('STRNUMBER').identifier, street=ds.column_by_name('STRNAME').identifier, city=ds.column_by_name('CITY').identifier, state=ds.column_by_name('STATE').identifier) result = self.processor.compute( command_id=command.command_id, arguments=command.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={DATASET_NAME: ds.identifier})) self.assertTrue(result.is_success) ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) columns = [c.name for c in ds.columns] self.assertEqual(len(columns), 6) self.assertTrue('LATITUDE' in columns) self.assertTrue('LONGITUDE' in columns) result = self.processor.compute( command_id=command.command_id, arguments=command.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={DATASET_NAME: ds.identifier})) self.assertTrue(result.is_success) ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) columns = [c.name for c in ds.columns] self.assertEqual(len(columns), 8) self.assertTrue('LATITUDE_1' in columns) self.assertTrue('LONGITUDE_1' in columns) self.assertEqual(len(ds.columns), 8) def test_key_repair_lens(self): """Test KEY REPAIR lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch f_handle = self.filestore.upload_file(KEY_REPAIR_FILE) ds1 = self.datastore.load_dataset(f_handle=f_handle) # Missing Value Lens command = cmd.mimir_key_repair(DATASET_NAME, ds1.column_by_name('Empid').identifier) result = self.processor.compute( command_id=command.command_id, arguments=command.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={DATASET_NAME: ds1.identifier})) self.assertTrue(result.is_success) ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) self.assertEqual(len(ds.columns), 4) self.assertEqual(ds.row_count, 3) names = set() empids = set() rowids = set() for row in ds.fetch_rows(): rowids.add(row.identifier) empids.add(int(row.get_value('empid'))) names.add(row.get_value('name')) self.assertTrue(1 in empids) self.assertTrue(2 in rowids) self.assertTrue('Alice' in names) self.assertTrue('Carla' in names) # Test error case and command text command = cmd.mimir_key_repair('MY DS', 'MY COL') with self.assertRaises(ValueError): self.processor.compute(command_id=command.command_id, arguments=command.arguments, context=TaskContext( datastore=self.datastore, filestore=self.filestore, datasets={DATASET_NAME: ds.identifier})) def test_missing_value_lens(self): """Test MISSING_VALUE lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch f_handle = self.filestore.upload_file(INCOMPLETE_CSV_FILE) ds = self.datastore.load_dataset(f_handle=f_handle) # Missing Value Lens command = cmd.mimir_missing_value( DATASET_NAME, columns=[{ 'column': ds.column_by_name('AGE').identifier }]) result = self.processor.compute( command_id=command.command_id, arguments=command.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={DATASET_NAME: ds.identifier})) self.assertTrue(result.is_success) # Get dataset ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) rows = ds.fetch_rows() for row in rows: self.assertIsNotNone(row.values[1]) self.assertNotEqual(rows[2].values[ds.column_index('Age')], '') # MISSING VALUE Lens with value constraint command = cmd.mimir_missing_value( DATASET_NAME, columns=[{ 'column': ds.column_by_name('AGE').identifier, 'constraint': '> 30' }], ) result = self.processor.compute( command_id=command.command_id, arguments=command.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={DATASET_NAME: ds.identifier})) self.assertTrue(result.is_success) # Get dataset ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) rows = ds.fetch_rows() for row in rows: self.assertIsNotNone(row.values[1]) self.assertTrue(rows[2].values[ds.column_index('Age')] > 30) def test_missing_key_lens(self): """Test MISSING_KEY lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch f_handle = self.filestore.upload_file(INCOMPLETE_CSV_FILE) ds = self.datastore.load_dataset(f_handle=f_handle) # Missing Value Lens age_col = ds.column_by_name('Age').identifier command = cmd.mimir_missing_key(DATASET_NAME, age_col) result = self.processor.compute( command_id=command.command_id, arguments=command.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={DATASET_NAME: ds.identifier})) self.assertTrue(result.is_success) # Get dataset ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) self.assertEqual(len(ds.columns), 3) rows = ds.fetch_rows() self.assertEqual(len(rows), 24) command = cmd.mimir_missing_key(DATASET_NAME, ds.column_by_name('Salary').identifier) result = self.processor.compute( command_id=command.command_id, arguments=command.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={DATASET_NAME: ds.identifier})) self.assertTrue(result.is_success) # Get dataset ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) self.assertEqual(len(ds.columns), 3) rows = ds.fetch_rows() self.assertEqual(len(rows), 55) def test_picker_lens(self): """Test PICKER lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch f_handle = self.filestore.upload_file(PICKER_FILE) ds = self.datastore.load_dataset(f_handle=f_handle) command = cmd.mimir_picker( DATASET_NAME, [{ 'pickFrom': ds.column_by_name('Age').identifier }, { 'pickFrom': ds.column_by_name('Salary').identifier }]) result = self.processor.compute( command_id=command.command_id, arguments=command.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={DATASET_NAME: ds.identifier})) self.assertTrue(result.is_success) # Get dataset ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) columns = [c.name for c in ds.columns] print(columns) self.assertEqual(len(ds.columns), 5) self.assertTrue('PICK_ONE_AGE_SALARY' in columns) # Pick another column, this time with custom name command = cmd.mimir_picker( DATASET_NAME, [{ 'pickFrom': ds.column_by_name('Age').identifier }, { 'pickFrom': ds.column_by_name('Salary').identifier }], pick_as='My_Column') result = self.processor.compute( command_id=command.command_id, arguments=command.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={DATASET_NAME: ds.identifier})) self.assertTrue(result.is_success) # Get dataset ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) columns = [c.name for c in ds.columns] self.assertEqual(len(ds.columns), 6) self.assertTrue('PICK_ONE_AGE_SALARY' in columns) self.assertTrue('MY_COLUMN' in columns) # Pick from a picked column command = cmd.mimir_picker( DATASET_NAME, [{ 'pickFrom': ds.column_by_name('Age').identifier }, { 'pickFrom': ds.column_by_name('PICK_ONE_AGE_SALARY').identifier }], pick_as='My_Next_Column') result = self.processor.compute( command_id=command.command_id, arguments=command.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={DATASET_NAME: ds.identifier})) self.assertTrue(result.is_success) # Get dataset ds = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) columns = [c.name for c in ds.columns] self.assertTrue('MY_NEXT_COLUMN' in columns) def test_schema_matching_lens(self): """Test SCHEMA_MATCHING lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch f_handle = self.filestore.upload_file(CSV_FILE) ds = self.datastore.load_dataset(f_handle=f_handle) # Missing Value Lens command = cmd.mimir_schema_matching(DATASET_NAME, [{ 'column': 'BDate', 'type': 'int' }, { 'column': 'PName', 'type': 'varchar' }], 'new_' + DATASET_NAME) result = self.processor.compute( command_id=command.command_id, arguments=command.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={DATASET_NAME: ds.identifier})) self.assertTrue(result.is_success) # Get dataset ds = self.datastore.get_dataset( result.provenance.write['new_' + DATASET_NAME].identifier) self.assertEqual(len(ds.columns), 2) self.assertEqual(ds.row_count, 2) def test_type_inference_lens(self): """Test TYPE INFERENCE lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch f_handle = self.filestore.upload_file(INCOMPLETE_CSV_FILE) ds = self.datastore.load_dataset(f_handle=f_handle) # Infer type command = cmd.mimir_type_inference(DATASET_NAME, 0.6) result = self.processor.compute( command_id=command.command_id, arguments=command.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={DATASET_NAME: ds.identifier})) self.assertTrue(result.is_success) # Get dataset ds2 = self.datastore.get_dataset( result.provenance.write[DATASET_NAME].identifier) self.assertEqual(len(ds2.columns), 3) self.assertEqual(ds2.row_count, 7) ds1_rows = ds.fetch_rows() ds2_rows = ds2.fetch_rows() for i in range(ds2.row_count): self.assertEqual(ds1_rows[i].values, ds2_rows[i].values)
class TestDatasetPaginationReader(unittest.TestCase): def set_up(self, engine): """Create an empty file server repository.""" if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.mkdir(SERVER_DIR) # Setup file server self.fs = FileSystemFilestore(FILESERVER_DIR) # Setup the respective datastore and Vizual engine if engine == ENGINEENV_DEFAULT: self.datastore = FileSystemDatastore(DATASTORE_DIR) elif engine == ENGINEENV_MIMIR: self.datastore = MimirDatastore(DATASTORE_DIR) def tear_down(self, engine): """Clean-up by dropping file server directory. """ if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) def test_default_engine(self): """Test functionality for the default setup.""" self.run_tests(ENGINEENV_DEFAULT) def test_mimir_engine(self): """Test functionality for the Mimir setup.""" import vizier.mimir as mimir # noqa: F401 self.run_tests(ENGINEENV_MIMIR) def run_tests(self, engine): """Run sequence of tests for given configuration.""" self.set_up(engine) ds = self.datastore.load_dataset(self.fs.upload_file(CSV_FILE_1)) rows = ds.fetch_rows() self.assertEqual(len(rows), 7) rows = ds.fetch_rows(offset=1) self.assertEqual(len(rows), 6) self.assertEqual(rows[0].values[0], 'Bob') self.assertEqual(rows[5].values[0], 'Gertrud') rows = ds.fetch_rows(limit=2) self.assertEqual(len(rows), 2) self.assertEqual(rows[0].values[0], 'Alice') self.assertEqual(rows[1].values[0], 'Bob') rows = ds.fetch_rows(offset=4, limit=3) self.assertEqual(len(rows), 3) self.assertEqual(rows[0].values[0], 'Eileen') self.assertEqual(rows[2].values[0], 'Gertrud') rows = ds.fetch_rows(offset=5, limit=3) self.assertEqual(len(rows), 2) self.assertEqual(rows[0].values[0], 'Frank') self.assertEqual(rows[1].values[0], 'Gertrud') rows = ds.fetch_rows(offset=6, limit=3) self.assertEqual(len(rows), 1) self.assertEqual(rows[0].values[0], 'Gertrud') # Test larger dataset with deletes ds = self.datastore.load_dataset(self.fs.upload_file(CSV_FILE_2)) rows = ds.fetch_rows(offset=0, limit=10) self.assertEqual(len(rows), 10) rows = ds.fetch_rows(offset=10, limit=20) self.assertEqual(len(rows), 20) rows = ds.fetch_rows(offset=60, limit=10) self.assertEqual(len(rows), 3) self.tear_down(engine)
def test_upload_file(self): """Test file upload.""" db = FileSystemFilestore(SERVER_DIR) fh = db.upload_file(CSV_FILE) self.assertEqual(fh.file_name, os.path.basename(CSV_FILE)) self.assertEqual(fh.mimetype, fs.FORMAT_CSV) self.assertEqual(fh.identifier, db.get_file(fh.identifier).identifier) self.assertTrue( os.path.isfile( os.path.join(SERVER_DIR, fh.identifier, METADATA_FILENAME))) self.assertTrue(os.path.isfile(fh.filepath)) self.assertTrue(fh.is_tabular) # Re-load the repository db = FileSystemFilestore(SERVER_DIR) fh = db.get_file(fh.identifier) self.assertEqual(fh.file_name, os.path.basename(CSV_FILE)) self.assertEqual(fh.mimetype, fs.FORMAT_CSV) self.assertEqual(fh.identifier, db.get_file(fh.identifier).identifier) # Add files with other valid suffixes fh = db.upload_file(CSV_FILE) self.assertFalse(fh.compressed) self.assertEqual(fh.delimiter, ',') fh = db.upload_file(GZIP_CSV_FILE) self.assertTrue(fh.compressed) self.assertEqual(fh.delimiter, ',') fh = db.upload_file(TSV_FILE) self.assertFalse(fh.compressed) self.assertEqual(fh.delimiter, '\t') fh = db.upload_file(GZIP_TSV_FILE) self.assertTrue(fh.compressed) self.assertEqual(fh.delimiter, '\t') # Re-load the repository db = FileSystemFilestore(SERVER_DIR) self.assertEqual(len(db.list_files()), 5) fh = db.upload_file(TEXT_FILE) self.assertFalse(fh.is_tabular)
def test_list_file(self): """Test list files method.""" db = FileSystemFilestore(SERVER_DIR) db.upload_file(CSV_FILE) db.upload_file(GZIP_CSV_FILE) db.upload_file(TSV_FILE) db.upload_file(GZIP_TSV_FILE) files = db.list_files() self.assertEqual(len(files), 4) db.upload_file(CSV_FILE) db.upload_file(GZIP_CSV_FILE) db.upload_file(TSV_FILE) db.upload_file(GZIP_TSV_FILE) files = db.list_files() self.assertEqual(len(files), 8)
class TestVizierClient(unittest.TestCase): def setUp(self): """Create instances of the default datastore and filestore.""" # Drop directory if it exists if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.makedirs(SERVER_DIR) self.datastore=FileSystemDatastore(DATASTORE_DIR) self.filestore=FileSystemFilestore(FILESTORE_DIR) def tearDown(self): """Clean-up by dropping the server directory. """ if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) def test_create_new_dataset(self): """Test creating and updating a new dataset via the client.""" client = VizierDBClient( datastore=self.datastore, datasets=dict(), dataobjects=dict(), source="", project_id=7 ) ds = DatasetClient() ds.insert_column('Name') ds.insert_column('Age') ds.insert_row(['Alice', '23']) ds.insert_row(['Bob', '25']) rows = ds.rows ds = client.create_dataset('MyDataset', ds) # Ensure the returned dataset contains the input data self.assertEqual([c.name for c in ds.columns], ['Name', 'Age']) self.assertEqual([str(v) for v in ds.rows[0].values], ['Alice', '23']) self.assertEqual([str(v) for v in ds.rows[1].values], ['Bob', '25']) # Modify the reference to the original rows to ensure that the rows in # the loaded datasets are not affected self.assertEqual([str(v) for v in rows[0].values], ['Alice', '23']) rows[0].set_value(0, 'Jane') self.assertEqual([str(v) for v in rows[0].values], ['Jane', '23']) self.assertEqual([str(v) for v in ds.rows[0].values], ['Alice', '23']) # Update dataset ds.rows[1].set_value('Age', '26') ds.save() ds = client.get_dataset('MyDataset') self.assertEqual([str(v) for v in ds.rows[1].values], ['Bob', '26']) # Value error when creating dataset with existing name with self.assertRaises(ValueError): client.create_dataset('MyDataset', ds) # Value error when retrieving unknown dataset with self.assertRaises(ValueError): client.get_dataset('SomeDataset') # Ensure the returned dataset contains the modified data client.rename_dataset('MyDataset', 'SomeDataset') ds = client.get_dataset('SomeDataset') # Ensure that access to unknown datasets is recorded with self.assertRaises(ValueError): client.get_dataset('ThisIsNotADataset') self.assertTrue('mydataset' in client.write) self.assertTrue('somedataset' in client.write) self.assertTrue('thisisnotadataset' in client.read) self.assertFalse('thisisnotadataset' in client.write) def test_update_existing_dataset(self): """Test creating and updating an existing dataset via the client.""" # Move columns around ds = self.datastore.load_dataset(self.filestore.upload_file(CSV_FILE)) client = VizierDBClient( datastore=self.datastore, datasets={DATASET_NAME:ds}, dataobjects=dict(), source="", project_id=7 ) ds = client.get_dataset(DATASET_NAME) col_1 = [row.get_value(1) for row in ds.rows] ds.insert_column('empty', 3) ds = client.update_dataset(DATASET_NAME, ds) col_2 = [row.get_value(2) for row in ds.rows] ds.move_column('empty', 1) ds = client.update_dataset(DATASET_NAME, ds) for i in range(len(ds.rows)): row = ds.rows[i] self.assertEqual(row.values[3], col_2[i]) self.assertEqual(row.values[2], col_1[i]) # Rename ds.columns[1].name = 'allnone' ds = client.update_dataset(DATASET_NAME, ds) for i in range(len(ds.rows)): row = ds.rows[i] self.assertEqual(row.get_value('allnone'), None) self.assertEqual(row.values[2], col_1[i]) # Insert row row = ds.insert_row() row.set_value('Name', 'Zoe') ds = client.create_dataset('upd', ds) self.assertEqual(len(ds.rows), 3) r2 = ds.rows[2] self.assertEqual(r2.values, ['Zoe', None, None, None]) # Delete column ds = client.get_dataset(DATASET_NAME) ds.delete_column('Age') client.update_dataset(DATASET_NAME, ds) ds = client.get_dataset(DATASET_NAME) names = [col.name.upper() for col in ds.columns] self.assertTrue('NAME' in names) self.assertFalse('AGE' in names) self.assertTrue(DATASET_NAME in client.read) self.assertTrue(DATASET_NAME in client.write) self.assertFalse('upd' in client.read) self.assertTrue('upd' in client.write)
class TestDefaultPlotProcessor(unittest.TestCase): def setUp(self): """Create instances of the default datastore and filestore.""" # Drop directory if it exists if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.makedirs(SERVER_DIR) self.datastore=FileSystemDatastore(DATASTORE_DIR) self.filestore=FileSystemFilestore(FILESTORE_DIR) def tearDown(self): """Clean-up by dropping the server directory. """ if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) def test_advanced_plot(self): """Test running the simple plot command with a more advanced chart definition. """ fh = self.filestore.upload_file(TSV_FILE) ds = self.datastore.load_dataset(fh) cmd = create_plot( dataset_name=DATASET_NAME, chart_name='My Chart', series=[{'column': 1, 'range': '25:30', 'label': 'A'}, {'column': 0, 'range': '25:30'}], validate=True ) result = PlotProcessor().compute( command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext( project_id=0, datastore=self.datastore, filestore=self.filestore, artifacts={DATASET_NAME: ds} ) ) chart = result.outputs.stdout[0].value self.assertEqual(chart['data']['data'][0]['label'], 'A') self.assertEqual(chart['data']['data'][1]['label'], 'average_class_size') self.assertEqual(chart['result']['series'][0]['label'], 'A') self.assertEqual(chart['result']['series'][1]['label'], 'average_class_size') self.assertEqual(len(chart['result']['series'][0]['data']), 6) self.assertEqual(len(chart['result']['series'][1]['data']), 6) def test_simple_plot(self): """Test running the simple plot command.""" fh = self.filestore.upload_file(CSV_FILE) ds = self.datastore.load_dataset(fh) cmd = create_plot( dataset_name=DATASET_NAME, chart_name='My Chart', series=[{'column': 1}], validate=True ) result = PlotProcessor().compute( command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext( project_id=0, datastore=self.datastore, filestore=self.filestore, artifacts={DATASET_NAME: ds} ) )
class TestDefaultVizualProcessor(unittest.TestCase): def setUp(self): """Create an instance of the default vizier processor for an empty server directory. """ # Drop directory if it exists if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.makedirs(SERVER_DIR) self.processor = VizualTaskProcessor(api=DefaultVizualApi()) self.datastore = FileSystemDatastore(DATASTORE_DIR) self.filestore = FileSystemFilestore(FILESTORE_DIR) def tearDown(self): """Clean-up by dropping the server directory. """ if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) def test_create_api_from_dictionary(self): """Test creating the processor instance with properties parameter instead of api. """ processor = VizualTaskProcessor( properties={ PROPERTY_API: ClassLoader.to_dict( module_name='vizier.engine.packages.vizual.api.fs', class_name='DefaultVizualApi') }) fh = self.filestore.upload_file(CSV_FILE) cmd = vizual.load_dataset(dataset_name=DATASET_NAME, file={pckg.FILE_ID: fh.identifier}, validate=True) result = processor.compute(command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext( project_id=5, datastore=self.datastore, filestore=self.filestore, artifacts={})) self.assertIsNotNone(result.provenance.write) self.assertTrue(DATASET_NAME in result.provenance.write) dataset_id = result.provenance.write[DATASET_NAME].identifier self.assertTrue(result.provenance.read is None or len(result.provenance.read) == 0) self.assertIsNotNone(result.provenance.resources) self.assertEqual(result.provenance.resources[RESOURCE_DATASET], dataset_id) def load_dataset(self): """Load a single dataset and return the resulting database state.""" fh = self.filestore.upload_file(CSV_FILE) cmd = vizual.load_dataset(dataset_name=DATASET_NAME, file={pckg.FILE_ID: fh.identifier}, validate=True) result = self.processor.compute(command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext( project_id=5, datastore=self.datastore, filestore=self.filestore, artifacts={})) return result.provenance.write def test_delete_column(self): """Test functionality to delete a column.""" cmd = vizual.delete_column(dataset_name=DATASET_NAME, column=1, validate=True) self.validate_command(cmd) def test_delete_row(self): """Test functionality to delete a row.""" cmd = vizual.delete_row(dataset_name=DATASET_NAME, row=1, validate=True) self.validate_command(cmd) def test_drop_dataset(self): """Test functionality to drop a dataset.""" cmd = vizual.drop_dataset(dataset_name=DATASET_NAME, validate=True) datasets = self.load_dataset() dataset_id = datasets[DATASET_NAME] result = self.processor.compute( command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext(project_id=5, datastore=self.datastore, filestore=self.filestore, artifacts={DATASET_NAME: dataset_id})) self.assertFalse(DATASET_NAME in result.provenance.read) self.assertTrue(DATASET_NAME in result.provenance.delete) self.assertFalse(DATASET_NAME in result.provenance.write) def test_filter_columns(self): """Test projection of a dataset.""" # Create a new dataset cmd = vizual.projection(dataset_name=DATASET_NAME, columns=[{ 'column': 1 }, { 'column': 2, 'name': 'MyName' }], validate=True) self.validate_command(cmd) def test_insert_column(self): """Test functionality to insert a columns.""" cmd = vizual.insert_column(dataset_name=DATASET_NAME, position=1, name='My Col', validate=True) self.validate_command(cmd) def test_insert_row(self): """Test functionality to insert a row.""" # Create a new dataset cmd = vizual.insert_row(dataset_name=DATASET_NAME, position=1, validate=True) self.validate_command(cmd) def test_load_dataset(self): """Test functionality to load a dataset.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) cmd = vizual.load_dataset(dataset_name='ABC', file={pckg.FILE_ID: fh.identifier}, validate=True) result = self.processor.compute(command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext( project_id=5, datastore=self.datastore, filestore=self.filestore, artifacts={})) self.assertIsNotNone(result.provenance.write) self.assertTrue('abc' in result.provenance.write) dataset_id = result.provenance.write['abc'].identifier self.assertTrue(result.provenance.read is None or len(result.provenance.read) == 0) self.assertIsNotNone(result.provenance.resources) self.assertEqual(result.provenance.resources[RESOURCE_DATASET], dataset_id) # Running load again will not change the dataset identifier result = self.processor.compute( command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext(project_id=5, datastore=self.datastore, filestore=self.filestore, artifacts={}, resources=result.provenance.resources)) self.assertEqual(result.provenance.write['abc'].identifier, dataset_id) self.assertEqual(result.provenance.resources[RESOURCE_DATASET], dataset_id) def test_move_column(self): """Test functionality to move a column.""" cmd = vizual.move_column(dataset_name=DATASET_NAME, column=0, position=1, validate=True) self.validate_command(cmd) def test_move_row(self): """Test functionality to move a row.""" cmd = vizual.move_row(dataset_name=DATASET_NAME, row=0, position=1, validate=True) self.validate_command(cmd) def test_rename_column(self): """Test functionality to rename a column.""" cmd = vizual.rename_column(dataset_name=DATASET_NAME, column=1, name='The col', validate=True) self.validate_command(cmd) def test_rename_dataset(self): """Test functionality to rename a dataset.""" cmd = vizual.rename_dataset(dataset_name=DATASET_NAME, new_name='XYZ', validate=True) datasets = self.load_dataset() dataset_id = datasets[DATASET_NAME] result = self.processor.compute( command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext(project_id=5, datastore=self.datastore, filestore=self.filestore, artifacts={DATASET_NAME: dataset_id})) self.assertFalse(DATASET_NAME in result.provenance.write) self.assertFalse(DATASET_NAME in result.provenance.read) self.assertTrue(DATASET_NAME in result.provenance.delete) self.assertFalse(DATASET_NAME in result.provenance.write) self.assertTrue('xyz' in result.provenance.write) def test_sort_dataset(self): """Test sorting a dataset.""" cmd = vizual.sort_dataset(dataset_name=DATASET_NAME, columns=[{ 'column': 1, 'order': 'Z-A' }, { 'column': 2, 'order': 'A-Z' }], validate=True) self.validate_command(cmd) def test_update_cell(self): """Test functionality to update a dataset cell.""" # Create a new dataset datasets = self.load_dataset() dataset = self.datastore.get_dataset(datasets[DATASET_NAME].identifier) row_ids = [row.identifier for row in dataset.fetch_rows()] cmd = vizual.update_cell(dataset_name=DATASET_NAME, column=1, row=row_ids[0], value=9, validate=True) self.validate_command(cmd, dataset=dataset) def validate_command(self, cmd, dataset=None): """Validate execution of the given command.""" if dataset is None: datasets = self.load_dataset() dataset = datasets[DATASET_NAME] result = self.processor.compute(command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext( project_id=5, datastore=self.datastore, filestore=self.filestore, artifacts={DATASET_NAME: dataset})) self.assertNotEqual(result.provenance.write[DATASET_NAME].identifier, dataset.identifier) self.assertIsNotNone(result.provenance.read) self.assertEqual(result.provenance.read[DATASET_NAME], dataset.identifier) self.assertIsNotNone(result.provenance.write) with self.assertRaises(ValueError): result = self.processor.compute(command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext( project_id=5, datastore=self.datastore, filestore=self.filestore, artifacts={}))
class TestDefaultVizualApi(unittest.TestCase): api: MimirVizualApi def setUp(self): """Create an instance of the default vizier API for an empty server directory. """ # Drop directory if it exists if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.makedirs(SERVER_DIR) self.api = MimirVizualApi() self.datastore = MimirDatastore(DATASTORE_DIR) self.filestore = FileSystemFilestore(FILESTORE_DIR) def tearDown(self): """Clean-up by dropping the server directory. """ if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) def test_api(self): """Run all tests after we initialize mimir. Make sure to create a fresh environment after each test. """ self.delete_column() self.setUp() self.delete_row() self.setUp() self.filter_columns() self.setUp() self.insert_column() self.setUp() self.insert_row() self.setUp() self.load_dataset() self.setUp() self.move_column() self.setUp() self.move_row() self.setUp() self.rename_column() self.setUp() self.sequence_of_steps() self.setUp() self.sort_dataset() self.setUp() self.update_cell() def delete_column(self): """Test functionality to delete a column.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset ds_rows = ds.fetch_rows() # Keep track of column and row identifier row_ids = [row.identifier for row in ds_rows] # Delete Age column col_id = ds.column_by_name('AGE').identifier result = self.api.delete_column(ds.identifier, col_id, self.datastore) # Resulting dataset should differ from previous one self.assertNotEqual(result.dataset.identifier, ds.identifier) # Retrieve modified dataset and ensure that it cobtains the following # # Name, Salary # ------------ # Alice, 35K # Bob, 30K ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() # Schema is Name, Salary self.assertEqual(len(ds.columns), 2) self.assertEqual(ds.columns[0].name.upper(), 'NAME') self.assertEqual(ds.columns[1].name.upper(), 'SALARY') # Make sure that all rows only have two columns row = ds_rows[0] self.assertEqual(len(row.values), 2) self.assertEqual(len(row.values), 2) self.assertEqual(row.values[0], 'Alice') self.assertEqual(row.values[1], '35K') row = ds_rows[1] self.assertEqual(len(row.values), 2) self.assertEqual(len(row.values), 2) self.assertEqual(row.values[0], 'Bob') self.assertEqual(row.values[1], '30K') # Ensure that row identifier haven't changed for i in range(len(ds_rows)): self.assertEqual(ds_rows[i].identifier, row_ids[i]) # Ensure exception is thrown if dataset identifier is unknown with self.assertRaises(MimirError): self.api.delete_column('unknown:uri', 0, self.datastore) # Ensure exception is thrown if column identifier is unknown with self.assertRaises(ValueError): self.api.delete_column(ds.identifier, 100, self.datastore) def delete_row(self): """Test functionality to delete a row.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset ds_rows = ds.fetch_rows() # Keep track of column and row identifier col_ids = [col.identifier for col in ds.columns] row_ids = [row.identifier for row in ds_rows] # Delete second row result = self.api.delete_row(ds.identifier, row_ids[1], self.datastore) del row_ids[1] # Resulting dataset should differ from previous one self.assertNotEqual(result.dataset.identifier, ds.identifier) # Retrieve modified dataset and ensure that it contains the following # data: # # Name, Age, Salary # ------------ # Alice, 23, 35K ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() # Schema is Name, Salary col_names = ['Name', 'Age', 'Salary'] self.assertEqual(len(ds.columns), len(col_names)) for i in range(len(ds.columns)): self.assertEqual(ds.columns[i].name.upper(), col_names[i].upper()) # Make sure column identifier haven't changed for i in range(len(ds.columns)): self.assertEqual(ds.columns[i].identifier, col_ids[i]) # There should only be one row self.assertEqual(len(ds_rows), 1) # Ensure that row identifier haven't changed for i in range(len(ds_rows)): self.assertEqual(ds_rows[i].identifier, row_ids[i]) # Ensure exception is thrown if dataset is unknown with self.assertRaises(MimirError): self.api.delete_row('unknown:uri', 0, self.datastore) def filter_columns(self): """Test projection of a dataset.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset result = self.api.filter_columns(ds.identifier, [2, 0], ['BD', None], self.datastore) ds = self.datastore.get_dataset(result.dataset.identifier) self.assertEqual(len(ds.columns), 2) self.assertEqual(ds.columns[0].name.upper(), 'BD') self.assertEqual(ds.columns[1].name.upper(), 'NAME') rows = ds.fetch_rows() self.assertEqual(rows[0].values, ['35K', 'Alice']) self.assertEqual(rows[1].values, ['30K', 'Bob']) def insert_column(self): """Test functionality to insert a columns.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset ds_rows = ds.fetch_rows() # Keep track of column and row identifier col_ids = [col.identifier for col in ds.columns] row_ids = [row.identifier for row in ds_rows] # Insert columns at position 1 col_ids.insert(1, ds.max_column_id() + 1) result = self.api.insert_column(ds.identifier, 1, 'Height', self.datastore) # Resulting dataset should differ from previous one self.assertNotEqual(result.dataset.identifier, ds.identifier) # Retrieve dataset and ensure that it has the following schema: # Name, Height, Age, Salary ds = self.datastore.get_dataset(result.dataset.identifier) col_names = ['Name', 'Height', 'Age', 'Salary'] # Ensure that there are four rows self.assertEqual(len(ds.columns), len(col_names)) print(ds.columns) for i in range(len(col_names)): col = ds.columns[i] self.assertEqual(col.name.upper(), col_names[i].upper()) # Insert columns at last position col_ids.append(ds.max_column_id() + 1) col_names.append('Weight') result = self.api.insert_column(ds.identifier, 4, 'Weight', self.datastore) # Resulting dataset should differ from previous one self.assertNotEqual(result.dataset.identifier, ds.identifier) # Retrieve dataset and ensure that it has the following schema: # Name, Height, Age, Salary, Weight ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() # Ensure that there are five rows self.assertEqual(len(ds.columns), len(col_names)) for i in range(len(col_names)): col = ds.columns[i] self.assertEqual(col.name.upper(), col_names[i].upper()) # The cell values for new columns are None all other values are not None for row in ds_rows: for i in range(len(ds.columns)): if i == 1 or i == 4: self.assertIsNone(row.values[i]) else: self.assertTrue(row.values[i]) # Ensure that row identifier haven't changed for i in range(len(ds_rows)): self.assertEqual(ds_rows[i].identifier, row_ids[i]) # Ensure exception is thrown if dataset identifier is unknown with self.assertRaises(MimirError): self.api.insert_column('unknown:uri', 1, 'Height', self.datastore) # Ensure exception is thrown if column name is invalid self.api.insert_column(ds.identifier, 1, 'Height_from_ground', self.datastore) with self.assertRaises(ValueError): self.api.insert_column(ds.identifier, 1, 'Height from ground!@#', self.datastore) # Ensure exception is thrown if column position is out of bounds with self.assertRaises(ValueError): self.api.insert_column(ds.identifier, 100, 'Height', self.datastore) def insert_row(self): """Test functionality to insert a row.""" fh = self.filestore.upload_file(CSV_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset # Keep track of column and row identifier ds_rows = ds.fetch_rows() row_ids = [row.identifier for row in ds_rows] # Insert row at index position 1 row_ids.insert(1, None) # Result should indicate that one row was inserted. The identifier of # the resulting dataset should differ from the identifier of the # original dataset result = self.api.insert_row(ds.identifier, 1, self.datastore) # Resulting dataset should differ from previous one self.assertNotEqual(result.dataset.identifier, ds.identifier) # Retrieve modified dataset ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() # Ensure that there are three rows self.assertEqual(len(ds_rows), 3) # The second row has empty values for each column row = ds_rows[1] self.assertEqual(len(row.values), len(ds.columns)) for i in range(len(ds.columns)): self.assertIsNone(row.values[i]) # Append row at end current dataset row_ids.append(None) result = self.api.insert_row(ds.identifier, 3, self.datastore) # Resulting dataset should differ from previous one self.assertNotEqual(result.dataset.identifier, ds.identifier) ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() # Ensure that there are three rows self.assertEqual(len(ds_rows), 4) # The next to last row has non-empty values for each column row = ds_rows[2] self.assertEqual(len(row.values), len(ds.columns)) for i in range(len(ds.columns)): self.assertIsNotNone(row.values[i]) # The last row has empty values for each column row = ds_rows[3] self.assertEqual(len(row.values), len(ds.columns)) for i in range(len(ds.columns)): self.assertIsNone(row.values[i]) # Ensure that row ids haven't changed # ## July 16, 2020 by OK: Bug in mimir that is going to take a bunch of # ## heavy lifting to fix: https://github.com/UBOdin/mimir-api/issues/11 # for i in range(len(ds_rows)): # if row_ids[i] is not None: # self.assertEqual(int(ds_rows[i].identifier), int(row_ids[i])) # Ensure exception is thrown if dataset identifier is unknown with self.assertRaises(MimirError): self.api.insert_row('unknown:uri', 1, self.datastore) # Ensure no exception is raised self.api.insert_row(ds.identifier, 4, self.datastore) def load_dataset(self) -> None: """Test functionality to load a dataset.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) result = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier) ds = result.dataset resources = result.resources assert (isinstance(ds, DatasetHandle)) ds_rows = ds.fetch_rows() self.assertEqual(len(ds.columns), 3) self.assertEqual(len(ds_rows), 2) for row in ds_rows: self.assertTrue(isinstance(row.values[1], int)) self.assertIsNotNone(resources) self.assertEqual(resources[RESOURCE_FILEID], fh.identifier) self.assertEqual(resources[RESOURCE_DATASET], ds.identifier) # Delete file handle and oing the same should raise an exception self.filestore.delete_file(fh.identifier) with self.assertRaises(ValueError): self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier) # Ensure exception is thrown if dataset identifier is unknown with self.assertRaises(ValueError): self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id='unknown:uri') # Test loading file from external resource. Skip if DOWNLOAD_URL is None if DOWNLOAD_URL is None: print('Skipping download test') return result = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, url=DOWNLOAD_URL, options=[{ 'delimiter': '\t' }]) ds = result.dataset resources = result.resources ds_rows = ds.fetch_rows() self.assertEqual(len(ds.columns), 4) self.assertEqual(len(ds_rows), 54) self.assertIsNotNone(resources) self.assertEqual(resources[RESOURCE_URL], DOWNLOAD_URL) self.assertEqual(resources[RESOURCE_DATASET], ds.identifier) # Attempt to simulate re-running without downloading again. Set the # Uri to some fake Uri that would raise an exception if an attempt was # made to download url = 'some fake uri' resources[RESOURCE_URL] = url result = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, url=url, resources=resources) prev_id = result.dataset.identifier self.assertEqual(result.dataset.identifier, prev_id) # If we re-run with reload flag true a new dataset should be returned resources[RESOURCE_URL] = DOWNLOAD_URL result = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, url=DOWNLOAD_URL, resources=resources, reload=True, options=[{ 'delimiter': '\t' }]) self.assertNotEqual(result.dataset.identifier, prev_id) def move_column(self): """Test functionality to move a column.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset ds_rows = ds.fetch_rows() # Keep track of column and row identifier col_ids = [col.identifier for col in ds.columns] row_ids = [row.identifier for row in ds_rows] # Swap first two columns c = col_ids[0] del col_ids[0] col_ids.insert(1, c) result = self.api.move_column(ds.identifier, ds.column_by_name('Name').identifier, 1, self.datastore) self.assertNotEqual(result.dataset.identifier, ds.identifier) ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() self.assertEqual(ds.columns[0].name.upper(), 'Age'.upper()) self.assertEqual(ds.columns[1].name.upper(), 'Name'.upper()) self.assertEqual(ds.columns[2].name.upper(), 'Salary'.upper()) row = ds_rows[0] self.assertEqual(row.values[0], 23) self.assertEqual(row.values[1], 'Alice') self.assertEqual(row.values[2], '35K') row = ds_rows[1] self.assertEqual(row.values[0], 32) self.assertEqual(row.values[1], 'Bob') self.assertEqual(row.values[2], '30K') # Ensure that row ids haven't changed for i in range(len(ds_rows)): self.assertEqual(int(ds_rows[i].identifier), int(row_ids[i])) # Swap last two columns c = col_ids[1] del col_ids[1] col_ids.append(c) result = self.api.move_column(ds.identifier, ds.column_by_name('Salary').identifier, 1, self.datastore) ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() self.assertEqual(ds.columns[0].name.upper(), 'Age'.upper()) self.assertEqual(ds.columns[1].name.upper(), 'Salary'.upper()) self.assertEqual(ds.columns[2].name.upper(), 'Name'.upper()) row = ds_rows[0] self.assertEqual(row.values[0], 23) self.assertEqual(row.values[1], '35K') self.assertEqual(row.values[2], 'Alice') row = ds_rows[1] self.assertEqual(row.values[0], 32) self.assertEqual(row.values[1], '30K') self.assertEqual(row.values[2], 'Bob') # Ensure that row ids haven't changed for i in range(len(ds_rows)): self.assertEqual(int(ds_rows[i].identifier), int(row_ids[i])) # No changes if source and target position are the same result = self.api.move_column(ds.identifier, ds.columns[1].identifier, 1, self.datastore) self.assertEqual(ds.identifier, result.dataset.identifier) # Ensure exception is thrown if dataset identifier is unknown with self.assertRaises(MimirError): self.api.move_column('unknown:uri', 0, 1, self.datastore) # Raise error if source column is out of bounds with self.assertRaises(ValueError): self.api.move_column(ds.identifier, 40, 1, self.datastore) # Raise error if target position is out of bounds with self.assertRaises(ValueError): self.api.move_column(ds.identifier, ds.column_by_name('Name').identifier, -1, self.datastore) with self.assertRaises(ValueError): self.api.move_column(ds.identifier, ds.column_by_name('Name').identifier, 4, self.datastore) def move_row(self): """Test functionality to move a row.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset ds_rows = ds.fetch_rows() # Keep track of column and row identifier row_ids = [row.identifier for row in ds_rows] # Swap first two rows result = self.api.move_row(ds.identifier, row_ids[0], 1, self.datastore) row_ids = [row for row in reversed(row_ids)] self.assertNotEqual(result.dataset.identifier, ds.identifier) ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() self.assertEqual(ds.columns[0].name.upper(), 'Name'.upper()) self.assertEqual(ds.columns[1].name.upper(), 'Age'.upper()) self.assertEqual(ds.columns[2].name.upper(), 'Salary'.upper()) row = ds_rows[0] self.assertEqual(row.values[0], 'Bob') self.assertEqual(row.values[1], 32) self.assertEqual(row.values[2], '30K') row = ds_rows[1] self.assertEqual(row.values[0], 'Alice') self.assertEqual(row.values[1], 23) self.assertEqual(row.values[2], '35K') # Ensure that row ids haven't changed for i in range(len(ds_rows)): self.assertEqual(ds_rows[i].identifier, row_ids[i]) # Swap last two rows result = self.api.move_row(ds.identifier, row_ids[1], 0, self.datastore) row_ids = [row for row in reversed(row_ids)] ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() self.assertEqual(ds.columns[0].name.upper(), 'Name'.upper()) self.assertEqual(ds.columns[1].name.upper(), 'Age'.upper()) self.assertEqual(ds.columns[2].name.upper(), 'Salary'.upper()) row = ds_rows[0] self.assertEqual(row.values[0], 'Alice') self.assertEqual(row.values[1], 23) self.assertEqual(row.values[2], '35K') row = ds_rows[1] self.assertEqual(row.values[0], 'Bob') self.assertEqual(row.values[1], 32) self.assertEqual(row.values[2], '30K') # Ensure that row ids haven't changed for i in range(len(ds_rows)): self.assertEqual(int(ds_rows[i].identifier), int(row_ids[i])) # Move first row to the end result = self.api.move_row(ds.identifier, row_ids[0], 2, self.datastore) row_ids = [row for row in reversed(row_ids)] ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() row = ds_rows[0] self.assertEqual(row.values[0], 'Bob') self.assertEqual(row.values[1], 32) self.assertEqual(row.values[2], '30K') row = ds_rows[1] self.assertEqual(row.values[0], 'Alice') self.assertEqual(row.values[1], 23) self.assertEqual(row.values[2], '35K') # Ensure that row ids haven't changed # ## July 16, 2020 by OK: Bug in mimir that is going to take a bunch of # ## heavy lifting to fix: https://github.com/UBOdin/mimir-api/issues/11 # for i in range(len(ds_rows)): # self.assertEqual(int(ds_rows[i].identifier), int(row_ids[i])) # No changes if source and target position are the same result = self.api.move_row(ds.identifier, row_ids[1], 1, self.datastore) # ## July 21, 2020 by OK: It would be fantastic if we could easily detect # no-op vizual, but for now skip this check #self.assertEqual(ds.identifier, result.dataset.identifier) # Ensure exception is thrown if dataset identifier is unknown with self.assertRaises(MimirError): self.api.move_row('unknown:uri', 0, 1, self.datastore) # Raise error if target position is out of bounds # ## July 21, 2020 by OK: Skipping this check for now # with self.assertRaises(ValueError): # self.api.move_row(ds.identifier, 0, -1, self.datastore) # with self.assertRaises(ValueError): # self.api.move_row(ds.identifier, 1, 4, self.datastore) def rename_column(self): """Test functionality to rename a column.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset ds_rows = ds.fetch_rows() # Keep track of column and row identifier row_ids = [row.identifier for row in ds_rows] # Rename first column to Firstname result = self.api.rename_column(ds.identifier, ds.column_by_name('Name').identifier, 'Firstname', self.datastore) self.assertNotEqual(result.dataset.identifier, ds.identifier) ds = self.datastore.get_dataset(result.dataset.identifier) self.assertEqual(ds.columns[0].name.upper(), 'Firstname'.upper()) self.assertEqual(ds.columns[1].name.upper(), 'Age'.upper()) self.assertEqual(ds.columns[2].name.upper(), 'Salary'.upper()) result = self.api.rename_column(ds.identifier, ds.column_by_name('Age').identifier, 'BDate', self.datastore) ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() self.assertEqual(ds.columns[0].name.upper(), 'Firstname'.upper()) self.assertEqual(ds.columns[1].name, 'BDate') self.assertEqual(ds.columns[2].name.upper(), 'Salary'.upper()) # Ensure that row ids haven't changed for i in range(len(ds_rows)): self.assertEqual(int(ds_rows[i].identifier), int(row_ids[i])) # No changes if the old and new column name are the same (with exception # to upper and lower cases). result = self.api.rename_column(ds.identifier, ds.column_by_name('BDate').identifier, 'BDate', self.datastore) # ## July 21, 2020 by OK: It would be fantastic if we could easily detect # no-op vizual, but for now skip this check # self.assertEqual(ds.identifier, result.dataset.identifier) # Ensure exception is thrown if dataset identifier is unknown with self.assertRaises(MimirError): self.api.rename_column('unknown:uri', 0, 'Firstname', self.datastore) # Ensure exception is thrown for invalid column id with self.assertRaises(ValueError): self.api.rename_column(ds.identifier, 500, 'BDate', self.datastore) def sequence_of_steps(self): """Test sequence of calls that modify a dataset.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset ds = self.api.insert_row(ds.identifier, 1, self.datastore).dataset row_ids = [row.identifier for row in ds.fetch_rows()] row0 = row_ids[0] row1 = row_ids[1] row2 = row_ids[2] ds = self.api.insert_column(ds.identifier, 3, 'HDate', self.datastore).dataset ds = self.api.update_cell(ds.identifier, ds.column_by_name('HDate').identifier, row0, '180', self.datastore).dataset ds = self.api.update_cell(ds.identifier, ds.column_by_name('HDate').identifier, row2, '160', self.datastore).dataset ds = self.api.rename_column(ds.identifier, ds.column_by_name('HDate').identifier, 'Height', self.datastore).dataset ds = self.api.update_cell(ds.identifier, ds.column_by_name('Height').identifier, row1, '170', self.datastore).dataset ds = self.api.move_row(ds.identifier, row1, 2, self.datastore).dataset ds = self.api.update_cell(ds.identifier, ds.column_by_name('Name').identifier, row2, 'Carla', self.datastore).dataset ds = self.api.update_cell(ds.identifier, ds.column_by_name('Age').identifier, row2, '45', self.datastore).dataset ds = self.api.update_cell(ds.identifier, ds.column_by_name('Salary').identifier, row2, '56K', self.datastore).dataset ds = self.api.move_column(ds.identifier, ds.column_by_name('Salary').identifier, 4, self.datastore).dataset ds = self.api.delete_column(ds.identifier, ds.column_by_name('Age').identifier, self.datastore).dataset ds = self.api.delete_row(ds.identifier, row0, self.datastore).dataset ds = self.api.delete_row(ds.identifier, row1, self.datastore).dataset ds = self.datastore.get_dataset(ds.identifier) ds_rows = ds.fetch_rows() names = ['Name', 'Height', 'Salary'] self.assertEqual(len(ds.columns), len(names)) for i in range(len(names)): col = ds.columns[i] self.assertEqual(col.name.upper(), names[i].upper()) self.assertEqual(len(ds_rows), 1) self.assertEqual(ds_rows[0].values, ['Carla', '160', '56K']) def sort_dataset(self): """Test sorting a dataset.""" # Create a new dataset fh = self.filestore.upload_file(SORT_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset result = self.api.sort_dataset(ds.identifier, [1, 2, 0], [False, False, True], self.datastore) ds = self.datastore.get_dataset(result.dataset.identifier) rows = ds.fetch_rows() names = ['Alice', 'Bob', 'Dave', 'Gertrud', 'Frank'] result = list() for row in rows: name = row.values[0] if name in names: result.append(name) for i in range(len(names)): self.assertEqual(names[i], result[i]) result = self.api.sort_dataset(ds.identifier, [2, 1, 0], [True, False, True], self.datastore) ds = self.datastore.get_dataset(result.dataset.identifier) rows = ds.fetch_rows() names = ['Gertrud', 'Frank', 'Bob', 'Alice', 'Dave'] result = list() for row in rows: name = row.values[0] if name in names: result.append(name) for i in range(len(names)): self.assertEqual(names[i], result[i]) # Raises error for invalid column identifier with self.assertRaises(ValueError): self.api.sort_dataset(ds.identifier, [2, 10, 0], [True, False, True], self.datastore) def update_cell(self): """Test functionality to update a dataset cell.""" # Create a new dataset fh = self.filestore.upload_file(CSV_FILE) ds = self.api.load_dataset(datastore=self.datastore, filestore=self.filestore, file_id=fh.identifier).dataset ds_rows = ds.fetch_rows() # Keep track of column and row identifier row_ids = [row.identifier for row in ds_rows] # Update cell [0, 0]. Ensure that one row was updated and a new # identifier is generated. Also ensure that the resulting datasets # has the new value in cell [0, 0] row_id = row_ids[0] result = self.api.update_cell(ds.identifier, 0, row_id, 'MyValue', self.datastore) self.assertNotEqual(ds.identifier, result.dataset.identifier) ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() row = None for r in ds.fetch_rows(): if r.identifier == row_id: row = r break self.assertEqual(row.values[0], 'MyValue') result = self.api.update_cell(ds.identifier, ds.column_by_name('Name').identifier, row_id, 'AValue', self.datastore) ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() row = None for r in ds.fetch_rows(): if r.identifier == row_id: row = r break self.assertEqual(row.values[0], 'AValue') self.assertEqual(row.values[ds.column_index('Name')], 'AValue') # Ensure that row ids haven't changed for i in range(len(ds_rows)): self.assertEqual(int(ds_rows[i].identifier), int(row_ids[i])) # Set value to None result = self.api.update_cell(ds.identifier, ds.column_by_name('Name').identifier, row_id, None, self.datastore) ds = self.datastore.get_dataset(result.dataset.identifier) ds_rows = ds.fetch_rows() row = None for r in ds.fetch_rows(): if r.identifier == row_id: row = r break self.assertIsNone(row.values[0]) self.assertIsNone(row.values[ds.column_index('Name')]) # Ensure exception is thrown if dataset is unknown with self.assertRaises(MimirError): self.api.update_cell('unknown:uri', 0, 0, 'MyValue', self.datastore)
class TestDefaultPyCellProcessor(unittest.TestCase): def setUp(self): """Create instances of the default datastore and filestore.""" # Drop directory if it exists if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) os.makedirs(SERVER_DIR) self.datastore = FileSystemDatastore(DATASTORE_DIR) self.filestore = FileSystemFilestore(FILESTORE_DIR) def tearDown(self): """Clean-up by dropping the server directory. """ if os.path.isdir(SERVER_DIR): shutil.rmtree(SERVER_DIR) def test_create_dataset_script(self): """Test running a script that creates a new datasets.""" cmd = python_cell(source=CREATE_DATASET_PY, validate=True) result = PyCellTaskProcessor().compute(command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext( datastore=self.datastore, filestore=self.filestore)) self.assertTrue(result.is_success) self.assertIsNotNone(result.provenance.read) self.assertIsNotNone(result.provenance.write) self.assertEqual(len(result.provenance.read), 0) self.assertEqual(len(result.provenance.write), 1) self.assertTrue('people' in result.provenance.write) self.assertIsNotNone(result.provenance.write['people']) self.assertEqual(len(result.outputs.stdout), 1) self.assertEqual(len(result.outputs.stderr), 0) self.assertEqual(result.outputs.stdout[0].value, 'Alice\nBob') def test_print_dataset_script(self): """Test running a script that prints rows in an existing datasets.""" fh = self.filestore.upload_file(CSV_FILE) ds = self.datastore.load_dataset(fh) cmd = python_cell(source=PRINT_DATASET_PY, validate=True) result = PyCellTaskProcessor().compute( command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={'people': ds.identifier})) self.assertTrue(result.is_success) self.assertIsNotNone(result.provenance.read) self.assertIsNotNone(result.provenance.write) self.assertEqual(len(result.provenance.read), 1) self.assertEqual(len(result.provenance.write), 0) self.assertTrue('people' in result.provenance.read) self.assertIsNotNone(result.provenance.read['people']) self.assertEqual(len(result.outputs.stdout), 1) self.assertEqual(len(result.outputs.stderr), 0) self.assertEqual(result.outputs.stdout[0].value, 'Alice\nBob') def test_simple_script(self): """Test running the simple python script.""" cmd = python_cell(source='print 2+2', validate=True) result = PyCellTaskProcessor().compute(command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext( datastore=self.datastore, filestore=self.filestore, datasets=dict())) self.assertTrue(result.is_success) self.assertEqual(result.outputs.stdout[0].value, '4') def test_unknown_dataset_script(self): """Test running a script that accesses an unknown datasets.""" fh = self.filestore.upload_file(CSV_FILE) ds = self.datastore.load_dataset(fh) cmd = python_cell(source=PRINT_UNKNOWN_DATASET_PY, validate=True) result = PyCellTaskProcessor().compute( command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={'people': ds.identifier})) self.assertFalse(result.is_success) self.assertIsNone(result.provenance.read) self.assertIsNone(result.provenance.write) self.assertEqual(len(result.outputs.stdout), 0) self.assertEqual(len(result.outputs.stderr), 1) # Running a similar script that catches the error schould be a success # and the access to the dataset should be recorded in the resulting # read provenance cmd = python_cell(source=PRINT_UNKNOWN_DATASET_PY_WITH_TRY_CATCH, validate=True) result = PyCellTaskProcessor().compute( command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext(datastore=self.datastore, filestore=self.filestore, datasets={'people': ds.identifier})) self.assertTrue(result.is_success) self.assertIsNotNone(result.provenance.read) self.assertIsNotNone(result.provenance.write) self.assertEqual(len(result.provenance.read), 1) self.assertEqual(len(result.provenance.write), 0) self.assertTrue('employees' in result.provenance.read) self.assertIsNone(result.provenance.read['employees']) self.assertEqual(len(result.outputs.stdout), 1) self.assertEqual(len(result.outputs.stderr), 0)