def test_viztrail_repository(self): """Test basic functionality of managing viztrails in the FS repository. """ repo = FileSystemViztrailRepository(VIZTRAIL_DIR, repos) self.assertEquals(len(repo.list_viztrails()), 0) # Create two viztrails vt1 = repo.create_viztrail(ENGINEENV_DEFAULT, {'name': 'Name A'}) self.assertEquals(vt1.properties['name'], 'Name A') self.assertTrue(PACKAGE_VIZUAL in vt1.command_repository) self.assertTrue(PACKAGE_PYTHON in vt1.command_repository) self.assertFalse(PACKAGE_MIMIR in vt1.command_repository) vt2 = repo.create_viztrail(ENGINEENV_MIMIR, {'name': 'Name B'}) self.assertEquals(vt2.properties['name'], 'Name B') self.assertTrue(PACKAGE_VIZUAL in vt2.command_repository) self.assertTrue(PACKAGE_PYTHON in vt2.command_repository) self.assertTrue(PACKAGE_MIMIR in vt2.command_repository) self.assertEquals(len(repo.list_viztrails()), 2) # Re-load the repository repo = FileSystemViztrailRepository(VIZTRAIL_DIR, repos) self.assertEquals(len(repo.list_viztrails()), 2) vt1 = repo.get_viztrail(vt1.identifier) self.assertEquals(vt1.properties['name'], 'Name A') self.assertTrue(PACKAGE_VIZUAL in vt1.command_repository) self.assertTrue(PACKAGE_PYTHON in vt1.command_repository) self.assertFalse(PACKAGE_MIMIR in vt1.command_repository) vt2 = repo.get_viztrail(vt2.identifier) self.assertEquals(vt2.properties['name'], 'Name B') self.assertTrue(PACKAGE_VIZUAL in vt2.command_repository) self.assertTrue(PACKAGE_PYTHON in vt2.command_repository) self.assertTrue(PACKAGE_MIMIR in vt2.command_repository) # Delete the first viztrail self.assertTrue(repo.delete_viztrail(vt1.identifier)) # Re-load the repository repo = FileSystemViztrailRepository(VIZTRAIL_DIR, repos) self.assertEquals(len(repo.list_viztrails()), 1) self.assertIsNone(repo.get_viztrail(vt1.identifier)) self.assertIsNotNone(repo.get_viztrail(vt2.identifier)) vt2 = repo.list_viztrails()[0] self.assertEquals(vt2.properties['name'], 'Name B') self.assertTrue(PACKAGE_VIZUAL in vt2.command_repository) self.assertTrue(PACKAGE_PYTHON in vt2.command_repository) self.assertTrue(PACKAGE_MIMIR in vt2.command_repository) self.assertFalse(repo.delete_viztrail(vt1.identifier))
def test_viztrail_workflow(self): """Test basic functionality of retrieving a workflow. """ repo = FileSystemViztrailRepository(VIZTRAIL_DIR, repos) viztrail = repo.create_viztrail(ENGINEENV_DEFAULT, {'name': 'Name A'}) self.assertEquals(len(repo.get_workflow(viztrail.identifier, DEFAULT_BRANCH).modules), 0) self.assertIsNone(repo.get_workflow(viztrail.identifier, 'unknown')) self.assertIsNone(repo.get_workflow('unknown', DEFAULT_BRANCH)) self.assertIsNone(repo.get_workflow(viztrail_id=viztrail.identifier, branch_id=DEFAULT_BRANCH, workflow_version=10)) # Re-load repository repo = FileSystemViztrailRepository(VIZTRAIL_DIR, repos) self.assertEquals(len(repo.get_workflow(viztrail.identifier, DEFAULT_BRANCH).modules), 0) self.assertIsNone(repo.get_workflow(viztrail.identifier, 'unknown')) self.assertIsNone(repo.get_workflow('unknown', DEFAULT_BRANCH)) self.assertIsNone(repo.get_workflow(viztrail_id=viztrail.identifier, branch_id=DEFAULT_BRANCH, workflow_version=10))
class TestMimirAnnotations(unittest.TestCase): def setUp(self): """Create an empty work trails repository.""" # Create fresh set of directories for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]: if os.path.isdir(d): shutil.rmtree(d) os.mkdir(d) self.datastore = MimirDataStore(DATASTORE_DIR) self.fileserver = DefaultFileServer(FILESERVER_DIR) vizual = MimirVizualEngine(self.datastore, self.fileserver) self.db = FileSystemViztrailRepository(VIZTRAILS_DIR, {ENV.identifier: ENV}) def tearDown(self): """Clean-up by dropping the MongoDB colelction used by the engine. """ # Delete directories for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]: if os.path.isdir(d): shutil.rmtree(d) def test_annotations(self): """Test DOMAIN lens.""" # Create new work trail and create dataset from CSV file mimir.initialize() f_handle = self.fileserver.upload_file(CSV_FILE) vt = self.db.create_viztrail(ENGINE_ID, {'name': 'My Project'}) self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.load_dataset( f_handle.identifier, DS_NAME)) wf = self.db.get_workflow(viztrail_id=vt.identifier) ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) # Missing Value Lens self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_missing_value( DS_NAME, ds.column_by_name('AGE').identifier)) wf = self.db.get_workflow(viztrail_id=vt.identifier) ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) annos = ds.get_annotations(column_id=1, row_id=2) self.assertEquals(len(annos), 2) for anno in annos: self.assertEquals(anno.key, ANNO_UNCERTAIN) mimir.finalize()
def test_viztrail_branches(self): """Test basic functionality of creating a branch. """ repo = FileSystemViztrailRepository(VIZTRAIL_DIR, repos) viztrail = repo.create_viztrail(ENGINEENV_DEFAULT, {'name': 'Name A'}) # Branching of an unknown viztrail will return None self.assertIsNone(repo.create_branch('unknown', DEFAULT_BRANCH, {'name': 'My Branch'})) # Branching of an empty branch raises a ValueError with self.assertRaises(ValueError): repo.create_branch(viztrail.identifier, DEFAULT_BRANCH, {'name': 'My Branch'}) # Re-load repository and repreat previous assertions repo = FileSystemViztrailRepository(VIZTRAIL_DIR, repos) self.assertIsNone(repo.create_branch('unknown', DEFAULT_BRANCH, {'name': 'My Branch'})) with self.assertRaises(ValueError): repo.create_branch(viztrail.identifier, DEFAULT_BRANCH, {'name': 'My Branch'}) # The master branch provenance does not contain any information prov = repo.get_viztrail(viztrail.identifier).branches[DEFAULT_BRANCH].provenance self.assertIsNone(prov.source_branch) self.assertTrue(prov.workflow_version < 0) self.assertTrue(prov.module_id < 0)
class TestUnicodeHandling(unittest.TestCase): def tearDown(self): """Clean-up by dropping the MongoDB colelction used by the engine. """ # Delete directories for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]: if os.path.isdir(d): shutil.rmtree(d) def set_up(self): """Create an empty work trails repository.""" # Create fresh set of directories for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]: if os.path.isdir(d): shutil.rmtree(d) os.mkdir(d) def set_up_default(self): """Setup configuration using default Vizual engine.""" env = ExecEnv( FileServerConfig().from_dict({'directory': FILESERVER_DIR}), packages=[PACKAGE_VIZUAL, PACKAGE_PYTHON] ).from_dict({'datastore': {'directory': DATASTORE_DIR}}) self.ENGINE_ID = env.identifier self.set_up() self.datastore = FileSystemDataStore(DATASTORE_DIR) self.fileserver = DefaultFileServer(FILESERVER_DIR) self.db = FileSystemViztrailRepository( VIZTRAILS_DIR, {env.identifier: env} ) def set_up_mimir(self): """Setup configuration using Mimir engine.""" env = ExecEnv( FileServerConfig().from_dict({'directory': FILESERVER_DIR}), identifier=ENGINEENV_MIMIR, packages=[PACKAGE_VIZUAL, PACKAGE_PYTHON, PACKAGE_MIMIR] ).from_dict({'datastore': {'directory': DATASTORE_DIR}}) self.ENGINE_ID = env.identifier self.set_up() self.datastore = MimirDataStore(DATASTORE_DIR) self.fileserver = DefaultFileServer(FILESERVER_DIR) self.db = FileSystemViztrailRepository( VIZTRAILS_DIR, {env.identifier: env} ) def test_vt_default(self): """Run workflow with default configuration.""" # Create new work trail and retrieve the HEAD workflow of the default # branch self.set_up_default() self.run_workflow() def test_vt_mimir(self): """Run workflows for Mimir configurations.""" # Create new work trail and retrieve the HEAD workflow of the default # branch mimir.initialize() self.set_up_mimir() self.run_workflow() mimir.finalize() def run_workflow(self): """Test functionality to execute a Python script that creates a dataset containing unicode characters.""" f_handle = self.fileserver.upload_file(CSV_FILE) vt = self.db.create_viztrail(self.ENGINE_ID, {'name' : 'My Project'}) # LOAD DATASET self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.load_dataset(f_handle.identifier, DS_NAME) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) # RUN Python Script self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.python_cell(PYTHON_SCRIPT) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) if wf.has_error: print wf.modules[-1].stderr self.assertFalse(wf.has_error) #print wf.modules[-1].stdout[0]['data'] ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) names = set(c.name.upper().replace('_', ' ') for c in ds.columns) self.assertTrue(len(names), 4) for name in ['DATE', 'IMO CODE', 'PORT OF DEPARTURE', 'PLACE OF RECEIPT']: self.assertTrue(name in names)
class TestWorkflowUpdates(unittest.TestCase): def setUp(self): """Create an empty work trails repository.""" # Create fresh set of directories self.config = AppConfig() env = ExecEnv( FileServerConfig().from_dict({'directory': FILESERVER_DIR}), packages=[PACKAGE_VIZUAL, PACKAGE_PLOT ]).from_dict({'datastore': { 'directory': DATASTORE_DIR }}) self.ENGINE_ID = env.identifier self.config.envs[self.ENGINE_ID] = env self.config.fileserver = env.fileserver for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]: if os.path.isdir(d): shutil.rmtree(d) os.mkdir(d) self.datastore = FileSystemDataStore(DATASTORE_DIR) self.fileserver = DefaultFileServer(FILESERVER_DIR) self.db = FileSystemViztrailRepository(VIZTRAILS_DIR, {env.identifier: env}) self.api = VizierWebService(self.db, self.datastore, self.fileserver, self.config) def tearDown(self): """Clean-up by dropping the MongoDB collection used by the engine. """ # Delete directories for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]: if os.path.isdir(d): shutil.rmtree(d) def test_view_urls(self): """Ensure that the urls for workflow views get updated correctly when the workflow is modified.""" f_handle = self.fileserver.upload_file(CSV_FILE) vt = self.db.create_viztrail(self.ENGINE_ID, {'name': 'My Project'}) #print '(1) CREATE DATASET' self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.load_dataset( f_handle.identifier, DS_NAME)) wf = self.db.get_workflow(viztrail_id=vt.identifier) #print '(2) PLOT' self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.create_plot( DS_NAME, CHART_NAME, series=[{ 'series_column': 2 }])) url = self.api.get_workflow( vt.identifier, DEFAULT_BRANCH)['state']['charts'][0]['links'][0]['href'] self.assertTrue('master/workflows/1/modules/1/views' in url) # print '(3) UPDATE CELL' self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.update_cell( DS_NAME, 0, 0, '28')) url = self.api.get_workflow( vt.identifier, DEFAULT_BRANCH)['state']['charts'][0]['links'][0]['href'] self.assertTrue('master/workflows/2/modules/2/views' in url)
class TestWorkflows(unittest.TestCase): def tearDown(self): """Clean-up by dropping the MongoDB colelction used by the engine. """ # Delete directories for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]: if os.path.isdir(d): shutil.rmtree(d) def set_up(self): """Create an empty work trails repository.""" # Create fresh set of directories for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]: if os.path.isdir(d): shutil.rmtree(d) os.mkdir(d) def set_up_default(self): """Setup configuration using default Vizual engine.""" env = ExecEnv( FileServerConfig().from_dict({'directory': FILESERVER_DIR}), packages=[PACKAGE_VIZUAL, PACKAGE_PYTHON ]).from_dict({'datastore': { 'directory': DATASTORE_DIR }}) self.ENGINE_ID = env.identifier self.set_up() self.datastore = FileSystemDataStore(DATASTORE_DIR) self.fileserver = DefaultFileServer(FILESERVER_DIR) self.db = FileSystemViztrailRepository(VIZTRAILS_DIR, {env.identifier: env}) def set_up_mimir(self): """Setup configuration using Mimir engine.""" env = ExecEnv( FileServerConfig().from_dict({'directory': FILESERVER_DIR}), identifier=ENGINEENV_MIMIR, packages=[PACKAGE_VIZUAL, PACKAGE_PYTHON, PACKAGE_MIMIR ]).from_dict({'datastore': { 'directory': DATASTORE_DIR }}) self.ENGINE_ID = env.identifier self.set_up() self.datastore = MimirDataStore(DATASTORE_DIR) self.fileserver = DefaultFileServer(FILESERVER_DIR) self.db = FileSystemViztrailRepository(VIZTRAILS_DIR, {env.identifier: env}) def test_vt_default(self): """Run workflow with default configuration.""" # Create new work trail and retrieve the HEAD workflow of the default # branch self.set_up_default() self.run_python_workflow() self.set_up_default() self.run_mixed_workflow() self.set_up_default() self.run_delete_modules() self.set_up_default() self.run_erroneous_workflow() self.set_up_default() self.run_update_datasets() def test_vt_mimir(self): """Run workflows for Mimir configurations.""" # Create new work trail and retrieve the HEAD workflow of the default # branch mimir.initialize() self.set_up_mimir() self.run_python_workflow() self.set_up_mimir() self.run_mixed_workflow() self.set_up_mimir() self.run_delete_modules() self.set_up_mimir() self.run_erroneous_workflow() mimir.finalize() def run_delete_modules(self): """Test deletion of modules.""" f_handle = self.fileserver.upload_file(CSV_FILE) vt = self.db.create_viztrail(self.ENGINE_ID, {'name': 'My Project'}) #print '(1) CREATE DATASET' self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.load_dataset( f_handle.identifier, DS_NAME)) wf = self.db.get_workflow(viztrail_id=vt.identifier) ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) col_age = ds.column_by_name('Age') self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.update_cell( DS_NAME, col_age.identifier, 0, '28')) self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.update_cell( DS_NAME, col_age.identifier, 1, '42')) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) ds = DatasetClient( self.datastore.get_dataset(wf.modules[-1].datasets['people'])) self.assertEquals(int(ds.rows[0].get_value('Age')), 28) self.assertEquals(int(ds.rows[1].get_value('Age')), 42) # DELETE UPDATE CELL self.db.delete_workflow_module(viztrail_id=vt.identifier, module_id=wf.modules[1].identifier) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) ds = DatasetClient( self.datastore.get_dataset(wf.modules[-1].datasets['people'])) self.assertEquals(int(ds.rows[0].get_value('Age')), 23) # DELETE LOAD (will introduce error) self.db.delete_workflow_module(viztrail_id=vt.identifier, module_id=wf.modules[0].identifier) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertTrue(wf.has_error) # DELETE last remaining module self.db.delete_workflow_module(viztrail_id=vt.identifier, module_id=wf.modules[0].identifier) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) def run_erroneous_workflow(self): """Test workflow that has errors.""" f_handle = self.fileserver.upload_file(CSV_FILE) vt = self.db.create_viztrail(self.ENGINE_ID, {'name': 'My Project'}) #print '(1) CREATE DATASET' self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.load_dataset( f_handle.identifier, DS_NAME)) wf = self.db.get_workflow(viztrail_id=vt.identifier) ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) col_age = ds.column_by_name('Age') self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.update_cell( DS_NAME, col_age.identifier, 0, '28')) # This should create an error because of the invalid column name self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.rename_column( DS_NAME, col_age.identifier, '')) # This should not have any effect self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.update_cell( DS_NAME, col_age.identifier, 0, '29')) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertTrue(wf.has_error) # Make sure that all workflow modules have a non-negative identifier # and that they are all unique identifier = set() for m in wf.modules: self.assertTrue(m.identifier >= 0) self.assertTrue(not m.identifier in identifier) identifier.add(m.identifier) def run_mixed_workflow(self): """Test functionality to execute a workflow module.""" f_handle = self.fileserver.upload_file(CSV_FILE) vt = self.db.create_viztrail(self.ENGINE_ID, {'name': 'My Project'}) #print '(1) CREATE DATASET' self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.load_dataset( f_handle.identifier, DS_NAME)) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) cmd_text = wf.modules[-1].command_text self.assertEquals(cmd_text, 'LOAD DATASET people FROM FILE dataset.csv') #print '(2) INSERT ROW' self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.insert_row(DS_NAME, 1)) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) cmd_text = wf.modules[-1].command_text self.assertEquals(cmd_text, 'INSERT ROW INTO people AT POSITION 1') #print '(3) Set name to Bobby and set variables' self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.python_cell(SET_VARIABLES_PY)) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) cmd_text = wf.modules[-1].command_text self.assertEquals(cmd_text, SET_VARIABLES_PY) ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) #print '(4) Set age to 28' self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.update_cell( DS_NAME, ds.column_by_name('Age').identifier, 1, '28')) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) cmd_text = wf.modules[-1].command_text self.assertEquals(cmd_text.upper(), 'UPDATE PEOPLE SET [AGE,1] = 28') ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) #print '(5) Change Alice to Bob' self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.update_cell(DS_NAME, ds.column_by_name('Name').identifier, 0, 'Bob')) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) cmd_text = wf.modules[-1].command_text self.assertEquals(cmd_text.upper(), 'UPDATE PEOPLE SET [NAME,0] = \'BOB\'') #print '(6) UPDATE DATASET WITH FILTER' self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.python_cell(UPDATE_DATASET_WITH_FILTER_PY)) wf = self.db.get_workflow(viztrail_id=vt.identifier) cmd_text = wf.modules[-1].command_text self.assertEquals(cmd_text, UPDATE_DATASET_WITH_FILTER_PY) self.assertFalse(wf.has_error) # Ensure that all names are Bobby ds = DatasetClient( self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])) age = [23, 28, 32] for i in range(len(ds.rows)): row = ds.rows[i] self.assertEquals(row.get_value('Name'), 'Bobby') self.assertEquals(int(row.get_value('Age')), age[i]) def run_python_workflow(self): """Test functionality to execute a workflow module.""" vt = self.db.create_viztrail(self.ENGINE_ID, {'name': 'My Project'}) #print '(1) CREATE DATASET' self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.python_cell(CREATE_DATASET_PY)) # from vizier.database.client import VizierDBClient\nv = VizierDBClient(__vizierdb__) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) modules = set() for m in wf.modules: self.assertNotEquals(m.identifier, -1) self.assertFalse(m.identifier in modules) modules.add(m.identifier) self.assertEquals(wf.version, 0) self.assertEquals(len(wf.modules), 1) self.assertTrue(len(wf.modules[0].stdout) == 0) self.assertTrue(len(wf.modules[0].stderr) == 0) self.assertEquals(len(wf.modules[0].datasets), 1) self.assertTrue(DS_NAME in wf.modules[0].datasets) #print '(2) PRINT DATASET' self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.python_cell(PRINT_DATASET_PY)) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) prev_modules = modules modules = set() for m in wf.modules: self.assertNotEquals(m.identifier, -1) self.assertFalse(m.identifier in modules) modules.add(m.identifier) # Ensure that the identifier of previous modules did not change for id in prev_modules: self.assertTrue(id in modules) self.assertEquals(wf.version, 1) self.assertEquals(len(wf.modules), 2) self.assertTrue(len(wf.modules[0].stdout) == 0) self.assertTrue(len(wf.modules[0].stderr) == 0) self.assertEquals(len(wf.modules[0].datasets), 1) self.assertTrue(DS_NAME in wf.modules[0].datasets) self.assertTrue(len(wf.modules[1].stdout) == 1) self.assertTrue(len(wf.modules[1].stderr) == 0) self.assertEquals(wf.modules[1].stdout[0]['data'], 'Alice\nBob') self.assertEquals(len(wf.modules[1].datasets), 1) self.assertTrue(DS_NAME in wf.modules[1].datasets) ds_id = wf.modules[1].datasets[DS_NAME] #print '(3) UPDATE DATASET' self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.python_cell(UPDATE_DATASET_PY)) wf = self.db.get_workflow(viztrail_id=vt.identifier) prev_modules = modules modules = set() for m in wf.modules: self.assertNotEquals(m.identifier, -1) self.assertFalse(m.identifier in modules) modules.add(m.identifier) # Ensure that the identifier of previous modules did not change for id in prev_modules: self.assertTrue(id in modules) self.assertFalse(wf.has_error) self.assertEquals(wf.version, 2) self.assertEquals(len(wf.modules), 3) self.assertTrue(len(wf.modules[0].stdout) == 0) self.assertTrue(len(wf.modules[0].stderr) == 0) self.assertEquals(len(wf.modules[0].datasets), 1) self.assertTrue(DS_NAME in wf.modules[0].datasets) self.assertEquals(wf.modules[0].datasets[DS_NAME], ds_id) self.assertTrue(len(wf.modules[1].stdout) == 1) self.assertTrue(len(wf.modules[1].stderr) == 0) self.assertEquals(wf.modules[1].stdout[0]['data'], 'Alice\nBob') self.assertEquals(len(wf.modules[1].datasets), 1) self.assertTrue(DS_NAME in wf.modules[1].datasets) self.assertEquals(wf.modules[1].datasets[DS_NAME], ds_id) self.assertTrue(len(wf.modules[2].stdout) == 0) self.assertTrue(len(wf.modules[2].stderr) == 0) self.assertEquals(len(wf.modules[2].datasets), 1) self.assertTrue(DS_NAME in wf.modules[2].datasets) self.assertNotEquals(wf.modules[2].datasets[DS_NAME], ds_id) #print '(4) PRINT DATASET' self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.python_cell(PRINT_DATASET_PY)) wf = self.db.get_workflow(viztrail_id=vt.identifier) prev_modules = modules modules = set() for m in wf.modules: self.assertNotEquals(m.identifier, -1) self.assertFalse(m.identifier in modules) modules.add(m.identifier) # Ensure that the identifier of previous modules did not change for id in prev_modules: self.assertTrue(id in modules) self.assertEquals(wf.version, 3) self.assertEquals(len(wf.modules), 4) self.assertEquals(wf.modules[1].stdout[0]['data'], 'Alice\nBob') self.assertTrue(len(wf.modules[3].stdout) == 1) self.assertTrue(len(wf.modules[3].stderr) == 0) self.assertEquals(wf.modules[3].stdout[0]['data'], 'NoName\nNoName') #print '(5) UPDATE DATASET WITH FILTER' self.db.replace_workflow_module( viztrail_id=vt.identifier, module_id=wf.modules[2].identifier, command=cmd.python_cell(UPDATE_DATASET_WITH_FILTER_PY)) wf = self.db.get_workflow(viztrail_id=vt.identifier) prev_modules = modules modules = set() for m in wf.modules: self.assertNotEquals(m.identifier, -1) self.assertFalse(m.identifier in modules) modules.add(m.identifier) # Ensure that the identifier of previous modules did not change for id in prev_modules: self.assertTrue(id in modules) self.assertTrue(wf.has_error) self.assertEquals(wf.version, 4) self.assertEquals(len(wf.modules), 4) # print '(6) INSERT SET VARIABLES BEFORE UPDATE' self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.python_cell(SET_VARIABLES_ONLY_PY), before_id=wf.modules[2].identifier) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) self.assertEquals(wf.modules[4].stdout[0]['data'], 'Alice\nBobby') #print '(7) INTRODUCE ERROR' self.db.replace_workflow_module( viztrail_id=vt.identifier, module_id=wf.modules[1].identifier, command=cmd.python_cell(PRINT_UNKNOWN_DATASET_PY)) wf = self.db.get_workflow(viztrail_id=vt.identifier) prev_modules = modules modules = set() for m in wf.modules: self.assertNotEquals(m.identifier, -1) self.assertFalse(m.identifier in modules) modules.add(m.identifier) # Ensure that the identifier of previous modules did not change for id in prev_modules: self.assertTrue(id in modules) self.assertTrue(wf.has_error) # Ensure that the second module has output to stderr self.assertNotEquals(len(wf.modules[1].stderr), 0) # Ensure that the last two modules hav no output (either to STDOUT or # STDERR) for m in wf.modules[2:]: self.assertEquals(len(m.stdout), 0) self.assertEquals(len(m.stderr), 0) #print '(8) FIX ERROR' self.db.replace_workflow_module( viztrail_id=vt.identifier, module_id=wf.modules[1].identifier, command=cmd.python_cell(PRINT_DATASET_PY)) wf = self.db.get_workflow(viztrail_id=vt.identifier) prev_modules = modules modules = set() for m in wf.modules: self.assertNotEquals(m.identifier, -1) self.assertFalse(m.identifier in modules) modules.add(m.identifier) # Ensure that the identifier of previous modules did not change for id in prev_modules: self.assertTrue(id in modules) #print (9) DELETE MODULE UPDATE_DATASET_WITH_FILTER_PY self.db.delete_workflow_module(viztrail_id=vt.identifier, module_id=wf.modules[3].identifier) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) self.assertEquals(wf.modules[3].stdout[0]['data'], 'Alice\nBob') def run_update_datasets(self): """Test dropping and renaming of datasets.""" f_handle = self.fileserver.upload_file(CSV_FILE) vt = self.db.create_viztrail(self.ENGINE_ID, {'name': 'My Project'}) self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.load_dataset( f_handle.identifier, DS_NAME)) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) self.assertTrue(DS_NAME in wf.modules[-1].datasets) new_name = DS_NAME + '_renamed' self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.rename_dataset( DS_NAME, new_name)) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) self.assertTrue(DS_NAME in wf.modules[0].datasets) self.assertFalse(new_name in wf.modules[0].datasets) self.assertFalse(DS_NAME in wf.modules[-1].datasets) self.assertTrue(new_name in wf.modules[-1].datasets) self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.drop_dataset(new_name)) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) self.assertFalse(new_name in wf.modules[-1].datasets) self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.drop_dataset(new_name)) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertTrue(wf.has_error) # Delete the Drop Dataset that failed and replace the first drop with # a Python module that prints names self.db.delete_workflow_module(viztrail_id=vt.identifier, module_id=wf.modules[-1].identifier) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) self.db.replace_workflow_module(viztrail_id=vt.identifier, module_id=wf.modules[-1].identifier, command=cmd.python_cell(""" for row in vizierdb.get_dataset('""" + new_name + """').rows: print row.get_value('Name') """)) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) self.assertEquals(wf.modules[-1].stdout[0]['data'], 'Alice\nBob') self.assertFalse(DS_NAME in wf.modules[-1].datasets) self.assertTrue(new_name in wf.modules[-1].datasets)
class TestFileSystemViztrailRepository(unittest.TestCase): def setUp(self): """Create an empty work trails repository.""" # Clear VisTrails directory if os.path.isdir(VIZTRAILS_DIRECTORY): shutil.rmtree(VIZTRAILS_DIRECTORY) # Setup project repository self.db = FileSystemViztrailRepository(VIZTRAILS_DIRECTORY, {ENV.identifier: ENV}) def tearDown(self): """Clean-up by dropping viztrails directory. """ shutil.rmtree(VIZTRAILS_DIRECTORY) def test_append_module(self): """Test appending modules.""" # Create new viztrail. vt = self.db.create_viztrail(ENV.identifier, {'name': 'My Project'}) self.db.append_workflow_module(viztrail_id=vt.identifier, command=python_cell('abc')) self.db.append_workflow_module(viztrail_id=vt.identifier, command=load_dataset('file', 'name')) # The default branch should have two versions. The first versions contains # one module and the second version contains two modules self.assertEquals(len(vt.branches[DEFAULT_BRANCH].workflows), 2) v1 = self.db.get_workflow( viztrail_id=vt.identifier, workflow_version=vt.branches[DEFAULT_BRANCH].workflows[0].version) v2 = self.db.get_workflow( viztrail_id=vt.identifier, workflow_version=vt.branches[DEFAULT_BRANCH].workflows[1].version) head = self.db.get_workflow(viztrail_id=vt.identifier, branch_id=DEFAULT_BRANCH) self.assertEquals(len(v1.modules), 1) self.assertEquals(len(v2.modules), 2) self.assertEquals(len(head.modules), 2) # Ensure that all modules have non-negative identifier for m in head.modules: self.assertTrue(m.identifier >= 0) self.assertEquals(head.modules[0].command.module_type, PACKAGE_PYTHON) self.assertEquals(head.modules[1].command.module_type, PACKAGE_VIZUAL) self.assertEquals(head.version, 1) # Re-load the viztrails to ensure that all information has been persisted properly self.db = FileSystemViztrailRepository(VIZTRAILS_DIRECTORY, {ENV.identifier: ENV}) vt = self.db.get_viztrail(vt.identifier) self.assertEquals(len(vt.branches[DEFAULT_BRANCH].workflows), 2) v1 = self.db.get_workflow( viztrail_id=vt.identifier, workflow_version=vt.branches[DEFAULT_BRANCH].workflows[0].version) v2 = self.db.get_workflow( viztrail_id=vt.identifier, workflow_version=vt.branches[DEFAULT_BRANCH].workflows[1].version) head = self.db.get_workflow(viztrail_id=vt.identifier, branch_id=DEFAULT_BRANCH) self.assertEquals(len(v1.modules), 1) self.assertEquals(len(v2.modules), 2) self.assertEquals(len(head.modules), 2) # Ensure that all modules have non-negative identifier for m in head.modules: self.assertTrue(m.identifier >= 0) self.assertEquals(head.modules[0].command.module_type, PACKAGE_PYTHON) self.assertEquals(head.modules[1].command.module_type, PACKAGE_VIZUAL) self.assertEquals(head.version, 1) # Append a third moduel to the head of the default branch self.db.append_workflow_module(viztrail_id=vt.identifier, command=python_cell('def')) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertEquals(len(wf.modules), 3) for m in wf.modules: self.assertTrue(m.identifier >= 0) self.assertEquals(m.stdout[0]['data'], 'SUCCESS ' + str(m.identifier)) self.assertEquals(wf.modules[0].command.module_type, PACKAGE_PYTHON) self.assertEquals(wf.modules[1].command.module_type, PACKAGE_VIZUAL) self.assertEquals(wf.modules[2].command.module_type, PACKAGE_PYTHON) self.assertEquals(wf.version, 2) # Append a module to the first version in the branch. The resulting new # branch HEAD is expected to contain only two modules then. self.db.append_workflow_module(viztrail_id=vt.identifier, workflow_version=0, command=python_cell('def')) self.db = FileSystemViztrailRepository(VIZTRAILS_DIRECTORY, {ENV.identifier: ENV}) vt = self.db.get_viztrail(vt.identifier) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertEquals(len(wf.modules), 2) for m in wf.modules: self.assertTrue(m.identifier >= 0) self.assertEquals(m.stdout[0]['data'], 'SUCCESS ' + str(m.identifier)) self.assertEquals(wf.modules[0].command.module_type, PACKAGE_PYTHON) self.assertEquals(wf.modules[1].command.module_type, PACKAGE_PYTHON) self.assertEquals(wf.version, 3) def test_branching(self): """Test functionality to execute a workflow module.""" # Create new viztrail and ensure that it contains exactly one branch vt = self.db.create_viztrail(ENV.identifier, {'name': 'My Project'}) self.assertEquals(len(vt.branches), 1) self.assertTrue(DEFAULT_BRANCH in vt.branches) self.assertEquals(vt.branches[DEFAULT_BRANCH].identifier, DEFAULT_BRANCH) # Append two modules to the defaukt branch self.db.append_workflow_module(viztrail_id=vt.identifier, command=python_cell('abc')) self.db.append_workflow_module(viztrail_id=vt.identifier, command=load_dataset('file', 'name')) # Create a branch at the end of the default branch. The new branch # contains one workflow with two modules the version number is 2 newbranch = self.db.create_branch(viztrail_id=vt.identifier, properties={'name': 'New Branch'}) self.assertEquals(len(newbranch.workflows), 1) self.assertEquals(newbranch.workflows[-1].version, 2) wf = vt.get_workflow(branch_id=newbranch.identifier) self.assertEquals(wf.version, 2) self.assertEquals(len(wf.modules), 2) self.assertTrue(newbranch.identifier in vt.branches) # Ensure that everything has been persisted properly self.db = FileSystemViztrailRepository(VIZTRAILS_DIRECTORY, {ENV.identifier: ENV}) vt = self.db.get_viztrail(vt.identifier) newbranch = vt.branches[newbranch.identifier] self.assertEquals(len(newbranch.workflows), 1) self.assertEquals(newbranch.workflows[-1].version, 2) wf = vt.get_workflow(branch_id=newbranch.identifier) self.assertEquals(wf.version, 2) self.assertEquals(len(wf.modules), 2) self.assertTrue(newbranch.identifier in vt.branches) self.assertEquals(newbranch.properties.get_properties()['name'], 'New Branch') # Create a third branch from the start of the master branch thirdbranch = self.db.create_branch(viztrail_id=vt.identifier, properties={'name': 'Next Branch'}, module_id=0) wf = vt.get_workflow(branch_id=thirdbranch.identifier) self.assertEquals(wf.version, 3) self.assertEquals(len(wf.modules), 1) # Append modules at end of master and at beginning of thirdbranch self.db.append_workflow_module(viztrail_id=vt.identifier, command=python_cell('abc')) self.db.append_workflow_module(viztrail_id=vt.identifier, branch_id=thirdbranch.identifier, command=python_cell('def'), before_id=0) master_head = vt.get_workflow() self.assertEquals(len(master_head.modules), 3) self.assertEquals(master_head.modules[0].command.module_type, PACKAGE_PYTHON) self.assertEquals(master_head.modules[1].command.module_type, PACKAGE_VIZUAL) self.assertEquals(master_head.modules[2].command.module_type, PACKAGE_PYTHON) b2_head = vt.get_workflow(branch_id=newbranch.identifier) self.assertEquals(len(b2_head.modules), 2) self.assertEquals(b2_head.modules[0].command.module_type, PACKAGE_PYTHON) self.assertEquals(b2_head.modules[1].command.module_type, PACKAGE_VIZUAL) b3_head = vt.get_workflow(branch_id=thirdbranch.identifier) self.assertEquals(len(b3_head.modules), 2) self.assertEquals(b3_head.modules[0].command.module_type, PACKAGE_PYTHON) self.assertEquals(b3_head.modules[1].command.module_type, PACKAGE_PYTHON) # Replace second module of third branch self.db.replace_workflow_module( viztrail_id=vt.identifier, branch_id=thirdbranch.identifier, module_id=b3_head.modules[1].identifier, command=load_dataset('file', 'name')) b3_head = vt.get_workflow(branch_id=thirdbranch.identifier) self.assertEquals(len(b3_head.modules), 2) self.assertEquals(b3_head.modules[0].command.module_type, PACKAGE_PYTHON) self.assertEquals(b3_head.modules[1].command.module_type, PACKAGE_VIZUAL) master_head = vt.get_workflow() self.assertEquals(len(master_head.modules), 3) self.assertEquals(master_head.modules[0].command.module_type, PACKAGE_PYTHON) self.assertEquals(master_head.modules[1].command.module_type, PACKAGE_VIZUAL) self.assertEquals(master_head.modules[2].command.module_type, PACKAGE_PYTHON) b2_head = vt.get_workflow(branch_id=newbranch.identifier) self.assertEquals(len(b2_head.modules), 2) self.assertEquals(b2_head.modules[0].command.module_type, PACKAGE_PYTHON) self.assertEquals(b2_head.modules[1].command.module_type, PACKAGE_VIZUAL) # Ensure there are exceptions raised when branching of an unknown branch # or module with self.assertRaises(ValueError): self.db.create_branch(viztrail_id=vt.identifier, source_branch='unknonw-branch', properties={'name': 'New Branch'}) with self.assertRaises(ValueError): self.db.create_branch(viztrail_id=vt.identifier, properties={'name': 'New Branch'}, module_id=100) with self.assertRaises(ValueError): self.db.create_branch(viztrail_id=vt.identifier) # Test branch provenance self.assertEquals(newbranch.provenance.source_branch, DEFAULT_BRANCH) self.assertEquals(newbranch.provenance.workflow_version, 1) self.assertEquals(newbranch.provenance.module_id, 1) self.assertEquals(thirdbranch.provenance.source_branch, DEFAULT_BRANCH) self.assertEquals(thirdbranch.provenance.workflow_version, 1) self.assertEquals(thirdbranch.provenance.module_id, 0) def test_eval_command(self): """Test functionality to execute a workflow module.""" # Create new work trail, append a module and retrieve the resulting # workflow from default branch HEAD. vt = self.db.create_viztrail(ENV.identifier, {'name': 'My Project'}) self.db.append_workflow_module(viztrail_id=vt.identifier, command=python_cell('abc')) wf = vt.get_workflow() self.assertEquals(wf.version, 0) self.assertEquals(len(wf.modules), 1) self.db.append_workflow_module(viztrail_id=vt.identifier, command=python_cell('def')) wf = vt.get_workflow(branch_id=DEFAULT_BRANCH) self.assertEquals(wf.version, 1) self.assertEquals(len(wf.modules), 2) self.assertEquals(len(wf.modules[0].stdout), 1) self.assertEquals(wf.modules[0].command.module_type, PACKAGE_PYTHON) self.assertEquals(wf.modules[0].command.command_identifier, PYTHON_CODE) self.assertEquals(wf.modules[0].command.arguments[PYTHON_SOURCE], 'abc') self.assertEquals(len(wf.modules[1].stdout), 1) self.assertEquals(wf.modules[1].command.module_type, PACKAGE_PYTHON) self.assertEquals(wf.modules[1].command.command_identifier, PYTHON_CODE) self.assertEquals(wf.modules[1].command.arguments[PYTHON_SOURCE], 'def') self.db.replace_workflow_module(viztrail_id=vt.identifier, module_id=0, command=load_dataset('file', 'ds')) wf = vt.get_workflow() self.assertEquals(wf.version, 2) self.assertEquals(len(wf.modules), 2) self.assertEquals(len(wf.modules[0].stdout), 1) self.assertEquals(wf.modules[0].command.module_type, PACKAGE_VIZUAL) self.assertEquals(wf.modules[0].command.command_identifier, VIZUAL_LOAD) self.assertEquals(wf.modules[0].command.arguments[PARA_FILE]['fileid'], 'file') self.assertEquals(wf.modules[0].command.arguments[PARA_NAME], 'ds') self.assertEquals(len(wf.modules[1].stdout), 2) self.assertEquals(wf.modules[1].command.module_type, PACKAGE_PYTHON) self.assertEquals(wf.modules[1].command.command_identifier, PYTHON_CODE) self.assertEquals(wf.modules[1].command.arguments[PYTHON_SOURCE], 'def') def test_workflow_life_cycle(self): """Test functionality to execute a workflow module.""" # Create new work trail. vt = self.db.create_viztrail(ENV.identifier, {'name': 'My Project'}) # Append two modules self.db.append_workflow_module(viztrail_id=vt.identifier, command=python_cell('abc')) self.db.append_workflow_module(viztrail_id=vt.identifier, command=load_dataset('file', 'name')) # Create a branch at the end of the default branch newbranch = self.db.create_branch(viztrail_id=vt.identifier, properties={'name': 'New Branch'}) # Append modules at end ofnew branch self.db.append_workflow_module(viztrail_id=vt.identifier, branch_id=newbranch.identifier, command=python_cell('xyz')) self.db.append_workflow_module(viztrail_id=vt.identifier, branch_id=newbranch.identifier, command=load_dataset('file', 'myname'), before_id=0) # Ensure that all version files exist self.check_files(vt.identifier, vt.branches[DEFAULT_BRANCH].workflows, True) new_versions = vt.branches[newbranch.identifier].workflows self.check_files(vt.identifier, new_versions, True) # Delete new branch. Ensure that only the master versions exist self.assertTrue( self.db.delete_branch(viztrail_id=vt.identifier, branch_id=newbranch.identifier)) self.check_files(vt.identifier, vt.branches[DEFAULT_BRANCH].workflows, True) self.check_files(vt.identifier, new_versions, False) # Deleting a non-existing branch should return False self.assertFalse( self.db.delete_branch(viztrail_id=vt.identifier, branch_id=newbranch.identifier)) self.assertFalse( self.db.delete_branch(viztrail_id=vt.identifier, branch_id='unknown')) # Deleting master branch should raise exception with self.assertRaises(ValueError): self.db.delete_branch(viztrail_id=vt.identifier, branch_id=DEFAULT_BRANCH) def test_viztrail_life_cycle(self): """Test API methods to create and delete work trails.""" # Create work trail and ensure that deleting it returns True vt = self.db.create_viztrail(ENV.identifier, {'name': 'My Project'}) # Ensure that the viztrail has property name = 'My Project' self.assertEquals(vt.properties.get_properties()['name'], 'My Project') self.assertEquals(len(self.db.list_viztrails()), 1) self.assertTrue(self.db.delete_viztrail(vt.identifier)) self.assertEquals(len(self.db.list_viztrails()), 0) # Multiple deletes should return False self.assertFalse(self.db.delete_viztrail(vt.identifier)) # Deleting an unknown work trail should return False self.assertFalse(self.db.delete_viztrail('invalid id')) self.assertFalse(self.db.delete_viztrail('f0f0f0f0f0f0f0f0f0f0f0f0')) # Cannot create viztrail for unknown engine with self.assertRaises(ValueError): self.db.create_viztrail('UNKNOWN', {'name': 'My Project'}) def check_files(self, viztrail_id, versions, check_exists): for wf_desc in versions: filename = os.path.join(VIZTRAILS_DIRECTORY, viztrail_id, str(wf_desc.version) + '.yaml') self.assertEquals(os.path.isfile(filename), check_exists)
class TestMimirLenses(unittest.TestCase): def setUp(self): """Create an empty work trails repository.""" # Create fresh set of directories for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]: if os.path.isdir(d): shutil.rmtree(d) os.mkdir(d) self.datastore = MimirDataStore(DATASTORE_DIR) self.fileserver = DefaultFileServer(FILESERVER_DIR) vizual = MimirVizualEngine(self.datastore, self.fileserver) self.db = FileSystemViztrailRepository( VIZTRAILS_DIR, {ENV.identifier: ENV} ) def tearDown(self): """Clean-up by dropping the MongoDB colelction used by the engine. """ # Delete directories for d in [DATASTORE_DIR, FILESERVER_DIR, VIZTRAILS_DIR]: if os.path.isdir(d): shutil.rmtree(d) def test_domain_lens(self): """Test DOMAIN lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch mimir.initialize() f_handle = self.fileserver.upload_file(INCOMPLETE_CSV_FILE) vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'}) self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.load_dataset(f_handle.identifier, DS_NAME) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) col_age = ds.column_by_name('Age') self.assertFalse(wf.has_error) # Missing Value Lens self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_domain(DS_NAME, col_age.identifier) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) if wf.has_error: print wf.modules[-1].stderr[0] self.assertEquals(wf.modules[-1].command_text.upper(), 'DOMAIN FOR AGE IN PEOPLE') self.assertFalse(wf.has_error) self.assertEquals(len(wf.modules), 2) # Get dataset ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) rows = ds.fetch_rows() self.assertNotEquals(rows[2].values[ds.column_index('Age')], '') # Introduce an error. Make sure command formating is correct self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_domain('MY DS', 'MY COL') ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertTrue(wf.has_error) self.assertEquals(wf.modules[-1].command_text.upper(), 'DOMAIN FOR \'MY COL\' IN \'MY DS\'') mimir.finalize() def test_geocode_lens(self): """Test GEOCODE lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch mimir.initialize() f_handle = self.fileserver.upload_file(GEO_FILE) vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'}) self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.load_dataset(f_handle.identifier, DS_NAME) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) self.assertFalse(wf.has_error) # Geocode Lens self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_geocode( DS_NAME, 'GOOGLE', house_nr=ds.column_by_name('STRNUMBER').identifier, street=ds.column_by_name('STRNAME').identifier, city=ds.column_by_name('CITY').identifier, state=ds.column_by_name('STATE').identifier ) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) if wf.has_error: print wf.modules[-1].stderr[0] self.assertEquals(wf.modules[-1].command_text.upper(), 'GEOCODE HOUSE_NUMBER=STRNUMBER,STREET=STRNAME,CITY=CITY,STATE=STATE PEOPLE USING GOOGLE') self.assertFalse(wf.has_error) self.assertEquals(len(wf.modules), 2) # Get dataset ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) self.assertEquals(len(ds.columns), 6) self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_geocode( DS_NAME, 'GOOGLE' ) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) if wf.has_error: print wf.modules[-1].stderr[0] self.assertEquals(wf.modules[-1].command_text.upper(), 'GEOCODE PEOPLE USING GOOGLE') self.assertFalse(wf.has_error) self.assertEquals(len(wf.modules), 3) # Get dataset ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) self.assertEquals(len(ds.columns), 8) mimir.finalize() def test_key_repair_lens(self): """Test KEY REPAIR lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch mimir.initialize() f_handle = self.fileserver.upload_file(KEY_REPAIR_FILE) vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'}) self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.load_dataset(f_handle.identifier, DS_NAME) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) ds1 = self.datastore.get_dataset(wf.modules[0].datasets[DS_NAME]) # Missing Value Lens self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_key_repair(DS_NAME, ds1.column_by_name('Empid').identifier) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) self.assertEquals(wf.modules[-1].command_text.upper(), 'KEY REPAIR FOR EMPID IN ' + DS_NAME.upper()) # Get dataset ds2 = self.datastore.get_dataset(wf.modules[0].datasets[DS_NAME]) self.assertEquals(ds1.row_count, ds2.row_count) ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) self.assertEquals(len(ds.columns), 4) self.assertEquals(ds.row_count, 2) names = set() empids = set() rowids = set() for row in DatasetClient(dataset=ds).rows: rowids.add(row.identifier) empids.add(int(row.get_value('empid'))) names.add(row.get_value('name')) self.assertTrue(1 in empids) self.assertTrue(2 in rowids) self.assertTrue('Alice' in names) self.assertTrue('Carla' in names) # Test error case and command text self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_key_repair('MY DS', 'MY COL') ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertTrue(wf.has_error) self.assertEquals(wf.modules[-1].command_text.upper(), 'KEY REPAIR FOR \'MY COL\' IN \'MY DS\'') mimir.finalize() def test_missing_value_lens(self): """Test MISSING_VALUE lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch mimir.initialize() f_handle = self.fileserver.upload_file(INCOMPLETE_CSV_FILE) vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'}) self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.load_dataset(f_handle.identifier, DS_NAME) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) self.assertFalse(wf.has_error) # Missing Value Lens self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_missing_value(DS_NAME, ds.column_by_name('AGE').identifier) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) self.assertEquals(wf.modules[-1].command_text.upper(), 'MISSING VALUES FOR AGE IN ' + DS_NAME.upper()) self.assertEquals(len(wf.modules), 2) # Get dataset ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) rows = ds.fetch_rows() self.assertNotEquals(rows[2].values[ds.column_index('Age')], '') # Annotations annotations = ds.get_annotations(column_id=1, row_id=4) self.assertEquals(len(annotations), 2) # MISSING VALUE Lens with value constraint vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'New Project'}) self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.load_dataset(f_handle.identifier, DS_NAME) ) self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_missing_value( DS_NAME, ds.column_by_name('AGE').identifier, constraint='> 30') ) wf = self.db.get_workflow(viztrail_id=vt.identifier) if wf.has_error: print wf.modules[-1].stderr[0] self.assertFalse(wf.has_error) self.assertEquals(wf.modules[-1].command_text.upper(), 'MISSING VALUES FOR AGE IN ' + DS_NAME.upper() + ' WITH CONSTRAINT > 30') #self.assertEquals(wf.modules[-1].command_text.upper(), 'MISSING VALUES FOR AGE IN ' + DS_NAME.upper()) ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) rows = ds.fetch_rows() self.assertTrue(rows[2].values[ds.column_index('Age')] > 30) # Command text in case of error self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_missing_value('MY DS', '?', constraint='A B') ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertTrue(wf.has_error) cmd_text = wf.modules[-1].command_text.upper() expected_text = 'MISSING VALUES FOR ? IN \'MY DS\'' + ' WITH CONSTRAINT A B' self.assertEquals(cmd_text, expected_text) mimir.finalize() def test_missing_key_lens(self): """Test MISSING_KEY lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch mimir.initialize() f_handle = self.fileserver.upload_file(INCOMPLETE_CSV_FILE) vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'}) self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.load_dataset(f_handle.identifier, DS_NAME) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) # Missing Value Lens age_col = ds.columns[ds.column_index('Age')].identifier self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_missing_key(DS_NAME, age_col, missing_only=True) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertEquals(wf.modules[-1].command_text.upper(), 'MISSING KEYS FOR AGE IN ' + DS_NAME.upper()) self.assertFalse(wf.has_error) # Get dataset ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) self.assertEquals(len(ds.columns), 3) rows = ds.fetch_rows() self.assertEquals(len(rows), 24) #self.db.append_workflow_module( # viztrail_id=vt.identifier, # command=cmd.load_dataset(f_handle.identifier, DS_NAME + '2') #) self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_missing_key( DS_NAME, ds.columns[ds.column_index('Salary')].identifier, missing_only=True ) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) # Get dataset ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) self.assertEquals(len(ds.columns), 3) rows = ds.fetch_rows() self.assertEquals(len(rows), 55) mimir.finalize() def test_picker_lens(self): """Test PICKER lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch mimir.initialize() f_handle = self.fileserver.upload_file(PICKER_FILE) vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'}) self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.load_dataset(f_handle.identifier, DS_NAME) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) # Missing Value Lens ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_picker(DS_NAME, [ {'pickFrom': ds.column_by_name('Age').identifier}, {'pickFrom': ds.column_by_name('Salary').identifier} ]) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) if wf.modules[-1].has_error: print wf.modules[-1].stderr self.assertFalse(wf.has_error) self.assertEquals(wf.modules[-1].command_text.upper(), 'PICK FROM AGE,SALARY IN ' + DS_NAME.upper()) # Get dataset self.assertEquals(len(wf.modules[-1].datasets), 1) ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) columns = [c.name for c in ds.columns] self.assertEquals(len(ds.columns), 5) self.assertTrue('PICK_ONE_AGE_SALARY' in columns) # Pick another column, this time with custom name self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_picker(DS_NAME, [ {'pickFrom': ds.column_by_name('Age').identifier}, {'pickFrom': ds.column_by_name('Salary').identifier} ], pick_as='My Column') ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) # Get dataset self.assertEquals(len(wf.modules[-1].datasets), 1) ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) columns = [c.name for c in ds.columns] self.assertEquals(len(ds.columns), 6) self.assertTrue('PICK_ONE_AGE_SALARY' in columns) self.assertTrue('My Column' in columns) # Pick from a picked column self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_picker(DS_NAME, [ {'pickFrom': ds.column_by_name('Age').identifier}, {'pickFrom': ds.column_by_name('PICK_ONE_AGE_SALARY').identifier} ], pick_as='My Column') ) wf = self.db.get_workflow(viztrail_id=vt.identifier) if wf.modules[-1].has_error: print wf.modules[-1].stderr self.assertFalse(wf.has_error) self.assertEquals(wf.modules[-1].command_text.upper(), 'PICK FROM AGE,PICK_ONE_AGE_SALARY AS \'MY COLUMN\' IN ' + DS_NAME.upper()) ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) mimir.finalize() def test_schema_matching_lens(self): """Test SCHEMA_MATCHING lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch mimir.initialize() f_handle = self.fileserver.upload_file(CSV_FILE) vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'}) self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.load_dataset(f_handle.identifier, DS_NAME) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) # Missing Value Lens self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_schema_matching(DS_NAME, [ {'column': 'BDate', 'type': 'int'}, {'column': 'PName', 'type': 'varchar'} ], 'new_' + DS_NAME) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) self.assertEquals(wf.modules[-1].command_text.upper(), 'SCHEMA MATCHING PEOPLE (BDATE INT, PNAME VARCHAR) AS NEW_' + DS_NAME.upper()) # Get dataset self.assertEquals(len(wf.modules[-1].datasets), 2) ds = self.datastore.get_dataset(wf.modules[-1].datasets['new_' + DS_NAME]) self.assertEquals(len(ds.columns), 2) self.assertEquals(ds.row_count, 2) # Error if adding an existing dataset self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_schema_matching( DS_NAME, [{'column': 'BDate', 'type': 'int'}], 'new_' + DS_NAME ) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertTrue(wf.has_error) self.db.replace_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_schema_matching( DS_NAME, [{'column': 'BDate', 'type': 'int'}], 'a_new_' + DS_NAME ), module_id=wf.modules[-1].identifier, ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) self.assertEquals(wf.modules[-1].command_text.upper(), 'SCHEMA MATCHING PEOPLE (BDATE INT) AS A_NEW_' + DS_NAME.upper()) # Error when adding a dataset with an invalid name self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_schema_matching( DS_NAME, [{'column': 'BDate', 'type': 'int'}], 'SOME NAME' ) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertTrue(wf.has_error) self.assertEquals(wf.modules[-1].command_text.upper(), 'SCHEMA MATCHING PEOPLE (BDATE INT) AS \'SOME NAME\'') mimir.finalize() def test_type_inference_lens(self): """Test TYPE INFERENCE lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch mimir.initialize() f_handle = self.fileserver.upload_file(INCOMPLETE_CSV_FILE) vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'}) self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.load_dataset(f_handle.identifier, DS_NAME) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) ds1 = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) self.assertFalse(wf.has_error) # Infer type self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_type_inference(DS_NAME, 0.6) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) print wf.modules[-1].command_text.upper() self.assertEquals(wf.modules[-1].command_text.upper(), 'TYPE INFERENCE FOR COLUMNS IN ' + DS_NAME.upper() + ' WITH PERCENT_CONFORM = 0.6') # Get dataset ds2 = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) self.assertEquals(len(ds2.columns), 3) self.assertEquals(ds2.row_count, 7) ds1_rows = ds1.fetch_rows() ds2_rows = ds2.fetch_rows() for i in range(ds2.row_count): self.assertEquals(ds1_rows[i].values, ds2_rows[i].values) mimir.finalize()
cleanUp() ENV = ExecEnv(FileServerConfig().from_dict({'directory': FILESERVER_DIR}), identifier=ENGINEENV_MIMIR).from_dict( {'datastore': { 'directory': DATASTORE_DIR }}) datastore = MimirDataStore(DATASTORE_DIR) fileserver = DefaultFileServer(FILESERVER_DIR) vizual = MimirVizualEngine(datastore, fileserver) db = FileSystemViztrailRepository(VIZTRAILS_DIR, {ENV.identifier: ENV}) mimir.initialize() vt = db.create_viztrail(ENV.identifier, {'name': 'My Project'}) # # LOAD DATASET # f_handle = fileserver.upload_file(CSV_FILE) db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.load_dataset(f_handle.identifier, DS_NAME)) wf = db.get_workflow(viztrail_id=vt.identifier) ds = datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) print_dataset(ds) """ # # PICKER LENS #