def run_workflow(self): """Test functionality to execute a Python script that creates a dataset containing unicode characters.""" f_handle = self.fileserver.upload_file(CSV_FILE) vt = self.db.create_viztrail(self.ENGINE_ID, {'name' : 'My Project'}) # LOAD DATASET self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.load_dataset(f_handle.identifier, DS_NAME) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) # RUN Python Script self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.python_cell(PYTHON_SCRIPT) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) if wf.has_error: print wf.modules[-1].stderr self.assertFalse(wf.has_error) #print wf.modules[-1].stdout[0]['data'] ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) names = set(c.name.upper().replace('_', ' ') for c in ds.columns) self.assertTrue(len(names), 4) for name in ['DATE', 'IMO CODE', 'PORT OF DEPARTURE', 'PLACE OF RECEIPT']: self.assertTrue(name in names)
def run_erroneous_workflow(self): """Test workflow that has errors.""" f_handle = self.fileserver.upload_file(CSV_FILE) vt = self.db.create_viztrail(self.ENGINE_ID, {'name': 'My Project'}) #print '(1) CREATE DATASET' self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.load_dataset( f_handle.identifier, DS_NAME)) wf = self.db.get_workflow(viztrail_id=vt.identifier) ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) col_age = ds.column_by_name('Age') self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.update_cell( DS_NAME, col_age.identifier, 0, '28')) # This should create an error because of the invalid column name self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.rename_column( DS_NAME, col_age.identifier, '')) # This should not have any effect self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.update_cell( DS_NAME, col_age.identifier, 0, '29')) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertTrue(wf.has_error) # Make sure that all workflow modules have a non-negative identifier # and that they are all unique identifier = set() for m in wf.modules: self.assertTrue(m.identifier >= 0) self.assertTrue(not m.identifier in identifier) identifier.add(m.identifier)
def test_type_inference_lens(self): """Test TYPE INFERENCE lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch mimir.initialize() f_handle = self.fileserver.upload_file(INCOMPLETE_CSV_FILE) vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'}) self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.load_dataset(f_handle.identifier, DS_NAME) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) ds1 = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) self.assertFalse(wf.has_error) # Infer type self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_type_inference(DS_NAME, 0.6) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) print wf.modules[-1].command_text.upper() self.assertEquals(wf.modules[-1].command_text.upper(), 'TYPE INFERENCE FOR COLUMNS IN ' + DS_NAME.upper() + ' WITH PERCENT_CONFORM = 0.6') # Get dataset ds2 = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) self.assertEquals(len(ds2.columns), 3) self.assertEquals(ds2.row_count, 7) ds1_rows = ds1.fetch_rows() ds2_rows = ds2.fetch_rows() for i in range(ds2.row_count): self.assertEquals(ds1_rows[i].values, ds2_rows[i].values) mimir.finalize()
def test_view_urls(self): """Ensure that the urls for workflow views get updated correctly when the workflow is modified.""" f_handle = self.fileserver.upload_file(CSV_FILE) vt = self.db.create_viztrail(self.ENGINE_ID, {'name': 'My Project'}) #print '(1) CREATE DATASET' self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.load_dataset( f_handle.identifier, DS_NAME)) wf = self.db.get_workflow(viztrail_id=vt.identifier) #print '(2) PLOT' self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.create_plot( DS_NAME, CHART_NAME, series=[{ 'series_column': 2 }])) url = self.api.get_workflow( vt.identifier, DEFAULT_BRANCH)['state']['charts'][0]['links'][0]['href'] self.assertTrue('master/workflows/1/modules/1/views' in url) # print '(3) UPDATE CELL' self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.update_cell( DS_NAME, 0, 0, '28')) url = self.api.get_workflow( vt.identifier, DEFAULT_BRANCH)['state']['charts'][0]['links'][0]['href'] self.assertTrue('master/workflows/2/modules/2/views' in url)
def test_workflow_life_cycle(self): """Test functionality to execute a workflow module.""" # Create new work trail. vt = self.db.create_viztrail(ENV.identifier, {'name': 'My Project'}) # Append two modules self.db.append_workflow_module(viztrail_id=vt.identifier, command=python_cell('abc')) self.db.append_workflow_module(viztrail_id=vt.identifier, command=load_dataset('file', 'name')) # Create a branch at the end of the default branch newbranch = self.db.create_branch(viztrail_id=vt.identifier, properties={'name': 'New Branch'}) # Append modules at end ofnew branch self.db.append_workflow_module(viztrail_id=vt.identifier, branch_id=newbranch.identifier, command=python_cell('xyz')) self.db.append_workflow_module(viztrail_id=vt.identifier, branch_id=newbranch.identifier, command=load_dataset('file', 'myname'), before_id=0) # Ensure that all version files exist self.check_files(vt.identifier, vt.branches[DEFAULT_BRANCH].workflows, True) new_versions = vt.branches[newbranch.identifier].workflows self.check_files(vt.identifier, new_versions, True) # Delete new branch. Ensure that only the master versions exist self.assertTrue( self.db.delete_branch(viztrail_id=vt.identifier, branch_id=newbranch.identifier)) self.check_files(vt.identifier, vt.branches[DEFAULT_BRANCH].workflows, True) self.check_files(vt.identifier, new_versions, False) # Deleting a non-existing branch should return False self.assertFalse( self.db.delete_branch(viztrail_id=vt.identifier, branch_id=newbranch.identifier)) self.assertFalse( self.db.delete_branch(viztrail_id=vt.identifier, branch_id='unknown')) # Deleting master branch should raise exception with self.assertRaises(ValueError): self.db.delete_branch(viztrail_id=vt.identifier, branch_id=DEFAULT_BRANCH)
def test_validate_vizual(self): """Test validation ofVizUAL cell command specifications.""" # DELETE COLUMN obj = cmd.delete_column('dataset', 'column') cmd.validate_command(self.command_repository, obj) obj.arguments['row'] = 'row' with self.assertRaises(ValueError): cmd.validate_command(self.command_repository, obj) obj = cmd.delete_column('dataset', 'column') del obj.arguments['dataset'] with self.assertRaises(ValueError): cmd.validate_command(self.command_repository, obj) obj = cmd.delete_column('dataset', 'column') obj.arguments['row'] = 'row' with self.assertRaises(ValueError): cmd.validate_command(self.command_repository, obj) # DELETE ROW obj = cmd.delete_row('dataset', 'row') cmd.validate_command(self.command_repository, obj) # INSERT COLUMN obj = cmd.insert_column('dataset', 1, 'A') cmd.validate_command(self.command_repository, obj) # INSERT ROW obj = cmd.insert_row('dataset', 1) cmd.validate_command(self.command_repository, obj) # LOAD DATASET obj = cmd.load_dataset('file', 'dataset', filename='My File') cmd.validate_command(self.command_repository, obj) # MOVE COLUMN obj = cmd.move_column('dataset', 'A', 2) cmd.validate_command(self.command_repository, obj) # MOVE ROW obj = cmd.move_row('dataset', 1, 2) cmd.validate_command(self.command_repository, obj) # RENAME COLUMN obj = cmd.rename_column('dataset', 'A', 'B') cmd.validate_command(self.command_repository, obj) # UPDATE CELL obj = cmd.update_cell('dataset', 'A', 1, 'X') cmd.validate_command(self.command_repository, obj) # Unknown VizUAL Command obj = { 'name': 'unknown', 'arguments': { 'dataset': '1', 'name': '2', 'position': '3' } } with self.assertRaises(ValueError): cmd.validate_command( self.command_repository, ModuleSpecification(cmd.PACKAGE_VIZUAL, 'unknown', obj))
def test_geocode_lens(self): """Test GEOCODE lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch mimir.initialize() f_handle = self.fileserver.upload_file(GEO_FILE) vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'}) self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.load_dataset(f_handle.identifier, DS_NAME) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) self.assertFalse(wf.has_error) # Geocode Lens self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_geocode( DS_NAME, 'GOOGLE', house_nr=ds.column_by_name('STRNUMBER').identifier, street=ds.column_by_name('STRNAME').identifier, city=ds.column_by_name('CITY').identifier, state=ds.column_by_name('STATE').identifier ) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) if wf.has_error: print wf.modules[-1].stderr[0] self.assertEquals(wf.modules[-1].command_text.upper(), 'GEOCODE HOUSE_NUMBER=STRNUMBER,STREET=STRNAME,CITY=CITY,STATE=STATE PEOPLE USING GOOGLE') self.assertFalse(wf.has_error) self.assertEquals(len(wf.modules), 2) # Get dataset ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) self.assertEquals(len(ds.columns), 6) self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_geocode( DS_NAME, 'GOOGLE' ) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) if wf.has_error: print wf.modules[-1].stderr[0] self.assertEquals(wf.modules[-1].command_text.upper(), 'GEOCODE PEOPLE USING GOOGLE') self.assertFalse(wf.has_error) self.assertEquals(len(wf.modules), 3) # Get dataset ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) self.assertEquals(len(ds.columns), 8) mimir.finalize()
def test_missing_key_lens(self): """Test MISSING_KEY lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch mimir.initialize() f_handle = self.fileserver.upload_file(INCOMPLETE_CSV_FILE) vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'}) self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.load_dataset(f_handle.identifier, DS_NAME) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) # Missing Value Lens age_col = ds.columns[ds.column_index('Age')].identifier self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_missing_key(DS_NAME, age_col, missing_only=True) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertEquals(wf.modules[-1].command_text.upper(), 'MISSING KEYS FOR AGE IN ' + DS_NAME.upper()) self.assertFalse(wf.has_error) # Get dataset ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) self.assertEquals(len(ds.columns), 3) rows = ds.fetch_rows() self.assertEquals(len(rows), 24) #self.db.append_workflow_module( # viztrail_id=vt.identifier, # command=cmd.load_dataset(f_handle.identifier, DS_NAME + '2') #) self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_missing_key( DS_NAME, ds.columns[ds.column_index('Salary')].identifier, missing_only=True ) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) # Get dataset ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) self.assertEquals(len(ds.columns), 3) rows = ds.fetch_rows() self.assertEquals(len(rows), 55) mimir.finalize()
def test_key_repair_lens(self): """Test KEY REPAIR lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch mimir.initialize() f_handle = self.fileserver.upload_file(KEY_REPAIR_FILE) vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'}) self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.load_dataset(f_handle.identifier, DS_NAME) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) ds1 = self.datastore.get_dataset(wf.modules[0].datasets[DS_NAME]) # Missing Value Lens self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_key_repair(DS_NAME, ds1.column_by_name('Empid').identifier) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) self.assertEquals(wf.modules[-1].command_text.upper(), 'KEY REPAIR FOR EMPID IN ' + DS_NAME.upper()) # Get dataset ds2 = self.datastore.get_dataset(wf.modules[0].datasets[DS_NAME]) self.assertEquals(ds1.row_count, ds2.row_count) ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) self.assertEquals(len(ds.columns), 4) self.assertEquals(ds.row_count, 2) names = set() empids = set() rowids = set() for row in DatasetClient(dataset=ds).rows: rowids.add(row.identifier) empids.add(int(row.get_value('empid'))) names.add(row.get_value('name')) self.assertTrue(1 in empids) self.assertTrue(2 in rowids) self.assertTrue('Alice' in names) self.assertTrue('Carla' in names) # Test error case and command text self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_key_repair('MY DS', 'MY COL') ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertTrue(wf.has_error) self.assertEquals(wf.modules[-1].command_text.upper(), 'KEY REPAIR FOR \'MY COL\' IN \'MY DS\'') mimir.finalize()
def run_update_datasets(self): """Test dropping and renaming of datasets.""" f_handle = self.fileserver.upload_file(CSV_FILE) vt = self.db.create_viztrail(self.ENGINE_ID, {'name': 'My Project'}) self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.load_dataset( f_handle.identifier, DS_NAME)) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) self.assertTrue(DS_NAME in wf.modules[-1].datasets) new_name = DS_NAME + '_renamed' self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.rename_dataset( DS_NAME, new_name)) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) self.assertTrue(DS_NAME in wf.modules[0].datasets) self.assertFalse(new_name in wf.modules[0].datasets) self.assertFalse(DS_NAME in wf.modules[-1].datasets) self.assertTrue(new_name in wf.modules[-1].datasets) self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.drop_dataset(new_name)) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) self.assertFalse(new_name in wf.modules[-1].datasets) self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.drop_dataset(new_name)) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertTrue(wf.has_error) # Delete the Drop Dataset that failed and replace the first drop with # a Python module that prints names self.db.delete_workflow_module(viztrail_id=vt.identifier, module_id=wf.modules[-1].identifier) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) self.db.replace_workflow_module(viztrail_id=vt.identifier, module_id=wf.modules[-1].identifier, command=cmd.python_cell(""" for row in vizierdb.get_dataset('""" + new_name + """').rows: print row.get_value('Name') """)) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) self.assertEquals(wf.modules[-1].stdout[0]['data'], 'Alice\nBob') self.assertFalse(DS_NAME in wf.modules[-1].datasets) self.assertTrue(new_name in wf.modules[-1].datasets)
def test_eval_command(self): """Test functionality to execute a workflow module.""" # Create new work trail, append a module and retrieve the resulting # workflow from default branch HEAD. vt = self.db.create_viztrail(ENV.identifier, {'name': 'My Project'}) self.db.append_workflow_module(viztrail_id=vt.identifier, command=python_cell('abc')) wf = vt.get_workflow() self.assertEquals(wf.version, 0) self.assertEquals(len(wf.modules), 1) self.db.append_workflow_module(viztrail_id=vt.identifier, command=python_cell('def')) wf = vt.get_workflow(branch_id=DEFAULT_BRANCH) self.assertEquals(wf.version, 1) self.assertEquals(len(wf.modules), 2) self.assertEquals(len(wf.modules[0].stdout), 1) self.assertEquals(wf.modules[0].command.module_type, PACKAGE_PYTHON) self.assertEquals(wf.modules[0].command.command_identifier, PYTHON_CODE) self.assertEquals(wf.modules[0].command.arguments[PYTHON_SOURCE], 'abc') self.assertEquals(len(wf.modules[1].stdout), 1) self.assertEquals(wf.modules[1].command.module_type, PACKAGE_PYTHON) self.assertEquals(wf.modules[1].command.command_identifier, PYTHON_CODE) self.assertEquals(wf.modules[1].command.arguments[PYTHON_SOURCE], 'def') self.db.replace_workflow_module(viztrail_id=vt.identifier, module_id=0, command=load_dataset('file', 'ds')) wf = vt.get_workflow() self.assertEquals(wf.version, 2) self.assertEquals(len(wf.modules), 2) self.assertEquals(len(wf.modules[0].stdout), 1) self.assertEquals(wf.modules[0].command.module_type, PACKAGE_VIZUAL) self.assertEquals(wf.modules[0].command.command_identifier, VIZUAL_LOAD) self.assertEquals(wf.modules[0].command.arguments[PARA_FILE]['fileid'], 'file') self.assertEquals(wf.modules[0].command.arguments[PARA_NAME], 'ds') self.assertEquals(len(wf.modules[1].stdout), 2) self.assertEquals(wf.modules[1].command.module_type, PACKAGE_PYTHON) self.assertEquals(wf.modules[1].command.command_identifier, PYTHON_CODE) self.assertEquals(wf.modules[1].command.arguments[PYTHON_SOURCE], 'def')
def run_delete_modules(self): """Test deletion of modules.""" f_handle = self.fileserver.upload_file(CSV_FILE) vt = self.db.create_viztrail(self.ENGINE_ID, {'name': 'My Project'}) #print '(1) CREATE DATASET' self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.load_dataset( f_handle.identifier, DS_NAME)) wf = self.db.get_workflow(viztrail_id=vt.identifier) ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) col_age = ds.column_by_name('Age') self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.update_cell( DS_NAME, col_age.identifier, 0, '28')) self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.update_cell( DS_NAME, col_age.identifier, 1, '42')) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) ds = DatasetClient( self.datastore.get_dataset(wf.modules[-1].datasets['people'])) self.assertEquals(int(ds.rows[0].get_value('Age')), 28) self.assertEquals(int(ds.rows[1].get_value('Age')), 42) # DELETE UPDATE CELL self.db.delete_workflow_module(viztrail_id=vt.identifier, module_id=wf.modules[1].identifier) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) ds = DatasetClient( self.datastore.get_dataset(wf.modules[-1].datasets['people'])) self.assertEquals(int(ds.rows[0].get_value('Age')), 23) # DELETE LOAD (will introduce error) self.db.delete_workflow_module(viztrail_id=vt.identifier, module_id=wf.modules[0].identifier) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertTrue(wf.has_error) # DELETE last remaining module self.db.delete_workflow_module(viztrail_id=vt.identifier, module_id=wf.modules[0].identifier) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error)
def test_domain_lens(self): """Test DOMAIN lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch mimir.initialize() f_handle = self.fileserver.upload_file(INCOMPLETE_CSV_FILE) vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'}) self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.load_dataset(f_handle.identifier, DS_NAME) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) col_age = ds.column_by_name('Age') self.assertFalse(wf.has_error) # Missing Value Lens self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_domain(DS_NAME, col_age.identifier) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) if wf.has_error: print wf.modules[-1].stderr[0] self.assertEquals(wf.modules[-1].command_text.upper(), 'DOMAIN FOR AGE IN PEOPLE') self.assertFalse(wf.has_error) self.assertEquals(len(wf.modules), 2) # Get dataset ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) rows = ds.fetch_rows() self.assertNotEquals(rows[2].values[ds.column_index('Age')], '') # Introduce an error. Make sure command formating is correct self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_domain('MY DS', 'MY COL') ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertTrue(wf.has_error) self.assertEquals(wf.modules[-1].command_text.upper(), 'DOMAIN FOR \'MY COL\' IN \'MY DS\'') mimir.finalize()
def test_spreadsheet(self): """Ensure that the includeDataset option is working for spreadsheet updates.""" # Upload file fh = self.fileserver.upload_file(CSV_FILE) # Create project ph = self.api.create_project(self.ENV.identifier, {'name': 'My Project'}) pid = ph['id'] # Load dataset DS_NAME = 'myDS' cmd = load_dataset(fh.identifier, DS_NAME) result = self.api.append_module(pid, DEFAULT_BRANCH, -1, cmd) self.validate_keys(result, ['workflow', 'modules', 'datasets']) # Update cell and request to include dataset cmd = update_cell(DS_NAME, 0, 0, 'A') result = self.api.append_module(pid, DEFAULT_BRANCH, -1, cmd, includeDataset={ 'name': DS_NAME, 'offset': 0 }) self.validate_keys(result, ['workflow', 'dataset']) self.validate_dataset_handle(result['dataset']) # In case of an error the result contains the modules cmd = update_cell(DS_NAME, 100, 0, 'A') result = self.api.append_module(pid, DEFAULT_BRANCH, -1, cmd, includeDataset={ 'name': DS_NAME, 'offset': 0 }) self.validate_keys(result, ['workflow', 'modules', 'datasets'])
def test_annotations(self): """Test DOMAIN lens.""" # Create new work trail and create dataset from CSV file mimir.initialize() f_handle = self.fileserver.upload_file(CSV_FILE) vt = self.db.create_viztrail(ENGINE_ID, {'name': 'My Project'}) self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.load_dataset( f_handle.identifier, DS_NAME)) wf = self.db.get_workflow(viztrail_id=vt.identifier) ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) # Missing Value Lens self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_missing_value( DS_NAME, ds.column_by_name('AGE').identifier)) wf = self.db.get_workflow(viztrail_id=vt.identifier) ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) annos = ds.get_annotations(column_id=1, row_id=2) self.assertEquals(len(annos), 2) for anno in annos: self.assertEquals(anno.key, ANNO_UNCERTAIN) mimir.finalize()
datastore = MimirDataStore(DATASTORE_DIR) fileserver = DefaultFileServer(FILESERVER_DIR) vizual = MimirVizualEngine(datastore, fileserver) db = FileSystemViztrailRepository(VIZTRAILS_DIR, {ENV.identifier: ENV}) mimir.initialize() vt = db.create_viztrail(ENV.identifier, {'name': 'My Project'}) # # LOAD DATASET # f_handle = fileserver.upload_file(CSV_FILE) db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.load_dataset(f_handle.identifier, DS_NAME)) wf = db.get_workflow(viztrail_id=vt.identifier) ds = datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) print_dataset(ds) """ # # PICKER LENS # db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_picker( DS_NAME, [ {'pickFrom': 'A'}, {'pickFrom': 'B'} ],
def test_missing_value_lens(self): """Test MISSING_VALUE lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch mimir.initialize() f_handle = self.fileserver.upload_file(INCOMPLETE_CSV_FILE) vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'}) self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.load_dataset(f_handle.identifier, DS_NAME) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) self.assertFalse(wf.has_error) # Missing Value Lens self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_missing_value(DS_NAME, ds.column_by_name('AGE').identifier) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) self.assertEquals(wf.modules[-1].command_text.upper(), 'MISSING VALUES FOR AGE IN ' + DS_NAME.upper()) self.assertEquals(len(wf.modules), 2) # Get dataset ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) rows = ds.fetch_rows() self.assertNotEquals(rows[2].values[ds.column_index('Age')], '') # Annotations annotations = ds.get_annotations(column_id=1, row_id=4) self.assertEquals(len(annotations), 2) # MISSING VALUE Lens with value constraint vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'New Project'}) self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.load_dataset(f_handle.identifier, DS_NAME) ) self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_missing_value( DS_NAME, ds.column_by_name('AGE').identifier, constraint='> 30') ) wf = self.db.get_workflow(viztrail_id=vt.identifier) if wf.has_error: print wf.modules[-1].stderr[0] self.assertFalse(wf.has_error) self.assertEquals(wf.modules[-1].command_text.upper(), 'MISSING VALUES FOR AGE IN ' + DS_NAME.upper() + ' WITH CONSTRAINT > 30') #self.assertEquals(wf.modules[-1].command_text.upper(), 'MISSING VALUES FOR AGE IN ' + DS_NAME.upper()) ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) rows = ds.fetch_rows() self.assertTrue(rows[2].values[ds.column_index('Age')] > 30) # Command text in case of error self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_missing_value('MY DS', '?', constraint='A B') ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertTrue(wf.has_error) cmd_text = wf.modules[-1].command_text.upper() expected_text = 'MISSING VALUES FOR ? IN \'MY DS\'' + ' WITH CONSTRAINT A B' self.assertEquals(cmd_text, expected_text) mimir.finalize()
def test_picker_lens(self): """Test PICKER lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch mimir.initialize() f_handle = self.fileserver.upload_file(PICKER_FILE) vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'}) self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.load_dataset(f_handle.identifier, DS_NAME) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) # Missing Value Lens ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_picker(DS_NAME, [ {'pickFrom': ds.column_by_name('Age').identifier}, {'pickFrom': ds.column_by_name('Salary').identifier} ]) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) if wf.modules[-1].has_error: print wf.modules[-1].stderr self.assertFalse(wf.has_error) self.assertEquals(wf.modules[-1].command_text.upper(), 'PICK FROM AGE,SALARY IN ' + DS_NAME.upper()) # Get dataset self.assertEquals(len(wf.modules[-1].datasets), 1) ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) columns = [c.name for c in ds.columns] self.assertEquals(len(ds.columns), 5) self.assertTrue('PICK_ONE_AGE_SALARY' in columns) # Pick another column, this time with custom name self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_picker(DS_NAME, [ {'pickFrom': ds.column_by_name('Age').identifier}, {'pickFrom': ds.column_by_name('Salary').identifier} ], pick_as='My Column') ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) # Get dataset self.assertEquals(len(wf.modules[-1].datasets), 1) ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) columns = [c.name for c in ds.columns] self.assertEquals(len(ds.columns), 6) self.assertTrue('PICK_ONE_AGE_SALARY' in columns) self.assertTrue('My Column' in columns) # Pick from a picked column self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_picker(DS_NAME, [ {'pickFrom': ds.column_by_name('Age').identifier}, {'pickFrom': ds.column_by_name('PICK_ONE_AGE_SALARY').identifier} ], pick_as='My Column') ) wf = self.db.get_workflow(viztrail_id=vt.identifier) if wf.modules[-1].has_error: print wf.modules[-1].stderr self.assertFalse(wf.has_error) self.assertEquals(wf.modules[-1].command_text.upper(), 'PICK FROM AGE,PICK_ONE_AGE_SALARY AS \'MY COLUMN\' IN ' + DS_NAME.upper()) ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) mimir.finalize()
def test_schema_matching_lens(self): """Test SCHEMA_MATCHING lens.""" # Create new work trail and retrieve the HEAD workflow of the default # branch mimir.initialize() f_handle = self.fileserver.upload_file(CSV_FILE) vt = self.db.create_viztrail(ENGINE_ID, {'name' : 'My Project'}) self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.load_dataset(f_handle.identifier, DS_NAME) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) # Missing Value Lens self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_schema_matching(DS_NAME, [ {'column': 'BDate', 'type': 'int'}, {'column': 'PName', 'type': 'varchar'} ], 'new_' + DS_NAME) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) self.assertEquals(wf.modules[-1].command_text.upper(), 'SCHEMA MATCHING PEOPLE (BDATE INT, PNAME VARCHAR) AS NEW_' + DS_NAME.upper()) # Get dataset self.assertEquals(len(wf.modules[-1].datasets), 2) ds = self.datastore.get_dataset(wf.modules[-1].datasets['new_' + DS_NAME]) self.assertEquals(len(ds.columns), 2) self.assertEquals(ds.row_count, 2) # Error if adding an existing dataset self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_schema_matching( DS_NAME, [{'column': 'BDate', 'type': 'int'}], 'new_' + DS_NAME ) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertTrue(wf.has_error) self.db.replace_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_schema_matching( DS_NAME, [{'column': 'BDate', 'type': 'int'}], 'a_new_' + DS_NAME ), module_id=wf.modules[-1].identifier, ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) self.assertEquals(wf.modules[-1].command_text.upper(), 'SCHEMA MATCHING PEOPLE (BDATE INT) AS A_NEW_' + DS_NAME.upper()) # Error when adding a dataset with an invalid name self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.mimir_schema_matching( DS_NAME, [{'column': 'BDate', 'type': 'int'}], 'SOME NAME' ) ) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertTrue(wf.has_error) self.assertEquals(wf.modules[-1].command_text.upper(), 'SCHEMA MATCHING PEOPLE (BDATE INT) AS \'SOME NAME\'') mimir.finalize()
def run_mixed_workflow(self): """Test functionality to execute a workflow module.""" f_handle = self.fileserver.upload_file(CSV_FILE) vt = self.db.create_viztrail(self.ENGINE_ID, {'name': 'My Project'}) #print '(1) CREATE DATASET' self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.load_dataset( f_handle.identifier, DS_NAME)) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) cmd_text = wf.modules[-1].command_text self.assertEquals(cmd_text, 'LOAD DATASET people FROM FILE dataset.csv') #print '(2) INSERT ROW' self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.insert_row(DS_NAME, 1)) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) cmd_text = wf.modules[-1].command_text self.assertEquals(cmd_text, 'INSERT ROW INTO people AT POSITION 1') #print '(3) Set name to Bobby and set variables' self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.python_cell(SET_VARIABLES_PY)) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) cmd_text = wf.modules[-1].command_text self.assertEquals(cmd_text, SET_VARIABLES_PY) ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) #print '(4) Set age to 28' self.db.append_workflow_module(viztrail_id=vt.identifier, command=cmd.update_cell( DS_NAME, ds.column_by_name('Age').identifier, 1, '28')) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) cmd_text = wf.modules[-1].command_text self.assertEquals(cmd_text.upper(), 'UPDATE PEOPLE SET [AGE,1] = 28') ds = self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME]) #print '(5) Change Alice to Bob' self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.update_cell(DS_NAME, ds.column_by_name('Name').identifier, 0, 'Bob')) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertFalse(wf.has_error) cmd_text = wf.modules[-1].command_text self.assertEquals(cmd_text.upper(), 'UPDATE PEOPLE SET [NAME,0] = \'BOB\'') #print '(6) UPDATE DATASET WITH FILTER' self.db.append_workflow_module( viztrail_id=vt.identifier, command=cmd.python_cell(UPDATE_DATASET_WITH_FILTER_PY)) wf = self.db.get_workflow(viztrail_id=vt.identifier) cmd_text = wf.modules[-1].command_text self.assertEquals(cmd_text, UPDATE_DATASET_WITH_FILTER_PY) self.assertFalse(wf.has_error) # Ensure that all names are Bobby ds = DatasetClient( self.datastore.get_dataset(wf.modules[-1].datasets[DS_NAME])) age = [23, 28, 32] for i in range(len(ds.rows)): row = ds.rows[i] self.assertEquals(row.get_value('Name'), 'Bobby') self.assertEquals(int(row.get_value('Age')), age[i])
def test_branching(self): """Test functionality to execute a workflow module.""" # Create new viztrail and ensure that it contains exactly one branch vt = self.db.create_viztrail(ENV.identifier, {'name': 'My Project'}) self.assertEquals(len(vt.branches), 1) self.assertTrue(DEFAULT_BRANCH in vt.branches) self.assertEquals(vt.branches[DEFAULT_BRANCH].identifier, DEFAULT_BRANCH) # Append two modules to the defaukt branch self.db.append_workflow_module(viztrail_id=vt.identifier, command=python_cell('abc')) self.db.append_workflow_module(viztrail_id=vt.identifier, command=load_dataset('file', 'name')) # Create a branch at the end of the default branch. The new branch # contains one workflow with two modules the version number is 2 newbranch = self.db.create_branch(viztrail_id=vt.identifier, properties={'name': 'New Branch'}) self.assertEquals(len(newbranch.workflows), 1) self.assertEquals(newbranch.workflows[-1].version, 2) wf = vt.get_workflow(branch_id=newbranch.identifier) self.assertEquals(wf.version, 2) self.assertEquals(len(wf.modules), 2) self.assertTrue(newbranch.identifier in vt.branches) # Ensure that everything has been persisted properly self.db = FileSystemViztrailRepository(VIZTRAILS_DIRECTORY, {ENV.identifier: ENV}) vt = self.db.get_viztrail(vt.identifier) newbranch = vt.branches[newbranch.identifier] self.assertEquals(len(newbranch.workflows), 1) self.assertEquals(newbranch.workflows[-1].version, 2) wf = vt.get_workflow(branch_id=newbranch.identifier) self.assertEquals(wf.version, 2) self.assertEquals(len(wf.modules), 2) self.assertTrue(newbranch.identifier in vt.branches) self.assertEquals(newbranch.properties.get_properties()['name'], 'New Branch') # Create a third branch from the start of the master branch thirdbranch = self.db.create_branch(viztrail_id=vt.identifier, properties={'name': 'Next Branch'}, module_id=0) wf = vt.get_workflow(branch_id=thirdbranch.identifier) self.assertEquals(wf.version, 3) self.assertEquals(len(wf.modules), 1) # Append modules at end of master and at beginning of thirdbranch self.db.append_workflow_module(viztrail_id=vt.identifier, command=python_cell('abc')) self.db.append_workflow_module(viztrail_id=vt.identifier, branch_id=thirdbranch.identifier, command=python_cell('def'), before_id=0) master_head = vt.get_workflow() self.assertEquals(len(master_head.modules), 3) self.assertEquals(master_head.modules[0].command.module_type, PACKAGE_PYTHON) self.assertEquals(master_head.modules[1].command.module_type, PACKAGE_VIZUAL) self.assertEquals(master_head.modules[2].command.module_type, PACKAGE_PYTHON) b2_head = vt.get_workflow(branch_id=newbranch.identifier) self.assertEquals(len(b2_head.modules), 2) self.assertEquals(b2_head.modules[0].command.module_type, PACKAGE_PYTHON) self.assertEquals(b2_head.modules[1].command.module_type, PACKAGE_VIZUAL) b3_head = vt.get_workflow(branch_id=thirdbranch.identifier) self.assertEquals(len(b3_head.modules), 2) self.assertEquals(b3_head.modules[0].command.module_type, PACKAGE_PYTHON) self.assertEquals(b3_head.modules[1].command.module_type, PACKAGE_PYTHON) # Replace second module of third branch self.db.replace_workflow_module( viztrail_id=vt.identifier, branch_id=thirdbranch.identifier, module_id=b3_head.modules[1].identifier, command=load_dataset('file', 'name')) b3_head = vt.get_workflow(branch_id=thirdbranch.identifier) self.assertEquals(len(b3_head.modules), 2) self.assertEquals(b3_head.modules[0].command.module_type, PACKAGE_PYTHON) self.assertEquals(b3_head.modules[1].command.module_type, PACKAGE_VIZUAL) master_head = vt.get_workflow() self.assertEquals(len(master_head.modules), 3) self.assertEquals(master_head.modules[0].command.module_type, PACKAGE_PYTHON) self.assertEquals(master_head.modules[1].command.module_type, PACKAGE_VIZUAL) self.assertEquals(master_head.modules[2].command.module_type, PACKAGE_PYTHON) b2_head = vt.get_workflow(branch_id=newbranch.identifier) self.assertEquals(len(b2_head.modules), 2) self.assertEquals(b2_head.modules[0].command.module_type, PACKAGE_PYTHON) self.assertEquals(b2_head.modules[1].command.module_type, PACKAGE_VIZUAL) # Ensure there are exceptions raised when branching of an unknown branch # or module with self.assertRaises(ValueError): self.db.create_branch(viztrail_id=vt.identifier, source_branch='unknonw-branch', properties={'name': 'New Branch'}) with self.assertRaises(ValueError): self.db.create_branch(viztrail_id=vt.identifier, properties={'name': 'New Branch'}, module_id=100) with self.assertRaises(ValueError): self.db.create_branch(viztrail_id=vt.identifier) # Test branch provenance self.assertEquals(newbranch.provenance.source_branch, DEFAULT_BRANCH) self.assertEquals(newbranch.provenance.workflow_version, 1) self.assertEquals(newbranch.provenance.module_id, 1) self.assertEquals(thirdbranch.provenance.source_branch, DEFAULT_BRANCH) self.assertEquals(thirdbranch.provenance.workflow_version, 1) self.assertEquals(thirdbranch.provenance.module_id, 0)
def test_append_module(self): """Test appending modules.""" # Create new viztrail. vt = self.db.create_viztrail(ENV.identifier, {'name': 'My Project'}) self.db.append_workflow_module(viztrail_id=vt.identifier, command=python_cell('abc')) self.db.append_workflow_module(viztrail_id=vt.identifier, command=load_dataset('file', 'name')) # The default branch should have two versions. The first versions contains # one module and the second version contains two modules self.assertEquals(len(vt.branches[DEFAULT_BRANCH].workflows), 2) v1 = self.db.get_workflow( viztrail_id=vt.identifier, workflow_version=vt.branches[DEFAULT_BRANCH].workflows[0].version) v2 = self.db.get_workflow( viztrail_id=vt.identifier, workflow_version=vt.branches[DEFAULT_BRANCH].workflows[1].version) head = self.db.get_workflow(viztrail_id=vt.identifier, branch_id=DEFAULT_BRANCH) self.assertEquals(len(v1.modules), 1) self.assertEquals(len(v2.modules), 2) self.assertEquals(len(head.modules), 2) # Ensure that all modules have non-negative identifier for m in head.modules: self.assertTrue(m.identifier >= 0) self.assertEquals(head.modules[0].command.module_type, PACKAGE_PYTHON) self.assertEquals(head.modules[1].command.module_type, PACKAGE_VIZUAL) self.assertEquals(head.version, 1) # Re-load the viztrails to ensure that all information has been persisted properly self.db = FileSystemViztrailRepository(VIZTRAILS_DIRECTORY, {ENV.identifier: ENV}) vt = self.db.get_viztrail(vt.identifier) self.assertEquals(len(vt.branches[DEFAULT_BRANCH].workflows), 2) v1 = self.db.get_workflow( viztrail_id=vt.identifier, workflow_version=vt.branches[DEFAULT_BRANCH].workflows[0].version) v2 = self.db.get_workflow( viztrail_id=vt.identifier, workflow_version=vt.branches[DEFAULT_BRANCH].workflows[1].version) head = self.db.get_workflow(viztrail_id=vt.identifier, branch_id=DEFAULT_BRANCH) self.assertEquals(len(v1.modules), 1) self.assertEquals(len(v2.modules), 2) self.assertEquals(len(head.modules), 2) # Ensure that all modules have non-negative identifier for m in head.modules: self.assertTrue(m.identifier >= 0) self.assertEquals(head.modules[0].command.module_type, PACKAGE_PYTHON) self.assertEquals(head.modules[1].command.module_type, PACKAGE_VIZUAL) self.assertEquals(head.version, 1) # Append a third moduel to the head of the default branch self.db.append_workflow_module(viztrail_id=vt.identifier, command=python_cell('def')) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertEquals(len(wf.modules), 3) for m in wf.modules: self.assertTrue(m.identifier >= 0) self.assertEquals(m.stdout[0]['data'], 'SUCCESS ' + str(m.identifier)) self.assertEquals(wf.modules[0].command.module_type, PACKAGE_PYTHON) self.assertEquals(wf.modules[1].command.module_type, PACKAGE_VIZUAL) self.assertEquals(wf.modules[2].command.module_type, PACKAGE_PYTHON) self.assertEquals(wf.version, 2) # Append a module to the first version in the branch. The resulting new # branch HEAD is expected to contain only two modules then. self.db.append_workflow_module(viztrail_id=vt.identifier, workflow_version=0, command=python_cell('def')) self.db = FileSystemViztrailRepository(VIZTRAILS_DIRECTORY, {ENV.identifier: ENV}) vt = self.db.get_viztrail(vt.identifier) wf = self.db.get_workflow(viztrail_id=vt.identifier) self.assertEquals(len(wf.modules), 2) for m in wf.modules: self.assertTrue(m.identifier >= 0) self.assertEquals(m.stdout[0]['data'], 'SUCCESS ' + str(m.identifier)) self.assertEquals(wf.modules[0].command.module_type, PACKAGE_PYTHON) self.assertEquals(wf.modules[1].command.module_type, PACKAGE_PYTHON) self.assertEquals(wf.version, 3)