def test_workflow(self): """Run workflows for Mimir configurations.""" # Create new work trail and retrieve the HEAD workflow of the default # branch project = self.engine.projects.create_project() branch_id = project.viztrail.default_branch.identifier fh = project.filestore.upload_file(CSV_FILE) cmd = load_dataset(dataset_name=DATASET_PEOPLE, file={ pckg.FILE_ID: fh.identifier, pckg.FILE_NAME: os.path.basename(CSV_FILE) }, infer_types=True) self.engine.append_workflow_module(project_id=project.identifier, branch_id=branch_id, command=cmd) cmd = python_cell(PY_ADD_ONE) self.engine.append_workflow_module(project_id=project.identifier, branch_id=branch_id, command=cmd) wf = project.viztrail.default_branch.head while project.viztrail.default_branch.head.is_active: time.sleep(0.1) for m in wf.modules: print(m) self.assertTrue(m.is_success) cmd = python_cell(CREATE_DATASET_PY) self.engine.insert_workflow_module( project_id=project.identifier, branch_id=branch_id, before_module_id=wf.modules[0].identifier, command=cmd) wf = project.viztrail.default_branch.head while project.viztrail.default_branch.head.is_active: time.sleep(0.1) for m in wf.modules: print(m) self.assertTrue(m.is_success) datasets = wf.modules[0].provenance.write self.assertTrue(DATASET_FRIENDS in datasets) self.assertFalse(DATASET_PEOPLE in datasets) for m in wf.modules[1:]: datasets = m.provenance.get_database_state(datasets) self.assertTrue(DATASET_FRIENDS in datasets) self.assertTrue(DATASET_PEOPLE in datasets) ds = project.datastore.get_dataset(datasets[DATASET_PEOPLE].identifier) rows = ds.fetch_rows() self.assertEqual(rows[0].values, ['Alice', 24]) self.assertEqual(rows[1].values, ['Bob', 32]) ds = project.datastore.get_dataset( datasets[DATASET_FRIENDS].identifier) rows = ds.fetch_rows() self.assertEqual(rows[0].values, ['Yonder', 23]) self.assertEqual(rows[1].values, ['Zoe', 34])
def test_skip_modules(self): """Test replacing a module in a workflow where dome cells do not require to be re-executed because they access a different dataset. """ project = self.engine.projects.create_project() branch_id = project.get_default_branch().identifier fh1 = project.filestore.upload_file(CSV_FILE) fh2 = project.filestore.upload_file(CSV_FILE) self.engine.append_workflow_module( project_id=project.identifier, branch_id=branch_id, command=load_dataset(dataset_name=DATASET_NAME, file={pckg.FILE_ID: fh1.identifier})) self.engine.append_workflow_module( project_id=project.identifier, branch_id=branch_id, command=load_dataset(dataset_name=SECOND_DATASET_NAME, file={pckg.FILE_ID: fh2.identifier})) for i in range(10): if i in [0, 2, 4, 6, 8]: cmd = command = python_cell(PY_ADD_ONE) else: cmd = command = python_cell(PY_ADD_SECOND) self.engine.append_workflow_module(project_id=project.identifier, branch_id=branch_id, command=cmd) while project.viztrail.default_branch.head.is_active: time.sleep(0.1) wf = project.viztrail.default_branch.head self.assertTrue(wf.get_state().is_success) datasets = [module.datasets for module in wf.modules[4:]] self.assert_module_count_is(project, 12) # Replace a module that updates the first datasets. All modules that # access the second dataset should remain unchanged. cmd = command = python_cell(PY_ADD_TEN) self.engine.replace_workflow_module(project_id=project.identifier, branch_id=branch_id, module_id=wf.modules[4].identifier, command=cmd) while project.viztrail.default_branch.head.is_active: time.sleep(0.1) wf = project.viztrail.default_branch.head self.assertTrue(wf.get_state().is_success) i = 0 for module in wf.modules[4:]: self.assertNotEqual(datasets[i][DATASET_NAME].identifier, module.datasets[DATASET_NAME].identifier) self.assertEqual(datasets[i][SECOND_DATASET_NAME].identifier, module.datasets[SECOND_DATASET_NAME].identifier) i += 1
def test_replace(self): """Test replacing a module.""" project = self.engine.projects.create_project() branch_id = self.create_workflow(project) wf = project.viztrail.default_branch.head # Keep track of datasets in the completed workflow datasets = [m.datasets[DATASET_NAME].identifier for m in wf.modules] # Insert in the middle cmd = command = python_cell(PY_ADD_TEN) result = self.engine.replace_workflow_module( project_id=project.identifier, branch_id=branch_id, module_id=wf.modules[5].identifier, command=cmd) self.assertEqual(len(result), 6) while project.viztrail.default_branch.head.is_active: time.sleep(0.1) self.assert_module_count_is(project, 11) self.assert_value_is(project, 42) wf = project.viztrail.default_branch.head for i in range(5): self.assertEqual(datasets[i], wf.modules[i].datasets[DATASET_NAME].identifier) for i in range(5, len(wf.modules)): self.assertFalse( wf.modules[i].datasets[DATASET_NAME].identifier in datasets) # Ensure that None is returned when attempting to replace a module in # an unknown branch result = self.engine.replace_workflow_module( project_id=project.identifier, branch_id='null', module_id=wf.modules[0].identifier, command=python_cell('print 2+2')) self.assertIsNone(result) # Replace at the start will leave all moules in error or canceled state result = self.engine.replace_workflow_module( project_id=project.identifier, branch_id=branch_id, module_id=wf.modules[0].identifier, command=cmd) self.assertEqual(len(result), 11) while project.viztrail.default_branch.head.is_active: time.sleep(0.1) self.assert_module_count_is(project, 11) wf = project.viztrail.default_branch.head self.assertTrue(wf.modules[0].is_error) for module in wf.modules[1:]: self.assertTrue(module.is_canceled)
def create_workflow(self, project): """Create a completed workflow by loading the data file and updating the age value of the first row ten times. """ branch_id = project.viztrail.default_branch.identifier fh = project.filestore.upload_file(CSV_FILE) cmd = load_dataset( dataset_name=DATASET_NAME, file={pckg.FILE_ID: fh.identifier} ) self.engine.append_workflow_module( project_id=project.identifier, branch_id=branch_id, command=cmd ) for i in range(10): cmd = python_cell(PY_ADD_ONE) self.engine.append_workflow_module( project_id=project.identifier, branch_id=branch_id, command=cmd ) while project.viztrail.default_branch.head.is_active: time.sleep(0.1) for module in project.viztrail.default_branch.head.modules: # print("--------=======--------") # print(module.command) # print(module.outputs) # print(module.provenance) if not module.is_success: print(module.outputs) self.assertTrue(module.is_success) self.assertTrue(DATASET_NAME in module.provenance.write) return branch_id
def test_simple_script(self): """Test running the simple python script.""" cmd = python_cell(source='print 2+2', validate=True) result = PyCellTaskProcessor().compute(command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext( datastore=self.datastore, filestore=self.filestore, datasets=dict())) self.assertTrue(result.is_success) self.assertEqual(result.outputs.stdout[0].value, '4')
def test_python_cell(self): """Test validation of the python cell command.""" python_cell(source='ABC', validate=True) # Have an error raised if values of invalid data type are given with self.assertRaises(ValueError): python_cell(source=[], validate=True) # Get dictionary serialization of command arguments. Ensure that we # can create a valid command instance from the returned result. obj = python_cell(source='ABC', validate=True).arguments.to_list() ModuleCommand(package_id=pycell.PACKAGE_PYTHON, command_id=pycell.PYTHON_CODE, arguments=obj, packages=PACKAGES) # Delete the only mandatory element from the serialization to ensure # that validation fails del obj[0] with self.assertRaises(ValueError): ModuleCommand(package_id=pycell.PACKAGE_PYTHON, command_id=pycell.PYTHON_CODE, arguments=obj, packages=PACKAGES) # Add an unknown argument to ensure that the validation fails obj = python_cell(source='ABC', validate=True).arguments.to_list() obj.append(ARG(id='someUnknownLabel', value='')) with self.assertRaises(ValueError): ModuleCommand(package_id=pycell.PACKAGE_PYTHON, command_id=pycell.PYTHON_CODE, arguments=obj, packages=PACKAGES)
def test_unknown_dataset_script(self): """Test running a script that accesses an unknown datasets.""" fh = self.filestore.upload_file(CSV_FILE) ds = self.datastore.load_dataset(fh) cmd = python_cell(source=PRINT_UNKNOWN_DATASET_PY, validate=True) result = PyCellTaskProcessor().compute(command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext( datastore=self.datastore, filestore=self.filestore, project_id=6, artifacts={'people': ds})) self.assertFalse(result.is_success) self.assertTrue(result.provenance.read == {}) self.assertTrue(result.provenance.write == {}) self.assertEqual(len(result.outputs.stdout), 0) self.assertEqual(len(result.outputs.stderr), 1) # Running a similar script that catches the error schould be a success # and the access to the dataset should be recorded in the resulting # read provenance cmd = python_cell(source=PRINT_UNKNOWN_DATASET_PY_WITH_TRY_CATCH, validate=True) result = PyCellTaskProcessor().compute(command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext( datastore=self.datastore, filestore=self.filestore, project_id=6, artifacts={'people': ds})) self.assertTrue(result.is_success) self.assertIsNotNone(result.provenance.read) self.assertIsNotNone(result.provenance.write) self.assertEqual(len(result.provenance.read), 1) self.assertEqual(len(result.provenance.write), 0) self.assertTrue('employees' in result.provenance.read) self.assertIsNone(result.provenance.read['employees']) self.assertEqual(len(result.outputs.stdout), 1) self.assertEqual(len(result.outputs.stderr), 0)
def test_execute(self): """Test that the initial workflow is created correctly.""" project = self.engine.projects.create_project() self.create_workflow(project) self.assertFalse(project.viztrail.default_branch.head.is_active) self.assert_module_count_is(project, 11) self.assert_value_is(project, 33) # Ensure that is returned when attempting to append to an # unknown branch result = self.engine.append_workflow_module( project_id=project.identifier, branch_id='null', command=python_cell('print 2+2')) self.assertIsNone(result)
def test_execute(self): """Test executing a sequence of supported commands.""" context = dict() cmd = pycell.python_cell(source='print(2+2)', validate=True) controller = FakeWorkflowController() self.backend.execute_async(task=TaskHandle(task_id='000', project_id=self.PROJECT_ID, controller=controller), command=cmd, artifacts=context) time.sleep(3) self.assertEqual(controller.task_id, '000') self.assertEqual(controller.state, 'SUCCESS') self.assertEqual(controller.outputs.stdout[0].value, '4')
def test_completed_append(self): """Test appending a completed workflow to a branch.""" base_path = os.path.join(os.path.abspath(REPO_DIR), 'ABC') os.makedirs(base_path) vt = OSViztrailHandle.create_viztrail(identifier='ABC', properties=None, base_path=base_path) branch = vt.get_default_branch() for i in range(10): ts = get_current_time() command = python_cell(source='print ' + str(i) + '+' + str(i)) module = OSModuleHandle.create_module( command=command, external_form='print ' + str(i) + '+' + str(i), state=MODULE_SUCCESS, datasets=dict(), outputs=ModuleOutputs(stdout=[TextOutput(str(i + i))]), provenance=ModuleProvenance(), timestamp=ModuleTimestamp(created_at=ts, started_at=ts, finished_at=ts), module_folder=vt.modules_folder, object_store=vt.object_store) if not branch.head is None: modules = branch.head.modules + [module] else: modules = [module] branch.append_workflow(modules=modules, action=ACTION_INSERT, command=command) head_modules = branch.get_head().modules wf = branch.append_workflow(modules=head_modules[:-1], action=ACTION_DELETE, command=head_modules[-1].command) self.assertEqual(len(wf.modules), 9) self.assertEqual(wf.descriptor.identifier, '0000000A') self.assertEqual(wf.descriptor.action, ACTION_DELETE) self.assertEqual(wf.descriptor.package_id, PACKAGE_PYTHON) self.assertEqual(wf.descriptor.command_id, PYTHON_CODE) vt = OSViztrailHandle.load_viztrail(base_path) branch = vt.get_default_branch() history = branch.get_history() self.assertEqual(len(history), 11) wf = branch.get_head() self.assertEqual(len(wf.modules), 9) self.assertEqual(wf.descriptor.identifier, '0000000A') self.assertEqual(wf.descriptor.action, ACTION_DELETE) self.assertEqual(wf.descriptor.package_id, PACKAGE_PYTHON) self.assertEqual(wf.descriptor.command_id, PYTHON_CODE)
def test_running(self): """Update module state from pending to running.""" # Create original module module = OSModuleHandle.create_module( command=python_cell(source='print 2+2'), external_form='TEST MODULE', state=MODULE_PENDING, module_folder=MODULE_DIR, timestamp=ModuleTimestamp(), datasets={'DS1': DS1}, outputs=ModuleOutputs(stdout=[TextOutput('ABC')]), provenance=ModuleProvenance( read={'DS1': 'ID1'}, write={'DS1': DatasetDescriptor(identifier='ID2')}, resources={'fileid': '0123456789'})) self.assertTrue(module.is_pending) module.set_running(external_form='TEST MODULE') self.assertTrue(module.is_running) self.assertIsNotNone(module.timestamp.started_at) self.assertEqual(len(module.datasets), 0) self.assertEqual(len(module.outputs.stderr), 0) self.assertEqual(len(module.outputs.stdout), 0) self.assertIsNotNone(module.provenance.read) self.assertIsNotNone(module.provenance.write) self.assertIsNotNone(module.provenance.resources) # Read module from object store and ensure that tall changes have been # materialized properly module = OSModuleHandle.load_module(identifier=module.identifier, module_path=module.module_path) self.assertTrue(module.is_running) self.assertIsNotNone(module.timestamp.started_at) self.assertEqual(len(module.datasets), 0) self.assertEqual(len(module.outputs.stderr), 0) self.assertEqual(len(module.outputs.stdout), 0) self.assertIsNotNone(module.provenance.read) self.assertIsNotNone(module.provenance.write) self.assertIsNotNone(module.provenance.resources) # Set running with all optional parameters module.set_running(started_at=module.timestamp.created_at, external_form='Some form') self.assertEqual(module.timestamp.started_at, module.timestamp.created_at) self.assertEqual(module.external_form, 'Some form') module = OSModuleHandle.load_module(identifier=module.identifier, module_path=module.module_path) self.assertEqual(module.timestamp.started_at, module.timestamp.created_at) self.assertEqual(module.external_form, 'Some form')
def test_cancel(self) -> None: """Test executing a sequence of supported commands.""" context: Dict[str, ArtifactDescriptor] = dict() cmd = pycell.python_cell(source='import time\ntime.sleep(5)', validate=True) controller = FakeWorkflowController() self.backend.execute_async(task=TaskHandle(task_id='000', project_id=self.PROJECT_ID, controller=controller), command=cmd, artifacts=context) time.sleep(1) self.backend.cancel_task('000') time.sleep(5) self.assertIsNone(controller.task_id) self.assertIsNone(controller.state)
def test_create_dataset_script(self): """Test running a script that creates a new datasets.""" cmd = python_cell(source=CREATE_DATASET_PY, validate=True) result = PyCellTaskProcessor().compute(command_id=cmd.command_id, arguments=cmd.arguments, context=TaskContext( datastore=self.datastore, filestore=self.filestore)) self.assertTrue(result.is_success) self.assertIsNotNone(result.provenance.read) self.assertIsNotNone(result.provenance.write) self.assertEqual(len(result.provenance.read), 0) self.assertEqual(len(result.provenance.write), 1) self.assertTrue('people' in result.provenance.write) self.assertIsNotNone(result.provenance.write['people']) self.assertEqual(len(result.outputs.stdout), 1) self.assertEqual(len(result.outputs.stderr), 0) self.assertEqual(result.outputs.stdout[0].value, 'Alice\nBob')
def test_multi_append(self): """Test appending modules to viztrail branch.""" base_path = os.path.join(os.path.abspath(REPO_DIR), 'ABC') os.makedirs(base_path) vt = OSViztrailHandle.create_viztrail(identifier='ABC', properties=None, base_path=base_path) branch = vt.get_default_branch() # Append ten modules for i in range(10): ts = get_current_time() command = python_cell(source='print ' + str(i) + '+' + str(i)) module = OSModuleHandle.create_module( command=command, external_form='print ' + str(i) + '+' + str(i), state=MODULE_SUCCESS, datasets=dict(), outputs=ModuleOutputs(stdout=[TextOutput(str(i + i))]), provenance=ModuleProvenance(), timestamp=ModuleTimestamp(created_at=ts, started_at=ts, finished_at=ts), module_folder=vt.modules_folder, object_store=vt.object_store) if not branch.head is None: modules = branch.head.modules + [module] else: modules = [module] branch.append_workflow(modules=modules, action=ACTION_INSERT, command=command) self.assertEqual(len(branch.get_history()), (i + 1)) vt = OSViztrailHandle.load_viztrail(base_path) branch = vt.get_default_branch() history = branch.get_history() self.assertEqual(len(history), 10) for i in range(10): wf = branch.get_workflow(history[i].identifier) self.assertEqual(len(wf.modules), (i + 1)) for m in range(i + 1): module = wf.modules[m] self.assertEqual(module.external_form, 'print ' + str(m) + '+' + str(m)) self.assertEqual(module.outputs.stdout[-1].value, str(m + m))
def create_workflow(self, project): """Create a completed workflow by loading the data file and updating the age value of the first row ten times. """ branch_id = project.viztrail.default_branch.identifier fh = project.filestore.upload_file(CSV_FILE) cmd = load_dataset(dataset_name=DATASET_NAME, file={pckg.FILE_ID: fh.identifier}) self.engine.append_workflow_module(project_id=project.identifier, branch_id=branch_id, command=cmd) for i in range(10): cmd = python_cell(PY_ADD_ONE) self.engine.append_workflow_module(project_id=project.identifier, branch_id=branch_id, command=cmd) while project.viztrail.default_branch.head.is_active: time.sleep(0.1) return branch_id