def test_altered_workflow(self): """ Tests whether data is regenerated if the pipeline workflows are altered """ study_name = 'add_node' # Test vanilla study study = self.create_study(TestProvStudy, study_name, inputs=STUDY_INPUTS) self.assertEqual( study.data('derived_field2').value(*self.SESSION), 156.0) # Rerun results of altered study study = self.create_study(TestProvStudyAddNode, study_name, processor=SingleProc(self.work_dir, reprocess=True), inputs=STUDY_INPUTS) self.assertEqual( study.data('derived_field2').value(*self.SESSION), 1252.0) study_name = 'add_connect' # Test vanilla study study = self.create_study(TestProvStudy, study_name, inputs=STUDY_INPUTS) self.assertEqual( study.data('derived_field2').value(*self.SESSION), 156.0) # Rerun results of altered study study = self.create_study(TestProvStudyAddConnect, study_name, processor=SingleProc(self.work_dir, reprocess=True), inputs=STUDY_INPUTS) self.assertEqual( study.data('derived_field2').value(*self.SESSION), 170.0)
def test_process_dialation(self): study_name = 'process_dialation' new_value = -101 study = self.create_study(TestDialationStudy, study_name, inputs=self.STUDY_INPUTS) study.data('derived_field5') def values_equal(field_name, values): for subj_i in range(self.NUM_SUBJECTS): for vis_i in range(self.NUM_VISITS): sess = study.tree.session(subj_i, vis_i) field = sess.field(field_name, from_study=study_name) self.assertEqual(field.value, values[(str(subj_i), str(vis_i))]) # Test generated values values_equal('derived_field5', self.DEFAULT_FIELD5_VALUES) # Tag the field 1 value so we can detect if it gets regenerated orig_field1_values = {} orig_field3_values = {} for vis_i in range(self.NUM_VISITS): for subj_i in range(self.NUM_SUBJECTS): sess = study.tree.session(subj_i, vis_i) field1 = sess.field('derived_field1', from_study=study_name) orig_field1_values[(str(subj_i), str(vis_i))] = field1.value change_value_w_prov(field1, new_value) field3 = study.tree.visit(vis_i).field('derived_field3', from_study=study_name) orig_field3_values[str(vis_i)] = field3.value # Rerun analysis with new parameters study = self.create_study(TestDialationStudy, study_name, inputs=self.STUDY_INPUTS, processor=SingleProc(self.work_dir, reprocess=True), parameters={'pipeline3_op': 'mul'}) study.data('derived_field3', subject_id='0', visit_id='0') values_equal('derived_field1', {k: new_value for k in orig_field1_values}) self.assertEqual( study.tree.visit('0').field('derived_field3', from_study=study_name).value, 10201) self.assertEqual( study.tree.visit('1').field('derived_field3', from_study=study_name).value, orig_field3_values['1']) study = self.create_study(TestDialationStudy, study_name, inputs=self.STUDY_INPUTS, processor=SingleProc(self.work_dir, reprocess=True), parameters={'increment': 2}) study.data('derived_field5', subject_id='0', visit_id='0') values_equal('derived_field1', {k: v + 1 for k, v in orig_field1_values.items()})
def test_protect_manually(self): """Protect manually altered files and fields from overwrite""" analysis_name = 'manual_protect' protected_derived_field4_value = -99.0 protected_derived_fileset1_value = -999.0 # Test vanilla analysis analysis = self.create_analysis( TestProvAnalysis, analysis_name, inputs=STUDY_INPUTS) derived_fileset1_slice, derived_field4_slice = analysis.data( ('derived_fileset1', 'derived_field4'), derive=True) self.assertContentsEqual(derived_fileset1_slice, 154.0) self.assertEqual(derived_field4_slice.value(*self.SESSION), 155.0) # Rerun with new parameters analysis = self.create_analysis( TestProvAnalysis, analysis_name, inputs=STUDY_INPUTS, processor=SingleProc(self.work_dir, reprocess=True), parameters={'multiplier': 100.0}) derived_fileset1_slice, derived_field4_slice = analysis.data( ('derived_fileset1', 'derived_field4'), derive=True) self.assertContentsEqual(derived_fileset1_slice, 1414.0) derived_field4 = derived_field4_slice.item(*self.SESSION) self.assertEqual(derived_field4.value, 1415.0) # Manually changing the value (or file contents) of a derivative value # (without also altering the saved provenance record) will mean # that new value/file will be "protected" from reprocessing, and will # need to be deleted in order to be regenerated derived_field4.value = protected_derived_field4_value # Since derived_fileset1 needs to be reprocessed but analysis = self.create_analysis( TestProvAnalysis, analysis_name, processor=SingleProc(self.work_dir, reprocess=True), inputs=STUDY_INPUTS, parameters={'multiplier': 1000.0}) # Check to see protected conflict error is raise if only one of # derived field4/fileset1 is protected self.assertRaises( ArcanaProtectedOutputConflictError, analysis.derive, ('derived_fileset1', 'derived_field4')) with open(derived_fileset1_slice.path(*self.SESSION), 'w') as f: f.write(str(protected_derived_fileset1_value)) analysis.clear_caches() # Protect the output of derived_fileset1 as well and it should return # the protected values derived_fileset1_slice, derived_field4_slice = analysis.data( ('derived_fileset1', 'derived_field4'), derive=True) self.assertContentsEqual(derived_fileset1_slice, protected_derived_fileset1_value) self.assertEqual(derived_field4_slice.value(*self.SESSION), protected_derived_field4_value)
def test_fields_roundtrip(self): repository = XnatRepo(server=SERVER, cache_dir=self.cache_dir) dataset = repository.dataset(self.project) analysis = DummyAnalysis( self.STUDY_NAME, dataset=dataset, processor=SingleProc('a_dir'), inputs=[FilesetFilter('source1', 'source1', text_format)]) fields = ['field{}'.format(i) for i in range(1, 4)] dummy_pipeline = analysis.dummy_pipeline() dummy_pipeline.cap() sink = pe.Node(RepositorySink( (analysis.bound_spec(f).slice for f in fields), dummy_pipeline), name='fields_sink') sink.inputs.field1_field = field1 = 1 sink.inputs.field2_field = field2 = 2.0 sink.inputs.field3_field = field3 = str('3') sink.inputs.subject_id = self.SUBJECT sink.inputs.visit_id = self.VISIT sink.inputs.desc = "Test sink of fields" sink.inputs.name = 'test_sink' sink.run() source = pe.Node(RepositorySource( analysis.bound_spec(f).slice for f in fields), name='fields_source') source.inputs.visit_id = self.VISIT source.inputs.subject_id = self.SUBJECT source.inputs.desc = "Test source of fields" source.inputs.name = 'test_source' results = source.run() self.assertEqual(results.outputs.field1_field, field1) self.assertEqual(results.outputs.field2_field, field2) self.assertEqual(results.outputs.field3_field, field3)
def test_fields_roundtrip(self): STUDY_NAME = 'fields_roundtrip' study = DummyStudy(STUDY_NAME, self.repository, processor=SingleProc('a_dir'), inputs=[]) dummy_pipeline = study.dummy_pipeline() dummy_pipeline.cap() sink = pe.Node(RepositorySink( (study.bound_spec(f).collection for f in ['field1', 'field2', 'field3']), dummy_pipeline), name='fields_sink') sink.inputs.field1_field = field1 = 1 sink.inputs.field2_field = field2 = 2.0 sink.inputs.field3_field = field3 = '3' sink.inputs.subject_id = self.SUBJECT sink.inputs.visit_id = self.VISIT sink.inputs.desc = "Test sink of fields" sink.inputs.name = 'test_sink' sink.run() source = pe.Node(RepositorySource( study.bound_spec(f).collection for f in ['field1', 'field2', 'field3']), name='fields_source') source.inputs.visit_id = self.VISIT source.inputs.subject_id = self.SUBJECT source.inputs.desc = "Test source of fields" source.inputs.name = 'test_source' results = source.run() self.assertEqual(results.outputs.field1_field, field1) self.assertEqual(results.outputs.field2_field, field2) self.assertEqual(results.outputs.field3_field, field3)
def test_input_change(self): analysis_name = 'input_change_analysis' analysis = self.create_analysis( TestProvAnalysis, analysis_name, inputs=STUDY_INPUTS) self.assertEqual( analysis.data('derived_field2', derive=True).value(*self.SESSION), 156.0) # Change acquired file contents, which should cause the checksum check # to fail with open(analysis.data('acquired_fileset1', derive=True).path(*self.SESSION), 'w') as f: f.write('99.9') # Should detect that the input has changed and throw an error self.assertRaises( ArcanaReprocessException, analysis.derive, 'derived_field2') new_analysis = self.create_analysis( TestProvAnalysis, analysis_name, processor=SingleProc(self.work_dir, reprocess=True), inputs=STUDY_INPUTS) self.assertEqual( new_analysis.data('derived_field2', derive=True).value(*self.SESSION), 1145.0)
def test_bids_fmri(self): analysis = BoldAnalysis('test_fmri', repository=BidsRepo( op.join(self.BIDS_EXAMPLES_PATH, 'ds114')), processor=SingleProc(self.work_dir), environment=TEST_ENV, bids_task='covertverbgeneration') analysis.pipeline('single_subject_melodic_pipeline')
def test_bids_dwi(self): analysis = DwiAnalysis('test_dwi', repository=BidsRepo( op.join(self.BIDS_EXAMPLES_PATH, 'ds114')), processor=SingleProc(self.work_dir), environment=TEST_ENV, parameters={'preproc_pe_dir': 'RL'}) analysis.pipeline('global_tracking_pipeline')
def test_bids_fmri(self): study = BoldStudy('test_fmri', repository=self.repo, processor=SingleProc( self.work_dir, prov_ignore=SingleProc.DEFAULT_PROV_IGNORE + ['workflow/nodes/.*/requirements/.*/version']), bids_task='covertverbgeneration') study.data('melodic_ica')
def test_bids_dwi(self): study = DwiStudy( 'test_dwi', repository=self.repo, processor=SingleProc(self.work_dir, prov_ignore=SingleProc.DEFAULT_PROV_IGNORE + ['workflow/nodes/.*/requirements/.*/version'], reprocess=True), parameters={'preproc_pe_dir': 'RL'}) study.data('tensor')
def test_repository_roundtrip(self): analysis = DummyAnalysis(self.STUDY_NAME, self.dataset, processor=SingleProc('a_dir'), inputs=[ FilesetFilter('source1', 'source1', text_format), FilesetFilter('source2', 'source2', text_format), FilesetFilter('source3', 'source3', text_format), FilesetFilter('source4', 'source4', text_format) ]) # TODO: Should test out other file formats as well. source_files = ('source1', 'source2', 'source3', 'source4') sink_files = ('sink1', 'sink3', 'sink4') inputnode = pe.Node(IdentityInterface(['subject_id', 'visit_id']), 'inputnode') inputnode.inputs.subject_id = self.SUBJECT inputnode.inputs.visit_id = self.VISIT source = pe.Node(RepositorySource( analysis.bound_spec(f).slice for f in source_files), name='source') dummy_pipeline = analysis.dummy_pipeline() dummy_pipeline.cap() sink = pe.Node(RepositorySink((analysis.bound_spec(f).slice for f in sink_files), dummy_pipeline), name='sink') sink.inputs.name = 'repository_sink' sink.inputs.desc = ( "A test session created by repository roundtrip unittest") # Create workflow connecting them together workflow = pe.Workflow('source_sink_unit_test', base_dir=self.work_dir) workflow.add_nodes((source, sink)) workflow.connect(inputnode, 'subject_id', source, 'subject_id') workflow.connect(inputnode, 'visit_id', source, 'visit_id') workflow.connect(inputnode, 'subject_id', sink, 'subject_id') workflow.connect(inputnode, 'visit_id', sink, 'visit_id') for source_name in source_files: if not source_name.endswith('2'): sink_name = source_name.replace('source', 'sink') workflow.connect(source, source_name + PATH_SUFFIX, sink, sink_name + PATH_SUFFIX) workflow.run() # Check local directory was created properly outputs = [ f for f in sorted( os.listdir(self.get_session_dir( from_analysis=self.STUDY_NAME))) if f not in (LocalFileSystemRepo.FIELDS_FNAME, LocalFileSystemRepo.PROV_DIR) ] self.assertEqual(outputs, ['sink1.txt', 'sink3.txt', 'sink4.txt'])
def test_dicom_match(self): analysis = test_data.TestMatchAnalysis( name='test_dicom', dataset=XnatRepo(server=SERVER, cache_dir=tempfile.mkdtemp()).dataset( self.project), processor=SingleProc(self.work_dir), inputs=test_data.TestDicomTagMatch.DICOM_MATCH) phase = list(analysis.data('gre_phase', derive=True))[0] mag = list(analysis.data('gre_mag', derive=True))[0] self.assertEqual(phase.name, 'gre_field_mapping_3mm_phase') self.assertEqual(mag.name, 'gre_field_mapping_3mm_mag')
def test_dicom_match(self): study = test_data.TestMatchStudy( name='test_dicom', repository=XnatRepo( project_id=self.project, server=SERVER, cache_dir=tempfile.mkdtemp()), processor=SingleProc(self.work_dir), inputs=test_data.TestDicomTagMatch.DICOM_MATCH) phase = list(study.data('gre_phase'))[0] mag = list(study.data('gre_mag'))[0] self.assertEqual(phase.name, 'gre_field_mapping_3mm_phase') self.assertEqual(mag.name, 'gre_field_mapping_3mm_mag')
def test_altered_workflow(self): """ Tests whether data is regenerated if the pipeline workflows are altered """ analysis_name = 'add_node' # Test vanilla analysis analysis = self.create_analysis( TestProvAnalysis, analysis_name, inputs=STUDY_INPUTS) self.assertEqual( analysis.data('derived_field2', derive=True).value(*self.SESSION), 156.0) # Rerun results of altered analysis analysis = self.create_analysis( TestProvAnalysisAddNode, analysis_name, processor=SingleProc(self.work_dir, reprocess=True), inputs=STUDY_INPUTS) self.assertEqual( analysis.data('derived_field2', derive=True).value(*self.SESSION), 1252.0) analysis_name = 'add_connect' # Test vanilla analysis analysis = self.create_analysis( TestProvAnalysis, analysis_name, inputs=STUDY_INPUTS) self.assertEqual( analysis.data('derived_field2', derive=True).value(*self.SESSION), 156.0) # Rerun results of altered analysis analysis = self.create_analysis( TestProvAnalysisAddConnect, analysis_name, processor=SingleProc(self.work_dir, reprocess=True), inputs=STUDY_INPUTS) self.assertEqual( analysis.data('derived_field2', derive=True).value(*self.SESSION), 170.0)
def test_id_match(self): study = test_data.TestMatchStudy( name='test_dicom', repository=XnatRepo( project_id=self.project, server=SERVER, cache_dir=tempfile.mkdtemp()), processor=SingleProc(self.work_dir), inputs=[ InputFilesets('gre_phase', valid_formats=dicom_format, id=7), InputFilesets('gre_mag', valid_formats=dicom_format, id=6)]) phase = list(study.data('gre_phase'))[0] mag = list(study.data('gre_mag'))[0] self.assertEqual(phase.name, 'gre_field_mapping_3mm_phase') self.assertEqual(mag.name, 'gre_field_mapping_3mm_mag')
def test_dialation_protection(self): """ Tests that derivatives are not re-derived unless they are needed to be """ analysis_name = 'dialation_protection' analysis = self.create_analysis( TestDialationAnalysis, analysis_name, inputs=self.STUDY_INPUTS) field5 = analysis.data('derived_field5', derive=True) for item in field5: self.assertEqual(item.value, self.DEFAULT_FIELD5_VALUES[(item.subject_id, item.visit_id)]) field1 = analysis.data('derived_field1', derive=True) field2 = analysis.data('derived_field2', derive=True) field1.item(subject_id='0', visit_id='1').value = 1000000 field1.item(subject_id='1', visit_id='1').value = 2000000 # Manually change value of field 2 field2.item(subject_id='0').value = -1000 analysis = self.create_analysis( TestDialationAnalysis, analysis_name, processor=SingleProc(self.work_dir, reprocess=True), inputs=self.STUDY_INPUTS, parameters={ 'increment': 2}) analysis.dataset.clear_cache() # Recalculate value of field5 with new field2 value field1, field2, field3, field4, field5 = analysis.data( ['derived_field1', 'derived_field2', 'derived_field3', 'derived_field4', 'derived_field5'], derive=True) self.assertEqual(field1.value(subject_id='0', visit_id='0'), 2) self.assertEqual(field1.value(subject_id='0', visit_id='1'), 1000000) self.assertEqual(field1.value(subject_id='1', visit_id='0'), 12) self.assertEqual(field1.value(subject_id='1', visit_id='1'), 2000000) self.assertEqual(field2.value(subject_id='0'), -1000) self.assertEqual(field2.value(subject_id='1'), 2000012) self.assertEqual(field3.value(visit_id='0'), 14) self.assertEqual(field3.value(visit_id='1'), 3000000) self.assertEqual(field4.value(), 3000014) self.assertEqual(field5.value(subject_id='0', visit_id='0'), -1000 + 14 + 3000014) self.assertEqual(field5.value(subject_id='0', visit_id='1'), -1000 + 3000000 + 3000014) self.assertEqual(field5.value(subject_id='1', visit_id='0'), 2000012 + 14 + 3000014) self.assertEqual(field5.value(subject_id='1', visit_id='1'), 2000012 + 3000000 + 3000014)
def test_id_match(self): analysis = test_data.TestMatchAnalysis( name='test_dicom', dataset=XnatRepo(server=SERVER, cache_dir=tempfile.mkdtemp()).dataset( self.project), processor=SingleProc(self.work_dir), inputs=[ FilesetFilter('gre_phase', valid_formats=dicom_format, id=7), FilesetFilter('gre_mag', valid_formats=dicom_format, id=6) ]) phase = list(analysis.data('gre_phase', derive=True))[0] mag = list(analysis.data('gre_mag', derive=True))[0] self.assertEqual(phase.name, 'gre_field_mapping_3mm_phase') self.assertEqual(mag.name, 'gre_field_mapping_3mm_mag')
def test_dialation_protection(self): study_name = 'dialation_protection' study = self.create_study(TestDialationStudy, study_name, inputs=self.STUDY_INPUTS) field5 = study.data('derived_field5') for item in field5: self.assertEqual( item.value, self.DEFAULT_FIELD5_VALUES[(item.subject_id, item.visit_id)]) field1 = study.data('derived_field1') field2 = study.data('derived_field2') field1.item(subject_id='0', visit_id='1').value = 1000000 field1.item(subject_id='1', visit_id='1').value = 2000000 # Manually change value of field 2 field2.item(subject_id='0').value = -1000 study = self.create_study(TestDialationStudy, study_name, processor=SingleProc(self.work_dir, reprocess=True), inputs=self.STUDY_INPUTS, parameters={'increment': 2}) # Recalculate value of field5 with new field2 value field1, field2, field3, field4, field5 = study.data([ 'derived_field1', 'derived_field2', 'derived_field3', 'derived_field4', 'derived_field5' ]) self.assertEqual(field1.value(subject_id='0', visit_id='0'), 2) self.assertEqual(field1.value(subject_id='0', visit_id='1'), 1000000) self.assertEqual(field1.value(subject_id='1', visit_id='0'), 12) self.assertEqual(field1.value(subject_id='1', visit_id='1'), 2000000) self.assertEqual(field2.value(subject_id='0'), -1000) self.assertEqual(field2.value(subject_id='1'), 2000012) self.assertEqual(field3.value(visit_id='0'), 14) self.assertEqual(field3.value(visit_id='1'), 3000000) self.assertEqual(field4.value(), 3000014) self.assertEqual(field5.value(subject_id='0', visit_id='0'), -1000 + 14 + 3000014) self.assertEqual(field5.value(subject_id='0', visit_id='1'), -1000 + 3000000 + 3000014) self.assertEqual(field5.value(subject_id='1', visit_id='0'), 2000012 + 14 + 3000014) self.assertEqual(field5.value(subject_id='1', visit_id='1'), 2000012 + 3000000 + 3000014)
def test_summary(self): study = DummyStudy(self.SUMMARY_STUDY_NAME, self.repository, SingleProc('ad'), inputs=[ InputFilesets('source1', 'source1', text_format), InputFilesets('source2', 'source2', text_format), InputFilesets('source3', 'source3', text_format) ]) # TODO: Should test out other file formats as well. source_files = ['source1', 'source2', 'source3'] inputnode = pe.Node(IdentityInterface(['subject_id', 'visit_id']), 'inputnode') inputnode.inputs.subject_id = self.SUBJECT inputnode.inputs.visit_id = self.VISIT source = pe.Node(RepositorySource( study.bound_spec(f).collection for f in source_files), name='source') # Test subject sink subject_sink_files = ['subject_sink'] dummy_pipeline = study.dummy_pipeline() dummy_pipeline.cap() subject_sink = pe.Node(RepositorySink( (study.bound_spec(f).collection for f in subject_sink_files), dummy_pipeline), name='subject_sink') subject_sink.inputs.name = 'subject_summary' subject_sink.inputs.desc = ( "Tests the sinking of subject-wide filesets") # Test visit sink visit_sink_files = ['visit_sink'] visit_sink = pe.Node(RepositorySink( (study.bound_spec(f).collection for f in visit_sink_files), dummy_pipeline), name='visit_sink') visit_sink.inputs.name = 'visit_summary' visit_sink.inputs.desc = ("Tests the sinking of visit-wide filesets") # Test project sink study_sink_files = ['study_sink'] study_sink = pe.Node(RepositorySink( (study.bound_spec(f).collection for f in study_sink_files), dummy_pipeline), name='study_sink') study_sink.inputs.name = 'project_summary' study_sink.inputs.desc = ("Tests the sinking of project-wide filesets") # Create workflow connecting them together workflow = pe.Workflow('summary_unittest', base_dir=self.work_dir) workflow.add_nodes((source, subject_sink, visit_sink, study_sink)) workflow.connect(inputnode, 'subject_id', source, 'subject_id') workflow.connect(inputnode, 'visit_id', source, 'visit_id') workflow.connect(inputnode, 'subject_id', subject_sink, 'subject_id') workflow.connect(inputnode, 'visit_id', visit_sink, 'visit_id') workflow.connect(source, 'source1' + PATH_SUFFIX, subject_sink, 'subject_sink' + PATH_SUFFIX) workflow.connect(source, 'source2' + PATH_SUFFIX, visit_sink, 'visit_sink' + PATH_SUFFIX) workflow.connect(source, 'source3' + PATH_SUFFIX, study_sink, 'study_sink' + PATH_SUFFIX) workflow.run() # Check local summary directories were created properly subject_dir = self.get_session_dir(frequency='per_subject', from_study=self.SUMMARY_STUDY_NAME) self.assertEqual(sorted(os.listdir(subject_dir)), [BasicRepo.PROV_DIR, 'subject_sink.txt']) visit_dir = self.get_session_dir(frequency='per_visit', from_study=self.SUMMARY_STUDY_NAME) self.assertEqual(sorted(os.listdir(visit_dir)), [BasicRepo.PROV_DIR, 'visit_sink.txt']) project_dir = self.get_session_dir(frequency='per_study', from_study=self.SUMMARY_STUDY_NAME) self.assertEqual(sorted(os.listdir(project_dir)), [BasicRepo.PROV_DIR, 'study_sink.txt']) # Reload the data from the summary directories reloadinputnode = pe.Node(IdentityInterface(['subject_id', 'visit_id']), name='reload_inputnode') reloadinputnode.inputs.subject_id = self.SUBJECT reloadinputnode.inputs.visit_id = self.VISIT reloadsource_per_subject = pe.Node(RepositorySource( study.bound_spec(f).collection for f in subject_sink_files), name='reload_source_per_subject') reloadsource_per_visit = pe.Node(RepositorySource( study.bound_spec(f).collection for f in visit_sink_files), name='reload_source_per_visit') reloadsource_per_study = pe.Node(RepositorySource( study.bound_spec(f).collection for f in study_sink_files), name='reload_source_per_study') reloadsink = pe.Node(RepositorySink( (study.bound_spec(f).collection for f in ['resink1', 'resink2', 'resink3']), dummy_pipeline), name='reload_sink') reloadsink.inputs.name = 'reload_summary' reloadsink.inputs.desc = ( "Tests the reloading of subject and project summary filesets") reloadworkflow = pe.Workflow('reload_summary_unittest', base_dir=self.work_dir) for node in (reloadsource_per_subject, reloadsource_per_visit, reloadsource_per_study, reloadsink): for iterator in ('subject_id', 'visit_id'): reloadworkflow.connect(reloadinputnode, iterator, node, iterator) reloadworkflow.connect(reloadsource_per_subject, 'subject_sink' + PATH_SUFFIX, reloadsink, 'resink1' + PATH_SUFFIX) reloadworkflow.connect(reloadsource_per_visit, 'visit_sink' + PATH_SUFFIX, reloadsink, 'resink2' + PATH_SUFFIX) reloadworkflow.connect(reloadsource_per_study, 'study_sink' + PATH_SUFFIX, reloadsink, 'resink3' + PATH_SUFFIX) reloadworkflow.run() outputs = [ f for f in sorted( os.listdir( self.get_session_dir(from_study=self.SUMMARY_STUDY_NAME))) if f not in (BasicRepo.FIELDS_FNAME, BasicRepo.PROV_DIR) ] self.assertEqual(outputs, ['resink1.txt', 'resink2.txt', 'resink3.txt'])
def __init__(self, name, repository, processor, inputs, environment=None, parameters=None, subject_ids=None, visit_ids=None, enforce_inputs=True, fill_tree=False, clear_caches=True): try: # This works for PY3 as the metaclass inserts it itself if # it isn't provided metaclass = type(self).__dict__['__metaclass__'] if not issubclass(metaclass, StudyMetaClass): raise KeyError except KeyError: raise ArcanaUsageError( "Need to have StudyMetaClass (or a sub-class) as " "the metaclass of all classes derived from Study") if isinstance(repository, basestring): repository = BasicRepo(repository, depth=None) if isinstance(processor, basestring): processor = SingleProc(processor) if environment is None: environment = StaticEnv() self._name = name self._repository = repository self._processor = processor.bind(self) self._environment = environment self._inputs = {} self._subject_ids = (tuple(subject_ids) if subject_ids is not None else None) self._visit_ids = tuple(visit_ids) if visit_ids is not None else None self._fill_tree = fill_tree # Initialise caches for data collection and pipeline objects if clear_caches: self.clear_caches() # Set parameters if parameters is None: parameters = {} elif not isinstance(parameters, dict): # Convert list of parameters into dictionary parameters = {o.name: o for o in parameters} self._parameters = {} for param_name, param in list(parameters.items()): if not isinstance(param, Parameter): param = Parameter(param_name, param) try: param_spec = self._param_specs[param_name] except KeyError: raise ArcanaNameError( param_name, "Provided parameter '{}' is not present in the " "allowable parameters for {} classes ('{}')".format( param_name, type(self).__name__, "', '".join(self.param_spec_names()))) param_spec.check_valid(param, context=' {}(name={})'.format( type(self).__name__, name)) self._parameters[param_name] = param # Convert inputs to a dictionary if passed in as a list/tuple if not isinstance(inputs, dict): inputs = {i.name: i for i in inputs} else: # Convert string patterns into Input objects for inpt_name, inpt in list(inputs.items()): if isinstance(inpt, basestring): spec = self.data_spec(inpt_name) if spec.is_fileset: inpt = InputFilesets(inpt_name, pattern=inpt, is_regex=True) else: inpt = InputFields(inpt_name, pattern=inpt, dtype=spec.dtype, is_regex=True) inputs[inpt_name] = inpt # Check validity of study inputs for inpt_name, inpt in inputs.items(): try: spec = self.data_spec(inpt_name) except ArcanaNameError: raise ArcanaNameError( inpt.name, "Input name '{}' isn't in data specs of {} ('{}')".format( inpt.name, self.__class__.__name__, "', '".join(self._data_specs))) else: if spec.is_fileset: if inpt.is_field: raise ArcanaUsageError( "Passed field ({}) as input to fileset spec" " {}".format(inpt, spec)) elif not inpt.is_field: raise ArcanaUsageError( "Passed fileset ({}) as input to field spec {}".format( inpt, spec)) # "Bind" input selectors to the current study object, and attempt to # match with data in the repository input_errors = [] with self.repository: if not self.subject_ids: raise ArcanaUsageError( "No subject IDs provided and destination repository " "is empty") if not self.visit_ids: raise ArcanaUsageError( "No visit IDs provided and destination repository " "is empty") for inpt_name, inpt in list(inputs.items()): try: try: self._inputs[inpt_name] = bound_inpt = inpt.bind( self, spec_name=inpt_name) except ArcanaInputMissingMatchError as e: if not inpt.drop_if_missing: raise e else: spec = self.data_spec(inpt_name) if spec.is_fileset: if spec.derived: try: spec.format.converter_from( bound_inpt.format) except ArcanaNoConverterError as e: e.msg += ( ", which is requried to convert:\n" + "{} to\n{}.").format( e, bound_inpt, spec) raise e else: if bound_inpt.format not in spec.valid_formats: raise ArcanaUsageError( "Cannot pass {} as an input to {} as " "it is not in one of the valid formats" " ('{}')".format( bound_inpt, spec, "', '".join( f.name for f in spec.valid_formats))) except ArcanaInputError as e: # Collate errors across all inputs into a single error # message input_errors.append(e) if input_errors: raise ArcanaInputError('\n'.join(str(e) for e in input_errors)) # Check remaining specs are optional or have default values for spec in self.data_specs(): if spec.name not in self.input_names: if not spec.derived and spec.default is None: # Emit a warning if an acquired fileset has not been # provided for an "acquired fileset" msg = ( " input fileset '{}' was not provided to {}.".format( spec.name, self)) if spec.optional: logger.info('Optional' + msg) else: if enforce_inputs: raise ArcanaMissingInputError( 'Non-optional' + msg + " Pipelines depending " "on this fileset will not run")
def test_repository_roundtrip(self): # Create working dirs # Create DarisSource node repository = XnatRepo(server=SERVER, cache_dir=self.cache_dir) dataset = repository.dataset(self.project) analysis = DummyAnalysis(self.STUDY_NAME, dataset=dataset, processor=SingleProc('a_dir'), inputs=[ FilesetFilter('source1', 'source1', text_format), FilesetFilter('source2', 'source2', text_format), FilesetFilter('source3', 'source3', text_format), FilesetFilter('source4', 'source4', text_format) ]) # TODO: Should test out other file formats as well. source_files = ['source1', 'source2', 'source3', 'source4'] sink_files = ['sink1', 'sink3', 'sink4'] inputnode = pe.Node(IdentityInterface(['subject_id', 'visit_id']), 'inputnode') inputnode.inputs.subject_id = str(self.SUBJECT) inputnode.inputs.visit_id = str(self.VISIT) source = pe.Node(RepositorySource( analysis.bound_spec(f).slice for f in source_files), name='source') dummy_pipeline = analysis.dummy_pipeline() dummy_pipeline.cap() sink = pe.Node(RepositorySink((analysis.bound_spec(f).slice for f in sink_files), dummy_pipeline), name='sink') sink.inputs.name = 'repository-roundtrip-unittest' sink.inputs.desc = ( "A test session created by repository roundtrip unittest") # Create workflow connecting them together workflow = pe.Workflow('source-sink-unit-test', base_dir=self.work_dir) workflow.add_nodes((source, sink)) workflow.connect(inputnode, 'subject_id', source, 'subject_id') workflow.connect(inputnode, 'visit_id', source, 'visit_id') workflow.connect(inputnode, 'subject_id', sink, 'subject_id') workflow.connect(inputnode, 'visit_id', sink, 'visit_id') for source_name in source_files: if source_name != 'source2': sink_name = source_name.replace('source', 'sink') workflow.connect(source, source_name + PATH_SUFFIX, sink, sink_name + PATH_SUFFIX) workflow.run() # Check cache was created properly self.assertEqual(filter_scans(os.listdir(self.session_cache())), [ 'source1-source1', 'source2-source2', 'source3-source3', 'source4-source4' ]) expected_sink_filesets = ['sink1', 'sink3', 'sink4'] self.assertEqual( filter_scans( os.listdir(self.session_cache(from_analysis=self.STUDY_NAME))), [(e + '-' + e) for e in expected_sink_filesets]) with self._connect() as login: fileset_names = filter_scans(login.experiments[self.session_label( from_analysis=self.STUDY_NAME)].scans.keys()) self.assertEqual(fileset_names, expected_sink_filesets)
def test_checksums(self): """ Tests check of downloaded checksums to see if file needs to be redownloaded """ cache_dir = op.join(self.work_dir, 'cache-checksum-check') DATASET_NAME = 'source1' STUDY_NAME = 'checksum_check_analysis' fileset_fname = DATASET_NAME + text_format.extension source_target_path = op.join(self.session_cache(cache_dir), DATASET_NAME + '-' + DATASET_NAME) md5_path = source_target_path + XnatRepo.MD5_SUFFIX source_target_fpath = op.join(source_target_path, fileset_fname) shutil.rmtree(cache_dir, ignore_errors=True) os.makedirs(cache_dir) source_repository = XnatRepo(server=SERVER, cache_dir=cache_dir) source_dataset = source_repository.dataset(self.project) sink_repository = XnatRepo(server=SERVER, cache_dir=cache_dir) sink_dataset = sink_repository.dataset(self.checksum_sink_project, subject_ids=['SUBJECT'], visit_ids=['VISIT'], fill_tree=True) analysis = DummyAnalysis(STUDY_NAME, dataset=sink_dataset, processor=SingleProc('ad'), inputs=[ FilesetFilter(DATASET_NAME, DATASET_NAME, text_format, dataset=source_dataset) ]) source = pe.Node(RepositorySource( [analysis.bound_spec(DATASET_NAME).slice]), name='checksum_check_source') source.inputs.subject_id = self.SUBJECT source.inputs.visit_id = self.VISIT source.run() self.assertTrue(op.exists(md5_path)) self.assertTrue(op.exists(source_target_fpath)) with open(md5_path) as f: checksums = json.load(f) # Stash the downloaded file in a new location and create a dummy # file instead stash_path = source_target_path + '.stash' shutil.move(source_target_path, stash_path) os.mkdir(source_target_path) with open(source_target_fpath, 'w') as f: f.write('dummy') # Run the download, which shouldn't download as the checksums are the # same source.run() with open(source_target_fpath) as f: d = f.read() self.assertEqual(d, 'dummy') # Replace the checksum with a dummy os.remove(md5_path) checksums['.'] = 'dummy_checksum' with open(md5_path, 'w', **JSON_ENCODING) as f: json.dump(checksums, f, indent=2) # Retry the download, which should now download since the checksums # differ source.run() with open(source_target_fpath) as f: d = f.read() with open(op.join(stash_path, fileset_fname)) as f: e = f.read() self.assertEqual(d, e) # Resink the source file and check that the generated MD5 checksum is # stored in identical format DATASET_NAME = 'sink1' dummy_pipeline = analysis.dummy_pipeline() dummy_pipeline.cap() sink = pe.Node(RepositorySink( [analysis.bound_spec(DATASET_NAME).slice], dummy_pipeline), name='checksum_check_sink') sink.inputs.name = 'checksum_check_sink' sink.inputs.desc = "Tests the generation of MD5 checksums" sink.inputs.subject_id = self.SUBJECT sink.inputs.visit_id = self.VISIT sink.inputs.sink1_path = source_target_fpath sink_target_path = op.join( self.session_cache(cache_dir, project=self.checksum_sink_project, subject=(self.SUBJECT), from_analysis=STUDY_NAME), DATASET_NAME + '-' + DATASET_NAME) sink_md5_path = sink_target_path + XnatRepo.MD5_SUFFIX sink.run() with open(md5_path) as f: source_checksums = json.load(f) with open(sink_md5_path) as f: sink_checksums = json.load(f) self.assertEqual( source_checksums, sink_checksums, ("Source checksum ({}) did not equal sink checksum ({})".format( source_checksums, sink_checksums)))
def test_delayed_download(self): """ Tests handling of race conditions where separate processes attempt to cache the same fileset """ cache_dir = op.join(self.work_dir, 'cache-delayed-download') DATASET_NAME = 'source1' target_path = op.join(self.session_cache(cache_dir), DATASET_NAME, DATASET_NAME + text_format.extension) tmp_dir = target_path + '.download' shutil.rmtree(cache_dir, ignore_errors=True) os.makedirs(cache_dir) repository = XnatRepo(server=SERVER, cache_dir=cache_dir) dataset = repository.dataset(self.project) analysis = DummyAnalysis( self.STUDY_NAME, dataset, SingleProc('ad'), inputs=[FilesetFilter(DATASET_NAME, DATASET_NAME, text_format)]) source = pe.Node(RepositorySource( [analysis.bound_spec(DATASET_NAME).slice]), name='delayed_source') source.inputs.subject_id = self.SUBJECT source.inputs.visit_id = self.VISIT result1 = source.run() source1_path = result1.outputs.source1_path self.assertTrue(op.exists(source1_path)) self.assertEqual( source1_path, target_path, "Output file path '{}' not equal to target path '{}'".format( source1_path, target_path)) # Clear cache to start again shutil.rmtree(cache_dir, ignore_errors=True) # Create tmp_dir before running interface, this time should wait for 1 # second, check to see that the session hasn't been created and then # clear it and redownload the fileset. os.makedirs(tmp_dir) source.inputs.race_cond_delay = 1 result2 = source.run() source1_path = result2.outputs.source1_path # Clear cache to start again shutil.rmtree(cache_dir, ignore_errors=True) # Create tmp_dir before running interface, this time should wait for 1 # second, check to see that the session hasn't been created and then # clear it and redownload the fileset. internal_dir = op.join(tmp_dir, 'internal') deleted_tmp_dir = tmp_dir + '.deleted' def simulate_download(): "Simulates a download in a separate process" os.makedirs(internal_dir) time.sleep(5) # Modify a file in the temp dir to make the source download keep # waiting logger.info('Updating simulated download directory') with open(op.join(internal_dir, 'download'), 'a') as f: f.write('downloading') time.sleep(10) # Simulate the finalising of the download by copying the previously # downloaded file into place and deleting the temp dir. logger.info('Finalising simulated download') with open(target_path, 'a') as f: f.write('simulated') shutil.move(tmp_dir, deleted_tmp_dir) source.inputs.race_cond_delay = 10 p = Process(target=simulate_download) p.start() # Start the simulated download in separate process time.sleep(1) source.run() # Run the local download p.join() with open(op.join(deleted_tmp_dir, 'internal', 'download')) as f: d = f.read() self.assertEqual(d, 'downloading') with open(target_path) as f: d = f.read() self.assertEqual(d, 'simulated')
def processor(self): return SingleProc(self.work_dir)
def test_summary(self): # Create working dirs # Create XnatSource node repository = XnatRepo(server=SERVER, cache_dir=self.cache_dir) analysis = DummyAnalysis(self.SUMMARY_STUDY_NAME, repository.dataset(self.project), SingleProc('ad'), inputs=[ FilesetFilter('source1', 'source1', text_format), FilesetFilter('source2', 'source2', text_format), FilesetFilter('source3', 'source3', text_format) ]) # TODO: Should test out other file formats as well. source_files = ['source1', 'source2', 'source3'] inputnode = pe.Node(IdentityInterface(['subject_id', 'visit_id']), 'inputnode') inputnode.inputs.subject_id = self.SUBJECT inputnode.inputs.visit_id = self.VISIT source = pe.Node(RepositorySource( [analysis.bound_spec(f).slice for f in source_files]), name='source') subject_sink_files = ['subject_sink'] dummy_pipeline = analysis.dummy_pipeline() dummy_pipeline.cap() subject_sink = pe.Node(RepositorySink( [analysis.bound_spec(f).slice for f in subject_sink_files], dummy_pipeline), name='subject_sink') subject_sink.inputs.name = 'subject_summary' subject_sink.inputs.desc = ( "Tests the sinking of subject-wide filesets") # Test visit sink visit_sink_files = ['visit_sink'] visit_sink = pe.Node(RepositorySink( [analysis.bound_spec(f).slice for f in visit_sink_files], dummy_pipeline), name='visit_sink') visit_sink.inputs.name = 'visit_summary' visit_sink.inputs.desc = ("Tests the sinking of visit-wide filesets") # Test project sink analysis_sink_files = ['analysis_sink'] analysis_sink = pe.Node(RepositorySink( [analysis.bound_spec(f).slice for f in analysis_sink_files], dummy_pipeline), name='analysis_sink') analysis_sink.inputs.name = 'project_summary' analysis_sink.inputs.desc = ( "Tests the sinking of project-wide filesets") # Create workflow connecting them together workflow = pe.Workflow('summary_unittest', base_dir=self.work_dir) workflow.add_nodes((source, subject_sink, visit_sink, analysis_sink)) workflow.connect(inputnode, 'subject_id', source, 'subject_id') workflow.connect(inputnode, 'visit_id', source, 'visit_id') workflow.connect(inputnode, 'subject_id', subject_sink, 'subject_id') workflow.connect(inputnode, 'visit_id', visit_sink, 'visit_id') workflow.connect(source, 'source1' + PATH_SUFFIX, subject_sink, 'subject_sink' + PATH_SUFFIX) workflow.connect(source, 'source2' + PATH_SUFFIX, visit_sink, 'visit_sink' + PATH_SUFFIX) workflow.connect(source, 'source3' + PATH_SUFFIX, analysis_sink, 'analysis_sink' + PATH_SUFFIX) workflow.run() analysis.clear_caches() # Refreshed cached repository tree object with self._connect() as login: # Check subject summary directories were created properly in cache expected_subj_filesets = ['subject_sink'] subject_dir = self.session_cache( visit=XnatRepo.SUMMARY_NAME, from_analysis=self.SUMMARY_STUDY_NAME) self.assertEqual(filter_scans(os.listdir(subject_dir)), [(e + '-' + e) for e in expected_subj_filesets]) # and on XNAT subject_fileset_names = filter_scans( login.projects[self.project].experiments[self.session_label( visit=XnatRepo.SUMMARY_NAME, from_analysis=self.SUMMARY_STUDY_NAME)].scans.keys()) self.assertEqual(expected_subj_filesets, subject_fileset_names) # Check visit summary directories were created properly in # cache expected_visit_filesets = ['visit_sink'] visit_dir = self.session_cache( subject=XnatRepo.SUMMARY_NAME, from_analysis=self.SUMMARY_STUDY_NAME) self.assertEqual(filter_scans(os.listdir(visit_dir)), [(e + '-' + e) for e in expected_visit_filesets]) # and on XNAT visit_fileset_names = filter_scans( login.projects[self.project].experiments[self.session_label( subject=XnatRepo.SUMMARY_NAME, from_analysis=self.SUMMARY_STUDY_NAME)].scans.keys()) self.assertEqual(expected_visit_filesets, visit_fileset_names) # Check project summary directories were created properly in cache expected_proj_filesets = ['analysis_sink'] project_dir = self.session_cache( subject=XnatRepo.SUMMARY_NAME, visit=XnatRepo.SUMMARY_NAME, from_analysis=self.SUMMARY_STUDY_NAME) self.assertEqual(filter_scans(os.listdir(project_dir)), [(e + '-' + e) for e in expected_proj_filesets]) # and on XNAT project_fileset_names = filter_scans( login.projects[self.project].experiments[self.session_label( subject=XnatRepo.SUMMARY_NAME, visit=XnatRepo.SUMMARY_NAME, from_analysis=self.SUMMARY_STUDY_NAME)].scans.keys()) self.assertEqual(expected_proj_filesets, project_fileset_names) # Reload the data from the summary directories reloadinputnode = pe.Node( IdentityInterface(['subject_id', 'visit_id']), 'reload_inputnode') reloadinputnode.inputs.subject_id = self.SUBJECT reloadinputnode.inputs.visit_id = self.VISIT reloadsource_per_subject = pe.Node(RepositorySource( analysis.bound_spec(f).slice for f in subject_sink_files), name='reload_source_per_subject') reloadsource_per_visit = pe.Node(RepositorySource( analysis.bound_spec(f).slice for f in visit_sink_files), name='reload_source_per_visit') reloadsource_per_dataset = pe.Node(RepositorySource( analysis.bound_spec(f).slice for f in analysis_sink_files), name='reload_source_per_dataset') reloadsink = pe.Node(RepositorySink( (analysis.bound_spec(f).slice for f in ['resink1', 'resink2', 'resink3']), dummy_pipeline), name='reload_sink') reloadsink.inputs.name = 'reload_summary' reloadsink.inputs.desc = ( "Tests the reloading of subject and project summary filesets") reloadworkflow = pe.Workflow('reload_summary_unittest', base_dir=self.work_dir) for node in (reloadsource_per_subject, reloadsource_per_visit, reloadsource_per_dataset, reloadsink): for iterator in ('subject_id', 'visit_id'): reloadworkflow.connect(reloadinputnode, iterator, node, iterator) reloadworkflow.connect(reloadsource_per_subject, 'subject_sink' + PATH_SUFFIX, reloadsink, 'resink1' + PATH_SUFFIX) reloadworkflow.connect(reloadsource_per_visit, 'visit_sink' + PATH_SUFFIX, reloadsink, 'resink2' + PATH_SUFFIX) reloadworkflow.connect(reloadsource_per_dataset, 'analysis_sink' + PATH_SUFFIX, reloadsink, 'resink3' + PATH_SUFFIX) reloadworkflow.run() # Check that the filesets self.assertEqual( filter_scans( os.listdir( self.session_cache( from_analysis=self.SUMMARY_STUDY_NAME))), ['resink1-resink1', 'resink2-resink2', 'resink3-resink3']) # and on XNAT with self._connect() as login: resinked_fileset_names = filter_scans( login.projects[self.project].experiments[self.session_label( from_analysis=self.SUMMARY_STUDY_NAME)].scans.keys()) self.assertEqual(sorted(resinked_fileset_names), ['resink1', 'resink2', 'resink3'])