def test_difference_when_diffs_i_vs_h(self): irods_metadata = IrodsSeqFileMetadata('/seq/123.bam', samples={ 'name': set(['S1']), 'accession_number': set(['EGA1']), 'internal_id': set(['1']) }, libraries={ 'name': set(['123']), 'internal_id': set(['123']) }, studies={ 'name': set(["Crohns disease"]), 'accession_number': set(['EGAS4']), 'internal_id': set(['4']) }) header_metadata = SAMFileHeaderMetadata( '/seq/123.bam', samples={ 'name': set(['S100']), 'accession_number': set(), 'internal_id': set() }, libraries={'internal_id': set(['123'])}, studies={}) result = irods_metadata.difference(header_metadata) self.assertDictEqual(result, {'samples': {'name': set(['S1'])}})
def test_difference_when_no_diffs_i_vs_h(self): irods_metadata = IrodsSeqFileMetadata('/seq/123.bam', samples={'name': set(['S1']), 'accession_number': set(), 'internal_id': set()}, libraries={}, studies={}) header_metadata = SAMFileHeaderMetadata('/seq/123.bam', samples={'name': set(['S1']), 'accession_number' : set(), 'internal_id': set()}, libraries={}, studies={}) result = irods_metadata.difference(header_metadata) self.assertDictEqual(result, {})
def test_validate_fields_2(self): irods_metadata = IrodsSeqFileMetadata(fpath='/seq/1234/1234_5#6.bam') result = irods_metadata.validate_fields() self.assertEqual(len(result), 5) for check_res in result: if check_res.check_name == CHECK_NAMES.check_by_comparison_checksum_in_meta_with_checksum_at_upload: self.assertEqual(check_res.result, None) else: self.assertEqual(check_res.result, RESULT.FAILURE)
def test_validate_fields_when_wrong_npg_qc(self): irods_metadata = IrodsSeqFileMetadata(fpath='/seq/1234/1234_5#6.bam', npg_qc='aaAAA', checksum_at_upload='123abc', checksum_in_meta='123abc') result = irods_metadata.validate_fields() self.assertEqual(len(result), 5) for check_res in result: if check_res.check_name in [CHECK_NAMES.check_target_field, CHECK_NAMES.check_npg_qc_field]: self.assertEqual(check_res.result, RESULT.FAILURE) else: self.assertEqual(check_res.result, RESULT.SUCCESS)
def test_difference_when_diffs_i_vs_h(self): irods_metadata = IrodsSeqFileMetadata('/seq/123.bam', samples={'name': set(['S1']), 'accession_number': set(['EGA1']), 'internal_id': set(['1'])}, libraries={'name': set(['123']), 'internal_id': set(['123'])}, studies={'name': set(["Crohns disease"]), 'accession_number': set(['EGAS4']), 'internal_id': set(['4'])}) header_metadata = SAMFileHeaderMetadata('/seq/123.bam', samples={'name': set(['S100']), 'accession_number' : set(), 'internal_id': set()}, libraries={'internal_id': set(['123'])}, studies={}) result = irods_metadata.difference(header_metadata) self.assertDictEqual(result, {'samples': {'name': set(['S1'])}})
def fetch_and_preprocess_irods_metadata_by_metadata( search_criteria, irods_zone, issues_dict, reference): """ This function takes some filtering/matching criteria for selecting data from iRODS based on metadata. The client also passes an issues_dict to this function as parameter, which the current function just needs to update with the issues found on the files found in iRODS to match the criteria. :param issues_dict: an existing dictionary of issues, to which this function needs to add the issues found :param irods_zone: the irods zone where to search for the data matching the criteria given :param search_criteria: a dict formed of key= attr name, val = attr value. The operator is by default =. :return: a dict of key: fpath, value: the iRODS metadata for that path """ irods_metadata_by_path = {} try: all_files_metadata_objs_list = iRODSMetadataProvider.retrieve_raw_files_metadata_by_metadata( search_criteria, irods_zone) except Exception as e: print(e) sys.exit(1) else: for raw_metadata in all_files_metadata_objs_list: check_results = [] file_metadata = IrodsSeqFileMetadata.from_raw_metadata( raw_metadata) check_results.extend(file_metadata.check_metadata(reference)) irods_metadata_by_path[raw_metadata.fpath] = file_metadata issues_dict[raw_metadata.fpath].extend(check_results) return irods_metadata_by_path
def check_metadata_given_as_json_stream(reference=None): """ This function takes in the iRODS metadata as a stream of json data read from stdin and it uses for checking the files. :param reference: string that contains the name of the genome reference => one wants to check if the data has this reference as metadata :return: dict of key = string file path, value = list[CheckResult] """ check_results_by_path = defaultdict(list) json_input_data = sys.stdin.read() baton_data_objects_list = convert_json_to_baton_objs(json_input_data) irods_metadata_dict = {} for data_obj in baton_data_objects_list: meta = IrodsSeqFileMetadata.from_baton_wrapper(data_obj) check_results_by_path[meta.fpath].extend( meta.check_metadata(reference)) irods_metadata_dict[meta.fpath] = meta if not irods_metadata_dict: print("No irods metadata found. No checks performed.") sys.exit(1) header_metadata_dict = MetadataSelfChecks.fetch_and_preprocess_header_metadata( irods_metadata_dict.keys(), check_results_by_path) seqscape_metadata_dict = MetadataSelfChecks.fetch_and_preprocess_seqscape_metadata( irods_metadata_dict, check_results_by_path) FileMetadataComparison.check_metadata_across_different_sources( irods_metadata_dict, header_metadata_dict, seqscape_metadata_dict, check_results_by_path) return check_results_by_path
def fetch_and_preprocess_irods_metadata_by_path(irods_fpaths, issues_dict, reference): """ This function fetches the irods metadata by file path and preprocesses it. It also adds the issues found to the issues_dict given as parameter. :param irods_fpaths: :param issues_dict: :param reference: :return: """ irods_metadata_dict = defaultdict(list) for fpath in irods_fpaths: try: raw_metadata = iRODSMetadataProvider.fetch_raw_file_metadata_by_path( fpath) except Exception as e: print(e) sys.exit(1) else: check_results = [] file_metadata = IrodsSeqFileMetadata.from_raw_metadata( raw_metadata) check_results.extend(file_metadata.check_metadata(reference)) irods_metadata_dict[fpath] = file_metadata issues_dict[fpath].extend(check_results) return irods_metadata_dict
def test_difference_when_not_the_right_type(self): irods_metadata = IrodsSeqFileMetadata('/seq/123.bam', samples={ 'name': set(['S1']), 'accession_number': set(), 'internal_id': set() }, libraries={}, studies={}) self.assertRaises(TypeError, irods_metadata.difference, [1, 2, 3])
def test_difference_when_no_diffs_i_vs_h(self): irods_metadata = IrodsSeqFileMetadata('/seq/123.bam', samples={ 'name': set(['S1']), 'accession_number': set(), 'internal_id': set() }, libraries={}, studies={}) header_metadata = SAMFileHeaderMetadata('/seq/123.bam', samples={ 'name': set(['S1']), 'accession_number': set(), 'internal_id': set() }, libraries={}, studies={}) result = irods_metadata.difference(header_metadata) self.assertDictEqual(result, {})
def test_from_raw_metadata_only_replicas(self): replicas = [ baton_models.DataObjectReplica(number=1, checksum="123abc"), baton_models.DataObjectReplica(number=2, checksum="abc"),] raw_metadata = IrodsRawFileMetadata(fpath='/seq/123.bam', file_replicas=replicas) seq_metadata = IrodsSeqFileMetadata.from_raw_metadata(raw_metadata) expected = {'name': set(), 'accession_number': set(), 'internal_id': set()} self.assertEqual(seq_metadata.samples, expected) self.assertEqual(seq_metadata.libraries, expected) self.assertEqual(seq_metadata.checksum_in_meta, set())
def convert_data_object(data_object: DataObject) -> IrodsSeqFileMetadata: """ Parses the given data object from iRODS into the representation used internally. :param data_object: data object from iRODS, retrieved via baton wrapper :return: internal representation of iRODS metadata """ path = data_object.path if data_object.replicas is not None: # Assuming that replica number `IRODS_REPLICA_FIRST_NUMBER` is the first replica that is created original_replica = data_object.replicas.get_by_number( IRODS_ORIGINAL_REPLICA_NUMBER) checksum_at_upload = original_replica.checksum if original_replica is not None else None else: checksum_at_upload = None metadata = data_object.metadata if metadata is None: return IrodsSeqFileMetadata(path, checksum_at_upload=checksum_at_upload) references = metadata.get(IRODS_METADATA_REFERENCE_PROPERTY) target = list(metadata.get(IRODS_METADATA_TARGET_PROPERTY, default={None}))[0] # TODO: Add other conversions if IRODS_METADATA_LIBRARY_ID_PROPERTY in metadata: libraries = metadata[IRODS_METADATA_LIBRARY_ID_PROPERTY] elif IRODS_METADATA_LEGACY_LIBRARY_ID_PROPERTY in metadata: libraries = metadata[IRODS_METADATA_LEGACY_LIBRARY_ID_PROPERTY] else: libraries = None return IrodsSeqFileMetadata(path, references=references, libraries=libraries, checksum_at_upload=checksum_at_upload, target=target)
def test_mdata_from_diff_srcs_when_different_id_types(self): irods_metadata = IrodsSeqFileMetadata('/seq/123.bam', samples={'name': set(['S1']), 'accession_number': set(['EGA1']), 'internal_id': set()}, libraries={}, studies={}) header_metadata = SAMFileHeaderMetadata('/seq/123.bam', samples={'name': set(['S1'])}, libraries={}, studies={}) seqscape_metadata = SeqscapeMetadata(samples={'name': set(['S1'])}, libraries={}, studies={}) issues_dict = defaultdict(list) FileMetadataComparison.check_metadata_across_different_sources({'/seq/213.bam': irods_metadata}, {'/seq/213.bam': header_metadata}, {'/seq/213.bam': seqscape_metadata}, issues_dict) check_results = issues_dict['/seq/213.bam'] self.assertEqual(4, len(check_results)) results = {c.result for c in check_results} self.assertSetEqual(results, {RESULT.SUCCESS})
def check_metadata_given_as_json_stream(reference=None): """ This function takes in the iRODS metadata as a stream of json data read from stdin and it uses for checking the files. :param reference: string that contains the name of the genome reference => one wants to check if the data has this reference as metadata :return: dict of key = string file path, value = list[CheckResult] """ check_results_by_path = defaultdict(list) json_input_data = sys.stdin.read() baton_data_objects_list = convert_json_to_baton_objs(json_input_data) irods_metadata_dict = {} for data_obj in baton_data_objects_list: meta = IrodsSeqFileMetadata.from_baton_wrapper(data_obj) check_results_by_path[meta.fpath].extend(meta.check_metadata(reference)) irods_metadata_dict[meta.fpath] = meta if not irods_metadata_dict: print("No irods metadata found. No checks performed.") sys.exit(1) header_metadata_dict = MetadataSelfChecks.fetch_and_preprocess_header_metadata(irods_metadata_dict.keys(), check_results_by_path) seqscape_metadata_dict = MetadataSelfChecks.fetch_and_preprocess_seqscape_metadata(irods_metadata_dict, check_results_by_path) FileMetadataComparison.check_metadata_across_different_sources(irods_metadata_dict, header_metadata_dict, seqscape_metadata_dict, check_results_by_path) return check_results_by_path
def test_is_npg_qc_valid_4(self): npq_qc = "0" result = IrodsSeqFileMetadata._is_npg_qc_valid(npq_qc) self.assertTrue(result)
def test_is_npg_qc_valid_6(self): npq_qc = "mamba" result = IrodsSeqFileMetadata._is_npg_qc_valid(npq_qc) self.assertFalse(result)
def test_extract_reference_name_from_ref_path3(self): ref_path = '/lustre/scratch109/srpipe/references/Homo_sapiens/GRCh38_15/all/bwa0_6/Homo_sapiens.GRCh38_15.fa' result = IrodsSeqFileMetadata.extract_reference_name_from_ref_path(ref_path) self.assertEqual(result, 'Homo_sapiens.GRCh38_15')
def test_extract_reference_name_from_ref_path2(self): ref_path = '/lustre/scratch110/srpipe/references/Homo_sapiens/1000Genomes/all/bwa/human_g1k_v37.fasta' result = IrodsSeqFileMetadata.extract_reference_name_from_ref_path(ref_path) self.assertEqual(result, 'human_g1k_v37')
def test_check_reference_when_ok(self): irods_metadata = IrodsSeqFileMetadata(fpath='/seq/1234/1234_5#6.cram', references=['/lustre/hs37d5.fa']) result = irods_metadata.check_reference('hs37d5') self.assertEqual(result.result, RESULT.SUCCESS)
def test_is_target_valid_when_empty(self): result = IrodsSeqFileMetadata._is_target_valid('') self.assertFalse(result)
def test_is_target_valid_when_invalid(self): result = IrodsSeqFileMetadata._is_target_valid('somethingelse') self.assertFalse(result)
def test_is_target_valid_when_valid_library(self): result = IrodsSeqFileMetadata._is_target_valid('library') self.assertTrue(result)
def test_check_checksum_in_meta_present(self): irods_metadata = IrodsSeqFileMetadata(fpath='/seq/1234/1234_5#6.bam', checksum_in_meta='aaa') result = irods_metadata.check_checksum_in_meta_present() self.assertEqual(result.result, RESULT.SUCCESS)
def test_extract_reference_name_from_ref_path4(self): ref_path = '/lustre/scratch110/srpipe/references/Homo_sapiens/GRCh37_53/all/bwa/Homo_sapiens.GRCh37.dna.all.fa' result = IrodsSeqFileMetadata.extract_reference_name_from_ref_path(ref_path) self.assertEqual(result, 'Homo_sapiens.GRCh37.dna.all')
def test_check_checksum_in_meta_present_when_absent(self): irods_metadata = IrodsSeqFileMetadata(fpath='/seq/1234/1234_5#6.bam') result = irods_metadata.check_checksum_in_meta_present() self.assertEqual(result.result, RESULT.FAILURE)
def test_check_reference_2(self): irods_metadata = IrodsSeqFileMetadata(fpath='/seq/1234/1234_5#6.cram', references=['/lustre/hs37d5.fa']) result = irods_metadata.check_reference('') self.assertEqual(result.result, None) self.assertEqual(result.executed, False)
def test_is_npg_qc_valid_7(self): npq_qc = True self.assertFalse(IrodsSeqFileMetadata._is_npg_qc_valid(npq_qc))
def test_check_reference_3(self): irods_metadata = IrodsSeqFileMetadata(fpath='/seq/1234/1234_5#6.cram') result = irods_metadata.check_reference('') self.assertEqual(result.result, None) self.assertEqual(result.executed, False)
def test_extract_reference_name_from_ref_path1(self): ref_path = '/lustre/scratch109/srpipe/references/Homo_sapiens/1000Genomes_hs37d5/all/bwa/hs37d5.fa' result = IrodsSeqFileMetadata.extract_reference_name_from_ref_path(ref_path) self.assertEqual(result, 'hs37d5')