def test_metadata_for_library_file(self): """ The file tested has metadata just like a library cram, except for some fields that are not used within metacheck anyway. It is a txt file, so it will have no header metadata. """ irods_fpath = "/humgen/projects/serapis_staging/test-metacheck/test_metadata.txt" result = api.check_metadata_fetched_by_path(irods_fpaths=[irods_fpath]) print("Comparisong checks: %s" % self.comparison_checks) for fpath, check_results in result.items(): check_names = [c.check_name for c in check_results] self.assertSetEqual( set(check_names), set(CHECK_NAMES.get_only_mandatory_check_names())) for check_res in check_results: if check_res.check_name in self.comparison_checks: self.assertFalse(check_res.executed) elif check_res.check_name in [ CHECK_NAMES.check_ss_irods_group_read_permission, CHECK_NAMES.check_there_is_ss_irods_group, CHECK_NAMES. check_studies_in_irods_with_studies_in_seqscape_fetched_by_samples, CHECK_NAMES. check_samples_in_irods_same_as_samples_fetched_by_study_from_seqscape ]: self.assertTrue(check_res.executed) self.assertEqual(check_res.result, RESULT.FAILURE)
def test_metadata_when_metadata_ok_with_wrong_reference(self): irods_fpath = "/humgen/projects/serapis_staging/test-metacheck/test_ok_metadata.out" result = api.check_metadata_fetched_by_path(irods_fpaths=[irods_fpath], reference='hs37d5') for fpath, check_results in result.items(): for check_res in check_results: if check_res.check_name == CHECK_NAMES.check_desired_reference: self.assertEqual(check_res.result, RESULT.FAILURE)
def test_fetch_study_metadata_vs_stream_study_metadata(self, stdin): fpath = "/nfs/users/nfs_i/ic4/Projects/python3/meta-check/16006_5.json" stdin.return_value = open(fpath).read() result_fetch_by_metadata = api.check_metadata_fetched_by_path(irods_fpaths=['/seq/16006/16006_5.cram']) result_stream_metadata = api.check_metadata_given_as_json_stream() self.assertSetEqual(set(result_stream_metadata.keys()), set(result_fetch_by_metadata.keys())) print() for fpath, results in result_fetch_by_metadata.items(): self.assertSetEqual(set(results), set(result_stream_metadata[fpath]))
def test_metadata_when_metadata_ok_with_wrong_reference_and_one_replica(self): irods_fpath = "/humgen/projects/serapis_staging/test-metacheck/test_metadata_missing_md5.out" result = api.check_metadata_fetched_by_path(irods_fpaths=[irods_fpath], reference='grch38') for fpath, check_results in result.items(): for check_res in check_results: if check_res.check_name == CHECK_NAMES.check_desired_reference: self.assertEqual(check_res.executed, False) elif check_res.check_name in [CHECK_NAMES.check_more_than_one_replica, CHECK_NAMES.check_ss_irods_group_read_permission, CHECK_NAMES.check_there_is_ss_irods_group, ]: self.assertEqual(check_res.result, RESULT.FAILURE)
def test_fetch_study_metadata_vs_stream_study_metadata(self, stdin): fpath = "/nfs/users/nfs_i/ic4/Projects/python3/meta-check/16006_5.json" stdin.return_value = open(fpath).read() result_fetch_by_metadata = api.check_metadata_fetched_by_path( irods_fpaths=['/seq/16006/16006_5.cram']) result_stream_metadata = api.check_metadata_given_as_json_stream() self.assertSetEqual(set(result_stream_metadata.keys()), set(result_fetch_by_metadata.keys())) print() for fpath, results in result_fetch_by_metadata.items(): self.assertSetEqual(set(results), set(result_stream_metadata[fpath]))
def test_metadata_when_metadata_ok_with_wrong_reference_and_one_replica( self): irods_fpath = "/humgen/projects/serapis_staging/test-metacheck/test_metadata_missing_md5.out" result = api.check_metadata_fetched_by_path(irods_fpaths=[irods_fpath], reference='grch38') for fpath, check_results in result.items(): for check_res in check_results: if check_res.check_name == CHECK_NAMES.check_desired_reference: self.assertEqual(check_res.executed, False) elif check_res.check_name in [ CHECK_NAMES.check_more_than_one_replica, CHECK_NAMES.check_ss_irods_group_read_permission, CHECK_NAMES.check_there_is_ss_irods_group, ]: self.assertEqual(check_res.result, RESULT.FAILURE)
def test_metadata_when_study_and_samples_dont_match(self): irods_fpath = "/humgen/projects/serapis_staging/test-metacheck/test_samples_given_wrong_study.cram" result = api.check_metadata_fetched_by_path(irods_fpaths=[irods_fpath], reference='grch38') for fpath, check_results in result.items(): for check_res in check_results: if check_res.check_name in [ CHECK_NAMES.check_studies_in_irods_with_studies_in_seqscape_fetched_by_samples, CHECK_NAMES.check_for_samples_in_more_studies, CHECK_NAMES.check_samples_in_irods_same_as_samples_fetched_by_study_from_seqscape, CHECK_NAMES.check_there_is_ss_irods_group, CHECK_NAMES.check_ss_irods_group_read_permission, CHECK_NAMES.check_attribute_count ]: self.assertEqual(check_res.result, RESULT.FAILURE) else: self.assertEqual(check_res.result, RESULT.SUCCESS)
def test_metadata_when_metadata_ok(self): irods_fpath = "/humgen/projects/serapis_staging/test-metacheck/test_ok_metadata.out" result = api.check_metadata_fetched_by_path(irods_fpaths=[irods_fpath]) for fpath, check_results in result.items(): check_names = [c.check_name for c in check_results] self.assertSetEqual(set(check_names), set(CHECK_NAMES.get_only_mandatory_check_names())) for check_res in check_results: if check_res.check_name in self.comparison_checks: self.assertFalse(check_res.executed) elif check_res.check_name in [ CHECK_NAMES.check_there_is_ss_irods_group, CHECK_NAMES.check_ss_irods_group_read_permission, CHECK_NAMES.check_for_samples_in_more_studies, ]: self.assertEqual(RESULT.FAILURE, check_res.result) else: self.assertEqual(RESULT.SUCCESS, check_res.result)
def test_metadata_when_header_doesnt_match_irods(self): irods_fpath = "/humgen/projects/serapis_staging/test-metacheck/test_wrong_header.cram" result = api.check_metadata_fetched_by_path(irods_fpaths=[irods_fpath], reference='grch38') for fpath, check_results in result.items(): for check_res in check_results: if check_res.check_name in [ CHECK_NAMES.check_there_is_ss_irods_group, CHECK_NAMES.check_ss_irods_group_read_permission, CHECK_NAMES.check_irods_ids_compared_to_header_ids, CHECK_NAMES.check_header_ids_compared_to_irods_ids, CHECK_NAMES.check_header_ids_compared_to_seqscape_ids, CHECK_NAMES.check_seqscape_ids_compared_to_header_ids, CHECK_NAMES.check_for_samples_in_more_studies, CHECK_NAMES.check_attribute_count ]: self.assertEqual(check_res.result, RESULT.FAILURE) else: self.assertEqual(check_res.result, RESULT.SUCCESS)
def test_same_check_results_by_path_and_by_metadata(self): fpath = '/humgen/projects/serapis_staging/test-metacheck/test_metadata_comparison.cram' check_results_by_metadata = api.check_metadata_fetched_by_metadata(reference='GRCh38', study_name='GDAP_XTEN', irods_zone='humgen') check_results_by_path = api.check_metadata_fetched_by_path(irods_fpaths=[fpath], reference='GRCh38') file_check_results_by_meta = check_results_by_metadata[fpath] file_check_results_by_path = check_results_by_path[fpath] def find_check_in_list(check_list, check_searched_name): for check in check_list: if check.check_name == check_searched_name: return check return None for check_result in file_check_results_by_meta: check_by_path = find_check_in_list(file_check_results_by_path, check_result.check_name) self.assertEqual(check_result, check_by_path) self.assertEqual(len(file_check_results_by_path), len(file_check_results_by_meta))
def test_metadata_when_metadata_ok(self): irods_fpath = "/humgen/projects/serapis_staging/test-metacheck/test_ok_metadata.out" result = api.check_metadata_fetched_by_path(irods_fpaths=[irods_fpath]) for fpath, check_results in result.items(): check_names = [c.check_name for c in check_results] self.assertSetEqual( set(check_names), set(CHECK_NAMES.get_only_mandatory_check_names())) for check_res in check_results: if check_res.check_name in self.comparison_checks: self.assertFalse(check_res.executed) elif check_res.check_name in [ CHECK_NAMES.check_there_is_ss_irods_group, CHECK_NAMES.check_ss_irods_group_read_permission, CHECK_NAMES.check_for_samples_in_more_studies, ]: self.assertEqual(RESULT.FAILURE, check_res.result) else: self.assertEqual(RESULT.SUCCESS, check_res.result)
def test_when_md5_is_wrong(self): irods_fpath = "/humgen/projects/serapis_staging/test-metacheck/test_wrong_md5.out" result = api.check_metadata_fetched_by_path(irods_fpaths=[irods_fpath], reference='grch38') for fpath, check_results in result.items(): for check_res in check_results: if check_res.check_name in [ CHECK_NAMES.check_there_is_ss_irods_group, CHECK_NAMES.check_ss_irods_group_read_permission, CHECK_NAMES.check_for_samples_in_more_studies, CHECK_NAMES.check_replica_checksum_valid, CHECK_NAMES.check_more_than_one_replica, CHECK_NAMES.check_by_comparison_checksum_in_meta_with_checksum_at_upload, CHECK_NAMES.check_attribute_count ]: self.assertEqual(check_res.result, RESULT.FAILURE) else: if check_res.executed: self.assertEqual(check_res.result, RESULT.SUCCESS) else: self.assertIsNone(check_res.result)
def test_when_md5_is_wrong(self): irods_fpath = "/humgen/projects/serapis_staging/test-metacheck/test_wrong_md5.out" result = api.check_metadata_fetched_by_path(irods_fpaths=[irods_fpath], reference='grch38') for fpath, check_results in result.items(): for check_res in check_results: if check_res.check_name in [ CHECK_NAMES.check_there_is_ss_irods_group, CHECK_NAMES.check_ss_irods_group_read_permission, CHECK_NAMES.check_for_samples_in_more_studies, CHECK_NAMES.check_replica_checksum_valid, CHECK_NAMES.check_more_than_one_replica, CHECK_NAMES. check_by_comparison_checksum_in_meta_with_checksum_at_upload, CHECK_NAMES.check_attribute_count ]: self.assertEqual(check_res.result, RESULT.FAILURE) else: if check_res.executed: self.assertEqual(check_res.result, RESULT.SUCCESS) else: self.assertIsNone(check_res.result)
def test_metadata_for_library_file(self): """ The file tested has metadata just like a library cram, except for some fields that are not used within metacheck anyway. It is a txt file, so it will have no header metadata. """ irods_fpath = "/humgen/projects/serapis_staging/test-metacheck/test_metadata.txt" result = api.check_metadata_fetched_by_path(irods_fpaths=[irods_fpath]) print("Comparisong checks: %s" % self.comparison_checks) for fpath, check_results in result.items(): check_names = [c.check_name for c in check_results] self.assertSetEqual(set(check_names), set(CHECK_NAMES.get_only_mandatory_check_names())) for check_res in check_results: if check_res.check_name in self.comparison_checks: self.assertFalse(check_res.executed) elif check_res.check_name in [ CHECK_NAMES.check_ss_irods_group_read_permission, CHECK_NAMES.check_there_is_ss_irods_group, CHECK_NAMES.check_studies_in_irods_with_studies_in_seqscape_fetched_by_samples, CHECK_NAMES.check_samples_in_irods_same_as_samples_fetched_by_study_from_seqscape ]: self.assertTrue(check_res.executed) self.assertEqual(check_res.result, RESULT.FAILURE)
def test_same_check_results_by_path_and_by_metadata(self): fpath = '/humgen/projects/serapis_staging/test-metacheck/test_metadata_comparison.cram' check_results_by_metadata = api.check_metadata_fetched_by_metadata( reference='GRCh38', study_name='GDAP_XTEN', irods_zone='humgen') check_results_by_path = api.check_metadata_fetched_by_path( irods_fpaths=[fpath], reference='GRCh38') file_check_results_by_meta = check_results_by_metadata[fpath] file_check_results_by_path = check_results_by_path[fpath] def find_check_in_list(check_list, check_searched_name): for check in check_list: if check.check_name == check_searched_name: return check return None for check_result in file_check_results_by_meta: check_by_path = find_check_in_list(file_check_results_by_path, check_result.check_name) self.assertEqual(check_result, check_by_path) self.assertEqual(len(file_check_results_by_path), len(file_check_results_by_meta))
def main(): args = arg_parser.parse_args() try: filter_npg_qc = args.filter_npg_qc except AttributeError: filter_npg_qc = None try: filter_target = args.filter_target except AttributeError: filter_target = None try: file_types = args.file_types except AttributeError: file_types = None try: study_name = args.study_name except AttributeError: study_name = None try: study_acc_nr = args.study_acc_nr except AttributeError: study_acc_nr = None try: study_internal_id = args.study_internal_id except AttributeError: study_internal_id = None try: irods_fpaths = args.irods_fpaths except AttributeError: irods_fpaths = None try: irods_zone = args.irods_zone except AttributeError: irods_zone = None try: reference = args.desired_reference except AttributeError: reference = None if args.metadata_fetching_strategy == 'fetch_by_metadata': if not file_types: print( "WARNING! You haven't filtered on file type. The result will contain both BAMs and CRAMs, possibly other types of file as well.") if not filter_target: print( "WARNING! You haven't filtered by target field. You will get back the report from checking all the data, " "no matter if it is the target or not, hence possibly also PhiX") if not filter_npg_qc: print( "WARNING! You haven't filtered on manual_qc field. You will get the report from checking all the data, " "no matter if qc pass of fail.") if args.metadata_fetching_strategy == 'fetch_by_metadata': check_results_by_fpath = check_metadata_fetched_by_metadata(filter_npg_qc, filter_target, file_types, study_name, study_acc_nr, study_internal_id, irods_zone, reference) elif args.metadata_fetching_strategy == 'fetch_by_path': check_results_by_fpath = check_metadata_fetched_by_path(irods_fpaths, reference) elif args.metadata_fetching_strategy == 'given_at_stdin': check_results_by_fpath = check_metadata_given_as_json_stream(reference) else: raise ValueError("Fetching strategy not supported") if args.json_output: check_results_as_json = format_output_as_json(check_results_by_fpath) print(check_results_as_json) else: result_as_tsv_string = format_output_as_tsv(check_results_by_fpath) print(result_as_tsv_string) exit(decide_exit_status(check_results_by_fpath))