def test_genetics_evidence_string(self): """Verifies expected genetics evidence string generation.""" evidence_string = json.dumps( evidence_strings.CTTVGeneticsEvidenceString(*self.test_args), sort_keys=True, indent=2) expected_evidence_string = open( config.expected_genetics_evidence_string).read() self.assertEqual(evidence_string, expected_evidence_string)
def clinvar_to_evidence_strings(allowed_clinical_significance, mappings, json_file, ot_schema): report = Report(trait_mappings=mappings.trait_2_efo) cell_recs = cellbase_records.CellbaseRecords(json_file=json_file) ot_schema_contents = json.loads(open(ot_schema).read()) for cellbase_record in cell_recs: report.counters["record_counter"] += 1 if report.counters["record_counter"] % 1000 == 0: logger.info("{} records processed".format( report.counters["record_counter"])) n_ev_strings_per_record = 0 clinvar_record = clinvar.ClinvarRecord(cellbase_record['clinvarSet']) for clinvar_record_measure in clinvar_record.measures: report.counters["n_nsvs"] += (clinvar_record_measure.nsv_id is not None) append_nsv(report.nsv_list, clinvar_record_measure) report.counters["n_multiple_allele_origin"] += (len( clinvar_record.allele_origins) > 1) traits = create_traits(clinvar_record.traits, mappings.trait_2_efo, report) converted_allele_origins = convert_allele_origins( clinvar_record.allele_origins) for consequence_type, trait, allele_origin in itertools.product( get_consequence_types(clinvar_record_measure, mappings.consequence_type_dict), traits, converted_allele_origins): if skip_record(clinvar_record, clinvar_record_measure, consequence_type, allele_origin, allowed_clinical_significance, report): continue if allele_origin == 'germline': evidence_string = evidence_strings.CTTVGeneticsEvidenceString( clinvar_record, clinvar_record_measure, report, trait, consequence_type) elif allele_origin == 'somatic': evidence_string = evidence_strings.CTTVSomaticEvidenceString( clinvar_record, clinvar_record_measure, report, trait, consequence_type) report.add_evidence_string(evidence_string, clinvar_record, trait, consequence_type.ensembl_gene_id, ot_schema_contents) report.evidence_list.append([ clinvar_record.accession, clinvar_record_measure.rs_id, trait.clinvar_name, trait.ontology_id ]) report.counters["n_valid_rs_and_nsv"] += ( clinvar_record_measure.nsv_id is not None) report.traits.add(trait.ontology_id) report.remove_trait_mapping(trait.clinvar_name) report.ensembl_gene_id_uris.add( evidence_strings.get_ensembl_gene_id_uri( consequence_type.ensembl_gene_id)) n_ev_strings_per_record += 1 if n_ev_strings_per_record > 0: report.counters["n_processed_clinvar_records"] += 1 if n_ev_strings_per_record > 1: report.counters["n_multiple_evidence_strings"] += 1 return report
def setUp(self): self.test_args = get_input_data_for_evidence_string_generation() self.test_ges = evidence_strings.CTTVGeneticsEvidenceString( *self.test_args) self.ot_schema_contents = json.loads( gzip.open(config.open_targets_schema_gz).read().decode('utf-8'))
def test_validate(self): test_args = get_input_data_for_evidence_string_generation() test_evidence_string = evidence_strings.CTTVGeneticsEvidenceString( *test_args) self.assertTrue(test_evidence_string.validate(self.ot_schema_contents))
def clinvar_to_evidence_strings(allowed_clinical_significance, mappings, json_file, ot_schema, output_evidence_strings): report = Report(trait_mappings=mappings.trait_2_efo) cell_recs = cellbase_records.CellbaseRecords(json_file=json_file) ot_schema_contents = json.loads(open(ot_schema).read()) output_evidence_strings_file = utilities.open_file(output_evidence_strings, 'wt') for cellbase_record in cell_recs: report.counters["record_counter"] += 1 if report.counters["record_counter"] % 1000 == 0: logger.info("{} records processed".format( report.counters["record_counter"])) n_ev_strings_per_record = 0 clinvar_record = clinvar.ClinvarRecord(cellbase_record['clinvarSet']) for clinvar_record_measure in clinvar_record.measures: report.counters["n_nsvs"] += (clinvar_record_measure.nsv_id is not None) append_nsv(report.nsv_list, clinvar_record_measure) report.counters["n_multiple_allele_origin"] += (len( clinvar_record.allele_origins) > 1) traits = create_traits(clinvar_record.traits, mappings.trait_2_efo, report) converted_allele_origins = convert_allele_origins( clinvar_record.allele_origins) for consequence_type, trait, allele_origin in itertools.product( get_consequence_types(clinvar_record_measure, mappings.consequence_type_dict), traits, converted_allele_origins): if skip_record(clinvar_record, clinvar_record_measure, consequence_type, allele_origin, allowed_clinical_significance, report): continue if allele_origin == 'germline': evidence_string = evidence_strings.CTTVGeneticsEvidenceString( clinvar_record, clinvar_record_measure, report, trait, consequence_type) elif allele_origin == 'somatic': evidence_string = evidence_strings.CTTVSomaticEvidenceString( clinvar_record, clinvar_record_measure, report, trait, consequence_type) else: raise AssertionError( 'Unknown allele_origin present in the data: {}'.format( allele_origin)) # Validate and immediately output the evidence string (not keeping everything in memory) validate_evidence_string(evidence_string, clinvar_record, trait, consequence_type.ensembl_gene_id, ot_schema_contents) output_evidence_strings_file.write( json.dumps(evidence_string) + '\n') report.evidence_string_count += 1 report.evidence_list.append([ clinvar_record.accession, clinvar_record_measure.rs_id, trait.clinvar_name, trait.ontology_id ]) report.counters["n_valid_rs_and_nsv"] += ( clinvar_record_measure.nsv_id is not None) report.traits.add(trait.ontology_id) report.remove_trait_mapping(trait.clinvar_name) report.ensembl_gene_id_uris.add( evidence_strings.get_ensembl_gene_id_uri( consequence_type.ensembl_gene_id)) n_ev_strings_per_record += 1 if n_ev_strings_per_record > 0: report.counters["n_processed_clinvar_records"] += 1 if n_ev_strings_per_record > 1: report.counters["n_multiple_evidence_strings"] += 1 output_evidence_strings_file.close() return report