Exemplo n.º 1
0
def get_test_record():
    test_clinvar_record_filepath = os.path.join(os.path.dirname(__file__), 'resources',
                                              'test_clinvar_record.json')
    with utilities.open_file(test_clinvar_record_filepath, "rt") as f:
        test_record_dict = json.load(f)
    test_record = clinvar.ClinvarRecord(test_record_dict)
    return test_record
def get_input_data_for_evidence_string_generation():
    """Prepares mock input data necessary for the evidence string generation."""
    clinvar_record = clinvar.ClinvarRecord(
        json.load(open(config.test_clinvar_record_file)))
    report = clinvar_to_evidence_strings.Report()

    trait = SimpleNamespace()
    trait.trait_counter = 0
    trait.clinvar_name = ''
    trait.ontology_id = 'http://www.orpha.net/ORDO/Orphanet_88991'
    trait.ontology_label = None

    consequence_type = test_clinvar_to_evidence_strings.MAPPINGS.consequence_type_dict[
        '14:67729209:A:G'][0]
    return clinvar_record, clinvar_record.measures[
        0], report, trait, consequence_type
Exemplo n.º 3
0
def clinvar_to_evidence_strings(allowed_clinical_significance, mappings,
                                json_file, ot_schema):
    report = Report(trait_mappings=mappings.trait_2_efo)
    cell_recs = cellbase_records.CellbaseRecords(json_file=json_file)
    ot_schema_contents = json.loads(open(ot_schema).read())
    for cellbase_record in cell_recs:
        report.counters["record_counter"] += 1
        if report.counters["record_counter"] % 1000 == 0:
            logger.info("{} records processed".format(
                report.counters["record_counter"]))

        n_ev_strings_per_record = 0
        clinvar_record = clinvar.ClinvarRecord(cellbase_record['clinvarSet'])

        for clinvar_record_measure in clinvar_record.measures:
            report.counters["n_nsvs"] += (clinvar_record_measure.nsv_id
                                          is not None)
            append_nsv(report.nsv_list, clinvar_record_measure)
            report.counters["n_multiple_allele_origin"] += (len(
                clinvar_record.allele_origins) > 1)
            traits = create_traits(clinvar_record.traits, mappings.trait_2_efo,
                                   report)
            converted_allele_origins = convert_allele_origins(
                clinvar_record.allele_origins)

            for consequence_type, trait, allele_origin in itertools.product(
                    get_consequence_types(clinvar_record_measure,
                                          mappings.consequence_type_dict),
                    traits, converted_allele_origins):

                if skip_record(clinvar_record, clinvar_record_measure,
                               consequence_type, allele_origin,
                               allowed_clinical_significance, report):
                    continue

                if allele_origin == 'germline':
                    evidence_string = evidence_strings.CTTVGeneticsEvidenceString(
                        clinvar_record, clinvar_record_measure, report, trait,
                        consequence_type)
                elif allele_origin == 'somatic':
                    evidence_string = evidence_strings.CTTVSomaticEvidenceString(
                        clinvar_record, clinvar_record_measure, report, trait,
                        consequence_type)
                report.add_evidence_string(evidence_string, clinvar_record,
                                           trait,
                                           consequence_type.ensembl_gene_id,
                                           ot_schema_contents)
                report.evidence_list.append([
                    clinvar_record.accession, clinvar_record_measure.rs_id,
                    trait.clinvar_name, trait.ontology_id
                ])
                report.counters["n_valid_rs_and_nsv"] += (
                    clinvar_record_measure.nsv_id is not None)
                report.traits.add(trait.ontology_id)
                report.remove_trait_mapping(trait.clinvar_name)
                report.ensembl_gene_id_uris.add(
                    evidence_strings.get_ensembl_gene_id_uri(
                        consequence_type.ensembl_gene_id))

                n_ev_strings_per_record += 1

            if n_ev_strings_per_record > 0:
                report.counters["n_processed_clinvar_records"] += 1
                if n_ev_strings_per_record > 1:
                    report.counters["n_multiple_evidence_strings"] += 1

    return report
Exemplo n.º 4
0
def clinvar_to_evidence_strings(allowed_clinical_significance, mappings,
                                json_file, ot_schema, output_evidence_strings):
    report = Report(trait_mappings=mappings.trait_2_efo)
    cell_recs = cellbase_records.CellbaseRecords(json_file=json_file)
    ot_schema_contents = json.loads(open(ot_schema).read())
    output_evidence_strings_file = utilities.open_file(output_evidence_strings,
                                                       'wt')
    for cellbase_record in cell_recs:
        report.counters["record_counter"] += 1
        if report.counters["record_counter"] % 1000 == 0:
            logger.info("{} records processed".format(
                report.counters["record_counter"]))

        n_ev_strings_per_record = 0
        clinvar_record = clinvar.ClinvarRecord(cellbase_record['clinvarSet'])

        for clinvar_record_measure in clinvar_record.measures:
            report.counters["n_nsvs"] += (clinvar_record_measure.nsv_id
                                          is not None)
            append_nsv(report.nsv_list, clinvar_record_measure)
            report.counters["n_multiple_allele_origin"] += (len(
                clinvar_record.allele_origins) > 1)
            traits = create_traits(clinvar_record.traits, mappings.trait_2_efo,
                                   report)
            converted_allele_origins = convert_allele_origins(
                clinvar_record.allele_origins)

            for consequence_type, trait, allele_origin in itertools.product(
                    get_consequence_types(clinvar_record_measure,
                                          mappings.consequence_type_dict),
                    traits, converted_allele_origins):

                if skip_record(clinvar_record, clinvar_record_measure,
                               consequence_type, allele_origin,
                               allowed_clinical_significance, report):
                    continue

                if allele_origin == 'germline':
                    evidence_string = evidence_strings.CTTVGeneticsEvidenceString(
                        clinvar_record, clinvar_record_measure, report, trait,
                        consequence_type)
                elif allele_origin == 'somatic':
                    evidence_string = evidence_strings.CTTVSomaticEvidenceString(
                        clinvar_record, clinvar_record_measure, report, trait,
                        consequence_type)
                else:
                    raise AssertionError(
                        'Unknown allele_origin present in the data: {}'.format(
                            allele_origin))

                # Validate and immediately output the evidence string (not keeping everything in memory)
                validate_evidence_string(evidence_string, clinvar_record,
                                         trait,
                                         consequence_type.ensembl_gene_id,
                                         ot_schema_contents)
                output_evidence_strings_file.write(
                    json.dumps(evidence_string) + '\n')
                report.evidence_string_count += 1

                report.evidence_list.append([
                    clinvar_record.accession, clinvar_record_measure.rs_id,
                    trait.clinvar_name, trait.ontology_id
                ])
                report.counters["n_valid_rs_and_nsv"] += (
                    clinvar_record_measure.nsv_id is not None)
                report.traits.add(trait.ontology_id)
                report.remove_trait_mapping(trait.clinvar_name)
                report.ensembl_gene_id_uris.add(
                    evidence_strings.get_ensembl_gene_id_uri(
                        consequence_type.ensembl_gene_id))

                n_ev_strings_per_record += 1

            if n_ev_strings_per_record > 0:
                report.counters["n_processed_clinvar_records"] += 1
                if n_ev_strings_per_record > 1:
                    report.counters["n_multiple_evidence_strings"] += 1

    output_evidence_strings_file.close()
    return report