Пример #1
0
    def test_output_trait_mapping(self):
        tempfile_path = tempfile.mkstemp()[1]
        with open(tempfile_path, "w", newline='') as mapping_file:
            mapping_writer = csv.writer(mapping_file, delimiter="\t")
            mapping_writer.writerow(["#clinvar_trait_name", "uri", "label"])

            test_trait = Trait('aprt deficiency, japanese type', 11)

            # Normally a set, but changed to a list for predictable output order in test
            test_trait.finished_mapping_set = [
                OntologyEntry('http://www.orpha.net/ORDO/Orphanet_976',
                              'Adenine phosphoribosyltransferase deficiency'),
                OntologyEntry('http://www.orpha.net/ORDO/Orphanet_977',
                              'Adenine phosphoribosyltransferase deficiency type A')
            ]

            output.output_trait_mapping(test_trait, mapping_writer)

        with open(tempfile_path, "rt", newline='') as mapping_file:
            mapping_reader = csv.reader(mapping_file, delimiter="\t")
            next(mapping_reader)
            self.assertEqual(['aprt deficiency, japanese type',
                              'http://www.orpha.net/ORDO/Orphanet_976',
                              'Adenine phosphoribosyltransferase deficiency'],
                             next(mapping_reader))

            self.assertEqual(['aprt deficiency, japanese type',
                              'http://www.orpha.net/ORDO/Orphanet_977',
                              'Adenine phosphoribosyltransferase deficiency type A'],
                             next(mapping_reader))
Пример #2
0
    def test_output_for_curation(self):
        tempfile_path = tempfile.mkstemp()[1]
        with open(tempfile_path, "wt") as curation_file:
            curation_writer = csv.writer(curation_file, delimiter="\t")

            test_trait = Trait("transitional cell carcinoma of the bladder", 276)

            test_oxo_result = OxOResult("HP:0006740", "Transitional cell carcinoma of the bladder",
                                        "HP:0006740")
            test_oxo_mapping = OxOMapping("bladder transitional cell carcinoma", "EFO:0006544", 2,
                                          "HP:0006740")
            test_oxo_mapping.in_efo = test_oxo_mapping.is_current = True
            test_oxo_mapping.ontology_label = "bladder transitional cell carcinoma"
            test_oxo_result.mapping_list = [test_oxo_mapping]

            test_trait.oxo_result_list = [test_oxo_result]

            output.output_for_curation(test_trait, curation_writer)

        with open(tempfile_path, "rt") as curation_file:
            curation_reader = csv.reader(curation_file, delimiter="\t")

            self.assertEqual(["transitional cell carcinoma of the bladder", "276",
                              "http://www.ebi.ac.uk/efo/EFO_0006544|bladder transitional cell carcinoma|2|HP:0006740"],
                             next(curation_reader))
Пример #3
0
    def test_output_trait_mapping(self):
        tempfile_path = tempfile.mkstemp()[1]
        with open(tempfile_path, "w", newline='') as mapping_file:
            mapping_writer = csv.writer(mapping_file, delimiter="\t")
            mapping_writer.writerow(["#clinvar_trait_name", "uri", "label"])

            test_trait = Trait('aprt deficiency, japanese type', 11)

            # Normally a set, but changed to a list for predictable output order in test
            test_trait.finished_mapping_set = [
                OntologyEntry('http://www.orpha.net/ORDO/Orphanet_976',
                              'Adenine phosphoribosyltransferase deficiency'),
                OntologyEntry(
                    'http://www.orpha.net/ORDO/Orphanet_977',
                    'Adenine phosphoribosyltransferase deficiency type A')
            ]

            output.output_trait_mapping(test_trait, mapping_writer)

        with open(tempfile_path, "rt", newline='') as mapping_file:
            mapping_reader = csv.reader(mapping_file, delimiter="\t")
            next(mapping_reader)
            self.assertEqual([
                'aprt deficiency, japanese type',
                'http://www.orpha.net/ORDO/Orphanet_976',
                'Adenine phosphoribosyltransferase deficiency'
            ], next(mapping_reader))

            self.assertEqual([
                'aprt deficiency, japanese type',
                'http://www.orpha.net/ORDO/Orphanet_977',
                'Adenine phosphoribosyltransferase deficiency type A'
            ], next(mapping_reader))
Пример #4
0
    def test_output_for_curation(self):
        tempfile_path = tempfile.mkstemp()[1]
        with open(tempfile_path, "wt") as curation_file:
            curation_writer = csv.writer(curation_file, delimiter="\t")

            test_trait = Trait("transitional cell carcinoma of the bladder",
                               276)

            test_oxo_result = OxOResult(
                "HP:0006740", "Transitional cell carcinoma of the bladder",
                "HP:0006740")
            test_oxo_mapping = OxOMapping(
                "bladder transitional cell carcinoma", "EFO:0006544", 2,
                "HP:0006740")
            test_oxo_mapping.in_efo = test_oxo_mapping.is_current = True
            test_oxo_mapping.ontology_label = "bladder transitional cell carcinoma"
            test_oxo_result.mapping_list = [test_oxo_mapping]

            test_trait.oxo_result_list = [test_oxo_result]

            output.output_for_curation(test_trait, curation_writer)

        with open(tempfile_path, "rt") as curation_file:
            curation_reader = csv.reader(curation_file, delimiter="\t")
            expected_record = [
                "transitional cell carcinoma of the bladder", "276",
                "http://www.ebi.ac.uk/efo/EFO_0006544|bladder transitional cell carcinoma|2|HP:0006740|EFO_CURRENT"
            ]
            self.assertEqual(expected_record, next(curation_reader))
Пример #5
0
def main(input_filepath, output_mappings_filepath, output_curation_filepath,
         filters, zooma_host, oxo_target_list, oxo_distance):
    logger.info('Started parsing trait names')
    trait_names_list = parse_trait_names(input_filepath)
    trait_names_counter = Counter(trait_names_list)
    logger.info("Loaded {} trait names".format(len(trait_names_counter)))

    with open(output_mappings_filepath, "w", newline='') as mapping_file, \
            open(output_curation_filepath, "wt") as curation_file:
        mapping_writer = csv.writer(mapping_file, delimiter="\t")
        mapping_writer.writerow(["#clinvar_trait_name", "uri", "label"])
        curation_writer = csv.writer(curation_file, delimiter="\t")

        logger.info('Processing trait names in parallel')
        trait_list = [
            Trait(trait_name, freq)
            for trait_name, freq in trait_names_counter.items()
        ]
        trait_process_pool = multiprocessing.Pool(processes=12)

        processed_trait_list = [
            trait_process_pool.apply(process_trait,
                                     args=(trait, filters, zooma_host,
                                           oxo_target_list, oxo_distance))
            for trait in trait_list
        ]

        for trait in processed_trait_list:
            output_trait(trait, mapping_writer, curation_writer)

    logger.info('Finished processing trait names')
Пример #6
0
def parse_trait_names(filepath: str) -> list:
    """
    For a file containing ClinVar records in the TSV format, return a list of Traits for the records in the file. Each
    Trait object contains trait name, how many times it occurs in the input file, and whether it is linked to an NT
    expansion variant.

    Trait occurrence count is calculated based on all unique (AlleleID, RCV, trait name) tuples in the input file. This
    is because each such tuple will, generally speaking, correspond to one output evidence string. So if we want to
    gauge which trait names are more important to curate, we need to consider how many such tuples it appears in.

    The reason we need to keep track of only *unique* tuples is because some (most) alleles will appear twice in the
    document with coordinates for GRCh37 and GRCh38, and we don't want to count them twice.

    Traits which are implicated in "NT expansion" variants are marked using a special field, because their curation is
    of highest importance even if the number of records which they are linked to is low.

    :param filepath: Path to a gzipped file containing ClinVar TSV summary.
    :return: A list of Trait objects.
    """

    # Tracks unique (AlleleID, RCV, trait name) tuples
    unique_association_tuples = set()

    # Tracks all traits which are at least once implicated in "NT expansion", or nucleotide repeat expansion, variants.
    # Their curation is of highest importantce regardless of how many records they are actually associated with.
    nt_expansion_traits = set()

    with gzip.open(filepath, "rt") as clinvar_summary:
        header = clinvar_summary.readline().rstrip().split('\t')
        for line in clinvar_summary:
            values = line.rstrip().split('\t')
            data = dict(zip(header, values))

            # Extract relevant fields
            is_nt_expansion_variant = data['Type'] == 'NT expansion'
            allele_id = data['#AlleleID']
            traits = set(data['PhenotypeList'].split(';'))
            rcv_ids = set(data['RCVaccession'].split(';'))

            # Process all (trait, rcv) records
            for trait, rcv_id in zip(traits, rcv_ids):
                unique_association_tuples.add((trait, rcv_id, allele_id))
                if is_nt_expansion_variant:
                    nt_expansion_traits.add(trait)

    # Count trait occurrences
    trait_names = [t[0] for t in unique_association_tuples]
    traits = []
    for trait_name, trait_frequency in Counter(trait_names).items():
        if trait_name == '-':
            print('Skipped {} missing trait names'.format(trait_frequency))
            continue
        associated_with_nt_expansion = trait_name in nt_expansion_traits
        traits.append(
            Trait(name=trait_name.lower(),
                  frequency=trait_frequency,
                  associated_with_nt_expansion=associated_with_nt_expansion))

    return traits
Пример #7
0
def process_trait(trait: Trait, filters: dict, zooma_host: str,
                  oxo_target_list: list, oxo_distance: int) -> Trait:
    """
    Process a single trait. Find any mappings in Zooma. If there are no high confidence Zooma
    mappings that are in EFO then query OxO with any high confidence mappings not in EFO.

    :param trait: The trait to be processed.
    :param filters: A dictionary of filters to use for querying Zooma.
    :param zooma_host: A string with the hostname to use for querying Zooma
    :param oxo_target_list: A list of strings, each being an OxO ID for an ontology. Used to specify
                            which ontologies should be queried using OxO.
    :param oxo_distance: int specifying the maximum number of steps to use to query OxO. i.e. OxO's
                         "distance" parameter.
    :return: The original trait after querying Zooma and possibly OxO, with any results found.
    """
    trait.zooma_result_list = get_zooma_results(trait.name, filters,
                                                zooma_host)
    trait.process_zooma_results()
    if (trait.is_finished or len(trait.zooma_result_list) == 0 or any([
            entry.is_current for mapping in trait.zooma_result_list
            for entry in mapping.mapping_list
    ])):
        return trait
    uris_for_oxo_set = get_uris_for_oxo(trait.zooma_result_list)
    if len(uris_for_oxo_set) == 0:
        return trait
    oxo_input_id_list = uris_to_oxo_format(uris_for_oxo_set)
    trait.oxo_result_list = get_oxo_results(oxo_input_id_list, oxo_target_list,
                                            oxo_distance)
    trait.process_oxo_mappings()

    return trait
Пример #8
0
def process_trait(trait: Trait, filters: dict, zooma_host: str, oxo_target_list: list,
                  oxo_distance: int) -> Trait:
    """
    Process a single trait. Find any mappings in Zooma. If there are no high confidence Zooma
    mappings that are in EFO then query OxO with any high confidence mappings not in EFO.

    :param trait: The trait to be processed.
    :param filters: A dictionary of filters to use for querying Zooma.
    :param zooma_host: A string with the hostname to use for querying Zooma
    :param oxo_target_list: A list of strings, each being an OxO ID for an ontology. Used to specify
                            which ontologies should be queried using OxO.
    :param oxo_distance: int specifying the maximum number of steps to use to query OxO. i.e. OxO's
                         "distance" parameter.
    :return: The original trait after querying Zooma and possibly OxO, with any results found.
    """
    trait.zooma_result_list = get_zooma_results(trait.name, filters, zooma_host)
    trait.process_zooma_results()
    if (trait.is_finished
            or len(trait.zooma_result_list) == 0
            or any([entry.is_current
                    for mapping in trait.zooma_result_list
                    for entry in mapping.mapping_list])):
        return trait
    uris_for_oxo_set = get_uris_for_oxo(trait.zooma_result_list)
    if len(uris_for_oxo_set) == 0:
        return trait
    oxo_input_id_list = uris_to_oxo_format(uris_for_oxo_set)
    trait.oxo_result_list = get_oxo_results(oxo_input_id_list, oxo_target_list, oxo_distance)
    trait.process_oxo_mappings()

    return trait
def parse_trait_names(filepath: str) -> list:
    """For a file containing ClinVar records in the XML format, return a list of Traits for the records in the file.
    Each Trait object contains trait name, how many times it occurs in the input file, and whether it is linked to an NT
    expansion variant.

    Trait occurrence count is calculated based on all unique (RCV, trait name) tuples in the input file. This is because
    each such tuple will, generally speaking, correspond to one output evidence string. So if we want to gauge which
    trait names are more important to curate, we need to consider how many such tuples it appears in.

    Traits which are implicated in "Microsatellite" variants are marked using a special field, because a subset of
    microsatellites are NT expansion variants, and their curation is of highest importance even if the number of records
    which they are linked to is low.

    :param filepath: Path to a gzipped file containing ClinVar XML dump.
    :return: A list of Trait objects."""

    # Tracks how many times a trait name occurs in ClinVar
    trait_name_counter = Counter()

    # Tracks all traits which are at least once implicated in "NT expansion", or nucleotide repeat expansion, variants.
    # Their curation is of highest importance regardless of how many records they are actually associated with.
    nt_expansion_traits = set()

    for clinvar_record in clinvar_xml_utils.ClinVarDataset(filepath):
        trait_names = set(trait.preferred_or_other_valid_name.lower()
                          for trait in clinvar_record.traits_with_valid_names)
        for trait_name in trait_names:
            trait_name_counter[trait_name] += 1
        if clinvar_record.measure and clinvar_record.measure.is_repeat_expansion_variant:
            nt_expansion_traits |= trait_names

    # Count trait occurrences
    traits = []
    for trait_name, trait_frequency in trait_name_counter.items():
        if trait_name == '-':
            print('Skipped {} missing trait names'.format(trait_frequency))
            continue
        associated_with_nt_expansion = trait_name in nt_expansion_traits
        traits.append(
            Trait(name=trait_name,
                  frequency=trait_frequency,
                  associated_with_nt_expansion=associated_with_nt_expansion))

    return traits
Пример #10
0
def main(input_filepath, output_mappings_filepath, output_curation_filepath,
         filters, zooma_host, oxo_target_list, oxo_distance):
    trait_names_list = parse_trait_names(input_filepath)
    trait_names_counter = Counter(trait_names_list)

    with open(output_mappings_filepath, "w", newline='') as mapping_file, \
            open(output_curation_filepath, "wt") as curation_file:
        mapping_writer = csv.writer(mapping_file, delimiter="\t")
        mapping_writer.writerow(["#clinvar_trait_name", "uri", "label"])
        curation_writer = csv.writer(curation_file, delimiter="\t")

        bar = progressbar.ProgressBar(
            max_value=len(trait_names_counter),
            widgets=[progressbar.AdaptiveETA(samples=1000)])

        for trait_name, freq in bar(trait_names_counter.items()):
            trait = Trait(trait_name, freq)
            trait = process_trait(trait, filters, zooma_host, oxo_target_list,
                                  oxo_distance)
            output_trait(trait, mapping_writer, curation_writer)