Exemplo n.º 1
0
    def _make_pheno_assoc(self, graph, gene_id, disorder_num, disorder_label,
                          phene_key):
        """
        From the docs:
        Brackets, "[ ]", indicate "nondiseases," mainly genetic variations
        that lead to apparently abnormal laboratory test values
        (e.g., dysalbuminemic euthyroidal hyperthyroxinemia).

        Braces, "{ }", indicate mutations that contribute to susceptibility
        to multifactorial disorders (e.g., diabetes, asthma) or to
        susceptibility to infection (e.g., malaria).

        A question mark, "?", before the phenotype name indicates that the
        relationship between the phenotype and gene is provisional.
        More details about this relationship are provided in the comment
        field of the map and in the gene and phenotype OMIM entries.

        Phene key:
        The number in parentheses after the name of each disorder indicates
        the following:
          (1) the disorder was positioned by mapping of the wildtype gene;
          (2) the disease phenotype itself was mapped;
          (3) the molecular basis of the disorder is known;
          (4) the disorder is a chromosome deletion or duplication syndrome.

        reference: https://omim.org/help/faq#1_6

        :param graph: graph object of type dipper.graph.Graph
        :param gene_id: str, gene id as curie
        :param gene_symbol: str, symbol
        :param disorder_num: str, disorder id
        :param disorder_label: str, disorder label
        :param phene_key: int or str, 1-4, see docstring
        :return:
        """

        disorder_id = ':'.join(('OMIM', disorder_num))
        rel_label = 'causes condition'
        rel_id = self.globaltt[rel_label]

        if disorder_label.startswith('['):
            rel_id = self.globaltt['is marker for']
            # rel_label = 'is a marker for'
        elif disorder_label.startswith('{'):
            rel_id = self.globaltt['contributes to']
            # rel_label = 'contributes to'
        elif disorder_label.startswith('?'):
            # this is a questionable mapping!  skip?
            rel_id = self.globaltt['contributes to']

        # Note: this is actually a G2D association;
        # see https://github.com/monarch-initiative/dipper/issues/748
        assoc = G2PAssoc(graph, self.name, gene_id, disorder_id, rel_id)

        if phene_key is not None:
            evidence = self.resolve(phene_key, False)
            if evidence != phene_key:
                assoc.add_evidence(evidence)  # evidence is Found

        assoc.add_association_to_graph()
Exemplo n.º 2
0
    def _make_association(self, subject_id, object_id, rel_id, pubmed_ids):
        """
        Make a reified association given an array of pubmed identifiers.

        Args:
            :param subject_id  id of the subject of the association (gene/chem)
            :param object_id  id of the object of the association (disease)
            :param rel_id  relationship id
            :param pubmed_ids an array of pubmed identifiers
        Returns:
            :return None

        """

        # TODO pass in the relevant Assoc class rather than relying on G2P
        assoc = G2PAssoc(self.graph, self.name, subject_id, object_id, rel_id)
        if pubmed_ids is not None and len(pubmed_ids) > 0:
            for pmid in pubmed_ids:
                ref = Reference(self.graph, pmid,
                                self.globaltt['journal article'])
                ref.addRefToGraph()
                assoc.add_source(pmid)
                assoc.add_evidence(self.globaltt['traceable author statement'])

        assoc.add_association_to_graph()
Exemplo n.º 3
0
    def _build_gene_disease_model(self,
                                  gene_id,
                                  relation_id,
                                  disease_id,
                                  variant_label,
                                  consequence_predicate=None,
                                  consequence_id=None,
                                  allelic_requirement=None,
                                  pmids=None):
        """
        Builds gene variant disease model

        :return: None
        """
        model = Model(self.graph)
        geno = Genotype(self.graph)

        pmids = [] if pmids is None else pmids

        is_variant = False
        variant_or_gene = gene_id

        variant_id_string = variant_label
        variant_bnode = self.make_id(variant_id_string, "_")

        if consequence_predicate is not None \
                and consequence_id is not None:
            is_variant = True
            model.addTriple(variant_bnode, consequence_predicate,
                            consequence_id)
            # Hack to add labels to terms that
            # don't exist in an ontology
            if consequence_id.startswith(':'):
                model.addLabel(consequence_id,
                               consequence_id.strip(':').replace('_', ' '))

        if is_variant:
            variant_or_gene = variant_bnode
            # Typically we would type the variant using the
            # molecular consequence, but these are not specific
            # enough for us to make mappings (see translation table)
            model.addIndividualToGraph(variant_bnode, variant_label,
                                       self.globaltt['variant_locus'])
            geno.addAffectedLocus(variant_bnode, gene_id)
            model.addBlankNodeAnnotation(variant_bnode)

        assoc = G2PAssoc(self.graph, self.name, variant_or_gene, disease_id,
                         relation_id)
        assoc.source = pmids
        assoc.add_association_to_graph()

        if allelic_requirement is not None and is_variant is False:
            model.addTriple(assoc.assoc_id,
                            self.globaltt['has_allelic_requirement'],
                            allelic_requirement)
            if allelic_requirement.startswith(':'):
                model.addLabel(
                    allelic_requirement,
                    allelic_requirement.strip(':').replace('_', ' '))
Exemplo n.º 4
0
    def _add_variant_trait_association(self,
                                       variant_id,
                                       mapped_trait_uri,
                                       mapped_trait,
                                       mondo_data,
                                       pubmed_id,
                                       description=None):
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)

        # Use mapped traits for labels, hope that labels do not contain commas

        mapped_traits = [trait.strip() for trait in mapped_trait.split(',')]
        mapped_trait_uris = [
            iri.strip() for iri in mapped_trait_uri.split(',')
        ]
        if mapped_trait_uris:
            for index, trait in enumerate(mapped_trait_uris):
                trait_curie = trait.replace("http://www.ebi.ac.uk/efo/EFO_",
                                            "EFO:")

                if not DipperUtil.is_id_in_mondo(trait_curie, mondo_data):
                    if re.match(r'^EFO', trait_curie):
                        model.addClassToGraph(
                            trait_curie,
                            mapped_traits[index],
                            self.globaltt['phenotype'],
                            class_category=blv.terms['PhenotypicFeature'])
                    LOG.debug("{} not in mondo".format(trait_curie))
                else:
                    LOG.debug("{} in mondo".format(trait_curie))

                pubmed_curie = 'PMID:' + pubmed_id

                ref = Reference(graph, pubmed_curie,
                                self.globaltt['journal article'])
                ref.addRefToGraph()

                if trait_curie is not None and trait_curie != '' and \
                        variant_id is not None and variant_id != '':
                    assoc = G2PAssoc(
                        graph, self.name, variant_id, trait_curie,
                        model.globaltt['contributes to condition'])
                    assoc.add_source(pubmed_curie)

                    assoc.add_evidence(self.globaltt[
                        'combinatorial evidence used in automatic assertion'])

                    if description is not None:
                        assoc.set_description(description)

                    # FIXME score should get added to provenance/study
                    # assoc.set_score(pvalue)

                    assoc.add_association_to_graph()
Exemplo n.º 5
0
    def parse(self, limit=None):

        count = 0
        for num in range(10, 100):
            fuzzy_gene = "MGI:{0}*".format(num)
            gene = "MGI:{0}".format(num)
            service = Service("http://www.mousemine.org/mousemine/service")
            logging.getLogger('Model').setLevel(logging.CRITICAL)
            logging.getLogger('JSONIterator').setLevel(logging.CRITICAL)
            query = service.new_query("OntologyAnnotation")
            query.add_constraint("subject", "SequenceFeature")
            query.add_constraint("ontologyTerm", "MPTerm")
            query.add_view("subject.primaryIdentifier", "subject.symbol",
                           "subject.sequenceOntologyTerm.name",
                           "ontologyTerm.identifier", "ontologyTerm.name",
                           "evidence.publications.pubMedId",
                           "evidence.comments.type",
                           "evidence.comments.description")
            query.add_sort_order("OntologyAnnotation.ontologyTerm.name", "ASC")
            query.add_constraint("subject.organism.taxonId",
                                 "=",
                                 "10090",
                                 code="A")
            query.add_constraint("subject", "LOOKUP", fuzzy_gene, code="B")
            query.add_constraint("subject.primaryIdentifier",
                                 "CONTAINS",
                                 gene,
                                 code="C")
            query.outerjoin("evidence.comments")

            for row in query.rows():
                mgi_curie = row["subject.primaryIdentifier"]
                mp_curie = row["ontologyTerm.identifier"]
                pub_curie = "PMID:{0}".format(
                    row["evidence.publications.pubMedId"])
                assoc = G2PAssoc(self.graph, self.name, mgi_curie, mp_curie)
                if row["evidence.publications.pubMedId"]:
                    reference = Reference(
                        self.graph, pub_curie,
                        Reference.ref_types['journal_article'])
                    reference.addRefToGraph()
                    assoc.add_source(pub_curie)

                assoc.add_evidence('ECO:0000059')
                assoc.add_association_to_graph()

            if not count % 10 and count != 0:
                count_from = count - 10
                logger.info(
                    "{0} processed ids from MGI:{1}* to MGI:{2}*".format(
                        datetime.datetime.now(), count_from, count))

            count += 1
            if limit and count >= limit:
                break

        return
Exemplo n.º 6
0
    def process_disease_association(self, limit):

        raw = '/'.join((self.rawdir, self.files['disease_assoc']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)
        logger.info("Processing disease models")
        geno = Genotype(g)
        line_counter = 0
        worm_taxon = 'NCBITaxon:6239'
        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                if re.match(r'!', ''.join(row)):  # header
                    continue
                line_counter += 1
                (db, gene_num, gene_symbol, is_not, disease_id, ref,
                 eco_symbol, with_or_from, aspect, gene_name, gene_synonym,
                 gene_class, taxon, date, assigned_by, blank, blank2) = row

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                # TODO add NOT phenotypes
                if is_not == 'NOT':
                    continue

                # WB	WBGene00000001	aap-1		DOID:2583	PMID:19029536	IEA	ENSEMBL:ENSG00000145675|OMIM:615214	D		Y110A7A.10	gene	taxon:6239	20150612	WB
                gene_id = 'WormBase:' + gene_num

                # make a variant of the gene
                vl = '_:' + '-'.join((gene_num, 'unspecified'))
                vl_label = 'some variant of ' + gene_symbol
                geno.addAffectedLocus(vl, gene_id)
                model.addBlankNodeAnnotation(vl)
                animal_id = geno.make_experimental_model_with_genotype(
                    vl, vl_label, worm_taxon, 'worm')

                assoc = G2PAssoc(g, self.name, animal_id, disease_id,
                                 model.object_properties['model_of'])
                ref = re.sub(r'WB_REF:', 'WormBase:', ref)
                if ref != '':
                    assoc.add_source(ref)
                eco_id = None
                if eco_symbol == 'IEA':
                    eco_id = 'ECO:0000501'  # IEA is this now
                if eco_id is not None:
                    assoc.add_evidence(eco_id)

                assoc.add_association_to_graph()

        return
Exemplo n.º 7
0
    def parse(self, limit=None):
        zfin_parser = ZFIN(self.graph_type, self.are_bnodes_skized)
        model = Model(self.graph)
        zp_file = '/'.join((self.rawdir, self.files['zpmap']['file']))
        g2p_file = '/'.join((self.rawdir, self.files['g2p_clean']['file']))
        zfin_parser.zp_map = zfin_parser._load_zp_mappings(zp_file)

        with open(g2p_file, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                (internal_id,
                 symbol,
                 gene_id,
                 subterm1_id,
                 subterm1_label,
                 pc_rel_id,
                 pc_rel_label,
                 superterm1_id,
                 superterm1_label,
                 quality_id,
                 quality_name,
                 modifier,
                 subterm2_id,
                 subterm2_label,
                 pc_rel2_id,
                 pc_rel2_label,
                 superterm2_id,
                 superterm2_label,
                 fish_id,
                 fish_label,
                 start_stage,
                 end_stage,
                 environment,
                 pub_id,
                 figure_id
                 # ,unknown_field   # just leave this here for next time
                ) = row

                zp_id = zfin_parser._map_sextuple_to_phenotype(
                    superterm1_id, subterm1_id, quality_id, superterm2_id,
                    subterm2_id, modifier)

                gene_curie = "ZFIN:{0}".format(gene_id)
                model.makeLeader(gene_curie)
                pub_curie = "ZFIN:{0}".format(pub_id)
                if zp_id:
                    assoc = G2PAssoc(self.graph, self.name, gene_curie, zp_id)
                    if pub_id:
                        reference = Reference(self.graph, pub_curie,
                                              self.globaltt['document'])
                        reference.addRefToGraph()
                        assoc.add_source(pub_curie)

                    assoc.add_evidence(
                        self.globaltt['experimental phenotypic evidence'])
                    assoc.add_association_to_graph()
Exemplo n.º 8
0
    def _add_variant_trait_association(self,
                                       variant_id,
                                       mapped_trait_uri,
                                       efo_ontology,
                                       pubmed_id,
                                       description=None):
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        # make associations to the EFO terms; there can be >1
        if mapped_trait_uri.strip() != '':
            for trait in re.split(r',', mapped_trait_uri):
                trait = trait.strip()

                trait_curie = trait.replace("http://www.ebi.ac.uk/efo/EFO_",
                                            "EFO:")

                phenotype_query = """
                    SELECT ?trait
                    WHERE {{
                        <{0}> rdfs:subClassOf+ <http://www.ebi.ac.uk/efo/EFO_0000651> .
                        <{0}> rdfs:label ?trait .
                    }}
                """.format(trait)

                query_result = efo_ontology.query(phenotype_query)
                if len(list(query_result)) > 0:
                    if re.match(r'^EFO', trait_curie):
                        model.addClassToGraph(trait_curie,
                                              list(query_result)[0][0],
                                              self.globaltt['Phenotype'])

                pubmed_curie = 'PMID:' + pubmed_id

                ref = Reference(graph, pubmed_curie,
                                self.globaltt['journal article'])
                ref.addRefToGraph()

                assoc = G2PAssoc(graph, self.name, variant_id, trait_curie,
                                 model.globaltt['contributes to condition'])
                assoc.add_source(pubmed_curie)

                assoc.add_evidence(self.globaltt[
                    'combinatorial evidence used in automatic assertion'])

                if description is not None:
                    assoc.set_description(description)

                # FIXME score should get added to provenance/study
                # assoc.set_score(pvalue)
                if trait_curie is not None:
                    assoc.add_association_to_graph()
Exemplo n.º 9
0
    def _make_pheno_assoc(self, g, gene_id, gene_symbol, disorder_num,
                          disorder_label, phene_key):

        geno = Genotype(g)
        model = Model(g)
        disorder_id = ':'.join(('OMIM', disorder_num))
        rel_id = model.object_properties['has_phenotype']  # default
        rel_label = 'causes'
        if re.match(r'\[', disorder_label):
            rel_id = model.object_properties['is_marker_for']
            rel_label = 'is a marker for'
        elif re.match(r'\{', disorder_label):
            rel_id = model.object_properties['contributes_to']
            rel_label = 'contributes to'
        elif re.match(r'\?', disorder_label):
            # this is a questionable mapping!  skip?
            rel_id = model.object_properties['contributes_to']
            rel_label = 'contributes to'

        evidence = self._map_phene_mapping_code_to_eco(phene_key)

        # we actually want the association between the gene and the disease
        # to be via an alternate locus not the "wildtype" gene itself.
        # so we make an anonymous alternate locus,
        # and put that in the association.
        # but we only need to do that in the cases when it's not an NCBIGene
        # (as that is a sequence feature itself)
        if re.match(r'OMIM:', gene_id):
            alt_locus = '_:' + re.sub(r':', '',
                                      gene_id) + '-' + disorder_num + 'VL'
            alt_label = gene_symbol.strip()
            if alt_label is not None and alt_label != '':
                alt_label = \
                    ' '.join(('some variant of', alt_label,
                              'that', rel_label, disorder_label))
            else:
                alt_label = None

            model.addIndividualToGraph(alt_locus, alt_label,
                                       Genotype.genoparts['variant_locus'])
            geno.addAffectedLocus(alt_locus, gene_id)
            model.addBlankNodeAnnotation(alt_locus)

        else:
            # assume it's already been added
            alt_locus = gene_id

        assoc = G2PAssoc(g, self.name, alt_locus, disorder_id, rel_id)
        assoc.add_evidence(evidence)
        assoc.add_association_to_graph()

        return
Exemplo n.º 10
0
    def test_therapeutic_relationship(self):
        from dipper.utils.TestUtils import TestUtils
        from dipper.models.Model import Model

        # Make testutils object and load ttl
        test_query = TestUtils(self.source.graph)
        test_query.load_testgraph_from_turtle(self.source)
        graph = self.source.graph
        model = Model(graph)

        # Expected structure
        # TODO can this be unified OBAN and the Annot models
        # to be automatically generated?
        sparql_query = """
                       SELECT ?assoc ?disease ?rel ?chemical
                       WHERE {
                           ?assoc a OBAN:association ;
                           OBAN:association_has_object ?disease ;
                           OBAN:association_has_predicate ?rel ;
                           OBAN:association_has_subject ?chemical .}
                       """

        # SPARQL variables to check
        chem_id = 'MESH:D009538'
        chem_uri = graph._getNode(chem_id)
        disease_id = 'OMIM:188890'
        disease_uri = graph._getNode(disease_id)
        rel_id = model.object_properties['substance_that_treats']
        rel_uri = graph._getNode(rel_id)
        # TODO unused
        # pubmed_id = 'PMID:16785264'
        # pubmed_uri = gu.getNode(pubmed_id)
        # eco = 'ECO:0000033'

        assoc = G2PAssoc(graph, self.source.name, chem_id, disease_id, rel_id)
        assoc_id = assoc.make_g2p_id()
        assoc_uri = self.source.graph._getNode(assoc_id)

        # One of the expected outputs from query
        expected_output = [assoc_uri, disease_uri, rel_uri, chem_uri]

        # Query graph
        sparql_output = test_query.query_graph(sparql_query)

        self.assertTrue(
            expected_output in sparql_output,
            "did not find expected association: " + str(expected_output) +
            " found " + str(len(sparql_output)) + " others:\n" +
            str(sparql_output))

        logger.info("Test query data finished.")
Exemplo n.º 11
0
    def process_disease_association(self, limit):
        src_key = 'disease_assoc'
        raw = '/'.join((self.rawdir, self.files[src_key]['file']))
        col = self.files[src_key]['columns']
        graph = self.graph
        LOG.info("Processing disease models")
        with open(raw, 'r') as csvfile:
            reader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            row = next(reader)
            if row[0] != '!gaf-version: 2.0':
                LOG.error('Not a vlaid gaf v2.0 formatted file: %s', raw)
                # raise
            for row in reader:
                if row[0][0] == '!':
                    continue
                # db = row[col.index('DB')]
                gene_num = row[col.index('DB Object ID')]
                # gene_symbol = row[col.index('DB Object Symbol')]
                is_not = row[col.index('Qualifier')]
                disease_id = row[col.index('GO ID')]
                ref = row[col.index('DB:Reference (|DB:Reference)')].strip()
                eco_symbol = row[col.index('Evidence Code')]
                # with_or_from = row[col.index('With (or) From')]
                # aspect = row[col.index('Aspect')]
                # gene_name = row[col.index('DB Object Name')]
                # gene_synonym = row[col.index('DB Object Synonym (|Synonym)')]
                # gene_class = row[col.index('DB Object Type')]
                # taxon = row[col.index('Taxon(|taxon)')]
                # date = row[col.index('Date')]
                # assigned_by = row[col.index('Assigned By')]
                # blank = row[col.index('Annotation Extension')]
                # blank2 = row[col.index('Gene Product Form ID')]

                # TODO add NOT phenotypes
                if is_not == 'NOT':
                    continue

                # WB	WBGene00000001	aap-1		DOID:2583	PMID:19029536	IEA	ENSEMBL:ENSG00000145675|OMIM:615214	D		Y110A7A.10	gene	taxon:6239	20150612	WB
                gene_id = 'WormBase:' + gene_num

                assoc = G2PAssoc(graph, self.name, gene_id, disease_id,
                                 self.globaltt['is model of'])
                ref = re.sub(r'WB_REF:', 'WormBase:', ref)
                if ref != '':
                    assoc.add_source(ref)
                assoc.add_evidence(self.gaf_eco[eco_symbol])
                assoc.add_association_to_graph()
Exemplo n.º 12
0
    def _process_breed_phene_row(self, row):
        model = Model(self.g)
        # Linking disorders/characteristic to breeds
        # breed_id, phene_id, added_by
        breed_id = self.id_hash['breed'].get(row['breed_id'])
        phene_id = self.id_hash['phene'].get(row['phene_id'])

        # get the omia id
        omia_id = self._get_omia_id_from_phene_id(phene_id)

        if (self.testMode and not (
                omia_id in self.test_ids['disease'] and
                int(row['breed_id']) in self.test_ids['breed']) or
                breed_id is None or phene_id is None):
            return

        # FIXME we want a different relationship here
        assoc = G2PAssoc(
            self.g, self.name, breed_id, phene_id,
            model.object_properties['has_phenotype'])
        assoc.add_association_to_graph()

        # add that the breed is a model of the human disease
        # use the omia-omim mappings for this
        # we assume that we have already scrubbed out the genes
        # from the omim list, so we can make the model associations here

        omim_ids = self.omia_omim_map.get(omia_id)
        eco_id = "ECO:0000214"   # biological aspect of descendant evidence
        if omim_ids is not None and len(omim_ids) > 0:
            if len(omim_ids) > 1:
                logger.info(
                    "There's 1:many omia:omim mapping: %s, %s",
                    omia_id, str(omim_ids))
            for i in omim_ids:
                assoc = G2PAssoc(
                    self.g, self.name, breed_id, i,
                    model.object_properties['model_of'])
                assoc.add_evidence(eco_id)
                assoc.add_association_to_graph()
                aid = assoc.get_association_id()

                breed_label = self.label_hash.get(breed_id)
                if breed_label is None:
                    breed_label = "this breed"

                m = re.search(r'\((.*)\)', breed_label)
                if m:
                    sp_label = m.group(1)
                else:
                    sp_label = ''

                phene_label = self.label_hash.get(phene_id)
                if phene_label is None:
                    phene_label = "phenotype"
                elif phene_label.endswith(sp_label):
                    # some of the labels we made already include the species;
                    # remove it to make a cleaner desc
                    phene_label = re.sub(r' in '+sp_label, '', phene_label)
                desc = ' '.join(
                    ("High incidence of", phene_label, "in", breed_label,
                     "suggests it to be a model of disease", i + "."))
                model.addDescription(aid, desc)
        return
Exemplo n.º 13
0
    def process_allele_phenotype(self, limit=None):
        """
        This file compactly lists variant to phenotype associations,
        such that in a single row, there may be >1 variant listed
        per phenotype and paper.  This indicates that each variant is
        individually assocated with the given phenotype,
        as listed in 1+ papers.
        (Not that the combination of variants is producing the phenotype.)
        :param limit:
        :return:

        """

        raw = '/'.join((self.rawdir, self.files['allele_pheno']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        logger.info("Processing Allele phenotype associations")
        line_counter = 0
        geno = Genotype(g)
        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                if re.match(r'!', ''.join(row)):  # header
                    continue
                line_counter += 1
                (db, gene_num, gene_symbol, is_not, phenotype_id, ref,
                 eco_symbol, with_or_from, aspect, gene_name, gene_synonym,
                 gene_class, taxon, date, assigned_by, blank, blank2) = row

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                # TODO add NOT phenotypes
                if is_not == 'NOT':
                    continue

                eco_id = None
                if eco_symbol == 'IMP':
                    eco_id = 'ECO:0000015'
                elif eco_symbol.strip() != '':
                    logger.warning("Encountered an ECO code we don't have: %s",
                                   eco_symbol)

                # according to the GOA spec, persons are not allowed to be
                # in the reference column, therefore they the variant and
                # persons are swapped between the reference and with column.
                # we unswitch them here.
                temp_var = temp_ref = None
                if re.search(r'WBVar|WBRNAi', ref):
                    temp_var = ref
                    # move the paper from the with column into the ref
                if re.search(r'WBPerson', with_or_from):
                    temp_ref = with_or_from
                if temp_var is not None or temp_ref is not None:
                    with_or_from = temp_var
                    ref = temp_ref

                allele_list = re.split(r'\|', with_or_from)
                if len(allele_list) == 0:
                    logger.error(
                        "Missing alleles from phenotype assoc at line %d",
                        line_counter)
                    continue
                else:
                    for a in allele_list:
                        allele_num = re.sub(r'WB:', '', a.strip())
                        allele_id = 'WormBase:' + allele_num
                        gene_id = 'WormBase:' + gene_num

                        if re.search(r'WBRNAi', allele_id):
                            # make the reagent-targeted gene,
                            # & annotate that instead of the RNAi item directly
                            rnai_num = re.sub(r'WormBase:', '', allele_id)
                            rnai_id = allele_id
                            rtg_id = self.make_reagent_targeted_gene_id(
                                gene_num, rnai_num)
                            geno.addReagentTargetedGene(
                                rnai_id, 'WormBase:' + gene_num, rtg_id)
                            geno.addGeneTargetingReagent(
                                rnai_id, None, geno.genoparts['RNAi_reagent'],
                                gene_id)
                            allele_id = rtg_id
                        elif re.search(r'WBVar', allele_id):
                            # this may become deprecated by using wormmine
                            # make the allele to gene relationship
                            # the WBVars are really sequence alterations

                            # the public name will come from elsewhere
                            geno.addSequenceAlteration(allele_id, None)
                            vl_id = '_:' + '-'.join((gene_num, allele_num))
                            geno.addSequenceAlterationToVariantLocus(
                                allele_id, vl_id)
                            geno.addAlleleOfGene(vl_id, gene_id)
                        else:
                            logger.warning(
                                "Some kind of allele I don't recognize: %s",
                                allele_num)
                            continue
                        assoc = G2PAssoc(g, self.name, allele_id, phenotype_id)

                        if eco_id is not None:
                            assoc.add_evidence(eco_id)

                        if ref is not None and ref != '':
                            ref = re.sub(r'(WB:|WB_REF:)', 'WormBase:', ref)
                            reference = Reference(g, ref)
                            if re.search(r'Person', ref):
                                reference.setType(
                                    reference.ref_types['person'])
                                # also add
                                # inferred from background scientific knowledge
                                assoc.add_evidence('ECO:0000001')
                            reference.addRefToGraph()
                            assoc.add_source(ref)

                        assoc.add_association_to_graph()

                        # finish looping through all alleles

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Exemplo n.º 14
0
    def _process_disease_model(self, limit):
        """
        Make associations between a disease and fly alleles
        Adds triples to self.graph

        Pulls FBrf to pmid eqs from _flyref_to_pmid
        for 2019_02 maps all but FlyBase:FBrf0211649

        Right now only model_of is processed from DOID_qualifier
        As of 2019_02 release there are also:
        ameliorates
        exacerbates
        DOES NOT ameliorate
        DOES NOT exacerbate
        DOES NOT model
        DOID_qualifier

        :param limit: number of rows to process
        :return: None

        """
        graph = self.graph
        pub_map = self._flyref_to_pmid()
        src_key = 'disease_model'
        raw = '/'.join((self.rawdir, self.files[src_key]['file']))
        LOG.info("processing disease models")
        col = self.files[src_key]['columns']

        transgenic_alleles = self._get_foreign_transgenic_alleles()

        with gzip.open(raw, 'rt') as tsvfile:
            reader = csv.reader(tsvfile, delimiter='\t')
            # skip first four lines
            for _ in range(0, 4):
                # file name, generated datetime, db info, blank line
                next(reader)

            row = next(reader)  # headers
            row[0] = row[0][3:]  # uncomment

            self.check_fileheader(col, row)

            for row in reader:
                # File ends with a blank line and a final comment
                if ''.join(row).startswith('#') or not ''.join(row).strip():
                    continue

                allele_id = row[col.index('Allele used in model (FBal ID)')]
                flybase_ref = row[col.index('Reference (FBrf ID)')]
                evidence_or_allele = row[col.index(
                    'Evidence/interacting alleles')]
                doid_id = row[col.index('DO ID')]
                doid_qualifier = row[col.index('DO qualifier')]

                # Skip any foreign transgenic alleles
                if allele_id in transgenic_alleles:
                    continue

                # Rows without an allele id may be "potential models through orthology"
                # We skip these because we can already make that inference
                # http://flybase.org/reports/FBgn0000022
                if not allele_id:
                    continue

                allele_curie = 'FlyBase:' + allele_id
                if doid_qualifier == 'model of':
                    relation = self.globaltt['is model of']
                else:
                    # amelorates, exacerbates, and DOES NOT *
                    continue

                assoc = G2PAssoc(graph, self.name, allele_curie, doid_id,
                                 relation)
                if flybase_ref != '':
                    ref_curie = None
                    try:
                        ref_curie = pub_map[flybase_ref]
                    except KeyError:
                        ref_curie = 'FlyBase:' + flybase_ref

                    assoc.add_source(ref_curie)
                if evidence_or_allele == 'inferred from mutant phenotype':
                    evidence_id = self.globaltt['mutant phenotype evidence']
                    assoc.add_evidence(evidence_id)
                else:
                    assoc.set_description(evidence_or_allele)

                assoc.add_association_to_graph()

                if limit is not None and reader.line_num > limit:
                    break
Exemplo n.º 15
0
    def _process_phenotype_data(self, limit):
        """
        NOTE: If a Strain carries more than one mutation,
        then each Mutation description,
        i.e., the set: (
            Mutation Type - Chromosome - Gene Symbol -
            Gene Name - Allele Symbol - Allele Name)
        will require a separate line.

        Note that MMRRC curates phenotypes to alleles,
        even though they distribute only one file with the
        phenotypes appearing to be associated with a strain.

        So, here we process the allele-to-phenotype relationships separately
        from the strain-to-allele relationships.

        :param limit:
        :return:

        """

        src_key = 'catalog'
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        fname = '/'.join((self.rawdir, self.files[src_key]['file']))

        self.strain_hash = {}
        self.id_label_hash = {}
        genes_with_no_ids = set()
        stem_cell_class = self.globaltt['stem cell']
        mouse_taxon = self.globaltt['Mus musculus']
        geno = Genotype(graph)
        with open(fname, 'r', encoding="utf8") as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            # First line is header not date/version info. This changed recently,
            # apparently as of Sep 2019. Also, 3rd line is no longer blank.
            row = [x.strip() for x in next(reader)]  # messy messy
            col = self.files['catalog']['columns']
            strain_missing_allele = []  # to count the ones w/insufficent info
            if not self.check_fileheader(col, row):
                pass

            for row in reader:
                strain_id = row[col.index('STRAIN/STOCK_ID')].strip()
                strain_label = row[col.index('STRAIN/STOCK_DESIGNATION')]
                # strain_type_symbol = row[col.index('STRAIN_TYPE')]
                strain_state = row[col.index('STATE')]
                mgi_allele_id = row[col.index(
                    'MGI_ALLELE_ACCESSION_ID')].strip()
                mgi_allele_symbol = row[col.index('ALLELE_SYMBOL')]
                # mgi_allele_name = row[col.index('ALLELE_NAME')]
                # mutation_type = row[col.index('MUTATION_TYPE')]
                # chrom = row[col.index('CHROMOSOME')]
                mgi_gene_id = row[col.index('MGI_GENE_ACCESSION_ID')].strip()
                mgi_gene_symbol = row[col.index('GENE_SYMBOL')].strip()
                mgi_gene_name = row[col.index('GENE_NAME')]
                # sds_url = row[col.index('SDS_URL')]
                # accepted_date = row[col.index('ACCEPTED_DATE')]
                mpt_ids = row[col.index('MPT_IDS')].strip()
                pubmed_nums = row[col.index('PUBMED_IDS')].strip()
                research_areas = row[col.index('RESEARCH_AREAS')].strip()

                if self.test_mode and (strain_id not in self.test_ids) \
                        or mgi_gene_name == 'withdrawn':
                    continue

                # strip off stuff after the dash -
                # is the holding center important?
                # MMRRC:00001-UNC --> MMRRC:00001
                strain_id = re.sub(r'-\w+$', '', strain_id)

                self.id_label_hash[strain_id] = strain_label

                # get the variant or gene to save for later building of
                # the genotype
                if strain_id not in self.strain_hash:
                    self.strain_hash[strain_id] = {
                        'variants': set(),
                        'genes': set()
                    }

                # flag bad ones
                if mgi_allele_id[:4] != 'MGI:' and mgi_allele_id != '':
                    LOG.error("Erroneous MGI allele id: %s", mgi_allele_id)
                    if mgi_allele_id[:3] == 'MG:':
                        mgi_allele_id = 'MGI:' + mgi_allele_id[3:]
                    else:
                        mgi_allele_id = ''

                if mgi_allele_id != '':
                    self.strain_hash[strain_id]['variants'].add(mgi_allele_id)
                    self.id_label_hash[mgi_allele_id] = mgi_allele_symbol

                    # use the following if needing to add the sequence alteration types
                    # var_type = self.localtt[mutation_type]
                    # make a sequence alteration for this variant locus,
                    # and link the variation type to it
                    # sa_id = '_'+re.sub(r':','',mgi_allele_id)+'SA'
                    # if self.nobnodes:
                    #     sa_id = ':'+sa_id
                    # gu.addIndividualToGraph(g, sa_id, None, var_type)
                    # geno.addSequenceAlterationToVariantLocus(sa_id, mgi_allele_id)

                # scrub out any spaces, fix known issues
                mgi_gene_id = re.sub(r'\s+', '', mgi_gene_id)
                if mgi_gene_id == 'NULL':
                    mgi_gene_id = ''
                elif mgi_gene_id[:7] == 'GeneID:':
                    mgi_gene_id = 'NCBIGene:' + mgi_gene_id[7:]

                if mgi_gene_id != '':
                    try:
                        [curie, localid] = mgi_gene_id.split(':')
                    except ValueError as verror:
                        LOG.warning(
                            "Problem parsing mgi_gene_id %s from file %s: %s",
                            mgi_gene_id, fname, verror)
                    if curie not in ['MGI', 'NCBIGene']:
                        LOG.info("MGI Gene id not recognized: %s", mgi_gene_id)
                    self.strain_hash[strain_id]['genes'].add(mgi_gene_id)
                    self.id_label_hash[mgi_gene_id] = mgi_gene_symbol

                # catch some errors - too many. report summary at the end
                # some things have gene labels, but no identifiers - report
                if mgi_gene_symbol != '' and mgi_gene_id == '':
                    # LOG.error(
                    #    "Gene label with no MGI identifier for strain %s: %s",
                    #    strain_id, mgi_gene_symbol)
                    genes_with_no_ids.add(mgi_gene_symbol)
                    # make a temp id for genes that aren't identified ... err wow.
                    # tmp_gene_id = '_' + mgi_gene_symbol
                    # self.id_label_hash[tmp_gene_id.strip()] = mgi_gene_symbol
                    # self.strain_hash[strain_id]['genes'].add(tmp_gene_id)

                # split apart the mp ids
                # ataxia [MP:0001393] ,hypoactivity [MP:0001402] ...
                # mpt_ids are a comma delimited list
                # labels with MP terms following in brackets
                phenotype_ids = []
                if mpt_ids != '':
                    for lb_mp in mpt_ids.split(r','):
                        lb_mp = lb_mp.strip()
                        if lb_mp[-1:] == ']' and lb_mp[-12:-8] == '[MP:':
                            phenotype_ids.append(lb_mp[-11:-2])

                # pubmed ids are space delimited
                pubmed_ids = []
                if pubmed_nums != '':
                    for pm_num in re.split(r'\s+', pubmed_nums):
                        pmid = 'PMID:' + pm_num.strip()
                        pubmed_ids.append(pmid)
                        ref = Reference(graph, pmid,
                                        self.globaltt['journal article'])
                        ref.addRefToGraph()

                # https://www.mmrrc.org/catalog/sds.php?mmrrc_id=00001
                # is a good example of 4 genotype parts

                model.addClassToGraph(mouse_taxon, None)
                if research_areas == '':
                    research_areas = None
                else:
                    research_areas = 'Research Areas: ' + research_areas
                strain_type = mouse_taxon
                if strain_state == 'ES':
                    strain_type = stem_cell_class
                model.addIndividualToGraph(  # an inst of mouse??
                    strain_id, strain_label, strain_type, research_areas)
                model.makeLeader(strain_id)

                # phenotypes are associated with the alleles
                for pid in phenotype_ids:
                    # assume the phenotype label is in some ontology
                    model.addClassToGraph(pid, None)
                    if mgi_allele_id is not None and mgi_allele_id != '':
                        assoc = G2PAssoc(graph, self.name, mgi_allele_id, pid,
                                         self.globaltt['has phenotype'])
                        for p in pubmed_ids:
                            assoc.add_source(p)
                        assoc.add_association_to_graph()
                    else:
                        # too chatty here. report aggregate
                        # LOG.info("Phenotypes and no allele for %s", strain_id)
                        strain_missing_allele.append(strain_id)

                if not self.test_mode and (limit is not None
                                           and reader.line_num > limit):
                    break

            # report misses
            if strain_missing_allele:
                LOG.info("Phenotypes and no allele for %i strains",
                         len(strain_missing_allele))

            # now that we've collected all of the variant information, build it
            # we don't know their zygosities
            for s in self.strain_hash:
                h = self.strain_hash.get(s)
                variants = h['variants']
                genes = h['genes']
                vl_set = set()
                # make variant loci for each gene
                if variants:
                    for var in variants:
                        vl_id = var.strip()
                        vl_symbol = self.id_label_hash[vl_id]
                        geno.addAllele(vl_id, vl_symbol,
                                       self.globaltt['variant_locus'])
                        vl_set.add(vl_id)
                        if len(variants) == 1 and len(genes) == 1:
                            for gene in genes:
                                geno.addAlleleOfGene(vl_id, gene)
                        else:
                            geno.addAllele(vl_id, vl_symbol)
                else:  # len(vars) == 0
                    # it's just anonymous variants in some gene
                    for gene in genes:
                        vl_id = '_:' + re.sub(r':', '', gene) + '-VL'
                        vl_symbol = self.id_label_hash[gene] + '<?>'
                        self.id_label_hash[vl_id] = vl_symbol
                        geno.addAllele(vl_id, vl_symbol,
                                       self.globaltt['variant_locus'])
                        geno.addGene(gene, self.id_label_hash[gene])
                        geno.addAlleleOfGene(vl_id, gene)
                        vl_set.add(vl_id)

                # make the vslcs
                vl_list = sorted(vl_set)
                vslc_list = []
                for vl in vl_list:
                    # for unknown zygosity
                    vslc_id = re.sub(r'^_', '', vl) + 'U'
                    vslc_id = re.sub(r':', '', vslc_id)
                    vslc_id = '_:' + vslc_id
                    vslc_label = self.id_label_hash[vl] + '/?'
                    self.id_label_hash[vslc_id] = vslc_label
                    vslc_list.append(vslc_id)
                    geno.addPartsToVSLC(vslc_id, vl, None,
                                        self.globaltt['indeterminate'],
                                        self.globaltt['has_variant_part'],
                                        None)
                    model.addIndividualToGraph(
                        vslc_id, vslc_label,
                        self.globaltt['variant single locus complement'])
                if vslc_list:
                    if len(vslc_list) > 1:
                        gvc_id = '-'.join(vslc_list)
                        gvc_id = re.sub(r'_|:', '', gvc_id)
                        gvc_id = '_:' + gvc_id
                        gvc_label = '; '.join(self.id_label_hash[v]
                                              for v in vslc_list)
                        model.addIndividualToGraph(
                            gvc_id, gvc_label,
                            self.globaltt['genomic_variation_complement'])
                        for vslc_id in vslc_list:
                            geno.addVSLCtoParent(vslc_id, gvc_id)
                    else:
                        # the GVC == VSLC, so don't have to make an extra piece
                        gvc_id = vslc_list.pop()
                        gvc_label = self.id_label_hash[gvc_id]

                    genotype_label = gvc_label + ' [n.s.]'
                    bkgd_id = re.sub(
                        r':', '', '-'.join(
                            (self.globaltt['unspecified_genomic_background'],
                             s)))
                    genotype_id = '-'.join((gvc_id, bkgd_id))
                    bkgd_id = '_:' + bkgd_id
                    geno.addTaxon(mouse_taxon, bkgd_id)
                    geno.addGenomicBackground(
                        bkgd_id, 'unspecified (' + s + ')',
                        self.globaltt['unspecified_genomic_background'],
                        "A placeholder for the unspecified genetic background for "
                        + s)
                    geno.addGenomicBackgroundToGenotype(
                        bkgd_id, genotype_id,
                        self.globaltt['unspecified_genomic_background'])
                    geno.addParts(gvc_id, genotype_id,
                                  self.globaltt['has_variant_part'])
                    geno.addGenotype(genotype_id, genotype_label)
                    graph.addTriple(s, self.globaltt['has_genotype'],
                                    genotype_id)
                else:
                    # LOG.debug(
                    #   "Strain %s is not making a proper genotype.", s)
                    pass

            LOG.warning(
                "The following gene symbols did not list identifiers: %s",
                str(sorted(list(genes_with_no_ids))))
            LOG.error('%i symbols given are missing their gene identifiers',
                      len(genes_with_no_ids))

        return
Exemplo n.º 16
0
    def process_gaf(self, gaffile, limit, id_map=None, eco_map=None):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)
        geno = Genotype(graph)
        LOG.info("Processing Gene Associations from %s", gaffile)
        uniprot_hit = 0
        uniprot_miss = 0
        col = self.gaf_columns

        with gzip.open(gaffile, 'rb') as csvfile:
            reader = csv.reader(
                io.TextIOWrapper(csvfile, newline=""), delimiter='\t', quotechar='\"')
            for row in reader:
                # comments start with exclamation
                if row[0][0] == '!':
                    continue

                if len(row) != len(col):
                    LOG.error(
                        "Wrong number of columns %i, expected ... got:\n\t%s",
                        len(col), row)
                    exit(1)

                dbase = row[col.index('DB')].strip()
                gene_num = row[col.index('DB_Object_ID')].strip()
                gene_symbol = row[col.index('DB_Object_Symbol')].strip()
                qualifier = row[col.index('Qualifier')]
                go_id = row[col.index('GO_ID')].strip()
                ref = row[col.index('DB:Reference')].strip()
                eco_symbol = row[col.index('Evidence Code')].strip()
                with_or_from = row[col.index('With (or) From')]
                aspect = row[col.index('Aspect')].strip()
                gene_name = row[col.index('DB_Object_Name')]
                gene_synonym = row[col.index('DB_Object_Synonym')]
                # object_type = row[col.index('DB_Object_Type')].strip()
                taxon = row[col.index('Taxon and Interacting taxon')].strip()
                # date = row[col.index('Date')].strip()
                # assigned_by = row[col.index('Assigned_By')].strip()
                # annotation_extension = row[col.index('Annotation_Extension')]
                # gene_product_form_id = row[col.index('Gene_Product_Form_ID')]

                # test for required fields
                if '' in [row[:10], row[12]]:
                    LOG.error(
                        "Missing required part of annotation on row %i:\n%s",
                        reader.line_num, str(row[:-4]))
                    continue

                # (Don't) deal with qualifier NOT, contributes_to, colocalizes_with
                if re.search(r'NOT', qualifier):
                    continue

                if dbase in self.localtt:
                    dbase = self.localtt[dbase]
                uniprotid = None
                gene_id = None
                if dbase == 'UniProtKB':
                    if id_map is not None and gene_num in id_map:
                        gene_id = id_map[gene_num]
                        uniprotid = ':'.join((dbase, gene_num))
                        (dbase, gene_num) = gene_id.split(':')
                        uniprot_hit += 1
                    else:
                        # LOG.warning(
                        #   "UniProt id %s is without a 1:1 mapping to entrez/ensembl",
                        #    gene_num)
                        uniprot_miss += 1
                        continue
                else:
                    gene_num = gene_num.split(':')[-1]  # last
                    gene_id = ':'.join((dbase, gene_num))

                if self.test_mode and gene_id[:9] != 'NCBIGene:' and\
                        gene_num not in self.test_ids:
                    continue

                model.addClassToGraph(gene_id, gene_symbol)
                if gene_name != '':
                    model.addDescription(gene_id, gene_name)
                if gene_synonym != '':
                    for syn in re.split(r'\|', gene_synonym):
                        syn = syn.strip()
                        if syn[:10] == 'UniProtKB:':
                            model.addTriple(
                                gene_id, self.globaltt['has gene product'], syn)
                        elif re.fullmatch(graph.curie_regexp, syn) is not None:
                            LOG.warning(
                                'possible curie "%s" as a literal synomym for %s',
                                syn, gene_id)
                            model.addSynonym(gene_id, syn)
                        else:
                            model.addSynonym(gene_id, syn)

                for txid in taxon.split('|'):
                    tax_curie = re.sub(r'taxon:', 'NCBITaxon:', txid)
                    geno.addTaxon(tax_curie, gene_id)

                assoc = Assoc(graph, self.name)
                assoc.set_subject(gene_id)
                assoc.set_object(go_id)

                try:
                    eco_id = eco_map[eco_symbol]
                    assoc.add_evidence(eco_id)
                except KeyError:
                    LOG.error("Evidence code (%s) not mapped", eco_symbol)

                refs = re.split(r'\|', ref)
                for ref in refs:
                    ref = ref.strip()
                    if ref != '':
                        prefix = ref.split(':')[0]  # sidestep 'MGI:MGI:'
                        if prefix in self.localtt:
                            prefix = self.localtt[prefix]
                        ref = ':'.join((prefix, ref.split(':')[-1]))
                        refg = Reference(graph, ref)
                        if prefix == 'PMID':
                            ref_type = self.globaltt['journal article']
                            refg.setType(ref_type)
                        refg.addRefToGraph()
                        assoc.add_source(ref)

                # TODO add the source of the annotations from assigned by?

                rel = self.resolve(aspect, mandatory=False)
                if rel is not None and aspect == rel:
                    if aspect == 'F' and re.search(r'contributes_to', qualifier):
                        assoc.set_relationship(self.globaltt['contributes to'])
                    else:
                        LOG.error(
                            "Aspect: %s with qualifier: %s  is not recognized",
                            aspect, qualifier)
                elif rel is not None:
                    assoc.set_relationship(rel)
                    assoc.add_association_to_graph()
                else:
                    LOG.warning("No predicate for association \n%s\n", str(assoc))

                if uniprotid is not None:
                    assoc.set_description('Mapped from ' + uniprotid)
                # object_type should be one of:
                # protein_complex; protein; transcript; ncRNA; rRNA; tRNA;
                # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology.
                # If the precise product type is unknown,
                # gene_product should be used
                ########################################################################

                # Derive G2P Associations from IMP annotations
                # in version 2.1 Pipe will indicate 'OR'
                # and Comma will indicate 'AND'.
                # in version 2.0, multiple values are separated by pipes
                # where the pipe has been used to mean 'AND'
                if eco_symbol == 'IMP' and with_or_from != '':
                    withitems = with_or_from.split('|')
                    phenotypeid = go_id + 'PHENOTYPE'
                    # create phenotype associations
                    for itm in withitems:
                        if itm == '' or re.match(
                                r'(UniProtKB|WBPhenotype|InterPro|HGNC)', itm):
                            LOG.warning(
                                "Skipping  %s from or with %s", uniprotid, itm)
                            continue
                        itm = re.sub(r'MGI\:MGI\:', 'MGI:', itm)
                        itm = re.sub(r'WB:', 'WormBase:', itm)

                        # for worms and fish, they might give a RNAi or MORPH
                        # in these cases make a reagent-targeted gene
                        if re.search('MRPHLNO|CRISPR|TALEN', itm):
                            targeted_gene_id = self.zfin.make_targeted_gene_id(
                                gene_id, itm)
                            geno.addReagentTargetedGene(itm, gene_id, targeted_gene_id)
                            # TODO PYLINT why is this needed?
                            # Redefinition of assoc type from
                            # dipper.models.assoc.Association.Assoc to
                            # dipper.models.assoc.G2PAssoc.G2PAssoc
                            assoc = G2PAssoc(
                                graph, self.name, targeted_gene_id, phenotypeid)
                        elif re.search(r'WBRNAi', itm):
                            targeted_gene_id = self.wbase.make_reagent_targeted_gene_id(
                                gene_id, itm)
                            geno.addReagentTargetedGene(itm, gene_id, targeted_gene_id)
                            assoc = G2PAssoc(
                                graph, self.name, targeted_gene_id, phenotypeid)
                        else:
                            assoc = G2PAssoc(graph, self.name, itm, phenotypeid)
                        for ref in refs:
                            ref = ref.strip()
                            if ref != '':
                                prefix = ref.split(':')[0]
                                if prefix in self.localtt:
                                    prefix = self.localtt[prefix]
                                ref = ':'.join((prefix, ref.split(':')[-1]))
                                assoc.add_source(ref)
                                # experimental phenotypic evidence
                                assoc.add_evidence(
                                    self.globaltt['experimental phenotypic evidence'])
                        assoc.add_association_to_graph()
                        # TODO should the G2PAssoc be the evidence for the GO assoc?

                if not self.test_mode and limit is not None and \
                        reader.line_num > limit:
                    break
            uniprot_tot = (uniprot_hit + uniprot_miss)
            uniprot_per = 0.0
            if uniprot_tot != 0:
                uniprot_per = 100.0 * uniprot_hit / uniprot_tot
            LOG.info(
                "Uniprot: %.2f%% of %i benefited from the 1/4 day id mapping download",
                uniprot_per, uniprot_tot)
Exemplo n.º 17
0
    def _process_allele_phenotype(self, limit):
        """
        Make allele to phenotype associations using derived_pheno_class
        and derived_pheno_manifest cvterm in the flybase db, an example entry is:

        FBal0257663    @FBcv0000351:lethal@ | @FBcv0000308:female limited@,
                       with @FBal0130657:Scer\GAL4<up>dome-PG14</up>@

        The first term is the phenotype, and all follow up terms are qualifiers,
        self.globaltt['has_qualifier'])

        Our previous approach was to use the genotype id associated with
        FBal0257663/FBal0130657 , however, this required us to create blank
        nodes and was considered unnecessarily granular

        Note that sometimes identifiers do not exist for a term, eg
        @:heat sensitive | tetracycline conditional@

        derived_pheno_class - FBcv terms, these are phenotypes
        derived_pheno_manifest -  Anatomy terms FBbt, we currently
        make phenotype IRI equivalents that end up in UPheno, but
        this is being developed and updated, see
        https://github.com/monarch-initiative/dipper/issues/770

        Adds triples to self.graph

        :param limit: number of rows to process
        :return: None

        """
        model = Model(self.graph)
        src_key = 'allele_phenotype'
        raw = '/'.join((self.rawdir, self.queries[src_key]['file']))
        LOG.info("processing allele phenotype associations")
        col = self.queries[src_key]['columns']

        transgenic_alleles = self._get_foreign_transgenic_alleles()

        # flybase terms - terms we prefix with FlyBase:
        fly_prefixes = ['FBal', 'FBti', 'FBab', 'FBba', 'FBtp']

        # a alphanumeric id followed by a colon then
        # any character but a colon bordered by @s
        term_regex = re.compile(r'@([\w]*):([^:@]*)@')
        id_regex = re.compile(r'([a-zA-Z]+)(\d+)')

        with open(raw, 'r') as tsvfile:
            reader = csv.reader(tsvfile, delimiter='\t')
            row = next(reader)  # headers
            self.check_fileheader(col, row)

            for row in reader:
                allele_id = row[col.index('allele_id')]
                pheno_desc = row[col.index('pheno_desc')]
                pheno_type = row[col.index('pheno_type')]
                pub_id = row[col.index('pub_id')]
                pub_title = row[col.index('pub_title')]
                pmid_id = row[col.index('pmid_id')]

                # Don't get phenotypes for transgenic alleles
                if allele_id in transgenic_alleles:
                    continue

                allele_curie = 'FlyBase:' + allele_id

                terms = re.findall(term_regex, pheno_desc)

                if not terms:
                    LOG.warning('Could not @terms@ in description: %s',
                                pheno_desc)
                    continue

                term_ids, term_labels = zip(*terms)
                id_match = re.match(id_regex, term_ids[0])

                if id_match is not None:
                    prefix, reference = id_match.group(1, 2)
                else:
                    raise ValueError("Could not parse id {}".format(
                        term_ids[0]))

                # derived_pheno_class should all start with a FBcv term
                if pheno_type == 'derived_pheno_class' and prefix != 'FBcv':
                    LOG.warning(
                        'derived_pheno_class does not '
                        'start with FBcv: %s', pheno_desc)
                    continue

                # Create phenotype curie
                if pheno_type == 'derived_pheno_class':
                    phenotype_curie = prefix + ':' + reference
                elif pheno_type == 'derived_pheno_manifest':
                    # These are not proper FBcv phenotype terms
                    # but rather anatomical entities, go terms, sometimes free text
                    # skip parsing for now
                    continue
                else:
                    raise ValueError(
                        "Unexpected phenotype type: {}".format(pheno_type))

                if pmid_id:
                    ref_curie = 'PMID:' + pmid_id
                else:
                    ref_curie = 'FlyBase:' + pub_id
                    reference = Reference(self.graph, ref_curie)
                    reference.setTitle(pub_title)
                    reference.addRefToGraph()

                assoc = G2PAssoc(self.graph, self.name, allele_curie,
                                 phenotype_curie,
                                 self.globaltt['has phenotype'])
                assoc.add_source(ref_curie)
                # Associations need to be disambiguated via their qualifiers
                # see http://flybase.org/reports/FBal0207398 as an example
                assoc.set_association_id(
                    assoc.make_association_id(self.name, allele_curie,
                                              self.globaltt['has phenotype'],
                                              phenotype_curie, term_ids[1:]))
                assoc.add_association_to_graph()
                assoc_id = assoc.get_association_id()

                # add the rest as qualifiers
                for term in term_ids[1:]:
                    if term:
                        # FBal, GO, FBti, FBab ...
                        id_match = re.match(id_regex, term)
                        if id_match is not None:
                            prefix, reference = id_match.group(1, 2)
                            if prefix in fly_prefixes:
                                term_curie = 'FlyBase:' + term
                            else:
                                term_curie = prefix + ':' + reference
                        else:
                            raise ValueError(
                                "Could not parse id {}".format(term))

                    else:
                        # There is not an id for a term,
                        # eg @:heat sensitive | tetracycline conditional@
                        continue

                    self.graph.addTriple(assoc_id,
                                         self.globaltt['has_qualifier'],
                                         term_curie)

                if limit is not None and reader.line_num > limit:
                    break
Exemplo n.º 18
0
    def _process_kegg_disease2gene(self, limit=None):
        """
        This method creates an association between diseases and
        their associated genes. We are being conservative here, and only
        processing those diseases for which there is no mapping to OMIM.

        Triples created:
        <alternate_locus> is an Individual
        <alternate_locus> has type <variant_locus>
        <alternate_locus> is an allele of  <gene_id>

        <assoc_id> has subject <disease_id>
        <assoc_id> has object <gene_id>
        :param limit:
        :return:

        """

        LOG.info("Processing KEGG disease to gene")
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        line_counter = 0
        geno = Genotype(graph)
        rel = self.globaltt['is marker for']
        noomimset = set()
        raw = '/'.join((self.rawdir, self.files['disease_gene']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (gene_id, disease_id) = row

                if self.test_mode and gene_id not in self.test_ids['genes']:
                    continue

                gene_id = 'KEGG-' + gene_id.strip()
                disease_id = 'KEGG-' + disease_id.strip()

                # only add diseases for which
                # there is no omim id and not a grouping class
                if disease_id not in self.kegg_disease_hash:
                    # add as a class
                    disease_label = None
                    if disease_id in self.label_hash:
                        disease_label = self.label_hash[disease_id]
                    if re.search(r'includ', str(disease_label)):
                        # they use 'including' when it's a grouping class
                        LOG.info(
                            "Skipping this association because " +
                            "it's a grouping class: %s",
                            disease_label)
                        continue
                    # type this disease_id as a disease
                    model.addClassToGraph(
                        disease_id, disease_label, self.globaltt['disease'])
                    noomimset.add(disease_id)
                    alt_locus_id = self._make_variant_locus_id(gene_id, disease_id)
                    alt_label = self.label_hash[alt_locus_id]
                    model.addIndividualToGraph(
                        alt_locus_id, alt_label, self.globaltt['variant_locus'])
                    geno.addAffectedLocus(alt_locus_id, gene_id)
                    model.addBlankNodeAnnotation(alt_locus_id)
                    # Add the disease to gene relationship.
                    assoc = G2PAssoc(graph, self.name, alt_locus_id, disease_id, rel)
                    assoc.add_association_to_graph()

                if (not self.test_mode) and (limit is not None and line_counter > limit):
                    break

        LOG.info("Done with KEGG disease to gene")
        LOG.info("Found %d diseases with no omim id", len(noomimset))

        return
Exemplo n.º 19
0
    def process_allele_phenotype(self, limit=None):
        """
        This file compactly lists variant to phenotype associations,
        such that in a single row, there may be >1 variant listed
        per phenotype and paper.  This indicates that each variant is
        individually assocated with the given phenotype,
        as listed in 1+ papers.
        (Not that the combination of variants is producing the phenotype.)
        :param limit:
        :return:

        """
        src_key = 'allele_pheno'
        raw = '/'.join((self.rawdir, self.files[src_key]['file']))
        col = self.files[src_key]['columns']

        graph = self.graph
        model = Model(self.graph)

        LOG.info("Processing Allele phenotype associations")
        geno = Genotype(graph)
        with open(raw, 'r') as csvfile:
            reader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            row = next(reader)
            if row[0] != '!gaf-version: 2.0':
                LOG.error('Not a vlaid gaf v2.0 formatted file: %s', raw)
                # raise
            for row in reader:
                if row[0][0] == '!':
                    continue
                # db = row[col.index('DB')]
                gene_num = row[col.index('DB Object ID')]
                # gene_symbol = row[col.index('DB Object Symbol')]
                is_not = row[col.index('Qualifier')]
                phenotype_id = row[col.index('GO ID')]
                ref = row[col.index('DB:Reference (|DB:Reference)')].strip()
                eco_symbol = row[col.index('Evidence Code')]
                with_or_from = row[col.index('With (or) From')]
                # aspect = row[col.index('Aspect')]
                # gene_name = row[col.index('DB Object Name')]
                # gene_synonym = row[col.index('DB Object Synonym (|Synonym)')]
                # gene_class = row[col.index('DB Object Type')]
                # taxon = row[col.index('Taxon(|taxon)')]
                # date = row[col.index('Date')]
                # assigned_by = row[col.index('Assigned By')]
                # blank = row[col.index('Annotation Extension')]
                # blank2 = row[col.index('Gene Product Form ID')]

                # TODO add NOT phenotypes
                if is_not == 'NOT':
                    continue

                eco_symbol = eco_symbol.strip()
                eco_curie = None
                if eco_symbol.strip() != '' and eco_symbol in self.gaf_eco:
                    eco_curie = self.gaf_eco[eco_symbol]
                else:
                    LOG.warning(
                        'Evidence code %s is not found in the (gaf) gaf_eco',
                        eco_symbol)

                # according to the GOA spec, persons are not allowed to be
                # in the reference column, therefore they the variant and
                # persons are swapped between the reference and with column.
                # we unswitch them here.
                temp_var = temp_ref = None
                if re.search(r'WBVar|WBRNAi', ref):
                    temp_var = ref
                    # move the paper from the with column into the ref
                if re.search(r'WBPerson', with_or_from):
                    temp_ref = with_or_from
                if temp_var is not None or temp_ref is not None:
                    with_or_from = temp_var
                    ref = temp_ref

                allele_list = re.split(r'\|', with_or_from)
                if len(allele_list) == 0:
                    LOG.error(
                        "Missing alleles from phenotype assoc at line %d",
                        reader.line_num)
                    continue
                else:
                    for allele in allele_list:
                        allele_num = re.sub(r'WB:', '', allele.strip())
                        allele_id = 'WormBase:' + allele_num
                        gene_id = 'WormBase:' + gene_num

                        if re.search(r'WBRNAi', allele_id):

                            # @kshefchek - removing this blank node
                            # in favor of simpler modeling
                            # make the WormBase:WBRNAi* id
                            # a self.globaltt['reagent_targeted_gene'], and attach
                            # phenotype to this ID

                            # Previous model - make a bnode reagent-targeted gene,
                            # & annotate that instead of the RNAi item directly
                            # rnai_num = re.sub(r'WormBase:', '', allele_id)
                            # rnai_id = allele_id
                            # rtg_id = self.make_reagent_targeted_gene_id(
                            #    gene_num, rnai_num)
                            # geno.addReagentTargetedGene(
                            #    rnai_id, 'WormBase:' + gene_num, rtg_id)
                            # allele_id = rtg_id

                            # Could type the IRI as both the reagant and reagant
                            # targeted gene but not sure if this needed
                            # geno.addGeneTargetingReagent(
                            #   allele_id, None, self.globaltt['RNAi_reagent'], gene_id)

                            model.addIndividualToGraph(
                                allele_id, None,
                                self.globaltt['reagent_targeted_gene'])

                            self.graph.addTriple(
                                allele_id,
                                self.globaltt['is_expression_variant_of'],
                                gene_id)

                        elif re.search(r'WBVar', allele_id):
                            # this may become deprecated by using wormmine
                            # make the allele to gene relationship
                            # the WBVars are really sequence alterations
                            # the public name will come from elsewhere

                            # @kshefchek - removing this blank node
                            # in favor of simpler modeling, treat variant
                            # like an allele
                            # vl_id = '_:'+'-'.join((gene_num, allele_num))
                            # geno.addSequenceAlterationToVariantLocus(
                            #    allele_id, vl_id)
                            # geno.addAlleleOfGene(vl_id, gene_id)

                            geno.addSequenceAlteration(allele_id, None)
                            geno.addAlleleOfGene(allele_id, gene_id)
                        else:
                            LOG.warning(
                                "Some kind of allele I don't recognize: %s",
                                allele_num)
                            continue
                        assoc = G2PAssoc(graph, self.name, allele_id,
                                         phenotype_id)

                        if eco_curie is not None:
                            assoc.add_evidence(eco_curie)

                        if ref is not None and ref != '':
                            ref = re.sub(r'(WB:|WB_REF:)', 'WormBase:', ref)
                            reference = Reference(graph, ref)
                            if re.search(r'Person', ref):
                                reference.setType(self.globaltt['person'])
                                assoc.add_evidence(self.globaltt[
                                    'inference from background scientific knowledge']
                                                   )
                            reference.addRefToGraph()
                            assoc.add_source(ref)

                        assoc.add_association_to_graph()

                        # finish looping through all alleles

                if limit is not None and reader.line_num > limit:
                    break
Exemplo n.º 20
0
    def _process_omim2gene(self, limit=None):
        """
        This method maps the OMIM IDs and KEGG gene ID.
        Currently split based on the link_type field.
        Equivalent link types are mapped as gene XRefs.
        Reverse link types are mapped as disease to gene associations.
        Original link types are currently skipped.

        Triples created:
        <kegg_gene_id> is a Gene
        <omim_gene_id> is a Gene
        <kegg_gene_id>> hasXref <omim_gene_id>

        <assoc_id> has subject <omim_disease_id>
        <assoc_id> has object <kegg_gene_id>
        :param limit:

        :return:
        """

        LOG.info("Processing OMIM to KEGG gene")
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        line_counter = 0
        geno = Genotype(graph)
        raw = '/'.join((self.rawdir, self.files['omim2gene']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (kegg_gene_id, omim_id, link_type) = row

                if self.test_mode and kegg_gene_id not in self.test_ids['genes']:
                    continue

                kegg_gene_id = 'KEGG-' + kegg_gene_id.strip()
                omim_id = re.sub(r'omim', 'OMIM', omim_id)
                if link_type == 'equivalent':
                    # these are genes!
                    # so add them as a class then make equivalence
                    model.addClassToGraph(omim_id, None)
                    geno.addGene(kegg_gene_id, None)
                    if not DipperUtil.is_omim_disease(omim_id):
                        model.addEquivalentClass(kegg_gene_id, omim_id)
                elif link_type == 'reverse':
                    # make an association between an OMIM ID & the KEGG gene ID
                    # we do this with omim ids because
                    # they are more atomic than KEGG ids

                    alt_locus_id = self._make_variant_locus_id(kegg_gene_id, omim_id)
                    alt_label = self.label_hash[alt_locus_id]
                    model.addIndividualToGraph(
                        alt_locus_id, alt_label, self.globaltt['variant_locus'])
                    geno.addAffectedLocus(alt_locus_id, kegg_gene_id)
                    model.addBlankNodeAnnotation(alt_locus_id)

                    # Add the disease to gene relationship.
                    rel = self.globaltt['is marker for']
                    assoc = G2PAssoc(graph, self.name, alt_locus_id, omim_id, rel)
                    assoc.add_association_to_graph()

                elif link_type == 'original':
                    # these are sometimes a gene, and sometimes a disease
                    LOG.info(
                        'Unable to handle original link for %s-%s',
                        kegg_gene_id, omim_id)
                else:
                    # don't know what these are
                    LOG.warning(
                        'Unhandled link type for %s-%s: %s',
                        kegg_gene_id, omim_id, link_type)

                if (not self.test_mode) and (
                        limit is not None and line_counter > limit):
                    break

        LOG.info("Done with OMIM to KEGG gene")

        return
Exemplo n.º 21
0
    def _process_data(self, source, limit=None):
        """
        This function will process the data files from Coriell.
        We make the assumption that any alleles listed are variants
        (alternates to w.t.)

        Triples: (examples)

        :NIGMSrepository a CLO_0000008 #repository
        label : NIGMS Human Genetic Cell Repository
        foaf:page
         https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8

        line_id a CL_0000057,  #fibroblast line
            derives_from patient_id
            part_of :NIGMSrepository
            RO:model_of OMIM:disease_id

        patient id a foaf:person,
            label: "fibroblast from patient 12345 with disease X"
            member_of family_id  #what is the right thing here?
            SIO:race EFO:caucasian  #subclass of EFO:0001799
            in_taxon NCBITaxon:9606
            dc:description Literal(remark)
            RO:has_phenotype OMIM:disease_id
            GENO:has_genotype genotype_id

        family_id a owl:NamedIndividual
            foaf:page
             "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM"

        genotype_id a intrinsic_genotype
            GENO:has_alternate_part allelic_variant_id
            we don't necessarily know much about the genotype,
            other than the allelic variant. also there's the sex here

        pub_id mentions cell_line_id

        :param raw:
        :param limit:
        :return:

        """
        raw = '/'.join((self.rawdir, self.files[source]['file']))

        LOG.info("Processing Data from %s", raw)

        if self.testMode:  # set the graph to build
            graph = self.testgraph
        else:
            graph = self.graph

        family = Family(graph)
        model = Model(graph)

        line_counter = 1
        geno = Genotype(graph)
        diputil = DipperUtil()
        col = self.files[source]['columns']
        # affords access with
        # x = row[col.index('x')].strip()

        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar=r'"')
            # we can keep a close watch on changing file formats
            fileheader = next(filereader, None)
            fileheader = [c.lower() for c in fileheader]
            if col != fileheader:  # assert
                LOG.error('Expected  %s to have columns: %s', raw, col)
                LOG.error('But Found %s to have columns: %s', raw, fileheader)
                raise AssertionError('Incomming data headers have changed.')

            for row in filereader:
                line_counter += 1
                if len(row) != len(col):
                    LOG.warning('Expected %i values but find %i in  row %i',
                                len(col), len(row), line_counter)
                    continue

                # (catalog_id, description, omim_number, sample_type,
                # cell_line_available, dna_in_stock, dna_ref, gender, age,
                # race, ethnicity, affected, karyotype, relprob, mutation,
                # gene, family_id, collection, url, cat_remark, pubmed_ids,
                # family_member, variant_id, dbsnp_id, species) = row

                # example:
                # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No,
                #       ,Female,26 YR,Caucasian,,,,
                # parent,,,39,NIGMS Human Genetic Cell Repository,
                # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003,
                # 46;XX; clinically normal mother of a child with Hurler syndrome;
                #       proband not in Repository,,
                # 2,,18343,H**o sapiens

                catalog_id = row[col.index('catalog_id')].strip()

                if self.testMode and catalog_id not in self.test_lines:
                    # skip rows not in our test lines, when in test mode
                    continue

                # ###########    BUILD REQUIRED VARIABLES    ###########

                # Make the cell line ID
                cell_line_id = 'Coriell:' + catalog_id
                # Map the cell/sample type
                cell_type = self.resolve(row[col.index('sample_type')].strip())
                # on fail cell_type = self.globaltt['cell'] ?

                # Make a cell line label
                collection = row[col.index('collection')].strip()
                line_label = collection.partition(' ')[0] + '-' + catalog_id

                # Map the repository/collection
                repository = self.localtt[collection]

                # patients are uniquely identified by one of:
                # dbsnp id (which is == an individual haplotype)
                # family id + family member (if present) OR
                # probands are usually family member zero
                # cell line id
                # since some patients have >1 cell line derived from them,
                # we must make sure that the genotype is attached to
                # the patient, and can be inferred to the cell line
                # examples of repeated patients are:
                #   famid=1159, member=1; fam=152,member=1

                # Make the patient ID

                # make an anonymous patient
                patient_id = '_:person'
                fam_id = row[col.index('fam')].strip()
                fammember = row[col.index('fammember')].strip()
                if fam_id != '':
                    patient_id = '-'.join((patient_id, fam_id, fammember))
                else:
                    # make an anonymous patient
                    patient_id = '-'.join((patient_id, catalog_id))

                # properties of the individual patients:  sex, family id,
                # member/relproband, description descriptions are
                # really long and ugly SCREAMING text, so need to clean up
                # the control cases are so odd with this labeling scheme;
                # but we'll deal with it as-is for now.
                description = row[col.index('description')].strip()
                short_desc = (description.split(';')[0]).capitalize()

                gender = row[col.index('gender')].strip().lower()
                affected = row[col.index('affected')].strip()
                relprob = row[col.index('relprob')].strip()

                if affected == '':
                    affected = 'unspecified'
                elif affected in self.localtt:
                    affected = self.localtt[affected]
                else:
                    LOG.warning('Novel Affected status  %s at row: %i of %s',
                                affected, line_counter, raw)
                patient_label = ' '.join((affected, gender, relprob))
                if relprob == 'proband':
                    patient_label = ' '.join(
                        (patient_label.strip(), 'with', short_desc))
                else:
                    patient_label = ' '.join(
                        (patient_label.strip(), 'of proband with', short_desc))

                # #############    BUILD THE CELL LINE    #############

                # Adding the cell line as a typed individual.
                cell_line_reagent_id = self.globaltt['cell line']

                model.addIndividualToGraph(cell_line_id, line_label,
                                           cell_line_reagent_id)

                # add the equivalent id == dna_ref
                dna_ref = row[col.index('dna_ref')].strip()
                if dna_ref != '' and dna_ref != catalog_id:
                    equiv_cell_line = 'Coriell:' + dna_ref
                    # some of the equivalent ids are not defined
                    # in the source data; so add them
                    model.addIndividualToGraph(equiv_cell_line, None,
                                               cell_line_reagent_id)
                    model.addSameIndividual(cell_line_id, equiv_cell_line)

                # Cell line derives from patient
                geno.addDerivesFrom(cell_line_id, patient_id)
                geno.addDerivesFrom(cell_line_id, cell_type)

                # Cell line a member of repository
                family.addMember(repository, cell_line_id)

                cat_remark = row[col.index('cat_remark')].strip()

                if cat_remark != '':
                    model.addDescription(cell_line_id, cat_remark)

                # Cell age_at_sampling
                # TODO add the age nodes when modeled properly in #78
                # if (age != ''):
                # this would give a BNode that is an instance of Age.
                # but i don't know how to connect
                # the age node to the cell line? we need to ask @mbrush
                # age_id = '_'+re.sub('\s+','_',age)
                # gu.addIndividualToGraph(
                #   graph,age_id,age,self.globaltt['age'])
                # gu.addTriple(
                #   graph,age_id,self.globaltt['has measurement value'],age,
                #   True)

                # #############    BUILD THE PATIENT    #############

                # Add the patient ID as an individual.
                model.addPerson(patient_id, patient_label)
                # TODO map relationship to proband as a class
                # (what ontology?)

                # Add race of patient
                # FIXME: Adjust for subcategories based on ethnicity field
                # EDIT: There are 743 different entries for ethnicity...
                # Too many to map?
                # Add ethnicity as literal in addition to the mapped race?
                # Adjust the ethnicity txt (if using)
                # to initial capitalization to remove ALLCAPS

                # TODO race should go into the individual's background
                # and abstracted out to the Genotype class punting for now.
                # if race != '':
                #    mapped_race = self.resolve(race)
                #    if mapped_race is not None:
                #        gu.addTriple(
                #           g,patient_id,self.globaltt['race'], mapped_race)
                #        model.addSubClass(
                #           mapped_race,self.globaltt['ethnic_group'])

                # #############    BUILD THE FAMILY    #############

                # Add triples for family_id, if present.
                if fam_id != '':
                    family_comp_id = 'CoriellFamily:' + fam_id

                    family_label = ' '.join(
                        ('Family of proband with', short_desc))

                    # Add the family ID as a named individual
                    model.addIndividualToGraph(family_comp_id, family_label,
                                               self.globaltt['family'])

                    # Add the patient as a member of the family
                    family.addMemberOf(patient_id, family_comp_id)

                # #############    BUILD THE GENOTYPE   #############

                # the important things to pay attention to here are:
                # karyotype = chr rearrangements  (somatic?)
                # mutation = protein-level mutation as a label,
                # often from omim
                # gene = gene symbol - TODO get id
                # variant_id = omim variant ids (; delimited)
                # dbsnp_id = snp individual ids = full genotype?

                # note GM00633 is a good example of chromosomal variation
                # - do we have enough to capture this?
                # GM00325 has both abnormal karyotype and variation

                # make an assumption that if the taxon is blank,
                # that it is human!
                species = row[col.index('species')].strip()
                if species is None or species == '':
                    species = 'H**o sapiens'
                taxon = self.resolve(species)

                # if there's a dbSNP id,
                # this is actually the individual's genotype
                genotype_id = None
                genotype_label = None

                dbsnp_id = row[col.index('dbsnp_id')].strip()
                if dbsnp_id != '':
                    genotype_id = 'dbSNPIndividual:' + dbsnp_id

                omim_map = {}
                gvc_id = None

                # some of the karyotypes are encoded
                # with terrible hidden codes. remove them here
                # i've seen a <98> character
                karyotype = row[col.index('karyotype')].strip()
                karyotype = diputil.remove_control_characters(karyotype)
                karyotype_id = None
                if karyotype.strip() != '':
                    karyotype_id = '_:' + re.sub('MONARCH:', '',
                                                 self.make_id(karyotype))
                    # add karyotype as karyotype_variation_complement
                    model.addIndividualToGraph(
                        karyotype_id, karyotype,
                        self.globaltt['karyotype_variation_complement'])
                    # TODO break down the karyotype into parts
                    # and map into GENO. depends on #77

                    # place the karyotype in a location(s).
                    karyo_chrs = self._get_affected_chromosomes_from_karyotype(
                        karyotype)
                    for chrom in karyo_chrs:
                        chr_id = makeChromID(chrom, taxon, 'CHR')
                        # add an anonymous sequence feature,
                        # each located on chr
                        karyotype_feature_id = '-'.join((karyotype_id, chrom))
                        karyotype_feature_label = \
                            'some karyotype alteration on chr' + str(chrom)
                        feat = Feature(graph, karyotype_feature_id,
                                       karyotype_feature_label,
                                       self.globaltt['sequence_alteration'])
                        feat.addFeatureStartLocation(None, chr_id)
                        feat.addFeatureToGraph()
                        geno.addParts(karyotype_feature_id, karyotype_id,
                                      self.globaltt['has_variant_part'])

                gene = row[col.index('gene')].strip()
                mutation = row[col.index('mutation')].strip()
                if gene != '':
                    vl = gene + '(' + mutation + ')'

                # fix the variant_id so it's always in the same order
                variant_id = row[col.index('variant_id')].strip()
                vids = variant_id.split(';')
                variant_id = ';'.join(sorted(list(set(vids))))

                if karyotype.strip() != '' and not self._is_normal_karyotype(
                        karyotype):

                    gvc_id = karyotype_id
                    if variant_id != '':
                        gvc_id = '_:' + variant_id.replace(';', '-') + '-' \
                            + re.sub(r'\w*:', '', karyotype_id)
                    if mutation.strip() != '':
                        gvc_label = '; '.join((vl, karyotype))
                    else:
                        gvc_label = karyotype
                elif variant_id.strip() != '':
                    gvc_id = '_:' + variant_id.replace(';', '-')
                    gvc_label = vl
                else:
                    # wildtype?
                    pass

                # add the karyotype to the gvc.
                # use reference if normal karyotype
                karyo_rel = self.globaltt['has_variant_part']
                if self._is_normal_karyotype(karyotype):
                    karyo_rel = self.globaltt['has_reference_part']
                if karyotype_id is not None \
                        and not self._is_normal_karyotype(karyotype) \
                        and gvc_id is not None and karyotype_id != gvc_id:
                    geno.addParts(karyotype_id, gvc_id, karyo_rel)

                if variant_id.strip() != '':
                    # split the variants & add them as part of the genotype
                    # we don't necessarily know their zygosity,
                    # just that they are part of the genotype variant ids
                    # are from OMIM, so prefix as such we assume that the
                    # sequence alts will be defined in OMIM not here
                    # TODO sort the variant_id list, if the omim prefix is
                    # the same, then assume it's the locus make a hashmap
                    # of the omim id to variant id list;
                    # then build the genotype hashmap is also useful for
                    # removing the "genes" from the list of "phenotypes"

                    # will hold gene/locus id to variant list
                    omim_map = {}

                    locus_num = None
                    for var in variant_id.split(';'):
                        # handle omim-style and odd var ids
                        # like 610661.p.R401X
                        mch = re.match(r'(\d+)\.+(.*)', var.strip())
                        if mch is not None and len(mch.groups()) == 2:
                            (locus_num, var_num) = mch.groups()

                        if locus_num is not None and locus_num not in omim_map:
                            omim_map[locus_num] = [var_num]
                        else:
                            omim_map[locus_num] += [var_num]

                    for omim in omim_map:
                        # gene_id = 'OMIM:' + omim  # TODO unused
                        vslc_id = '_:' + '-'.join(
                            [omim + '.' + a for a in omim_map.get(omim)])
                        vslc_label = vl
                        # we don't really know the zygosity of
                        # the alleles at all.
                        # so the vslcs are just a pot of them
                        model.addIndividualToGraph(
                            vslc_id, vslc_label,
                            self.globaltt['variant single locus complement'])
                        for var in omim_map.get(omim):
                            # this is actually a sequence alt
                            allele1_id = 'OMIM:' + omim + '.' + var
                            geno.addSequenceAlteration(allele1_id, None)

                            # assume that the sa -> var_loc -> gene
                            # is taken care of in OMIM
                            geno.addPartsToVSLC(
                                vslc_id, allele1_id, None,
                                self.globaltt['indeterminate'],
                                self.globaltt['has_variant_part'])

                        if vslc_id != gvc_id:
                            geno.addVSLCtoParent(vslc_id, gvc_id)

                if affected == 'unaffected':
                    # let's just say that this person is wildtype
                    model.addType(patient_id, self.globaltt['wildtype'])
                elif genotype_id is None:
                    # make an anonymous genotype id (aka blank node)
                    genotype_id = '_:geno' + catalog_id.strip()

                # add the gvc
                if gvc_id is not None:
                    model.addIndividualToGraph(
                        gvc_id, gvc_label,
                        self.globaltt['genomic_variation_complement'])

                    # add the gvc to the genotype
                    if genotype_id is not None:
                        if affected == 'unaffected':
                            rel = self.globaltt['has_reference_part']
                        else:
                            rel = self.globaltt['has_variant_part']
                        geno.addParts(gvc_id, genotype_id, rel)

                    if karyotype_id is not None \
                            and self._is_normal_karyotype(karyotype):
                        if gvc_label is not None and gvc_label != '':
                            genotype_label = '; '.join((gvc_label, karyotype))
                        elif karyotype is not None:
                            genotype_label = karyotype
                        if genotype_id is None:
                            genotype_id = karyotype_id
                        else:
                            geno.addParts(karyotype_id, genotype_id,
                                          self.globaltt['has_reference_part'])
                    else:
                        genotype_label = gvc_label
                        # use the catalog id as the background
                    genotype_label += ' [' + catalog_id.strip() + ']'

                if genotype_id is not None and gvc_id is not None:
                    # only add the genotype if it has some parts
                    geno.addGenotype(genotype_id, genotype_label,
                                     self.globaltt['intrinsic_genotype'])
                    geno.addTaxon(taxon, genotype_id)
                    # add that the patient has the genotype
                    # TODO check if the genotype belongs to
                    # the cell line or to the patient
                    graph.addTriple(patient_id, self.globaltt['has_genotype'],
                                    genotype_id)
                else:
                    geno.addTaxon(taxon, patient_id)

                # TODO: Add sex/gender  (as part of the karyotype?)
                # = row[col.index('')].strip()
                # #############    DEAL WITH THE DISEASES   #############
                omim_num = row[col.index('omim_num')].strip()

                # we associate the disease to the patient
                if affected == 'affected' and omim_num != '':
                    for d in omim_num.split(';'):
                        if d is not None and d != '':
                            # if the omim number is in omim_map,
                            # then it is a gene not a pheno

                            # TEC - another place to use the mimTitle omim
                            # classifier omia & genereviews are using

                            if d not in omim_map:
                                disease_id = 'OMIM:' + d.strip()
                                # assume the label is taken care of in OMIM
                                model.addClassToGraph(disease_id, None)

                                # add the association:
                                #   the patient has the disease
                                assoc = G2PAssoc(graph, self.name, patient_id,
                                                 disease_id)
                                assoc.add_association_to_graph()

                                # this line is a model of this disease
                                # TODO abstract out model into
                                # it's own association class?
                                graph.addTriple(cell_line_id,
                                                self.globaltt['is model of'],
                                                disease_id)
                            else:
                                LOG.info('drop gene %s from disease list', d)

                # #############    ADD PUBLICATIONS   #############
                pubmed_ids = row[col.index('pubmed_ids')].strip()
                if pubmed_ids != '':
                    for s in pubmed_ids.split(';'):
                        pubmed_id = 'PMID:' + s.strip()
                        ref = Reference(graph, pubmed_id)
                        ref.setType(self.globaltt['journal article'])
                        ref.addRefToGraph()
                        graph.addTriple(pubmed_id, self.globaltt['mentions'],
                                        cell_line_id)

                if not self.testMode and (limit is not None
                                          and line_counter > limit):
                    break
        return
Exemplo n.º 22
0
    def _transform_entry(self, e, graph):
        g = graph
        model = Model(g)
        geno = Genotype(graph)

        tax_num = '9606'
        tax_id = 'NCBITaxon:9606'
        tax_label = 'Human'
        build_num = "GRCh38"
        build_id = "NCBIGenome:"+build_num

        # get the numbers, labels, and descriptions
        omimnum = e['entry']['mimNumber']
        titles = e['entry']['titles']
        label = titles['preferredTitle']

        other_labels = []
        if 'alternativeTitles' in titles:
            other_labels += self._get_alt_labels(titles['alternativeTitles'])
        if 'includedTitles' in titles:
            other_labels += self._get_alt_labels(titles['includedTitles'])

        # add synonyms of alternate labels
        # preferredTitle": "PFEIFFER SYNDROME",
        # "alternativeTitles":
        #   "ACROCEPHALOSYNDACTYLY, TYPE V; ACS5;;\nACS V;;\nNOACK SYNDROME",
        # "includedTitles":
        #   "CRANIOFACIAL-SKELETAL-DERMATOLOGIC DYSPLASIA, INCLUDED"

        # remove the abbreviation (comes after the ;) from the preferredTitle,
        # and add it as a synonym
        abbrev = None
        if len(re.split(r';', label)) > 1:
            abbrev = (re.split(r';', label)[1].strip())
        newlabel = self._cleanup_label(label)

        description = self._get_description(e['entry'])
        omimid = 'OMIM:'+str(omimnum)

        if e['entry']['status'] == 'removed':
            model.addDeprecatedClass(omimid)
        else:
            omimtype = self._get_omimtype(e['entry'])
            nodelabel = newlabel
            # this uses our cleaned-up label
            if omimtype == Genotype.genoparts['heritable_phenotypic_marker']:
                if abbrev is not None:
                    nodelabel = abbrev
                # in this special case,
                # make it a disease by not declaring it as a gene/marker
                model.addClassToGraph(omimid, nodelabel, None, newlabel)
            elif omimtype == Genotype.genoparts['gene']:
                if abbrev is not None:
                    nodelabel = abbrev
                model.addClassToGraph(omimid, nodelabel, omimtype, newlabel)
            else:
                model.addClassToGraph(omimid, newlabel, omimtype)

            # add the original screaming-caps OMIM label as a synonym
            model.addSynonym(omimid, label)

            # add the alternate labels and includes as synonyms
            for l in other_labels:
                model.addSynonym(omimid, l, 'OIO:hasRelatedSynonym')

            # for OMIM, we're adding the description as a definition
            model.addDefinition(omimid, description)
            if abbrev is not None:
                model.addSynonym(omimid, abbrev, 'OIO:hasRelatedSynonym')

            # if this is a genetic locus (but not sequenced)
            #   then add the chrom loc info
            # but add it to the ncbi gene identifier,
            # not to the omim id (we reserve the omim id to be the phenotype)
            feature_id = None
            feature_label = None
            if 'geneMapExists' in e['entry'] and e['entry']['geneMapExists']:
                genemap = e['entry']['geneMap']
                is_gene = False

                if omimtype == \
                        Genotype.genoparts['heritable_phenotypic_marker']:
                    # get the ncbigene ids
                    ncbifeature = self._get_mapped_gene_ids(e['entry'], g)
                    if len(ncbifeature) == 1:
                        feature_id = 'NCBIGene:'+str(ncbifeature[0])
                        # add this feature as a cause for the omim disease
                        # TODO SHOULD I EVEN DO THIS HERE?
                        assoc = G2PAssoc(g, self.name, feature_id, omimid)
                        assoc.add_association_to_graph()

                    elif len(ncbifeature) > 1:
                        logger.info(
                            "Its ambiguous when %s maps to >1 gene id: %s",
                            omimid, str(ncbifeature))
                    else:  # no ncbi feature, make an anonymous one
                        feature_id = self._make_anonymous_feature(str(omimnum))
                        feature_label = abbrev

                elif omimtype == Genotype.genoparts['gene']:
                    feature_id = omimid
                    is_gene = True
                else:
                    # 158900 falls into this category
                    feature_id = self._make_anonymous_feature(str(omimnum))
                    if abbrev is not None:
                        feature_label = abbrev
                    omimtype = \
                        Genotype.genoparts[
                            'heritable_phenotypic_marker']

                if feature_id is not None:
                    if 'comments' in genemap:
                        # add a comment to this feature
                        comment = genemap['comments']
                        if comment.strip() != '':
                            model.addDescription(feature_id, comment)
                    if 'cytoLocation' in genemap:
                        cytoloc = genemap['cytoLocation']
                        # parse the cytoloc.
                        # add this omim thing as
                        # a subsequence of the cytofeature
                        # 18p11.3-p11.2
                        # FIXME
                        # add the other end of the range,
                        # but not sure how to do that
                        # not sure if saying subsequence of feature
                        # is the right relationship

                        f = Feature(g, feature_id, feature_label, omimtype)
                        if 'chromosomeSymbol' in genemap:
                            chrom_num = str(genemap['chromosomeSymbol'])
                            chrom = makeChromID(chrom_num, tax_num, 'CHR')
                            geno.addChromosomeClass(
                                chrom_num, tax_id, tax_label)

                            # add the positional information, if available
                            fstart = fend = -1
                            if 'chromosomeLocationStart' in genemap:
                                fstart = genemap['chromosomeLocationStart']
                            if 'chromosomeLocationEnd' in genemap:
                                fend = genemap['chromosomeLocationEnd']
                            if fstart >= 0:
                                # make the build-specific chromosome
                                chrom_in_build = makeChromID(chrom_num,
                                                             build_num,
                                                             'MONARCH')
                                # then, add the chromosome instance
                                # (from the given build)
                                geno.addChromosomeInstance(
                                    chrom_num, build_id, build_num, chrom)
                                if omimtype == \
                                        Genotype.genoparts[
                                            'heritable_phenotypic_marker']:
                                    postypes = [Feature.types['FuzzyPosition']]
                                else:
                                    postypes = None
                                # NOTE that no strand information
                                # is available in the API
                                f.addFeatureStartLocation(
                                    fstart, chrom_in_build, None, postypes)
                                if fend >= 0:
                                    f.addFeatureEndLocation(
                                        fend, chrom_in_build, None, postypes)
                                if fstart > fend:
                                    logger.info(
                                        "start>end (%d>%d) for %s",
                                        fstart, fend, omimid)
                            # add the cytogenic location too
                            # for now, just take the first one
                            cytoloc = cytoloc.split('-')[0]
                            loc = makeChromID(cytoloc, tax_num, 'CHR')
                            model.addClassToGraph(loc, None)
                            f.addSubsequenceOfFeature(loc)
                            f.addFeatureToGraph(True, None, is_gene)

                # end adding causative genes/features

            # check if moved, if so,
            # make it deprecated and
            # replaced consider class to the other thing(s)
            # some entries have been moved to multiple other entries and
            # use the joining raw word "and"
            # 612479 is movedto:  "603075 and 603029"  OR
            # others use a comma-delimited list, like:
            # 610402 is movedto: "609122,300870"
            if e['entry']['status'] == 'moved':
                if re.search(r'and', str(e['entry']['movedTo'])):
                    # split the movedTo entry on 'and'
                    newids = re.split(r'and', str(e['entry']['movedTo']))
                elif len(str(e['entry']['movedTo']).split(',')) > 0:
                    # split on the comma
                    newids = str(e['entry']['movedTo']).split(',')
                else:
                    # make a list of one
                    newids = [str(e['entry']['movedTo'])]
                # cleanup whitespace and add OMIM prefix to numeric portion
                fixedids = []
                for i in newids:
                    fixedids.append('OMIM:'+i.strip())

                model.addDeprecatedClass(omimid, fixedids)

            self._get_phenotypicseries_parents(e['entry'], g)
            self._get_mappedids(e['entry'], g)
            self._get_mapped_gene_ids(e['entry'], g)

            self._get_pubs(e['entry'], g)

            self._get_process_allelic_variants(e['entry'], g)  # temp gag

        return
Exemplo n.º 23
0
    def process_rnai_phenotypes(self, limit=None):

        raw = '/'.join((self.rawdir, self.files['rnai_pheno']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)

        logger.info("Processing RNAi phenotype associations")
        line_counter = 0
        geno = Genotype(g)
        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (gene_num, gene_alt_symbol, phenotype_label, phenotype_id,
                 rnai_and_refs) = row
                # WBGene00001908	F17E9.9	locomotion variant	WBPhenotype:0000643	WBRNAi00025129|WBPaper00006395 WBRNAi00025631|WBPaper00006395
                # WBGene00001908	F17E9.9	avoids bacterial lawn	WBPhenotype:0000402	WBRNAi00095640|WBPaper00040984
                # WBGene00001908	F17E9.9	RAB-11 recycling endosome localization variant	WBPhenotype:0002107	WBRNAi00090830|WBPaper00041129

                if self.testMode and gene_num not in self.test_ids['gene']:
                    continue

                gene_id = 'WormBase:' + gene_num
                # refs = list()  # TODO unused

                # the rnai_and_refs has this so that
                # WBRNAi00008687|WBPaper00005654 WBRNAi00025197|WBPaper00006395 WBRNAi00045381|WBPaper00025054
                # space delimited between RNAi sets;
                # then each RNAi should have a paper

                rnai_sets = re.split(r' ', rnai_and_refs)

                for s in rnai_sets:

                    # get the rnai_id
                    (rnai_num, ref_num) = re.split(r'\|', s)
                    if len(re.split(r'\|', s)) > 2:
                        logger.warning(
                            "There's an unexpected number of items in %s", s)
                    if rnai_num not in self.rnai_gene_map:
                        self.rnai_gene_map[rnai_num] = set()

                    # to use for looking up later
                    self.rnai_gene_map[rnai_num].add(gene_num)

                    rnai_id = 'WormBase:' + rnai_num
                    geno.addGeneTargetingReagent(
                        rnai_id, None, geno.genoparts['RNAi_reagent'], gene_id)

                    # make the "allele" of the gene
                    # that is targeted by the reagent
                    allele_id = self.make_reagent_targeted_gene_id(
                        gene_num, rnai_num)
                    allele_label = gene_alt_symbol + '<' + rnai_num + '>'
                    geno.addReagentTargetedGene(rnai_id, gene_id, allele_id,
                                                allele_label)

                    assoc = G2PAssoc(g, self.name, allele_id, phenotype_id)
                    assoc.add_source('WormBase:' + ref_num)
                    # eco_id = 'ECO:0000019'  # RNAi evidence  # TODO unused
                    assoc.add_association_to_graph()

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return
Exemplo n.º 24
0
    def process_allele_phenotype(self, limit=None):
        """
        This file compactly lists variant to phenotype associations,
        such that in a single row, there may be >1 variant listed
        per phenotype and paper.  This indicates that each variant is
        individually assocated with the given phenotype,
        as listed in 1+ papers.
        (Not that the combination of variants is producing the phenotype.)
        :param limit:
        :return:

        """
        raw = '/'.join((self.rawdir, self.files['allele_pheno']['file']))

        graph = self.graph
        model = Model(self.graph)

        LOG.info("Processing Allele phenotype associations")
        line_counter = 0
        geno = Genotype(graph)
        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                if re.match(r'!', ''.join(row)):  # header
                    continue
                line_counter += 1
                (db, gene_num, gene_symbol, is_not, phenotype_id, ref,
                 eco_symbol, with_or_from, aspect, gene_name, gene_synonym,
                 gene_class, taxon, date, assigned_by, blank, blank2) = row

                # TODO add NOT phenotypes
                if is_not == 'NOT':
                    continue

                eco_symbol = eco_symbol.strip()
                eco_id = None
                if eco_symbol.strip() != '':
                    eco_id = self.resolve(eco_symbol)

                # according to the GOA spec, persons are not allowed to be
                # in the reference column, therefore they the variant and
                # persons are swapped between the reference and with column.
                # we unswitch them here.
                temp_var = temp_ref = None
                if re.search(r'WBVar|WBRNAi', ref):
                    temp_var = ref
                    # move the paper from the with column into the ref
                if re.search(r'WBPerson', with_or_from):
                    temp_ref = with_or_from
                if temp_var is not None or temp_ref is not None:
                    with_or_from = temp_var
                    ref = temp_ref

                allele_list = re.split(r'\|', with_or_from)
                if len(allele_list) == 0:
                    LOG.error(
                        "Missing alleles from phenotype assoc at line %d",
                        line_counter)
                    continue
                else:
                    for allele in allele_list:
                        allele_num = re.sub(r'WB:', '', allele.strip())
                        allele_id = 'WormBase:' + allele_num
                        gene_id = 'WormBase:' + gene_num

                        if re.search(r'WBRNAi', allele_id):

                            # @kshefchek - removing this blank node
                            # in favor of simpler modeling
                            # make the WormBase:WBRNAi* id
                            # a self.globaltt['reagent_targeted_gene'], and attach
                            # phenotype to this ID

                            # Previous model - make a bnode reagent-targeted gene,
                            # & annotate that instead of the RNAi item directly
                            #rnai_num = re.sub(r'WormBase:', '', allele_id)
                            #rnai_id = allele_id
                            #rtg_id = self.make_reagent_targeted_gene_id(
                            #    gene_num, rnai_num)
                            #geno.addReagentTargetedGene(
                            #    rnai_id, 'WormBase:' + gene_num, rtg_id)
                            # allele_id = rtg_id

                            # Could type the IRI as both the reagant and reagant
                            # targeted gene but not sure if this needed
                            # geno.addGeneTargetingReagent(
                            #   allele_id, None, self.globaltt['RNAi_reagent'], gene_id)

                            model.addIndividualToGraph(
                                allele_id, None,
                                self.globaltt['reagent_targeted_gene'])

                            self.graph.addTriple(
                                allele_id, self.globaltt['is_expression_variant_of'],
                                gene_id)

                        elif re.search(r'WBVar', allele_id):
                            # this may become deprecated by using wormmine
                            # make the allele to gene relationship
                            # the WBVars are really sequence alterations
                            # the public name will come from elsewhere

                            # @kshefchek - removing this blank node
                            # in favor of simpler modeling, treat variant
                            # like an allele
                            #vl_id = '_:'+'-'.join((gene_num, allele_num))
                            #geno.addSequenceAlterationToVariantLocus(
                            #    allele_id, vl_id)
                            #geno.addAlleleOfGene(vl_id, gene_id)

                            geno.addSequenceAlteration(allele_id, None)
                            geno.addAlleleOfGene(allele_id, gene_id)
                        else:
                            LOG.warning(
                                "Some kind of allele I don't recognize: %s", allele_num)
                            continue
                        assoc = G2PAssoc(graph, self.name, allele_id, phenotype_id)

                        if eco_id is not None:
                            assoc.add_evidence(eco_id)

                        if ref is not None and ref != '':
                            ref = re.sub(r'(WB:|WB_REF:)', 'WormBase:', ref)
                            reference = Reference(graph, ref)
                            if re.search(r'Person', ref):
                                reference.setType(self.globaltt['person'])
                                assoc.add_evidence(
                                    self.globaltt[
                                        'inference from background scientific knowledge'
                                    ])
                            reference.addRefToGraph()
                            assoc.add_source(ref)

                        assoc.add_association_to_graph()

                        # finish looping through all alleles

                if limit is not None and line_counter > limit:
                    break

        return
Exemplo n.º 25
0
    def _process_data(self, raw, limit=None):
        LOG.info("Processing Data from %s", raw)

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        geno = Genotype(graph)

        # Add the taxon as a class
        taxon_id = self.globaltt['Mus musculus']
        model.addClassToGraph(taxon_id, None)

        # with open(raw, 'r', encoding="utf8") as csvfile:
        col = self.files['all']['columns']
        with gzip.open(raw, 'rt') as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            row = next(reader)  # presumed header
            if not self.check_fileheader(col, row):
                pass

            for row in reader:
                # | head -1 | tr ',' '\n' | sed "s|\(.*\)|# \1 = row[col.index('\1')]|g"
                marker_accession_id = row[col.index('marker_accession_id')].strip()
                marker_symbol = row[col.index('marker_symbol')].strip()
                phenotyping_center = row[col.index('phenotyping_center')].strip()
                colony_raw = row[col.index('colony_id')].strip()
                sex = row[col.index('sex')].strip()
                zygosity = row[col.index('zygosity')].strip()
                allele_accession_id = row[col.index('allele_accession_id')].strip()
                allele_symbol = row[col.index('allele_symbol')].strip()
                # allele_name = row[col.index('allele_name')]
                strain_accession_id = row[col.index('strain_accession_id')].strip()
                strain_name = row[col.index('strain_name')].strip()
                # project_name = row[col.index('project_name')]
                project_fullname = row[col.index('project_fullname')].strip()
                pipeline_name = row[col.index('pipeline_name')].strip()
                pipeline_stable_id = row[col.index('pipeline_stable_id')].strip()
                procedure_stable_id = row[col.index('procedure_stable_id')].strip()
                procedure_name = row[col.index('procedure_name')].strip()
                parameter_stable_id = row[col.index('parameter_stable_id')].strip()
                parameter_name = row[col.index('parameter_name')].strip()
                # top_level_mp_term_id = row[col.index('top_level_mp_term_id')]
                # top_level_mp_term_name = row[col.index('top_level_mp_term_name')]
                mp_term_id = row[col.index('mp_term_id')].strip()
                mp_term_name = row[col.index('mp_term_name')].strip()
                p_value = row[col.index('p_value')].strip()
                percentage_change = row[col.index('percentage_change')].strip()
                effect_size = row[col.index('effect_size')].strip()
                statistical_method = row[col.index('statistical_method')].strip()
                resource_name = row[col.index('resource_name')].strip()

                if self.test_mode and marker_accession_id not in self.gene_ids:
                    continue

                # ##### cleanup some of the identifiers ######
                zygosity = zygosity.strip()
                zygosity_id = self.resolve(zygosity)
                if zygosity_id == zygosity:
                    LOG.warning(
                        "Zygosity '%s' unmapped. detting to indeterminate", zygosity)
                    zygosity_id = self.globaltt['indeterminate']

                # colony ids sometimes have <> in them, spaces,
                # or other non-alphanumerics and break our system;
                # replace these with underscores
                colony_id = '_:' + re.sub(r'\W+', '_', colony_raw)

                if not re.match(r'MGI', allele_accession_id):
                    allele_accession_id = '_:IMPC-'+re.sub(
                        r':', '', allele_accession_id)

                if re.search(r'EUROCURATE', strain_accession_id):
                    # the eurocurate links don't resolve at IMPC
                    # TODO blank nodes do not maintain identifiers
                    strain_accession_id = '_:' + strain_accession_id

                elif not re.match(r'MGI', strain_accession_id):
                    LOG.info(
                        "Found a strange strain accession...%s", strain_accession_id)
                    strain_accession_id = 'IMPC:'+strain_accession_id

                ######################
                # first, add the marker and variant to the graph as with MGI,
                # the allele is the variant locus.  IF the marker is not known,
                # we will call it a sequence alteration.  otherwise,
                # we will create a BNode for the sequence alteration.
                sequence_alteration_id = variant_locus_id = None
                variant_locus_name = sequence_alteration_name = None

                # extract out what's within the <> to get the symbol
                if re.match(r'.*<.*>', allele_symbol):
                    sequence_alteration_name = re.match(
                        r'.*<(.*)>', allele_symbol)
                    if sequence_alteration_name is not None:
                        sequence_alteration_name = sequence_alteration_name.group(1)
                else:
                    sequence_alteration_name = allele_symbol

                if marker_accession_id is not None and marker_accession_id == '':
                    LOG.warning("Marker unspecified on row %d", reader.line_num)
                    marker_accession_id = None

                if marker_accession_id is not None:
                    variant_locus_id = allele_accession_id
                    variant_locus_name = allele_symbol
                    variant_locus_type = self.globaltt['variant_locus']
                    geno.addGene(
                        marker_accession_id, marker_symbol, self.globaltt['gene'])

                    geno.addAllele(
                        variant_locus_id, variant_locus_name, variant_locus_type, None)
                    geno.addAlleleOfGene(variant_locus_id, marker_accession_id)

                    # TAG bnode
                    sequence_alteration_id = '_:seqalt' + re.sub(
                        r':', '', allele_accession_id)
                    geno.addSequenceAlterationToVariantLocus(
                        sequence_alteration_id, variant_locus_id)

                else:
                    sequence_alteration_id = allele_accession_id

                # IMPC contains targeted mutations with either gene traps,
                # knockouts, insertion/intragenic deletions.
                # but I don't really know what the SeqAlt is here,
                # so I don't add it.
                geno.addSequenceAlteration(
                    sequence_alteration_id, sequence_alteration_name)

                # #############    BUILD THE COLONY    #############
                # First, let's describe the colony that the animals come from
                # The Colony ID refers to the ES cell clone
                #   used to generate a mouse strain.
                # Terry sez: we use this clone ID to track
                #   ES cell -> mouse strain -> mouse phenotyping.
                # The same ES clone maybe used at multiple centers,
                # so we have to concatenate the two to have a unique ID.
                # some useful reading about generating mice from ES cells:
                # http://ki.mit.edu/sbc/escell/services/details

                # here, we'll make a genotype
                # that derives from an ES cell with a given allele.
                # the strain is not really attached to the colony.

                # the colony/clone is reflective of the allele,  with unknown zygosity

                stem_cell_class = self.globaltt['embryonic stem cell line']

                if colony_id is None:
                    print(colony_raw, stem_cell_class, "\nline:\t", reader.line_num)
                model.addIndividualToGraph(colony_id, colony_raw, stem_cell_class)

                # vslc of the colony has unknown zygosity
                # note that we will define the allele
                # (and it's relationship to the marker, etc.) later
                # FIXME is it really necessary to create this vslc
                # when we always know it's unknown zygosity?
                vslc_colony = '_:'+re.sub(
                    r':', '', allele_accession_id + self.globaltt['indeterminate'])
                vslc_colony_label = allele_symbol + '/<?>'
                # for ease of reading, we make the colony genotype variables.
                # in the future, it might be desired to keep the vslcs
                colony_genotype_id = vslc_colony
                colony_genotype_label = vslc_colony_label
                geno.addGenotype(colony_genotype_id, colony_genotype_label)
                geno.addParts(
                    allele_accession_id, colony_genotype_id,
                    self.globaltt['has_variant_part'])

                geno.addPartsToVSLC(
                    vslc_colony, allele_accession_id, None,
                    self.globaltt['indeterminate'], self.globaltt['has_variant_part'])
                graph.addTriple(
                    colony_id, self.globaltt['has_genotype'], colony_genotype_id)

                # ##########    BUILD THE ANNOTATED GENOTYPE    ##########
                # now, we'll build the genotype of the individual that derives
                # from the colony/clone genotype that is attached to
                # phenotype = colony_id + strain + zygosity + sex
                # (and is derived from a colony)

                # this is a sex-agnostic genotype
                genotype_id = self.make_id(
                    (colony_id + phenotyping_center + zygosity + strain_accession_id))
                geno.addSequenceDerivesFrom(genotype_id, colony_id)

                # build the VSLC of the sex-agnostic genotype
                # based on the zygosity
                allele1_id = allele_accession_id
                allele2_id = allele2_rel = None
                allele1_label = allele_symbol
                allele2_label = '<?>'
                # Making VSLC labels from the various parts,
                # can change later if desired.
                if zygosity == 'heterozygote':
                    allele2_label = re.sub(r'<.*', '<+>', allele1_label)
                    allele2_id = None
                elif zygosity == 'homozygote':
                    allele2_label = allele1_label
                    allele2_id = allele1_id
                    allele2_rel = self.globaltt['has_variant_part']
                elif zygosity == 'hemizygote':
                    allele2_label = re.sub(r'<.*', '<0>', allele1_label)
                    allele2_id = None
                elif zygosity == 'not_applicable':
                    allele2_label = re.sub(r'<.*', '<?>', allele1_label)
                    allele2_id = None
                else:
                    LOG.warning("found unknown zygosity %s", zygosity)
                    break
                vslc_name = '/'.join((allele1_label, allele2_label))

                # Add the VSLC
                vslc_id = '-'.join(
                    (marker_accession_id, allele_accession_id, zygosity))
                vslc_id = re.sub(r':', '', vslc_id)
                vslc_id = '_:'+vslc_id
                model.addIndividualToGraph(
                    vslc_id, vslc_name,
                    self.globaltt['variant single locus complement'])
                geno.addPartsToVSLC(
                    vslc_id, allele1_id, allele2_id, zygosity_id,
                    self.globaltt['has_variant_part'], allele2_rel)

                # add vslc to genotype
                geno.addVSLCtoParent(vslc_id, genotype_id)

                # note that the vslc is also the gvc
                model.addType(vslc_id, self.globaltt['genomic_variation_complement'])

                # Add the genomic background
                # create the genomic background id and name
                if strain_accession_id != '':
                    genomic_background_id = strain_accession_id
                else:
                    genomic_background_id = None

                genotype_name = vslc_name
                if genomic_background_id is not None:
                    geno.addGenotype(
                        genomic_background_id, strain_name,
                        self.globaltt['genomic_background'])

                    # make a phenotyping-center-specific strain
                    # to use as the background
                    pheno_center_strain_label = strain_name + '-' + phenotyping_center \
                        + '-' + colony_raw
                    pheno_center_strain_id = '-'.join((
                        re.sub(r':', '', genomic_background_id),
                        re.sub(r'\s', '_', phenotyping_center),
                        re.sub(r'\W+', '', colony_raw)))
                    if not re.match(r'^_', pheno_center_strain_id):
                        # Tag bnode
                        pheno_center_strain_id = '_:' + pheno_center_strain_id

                    geno.addGenotype(
                        pheno_center_strain_id, pheno_center_strain_label,
                        self.globaltt['genomic_background'])
                    geno.addSequenceDerivesFrom(
                        pheno_center_strain_id, genomic_background_id)

                    # Making genotype labels from the various parts,
                    # can change later if desired.
                    # since the genotype is reflective of the place
                    # it got made, should put that in to disambiguate
                    genotype_name = \
                        genotype_name + ' [' + pheno_center_strain_label + ']'
                    geno.addGenomicBackgroundToGenotype(
                        pheno_center_strain_id, genotype_id)
                    geno.addTaxon(taxon_id, pheno_center_strain_id)
                # this is redundant, but i'll keep in in for now
                geno.addSequenceDerivesFrom(genotype_id, colony_id)
                geno.addGenotype(genotype_id, genotype_name)

                # Make the sex-qualified genotype,
                # which is what the phenotype is associated with
                sex_qualified_genotype_id = \
                    self.make_id((
                        colony_id + phenotyping_center + zygosity +
                        strain_accession_id + sex))
                sex_qualified_genotype_label = genotype_name + ' (' + sex + ')'

                sq_type_id = self.resolve(sex, False)

                if sq_type_id == sex:
                    sq_type_id = self.globaltt['intrinsic_genotype']
                    LOG.warning(
                        "Unknown sex qualifier %s, adding as intrinsic_genotype",
                        sex)

                geno.addGenotype(
                    sex_qualified_genotype_id, sex_qualified_genotype_label, sq_type_id)
                geno.addParts(
                    genotype_id, sex_qualified_genotype_id,
                    self.globaltt['has_variant_part'])

                if genomic_background_id is not None and genomic_background_id != '':
                    # Add the taxon to the genomic_background_id
                    geno.addTaxon(taxon_id, genomic_background_id)
                else:
                    # add it as the genomic background
                    geno.addTaxon(taxon_id, genotype_id)

                # #############    BUILD THE G2P ASSOC    #############
                # from an old email dated July 23 2014:
                # Phenotypes associations are made to
                # imits colony_id+center+zygosity+gender

                # sometimes phenotype ids are missing.  (about 711 early 2020)
                if mp_term_id is None or mp_term_id == '':
                    LOG.warning(
                        "No phenotype id specified for row %d", reader.line_num)
                    continue
                # hard coded ECO code
                eco_id = self.globaltt['mutant phenotype evidence']

                # the association comes as a result of a g2p from
                # a procedure in a pipeline at a center and parameter tested

                assoc = G2PAssoc(
                    graph, self.name, sex_qualified_genotype_id, mp_term_id)
                assoc.add_evidence(eco_id)
                # assoc.set_score(float(p_value))

                # TODO add evidence instance using
                # pipeline_stable_id +
                # procedure_stable_id +
                # parameter_stable_id

                assoc.add_association_to_graph()
                assoc_id = assoc.get_association_id()

                model._addSexSpecificity(assoc_id, self.resolve(sex))

                # add a free-text description
                try:
                    description = ' '.join((
                        mp_term_name, 'phenotype determined by', phenotyping_center,
                        'in an', procedure_name, 'assay where', parameter_name.strip(),
                        'was measured with an effect_size of',
                        str(round(float(effect_size), 5)),
                        '(p =', "{:.4e}".format(float(p_value)), ').'))
                except ValueError:
                    description = ' '.join((
                        mp_term_name, 'phenotype determined by', phenotyping_center,
                        'in an', procedure_name, 'assay where', parameter_name.strip(),
                        'was measured with an effect_size of', str(effect_size),
                        '(p =', "{0}".format(p_value), ').'))

                study_bnode = self._add_study_provenance(
                    phenotyping_center, colony_raw, project_fullname, pipeline_name,
                    pipeline_stable_id, procedure_stable_id, procedure_name,
                    parameter_stable_id, parameter_name, statistical_method,
                    resource_name)

                evidence_line_bnode = self._add_evidence(
                    assoc_id, eco_id, p_value, percentage_change, effect_size,
                    study_bnode)

                self._add_assertion_provenance(assoc_id, evidence_line_bnode)

                model.addDescription(evidence_line_bnode, description)

                # resource_id = resource_name
                # assoc.addSource(graph, assoc_id, resource_id)

                if not self.test_mode and limit is not None and reader.line_num > limit:
                    break
Exemplo n.º 26
0
    def _get_variants(self, limit):
        """
        Currently loops through the variant_summary file.

        :param limit:
        :return:

        """

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)

        geno = Genotype(g)
        f = Feature(g, None, None, None)

        # add the taxon and the genome
        tax_num = '9606'  # HARDCODE
        tax_id = 'NCBITaxon:' + tax_num
        tax_label = 'Human'
        model.addClassToGraph(tax_id, None)
        geno.addGenome(tax_id, tax_label)  # label gets added elsewhere

        # not unzipping the file
        logger.info("Processing Variant records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['variant_summary']['file']))
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue

                # AlleleID               integer value as stored in the AlleleID field in ClinVar  (//Measure/@ID in the XML)
                # Type                   character, the type of variation
                # Name                   character, the preferred name for the variation
                # GeneID                 integer, GeneID in NCBI's Gene database
                # GeneSymbol             character, comma-separated list of GeneIDs overlapping the variation
                # ClinicalSignificance   character, comma-separated list of values of clinical significance reported for this variation
                #                          for the mapping between the terms listed here and the integers in the .VCF files, see
                #                          http://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/
                # RS# (dbSNP)            integer, rs# in dbSNP
                # nsv (dbVar)            character, the NSV identifier for the region in dbVar
                # RCVaccession           character, list of RCV accessions that report this variant
                # TestedInGTR            character, Y/N for Yes/No if there is a test registered as specific to this variation in the NIH Genetic Testing Registry (GTR)
                # PhenotypeIDs           character, list of db names and identifiers for phenotype(s) reported for this variant
                # Origin                 character, list of all allelic origins for this variation
                # Assembly               character, name of the assembly on which locations are based
                # Chromosome             character, chromosomal location
                # Start                  integer, starting location, in pter->qter orientation
                # Stop                   integer, end location, in pter->qter orientation
                # Cytogenetic            character, ISCN band
                # ReviewStatus           character, highest review status for reporting this measure. For the key to the terms,
                #                            and their relationship to the star graphics ClinVar displays on its web pages,
                #                            see http://www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#interpretation
                # HGVS(c.)               character, RefSeq cDNA-based HGVS expression
                # HGVS(p.)               character, RefSeq protein-based HGVS expression
                # NumberSubmitters       integer, number of submissions with this variant
                # LastEvaluated          datetime, the latest time any submitter reported clinical significance
                # Guidelines             character, ACMG only right now, for the reporting of incidental variation in a Gene
                #                                (NOTE: if ACMG, not a specific to the allele but to the Gene)
                # OtherIDs               character, list of other identifiers or sources of information about this variant
                # VariantID              integer, the value used to build the URL for the current default report,
                #                            e.g. http://www.ncbi.nlm.nih.gov/clinvar/variation/1756/
                #

                # a crude check that there's an expected number of cols.
                # if not, error out because something changed.
                num_cols = len(line.split('\t'))
                expected_numcols = 29
                if num_cols != expected_numcols:
                    logger.error(
                        "Unexpected number of columns in raw file " +
                        "(%d actual vs %d expected)", num_cols,
                        expected_numcols)

                (allele_num, allele_type, allele_name, gene_num, gene_symbol,
                 clinical_significance, dbsnp_num, dbvar_num, rcv_nums,
                 tested_in_gtr, phenotype_ids, origin, assembly, chr, start,
                 stop, cytogenetic_loc, review_status, hgvs_c, hgvs_p,
                 number_of_submitters, last_eval, guidelines, other_ids,
                 variant_num, reference_allele, alternate_allele, categories,
                 ChromosomeAccession) = line.split('\t')

                # ###set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #    if ((self.filter == 'taxids' and\
                #            (int(tax_num) not in self.tax_ids)) or\
                #            (self.filter == 'geneids' and\
                #             (int(gene_num) not in self.gene_ids))):
                #        continue
                # #### end filter

                line_counter += 1

                pheno_list = []
                if phenotype_ids != '-':
                    # trim any leading/trailing semicolons/commas
                    phenotype_ids = re.sub(r'^[;,]', '', phenotype_ids)
                    phenotype_ids = re.sub(r'[;,]$', '', phenotype_ids)
                    pheno_list = re.split(r'[,;]', phenotype_ids)

                if self.testMode:
                    # get intersection of test disease ids
                    # and these phenotype_ids
                    intersect = \
                        list(
                            set([str(i)
                                for i in self.disease_ids]) & set(pheno_list))
                    if int(gene_num) not in self.gene_ids and\
                            int(variant_num) not in self.variant_ids and\
                            len(intersect) < 1:
                        continue

                # TODO may need to switch on assembly to create correct
                # assembly/build identifiers
                build_id = ':'.join(('NCBIGenome', assembly))

                # make the reference genome build
                geno.addReferenceGenome(build_id, assembly, tax_id)

                allele_type_id = self._map_type_of_allele(allele_type)
                bandinbuild_id = None
                if str(chr) == '':
                    # check cytogenic location
                    if str(cytogenetic_loc).strip() != '':
                        # use cytogenic location to get the apx location
                        # oddly, they still put an assembly number even when
                        # there's no numeric location
                        if not re.search(r'-', str(cytogenetic_loc)):
                            band_id = makeChromID(
                                re.split(r'-', str(cytogenetic_loc)), tax_num,
                                'CHR')
                            geno.addChromosomeInstance(cytogenetic_loc,
                                                       build_id, assembly,
                                                       band_id)
                            bandinbuild_id = makeChromID(
                                re.split(r'-', str(cytogenetic_loc)), assembly,
                                'MONARCH')
                        else:
                            # can't deal with ranges yet
                            pass
                else:
                    # add the human chromosome class to the graph,
                    # and add the build-specific version of it
                    chr_id = makeChromID(str(chr), tax_num, 'CHR')
                    geno.addChromosomeClass(str(chr), tax_id, tax_label)
                    geno.addChromosomeInstance(str(chr), build_id, assembly,
                                               chr_id)
                    chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH')

                seqalt_id = ':'.join(('ClinVarVariant', variant_num))
                gene_id = None

                # they use -1 to indicate unknown gene
                if str(gene_num) != '-1' and str(gene_num) != 'more than 10':
                    if re.match(r'^Gene:', gene_num):
                        gene_num = "NCBI" + gene_num
                    else:
                        gene_id = ':'.join(('NCBIGene', str(gene_num)))

                # FIXME there are some "variants" that are actually haplotypes
                # probably will get taken care of when we switch to processing
                # the xml for example, variant_num = 38562
                # but there's no way to tell if it's a haplotype
                # in the csv data so the dbsnp or dbvar
                # should probably be primary,
                # and the variant num be the vslc,
                # with each of the dbsnps being added to it

                # TODO clinical significance needs to be mapped to
                # a list of terms
                # first, make the variant:
                f = Feature(seqalt_id, allele_name, allele_type_id)

                if start != '-' and start.strip() != '':
                    f.addFeatureStartLocation(start, chrinbuild_id)
                if stop != '-' and stop.strip() != '':
                    f.addFeatureEndLocation(stop, chrinbuild_id)

                f.addFeatureToGraph()
                f.addTaxonToFeature(tax_id)
                # make the ClinVarVariant the clique leader
                model.makeLeader(seqalt_id)

                if bandinbuild_id is not None:
                    f.addSubsequenceOfFeature(bandinbuild_id)

                # CHECK - this makes the assumption that there is
                # only one affected chromosome per variant what happens with
                # chromosomal rearrangement variants?
                # shouldn't both chromosomes be here?

                # add the hgvs as synonyms
                if hgvs_c != '-' and hgvs_c.strip() != '':
                    model.addSynonym(seqalt_id, hgvs_c)
                if hgvs_p != '-' and hgvs_p.strip() != '':
                    model.addSynonym(seqalt_id, hgvs_p)

                # add the dbsnp and dbvar ids as equivalent
                if dbsnp_num != '-' and int(dbsnp_num) != -1:
                    dbsnp_id = 'dbSNP:rs' + str(dbsnp_num)
                    model.addIndividualToGraph(dbsnp_id, None)
                    model.addSameIndividual(seqalt_id, dbsnp_id)
                if dbvar_num != '-':
                    dbvar_id = 'dbVar:' + dbvar_num
                    model.addIndividualToGraph(dbvar_id, None)
                    model.addSameIndividual(seqalt_id, dbvar_id)

                # TODO - not sure if this is right... add as xref?
                # the rcv is like the combo of the phenotype with the variant
                if rcv_nums != '-':
                    for rcv_num in re.split(r';', rcv_nums):
                        rcv_id = 'ClinVar:' + rcv_num
                        model.addIndividualToGraph(rcv_id, None)
                        model.addXref(seqalt_id, rcv_id)

                if gene_id is not None:
                    # add the gene
                    model.addClassToGraph(gene_id, gene_symbol)
                    # make a variant locus
                    vl_id = '_' + gene_num + '-' + variant_num
                    if self.nobnodes:
                        vl_id = ':' + vl_id
                    vl_label = allele_name
                    model.addIndividualToGraph(vl_id, vl_label,
                                               geno.genoparts['variant_locus'])
                    geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id)
                    geno.addAlleleOfGene(vl_id, gene_id)
                else:
                    # some basic reporting
                    gmatch = re.search(r'\(\w+\)', allele_name)
                    if gmatch is not None and len(gmatch.groups()) > 0:
                        logger.info(
                            "Gene found in allele label, but no id provided: %s",
                            gmatch.group(1))
                    elif re.match(r'more than 10', gene_symbol):
                        logger.info(
                            "More than 10 genes found; "
                            "need to process XML to fetch (variant=%d)",
                            int(variant_num))
                    else:
                        logger.info("No gene listed for variant %d",
                                    int(variant_num))

                # parse the list of "phenotypes" which are diseases.
                # add them as an association
                # ;GeneReviews:NBK1440,MedGen:C0392514,OMIM:235200,SNOMED CT:35400008;MedGen:C3280096,OMIM:614193;MedGen:CN034317,OMIM:612635;MedGen:CN169374
                # the list is both semicolon delimited and comma delimited,
                # but i don't know why! some are bad, like:
                # Orphanet:ORPHA ORPHA319705,SNOMED CT:49049000
                if phenotype_ids != '-':
                    for phenotype in pheno_list:
                        m = re.match(r"(Orphanet:ORPHA(?:\s*ORPHA)?)",
                                     phenotype)
                        if m is not None and len(m.groups()) > 0:
                            phenotype = re.sub(m.group(1), 'Orphanet:',
                                               phenotype.strip())
                        elif re.match(r'ORPHA:\d+', phenotype):
                            phenotype = re.sub(r'^ORPHA', 'Orphanet',
                                               phenotype.strip())
                        elif re.match(r'Human Phenotype Ontology', phenotype):
                            phenotype = re.sub(r'^Human Phenotype Ontology',
                                               '', phenotype.strip())
                        elif re.match(r'SNOMED CT:\s?', phenotype):
                            phenotype = re.sub(r'SNOMED CT:\s?', 'SNOMED:',
                                               phenotype.strip())
                        elif re.match(r'^Gene:', phenotype):
                            continue

                        assoc = G2PAssoc(g, self.name, seqalt_id,
                                         phenotype.strip())
                        assoc.add_association_to_graph()

                if other_ids != '-':
                    id_list = other_ids.split(',')
                    # process the "other ids" ex:
                    # CFTR2:F508del,HGMD:CD890142,OMIM Allelic Variant:602421.0001
                    # TODO make more xrefs
                    for xrefid in id_list:
                        prefix = xrefid.split(':')[0].strip()
                        if prefix == 'OMIM Allelic Variant':
                            xrefid = 'OMIM:' + xrefid.split(':')[1]
                            model.addIndividualToGraph(xrefid, None)
                            model.addSameIndividual(seqalt_id, xrefid)
                        elif prefix == 'HGMD':
                            model.addIndividualToGraph(xrefid, None)
                            model.addSameIndividual(seqalt_id, xrefid)
                        elif prefix == 'dbVar' \
                                and dbvar_num == xrefid.split(':')[1].strip():
                            pass  # skip over this one
                        elif re.search(r'\s', prefix):
                            pass
                            # logger.debug(
                            #   'xref prefix has a space: %s', xrefid)
                        else:
                            # should be a good clean prefix
                            # note that HGMD variants are in here as Xrefs
                            # because we can't resolve URIs for them
                            # logger.info("Adding xref: %s", xrefid)
                            # gu.addXref(g, seqalt_id, xrefid)
                            # logger.info("xref prefix to add: %s", xrefid)
                            pass

                if not self.testMode and limit is not None \
                        and line_counter > limit:
                    break

        logger.info("Finished parsing variants")

        return
Exemplo n.º 27
0
    def process_gaf(self, file, limit, id_map=None, eco_map=None):

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph

        model = Model(graph)
        geno = Genotype(graph)
        LOG.info("Processing Gene Associations from %s", file)
        line_counter = 0
        uniprot_hit = 0
        uniprot_miss = 0
        if '7955' in self.tax_ids:
            zfin = ZFIN(self.graph_type, self.are_bnodes_skized)
        if '6239' in self.tax_ids:
            wbase = WormBase(self.graph_type, self.are_bnodes_skized)

        with gzip.open(file, 'rb') as csvfile:
            filereader = csv.reader(io.TextIOWrapper(csvfile, newline=""),
                                    delimiter='\t',
                                    quotechar='\"')
            for row in filereader:
                line_counter += 1
                # comments start with exclamation
                if re.match(r'!', ''.join(row)):
                    continue

                if len(row) > 17 or len(row) < 15:
                    LOG.warning(
                        "Wrong number of columns %i, expected 15 or 17\n%s",
                        len(row), row)
                    continue

                if 17 > len(row) >= 15:
                    row += [""] * (17 - len(row))

                (dbase, gene_num, gene_symbol, qualifier, go_id, ref,
                 eco_symbol, with_or_from, aspect, gene_name, gene_synonym,
                 object_type, taxon, date, assigned_by, annotation_extension,
                 gene_product_form_id) = row

                # test for required fields
                if (dbase == '' or gene_num == '' or gene_symbol == ''
                        or go_id == '' or ref == '' or eco_symbol == ''
                        or aspect == '' or object_type == '' or taxon == ''
                        or date == '' or assigned_by == ''):
                    LOG.error(
                        "Missing required part of annotation on row %d:\n" +
                        '\t'.join(row), line_counter)
                    continue

                # deal with qualifier NOT, contributes_to, colocalizes_with
                if re.search(r'NOT', qualifier):
                    continue

                if dbase in self.localtt:
                    dbase = self.localtt[dbase]
                uniprotid = None
                gene_id = None
                if dbase == 'UniProtKB':
                    if id_map is not None and gene_num in id_map:
                        gene_id = id_map[gene_num]
                        uniprotid = ':'.join((dbase, gene_num))
                        (dbase, gene_num) = gene_id.split(':')
                        uniprot_hit += 1
                    else:
                        # LOG.warning(
                        #   "UniProt id %s  is without a 1:1 mapping to entrez/ensembl",
                        #    gene_num)
                        uniprot_miss += 1
                        continue
                else:
                    gene_num = gene_num.split(':')[-1]  # last
                    gene_id = ':'.join((dbase, gene_num))

                if self.test_mode and not (re.match(r'NCBIGene', gene_id)
                                           and int(gene_num) in self.test_ids):
                    continue

                model.addClassToGraph(gene_id, gene_symbol)
                if gene_name != '':
                    model.addDescription(gene_id, gene_name)
                if gene_synonym != '':
                    for syn in re.split(r'\|', gene_synonym):
                        model.addSynonym(gene_id, syn.strip())
                if re.search(r'\|', taxon):
                    # TODO add annotations with >1 taxon
                    LOG.info(">1 taxon (%s) on line %d.  skipping", taxon,
                             line_counter)
                else:
                    tax_id = re.sub(r'taxon:', 'NCBITaxon:', taxon)
                    geno.addTaxon(tax_id, gene_id)

                assoc = Assoc(graph, self.name)
                assoc.set_subject(gene_id)
                assoc.set_object(go_id)

                try:
                    eco_id = eco_map[eco_symbol]
                    assoc.add_evidence(eco_id)
                except KeyError:
                    LOG.error("Evidence code (%s) not mapped", eco_symbol)

                refs = re.split(r'\|', ref)
                for ref in refs:
                    ref = ref.strip()
                    if ref != '':
                        prefix = ref.split(':')[0]  # sidestep 'MGI:MGI:'
                        if prefix in self.localtt:
                            prefix = self.localtt[prefix]
                        ref = ':'.join((prefix, ref.split(':')[-1]))
                        refg = Reference(graph, ref)
                        if prefix == 'PMID':
                            ref_type = self.globaltt['journal article']
                            refg.setType(ref_type)
                        refg.addRefToGraph()
                        assoc.add_source(ref)

                # TODO add the source of the annotations from assigned by?

                rel = self.resolve(aspect, mandatory=False)
                if rel is not None and aspect == rel:
                    if aspect == 'F' and re.search(r'contributes_to',
                                                   qualifier):
                        assoc.set_relationship(self.globaltt['contributes to'])
                    else:
                        LOG.error(
                            "Aspect: %s with qualifier: %s  is not recognized",
                            aspect, qualifier)
                elif rel is not None:
                    assoc.set_relationship(rel)
                    assoc.add_association_to_graph()
                else:
                    LOG.warning("No predicate for association \n%s\n",
                                str(assoc))

                if uniprotid is not None:
                    assoc.set_description('Mapped from ' + uniprotid)
                # object_type should be one of:
                # protein_complex; protein; transcript; ncRNA; rRNA; tRNA;
                # snRNA; snoRNA; any subtype of ncRNA in the Sequence Ontology.
                # If the precise product type is unknown,
                # gene_product should be used
                #######################################################################

                # Derive G2P Associations from IMP annotations
                # in version 2.1 Pipe will indicate 'OR'
                # and Comma will indicate 'AND'.
                # in version 2.0, multiple values are separated by pipes
                # where the pipe has been used to mean 'AND'
                if eco_symbol == 'IMP' and with_or_from != '':
                    withitems = re.split(r'\|', with_or_from)
                    phenotypeid = go_id + 'PHENOTYPE'
                    # create phenotype associations
                    for i in withitems:
                        if i == '' or re.match(
                                r'(UniProtKB|WBPhenotype|InterPro|HGNC)', i):
                            LOG.warning(
                                "Don't know what having a uniprot id " +
                                "in the 'with' column means of %s", uniprotid)
                            continue
                        i = re.sub(r'MGI\:MGI\:', 'MGI:', i)
                        i = re.sub(r'WB:', 'WormBase:', i)

                        # for worms and fish, they might give a RNAi or MORPH
                        # in these cases make a reagent-targeted gene
                        if re.search('MRPHLNO|CRISPR|TALEN', i):
                            targeted_gene_id = zfin.make_targeted_gene_id(
                                gene_id, i)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            # TODO PYLINT why is this needed?
                            # Redefinition of assoc type from
                            # dipper.models.assoc.Association.Assoc to
                            # dipper.models.assoc.G2PAssoc.G2PAssoc
                            assoc = G2PAssoc(graph, self.name,
                                             targeted_gene_id, phenotypeid)
                        elif re.search(r'WBRNAi', i):
                            targeted_gene_id = wbase.make_reagent_targeted_gene_id(
                                gene_id, i)
                            geno.addReagentTargetedGene(
                                i, gene_id, targeted_gene_id)
                            assoc = G2PAssoc(graph, self.name,
                                             targeted_gene_id, phenotypeid)
                        else:
                            assoc = G2PAssoc(graph, self.name, i, phenotypeid)
                        for ref in refs:
                            ref = ref.strip()
                            if ref != '':
                                prefix = ref.split(':')[0]
                                if prefix in self.localtt:
                                    prefix = self.localtt[prefix]
                                ref = ':'.join((prefix, ref.split(':')[-1]))
                                assoc.add_source(ref)
                                # experimental phenotypic evidence
                                assoc.add_evidence(self.globaltt[
                                    'experimental phenotypic evidence'])
                        assoc.add_association_to_graph()
                        # TODO should the G2PAssoc be
                        # the evidence for the GO assoc?

                if not self.test_mode and limit is not None and line_counter > limit:
                    break
            uniprot_tot = (uniprot_hit + uniprot_miss)
            uniprot_per = 0.0
            if uniprot_tot != 0:
                uniprot_per = 100.0 * uniprot_hit / uniprot_tot
            LOG.info(
                "Uniprot: %.2f%% of %i benefited from the 1/4 day id mapping download",
                uniprot_per, uniprot_tot)
        return
Exemplo n.º 28
0
    def _process_diseasegene(self, limit):
        """
        :param limit:
        :return:
        """
        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        line_counter = 0

        model = Model(graph)

        myfile = '/'.join((self.rawdir, self.files['disease-gene']['file']))

        for event, elem in ET.iterparse(myfile):
            if elem.tag == 'Disorder':
                # get the element name and id, ignore element name
                # id = elem.get('id') # some internal identifier
                disorder_num = elem.find('OrphaNumber').text
                disorder_id = 'ORPHA:' + str(disorder_num)

                if self.test_mode and disorder_id not in self.all_test_ids[
                        'disease']:
                    continue
                disorder_label = elem.find('Name').text

                # assuming that these are in the ontology (...any particular one?)
                model.addClassToGraph(disorder_id, disorder_label)
                assoc_list = elem.find('DisorderGeneAssociationList')
                expected_genes = assoc_list.get('count')
                LOG.info('Expecting %s genes associated with disorder %s.',
                         expected_genes, disorder_id)
                processed_genes = 0
                for assoc in assoc_list.findall('DisorderGeneAssociation'):
                    processed_genes += 1
                    gene = assoc.find('Gene')

                    # get gene's curie  HGNC or Ensembl ...

                    lclid = gene.find('OrphaNumber').text
                    gene_curie = 'ORPHA:' + lclid
                    gene_set = {'ORPHA': lclid}
                    for gene_ref in gene.findall(
                            './ExternalReferenceList/ExternalReference'):
                        gene_set[gene_ref.find('Source').text] = \
                            gene_ref.find('Reference').text

                    # set priority (clique leader if available) but default to OPRHA
                    for pfx in ('HGNC', 'Ensembl', 'SwissProt'):
                        if pfx in gene_set:
                            if pfx in self.localtt:
                                pfx = self.localtt[pfx]
                            gene_curie = pfx + ':' + gene_set[pfx]
                            gene_set.pop(pfx)
                            model.addClassToGraph(gene_curie, None)
                            break

                    # TEC have reservations w.r.t aggerator links being gene classes
                    for prefix in gene_set:
                        lclid = gene_set[prefix]
                        if prefix in self.localtt:
                            prefix = self.localtt[prefix]

                        dbxref = prefix + ':' + lclid

                        if gene_curie != dbxref:
                            model.addClassToGraph(dbxref, None)
                            model.addEquivalentClass(gene_curie, dbxref)

                    # TEC. would prefer this not happen here. let HGNC handle it
                    # except there are some w/o explicit external links ...

                    gene_symbol = gene.find('Symbol').text

                    syn_list = gene.find('./SynonymList')
                    if int(syn_list.get('count')) > 0:
                        for syn in syn_list.findall('./Synonym'):
                            model.addSynonym(gene_curie, syn.text)

                    dg_label = assoc.find(
                        './DisorderGeneAssociationType/Name').text

                    # use dg association status to issue an evidence code
                    # FIXME I think that these codes are sub-optimal
                    eco_id = self.resolve(
                        assoc.find('DisorderGeneAssociationStatus/Name').text)

                    rel_id = self.resolve(dg_label)

                    g2p_assoc = G2PAssoc(self.graph, self.name, gene_curie,
                                         disorder_id, rel_id)
                    g2p_assoc.add_evidence(eco_id)
                    g2p_assoc.add_association_to_graph()

                elem.clear()  # empty the element
                if int(expected_genes) != processed_genes:
                    LOG.warning(
                        '% expected %s associated genes but we processed %i',
                        disorder_id, expected_genes, processed_genes)

            if self.test_mode and limit is not None and line_counter > limit:
                return

        return
Exemplo n.º 29
0
    def _add_g2p_assoc(self, graph, strain_id, sex, assay_id, phenotypes,
                       comment):
        """
        Create an association between a sex-specific strain id
        and each of the phenotypes.
        Here, we create a genotype from the strain,
        and a sex-specific genotype.
        Each of those genotypes are created as anonymous nodes.

        The evidence code is hardcoded to be:
            ECO:experimental_phenotypic_evidence.

        :param g:
        :param strain_id:
        :param sex:
        :param assay_id:
        :param phenotypes: a list of phenotypes to association with the strain
        :param comment:
        :return:

        """
        geno = Genotype(graph)
        model = Model(graph)
        eco_id = self.globaltt['experimental phenotypic evidence']
        strain_label = self.idlabel_hash.get(strain_id)
        # strain genotype
        genotype_id = '_' + '-'.join((re.sub(r':', '', strain_id), 'genotype'))
        genotype_label = '[' + strain_label + ']'

        sex_specific_genotype_id = '_' + '-'.join(
            (re.sub(r':', '', strain_id), sex, 'genotype'))
        if strain_label is not None:
            sex_specific_genotype_label = strain_label + ' (' + sex + ')'
        else:
            sex_specific_genotype_label = strain_id + '(' + sex + ')'

        genotype_type = self.globaltt['sex_qualified_genotype']
        if sex == 'm':
            genotype_type = self.globaltt['male_genotype']
        elif sex == 'f':
            genotype_type = self.globaltt['female_genotype']

        # add the genotype to strain connection
        geno.addGenotype(genotype_id, genotype_label,
                         self.globaltt['genomic_background'])
        graph.addTriple(strain_id, self.globaltt['has_genotype'], genotype_id)

        geno.addGenotype(sex_specific_genotype_id, sex_specific_genotype_label,
                         genotype_type)

        # add the strain as the background for the genotype
        graph.addTriple(sex_specific_genotype_id,
                        self.globaltt['has_sex_agnostic_part'], genotype_id)

        # #############    BUILD THE G2P ASSOC    #############
        # TODO add more provenance info when that model is completed

        if phenotypes is not None:
            for phenotype_id in phenotypes:
                assoc = G2PAssoc(graph, self.name, sex_specific_genotype_id,
                                 phenotype_id)
                assoc.add_evidence(assay_id)
                assoc.add_evidence(eco_id)
                assoc.add_association_to_graph()
                assoc_id = assoc.get_association_id()
                model.addComment(assoc_id, comment)
                model._addSexSpecificity(assoc_id, self.resolve(sex))

        return
Exemplo n.º 30
0
Arquivo: OMIA.py Projeto: sgml/dipper
    def _process_breed_phene_row(self, row):
        model = Model(self.graph)
        # Linking disorders/characteristic to breeds
        # breed_id, phene_id, added_by
        breed_id = self.id_hash['breed'].get(row['breed_id'])
        phene_id = self.id_hash['phene'].get(row['phene_id'])

        # get the omia id
        omia_id = self._get_omia_id_from_phene_id(phene_id)

        if breed_id is None or phene_id is None or (
                self.test_mode and
            (omia_id not in self.test_ids['disease']
             or row['breed_id'] not in self.test_ids['breed'])):
            return

        # FIXME we want a different relationship here
        assoc = G2PAssoc(self.graph, self.name, breed_id, phene_id,
                         self.globaltt['has phenotype'])
        assoc.add_association_to_graph()

        # add that the breed is a model of the human disease
        # use the omia-omim mappings for this
        # we assume that we have already scrubbed out the genes
        # from the omim list, so we can make the model associations here

        omim_ids = self.omia_omim_map.get(omia_id)
        eco_id = self.globaltt['biological aspect of descendant evidence']
        if omim_ids is not None and omim_ids:
            # if len(omim_ids) > 1:
            #    LOG.info(
            #        "There's 1:many omia:omim mapping: %s, %s", omia_id, str(omim_ids))
            # else:
            #    oid = list(omim_ids)[0]
            #    LOG.info("OMIA %s is mapped to OMIM %s", omia_id, oid)

            for oid in omim_ids:
                assoc = G2PAssoc(self.graph, self.name, breed_id, oid,
                                 self.globaltt['is model of'])
                assoc.add_evidence(eco_id)
                assoc.add_association_to_graph()
                aid = assoc.get_association_id()

                breed_label = self.label_hash.get(breed_id)
                if breed_label is None:  # get taxon label?
                    breed_label = "this breed"

                mch = re.search(r'\((.*)\)', breed_label)
                if mch:
                    sp_label = mch.group(1)
                else:
                    sp_label = ''

                phene_label = self.label_hash.get(phene_id)
                if phene_label is None:
                    phene_label = "phenotype"
                elif phene_label.endswith(sp_label):
                    # some of the labels we made already include the species;
                    # remove it to make a cleaner desc
                    phene_label = re.sub(r' in ' + sp_label, '', phene_label)
                desc = ' '.join(
                    ("High incidence of", phene_label, "in", breed_label,
                     "suggests it to be a model of disease", oid + "."))
                model.addDescription(aid, desc)
        else:
            LOG.warning("No OMIM Disease associated with %s", omia_id)