Пример #1
0
    def _map_eom_terms(self, raw, limit=None):
        """
        This table contains the HP ID mappings from the local tsv file.
        Triples:
            <eom id> owl:equivalentClass <hp id>
        :param raw:
        :param limit:
        :return:
        """

        gu = GraphUtils(curie_map.get())

        line_counter = 0
        with open(raw, 'r') as f1:
            f1.readline()  # read the header row; skip
            for line in f1:
                line_counter += 1

                (morphology_term_id, morphology_term_label, hp_id, hp_label, notes) = line.split('\t')

                # Sub out the underscores for colons.
                hp_id = re.sub('_', ':', hp_id)
                if re.match(".*HP:.*", hp_id):
                    # add the HP term as a class
                    gu.addClassToGraph(self.graph, hp_id, None)
                    # Add the HP ID as an equivalent class
                    gu.addEquivalentClass(self.graph, morphology_term_id, hp_id)
                else:
                    logger.warning('No matching HP term for %s', morphology_term_label)

                if limit is not None and line_counter > limit:
                    break

        return
Пример #2
0
    def _get_process_allelic_variants(self, entry, g):
        gu = GraphUtils(curie_map.get())
        geno = Genotype(g)
        du = DipperUtil()
        if entry is not None:
            publist = {}  # to hold the entry-specific publication mentions for the allelic variants
            entry_num = entry['mimNumber']

            # process the ref list just to get the pmids
            ref_to_pmid = self._get_pubs(entry, g)

            if 'allelicVariantList' in entry:
                allelicVariantList = entry['allelicVariantList']
                for al in allelicVariantList:
                    al_num = al['allelicVariant']['number']
                    al_id = 'OMIM:'+str(entry_num)+'.'+str(al_num).zfill(4)
                    al_label = None
                    al_description = None
                    if al['allelicVariant']['status'] == 'live':
                        publist[al_id] = set()
                        if 'mutations' in al['allelicVariant']:
                            al_label = al['allelicVariant']['mutations']
                        if 'text' in al['allelicVariant']:
                            al_description = al['allelicVariant']['text']
                            m = re.findall('\{(\d+)\:', al_description)
                            publist[al_id] = set(m)
                        geno.addAllele(al_id, al_label, geno.genoparts['variant_locus'], al_description)
                        geno.addAlleleOfGene(al_id, 'OMIM:'+str(entry_num),
                                             geno.object_properties['is_sequence_variant_instance_of'])
                        for r in publist[al_id]:
                            pmid = ref_to_pmid[int(r)]
                            gu.addTriple(g, pmid, gu.object_properties['is_about'], al_id)
                        # look up the pubmed id in the list of references
                        if 'dbSnps' in al['allelicVariant']:
                            dbsnp_ids = re.split(',', al['allelicVariant']['dbSnps'])
                            for dnum in dbsnp_ids:
                                did = 'dbSNP:'+dnum.strip()
                                gu.addIndividualToGraph(g, did, None)
                                gu.addEquivalentClass(g, al_id, did)
                        if 'clinvarAccessions' in al['allelicVariant']:
                            # clinvarAccessions triple semicolon delimited, each lik eRCV000020059;;1
                            rcv_ids = re.split(';;;', al['allelicVariant']['clinvarAccessions'])
                            rcv_ids = [(re.match('(RCV\d+)\;\;', r)).group(1) for r in rcv_ids]
                            for rnum in rcv_ids:
                                rid = 'ClinVar:'+rnum
                                gu.addXref(g, al_id, rid)
                        gu.addPage(g, al_id, "http://omim.org/entry/"+str(entry_num)+"#"+str(al_num).zfill(4))
                    elif re.search('moved', al['allelicVariant']['status']):
                        # for both 'moved' and 'removed'
                        moved_ids = None
                        if 'movedTo' in al['allelicVariant']:
                            moved_id = 'OMIM:'+al['allelicVariant']['movedTo']
                            moved_ids = [moved_id]
                        gu.addDeprecatedIndividual(g, al_id, moved_ids)
                    else:
                        logger.error('Uncaught alleleic variant status %s', al['allelicVariant']['status'])
                # end loop allelicVariantList

        return
Пример #3
0
    def _process_genes(self, taxid, limit=None):
        gu = GraphUtils(curie_map.get())

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        geno = Genotype(g)

        raw = '/'.join((self.rawdir, self.files[taxid]['file']))
        line_counter = 0
        logger.info("Processing Ensembl genes for tax %s", taxid)
        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t')
            for row in filereader:
                if len(row) < 4:
                    logger.error("Data error for file %s", raw)
                    return
                (ensembl_gene_id, external_gene_name, description,
                 gene_biotype, entrezgene) = row[0:5]

                # in the case of human genes, we also get the hgnc id,
                # and is the last col
                if taxid == '9606':
                    hgnc_id = row[5]
                else:
                    hgnc_id = None

                if self.testMode and entrezgene != '' \
                        and int(entrezgene) not in self.gene_ids:
                    continue

                line_counter += 1
                gene_id = 'ENSEMBL:'+ensembl_gene_id
                if description == '':
                    description = None
                gene_type_id = self._get_gene_type(gene_biotype)
                gene_type_id = None
                gu.addClassToGraph(
                    g, gene_id, external_gene_name, gene_type_id, description)

                if entrezgene != '':
                    gu.addEquivalentClass(g, gene_id, 'NCBIGene:'+entrezgene)
                if hgnc_id is not None and hgnc_id != '':
                    gu.addEquivalentClass(g, gene_id, hgnc_id)
                geno.addTaxon('NCBITaxon:'+taxid, gene_id)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        gu.loadProperties(g, Feature.object_properties, gu.OBJPROP)
        gu.loadProperties(g, Feature.data_properties, gu.DATAPROP)
        gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP)
        gu.loadAllProperties(g)

        return
Пример #4
0
    def _process_genes_kegg2ncbi(self, limit=None):
        """
        This method maps the KEGG human gene IDs
            to the corresponding NCBI Gene IDs.

        Triples created:
        <kegg_gene_id> is a class
        <ncbi_gene_id> is a class
        <kegg_gene_id> equivalentClass <ncbi_gene_id>
        :param limit:
        :return:

        """

        logger.info("Processing KEGG gene IDs to NCBI gene IDs")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0

        gu = GraphUtils(curie_map.get())
        raw = '/'.join((self.rawdir, self.files['ncbi']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (kegg_gene_id, ncbi_gene_id, link_type) = row

                if self.testMode and \
                        kegg_gene_id not in self.test_ids['genes']:
                    continue

                # Adjust the NCBI gene ID prefix.
                ncbi_gene_id = re.sub(r'ncbi-geneid', 'NCBIGene', ncbi_gene_id)
                kegg_gene_id = 'KEGG-'+kegg_gene_id

                # Adding the KEGG gene ID to the graph here is redundant,
                # unless there happens to be additional gene IDs in this table
                # not present in the genes table.
                gu.addClassToGraph(g, kegg_gene_id, None)
                gu.addClassToGraph(g, ncbi_gene_id, None)
                gu.addEquivalentClass(g, kegg_gene_id, ncbi_gene_id)

                if (not self.testMode) and (
                        limit is not None and line_counter > limit):
                    break

        logger.info("Done with KEGG gene IDs to NCBI gene IDs")
        return
Пример #5
0
    def _process_pathway_pathway(self, limit):
        """
        There are "map" and "ko" identifiers for pathways.
        This makes equivalence mapping between them, where they exist.
        :param limit:
        :return:

        """
        logger.info("Processing KEGG pathways to other ids")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0

        gu = GraphUtils(curie_map.get())
        raw = '/'.join((self.rawdir, self.files['pathway_pathway']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (pathway_id_1, pathway_id_2) = row

                if self.testMode and \
                        pathway_id_1 not in self.test_ids['pathway']:
                    continue

                pathway_id_1 = 'KEGG-'+pathway_id_1
                # will look like KEGG-path:map04130 or KEGG-path:ko04130
                pathway_id_2 = 'KEGG-'+pathway_id_2

                if pathway_id_1 != pathway_id_2:
                    gu.addEquivalentClass(g, pathway_id_1, pathway_id_2)

                if not self.testMode and \
                        limit is not None and line_counter > limit:
                    break

        return
Пример #6
0
    def _get_mappedids(self, entry, g):
        """
        Extract the Orphanet and UMLS ids as equivalences from the entry
        :param entry:
        :return:
        """
        # umlsIDs
        gu = GraphUtils(curie_map.get())
        omimid = 'OMIM:'+str(entry['mimNumber'])
        orpha_mappings = []
        if 'externalLinks' in entry:
            links = entry['externalLinks']
            if 'orphanetDiseases' in links:
                # triple semi-colon delimited list of double semi-colon delimited orphanet ID/disease pairs
                # 2970;;566;;Prune belly syndrome
                items = links['orphanetDiseases'].split(';;;')
                for i in items:
                    (orpha_num, internal_num, orpha_label) = i.split(';;')
                    orpha_id = 'Orphanet:'+orpha_num.strip()
                    orpha_mappings.append(orpha_id)
                    gu.addClassToGraph(g, orpha_id, orpha_label.strip())
                    gu.addXref(g, omimid, orpha_id)

            if 'umlsIDs' in links:
                umls_mappings = links['umlsIDs'].split(',')
                for i in umls_mappings:
                    umls_id = 'UMLS:'+i
                    gu.addClassToGraph(g, umls_id, None)
                    gu.addXref(g, omimid, umls_id)

            if self._get_omimtype(entry) == Genotype.genoparts['gene'] and 'geneIDs' in links:
                entrez_mappings = links['geneIDs']
                for i in entrez_mappings.split(','):
                    gu.addEquivalentClass(g, omimid, 'NCBIGene:'+str(i))

        return
Пример #7
0
    def _process_diseasegene(self, limit):
        """
        :param limit:
        :return:
        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        geno = Genotype(g)
        gu = GraphUtils(curie_map.get())

        myfile = "/".join((self.rawdir, self.files["disease-gene"]["file"]))

        for event, elem in ET.iterparse(myfile):
            if elem.tag == "Disorder":
                # get the element name and id
                # id = elem.get('id') # some internal identifier
                disorder_num = elem.find("OrphaNumber").text

                disorder_id = "Orphanet:" + str(disorder_num)

                if self.testMode and disorder_id not in config.get_config()["test_ids"]["disease"]:
                    continue

                disorder_label = elem.find("Name").text

                # make a hash of internal gene id to type for later lookup
                gene_iid_to_type = {}
                gene_list = elem.find("GeneList")
                for gene in gene_list.findall("Gene"):
                    gene_iid = gene.get("id")
                    gene_type = gene.find("GeneType").get("id")
                    gene_iid_to_type[gene_iid] = gene_type

                gu.addClassToGraph(g, disorder_id, disorder_label)  # assuming that these are in the ontology

                assoc_list = elem.find("DisorderGeneAssociationList")
                for a in assoc_list.findall("DisorderGeneAssociation"):
                    gene_iid = a.find(".//Gene").get("id")
                    gene_name = a.find(".//Gene/Name").text
                    gene_symbol = a.find(".//Gene/Symbol").text
                    gene_num = a.find("./Gene/OrphaNumber").text
                    gene_id = "Orphanet:" + str(gene_num)
                    gene_type_id = self._map_gene_type_id(gene_iid_to_type[gene_iid])
                    gu.addClassToGraph(g, gene_id, gene_symbol, gene_type_id, gene_name)
                    syn_list = a.find("./Gene/SynonymList")
                    if int(syn_list.get("count")) > 0:
                        for s in syn_list.findall("./Synonym"):
                            gu.addSynonym(g, gene_id, s.text)

                    dgtype = a.find("DisorderGeneAssociationType").get("id")
                    rel_id = self._map_rel_id(dgtype)
                    dg_label = a.find("./DisorderGeneAssociationType/Name").text
                    if rel_id is None:
                        logger.warn(
                            "Cannot map association type (%s) to RO for association (%s | %s).  Skipping.",
                            dg_label,
                            disorder_label,
                            gene_symbol,
                        )
                        continue

                    alt_locus_id = "_" + gene_num + "-" + disorder_num + "VL"
                    alt_label = " ".join(
                        ("some variant of", gene_symbol.strip(), "that is a", dg_label.lower(), disorder_label)
                    )
                    if self.nobnodes:
                        alt_locus_id = ":" + alt_locus_id
                    gu.addIndividualToGraph(g, alt_locus_id, alt_label, geno.genoparts["variant_locus"])
                    geno.addAlleleOfGene(alt_locus_id, gene_id)

                    # consider typing the gain/loss-of-function variants like:
                    # http://sequenceontology.org/browser/current_svn/term/SO:0002054
                    # http://sequenceontology.org/browser/current_svn/term/SO:0002053

                    # use "assessed" status to issue an evidence code
                    # FIXME I think that these codes are sub-optimal
                    status_code = a.find("DisorderGeneAssociationStatus").get("id")
                    eco_id = "ECO:0000323"  # imported automatically asserted information used in automatic assertion
                    if status_code == "17991":  # Assessed  # TODO are these internal ids stable between releases?
                        eco_id = "ECO:0000322"  # imported manually asserted information used in automatic assertion
                    # Non-traceable author statement ECO_0000034
                    # imported information in automatic assertion ECO_0000313

                    assoc = G2PAssoc(self.name, alt_locus_id, disorder_id, rel_id)
                    assoc.add_evidence(eco_id)
                    assoc.add_association_to_graph(g)

                    rlist = a.find("./Gene/ExternalReferenceList")
                    eqid = None

                    for r in rlist.findall("ExternalReference"):
                        if r.find("Source").text == "Ensembl":
                            eqid = "ENSEMBL:" + r.find("Reference").text
                        elif r.find("Source").text == "HGNC":
                            eqid = "HGNC:" + r.find("Reference").text
                        elif r.find("Source").text == "OMIM":
                            eqid = "OMIM:" + r.find("Reference").text
                        else:
                            pass  # skip the others for now
                        if eqid is not None:
                            gu.addClassToGraph(g, eqid, None)
                            gu.addEquivalentClass(g, gene_id, eqid)
                            pass
                elem.clear()  # discard the element

            if self.testMode and limit is not None and line_counter > limit:
                return

        gu.loadProperties(g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP)
        gu.loadProperties(g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP)
        gu.loadProperties(g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP)
        gu.loadAllProperties(g)

        return
Пример #8
0
    def _get_gene_info(self, limit):
        """
        Currently loops through the gene_info file and creates the genes as classes, typed with SO.  It will add their
        label, any alternate labels as synonyms, alternate ids as equivlaent classes.  HPRDs get added as
        protein products.  The chromosome and chr band get added as blank node regions, and the gene is faldo:located
        on the chr band.
        :param limit:
        :return:
        """
        gu = GraphUtils(curie_map.get())

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        geno = Genotype(g)

        # not unzipping the file
        logger.info("Processing Gene records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['gene_info']['file']))
        logger.info("FILE: %s", myfile)

        # Add taxa and genome classes for those in our filter
        for tax_num in self.tax_ids:
            tax_id = ':'.join(('NCBITaxon', str(tax_num)))
            geno.addGenome(tax_id, str(tax_num))   # tax label can get added elsewhere
            gu.addClassToGraph(g, tax_id, None)   # label added elsewhere
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match('^#', line):
                    continue
                (tax_num, gene_num, symbol, locustag,
                 synonyms, xrefs, chr, map_loc, desc,
                 gtype, authority_symbol, name,
                 nomenclature_status, other_designations, modification_date) = line.split('\t')

                ##### set filter=None in init if you don't want to have a filter
                #if self.filter is not None:
                #    if ((self.filter == 'taxids' and (int(tax_num) not in self.tax_ids))
                #            or (self.filter == 'geneids' and (int(gene_num) not in self.gene_ids))):
                #        continue
                ##### end filter

                if self.testMode and int(gene_num) not in self.gene_ids:
                    continue

                if int(tax_num) not in self.tax_ids:
                    continue

                line_counter += 1

                gene_id = ':'.join(('NCBIGene', gene_num))
                tax_id = ':'.join(('NCBITaxon', tax_num))
                gene_type_id = self._map_type_of_gene(gtype)

                if symbol == 'NEWENTRY':
                    label = None
                else:
                    label = symbol

                # TODO might have to figure out if things aren't genes, and make them individuals
                gu.addClassToGraph(g, gene_id, label, gene_type_id, desc)

                # we have to do special things here for genes, because they're classes not individuals
                # f = Feature(gene_id,label,gene_type_id,desc)

                if name != '-':
                    gu.addSynonym(g, gene_id, name)
                if synonyms.strip() != '-':
                    for s in synonyms.split('|'):
                        gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym'])
                if other_designations.strip() != '-':
                    for s in other_designations.split('|'):
                        gu.addSynonym(g, gene_id, s.strip(), Assoc.annotation_properties['hasRelatedSynonym'])

                # deal with the xrefs
                # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696
                if xrefs.strip() != '-':
                    for r in xrefs.strip().split('|'):
                        fixedr = self._cleanup_id(r)
                        if fixedr is not None and fixedr.strip() != '':
                            if re.match('HPRD', fixedr):
                                # proteins are not == genes.
                                gu.addTriple(g, gene_id, self.properties['has_gene_product'], fixedr)
                            else:
                                # skip some of these for now
                                if fixedr.split(':')[0] not in ['Vega', 'IMGT/GENE-DB']:
                                    gu.addEquivalentClass(g, gene_id, fixedr)

                # edge cases of id | symbol | chr | map_loc:
                # 263     AMD1P2    X|Y  with   Xq28 and Yq12
                # 438     ASMT      X|Y  with   Xp22.3 or Yp11.3    # in PAR
                # 419     ART3      4    with   4q21.1|4p15.1-p14   # no idea why there's two bands listed - possibly 2 assemblies
                # 28227   PPP2R3B   X|Y  Xp22.33; Yp11.3            # in PAR
                # 619538  OMS     10|19|3 10q26.3;19q13.42-q13.43;3p25.3   #this is of "unknown" type == susceptibility
                # 101928066       LOC101928066    1|Un    -         # unlocated scaffold
                # 11435   Chrna1  2       2 C3|2 43.76 cM           # mouse --> 2C3
                # 11548   Adra1b  11      11 B1.1|11 25.81 cM       # mouse --> 11B1.1
                # 11717   Ampd3   7       7 57.85 cM|7 E2-E3        # mouse
                # 14421   B4galnt1        10      10 D3|10 74.5 cM  # mouse
                # 323212  wu:fb92e12      19|20   -                 # fish
                # 323368  ints10  6|18    -                         # fish
                # 323666  wu:fc06e02      11|23   -                 # fish

                # feel that the chr placement can't be trusted in this table when there is > 1 listed
                # with the exception of human X|Y, i will only take those that align to one chr

                # FIXME remove the chr mapping below when we pull in the genomic coords
                if str(chr) != '-' and str(chr) != '':
                    if re.search('\|', str(chr)) and str(chr) not in ['X|Y','X; Y']:
                        # this means that there's uncertainty in the mapping.  skip it
                        # TODO we'll need to figure out how to deal with >1 loc mapping
                        logger.info('%s is non-uniquely mapped to %s.  Skipping for now.', gene_id, str(chr))
                        continue
                        # X|Y	Xp22.33;Yp11.3

                    # if (not re.match('(\d+|(MT)|[XY]|(Un)$',str(chr).strip())):
                    #    print('odd chr=',str(chr))
                    if str(chr) == 'X; Y':
                        chr = 'X|Y'  # rewrite the PAR regions for processing
                    # do this in a loop to allow PAR regions like X|Y
                    for c in re.split('\|',str(chr)) :
                        geno.addChromosomeClass(c, tax_id, None)  # assume that the chromosome label will get added elsewhere
                        mychrom = makeChromID(c, tax_num, 'CHR')
                        mychrom_syn = makeChromLabel(c, tax_num)  # temporarily use the taxnum for the disambiguating label
                        gu.addSynonym(g, mychrom,  mychrom_syn)
                        band_match = re.match('[0-9A-Z]+[pq](\d+)?(\.\d+)?$', map_loc)
                        if band_match is not None and len(band_match.groups()) > 0:
                            # if tax_num != '9606':
                            #     continue
                            # this matches the regular kind of chrs, so make that kind of band
                            # not sure why this matches? chrX|Y or 10090chr12|Un"
                            # TODO we probably need a different regex per organism
                            # the maploc_id already has the numeric chromosome in it, strip it first
                            bid = re.sub('^'+c, '', map_loc)
                            maploc_id = makeChromID(c+bid, tax_num, 'CHR')  # the generic location (no coordinates)
                            # print(map_loc,'-->',bid,'-->',maploc_id)
                            band = Feature(maploc_id, None, None)  # Assume it's type will be added elsewhere
                            band.addFeatureToGraph(g)
                            # add the band as the containing feature
                            gu.addTriple(g, gene_id, Feature.object_properties['is_subsequence_of'], maploc_id)
                        else:
                            # TODO handle these cases
                            # examples are: 15q11-q22, Xp21.2-p11.23, 15q22-qter, 10q11.1-q24,
                            ## 12p13.3-p13.2|12p13-p12, 1p13.3|1p21.3-p13.1,  12cen-q21, 22q13.3|22q13.3
                            logger.debug('not regular band pattern for %s: %s', gene_id, map_loc)
                            # add the gene as a subsequence of the chromosome
                            gu.addTriple(g, gene_id, Feature.object_properties['is_subsequence_of'], mychrom)

                geno.addTaxon(tax_id, gene_id)

                if not self.testMode and limit is not None and line_counter > limit:
                    break

            gu.loadProperties(g, Feature.object_properties, gu.OBJPROP)
            gu.loadProperties(g, Feature.data_properties, gu.DATAPROP)
            gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP)
            gu.loadAllProperties(g)

        return
Пример #9
0
    def _get_identifiers(self, limit):
        """
        This will process the id mapping file provided by Biogrid.
        The file has a very large header, which we scan past,
        then pull the identifiers, and make equivalence axioms

        :param limit:
        :return:

        """

        logger.info("getting identifier mapping")
        line_counter = 0
        f = '/'.join((self.rawdir, self.files['identifiers']['file']))
        myzip = ZipFile(f, 'r')
        # assume that the first entry is the item
        fname = myzip.namelist()[0]
        foundheader = False

        gu = GraphUtils(curie_map.get())

        # TODO align this species filter with the one above
        # speciesfilters = 'H**o sapiens,Mus musculus,Drosophila melanogaster,
        # Danio rerio, Caenorhabditis elegans,Xenopus laevis'.split(',')

        speciesfilters = 'H**o sapiens,Mus musculus'.split(',')
        with myzip.open(fname, 'r') as csvfile:
            for line in csvfile:
                # skip header lines
                if not foundheader:
                    if re.match(r'BIOGRID_ID', line.decode()):
                        foundheader = True
                    continue

                line = line.decode().strip()
                # BIOGRID_ID
                # IDENTIFIER_VALUE
                # IDENTIFIER_TYPE
                # ORGANISM_OFFICIAL_NAME
                # 1	814566	ENTREZ_GENE	Arabidopsis thaliana
                (biogrid_num, id_num, id_type,
                 organism_label) = line.split('\t')

                if self.testMode:
                    g = self.testgraph
                    # skip any genes that don't match our test set
                    if int(biogrid_num) not in self.biogrid_ids:
                        continue
                else:
                    g = self.graph

                # for each one of these,
                # create the node and add equivalent classes
                biogrid_id = 'BIOGRID:'+biogrid_num
                prefix = self._map_idtype_to_prefix(id_type)

                # TODO make these filters available as commandline options
                # geneidtypefilters='NCBIGene,OMIM,MGI,FlyBase,ZFIN,MGI,HGNC,
                #                   WormBase,XenBase,ENSEMBL,miRBase'.split(',')
                geneidtypefilters = 'NCBIGene,MGI,ENSEMBL,ZFIN,HGNC'.split(',')
                # proteinidtypefilters='HPRD,Swiss-Prot,NCBIProtein'
                if (speciesfilters is not None) \
                        and (organism_label.strip() in speciesfilters):
                    line_counter += 1
                    if (geneidtypefilters is not None) \
                            and (prefix in geneidtypefilters):
                        mapped_id = ':'.join((prefix, id_num))
                        gu.addEquivalentClass(g, biogrid_id, mapped_id)
                    # this symbol will only get attached to the biogrid class
                    elif id_type == 'OFFICIAL_SYMBOL':
                        gu.addClassToGraph(g, biogrid_id, id_num)
                    # elif (id_type == 'SYNONYM'):
                    #   FIXME - i am not sure these are synonyms, altids?
                    #   gu.addSynonym(g,biogrid_id,id_num)

                if not self.testMode and limit is not None \
                        and line_counter > limit:
                    break

        myzip.close()

        return
Пример #10
0
    def _process_omim2disease(self, limit=None):
        """
        This method maps the KEGG disease IDs to the corresponding OMIM disease IDs.
        Currently this only maps KEGG diseases and OMIM diseases that have a 1:1 mapping.

        Triples created:
        <kegg_disease_id> is a class
        <omim_disease_id> is a class
        <kegg_disease_id> hasXref <omim_disease_id>
        :param limit:
        :return:
        """

        logger.info("Processing 1:1 KEGG disease to OMIM disease mappings")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        gu = GraphUtils(curie_map.get())
        raw = '/'.join((self.rawdir, self.files['omim2disease']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                (omim_disease_id, kegg_disease_id, link_type) = row

                kegg_disease_id = 'KEGG-'+kegg_disease_id.strip()
                omim_disease_id = re.sub('omim', 'OMIM', omim_disease_id)

                # Create hash for the links from OMIM ID -> KEGG ID
                if omim_disease_id not in self.omim_disease_hash:
                    self.omim_disease_hash[omim_disease_id] = [kegg_disease_id]
                else:
                    self.omim_disease_hash[omim_disease_id].append(kegg_disease_id)

                # Create hash for the links from KEGG ID -> OMIM ID
                if kegg_disease_id not in self.kegg_disease_hash:
                    self.kegg_disease_hash[kegg_disease_id] = [omim_disease_id]
                else:
                    self.kegg_disease_hash[kegg_disease_id].append(omim_disease_id)

        # Now process the disease hashes and only process 1:1 omim disease:KEGG disease entries.
        for omim_disease_id in self.omim_disease_hash:
            if self.testMode and omim_disease_id not in self.test_ids['disease']:
                continue

            if (not self.testMode) and (limit is not None and line_counter > limit):
                break
            line_counter += 1

            if len(self.omim_disease_hash[omim_disease_id]) == 1:
                kegg_disease_id = ''.join(self.omim_disease_hash.get(omim_disease_id))
                if len(self.kegg_disease_hash[kegg_disease_id]) == 1:
                    # add ids, and deal with the labels separately
                    gu.addClassToGraph(g, kegg_disease_id, None)
                    gu.addClassToGraph(g, omim_disease_id, None)
                    gu.addEquivalentClass(g, kegg_disease_id, omim_disease_id)  # safe?
                    # gu.addXref(g, kegg_disease_id, omim_disease_id)

        logger.info("Done with KEGG disease to OMIM disease mappings.")
        return
Пример #11
0
    def _process_omim2gene(self, limit=None):
        """
        This method maps the OMIM IDs and KEGG gene ID. Currently split based on the link_type field.
        Equivalent link types are mapped as gene XRefs.
        Reverse link types are mapped as disease to gene associations.
        Original link types are currently skipped.

        Triples created:
        <kegg_gene_id> is a Gene
        <omim_gene_id> is a Gene
        <kegg_gene_id>> hasXref <omim_gene_id>

        <assoc_id> has subject <omim_disease_id>
        <assoc_id> has object <kegg_gene_id>
        :param limit:
        :return:
        """

        logger.info("Processing OMIM to KEGG gene")
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        geno = Genotype(g)
        gu = GraphUtils(curie_map.get())
        raw = '/'.join((self.rawdir, self.files['omim2gene']['file']))
        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (kegg_gene_id, omim_id, link_type) = row

                if self.testMode and kegg_gene_id not in self.test_ids['genes']:
                    continue

                kegg_gene_id = 'KEGG-'+kegg_gene_id.strip()
                omim_id = re.sub('omim', 'OMIM', omim_id)
                if link_type == 'equivalent':
                    # these are genes!  so add them as a class then make equivalence
                    gu.addClassToGraph(g, omim_id, None)
                    geno.addGene(kegg_gene_id, None)
                    gu.addEquivalentClass(g, kegg_gene_id, omim_id)
                elif link_type == 'reverse':
                    # make an association between an OMIM ID and the KEGG gene ID
                    # we do this with omim ids because they are more atomic than KEGG ids

                    alt_locus_id = self._make_variant_locus_id(kegg_gene_id, omim_id)
                    alt_label = self.label_hash[alt_locus_id]
                    gu.addIndividualToGraph(g, alt_locus_id, alt_label, geno.genoparts['variant_locus'])
                    geno.addAlleleOfGene(alt_locus_id, kegg_gene_id)

                    # Add the disease to gene relationship.
                    rel = gu.object_properties['is_marker_for']
                    assoc = G2PAssoc(self.name, alt_locus_id, omim_id, rel)
                    assoc.add_association_to_graph(g)

                elif link_type == 'original':
                    # these are sometimes a gene, and sometimes a disease
                    logger.info('Unable to handle original link for %s-%s', kegg_gene_id, omim_id)
                else:
                    # don't know what these are
                    logger.warn('Unhandled link type for %s-%s: %s', kegg_gene_id, omim_id, link_type)

                if (not self.testMode) and (limit is not None and line_counter > limit):
                    break

        logger.info("Done with OMIM to KEGG gene")
        gu.loadProperties(g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP)
        gu.loadProperties(g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP)
        gu.loadProperties(g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP)

        return
Пример #12
0
    def _process_genes(self, limit=None):
        gu = GraphUtils(curie_map.get())

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        geno = Genotype(g)

        raw = '/'.join((self.rawdir, self.files['genes']['file']))
        line_counter = 0
        logger.info("Processing HGNC genes")

        with open(raw, 'r', encoding="utf8") as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                (hgnc_id, symbol, name, locus_group, locus_type, status,
                 location, location_sortable, alias_symbol, alias_name,
                 prev_symbol, prev_name, gene_family, gene_family_id,
                 date_approved_reserved, date_symbol_changed,
                 date_name_changed, date_modified, entrez_id, ensembl_gene_id,
                 vega_id, ucsc_id, ena, refseq_accession, ccds_id, uniprot_ids,
                 pubmed_id, mgd_id, rgd_id, lsdb, cosmic, omim_id, mirbase,
                 homeodb, snornabase, bioparadigms_slc, orphanet,
                 pseudogene_org, horde_id, merops, imgt, iuphar,
                 kznf_gene_catalog, mamit_trnadb, cd, lncrnadb, enzyme_id,
                 intermediate_filament_db) = row

                line_counter += 1

                # skip header
                if line_counter <= 1:
                    continue

                if self.testMode and entrez_id != '' \
                        and int(entrez_id) not in self.gene_ids:
                    continue

                if name == '':
                    name = None
                gene_type_id = self._get_gene_type(locus_type)
                gu.addClassToGraph(g, hgnc_id, symbol, gene_type_id, name)
                if locus_type == 'withdrawn':
                    gu.addDeprecatedClass(g, hgnc_id)
                if entrez_id != '':
                    gu.addEquivalentClass(
                        g, hgnc_id, 'NCBIGene:' + entrez_id)
                if ensembl_gene_id != '':
                    gu.addEquivalentClass(
                        g, hgnc_id, 'ENSEMBL:' + ensembl_gene_id)
                geno.addTaxon('NCBITaxon:9606', hgnc_id)

                # add pubs as "is about"
                if pubmed_id != '':
                    for p in re.split(r'\|', pubmed_id.strip()):
                        if str(p) != '':
                            gu.addTriple(
                                g, 'PMID:' + str(p.strip()),
                                gu.object_properties['is_about'], hgnc_id)

                # add chr location
                # sometimes two are listed, like: 10p11.2 or 17q25
                # -- there are only 2 of these FRA10A and MPFD
                # sometimes listed like "1 not on reference assembly"
                # sometimes listed like 10q24.1-q24.3
                # sometimes like 11q11 alternate reference locus
                band = chrom = None
                chr_pattern = r'(\d+|X|Y|Z|W|MT)[pq$]'
                chr_match = re.match(chr_pattern, location)
                if chr_match is not None and len(chr_match.groups()) > 0:
                    chrom = chr_match.group(1)
                    chrom_id = makeChromID(chrom, 'NCBITaxon:9606', 'CHR')
                    band_pattern = r'([pq][A-H\d]?\d?(?:\.\d+)?)'
                    band_match = re.search(band_pattern, location)
                    f = Feature(hgnc_id, None, None)
                    if band_match is not None and len(band_match.groups()) > 0:
                        band = band_match.group(1)
                        band = chrom + band
                        # add the chr band as the parent to this gene
                        # as a feature but assume that the band is created
                        # as a class with properties elsewhere in Monochrom
                        # TEC Monoch? Monarchdom??
                        band_id = makeChromID(band, 'NCBITaxon:9606', 'CHR')
                        gu.addClassToGraph(g, band_id, None)
                        f.addSubsequenceOfFeature(g, band_id)
                    else:
                        gu.addClassToGraph(g, chrom_id, None)
                        f.addSubsequenceOfFeature(g, chrom_id)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

            # end loop through file

        gu.loadProperties(g, Feature.object_properties, gu.OBJPROP)
        gu.loadProperties(g, Feature.data_properties, gu.DATAPROP)
        gu.loadProperties(g, Genotype.object_properties, gu.OBJPROP)
        gu.loadAllProperties(g)

        return
Пример #13
0
    def _process_trait_mappings(self, raw, limit=None):
        """
        This method

        Triples created:

        :param limit:
        :return:
        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0

        gu = GraphUtils(curie_map.get())

        # with open(raw, 'r') as csvfile:
        #     filereader = csv.reader(csvfile, delimiter=',')
        #     row_count = sum(1 for row in filereader)
        #     row_count = row_count - 1

        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            next(filereader, None)  # skip header line
            for row in filereader:
                line_counter += 1
                # need to skip the last line
                if len(row) < 8:
                    logger.info("skipping line %d: %s", line_counter, '\t'.join(row))
                    continue
                (vto_id, pto_id, cmo_id, ato_column, species, trait_class, trait_type, qtl_count) = row

                ato_id = re.sub('ATO #', 'AQTLTrait:', re.sub('\].*', '', re.sub('\[', '', ato_column)))

                ato_label = re.sub('.*\]\s*', '', ato_column)
                # if species == 'Cattle':
                #     ato_id = re.sub('ATO:', 'AQTLTraitCattle:', ato_id)
                # elif species == 'Chicken':
                #     ato_id = re.sub('ATO:', 'AQTLTraitChicken:', ato_id)
                # elif species == 'Sheep':
                #     ato_id = re.sub('ATO:', 'AQTLTraitSheep:', ato_id)
                # elif species == 'Horse':
                #     ato_id = re.sub('ATO:', 'AQTLTraitHorse:', ato_id)
                # elif species == 'Pig':
                #     ato_id = re.sub('ATO:', 'AQTLTraitPig:', ato_id)
                # elif species == 'Rainbow trout':
                #     ato_id = re.sub('ATO:', 'AQTLTraitRainbowTrout:', ato_id)
                # else:
                #     logger.warn(' Unknown species %s found in trait mapping file.', species)
                #     continue
                #print(ato_label)

                gu.addClassToGraph(g, ato_id, ato_label.strip())

                if re.match('VT:.*', vto_id):
                    gu.addClassToGraph(g, vto_id, None)
                    gu.addEquivalentClass(g, ato_id, vto_id)
                if re.match('PT:.*', pto_id):
                    gu.addClassToGraph(g, pto_id, None)
                    gu.addEquivalentClass(g, ato_id, pto_id)
                if re.match('CMO:.*', cmo_id):
                    gu.addClassToGraph(g, cmo_id, None)
                    gu.addXref(g, ato_id, cmo_id)

        logger.info("Done with trait mappings")
        return
Пример #14
0
    def _process_diseasegene(self, limit):
        """
        :param limit:
        :return:
        """
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        line_counter = 0
        geno = Genotype(g)
        gu = GraphUtils(curie_map.get())

        myfile = '/'.join((self.rawdir, self.files['disease-gene']['file']))

        # PYLINT complains iterparse deprecated,
        # but as of py 3.4 only the optional & unsupplied parse arg is.
        for event, elem in ET.iterparse(myfile):
            if elem.tag == 'Disorder':
                # get the element name and id, ignoreS element name
                # id = elem.get('id') # some internal identifier
                disorder_num = elem.find('OrphaNumber').text

                disorder_id = 'Orphanet:'+str(disorder_num)

                if self.testMode and \
                        disorder_id not in \
                        config.get_config()['test_ids']['disease']:
                    continue

                disorder_label = elem.find('Name').text

                # make a hash of internal gene id to type for later lookup
                gene_iid_to_type = {}
                gene_list = elem.find('GeneList')
                for gene in gene_list.findall('Gene'):
                    gene_iid = gene.get('id')
                    gene_type = gene.find('GeneType').get('id')
                    gene_iid_to_type[gene_iid] = gene_type

                # assuming that these are in the ontology
                gu.addClassToGraph(g, disorder_id, disorder_label)

                assoc_list = elem.find('DisorderGeneAssociationList')
                for a in assoc_list.findall('DisorderGeneAssociation'):
                    gene_iid = a.find('.//Gene').get('id')
                    gene_name = a.find('.//Gene/Name').text
                    gene_symbol = a.find('.//Gene/Symbol').text
                    gene_num = a.find('./Gene/OrphaNumber').text
                    gene_id = 'Orphanet:'+str(gene_num)
                    gene_type_id = \
                        self._map_gene_type_id(gene_iid_to_type[gene_iid])
                    gu.addClassToGraph(
                        g, gene_id, gene_symbol, gene_type_id, gene_name)
                    syn_list = a.find('./Gene/SynonymList')
                    if int(syn_list.get('count')) > 0:
                        for s in syn_list.findall('./Synonym'):
                            gu.addSynonym(g, gene_id, s.text)

                    dgtype = a.find('DisorderGeneAssociationType').get('id')
                    rel_id = self._map_rel_id(dgtype)
                    dg_label = \
                        a.find('./DisorderGeneAssociationType/Name').text
                    if rel_id is None:
                        logger.warning(
                            "Cannot map association type (%s) to RO " +
                            "for association (%s | %s).  Skipping.",
                            dg_label, disorder_label, gene_symbol)
                        continue

                    alt_locus_id = '_'+gene_num+'-'+disorder_num+'VL'
                    alt_label = \
                        ' '.join(('some variant of', gene_symbol.strip(),
                                  'that is a', dg_label.lower(),
                                  disorder_label))
                    if self.nobnodes:
                        alt_locus_id = ':'+alt_locus_id
                    gu.addIndividualToGraph(g, alt_locus_id, alt_label,
                                            geno.genoparts['variant_locus'])
                    geno.addAlleleOfGene(alt_locus_id, gene_id)

                    # consider typing the gain/loss-of-function variants like:
                    # http://sequenceontology.org/browser/current_svn/term/SO:0002054
                    # http://sequenceontology.org/browser/current_svn/term/SO:0002053

                    # use "assessed" status to issue an evidence code
                    # FIXME I think that these codes are sub-optimal
                    status_code = \
                        a.find('DisorderGeneAssociationStatus').get('id')
                    # imported automatically asserted information
                    # used in automatic assertion
                    eco_id = 'ECO:0000323'
                    # Assessed
                    # TODO are these internal ids stable between releases?
                    if status_code == '17991':
                        # imported manually asserted information
                        # used in automatic assertion
                        eco_id = 'ECO:0000322'
                    # Non-traceable author statement ECO_0000034
                    # imported information in automatic assertion ECO_0000313

                    assoc = G2PAssoc(self.name, alt_locus_id,
                                     disorder_id, rel_id)
                    assoc.add_evidence(eco_id)
                    assoc.add_association_to_graph(g)

                    rlist = a.find('./Gene/ExternalReferenceList')
                    eqid = None

                    for r in rlist.findall('ExternalReference'):
                        if r.find('Source').text == 'Ensembl':
                            eqid = 'ENSEMBL:'+r.find('Reference').text
                        elif r.find('Source').text == 'HGNC':
                            eqid = 'HGNC:'+r.find('Reference').text
                        elif r.find('Source').text == 'OMIM':
                            eqid = 'OMIM:'+r.find('Reference').text
                        else:
                            pass  # skip the others for now
                        if eqid is not None:
                            gu.addClassToGraph(g, eqid, None)
                            gu.addEquivalentClass(g, gene_id, eqid)
                elem.clear()  # discard the element

            if self.testMode and limit is not None and line_counter > limit:
                return

        gu.loadProperties(
            g, G2PAssoc.annotation_properties, G2PAssoc.ANNOTPROP)
        gu.loadProperties(g, G2PAssoc.datatype_properties, G2PAssoc.DATAPROP)
        gu.loadProperties(g, G2PAssoc.object_properties, G2PAssoc.OBJECTPROP)
        gu.loadAllProperties(g)

        return