Exemplo n.º 1
0
    def __init__(self, gobo, fileName=fileLocation + "/hpp12_hp_xref"):

        self.df = DataFrame.parseFromFile(fileName)

        self.infos = {}
        self.add_infos = defaultdict(list)
        self.xrefs = {
            'Uniprot': 'GeneIdentity.UNIPROT',
            'GO': 'GeneIdentity.GO_ID',
            'Pfam': 'GeneIdentity.PFAM',
            'Interpro': 'GeneIdentity.INTERPRO',
        }

        self.go = GeneOntology(gobo)

        for row in self.df:
            elemName = row['GeneIdentity.GENE_NAME']
            self.infos[elemName] = row

        if os.path.exists(fileName + "_add"):
            self.df_add = DataFrame.parseFromFile(fileName + "_add")
            for row in self.df_add:
                self.add_infos[row['XREF']].append(row['GOID'])
Exemplo n.º 2
0
    def loadFromFile(cls, filepath, ltype='gene', rtype='mirna',normGeneSymbols=None):


        ret = MiRNADiseaseDB(ltype, rtype)
        file_base = os.path.basename(filepath)

        mir2disease = DataFrame.parseFromFile(filepath, bConvertTextToNumber=False)

        datasource = "mir2disease"


        # mirna   disease effect  measurement     year    title   pmid    doid
        # hsa-let-7f-2    kidney cancer   up-regulated    microarray      2007    Micro-RNA profiling in kidney and bladder cancers.      17826655        DOID:263

        for idx,disentry in enumerate(mir2disease):

            rid = disentry['mirna']

            diseaseDescr = disentry['disease']
            diseaseDOID = disentry['doid']

            diseasePMID = disentry['pmid']
            diseaseEffect = disentry['effect']
            diseaseMeasure = disentry['measurement']

            org, rid = cls.harmonizeMIRNA(rid)

            if not org in ['hsa', 'mmu']:
                continue


            orgs = set()
            orgs.add(org)

            dataid = file_base + "_" + str(idx)

            relations = set([
                MiRNADiseaseEntry(("", ltype), (rid, rtype), diseaseDOID, diseaseDescr, diseasePMID, diseaseEffect, diseaseMeasure, orgs, datasource, dataid)
                ])

            for rel in relations:

                ret.ltype2rel[""].add(rel)
                ret.rtype2rel[rid].add(rel)

            ret.all_ltypes.add("")
            ret.all_rtypes.add(rid)


        return ret
Exemplo n.º 3
0
    def loadFromFile(cls,
                     filepath,
                     symbol2ens,
                     ltype='mirna',
                     rtype='lncrna',
                     org="mmu"):

        ret = MirandaRelDB(ltype, rtype)
        file_base = os.path.basename(filepath)

        mirandaEvidences = DataFrame.parseFromFile(filepath,
                                                   bConvertTextToNumber=False)

        for mirtEntry in mirandaEvidences:

            lid = mirtEntry['Name_miRNA']
            rid = mirtEntry['Name_gene']
            org, lid = cls.harmonizeMIRNA(lid)

            if "." in rid:
                rid = rid[0:rid.index(".")]

            #print(org, lid, rid)

            if not org in ['hsa', 'mmu']:
                continue

            retObj = symbol2ens.get_symbol_from_ens(org, rid)

            if retObj == None and not rid.startswith('LNC'):
                #print(lid, rid, org)
                continue
            else:
                retObj = [rid]

            for symbol in retObj:

                rid = symbol

                transcript = mirtEntry['Name_transcript']
                align_score = mirtEntry['align_score']
                energy = mirtEntry['energy']
                mirna_start = mirtEntry['mirna_start']
                mirna_end = mirtEntry['mirna_end']
                lnc_start = mirtEntry['lnc_start']
                lnc_end = mirtEntry['lnc_end']
                align_length = mirtEntry['align_len']
                mirna_identity = mirtEntry['mirna_iden']
                lncrna_identity = mirtEntry['lncrna_iden']
                mirna_alignment = mirtEntry['mirna_alignment']
                alignment = mirtEntry['alignment']
                lncrna_alignment = mirtEntry['lncrna_alignment']
                dataSource = 'miranda'
                orgs = [org]

                relations = set([
                    MirandaRel(
                        (lid, ltype), (rid, rtype), dataSource, transcript,
                        align_score, energy, mirna_start, mirna_end, lnc_start,
                        lnc_end, align_length, mirna_identity, lncrna_identity,
                        mirna_alignment, alignment, lncrna_alignment), orgs
                ])

                for rel in relations:

                    ret.ltype2rel[lid].add(rel)
                    ret.rtype2rel[rid].add(rel)

                ret.all_ltypes.add(lid)
                ret.all_rtypes.add(rid)

        return ret
Exemplo n.º 4
0
    def loadFromFile(cls, filepath, dbtype='pmid', normGeneSymbols=None):

        syns = Synfile(os.path.dirname(filepath) + "/celllines.all.syn")
        syngrepFile = os.path.dirname(filepath) + "/celllines.index"

        cellline2obo = defaultdict(set)

        with open(syngrepFile, 'r') as fin:

            for line in fin:
                line = line.strip()
                line = line.split("\t")

                word = line[0].strip()
                syn = line[1].split(":")

                syn = syns.get(int(syn[1]))
                synID = syn.id

                cellline2obo[word].add(synID)

        ret = DIANATarbaseDB("gene", "mirna")

        species2org = {'H**o sapiens': "hsa", 'Mus musculus': "mmu"}

        seenRels = set()

        geneSymbolsNormalized = 0

        dianaData = DataFrame.parseFromFile(filepath)

        foundCellInfos = []
        seenMirnas = {}

        for idx, row in enumerate(dianaData):
            """
            geneId  geneName        mirna   species cell_line       tissue  category        method  positive_negative       direct_indirect up_down condition
            0910001A06Rik   0910001A06Rik   mmu-miR-124-3p  Mus musculus    NA      NA      NA      Microarrays     POSITIVE        INDIRECT        UP      NA
            """

            geneID = row['geneName']
            mirna = row['mirna']

            species = row['species']
            cellline = row['cell_line']
            tissue = row['tissue']
            method = row['method']
            measure = row['direct_indirect']
            direction = row['up_down']

            if cellline == "NA":
                cellline = None

            if tissue == "NA":
                tissue = None

            if method == "NA":
                method = None

            if measure == "NA":
                measure = None

            if species not in ['Mus musculus', 'H**o sapiens']:
                continue

            org = None

            if mirna in seenMirnas:
                org, mirna = seenMirnas[mirna]
            else:
                origMirna = mirna
                (org, mirna) = cls.harmonizeMIRNA(mirna)
                seenMirnas[origMirna] = (org, mirna)

            docOrgs = set()

            if org != None:
                docOrgs.add(org)

            if species in species2org:
                docOrgs.add(species2org[species])

            geneID = geneID.upper()
            if geneID in normGeneSymbols:
                geneID = normGeneSymbols[geneID]

            docID = "DIANA:" + str(idx)

            if cellline in cellline2obo:

                for oboID in cellline2obo[cellline]:
                    celllInfo = {
                        'docid': docID,
                        'termid': oboID,
                        'termname': cellline,
                        'evidences': []
                    }

                    foundCellInfos.append(celllInfo)

            entry = DIANATarbaseEntry(
                (tuple(docOrgs), cellline, tissue, method, measure, direction),
                docID, (geneID, "gene"), (mirna, "mirna"), "DIANA", idx)

            ret.ltype2rel[geneID].add(entry)
            ret.rtype2rel[mirna].add(entry)

            ret.all_ltypes.add(geneID)
            ret.all_rtypes.add(mirna)

        return ret, foundCellInfos
Exemplo n.º 5
0
    def loadFromFile(cls, basePath):

        ret = SymbolEnsemblDB()

        hsa = True
        mmu = True

        if mmu:
            MGIdata = DataFrame.parseFromFile(basePath + "/MRK_Sequence.rpt")

            transript2gene = {}

            with open(basePath + "/mmu_gene_transcript.txt") as fin:

                for line in fin:
                    aline = line.strip().split("\t")

                    egene = aline[0]
                    etran = aline[1]

                    transript2gene[etran] = egene

            for row in MGIdata:

                symbol = row['Marker Symbol'].upper()
                etranscripts = row['Ensembl transcript IDs'].split("|")

                if len(etranscripts) == 0:
                    continue

                egenes = set()

                for etran in etranscripts:
                    egeneid = transript2gene.get(etran, None)

                    if egeneid != None:
                        egenes.add(egeneid)

                for geneid in egenes:
                    ret.org2convert['mmu'][symbol].add(geneid)
                    ret.org2ens2symbol['mmu'][geneid] = symbol

        if hsa:

            hgncData = DataFrame.parseFromFile(basePath + "/hgnc_ext.tsv")

            for row in hgncData:

                symbol = row['Approved Symbol']

                if symbol == None:
                    continue

                symbol = symbol.upper()

                ensemblGeneID = row['Ensembl ID(supplied by Ensembl)']

                if ensemblGeneID == None:
                    continue

                ret.org2convert['hsa'][symbol].add(ensemblGeneID)
                ret.org2ens2symbol['hsa'][ensemblGeneID] = symbol

        return ret
Exemplo n.º 6
0
from collections import Counter
import sys, os

sys.path.append(os.path.dirname(__file__) + "/../")

from utils.idutils import loadExludeWords
from utils.DataFrame import DataFrame

from utils.idutils import printToFile
from synonymes.Synonym import Synonym
from synonymes.SynonymUtils import handleCommonExcludeWords

if __name__ == '__main__':

    MGIdata = DataFrame.parseFromFile(
        "/mnt/d/owncloud/data/miRExplore/MRK_Sequence.rpt")

    mgiID = MGIdata.getColumnIndex("MGI Marker Accession ID")
    mgiSym = MGIdata.getColumnIndex("Marker Symbol")
    #mgiSyn = MGIdata.getColumnIndex("Marker Synonyms (pipe-separated)")
    mgiUniprot = MGIdata.getColumnIndex("UniProt IDs")

    mgiGeneType = 20

    MGIinfo = {}
    foundUniprotIDs = set()
    uniprotID2MGI = {}

    locID2sym = {}

    mirIDs = set()
Exemplo n.º 7
0
    def loadFromFile(cls,
                     filepath,
                     ltype='gene',
                     rtype='mirna',
                     normGeneSymbols=None,
                     getDocs=False):

        ret = MirTarBaseDB(ltype, rtype)
        file_base = os.path.basename(filepath)

        mirtarbaseEvidences = DataFrame.parseFromFile(
            filepath, bConvertTextToNumber=False)

        seenMirnas = {}
        geneSymbolsNormalized = 0

        docs = set()

        for mirtEntry in mirtarbaseEvidences:

            lid = mirtEntry['Target Gene'].upper()

            if lid in normGeneSymbols:
                lid = normGeneSymbols[lid]
                geneSymbolsNormalized = 0

            rid = mirtEntry['miRNA']

            if rid in seenMirnas:
                org, rid = seenMirnas[rid]
            else:
                origRid = rid
                org, rid = cls.harmonizeMIRNA(rid)
                seenMirnas[origRid] = (org, rid)

            if not org in ['hsa', 'mmu']:
                continue

            organism = mirtEntry['Species (miRNA)']

            orgs = set()

            orgs.add(org)

            if organism == 'H**o sapiens':
                orgs.add('hsa')
            elif organism == 'Mus musculus':
                orgs.add('mmu')

            dataID = mirtEntry['miRTarBase ID']
            dataSource = 'miRTarBase'

            #Experiments     Support Type    References (PMID)

            docID = mirtEntry['References (PMID)']
            expSupport = mirtEntry['Experiments'].split("//")
            supType = mirtEntry['Support Type']

            if docID != None and len(docID) > 0:
                docs.add(docID)

            relations = set([
                MirTarBaseRel((lid, ltype), (rid, rtype), dataSource, dataID,
                              expSupport, supType, docID, orgs)
            ])

            for rel in relations:

                ret.ltype2rel[lid].add(rel)
                ret.rtype2rel[rid].add(rel)

            ret.all_ltypes.add(lid)
            ret.all_rtypes.add(rid)

        print("Gene Symbols Normalized", geneSymbolsNormalized)

        if getDocs:
            return ret, docs

        return ret
Exemplo n.º 8
0
    def loadFromFolder(cls):

        baseFolder = "/mnt/t/ownCloud/data/miRExplore/obodir/map_ncit_syms/"

        ncit2swissprotDF = DataFrame.parseFromFile(
            baseFolder + "NCIt-SwissProt_Mapping.txt")
        hsa_mmu_orthologuesDF = DataFrame.parseFromFile(
            baseFolder + "human_mouse_orthologues_ensembl.tsv")
        ensembl2hgncDF = DataFrame.parseFromFile(baseFolder +
                                                 "ensembl_hgnc_uniprot.txt")
        ensembl2mgiDF = DataFrame.parseFromFile(baseFolder +
                                                "ensembl_mgi_hgnc.tsv")

        addUniprotEnsemblDF = DataFrame.parseFromFile(baseFolder +
                                                      "add_swissprot_ensembl")

        print("ncit2swissprot")
        print(ncit2swissprotDF.getHeader())

        print("hsa_mmu_orthologues")
        print(hsa_mmu_orthologuesDF.getHeader())

        print("ensembl2hgnc")
        print(ensembl2hgncDF.getHeader())

        print("ensembl2mgi")
        print(ensembl2mgiDF.getHeader())

        ncit2swissprot = {}

        for row in ncit2swissprotDF:
            ncit2swissprot[row['NCIt Code']] = row['SwissProt ID']

        swissprot2ensembl = defaultdict(set)
        for row in ensembl2hgncDF:

            ensemblGeneID = row['Gene stable ID']
            swissprotID = row['UniProtKB/Swiss-Prot ID']

            swissprot2ensembl[swissprotID].add(ensemblGeneID)

        for row in addUniprotEnsemblDF:

            ensemblGeneID = row['From']
            swissprotID = row['To']

            swissprot2ensembl[swissprotID].add(ensemblGeneID)

        hsaEnsembl2Sym = {}
        for row in ensembl2hgncDF:
            ensemblGeneID = row['Gene stable ID']
            hgncSymbol = row['HGNC symbol']

            if hgncSymbol == None or len(hgncSymbol) == 0:
                continue

            hsaEnsembl2Sym[ensemblGeneID] = hgncSymbol

        mmuEnsembl2Sym = {}
        for row in ensembl2mgiDF:
            ensemblGeneID = row['Gene stable ID']
            mgiSymbol = row['MGI symbol']

            if mgiSymbol == None or len(mgiSymbol) == 0:
                continue

            mmuEnsembl2Sym[ensemblGeneID] = mgiSymbol

        hsaEnsembl2mmuEnsembl = defaultdict(set)
        for row in hsa_mmu_orthologuesDF:

            hsaID = row['Gene stable ID']
            mmuID = row['Mouse gene stable ID']

            hsaEnsembl2mmuEnsembl[hsaID].add(mmuID)

        retDB = NcitTermSymbolDB()

        unknownSwissprots = set()

        for ncitID in ncit2swissprot:

            swissprotID = ncit2swissprot[ncitID]

            hsaEnsembls = swissprot2ensembl.get(swissprotID, None)

            if hsaEnsembls == None:
                unknownSwissprots.add(swissprotID)
                #print("no ensembl for swissprot", swissprotID)
                continue

            for hsaEnsembl in hsaEnsembls:
                hsaSym = hsaEnsembl2Sym.get(hsaEnsembl, None)

                if hsaSym == None:
                    pass  #print("no hsa symols for", hsaEnsembl)
                    continue

                retDB.org_term2symbol['hsa'][ncitID].add(hsaSym)

                mmuIDs = hsaEnsembl2mmuEnsembl.get(hsaEnsembl, None)

                if mmuIDs == None:
                    pass  #print("no mmu ortholog for", hsaEnsembl)
                    continue

                for mmuID in mmuIDs:
                    mmuSym = mmuEnsembl2Sym.get(mmuID)

                    if mmuSym == None:
                        pass  #print("no mmu symbol for", mmuID)
                        continue

                    retDB.org_term2symbol['mmu'][ncitID].add(mmuSym)

        #print(unknownSwissprots)

        #for x in unknownSwissprots:
        #    print(x)

        #print(len(unknownSwissprots))

        wellAnnotated = 0
        for row in ncit2swissprotDF:
            ncitID = row['NCIt Code']

            hsaRes = retDB.org_term2symbol['hsa'].get(ncitID, None)
            mmuRes = retDB.org_term2symbol['mmu'].get(ncitID, None)

            if hsaRes == None or mmuRes == None:
                pass  #print(ncitID, row['NCIt Preferred Name'], hsaRes, mmuRes)

            else:
                wellAnnotated += 1

        print("well annotated:", wellAnnotated)

        return retDB

        for elem in swissprot2ensembl:
            if len(swissprot2ensembl[elem]) > 1:
                print(elem, swissprot2ensembl[elem])