예제 #1
0
def __addGene(i, geneSym, geneTrait, __ENABLE_GENE_VERIFICATION, __ENABLE_GENE_UPDATES, updatedGeneSet, invalidGeneSet):
    if __ENABLE_GENE_UPDATES:
        parent = geneDB.findUpdatedSymbol(geneSym)
        if parent != None:
            if __DEBUG > 1:
                print "Replacing",geneSym,"with",parent
            updatedGeneSet.add(geneSym)
            geneSym = parent
    
    if not __ENABLE_GENE_VERIFICATION or geneDB.isApproved(geneSym):
        __geneSet.add(geneSym)
        
        try:
            __studyGenes[i].add(geneSym)
        except KeyError:
            __studyGenes[i] = set([geneSym])
        
        try:
            __traitDict[geneSym].add(geneTrait)
        except KeyError:
            __traitDict[geneSym] = set([geneTrait])
            
        try:
            __studyByGene[geneSym].add(i)
        except KeyError:
            __studyByGene[geneSym] = set([i])
    elif __ENABLE_GENE_VERIFICATION:
       invalidGeneSet.add(geneSym)
예제 #2
0
def removeInvalidGenes(geneSet):
    remove = []
    for gene in geneSet:
        if not geneDB.isApproved(gene):
            if __DEBUG>2:
                print "Gene:", gene,"is not valid..."
            remove.append(gene)
            
    invalid_file = open(os.sep.join(["results","log","invalid_genelist.txt"]),'w')
    for r in remove:
        geneSet.remove(r)
        invalid_file.write(r + "\n")
    invalid_file.close()
        
    if __DEBUG>0:
        print "\n----------------------------"
        print "Gene Set Correction Results:"
        print "Removed:     ", len(remove)
        print "----------------------------\n"
예제 #3
0
def removeInvalidGenes(geneSet):
    remove = []
    for gene in geneSet:
        if not geneDB.isApproved(gene):
            if __DEBUG > 2:
                print "Gene:", gene, "is not valid..."
            remove.append(gene)

    invalid_file = open(
        os.sep.join(["results", "log", "invalid_genelist.txt"]), 'w')
    for r in remove:
        geneSet.remove(r)
        invalid_file.write(r + "\n")
    invalid_file.close()

    if __DEBUG > 0:
        print "\n----------------------------"
        print "Gene Set Correction Results:"
        print "Removed:     ", len(remove)
        print "----------------------------\n"
def mapTargetNames(targets_file,
                   __ENABLE_GENE_UPDATES=1,
                   __ENABLE_GENE_VERIFICATION=1):
    global __targets_unnamed, __targets, __target_names, __geneSet

    __targetCatalogue = pyCSV()
    __targetCatalogue.load(targets_file)
    rejectednames = 0

    representedids = set([])

    for r in xrange(1, __targetCatalogue.rows + 1):
        targetId = int(__targetCatalogue.get(r, 0))
        targetGene = geneUtils.formatGeneSymbol(__targetCatalogue.get(r, 2))

        representedids.add(targetId)

        if targetGene != None:
            parentSym = geneDB.findUpdatedSymbol(targetGene)
            if __ENABLE_GENE_UPDATES and parentSym != None:
                if __DEBUG > 1:
                    print "Updated:", targetGene, "to:", parentSym
                targetGene = parentSym
            if __ENABLE_GENE_VERIFICATION and not geneDB.isApproved(
                    targetGene):
                if __DEBUG > 1:
                    print "Rejected:", targetGene
                rejectednames += 1
                continue

            __geneSet.add(targetGene)
            __target_names[targetId] = targetGene

    for targetId in __targets:
        if targetId not in __target_names.keys():
            __targets_unnamed += 1

    if __DEBUG > 0:
        print "Rejected:        ", rejectednames
        print "Unrepresented:   ", len(set(__targets.keys()) - representedids)

    __drugSet = set([])
    for drugbankid in __drugs:
        drug = __drugs[drugbankid]

        for target in drug['targets']:
            targetId = target['partner']
            if targetId in __target_names:
                targetGene = __target_names[targetId]

                __drugSet.add(drugbankid)
                try:
                    __drugDict[targetGene].add(drugbankid)
                except KeyError:
                    __drugDict[targetGene] = set([drugbankid])

    removable_drugs = set([])
    for drugbankid in __drugs:
        if drugbankid not in __drugSet:
            removable_drugs.add(drugbankid)

    for drugbankid in removable_drugs:
        del __drugs[drugbankid]

    lenbefore = len(__geneSet)
    __geneSet = __geneSet & set(__drugDict.keys())
    lenafter = len(__geneSet)
    if __DEBUG > 0:
        print "Removed", (lenafter - lenbefore), "untargeted gene names"

    print "Total drugs with targets:   ", len(__drugSet), len(__drugs)
    print "Total geneset size:         ", len(__geneSet)
def mapTargetNames(targets_file,__ENABLE_GENE_UPDATES=1,__ENABLE_GENE_VERIFICATION=1):
    global __targets_unnamed, __targets, __target_names, __geneSet
    
    __targetCatalogue = pyCSV()
    __targetCatalogue.load(targets_file)
    rejectednames = 0

    representedids = set([])

    for r in xrange(1, __targetCatalogue.rows+1):
        targetId   = int(__targetCatalogue.get(r,0))
        targetGene = geneUtils.formatGeneSymbol(__targetCatalogue.get(r,2))
        
        representedids.add(targetId)

        if targetGene != None:
            parentSym = geneDB.findUpdatedSymbol(targetGene)
            if __ENABLE_GENE_UPDATES and parentSym != None:
                if __DEBUG>1:
                    print "Updated:", targetGene, "to:", parentSym
                targetGene = parentSym
            if __ENABLE_GENE_VERIFICATION and not geneDB.isApproved(targetGene):
                if __DEBUG>1:
                    print "Rejected:", targetGene
                rejectednames+=1
                continue

            __geneSet.add(targetGene)
            __target_names[targetId] = targetGene

    
    for targetId in __targets:
        if targetId not in __target_names.keys():
            __targets_unnamed += 1

    if __DEBUG>0:
        print "Rejected:        ", rejectednames
        print "Unrepresented:   ", len(set(__targets.keys()) - representedids)
    
    __drugSet = set([])
    for drugbankid in __drugs:
        drug = __drugs[drugbankid]

        for target in drug['targets']:
            targetId = target['partner']
            if targetId in __target_names:
                targetGene = __target_names[targetId]
                
                __drugSet.add(drugbankid)
                try:
                    __drugDict[targetGene].add(drugbankid)
                except KeyError:
                    __drugDict[targetGene] = set([drugbankid])
    
    removable_drugs = set([])
    for drugbankid in __drugs:
        if drugbankid not in __drugSet:
            removable_drugs.add(drugbankid)

    for drugbankid in removable_drugs:
        del __drugs[drugbankid]

    lenbefore = len(__geneSet)
    __geneSet = __geneSet & set(__drugDict.keys())
    lenafter = len(__geneSet)
    if __DEBUG>0:
        print "Removed", (lenafter - lenbefore), "untargeted gene names"
    
    print "Total drugs with targets:   ", len(__drugSet), len(__drugs)
    print "Total geneset size:         ", len(__geneSet)
예제 #6
0
def loadEvolutionaryGenes(filename, __ENABLE_GENE_VERIFICATION=0,
        __ENABLE_GENE_UPDATES=0, __CROSS_MATCH_LEVEL = 1, include_studies = [1,2,3,4,5]):
    global __DEBUG
    
    genesTSV = pyCSV()
    genesTSV.load(filename, "\t")
    
    bustamante = []
    vamathevan_human = []
    kosiol_human = []

    if 1 in include_studies:
        bustamante          = [item.lower() for item in geneUtils.columnToList(genesTSV, 1, 2)]
    if 2 in include_studies:
        vamathevan_human    = [item.lower() for item in geneUtils.columnToList(genesTSV, 3, 2)]
    if 3 in include_studies:
        kosiol_human        = [item.lower() for item in geneUtils.columnToList(genesTSV, 8, 2)]
    
    conflicts = []
    
    bakewell = []
    nielsen = []
    
    if 4 in include_studies:
        bakewell, c         = geneUtils.mergeColumns(genesTSV, 12, 13, 2)
        bakewell = [item.lower() for item in bakewell]
        conflicts.extend(c)

    if 5 in include_studies:
        nielsen, c          = geneUtils.mergeColumns(genesTSV, 17, 18, 2)
        nielsen = [item.lower() for item in nielsen]
        conflicts.extend(c)
    
    # verify gene symbols
    
    duplicates = 0
    
    geneCounts = {}
    geneUtils.geneFrequency(geneCounts, bustamante)
    geneUtils.geneFrequency(geneCounts, vamathevan_human)
    geneUtils.geneFrequency(geneCounts, kosiol_human)
    geneUtils.geneFrequency(geneCounts, bakewell)
    geneUtils.geneFrequency(geneCounts, nielsen)
    
    
    
    #duplicates+=geneUtils.addAll(geneSet, bustamante)
    #duplicates+=geneUtils.addAll(geneSet, vamathevan_human)
    #duplicates+=geneUtils.addAll(geneSet, kosiol_human)
    #duplicates+=geneUtils.addAll(geneSet, bakewell)
    #duplicates+=geneUtils.addAll(geneSet, nielsen)
    
    
    
    if __ENABLE_GENE_VERIFICATION:
        for pair in conflicts:
            g1 = pair[0].lower()
            g2 = pair[1].lower()
            
            if __ENABLE_GENE_UPDATES:
                g1parent = geneDB.findUpdatedSymbol(g1)
                g2parent = geneDB.findUpdatedSymbol(g2)
                
                if g1parent != None:
                    g1 = g1parent
                if g2parent != None:
                    g2 = g2parent
            
            if geneDB.isApproved(g1):
                if __DEBUG>1:
                    print "Gene", g2, "not approved, but",g1,"is fine"
                try:
                    geneCounts[g1]+=1
                except KeyError:
                    geneCounts[g1] = 1
                    
            elif geneDB.isApproved(g2):
                if __DEBUG>1:
                    print "Gene", g1, "not approved, but",g2,"is fine"
                try:
                    geneCounts[g2]+=1
                except KeyError:
                    geneCounts[g2] = 1
                    
            else:
                if __DEBUG>1:
                    print "Neither",g1,"nor",g2,"are valid"
        
    
    else:
        for pair in conflicts:
            g1 = pair[0].lower()
            g2 = pair[1].lower()
            
            try:
                geneCounts[g2]+=1
            except KeyError:
                geneCounts[g2] = 1
            
    duplicates, geneSet = geneUtils.addFilterFrequency(geneCounts, __CROSS_MATCH_LEVEL)
    ofile = open(os.sep.join(["results","log","geneSetDuplicateFrequency.txt"]),'w')
    
    glist = []
    
    for gene in geneCounts:
        glist.append((gene, geneCounts[gene]))
        
    for item in sorted(glist, key=lambda item: -item[1]):
        ofile.write("%-20s%d\n" % item)
    
    ofile.close()
    
    geneSet = set([geneUtils.formatGeneSymbol(geneSym) for geneSym in geneSet])
            
    if __ENABLE_GENE_UPDATES:
        geneUtils.updateGeneSet(geneSet)
    
    if __ENABLE_GENE_VERIFICATION:        
        geneUtils.removeInvalidGenes(geneSet)
    
    if __DEBUG>0:
        print "\n-----------------------------"
        print "Total Duplicates:      ", duplicates
        print "Total Name Conflicts:  ", len(conflicts)
        print "Total Genes Remaining: ", len(geneSet)
        print "-----------------------------\n"
        
    log_file = open(os.sep.join(["results","log","loaded_genelist.txt"]),'w')
    for gene in geneSet:
        log_file.write(gene+"\n")
    log_file.close()
    return geneSet
예제 #7
0
def initTargets(targets_file, protein_file,__ENABLE_GENE_VERIFICATION=0, __ENABLE_GENE_UPDATES=0):
    global __DEBUG, __targetCatalogue, __geneSet, __geneNames, __drugDict
    
    __targetCatalogue.load(targets_file)
    
    rejectedSet = set([])
    updatedSet = set([])
    
    for r in xrange(1, __targetCatalogue.rows+1):
        geneId = int(__targetCatalogue.get(r,0))
        geneName = geneUtils.formatGeneSymbol(__targetCatalogue.get(r,2))
        
        if geneName != None:
            if __ENABLE_GENE_UPDATES:
                parentSym = geneDB.findUpdatedSymbol(geneName)
                if parentSym != None:
                    updatedSet.add(geneName)
                    geneName = parentSym
                
            if __ENABLE_GENE_VERIFICATION and not geneDB.isApproved(geneName):
                if __DEBUG>2 and geneName != "" or geneName == "papc":
                    print "Rejected:", geneName
                rejectedSet.add(geneName)
                continue
                
            __geneNames[geneId] = geneName
            __geneSet.add(geneName)
            __drugDict[geneName] = set([])
    
    invalid_file = open(os.sep.join(["results","log","invalid_drugbank.txt"]),'w')
    for geneName in rejectedSet:
        invalid_file.write(geneName+"\n")
    invalid_file.close()            
    
    proteins = parseFASTA(protein_file)
    
    __drugSet = set([])
    empty_gene_drug_targets = 0
    for fasta in proteins:
        items = fasta[1].split()
        geneId = int(items[0])
        
        if geneId in __geneNames:
            parenthetical = fasta[1][fasta[1].rfind("(")+1 : fasta[1].rfind(")")]
            
            drugs = parenthetical.split(";")
            
            for drug in drugs:
                drugbankid = drug.strip()
                __drugDict[__geneNames[geneId]].add(drugbankid)
                __drugSet.add(drugbankid)
    
    removable = set([])
    for gene in __geneSet:
        if gene not in __drugDict or len(__drugDict[gene]) == 0:
            removable.add(gene)

            empty_gene_drug_targets += 1
    __geneSet -= removable
    
    removable_drugs = set([])
    for drugbankid in __drugs:
        if drugbankid not in __drugSet:
            removable_drugs.add(drugbankid)
    
    for drugbankid in __drugSet:
        if drugbankid not in __drugs:
            __drugs[drugbankid] = {'name':drugbankid}

    for drugbankid in removable_drugs:
        del __drugs[drugbankid]

    if __DEBUG>0:
        print "\n------------------------------------------"
        print "Invalid Drug Target Gene Symbols:   ", len(rejectedSet)
        print "Updated Drug Target Gene Symbols:   ", len(updatedSet)
        print "Remaining Drug Target Gene Symbols: ", len(__geneSet)
        print "Drugs with targets:                 ", len(__drugSet), len(__drugs)
        print "Removed:", empty_gene_drug_targets, "genes without targeting drugs"
        print "------------------------------------------\n"