def __addGene(i, geneSym, geneTrait, __ENABLE_GENE_VERIFICATION, __ENABLE_GENE_UPDATES, updatedGeneSet, invalidGeneSet): if __ENABLE_GENE_UPDATES: parent = geneDB.findUpdatedSymbol(geneSym) if parent != None: if __DEBUG > 1: print "Replacing",geneSym,"with",parent updatedGeneSet.add(geneSym) geneSym = parent if not __ENABLE_GENE_VERIFICATION or geneDB.isApproved(geneSym): __geneSet.add(geneSym) try: __studyGenes[i].add(geneSym) except KeyError: __studyGenes[i] = set([geneSym]) try: __traitDict[geneSym].add(geneTrait) except KeyError: __traitDict[geneSym] = set([geneTrait]) try: __studyByGene[geneSym].add(i) except KeyError: __studyByGene[geneSym] = set([i]) elif __ENABLE_GENE_VERIFICATION: invalidGeneSet.add(geneSym)
def removeInvalidGenes(geneSet): remove = [] for gene in geneSet: if not geneDB.isApproved(gene): if __DEBUG>2: print "Gene:", gene,"is not valid..." remove.append(gene) invalid_file = open(os.sep.join(["results","log","invalid_genelist.txt"]),'w') for r in remove: geneSet.remove(r) invalid_file.write(r + "\n") invalid_file.close() if __DEBUG>0: print "\n----------------------------" print "Gene Set Correction Results:" print "Removed: ", len(remove) print "----------------------------\n"
def removeInvalidGenes(geneSet): remove = [] for gene in geneSet: if not geneDB.isApproved(gene): if __DEBUG > 2: print "Gene:", gene, "is not valid..." remove.append(gene) invalid_file = open( os.sep.join(["results", "log", "invalid_genelist.txt"]), 'w') for r in remove: geneSet.remove(r) invalid_file.write(r + "\n") invalid_file.close() if __DEBUG > 0: print "\n----------------------------" print "Gene Set Correction Results:" print "Removed: ", len(remove) print "----------------------------\n"
def mapTargetNames(targets_file, __ENABLE_GENE_UPDATES=1, __ENABLE_GENE_VERIFICATION=1): global __targets_unnamed, __targets, __target_names, __geneSet __targetCatalogue = pyCSV() __targetCatalogue.load(targets_file) rejectednames = 0 representedids = set([]) for r in xrange(1, __targetCatalogue.rows + 1): targetId = int(__targetCatalogue.get(r, 0)) targetGene = geneUtils.formatGeneSymbol(__targetCatalogue.get(r, 2)) representedids.add(targetId) if targetGene != None: parentSym = geneDB.findUpdatedSymbol(targetGene) if __ENABLE_GENE_UPDATES and parentSym != None: if __DEBUG > 1: print "Updated:", targetGene, "to:", parentSym targetGene = parentSym if __ENABLE_GENE_VERIFICATION and not geneDB.isApproved( targetGene): if __DEBUG > 1: print "Rejected:", targetGene rejectednames += 1 continue __geneSet.add(targetGene) __target_names[targetId] = targetGene for targetId in __targets: if targetId not in __target_names.keys(): __targets_unnamed += 1 if __DEBUG > 0: print "Rejected: ", rejectednames print "Unrepresented: ", len(set(__targets.keys()) - representedids) __drugSet = set([]) for drugbankid in __drugs: drug = __drugs[drugbankid] for target in drug['targets']: targetId = target['partner'] if targetId in __target_names: targetGene = __target_names[targetId] __drugSet.add(drugbankid) try: __drugDict[targetGene].add(drugbankid) except KeyError: __drugDict[targetGene] = set([drugbankid]) removable_drugs = set([]) for drugbankid in __drugs: if drugbankid not in __drugSet: removable_drugs.add(drugbankid) for drugbankid in removable_drugs: del __drugs[drugbankid] lenbefore = len(__geneSet) __geneSet = __geneSet & set(__drugDict.keys()) lenafter = len(__geneSet) if __DEBUG > 0: print "Removed", (lenafter - lenbefore), "untargeted gene names" print "Total drugs with targets: ", len(__drugSet), len(__drugs) print "Total geneset size: ", len(__geneSet)
def mapTargetNames(targets_file,__ENABLE_GENE_UPDATES=1,__ENABLE_GENE_VERIFICATION=1): global __targets_unnamed, __targets, __target_names, __geneSet __targetCatalogue = pyCSV() __targetCatalogue.load(targets_file) rejectednames = 0 representedids = set([]) for r in xrange(1, __targetCatalogue.rows+1): targetId = int(__targetCatalogue.get(r,0)) targetGene = geneUtils.formatGeneSymbol(__targetCatalogue.get(r,2)) representedids.add(targetId) if targetGene != None: parentSym = geneDB.findUpdatedSymbol(targetGene) if __ENABLE_GENE_UPDATES and parentSym != None: if __DEBUG>1: print "Updated:", targetGene, "to:", parentSym targetGene = parentSym if __ENABLE_GENE_VERIFICATION and not geneDB.isApproved(targetGene): if __DEBUG>1: print "Rejected:", targetGene rejectednames+=1 continue __geneSet.add(targetGene) __target_names[targetId] = targetGene for targetId in __targets: if targetId not in __target_names.keys(): __targets_unnamed += 1 if __DEBUG>0: print "Rejected: ", rejectednames print "Unrepresented: ", len(set(__targets.keys()) - representedids) __drugSet = set([]) for drugbankid in __drugs: drug = __drugs[drugbankid] for target in drug['targets']: targetId = target['partner'] if targetId in __target_names: targetGene = __target_names[targetId] __drugSet.add(drugbankid) try: __drugDict[targetGene].add(drugbankid) except KeyError: __drugDict[targetGene] = set([drugbankid]) removable_drugs = set([]) for drugbankid in __drugs: if drugbankid not in __drugSet: removable_drugs.add(drugbankid) for drugbankid in removable_drugs: del __drugs[drugbankid] lenbefore = len(__geneSet) __geneSet = __geneSet & set(__drugDict.keys()) lenafter = len(__geneSet) if __DEBUG>0: print "Removed", (lenafter - lenbefore), "untargeted gene names" print "Total drugs with targets: ", len(__drugSet), len(__drugs) print "Total geneset size: ", len(__geneSet)
def loadEvolutionaryGenes(filename, __ENABLE_GENE_VERIFICATION=0, __ENABLE_GENE_UPDATES=0, __CROSS_MATCH_LEVEL = 1, include_studies = [1,2,3,4,5]): global __DEBUG genesTSV = pyCSV() genesTSV.load(filename, "\t") bustamante = [] vamathevan_human = [] kosiol_human = [] if 1 in include_studies: bustamante = [item.lower() for item in geneUtils.columnToList(genesTSV, 1, 2)] if 2 in include_studies: vamathevan_human = [item.lower() for item in geneUtils.columnToList(genesTSV, 3, 2)] if 3 in include_studies: kosiol_human = [item.lower() for item in geneUtils.columnToList(genesTSV, 8, 2)] conflicts = [] bakewell = [] nielsen = [] if 4 in include_studies: bakewell, c = geneUtils.mergeColumns(genesTSV, 12, 13, 2) bakewell = [item.lower() for item in bakewell] conflicts.extend(c) if 5 in include_studies: nielsen, c = geneUtils.mergeColumns(genesTSV, 17, 18, 2) nielsen = [item.lower() for item in nielsen] conflicts.extend(c) # verify gene symbols duplicates = 0 geneCounts = {} geneUtils.geneFrequency(geneCounts, bustamante) geneUtils.geneFrequency(geneCounts, vamathevan_human) geneUtils.geneFrequency(geneCounts, kosiol_human) geneUtils.geneFrequency(geneCounts, bakewell) geneUtils.geneFrequency(geneCounts, nielsen) #duplicates+=geneUtils.addAll(geneSet, bustamante) #duplicates+=geneUtils.addAll(geneSet, vamathevan_human) #duplicates+=geneUtils.addAll(geneSet, kosiol_human) #duplicates+=geneUtils.addAll(geneSet, bakewell) #duplicates+=geneUtils.addAll(geneSet, nielsen) if __ENABLE_GENE_VERIFICATION: for pair in conflicts: g1 = pair[0].lower() g2 = pair[1].lower() if __ENABLE_GENE_UPDATES: g1parent = geneDB.findUpdatedSymbol(g1) g2parent = geneDB.findUpdatedSymbol(g2) if g1parent != None: g1 = g1parent if g2parent != None: g2 = g2parent if geneDB.isApproved(g1): if __DEBUG>1: print "Gene", g2, "not approved, but",g1,"is fine" try: geneCounts[g1]+=1 except KeyError: geneCounts[g1] = 1 elif geneDB.isApproved(g2): if __DEBUG>1: print "Gene", g1, "not approved, but",g2,"is fine" try: geneCounts[g2]+=1 except KeyError: geneCounts[g2] = 1 else: if __DEBUG>1: print "Neither",g1,"nor",g2,"are valid" else: for pair in conflicts: g1 = pair[0].lower() g2 = pair[1].lower() try: geneCounts[g2]+=1 except KeyError: geneCounts[g2] = 1 duplicates, geneSet = geneUtils.addFilterFrequency(geneCounts, __CROSS_MATCH_LEVEL) ofile = open(os.sep.join(["results","log","geneSetDuplicateFrequency.txt"]),'w') glist = [] for gene in geneCounts: glist.append((gene, geneCounts[gene])) for item in sorted(glist, key=lambda item: -item[1]): ofile.write("%-20s%d\n" % item) ofile.close() geneSet = set([geneUtils.formatGeneSymbol(geneSym) for geneSym in geneSet]) if __ENABLE_GENE_UPDATES: geneUtils.updateGeneSet(geneSet) if __ENABLE_GENE_VERIFICATION: geneUtils.removeInvalidGenes(geneSet) if __DEBUG>0: print "\n-----------------------------" print "Total Duplicates: ", duplicates print "Total Name Conflicts: ", len(conflicts) print "Total Genes Remaining: ", len(geneSet) print "-----------------------------\n" log_file = open(os.sep.join(["results","log","loaded_genelist.txt"]),'w') for gene in geneSet: log_file.write(gene+"\n") log_file.close() return geneSet
def initTargets(targets_file, protein_file,__ENABLE_GENE_VERIFICATION=0, __ENABLE_GENE_UPDATES=0): global __DEBUG, __targetCatalogue, __geneSet, __geneNames, __drugDict __targetCatalogue.load(targets_file) rejectedSet = set([]) updatedSet = set([]) for r in xrange(1, __targetCatalogue.rows+1): geneId = int(__targetCatalogue.get(r,0)) geneName = geneUtils.formatGeneSymbol(__targetCatalogue.get(r,2)) if geneName != None: if __ENABLE_GENE_UPDATES: parentSym = geneDB.findUpdatedSymbol(geneName) if parentSym != None: updatedSet.add(geneName) geneName = parentSym if __ENABLE_GENE_VERIFICATION and not geneDB.isApproved(geneName): if __DEBUG>2 and geneName != "" or geneName == "papc": print "Rejected:", geneName rejectedSet.add(geneName) continue __geneNames[geneId] = geneName __geneSet.add(geneName) __drugDict[geneName] = set([]) invalid_file = open(os.sep.join(["results","log","invalid_drugbank.txt"]),'w') for geneName in rejectedSet: invalid_file.write(geneName+"\n") invalid_file.close() proteins = parseFASTA(protein_file) __drugSet = set([]) empty_gene_drug_targets = 0 for fasta in proteins: items = fasta[1].split() geneId = int(items[0]) if geneId in __geneNames: parenthetical = fasta[1][fasta[1].rfind("(")+1 : fasta[1].rfind(")")] drugs = parenthetical.split(";") for drug in drugs: drugbankid = drug.strip() __drugDict[__geneNames[geneId]].add(drugbankid) __drugSet.add(drugbankid) removable = set([]) for gene in __geneSet: if gene not in __drugDict or len(__drugDict[gene]) == 0: removable.add(gene) empty_gene_drug_targets += 1 __geneSet -= removable removable_drugs = set([]) for drugbankid in __drugs: if drugbankid not in __drugSet: removable_drugs.add(drugbankid) for drugbankid in __drugSet: if drugbankid not in __drugs: __drugs[drugbankid] = {'name':drugbankid} for drugbankid in removable_drugs: del __drugs[drugbankid] if __DEBUG>0: print "\n------------------------------------------" print "Invalid Drug Target Gene Symbols: ", len(rejectedSet) print "Updated Drug Target Gene Symbols: ", len(updatedSet) print "Remaining Drug Target Gene Symbols: ", len(__geneSet) print "Drugs with targets: ", len(__drugSet), len(__drugs) print "Removed:", empty_gene_drug_targets, "genes without targeting drugs" print "------------------------------------------\n"