def init(filename, __EXCLUDE_PSEUDOGENES=0): global __geneDB, __symbols, __gene_symbol_parents, __gene_symbol_synonyms __geneDB = pyCSV() __geneDB.load(filename,"\t") for row in xrange(1, __geneDB.rows+1): if __EXCLUDE_PSEUDOGENES and __geneDB.get(row, 2).count("pseudogene")>0: continue symbol = geneUtils.formatGeneSymbol(__geneDB.get(row, 1)) __original_names[symbol] = __geneDB.get(row, 1) __symbols.add(symbol) __sym_rows[symbol] = row if __isApproved(symbol): __approved_symbols.add(symbol) print "Loaded:", len(__approved_symbols), "approved gene symbols..." for symbol in __approved_symbols: past_symbols = getPastSymbols(symbol) for child in past_symbols: try: __gene_symbol_parents[child].add(symbol) except KeyError: __gene_symbol_parents[child] = set([symbol]) for synSym in getSynonyms(symbol): try: __gene_symbol_synonyms[synSym].add(symbol) except KeyError: __gene_symbol_synonyms[synSym] = set([symbol]) for symbol in __gene_symbol_synonyms: remove = [] for synGene in __gene_symbol_synonyms[symbol]: if synGene not in __approved_symbols: remove.append(synGene) for r in remove: __gene_symbol_synonyms[symbol].remove(r)
def mapTargetNames(targets_file, __ENABLE_GENE_UPDATES=1, __ENABLE_GENE_VERIFICATION=1): global __targets_unnamed, __targets, __target_names, __geneSet __targetCatalogue = pyCSV() __targetCatalogue.load(targets_file) rejectednames = 0 representedids = set([]) for r in xrange(1, __targetCatalogue.rows + 1): targetId = int(__targetCatalogue.get(r, 0)) targetGene = geneUtils.formatGeneSymbol(__targetCatalogue.get(r, 2)) representedids.add(targetId) if targetGene != None: parentSym = geneDB.findUpdatedSymbol(targetGene) if __ENABLE_GENE_UPDATES and parentSym != None: if __DEBUG > 1: print "Updated:", targetGene, "to:", parentSym targetGene = parentSym if __ENABLE_GENE_VERIFICATION and not geneDB.isApproved( targetGene): if __DEBUG > 1: print "Rejected:", targetGene rejectednames += 1 continue __geneSet.add(targetGene) __target_names[targetId] = targetGene for targetId in __targets: if targetId not in __target_names.keys(): __targets_unnamed += 1 if __DEBUG > 0: print "Rejected: ", rejectednames print "Unrepresented: ", len(set(__targets.keys()) - representedids) __drugSet = set([]) for drugbankid in __drugs: drug = __drugs[drugbankid] for target in drug['targets']: targetId = target['partner'] if targetId in __target_names: targetGene = __target_names[targetId] __drugSet.add(drugbankid) try: __drugDict[targetGene].add(drugbankid) except KeyError: __drugDict[targetGene] = set([drugbankid]) removable_drugs = set([]) for drugbankid in __drugs: if drugbankid not in __drugSet: removable_drugs.add(drugbankid) for drugbankid in removable_drugs: del __drugs[drugbankid] lenbefore = len(__geneSet) __geneSet = __geneSet & set(__drugDict.keys()) lenafter = len(__geneSet) if __DEBUG > 0: print "Removed", (lenafter - lenbefore), "untargeted gene names" print "Total drugs with targets: ", len(__drugSet), len(__drugs) print "Total geneset size: ", len(__geneSet)
def mapTargetNames(targets_file,__ENABLE_GENE_UPDATES=1,__ENABLE_GENE_VERIFICATION=1): global __targets_unnamed, __targets, __target_names, __geneSet __targetCatalogue = pyCSV() __targetCatalogue.load(targets_file) rejectednames = 0 representedids = set([]) for r in xrange(1, __targetCatalogue.rows+1): targetId = int(__targetCatalogue.get(r,0)) targetGene = geneUtils.formatGeneSymbol(__targetCatalogue.get(r,2)) representedids.add(targetId) if targetGene != None: parentSym = geneDB.findUpdatedSymbol(targetGene) if __ENABLE_GENE_UPDATES and parentSym != None: if __DEBUG>1: print "Updated:", targetGene, "to:", parentSym targetGene = parentSym if __ENABLE_GENE_VERIFICATION and not geneDB.isApproved(targetGene): if __DEBUG>1: print "Rejected:", targetGene rejectednames+=1 continue __geneSet.add(targetGene) __target_names[targetId] = targetGene for targetId in __targets: if targetId not in __target_names.keys(): __targets_unnamed += 1 if __DEBUG>0: print "Rejected: ", rejectednames print "Unrepresented: ", len(set(__targets.keys()) - representedids) __drugSet = set([]) for drugbankid in __drugs: drug = __drugs[drugbankid] for target in drug['targets']: targetId = target['partner'] if targetId in __target_names: targetGene = __target_names[targetId] __drugSet.add(drugbankid) try: __drugDict[targetGene].add(drugbankid) except KeyError: __drugDict[targetGene] = set([drugbankid]) removable_drugs = set([]) for drugbankid in __drugs: if drugbankid not in __drugSet: removable_drugs.add(drugbankid) for drugbankid in removable_drugs: del __drugs[drugbankid] lenbefore = len(__geneSet) __geneSet = __geneSet & set(__drugDict.keys()) lenafter = len(__geneSet) if __DEBUG>0: print "Removed", (lenafter - lenbefore), "untargeted gene names" print "Total drugs with targets: ", len(__drugSet), len(__drugs) print "Total geneset size: ", len(__geneSet)
def loadEvolutionaryGenes(filename, __ENABLE_GENE_VERIFICATION=0, __ENABLE_GENE_UPDATES=0, __CROSS_MATCH_LEVEL = 1, include_studies = [1,2,3,4,5]): global __DEBUG genesTSV = pyCSV() genesTSV.load(filename, "\t") bustamante = [] vamathevan_human = [] kosiol_human = [] if 1 in include_studies: bustamante = [item.lower() for item in geneUtils.columnToList(genesTSV, 1, 2)] if 2 in include_studies: vamathevan_human = [item.lower() for item in geneUtils.columnToList(genesTSV, 3, 2)] if 3 in include_studies: kosiol_human = [item.lower() for item in geneUtils.columnToList(genesTSV, 8, 2)] conflicts = [] bakewell = [] nielsen = [] if 4 in include_studies: bakewell, c = geneUtils.mergeColumns(genesTSV, 12, 13, 2) bakewell = [item.lower() for item in bakewell] conflicts.extend(c) if 5 in include_studies: nielsen, c = geneUtils.mergeColumns(genesTSV, 17, 18, 2) nielsen = [item.lower() for item in nielsen] conflicts.extend(c) # verify gene symbols duplicates = 0 geneCounts = {} geneUtils.geneFrequency(geneCounts, bustamante) geneUtils.geneFrequency(geneCounts, vamathevan_human) geneUtils.geneFrequency(geneCounts, kosiol_human) geneUtils.geneFrequency(geneCounts, bakewell) geneUtils.geneFrequency(geneCounts, nielsen) #duplicates+=geneUtils.addAll(geneSet, bustamante) #duplicates+=geneUtils.addAll(geneSet, vamathevan_human) #duplicates+=geneUtils.addAll(geneSet, kosiol_human) #duplicates+=geneUtils.addAll(geneSet, bakewell) #duplicates+=geneUtils.addAll(geneSet, nielsen) if __ENABLE_GENE_VERIFICATION: for pair in conflicts: g1 = pair[0].lower() g2 = pair[1].lower() if __ENABLE_GENE_UPDATES: g1parent = geneDB.findUpdatedSymbol(g1) g2parent = geneDB.findUpdatedSymbol(g2) if g1parent != None: g1 = g1parent if g2parent != None: g2 = g2parent if geneDB.isApproved(g1): if __DEBUG>1: print "Gene", g2, "not approved, but",g1,"is fine" try: geneCounts[g1]+=1 except KeyError: geneCounts[g1] = 1 elif geneDB.isApproved(g2): if __DEBUG>1: print "Gene", g1, "not approved, but",g2,"is fine" try: geneCounts[g2]+=1 except KeyError: geneCounts[g2] = 1 else: if __DEBUG>1: print "Neither",g1,"nor",g2,"are valid" else: for pair in conflicts: g1 = pair[0].lower() g2 = pair[1].lower() try: geneCounts[g2]+=1 except KeyError: geneCounts[g2] = 1 duplicates, geneSet = geneUtils.addFilterFrequency(geneCounts, __CROSS_MATCH_LEVEL) ofile = open(os.sep.join(["results","log","geneSetDuplicateFrequency.txt"]),'w') glist = [] for gene in geneCounts: glist.append((gene, geneCounts[gene])) for item in sorted(glist, key=lambda item: -item[1]): ofile.write("%-20s%d\n" % item) ofile.close() geneSet = set([geneUtils.formatGeneSymbol(geneSym) for geneSym in geneSet]) if __ENABLE_GENE_UPDATES: geneUtils.updateGeneSet(geneSet) if __ENABLE_GENE_VERIFICATION: geneUtils.removeInvalidGenes(geneSet) if __DEBUG>0: print "\n-----------------------------" print "Total Duplicates: ", duplicates print "Total Name Conflicts: ", len(conflicts) print "Total Genes Remaining: ", len(geneSet) print "-----------------------------\n" log_file = open(os.sep.join(["results","log","loaded_genelist.txt"]),'w') for gene in geneSet: log_file.write(gene+"\n") log_file.close() return geneSet
def init(filename, __ENABLE_GENE_VERIFICATION = 0, __ENABLE_GENE_UPDATES = 0, __INCLUDE_MAPPED_GENES = 0, trait_exclude_file = 0, pfilter = 0.05): global __DEBUG, __pValues, __gwasCatalogue, __studyByTrait, __geneSet, __traitDict, __studyByGene exclude_traits = set([]) if trait_exclude_file != 0: ifile = open(trait_exclude_file,'r') for line in ifile: exclude_traits.add(line.strip()) ifile.close() __gwasCatalogue.load(filename, "\t") invalidGeneSet = set([]) updatedGeneSet = set([]) for i in xrange(1, __gwasCatalogue.rows+1): geneString = __gwasCatalogue.get(i, 13).strip() geneTrait = __gwasCatalogue.get(i, 7).strip() pvalueText = __gwasCatalogue.get(i, 27) pvalue = 0 try: pvalue = float(pvalueText) except ValueError: pvalue = -1 if pvalue > pfilter: continue if geneTrait in exclude_traits: continue if geneString==None: continue if geneString == "": continue __pValues[i] = pvalue try: __studyByTrait[geneTrait].add(i) except KeyError: __studyByTrait[geneTrait] = set([i]) geneItems = geneString.split(",") for item in geneItems: geneSymbols = item.split(" - ") for geneSym in geneSymbols: geneSym = geneUtils.formatGeneSymbol(geneSym.strip()) __addGene(i,geneSym, geneTrait, __ENABLE_GENE_VERIFICATION, __ENABLE_GENE_UPDATES, updatedGeneSet, invalidGeneSet) if __INCLUDE_MAPPED_GENES: mappedGenes = __gwasCatalogue.get(i, 14) mappedItems = mappedGenes.split(";") for item in mappedItems: geneSymbols = item.split(" - ") for geneSym in geneSymbols: geneSym = geneUtils.formatGeneSymbol(geneSym) __addGene(i,geneSym, geneTrait, __ENABLE_GENE_VERIFICATION, __ENABLE_GENE_UPDATES, updatedGeneSet, invalidGeneSet) invalid_file = open(os.sep.join(["results","log","invalid_gwas.txt"]),'w') for geneSym in invalidGeneSet: invalid_file.write(geneSym+"\n") invalid_file.close() background_file = open(os.sep.join(["results", "log", "gwas_background.txt"]), 'w') for geneSym in __geneSet: background_file.write(geneDB.__original_names[geneSym] + "\n") background_file.close() if __DEBUG > 0: print "\n---------------------------------" print "GWAS Invalid Gene Symbols: ", len(invalidGeneSet) print "GWAS Updated Gene Symbols: ", len(updatedGeneSet) print "GWAS Total Genes Remaining: ", len(__geneSet) print "---------------------------------\n"
def getSynonyms(geneSym): global __geneDB, __sym_rows, __synonyms return [geneUtils.formatGeneSymbol(item) for item in __strToList(__geneDB.get(__sym_rows[geneSym.lower()], __synonyms))]
def initTargets(targets_file, protein_file,__ENABLE_GENE_VERIFICATION=0, __ENABLE_GENE_UPDATES=0): global __DEBUG, __targetCatalogue, __geneSet, __geneNames, __drugDict __targetCatalogue.load(targets_file) rejectedSet = set([]) updatedSet = set([]) for r in xrange(1, __targetCatalogue.rows+1): geneId = int(__targetCatalogue.get(r,0)) geneName = geneUtils.formatGeneSymbol(__targetCatalogue.get(r,2)) if geneName != None: if __ENABLE_GENE_UPDATES: parentSym = geneDB.findUpdatedSymbol(geneName) if parentSym != None: updatedSet.add(geneName) geneName = parentSym if __ENABLE_GENE_VERIFICATION and not geneDB.isApproved(geneName): if __DEBUG>2 and geneName != "" or geneName == "papc": print "Rejected:", geneName rejectedSet.add(geneName) continue __geneNames[geneId] = geneName __geneSet.add(geneName) __drugDict[geneName] = set([]) invalid_file = open(os.sep.join(["results","log","invalid_drugbank.txt"]),'w') for geneName in rejectedSet: invalid_file.write(geneName+"\n") invalid_file.close() proteins = parseFASTA(protein_file) __drugSet = set([]) empty_gene_drug_targets = 0 for fasta in proteins: items = fasta[1].split() geneId = int(items[0]) if geneId in __geneNames: parenthetical = fasta[1][fasta[1].rfind("(")+1 : fasta[1].rfind(")")] drugs = parenthetical.split(";") for drug in drugs: drugbankid = drug.strip() __drugDict[__geneNames[geneId]].add(drugbankid) __drugSet.add(drugbankid) removable = set([]) for gene in __geneSet: if gene not in __drugDict or len(__drugDict[gene]) == 0: removable.add(gene) empty_gene_drug_targets += 1 __geneSet -= removable removable_drugs = set([]) for drugbankid in __drugs: if drugbankid not in __drugSet: removable_drugs.add(drugbankid) for drugbankid in __drugSet: if drugbankid not in __drugs: __drugs[drugbankid] = {'name':drugbankid} for drugbankid in removable_drugs: del __drugs[drugbankid] if __DEBUG>0: print "\n------------------------------------------" print "Invalid Drug Target Gene Symbols: ", len(rejectedSet) print "Updated Drug Target Gene Symbols: ", len(updatedSet) print "Remaining Drug Target Gene Symbols: ", len(__geneSet) print "Drugs with targets: ", len(__drugSet), len(__drugs) print "Removed:", empty_gene_drug_targets, "genes without targeting drugs" print "------------------------------------------\n"