def wordCounter(db): # Takes in a database dictionary and returns sorted list of WordFreq objects # sorted by how often they occur from genewordsearch.Classes import GeneNote from genewordsearch.Classes import WordFreq #Make a list of all the words associated genes in the database words = [] for gene in list(db.values()): words += gene.words # Sorting the words into alphabetical order words.sort() wordList = [] # Counting the words for item in words: if(wordList == [] or wordList[0].word != item): wordList.insert(0, WordFreq(item,1)) else: wordList[0].increment() del words # Sorting now by frequency instead of alphabetical and return it return sorted(wordList, key=lambda item: item.freq,reverse=True)
def custom_db_analysis(): # Deal with a custom database file import os import glob import shutil from genewordsearch.DBBuilder import geneWordBuilder # Prep the database files for processing ip = str(request.environ['REMOTE_ADDR']) folder = os.path.join(app.config['UPLOAD_FOLDER'], ip) os.makedirs(folder, exist_ok=True) dbFiles = request.files.getlist('geneDBs') fileCount = len(dbFiles) fileNum = 0 for db in dbFiles: filename = secure_filename(db.filename) db.save(os.path.join(folder, (str(fileNum)+filename[-4:]))) fileNum += 1 fileList = glob.glob(folder+'/*') fileList.sort() # Pull and organize the rest of the database info headers = [] headtxt = [] delimiters = [] geneCols = [] desCols = [] fileNum = 0 while(fileNum < fileCount): headtxt.append(str(request.form['header'+str(fileNum)])) delimiters.append(str(request.form['delimiter'+str(fileNum)])) geneCols.append(str(request.form['geneCol'+str(fileNum)])) desCols.append(str(request.form['desCols'+str(fileNum)])) fileNum += 1 for header in headtxt: if(header =='y'): headers.append(True) else: headers.append(False) geneWordBuilder(ip,fileList,geneCols,desCols,delimiters,headers) shutil.rmtree(folder+'/') # Run the enrichment analysis genes = str(request.form['geneList']) probCutoff = float(request.form['probCut']) genes = re.split('\r| |,|\t|\n',genes) genes = list(filter((lambda x: x != ''),genes)) try: results = geneWordSearch(genes,ip,minChance=probCutoff) except KeyError: abort(400) ans = WordFreq.to_JSON_array(results[0]) shutil.rmtree('genewordsearch/databases/'+ip+'/') return jsonify(result=ans)
def custom_db_analysis(): # Deal with a custom database file # Prep the database files for processing ip = str(request.environ['REMOTE_ADDR']) folder = os.path.join(app.config['UPLOAD_FOLDER'], ip) os.makedirs(folder, exist_ok=True) dbFiles = request.files.getlist('geneDBs') fileCount = len(dbFiles) fileNum = 0 for db in dbFiles: filename = secure_filename(db.filename) db.save(os.path.join(folder, (str(fileNum) + filename[-4:]))) fileNum += 1 fileList = glob.glob(folder + '/*') fileList.sort() # Pull and organize the rest of the database info headers = [] headtxt = [] delimiters = [] geneCols = [] desCols = [] fileNum = 0 while (fileNum < fileCount): headtxt.append(str(request.form['header' + str(fileNum)])) delimiters.append(str(request.form['delimiter' + str(fileNum)])) geneCols.append(str(request.form['geneCol' + str(fileNum)])) desCols.append(str(request.form['desCols' + str(fileNum)])) fileNum += 1 for header in headtxt: if (header == 'y'): headers.append(True) else: headers.append(False) geneWordBuilder(ip, fileList, geneCols, desCols, delimiters, headers) shutil.rmtree(folder + '/') # Run the enrichment analysis genes = str(request.form['geneList']) probCutoff = float(request.form['probCut']) genes = re.split('\r| |,|\t|\n', genes) genes = list(filter((lambda x: x != ''), genes)) try: results = geneWordSearch(genes, ip, minChance=probCutoff) except KeyError: abort(400) ans = WordFreq.to_JSON_array(results[0]) shutil.rmtree('genewordsearch/databases/' + ip + '/') return jsonify(result=ans)
def gene_word_search(): cob = networks[str(request.form['network'])] pCutoff = safeOpts('pCutoff',float(request.form['pCutoff'])) geneList = str(request.form['geneList']) geneList = list(filter((lambda x: x != ''), re.split('\r| |,|;|\t|\n', geneList))) # Run the analysis and return the JSONified results if cob._global('parent_refgen') in func_data_db: results = geneWordSearch(geneList, cob._global('parent_refgen'), minChance=pCutoff) else: abort(405) if len(results[0]) == 0: abort(400) results = WordFreq.to_JSON_array(results[0]) return jsonify(result=results)
def gene_word_search(): cob = networks[str(request.form['network'])] pCutoff = safeOpts('pCutoff',float(request.form['pCutoff'])) geneList = str(request.form['geneList']) geneList = list(filter((lambda x: x != ''), re.split('\r| |,|;|\t|\n', geneList))) # Run the analysis and return the JSONified results if hasGWS and (cob._global('parent_refgen') in func_data_db): results = geneWordSearch(geneList, cob._global('parent_refgen'), minChance=pCutoff) else: abort(405) if len(results[0]) == 0: abort(400) results = WordFreq.to_JSON_array(results[0]) return jsonify(result=results)
def gene_analysis(): # Run the genes through genewordsearch # Sanitize the input species = str(request.form['species']) genes = str(request.form['geneList']) probCutoff = float(request.form['probCut']) genes = re.split('\r| |,|\t|\n', genes) genes = list(filter((lambda x: x != ''), genes)) # Run the analysis and return the JSONified results try: results = geneWordSearch(genes, species, minChance=probCutoff) except KeyError: abort(400) ans = WordFreq.to_JSON_array(results[0]) return jsonify(result=ans)
def gene_analysis(): # Run the genes through genewordsearch # Sanitize the input species = str(request.form['species']) genes = str(request.form['geneList']) probCutoff = float(request.form['probCut']) genes = re.split('\r| |,|\t|\n',genes) genes = list(filter((lambda x: x != ''),genes)) # Run the analysis and return the JSONified results try: results = geneWordSearch(genes,species,minChance=probCutoff) except KeyError: abort(400) ans = WordFreq.to_JSON_array(results[0]) return jsonify(result=ans)
def bookkeeper(species, geneDB, countList): import os import pickle import pkg_resources from genewordsearch.Classes import WordFreq from genewordsearch.Classes import GeneNote # Find the total word count, add it to the list total = 0 for word in countList: total += word.freq countList.insert(0,WordFreq('Total Count',total)) # Determine outfile locations dbFolder = 'databases/' + species.lower() + '/' os.makedirs(pkg_resources.resource_filename(__name__, dbFolder), exist_ok=True) folder = pkg_resources.resource_filename(__name__, dbFolder) # --------------Save the gene database files------------------- # Make a text version for posterity (and error checking) printList = list(geneDB.values()) geneFile = open(folder+'geneNotes.tsv','w',newline='') for gene in printList: if not(gene.gene == ''): geneFile.write(str(gene)) geneFile.close() # Pickle that stuff! (for geneWordSearch function) pickle.dump(geneDB,open(folder+'geneNotes.p','wb')) # ---------------Save the total word count files---------------- # Make a text version for posterity (and error checking) countFile = open(folder+'totalWordCounts.tsv','w') for word in countList: countFile.write(str(word.freq) + '\t' + str(word.word) + '\n') countFile.close() # Pickle a dictionary of that stuff! (for geneWordSearch function) countDB = dict() for word in countList: countDB[word.word] = word.freq pickle.dump(countDB,open(folder+'totalWordCounts.p','wb')) return
def bookkeeper(species, geneDB, countList): # Internal Method # Make all of the necessary files # Find the total word count, add it to the list total = 0 for word in countList: total += word.freq countList.insert(0, WordFreq('Total Count', total)) # Determine outfile locations dbFolder = getPath(species) os.makedirs(dbFolder, exist_ok=True) # --------------Save the gene database files------------------- # Make a text version for posterity (and error checking) printList = list(geneDB.values()) geneFile = open(os.path.join(dbFolder, 'geneNotes.tsv'), 'w', newline='') for gene in printList: if not (gene.gene == ''): geneFile.write(str(gene)) geneFile.close() # Pickle that stuff! (for geneWordSearch function) pickle.dump(geneDB, open(os.path.join(dbFolder, 'geneNotes.p'), 'wb')) # ---------------Save the total word count files---------------- # Make a text version for posterity (and error checking) countFile = open(os.path.join(dbFolder, 'totalWordCounts.tsv'), 'w') for word in countList: countFile.write(str(word.freq) + '\t' + str(word.word) + '\n') countFile.close() # Pickle a dictionary of that stuff! (for geneWordSearch function) countDB = dict() for word in countList: countDB[word.word] = word.freq pickle.dump(countDB, open(os.path.join(dbFolder, 'totalWordCounts.p'), 'wb')) return
def geneWordSearch(genes, species, minChance=0.05, corrected=False): # Input: Takes in a list of genes, the species, and the probability cutoff. # Output: Returns tuple of words and links. Only returns the genes that have a # chance probability of less than the minChance variable. import re import pickle import pkg_resources from genewordsearch.Classes import WordFreq from genewordsearch.Classes import GeneNote # Unpickle the database of words dbFolder = 'databases/' + species if pkg_resources.resource_exists(__name__, dbFolder + '/geneNotes.p'): dbfile = open( pkg_resources.resource_filename(__name__, dbFolder + '/geneNotes.p'), 'rb') else: raise ValueError( 'There is no database associated with this species, please use either \'maize\' or \'ath\', or make your own using \'--buildDB\'.' ) db = pickle.load(dbfile) # Build the word list up for all of the genes provided. words = [] webSites = [] links = WordFreq('Web Links', 0) for item in genes: # Make the input all lowercase to match the database gene = item.lower() i = 1 # Get the object from the DB geneData = db[gene] # Adding words related to the gene in db to the overall list for word in geneData.words: words.append([word, geneData.gene]) # Dealing with the websites for link in geneData.links: links.addGene(geneData.gene) webSites.append(link) # Sort to put words in alphabetical order for counting words.sort() # Adding the web link counts to the list wordList = [] # Counting the words for item in words: if (wordList == [] or wordList[0].word != item[0]): wordList.insert(0, WordFreq(item[0], 1)) wordList[0].addGene(item[1]) else: wordList[0].increment() wordList[0].addGene(item[1]) del words # Getting rid of words that don't happen in enough genes to matter wordListRaw = wordList[:] wordList = [] length = 0 for word in wordListRaw: if (word.freq >= 3): wordList.append(word) length += word.freq del wordListRaw # Finding the respective P values pickleDict = dbfile = open( pkg_resources.resource_filename(__name__, dbFolder + '/totalWordCounts.p'), 'rb') wordCounts = pickle.load(pickleDict) totalWords = wordCounts['Total Count'] for word in wordList: word.computeP(wordCounts, length, totalWords) pickleDict.close() del wordCounts # Sorting now by P Value instead of alphabetical wordList = sorted(wordList, key=lambda item: item.p) # Finding corrected P Values using Holm–Bonferroni method count = len(wordList) for i in range(0, count): wordList[i].pCorrect(count, (i + 1)) # Sort by corrected P Value instead of original P value if desired if (corrected): wordList = sorted(wordList, key=lambda item: item.pCor) # Filtering out results that are higher than the minimum chance threshold wordList = filter(lambda x: x.p <= minChance, wordList) return (list(wordList), list(webSites))
def geneWordSearch(genes, species, minChance=0.05, minWordFreq=3, corrected=False): # Does the analysis work of making of looking at the genes and doing the statistics # genes - list of strings of the gene ids in the set to be analysed # species - str of the species these genes belong to # minChance - the minimum probability that is acceptable for the word to be included in the Results # minWordFreq - the minimum amount of genes the word must appear in in the set to be counted # corrected - boolean saying whether the results should be cutoff using the corrected p value or the # original p, if true, results are more reliable, but less numerous # Unpickle the database of words dbFolder = getPath(species) try: dbfile = open(os.path.join(dbFolder, 'geneNotes.p'), 'rb') except: raise ValueError( 'There is no database associated with ' + species + ', please use either \'maize\' or \'ath\', or make your own using \'--buildDB\'.' ) db = pickle.load(dbfile) # Build the word list up for all of the genes provided. words = [] webSites = [] badGenes = [] links = WordFreq('Web Links', 0) for item in genes: # Make the input all lowercase to match the database gene = item.lower() i = 1 # Get the object from the DB, skip term if it is not there try: geneData = db[gene] except KeyError: badGenes.append(gene) continue # Adding words related to the gene in db to the overall list for word in geneData.words: words.append([word, geneData.gene]) # Dealing with the websites for link in geneData.links: links.addGene(geneData.gene) webSites.append(link) # Sort to put words in alphabetical order for counting words.sort() # Adding the web link counts to the list wordList = [] # Counting the words for item in words: if (wordList == [] or wordList[0].word != item[0]): wordList.insert(0, WordFreq(item[0], 1)) wordList[0].addGene(item[1]) else: wordList[0].increment() wordList[0].addGene(item[1]) del words # Getting rid of words that don't happen in enough genes to matter wordListRaw = wordList[:] wordList = [] length = 0 for word in wordListRaw: if (word.freq >= minWordFreq): wordList.append(word) length += word.freq del wordListRaw # Finding the respective P values pickleDict = dbfile = open(os.path.join(dbFolder, 'totalWordCounts.p'), 'rb') wordCounts = pickle.load(pickleDict) totalWords = wordCounts['Total Count'] for word in wordList: word.computeP(wordCounts, length, totalWords) pickleDict.close() del wordCounts # Sorting now by P Value instead of alphabetical wordList = sorted(wordList, key=lambda item: item.p) # Finding corrected P Values using Holm–Bonferroni method count = len(wordList) for i in range(0, count): wordList[i].pCorrect(count, (i + 1)) # Sort by corrected P Value instead of original P value if desired if (corrected): wordList = sorted(wordList, key=lambda item: item.pCor) # Filtering out results that are higher than the minimum chance threshold wordList = filter(lambda x: x.p <= minChance, wordList) return (list(wordList), list(webSites))
def geneWordSearch(genes,species,minChance=0.05,minWordFreq=3,corrected=False): # Does the analysis work of making of looking at the genes and doing the statistics # genes - list of strings of the gene ids in the set to be analysed # species - str of the species these genes belong to # minChance - the minimum probability that is acceptable for the word to be included in the Results # minWordFreq - the minimum amount of genes the word must appear in in the set to be counted # corrected - boolean saying whether the results should be cutoff using the corrected p value or the # original p, if true, results are more reliable, but less numerous # Unpickle the database of words dbFolder = 'databases/'+ species if pkg_resources.resource_exists(__name__, dbFolder + '/geneNotes.p'): dbfile = open(pkg_resources.resource_filename(__name__, dbFolder + '/geneNotes.p'),'rb') else: raise ValueError('There is no database associated with this species, please use either \'maize\' or \'ath\', or make your own using \'--buildDB\'.') db = pickle.load(dbfile) # Build the word list up for all of the genes provided. words = [] webSites = [] badGenes = [] links = WordFreq('Web Links',0) for item in genes: # Make the input all lowercase to match the database gene = item.lower() i=1 # Get the object from the DB, skip term if it is not there try: geneData = db[gene] except KeyError: badGenes.append(gene) continue # Adding words related to the gene in db to the overall list for word in geneData.words: words.append([word,geneData.gene]) # Dealing with the websites for link in geneData.links: links.addGene(geneData.gene) webSites.append(link) # Sort to put words in alphabetical order for counting words.sort() # Adding the web link counts to the list wordList = [] # Counting the words for item in words: if(wordList == [] or wordList[0].word != item[0]): wordList.insert(0, WordFreq(item[0],1)) wordList[0].addGene(item[1]) else: wordList[0].increment() wordList[0].addGene(item[1]) del words # Getting rid of words that don't happen in enough genes to matter wordListRaw = wordList[:] wordList = [] length = 0 for word in wordListRaw: if(word.freq >= minWordFreq): wordList.append(word) length += word.freq del wordListRaw # Finding the respective P values pickleDict = dbfile = open(pkg_resources.resource_filename(__name__, dbFolder + '/totalWordCounts.p'),'rb') wordCounts = pickle.load(pickleDict) totalWords = wordCounts['Total Count'] for word in wordList: word.computeP(wordCounts,length,totalWords) pickleDict.close() del wordCounts # Sorting now by P Value instead of alphabetical wordList = sorted(wordList, key=lambda item: item.p) # Finding corrected P Values using Holm–Bonferroni method count = len(wordList) for i in range(0,count): wordList[i].pCorrect(count,(i+1)) # Sort by corrected P Value instead of original P value if desired if(corrected): wordList = sorted(wordList, key=lambda item: item.pCor) # Filtering out results that are higher than the minimum chance threshold wordList = filter(lambda x: x.p <= minChance,wordList) return (list(wordList),list(webSites))
def geneWordSearch(genes,species,minChance=0.05,corrected=False): # Input: Takes in a list of genes, the species, and the probability cutoff. # Output: Returns tuple of words and links. Only returns the genes that have a # chance probability of less than the minChance variable. import re import pickle import pkg_resources from genewordsearch.Classes import WordFreq from genewordsearch.Classes import GeneNote # Unpickle the database of words dbFolder = 'databases/'+ species if pkg_resources.resource_exists(__name__, dbFolder + '/geneNotes.p'): dbfile = open(pkg_resources.resource_filename(__name__, dbFolder + '/geneNotes.p'),'rb') else: raise ValueError('There is no database associated with this species, please use either \'maize\' or \'ath\', or make your own using \'--buildDB\'.') db = pickle.load(dbfile) # Build the word list up for all of the genes provided. words = [] webSites = [] links = WordFreq('Web Links',0) for item in genes: # Make the input all lowercase to match the database gene = item.lower() i=1 # Get the object from the DB geneData = db[gene] # Adding words related to the gene in db to the overall list for word in geneData.words: words.append([word,geneData.gene]) # Dealing with the websites for link in geneData.links: links.addGene(geneData.gene) webSites.append(link) # Sort to put words in alphabetical order for counting words.sort() # Adding the web link counts to the list wordList = [] # Counting the words for item in words: if(wordList == [] or wordList[0].word != item[0]): wordList.insert(0, WordFreq(item[0],1)) wordList[0].addGene(item[1]) else: wordList[0].increment() wordList[0].addGene(item[1]) del words # Getting rid of words that don't happen in enough genes to matter wordListRaw = wordList[:] wordList = [] length = 0 for word in wordListRaw: if(word.freq >= 3): wordList.append(word) length += word.freq del wordListRaw # Finding the respective P values pickleDict = dbfile = open(pkg_resources.resource_filename(__name__, dbFolder + '/totalWordCounts.p'),'rb') wordCounts = pickle.load(pickleDict) totalWords = wordCounts['Total Count'] for word in wordList: word.computeP(wordCounts,length,totalWords) pickleDict.close() del wordCounts # Sorting now by P Value instead of alphabetical wordList = sorted(wordList, key=lambda item: item.p) # Finding corrected P Values using Holm–Bonferroni method count = len(wordList) for i in range(0,count): wordList[i].pCorrect(count,(i+1)) # Sort by corrected P Value instead of original P value if desired if(corrected): wordList = sorted(wordList, key=lambda item: item.pCor) # Filtering out results that are higher than the minimum chance threshold wordList = filter(lambda x: x.p <= minChance,wordList) return (list(wordList),list(webSites))