Python WordFreq示例，genewordsearch.Classes.WordFreq Python示例

示例#1

0

显示文件

文件： DBBuilder.py 项目： schae234/geneWordSearch

def wordCounter(db):
# Takes in a database dictionary and returns sorted list of WordFreq objects
# sorted by how often they occur
	from genewordsearch.Classes import GeneNote
	from genewordsearch.Classes import WordFreq
	
	#Make a list of all the words associated genes in the database
	words = []
	for gene in list(db.values()):
		words += gene.words
	
	# Sorting the words into alphabetical order
	words.sort()
	wordList = []
	
	# Counting the words
	for item in words:
		if(wordList == [] or wordList[0].word != item):
			wordList.insert(0, WordFreq(item,1))
		else:
			wordList[0].increment()
	del words
	
	# Sorting now by frequency instead of alphabetical and return it
	return sorted(wordList, key=lambda item: item.freq,reverse=True)

示例#2

0

显示文件

文件： views.py 项目： schae234/geneWordSearch

def custom_db_analysis():
# Deal with a custom database file
	import os
	import glob
	import shutil
	from genewordsearch.DBBuilder import geneWordBuilder 
	
	# Prep the database files for processing
	ip = str(request.environ['REMOTE_ADDR'])
	folder = os.path.join(app.config['UPLOAD_FOLDER'], ip)
	os.makedirs(folder, exist_ok=True)
	dbFiles = request.files.getlist('geneDBs')
	fileCount = len(dbFiles)
	fileNum = 0
	for db in dbFiles:
		filename = secure_filename(db.filename)
		db.save(os.path.join(folder, (str(fileNum)+filename[-4:])))
		fileNum += 1
	fileList = glob.glob(folder+'/*')
	fileList.sort()
	
	# Pull and organize the rest of the database info
	headers = []
	headtxt = []
	delimiters = []
	geneCols = []
	desCols = []
	fileNum = 0
	while(fileNum < fileCount):
		headtxt.append(str(request.form['header'+str(fileNum)]))
		delimiters.append(str(request.form['delimiter'+str(fileNum)]))
		geneCols.append(str(request.form['geneCol'+str(fileNum)]))
		desCols.append(str(request.form['desCols'+str(fileNum)]))
		fileNum += 1
	for header in headtxt:
		if(header =='y'):
			headers.append(True)
		else:
			headers.append(False)
	geneWordBuilder(ip,fileList,geneCols,desCols,delimiters,headers)
	shutil.rmtree(folder+'/')
	
	# Run the enrichment analysis
	genes = str(request.form['geneList'])
	probCutoff = float(request.form['probCut'])
	genes = re.split('\r| |,|\t|\n',genes)
	genes = list(filter((lambda x: x != ''),genes))
	try:
		results = geneWordSearch(genes,ip,minChance=probCutoff)
	except KeyError:
		abort(400)
	ans = WordFreq.to_JSON_array(results[0])
	shutil.rmtree('genewordsearch/databases/'+ip+'/')
	return jsonify(result=ans)

示例#3

0

显示文件

def custom_db_analysis():
    # Deal with a custom database file

    # Prep the database files for processing
    ip = str(request.environ['REMOTE_ADDR'])
    folder = os.path.join(app.config['UPLOAD_FOLDER'], ip)
    os.makedirs(folder, exist_ok=True)
    dbFiles = request.files.getlist('geneDBs')
    fileCount = len(dbFiles)
    fileNum = 0
    for db in dbFiles:
        filename = secure_filename(db.filename)
        db.save(os.path.join(folder, (str(fileNum) + filename[-4:])))
        fileNum += 1
    fileList = glob.glob(folder + '/*')
    fileList.sort()

    # Pull and organize the rest of the database info
    headers = []
    headtxt = []
    delimiters = []
    geneCols = []
    desCols = []
    fileNum = 0
    while (fileNum < fileCount):
        headtxt.append(str(request.form['header' + str(fileNum)]))
        delimiters.append(str(request.form['delimiter' + str(fileNum)]))
        geneCols.append(str(request.form['geneCol' + str(fileNum)]))
        desCols.append(str(request.form['desCols' + str(fileNum)]))
        fileNum += 1
    for header in headtxt:
        if (header == 'y'):
            headers.append(True)
        else:
            headers.append(False)
    geneWordBuilder(ip, fileList, geneCols, desCols, delimiters, headers)
    shutil.rmtree(folder + '/')

    # Run the enrichment analysis
    genes = str(request.form['geneList'])
    probCutoff = float(request.form['probCut'])
    genes = re.split('\r| |,|\t|\n', genes)
    genes = list(filter((lambda x: x != ''), genes))
    try:
        results = geneWordSearch(genes, ip, minChance=probCutoff)
    except KeyError:
        abort(400)
    ans = WordFreq.to_JSON_array(results[0])
    shutil.rmtree('genewordsearch/databases/' + ip + '/')
    return jsonify(result=ans)

示例#4

0

显示文件

def gene_word_search():
    cob = networks[str(request.form['network'])]
    pCutoff = safeOpts('pCutoff',float(request.form['pCutoff']))
    geneList = str(request.form['geneList'])
    geneList = list(filter((lambda x: x != ''), re.split('\r| |,|;|\t|\n', geneList)))
    
    # Run the analysis and return the JSONified results
    if cob._global('parent_refgen') in func_data_db:
        results = geneWordSearch(geneList, cob._global('parent_refgen'), minChance=pCutoff)
    else:
        abort(405)
    if len(results[0]) == 0:
        abort(400)
    results = WordFreq.to_JSON_array(results[0])
    return jsonify(result=results)

示例#5

0

显示文件

文件： server.py 项目： UMN-EGGL/cob

def gene_word_search():
    cob = networks[str(request.form['network'])]
    pCutoff = safeOpts('pCutoff',float(request.form['pCutoff']))
    geneList = str(request.form['geneList'])
    geneList = list(filter((lambda x: x != ''), re.split('\r| |,|;|\t|\n', geneList)))
    
    # Run the analysis and return the JSONified results
    if hasGWS and (cob._global('parent_refgen') in func_data_db):
        results = geneWordSearch(geneList, cob._global('parent_refgen'), minChance=pCutoff)
    else:
        abort(405)
    if len(results[0]) == 0:
        abort(400)
    results = WordFreq.to_JSON_array(results[0])
    return jsonify(result=results)

示例#6

0

显示文件

def gene_analysis():
    # Run the genes through genewordsearch
    # Sanitize the input
    species = str(request.form['species'])
    genes = str(request.form['geneList'])
    probCutoff = float(request.form['probCut'])
    genes = re.split('\r| |,|\t|\n', genes)
    genes = list(filter((lambda x: x != ''), genes))

    # Run the analysis and return the JSONified results
    try:
        results = geneWordSearch(genes, species, minChance=probCutoff)
    except KeyError:
        abort(400)
    ans = WordFreq.to_JSON_array(results[0])
    return jsonify(result=ans)

示例#7

0

显示文件

文件： views.py 项目： monprin/geneWordSearch

def gene_analysis():
# Run the genes through genewordsearch
	# Sanitize the input
	species = str(request.form['species'])
	genes = str(request.form['geneList'])
	probCutoff = float(request.form['probCut'])
	genes = re.split('\r| |,|\t|\n',genes)
	genes = list(filter((lambda x: x != ''),genes))

	# Run the analysis and return the JSONified results
	try:
		results = geneWordSearch(genes,species,minChance=probCutoff)
	except KeyError:
		abort(400)
	ans = WordFreq.to_JSON_array(results[0])
	return jsonify(result=ans)

示例#8

0

显示文件

文件： DBBuilder.py 项目： schae234/geneWordSearch

def bookkeeper(species, geneDB, countList):
	import os
	import pickle
	import pkg_resources
	from genewordsearch.Classes import WordFreq
	from genewordsearch.Classes import GeneNote
	
	# Find the total word count, add it to the list
	total = 0
	for word in countList:
		total += word.freq
	countList.insert(0,WordFreq('Total Count',total))
	
	# Determine outfile locations
	dbFolder = 'databases/' + species.lower() + '/'
	os.makedirs(pkg_resources.resource_filename(__name__, dbFolder), exist_ok=True)
	folder = pkg_resources.resource_filename(__name__, dbFolder)
	
	# --------------Save the gene database files-------------------
	
	# Make a text version for posterity (and error checking)
	printList = list(geneDB.values())
	geneFile = open(folder+'geneNotes.tsv','w',newline='')
	for gene in printList:
		if not(gene.gene == ''):
			geneFile.write(str(gene))
	geneFile.close()
	
	# Pickle that stuff! (for geneWordSearch function)
	pickle.dump(geneDB,open(folder+'geneNotes.p','wb'))
	
	# ---------------Save the total word count files----------------
	
	# Make a text version for posterity (and error checking)
	countFile = open(folder+'totalWordCounts.tsv','w')
	for word in countList:
		countFile.write(str(word.freq) + '\t' + str(word.word) + '\n')
	countFile.close()
	
	# Pickle a dictionary of that stuff! (for geneWordSearch function)
	countDB = dict()
	for word in countList:
		countDB[word.word] = word.freq
	pickle.dump(countDB,open(folder+'totalWordCounts.p','wb'))
	
	return

示例#9

0

显示文件

def bookkeeper(species, geneDB, countList):
    # Internal Method
    # Make all of the necessary files

    # Find the total word count, add it to the list
    total = 0
    for word in countList:
        total += word.freq
    countList.insert(0, WordFreq('Total Count', total))

    # Determine outfile locations
    dbFolder = getPath(species)
    os.makedirs(dbFolder, exist_ok=True)

    # --------------Save the gene database files-------------------

    # Make a text version for posterity (and error checking)
    printList = list(geneDB.values())
    geneFile = open(os.path.join(dbFolder, 'geneNotes.tsv'), 'w', newline='')
    for gene in printList:
        if not (gene.gene == ''):
            geneFile.write(str(gene))
    geneFile.close()

    # Pickle that stuff! (for geneWordSearch function)
    pickle.dump(geneDB, open(os.path.join(dbFolder, 'geneNotes.p'), 'wb'))

    # ---------------Save the total word count files----------------

    # Make a text version for posterity (and error checking)
    countFile = open(os.path.join(dbFolder, 'totalWordCounts.tsv'), 'w')
    for word in countList:
        countFile.write(str(word.freq) + '\t' + str(word.word) + '\n')
    countFile.close()

    # Pickle a dictionary of that stuff! (for geneWordSearch function)
    countDB = dict()
    for word in countList:
        countDB[word.word] = word.freq
    pickle.dump(countDB, open(os.path.join(dbFolder, 'totalWordCounts.p'),
                              'wb'))

    return

示例#10

0

显示文件

文件： GeneWordSearch.py 项目： schae234/geneWordSearch

def geneWordSearch(genes, species, minChance=0.05, corrected=False):
    # Input: Takes in a list of genes, the species, and the probability cutoff.
    # Output: Returns tuple of words and links. Only returns the genes that have a
    #         chance probability of less than the minChance variable.
    import re
    import pickle
    import pkg_resources
    from genewordsearch.Classes import WordFreq
    from genewordsearch.Classes import GeneNote

    # Unpickle the database of words
    dbFolder = 'databases/' + species
    if pkg_resources.resource_exists(__name__, dbFolder + '/geneNotes.p'):
        dbfile = open(
            pkg_resources.resource_filename(__name__,
                                            dbFolder + '/geneNotes.p'), 'rb')
    else:
        raise ValueError(
            'There is no database associated with this species, please use either \'maize\' or \'ath\', or make your own using \'--buildDB\'.'
        )
    db = pickle.load(dbfile)

    # Build the word list up for all of the genes provided.
    words = []
    webSites = []
    links = WordFreq('Web Links', 0)
    for item in genes:
        # Make the input all lowercase to match the database
        gene = item.lower()
        i = 1

        # Get the object from the DB
        geneData = db[gene]

        # Adding words related to the gene in db to the overall list
        for word in geneData.words:
            words.append([word, geneData.gene])

        # Dealing with the websites
        for link in geneData.links:
            links.addGene(geneData.gene)
            webSites.append(link)

    # Sort to put words in alphabetical order for counting
    words.sort()

    # Adding the web link counts to the list
    wordList = []

    # Counting the words
    for item in words:
        if (wordList == [] or wordList[0].word != item[0]):
            wordList.insert(0, WordFreq(item[0], 1))
            wordList[0].addGene(item[1])
        else:
            wordList[0].increment()
            wordList[0].addGene(item[1])
    del words

    # Getting rid of words that don't happen in enough genes to matter
    wordListRaw = wordList[:]
    wordList = []
    length = 0
    for word in wordListRaw:
        if (word.freq >= 3):
            wordList.append(word)
            length += word.freq
    del wordListRaw

    # Finding the respective P values
    pickleDict = dbfile = open(
        pkg_resources.resource_filename(__name__,
                                        dbFolder + '/totalWordCounts.p'), 'rb')
    wordCounts = pickle.load(pickleDict)
    totalWords = wordCounts['Total Count']
    for word in wordList:
        word.computeP(wordCounts, length, totalWords)
    pickleDict.close()
    del wordCounts

    # Sorting now by P Value instead of alphabetical
    wordList = sorted(wordList, key=lambda item: item.p)

    # Finding corrected P Values using Holm–Bonferroni method
    count = len(wordList)
    for i in range(0, count):
        wordList[i].pCorrect(count, (i + 1))

    # Sort by corrected P Value instead of original P value if desired
    if (corrected):
        wordList = sorted(wordList, key=lambda item: item.pCor)

    # Filtering out results that are higher than the minimum chance threshold
    wordList = filter(lambda x: x.p <= minChance, wordList)

    return (list(wordList), list(webSites))

示例#11

0

显示文件

文件： GeneWordSearch.py 项目： monprin/geneWordSearch

def geneWordSearch(genes,
                   species,
                   minChance=0.05,
                   minWordFreq=3,
                   corrected=False):
    # Does the analysis work of making of looking at the genes and doing the statistics
    #	genes - list of strings of the gene ids in the set to be analysed
    #	species - str of the species these genes belong to
    #	minChance - the minimum probability that is acceptable for the word to be included in the Results
    #	minWordFreq - the minimum amount of genes the word must appear in in the set to be counted
    #	corrected - boolean saying whether the results should be cutoff using the corrected p value or the
    #	            original p, if true, results are more reliable, but less numerous

    # Unpickle the database of words
    dbFolder = getPath(species)
    try:
        dbfile = open(os.path.join(dbFolder, 'geneNotes.p'), 'rb')
    except:
        raise ValueError(
            'There is no database associated with ' + species +
            ', please use either \'maize\' or \'ath\', or make your own using \'--buildDB\'.'
        )
    db = pickle.load(dbfile)

    # Build the word list up for all of the genes provided.
    words = []
    webSites = []
    badGenes = []
    links = WordFreq('Web Links', 0)
    for item in genes:
        # Make the input all lowercase to match the database
        gene = item.lower()
        i = 1

        # Get the object from the DB, skip term if it is not there
        try:
            geneData = db[gene]
        except KeyError:
            badGenes.append(gene)
            continue

        # Adding words related to the gene in db to the overall list
        for word in geneData.words:
            words.append([word, geneData.gene])

        # Dealing with the websites
        for link in geneData.links:
            links.addGene(geneData.gene)
            webSites.append(link)

    # Sort to put words in alphabetical order for counting
    words.sort()

    # Adding the web link counts to the list
    wordList = []

    # Counting the words
    for item in words:
        if (wordList == [] or wordList[0].word != item[0]):
            wordList.insert(0, WordFreq(item[0], 1))
            wordList[0].addGene(item[1])
        else:
            wordList[0].increment()
            wordList[0].addGene(item[1])
    del words

    # Getting rid of words that don't happen in enough genes to matter
    wordListRaw = wordList[:]
    wordList = []
    length = 0
    for word in wordListRaw:
        if (word.freq >= minWordFreq):
            wordList.append(word)
            length += word.freq
    del wordListRaw

    # Finding the respective P values
    pickleDict = dbfile = open(os.path.join(dbFolder, 'totalWordCounts.p'),
                               'rb')
    wordCounts = pickle.load(pickleDict)
    totalWords = wordCounts['Total Count']
    for word in wordList:
        word.computeP(wordCounts, length, totalWords)
    pickleDict.close()
    del wordCounts

    # Sorting now by P Value instead of alphabetical
    wordList = sorted(wordList, key=lambda item: item.p)

    # Finding corrected P Values using Holm–Bonferroni method
    count = len(wordList)
    for i in range(0, count):
        wordList[i].pCorrect(count, (i + 1))

    # Sort by corrected P Value instead of original P value if desired
    if (corrected):
        wordList = sorted(wordList, key=lambda item: item.pCor)

    # Filtering out results that are higher than the minimum chance threshold
    wordList = filter(lambda x: x.p <= minChance, wordList)

    return (list(wordList), list(webSites))

示例#12

0

显示文件

文件： GeneWordSearch.py 项目： mrG7/geneWordSearch

def geneWordSearch(genes,species,minChance=0.05,minWordFreq=3,corrected=False):
# Does the analysis work of making of looking at the genes and doing the statistics
#	genes - list of strings of the gene ids in the set to be analysed
#	species - str of the species these genes belong to
#	minChance - the minimum probability that is acceptable for the word to be included in the Results
#	minWordFreq - the minimum amount of genes the word must appear in in the set to be counted
#	corrected - boolean saying whether the results should be cutoff using the corrected p value or the
#	            original p, if true, results are more reliable, but less numerous

	# Unpickle the database of words
	dbFolder = 'databases/'+ species
	if pkg_resources.resource_exists(__name__, dbFolder + '/geneNotes.p'):
		dbfile = open(pkg_resources.resource_filename(__name__, dbFolder + '/geneNotes.p'),'rb')
	else:
		raise ValueError('There is no database associated with this species, please use either \'maize\' or \'ath\', or make your own using \'--buildDB\'.')
	db = pickle.load(dbfile)

	# Build the word list up for all of the genes provided.
	words = []
	webSites = []
	badGenes = []
	links = WordFreq('Web Links',0)
	for item in genes:
		# Make the input all lowercase to match the database
		gene = item.lower()
		i=1

		# Get the object from the DB, skip term if it is not there
		try:
			geneData = db[gene]
		except KeyError:
			badGenes.append(gene)
			continue

		# Adding words related to the gene in db to the overall list
		for word in geneData.words:
			words.append([word,geneData.gene])

		# Dealing with the websites
		for link in geneData.links:
			links.addGene(geneData.gene)
			webSites.append(link)

	# Sort to put words in alphabetical order for counting
	words.sort()

	# Adding the web link counts to the list
	wordList = []

	# Counting the words
	for item in words:
		if(wordList == [] or wordList[0].word != item[0]):
			wordList.insert(0, WordFreq(item[0],1))
			wordList[0].addGene(item[1])
		else:
			wordList[0].increment()
			wordList[0].addGene(item[1])
	del words

	# Getting rid of words that don't happen in enough genes to matter
	wordListRaw = wordList[:]
	wordList = []
	length = 0
	for word in wordListRaw:
		if(word.freq >= minWordFreq):
			wordList.append(word)
			length += word.freq
	del wordListRaw

	# Finding the respective P values
	pickleDict = dbfile = open(pkg_resources.resource_filename(__name__, dbFolder + '/totalWordCounts.p'),'rb')
	wordCounts = pickle.load(pickleDict)
	totalWords = wordCounts['Total Count']
	for word in wordList:
		word.computeP(wordCounts,length,totalWords)
	pickleDict.close()
	del wordCounts

	# Sorting now by P Value instead of alphabetical
	wordList = sorted(wordList, key=lambda item: item.p)

	# Finding corrected P Values using Holm–Bonferroni method
	count = len(wordList)
	for i in range(0,count):
		wordList[i].pCorrect(count,(i+1))

	# Sort by corrected P Value instead of original P value if desired
	if(corrected):
		wordList = sorted(wordList, key=lambda item: item.pCor)

	# Filtering out results that are higher than the minimum chance threshold
	wordList = filter(lambda x: x.p <= minChance,wordList)

	return (list(wordList),list(webSites))

示例#13

0

显示文件

文件： GeneWordSearch.py 项目： schae234/geneWordSearch

def geneWordSearch(genes,species,minChance=0.05,corrected=False):
# Input: Takes in a list of genes, the species, and the probability cutoff.
# Output: Returns tuple of words and links. Only returns the genes that have a 
#         chance probability of less than the minChance variable. 
	import re
	import pickle
	import pkg_resources
	from genewordsearch.Classes import WordFreq
	from genewordsearch.Classes import GeneNote
	
	# Unpickle the database of words
	dbFolder = 'databases/'+ species
	if pkg_resources.resource_exists(__name__, dbFolder + '/geneNotes.p'):
		dbfile = open(pkg_resources.resource_filename(__name__, dbFolder + '/geneNotes.p'),'rb')
	else:
		raise ValueError('There is no database associated with this species, please use either \'maize\' or \'ath\', or make your own using \'--buildDB\'.')
	db = pickle.load(dbfile)
	
	# Build the word list up for all of the genes provided.
	words = []
	webSites = []
	links = WordFreq('Web Links',0)
	for item in genes:
		# Make the input all lowercase to match the database
		gene = item.lower()
		i=1
		
		# Get the object from the DB
		geneData = db[gene]
		
		# Adding words related to the gene in db to the overall list
		for word in geneData.words:
			words.append([word,geneData.gene])
		
		# Dealing with the websites
		for link in geneData.links:
			links.addGene(geneData.gene)
			webSites.append(link)

	# Sort to put words in alphabetical order for counting
	words.sort()
	
	# Adding the web link counts to the list
	wordList = []
	
	# Counting the words
	for item in words:
		if(wordList == [] or wordList[0].word != item[0]):
			wordList.insert(0, WordFreq(item[0],1))
			wordList[0].addGene(item[1])
		else:
			wordList[0].increment()
			wordList[0].addGene(item[1])
	del words
	
	# Getting rid of words that don't happen in enough genes to matter
	wordListRaw = wordList[:]
	wordList = []
	length = 0
	for word in wordListRaw:
		if(word.freq >= 3):
			wordList.append(word)
			length += word.freq
	del wordListRaw
	
	# Finding the respective P values
	pickleDict = dbfile = open(pkg_resources.resource_filename(__name__, dbFolder + '/totalWordCounts.p'),'rb')
	wordCounts = pickle.load(pickleDict)
	totalWords = wordCounts['Total Count']
	for word in wordList:
		word.computeP(wordCounts,length,totalWords)
	pickleDict.close()
	del wordCounts
	
	# Sorting now by P Value instead of alphabetical
	wordList = sorted(wordList, key=lambda item: item.p)
	
	# Finding corrected P Values using Holm–Bonferroni method
	count = len(wordList)
	for i in range(0,count):
		wordList[i].pCorrect(count,(i+1))
	
	# Sort by corrected P Value instead of original P value if desired
	if(corrected):
		wordList = sorted(wordList, key=lambda item: item.pCor)
	
	# Filtering out results that are higher than the minimum chance threshold
	wordList = filter(lambda x: x.p <= minChance,wordList)
	
	return (list(wordList),list(webSites))