Python get_term_counts示例

编程语言: Python

命名空间/包名称: brainbehavior.nlp

方法/功能: get_term_counts

hotexamples.com的示例: 4

Python get_term_counts - 已找到4个示例。这些是从开源项目中提取的最受好评的brainbehavior.nlp.get_term_counts现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： 2.parse_disorder_corpus.py 项目： vsoch/brainbehavior

# This script will use the brainbehavior.py module to parse a text corpus for a list of terms
# For example, wikipedia, this script was not used for main (pubmed) analysis
from brainbehavior.wikipedia import get_page_content, get_headers, get_section_text
from brainbehavior.nlp import get_term_counts
import pickle

# Step 1: Read in behavioral term list (generated with 1.create_ontology
output_folder = "/home/vanessa/Documents/Dropbox/Website/traits/data"
terms = pickle.load(open("%s/behavior_list.pkl" %(output_folder),"rb"))

# Step 2: Get data from an "expert source" - let's start with wikipedia!
test_disorder = "major depressive disorder"
text = get_page_content(test_disorder)

# Get the section text to parse!
headers = get_headers(text)
sections = headers[0:2] # [u'== Symptoms and signs ==', u'=== Comorbidity ===']
section_text = get_section_text(text,sections)

# First let's try just term counts
counts = get_term_counts(terms=terms,text=section_text)

#Boom! Works! Now to do for a larger set...

# 5. Parse some corpus, and break into papers about disorders that we have in RDoC, cognitive atlas, and behavioral data.


# 6. For each term in ontology, take simple counts to calculate the probability of the term given the disorder. (or probabiliy of term1 given term2 in title (meaning the literature is "about" that term))
# 7. We may want to do something more intelligent like parsing individual sentences and breaking into parts of speech, determining some kind of more detailed relationship about terms (other than co-occurence). Will determine this when I get there.
# 8. Finished ontology should be explorable in web interface, define how behavioral traits are related (definition wise), and how they are related in the literature (based on disorders). We can then extend to actual behavioral data.

示例#2

显示文件

文件： 0.get_reddit.py 项目： vsoch/brainbehavior

stems = do_stem(terms)

pickles = glob("%s/*_dict.pkl" %outfolder)
for result_file in pickles:
    result = pickle.load(open(result_file,"rb"))
    content = result["content"]
    print "Parsing %s" %result_file
    # We will save counts and total words
    dfcount = pandas.DataFrame(columns=stems)
    totalwords = []
    # We want to keep a count of those with no terms
    noterms = 0
    for t in range(0,len(content)):
        print "%s of %s" %(t,len(content))
        text = ''.join([i if ord(i) < 128 else ' ' for i in content[t]])
        counts = get_term_counts(terms,str(text))
        # Only save if we have at least one!
        if counts["count"].sum() > 0:    
            totalwords.append(get_total_words(text))
            dfcount.loc[t,counts.index] = counts["count"]
    result["dfcount"] = dfcount
    result["words"] = totalwords 
    # Save to output file
    pickle.dump(result,open(result_file.replace("dict","dict_counts"),"wb"))

### 3. COMBINE COUNTS BY FAMILY ##############################################################
# Prepare behavioral terms
pickles = glob("%s/*_dict_counts.pkl" %outfolder)
families = get_expanded_family_dict(unique=True)

# This is NOT a diagonal matrix, base terms are in rows, family members in columns

示例#3

显示文件

文件： 1.count.py 项目： vsoch/brainbehavior

# Read in our terms
terms = pickle.load(open(term_pickle, "rb"))
stems = do_stem(terms)

# We will save counts and total words
dfcount = pandas.DataFrame(columns=stems)
totalwords = []

# We want to keep a count of those with no terms
noterms = 0

for z in zips:
    zname = "%s/%s" % (subfolder, os.path.basename(z))
    text = get_xml_tree(z)
    counts = get_term_counts(terms, text)
    # Only save if we have at least one!
    if counts["count"].sum() > 0:
        totalwords.append(get_total_words(text))
        dfcount.loc[zname, counts.index] = counts["count"]
    else:
        noterms += 1

# Save to output file
result = {
    "df": dfcount,
    "noterms_count": noterms,
    "files_in_folder": len(zips),
    "total_words": totalwords
}
pickle.dump(result, open("%s/%s_counts.pkl" % (outfolder, subfolder), "wb"))

示例#4

显示文件

文件： 1.count.py 项目： vsoch/brainbehavior

folder = "%s/%s" %(topfolder,subfolder)

# Get compressed files in folder.
zips = glob("%s/*.tar.gz" %folder)

# Read in our terms
terms = pickle.load(open(term_pickle,"rb"))
stems = do_stem(terms)

# We will save counts and total words
dfcount = pandas.DataFrame(columns=stems)
totalwords = []

# We want to keep a count of those with no terms
noterms = 0

for z in zips:
    zname = "%s/%s" %(subfolder,os.path.basename(z))
    text = get_xml_tree(z)
    counts = get_term_counts(terms,text)
    # Only save if we have at least one!
    if counts["count"].sum() > 0:    
        totalwords.append(get_total_words(text))
        dfcount.loc[zname,counts.index] = counts["count"]
    else:
        noterms += 1

# Save to output file
result = {"df":dfcount,"noterms_count":noterms,"files_in_folder":len(zips),"total_words":totalwords}
pickle.dump(result,open("%s/%s_counts.pkl" %(outfolder,subfolder),"wb"))