Exemplo n.º 1
0
stems = do_stem(terms)

pickles = glob("%s/*_dict.pkl" %outfolder)
for result_file in pickles:
    result = pickle.load(open(result_file,"rb"))
    content = result["content"]
    print "Parsing %s" %result_file
    # We will save counts and total words
    dfcount = pandas.DataFrame(columns=stems)
    totalwords = []
    # We want to keep a count of those with no terms
    noterms = 0
    for t in range(0,len(content)):
        print "%s of %s" %(t,len(content))
        text = ''.join([i if ord(i) < 128 else ' ' for i in content[t]])
        counts = get_term_counts(terms,str(text))
        # Only save if we have at least one!
        if counts["count"].sum() > 0:    
            totalwords.append(get_total_words(text))
            dfcount.loc[t,counts.index] = counts["count"]
    result["dfcount"] = dfcount
    result["words"] = totalwords 
    # Save to output file
    pickle.dump(result,open(result_file.replace("dict","dict_counts"),"wb"))

### 3. COMBINE COUNTS BY FAMILY ##############################################################
# Prepare behavioral terms
pickles = glob("%s/*_dict_counts.pkl" %outfolder)
families = get_expanded_family_dict(unique=True)

# This is NOT a diagonal matrix, base terms are in rows, family members in columns
# This script will use the brainbehavior.py module to parse a text corpus for a list of terms
# For example, wikipedia, this script was not used for main (pubmed) analysis
from brainbehavior.wikipedia import get_page_content, get_headers, get_section_text
from brainbehavior.nlp import get_term_counts
import pickle

# Step 1: Read in behavioral term list (generated with 1.create_ontology
output_folder = "/home/vanessa/Documents/Dropbox/Website/traits/data"
terms = pickle.load(open("%s/behavior_list.pkl" %(output_folder),"rb"))

# Step 2: Get data from an "expert source" - let's start with wikipedia!
test_disorder = "major depressive disorder"
text = get_page_content(test_disorder)

# Get the section text to parse!
headers = get_headers(text)
sections = headers[0:2] # [u'== Symptoms and signs ==', u'=== Comorbidity ===']
section_text = get_section_text(text,sections)

# First let's try just term counts
counts = get_term_counts(terms=terms,text=section_text)

#Boom! Works! Now to do for a larger set...

# 5. Parse some corpus, and break into papers about disorders that we have in RDoC, cognitive atlas, and behavioral data.


# 6. For each term in ontology, take simple counts to calculate the probability of the term given the disorder. (or probabiliy of term1 given term2 in title (meaning the literature is "about" that term))
# 7. We may want to do something more intelligent like parsing individual sentences and breaking into parts of speech, determining some kind of more detailed relationship about terms (other than co-occurence). Will determine this when I get there.
# 8. Finished ontology should be explorable in web interface, define how behavioral traits are related (definition wise), and how they are related in the literature (based on disorders). We can then extend to actual behavioral data.
Exemplo n.º 3
0
folder = "%s/%s" %(topfolder,subfolder)

# Get compressed files in folder.
zips = glob("%s/*.tar.gz" %folder)

# Read in our terms
terms = pickle.load(open(term_pickle,"rb"))
stems = do_stem(terms)

# We will save counts and total words
dfcount = pandas.DataFrame(columns=stems)
totalwords = []

# We want to keep a count of those with no terms
noterms = 0

for z in zips:
    zname = "%s/%s" %(subfolder,os.path.basename(z))
    text = get_xml_tree(z)
    counts = get_term_counts(terms,text)
    # Only save if we have at least one!
    if counts["count"].sum() > 0:    
        totalwords.append(get_total_words(text))
        dfcount.loc[zname,counts.index] = counts["count"]
    else:
        noterms += 1

# Save to output file
result = {"df":dfcount,"noterms_count":noterms,"files_in_folder":len(zips),"total_words":totalwords}
pickle.dump(result,open("%s/%s_counts.pkl" %(outfolder,subfolder),"wb"))