# This script will use the brainbehavior.py module to parse a text corpus for a list of terms # For example, wikipedia, this script was not used for main (pubmed) analysis from brainbehavior.wikipedia import get_page_content, get_headers, get_section_text from brainbehavior.nlp import get_term_counts import pickle # Step 1: Read in behavioral term list (generated with 1.create_ontology output_folder = "/home/vanessa/Documents/Dropbox/Website/traits/data" terms = pickle.load(open("%s/behavior_list.pkl" %(output_folder),"rb")) # Step 2: Get data from an "expert source" - let's start with wikipedia! test_disorder = "major depressive disorder" text = get_page_content(test_disorder) # Get the section text to parse! headers = get_headers(text) sections = headers[0:2] # [u'== Symptoms and signs ==', u'=== Comorbidity ==='] section_text = get_section_text(text,sections) # First let's try just term counts counts = get_term_counts(terms=terms,text=section_text) #Boom! Works! Now to do for a larger set... # 5. Parse some corpus, and break into papers about disorders that we have in RDoC, cognitive atlas, and behavioral data. # 6. For each term in ontology, take simple counts to calculate the probability of the term given the disorder. (or probabiliy of term1 given term2 in title (meaning the literature is "about" that term)) # 7. We may want to do something more intelligent like parsing individual sentences and breaking into parts of speech, determining some kind of more detailed relationship about terms (other than co-occurence). Will determine this when I get there. # 8. Finished ontology should be explorable in web interface, define how behavioral traits are related (definition wise), and how they are related in the literature (based on disorders). We can then extend to actual behavioral data.
stems = do_stem(terms) pickles = glob("%s/*_dict.pkl" %outfolder) for result_file in pickles: result = pickle.load(open(result_file,"rb")) content = result["content"] print "Parsing %s" %result_file # We will save counts and total words dfcount = pandas.DataFrame(columns=stems) totalwords = [] # We want to keep a count of those with no terms noterms = 0 for t in range(0,len(content)): print "%s of %s" %(t,len(content)) text = ''.join([i if ord(i) < 128 else ' ' for i in content[t]]) counts = get_term_counts(terms,str(text)) # Only save if we have at least one! if counts["count"].sum() > 0: totalwords.append(get_total_words(text)) dfcount.loc[t,counts.index] = counts["count"] result["dfcount"] = dfcount result["words"] = totalwords # Save to output file pickle.dump(result,open(result_file.replace("dict","dict_counts"),"wb")) ### 3. COMBINE COUNTS BY FAMILY ############################################################## # Prepare behavioral terms pickles = glob("%s/*_dict_counts.pkl" %outfolder) families = get_expanded_family_dict(unique=True) # This is NOT a diagonal matrix, base terms are in rows, family members in columns
# Read in our terms terms = pickle.load(open(term_pickle, "rb")) stems = do_stem(terms) # We will save counts and total words dfcount = pandas.DataFrame(columns=stems) totalwords = [] # We want to keep a count of those with no terms noterms = 0 for z in zips: zname = "%s/%s" % (subfolder, os.path.basename(z)) text = get_xml_tree(z) counts = get_term_counts(terms, text) # Only save if we have at least one! if counts["count"].sum() > 0: totalwords.append(get_total_words(text)) dfcount.loc[zname, counts.index] = counts["count"] else: noterms += 1 # Save to output file result = { "df": dfcount, "noterms_count": noterms, "files_in_folder": len(zips), "total_words": totalwords } pickle.dump(result, open("%s/%s_counts.pkl" % (outfolder, subfolder), "wb"))
folder = "%s/%s" %(topfolder,subfolder) # Get compressed files in folder. zips = glob("%s/*.tar.gz" %folder) # Read in our terms terms = pickle.load(open(term_pickle,"rb")) stems = do_stem(terms) # We will save counts and total words dfcount = pandas.DataFrame(columns=stems) totalwords = [] # We want to keep a count of those with no terms noterms = 0 for z in zips: zname = "%s/%s" %(subfolder,os.path.basename(z)) text = get_xml_tree(z) counts = get_term_counts(terms,text) # Only save if we have at least one! if counts["count"].sum() > 0: totalwords.append(get_total_words(text)) dfcount.loc[zname,counts.index] = counts["count"] else: noterms += 1 # Save to output file result = {"df":dfcount,"noterms_count":noterms,"files_in_folder":len(zips),"total_words":totalwords} pickle.dump(result,open("%s/%s_counts.pkl" %(outfolder,subfolder),"wb"))