def test_reddit(self): ### COMBINE COUNTS BY FAMILY ############################################################## # Prepare behavioral terms pickles = glob("output/*_dict_counts.pkl") families = get_expanded_family_dict(unique=True) # This is NOT a diagonal matrix, base terms are in rows, family members in columns path_similarities = get_path_similarity_matrix() for result_file in pickles: tmp = pickle.load(open(result_file,"rb")) print "Combining family counts for %s" %tmp["disorder"] result = tmp["dfcount"] # This will be a new matrix with only base terms as column names familydf = pandas.DataFrame(index=result.index) # Step 2: For each term stem (row), find family based on path similarity for stem,data in families.iteritems(): family = path_similarities[stem][path_similarities[stem] != 0] # Create a data frame with just the columns column_names = [c for c in family.index if c in result.columns] family = family[column_names] # if there are no family members if family.shape[0] == 0: familydf[stem] = result[stem] # Weight each count by the path similarity, and sum else: subset = result[column_names].copy() for col in subset.columns: subset[col] *= family[col] familydf[stem] = subset.sum(axis=1) + result[stem] # Save family data frame to file tmp["familydf"] = familydf pickle.dump(tmp,open(result_file.replace("dict_counts","dict_counts_family"),"wb")) ### 4. CO-OCCURRENCE ################################################################## # Now calculate co-occurrence terms = familydf.columns.tolist() # Result df will be terms by terms df = pandas.DataFrame(columns=terms,index=terms) print "Calculating co-occurrence for %s" %tmp["disorder"] for t in range(0,len(terms)): term1 = terms[t] subset = familydf.loc[familydf[term1]>0] number_with_term1 = subset.shape[0] if number_with_term1 != 0: for term2 in terms: number_with_term2 = subset[term2].loc[subset[term2]>0].shape[0] pt2_given_t1 = float(number_with_term2) / number_with_term1 # [row](probability), [col](given) df.loc[term2,term1] = pt2_given_t1 else: df.loc[:,term1] = 0 df.to_csv("output/%s_co-occurrence.tsv" %tmp["disorder"],sep="\t") print "Finished." pass os.system("Rscript prep_cooccurr_data.R")
# if on sherlock, need to load python2.7 # module load python/2.7.5 from brainbehavior.cognitiveatlas import get_expanded_family_dict, get_path_similarity_matrix #from brainbehavior.utils import save_json from brainbehavior.nlp import do_stem #from textblob.wordnet import Synset from glob import glob import pandas import pickle # We want to first generate a matrix of relationships between all pairwise terms families = get_expanded_family_dict() # This is NOT a diagonal matrix, base terms are in rows, family members in columns path_similarities = get_path_similarity_matrix(families) # Load the result result = pandas.read_pickle("/share/PI/russpold/work/PUBMED/pmc_behavior_counts.pkl") # Step 1: Terms that appear in most papers need to be filterd out percents = [] for c in result.columns: col = result[c] percent_occur = col[col!=0].shape[0] / float(col.shape[0]) percents.append(percent_occur) percents = pandas.DataFrame(percents) percents.index = result.columns percents.to_csv("/share/PI/russpold/work/PUBMED/pmc_percent_occur.tsv",sep="\t") # Lets try nixing words with > 0.4 frequency... nix = percents.loc[percents[0]>0.1]
# if on sherlock, need to load python2.7 # module load python/2.7.5 from brainbehavior.cognitiveatlas import get_expanded_family_dict, get_path_similarity_matrix #from brainbehavior.utils import save_json from brainbehavior.nlp import do_stem #from textblob.wordnet import Synset from glob import glob import pandas import pickle # We want to first generate a matrix of relationships between all pairwise terms families = get_expanded_family_dict() # This is NOT a diagonal matrix, base terms are in rows, family members in columns path_similarities = get_path_similarity_matrix(families) # Load the result result = pandas.read_pickle( "/share/PI/russpold/work/PUBMED/pmc_behavior_counts.pkl") # Step 1: Terms that appear in most papers need to be filterd out percents = [] for c in result.columns: col = result[c] percent_occur = col[col != 0].shape[0] / float(col.shape[0]) percents.append(percent_occur) percents = pandas.DataFrame(percents) percents.index = result.columns percents.to_csv("/share/PI/russpold/work/PUBMED/pmc_percent_occur.tsv", sep="\t")
# Only save if we have at least one! if counts["count"].sum() > 0: totalwords.append(get_total_words(text)) dfcount.loc[t,counts.index] = counts["count"] result["dfcount"] = dfcount result["words"] = totalwords # Save to output file pickle.dump(result,open(result_file.replace("dict","dict_counts"),"wb")) ### 3. COMBINE COUNTS BY FAMILY ############################################################## # Prepare behavioral terms pickles = glob("%s/*_dict_counts.pkl" %outfolder) families = get_expanded_family_dict(unique=True) # This is NOT a diagonal matrix, base terms are in rows, family members in columns path_similarities = get_path_similarity_matrix() for result_file in pickles: tmp = pickle.load(open(result_file,"rb")) print "Parsing disorder %s" %tmp["disorder"] result = tmp["dfcount"] # This will be a new matrix with only base terms as column names familydf = pandas.DataFrame(index=result.index) # Step 2: For each term stem (row), find family based on path similarity for stem,data in families.iteritems(): family = path_similarities[stem][path_similarities[stem] != 0] # Create a data frame with just the columns column_names = [c for c in family.index if c in result.columns] family = family[column_names] # if there are no family members if family.shape[0] == 0: