def analyze_num_lang_edition(self, classname, people): path = "img/" + classname men = people[people.gender == "male"] women = people[people.gender == "female"] # create folder if not exist if not os.path.exists(path): os.makedirs(path) self.logfile.write("\n\n\n\ " + classname + "\n") edition_counts = np.append(women.edition_count.values, men.edition_count.values) labels = ["female (" + str(len(women.index)) + ")", "male (" + str(len(men.index)) + ")"] # print edition_counts # print type(edition_counts)probability that a score randomly drawn from population A will be greater than a score randomly drawn from population B. max_num_editions = np.max(edition_counts) # print max_num_editions top_men = [] men_percentage = 0 topk = range(10, 110, 10) if men.shape[0] > 0: men_percentage = [ men[men.edition_count == x].shape[0] / float(men.shape[0]) for x in range(1, max_num_editions) ] self.logfile.write("\n % of local men ") self.logfile.write(str(men[men.edition_count < 2].shape[0] / float(men.shape[0]))) self.logfile.write( "\n percentage men %s" % str(len(men.index) / float((len(women.index) + len(men.index)))) ) men_vals = men.edition_count.order(ascending=True).values women_percentage = 0 if women.shape[0] > 0: women_percentage = [ women[women.edition_count == x].shape[0] / float(women.shape[0]) for x in range(1, max_num_editions) ] self.logfile.write("\n\n % local women ") self.logfile.write(str(women[women.edition_count < 2].shape[0] / float(women.shape[0]))) self.logfile.write( "\n percentage women %s" % str(len(women.index) / float((len(women.index) + len(men.index)))) ) women_vals = women.edition_count.order(ascending=True).values if women.shape[0] > 0 and men.shape[0] > 0: j_m = 0 j_w = 0 U, p = stats.mstats.mannwhitneyu(women.edition_count, men.edition_count) ut.write_mannwithneyu(U, p, women.edition_count, men.edition_count, self.logfile) # for i in topk: # perc_men = len(men.edition_count.index) * i/100.0 # print "bottom %s percent of men are %s - mean %s - median %s"% (str(i), str(perc_men), np.mean(men_vals[:int(perc_men)]), np.median(men_vals[:int(perc_men)])) # top_men.append(men_vals[:int(perc_men)]) # perc_women = len(women.edition_count.index) * i/100.0 # print "bottom %s percent of women are %s - mean %s - median %s"% (str(i), str(perc_women), np.mean(women_vals[:int(perc_men)]), np.median(women_vals[:int(perc_men)])) # top_women.append(women_vals[:int(perc_women)]) # j_m += int(perc_men) # j_w += int(perc_women) # ut.plotTopk(top_women, top_men, ['pink', 'blue'], topk, path+'/numedition_gender_topk.png') ut.plot_percentage( women_percentage, men_percentage, ["pink", "blue"], range(1, max_num_editions), path + "/numedition_gender_percentage" + self.pre + "-" + self.post + ".png", ) self.genderBoxplots(women, men, labels, path) self.logfile.write("\n\n num women %s" % len(women.index)) self.logfile.write("\n num men %s" % len(men.index)) data = [women.edition_count.values, men.edition_count.values] # Compute the qth percentile of women and men q75 = [np.percentile(x, q=75) for x in data] self.logfile.write("\n third quartil (75th percentile): " + str(q75)) q95 = [np.percentile(x, q=95) for x in data] self.logfile.write("\n 95th percentile: " + str(q95)) q99 = [np.percentile(x, q=99) for x in data] self.logfile.write("\n 99th percentile: " + str(q99)) q99_women = q99[0] q99_men = q99[1] self.logfile.write("\n threshold women 99th percentile: %s" % q99_women) self.logfile.write("\n threshold men 99th percentile: %s" % q99_men) men_percentage = 0 th = np.min(q99) if men.shape[0] > 0: men_percentage = [men[men.edition_count == x].shape[0] / float(men.shape[0]) for x in range(1, int(th))] women_percentage = 0 if women.shape[0] > 0: women_percentage = [ women[women.edition_count == x].shape[0] / float(women.shape[0]) for x in range(1, int(th)) ] ut.plot_percentage( women_percentage, men_percentage, ["pink", "blue"], range(1, int(th)), path + "/numedition_gender_percentage" + self.pre + "-" + self.post + "_99.png", ) # RANDOM BASELINE FOR RATIO fake_ratios_norm = list() fake_ratios = list() for i in range(1, 1000): # print (people.gender.value_counts()) people["random_gender"] = pd.Series(np.random.permutation(people.gender.values), index=people.index) # print (people.random_gender.value_counts()) fake_men = people[people.random_gender == "male"] fake_women = people[people.random_gender == "female"] item_frequency_fake_female = itemfreq(np.array(fake_women["edition_count"].values.tolist())) item_frequency_fake_male = itemfreq(np.array(fake_men["edition_count"].values.tolist())) fake_ratios_norm.append(self.get_ratio(item_frequency_fake_female, item_frequency_fake_male, True)) fake_ratios.append(self.get_ratio(item_frequency_fake_female, item_frequency_fake_male, True)) item_frequency_female = itemfreq(np.array(women["edition_count"].values.tolist())) item_frequency_male = itemfreq(np.array(men["edition_count"].values.tolist())) ratio_norm = self.get_ratio(item_frequency_female, item_frequency_male, True) mean_fake_ratio_norm = {} mean_fake_ratio = {} for key in ratio_norm.keys(): vals = [] vals_norm = [] for dic1 in fake_ratios_norm: if key in dic1: vals_norm.append(dic1.get(key)) mean_fake_ratio_norm[key] = np.mean(vals_norm) for dic2 in fake_ratios: if key in dic2: vals.append(dic2.get(key)) mean_fake_ratio[key] = np.mean(vals) # if we plor normalized ratio we should take the log since otherwise upper boun dis 0 but ratio can become extremly small. # ut.plotratio(ratio_norm, mean_fake_ratio_norm, ['g','r--'], ['empirical gender', 'random gender'], self.pre+"-"+self.post, path+'/numedition_gender_ratio'+self.pre+"-"+self.post+'_norm.png', 'Num Editions', 'Male Proportion/Female Proportion', False, False) ratio = self.get_ratio(item_frequency_female, item_frequency_male, False) lowess = sm.nonparametric.lowess(ratio.values(), ratio.keys(), frac=0.1) ut.plotratio( ratio, lowess, mean_fake_ratio, ["b^", "g", "r--"], ["empirical gender", "lowess fit", "random gender"], self.pre + "-" + self.post, path + "/numedition_gender_ratio" + self.pre + "-" + self.post + ".png", "Num Editions", "Male/Female", False, False, ) # ratio = self.get_ratio(item_frequency_female, item_frequency_male, True) # ut.plotline(list(ratio.keys()), list(ratio.values()), ['pink','blue'], path+'/numedition_gender_ratio'+self.pre+'_norm.png', 'Num Editions', 'Female-Male-Ratio', False, False) # ratio = self.get_ratio(item_frequency_female, item_frequency_male, False) # ut.plotline(list(ratio.keys()), list(ratio.values()), ['pink','blue'], path+'/numedition_gender_ratio'+self.pre+'.png', 'Num Editions', 'Female-Male-Ratio', False, False) # ut.plot_rank_size(list([item_frequency_female[:np.max(q99)], item_frequency_male[:np.max(q99)]]), labels, ['pink','blue'], path+'/numedition_gender_ranksize_99.png', 'Rank', 'Num Editions', False, True) # print "Mann Withney U Test Frequ Dist:" # print stats.mstats.mannwhitneyu(item_frequency_female, item_frequency_male) # print stats.ranksums(item_frequency_female, item_frequency_male) ut.plot_cdf( list([item_frequency_female, item_frequency_male]), labels, ["pink", "blue"], path + "/numedition_gender_ccdf" + self.pre + "-" + self.post + ".png", "Num Editions", True, False, True, ) # ut.plot_cdf(list([item_frequency_female[:np.max(q95)], item_frequency_male[:np.max(q95)]]), labels, ['pink','blue'], path+'/numedition_gender_ccdf_95.png', 'Num Editions', True, False, True) # ut.plot_cdf(list([item_frequency_female[:np.max(q99)], item_frequency_male[:np.max(q99)]]), labels, ['pink','blue'], path+'/numedition_gender_ccdf_99.png', 'Num Editions', True, False, True) self.logfile.write("\n\n men median(men mean), women median (women mean)") self.logfile.write( "\n " + str(np.median(men.edition_count.values)) + "(" + str(np.mean(men.edition_count.values)) + "), " + str(np.median(women.edition_count.values)) + "(" + str(np.mean(women.edition_count.values)) + ")" ) return { "class": classname, "median-men": np.median(men.edition_count.values), "mean-men": np.mean(men.edition_count.values), "sem-men": stats.sem(men.edition_count.values), "sem-women": stats.sem(women.edition_count.values), "median-women": np.median(women.edition_count.values), "mean-women": np.mean(women.edition_count.values), }
def analyze_num_lang_edition(self, classname, people): path = "img/" + classname men = people[people.gender == "male"] women = people[people.gender == "female"] #create folder if not exist if not os.path.exists(path): os.makedirs(path) self.logfile.write("\n\n\n\ " + classname + "\n") edition_counts = np.append(women.edition_count.values, men.edition_count.values) labels = [ 'female (' + str(len(women.index)) + ')', 'male (' + str(len(men.index)) + ')' ] #print edition_counts #print type(edition_counts)probability that a score randomly drawn from population A will be greater than a score randomly drawn from population B. max_num_editions = np.max(edition_counts) #print max_num_editions top_men = [] men_percentage = 0 topk = range(10, 110, 10) if (men.shape[0] > 0): men_percentage = [ men[men.edition_count == x].shape[0] / float(men.shape[0]) for x in range(1, max_num_editions) ] self.logfile.write("\n % of local men ") self.logfile.write( str(men[men.edition_count < 2].shape[0] / float(men.shape[0]))) self.logfile.write("\n percentage men %s" % str( len(men.index) / float((len(women.index) + len(men.index))))) men_vals = men.edition_count.order(ascending=True).values women_percentage = 0 if (women.shape[0] > 0): women_percentage = [ women[women.edition_count == x].shape[0] / float(women.shape[0]) for x in range(1, max_num_editions) ] self.logfile.write("\n\n % local women ") self.logfile.write( str(women[women.edition_count < 2].shape[0] / float(women.shape[0]))) self.logfile.write("\n percentage women %s" % str( len(women.index) / float((len(women.index) + len(men.index))))) women_vals = women.edition_count.order(ascending=True).values if (women.shape[0] > 0 and men.shape[0] > 0): j_m = 0 j_w = 0 U, p = stats.mstats.mannwhitneyu(women.edition_count, men.edition_count) ut.write_mannwithneyu(U, p, women.edition_count, men.edition_count, self.logfile) #for i in topk: # perc_men = len(men.edition_count.index) * i/100.0 # print "bottom %s percent of men are %s - mean %s - median %s"% (str(i), str(perc_men), np.mean(men_vals[:int(perc_men)]), np.median(men_vals[:int(perc_men)])) # top_men.append(men_vals[:int(perc_men)]) # perc_women = len(women.edition_count.index) * i/100.0 # print "bottom %s percent of women are %s - mean %s - median %s"% (str(i), str(perc_women), np.mean(women_vals[:int(perc_men)]), np.median(women_vals[:int(perc_men)])) # top_women.append(women_vals[:int(perc_women)]) # j_m += int(perc_men) # j_w += int(perc_women) #ut.plotTopk(top_women, top_men, ['pink', 'blue'], topk, path+'/numedition_gender_topk.png') ut.plot_percentage( women_percentage, men_percentage, ['pink', 'blue'], range(1, max_num_editions), path + '/numedition_gender_percentage' + self.pre + "-" + self.post + '.png') self.genderBoxplots(women, men, labels, path) self.logfile.write("\n\n num women %s" % len(women.index)) self.logfile.write("\n num men %s" % len(men.index)) data = [women.edition_count.values, men.edition_count.values] # Compute the qth percentile of women and men q75 = [np.percentile(x, q=75) for x in data] self.logfile.write("\n third quartil (75th percentile): " + str(q75)) q95 = [np.percentile(x, q=95) for x in data] self.logfile.write("\n 95th percentile: " + str(q95)) q99 = [np.percentile(x, q=99) for x in data] self.logfile.write("\n 99th percentile: " + str(q99)) q99_women = q99[0] q99_men = q99[1] self.logfile.write("\n threshold women 99th percentile: %s" % q99_women) self.logfile.write("\n threshold men 99th percentile: %s" % q99_men) men_percentage = 0 th = np.min(q99) if (men.shape[0] > 0): men_percentage = [ men[men.edition_count == x].shape[0] / float(men.shape[0]) for x in range(1, int(th)) ] women_percentage = 0 if (women.shape[0] > 0): women_percentage = [ women[women.edition_count == x].shape[0] / float(women.shape[0]) for x in range(1, int(th)) ] ut.plot_percentage( women_percentage, men_percentage, ['pink', 'blue'], range(1, int(th)), path + '/numedition_gender_percentage' + self.pre + "-" + self.post + '_99.png') # RANDOM BASELINE FOR RATIO fake_ratios_norm = list() fake_ratios = list() for i in range(1, 1000): #print (people.gender.value_counts()) people["random_gender"] = pd.Series(np.random.permutation( people.gender.values), index=people.index) #print (people.random_gender.value_counts()) fake_men = people[people.random_gender == "male"] fake_women = people[people.random_gender == "female"] item_frequency_fake_female = itemfreq( np.array(fake_women['edition_count'].values.tolist())) item_frequency_fake_male = itemfreq( np.array(fake_men['edition_count'].values.tolist())) fake_ratios_norm.append( self.get_ratio(item_frequency_fake_female, item_frequency_fake_male, True)) fake_ratios.append( self.get_ratio(item_frequency_fake_female, item_frequency_fake_male, True)) item_frequency_female = itemfreq( np.array(women['edition_count'].values.tolist())) item_frequency_male = itemfreq( np.array(men['edition_count'].values.tolist())) ratio_norm = self.get_ratio(item_frequency_female, item_frequency_male, True) mean_fake_ratio_norm = {} mean_fake_ratio = {} for key in ratio_norm.keys(): vals = [] vals_norm = [] for dic1 in fake_ratios_norm: if key in dic1: vals_norm.append(dic1.get(key)) mean_fake_ratio_norm[key] = np.mean(vals_norm) for dic2 in fake_ratios: if key in dic2: vals.append(dic2.get(key)) mean_fake_ratio[key] = np.mean(vals) # if we plor normalized ratio we should take the log since otherwise upper boun dis 0 but ratio can become extremly small. #ut.plotratio(ratio_norm, mean_fake_ratio_norm, ['g','r--'], ['empirical gender', 'random gender'], self.pre+"-"+self.post, path+'/numedition_gender_ratio'+self.pre+"-"+self.post+'_norm.png', 'Num Editions', 'Male Proportion/Female Proportion', False, False) ratio = self.get_ratio(item_frequency_female, item_frequency_male, False) lowess = sm.nonparametric.lowess(ratio.values(), ratio.keys(), frac=0.1) ut.plotratio( ratio, lowess, mean_fake_ratio, ['b^', 'g', 'r--'], ['empirical gender', 'lowess fit', 'random gender'], self.pre + "-" + self.post, path + '/numedition_gender_ratio' + self.pre + "-" + self.post + '.png', 'Num Editions', 'Male/Female', False, False) #ratio = self.get_ratio(item_frequency_female, item_frequency_male, True) #ut.plotline(list(ratio.keys()), list(ratio.values()), ['pink','blue'], path+'/numedition_gender_ratio'+self.pre+'_norm.png', 'Num Editions', 'Female-Male-Ratio', False, False) #ratio = self.get_ratio(item_frequency_female, item_frequency_male, False) #ut.plotline(list(ratio.keys()), list(ratio.values()), ['pink','blue'], path+'/numedition_gender_ratio'+self.pre+'.png', 'Num Editions', 'Female-Male-Ratio', False, False) #ut.plot_rank_size(list([item_frequency_female[:np.max(q99)], item_frequency_male[:np.max(q99)]]), labels, ['pink','blue'], path+'/numedition_gender_ranksize_99.png', 'Rank', 'Num Editions', False, True) #print "Mann Withney U Test Frequ Dist:" #print stats.mstats.mannwhitneyu(item_frequency_female, item_frequency_male) #print stats.ranksums(item_frequency_female, item_frequency_male) ut.plot_cdf( list([item_frequency_female, item_frequency_male]), labels, ['pink', 'blue'], path + '/numedition_gender_ccdf' + self.pre + "-" + self.post + '.png', 'Num Editions', True, False, True) #ut.plot_cdf(list([item_frequency_female[:np.max(q95)], item_frequency_male[:np.max(q95)]]), labels, ['pink','blue'], path+'/numedition_gender_ccdf_95.png', 'Num Editions', True, False, True) #ut.plot_cdf(list([item_frequency_female[:np.max(q99)], item_frequency_male[:np.max(q99)]]), labels, ['pink','blue'], path+'/numedition_gender_ccdf_99.png', 'Num Editions', True, False, True) self.logfile.write( "\n\n men median(men mean), women median (women mean)") self.logfile.write("\n " + str(np.median(men.edition_count.values)) + '(' + str(np.mean(men.edition_count.values)) + '), ' + str(np.median(women.edition_count.values)) + '(' + str(np.mean(women.edition_count.values)) + ')') return { "class": classname, "median-men": np.median(men.edition_count.values), "mean-men": np.mean(men.edition_count.values), "sem-men": stats.sem(men.edition_count.values), "sem-women": stats.sem(women.edition_count.values), "median-women": np.median(women.edition_count.values), "mean-women": np.mean(women.edition_count.values) }
def run(self): regionalEntropy = {} regionCount = {} timeEntropy = {} sumInterest = {} posInterest = {} ############################################################################## # PARSE GOOGLE TREND RESULT FILES ############################################################################## quota_error = 0 for file in self.onlyfiles: startFromLine = -1 startFromLineTime = -1 pos = file.find(".csv") filename = file[0:pos] linesCounter = 1 end = False endTime = False with open(self.datapath+"/"+file) as f: content = f.readlines() regions = {} timeseries = {} for line in content: if line.startswith("<div id="): quota_error += 1 print "quota error for %s"%filename break; if line.startswith("Region,"): startFromLine = linesCounter if line.startswith("Month,"): startFromLineTime = linesCounter if ((startFromLine > 0) and (linesCounter > startFromLine) and (not end)): if line == "\n": end = True else: items = line.split(",") regions[items[0]] = items[1] if ((startFromLineTime > 0) and (linesCounter > startFromLineTime) and (not endTime)): print line if line == "\n": endTime = True else: items = line.split(",") if items[1] == ' \n': # sometimes gtrends returns empty field rather than 0 timeseries[items[0]] = "0" else: timeseries[items[0]] = items[1] linesCounter += 1 timeFrequs = map(int, timeseries.values()) regionFrequs = map(int,(regions.values())) if linesCounter > 1: sumInterest[filename] = np.sum(timeFrequs) posInterest[filename] = np.count_nonzero(timeFrequs) if posInterest[filename] > 0: timeEntropy[filename] = stats.entropy(timeFrequs) else: timeEntropy[filename] = np.nan regionCount[filename] = len(regionFrequs) if(np.sum(regionFrequs) > 0): regionalEntropy[filename] = stats.entropy(regionFrequs) else: regionalEntropy[filename] = np.nan # store results into a dataframe regionalEntropyDF = pd.DataFrame.from_dict(regionalEntropy.items()) regionalEntropyDF.columns=["filename", "entropy"] print regionalEntropyDF.head() regionCountDF = pd.DataFrame.from_dict(regionCount.items()) regionCountDF.columns=["filename", "numRegions"] interestDF = pd.DataFrame.from_dict(sumInterest.items()) interestDF.columns=["filename", "timeInterest"] timeEntropyDF = pd.DataFrame.from_dict(timeEntropy.items()) timeEntropyDF.columns=["filename", "timeEntropy"] posInterestDF = pd.DataFrame.from_dict(posInterest.items()) posInterestDF.columns=["filename", "timePosInterest"] #print "regionalEntropyDF" #print regionalEntropyDF.head(n=1) #print regionalEntropyDF.shape #print "regionCountDF" #print regionCountDF.head(n=1) #print regionCountDF.shape #print "self.people" #print self.people.head(n=1) #print self.people.shape # add the computed statistics to the people file self.people = self.people.merge(regionalEntropyDF, right_on="filename", left_on="filename", how="inner") self.people = self.people.merge(regionCountDF, right_on="filename", left_on="filename", how="inner") self.people = self.people.merge(timeEntropyDF, right_on="filename", left_on="filename", how="inner") self.people = self.people.merge(interestDF, right_on="filename", left_on="filename", how="inner") self.people = self.people.merge(posInterestDF, right_on="filename", left_on="filename", how="inner") print "AFTER MERGING" print self.people.head(n=1) print self.people.shape ############################################################################## # PLOTS NUM REGIONS ############################################################################## men = self.people[self.people.gender =="male"] women = self.people[self.people.gender =="female"] labels = ['female ('+str(len(women.index))+')', 'male ('+str(len(men.index))+')'] data = [women.numRegions.values, men.numRegions.values] self.logfile.write("\n Mann Withney U Num regions:") U, p = stats.mstats.mannwhitneyu(women.numRegions.values, men.numRegions.values) ut.write_mannwithneyu(U, p, women.numRegions.values, men.numRegions.values, self.logfile) self.make_boxplot(data, labels, self.imgpath+"gtrend_num_regions_box.png", "num regions") self.plot_ccdf(np.array(women.numRegions.values.tolist()), np.array(men.numRegions.values.tolist()), labels, self.imgpath+"gtrend_num_regions_ccdf.png", "Num Regions", 1, 0) ut.plot_facet_dist(self.people, 'gender', 'numRegions', self.imgpath+"gtrend_num_regions.png") ut.rank_size_plot(self.people, 'numRegions', 'Num Regions Gtrends', self.imgpath+"gtrend_num_regions_ranksize.png") ############################################################################## # PLOTS TOTAL INTEREST ############################################################################## data = [women.timeInterest.values, men.timeInterest.values] self.logfile.write("\n \n Mann Withney U Temp Sum Interest:") U, p = stats.mstats.mannwhitneyu(women.timeInterest.values, men.timeInterest.values) ut.write_mannwithneyu(U, p, women.timeInterest.values, men.timeInterest.values, self.logfile) self.make_boxplot(data, labels, self.imgpath+"gtrend_time_interest_box.png", "sum interest") self.plot_ccdf(np.array(women.timeInterest.values.tolist()), np.array(men.timeInterest.values.tolist()), labels, self.imgpath+"gtrend_time_interest_ccdf.png", "Sum Interest", 1, 0) ut.plot_facet_dist(self.people, 'gender', 'timeInterest', self.imgpath+"gtrend_time_interest.png") data = [women.timePosInterest.values, men.timePosInterest.values] self.logfile.write("\n\n Mann Withney U Temp Pos Interest:") U, p = stats.mstats.mannwhitneyu(women.timePosInterest.values, men.timePosInterest.values) ut.write_mannwithneyu(U, p, women.timePosInterest.values, men.timePosInterest.values, self.logfile) self.make_boxplot(data, labels, self.imgpath+"gtrend_time_pos_interest_box.png", "num weeks with interest") self.plot_ccdf(np.array(women.timePosInterest.values.tolist()), np.array(men.timePosInterest.values.tolist()), labels, self.imgpath+"gtrend_time_pos_interest_ccdf.png", "Num weeks with interest", 1, 0) ut.plot_facet_dist(self.people, 'gender', 'timePosInterest', self.imgpath+"gtrend_time_pos_interest.png") ############################################################################## # PLOT Entropy Temp INTEREST ############################################################################## limPeople = self.people[np.isfinite(self.people['timeEntropy'])] #people[people.index not in inds] men = limPeople[limPeople.gender =="male"] women = limPeople[limPeople.gender =="female"] data = [women.timeEntropy.values, men.timeEntropy.values] self.logfile.write("\n\n Mann Withney U Time Entropy:") U, p = stats.mstats.mannwhitneyu(women.timeEntropy.values, men.timeEntropy.values) ut.write_mannwithneyu(U, p, women.timeEntropy.values, men.timeEntropy.values, self.logfile) self.make_boxplot(data, labels, self.imgpath+"gtrend_time_entropy_box.png", "temporal entropy") self.plot_ccdf(np.array(women.timeEntropy.values.tolist()), np.array(men.timeEntropy.values.tolist()), labels, self.imgpath+"gtrend_time_entropy_ccdf.png", "Temp Entropy", 1, 0) ut.plot_facet_dist(self.people, 'gender', 'timeEntropy', self.imgpath+"gtrend_time_entropy.png") ############################################################################## # PLOT ENTROPY ############################################################################## # for entropy we need to remove the nan value. If we dont have data the result is nan limPeople = self.people[np.isfinite(self.people['entropy'])] #people[people.index not in inds] men = limPeople[limPeople.gender =="male"] women = limPeople[limPeople.gender =="female"] labels = ['female ('+str(len(women.index))+')', 'male ('+str(len(men.index))+')'] data = [women.entropy.values, men.entropy.values] self.logfile.write("\n\n Mann Withney U Entropy:") U, p = stats.mstats.mannwhitneyu(women.entropy.values, men.entropy.values) ut.write_mannwithneyu(U, p, women.entropy.values, men.entropy.values, self.logfile) self.make_boxplot(data, labels, "gtrend_region_entropy_box.png", "shannon entropy") self.plot_ccdf(np.array(women.entropy.values.tolist()), np.array(men.entropy.values.tolist()), labels, self.imgpath+"gtrend_entropy_ccdf.png", "Entropy", 0, 0) ut.plot_facet_dist(self.people, 'gender', 'entropy', self.imgpath+"gtrend_region_entropy.png") self.regression()
def run(self): regionalEntropy = {} regionCount = {} timeEntropy = {} sumInterest = {} posInterest = {} ############################################################################## # PARSE GOOGLE TREND RESULT FILES ############################################################################## quota_error = 0 monthlyFiles = 0 weeklyFiles = 0 for file in self.onlyfiles: startFromLine = -1 startFromLineTime = -1 weekly = False pos = file.find(".json") if (pos >= 0): filename = file[0:pos] linesCounter = 1 end = False endTime = False with open(self.datapath+"/"+file) as f: content = f.read() #print content data = self.parse_gtrend_json(content) timeseries = {} print data print len(data) if len(data) > 3: for ind in xrange(0, len(data), 2): #print data[ind] #print data[ind+1] if (ind+1) < len(data): timeseries[data[ind]] = data[ind+1] timeFrequs = map(int, timeseries.values()) if len(timeFrequs) > 1: sumInterest[filename] = np.sum(timeFrequs) posInterest[filename] = np.count_nonzero(timeFrequs) if posInterest[filename] > 0: timeEntropy[filename] = stats.entropy(timeFrequs) else: timeEntropy[filename] = np.nan interestDF = pd.DataFrame.from_dict(sumInterest.items()) interestDF.columns=["filename", "timeInterest"] timeEntropyDF = pd.DataFrame.from_dict(timeEntropy.items()) timeEntropyDF.columns=["filename", "timeEntropy"] posInterestDF = pd.DataFrame.from_dict(posInterest.items()) posInterestDF.columns=["filename", "timePosInterest"] #print "regionalEntropyDF" #print regionalEntropyDF.head(n=1) #print regionalEntropyDF.shape #print "regionCountDF" #print regionCountDF.head(n=1) #print regionCountDF.shape #print "self.people" #print self.people.head(n=1) #print self.people.shape # add the computed statistics to the people file self.people = self.people.merge(timeEntropyDF, right_on="filename", left_on="filename", how="inner") self.people = self.people.merge(interestDF, right_on="filename", left_on="filename", how="inner") self.people = self.people.merge(posInterestDF, right_on="filename", left_on="filename", how="inner") print "AFTER MERGING" print self.people.head(n=1) print self.people.shape men = self.people[self.people.gender =="male"] women = self.people[self.people.gender =="female"] labels = ['female ('+str(len(women.index))+')', 'male ('+str(len(men.index))+')'] ############################################################################## # PLOTS TOTAL INTEREST ############################################################################## data = [women.timeInterest.values, men.timeInterest.values] self.logfile.write("\n \n Mann Withney U Temp Sum Interest:") U, p = stats.mstats.mannwhitneyu(women.timeInterest.values, men.timeInterest.values) ut.write_mannwithneyu(U, p, women.timeInterest.values, men.timeInterest.values, self.logfile) self.make_boxplot(data, labels, self.imgpath+"gtrend_time_interest_box.png", "sum interest") self.plot_ccdf(np.array(women.timeInterest.values.tolist()), np.array(men.timeInterest.values.tolist()), labels, self.imgpath+"gtrend_time_interest_ccdf.png", "Sum Interest", 1, 0) ut.plot_facet_dist(self.people, 'gender', 'timeInterest', self.imgpath+"gtrend_time_interest.png") data = [women.timePosInterest.values, men.timePosInterest.values] self.logfile.write("\n\n Mann Withney U Temp Pos Interest:") U, p = stats.mstats.mannwhitneyu(women.timePosInterest.values, men.timePosInterest.values) ut.write_mannwithneyu(U, p, women.timePosInterest.values, men.timePosInterest.values, self.logfile) self.make_boxplot(data, labels, self.imgpath+"gtrend_time_pos_interest_box.png", "num weeks with interest") self.plot_ccdf(np.array(women.timePosInterest.values.tolist()), np.array(men.timePosInterest.values.tolist()), labels, self.imgpath+"gtrend_time_pos_interest_ccdf.png", "Num weeks with interest", 1, 0) ut.plot_facet_dist(self.people, 'gender', 'timePosInterest', self.imgpath+"gtrend_time_pos_interest.png") ############################################################################## # PLOT Entropy Temp INTEREST ############################################################################## limPeople = self.people[np.isfinite(self.people['timeEntropy'])] #people[people.index not in inds] men = limPeople[limPeople.gender =="male"] women = limPeople[limPeople.gender =="female"] data = [women.timeEntropy.values, men.timeEntropy.values] self.logfile.write("\n\n Mann Withney U Time Entropy:") U, p = stats.mstats.mannwhitneyu(women.timeEntropy.values, men.timeEntropy.values) ut.write_mannwithneyu(U, p, women.timeEntropy.values, men.timeEntropy.values, self.logfile) self.make_boxplot(data, labels, self.imgpath+"gtrend_time_entropy_box.png", "temporal entropy") self.plot_ccdf(np.array(women.timeEntropy.values.tolist()), np.array(men.timeEntropy.values.tolist()), labels, self.imgpath+"gtrend_time_entropy_ccdf.png", "Temp Entropy", 1, 0) ut.plot_facet_dist(self.people, 'gender', 'timeEntropy', self.imgpath+"gtrend_time_entropy.png") self.regression()