def _build_user_graph(name): if name.endswith("lang"): name = name[: -len("lang")] dist = stats.get_language_data(name) return charts.PieChart(dist.items(), max_options=8) elif name == "dropout": data = stats.get_dropout_figures() approx_data = stats.approximate(data) chart = charts.MultiLineChart(approx_data, data_name="histogram", x_axis=(0.4, 1.0, 0.1), y_axis=(0, 1.0, 0.1)) chart.add_data("raw", data) return chart elif name == "prepostdiff": data = [r["pre_post_diff"] for r in stats.get_global_rater_stats() if r["n_tests"] > 2 and r["pre_post_diff"]] hist_data = stats.histogram(data, n_bins=11, normalize=False, x_min=-0.7, x_max=0.7) chart = charts.LineChart(hist_data, data_name="histogram", x_axis=(-0.8, 0.8, 0.2)) chart.add_data("raw", data) return chart elif name == "abilityjlpt3": data = stats.get_user_scores("jlpt 3") hist_data = stats.histogram(data, x_min=0.0, x_max=1.0, normalize=False) chart = charts.LineChart(hist_data, data_name="histogram", x_axis=(0.0, 1.0, 0.1)) chart.add_data("raw", data) return chart elif name == "abilityjlpt4": data = stats.get_user_scores("jlpt 4") hist_data = stats.histogram(data, x_min=0.0, x_max=1.0, normalize=False) chart = charts.LineChart(hist_data, data_name="histogram", x_axis=(0.0, 1.0, 0.1)) chart.add_data("raw", data) return chart raise KeyError(name)
def _build_user_graph(name): if name.endswith('lang'): name = name[:-len('lang')] dist = stats.get_language_data(name) return charts.PieChart(dist.items(), max_options=8) elif name == 'dropout': data = stats.get_dropout_figures() approx_data = stats.approximate(data) chart = charts.MultiLineChart(approx_data, data_name='histogram', x_axis=(0.4, 1.0, 0.1), y_axis=(0, 1.0, 0.1)) chart.add_data('raw', data) return chart elif name == 'prepostdiff': data = [ r['pre_post_diff'] for r in stats.get_global_rater_stats() if r['n_tests'] > 2 and r['pre_post_diff'] ] hist_data = stats.histogram(data, n_bins=11, normalize=False, x_min=-0.7, x_max=0.7) chart = charts.LineChart(hist_data, data_name='histogram', x_axis=(-0.8, 0.8, 0.2)) chart.add_data('raw', data) return chart elif name == 'abilityjlpt3': data = stats.get_user_scores('jlpt 3') hist_data = stats.histogram(data, x_min=0.0, x_max=1.0, normalize=False) chart = charts.LineChart(hist_data, data_name='histogram', x_axis=(0.0, 1.0, 0.1)) chart.add_data('raw', data) return chart elif name == 'abilityjlpt4': data = stats.get_user_scores('jlpt 4') hist_data = stats.histogram(data, x_min=0.0, x_max=1.0, normalize=False) chart = charts.LineChart(hist_data, data_name='histogram', x_axis=(0.0, 1.0, 0.1)) chart.add_data('raw', data) return chart raise KeyError(name)
def statsex(self, objects): """ Do some statistics on a source list Return dictionary """ import stats, pstat # Return if we have no objects if len(objects) == 0: return 0 # Define dictionary to hold statistics stat = {} # Get number of objects stat['N'] = str(len(objects)) # Define list (float) of FWHM values fwhm = [ float(obj[7]) for obj in objects ] # Define list (float) of ELLIPTICITY values el = [ float(obj[6]) for obj in objects ] # Define list (float) of THETA_IMAGE values pa = [ float(obj[5]) for obj in objects ] # Define list (float) of 'Stella-like' values stella = [ float(obj[9]) for obj in objects ] # Create a histogram of FWHM values of binsize 1 pixel hfwhm = stats.histogram(fwhm,40,[0,40])[0] stat['medianFWHM'] = "%.2f" % stats.median(fwhm) stat['meanFWHM'] = "%.2f" % stats.mean(fwhm) stat['modeFWHM'] = "%.2f" % float(hfwhm.index(max(hfwhm))+0.5) try: stat['stdevFWHM'] = "%.2f" % stats.stdev(fwhm) except ZeroDivisionError: stat['stdevFWHM'] = '0.00'; stat['medianEL'] = "%.2f" % stats.median(el) stat['meanEL'] = "%.2f" % stats.mean(el) try: stat['stdevEL'] = "%.2f" % stats.stdev(el) except ZeroDivisionError: stat['stdevEL'] = '0.00' # Histogram of Ellipticity PA (-180 to 180, bins of 45 deg) #stat['histoTHETA'] = stats.histogram(pa,8,[-180,180])[0] # Histogram of Stellarity (0 to 1, bins of 0.05) #stat['histoStella'] = stats.histogram(stella,20,[0,1.01])[0] return stat
def _build_test_graph(name): if name == 'mean': score_data = stats.get_mean_score_nth_test() data = stats.group_by_points(score_data, y_max=1.0, y_min=0.0) chart = charts.MultiLineChart(data, y_axis=(0, 1, 0.1), data_name='grouped') chart.add_data('raw', score_data) return chart elif name == 'volume': user_data = stats.get_users_by_n_tests() return charts.LineChart(user_data) elif name == 'length': return charts.PieChart(stats.get_test_length_volume()) elif name == 'normtime': user_data = stats.get_score_over_norm_time() return charts.LineChart(user_data) elif name == 'time': base_data = stats.get_score_over_time() data = stats.approximate(base_data) chart = charts.MultiLineChart(data, y_axis=(0, 1.05, 0.1), data_name='approximate') chart.add_data('raw', base_data) two_colours = charts.color_desc(2).split(',') three_colours = ','.join( (two_colours[0], two_colours[1], two_colours[1])) chart['chco'] = three_colours return chart elif name == 'dropout': return charts.LineChart(stats.get_mean_score_by_n_tests()) elif name == 'firstlast': data = stats.get_first_last_test() hist_data = stats.histogram(data, n_bins=11, normalize=False, x_min=-0.5, x_max=0.5) chart = charts.LineChart(hist_data, data_name='histogram', x_axis=(-0.5, 0.5, 0.1)) chart.add_data('raw', data) return chart raise KeyError(name)
def hist(x, nbins=15): """Simple histogram function. **x** -- data set as numeric vector **nbins** -- number of histogram bins matlab equiv: HIST """ import stats counts, smallest, binsize, extras = stats.histogram(x, nbins) graceplot().histoPlot(counts, x_min=smallest, x_max=smallest+len(counts)*binsize, fillcolor=2, edgecolor=1, labeled=0)
def get_sentiword_score(self): f = open("SentiWordNet/data/SentiWord.json", "r") self.sentiwords = cjson.decode(f.readline()) f.close() self.read_user_vector("user_vector/user_vector_new.json") self.neg_words = self.get_word_list( "SentiWordNet/data/negative_words_list.txt") self.pos_words = self.get_word_list( "SentiWordNet/data/positive_words_list.txt") myht = histogram() for user in self.user_vector: for tweet in self.user_vector[user]["tweets"]: words = self.process_sentence(tweet["text"]) pnword_score = 0.0 sentiword_score = 0.0 sentiword_count = 0 for word in words: if self.sentiwords.has_key(word): sentiword_score += self.sentiwords[word] sentiword_count += 1 if self.pos_words.has_key(word): pnword_score += 1.0 elif self.neg_words.has_key(word): pnword_score -= 1.0 myht.add(pnword_score) tweet.update({"pnword_score": pnword_score}) if sentiword_count != 0: sentiword_score = sentiword_score / float(sentiword_count) tweet.update({"sentiword_score": sentiword_score}) #print self.user_vector[user]["tweets"] mean = myht.avg() std = myht.std() # normalize the positive_negative_word score for user in self.user_vector: for tweet in self.user_vector[user]["tweets"]: tweet["pnword_score"] = (tweet["pnword_score"] - mean) / std #print tweet["pnword_score"] f = open("user_vector/user_vector_new_2.json", "w") for user in self.user_vector: json.dump(self.user_vector[user], f) f.write("\n") f.close()
def hist(x, nbins=15): """Simple histogram function. **x** -- data set as numeric vector **nbins** -- number of histogram bins matlab equiv: HIST """ import stats counts, smallest, binsize, extras = stats.histogram(x, nbins) graceplot().histoPlot(counts, x_min=smallest, x_max=smallest + len(counts) * binsize, fillcolor=2, edgecolor=1, labeled=0)
def get_sentiword_score(self): f = open("SentiWordNet/data/SentiWord.json","r") self.sentiwords = cjson.decode(f.readline()) f.close() self.read_user_vector("user_vector/user_vector_new.json") self.neg_words = self.get_word_list("SentiWordNet/data/negative_words_list.txt") self.pos_words = self.get_word_list("SentiWordNet/data/positive_words_list.txt") myht = histogram() for user in self.user_vector: for tweet in self.user_vector[user]["tweets"]: words = self.process_sentence(tweet["text"]) pnword_score = 0.0 sentiword_score = 0.0 sentiword_count = 0 for word in words: if self.sentiwords.has_key(word): sentiword_score += self.sentiwords[word] sentiword_count += 1 if self.pos_words.has_key(word): pnword_score += 1.0 elif self.neg_words.has_key(word): pnword_score -= 1.0 myht.add(pnword_score) tweet.update({"pnword_score" : pnword_score}) if sentiword_count != 0: sentiword_score = sentiword_score/float(sentiword_count) tweet.update({"sentiword_score" : sentiword_score}) #print self.user_vector[user]["tweets"] mean = myht.avg() std = myht.std() # normalize the positive_negative_word score for user in self.user_vector: for tweet in self.user_vector[user]["tweets"]: tweet["pnword_score"] = (tweet["pnword_score"]-mean)/std #print tweet["pnword_score"] f = open("user_vector/user_vector_new_2.json", "w") for user in self.user_vector: json.dump(self.user_vector[user], f) f.write("\n") f.close()
def _build_test_graph(name): if name == "mean": score_data = stats.get_mean_score_nth_test() data = stats.group_by_points(score_data, y_max=1.0, y_min=0.0) chart = charts.MultiLineChart(data, y_axis=(0, 1, 0.1), data_name="grouped") chart.add_data("raw", score_data) return chart elif name == "volume": user_data = stats.get_users_by_n_tests() return charts.LineChart(user_data) elif name == "length": return charts.PieChart(stats.get_test_length_volume()) elif name == "normtime": user_data = stats.get_score_over_norm_time() return charts.LineChart(user_data) elif name == "time": base_data = stats.get_score_over_time() data = stats.approximate(base_data) chart = charts.MultiLineChart(data, y_axis=(0, 1.05, 0.1), data_name="approximate") chart.add_data("raw", base_data) two_colours = charts.color_desc(2).split(",") three_colours = ",".join((two_colours[0], two_colours[1], two_colours[1])) chart["chco"] = three_colours return chart elif name == "dropout": return charts.LineChart(stats.get_mean_score_by_n_tests()) elif name == "firstlast": data = stats.get_first_last_test() hist_data = stats.histogram(data, n_bins=11, normalize=False, x_min=-0.5, x_max=0.5) chart = charts.LineChart(hist_data, data_name="histogram", x_axis=(-0.5, 0.5, 0.1)) chart.add_data("raw", data) return chart raise KeyError(name)
print('var:',stats.var(a),stats.var(af)) print('stdev:',stats.stdev(a),stats.stdev(af)) print('sem:',stats.sem(a),stats.sem(af)) print('describe:') print(stats.describe(l)) print(stats.describe(lf)) print(stats.describe(a)) print(stats.describe(af)) print('\nFREQUENCY') print('freqtable:') print('itemfreq:') print(stats.itemfreq(l)) print(stats.itemfreq(a)) print('scoreatpercentile:',stats.scoreatpercentile(l,40),stats.scoreatpercentile(lf,40),stats.scoreatpercentile(a,40),stats.scoreatpercentile(af,40)) print('percentileofscore:',stats.percentileofscore(l,12),stats.percentileofscore(lf,12),stats.percentileofscore(a,12),stats.percentileofscore(af,12)) print('histogram:',stats.histogram(l),stats.histogram(a)) print('cumfreq:') print(stats.cumfreq(l)) print(stats.cumfreq(lf)) print(stats.cumfreq(a)) print(stats.cumfreq(af)) print('relfreq:') print(stats.relfreq(l)) print(stats.relfreq(lf)) print(stats.relfreq(a)) print(stats.relfreq(af)) print('\nVARIATION') print('obrientransform:') l = range(1,21) a = N.array(l) ll = [l]*5
def draw_distributions(self): """ draw distributions for all terms in self.terms and save figs to specified folders """ f = open("topics/term_senti_scores.json", "r") self.terms = cjson.decode(f.readline()) f.close() print len(self.terms) count = 0 for term in self.terms: hsw = stats.histogram() #hpn = stats.histogram() hst = stats.histogram() has = stats.histogram() for s in self.terms[term]["sentiword_score"]: hsw.add(s) #for s in self.terms[term]["pnword_score"]: # hpn.add(s) for s in self.terms[term]["sentiment_score"]: hst.add(s) for s in self.terms[term]["avg_sentiscore"]: has.add(s) distribution_sw = hsw.histogram_2() #distribution_pn = hpn.histogram_2() distribution_st = hst.histogram_2() distribution_as = has.histogram_2() # sentiword score x_axis = range(len(distribution_sw)) rcParams['figure.figsize'] = 24, 5 x_ = [-1.025 + i * 0.05 for i in range(42)] plt.bar(x_axis, distribution_sw, width=0.8, facecolor='blue', alpha=0.5) X_ticks = ["%.1f" % (x_[i]) for i in range(len(x_))] for i in range(len(X_ticks)): if (i - 1) % 2 != 0: X_ticks[i] = "" plt.xticks(x_axis, X_ticks) plt.xlim(0, len(x_axis)) plt.grid(True) plt.xlabel("Sentiword Score") plt.ylabel("Percentage (%)") plt.title("Sentiword Score Distribution - %s" % term) plt.savefig(sentiword_folder + "%s.png" % term, dpi=50) plt.clf() """ # positive negative score x_axis = range(len(distribution_pn)) rcParams['figure.figsize'] = 24, 5 x_ = [-1.025 + i*0.05 for i in range(42)] plt.bar(x_axis, distribution_pn, width=0.8, facecolor='blue', alpha = 0.5) X_ticks = ["%.1f" %(x_[i]) for i in range(len(x_))] for i in range(len(X_ticks)): if (i-1)%2 != 0: X_ticks[i] = "" plt.xticks(x_axis, X_ticks) plt.xlim(0, len(x_axis)) plt.grid(True) plt.xlabel("Positive-Negative Word Score") plt.ylabel("Percentage (%)") plt.title("PNWord Score Distribution - %s" %term) plt.savefig(pnword_folder+"%s.png" %term,dpi=50) plt.clf() """ # sentiment score x_axis = range(len(distribution_st)) rcParams['figure.figsize'] = 24, 5 x_ = [-1.025 + i * 0.05 for i in range(42)] plt.bar(x_axis, distribution_st, width=0.8, facecolor='blue', alpha=0.5) X_ticks = ["%.1f" % (x_[i]) for i in range(len(x_))] for i in range(len(X_ticks)): if (i - 1) % 2 != 0: X_ticks[i] = "" plt.xticks(x_axis, X_ticks) plt.xlim(0, len(x_axis)) plt.grid(True) plt.xlabel("Sentiment Score") plt.ylabel("Percentage (%)") plt.title("Sentiment Score Distribution - %s" % term) plt.savefig(sentiment_folder + "%s.png" % term, dpi=50) plt.clf() # sentiscore score x_axis = range(len(distribution_as)) rcParams['figure.figsize'] = 24, 5 x_ = [-1.025 + i * 0.05 for i in range(42)] plt.bar(x_axis, distribution_as, width=0.8, facecolor='blue', alpha=0.5) X_ticks = ["%.1f" % (x_[i]) for i in range(len(x_))] for i in range(len(X_ticks)): if (i - 1) % 2 != 0: X_ticks[i] = "" plt.xticks(x_axis, X_ticks) plt.xlim(0, len(x_axis)) plt.grid(True) plt.xlabel("Avg Senti Score") plt.ylabel("Percentage (%)") plt.title("Senti Score Distribution - %s" % term) plt.savefig(sentiscore_folder + "%s.png" % term, dpi=50) plt.clf() count += 1 print "done %s: neutral_st %d neutral_sw %d neutral_as %d count %d" % ( term, hst.count_zero(), hsw.count_zero(), has.count_zero(), count)
print 'tstdev:',stats.tstdev(a,(5,17)),stats.tstdev(af,(5,17)) print 'tsem:',stats.tsem(a,(5,17)),stats.tsem(af,(5,17)) print 'describe:' print stats.describe(l) print stats.describe(lf) print stats.describe(a) print stats.describe(af) print '\nFREQUENCY' print 'freqtable:' print 'itemfreq:' print stats.itemfreq(l) print stats.itemfreq(a) print 'scoreatpercentile:',stats.scoreatpercentile(l,40),stats.scoreatpercentile(lf,40),stats.scoreatpercentile(a,40),stats.scoreatpercentile(af,40) print 'percentileofscore:',stats.percentileofscore(l,12),stats.percentileofscore(lf,12),stats.percentileofscore(a,12),stats.percentileofscore(af,12) print 'histogram:',stats.histogram(l),stats.histogram(a) print 'cumfreq:' print stats.cumfreq(l) print stats.cumfreq(lf) print stats.cumfreq(a) print stats.cumfreq(af) print 'relfreq:' print stats.relfreq(l) print stats.relfreq(lf) print stats.relfreq(a) print stats.relfreq(af) print '\nVARIATION' print 'obrientransform:' l = range(1,21)
def draw_distributions(self): """ draw distributions for all terms in self.terms and save figs to specified folders """ f = open("topics/term_senti_scores.json","r") self.terms = cjson.decode(f.readline()) f.close() print len(self.terms) count = 0 for term in self.terms: hsw = stats.histogram() #hpn = stats.histogram() hst = stats.histogram() has = stats.histogram() for s in self.terms[term]["sentiword_score"]: hsw.add(s) #for s in self.terms[term]["pnword_score"]: # hpn.add(s) for s in self.terms[term]["sentiment_score"]: hst.add(s) for s in self.terms[term]["avg_sentiscore"]: has.add(s) distribution_sw = hsw.histogram_2() #distribution_pn = hpn.histogram_2() distribution_st = hst.histogram_2() distribution_as = has.histogram_2() # sentiword score x_axis = range(len(distribution_sw)) rcParams['figure.figsize'] = 24, 5 x_ = [-1.025 + i*0.05 for i in range(42)] plt.bar(x_axis, distribution_sw, width=0.8, facecolor='blue', alpha = 0.5) X_ticks = ["%.1f" %(x_[i]) for i in range(len(x_))] for i in range(len(X_ticks)): if (i-1)%2 != 0: X_ticks[i] = "" plt.xticks(x_axis, X_ticks) plt.xlim(0, len(x_axis)) plt.grid(True) plt.xlabel("Sentiword Score") plt.ylabel("Percentage (%)") plt.title("Sentiword Score Distribution - %s" %term) plt.savefig(sentiword_folder+"%s.png" %term,dpi=50) plt.clf() """ # positive negative score x_axis = range(len(distribution_pn)) rcParams['figure.figsize'] = 24, 5 x_ = [-1.025 + i*0.05 for i in range(42)] plt.bar(x_axis, distribution_pn, width=0.8, facecolor='blue', alpha = 0.5) X_ticks = ["%.1f" %(x_[i]) for i in range(len(x_))] for i in range(len(X_ticks)): if (i-1)%2 != 0: X_ticks[i] = "" plt.xticks(x_axis, X_ticks) plt.xlim(0, len(x_axis)) plt.grid(True) plt.xlabel("Positive-Negative Word Score") plt.ylabel("Percentage (%)") plt.title("PNWord Score Distribution - %s" %term) plt.savefig(pnword_folder+"%s.png" %term,dpi=50) plt.clf() """ # sentiment score x_axis = range(len(distribution_st)) rcParams['figure.figsize'] = 24, 5 x_ = [-1.025 + i*0.05 for i in range(42)] plt.bar(x_axis, distribution_st, width=0.8, facecolor='blue', alpha = 0.5) X_ticks = ["%.1f" %(x_[i]) for i in range(len(x_))] for i in range(len(X_ticks)): if (i-1)%2 != 0: X_ticks[i] = "" plt.xticks(x_axis, X_ticks) plt.xlim(0, len(x_axis)) plt.grid(True) plt.xlabel("Sentiment Score") plt.ylabel("Percentage (%)") plt.title("Sentiment Score Distribution - %s" %term) plt.savefig(sentiment_folder+"%s.png" %term,dpi=50) plt.clf() # sentiscore score x_axis = range(len(distribution_as)) rcParams['figure.figsize'] = 24, 5 x_ = [-1.025 + i*0.05 for i in range(42)] plt.bar(x_axis, distribution_as, width=0.8, facecolor='blue', alpha = 0.5) X_ticks = ["%.1f" %(x_[i]) for i in range(len(x_))] for i in range(len(X_ticks)): if (i-1)%2 != 0: X_ticks[i] = "" plt.xticks(x_axis, X_ticks) plt.xlim(0, len(x_axis)) plt.grid(True) plt.xlabel("Avg Senti Score") plt.ylabel("Percentage (%)") plt.title("Senti Score Distribution - %s" %term) plt.savefig(sentiscore_folder+"%s.png" %term, dpi=50) plt.clf() count += 1 print "done %s: neutral_st %d neutral_sw %d neutral_as %d count %d" %(term, hst.count_zero(), hsw.count_zero(), has.count_zero(), count)
print stats.describe(lf) print stats.describe(a) print stats.describe(af) print '\nFREQUENCY' print 'freqtable:' print 'itemfreq:' print stats.itemfreq(l) print stats.itemfreq(a) print 'scoreatpercentile:', stats.scoreatpercentile( l, 40), stats.scoreatpercentile(lf, 40), stats.scoreatpercentile( a, 40), stats.scoreatpercentile(af, 40) print 'percentileofscore:', stats.percentileofscore( l, 12), stats.percentileofscore(lf, 12), stats.percentileofscore( a, 12), stats.percentileofscore(af, 12) print 'histogram:', stats.histogram(l), stats.histogram(a) print 'cumfreq:' print stats.cumfreq(l) print stats.cumfreq(lf) print stats.cumfreq(a) print stats.cumfreq(af) print 'relfreq:' print stats.relfreq(l) print stats.relfreq(lf) print stats.relfreq(a) print stats.relfreq(af) print '\nVARIATION' print 'obrientransform:' l = range(1, 21)
def read_data_in_range(filename = "./", topschoolfile = "./", start_year = 2000, end_year = 2014, self_edge = True): """ @description: read the recent data back until specified <cutting_year> @type filename: string @param filename: input file path and name @type start_year: integer @param start_year: the earliest year to be considered @type end_year: integer @param end_year: the latest year to be considered @type self_edge: Boolean @param self_edge: whether self edges are included or not; True-yes, False-not @return: list of nodes @return: list of edges """ top_50 = [] f = open(topschoolfile,"r") for line in f: line = line.strip().lower() top_50.append(line) f.close() s = {} edge_list_all = [] f = open(filename,"r") f.readline() # skip the first row for line in f: line = line.lower() line = line.strip() # remove those "\r\n" lines = line.split(",") ## subject to change if len(lines) == 2 or len(lines) == 3: # if lines[0].strip() in top_50 and lines[1].strip() in top_50: edge = [] for i in range(2): edge.append(lines[i].strip()) if s.has_key(lines[i].strip()): s[lines[i].strip()] += 1 else: s.update({lines[i].strip() : 1}) if len(lines) == 2: # without year data edge.append("-") ## never enter this loop else: #print lines if len(lines[2]) > 0: # with year data edge.append(lines[2].strip()) else: # without year data pass edge_list_all.append(edge) f.close() ## statistical analysis hist = stats.histogram() stat = {} cnt = 0 ## re-organize the edge with weights edge_dict = {} for edge in edge_list_all: if len(edge) == 3 and int(edge[2]) >= start_year and int(edge[2]) <= end_year: ## filtering the recent faculty data cnt += 1 key = edge[0]+"#"+edge[1] hist.add(edge[2].strip()) if not stat.has_key(edge[0]): stat.update({edge[0] : {'total' : 1, 'wyear' : 1}}) else: stat[edge[0]]['total'] += 1 stat[edge[0]]['wyear'] += 1 if edge_dict.has_key(key): edge_dict[key] += 1.0 else: edge_dict.update({key : 1.0}) else: if not stat.has_key(edge[0]): stat.update({edge[0] : {'total' : 1, 'wyear' : 0}}) else: stat[edge[0]]['total'] += 1 # # statistics # index, dist, cdf = hist.cdf() # print hist._max, hist._min # print len(index), index # print len(dist), dist # print len(cdf), cdf # # f = open("../result/result_top50_cs_newdata_apr09/year_statistical_from%d_to%d_extended.csv" %(start_year, end_year),"w") # f.write("univ,total,wyear\n") # for key in stat: # f.write("%s,%d,%d\n" %(key, stat[key]['total'], stat[key]['wyear'])) # f.close() # # # the CDF of year distribution # f = open("../result/result_top50_cs_newdata_apr09/year_cdf_from%d_to%d_extended.csv" %(start_year, end_year),"w") # f.write("year,freq,percentile\n") # for i in range(len(index)): # f.write("%s,%d,%.3f\n" %(index[i], int(dist[i]), cdf[i])) # f.close() edge_list = [] for item in edge_dict.iteritems(): edge = [] univs = item[0].split("#") if not self_edge == True: if not univs[0].strip() == univs[1].strip(): edge.append(univs[0].strip()) edge.append(univs[1].strip()) edge.append(item[1]) edge_list.append(edge) else: pass else: edge.append(univs[0].strip()) edge.append(univs[1].strip()) edge.append(item[1]) edge_list.append(edge) #print len(edge_list), edge_list node_list = sorted(s.keys(), reverse = False) return node_list, edge_list
def keyword_distribution(self, rm_en = True): f = open("results/statistics_hashtag_sentiscores_%d.csv" %len(self.keywords),"w") f.write("keyword,user_count,tag_count,count,sw_avg_neg,sw_avg_pos,sw_count_neg,sw_count_zero,sw_count_pos,sw_min,sw_max," +"st_avg_neg,st_avg_pos,st_count_neg,st_count_zero,st_count_pos,st_min,st_max," +"sc_avg_neg,sc_avg_pos,sc_count_neg,sc_count_zero,sc_count_pos,sc_min,sc_max\n") for word in self.keywords: hsw = stats.histogram() hst = stats.histogram() has = stats.histogram() for score in self.keywords[word]["scores"]: hsw.add(score["sentiword_score"]) hst.add(score["sentiment_score"]) has.add(score["sentiscore"]) f.write("%s,%d,%d,%d,%.3f,%.3f,%d,%d,%d,%.3f,%.3f," %(word, self.keywords[word]["user_count"], self.keywords[word]["tag_count"], hsw._count, hsw._mean_neg, hsw._mean_pos, hsw._count_neg, hsw._zero, hsw._count_pos, hsw._min, hsw._max)) f.write("%.3f,%.3f,%d,%d,%d,%.3f,%.3f," %(hst._mean_neg, hst._mean_pos, hst._count_neg, hst._zero, hst._count_pos, hst._min, hst._max)) f.write("%.3f,%.3f,%d,%d,%d,%.3f,%.3f\n" %(has._mean_neg, has._mean_pos, has._count_neg, has._zero, has._count_pos, has._min, has._max)) distribution_sw = hsw.histogram_2() distribution_st = hst.histogram_2() distribution_as = has.histogram_2() # sentiword score x_axis = range(len(distribution_sw)) rcParams['figure.figsize'] = 24, 5 x_ = [-1.025 + i*0.05 for i in range(42)] plt.bar(x_axis, distribution_sw, width=0.8, facecolor='blue', alpha = 0.5) X_ticks = ["%.1f" %(x_[i]) for i in range(len(x_))] for i in range(len(X_ticks)): if (i-1)%2 != 0: X_ticks[i] = "" plt.xticks(x_axis, X_ticks) plt.xlim(0, len(x_axis)) plt.grid(True) plt.xlabel("Sentiword Score") plt.ylabel("Percentage (%)") plt.title("Sentiword Score Distribution - %s" %word) plt.savefig(OUT_DIR+SUB_DIR_1+"%s.png" %word,dpi=50) plt.clf() # sentiment score x_axis = range(len(distribution_st)) rcParams['figure.figsize'] = 24, 5 x_ = [-1.025 + i*0.05 for i in range(42)] plt.bar(x_axis, distribution_st, width=0.8, facecolor='blue', alpha = 0.5) X_ticks = ["%.1f" %(x_[i]) for i in range(len(x_))] for i in range(len(X_ticks)): if (i-1)%2 != 0: X_ticks[i] = "" plt.xticks(x_axis, X_ticks) plt.xlim(0, len(x_axis)) plt.grid(True) plt.xlabel("Sentiment Score") plt.ylabel("Percentage (%)") plt.title("Sentiment Score Distribution - %s" %word) plt.savefig(OUT_DIR+SUB_DIR_2+"%s.png" %word,dpi=50) plt.clf() # sentiscore score x_axis = range(len(distribution_as)) rcParams['figure.figsize'] = 24, 5 x_ = [-1.025 + i*0.05 for i in range(42)] plt.bar(x_axis, distribution_as, width=0.8, facecolor='blue', alpha = 0.5) X_ticks = ["%.1f" %(x_[i]) for i in range(len(x_))] for i in range(len(X_ticks)): if (i-1)%2 != 0: X_ticks[i] = "" plt.xticks(x_axis, X_ticks) plt.xlim(0, len(x_axis)) plt.grid(True) plt.xlabel("Avg Senti Score") plt.ylabel("Percentage (%)") plt.title("Senti Score Distribution - %s" %word) plt.savefig(OUT_DIR+SUB_DIR_3+"%s.png" %word, dpi=50) plt.clf() print "done %s: neutral_st %d neutral_sw %d neutral_as %d" %(word, hst.count_zero(), hsw.count_zero(), has.count_zero()) f.close()
'lensr_v1': (1.0, 3.0) } # Start the nestlefit with the hsiao-stretch or the custom_model # Save the parameters in 'nestfitparam.dat' and plot the model MI_mH = MI_model(mH, 4, f=1.7, useprior=True, samerv=True) model, res = nest_lc(phot_d, MI_mH, fit_param, fit_bounds) #MI_geu_source = MI_model(geu_source,4) #model,res = nest_lc(phot_d,MI_geu_source,fit_param_16geu,fit_bounds) # Draw the correlations between the parameters that are degenerated samp, params = res.samples, res.vparam_names samples(samp, params, fit_param[9:]) # Draw the probability density function df, data = res.ndof, model.tot_amp fmin, minamp = min(model.chi), data[np.argmin(model.chi)] histogram(data) # Plot the lightcurves for the model and write the parameters into a file model.plot(phot_d) myfile = open('nestfitparam.dat', 'w') for name in fit_param: myfile.write(name + ' = ' + str(model.get(name)) + ' (' + str(res.errors[name]) + ')\n') myfile.write("chiqsquare = " + str(fmin) + ' Dof = ' + str(df) + '\n') myfile.write('total amplification = ' + str(minamp) + '\n') myfile.write('Bayesian evidence z = ' + str(np.exp(res.logz))) myfile.close()
import json import operator import pylab import sys from stats import histogram from pylabutils import setupPlot prev = None A = [] for line in sys.stdin: data = json.loads(line) acc = data.get('accuracy') if acc: A.append(min(999, acc)) w = 10.0 XY = histogram(A, w) X = map(operator.itemgetter(0), XY) Y = map(operator.itemgetter(1), XY) pylab.bar(X, Y, width=w) setupPlot( 'Accuracy (m)', 'Frequency', 'Accuracy Radius' ) pylab.savefig('accuracy-histogram.png')
def train(notations, output): hist = stats.histogram(read_notations(notations)) yaml.dump(hist, output)
root = etree.parse(sys.stdin).getroot() document = root.getchildren()[-1] placemark = document.getchildren()[-1] track = placemark.getchildren()[-1] dts = [] prev_when = None for child in track.iterchildren(): if child.tag.endswith('when'): ts = dateutil.parser.parse(child.text) when = int(time.mktime(ts.timetuple())) if prev_when is not None: dts.append(when - prev_when) prev_when = when rmv = RollingMeanVar(0.001) Y = [] for i, dt in enumerate(dts): rmv.update(dt, i) Y.append(rmv.mean()) pylab.plot(range(len(Y)), Y, color=RED) setupPlot('Sample Index', 'Interval (s)', 'Polling Interval - Rolling Average') pylab.savefig('timings-frequency.png') pylab.close() Y = map(operator.itemgetter(1), histogram(dt % 60 for dt in dts)) pylab.bar(range(len(Y)), Y, width=1.0) setupPlot('Interval (s, mod 60)', 'Frequency', 'Polling Interval - Seconds mod 60') pylab.savefig('timings-second-histogram.png') pylab.close()
print print("ACCOUNTS") print("--------------------------------------------------") print("Number of accounts: %s" % (len(accounts))) print print("HOURS OF PLAY") print("--------------------------------------------------") print("Total play time: %s" % getHourString(stats.sum(table[TABLE_TIME]))) print("Largest play time for a single account: %s" % getHourString(max(table[TABLE_TIME]))) print("Median total play time for all accounts: %s" % getHourString(stats.median(table[TABLE_TIME]))) print( "Macro Histogram play time: \n%s" % getHistogramString(stats.histogram(table[TABLE_TIME], 10, [0, 100 * 3600]), getHourString)) print( "Micro Histogram play time: \n%s" % getHistogramString( stats.histogram(table[TABLE_TIME], 10, [0, 10 * 3600]), getHourString)) print( "Pico Histogram play time: \n%s" % getHistogramString( stats.histogram(table[TABLE_TIME], 12, [0, 1 * 3600]), getHourString)) print print("LOGINS") print("--------------------------------------------------") print("Total logins: %s" % stats.sum(table[TABLE_LOGINS])) print("Largest logins for a single account: %s" % max(table[TABLE_LOGINS])) print("Median number of logins for all accounts: %s" % stats.median(table[TABLE_LOGINS]))
import json import operator import pylab import sys from stats import histogram from pylabutils import setupPlot prev = None A = [] for line in sys.stdin: data = json.loads(line) acc = data.get('accuracy') if acc: A.append(min(999, acc)) w = 10.0 XY = histogram(A, w) X = map(operator.itemgetter(0), XY) Y = map(operator.itemgetter(1), XY) pylab.bar(X, Y, width=w) setupPlot('Accuracy (m)', 'Frequency', 'Accuracy Radius') pylab.savefig('accuracy-histogram.png')
def read_data(filename, topschoolfile, self_edge = True, extended = True): """ @type filename: string @param filename: input file path and name @type self_edge: Boolean @param self_edge: whether self edges are included or not; True-yes, False-not @type extended: Boolean @param extended: whether the graph is extended or restricted in top schools or not; default is True @return: list of nodes @return: list of edges """ top_50 = [] f = open(topschoolfile,"r") for line in f: line = line.strip().lower() top_50.append(line) f.close() ## statistical analysis hist = stats.histogram() stat = {} s = {} edge_list_all = [] f = open(filename,"r") # print f.readline() # skip the first row for line in f: line = line.lower() line = line.strip() # remove those "\r\n" lines = line.split(",") ## subject to change if len(lines) == 2 or len(lines) == 3: if extended == True: edge = [] for i in range(2): edge.append(lines[i].strip()) if s.has_key(lines[i].strip()): s[lines[i].strip()] += 1 else: s.update({lines[i].strip() : 1}) if len(lines) == 2: # without year data edge.append("-") if not stat.has_key(lines[0]): stat.update({lines[0] : {'total' : 1, 'wyear' : 0}}) else: stat[lines[0]]['total'] += 1 else: #print lines if len(lines[2]) > 0: # with year data edge.append(lines[2].strip()) hist.add(lines[2].strip()) if not stat.has_key(lines[0]): stat.update({lines[0] : {'total' : 1, 'wyear' : 1}}) else: stat[lines[0]]['total'] += 1 stat[lines[0]]['wyear'] += 1 else: # without year data if not stat.has_key(lines[0]): stat.update({lines[0] : {'total' : 1, 'wyear' : 0}}) else: stat[lines[0]]['total'] += 1 edge_list_all.append(edge) else: if lines[0].strip() in top_50 and lines[1].strip() in top_50: edge = [] for i in range(2): edge.append(lines[i].strip()) if s.has_key(lines[i].strip()): s[lines[i].strip()] += 1 else: s.update({lines[i].strip() : 1}) if len(lines) == 2: # without year data edge.append("-") if not stat.has_key(lines[0]): stat.update({lines[0] : {'total' : 1, 'wyear' : 0}}) else: stat[lines[0]]['total'] += 1 else: #print lines if len(lines[2]) > 0: # with year data edge.append(lines[2].strip()) hist.add(lines[2].strip()) if not stat.has_key(lines[0]): stat.update({lines[0] : {'total' : 1, 'wyear' : 1}}) else: stat[lines[0]]['total'] += 1 stat[lines[0]]['wyear'] += 1 else: # without year data if not stat.has_key(lines[0]): stat.update({lines[0] : {'total' : 1, 'wyear' : 0}}) else: stat[lines[0]]['total'] += 1 edge_list_all.append(edge) else: print "invalid line!", lines f.close() # # statistical # f = open("../result/result_may28/me/statistics/year_statistical.csv","w") # f.write("univ,total,wyear\n") # for key in stat: # f.write("%s,%d,%d\n" %(key, stat[key]['total'], stat[key]['wyear'])) # f.close() # index, dist, cdf = hist.cdf() # print hist._max, hist._min # print len(index), index # print len(dist), dist # print len(cdf), cdf # print sum(dist) # # # the CDF of year distribution # f = open("../result/result_may28/ee/statistics/year_cdf.csv","w") # f.write("year,freq,percentile\n") # for i in range(len(index)): # f.write("%s,%d,%.3f\n" %(index[i], int(dist[i]), cdf[i])) # f.close() # # exit(0) # univlist = sorted(s.iteritems(), key = lambda asd:asd[0], reverse = False) # fo = open("../data/out_me.csv","w") # for i in univlist: # fo.write("%s,%d\n" %(i[0],i[1])) # fo.close() # exit(0) ## re-organize the edge with weights edge_dict = {} for edge in edge_list_all: key = edge[0]+"#"+edge[1] if edge_dict.has_key(key): edge_dict[key] += 1.0 else: edge_dict.update({key : 1.0}) edge_list = [] for item in edge_dict.iteritems(): if self_edge == True: edge = [] edge.extend(item[0].split("#")) edge.append(item[1]) edge_list.append(edge) else: edge = [] nodes = item[0].split("#") if not nodes[0] == nodes[1]: edge.extend(nodes) edge.append(item[1]) edge_list.append(edge) node_list = sorted(s.keys(), reverse = False) return node_list, edge_list
if child.tag.endswith('when'): ts = dateutil.parser.parse(child.text) when = int(time.mktime(ts.timetuple())) if prev_when is not None: dts.append(when - prev_when) prev_when = when rmv = RollingMeanVar(0.001) Y = [] for i, dt in enumerate(dts): rmv.update(dt, i) Y.append(rmv.mean()) pylab.plot(range(len(Y)), Y, color=RED) setupPlot( 'Sample Index', 'Interval (s)', 'Polling Interval - Rolling Average' ) pylab.savefig('timings-frequency.png') pylab.close() Y = map(operator.itemgetter(1), histogram(dt % 60 for dt in dts)) pylab.bar(range(len(Y)), Y, width=1.0) setupPlot( 'Interval (s, mod 60)', 'Frequency', 'Polling Interval - Seconds mod 60' ) pylab.savefig('timings-second-histogram.png') pylab.close()
def keyword_distribution(self, rm_en=True): f = open( "results/statistics_hashtag_sentiscores_%d.csv" % len(self.keywords), "w") f.write( "keyword,user_count,tag_count,count,sw_avg_neg,sw_avg_pos,sw_count_neg,sw_count_zero,sw_count_pos,sw_min,sw_max," + "st_avg_neg,st_avg_pos,st_count_neg,st_count_zero,st_count_pos,st_min,st_max," + "sc_avg_neg,sc_avg_pos,sc_count_neg,sc_count_zero,sc_count_pos,sc_min,sc_max\n" ) for word in self.keywords: hsw = stats.histogram() hst = stats.histogram() has = stats.histogram() for score in self.keywords[word]["scores"]: hsw.add(score["sentiword_score"]) hst.add(score["sentiment_score"]) has.add(score["sentiscore"]) f.write("%s,%d,%d,%d,%.3f,%.3f,%d,%d,%d,%.3f,%.3f," % (word, self.keywords[word]["user_count"], self.keywords[word]["tag_count"], hsw._count, hsw._mean_neg, hsw._mean_pos, hsw._count_neg, hsw._zero, hsw._count_pos, hsw._min, hsw._max)) f.write("%.3f,%.3f,%d,%d,%d,%.3f,%.3f," % (hst._mean_neg, hst._mean_pos, hst._count_neg, hst._zero, hst._count_pos, hst._min, hst._max)) f.write("%.3f,%.3f,%d,%d,%d,%.3f,%.3f\n" % (has._mean_neg, has._mean_pos, has._count_neg, has._zero, has._count_pos, has._min, has._max)) distribution_sw = hsw.histogram_2() distribution_st = hst.histogram_2() distribution_as = has.histogram_2() # sentiword score x_axis = range(len(distribution_sw)) rcParams['figure.figsize'] = 24, 5 x_ = [-1.025 + i * 0.05 for i in range(42)] plt.bar(x_axis, distribution_sw, width=0.8, facecolor='blue', alpha=0.5) X_ticks = ["%.1f" % (x_[i]) for i in range(len(x_))] for i in range(len(X_ticks)): if (i - 1) % 2 != 0: X_ticks[i] = "" plt.xticks(x_axis, X_ticks) plt.xlim(0, len(x_axis)) plt.grid(True) plt.xlabel("Sentiword Score") plt.ylabel("Percentage (%)") plt.title("Sentiword Score Distribution - %s" % word) plt.savefig(OUT_DIR + SUB_DIR_1 + "%s.png" % word, dpi=50) plt.clf() # sentiment score x_axis = range(len(distribution_st)) rcParams['figure.figsize'] = 24, 5 x_ = [-1.025 + i * 0.05 for i in range(42)] plt.bar(x_axis, distribution_st, width=0.8, facecolor='blue', alpha=0.5) X_ticks = ["%.1f" % (x_[i]) for i in range(len(x_))] for i in range(len(X_ticks)): if (i - 1) % 2 != 0: X_ticks[i] = "" plt.xticks(x_axis, X_ticks) plt.xlim(0, len(x_axis)) plt.grid(True) plt.xlabel("Sentiment Score") plt.ylabel("Percentage (%)") plt.title("Sentiment Score Distribution - %s" % word) plt.savefig(OUT_DIR + SUB_DIR_2 + "%s.png" % word, dpi=50) plt.clf() # sentiscore score x_axis = range(len(distribution_as)) rcParams['figure.figsize'] = 24, 5 x_ = [-1.025 + i * 0.05 for i in range(42)] plt.bar(x_axis, distribution_as, width=0.8, facecolor='blue', alpha=0.5) X_ticks = ["%.1f" % (x_[i]) for i in range(len(x_))] for i in range(len(X_ticks)): if (i - 1) % 2 != 0: X_ticks[i] = "" plt.xticks(x_axis, X_ticks) plt.xlim(0, len(x_axis)) plt.grid(True) plt.xlabel("Avg Senti Score") plt.ylabel("Percentage (%)") plt.title("Senti Score Distribution - %s" % word) plt.savefig(OUT_DIR + SUB_DIR_3 + "%s.png" % word, dpi=50) plt.clf() print "done %s: neutral_st %d neutral_sw %d neutral_as %d" % ( word, hst.count_zero(), hsw.count_zero(), has.count_zero()) f.close()