def pr_msg_received(self, type='r'): spcorr = {} corr = {} st = Stat() rank = st.get_users_rank() seq = self.log.get_user_date_seq() for date in seq.iterkeys(): weekly_rank = [] weekly_r_count = [] for user in seq[date].iterkeys(): if date != self.log.end_date: if user in rank[date]: weekly_rank.append(rank[date].index(user) + 1) weekly_r_count.append( Counter(seq[date + timedelta(7)][user])[type]) if weekly_r_count != [] and weekly_rank != []: corr[date] = stats.pearsonr(weekly_rank, weekly_r_count) spcorr[date] = stats.spearmanr(weekly_rank, weekly_r_count) # x = [] # y = [] # for date in sorted(corr.iterkeys()): # x.append((date - self.log.start_date).days) # y.append(corr[date][0]) # # plt.plot(x,y , 'r.') # plt.show() x = [] y = [] for date in sorted(spcorr.iterkeys()): x.append((date - self.log.start_date).days) y.append(spcorr[date][0]) # plt.plot(x, y, 'r.') # plt.show() return x, y
def freq_pr_corr_weekly(self): st = Stat() rank = st.get_users_rank() seq = Sequence() kgram_list, kgram_count = seq.create_weekly_sequences() scorr = {} pcorr = {} for kgram in kgram_list: scorr[kgram] = {} pcorr[kgram] = {} date = self.log.start_date + timedelta(7) while date <= self.log.end_date: lrank = [] lfreq = [] for i, user in enumerate(rank[date]): if user in kgram_count[date - timedelta(7)]: if kgram in kgram_count[date - timedelta(7)][user]: lfreq.append( kgram_count[date - timedelta(7)][user][kgram]) if user in rank[date - timedelta(7)]: prank = rank[date - timedelta(7)].index(user) lrank.append(i - prank) else: lrank.append(i + 1) if lrank is not [] and lfreq is not []: scorr[kgram][date] = stats.spearmanr(lfreq, lrank) pcorr[kgram][date] = stats.pearsonr(lfreq, lrank) date += timedelta(7) print scorr print pcorr for kgram in kgram_list: score = [] dates = [] for date in scorr[kgram]: if type(scorr[kgram][date][0]) == np.float64 \ and scorr[kgram][date][0] is not np.nan and \ scorr[kgram][date][1] < 0.001: dates.append((date - self.log.start_date).days) score.append(scorr[kgram][date][0]) if len(score) > 0: print(kgram) print score plt.plot(dates, score, '.') plt.show() for kgram in kgram_list: score = [] dates = [] for date in pcorr[kgram]: if type(pcorr[kgram][date][0]) == np.float64 \ and pcorr[kgram][date][0] is not np.nan and \ pcorr[kgram][date][1] < 0.001: dates.append((date - self.log.start_date).days) score.append(pcorr[kgram][date][0]) if len(score) > 0: print(kgram) plt.plot(dates, score, '.') plt.show()
def top_ten_change_in_pr(self): x = self.log.join_date_since_inception() st = Stat() pr = st.pagerank()[self.log.end_date] graph = st.graphs[self.log.end_date] id = graph.vertex_properties['id'] id_invert = {} for v in graph.vertices(): id_invert[id[v]] = int(v) print id_invert pr_dict = {} for v in graph.vertices(): pr_dict[int(id[v])] = pr[v] sorted_pr = sorted(pr_dict.values()) sorted_id = sorted(pr_dict, key=pr_dict.get) K = 10 samples_pr = sorted_pr[:] samples_id = sorted_id[:] # samples_pr = sorted_pr[-K:] # samples_id = sorted_id[-K:] # samples_pr = sorted_pr[1800 - K: 1800] # samples_id = sorted_id[1800 - K: 1800] # samples_pr = sorted_pr[1000 - K: 1000] # samples_id = sorted_id[1000 - K: 1000] # samples_pr = sorted_pr[800 - K: 800] # samples_id = sorted_id[800 - K: 800] K = len(samples_id) samples_join = [] for idx in samples_id: samples_join.append(x[str(idx)]) print samples_pr print samples_join print samples_id rank = st.get_users_rank() date_idx = [] pr_ev = [] pr_inc = [] pr_dec = [] for i in range(K): pr_ev.append([]) pr_inc.append([]) pr_dec.append([]) date = self.log.start_date while date <= self.log.end_date - timedelta(7): date_idx.append((date - self.log.start_date).days) for i in range(K): try: change = -rank[date].index(str(samples_id[i])) + \ rank[date + timedelta(7)].index(str(samples_id[i])) pr_ev[i].append(change) if change > 0: pr_inc[i].append(change) pr_dec[i].append(0) else: pr_dec[i].append(change) pr_inc[i].append(0) except: pr_ev[i].append(0) pr_dec[i].append(0) pr_inc[i].append(0) date += timedelta(7) for i in range(K): l = [] date_idx_temp = [] for j in range(len(pr_ev[i])): l.append(pr_ev[i][j]) date_idx_temp.append(date_idx[j]) plt.plot(date_idx_temp, l) plt.show() interval = timedelta(7) event_snapshot = self.log.get_event_snapshot('week', 'event') date = self.log.start_date data = [] while date <= self.log.end_date - timedelta(7): data.append(len(event_snapshot[date])) date += interval freq = self.log.event_frequency('msg', 'week') prch = np.average(pr_ev, axis=0) prchp = np.average(pr_inc, axis=0) prchn = np.average(pr_dec, axis=0) print(stats.spearmanr(prch, data)) print(stats.pearsonr(prch, data)) print(stats.spearmanr(prchp, data)) print(stats.pearsonr(prchp, data)) print(stats.spearmanr(prchn, data)) print(stats.pearsonr(prchn, data)) prchp = prchp / np.array(freq[1:]) prchn = prchn / np.array(freq[1:]) plt.plot([i for i in range(len(prch))], prch) plt.show() plt.plot([i for i in range(len(prchp))], prchp) plt.show() plt.plot([i for i in range(len(prchp))], np.abs(prchn)) plt.show()
def pr_new_msgs_received(self): spcorr = {} corr = {} user_new_msg = {} new_weekly_msgs = self.log.get_new_messages_received() st = Stat() rank = st.get_users_rank() event_frequency = self.log.event_frequency('msg', 'week') for date in rank: wrank = [] wnr = [] for i, user in enumerate(rank[date]): if i + 1 not in user_new_msg: user_new_msg[i + 1] = 0 wrank.append(i + 1) if date in new_weekly_msgs: if user in new_weekly_msgs[date]: wnr.append(len(new_weekly_msgs[date][user])) user_new_msg[i + 1] += len(new_weekly_msgs[date][user]) else: wnr.append(0) if len(wnr) == len(wrank): spcorr[date] = stats.spearmanr(wnr, wrank) corr[date] = stats.pearsonr(wnr, wrank) # # x = [] # y = [] # for date in sorted(corr.iterkeys()): # x.append((date - self.log.start_date).days) # y.append(corr[date][0] ) # # plt.plot(x, y, 'r.') # plt.show() x = [] y = [] for date in sorted(spcorr.iterkeys()): if spcorr[date][0] < 0.5: x.append((date - self.log.start_date).days) y.append(spcorr[date][0]) new, = plt.plot(x, y, 'g') plt.xlabel('Day from inception') plt.ylabel('Spearman correlation') plt.title('Spearman correlation between page rank and number ' +\ 'of\n new messages received through out time') plt.margins(0.05) # x, y = self.pr_msg_received() # total, = plt.plot(x,y, 'r') plt.show() x, y = [], [] for user in sorted(user_new_msg.keys(), key=lambda x: int(x)): x.append(user) y.append(user_new_msg[user]) print np.max(y) plt.hist(y, x, normed=False) plt.title( 'Total number of new messages received by each rank through out time' ) plt.xlabel('Pagerank') plt.ylabel('Number of new messages received') plt.margins(0.05) plt.show()
def ten_top_rank(self): x = self.log.join_date_since_inception() st = Stat() pr = st.pagerank()[self.log.end_date] graph = st.graphs[self.log.end_date] id = graph.vertex_properties['id'] id_invert = {} for v in graph.vertices(): id_invert[id[v]] = int(v) print id_invert pr_dict = {} for v in graph.vertices(): pr_dict[int(id[v])] = pr[v] sorted_pr = sorted(pr_dict.values()) sorted_id = sorted(pr_dict, key=pr_dict.get) K = 20 # samples_pr = sorted_pr[:] # samples_id = sorted_id[:] # samples_pr = sorted_pr[-K:] # samples_id = sorted_id[-K:] samples_pr = sorted_pr[1000 - K:1000] samples_id = sorted_id[1000 - K:1000] # samples_pr = sorted_pr[800 - K: 800] # samples_id = sorted_id[800 - K: 800] K = len(samples_id) samples_join = [] for idx in samples_id: samples_join.append(x[str(idx)]) print samples_pr print samples_join print samples_id rank = st.get_users_rank() date_idx = [] pr_ev = [] for i in range(K): pr_ev.append([]) date = self.log.start_date while date < self.log.end_date: date_idx.append((date - self.log.start_date).days) for i in range(K): try: pr_ev[i].append(rank[date].index(str(samples_id[i])) + 1) except: pr_ev[i].append(0) date += timedelta(1) for i in range(K): l = [] date_idx_temp = [] for j in range(len(pr_ev[i])): if pr_ev[i][j] != 0: l.append(pr_ev[i][j]) date_idx_temp.append(date_idx[j]) plt.plot(date_idx_temp, l) plt.show()