示例#1
0
    def pr_msg_received(self, type='r'):
        spcorr = {}
        corr = {}
        st = Stat()
        rank = st.get_users_rank()
        seq = self.log.get_user_date_seq()
        for date in seq.iterkeys():
            weekly_rank = []
            weekly_r_count = []
            for user in seq[date].iterkeys():
                if date != self.log.end_date:
                    if user in rank[date]:
                        weekly_rank.append(rank[date].index(user) + 1)
                        weekly_r_count.append(
                            Counter(seq[date + timedelta(7)][user])[type])
            if weekly_r_count != [] and weekly_rank != []:
                corr[date] = stats.pearsonr(weekly_rank, weekly_r_count)
                spcorr[date] = stats.spearmanr(weekly_rank, weekly_r_count)

        # x = []
        # y = []
        # for date in sorted(corr.iterkeys()):
        # 	x.append((date - self.log.start_date).days)
        # 	y.append(corr[date][0])
        #
        # plt.plot(x,y , 'r.')
        # plt.show()

        x = []
        y = []
        for date in sorted(spcorr.iterkeys()):
            x.append((date - self.log.start_date).days)
            y.append(spcorr[date][0])

        # plt.plot(x, y, 'r.')
        # plt.show()

        return x, y
示例#2
0
    def freq_pr_corr_weekly(self):
        st = Stat()
        rank = st.get_users_rank()

        seq = Sequence()
        kgram_list, kgram_count = seq.create_weekly_sequences()

        scorr = {}
        pcorr = {}
        for kgram in kgram_list:
            scorr[kgram] = {}
            pcorr[kgram] = {}
            date = self.log.start_date + timedelta(7)
            while date <= self.log.end_date:
                lrank = []
                lfreq = []
                for i, user in enumerate(rank[date]):
                    if user in kgram_count[date - timedelta(7)]:
                        if kgram in kgram_count[date - timedelta(7)][user]:
                            lfreq.append(
                                kgram_count[date - timedelta(7)][user][kgram])
                            if user in rank[date - timedelta(7)]:
                                prank = rank[date - timedelta(7)].index(user)
                                lrank.append(i - prank)
                            else:
                                lrank.append(i + 1)

                if lrank is not [] and lfreq is not []:
                    scorr[kgram][date] = stats.spearmanr(lfreq, lrank)
                    pcorr[kgram][date] = stats.pearsonr(lfreq, lrank)

                date += timedelta(7)

        print scorr
        print pcorr

        for kgram in kgram_list:
            score = []
            dates = []
            for date in scorr[kgram]:
                if type(scorr[kgram][date][0]) == np.float64 \
                  and scorr[kgram][date][0] is not np.nan and \
                    scorr[kgram][date][1] < 0.001:
                    dates.append((date - self.log.start_date).days)
                    score.append(scorr[kgram][date][0])
            if len(score) > 0:
                print(kgram)
                print score
                plt.plot(dates, score, '.')
                plt.show()

        for kgram in kgram_list:
            score = []
            dates = []
            for date in pcorr[kgram]:
                if type(pcorr[kgram][date][0]) == np.float64 \
                  and pcorr[kgram][date][0] is not np.nan and \
                    pcorr[kgram][date][1] < 0.001:
                    dates.append((date - self.log.start_date).days)
                    score.append(pcorr[kgram][date][0])
            if len(score) > 0:
                print(kgram)
                plt.plot(dates, score, '.')
                plt.show()
示例#3
0
    def top_ten_change_in_pr(self):
        x = self.log.join_date_since_inception()
        st = Stat()
        pr = st.pagerank()[self.log.end_date]
        graph = st.graphs[self.log.end_date]
        id = graph.vertex_properties['id']
        id_invert = {}
        for v in graph.vertices():
            id_invert[id[v]] = int(v)
        print id_invert

        pr_dict = {}
        for v in graph.vertices():
            pr_dict[int(id[v])] = pr[v]

        sorted_pr = sorted(pr_dict.values())
        sorted_id = sorted(pr_dict, key=pr_dict.get)

        K = 10

        samples_pr = sorted_pr[:]
        samples_id = sorted_id[:]
        # samples_pr = sorted_pr[-K:]
        # samples_id = sorted_id[-K:]
        # samples_pr = sorted_pr[1800 - K: 1800]
        # samples_id = sorted_id[1800 - K: 1800]
        # samples_pr = sorted_pr[1000 - K: 1000]
        # samples_id = sorted_id[1000 - K: 1000]
        # samples_pr = sorted_pr[800 - K: 800]
        # samples_id = sorted_id[800 - K: 800]

        K = len(samples_id)

        samples_join = []
        for idx in samples_id:
            samples_join.append(x[str(idx)])

        print samples_pr
        print samples_join
        print samples_id

        rank = st.get_users_rank()
        date_idx = []
        pr_ev = []
        pr_inc = []
        pr_dec = []
        for i in range(K):
            pr_ev.append([])
            pr_inc.append([])
            pr_dec.append([])
        date = self.log.start_date
        while date <= self.log.end_date - timedelta(7):
            date_idx.append((date - self.log.start_date).days)
            for i in range(K):
                try:
                    change = -rank[date].index(str(samples_id[i])) + \
                       rank[date + timedelta(7)].index(str(samples_id[i]))
                    pr_ev[i].append(change)
                    if change > 0:
                        pr_inc[i].append(change)
                        pr_dec[i].append(0)
                    else:
                        pr_dec[i].append(change)
                        pr_inc[i].append(0)
                except:
                    pr_ev[i].append(0)
                    pr_dec[i].append(0)
                    pr_inc[i].append(0)
            date += timedelta(7)
        for i in range(K):
            l = []
            date_idx_temp = []
            for j in range(len(pr_ev[i])):
                l.append(pr_ev[i][j])
                date_idx_temp.append(date_idx[j])
            plt.plot(date_idx_temp, l)
        plt.show()

        interval = timedelta(7)
        event_snapshot = self.log.get_event_snapshot('week', 'event')
        date = self.log.start_date
        data = []
        while date <= self.log.end_date - timedelta(7):
            data.append(len(event_snapshot[date]))
            date += interval

        freq = self.log.event_frequency('msg', 'week')

        prch = np.average(pr_ev, axis=0)
        prchp = np.average(pr_inc, axis=0)
        prchn = np.average(pr_dec, axis=0)
        print(stats.spearmanr(prch, data))
        print(stats.pearsonr(prch, data))
        print(stats.spearmanr(prchp, data))
        print(stats.pearsonr(prchp, data))
        print(stats.spearmanr(prchn, data))
        print(stats.pearsonr(prchn, data))
        prchp = prchp / np.array(freq[1:])
        prchn = prchn / np.array(freq[1:])
        plt.plot([i for i in range(len(prch))], prch)
        plt.show()
        plt.plot([i for i in range(len(prchp))], prchp)
        plt.show()
        plt.plot([i for i in range(len(prchp))], np.abs(prchn))
        plt.show()
示例#4
0
    def pr_new_msgs_received(self):
        spcorr = {}
        corr = {}
        user_new_msg = {}
        new_weekly_msgs = self.log.get_new_messages_received()
        st = Stat()
        rank = st.get_users_rank()
        event_frequency = self.log.event_frequency('msg', 'week')
        for date in rank:
            wrank = []
            wnr = []
            for i, user in enumerate(rank[date]):
                if i + 1 not in user_new_msg:
                    user_new_msg[i + 1] = 0
                wrank.append(i + 1)
                if date in new_weekly_msgs:
                    if user in new_weekly_msgs[date]:
                        wnr.append(len(new_weekly_msgs[date][user]))
                        user_new_msg[i + 1] += len(new_weekly_msgs[date][user])
                    else:
                        wnr.append(0)
            if len(wnr) == len(wrank):
                spcorr[date] = stats.spearmanr(wnr, wrank)
                corr[date] = stats.pearsonr(wnr, wrank)

        #
        # x = []
        # y = []
        # for date in sorted(corr.iterkeys()):
        # 	x.append((date - self.log.start_date).days)
        # 	y.append(corr[date][0] )
        #
        # plt.plot(x, y, 'r.')
        # plt.show()

        x = []
        y = []
        for date in sorted(spcorr.iterkeys()):
            if spcorr[date][0] < 0.5:
                x.append((date - self.log.start_date).days)
                y.append(spcorr[date][0])

        new, = plt.plot(x, y, 'g')
        plt.xlabel('Day from inception')
        plt.ylabel('Spearman correlation')
        plt.title('Spearman correlation between page rank and number ' +\
            'of\n new messages received through out time')
        plt.margins(0.05)

        # x, y = self.pr_msg_received()
        # total, = plt.plot(x,y, 'r')

        plt.show()

        x, y = [], []
        for user in sorted(user_new_msg.keys(), key=lambda x: int(x)):
            x.append(user)
            y.append(user_new_msg[user])
        print np.max(y)

        plt.hist(y, x, normed=False)
        plt.title(
            'Total number of new messages received by each rank through out time'
        )
        plt.xlabel('Pagerank')
        plt.ylabel('Number of new messages received')
        plt.margins(0.05)
        plt.show()
示例#5
0
    def ten_top_rank(self):
        x = self.log.join_date_since_inception()
        st = Stat()
        pr = st.pagerank()[self.log.end_date]
        graph = st.graphs[self.log.end_date]
        id = graph.vertex_properties['id']
        id_invert = {}
        for v in graph.vertices():
            id_invert[id[v]] = int(v)
        print id_invert

        pr_dict = {}
        for v in graph.vertices():
            pr_dict[int(id[v])] = pr[v]

        sorted_pr = sorted(pr_dict.values())
        sorted_id = sorted(pr_dict, key=pr_dict.get)

        K = 20

        # samples_pr = sorted_pr[:]
        # samples_id = sorted_id[:]
        # samples_pr = sorted_pr[-K:]
        # samples_id = sorted_id[-K:]
        samples_pr = sorted_pr[1000 - K:1000]
        samples_id = sorted_id[1000 - K:1000]
        # samples_pr = sorted_pr[800 - K: 800]
        # samples_id = sorted_id[800 - K: 800]

        K = len(samples_id)

        samples_join = []
        for idx in samples_id:
            samples_join.append(x[str(idx)])

        print samples_pr
        print samples_join
        print samples_id

        rank = st.get_users_rank()
        date_idx = []
        pr_ev = []
        for i in range(K):
            pr_ev.append([])
        date = self.log.start_date
        while date < self.log.end_date:
            date_idx.append((date - self.log.start_date).days)
            for i in range(K):
                try:
                    pr_ev[i].append(rank[date].index(str(samples_id[i])) + 1)
                except:
                    pr_ev[i].append(0)
            date += timedelta(1)

        for i in range(K):
            l = []
            date_idx_temp = []
            for j in range(len(pr_ev[i])):
                if pr_ev[i][j] != 0:
                    l.append(pr_ev[i][j])
                    date_idx_temp.append(date_idx[j])
            plt.plot(date_idx_temp, l)
        plt.show()