Пример #1
0
    def matthew_effect(self):
        st = Stat()
        final_pr = st.user_pagerank()[self.log.end_date]
        log_pr = self.cal_log_pr(final_pr.values())
        per = self.get_bins(70, log_pr)
        rep = self.get_rep(log_pr, per)
        print rep

        pr = st.user_pagerank()
        date = self.log.start_date
        pch = np.zeros(len(rep))
        pcount = np.zeros(len(rep))
        nch = np.zeros(len(rep))
        ncount = np.zeros(len(rep))
        change = np.zeros(len(rep))
        count = np.zeros(len(rep))
        rel_prob = np.zeros(len(rep))

        while date < self.log.end_date:
            for user in pr[date]:
                g = self.get_bin_group(self.cal_log_pr(pr[date][user]), per)
                if user not in pr[date + timedelta(7)]:
                    nch[g] += pr[date][user]
                    ncount[g] += 1
                    change[g] += pr[date][user]
                    count[g] += 1
                    continue
                ch = pr[date + timedelta(7)][user] - pr[date][user]
                if ch >= 0:
                    pch[g] += ch
                    pcount[g] += 1
                if ch < 0:
                    nch[g] += abs(ch)
                    ncount[g] += 1
                change[g] += pr[date][user]
                count[g] += 1
            date += timedelta(7)

        for i in range(len(pch)):
            pch[i] /= pcount[i]
            nch[i] /= ncount[i]
            change[i] /= count[i]
        nch = self.cal_log_pr(nch)
        pch = self.cal_log_pr(pch)
        change = self.cal_log_pr(change)

        print np.polyfit(rep, nch, 1)
        print np.polyfit(rep, pch, 1)
        print np.polyfit(rep, change, 1)
        pp = np.poly1d(np.polyfit(rep, pch, 1))
        nn = np.poly1d(np.polyfit(rep, nch, 1))
        changep = np.poly1d(np.polyfit(rep, change, 1))
        xp = np.linspace(np.min(rep), np.max(rep), 100)
        plt.plot(rep, pch, '.', xp, pp(xp), '-')
        plt.show()
        plt.plot(rep, nch, '.', xp, nn(xp), '-')
        plt.show()
        plt.plot(rep, change, '.', xp, changep(xp), '-')
        plt.show()
Пример #2
0
    def pr_bin_dist(self):
        st = Stat()
        final_pr = st.user_pagerank()[self.log.end_date]
        log_pr = self.cal_log_pr(final_pr.values())
        per = self.get_bins(70, log_pr)
        rep = self.get_rep(log_pr, per)
        count = np.zeros(len(rep))
        cdf = np.zeros(len(rep))
        for pr in final_pr.values():
            g = self.get_bin_group(self.cal_log_pr(pr), per)
            count[g] += 1
        for i in range(len(count)):
            if i == 0:
                cdf[i] = count[i]
            cdf[i] = count[i] + cdf[i - 1]

        print cdf
        count = self.cal_log_pr(count)
        cdf = self.cal_log_pr(cdf)
        count, rep_cleaned = self.clean_lists(count, rep)
        xp = np.linspace(np.min(rep_cleaned), np.max(rep_cleaned), 100)
        print np.polyfit(rep_cleaned, count, 1)
        cc = np.poly1d(np.polyfit(rep_cleaned, count, 1))
        plt.plot(rep_cleaned, count, '.', xp, cc(xp), '-')
        plt.show()

        cdf, rep_cleaned = self.clean_lists(cdf, rep)

        print np.polyfit(rep_cleaned, 1 - cdf, 1)
        cc = np.poly1d(np.polyfit(rep_cleaned, 1 - cdf, 1))
        plt.plot(rep_cleaned, 1 - cdf, '.', xp, cc(xp), '-')
        plt.show()
Пример #3
0
	def __create_cumulative_design_matrix__(self , k = 5):
		kgram_list , kgram_count = self.create_k_grams(k)

		n = len(kgram_count.keys())
		X = np.zeros((n , len(kgram_list)))
		y = np.zeros(n)
		st = Stat()
		pr = st.user_pagerank()[self.log.end_date]
		sorted_ids = sorted(kgram_count.keys() , key = lambda x: int(x))
		for i , user in enumerate(sorted_ids):
			for j, gram in enumerate(kgram_list):
				if user not in pr:
					print user
					continue
				X[i,j] = kgram_count[user][gram]
				y[i]= pr[user]

		return X , y