Python getDistri示例，utility.getDistri Python示例

示例#1

0

显示文件

文件： feature.py 项目： imsorry1121/sn_crawler

def calSpatialScore(place_seq1, place_seq2):
	scores = list()
	if len(place_seq1)==0 or len(place_seq2)==0:
		return [0]*4
	places1 = set(place_seq1)
	places2 = set(place_seq2)
	place_num1 = ut.getDistri(place_seq1)
	place_num2 = ut.getDistri(place_seq2)

	# 如果到小數點後第二位一樣就算是同一個地方
	# most visited place
	most_visited_places1 = [place for place, num in place_num1.items() if num==max(place_num1, key=place_num1.get)]
	most_visited_places2 = [place for place, num in place_num2.items() if num==max(place_num2, key=place_num2.get)]
	mvp = 1 if len(set(most_visited_places1)and set(most_visited_places2))>0 else 0
	# common place 
	cp = len(places1 and places2)/len(places1 or places2)
	# place divergence
	pd = calKLDivergence(place_num1, place_num2)
	# average distance ratio
	avg_distance1 = float()
	avg_distance2 = float()
	if len(places1) ==1 or len(places2) == 1:
		avgdRatio = 1
	else:
		for p1 in places1:
			for p2 in places1:
				avg_distance1+= calDistance(p1, p2)
		for p1 in places2:
			for p2 in places2:
				avg_distance2+= calDistance(p1, p2)
		avg_distance1/=(len(places1)*(len(places1)-1))/2
		avg_distance2/=(len(places2)*(len(places2)-1))/2
		avgdRatio = max(avg_distance1, avg_distance2)/min(avg_distance1, avg_distance2)
	scores = [mvp, cp, pd, avgdRatio]
	return scores

示例#2

0

显示文件

文件： preprocess.py 项目： imsorry1121/sn_crawler

def structUserData(sn, uid):
	print(uid)
	# norm profile
	profile = ut.readJson2Dict(inputPath+sn+"/profile/", uid)
	posts = ut.readJson2Dict(inputPath+sn+"/wall/", uid)

	print("profile:"+interPath+sn+"/profile/"+uid)
	newProfile = normProfile(sn, profile)
	print("wall:"+interPath+sn+"/wall/"+uid)
	newPosts = normWall(sn, posts)

	ut.writeDict2Json(interPath+sn+"/profile/", uid, newProfile)
	ut.writeDict2Json(interPath+sn+"/wall/", uid, newPosts)

	# wall statisitcs
	langDistri = ut.getDistri([post["lang"] for post in newPosts])
	# sentiment sum
	sentiments = [post["sentiment"]["polarity"] for post in newPosts]
	sentiment_score = sum(sentiments)/len(sentiments) if len(sentiments)>0 else 0
	# topic sum
	topicDistris = [post["topic_distri"] for post in newPosts]
	userTopicDistri = ut.mergeDict(topicDistris)
	userTopicDistri = ut.normVector(userTopicDistri)
	# tf
	tfs = [post["tf"] for post in newPosts]
	userTf = ut.mergeDict(tfs)
	return (userTf, langDistri, sentiment_score, userTopicDistri)

示例#3

0

显示文件

文件： preprocess.py 项目： sychen1121/sn_crawler

def structUserData(sn, uid):
    print(uid)
    # norm profile
    profile = ut.readJson2Dict(inputPath + sn + "/profile/", uid)
    posts = ut.readJson2Dict(inputPath + sn + "/wall/", uid)

    print("profile:" + interPath + sn + "/profile/" + uid)
    newProfile = normProfile(sn, profile)
    print("wall:" + interPath + sn + "/wall/" + uid)
    newPosts = normWall(sn, posts)

    ut.writeDict2Json(interPath + sn + "/profile/", uid, newProfile)
    ut.writeDict2Json(interPath + sn + "/wall/", uid, newPosts)

    # wall statisitcs
    langDistri = ut.getDistri([post["lang"] for post in newPosts])
    # sentiment sum
    sentiments = [post["sentiment"]["polarity"] for post in newPosts]
    sentiment_score = sum(sentiments) / len(sentiments) if len(
        sentiments) > 0 else 0
    # topic sum
    topicDistris = [post["topic_distri"] for post in newPosts]
    userTopicDistri = ut.mergeDict(topicDistris)
    userTopicDistri = ut.normVector(userTopicDistri)
    # tf
    tfs = [post["tf"] for post in newPosts]
    userTf = ut.mergeDict(tfs)
    return (userTf, langDistri, sentiment_score, userTopicDistri)

示例#4

0

显示文件

def calSpatialScore(place_seq1, place_seq2):
    scores = list()
    if len(place_seq1) == 0 or len(place_seq2) == 0:
        return [0] * 4
    places1 = set(place_seq1)
    places2 = set(place_seq2)
    place_num1 = ut.getDistri(place_seq1)
    place_num2 = ut.getDistri(place_seq2)

    # 如果到小數點後第二位一樣就算是同一個地方
    # most visited place
    most_visited_places1 = [
        place for place, num in place_num1.items()
        if num == max(place_num1, key=place_num1.get)
    ]
    most_visited_places2 = [
        place for place, num in place_num2.items()
        if num == max(place_num2, key=place_num2.get)
    ]
    mvp = 1 if len(set(most_visited_places1)
                   and set(most_visited_places2)) > 0 else 0
    # common place
    cp = len(places1 and places2) / len(places1 or places2)
    # place divergence
    pd = calKLDivergence(place_num1, place_num2)
    # average distance ratio
    avg_distance1 = float()
    avg_distance2 = float()
    if len(places1) == 1 or len(places2) == 1:
        avgdRatio = 1
    else:
        for p1 in places1:
            for p2 in places1:
                avg_distance1 += calDistance(p1, p2)
        for p1 in places2:
            for p2 in places2:
                avg_distance2 += calDistance(p1, p2)
        avg_distance1 /= (len(places1) * (len(places1) - 1)) / 2
        avg_distance2 /= (len(places2) * (len(places2) - 1)) / 2
        avgdRatio = max(avg_distance1, avg_distance2) / min(
            avg_distance1, avg_distance2)
    scores = [mvp, cp, pd, avgdRatio]
    return scores

示例#5

0

显示文件

文件： feature.py 项目： imsorry1121/sn_crawler

def temp_spatial_divergence(temp_spatial_list1, temp_spatial_list2, total):
	pd = float()
	for t in temp_spatial_list1:
		pd+= calKLDivergence(ut.getDistri(temp_spatial_list1[t]), ut.getDistri(temp_spatial_list2.get(t, list())))
	return pd/total

示例#6

0

显示文件

文件： feature.py 项目： imsorry1121/sn_crawler

def calTemporalScore(times1, times2, post_with_place_index1, post_with_place_index2, posts1, posts2):
	scores = list()
	mins1 = list()
	hrs1 = list()
	days1 = list()
	weeks1 = list()
	mons1 = list()
	by_hr1 = list()
	by_mon1= list()
	mins2 = list()
	hrs2 = list()
	days2 = list()
	weeks2 = list()
	mons2 = list()
	by_hr2 = list()
	by_mon2 = list()

	# temporal features
	if len(times1)>0 and len(times2)<=0:
		return [0]*15
	# times1 and times2
	for t in times1:
		mins1.append(str(t.tm_year)+str(t.tm_yday)+str(t.tm_hour)+str(int(t.tm_min)/10))
		hrs1.append(str(t.tm_year)+str(t.tm_yday)+str(t.tm_hour))
		days1.append(str(t.tm_year)+str(t.tm_yday))
		weeks1.append(str(t.tm_year)+str(int(t.tm_yday)/7))
		mons1.append(str(t.tm_year)+str(t.tm_mon))
		by_hr1.append(str(t.tm_hour))
		by_mon1.append(str(t.tm_mon))
	for t in times2:
		mins2.append(str(t.tm_year)+str(t.tm_yday)+str(t.tm_hour)+str(int(t.tm_min)/10))
		hrs2.append(str(t.tm_year)+str(t.tm_yday)+str(t.tm_hour))
		days2.append(str(t.tm_year)+str(t.tm_yday))
		weeks2.append(str(t.tm_year)+str(int(t.tm_yday)/7))
		mons2.append(str(t.tm_year)+str(t.tm_mon))
		by_hr2.append(str(t.tm_hour))
		by_mon2.append(str(t.tm_mon))

	td_10min = calKLDivergence(ut.getDistri(mins1), ut.getDistri(mins2))
	td_hr = calKLDivergence(ut.getDistri(hrs1), ut.getDistri(hrs2))
	td_day = calKLDivergence(ut.getDistri(days1), ut.getDistri(days2))
	td_week = calKLDivergence(ut.getDistri(weeks1), ut.getDistri(weeks2))
	td_mon = calKLDivergence(ut.getDistri(mons1), ut.getDistri(mons2))
	td_by_hr = calKLDivergence(ut.getDistri(by_hr1), ut.getDistri(by_hr2))
	td_by_mon = calKLDivergence(ut.getDistri(by_mon1), ut.getDistri(by_mon2))

	# spatial temporal features
	temp_spatial1_hr, temp_spatial1_day, temp_spatial1_week, temp_spatial1_by_hr, temp_spatial1_by_mon = temp_spatial_distri(posts1, post_with_place_index1, weeks1, days1, hrs1, by_hr1, by_mon1)
	temp_spatial2_hr, temp_spatial2_day, temp_spatial2_week, temp_spatial2_by_hr, temp_spatial2_by_mon = temp_spatial_distri(posts2, post_with_place_index2, weeks2, days2, hrs2, by_hr2, by_mon2)

	cp_hr = temp_spatial_common_place(temp_spatial1_hr, temp_spatial2_hr)
	cp_day = temp_spatial_common_place(temp_spatial1_day, temp_spatial2_day)
	cp_mon = temp_spatial_common_place(temp_spatial1_week, temp_spatial2_week)
	cp_by_hr = temp_spatial_common_place(temp_spatial1_by_hr, temp_spatial2_by_hr)
	cp_by_mon = temp_spatial_common_place(temp_spatial1_by_mon, temp_spatial2_by_mon)
	pd_by_hr = temp_spatial_divergence(temp_spatial1_by_hr, temp_spatial2_by_hr, 24)
	pd_by_mon = temp_spatial_divergence(temp_spatial1_by_mon, temp_spatial2_by_mon, 24)
	avgd_hr = temp_spatial_distance(temp_spatial1_hr, temp_spatial2_hr)

	# content temporal features: sentiment and topic comparison
	temp_sentiment1_hr, temp_sentiment1_day, temp_sentiment1_week, temp_sentiment1_mon = temp_sentiment_distri(posts1, mons1, weeks1, days1, hrs1)
	temp_sentiment2_hr, temp_sentiment2_day, temp_sentiment2_week, temp_sentiment2_mon = temp_sentiment_distri(posts2, mons2, weeks2, days2, hrs2)
	temp_topic_distri1_hr ,temp_topic_distri1_day, temp_topic_distri1_week, temp_topic_distri1_mon = temp_topic_distri(posts1, mons1, weeks1, days1)
	temp_topic_distri2_hr, temp_topic_distri2_day, temp_topic_distri2_week, temp_topic_distri2_mon = temp_topic_distri(posts2, mons2, weeks2, days2)


	senti_sim_hr = calTempSentimentSim(temp_sentiment1_hr, temp_sentiment2_hr)
	senti_sim_day = calTempSentimentSim(temp_sentiment1_day, temp_sentiment2_day)
	senti_sim_week = calTempSentimentSim(temp_sentiment1_week, temp_sentiment2_week)
	senti_sim_mon = calTempSentimentSim(temp_sentiment1_mon, temp_sentiment2_mon)

	topic_d_day = calTempTopicDivergence(temp_topic_distri1_day, temp_topic_distri2_day)
	topic_d_week = calTempTopicDivergence(temp_topic_distri1_week, temp_topic_distri2_week)
	topic_d_mon = calTempTopicDivergence(temp_topic_distri1_mon, temp_topic_distri2_mon)

	# all scores 
	scores_temp = [td_10min, td_hr, td_day, td_week, td_mon, td_by_hr, td_by_mon]
	scores_spatial_temp = [cp_hr, cp_day, cp_mon, cp_by_hr, cp_by_mon, pd_by_hr, pd_by_mon, avgd_hr]
	scores_content_temp = [senti_sim_hr, senti_sim_day, senti_sim_week, senti_sim_mon, topic_d_day, topic_d_week, topic_d_mon]

	scores = scores_temp + scores_spatial_temp
	return scores

示例#7

0

显示文件

def calTemporalScore(times1, times2, post_with_place_index1,
                     post_with_place_index2, posts1, posts2):
    scores = list()
    mins1 = list()
    hrs1 = list()
    days1 = list()
    weeks1 = list()
    mons1 = list()
    by_hr1 = list()
    by_mon1 = list()
    mins2 = list()
    hrs2 = list()
    days2 = list()
    weeks2 = list()
    mons2 = list()
    by_hr2 = list()
    by_mon2 = list()

    # temporal features
    if len(times1) > 0 and len(times2) <= 0:
        return [0] * 15
    # times1 and times2
    for t in times1:
        mins1.append(
            str(t.tm_year) + str(t.tm_yday) + str(t.tm_hour) +
            str(int(t.tm_min) / 10))
        hrs1.append(str(t.tm_year) + str(t.tm_yday) + str(t.tm_hour))
        days1.append(str(t.tm_year) + str(t.tm_yday))
        weeks1.append(str(t.tm_year) + str(int(t.tm_yday) / 7))
        mons1.append(str(t.tm_year) + str(t.tm_mon))
        by_hr1.append(str(t.tm_hour))
        by_mon1.append(str(t.tm_mon))
    for t in times2:
        mins2.append(
            str(t.tm_year) + str(t.tm_yday) + str(t.tm_hour) +
            str(int(t.tm_min) / 10))
        hrs2.append(str(t.tm_year) + str(t.tm_yday) + str(t.tm_hour))
        days2.append(str(t.tm_year) + str(t.tm_yday))
        weeks2.append(str(t.tm_year) + str(int(t.tm_yday) / 7))
        mons2.append(str(t.tm_year) + str(t.tm_mon))
        by_hr2.append(str(t.tm_hour))
        by_mon2.append(str(t.tm_mon))

    td_10min = calKLDivergence(ut.getDistri(mins1), ut.getDistri(mins2))
    td_hr = calKLDivergence(ut.getDistri(hrs1), ut.getDistri(hrs2))
    td_day = calKLDivergence(ut.getDistri(days1), ut.getDistri(days2))
    td_week = calKLDivergence(ut.getDistri(weeks1), ut.getDistri(weeks2))
    td_mon = calKLDivergence(ut.getDistri(mons1), ut.getDistri(mons2))
    td_by_hr = calKLDivergence(ut.getDistri(by_hr1), ut.getDistri(by_hr2))
    td_by_mon = calKLDivergence(ut.getDistri(by_mon1), ut.getDistri(by_mon2))

    # spatial temporal features
    temp_spatial1_hr, temp_spatial1_day, temp_spatial1_week, temp_spatial1_by_hr, temp_spatial1_by_mon = temp_spatial_distri(
        posts1, post_with_place_index1, weeks1, days1, hrs1, by_hr1, by_mon1)
    temp_spatial2_hr, temp_spatial2_day, temp_spatial2_week, temp_spatial2_by_hr, temp_spatial2_by_mon = temp_spatial_distri(
        posts2, post_with_place_index2, weeks2, days2, hrs2, by_hr2, by_mon2)

    cp_hr = temp_spatial_common_place(temp_spatial1_hr, temp_spatial2_hr)
    cp_day = temp_spatial_common_place(temp_spatial1_day, temp_spatial2_day)
    cp_mon = temp_spatial_common_place(temp_spatial1_week, temp_spatial2_week)
    cp_by_hr = temp_spatial_common_place(temp_spatial1_by_hr,
                                         temp_spatial2_by_hr)
    cp_by_mon = temp_spatial_common_place(temp_spatial1_by_mon,
                                          temp_spatial2_by_mon)
    pd_by_hr = temp_spatial_divergence(temp_spatial1_by_hr,
                                       temp_spatial2_by_hr, 24)
    pd_by_mon = temp_spatial_divergence(temp_spatial1_by_mon,
                                        temp_spatial2_by_mon, 24)
    avgd_hr = temp_spatial_distance(temp_spatial1_hr, temp_spatial2_hr)

    # content temporal features: sentiment and topic comparison
    temp_sentiment1_hr, temp_sentiment1_day, temp_sentiment1_week, temp_sentiment1_mon = temp_sentiment_distri(
        posts1, mons1, weeks1, days1, hrs1)
    temp_sentiment2_hr, temp_sentiment2_day, temp_sentiment2_week, temp_sentiment2_mon = temp_sentiment_distri(
        posts2, mons2, weeks2, days2, hrs2)
    temp_topic_distri1_hr, temp_topic_distri1_day, temp_topic_distri1_week, temp_topic_distri1_mon = temp_topic_distri(
        posts1, mons1, weeks1, days1)
    temp_topic_distri2_hr, temp_topic_distri2_day, temp_topic_distri2_week, temp_topic_distri2_mon = temp_topic_distri(
        posts2, mons2, weeks2, days2)

    senti_sim_hr = calTempSentimentSim(temp_sentiment1_hr, temp_sentiment2_hr)
    senti_sim_day = calTempSentimentSim(temp_sentiment1_day,
                                        temp_sentiment2_day)
    senti_sim_week = calTempSentimentSim(temp_sentiment1_week,
                                         temp_sentiment2_week)
    senti_sim_mon = calTempSentimentSim(temp_sentiment1_mon,
                                        temp_sentiment2_mon)

    topic_d_day = calTempTopicDivergence(temp_topic_distri1_day,
                                         temp_topic_distri2_day)
    topic_d_week = calTempTopicDivergence(temp_topic_distri1_week,
                                          temp_topic_distri2_week)
    topic_d_mon = calTempTopicDivergence(temp_topic_distri1_mon,
                                         temp_topic_distri2_mon)

    # all scores
    scores_temp = [
        td_10min, td_hr, td_day, td_week, td_mon, td_by_hr, td_by_mon
    ]
    scores_spatial_temp = [
        cp_hr, cp_day, cp_mon, cp_by_hr, cp_by_mon, pd_by_hr, pd_by_mon,
        avgd_hr
    ]
    scores_content_temp = [
        senti_sim_hr, senti_sim_day, senti_sim_week, senti_sim_mon,
        topic_d_day, topic_d_week, topic_d_mon
    ]

    scores = scores_temp + scores_spatial_temp
    return scores

示例#8

0

显示文件

def temp_spatial_divergence(temp_spatial_list1, temp_spatial_list2, total):
    pd = float()
    for t in temp_spatial_list1:
        pd += calKLDivergence(ut.getDistri(temp_spatial_list1[t]),
                              ut.getDistri(temp_spatial_list2.get(t, list())))
    return pd / total