def calSpatialScore(place_seq1, place_seq2): scores = list() if len(place_seq1)==0 or len(place_seq2)==0: return [0]*4 places1 = set(place_seq1) places2 = set(place_seq2) place_num1 = ut.getDistri(place_seq1) place_num2 = ut.getDistri(place_seq2) # 如果到小數點後第二位一樣就算是同一個地方 # most visited place most_visited_places1 = [place for place, num in place_num1.items() if num==max(place_num1, key=place_num1.get)] most_visited_places2 = [place for place, num in place_num2.items() if num==max(place_num2, key=place_num2.get)] mvp = 1 if len(set(most_visited_places1)and set(most_visited_places2))>0 else 0 # common place cp = len(places1 and places2)/len(places1 or places2) # place divergence pd = calKLDivergence(place_num1, place_num2) # average distance ratio avg_distance1 = float() avg_distance2 = float() if len(places1) ==1 or len(places2) == 1: avgdRatio = 1 else: for p1 in places1: for p2 in places1: avg_distance1+= calDistance(p1, p2) for p1 in places2: for p2 in places2: avg_distance2+= calDistance(p1, p2) avg_distance1/=(len(places1)*(len(places1)-1))/2 avg_distance2/=(len(places2)*(len(places2)-1))/2 avgdRatio = max(avg_distance1, avg_distance2)/min(avg_distance1, avg_distance2) scores = [mvp, cp, pd, avgdRatio] return scores
def structUserData(sn, uid): print(uid) # norm profile profile = ut.readJson2Dict(inputPath+sn+"/profile/", uid) posts = ut.readJson2Dict(inputPath+sn+"/wall/", uid) print("profile:"+interPath+sn+"/profile/"+uid) newProfile = normProfile(sn, profile) print("wall:"+interPath+sn+"/wall/"+uid) newPosts = normWall(sn, posts) ut.writeDict2Json(interPath+sn+"/profile/", uid, newProfile) ut.writeDict2Json(interPath+sn+"/wall/", uid, newPosts) # wall statisitcs langDistri = ut.getDistri([post["lang"] for post in newPosts]) # sentiment sum sentiments = [post["sentiment"]["polarity"] for post in newPosts] sentiment_score = sum(sentiments)/len(sentiments) if len(sentiments)>0 else 0 # topic sum topicDistris = [post["topic_distri"] for post in newPosts] userTopicDistri = ut.mergeDict(topicDistris) userTopicDistri = ut.normVector(userTopicDistri) # tf tfs = [post["tf"] for post in newPosts] userTf = ut.mergeDict(tfs) return (userTf, langDistri, sentiment_score, userTopicDistri)
def structUserData(sn, uid): print(uid) # norm profile profile = ut.readJson2Dict(inputPath + sn + "/profile/", uid) posts = ut.readJson2Dict(inputPath + sn + "/wall/", uid) print("profile:" + interPath + sn + "/profile/" + uid) newProfile = normProfile(sn, profile) print("wall:" + interPath + sn + "/wall/" + uid) newPosts = normWall(sn, posts) ut.writeDict2Json(interPath + sn + "/profile/", uid, newProfile) ut.writeDict2Json(interPath + sn + "/wall/", uid, newPosts) # wall statisitcs langDistri = ut.getDistri([post["lang"] for post in newPosts]) # sentiment sum sentiments = [post["sentiment"]["polarity"] for post in newPosts] sentiment_score = sum(sentiments) / len(sentiments) if len( sentiments) > 0 else 0 # topic sum topicDistris = [post["topic_distri"] for post in newPosts] userTopicDistri = ut.mergeDict(topicDistris) userTopicDistri = ut.normVector(userTopicDistri) # tf tfs = [post["tf"] for post in newPosts] userTf = ut.mergeDict(tfs) return (userTf, langDistri, sentiment_score, userTopicDistri)
def calSpatialScore(place_seq1, place_seq2): scores = list() if len(place_seq1) == 0 or len(place_seq2) == 0: return [0] * 4 places1 = set(place_seq1) places2 = set(place_seq2) place_num1 = ut.getDistri(place_seq1) place_num2 = ut.getDistri(place_seq2) # 如果到小數點後第二位一樣就算是同一個地方 # most visited place most_visited_places1 = [ place for place, num in place_num1.items() if num == max(place_num1, key=place_num1.get) ] most_visited_places2 = [ place for place, num in place_num2.items() if num == max(place_num2, key=place_num2.get) ] mvp = 1 if len(set(most_visited_places1) and set(most_visited_places2)) > 0 else 0 # common place cp = len(places1 and places2) / len(places1 or places2) # place divergence pd = calKLDivergence(place_num1, place_num2) # average distance ratio avg_distance1 = float() avg_distance2 = float() if len(places1) == 1 or len(places2) == 1: avgdRatio = 1 else: for p1 in places1: for p2 in places1: avg_distance1 += calDistance(p1, p2) for p1 in places2: for p2 in places2: avg_distance2 += calDistance(p1, p2) avg_distance1 /= (len(places1) * (len(places1) - 1)) / 2 avg_distance2 /= (len(places2) * (len(places2) - 1)) / 2 avgdRatio = max(avg_distance1, avg_distance2) / min( avg_distance1, avg_distance2) scores = [mvp, cp, pd, avgdRatio] return scores
def temp_spatial_divergence(temp_spatial_list1, temp_spatial_list2, total): pd = float() for t in temp_spatial_list1: pd+= calKLDivergence(ut.getDistri(temp_spatial_list1[t]), ut.getDistri(temp_spatial_list2.get(t, list()))) return pd/total
def calTemporalScore(times1, times2, post_with_place_index1, post_with_place_index2, posts1, posts2): scores = list() mins1 = list() hrs1 = list() days1 = list() weeks1 = list() mons1 = list() by_hr1 = list() by_mon1= list() mins2 = list() hrs2 = list() days2 = list() weeks2 = list() mons2 = list() by_hr2 = list() by_mon2 = list() # temporal features if len(times1)>0 and len(times2)<=0: return [0]*15 # times1 and times2 for t in times1: mins1.append(str(t.tm_year)+str(t.tm_yday)+str(t.tm_hour)+str(int(t.tm_min)/10)) hrs1.append(str(t.tm_year)+str(t.tm_yday)+str(t.tm_hour)) days1.append(str(t.tm_year)+str(t.tm_yday)) weeks1.append(str(t.tm_year)+str(int(t.tm_yday)/7)) mons1.append(str(t.tm_year)+str(t.tm_mon)) by_hr1.append(str(t.tm_hour)) by_mon1.append(str(t.tm_mon)) for t in times2: mins2.append(str(t.tm_year)+str(t.tm_yday)+str(t.tm_hour)+str(int(t.tm_min)/10)) hrs2.append(str(t.tm_year)+str(t.tm_yday)+str(t.tm_hour)) days2.append(str(t.tm_year)+str(t.tm_yday)) weeks2.append(str(t.tm_year)+str(int(t.tm_yday)/7)) mons2.append(str(t.tm_year)+str(t.tm_mon)) by_hr2.append(str(t.tm_hour)) by_mon2.append(str(t.tm_mon)) td_10min = calKLDivergence(ut.getDistri(mins1), ut.getDistri(mins2)) td_hr = calKLDivergence(ut.getDistri(hrs1), ut.getDistri(hrs2)) td_day = calKLDivergence(ut.getDistri(days1), ut.getDistri(days2)) td_week = calKLDivergence(ut.getDistri(weeks1), ut.getDistri(weeks2)) td_mon = calKLDivergence(ut.getDistri(mons1), ut.getDistri(mons2)) td_by_hr = calKLDivergence(ut.getDistri(by_hr1), ut.getDistri(by_hr2)) td_by_mon = calKLDivergence(ut.getDistri(by_mon1), ut.getDistri(by_mon2)) # spatial temporal features temp_spatial1_hr, temp_spatial1_day, temp_spatial1_week, temp_spatial1_by_hr, temp_spatial1_by_mon = temp_spatial_distri(posts1, post_with_place_index1, weeks1, days1, hrs1, by_hr1, by_mon1) temp_spatial2_hr, temp_spatial2_day, temp_spatial2_week, temp_spatial2_by_hr, temp_spatial2_by_mon = temp_spatial_distri(posts2, post_with_place_index2, weeks2, days2, hrs2, by_hr2, by_mon2) cp_hr = temp_spatial_common_place(temp_spatial1_hr, temp_spatial2_hr) cp_day = temp_spatial_common_place(temp_spatial1_day, temp_spatial2_day) cp_mon = temp_spatial_common_place(temp_spatial1_week, temp_spatial2_week) cp_by_hr = temp_spatial_common_place(temp_spatial1_by_hr, temp_spatial2_by_hr) cp_by_mon = temp_spatial_common_place(temp_spatial1_by_mon, temp_spatial2_by_mon) pd_by_hr = temp_spatial_divergence(temp_spatial1_by_hr, temp_spatial2_by_hr, 24) pd_by_mon = temp_spatial_divergence(temp_spatial1_by_mon, temp_spatial2_by_mon, 24) avgd_hr = temp_spatial_distance(temp_spatial1_hr, temp_spatial2_hr) # content temporal features: sentiment and topic comparison temp_sentiment1_hr, temp_sentiment1_day, temp_sentiment1_week, temp_sentiment1_mon = temp_sentiment_distri(posts1, mons1, weeks1, days1, hrs1) temp_sentiment2_hr, temp_sentiment2_day, temp_sentiment2_week, temp_sentiment2_mon = temp_sentiment_distri(posts2, mons2, weeks2, days2, hrs2) temp_topic_distri1_hr ,temp_topic_distri1_day, temp_topic_distri1_week, temp_topic_distri1_mon = temp_topic_distri(posts1, mons1, weeks1, days1) temp_topic_distri2_hr, temp_topic_distri2_day, temp_topic_distri2_week, temp_topic_distri2_mon = temp_topic_distri(posts2, mons2, weeks2, days2) senti_sim_hr = calTempSentimentSim(temp_sentiment1_hr, temp_sentiment2_hr) senti_sim_day = calTempSentimentSim(temp_sentiment1_day, temp_sentiment2_day) senti_sim_week = calTempSentimentSim(temp_sentiment1_week, temp_sentiment2_week) senti_sim_mon = calTempSentimentSim(temp_sentiment1_mon, temp_sentiment2_mon) topic_d_day = calTempTopicDivergence(temp_topic_distri1_day, temp_topic_distri2_day) topic_d_week = calTempTopicDivergence(temp_topic_distri1_week, temp_topic_distri2_week) topic_d_mon = calTempTopicDivergence(temp_topic_distri1_mon, temp_topic_distri2_mon) # all scores scores_temp = [td_10min, td_hr, td_day, td_week, td_mon, td_by_hr, td_by_mon] scores_spatial_temp = [cp_hr, cp_day, cp_mon, cp_by_hr, cp_by_mon, pd_by_hr, pd_by_mon, avgd_hr] scores_content_temp = [senti_sim_hr, senti_sim_day, senti_sim_week, senti_sim_mon, topic_d_day, topic_d_week, topic_d_mon] scores = scores_temp + scores_spatial_temp return scores
def calTemporalScore(times1, times2, post_with_place_index1, post_with_place_index2, posts1, posts2): scores = list() mins1 = list() hrs1 = list() days1 = list() weeks1 = list() mons1 = list() by_hr1 = list() by_mon1 = list() mins2 = list() hrs2 = list() days2 = list() weeks2 = list() mons2 = list() by_hr2 = list() by_mon2 = list() # temporal features if len(times1) > 0 and len(times2) <= 0: return [0] * 15 # times1 and times2 for t in times1: mins1.append( str(t.tm_year) + str(t.tm_yday) + str(t.tm_hour) + str(int(t.tm_min) / 10)) hrs1.append(str(t.tm_year) + str(t.tm_yday) + str(t.tm_hour)) days1.append(str(t.tm_year) + str(t.tm_yday)) weeks1.append(str(t.tm_year) + str(int(t.tm_yday) / 7)) mons1.append(str(t.tm_year) + str(t.tm_mon)) by_hr1.append(str(t.tm_hour)) by_mon1.append(str(t.tm_mon)) for t in times2: mins2.append( str(t.tm_year) + str(t.tm_yday) + str(t.tm_hour) + str(int(t.tm_min) / 10)) hrs2.append(str(t.tm_year) + str(t.tm_yday) + str(t.tm_hour)) days2.append(str(t.tm_year) + str(t.tm_yday)) weeks2.append(str(t.tm_year) + str(int(t.tm_yday) / 7)) mons2.append(str(t.tm_year) + str(t.tm_mon)) by_hr2.append(str(t.tm_hour)) by_mon2.append(str(t.tm_mon)) td_10min = calKLDivergence(ut.getDistri(mins1), ut.getDistri(mins2)) td_hr = calKLDivergence(ut.getDistri(hrs1), ut.getDistri(hrs2)) td_day = calKLDivergence(ut.getDistri(days1), ut.getDistri(days2)) td_week = calKLDivergence(ut.getDistri(weeks1), ut.getDistri(weeks2)) td_mon = calKLDivergence(ut.getDistri(mons1), ut.getDistri(mons2)) td_by_hr = calKLDivergence(ut.getDistri(by_hr1), ut.getDistri(by_hr2)) td_by_mon = calKLDivergence(ut.getDistri(by_mon1), ut.getDistri(by_mon2)) # spatial temporal features temp_spatial1_hr, temp_spatial1_day, temp_spatial1_week, temp_spatial1_by_hr, temp_spatial1_by_mon = temp_spatial_distri( posts1, post_with_place_index1, weeks1, days1, hrs1, by_hr1, by_mon1) temp_spatial2_hr, temp_spatial2_day, temp_spatial2_week, temp_spatial2_by_hr, temp_spatial2_by_mon = temp_spatial_distri( posts2, post_with_place_index2, weeks2, days2, hrs2, by_hr2, by_mon2) cp_hr = temp_spatial_common_place(temp_spatial1_hr, temp_spatial2_hr) cp_day = temp_spatial_common_place(temp_spatial1_day, temp_spatial2_day) cp_mon = temp_spatial_common_place(temp_spatial1_week, temp_spatial2_week) cp_by_hr = temp_spatial_common_place(temp_spatial1_by_hr, temp_spatial2_by_hr) cp_by_mon = temp_spatial_common_place(temp_spatial1_by_mon, temp_spatial2_by_mon) pd_by_hr = temp_spatial_divergence(temp_spatial1_by_hr, temp_spatial2_by_hr, 24) pd_by_mon = temp_spatial_divergence(temp_spatial1_by_mon, temp_spatial2_by_mon, 24) avgd_hr = temp_spatial_distance(temp_spatial1_hr, temp_spatial2_hr) # content temporal features: sentiment and topic comparison temp_sentiment1_hr, temp_sentiment1_day, temp_sentiment1_week, temp_sentiment1_mon = temp_sentiment_distri( posts1, mons1, weeks1, days1, hrs1) temp_sentiment2_hr, temp_sentiment2_day, temp_sentiment2_week, temp_sentiment2_mon = temp_sentiment_distri( posts2, mons2, weeks2, days2, hrs2) temp_topic_distri1_hr, temp_topic_distri1_day, temp_topic_distri1_week, temp_topic_distri1_mon = temp_topic_distri( posts1, mons1, weeks1, days1) temp_topic_distri2_hr, temp_topic_distri2_day, temp_topic_distri2_week, temp_topic_distri2_mon = temp_topic_distri( posts2, mons2, weeks2, days2) senti_sim_hr = calTempSentimentSim(temp_sentiment1_hr, temp_sentiment2_hr) senti_sim_day = calTempSentimentSim(temp_sentiment1_day, temp_sentiment2_day) senti_sim_week = calTempSentimentSim(temp_sentiment1_week, temp_sentiment2_week) senti_sim_mon = calTempSentimentSim(temp_sentiment1_mon, temp_sentiment2_mon) topic_d_day = calTempTopicDivergence(temp_topic_distri1_day, temp_topic_distri2_day) topic_d_week = calTempTopicDivergence(temp_topic_distri1_week, temp_topic_distri2_week) topic_d_mon = calTempTopicDivergence(temp_topic_distri1_mon, temp_topic_distri2_mon) # all scores scores_temp = [ td_10min, td_hr, td_day, td_week, td_mon, td_by_hr, td_by_mon ] scores_spatial_temp = [ cp_hr, cp_day, cp_mon, cp_by_hr, cp_by_mon, pd_by_hr, pd_by_mon, avgd_hr ] scores_content_temp = [ senti_sim_hr, senti_sim_day, senti_sim_week, senti_sim_mon, topic_d_day, topic_d_week, topic_d_mon ] scores = scores_temp + scores_spatial_temp return scores
def temp_spatial_divergence(temp_spatial_list1, temp_spatial_list2, total): pd = float() for t in temp_spatial_list1: pd += calKLDivergence(ut.getDistri(temp_spatial_list1[t]), ut.getDistri(temp_spatial_list2.get(t, list()))) return pd / total