def factor_knn(findex, topic_popularity, dataset, num_neigh): """ 对于每个dynamic factor,分别计算其knn邻居 """ num_level = 2 # topic_id ==> set of knn neighbours' topic id factor_knn_graph = dict() total = len(dataset) for topic_id, ins, level in dataset: #print 'Finding knn for topic: ', topic_id distance_comment_list = [0] * total index = 0 for topic_id_other, ins_other, level_other in dataset: if topic_id_other == topic_id: continue # 程序的瓶颈:计算两个ts的距离 dis = get_instance_distance(ins, ins_other, findex) if dis == 0: dis = 1e-6 level = topic_popularity[topic_id_other][0] target_comment_count = topic_popularity[topic_id_other][1] prediction_comment_count= topic_popularity[topic_id_other][2] ratio = topic_popularity[topic_id_other][3] distance_comment_list[index] = [topic_id_other, dis, prediction_comment_count, target_comment_count, level, ratio] index += 1 distance_comment_list = distance_comment_list[:index] # 按照dis进行升序排序 distance_comment_list.sort(key=operator.itemgetter(1), reverse=False) # 将所有的最短距离都记录 # 需要确保knn_level_list中包括两类的样本 knn_level_list, knn_list, level_count_list = get_knn_level_list_old(distance_comment_list, num_neigh, num_level) factor_knn_graph[topic_id] = set() k = len(knn_list) for i in range(k): neighbour_topic_id = knn_list[i][0] factor_knn_graph[topic_id].add(neighbour_topic_id) return factor_knn_graph
def factor_score_knn(findex, mutual_knn_graph, target_topic_id, topic_popularity, num_level, prior_score = -1, gamma = 1): """ 计算每个topic的confidence score和level score """ # 标记是否考虑先验信息 with_prior_flag = isinstance(prior_score, dict) num_mutual_neighbour = len(mutual_knn_graph[target_topic_id]) neighbour_topic_id = list(mutual_knn_graph[target_topic_id]) level_confidence_score = np.zeros((num_level,), float) level_prior_score = np.array([0] * num_level, float) if num_mutual_neighbour == 0: print 'Topic %s in factor %d does not have any mutual knn neighbours.' % (target_topic_id, findex) return level_confidence_score, level_prior_score # normalize the distance dis_list = [0] * num_mutual_neighbour for i in range(num_mutual_neighbour): topic_id = neighbour_topic_id[i] ins = topic_popularity[target_topic_id][4] ins_other = topic_popularity[topic_id][4] dis_list[i] = get_instance_distance(ins, ins_other, findex) # use the min-max normalizer #dis_list = my_min_max_scaler(dis_list) #print 'Transformed distance list:', dis_list #import ipdb; ipdb.set_trace() Z = [0] * 2 gamma = 1 for i in range(num_mutual_neighbour): topic_id = neighbour_topic_id[i] #dis = knn_list[i][1] # 如果topic_id不在训练集中则不考虑 if not topic_popularity[topic_id][-1]: continue level = topic_popularity[topic_id][0] dis = dis_list[i] # TODO: 这里的weight的值很可能覆盖prior try: weight = math.exp(-gamma * dis) except OverflowError: print 'Error in math.exp: ', -gamma * dis_list[i] continue Z[level] += weight if with_prior_flag: # 如果已经传递了先验信息 #import ipdb; ipdb.set_trace() level_confidence_score[level] += weight # 计算每个instance在这个factor下的level prior score level_prior = prior_score[topic_id] level_prior_score[level] += (weight * level_prior[findex]) else: level_confidence_score[level] += weight # normalize if sum(Z) > 0: level_confidence_score /= sum(Z) else: print 'Warning: Topic(%s) in factor(%d) dose not have any mutual knn neighbours.' % (topic_id, findex) level_confidence_score[:] = 1/num_level # 在不同factor下的level confidence下加入level_prior_score信息 if with_prior_flag: if Z[0] > 0: level_prior_score[0] /= Z[0] else: level_prior_score[0] = 0 if Z[1] > 0: level_prior_score[1] /= Z[1] else: level_prior_score[1] = 0 if np.sum(level_prior_score) > 0: # 归一化, 此时 level_prior_score 的作用和level_confidence_score相同,只不过包含了先验信息 level_prior_score /= np.sum(level_prior_score) else: level_prior_score[:] = 1/num_level #print 'Level confidence score:', level_confidence_score return level_confidence_score, level_prior_score