def find_effective_factor(train_set, k): """ 找出每个训练样本在哪些dynamic factor下能够分类成功 """ # import ipdb; ipdb.set_trace() num_factor = len(train_set[0][1][1]) # get the number of dynamic factors train_count = len(train_set) effective_factors = dict() # topic_id ==> effective factor set for topic_id, ins, true_level in train_set: effective_factor_set = set() for findex in range(num_factor): distance_list = [0] * (train_count - 1) index = 0 for train_topic_id, train_ins, train_true_level in train_set: if train_topic_id == topic_id: # 不考虑自身 continue dis = get_instance_distance(ins, train_ins, findex) distance_list[index] = [train_topic_id, dis, train_true_level] index += 1 distance_list = distance_list[:index] distance_list.sort(key=operator.itemgetter(1), reverse=False) level_list, nn_list = get_knn_level_list(distance_list, k) pred_level = trusted_vote(level_list, num_level=2, majority_threshold=0.66) if pred_level == true_level: effective_factor_set.add(findex) print "Effecitive factors for %s: %r" % (topic_id, effective_factor_set) effective_factors[topic_id] = effective_factor_set return effective_factors
def find_nearest_neighbor_level(test_ins, train_set, k): """ 简单版本找到k近邻的level,但只能处理一种feature Return: k个近邻的level """ nearest_neighbors = [] # [similarity, level] for train_topic_id, train_ins, level in train_set: sim = get_instance_distance(test_ins, train_ins, findex=1) insert_neighbor(nearest_neighbors, k, sim, level) nearest_neighbor_level = [0] * k for i in range(k): nearest_neighbor_level[i] = nearest_neighbors[i][1] # get the level return nearest_neighbor_level
def effective_factor_knn(train_set, test_ins, k, effective_factors): """ 给定有效因素进行分类 """ train_count = len(train_set) num_factor = len(test_ins[1]) # get the number of features test_topic_id = test_ins[0][3] # 初始化所有的topic的rank score topic_popularity = dict() # topic_id ==> (level, comment_count) for train_topic_id, train_ins, level in train_set: target_comment_count = train_ins[0][0] prediction_comment_count = train_ins[0][4] ratio = target_comment_count * 1.0 / prediction_comment_count topic_popularity[train_topic_id] = (level, target_comment_count, prediction_comment_count, ratio) topic_score_list = [] # 每个feature都保存一份score list """ 注:分别在不同的dynamic factor中查找最近邻,然后将这些最近邻组合起来投票 """ # import ipdb; ipdb.set_trace() pred_level_list = [] for findex in range(num_factor): # print 'Caculating score and rank for feature: ', findex distance_list = [0] * train_count index = 0 # import ipdb, ipdb.set_trace() for train_topic_id, train_ins, level in train_set: if not findex in effective_factors[train_topic_id]: continue dis = get_instance_distance(test_ins, train_ins, findex) distance_list[index] = [train_topic_id, dis, level] index += 1 distance_list = distance_list[:index] # 按照dis进行升序排序 distance_list.sort(key=operator.itemgetter(1), reverse=False) level_list, nn_list = get_knn_level_list(distance_list, k) pred_level = trusted_vote(level_list, num_level=2, majority_threshold=0.66) if pred_level != -1: pred_level_list.append(pred_level) # print '\nOverall pred level list: ', pred_level_list if len(pred_level_list) == 0: return [], "", 0 return pred_level_list, "", 0 num_neighbour = len(knn_list) knn_level = [0] * num_neighbour knn_topic_id = [0] * num_neighbour # 真正的k个近邻的topic id # 将k近邻的评论数进行加权平均,将score值作为权值 weighted_num_comment = 0 weighted_ratio = 0 total_score = 0 for i in range(num_neighbour): topic_id = knn_list[i][0] dis = knn_list[i][1] prediction_comment_count = knn_list[i][2] target_comment_count = knn_list[i][3] level = knn_list[i][4] ratio = knn_list[i][5] knn_level[i] = level knn_topic_id[i] = topic_id score = 1 total_score += score weighted_num_comment += score * target_comment_count weighted_ratio += score * ratio weighted_num_comment = weighted_num_comment * 1.0 / total_score weighted_ratio = weighted_ratio * 1.0 / total_score weighted_ratio_pred = test_ins[0][4] * weighted_ratio return knn_level, knn_topic_id, weighted_ratio_pred