def main(group_id): #topiclist_path = 'data-dynamic/TopicList-' + group_id + 'shuffled-.txt' # for douban dataset topiclist_path = 'data-dynamic/' + group_id + '-post-list.txt' # for Tianya dataset topic_list = load_id_list(topiclist_path) print 'Number of total topics loaded: ', len(topic_list) pop_level = [25, 50, float('inf')] # prediction_date 的含义为:在帖子发布 prediction_date 时间后,开始预测 # target_date 的含义为:预测在 target_date 处的评论数量 # 以上两个参数可以调节 # 设置采样的间隔 gaptime = timedelta(hours=3) prediction_date = timedelta(hours=10*3) response_time = timedelta(hours=24) target_date = prediction_date + response_time # 计算每个topic在prediction_date前会有多少个interval num_feature = int(prediction_date.total_seconds() / gaptime.total_seconds()) print 'Number of features: ', num_feature alpha = 1.5 percentage_threshold = 0.7 print 'Generating training and test dataset...' dataset, comment_count_dataset, Bao_dataset, category_count_list = prepare_dataset(group_id, \ topic_list, gaptime, pop_level, prediction_date, target_date, alpha, percentage_threshold) # 保存那些经过筛选的topic id #save_filtered_topics(group_id, dataset) print 'Ploting factor propagation' #factor_propagation_plot(dataset, num_feature) #topic_propagation_plot(dataset, num_feature) #return # 调整所有帖子的顺序 # 在调试阶段,暂且不shuffle dataset,避免每次结果都不一样 #shuffle(dataset) # 注意:每次使用的数据集是不同的 total = len(dataset) train_cnt = total * 4 / 5 train_set = dataset[:train_cnt] test_set = dataset[train_cnt:] print 'Training: %d, Test: %d' % (train_cnt, total-train_cnt) print 'Category 0: %d, Category 1: %d ' % (category_count_list[0] , category_count_list[1]) print 'Imbalance ratio: ', category_count_list[0] * 1.0 / category_count_list[1] dataset, comment_count_dataset, Bao_dataset, category_count_list = down_sampling_dataset(dataset, \ comment_count_dataset, Bao_dataset, category_count_list) print 'After down sampling...' total = len(dataset) train_cnt = total * 4 / 5 train_set = dataset[:train_cnt] test_set = dataset[train_cnt:] print 'Training: %d, Test: %d' % (train_cnt, total-train_cnt) print 'Category 0: %d, Category 1: %d ' % (category_count_list[0] , category_count_list[1]) print 'Imbalance ratio: ', category_count_list[0] * 1.0 / category_count_list[1]
def select_k(group_id, topic_list, percentage_threshold, prediction_date, response_time, cvk): # sampling interval gaptime = timedelta(hours=5) target_date = prediction_date + response_time num_feature = int(prediction_date.total_seconds() / gaptime.total_seconds()) print "Number of features: ", num_feature # percentage_threshold = 0.7 alpha = 1 / percentage_threshold pop_level = [25, 50, float("inf")] # group: zhuangb print "Generating training and test dataset..." dataset, comment_count_dataset, Bao_dataset, category_count_list = prepare_dataset( group_id, topic_list, gaptime, pop_level, prediction_date, target_date, alpha, percentage_threshold ) print "Down-sampling the datasets..." dataset, comment_count_dataset, Bao_dataset, category_count_list = down_sampling_dataset( dataset, comment_count_dataset, Bao_dataset, category_count_list ) total = len(dataset) n_folds = 3 kf = KFold(total, n_folds) IPW_acc_list = [] for cv_train_index, cv_test_index in kf: train_set = make_cv_dataset(dataset, cv_train_index) test_set = make_cv_dataset(dataset, cv_test_index) train_cnt = len(train_set) print "Training: %d, Test: %d" % (train_cnt, total - train_cnt) print "Category 0: %d, Category 1: %d " % (category_count_list[0], category_count_list[1]) print "Imbalance ratio: ", category_count_list[0] * 1.0 / category_count_list[1] num_level = 2 num_factor = len(train_set[0][1][1]) print "The proposed model:" print "Caculating instance prior score..." prior_score = -1 mutual_knn_graph_list = None # prior_score = caculate_instance_prior_confidence_score(train_set, k, num_level = 2) # for instance_prior_weighting3.py topic_popularity, prior_score, mutual_knn_graph_list = caculate_instance_prior_confidence_score( train_set, test_set, cvk, num_factor, num_level=2 ) # for IPW_mutual_knn.py print "Classify test instances..." y_true, y_pred, comment_true, comment_pred, give_up_list, prediction_list, factor_prediction = classify( train_set, test_set, cvk, num_factor, num_level, prior_score, topic_popularity, mutual_knn_graph_list ) # evaluate results print "Number of give-ups: ", len(give_up_list) IPW_acc = classification_evaluation(y_true, y_pred) IPW_acc_list.append(IPW_acc) return IPW_acc_list
def main(group_id): topiclist_path = 'data-dynamic/TopicList-' + group_id + '-filtered.txt' topic_list = load_id_list(topiclist_path) print 'Number of total topics loaded: ', len(topic_list) # set the pre-computed popularity level # 未来的最大评论数可能超过pop_level的最大值 # 注意:这里将最小的popularity值,即0,忽略了 #pop_level = [8, 13, 23, 43, float('inf')] # group: zhuangb pop_level = [25, 50, float('inf')] # group: zhuangb #pop_level = [25, 50, float('inf')] # group: buybook #pop_level = [30, float('inf')] # group: buybook # prediction_date 的含义为:在帖子发布 prediction_date 时间后,开始预测 # target_date 的含义为:预测在 target_date 处的评论数量 # 以上两个参数可以调节 # 设置采样的间隔 gaptime = timedelta(hours=5) prediction_date = timedelta(hours=10*5) response_time = timedelta(hours=50) target_date = prediction_date + response_time # 计算每个topic在prediction_date前会有多少个interval num_feature = int(prediction_date.total_seconds() / gaptime.total_seconds()) print 'Number of features: ', num_feature alpha = 1.5 percentage_threshold = 0.7 print 'Generating training and test dataset...' dataset, comment_count_dataset, Bao_dataset, category_count_list = prepare_dataset(group_id, \ topic_list, gaptime, pop_level, prediction_date, target_date, alpha, percentage_threshold) # 保存那些经过筛选的topic id #save_filtered_topics(group_id, dataset) #print 'Ploting factor propagation' #factor_propagation_plot(dataset, num_feature) #topic_propagation_plot(dataset, num_feature) #return # 调整所有帖子的顺序 # 在调试阶段,暂且不shuffle dataset,避免每次结果都不一样 #shuffle(dataset) print 'Down-sampling the datasets...' dataset, comment_count_dataset, Bao_dataset, category_count_list = down_sampling_dataset(dataset, \ comment_count_dataset, Bao_dataset, category_count_list) total = len(dataset) train_cnt = total * 4 / 5 train_set = dataset[:train_cnt] test_set = dataset[train_cnt:] print 'Training: %d, Test: %d' % (train_cnt, total-train_cnt) print 'Category 0: %d, Category 1: %d ' % (category_count_list[0] , category_count_list[1]) print 'Imbalance ratio: ', category_count_list[0] * 1.0 / category_count_list[1] #num_level = len(pop_level) #raw_input() #import ipdb #ipdb.set_trace() print 'The proposed model:' k = 3 num_level = 2 num_factor = len(train_set[0][1][1]) print 'Classify test instances...' y_true, y_pred, comment_true, comment_pred, give_up_list, prediction_list = classify(train_set, test_set, k, num_level) # evaluate results print 'Number of give-ups: ', len(give_up_list) classification_evaluation(y_true, y_pred) level_MSE_evaluation(y_true, y_pred) #save_predictions(prediction_list, y_pred, factor_name = 'num_authors') #save_predictions(prediction_list, y_true, factor_name = 'all') comment_RSE_evaluation(comment_true, comment_pred) #print 'The class prior:', prior_score from svm_model import svm_model print 'Building a svm model...' y_true, y_pred = svm_model(train_set, test_set) classification_evaluation(y_true, y_pred) # 查看对于不同的factor,它们在不同的ratio上的预测结果 #from utils import ratio_accuracy_distribution_plot #ratio_accuracy_distribution_plot(y_true, y_pred, test_set, group_id, factor_name='tree_link_density') # S-H model print '\nThe S-H model:' baseline_train_set = comment_count_dataset[:train_cnt] baseline_test_set = comment_count_dataset[train_cnt:] y_true, y_pred, comment_true_cnt, comment_pred_cnt = SH_model(baseline_train_set, baseline_test_set, alpha) # drop some intances with cat = 0 comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt) # level wise classification classification_evaluation(y_true, y_pred) level_MSE_evaluation(y_true, y_pred) print '\nML model:' y_true, y_pred, comment_true_cnt, comment_pred_cnt = ML_model(baseline_train_set, baseline_test_set, num_feature, alpha) comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt) classification_evaluation(y_true, y_pred) print '\nMLR model:' y_true, y_pred, comment_true_cnt, comment_pred_cnt = MLR_model(baseline_train_set, baseline_test_set, num_feature, alpha) comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt) classification_evaluation(y_true, y_pred) print '\nkNN method:' k = 1 y_true, y_pred, comment_true_cnt, comment_pred_cnt = knn_method(train_set, test_set, k, num_feature, alpha) comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt) # level wise classification classification_evaluation(y_true, y_pred) print "\nBao's method:" Bao_train_set = Bao_dataset[:train_cnt] Bao_test_set = Bao_dataset[train_cnt:] y_true, y_pred, comment_true_cnt, comment_pred_cnt = Bao_method(Bao_train_set, Bao_test_set, alpha) comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt) classification_evaluation(y_true, y_pred)
def main(group_id, topic_list, threshold_p, prediction_date_tr, response_time_delta, gaptime_n, best_k): # 设置两个自由参数值,由外部传入 percentage_threshold = threshold_p prediction_date = timedelta(hours=prediction_date_tr) response_time = timedelta(hours=response_time_delta) # set the pre-computed popularity level # 未来的最大评论数可能超过pop_level的最大值 # 注意:这里将最小的popularity值,即0,忽略了 #pop_level = [8, 13, 23, 43, float('inf')] # group: zhuangb pop_level = [25, 50, float('inf')] # group: zhuangb #pop_level = [25, 50, float('inf')] # group: buybook #pop_level = [30, float('inf')] # group: buybook # prediction_date 的含义为:在帖子发布 prediction_date 时间后,开始预测 # target_date 的含义为:预测在 target_date 处的评论数量 # 以上两个参数可以调节 # 设置采样的间隔 gaptime = timedelta(hours=gaptime_n) #prediction_date = timedelta(hours=10*3) #response_time = timedelta(hours=24) # 已经作为参数传递 target_date = prediction_date + response_time # 计算每个topic在prediction_date前会有多少个interval num_feature = int(prediction_date.total_seconds() / gaptime.total_seconds()) print 'Number of features: ', num_feature #percentage_threshold = 0.7 alpha = 1/percentage_threshold #""" print 'Generating training and test dataset...' dataset, comment_count_dataset, Bao_dataset, category_count_list = prepare_dataset(group_id, \ topic_list, gaptime, pop_level, prediction_date, target_date, alpha, percentage_threshold) # 保存那些经过筛选的topic id #save_filtered_topics(group_id, dataset) #print 'Ploting factor propagation' #factor_propagation_plot(dataset, num_feature) #topic_propagation_plot(dataset, num_feature) #return print 'Down-sampling the datasets...' dataset, comment_count_dataset, Bao_dataset, category_count_list = down_sampling_dataset(dataset, \ comment_count_dataset, Bao_dataset, category_count_list) # 调整所有帖子的顺序 # 在调试阶段,暂且不shuffle dataset,避免每次结果都不一样 #shuffle(dataset) # 注意:每次使用的数据集是不同的 total = len(dataset) train_cnt = total * 4 / 5 train_set = dataset[:train_cnt] test_set = dataset[train_cnt:] print 'Training: %d, Test: %d' % (train_cnt, total-train_cnt) print 'Category 0: %d, Category 1: %d ' % (category_count_list[0] , category_count_list[1]) print 'Imbalance ratio: ', category_count_list[0] * 1.0 / category_count_list[1] #num_level = len(pop_level) #save_filtered_topics(group_id, dataset) #raw_input() from MDT_method import prepare_MDT_dataset #prepare_MDT_dataset(train_set, 'MDT_train.pickle') #prepare_MDT_dataset(test_set, 'MDT_test.pickle') #return k = best_k num_level = 2 num_factor = len(train_set[0][1][1]) print 'The proposed model:' #print 'Caculating class prior score...' #prior_score = np.ones((num_factor, num_level)) # 初始化 #prior_score = caculate_class_prior_confidence_score(train_set, k, num_level = 2) #print prior_score; raw_input() print 'Caculating instance prior score...' prior_score = -1 mutual_knn_graph_list = None #prior_score = caculate_instance_prior_confidence_score(train_set, k, num_level = 2) # for instance_prior_weighting3.py topic_popularity, prior_score, mutual_knn_graph_list = caculate_instance_prior_confidence_score(train_set, test_set, k, num_factor, num_level = 2) # for IPW_mutual_knn.py # 保存prior-score,train dataset,test-dataset #save_intermediate_results(train_set, test_set, comment_count_dataset, Bao_dataset, category_count_list, topic_popularity, prior_score, mutual_knn_graph_list) #""" #print 'Loading train_set, test_set, comment_count_dataset, ... and prior_score...' #train_set, test_set, comment_count_dataset, Bao_dataset, category_count_list, topic_popularity, prior_score, mutual_knn_graph_list = load_intermediate_results() #train_cnt = len(train_set) #k = best_k; num_level=2; num_factor = len(train_set[0][1][1]) #factor_name_list = ['current_comment_count', 'num_authors', 'tree_density', 'reply_density'] # 需要考察的factor变量 #factor_propagation_plot(group_id, train_set+test_set, num_feature, category_count_list, range(4), factor_name_list) #return print 'Parameter set:' print 'Gap time: ', gaptime print 'Prediction date(in hours):', prediction_date.total_seconds() / 3600 print 'Response time(in hours):', response_time.total_seconds() / 3600 print 'percentage_threshold: ', percentage_threshold print 'k = ', k # TODO:测试是否过拟合 #test_set = train_set # 将训练集作为测试集,查看是否过拟合 print 'Classify test instances...' y_true, y_pred, comment_true, comment_pred, give_up_list, prediction_list, factor_prediction = \ classify(train_set, test_set, k, num_factor, num_level, prior_score, topic_popularity, mutual_knn_graph_list) # evaluate results print 'Number of give-ups: ', len(give_up_list) IPW_acc = classification_evaluation(y_true, y_pred) level_MSE_evaluation(y_true, y_pred) #save_predictions(prediction_list, y_pred, factor_name = 'fourfactor') #save_predictions(prediction_list, y_true, factor_name = 'all') comment_RSE_evaluation(comment_true, comment_pred) #print 'The class prior:', prior_score print 'Single factor and simple vote prediction result:' single_factor_acc = single_factor_prediction(y_true, factor_prediction) from svm_model import svm_model print 'Building a svm model...' y_true, y_pred = svm_model(train_set, test_set) classification_evaluation(y_true, y_pred) # 查看对于不同的factor,它们在不同的ratio上的预测结果 from utils import ratio_accuracy_distribution_plot #ratio_accuracy_distribution_plot(y_true, y_pred, test_set, group_id, factor_name='tree_link_density') # S-H model print '\nThe S-H model:' baseline_train_set = comment_count_dataset[:train_cnt] baseline_test_set = comment_count_dataset[train_cnt:] y_true, y_pred, comment_true_cnt, comment_pred_cnt = SH_model(baseline_train_set, baseline_test_set, alpha) # drop some intances with cat = 0 comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt) # level wise classification classification_evaluation(y_true, y_pred) level_MSE_evaluation(y_true, y_pred) print '\nML model:' y_true, y_pred, comment_true_cnt, comment_pred_cnt = ML_model(baseline_train_set, baseline_test_set, num_feature, alpha) comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt) classification_evaluation(y_true, y_pred) print '\nMLR model:' y_true, y_pred, comment_true_cnt, comment_pred_cnt = MLR_model(baseline_train_set, baseline_test_set, num_feature, alpha) comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt) classification_evaluation(y_true, y_pred) print '\nkNN method:' knn_k = 1 y_true, y_pred, comment_true_cnt, comment_pred_cnt = knn_method(train_set, test_set, knn_k, num_feature, alpha) comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt) # level wise classification classification_evaluation(y_true, y_pred) print "\nBao's method:" Bao_train_set = Bao_dataset[:train_cnt] Bao_test_set = Bao_dataset[train_cnt:] print 'With link density:' y_true, y_pred, comment_true_cnt, comment_pred_cnt = Bao_method(Bao_train_set, Bao_test_set, alpha, version = 1) comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt) classification_evaluation(y_true, y_pred) print 'With diffusion depth:' y_true, y_pred, comment_true_cnt, comment_pred_cnt = Bao_method(Bao_train_set, Bao_test_set, alpha, version = 2) comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt) classification_evaluation(y_true, y_pred) return IPW_acc, single_factor_acc # 返回正确率