Exemplos de down_sampling_dataset em Python, exemplos de utils.down_sampling_dataset em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: popularity_dataset.py Projeto: hitalex/popularity

def main(group_id):

    #topiclist_path = 'data-dynamic/TopicList-' + group_id + 'shuffled-.txt' # for douban dataset
    topiclist_path = 'data-dynamic/' + group_id + '-post-list.txt' # for Tianya dataset
    
    topic_list = load_id_list(topiclist_path)
    print 'Number of total topics loaded: ', len(topic_list)
    pop_level = [25, 50, float('inf')]  
    
    # prediction_date 的含义为：在帖子发布 prediction_date 时间后，开始预测
    # target_date 的含义为：预测在 target_date 处的评论数量
    # 以上两个参数可以调节
    # 设置采样的间隔
    gaptime = timedelta(hours=3)
    prediction_date = timedelta(hours=10*3)
    response_time = timedelta(hours=24)
    target_date = prediction_date + response_time
    
    # 计算每个topic在prediction_date前会有多少个interval
    num_feature = int(prediction_date.total_seconds() / gaptime.total_seconds())
    print 'Number of features: ', num_feature
    
    alpha = 1.5
    percentage_threshold = 0.7
    print 'Generating training and test dataset...'
    dataset, comment_count_dataset, Bao_dataset, category_count_list = prepare_dataset(group_id, \
        topic_list, gaptime, pop_level, prediction_date, target_date, alpha, percentage_threshold)
    # 保存那些经过筛选的topic id
    #save_filtered_topics(group_id, dataset)
    print 'Ploting factor propagation'
    #factor_propagation_plot(dataset, num_feature)
    #topic_propagation_plot(dataset, num_feature)
    #return 
    
    # 调整所有帖子的顺序
    # 在调试阶段，暂且不shuffle dataset，避免每次结果都不一样
    #shuffle(dataset)
    
    # 注意：每次使用的数据集是不同的
    total = len(dataset)
    train_cnt = total * 4 / 5
    train_set = dataset[:train_cnt]
    test_set = dataset[train_cnt:]
    print 'Training: %d, Test: %d' % (train_cnt, total-train_cnt)
    print 'Category 0: %d, Category 1: %d ' % (category_count_list[0] , category_count_list[1])
    print 'Imbalance ratio: ', category_count_list[0] * 1.0 / category_count_list[1]
    
    dataset, comment_count_dataset, Bao_dataset, category_count_list = down_sampling_dataset(dataset, \
        comment_count_dataset, Bao_dataset, category_count_list)    
    print 'After down sampling...'
    total = len(dataset)
    train_cnt = total * 4 / 5
    train_set = dataset[:train_cnt]
    test_set = dataset[train_cnt:]
    
    print 'Training: %d, Test: %d' % (train_cnt, total-train_cnt)
    print 'Category 0: %d, Category 1: %d ' % (category_count_list[0] , category_count_list[1])
    print 'Imbalance ratio: ', category_count_list[0] * 1.0 / category_count_list[1]

Exemplo n.º 2

0

Exibir arquivo

Arquivo: cross_validation_k.py Projeto: hitalex/popularity

def select_k(group_id, topic_list, percentage_threshold, prediction_date, response_time, cvk):
    # sampling interval
    gaptime = timedelta(hours=5)
    target_date = prediction_date + response_time
    num_feature = int(prediction_date.total_seconds() / gaptime.total_seconds())
    print "Number of features: ", num_feature

    # percentage_threshold = 0.7
    alpha = 1 / percentage_threshold
    pop_level = [25, 50, float("inf")]  # group: zhuangb

    print "Generating training and test dataset..."
    dataset, comment_count_dataset, Bao_dataset, category_count_list = prepare_dataset(
        group_id, topic_list, gaptime, pop_level, prediction_date, target_date, alpha, percentage_threshold
    )

    print "Down-sampling the datasets..."
    dataset, comment_count_dataset, Bao_dataset, category_count_list = down_sampling_dataset(
        dataset, comment_count_dataset, Bao_dataset, category_count_list
    )

    total = len(dataset)
    n_folds = 3
    kf = KFold(total, n_folds)
    IPW_acc_list = []
    for cv_train_index, cv_test_index in kf:
        train_set = make_cv_dataset(dataset, cv_train_index)
        test_set = make_cv_dataset(dataset, cv_test_index)
        train_cnt = len(train_set)
        print "Training: %d, Test: %d" % (train_cnt, total - train_cnt)
        print "Category 0: %d, Category 1: %d " % (category_count_list[0], category_count_list[1])
        print "Imbalance ratio: ", category_count_list[0] * 1.0 / category_count_list[1]

        num_level = 2
        num_factor = len(train_set[0][1][1])

        print "The proposed model:"
        print "Caculating instance prior score..."
        prior_score = -1
        mutual_knn_graph_list = None
        # prior_score = caculate_instance_prior_confidence_score(train_set, k, num_level = 2) # for instance_prior_weighting3.py
        topic_popularity, prior_score, mutual_knn_graph_list = caculate_instance_prior_confidence_score(
            train_set, test_set, cvk, num_factor, num_level=2
        )  # for IPW_mutual_knn.py

        print "Classify test instances..."
        y_true, y_pred, comment_true, comment_pred, give_up_list, prediction_list, factor_prediction = classify(
            train_set, test_set, cvk, num_factor, num_level, prior_score, topic_popularity, mutual_knn_graph_list
        )
        # evaluate results
        print "Number of give-ups: ", len(give_up_list)
        IPW_acc = classification_evaluation(y_true, y_pred)

        IPW_acc_list.append(IPW_acc)

    return IPW_acc_list

Exemplo n.º 3

0

Exibir arquivo

Arquivo: popularity_baseline.py Projeto: hitalex/popularity

def main(group_id):

    topiclist_path = 'data-dynamic/TopicList-' + group_id + '-filtered.txt'
    topic_list = load_id_list(topiclist_path)
    print 'Number of total topics loaded: ', len(topic_list)

    # set the pre-computed popularity level
    # 未来的最大评论数可能超过pop_level的最大值
    # 注意：这里将最小的popularity值，即0，忽略了
    #pop_level = [8, 13, 23, 43, float('inf')]  # group: zhuangb
    pop_level = [25, 50, float('inf')]  # group: zhuangb
    #pop_level = [25, 50, float('inf')]      # group: buybook
    #pop_level = [30, float('inf')]      # group: buybook
    
    # prediction_date 的含义为：在帖子发布 prediction_date 时间后，开始预测
    # target_date 的含义为：预测在 target_date 处的评论数量
    # 以上两个参数可以调节
    # 设置采样的间隔
    gaptime = timedelta(hours=5)
    prediction_date = timedelta(hours=10*5)
    response_time = timedelta(hours=50)
    target_date = prediction_date + response_time
    
    # 计算每个topic在prediction_date前会有多少个interval
    num_feature = int(prediction_date.total_seconds() / gaptime.total_seconds())
    print 'Number of features: ', num_feature
    
    alpha = 1.5
    percentage_threshold = 0.7
    print 'Generating training and test dataset...'
    dataset, comment_count_dataset, Bao_dataset, category_count_list = prepare_dataset(group_id, \
        topic_list, gaptime, pop_level, prediction_date, target_date, alpha, percentage_threshold)
    # 保存那些经过筛选的topic id
    #save_filtered_topics(group_id, dataset)
    #print 'Ploting factor propagation'
    #factor_propagation_plot(dataset, num_feature)
    #topic_propagation_plot(dataset, num_feature)
    #return 
    
    # 调整所有帖子的顺序
    # 在调试阶段，暂且不shuffle dataset，避免每次结果都不一样
    #shuffle(dataset)
    
    print 'Down-sampling the datasets...'
    dataset, comment_count_dataset, Bao_dataset, category_count_list = down_sampling_dataset(dataset, \
        comment_count_dataset, Bao_dataset, category_count_list)
    
    total = len(dataset)
    train_cnt = total * 4 / 5
    train_set = dataset[:train_cnt]
    test_set = dataset[train_cnt:]
    
    print 'Training: %d, Test: %d' % (train_cnt, total-train_cnt)
    print 'Category 0: %d, Category 1: %d ' % (category_count_list[0] , category_count_list[1])
    print 'Imbalance ratio: ', category_count_list[0] * 1.0 / category_count_list[1]
    #num_level = len(pop_level)
    #raw_input()
    
    #import ipdb
    #ipdb.set_trace()
        
    print 'The proposed model:'
    k = 3
    num_level = 2
    num_factor = len(train_set[0][1][1])
    
    print 'Classify test instances...'
    y_true, y_pred, comment_true, comment_pred, give_up_list, prediction_list = classify(train_set, test_set, k, num_level)
    # evaluate results
    print 'Number of give-ups: ', len(give_up_list)
    classification_evaluation(y_true, y_pred)
    level_MSE_evaluation(y_true, y_pred)
    #save_predictions(prediction_list, y_pred, factor_name = 'num_authors')
    #save_predictions(prediction_list, y_true, factor_name = 'all')
    
    comment_RSE_evaluation(comment_true, comment_pred)
    
    #print 'The class prior:', prior_score
    
    from svm_model import svm_model
    print 'Building a svm model...'
    y_true, y_pred = svm_model(train_set, test_set)
    classification_evaluation(y_true, y_pred)

    # 查看对于不同的factor，它们在不同的ratio上的预测结果
    #from utils import ratio_accuracy_distribution_plot
    #ratio_accuracy_distribution_plot(y_true, y_pred, test_set, group_id, factor_name='tree_link_density')
    
    # S-H model
    print '\nThe S-H model:'
    baseline_train_set = comment_count_dataset[:train_cnt]
    baseline_test_set = comment_count_dataset[train_cnt:]
    y_true, y_pred, comment_true_cnt, comment_pred_cnt = SH_model(baseline_train_set, baseline_test_set, alpha)
    # drop some intances with cat = 0
    comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt)    
    # level wise classification
    classification_evaluation(y_true, y_pred)
    level_MSE_evaluation(y_true, y_pred)
    
    print '\nML model:'
    y_true, y_pred, comment_true_cnt, comment_pred_cnt = ML_model(baseline_train_set, baseline_test_set, num_feature, alpha)
    comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt)
    classification_evaluation(y_true, y_pred)
    
    print '\nMLR model:'
    y_true, y_pred, comment_true_cnt, comment_pred_cnt = MLR_model(baseline_train_set, baseline_test_set, num_feature, alpha)
    comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt)
    classification_evaluation(y_true, y_pred)
    
    print '\nkNN method:'
    k = 1
    y_true, y_pred, comment_true_cnt, comment_pred_cnt = knn_method(train_set, test_set, k, num_feature, alpha)
    comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt)    
    # level wise classification
    classification_evaluation(y_true, y_pred)
    
    print "\nBao's method:"
    Bao_train_set = Bao_dataset[:train_cnt]
    Bao_test_set = Bao_dataset[train_cnt:]
    y_true, y_pred, comment_true_cnt, comment_pred_cnt = Bao_method(Bao_train_set, Bao_test_set, alpha)
    comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt)
    classification_evaluation(y_true, y_pred)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: popularity.py Projeto: hitalex/popularity

def main(group_id, topic_list, threshold_p, prediction_date_tr, response_time_delta, gaptime_n, best_k):
    
    # 设置两个自由参数值，由外部传入
    percentage_threshold = threshold_p
    prediction_date = timedelta(hours=prediction_date_tr)
    response_time = timedelta(hours=response_time_delta)

    # set the pre-computed popularity level
    # 未来的最大评论数可能超过pop_level的最大值
    # 注意：这里将最小的popularity值，即0，忽略了
    #pop_level = [8, 13, 23, 43, float('inf')]  # group: zhuangb
    pop_level = [25, 50, float('inf')]  # group: zhuangb
    #pop_level = [25, 50, float('inf')]      # group: buybook
    #pop_level = [30, float('inf')]      # group: buybook
    
    # prediction_date 的含义为：在帖子发布 prediction_date 时间后，开始预测
    # target_date 的含义为：预测在 target_date 处的评论数量
    # 以上两个参数可以调节
    # 设置采样的间隔
    gaptime = timedelta(hours=gaptime_n)
    #prediction_date = timedelta(hours=10*3)
    #response_time = timedelta(hours=24) # 已经作为参数传递
    target_date = prediction_date + response_time
    
    # 计算每个topic在prediction_date前会有多少个interval
    num_feature = int(prediction_date.total_seconds() / gaptime.total_seconds())
    print 'Number of features: ', num_feature
    
    #percentage_threshold = 0.7
    alpha = 1/percentage_threshold
    
    #"""
    print 'Generating training and test dataset...'
    dataset, comment_count_dataset, Bao_dataset, category_count_list = prepare_dataset(group_id, \
        topic_list, gaptime, pop_level, prediction_date, target_date, alpha, percentage_threshold)
    # 保存那些经过筛选的topic id
    #save_filtered_topics(group_id, dataset)
    #print 'Ploting factor propagation'
    #factor_propagation_plot(dataset, num_feature)
    #topic_propagation_plot(dataset, num_feature)
    #return 
    
    print 'Down-sampling the datasets...'
    dataset, comment_count_dataset, Bao_dataset, category_count_list = down_sampling_dataset(dataset, \
        comment_count_dataset, Bao_dataset, category_count_list)
    
    # 调整所有帖子的顺序
    # 在调试阶段，暂且不shuffle dataset，避免每次结果都不一样
    #shuffle(dataset)
    
    # 注意：每次使用的数据集是不同的
    total = len(dataset)
    train_cnt = total * 4 / 5
    train_set = dataset[:train_cnt]
    test_set = dataset[train_cnt:]
    
    print 'Training: %d, Test: %d' % (train_cnt, total-train_cnt)
    print 'Category 0: %d, Category 1: %d ' % (category_count_list[0] , category_count_list[1])
    print 'Imbalance ratio: ', category_count_list[0] * 1.0 / category_count_list[1]
    #num_level = len(pop_level)
    #save_filtered_topics(group_id, dataset)
    #raw_input()
    
    from MDT_method import prepare_MDT_dataset
    #prepare_MDT_dataset(train_set, 'MDT_train.pickle')
    #prepare_MDT_dataset(test_set, 'MDT_test.pickle')
    #return    
    
    k = best_k
    num_level = 2
    num_factor = len(train_set[0][1][1])
    
    print 'The proposed model:'
    #print 'Caculating class prior score...'
    #prior_score = np.ones((num_factor, num_level)) # 初始化
    #prior_score = caculate_class_prior_confidence_score(train_set, k, num_level = 2)
    #print prior_score; raw_input()
    
    print 'Caculating instance prior score...'
    prior_score = -1
    mutual_knn_graph_list = None
    #prior_score = caculate_instance_prior_confidence_score(train_set, k, num_level = 2) # for instance_prior_weighting3.py
    topic_popularity, prior_score, mutual_knn_graph_list = caculate_instance_prior_confidence_score(train_set, test_set, k, num_factor, num_level = 2) # for IPW_mutual_knn.py
    
    # 保存prior-score，train dataset，test-dataset
    #save_intermediate_results(train_set, test_set, comment_count_dataset, Bao_dataset, category_count_list, topic_popularity, prior_score, mutual_knn_graph_list)
    #"""
    
    #print 'Loading train_set, test_set, comment_count_dataset, ... and prior_score...'
    #train_set, test_set, comment_count_dataset, Bao_dataset, category_count_list, topic_popularity, prior_score, mutual_knn_graph_list = load_intermediate_results()
    #train_cnt = len(train_set)
    #k = best_k; num_level=2; num_factor = len(train_set[0][1][1])
    #factor_name_list = ['current_comment_count', 'num_authors', 'tree_density', 'reply_density'] # 需要考察的factor变量
    #factor_propagation_plot(group_id, train_set+test_set, num_feature, category_count_list, range(4), factor_name_list)
    #return 
    
    print 'Parameter set:'
    print 'Gap time: ', gaptime
    print 'Prediction date(in hours):', prediction_date.total_seconds() / 3600
    print 'Response time(in hours):', response_time.total_seconds() / 3600
    print 'percentage_threshold: ', percentage_threshold
    print 'k = ', k

    # TODO：测试是否过拟合
    #test_set = train_set  # 将训练集作为测试集，查看是否过拟合
    print 'Classify test instances...'
    y_true, y_pred, comment_true, comment_pred, give_up_list, prediction_list, factor_prediction = \
        classify(train_set, test_set, k, num_factor, num_level, prior_score, topic_popularity, mutual_knn_graph_list)
    # evaluate results
    print 'Number of give-ups: ', len(give_up_list)
    IPW_acc = classification_evaluation(y_true, y_pred)
    level_MSE_evaluation(y_true, y_pred)
    #save_predictions(prediction_list, y_pred, factor_name = 'fourfactor')
    #save_predictions(prediction_list, y_true, factor_name = 'all')
    
    comment_RSE_evaluation(comment_true, comment_pred)
    
    #print 'The class prior:', prior_score
    
    print 'Single factor and simple vote prediction result:'
    single_factor_acc = single_factor_prediction(y_true, factor_prediction)
    
    from svm_model import svm_model
    print 'Building a svm model...'
    y_true, y_pred = svm_model(train_set, test_set)
    classification_evaluation(y_true, y_pred)

    # 查看对于不同的factor，它们在不同的ratio上的预测结果
    from utils import ratio_accuracy_distribution_plot
    #ratio_accuracy_distribution_plot(y_true, y_pred, test_set, group_id, factor_name='tree_link_density')
    
    # S-H model
    print '\nThe S-H model:'
    baseline_train_set = comment_count_dataset[:train_cnt]
    baseline_test_set = comment_count_dataset[train_cnt:]
    y_true, y_pred, comment_true_cnt, comment_pred_cnt = SH_model(baseline_train_set, baseline_test_set, alpha)
    # drop some intances with cat = 0
    comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt)    
    # level wise classification
    classification_evaluation(y_true, y_pred)
    level_MSE_evaluation(y_true, y_pred)
    
    print '\nML model:'
    y_true, y_pred, comment_true_cnt, comment_pred_cnt = ML_model(baseline_train_set, baseline_test_set, num_feature, alpha)
    comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt)
    classification_evaluation(y_true, y_pred)
    
    print '\nMLR model:'
    y_true, y_pred, comment_true_cnt, comment_pred_cnt = MLR_model(baseline_train_set, baseline_test_set, num_feature, alpha)
    comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt)
    classification_evaluation(y_true, y_pred)
    
    print '\nkNN method:'
    knn_k = 1
    y_true, y_pred, comment_true_cnt, comment_pred_cnt = knn_method(train_set, test_set, knn_k, num_feature, alpha)
    comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt)    
    # level wise classification
    classification_evaluation(y_true, y_pred)
    
    print "\nBao's method:"
    Bao_train_set = Bao_dataset[:train_cnt]
    Bao_test_set = Bao_dataset[train_cnt:]
    print 'With link density:'
    y_true, y_pred, comment_true_cnt, comment_pred_cnt = Bao_method(Bao_train_set, Bao_test_set, alpha, version = 1)
    comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt)
    classification_evaluation(y_true, y_pred)
    print 'With diffusion depth:'
    y_true, y_pred, comment_true_cnt, comment_pred_cnt = Bao_method(Bao_train_set, Bao_test_set, alpha, version = 2)
    comment_RSE_evaluation(comment_true_cnt, comment_pred_cnt)
    classification_evaluation(y_true, y_pred)
    
    return IPW_acc, single_factor_acc # 返回正确率