import numpy as np


def extract_words(vectorizor, train_list, test_list):
    count_vect = vectorizor.fit(train_list)
    train = count_vect.transform(train_list)
    test = count_vect.transform(test_list)
        
    return train, test, count_vect.get_feature_names()

set = 1
if __name__ == '__main__':
    if set == 1:
        articleList, commentList, parentList, commentCount = read_comments(comment_data_path + 'trainTestDataSet.txt', skip_mtn=False)
    elif set == 2:
        articleList, commentList, parentList, commentCount = read_toy_comments(comment_data_path + 'trainTestDataSet.txt', comment_data_path + 'toyComments.csv')
    elif set == 3:
        articleList, commentList, commentCount = read_slashdot_comments(comment_data_path + 'slashdotDataSet.txt', limit=100000)
    
    # Values
    y = extract_values(articleList, commentList, commentCount, set)
    sss = StratifiedShuffleSplit(y, 1, test_size=0.95, random_state=42)
    y_train = []
    y_test = []
    for train, test in sss:
        np.save('train_vect', train)
        np.save('test_vect', test)
        y_train = y[train]
        y_test = y[test]
    
    processed_comment_list = extract_global_bag_of_words_processed(commentList)  
示例#2
0

set = 3
if __name__ == '__main__':
    print 'START'
         
    # To process all the comments
    if set == 1:
        df_comments = read_news24_comments(comment_data_path + 'trainTestDataSet.txt')
        df_comments.sort('date', inplace=True)
        df_comments.reset_index(inplace=True, drop=True)
        df_thread_groupby = df_comments.groupby('thread_root_id')
        set_tag = "_news24"
        tag = '_main'
    elif set == 2:
        df_comments = read_toy_comments(comment_data_path + 'trainTestDataSet.txt', comment_data_path + 'toyComments.csv')
        df_comments.sort('date', inplace=True)
        df_comments.reset_index(inplace=True, drop=True)
        df_thread_groupby = df_comments.groupby('thread_root_id')
        set_tag = "_news24"
        tag = '_toy'
    elif set == 3:
        df_comments = read_slashdot_comments(comment_data_path + 'slashdotDataSet_latest.txt')
        df_comments.sort('date', inplace=True)
        df_comments.reset_index(inplace=True, drop=True)
        df_thread_groupby = df_comments.groupby('thread_root_id')
        set_tag = "_slashdot"
        tag = '_slashdot'
        
   
    print df_comments.shape
示例#3
0

set = 3
if __name__ == '__main__':
    print 'START'
    # setup()

    # To process all the comments
    if set == 1:
        articleList, commentList, parentList, commentCount = read_comments(
            comment_data_path + 'trainTestDataSet.txt')
        set_tag = "_news24"
        tag = '_main'
    elif set == 2:
        articleList, commentList, parentList, commentCount = read_toy_comments(
            comment_data_path + 'trainTestDataSet.txt',
            comment_data_path + 'toyComments.csv')
        set_tag = "_news24"
        tag = '_toy'
    elif set == 3:
        articleList, commentList, commentCount = read_slashdot_comments(
            comment_data_path + 'slashdotDataSet.txt', limit=100000)
        set_tag = "_slashdot"
        tag = '_slashdot'

    print "Processed", commentCount, "Comments"

    extractSaveValues(articleList, commentList, commentCount,
                      feature_set_path + "valueVector" + tag, set)
    y = load_numpy_matrix(feature_set_path + r'valueVector' + tag + '.npy')
    sss = StratifiedShuffleSplit(y, 1, test_size=0.40, random_state=42)