from config import feature_set_path, comment_data_path
import numpy as np


def extract_words(vectorizor, train_list, test_list):
    count_vect = vectorizor.fit(train_list)
    train = count_vect.transform(train_list)
    test = count_vect.transform(test_list)
        
    return train, test, count_vect.get_feature_names()

set = 1
if __name__ == '__main__':
    if set == 1:
        articleList, commentList, parentList, commentCount = read_comments(comment_data_path + 'trainTestDataSet.txt', skip_mtn=False)
    elif set == 2:
        articleList, commentList, parentList, commentCount = read_toy_comments(comment_data_path + 'trainTestDataSet.txt', comment_data_path + 'toyComments.csv')
    elif set == 3:
        articleList, commentList, commentCount = read_slashdot_comments(comment_data_path + 'slashdotDataSet.txt', limit=100000)
    
    # Values
    y = extract_values(articleList, commentList, commentCount, set)
    sss = StratifiedShuffleSplit(y, 1, test_size=0.95, random_state=42)
    y_train = []
    y_test = []
    for train, test in sss:
        np.save('train_vect', train)
        np.save('test_vect', test)
        y_train = y[train]
        y_test = y[test]
'''
Created on 29 Jul 2015

@author: Dirk
'''
from FeatureExtraction.mainExtractor import read_slashdot_comments,\
    read_comments

from config import comment_data_path

if __name__ == '__main__':
    articleList, commentList, parList, commentCount = read_comments(
        comment_data_path + 'trainTestDataSet.txt', skip=False)
    #articleList, commentList, commentCount = read_slashdot_comments(comment_data_path + 'slashdotDataSet.txt', skip=False)

    out = open(comment_data_path + 'News24CommData.txt', 'w')
    #out = open(comment_data_path + 'slashdotCommData.txt', 'w')

    for art in commentList.items():
        for comm in art[1]:
            out.write(comm.id + "|" + comm.body + "\n")

    out.close()
    print "Done with file"