Пример #1
0
def pre_process():
    f = open("mode", "r")
    mode = f.read()
    f.close()

    if (mode == '0'):
        READ_DIR = "clean_train_set"
    elif (mode == '1'):
        READ_DIR = "clean_test_set"

    one_gram = T.create_mono_gram(READ_DIR)
    two_gram = T.create_bi_gram(READ_DIR, one_gram)
    three_gram = T.create_tri_gram(READ_DIR, one_gram)
    n_gram_list1 = []
    READ_DIR = "Data"
    n_gram = one_gram + two_gram + three_gram
    negative_words = get_negative_words(READ_DIR)
    n_gram = remove_empty_tokens(n_gram)
    n_gram = remove_numeric_tokens(n_gram)
    n_gram = remove_non_cap_tokens(n_gram)
    n_gram = remove_negative_tokens(n_gram, negative_words)
    n_gram = remove_mixed_words(n_gram)
    os.chdir('..')
    return n_gram, one_gram