def merge_ham_spam(): ham_file = "./data/ham.txt" spam_file = "./data/spam.txt" # load from txt import load ham_sent_list = cut(load.load_text(ham_file)) spam_sent_list = cut(load.load_text(spam_file)) # create DataFrame # ham label 0 # spam label 1 ham_df = pd.DataFrame(pd.Series(ham_sent_list)) ham_df.columns = ["sent"] spam_df = pd.DataFrame(pd.Series(spam_sent_list)) spam_df.columns = ["sent"] ham_df["label"] = 0 spam_df["label"] = 1 # merge df = pd.concat([ham_df, spam_df]) # df.to_pickle("./data/sent_with_label.pkl") return df
def main(): # load content (text set) import load content_file = "./data/content.txt" sent_list = load.load_text(content_file) # cut sent_list = cut(sent_list) df = pd.DataFrame(pd.Series(sent_list)) df.columns = ["sent"] # load word2id word2id = load.load_word_dict() # to_id df = sent_list_to_id(df, word2id) df.to_pickle("./data/content.pkl")
# -*- coding: utf-8 -*- import re # import 20 import load title = "イギリス" fn = "jawiki-country.json.gz" def getAllCategory(text): p = re.compile("^\[\[Category:(.*?)(\|.*\]\]|\]\])") return [ p.match(line).group(1) for line in text.splitlines() if p.match(line) ] text = load.load_text(fn, title) print(getAllCategory(text)) # print(type(text)) #for x in text: # print (type(x))
from pair_extract import pair from apply_to_hotels import load_hotel def y_nn(): with open('y_nn.txt') as f: y = list(map(int, f.read().split())) return y if __name__ == '__main__': # y_result = y_nn() # print(y_result) X_pure_train, X_sentences_train, aspects_list_train, _ = load_text( 'SentiRuEval_rest_markup_train.xml') #X_pure_test, X_sentences_test, aspects_list_test, X_p = load_text('SentiRuEval_rest_markup_test.xml') X_pure_test, X_sentences_test, X_p = load_hotel() y_result1, y_result2, y_result3 = svm(X_pure_train, X_sentences_train, aspects_list_train, X_pure_test, X_sentences_test) pair(X_p, y_result3) #y_result = lingvistic(X_pure_test) #y_result = freq(X_pure_train, X_pure_test) #print (len(X_pure_test), len(y_result1)) #save_result(X_pure_test, y_result, 'SentiRuEval_result_rest_test_on_rest_2LSTM.xml')