def loaddata(data_dir, mode='train', max_len=None): """The function for loading data. This function will load the data, and then turns it into Lang Object. Args: data_dir: A string indicates the location of data set mode: A string indicates to load train, valid, or test. Returns: A list of reading dataset and a dictionary of Langs """ data_set = readfile(data_dir + mode + '.json') if max_len is not None: data_set = data_set[:max_len] rt, re, rm, summary = readLang(data_set) print("Read %s data" % mode) print("Read %s box score summary" % len(data_set)) print("Embedding size of (r.t, r.e, r.m) and summary:") print("({}, {}, {}), {}".format(rt.n_words, re.n_words, rm.n_words, summary.n_words)) langs = {'rt': rt, 're': re, 'rm': rm, 'summary': summary} return data_set, langs
def runTest(file1, version, model, mode='words'): """Gets training or test file for stance detection SemiVal 2016 competition and prints prediction results. Parameters ---------- file1 : list a list with text tokens on index (0) and hashtags list on index (1) istest : Boolean specifies if the dataset is for test or training version : int 0: Training dataset, 1: Test dataset, 2:Other domain dataset mode : str choose either (words) or (hashtags) """ indata = readfile(file1, version) data = preprocesstweets(indata, ignoreNONE=False, version=version, lowerCase=True) tfidfAdded = getTfidfRepresentation(data, version, mode) labels = [d[7] for d in data] encoder = LabelEncoder() y = encoder.fit_transform(labels) print(encoder.classes_) if version == 0: x_train, x_test, y_train, y_test = train_test_split(tfidfAdded, y, test_size=0.2) y_test = np_utils.to_categorical(y_test, num_classes=3) y_train = np_utils.to_categorical(y_train, num_classes=3) print(x_train.shape[1]) print(model.summary()) model.fit(x_train, y_train, epochs=10, verbose=2, validation_data=(x_test, y_test)) loss, acc = model.evaluate(x_test, y_test, verbose=0) ypred = model.predict(x_test) print('Training Accuracy: %f' % (acc * 100)) print('Training F-Score: ', f1(y_test, ypred) * 100) if version == 1 or version == 2: y = np_utils.to_categorical(y, num_classes=3) loss, acc = model.evaluate(tfidfAdded, y) ypred = model.predict(tfidfAdded) otherdomain = '' if version == 2: otherdomain = '(other domain)' print('TEST Accuracy ' + otherdomain + ': %f' % ((acc * 100))) print('TEST F-Score ' + otherdomain + ': ', (f1(y, ypred) * 100))
def loaddata(data_dir, mode='train', max_len=None, copy_player=COPY_PLAYER): data_set = readfile(data_dir + mode + '.json', copy_player=copy_player) if max_len is not None: data_set = data_set[:max_len] rt, re, rm, summary = readLang(data_set) print("Read %s data" % mode) print("Read %s box score summary" % len(data_set)) print("Embedding size of (r.t, r.e, r.m) and summary:") print("({}, {}, {}), {}".format(rt.n_words, re.n_words, rm.n_words, summary.n_words)) langs = {'rt': rt, 're': re, 'rm': rm, 'summary': summary} return data_set, langs
#External libraries import numpy as np from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.model_selection import KFold from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix #files from preprocessing import readfile, calc_length, flatten from preprocessing import padding_data, one_hot_encoding from preprocessing import CHANNELS folder = "data" train_data = readfile(f"{folder}/ae.train", 0) test_data = readfile(f"{folder}/ae.test", 1) maxlength = calc_length(test_data, train_data) train_input, train_output = padding_data(train_data, maxlength, 0, True) test_input, test_output = padding_data(test_data, maxlength, 1, True) pca = PCA(n_components=CHANNELS) train_transformed = pca.fit_transform(train_input) test_transformed = pca.fit_transform(test_input) """ Logistic Regression with PCA folds: 5, 10, 15 accuracy: 0.3162162162162162 """ # cv = KFold(n_splits=10, random_state=42, shuffle=True) # clf = LogisticRegression() # scores = []
print(x_train.shape[1]) print(model.summary()) model.fit(x_train, y_train, epochs=10, verbose=2, validation_data=(x_test, y_test)) loss, acc = model.evaluate(x_test, y_test, verbose=0) ypred = model.predict(x_test) print('Training Accuracy: %f' % (acc * 100)) print('Training F-Score: ', f1(y_test, ypred) * 100) if version == 1 or version == 2: y = np_utils.to_categorical(y, num_classes=3) loss, acc = model.evaluate(tfidfAdded, y) ypred = model.predict(tfidfAdded) otherdomain = '' if version == 2: otherdomain = '(other domain)' print('TEST Accuracy ' + otherdomain + ': %f' % ((acc * 100))) print('TEST F-Score ' + otherdomain + ': ', (f1(y, ypred) * 100)) indata = readfile('SemEval2016-Task6-subtaskA-traindata-gold.csv', 0) data = preprocesstweets(indata, ignoreNONE=False, version=0, lowerCase=True) tfidfAdded = getTfidfRepresentation(data, 0, 'words') model = createModel(tfidfAdded.shape[1]) runTest('SemEval2016-Task6-subtaskA-traindata-gold.csv', 0, model, 'words') runTest('SemEval2016-Task6-subtaskA-testdata-gold.txt', 1, model, 'words') runTest('stance.csv', 2, model, 'words')
validation_data=(Xtest, y_test), verbose=2) print('History', history.history) # evaluate print('Predicting (training)..') ypred = model.predict(Xtest) print('Accuracy (TRAIN): %f' % (model.evaluate(Xtest,y_test)[0]*100)) print('FScore (TRAIN): %f' % (f1(y_test, ypred)*100)) print('Predicting (testing)..') #ypred = model.predict(XtestGroup) #print('Accuracy (TEST): %f' % (model.evaluate(XtestGroup,y_testGroup)[0]*100)) #print('FScore (TEST): %f' % (f1(y_testGroup,ypred)*100)) indata = readfile('SemEval2016-Task6-subtaskA-traindata-gold.csv', False) data = preprocesstweets(indata,ignoreNONE=False, version =0) tweets = [' '.join(d[0]) for d in data] stances = [d[7] for d in data] encoder = LabelEncoder() stances = encoder.fit_transform(stances) indata = readfile('SemEval2016-Task6-subtaskA-testdata-gold.txt', True) data = preprocesstweets(indata,ignoreNONE=False, version =1) tweets2 = [' '.join(d[0]) for d in data] stances2 = [d[7] for d in data] stances2 = encoder.fit_transform(stances2) convModel(tweets, stances, tweets2, stances2) #The model is not working currently, last edits caused a problem. Reported results were from previous stage