def kaggle(): tr, te, trlab, telab = load_kaggle_data(datapath) kaggle_vectorizer = TfidfVectorizer(ngram_range=(v, k), max_features=m) X_train_kaggle = kaggle_vectorizer.fit_transform(tr) X_test_kaggle = kaggle_vectorizer.transform(te) kaggle_feats = [ '_'.join(s.split()) for s in kaggle_vectorizer.get_feature_names() ] clf_kaggle = LogisticRegression(random_state=16, solver='saga', C=np.inf, max_iter=10000).fit(X_train_kaggle, trlab) kaggle_coefs = clf_kaggle.coef_ allcoefs_kaggle = pd.DataFrame.from_records( kaggle_coefs, columns=kaggle_feats) #add ngrams as colnames allcoefs_kaggle.to_csv('kaggle_coefs.csv', sep='\t', index=False) preds_kaggle_test = clf_kaggle.predict(X_test_kaggle) preds_kaggle_tain = clf_kaggle.predict(X_train_kaggle) print_scores(telab, preds_kaggle_test, "Kaggle Test Scores") print_scores(trlab, preds_kaggle_tain, "Kaggle Train Scores")
def test_f(model, tokenizer, test_string): global SEQ_LEN global trainingdata #global tokenizer print("Testing on " + test_string) if test_string == "kaggle": _, test, _, test_lab = load_kaggle_data(datapath) elif test_string == "BS": _, test, _, test_lab = load_BS_data(datapath) elif test_string == "liar": _, _, test, _, _, test_lab = load_liar_data(datapath) test_lab = to_categorical(test_lab, 2) test_indices = [] for i in test: ids, segments = tokenizer.encode(i, max_len=SEQ_LEN) test_indices.append(ids) preds = model.predict( [np.array(test_indices), np.zeros_like(test_indices)], verbose=0) #print("len "+test_string+" preds:", len(preds)) #print("len "+test_string+" y_test", len(test_lab)) #np.savetxt("BERT"+trainingdata+"_"+test_string+"_labels.txt",test_lab) #np.savetxt("BERT"+trainingdata+"_"+test_string+"_preds.txt",preds) #print(preds) print( test_string + " accuracy: ", accuracy_score(np.argmax(test_lab, axis=1), np.argmax(preds, axis=1))) print( test_string + " F1 score: ", f1_score(np.argmax(test_lab, axis=1), np.argmax(preds, axis=1), average="weighted")) tn, fp, fn, tp = confusion_matrix(np.argmax(test_lab, axis=1), np.argmax(preds, axis=1)).ravel() print("tn, fp, fn, tp") print(tn, fp, fn, tp)
trainingdata = "liar" #sys.argv[1] # "liar" or "kaggle" print("TRAINING WITH", trainingdata) NUM_LAYERS = 2 print("NUMBER OF LAYERS:", NUM_LAYERS) # kør med forskellige random seeds og tag gennemsnit. eller cross-validate. random.seed(16) np.random.seed(16) set_random_seed(16) if trainingdata == "liar": train, dev, test, train_lab, dev_lab, test_lab = load_liar_data(datapath) elif trainingdata == "kaggle": train, test, train_lab, test_lab = load_kaggle_data(datapath) elif trainingdata == "FNC": train, test, train_lab, test_lab = load_FNC_data(datapath) train = [nltk.word_tokenize(i.lower()) for i in train] test = [nltk.word_tokenize(i.lower()) for i in test] if trainingdata == "liar": dev = [nltk.word_tokenize(i.lower()) for i in dev] all_train_tokens = [] for i in train: for word in i: all_train_tokens.append(word)
def test_on_kaggle(): print("Testing on kaggle") train, test, train_lab, test_lab = load_kaggle_data(datapath) model_loaded = load_model(model_path + '.h5') commons_testing(model_loaded, test, test_lab, trainingdata + "_kaggle")
def train_and_test(datapath="/home/ktj250/thesis/data/", emb_model_path="/home/ktj250/thesis/", TIMEDISTRIBUTED=False, trainingdata="liar", num_cells=32, num_epochs=10, dropout=0.4, r_dropout=0.4, num_batch=64, learning_rate=0.0001): K.clear_session() #colab_directory_path = "/gdrive/My Drive/Thesis/" #TIMEDISTRIBUTED = False use_pretrained_embeddings = True FAKE=1 #trainingdata = sys.argv[1] #"liar" # kaggle, FNC, BS print("trainingdata=",trainingdata) if trainingdata == "liar": train, dev, test, train_lab, dev_lab, test_lab = load_liar_data(datapath) elif trainingdata == "kaggle": train, test, train_lab, test_lab = load_kaggle_data(datapath) elif trainingdata == "FNC": train, test, train_lab, test_lab = load_FNC_data(datapath) elif trainingdata == "BS": train, test, train_lab, test_lab = load_BS_data(datapath) train = [nltk.word_tokenize(i.lower()) for i in train] test = [nltk.word_tokenize(i.lower()) for i in test] if trainingdata == "liar": dev = [nltk.word_tokenize(i.lower()) for i in dev] else: dev = train[int(abs((len(train_lab)/3)*2)):] dev_lab = train_lab[int(abs((len(train_lab)/3)*2)):] train = train[:int(abs((len(train_lab)/3)*2))] train_lab = train_lab[:int(abs((len(train_lab)/3)*2))] print(len(train), len(dev)) all_train_tokens = [] for i in train: for word in i: all_train_tokens.append(word) vocab = set(all_train_tokens) word2id = {word: i+1 for i, word in enumerate(vocab)}# making the first id is 1, so that I can pad with zeroes. word2id["UNK"] = len(word2id)+1 id2word = {v: k for k, v in word2id.items()} #trainTextsSeq: List of input sequence for each document (A matrix with size num_samples * max_doc_length) trainTextsSeq = np.array([[word2id[w] for w in sent] for sent in train]) testTextsSeq = np.array([[word2id.get(w, word2id["UNK"]) for w in sent] for sent in test]) #if trainingdata == "liar": devTextsSeq = np.array([[word2id.get(w, word2id["UNK"]) for w in sent] for sent in dev]) # PARAMETERS # vocab_size: number of tokens in vocabulary vocab_size = len(word2id)+1 # max_doc_length: length of documents after padding (in Keras, the length of documents are usually padded to be of the same size) max_doc_length = 100 # LIAR 100 (like Wang), Kaggle 3391, FakeNewsCorpus 2669 # num_samples: number of training/testing data samples num_samples = len(train_lab) # num_time_steps: number of time steps in LSTM cells, usually equals to the size of input, i.e., max_doc_length num_time_steps = max_doc_length embedding_size = 300 # also just for now.. # padding with max doc lentgh seq = sequence.pad_sequences(trainTextsSeq, maxlen=max_doc_length, dtype='int32', padding='post', truncating='post', value=0.0) print("train seq shape",seq.shape) test_seq = sequence.pad_sequences(testTextsSeq, maxlen=max_doc_length, dtype='int32', padding='post', truncating='post', value=0.0) #if trainingdata == "liar": dev_seq = sequence.pad_sequences(devTextsSeq, maxlen=max_doc_length, dtype='int32', padding='post', truncating='post', value=0.0) if TIMEDISTRIBUTED: train_lab = tile_reshape(train_lab, num_time_steps) test_lab = tile_reshape(test_lab, num_time_steps) print(train_lab.shape) #if trainingdata == "liar": dev_lab = tile_reshape(dev_lab, num_time_steps) else: train_lab = to_categorical(train_lab, 2) test_lab = to_categorical(test_lab, 2) print(train_lab.shape) #if trainingdata == "liar": dev_lab = to_categorical(dev_lab, 2) print("Parameters:: num_cells: "+str(num_cells)+" num_samples: "+str(num_samples)+" embedding_size: "+str(embedding_size)+" epochs: "+str(num_epochs)+" batch_size: "+str(num_batch)) if use_pretrained_embeddings: # https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html # Load Google's pre-trained Word2Vec model. model = gensim.models.KeyedVectors.load_word2vec_format(emb_model_path+'GoogleNews-vectors-negative300.bin', binary=True) embedding_matrix = np.zeros((len(word2id) + 1, 300)) for word, i in word2id.items(): try: embedding_vector = model.wv[word] except: embedding_vector = model.wv["UNK"] if embedding_vector is not None: embedding_matrix[i] = embedding_vector myInput = Input(shape=(max_doc_length,), name='input') print(myInput.shape) if use_pretrained_embeddings: x = Embedding(input_dim=vocab_size, output_dim=embedding_size, weights=[embedding_matrix],input_length=max_doc_length,trainable=True)(myInput) else: x = Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=max_doc_length)(myInput) print(x.shape) if TIMEDISTRIBUTED: lstm_out = LSTM(num_cells, dropout=dropout, recurrent_dropout=r_dropout, return_sequences=True, kernel_constraint=NonNeg())(x) predictions = TimeDistributed(Dense(1, activation='sigmoid', kernel_constraint=NonNeg()))(lstm_out) else: lstm_out = Bidirectional(LSTM(num_cells, dropout=dropout, recurrent_dropout=r_dropout))(x) predictions = Dense(2, activation='softmax')(lstm_out) model = Model(inputs=myInput, outputs=predictions) opt = Adam(lr=learning_rate) if TIMEDISTRIBUTED: model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy']) else: model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy']) print("fitting model..") #if trainingdata == "liar": history = model.fit({'input': seq}, train_lab, epochs=num_epochs, verbose=2, batch_size=num_batch, validation_data=(dev_seq,dev_lab)) #else: # history = model.fit({'input': seq}, train_lab, epochs=num_epochs, verbose=2, batch_size=num_batch) print("Testing...") test_score = model.evaluate(test_seq, test_lab, batch_size=num_batch, verbose=0) #if trainingdata == "liar": dev_score = model.evaluate(dev_seq, dev_lab, batch_size=num_batch, verbose=0) print("Test loss:", test_score[0]) print("Test accuracy:", test_score[1]) #if trainingdata == "liar": print("Valid loss:", dev_score[0]) print("Valid accuracy:", dev_score[1]) if not TIMEDISTRIBUTED: preds = model.predict(test_seq) f1 = f1_score(np.argmax(test_lab,axis=1), np.argmax(preds, axis=1)) tn, fp, fn, tp = confusion_matrix(np.argmax(test_lab,axis=1), np.argmax(preds, axis=1)).ravel() print("tn, fp, fn, tp") print(tn, fp, fn, tp) model.summary() #if trainingdata=="liar": # return dev_score[1], history #else: return test_score[1], dev_score[1], history, f1
def pre_modelling_stuff(TIMEDISTRIBUTED=False, trainingdata="liar", datapath="/home/ktj250/thesis/data/", emb_model_path="/home/ktj250/thesis/"): #directory_path = "/gdrive/My Drive/Thesis/" #TIMEDISTRIBUTED = False use_pretrained_embeddings = True FAKE=1 #trainingdata = sys.argv[1] #"liar" # kaggle, FNC, BS print("trainingdata=",trainingdata) if trainingdata == "liar": train, dev, test, train_lab, dev_lab, test_lab = load_liar_data(datapath) elif trainingdata == "kaggle": train, test, train_lab, test_lab = load_kaggle_data(datapath) elif trainingdata == "FNC": train, test, train_lab, test_lab = load_FNC_data(datapath) elif trainingdata == "BS": train, test, train_lab, test_lab = load_BS_data(datapath) train = [nltk.word_tokenize(i.lower()) for i in train] test = [nltk.word_tokenize(i.lower()) for i in test] if trainingdata == "liar": dev = [nltk.word_tokenize(i.lower()) for i in dev] else: dev = train[int(abs((len(train_lab)/3)*2)):] dev_lab = train_lab[int(abs((len(train_lab)/3)*2)):] train = train[:int(abs((len(train_lab)/3)*2))] train_lab = train_lab[:int(abs((len(train_lab)/3)*2))] print(len(train), len(dev)) all_train_tokens = [] for i in train: for word in i: all_train_tokens.append(word) vocab = set(all_train_tokens) word2id = {word: i+1 for i, word in enumerate(vocab)}# making the first id is 1, so that I can pad with zeroes. word2id["UNK"] = len(word2id)+1 id2word = {v: k for k, v in word2id.items()} #trainTextsSeq: List of input sequence for each document (A matrix with size num_samples * max_doc_length) trainTextsSeq = np.array([[word2id[w] for w in sent] for sent in train]) testTextsSeq = np.array([[word2id.get(w, word2id["UNK"]) for w in sent] for sent in test]) #if trainingdata == "liar": devTextsSeq = np.array([[word2id.get(w, word2id["UNK"]) for w in sent] for sent in dev]) # PARAMETERS # vocab_size: number of tokens in vocabulary vocab_size = len(word2id)+1 # max_doc_length: length of documents after padding (in Keras, the length of documents are usually padded to be of the same size) max_doc_length = 100 # LIAR 100 (like Wang), Kaggle 3391, FakeNewsCorpus 2669 # num_samples: number of training/testing data samples num_samples = len(train_lab) # num_time_steps: number of time steps in LSTM cells, usually equals to the size of input, i.e., max_doc_length num_time_steps = max_doc_length embedding_size = 300 # also just for now.. # padding with max doc lentgh seq = sequence.pad_sequences(trainTextsSeq, maxlen=max_doc_length, dtype='int32', padding='post', truncating='post', value=0.0) print("train seq shape",seq.shape) test_seq = sequence.pad_sequences(testTextsSeq, maxlen=max_doc_length, dtype='int32', padding='post', truncating='post', value=0.0) #if trainingdata == "liar": dev_seq = sequence.pad_sequences(devTextsSeq, maxlen=max_doc_length, dtype='int32', padding='post', truncating='post', value=0.0) if TIMEDISTRIBUTED: train_lab = tile_reshape(train_lab, num_time_steps) test_lab = tile_reshape(test_lab, num_time_steps) print(train_lab.shape) #if trainingdata == "liar": dev_lab = tile_reshape(dev_lab, num_time_steps) else: train_lab = to_categorical(train_lab, 2) test_lab = to_categorical(test_lab, 2) print(train_lab.shape) #if trainingdata == "liar": dev_lab = to_categorical(dev_lab, 2) if use_pretrained_embeddings: # https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html # Load Google's pre-trained Word2Vec model. model = gensim.models.KeyedVectors.load_word2vec_format(emb_model_path+'GoogleNews-vectors-negative300.bin', binary=True) embedding_matrix = np.zeros((len(word2id) + 1, 300)) for word, i in word2id.items(): try: embedding_vector = model.wv[word] except: embedding_vector = model.wv["UNK"] if embedding_vector is not None: embedding_matrix[i] = embedding_vector #if trainingdata=="liar": return embedding_matrix, seq, test_seq, dev_seq, train_lab, test_lab, dev_lab, vocab_size