def FNC(): FNC_Xtrain, FNC_Xtest, FNC_ytrain, FNC_ytest = load_FNC_data(datapath) FNC_vectorizer = TfidfVectorizer(ngram_range=(v, k), max_features=m) FNC_Xtrain_vect = FNC_vectorizer.fit_transform(FNC_Xtrain) FNC_Xtest_vect = FNC_vectorizer.transform(FNC_Xtest) FNC_feats = [ '_'.join(s.split()) for s in FNC_vectorizer.get_feature_names() ] clf_FNC = None clf_FNC = LogisticRegression(random_state=16, solver='saga', C=np.inf, max_iter=10000).fit(FNC_Xtrain_vect, FNC_ytrain) FNC_coefs = clf_FNC.coef_ allcoefs_FNC = pd.DataFrame.from_records(FNC_coefs, columns=FNC_feats) allcoefs_FNC.to_csv("FakeNewsCorpus_coefs.csv", sep="\t", index=False) preds_FNC_test = clf_FNC.predict(FNC_Xtest_vect) preds_FNC_train = clf_FNC.predict(FNC_Xtrain_vect) print_scores(FNC_ytest, preds_FNC_test, "FakeNewsCorpus Test Scores") print_scores(FNC_ytrain, preds_FNC_train, "FakeNewsCorpus Train Scores")
print("TRAINING WITH", trainingdata) NUM_LAYERS = 1 print("NUMBER OF LAYERS:", NUM_LAYERS) # kør med forskellige random seeds og tag gennemsnit. eller cross-validate. random.seed(42) np.random.seed(42) set_random_seed(42) if trainingdata == "liar": train, dev, test, train_lab, dev_lab, test_lab = load_liar_data(datapath) elif trainingdata == "kaggle": train, test, train_lab, test_lab = load_kaggle_data(datapath) elif trainingdata == "FNC": train, test, train_lab, test_lab = load_FNC_data(datapath) elif trainingdata == "BS": train, test, train_lab, test_lab = load_BS_data(datapath) train = [nltk.word_tokenize(i.lower()) for i in train] test = [nltk.word_tokenize(i.lower()) for i in test] if trainingdata == "liar": dev = [nltk.word_tokenize(i.lower()) for i in dev] else: dev = train[int(abs((len(train_lab) / 3) * 2)):] dev_lab = train_lab[int(abs((len(train_lab) / 3) * 2)):] train = train[:int(abs((len(train_lab) / 3) * 2))] train_lab = train_lab[:int(abs((len(train_lab) / 3) * 2))] print(len(train), len(dev))
def train_and_test(datapath="/home/ktj250/thesis/data/", emb_model_path="/home/ktj250/thesis/", TIMEDISTRIBUTED=False, trainingdata="liar", num_cells=32, num_epochs=10, dropout=0.4, r_dropout=0.4, num_batch=64, learning_rate=0.0001): K.clear_session() #colab_directory_path = "/gdrive/My Drive/Thesis/" #TIMEDISTRIBUTED = False use_pretrained_embeddings = True FAKE=1 #trainingdata = sys.argv[1] #"liar" # kaggle, FNC, BS print("trainingdata=",trainingdata) if trainingdata == "liar": train, dev, test, train_lab, dev_lab, test_lab = load_liar_data(datapath) elif trainingdata == "kaggle": train, test, train_lab, test_lab = load_kaggle_data(datapath) elif trainingdata == "FNC": train, test, train_lab, test_lab = load_FNC_data(datapath) elif trainingdata == "BS": train, test, train_lab, test_lab = load_BS_data(datapath) train = [nltk.word_tokenize(i.lower()) for i in train] test = [nltk.word_tokenize(i.lower()) for i in test] if trainingdata == "liar": dev = [nltk.word_tokenize(i.lower()) for i in dev] else: dev = train[int(abs((len(train_lab)/3)*2)):] dev_lab = train_lab[int(abs((len(train_lab)/3)*2)):] train = train[:int(abs((len(train_lab)/3)*2))] train_lab = train_lab[:int(abs((len(train_lab)/3)*2))] print(len(train), len(dev)) all_train_tokens = [] for i in train: for word in i: all_train_tokens.append(word) vocab = set(all_train_tokens) word2id = {word: i+1 for i, word in enumerate(vocab)}# making the first id is 1, so that I can pad with zeroes. word2id["UNK"] = len(word2id)+1 id2word = {v: k for k, v in word2id.items()} #trainTextsSeq: List of input sequence for each document (A matrix with size num_samples * max_doc_length) trainTextsSeq = np.array([[word2id[w] for w in sent] for sent in train]) testTextsSeq = np.array([[word2id.get(w, word2id["UNK"]) for w in sent] for sent in test]) #if trainingdata == "liar": devTextsSeq = np.array([[word2id.get(w, word2id["UNK"]) for w in sent] for sent in dev]) # PARAMETERS # vocab_size: number of tokens in vocabulary vocab_size = len(word2id)+1 # max_doc_length: length of documents after padding (in Keras, the length of documents are usually padded to be of the same size) max_doc_length = 100 # LIAR 100 (like Wang), Kaggle 3391, FakeNewsCorpus 2669 # num_samples: number of training/testing data samples num_samples = len(train_lab) # num_time_steps: number of time steps in LSTM cells, usually equals to the size of input, i.e., max_doc_length num_time_steps = max_doc_length embedding_size = 300 # also just for now.. # padding with max doc lentgh seq = sequence.pad_sequences(trainTextsSeq, maxlen=max_doc_length, dtype='int32', padding='post', truncating='post', value=0.0) print("train seq shape",seq.shape) test_seq = sequence.pad_sequences(testTextsSeq, maxlen=max_doc_length, dtype='int32', padding='post', truncating='post', value=0.0) #if trainingdata == "liar": dev_seq = sequence.pad_sequences(devTextsSeq, maxlen=max_doc_length, dtype='int32', padding='post', truncating='post', value=0.0) if TIMEDISTRIBUTED: train_lab = tile_reshape(train_lab, num_time_steps) test_lab = tile_reshape(test_lab, num_time_steps) print(train_lab.shape) #if trainingdata == "liar": dev_lab = tile_reshape(dev_lab, num_time_steps) else: train_lab = to_categorical(train_lab, 2) test_lab = to_categorical(test_lab, 2) print(train_lab.shape) #if trainingdata == "liar": dev_lab = to_categorical(dev_lab, 2) print("Parameters:: num_cells: "+str(num_cells)+" num_samples: "+str(num_samples)+" embedding_size: "+str(embedding_size)+" epochs: "+str(num_epochs)+" batch_size: "+str(num_batch)) if use_pretrained_embeddings: # https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html # Load Google's pre-trained Word2Vec model. model = gensim.models.KeyedVectors.load_word2vec_format(emb_model_path+'GoogleNews-vectors-negative300.bin', binary=True) embedding_matrix = np.zeros((len(word2id) + 1, 300)) for word, i in word2id.items(): try: embedding_vector = model.wv[word] except: embedding_vector = model.wv["UNK"] if embedding_vector is not None: embedding_matrix[i] = embedding_vector myInput = Input(shape=(max_doc_length,), name='input') print(myInput.shape) if use_pretrained_embeddings: x = Embedding(input_dim=vocab_size, output_dim=embedding_size, weights=[embedding_matrix],input_length=max_doc_length,trainable=True)(myInput) else: x = Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=max_doc_length)(myInput) print(x.shape) if TIMEDISTRIBUTED: lstm_out = LSTM(num_cells, dropout=dropout, recurrent_dropout=r_dropout, return_sequences=True, kernel_constraint=NonNeg())(x) predictions = TimeDistributed(Dense(1, activation='sigmoid', kernel_constraint=NonNeg()))(lstm_out) else: lstm_out = Bidirectional(LSTM(num_cells, dropout=dropout, recurrent_dropout=r_dropout))(x) predictions = Dense(2, activation='softmax')(lstm_out) model = Model(inputs=myInput, outputs=predictions) opt = Adam(lr=learning_rate) if TIMEDISTRIBUTED: model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy']) else: model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy']) print("fitting model..") #if trainingdata == "liar": history = model.fit({'input': seq}, train_lab, epochs=num_epochs, verbose=2, batch_size=num_batch, validation_data=(dev_seq,dev_lab)) #else: # history = model.fit({'input': seq}, train_lab, epochs=num_epochs, verbose=2, batch_size=num_batch) print("Testing...") test_score = model.evaluate(test_seq, test_lab, batch_size=num_batch, verbose=0) #if trainingdata == "liar": dev_score = model.evaluate(dev_seq, dev_lab, batch_size=num_batch, verbose=0) print("Test loss:", test_score[0]) print("Test accuracy:", test_score[1]) #if trainingdata == "liar": print("Valid loss:", dev_score[0]) print("Valid accuracy:", dev_score[1]) if not TIMEDISTRIBUTED: preds = model.predict(test_seq) f1 = f1_score(np.argmax(test_lab,axis=1), np.argmax(preds, axis=1)) tn, fp, fn, tp = confusion_matrix(np.argmax(test_lab,axis=1), np.argmax(preds, axis=1)).ravel() print("tn, fp, fn, tp") print(tn, fp, fn, tp) model.summary() #if trainingdata=="liar": # return dev_score[1], history #else: return test_score[1], dev_score[1], history, f1
def test_on_FNC(): print("Testing on FNC") train, test, train_lab, test_lab = load_FNC_data(datapath) model_loaded2 = load_model(model_path + '.h5') commons_testing(model_loaded2, test, test_lab, trainingdata + "_FNC")
def pre_modelling_stuff(TIMEDISTRIBUTED=False, trainingdata="liar", datapath="/home/ktj250/thesis/data/", emb_model_path="/home/ktj250/thesis/"): #directory_path = "/gdrive/My Drive/Thesis/" #TIMEDISTRIBUTED = False use_pretrained_embeddings = True FAKE=1 #trainingdata = sys.argv[1] #"liar" # kaggle, FNC, BS print("trainingdata=",trainingdata) if trainingdata == "liar": train, dev, test, train_lab, dev_lab, test_lab = load_liar_data(datapath) elif trainingdata == "kaggle": train, test, train_lab, test_lab = load_kaggle_data(datapath) elif trainingdata == "FNC": train, test, train_lab, test_lab = load_FNC_data(datapath) elif trainingdata == "BS": train, test, train_lab, test_lab = load_BS_data(datapath) train = [nltk.word_tokenize(i.lower()) for i in train] test = [nltk.word_tokenize(i.lower()) for i in test] if trainingdata == "liar": dev = [nltk.word_tokenize(i.lower()) for i in dev] else: dev = train[int(abs((len(train_lab)/3)*2)):] dev_lab = train_lab[int(abs((len(train_lab)/3)*2)):] train = train[:int(abs((len(train_lab)/3)*2))] train_lab = train_lab[:int(abs((len(train_lab)/3)*2))] print(len(train), len(dev)) all_train_tokens = [] for i in train: for word in i: all_train_tokens.append(word) vocab = set(all_train_tokens) word2id = {word: i+1 for i, word in enumerate(vocab)}# making the first id is 1, so that I can pad with zeroes. word2id["UNK"] = len(word2id)+1 id2word = {v: k for k, v in word2id.items()} #trainTextsSeq: List of input sequence for each document (A matrix with size num_samples * max_doc_length) trainTextsSeq = np.array([[word2id[w] for w in sent] for sent in train]) testTextsSeq = np.array([[word2id.get(w, word2id["UNK"]) for w in sent] for sent in test]) #if trainingdata == "liar": devTextsSeq = np.array([[word2id.get(w, word2id["UNK"]) for w in sent] for sent in dev]) # PARAMETERS # vocab_size: number of tokens in vocabulary vocab_size = len(word2id)+1 # max_doc_length: length of documents after padding (in Keras, the length of documents are usually padded to be of the same size) max_doc_length = 100 # LIAR 100 (like Wang), Kaggle 3391, FakeNewsCorpus 2669 # num_samples: number of training/testing data samples num_samples = len(train_lab) # num_time_steps: number of time steps in LSTM cells, usually equals to the size of input, i.e., max_doc_length num_time_steps = max_doc_length embedding_size = 300 # also just for now.. # padding with max doc lentgh seq = sequence.pad_sequences(trainTextsSeq, maxlen=max_doc_length, dtype='int32', padding='post', truncating='post', value=0.0) print("train seq shape",seq.shape) test_seq = sequence.pad_sequences(testTextsSeq, maxlen=max_doc_length, dtype='int32', padding='post', truncating='post', value=0.0) #if trainingdata == "liar": dev_seq = sequence.pad_sequences(devTextsSeq, maxlen=max_doc_length, dtype='int32', padding='post', truncating='post', value=0.0) if TIMEDISTRIBUTED: train_lab = tile_reshape(train_lab, num_time_steps) test_lab = tile_reshape(test_lab, num_time_steps) print(train_lab.shape) #if trainingdata == "liar": dev_lab = tile_reshape(dev_lab, num_time_steps) else: train_lab = to_categorical(train_lab, 2) test_lab = to_categorical(test_lab, 2) print(train_lab.shape) #if trainingdata == "liar": dev_lab = to_categorical(dev_lab, 2) if use_pretrained_embeddings: # https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html # Load Google's pre-trained Word2Vec model. model = gensim.models.KeyedVectors.load_word2vec_format(emb_model_path+'GoogleNews-vectors-negative300.bin', binary=True) embedding_matrix = np.zeros((len(word2id) + 1, 300)) for word, i in word2id.items(): try: embedding_vector = model.wv[word] except: embedding_vector = model.wv["UNK"] if embedding_vector is not None: embedding_matrix[i] = embedding_vector #if trainingdata=="liar": return embedding_matrix, seq, test_seq, dev_seq, train_lab, test_lab, dev_lab, vocab_size