def FNC():
    FNC_Xtrain, FNC_Xtest, FNC_ytrain, FNC_ytest = load_FNC_data(datapath)

    FNC_vectorizer = TfidfVectorizer(ngram_range=(v, k), max_features=m)
    FNC_Xtrain_vect = FNC_vectorizer.fit_transform(FNC_Xtrain)
    FNC_Xtest_vect = FNC_vectorizer.transform(FNC_Xtest)
    FNC_feats = [
        '_'.join(s.split()) for s in FNC_vectorizer.get_feature_names()
    ]

    clf_FNC = None
    clf_FNC = LogisticRegression(random_state=16,
                                 solver='saga',
                                 C=np.inf,
                                 max_iter=10000).fit(FNC_Xtrain_vect,
                                                     FNC_ytrain)
    FNC_coefs = clf_FNC.coef_
    allcoefs_FNC = pd.DataFrame.from_records(FNC_coefs, columns=FNC_feats)
    allcoefs_FNC.to_csv("FakeNewsCorpus_coefs.csv", sep="\t", index=False)

    preds_FNC_test = clf_FNC.predict(FNC_Xtest_vect)
    preds_FNC_train = clf_FNC.predict(FNC_Xtrain_vect)

    print_scores(FNC_ytest, preds_FNC_test, "FakeNewsCorpus Test Scores")
    print_scores(FNC_ytrain, preds_FNC_train, "FakeNewsCorpus Train Scores")
예제 #2
0
print("TRAINING WITH", trainingdata)

NUM_LAYERS = 1
print("NUMBER OF LAYERS:", NUM_LAYERS)

# kør med forskellige random seeds og tag gennemsnit. eller cross-validate.
random.seed(42)
np.random.seed(42)
set_random_seed(42)

if trainingdata == "liar":
    train, dev, test, train_lab, dev_lab, test_lab = load_liar_data(datapath)
elif trainingdata == "kaggle":
    train, test, train_lab, test_lab = load_kaggle_data(datapath)
elif trainingdata == "FNC":
    train, test, train_lab, test_lab = load_FNC_data(datapath)
elif trainingdata == "BS":
    train, test, train_lab, test_lab = load_BS_data(datapath)

train = [nltk.word_tokenize(i.lower()) for i in train]

test = [nltk.word_tokenize(i.lower()) for i in test]

if trainingdata == "liar":
    dev = [nltk.word_tokenize(i.lower()) for i in dev]
else:
    dev = train[int(abs((len(train_lab) / 3) * 2)):]
    dev_lab = train_lab[int(abs((len(train_lab) / 3) * 2)):]
    train = train[:int(abs((len(train_lab) / 3) * 2))]
    train_lab = train_lab[:int(abs((len(train_lab) / 3) * 2))]
    print(len(train), len(dev))
예제 #3
0
def train_and_test(datapath="/home/ktj250/thesis/data/",
                    emb_model_path="/home/ktj250/thesis/",
                    TIMEDISTRIBUTED=False,
                    trainingdata="liar",
                    num_cells=32,
                    num_epochs=10,
                    dropout=0.4,
                    r_dropout=0.4,
                    num_batch=64,
                    learning_rate=0.0001):

    K.clear_session()
    #colab_directory_path = "/gdrive/My Drive/Thesis/"

    #TIMEDISTRIBUTED = False

    use_pretrained_embeddings = True

    FAKE=1

    #trainingdata = sys.argv[1] #"liar" # kaggle, FNC, BS

    print("trainingdata=",trainingdata)

    if trainingdata == "liar":
        train, dev, test, train_lab, dev_lab, test_lab = load_liar_data(datapath)
    elif trainingdata == "kaggle":
        train, test, train_lab, test_lab = load_kaggle_data(datapath)
    elif trainingdata == "FNC":
        train, test, train_lab, test_lab = load_FNC_data(datapath)
    elif trainingdata == "BS":
        train, test, train_lab, test_lab = load_BS_data(datapath)

    train = [nltk.word_tokenize(i.lower()) for i in train]

    test = [nltk.word_tokenize(i.lower()) for i in test]

    if trainingdata == "liar":
        dev = [nltk.word_tokenize(i.lower()) for i in dev]
    else:
        dev = train[int(abs((len(train_lab)/3)*2)):]
        dev_lab = train_lab[int(abs((len(train_lab)/3)*2)):]
        train = train[:int(abs((len(train_lab)/3)*2))]
        train_lab = train_lab[:int(abs((len(train_lab)/3)*2))]
        print(len(train), len(dev))


    all_train_tokens = []
    for i in train:
        for word in i:
            all_train_tokens.append(word)

    vocab = set(all_train_tokens)
    word2id = {word: i+1 for i, word in enumerate(vocab)}# making the first id is 1, so that I can pad with zeroes.
    word2id["UNK"] = len(word2id)+1
    id2word = {v: k for k, v in word2id.items()}


    #trainTextsSeq: List of input sequence for each document (A matrix with size num_samples * max_doc_length)
    trainTextsSeq = np.array([[word2id[w] for w in sent] for sent in train])

    testTextsSeq = np.array([[word2id.get(w, word2id["UNK"]) for w in sent] for sent in test])

    #if trainingdata == "liar":
    devTextsSeq = np.array([[word2id.get(w, word2id["UNK"]) for w in sent] for sent in dev])

    # PARAMETERS
    # vocab_size: number of tokens in vocabulary
    vocab_size = len(word2id)+1
    # max_doc_length: length of documents after padding (in Keras, the length of documents are usually padded to be of the same size)
    max_doc_length = 100 # LIAR 100 (like Wang), Kaggle 3391, FakeNewsCorpus 2669
    # num_samples: number of training/testing data samples
    num_samples = len(train_lab)
    # num_time_steps: number of time steps in LSTM cells, usually equals to the size of input, i.e., max_doc_length
    num_time_steps = max_doc_length
    embedding_size = 300 # also just for now..

    # padding with max doc lentgh
    seq = sequence.pad_sequences(trainTextsSeq, maxlen=max_doc_length, dtype='int32', padding='post', truncating='post', value=0.0)
    print("train seq shape",seq.shape)
    test_seq = sequence.pad_sequences(testTextsSeq, maxlen=max_doc_length, dtype='int32', padding='post', truncating='post', value=0.0)
    #if trainingdata == "liar":
    dev_seq = sequence.pad_sequences(devTextsSeq, maxlen=max_doc_length, dtype='int32', padding='post', truncating='post', value=0.0)




    if TIMEDISTRIBUTED:
        train_lab = tile_reshape(train_lab, num_time_steps)
        test_lab = tile_reshape(test_lab, num_time_steps)
        print(train_lab.shape)
        #if trainingdata == "liar":
        dev_lab = tile_reshape(dev_lab, num_time_steps)
    else:
        train_lab = to_categorical(train_lab, 2)
        test_lab = to_categorical(test_lab, 2)
        print(train_lab.shape)
        #if trainingdata == "liar":
        dev_lab = to_categorical(dev_lab, 2)

    print("Parameters:: num_cells: "+str(num_cells)+" num_samples: "+str(num_samples)+" embedding_size: "+str(embedding_size)+" epochs: "+str(num_epochs)+" batch_size: "+str(num_batch))


    if use_pretrained_embeddings:
        # https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
        # Load Google's pre-trained Word2Vec model.
        model = gensim.models.KeyedVectors.load_word2vec_format(emb_model_path+'GoogleNews-vectors-negative300.bin', binary=True)

        embedding_matrix = np.zeros((len(word2id) + 1, 300))
        for word, i in word2id.items():
            try:
                embedding_vector = model.wv[word]
            except:
                embedding_vector = model.wv["UNK"]
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector

    myInput = Input(shape=(max_doc_length,), name='input')
    print(myInput.shape)
    if use_pretrained_embeddings:
        x = Embedding(input_dim=vocab_size, output_dim=embedding_size, weights=[embedding_matrix],input_length=max_doc_length,trainable=True)(myInput)
    else:
        x = Embedding(input_dim=vocab_size, output_dim=embedding_size, input_length=max_doc_length)(myInput)
        print(x.shape)

    if TIMEDISTRIBUTED:
        lstm_out = LSTM(num_cells, dropout=dropout, recurrent_dropout=r_dropout, return_sequences=True, kernel_constraint=NonNeg())(x)
        predictions = TimeDistributed(Dense(1, activation='sigmoid', kernel_constraint=NonNeg()))(lstm_out)
    else:
        lstm_out = Bidirectional(LSTM(num_cells, dropout=dropout, recurrent_dropout=r_dropout))(x)
        predictions = Dense(2, activation='softmax')(lstm_out)

    model = Model(inputs=myInput, outputs=predictions)

    opt = Adam(lr=learning_rate)

    if TIMEDISTRIBUTED:
        model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
    else:
        model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
    print("fitting model..")
    #if trainingdata == "liar":
    history = model.fit({'input': seq}, train_lab, epochs=num_epochs, verbose=2, batch_size=num_batch, validation_data=(dev_seq,dev_lab))
    #else:
    #    history = model.fit({'input': seq}, train_lab, epochs=num_epochs, verbose=2, batch_size=num_batch)
    print("Testing...")
    test_score = model.evaluate(test_seq, test_lab, batch_size=num_batch, verbose=0)
    #if trainingdata == "liar":
    dev_score = model.evaluate(dev_seq, dev_lab, batch_size=num_batch, verbose=0)

    print("Test loss:", test_score[0])
    print("Test accuracy:", test_score[1])
    #if trainingdata == "liar":
    print("Valid loss:", dev_score[0])
    print("Valid accuracy:", dev_score[1])

    if not TIMEDISTRIBUTED:
        preds = model.predict(test_seq)
        f1 = f1_score(np.argmax(test_lab,axis=1), np.argmax(preds, axis=1))
        tn, fp, fn, tp = confusion_matrix(np.argmax(test_lab,axis=1), np.argmax(preds, axis=1)).ravel()
        print("tn, fp, fn, tp")
        print(tn, fp, fn, tp)


    model.summary()

    #if trainingdata=="liar":
    #    return dev_score[1], history
    #else:
    return test_score[1], dev_score[1], history, f1
예제 #4
0
def test_on_FNC():
    print("Testing on FNC")
    train, test, train_lab, test_lab = load_FNC_data(datapath)
    model_loaded2 = load_model(model_path + '.h5')
    commons_testing(model_loaded2, test, test_lab, trainingdata + "_FNC")
예제 #5
0
def pre_modelling_stuff(TIMEDISTRIBUTED=False,
                        trainingdata="liar",
                        datapath="/home/ktj250/thesis/data/",
                        emb_model_path="/home/ktj250/thesis/"):

    #directory_path = "/gdrive/My Drive/Thesis/"
    #TIMEDISTRIBUTED = False
    use_pretrained_embeddings = True
    FAKE=1
    #trainingdata = sys.argv[1] #"liar" # kaggle, FNC, BS
    print("trainingdata=",trainingdata)

    if trainingdata == "liar":
        train, dev, test, train_lab, dev_lab, test_lab = load_liar_data(datapath)
    elif trainingdata == "kaggle":
        train, test, train_lab, test_lab = load_kaggle_data(datapath)
    elif trainingdata == "FNC":
        train, test, train_lab, test_lab = load_FNC_data(datapath)
    elif trainingdata == "BS":
        train, test, train_lab, test_lab = load_BS_data(datapath)

    train = [nltk.word_tokenize(i.lower()) for i in train]

    test = [nltk.word_tokenize(i.lower()) for i in test]

    if trainingdata == "liar":
        dev = [nltk.word_tokenize(i.lower()) for i in dev]
    else:
        dev = train[int(abs((len(train_lab)/3)*2)):]
        dev_lab = train_lab[int(abs((len(train_lab)/3)*2)):]
        train = train[:int(abs((len(train_lab)/3)*2))]
        train_lab = train_lab[:int(abs((len(train_lab)/3)*2))]
        print(len(train), len(dev))

    all_train_tokens = []
    for i in train:
        for word in i:
            all_train_tokens.append(word)

    vocab = set(all_train_tokens)
    word2id = {word: i+1 for i, word in enumerate(vocab)}# making the first id is 1, so that I can pad with zeroes.
    word2id["UNK"] = len(word2id)+1
    id2word = {v: k for k, v in word2id.items()}


    #trainTextsSeq: List of input sequence for each document (A matrix with size num_samples * max_doc_length)
    trainTextsSeq = np.array([[word2id[w] for w in sent] for sent in train])

    testTextsSeq = np.array([[word2id.get(w, word2id["UNK"]) for w in sent] for sent in test])

    #if trainingdata == "liar":
    devTextsSeq = np.array([[word2id.get(w, word2id["UNK"]) for w in sent] for sent in dev])

    # PARAMETERS
    # vocab_size: number of tokens in vocabulary
    vocab_size = len(word2id)+1
    # max_doc_length: length of documents after padding (in Keras, the length of documents are usually padded to be of the same size)
    max_doc_length = 100 # LIAR 100 (like Wang), Kaggle 3391, FakeNewsCorpus 2669
    # num_samples: number of training/testing data samples
    num_samples = len(train_lab)
    # num_time_steps: number of time steps in LSTM cells, usually equals to the size of input, i.e., max_doc_length
    num_time_steps = max_doc_length
    embedding_size = 300 # also just for now..

    # padding with max doc lentgh
    seq = sequence.pad_sequences(trainTextsSeq, maxlen=max_doc_length, dtype='int32', padding='post', truncating='post', value=0.0)
    print("train seq shape",seq.shape)
    test_seq = sequence.pad_sequences(testTextsSeq, maxlen=max_doc_length, dtype='int32', padding='post', truncating='post', value=0.0)
    #if trainingdata == "liar":
    dev_seq = sequence.pad_sequences(devTextsSeq, maxlen=max_doc_length, dtype='int32', padding='post', truncating='post', value=0.0)


    if TIMEDISTRIBUTED:
        train_lab = tile_reshape(train_lab, num_time_steps)
        test_lab = tile_reshape(test_lab, num_time_steps)
        print(train_lab.shape)
        #if trainingdata == "liar":
        dev_lab = tile_reshape(dev_lab, num_time_steps)
    else:
        train_lab = to_categorical(train_lab, 2)
        test_lab = to_categorical(test_lab, 2)
        print(train_lab.shape)
        #if trainingdata == "liar":
        dev_lab = to_categorical(dev_lab, 2)



    if use_pretrained_embeddings:
        # https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
        # Load Google's pre-trained Word2Vec model.
        model = gensim.models.KeyedVectors.load_word2vec_format(emb_model_path+'GoogleNews-vectors-negative300.bin', binary=True)

        embedding_matrix = np.zeros((len(word2id) + 1, 300))
        for word, i in word2id.items():
            try:
                embedding_vector = model.wv[word]
            except:
                embedding_vector = model.wv["UNK"]
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector

    #if trainingdata=="liar":
    return embedding_matrix, seq, test_seq, dev_seq, train_lab, test_lab, dev_lab, vocab_size