Exemplo n.º 1
0
def evaluateModel(model, trainPath, labelsPath, preprocesser=sparseBagOfWords):
	"""
	Evaluate the f1-score of a model.

	Parameters:	
	-----------
		model: Class with fit and predict methods.
		trainPath (str): The path of the pickle of the training examples.
		labelsPath (str): The path of the pickle of the training labels.
		preprocesser (func): Function used to transform the list of sequences into a matrix.
	"""
	sequences = np.array(openPickle(trainPath))
	labels = toBoolList(openPickle(labelsPath))
	return evaluateModel_(model, sequences, labels, preprocesser=preprocesser)
Exemplo n.º 2
0
def getPredictions(model, trainPath, labelsPath, testPath, preprocesser=sparseBagOfWords):
	"""
	Train a model and predict a testSet.

	Parameters:
	-----------
		model: class with fit and predict methods
		trainPath (str): The path of the pickle of the training examples.
		labelsPath (str): The path of the pickle of the training labels.
		testPath (str): The path of the pickle of the testing examples.
		preprocesser (func): Function used to transform  the list of sequences into a matrix.
	"""
	sequences = np.array(openPickle(trainPath))
	labels = toBoolList(openPickle(labelsPath))

	trainSeq = preprocesser(sequences)
	n_features = trainSeq.shape[1]

	model.fit(trainSeq, labels)

	testSeq = openPickle(testPath)
	return model.predict(preprocesser(testSeq, shape=(len(testSeq), n_features)))
Exemplo n.º 3
0
def getTrainTest(labels):
    """
    Split the dataset into a training and a testing set, conserving the labels.

    Parameters:
    -----------
        labels (string or list): Path to the pickle of the labels, or directly the labels.

    Returns:
    --------
        ((list<int>, list<int>)): Return a pair of lists of indices. The first list correspond to
            the training set, the second one to the testing set.
    """
    if type(labels) == str:
        y = toBoolList(openPickle(labels))
    else:
        y = toBoolList(labels)

    X = np.arange(len(y))
    return train_test_split(X, stratify=y, random_state=42)
Exemplo n.º 4
0
# coding: utf-8

import numpy as np
import word2vec
import os

from xgboost.sklearn import XGBClassifier

from processing import evaluateModel
from preprocessing import getMeanVectors
from utils import openPickle, savePickle


if not os.path.isfile("./Data/Learn/embeddedMeanSequences.pkl"):
	encoder = openPickle("./Data/dict.pkl")
	decoder = {encoder[key]: key for key in encoder}

	w2v = word2vec.load("./Resources/frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin")

	preprocesser = lambda x: getMeanVectors(x, w2v, decoder)

	sequences = np.array(openPickle("./Data/Learn/correctedSequences.pkl"))

	for i in range(len(sequences) // 5000):
		if i == 0:
			embeddedSeq = preprocesser(sequences[0:5000])
		else:
			embeddedSeq = np.vstack((embeddedSeq, preprocesser(sequences[5000 * i: 5000 * (i+1)])))
		print("Process until i = %s"%i)

	embeddedSeq = np.vstack((embeddedSeq, preprocesser(sequences[5000 * i:])))
Exemplo n.º 5
0
	labels = toBoolList(openPickle(labelsPath))

	trainSeq = preprocesser(sequences)
	n_features = trainSeq.shape[1]

	model.fit(trainSeq, labels)

	testSeq = openPickle(testPath)
	return model.predict(preprocesser(testSeq, shape=(len(testSeq), n_features)))


if __name__ == "__main__":
	
	from utils import top20Coefs
	import numpy as np

	model = MultinomialNB()
	evaluateModel(model, "./Data/Learn/sequences.pkl", "./Data/Learn/labels.pkl")

	encoder = openPickle("./Data/dict.pkl")
	decoder = {encoder[i]: i for i in encoder}
	coefs = top20Coefs(model)

	print("Most positives:")
	for coef in coefs[0]:
		print("%s: %s"%(decoder[coef], np.exp(-model.feature_log_prob_[0][coef])))
	
	print("\nMost negatives:")
	for coef in coefs[1]:
		print("%s: %s"%(decoder[coef], np.exp(-model.feature_log_prob_[1][coef])))
Exemplo n.º 6
0
def preprocessDeepModel(sequencesPath, outputPath, maxLen=None):
    """
    Preprocess the sequences to make them trainable by a deep model.

    Parameters:
    -----------
        sequencesPath (str); where are stored the sequences
        outputPath (str): where will be stored the preprocessed sequenced
        maxLen(int): size of the padded sequences

    Returns:
    --------
        (np.arrays): the training and the validation data, and the training and the validation
                     labels.
    """

    modelPath = "./Resources/frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin"

    # Download the model if needed
    if not os.path.isfile(modelPath):
        link = " http://embeddings.org/frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin"
        os.system("wget -O " + modelPath + link)

    # Load the model
    w2v = word2vec.load(modelPath)
    vocab = set(w2v.vocab)

    # Load the encoder
    encoder = openPickle("./Data/dict.pkl")
    decoder = {encoder[key]: key for key in encoder}

    #if not os.path.isfile(sequencesPath):
    if not os.path.isfile(outputPath):

        fromOldToNew = reIndexToken(w2v, decoder)

        if not os.path.isfile("./Data/newDict.pkl"):

            newCoder = {"pad": 0, "unk": len(decoder) - 1}
            for key in decoder:
                if fromOldToNew[key] != len(decoder) - 1:
                    newCoder[decoder[key]] = fromOldToNew[key]

            savePickle("./Data/newDict.pkl", newCoder)
        else:
            newCoder = openPickle("./Data/newDict.pkl")

        if not os.path.isfile(sequencesPath):
            raise FileNotFoundError("Please run studyWord2Vec.py")

        sequences = openPickle(sequencesPath)
        sequences = reIndexSequences(sequences, fromOldToNew)
        savePickle(outputPath, sequences)
    else:
        sequences = openPickle(outputPath)

    if maxLen is None:
        maxLength = max([len(seq) for seq in sequences])
    else:
        maxLength = maxLen

    return pad_sequences(sequences, maxlen=maxLength)
Exemplo n.º 7
0
# coding: utf-8

import pickle
import numpy as np
from sentenceFunctions import wordCount

from utils import openPickle

#Let's open the files
d = openPickle("./Data/dict.pkl")

print("We have %s different words." % len(d.keys()))
for i, key in enumerate(d.keys()):
    print("%s: %s" % (key, d[key]))
    if i > 2:
        break

d_reverse = {d[i]: i for i in d}

sentences = openPickle("./Data/Learn/sentences.pkl")

print("")
print("We have %s sentences" % len(sentences))
for i, sentence in enumerate(sentences):
    print(sentence)
    if i > 2:
        break

sequences = openPickle("./Data/Learn/sequences.pkl")

print("")
Exemplo n.º 8
0
# coding: utf-8

from utils import openPickle

sequences = openPickle("./Resources/trysequences.pkl")

for i, sequence in enumerate(sequences):
    print(sequence)
    if i > 50:
        break
    """
	Remove from the sequences the words that appears only one time.

	Parameters:
	-----------
		sequences (list<list<int>>): list of sequences. A sequence is a list of int.
		wordOcc (dict): mapping from words to number of occurences

	Returns:
	--------
		(list<list<int>>): list of sequences without words that appears only one time.
	"""

    answer = []
    for sequence in sequences:
        answer.append([word for word in sequence if wordOcc[word] > 1])
    return answer


if __name__ == "__main__":

    from utils import openPickle
    import numpy as np

    sequences = openPickle("./Data/Learn/sequences.pkl")

    wordOcc = wordCount(sequences)
    notUniqueSequences = removeUniqueWords(sequences, wordOcc)

    print(np.sum([len(sequence) == 0 for sequence in notUniqueSequences]))
Exemplo n.º 10
0
# coding: utf-8

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from utils import openPickle
from preprocessing import toBoolList, getTrainTest

cv = CountVectorizer(ngram_range=(1, 2))

X = cv.fit_transform(openPickle("./Data/Learn/sentences.pkl"))

labels = toBoolList(openPickle("./Data/Learn/labels.pkl"))

trainInd, testInd = getTrainTest(labels)

X_train, X_test, y_train, y_test = X[trainInd], X[testInd], labels[
    trainInd], labels[testInd]

model = MultinomialNB(alpha=0.01)

model.fit(X_train, y_train)

trainScore = f1_score(y_train, model.predict(X_train))
testScore = f1_score(y_test, model.predict(X_test))

print("Training f1 score: %.4f" % trainScore)
print("Testing f1 score: %.4f" % testScore)
Exemplo n.º 11
0
import os
import word2vec

from keras.models import Model, load_model
from keras.layers import Dense, LSTM, Input
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint

from preprocessing import embeddingMatrix, preprocessDeepModel, toBoolList, getTrainTest
from utils import openPickle

paddedSeq = preprocessDeepModel("./Data/Learn/correctedSequences.pkl",
                                "./Data/Learn/kerasSequences.pkl")

labels = np.array(toBoolList(
    openPickle("./Data/Learn/labels.pkl"))).astype(int)

print('Shape of data tensor:', paddedSeq.shape)
print('Shape of label tensor:', labels.shape)

trainInd, testInd = getTrainTest(labels)

X_train, X_val = paddedSeq[trainInd], paddedSeq[testInd]
y_train, y_val = labels[trainInd], labels[testInd]

w2v = word2vec.load(
    "./Resources/frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin")

encoder = openPickle("./Data/newDict.pkl")
decoder = {encoder[key]: key for key in encoder}
Exemplo n.º 12
0
# coding: utf-8

from keras.models import load_model
from sklearn.metrics import f1_score

from utils import openPickle, savePickle
from preprocessing import getTrainTest, toBoolList
from preprocessing import sequencesCorrecter, preprocessDeepModel
from postprocessing import convertLabels

# Train Data

paddedTrainSeq = preprocessDeepModel("./Data/Learn/correctedSequences.pkl",
                                     "./Data/Learn/kerasSequences.pkl", 409)
labels = toBoolList(openPickle("./Data/Learn/labels.pkl"))

trainInd, testInd = getTrainTest(labels)

X_train, X_val = paddedTrainSeq[trainInd], paddedTrainSeq[testInd]
y_train, y_val = labels[trainInd], labels[testInd]

# Test Data
sequences = openPickle("./Data/Test/sequences.pkl")
correcter = openPickle("./Resources/tokenCorrecter.pkl")
correctedSequences = sequencesCorrecter(sequences, correcter)

savePickle("./Data/Test/correctedSequences.pkl", correctedSequences)

paddedSeq = preprocessDeepModel("./Data/Test/correctedSequences.pkl",
                                "./Data/Test/kerasSequences.pkl", 409)