def evaluateModel(model, trainPath, labelsPath, preprocesser=sparseBagOfWords): """ Evaluate the f1-score of a model. Parameters: ----------- model: Class with fit and predict methods. trainPath (str): The path of the pickle of the training examples. labelsPath (str): The path of the pickle of the training labels. preprocesser (func): Function used to transform the list of sequences into a matrix. """ sequences = np.array(openPickle(trainPath)) labels = toBoolList(openPickle(labelsPath)) return evaluateModel_(model, sequences, labels, preprocesser=preprocesser)
def getPredictions(model, trainPath, labelsPath, testPath, preprocesser=sparseBagOfWords): """ Train a model and predict a testSet. Parameters: ----------- model: class with fit and predict methods trainPath (str): The path of the pickle of the training examples. labelsPath (str): The path of the pickle of the training labels. testPath (str): The path of the pickle of the testing examples. preprocesser (func): Function used to transform the list of sequences into a matrix. """ sequences = np.array(openPickle(trainPath)) labels = toBoolList(openPickle(labelsPath)) trainSeq = preprocesser(sequences) n_features = trainSeq.shape[1] model.fit(trainSeq, labels) testSeq = openPickle(testPath) return model.predict(preprocesser(testSeq, shape=(len(testSeq), n_features)))
def getTrainTest(labels): """ Split the dataset into a training and a testing set, conserving the labels. Parameters: ----------- labels (string or list): Path to the pickle of the labels, or directly the labels. Returns: -------- ((list<int>, list<int>)): Return a pair of lists of indices. The first list correspond to the training set, the second one to the testing set. """ if type(labels) == str: y = toBoolList(openPickle(labels)) else: y = toBoolList(labels) X = np.arange(len(y)) return train_test_split(X, stratify=y, random_state=42)
# coding: utf-8 import numpy as np import word2vec import os from xgboost.sklearn import XGBClassifier from processing import evaluateModel from preprocessing import getMeanVectors from utils import openPickle, savePickle if not os.path.isfile("./Data/Learn/embeddedMeanSequences.pkl"): encoder = openPickle("./Data/dict.pkl") decoder = {encoder[key]: key for key in encoder} w2v = word2vec.load("./Resources/frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin") preprocesser = lambda x: getMeanVectors(x, w2v, decoder) sequences = np.array(openPickle("./Data/Learn/correctedSequences.pkl")) for i in range(len(sequences) // 5000): if i == 0: embeddedSeq = preprocesser(sequences[0:5000]) else: embeddedSeq = np.vstack((embeddedSeq, preprocesser(sequences[5000 * i: 5000 * (i+1)]))) print("Process until i = %s"%i) embeddedSeq = np.vstack((embeddedSeq, preprocesser(sequences[5000 * i:])))
labels = toBoolList(openPickle(labelsPath)) trainSeq = preprocesser(sequences) n_features = trainSeq.shape[1] model.fit(trainSeq, labels) testSeq = openPickle(testPath) return model.predict(preprocesser(testSeq, shape=(len(testSeq), n_features))) if __name__ == "__main__": from utils import top20Coefs import numpy as np model = MultinomialNB() evaluateModel(model, "./Data/Learn/sequences.pkl", "./Data/Learn/labels.pkl") encoder = openPickle("./Data/dict.pkl") decoder = {encoder[i]: i for i in encoder} coefs = top20Coefs(model) print("Most positives:") for coef in coefs[0]: print("%s: %s"%(decoder[coef], np.exp(-model.feature_log_prob_[0][coef]))) print("\nMost negatives:") for coef in coefs[1]: print("%s: %s"%(decoder[coef], np.exp(-model.feature_log_prob_[1][coef])))
def preprocessDeepModel(sequencesPath, outputPath, maxLen=None): """ Preprocess the sequences to make them trainable by a deep model. Parameters: ----------- sequencesPath (str); where are stored the sequences outputPath (str): where will be stored the preprocessed sequenced maxLen(int): size of the padded sequences Returns: -------- (np.arrays): the training and the validation data, and the training and the validation labels. """ modelPath = "./Resources/frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin" # Download the model if needed if not os.path.isfile(modelPath): link = " http://embeddings.org/frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin" os.system("wget -O " + modelPath + link) # Load the model w2v = word2vec.load(modelPath) vocab = set(w2v.vocab) # Load the encoder encoder = openPickle("./Data/dict.pkl") decoder = {encoder[key]: key for key in encoder} #if not os.path.isfile(sequencesPath): if not os.path.isfile(outputPath): fromOldToNew = reIndexToken(w2v, decoder) if not os.path.isfile("./Data/newDict.pkl"): newCoder = {"pad": 0, "unk": len(decoder) - 1} for key in decoder: if fromOldToNew[key] != len(decoder) - 1: newCoder[decoder[key]] = fromOldToNew[key] savePickle("./Data/newDict.pkl", newCoder) else: newCoder = openPickle("./Data/newDict.pkl") if not os.path.isfile(sequencesPath): raise FileNotFoundError("Please run studyWord2Vec.py") sequences = openPickle(sequencesPath) sequences = reIndexSequences(sequences, fromOldToNew) savePickle(outputPath, sequences) else: sequences = openPickle(outputPath) if maxLen is None: maxLength = max([len(seq) for seq in sequences]) else: maxLength = maxLen return pad_sequences(sequences, maxlen=maxLength)
# coding: utf-8 import pickle import numpy as np from sentenceFunctions import wordCount from utils import openPickle #Let's open the files d = openPickle("./Data/dict.pkl") print("We have %s different words." % len(d.keys())) for i, key in enumerate(d.keys()): print("%s: %s" % (key, d[key])) if i > 2: break d_reverse = {d[i]: i for i in d} sentences = openPickle("./Data/Learn/sentences.pkl") print("") print("We have %s sentences" % len(sentences)) for i, sentence in enumerate(sentences): print(sentence) if i > 2: break sequences = openPickle("./Data/Learn/sequences.pkl") print("")
# coding: utf-8 from utils import openPickle sequences = openPickle("./Resources/trysequences.pkl") for i, sequence in enumerate(sequences): print(sequence) if i > 50: break
""" Remove from the sequences the words that appears only one time. Parameters: ----------- sequences (list<list<int>>): list of sequences. A sequence is a list of int. wordOcc (dict): mapping from words to number of occurences Returns: -------- (list<list<int>>): list of sequences without words that appears only one time. """ answer = [] for sequence in sequences: answer.append([word for word in sequence if wordOcc[word] > 1]) return answer if __name__ == "__main__": from utils import openPickle import numpy as np sequences = openPickle("./Data/Learn/sequences.pkl") wordOcc = wordCount(sequences) notUniqueSequences = removeUniqueWords(sequences, wordOcc) print(np.sum([len(sequence) == 0 for sequence in notUniqueSequences]))
# coding: utf-8 from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import f1_score from utils import openPickle from preprocessing import toBoolList, getTrainTest cv = CountVectorizer(ngram_range=(1, 2)) X = cv.fit_transform(openPickle("./Data/Learn/sentences.pkl")) labels = toBoolList(openPickle("./Data/Learn/labels.pkl")) trainInd, testInd = getTrainTest(labels) X_train, X_test, y_train, y_test = X[trainInd], X[testInd], labels[ trainInd], labels[testInd] model = MultinomialNB(alpha=0.01) model.fit(X_train, y_train) trainScore = f1_score(y_train, model.predict(X_train)) testScore = f1_score(y_test, model.predict(X_test)) print("Training f1 score: %.4f" % trainScore) print("Testing f1 score: %.4f" % testScore)
import os import word2vec from keras.models import Model, load_model from keras.layers import Dense, LSTM, Input from keras.layers.embeddings import Embedding from keras.callbacks import ModelCheckpoint from preprocessing import embeddingMatrix, preprocessDeepModel, toBoolList, getTrainTest from utils import openPickle paddedSeq = preprocessDeepModel("./Data/Learn/correctedSequences.pkl", "./Data/Learn/kerasSequences.pkl") labels = np.array(toBoolList( openPickle("./Data/Learn/labels.pkl"))).astype(int) print('Shape of data tensor:', paddedSeq.shape) print('Shape of label tensor:', labels.shape) trainInd, testInd = getTrainTest(labels) X_train, X_val = paddedSeq[trainInd], paddedSeq[testInd] y_train, y_val = labels[trainInd], labels[testInd] w2v = word2vec.load( "./Resources/frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin") encoder = openPickle("./Data/newDict.pkl") decoder = {encoder[key]: key for key in encoder}
# coding: utf-8 from keras.models import load_model from sklearn.metrics import f1_score from utils import openPickle, savePickle from preprocessing import getTrainTest, toBoolList from preprocessing import sequencesCorrecter, preprocessDeepModel from postprocessing import convertLabels # Train Data paddedTrainSeq = preprocessDeepModel("./Data/Learn/correctedSequences.pkl", "./Data/Learn/kerasSequences.pkl", 409) labels = toBoolList(openPickle("./Data/Learn/labels.pkl")) trainInd, testInd = getTrainTest(labels) X_train, X_val = paddedTrainSeq[trainInd], paddedTrainSeq[testInd] y_train, y_val = labels[trainInd], labels[testInd] # Test Data sequences = openPickle("./Data/Test/sequences.pkl") correcter = openPickle("./Resources/tokenCorrecter.pkl") correctedSequences = sequencesCorrecter(sequences, correcter) savePickle("./Data/Test/correctedSequences.pkl", correctedSequences) paddedSeq = preprocessDeepModel("./Data/Test/correctedSequences.pkl", "./Data/Test/kerasSequences.pkl", 409)