def get_sets(self): """Returns as many balanced lists of documents as many folds.""" documents_by_class = Dataset.get_documents_by_class( self.dataset.documents) sets = [] for i in range(0, self.folds): set = [] for c in documents_by_class: n = floor(len(documents_by_class[c]) / self.folds) for j in range(0, n): set.append(documents_by_class[c][i * n + j]) sets.append(set) return sets
def test_get_documents_by_class(self): documents = [ ClassDocument("A positive document", "+"), ClassDocument("Another positive document", "+"), ClassDocument("Yet one more positive document", "+"), ClassDocument("This time a negative one", "-"), ClassDocument("Still negative", "-") ] documents_by_class = Dataset.get_documents_by_class(documents) expected_documents_by_class = { "+": [documents[0], documents[1], documents[2]], "-": [documents[3], documents[4]] } self.assertEqual(documents_by_class, expected_documents_by_class)
def test_get_split_documents(self): documents = [ ClassDocument("pos1", "+"), ClassDocument("pos2", "+"), ClassDocument("pos3", "+"), ClassDocument("pos4", "+"), ClassDocument("pos5", "+"), ClassDocument("neg1", "-"), ClassDocument("neg2", "-"), ClassDocument("neg3", "-"), ClassDocument("neg4", "-"), ClassDocument("neg5", "-"), ] split_documents = Dataset.get_split_documents(documents, 0.8) expected_split_documents = [ documents[0], documents[1], documents[2], documents[3], documents[5], documents[6], documents[7], documents[8] ], [documents[4], documents[9]] self.assertEqual(split_documents, expected_split_documents)
def train_nb(filename, label_col='label', text_col='text', validation_split=0.3): dataset = Dataset(filename, label_col=label_col, text_col=text_col) dataset.load() dataset.preprocess_texts(stemming=True) data = dataset.cleaned_data.copy() train = pd.DataFrame(columns=['label', 'text']) validation = pd.DataFrame(columns=['label', 'text']) for label in data.label.unique(): label_data = data[data.label == label] train_data, validation_data = train_test_split( label_data, test_size=validation_split) train = pd.concat([train, train_data]) validation = pd.concat([validation, validation_data]) text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())]) tuned_parameters = { 'vect__ngram_range': [(1, 1), (1, 2), (2, 2)], 'tfidf__use_idf': (True, False), 'tfidf__norm': ('l1', 'l2'), 'clf__alpha': [1, 1e-1, 1e-2] } x_train = train.text y_train = train.label x_validation = validation.text y_validation = validation.label print('Running Multinomial Naive Bayes...') start = time() model = GridSearchCV(text_clf, tuned_parameters, n_jobs=4, cv=10) model.fit(x_train, y_train) print('Finished in: {} mins'.format(round((time() - start) / 60, 2))) print('Testing Model...') results = model.predict(x_validation) print(classification_report(y_validation, results, digits=4)) filepath = Path('models/emotion_recognition/nb_model.pickle').resolve() with filepath.open('wb') as file: pickle.dump(model, file)
def train(model_type, dataset_path, tokenizer_path, save_dir, glove_embeddings, word2vec_embeddings, token_index_path, ngram_range, max_length, label_col='label', text_col='text', validation_split=0.3, embedding_dim=100, input_length=100, learning_rate=1e-3, epochs=10, batch_size=32): dataset = Dataset(dataset_path, label_col=label_col, text_col=text_col) dataset.load() dataset.preprocess_texts(stemming=True) tokenizer_file = Path(tokenizer_path).resolve() with tokenizer_file.open('rb') as file: tokenizer = pickle.load(file) data = dataset.cleaned_data.copy() train = pd.DataFrame(columns=['label', 'text']) validation = pd.DataFrame(columns=['label', 'text']) for label in data.label.unique(): label_data = data[data.label == label] train_data, validation_data = train_test_split( label_data, test_size=validation_split) train = pd.concat([train, train_data]) validation = pd.concat([validation, validation_data]) embedding_layer = None if (glove_embeddings): embeddings_index = {} f = Path(os.getenv("GLOVE_EMBEDDINGS")).open() for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close() embedding_dim = int(os.getenv("EMBEDDING_DIM")) embedding_matrix = np.zeros( (len(tokenizer.word_index) + 1, embedding_dim)) for word, i in tokenizer.word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector embedding_layer = Embedding(len(tokenizer.word_index) + 1, embedding_dim, weights=[embedding_matrix], input_length=input_length, trainable=True) if (word2vec_embeddings): embedding = KeyedVectors.load_word2vec_format( os.getenv("WORD2VEC_EMBEDDINGS")) embedding_matrix = np.zeros( (len(tokenizer.word_index) + 1, embedding.vector_size)) for word, i in tokenizer.word_index.items(): try: embedding_vector = embedding.get_vector(word) embedding_matrix[i] = embedding_vector except: pass embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=embedding.vector_size, weights=[embedding_matrix], input_length=input_length, trainable=False, input_shape=(input_length, )) input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1) train_sequences = [text.split() for text in train.text] validation_sequences = [text.split() for text in validation.text] list_tokenized_train = tokenizer.texts_to_sequences(train_sequences) list_tokenized_validation = tokenizer.texts_to_sequences( validation_sequences) if (model_type == 'fasttext'): with Path(token_index_path).resolve().open('rb') as file: token_index = pickle.load(file) list_tokenized_train = add_ngram(train_sequences, token_index, ngram_range) list_tokenized_validation = add_ngram(validation_sequences, token_index, ngram_range) input_dim = max_length model = NLP_MODEL[model_type](input_length, input_dim, embedding_layer, embedding_dim=embedding_dim) optimizer = Adam(learning_rate) model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy']) print(model.summary()) x_train = pad_sequences(list_tokenized_train, maxlen=input_length) x_validation = pad_sequences(list_tokenized_validation, maxlen=input_length) y_train = train.label.replace(4, 1) y_validation = validation.label.replace(4, 1) model_name = model_type + '_' + str(embedding_dim) + '_' + str( input_length) checkpoint_path = os.path.join( save_dir, 'checkpoints', model_name + '_{epoch:02d}-{val_acc:.4f}.h5') log_dir = os.path.join(save_dir, 'logs', model_name) if not os.path.exists(log_dir): os.makedirs(log_dir) model.fit(x_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_validation, y_validation), callbacks=[ checkpoints(checkpoint_path), tensorboard(log_dir, batch_size), early_stopping(10), reduce_lr(5) ]) model_file = Path(save_dir, model_name + '.h5').resolve() model.save_weights(model_file.as_posix())
# cross_validator.py <folds> <dataset> <classifier> <options> if __name__ == '__main__': print(" ") format = "cross_validator.py <folds> <dataset> <classifier> [<options>]\n" arguments = sys.argv if len(arguments) >= 4: folds = 0 try: folds = int(arguments[1]) except: print("Invalid format for number of folds (should be integer).") print("\n > " + format) exit() dataset = Dataset.get_dataset(arguments[2]) if dataset is not None: cv = None if arguments[3] == "naive_bayes": cv = NaiveBayesCrossValidator(dataset, folds) elif arguments[3] == "knn": if len(arguments) == 5: k = 0 try: k = int(arguments[4]) except: print("Invalid number format for number of neighbours") print( "\n > cross_validator.py <folds> <dataset> <classifier> <neighbours>\n" ) exit()