def Bagging(bot: list, human: list, roc=False): training, test = classifier(bot, human) m = BaggingClassifier() m.fit(*training) predict = m.predict(test[0]) matrix = confusion_matrix(test[1], predict) if roc: x, y, _ = roc_curve(test[1], roc_helper(predict), pos_label=1) roc_stats['KNeighbors'] = (x, y) return m.score(*test), matrix, classification_report(test[1], predict)
def Logistic(bot: list, human: list, roc=False) -> tuple: training, test = classifier(bot, human) m = LogisticRegression() m.fit(*training) predict = m.predict(test[0]) matrix = confusion_matrix(test[1], predict) if roc: x, y, _ = roc_curve(test[1], roc_helper(predict), pos_label=1) roc_stats['Logistic'] = (x, y) return m.score(*test), matrix, classification_report(test[1], predict)
label = int(folder.replace("s", "")) images = os.listdir(pathToData + "/" + folder) for img in images: path = pathToData + "/" + folder + "/" + img # print path image = cv2.imread(path, 0) ## print type(image) # cv2.imshow("Training on image...", image) # cv2.waitKey(100) # gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) ## print type(face) labels.append(label) faces.append(image) cv2.destroyAllWindows() cv2.waitKey(1) cv2.destroyAllWindows() return faces, labels if __name__ == '__main__': faces, labels = ExtractFace('KL/Faces') lbp_images = [] for face in faces: lbp = LBP(face) lbp_image, _ = lbp.createLBPImage() lbp_images.append(lbp.Histogram(lbp_image)) clf = classifier(lbp_images, labels) clf.svm() clf.random_forest() clf.KNN()
def main(dataset, word_embedding, use_encoder, train_encoder): all_dataset = ["imdb", "20newsgroups"] dataset = dataset.lower() if not dataset in all_dataset: raise InvalidArgument all_word_embeddings = ["glove", "fasttext", "lexvec"] word_embedding = word_embedding.lower() if not word_embedding in all_word_embeddings: raise InvalidArgument # initial tokenizer tokenizer = tk.Tokenizer() if use_encoder: # get trained encoder weigths kernel, bias = get_encoder(tokenizer, word_embedding) # load data set if dataset == "imdb": HP = ImdbHP() train_data, train_labels, test_data, test_labels = imdb.get_imdb() else: HP = NewsGroupsHP() train_data, train_labels, labels_index = ng.get_data( 'data/20newsgroups/train') test_data, test_labels, _ = ng.get_data('data/20newsgroups/test') HP.MODEL_DIR = HP.MODEL_DIR + word_embedding + "/" # convert word to index train_idxs, train_sent_lens, train_docs_lens = data2idx( train_data, HP, tokenizer) test_idxs, test_sent_lens, test_docs_lens = data2idx( test_data, HP, tokenizer) # Important: we need to add 1 for zero index (pad value) HP.VOCAB_LEN = len(tokenizer.word2index) + 1 HP.START_TOKEN = tokenizer.start_index HP.END_TOKEN = tokenizer.end_index # get word embedding path if word_embedding == all_word_embeddings[0]: word_embedding_path = HP.GLOVE_PATH elif word_embedding == all_word_embeddings[1]: word_embedding_path = HP.FASTTEXT_PATH else: word_embedding_path = HP.LEXVEC_PATH with tf.Graph().as_default() as g: with tf.Session() as sess: epoch_loss = 0 # Create a new model or reload existing checkpoint model = classifier(HP) sess.run(tf.global_variables_initializer()) if use_encoder: model.assign_encoder(sess, kernel, bias, train_encoder) # load glove and feed it to embedding layer weights = load_embedding(tokenizer.word2index, HP.EMB_DIM, word_embedding_path) model.assign_embedding(sess, weights) # Create a log writer object log_writer = tf.summary.FileWriter(HP.MODEL_DIR, graph=sess.graph) accuracies = [] valid_loss = 0 best_accuracy = 0 saver = tf.train.Saver(max_to_keep=3) for e in range(HP.N_EPOCHS): # initial batchizer train_batchizer = batchizer(train_idxs, train_sent_lens, train_docs_lens, train_labels, HP.BATCH_SIZE, HP.MAX_SENT) test_batchizer = batchizer(test_idxs, test_sent_lens, test_docs_lens, test_labels, HP.BATCH_SIZE, HP.MAX_SENT) if model.global_epoch_step.eval() >= HP.N_EPOCHS: print('Training is already complete.') break for batch_x, sent_len, docs_len, batch_y in train_batchizer: if batch_x is None: continue batch_loss, summary = model.train(sess, batch_x, sent_len, docs_len, batch_y, HP.KEEP_PROB) epoch_loss += batch_loss log_writer.add_summary(summary, model.global_step.eval()) for batch_x, sent_len, docs_len, batch_y in test_batchizer: batch_loss, _, accuracy = model.eval( sess, batch_x, sent_len, docs_len, batch_y, 1.0) valid_loss += batch_loss accuracies.append(accuracy) acc_mean = np.mean(accuracies) # if acc_mean > best_accuracy: # best_accuracy = acc_mean # with tf.device('/cpu:0'): # checkpoint_path = os.path.join(HP.MODEL_DIR, HP.MODEL_NAME) # save_path = saver.save(sess, checkpoint_path, global_step=model.global_step) # print('model saved at %s' % save_path) print( "epoch:{0:2}: train loss:{1:8.2f},".format(e, epoch_loss), "validation loss:{0:8.2f} accuracy:{1:6.2f}%".format( valid_loss, acc_mean * 100)) accuracies = [] epoch_loss = 0 valid_loss = 0 # Increase the epoch index of the model model.global_epoch_step_op.eval() # checkpoint_path = os.path.join(HP.MODEL_DIR, HP.MODEL_NAME) # save_path = saver.save(sess, checkpoint_path, global_step=model.global_step) # print('model saved at %s' % save_path) print('Training Terminated')
def main(): X_train, X_test, y_train, y_test = get_data() X_train, X_test = feature_extractor( X_train, X_test) #comment this line to remove PCA print(classifier(X_train, y_train, X_test, y_test))