def evaluate_LFW(model, embedding_size, use_flipped_images=False, N_folds=5, distance_metric=1, verbose=1): pairs = read_pairs(os.path.expanduser(LFW_PAIRS_PATH)) paths, actual_issame = get_paths(os.path.expanduser(LFW_DIR), pairs) ds = tf_dataset_from_paths(paths, flip=False) embeddings = np.zeros([len(paths), embedding_size]) j = 0 if verbose >= 2: print("Feed forward all pairs") for batch in ds: batch_embeddings = model(batch).numpy() embeddings[j:j + len(batch)] = batch_embeddings j += len(batch) if use_flipped_images: if verbose >= 2: print("Feed forward all pairs - flipped") flip_ds = tf_dataset_from_paths(paths, flip=True) flip_embeddings = np.zeros([len(paths), embedding_size]) j = 0 for batch in flip_ds: batch_embeddings = model(batch).numpy() flip_embeddings[j:j + len(batch)] = batch_embeddings j += len(batch) full_embeddings = np.zeros((len(paths), embedding_size * 2)) full_embeddings[:, :embedding_size] = embeddings full_embeddings[:, embedding_size:] = flip_embeddings if verbose >= 2: print("Calculating metrics") if use_flipped_images: tpr, fpr, accuracy, val, val_std, far, best_thresholds = evaluate( (embeddings + flip_embeddings) / 2, actual_issame, nrof_folds=N_folds, distance_metric=distance_metric) else: tpr, fpr, accuracy, val, val_std, far, best_thresholds = evaluate( embeddings, actual_issame, nrof_folds=N_folds, distance_metric=distance_metric) if verbose: print('Accuracy: %2.5f+-%2.5f' % (np.mean(accuracy), np.std(accuracy))) print('Validation rate: %2.5f+-%2.5f @ FAR=%2.5f' % (val, val_std, far)) print('threshold : %2.5f+-%2.5f' % (np.mean(best_thresholds), np.std(best_thresholds))) auc = metrics.auc(fpr, tpr) print('Area Under Curve (AUC): %1.3f' % auc) eer = brentq(lambda x: 1. - x - interpolate.interp1d(fpr, tpr)(x), 0., 1.) print('Equal Error Rate (EER): %1.3f' % eer) return accuracy
def gensim_classifier(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) label_list = get_labels() tweet_list = get_labelled_tweets() # split all sentences to list of words sentences = [] for tweet in tweet_list: temp_doc = tweet.split() sentences.append(temp_doc) # parameters for model num_features = 100 min_word_count = 1 num_workers = 4 context = 2 downsampling = 1e-3 # Initialize and train the model w2v_model = Word2Vec(sentences, workers=num_workers, \ size=num_features, min_count = min_word_count, \ window = context, sample = downsampling, seed=1) index_value, train_set, test_set = train_test_split(0.80, sentences) train_vector = getAvgFeatureVecs(train_set, w2v_model, num_features) test_vector = getAvgFeatureVecs(test_set, w2v_model, num_features) train_vector = Imputer().fit_transform(train_vector) test_vector = Imputer().fit_transform(test_vector) # train model and predict model = LinearSVC() classifier_fitted = OneVsRestClassifier(model).fit( train_vector, label_list[:index_value]) result = classifier_fitted.predict(test_vector) # output result to csv create_directory('data') result.tofile("data/w2v_linsvc.csv", sep=',') # store the model to mmap-able files create_directory('model') joblib.dump(model, 'model/%s.pkl' % 'w2v_linsvc') # evaluation label_score = classifier_fitted.decision_function(test_vector) binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(label_list, classes=class_list) evaluate(binarise_result, binarise_labels[index_value:], label_score, 'w2v_linsvc')
def gensim_classifier(): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) label_list = get_labels() tweet_list = get_labelled_tweets() # split all sentences to list of words sentences = [] for tweet in tweet_list: temp_doc = tweet.split() sentences.append(temp_doc) # parameters for model num_features = 100 min_word_count = 1 num_workers = 4 context = 2 downsampling = 1e-3 # Initialize and train the model w2v_model = Word2Vec(sentences, workers=num_workers, \ size=num_features, min_count = min_word_count, \ window = context, sample = downsampling, seed=1) index_value, train_set, test_set = train_test_split(0.80, sentences) train_vector = getAvgFeatureVecs(train_set, w2v_model, num_features) test_vector = getAvgFeatureVecs(test_set, w2v_model, num_features) train_vector = Imputer().fit_transform(train_vector) test_vector = Imputer().fit_transform(test_vector) # train model and predict model = LinearSVC() classifier_fitted = OneVsRestClassifier(model).fit(train_vector, label_list[:index_value]) result = classifier_fitted.predict(test_vector) # output result to csv create_directory('data') result.tofile("data/w2v_linsvc.csv", sep=',') # store the model to mmap-able files create_directory('model') joblib.dump(model, 'model/%s.pkl' % 'w2v_linsvc') # evaluation label_score = classifier_fitted.decision_function(test_vector) binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(label_list, classes=class_list) evaluate(binarise_result, binarise_labels[index_value:], label_score, 'w2v_linsvc')
def find_alphas(preprocess=None): index, categories, categories_words, N_words = analyze_training_data( './data/') results = {} for alpha in [i / 10 for i in range(1, 51)]: predictions, labels = naive_bayes_on_validation_set( './data/', index, categories, categories_words, N_words, alpha=alpha, preprocess=preprocess) results[alpha] = evaluate(predictions, labels) if preprocess is not None: f = open( 'naive_bayes_with_different_alphas_' + str(preprocess) + '_.txt', 'w+') else: f = open('naive_bayes_with_different_alphas.txt', 'w+') for alpha in results.keys(): print("##############################\nRESULTS FOR ALPHA={}\n".format( alpha), results[alpha], file=f) f.close()
def perform_on_specific_setting(self, method, k): predictions, labels = self.knn_on_validation_set(k=k, method=method) result = evaluate(predictions, labels) f = open('knn_with_' + str(self.preprocess) + '.txt', 'w+') print("##############################\nRESULTS FOR M={} K={}\n".format( method, k), result, file=f) f.close()
def search_best_n_est_and_max_depth(data_path): results = {} for num_estimators in [i for i in range(20, 200, 20)]: for max_depth in [i for i in range(100, 500, 50)]: predictions, labels = validation(data_path, num_estimators, max_depth) results[(num_estimators, max_depth)] = evaluate(predictions, labels) f = open('random_forest_with_different_parameters.txt', 'w+') for k in results.keys(): print("##############################\nRESULTS FOR (N_EST, MAX_DEPTH)={}\n".format(k), results[k], file=f) f.close()
def search_best_c(data_path): results = {} for C in [i / 10 for i in range(1, 21)]: predictions, labels = validation(data_path, C) results[C] = evaluate(predictions, labels) f = open('SVM_with_different_Cs.txt', 'w+') for C in results.keys(): print("##############################\nRESULTS FOR C={}\n".format(C), results[C], file=f) f.close()
def lin_svc(): label_list = get_labels() tweet_list = get_labelled_tweets() # vectorise using tf-idf vectoriser = TfidfVectorizer( min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1, ) ## do transformation into vector fitted_vectoriser = vectoriser.fit(tweet_list) vectorised_tweet_list = fitted_vectoriser.transform(tweet_list) train_vector, test_vector, train_labels, test_labels = train_test_split( vectorised_tweet_list, label_list, test_size=0.8, random_state=42) # train model and predict model = LinearSVC() ovr_classifier = OneVsRestClassifier(model).fit(train_vector, train_labels) result = ovr_classifier.predict(test_vector) # output result to csv create_directory('data') save_to_csv("data/testset_labels.csv", test_labels) result.tofile("data/tfidf_linsvc.csv", sep=',') save_model(ovr_classifier, 'tfidf_linsvc') save_vectoriser(fitted_vectoriser, 'tfidf_vectoriser') # evaluation label_score = ovr_classifier.decision_function(test_vector) binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(test_labels, classes=class_list) evaluate(binarise_result, binarise_labels, label_score, 'tfidf_linsvc')
def lin_svc(): label_list = get_labels() tweet_list = get_labelled_tweets() # vectorise using tf-idf vectoriser = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1,) ## do transformation into vector fitted_vectoriser = vectoriser.fit(tweet_list) vectorised_tweet_list = fitted_vectoriser.transform(tweet_list) train_vector, test_vector, train_labels, test_labels = train_test_split(vectorised_tweet_list, label_list, test_size=0.8, random_state=42) # train model and predict model = LinearSVC() ovr_classifier = OneVsRestClassifier(model).fit(train_vector, train_labels) result = ovr_classifier.predict(test_vector) # output result to csv create_directory('data') save_to_csv("data/testset_labels.csv", test_labels) result.tofile("data/tfidf_linsvc.csv", sep=',') save_model(ovr_classifier, 'tfidf_linsvc') save_vectoriser(fitted_vectoriser, 'tfidf_vectoriser') # evaluation label_score = ovr_classifier.decision_function(test_vector) binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(test_labels, classes=class_list) evaluate(binarise_result, binarise_labels, label_score, 'tfidf_linsvc')
def find_results_of_ks(self): results = {m: {} for m in self.methods} for m in self.methods: for k in [1, 3, 5]: t1 = time.time() predictions, labels = self.knn_on_validation_set(k=k, method=m) results[m][k] = evaluate(predictions, labels) t2 = time.time() print(t2 - t1, 'secs') f = open('knn_with_different_ks.txt', 'w+') for m in self.methods: for k in results[m].keys(): print( "##############################\nRESULTS FOR M={} K={}\n". format(m, k), results[m][k], file=f) f.close()