def main(datapath, modelpath, idxs): dataset_names = get_filenames(datapath) print(f'using sets {dataset_names} from {datapath}') print('looking for idx files') idxs = load_pkl(idxs) idxs_train = idxs['train'] idxs_test = idxs['test'] for dataset_name in dataset_names: dataset: Dataset = load_pkl(datapath+dataset_name) X = dataset.data.detach().numpy() Y = dataset.get_labels_numerical() x_train, x_test = X[idxs_train], X[idxs_test] y_train, y_test = Y[idxs_train], Y[idxs_test] hiddim = dataset_name.split('/')[-1].split('_')[2] print(f'training gmlvq on {hiddim} dim embedding') gmlvq = GmlvqModel() gmlvq.fit(x_train, y_train) train_error = get_error(gmlvq, x_train, y_train) test_error = get_error(gmlvq, x_test, y_test) var = gmlvq_covered_variance(gmlvq, thresh=1, verbose=True) misc = {'train_error': train_error, 'test_error' : test_error, 'matrix_var' : var} print(f'adding misc data to gmlvq model {misc}') gmlvq.misc = misc modelname = f'gmlvq{hiddim}.pkl' print(f'saving model to {modelname}') pkl.dump(gmlvq, open(modelpath+modelname, 'wb'))
def do_experiment(title, cols, labels): ''' perform the classification experiment with given data ''' print(f'{title} experiment') data = read_cols(cols) avg_acc = 0 avg_cm = np.zeros((num_classes, num_classes)) relevances = np.empty((NUM_CV, data.shape[1])) scores = np.empty((NUM_CV)) i = 0 for train, test in KFold(n_splits=NUM_CV).split(data): gmlvq = GmlvqModel(prototypes_per_class=[1, 1, 1, 1]) gmlvq.fit(data[train], labels[train]) score = gmlvq.score(data[test], labels[test]) scores[i] = score relMatrix = np.dot(np.transpose(gmlvq.omega_), gmlvq.omega_) relevances[i] = np.diag(relMatrix) label_pred = gmlvq.predict(data[test]) avg_cm += confusion_matrix(labels[test], label_pred) i += 1 os.makedirs(OUTPUT_DIR, exist_ok=True) # normalize confusion matrix avg_cm = avg_cm.astype('float') / avg_cm.sum(axis=1)[:, np.newaxis] avg_acc = np.mean(scores) print(f'mean score: {np.mean(scores)} - variance score: {np.var(scores)}') # save confusion matrix figure plot_confusion_matrix(title, avg_cm) plt.savefig(f'{OUTPUT_DIR}/CM_{title}.pdf') plt.clf plot_relevances(title, cols, relevances) plt.savefig(f'{OUTPUT_DIR}/REL_{title}.pdf') plt.clf
def test_gmlvq(): # Load data X, y = load_iris(True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=4242) # Create and fit model model = GmlvqModel(prototypes_per_class=3, max_iter=200, random_state=4242, dim=2) model.fit(X_train, y_train) # Select data point for explaining its prediction x_orig = X_test[1:4][0, :] assert model.predict([x_orig]) == 2 # Compute counterfactual features_whitelist = None x_cf, y_cf, delta = generate_counterfactual( model, x_orig, 0, features_whitelist=features_whitelist, regularization="l1", C=0.01, optimizer="bfgs", return_as_dict=False) assert y_cf == 0 assert model.predict(np.array([x_cf])) == 0 x_cf, y_cf, delta = generate_counterfactual( model, x_orig, 0, features_whitelist=features_whitelist, regularization="l1", C=1.0, optimizer="nelder-mead", return_as_dict=False) assert y_cf == 0 assert model.predict(np.array([x_cf])) == 0 x_cf, y_cf, delta = generate_counterfactual( model, x_orig, 0, features_whitelist=features_whitelist, regularization=None, optimizer="bfgs", return_as_dict=False) assert y_cf == 0 assert model.predict(np.array([x_cf])) == 0 x_cf, y_cf, delta = generate_counterfactual( model, x_orig, 0, features_whitelist=features_whitelist, regularization=None, optimizer="nelder-mead", return_as_dict=False) assert y_cf == 0 assert model.predict(np.array([x_cf])) == 0 features_whitelist = [0, 1, 2, 3] x_cf, y_cf, delta = generate_counterfactual( model, x_orig, 0, features_whitelist=features_whitelist, regularization="l1", C=0.01, optimizer="bfgs", return_as_dict=False) assert y_cf == 0 assert model.predict(np.array([x_cf])) == 0 assert all([ True if i in features_whitelist else delta[i] == 0. for i in range(x_orig.shape[0]) ]) x_cf, y_cf, delta = generate_counterfactual( model, x_orig, 0, features_whitelist=features_whitelist, regularization="l1", C=1.0, optimizer="nelder-mead", return_as_dict=False) assert y_cf == 0 assert model.predict(np.array([x_cf])) == 0 assert all([ True if i in features_whitelist else delta[i] == 0. for i in range(x_orig.shape[0]) ]) features_whitelist = [0, 2] x_cf, y_cf, delta = generate_counterfactual( model, x_orig, 0, features_whitelist=features_whitelist, regularization=None, optimizer="nelder-mead", return_as_dict=False) assert y_cf == 0 assert model.predict(np.array([x_cf])) == 0 assert all([ True if i in features_whitelist else delta[i] == 0. for i in range(x_orig.shape[0]) ])
glvq.fit(x, y) p2 = plt.subplot(232) p2.set_title('GLVQ') plot(PCA().fit_transform(x), y, glvq.predict(x), glvq.w_, glvq.c_w_, p2) # GRLVQ grlvq = GrlvqModel() grlvq.fit(x, y) p3 = plt.subplot(233) p3.set_title('GRLVQ') plot(grlvq.project(x, 2), y, grlvq.predict(x), grlvq.project(grlvq.w_, 2), grlvq.c_w_, p3) # GMLVQ gmlvq = GmlvqModel() gmlvq.fit(x, y) p4 = plt.subplot(234) p4.set_title('GMLVQ') plot(gmlvq.project(x, 2), y, gmlvq.predict(x), gmlvq.project(gmlvq.w_, 2), gmlvq.c_w_, p4) # LGMLVQ lgmlvq = LgmlvqModel() lgmlvq.fit(x, y) p5 = plt.subplot(235) elem_set = list(set(lgmlvq.c_w_)) p5.set_title('LGMLVQ 1') plot(lgmlvq.project(x, 1, 2, True), y, lgmlvq.predict(x), lgmlvq.project(np.array([lgmlvq.w_[1]]), 1, 2),
show_plot = False # Evaluation scores accuracy_noadjust = [] accuracy_adjust = [] # Create initial data set class_means = np.array([[0.0, 0.0], [5.0, 8.0]]) #class_means = np.array([[0.0, 0.0], [5.0, 8.0, ], [8.0, 0.0]]) cov = np.array([[1.0, 0.0], [0.0, 1.0]]) #cov = np.array([[0.1, 0.0], [0.0, 5.0]]) X, y = sample_from_classdist(class_means, cov) #plot_classification_dataset(X, y) model = GmlvqModel(prototypes_per_class=1, random_state=4242) # Fit model to initial data set #model = MrslvqModel(prototypes_per_class=1, random_state=4242) model.fit(X, y) print(np.dot(model.omega_.T, model.omega_)) mymodel = MyModel(model.w_, np.dot(model.omega_.T, model.omega_)) y_pred = mymodel.predict(X) print("MyModel: {0}".format(accuracy_score(y, y_pred))) #model2 = LgmlvqModel(prototypes_per_class=1, random_state=4242) model2 = LmrslvqModel(prototypes_per_class=1, random_state=4242) model2.fit(X, y) print([np.dot(o.T, o) for o in model2.omegas_]) mymodel2 = MyModel2(model2.w_, [np.dot(o.T, o) for o in model2.omegas_]) y_pred2 = mymodel2.predict(X)
metric = ['euclidean','minkowski'] param_grid= dict(n_neighbors=k_list, metric=metric) knn = GridSearchCV(classifier, param_grid) knn.fit(val_x, val_y) print('Os melhores parâmetros foram:', knn.best_params_) best_n = knn.best_params_['n_neighbors'] """#### **Classificador LVQ** Utilizou-se o pacote *sklearn* para implementação do LVQ e verificou-se os melhores valores para regularização. """ #!pip install sklearn-lvq warnings.filterwarnings(action='ignore') glvq = GmlvqModel(gtol=1e-1, max_iter=150) param_grid = {'regularization': [0.0, 0.1, 0.5] }#, 'beta': [1, 2]} clf = GridSearchCV(glvq,param_grid) clf.fit(val_x, val_y) print('O melhor:', clf.best_params_) best_reg = clf.best_params_['regularization'] #best_beta = clf.best_params_['beta'] warnings.filterwarnings(action='default') #reabilitando os warnings """#### **Classificador SVM** Fez uso do pacote do *sklearn* para implementação do SVM e verificou-se os melhores valores para o parâmetro de regularização (C) e também qual o melhor kernel. """ svm = SVC() param_grid = {'C': [0.5, 1.0, 10], 'kernel': ['rbf', 'sigmoid']}
scores_cf_perturbation_dist = [] results = {'notFound': 0, 'found': 0} kf = KFold(n_splits=n_kf_splits) for train_index, test_index in kf.split(X): # Split data into training and test set X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # Fit and evaluate classifier model = None if modeldesc == "glvq": model = GlvqModel(prototypes_per_class=n_prototypes) elif modeldesc == "gmlvq": model = GmlvqModel(prototypes_per_class=n_prototypes) elif modeldesc == "logreg": model = LogisticRegression(multi_class='multinomial') elif modeldesc == "dectree": model = DecisionTreeClassifier(max_depth=7) model.fit(X_train, y_train) # Compute accuracy on test set y_pred = model.predict(X_test) print(f"F1-score: {f1_score(y_test, y_pred, average='weighted')}") labels = np.unique(y) # Compute counterfactual of each test sample for i in range(X_test.shape[0]): x_orig_orig = X_test[i,:] y_orig = y_test[i]
np.array([0 for _ in range(n_samples)]).reshape(-1, 1))) y = [0 for _ in range(n_samples)] X = np.vstack( (X, np.hstack((np.random.uniform(7, 12, n_samples).reshape(-1, 1), np.array([5 for _ in range(n_samples)]).reshape(-1, 1))))) y += [1 for _ in range(n_samples)] y = np.array(y) from plotting import plot_classification_dataset, export_as_png plot_classification_dataset(X, y, show=False) export_as_png("toydata.png") # Fit model model = GmlvqModel(prototypes_per_class=1, random_state=4242) model.fit(X, y) # Evaluate y_pred = model.predict(X) y_, y_pred_ = encode_labels(y.reshape(-1, 1), y_pred.reshape(-1, 1)) print("ROC-AUC: {0}".format(roc_auc_score(y_, y_pred_, average="weighted"))) print("Omega\n{0}".format(np.dot(model.omega_.T, model.omega_))) print() # Compute counterfactual metric x_orig = np.array([10.0, 0]) y_target = 1 Omega_cf = compute_change_in_distmat_gmlvq(model, x_orig, y_target)[0]
pca = PCA(n_components=pca_dim) pca.fit(X_train) projection_matrix = pca.components_ # Projection matrix projection_mean_sub = pca.mean_ #print(projection_matrix) X_train = np.dot(X_train - projection_mean_sub, projection_matrix.T) X_test = np.dot(X_test - projection_mean_sub, projection_matrix.T) # Fit classifier model = None if modeldesc == "glvq": model = GlvqModel(prototypes_per_class=n_prototypes, random_state=4242) elif modeldesc == "gmlvq": model = GmlvqModel(prototypes_per_class=n_prototypes, random_state=4242) elif modeldesc == "logreg": model = LogisticRegression(multi_class='multinomial') elif modeldesc == "dectree": model = DecisionTreeClassifier(max_depth=7, random_state=42) model.fit(X_train, y_train) # Compute accuracy on test set y_pred = model.predict(X_test) print(f"F1-score: {f1_score(y_test, y_pred, average='weighted')}") # Fit model for finding closest samples closest_samples = ClosestSample(X_train_orig, y_train) # For each class, fit density estimators density_estimators = {}
def get_error(lvq_model: GmlvqModel, x, y) -> float: y_ = lvq_model.predict(x) errors = 1 - np.mean(y_ == y) return errors
(big circle) and which class was predicted (smaller circle). It also shows the prototypes (black diamond) and their labels (small point inside the diamond). The projected data is shown in the right plot. """ import matplotlib.pyplot as plt import numpy as np from sklearn_lvq import GmlvqModel from sklearn_lvq.utils import plot2d print(__doc__) nb_ppc = 100 toy_label = np.append(np.zeros(nb_ppc), np.ones(nb_ppc), axis=0) print('GMLVQ:') toy_data = np.append(np.random.multivariate_normal([0, 0], np.array([[5, 4], [4, 6]]), size=nb_ppc), np.random.multivariate_normal([9, 0], np.array([[5, 4], [4, 6]]), size=nb_ppc), axis=0) gmlvq = GmlvqModel() gmlvq.fit(toy_data, toy_label) plot2d(gmlvq, toy_data, toy_label, 1, 'gmlvq') print('classification accuracy:', gmlvq.score(toy_data, toy_label)) plt.show()