def GaussianClassifier(dataset, show=False): X, labels = dataset kernel = 1.0 * RBF(1.0) classifier = GaussianProcessClassifier(kernel=kernel, random_state=0).fit(X, labels) classifier.score(X, labels) new_labels = classifier.predict(X) conf = confusion_matrix(labels, new_labels) print(conf) conf = conf / conf.sum(axis=1) plt.matshow(conf) plt.title("Gaussian") plt.show() if show: for i in range(len(X[0])): if i >= 2: break for j in range(i): x = list(x[i] for x in X) y = list(x[j] for x in X) paint(x, y, labels, "labels") paint(x, y, new_labels, "prediction") plt.show()
def GP_Classifier(i): x_data, y_data = data_select(i) gpc = GaussianProcessClassifier(random_state=53) # split validation X_train, X_test, Y_train, Y_test = train_test_split(x_data, y_data, test_size=0.25, random_state=53) gpc.fit(X_train, np.ravel(Y_train, order='C')) train_score = gpc.score(X_train, Y_train) test_score = gpc.score(X_test, Y_test) print('Train Acc: %.3f, Test Acc: %.3f' % (train_score, test_score)) # K-fold validation kfold = model_selection.KFold(n_splits=10) results_kfold = model_selection.cross_val_score(gpc, x_data, np.ravel(y_data, order='C'), cv=kfold) print("Accuracy: %.2f%%" % (results_kfold.mean() * 100.0)) # leave one out validatoin loocv = LeaveOneOut() results_loocv = model_selection.cross_val_score(gpc, x_data, np.ravel(y_data, order='C'), cv=loocv) print("Accuracy: %.2f%%" % (results_loocv.mean() * 100.0))
def train_l2_gaussian(x_train, x_test, y_train, y_test): clf = GaussianProcessClassifier() clf.fit(x_train, y_train) if y_test is not None: print('GaussianProcessClassifier:', clf.score(x_test, y_test)) else: print('GaussianProcessClassifier:', clf.score(x_train, y_train)) return np.reshape(clf.predict(x_train), (-1, 1))
def one_vs_rest_gauss_process_with_log(): raw_frame=thal_data() x=raw_frame.drop(['thal','pressure','cholestoral','age','heart_rate'],axis=1) y=raw_frame['thal'] x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=5) kernel = 1.0 * RBF(1.0) gpc = GaussianProcessClassifier(kernel=kernel,random_state=0, multi_class = 'one_vs_rest').fit(x_train, y_train) global train_score train_score.append(gpc.score(x_train,y_train)) global test_score test_score.append(gpc.score(x_test,y_test))
def train_l1_gaussian(x_train, x_test, y_train, y_test): clf = GaussianProcessClassifier(n_jobs=-1) clf.fit(x_train, y_train) if y_test is not None: print('GaussianProcessClassifier:', clf.score(x_test, y_test)) else: print('GaussianProcessClassifier:', clf.score(x_train, y_train)) test_res = np.reshape(clf.predict(x_train), (-1, 1)) train_res = np.reshape(clf.predict(x_test), (-1, 1)) return [test_res, train_res]
def task3(feature_sets, label_sets): sets = ["A", "B", "crashes", "diabetes", "ionosphere"] kernel = 1.0 * RBF(1.0) for i in range(5): n = len(label_sets[i]) m = np.linspace(10, .6 * n, num=10, dtype=int) div = int(n * .4) x_train = feature_sets[i][div:] x_test = feature_sets[i][:div] y_train = label_sets[i][div:] y_test = label_sets[i][:div] gpc_errors = [] for j in range(10): gpc = GPC(kernel=kernel, random_state=0) gpc.fit(x_train[:m[j] - 1], np.ravel(y_train[:m[j] - 1])) gpc_errors.append(1 - gpc.score(x_test, np.ravel(y_test))) plt.legend() plt.ylabel("Error") plt.xlabel("M value") plt.title(sets[i]) plt.plot(m, gpc_errors, label="GPC") plt.show() return
def compute_per_gaussian(self, max_iter=100): """Compute SVM per feature""" # per feature for feature_index in range(int(len(X[0])/45)): X_train_mod = [] # define training dataset for example in range(len(self.X_train)): # for each example (469) X_train_mod.append([self.X_train[example][self.epoch*self.neuron_num + self.counter]]) X_test_mod = [] # define testing dataset for example in range(len(self.X_test)): # for each example (469) X_test_mod.append([self.X_test[example][self.epoch*self.neuron_num + self.counter]]) gamma = 1e-2 c = 10 kernel = 'linear' clf = GPC(max_iter_predict=max_iter) # GPC model clf.fit(X_train_mod, self.y_train) # compute with only one feature score = clf.score(X_test_mod, self.y_test) self.features_accuracy.append(score) self.counter += 1
def compute_per_gaussian(self, max_iter=100): """Compute SVM per feature""" print(len(self.X_train)) print(len(self.X_train[0])) # per feature for feature_index in range(int(len(self.X[0]))): X_train_mod = [] # define training dataset for example in range(len(self.X_train)): # for each example (469) X_train_mod.append([self.X_train[example][self.counter]]) X_test_mod = [] # define testing dataset for example in range(len(self.X_test)): # for each example (469) X_test_mod.append([self.X_test[example][self.counter]]) clf = GPC(max_iter_predict=max_iter) # GPC model clf.fit(X_train_mod, self.y_train) # compute with only one feature score = clf.score(X_test_mod, self.y_test) self.features_accuracy.append(score) self.counter += 1
def main(): df_train = pd.read_csv('../train_dataset.csv') df_test = pd.read_csv('../test_dataset.csv') X_train, y_train = df_train.iloc[:, 2:], df_train.iloc[:, 0] X_test, y_test = df_test.iloc[:, 2:], df_test.iloc[:, 0] unique_labels = sorted(y_train.unique().tolist()) clf = GaussianProcessClassifier(max_iter_predict=500, warm_start=True, n_jobs=-1) clf.fit(X_train, y_train) print("\n\n{}\n".format(clf.score(X_test, y_test))) y_predicted = clf.predict(X_test) print("Generating confusion matrix figure... \n") stdfunc.plot_confusion_matrix(y_test, y_predicted, ml_name='GP', classes=unique_labels, title='Confusion matrix for Gaussian Process evaluation') print("Generating classification report figure... \n") stdfunc.plot_classification_report(y_test, y_predicted, ml_name='GP', classes=unique_labels, title='Classification report for Gaussian Process evaluation')
class GaussianProcess_(ProbabilisticModel): """GaussianProcess Classifier """ def __init__(self, *args, **kwargs): self.model = GaussianProcessClassifier(*args, **kwargs) self.name = "gpc" def train(self, dataset, *args, **kwargs): return self.model.fit(*(dataset.format_sklearn() + args), **kwargs) def predict(self, feature, *args, **kwargs): return self.model.predict(feature, *args, **kwargs) def score(self, testing_dataset, *args, **kwargs): return self.model.score(*(testing_dataset.format_sklearn() + args), **kwargs) def predict_proba(self, feature, *args, **kwargs): return self.model.predict_proba(feature, *args, **kwargs) def feature_importances_(self): LOGGER.warn("GPC model does not support feature_importance") return None def get_params(self): return self.model.get_params()
def gaussian_process_models(x_train, y_train): from sklearn.gaussian_process import GaussianProcessClassifier classifier1 = GaussianProcessClassifier() classifier1.fit(x_train, y_train) print('GaussianProcessClassifier training accuracy: ', classifier1.score(x_train, y_train)) return classifier1
def GPAL(X, Y, train_ind, candidate_ind, test_ind, sample='En', kernel='rbf', Niter=500, eta=10): ourRes = [] train_index = train_ind.copy() test_index = test_ind.copy() candidate_index = candidate_ind.copy() varRes = [] enRes = [] for i in range(Niter): print(i) if (kernel == 'linear'): dotkernel = DotProduct(sigma_0=1) model = GPC(kernel=dotkernel) else: model = GPC() model.fit(X[train_index], Y[train_index]) ourRes.append(model.score(X[test_index, :], Y[test_index])) print(ourRes[-1]) if (sample == 'rand'): sampleIndex = np.random.randint(len(candidate_index)) elif (sample == 'En'): proba = model.predict_proba(X[candidate_index, :]) en = sp.stats.entropy(proba.T) sampleScore = en sampleIndex = np.argmax(sampleScore) elif (sample == 'var'): model.predict_proba(X[candidate_index, :]) meanVar = np.zeros(len(candidate_index)) for tem in model.base_estimator_.estimators_: meanVar = meanVar + tem.var sampleIndex = np.argmax(meanVar) elif (sample == 'varEN'): proba = model.predict_proba(X[candidate_index, :]) en = sp.stats.entropy(proba.T) meanVar = np.zeros(len(candidate_index)) enRes.append(np.mean(en)) for tem in model.base_estimator_.estimators_: meanVar = meanVar + tem.var sampleIndex = np.argmax(meanVar / len(np.unique(Y)) * eta + en) varRes.append(np.mean(meanVar)) print('max var %f----selected var %f-----selected en %f ' % (np.max(meanVar), meanVar[sampleIndex], en[sampleIndex])) sampleIndex = candidate_index[sampleIndex] train_index = train_index + [sampleIndex] candidate_index = [ x for x in candidate_index if x not in [sampleIndex] ] return [ourRes, varRes, enRes]
def compute(dataSet, dataRes, dataTest, dataTestID): gauss = GaussianProcessClassifier() gauss.fit(dataSet, dataRes) with open('gaussian.csv', 'w') as subFile: fileWriter = csv.writer(subFile, delimiter=',') fileWriter.writerow(['PassengerId', 'Survived']) for i, row in enumerate(dataTest): predict = gauss.predict([row])[0] fileWriter.writerow([dataTestID[i], predict]) print('Gaussian', gauss.score(dataSet, dataRes))
def job(i): results = pd.DataFrame() df_train = pd.read_csv("preprocessed_training_" + str(i) + ".csv") df_train = df_train.drop(["ID"], axis=1) y = df_train["Class"] X = df_train.drop(['Class'], axis=1) df_test = pd.read_csv("preprocessed_test_" + str(i) + ".csv") df_test = df_test.drop(["ID"], axis=1) X_p = df_test.drop(["Class"], axis=1) global n_estimators #n_estimators = [1, 8] global max_iter_predict global warm_start global multi_class #max_iter_predict = [1, 2] random_state = 1428 for n in n_estimators: for d in max_iter_predict: for s in warm_start: for m in multi_class: result_row = {} result_row["n_estimators"] = n result_row["fold"] = i result_row["max_iter_predict"] = d result_row["warm_start"] = s result_row["multi_class"] = m print(result_row) rf = GaussianProcessClassifier(n_restarts_optimizer=n, max_iter_predict=d, warm_start=s, n_jobs=1, multi_class=m) rf.fit(X, y) predicted = rf.predict(X_p) result_row["score"] = round( rf.score(X_p, df_test["Class"]), 4) confusion = confusion_matrix(df_test["Class"], predicted) conf = pd.DataFrame(confusion) conf.to_csv(target_path + "confusion_" + str(i) + "_" + str(n) + "_" + str(d) + "_" + str(s) + "_" + str(m) + "_" + ".csv", index=False) results = results.append(result_row, ignore_index=True) return results
def gaussian_kernel(X,y,X_train, X_test, y_train, y_test): from sklearn.svm import SVC from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.gaussian_process.kernels import RBF kernel = 1.0 * RBF(1.0) svclassifier = GaussianProcessClassifier(kernel=kernel,random_state=0) model = svclassifier plot_model(model,'Gaussian kernel',X,y) svclassifier.fit(X_train, y_train) # predictions y_pred = svclassifier.predict(X_test) # Evaluate model from sklearn.metrics import classification_report, confusion_matrix print("Confusion Matrix for gaussian") print(confusion_matrix(y_test,y_pred)) print("Classification report for gaussian") print(classification_report(y_test,y_pred)) print("Score for Gaussian RBF:",svclassifier.score(X_train, y_train))
def OVA_OVO(param): print('Aplicando metodo multiclase ONE VS ALL GAUSSIAN PROCESS CLASSIFIER') for i in lista_datasets: print('Base de datos: ' + str(i)) dataset = arff.loadarff('./datasets/' + str(i)) df = pd.DataFrame(dataset[0]) input = df.iloc[:, df.columns != 'class'] output = pd.factorize(df['class'])[0] X_train, X_test, Y_train, Y_test = train_test_split(input, output, test_size=0.25) kernel = ( 1.0 * RBF(1.0) ) gpc = GaussianProcessClassifier(kernel=kernel, random_state=0, multi_class=param) gpc.fit(X_train, Y_train) print('Porcentaje de bien clasificados GAUSSIAN PROCESS CLASSIFIER ONE VS ALL') print(gpc.score(X_test, Y_test)) print('--------------------------')
class scikit_GaussianProcessClassifier(MLAlgo): def __init__(self): self.clf = GaussianProcessClassifier() self.className = self.__class__.__name__ def train(self, train_data): train_X = train_data[:, :-1] train_Y = train_data[:, -1] self.clf.fit(train_X, train_Y) print("GaussianProcessClassifier model built.") return self.className + " Training finished...\n" def test(self, test_data): test_X = test_data[:, :-1] test_Y = test_data[:, -1] print("Accuracy: ", self.clf.score(test_X, test_Y)) return self.className + " Testing finished...\n" def predict(self, predict_data): print("Predictions: ", self.clf.predict(predict_data)) return self.className + " Prediction finished...\n" def cross_validate(self, train_data): X_ = train_data[:, :-1] Y_ = train_data[:, -1] predicted = cross_val_predict(self.clf, X_, Y_, cv=10) print("Cross-validation accuracy: ", metrics.accuracy_score(Y_, predicted)) if metrics.accuracy_score(Y_, predicted) > MLAlgo.cross_validate_accuracy: MLAlgo.cross_validate_accuracy = metrics.accuracy_score( Y_, predicted) MLAlgo.classifier = self.clf MLAlgo.trained_instance = self return self.className + " Cross validation finished...\n"
print(np.shape(labels_con)) labels=np.vstack((labels_ad,labels_con)) print(np.shape(labels)) data_train, data_test, labels_train, labels_test = train_test_split(lst, labels, test_size=0.20, random_state=42) kernel = 1.0 * RBF(214) ''' S=data_train T=data_test S /= S.std(axis=0) T /= T.std(axis=0) ica = FastICA(n_components=14) S_ = ica.fit_transform(S) T_=ica.fit_transform(T) ''' gpc = GaussianProcessClassifier(kernel=kernel,n_restarts_optimizer=5,random_state=None,multi_class="one_vs_rest",max_iter_predict=100,n_jobs=-1) gpc=gpc.fit(data_train, labels_train) print('') print('accuracy on trainingset:',gpc.score(data_train,labels_train)) print('accuracy on testset:',gpc.score(data_test, labels_test)) print("confusion matrix for the training") cm_train = confusion_matrix(labels_train, gpc.predict(data_train)) print(cm_train) print(classification_report(labels_train, gpc.predict(data_train), labels=[0, 1])) print("confusion matrix for the testing") cm_test = confusion_matrix(labels_test, gpc.predict(data_test)) print(cm_test) print('') print(classification_report(labels_test, gpc.predict(data_test), labels=[0, 1]))
transform=torchvision.transforms.ToTensor()) mnist_testset = datasets.MNIST(root='./data', train=False, download=False, transform=torchvision.transforms.ToTensor()) mnist_testset = mnist_testset print(mnist_trainset.data.shape) print(mnist_trainset.data[0:2000].view(-1, 28 * 28).shape) kernel = 1.0 * RBF(28 * 28) gpc = GaussianProcessClassifier(kernel=kernel, n_restarts_optimizer=3, random_state=None, multi_class="one_vs_rest", max_iter_predict=200, n_jobs=-1) gpc = gpc.fit(mnist_trainset.data[0:1000].view(-1, 28 * 28), mnist_trainset.targets[0:1000]) print( gpc.score(mnist_trainset.data[0:1000].view(-1, 28 * 28), mnist_trainset.targets[0:1000])) print( gpc.score(mnist_testset.data[0:500].view(-1, 28 * 28), mnist_testset.targets[0:500])) print(gpc.predict(mnist_trainset.data[0:20].view(-1, 28 * 28))) print(mnist_trainset.targets[0:20]) print(gpc.predict(mnist_testset.data[0:20].view(-1, 28 * 28))) print(mnist_testset.targets[0:20])
ax.set_xlabel("1st eigenvector") ax.w_xaxis.set_ticklabels([]) ax.set_ylabel("2nd eigenvector") ax.w_yaxis.set_ticklabels([]) ax.set_zlabel("3rd eigenvector") ax.w_zaxis.set_ticklabels([]) plt.show() from sklearn.gaussian_process.kernels import RBF kernel = 1.0 * RBF(1.0) gpc = GaussianProcessClassifier(kernel=kernel, multi_class='one_vs_one', random_state=0).fit(X_, Y_) # lets see how good our fit on the train set is print(gpc.score(X_, Y_)) # create the TF neural net # some hyperparams training_epochs = 200 n_neurons_in_h1 = 10 n_neurons_in_h2 = 10 learning_rate = 0.01 dkl_loss_rate = 0.1 n_features = len(X[0]) labels_dim = 1 ############################################# # these placeholders serve as our input tensors
# Quadratic Discriminant qda = QuadraticDiscriminantAnalysis() qda.fit(X_train, y_train) print('QDA accuracy: ', qda.score(X_test, y_test)) # MPL classifier mpl = MLPClassifier(hidden_layer_sizes=(100, ), activation='logistic', max_iter=5000) mpl.fit(X_train, y_train) print('MPL accuracy: ', mpl.score(X_test, y_test)) # Gaussian Process gpc = GaussianProcessClassifier() gpc.fit(X_train, y_train) print('GPC accuracy: ', gpc.score(X_test, y_test)) # Random Forest Classifier rfc = RandomForestClassifier() rfc.fit(X_train, y_train) print('RFC accuracy: ', rfc.score(X_test, y_test)) # Computes the Silhoutte coefficient print( 'Silhouette coefficient: ', metrics.silhouette_score(novos_dados_pca.real.T, target, metric='euclidean')) print() #%%%%%%%%%%%%%%%%%%%% Supervised classification for PCA-KL features
sv.score(result_test, y_test) #rf = RandomForestClassifier() #rf.fit(result,y) #rf.score(result,y) #rf.score(result_test,y_test) # lr = LogisticRegressionCV() # lr.fit(result,y) # lr.score(result,y) # lr.score(result_test,y_test) # # Specify Gaussian Processes with fixed and optimized hyperparameters gp_opt = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0)) gp_opt.fit(result, y) gp_opt.score(result, y) gp_opt.score(result_test, y_test) # ============================================================================= ################################################################################ clf = GradientBoostingClassifier(verbose=1) # The list of hyper-parameters we want to optimize. For each one we define the bounds, # the corresponding scikit-learn parameter name, as well as how to sample values # from that dimension (`'log-uniform'` for the learning rate) n_features = result.shape[1] dim_max_depth = Integer(1, 35, name='max_depth') dim_learning_rate = Real(10**-5, 10**0, "log-uniform", name='learning_rate') dim_max_features = Integer(1, n_features, name='max_features')
for name, gpc in gpList: start_time = time.time() # Training / Fitting print('\nName: ', name) gpc.fit(usedTrainX, usedTrainY) # Cross Validation #tenFoldCV = cross_val_score(gpc, usedTrainX, usedTrainY, cv=10, scoring='neg_log_loss', n_jobs=-1) tenFoldCV = cross_val_score(gpc, usedTrainX, usedTrainY, cv=5, n_jobs=-1) #avgCV = np.mean(tenFoldCV) print('Average of 10VC: ', round(tenFoldCV.mean(), 4) * 100, '%') print('Std of 10VC: ', tenFoldCV.std()) # Testing score = gpc.score(usedTestX, usedTestY) print('Testing Score:\t', round(score, 4) * 100, '%') # Running Time print("Time(s):\t", round((time.time() - start_time) * 100) / 100) # Add to usedModelList usedModelList.append([name, tenFoldCV.mean(), score, gpc, tenFoldCV.std()]) #%% Measuring Training+CV+Testing Time for models in usedModelList for name, m, s, model, std in usedModelList: start_time = time.time() # Training / Fitting print('\nName: ', name) gpc.fit(usedTrainX, usedTrainY)
def clf_GAU(imputed_data_x, y, train_idx, test_idx): kernel = 1.0 * RBF(1.0) clf = GaussianProcessClassifier(kernel=kernel, random_state=0) clf.fit(imputed_data_x[train_idx], y[train_idx]) score = clf.score(imputed_data_x[test_idx], y[test_idx]) return np.round(score * 100, 4)
def batch_Parametric_PCA(): # Datasets X = skdata.load_iris() # K = 95 #X = skdata.fetch_openml(name='Engine1', version=1) # K = 235 #X = skdata.fetch_openml(name='prnn_crabs', version=1) # K = 10 #X = skdata.fetch_openml(name='analcatdata_happiness', version=1) # K = 53 #X = skdata.fetch_openml(name='mux6', version=1) # K = 105 #X = skdata.fetch_openml(name='threeOf9', version=1) # K = 385 #X = skdata.fetch_openml(name='parity5', version=1) # K = 25 #X = skdata.fetch_openml(name='sa-heart', version=1) # K = 74 #X = skdata.fetch_openml(name='vertebra-column', version=1) # K = 305 #X = skdata.fetch_openml(name='breast-tissue', version=2) # K = 5 #X = skdata.fetch_openml(name='transplant', version=2) # K = 65 #X = skdata.fetch_openml(name='hayes-roth', version=2) # K = 5 #X = skdata.fetch_openml(name='plasma_retinol', version=2) # K = 145 #X = skdata.fetch_openml(name='aids', version=1) # K = 42 #X = skdata.fetch_openml(name='lupus', version=1) # K = 37 #X = skdata.fetch_openml(name='pwLinear', version=2) # K = 135 #X = skdata.fetch_openml(name='fruitfly', version=2) # K = 120 #X = skdata.fetch_openml(name='pm10', version=2) # K = 485 #X = skdata.fetch_openml(name='visualizing_livestock', version=1) # K = 125 #X = skdata.fetch_openml(name='strikes', version=2) # K = 130 dados = X['data'] target = X['target'] # Normalize data dados = preprocessing.scale(dados) n = dados.shape[0] m = dados.shape[1] print('N = ', n) print('M = ', m) inicio = 5 incremento = 5 lista_k = list(range(inicio, n, incremento)) acuracias = [] kappas_medios = [] for k in lista_k: print('K = ', k) novos_dados = ParametricPCA(dados, k, 2, 'KL') #%%%%%%%%%%%%%%%%% Parametric PCA print('Parametric PCA for supervised classification') # Split training data X_train, X_test, y_train, y_test = train_test_split(novos_dados.real.T, target, test_size=.4, random_state=42) acc = [] # KNN neigh = KNeighborsClassifier(n_neighbors=7) neigh.fit(X_train, y_train) s = neigh.score(X_test, y_test) kap = metrics.cohen_kappa_score(neigh.predict(X_test), y_test) acc.append(s) print('KNN accuracy: ', s) # SVM svm = SVC(gamma='auto') svm.fit(X_train, y_train) s = svm.score(X_test, y_test) kap = metrics.cohen_kappa_score(svm.predict(X_test), y_test) acc.append(s) print('SVM accuracy: ', s) # Naive Bayes nb = GaussianNB() nb.fit(X_train, y_train) s = nb.score(X_test, y_test) kap = metrics.cohen_kappa_score(nb.predict(X_test), y_test) acc.append(s) print('NB accuracy: ', s) # Decision Tree dt = DecisionTreeClassifier(random_state=0) dt.fit(X_train, y_train) s = dt.score(X_test, y_test) kap = metrics.cohen_kappa_score(dt.predict(X_test), y_test) acc.append(s) print('DT accuracy: ', s) # Quadratic Discriminant qda = QuadraticDiscriminantAnalysis() qda.fit(X_train, y_train) s = qda.score(X_test, y_test) kap = metrics.cohen_kappa_score(qda.predict(X_test), y_test) acc.append(s) print('QDA accuracy: ', s) # MPL classifier mpl = MLPClassifier(hidden_layer_sizes=(100,), activation='logistic', max_iter=5000) mpl.fit(X_train, y_train) s = mpl.score(X_test, y_test) kap = metrics.cohen_kappa_score(mpl.predict(X_test), y_test) acc.append(s) print('MPL accuracy: ', s) # Gaussian Process gpc = GaussianProcessClassifier() gpc.fit(X_train, y_train) s = gpc.score(X_test, y_test) kap = metrics.cohen_kappa_score(gpc.predict(X_test), y_test) acc.append(s) print('GPC accuracy: ', s) # Random Forest Classifier rfc = RandomForestClassifier() rfc.fit(X_train, y_train) s = rfc.score(X_test, y_test) kap = metrics.cohen_kappa_score(rfc.predict(X_test), y_test) acc.append(s) print('RFC accuracy: ', s) acuracia = sum(acc)/len(acc) # Computes the Silhoutte coefficient print('Silhouette coefficient: ', metrics.silhouette_score(novos_dados.real.T, target, metric='euclidean')) print('Average accuracy: ', acuracia) print() acuracias.append(acuracia) print('List of values for K: ', lista_k) print('Supervised classification accuracies: ', acuracias) acuracias = np.array(acuracias) print('Max Acc: ', acuracias.max()) print('K* = ', lista_k[acuracias.argmax()]) print() plt.figure(1) plt.plot(lista_k, acuracias) plt.title('Mean accuracies for different values of K (neighborhood)') plt.show() #%%%%%%%%%%%%% Dimensionality reduction methods # PCA novos_dados_pca = PCA(dados, 2) # Kernel PCA model = KernelPCA(n_components=2, kernel='rbf') novos_dados_kpca = model.fit_transform(dados) novos_dados_kpca = novos_dados_kpca.T # ISOMAP model = Isomap(n_neighbors=20, n_components=2) novos_dados_isomap = model.fit_transform(dados) novos_dados_isomap = novos_dados_isomap.T # LLE model = LocallyLinearEmbedding(n_neighbors=20, n_components=2) novos_dados_LLE = model.fit_transform(dados) novos_dados_LLE = novos_dados_LLE.T # Lap. Eig. model = SpectralEmbedding(n_neighbors=20, n_components=2) novos_dados_Lap = model.fit_transform(dados) novos_dados_Lap = novos_dados_Lap.T #%%%%%%%%%%%%%%%%% PCA print('Results for PCA') # Split training data X_train, X_test, y_train, y_test = train_test_split(novos_dados_pca.real.T, target, test_size=.4, random_state=42) acc = [] # KNN neigh = KNeighborsClassifier(n_neighbors=7) neigh.fit(X_train, y_train) s = neigh.score(X_test, y_test) kap = metrics.cohen_kappa_score(neigh.predict(X_test), y_test) acc.append(s) print('KNN accuracy: ', s) # SVM svm = SVC(gamma='auto') svm.fit(X_train, y_train) s = svm.score(X_test, y_test) kap = metrics.cohen_kappa_score(svm.predict(X_test), y_test) acc.append(s) print('SVM accuracy: ', s) # Naive Bayes nb = GaussianNB() nb.fit(X_train, y_train) s = nb.score(X_test, y_test) kap = metrics.cohen_kappa_score(nb.predict(X_test), y_test) acc.append(s) print('NB accuracy: ', s) # Decision Tree dt = DecisionTreeClassifier(random_state=0) dt.fit(X_train, y_train) s = dt.score(X_test, y_test) kap = metrics.cohen_kappa_score(dt.predict(X_test), y_test) acc.append(s) print('DT accuracy: ', s) # Quadratic Discriminant qda = QuadraticDiscriminantAnalysis() qda.fit(X_train, y_train) s = qda.score(X_test, y_test) kap = metrics.cohen_kappa_score(qda.predict(X_test), y_test) acc.append(s) print('QDA accuracy: ', s) # MPL classifier mpl = MLPClassifier(hidden_layer_sizes=(100,), activation='logistic', max_iter=5000) mpl.fit(X_train, y_train) s = mpl.score(X_test, y_test) kap = metrics.cohen_kappa_score(mpl.predict(X_test), y_test) acc.append(s) print('MPL accuracy: ', s) # Gaussian Process gpc = GaussianProcessClassifier() gpc.fit(X_train, y_train) s = gpc.score(X_test, y_test) kap = metrics.cohen_kappa_score(gpc.predict(X_test), y_test) acc.append(s) print('GPC accuracy: ', s) # Random Forest Classifier rfc = RandomForestClassifier() rfc.fit(X_train, y_train) s = rfc.score(X_test, y_test) kap = metrics.cohen_kappa_score(rfc.predict(X_test), y_test) acc.append(s) print('RFC accuracy: ', s) # Computes the Silhoutte coefficient print('Silhouette coefficient: ', metrics.silhouette_score(novos_dados_pca.real.T, target, metric='euclidean')) print('Average accuracy: ', sum(acc)/len(acc)) print() #%%%%%%%%%%%%%%%%% KPCA print('Results for KPCA') # Split training data X_train, X_test, y_train, y_test = train_test_split(novos_dados_kpca.real.T, target, test_size=.4, random_state=42) acc = [] # KNN neigh = KNeighborsClassifier(n_neighbors=7) neigh.fit(X_train, y_train) s = neigh.score(X_test, y_test) kap = metrics.cohen_kappa_score(neigh.predict(X_test), y_test) acc.append(s) print('KNN accuracy: ', s) # SVM svm = SVC(gamma='auto') svm.fit(X_train, y_train) s = svm.score(X_test, y_test) kap = metrics.cohen_kappa_score(svm.predict(X_test), y_test) acc.append(s) print('SVM accuracy: ', s) # Naive Bayes nb = GaussianNB() nb.fit(X_train, y_train) s = nb.score(X_test, y_test) kap = metrics.cohen_kappa_score(nb.predict(X_test), y_test) acc.append(s) print('NB accuracy: ', s) # Decision Tree dt = DecisionTreeClassifier(random_state=0) dt.fit(X_train, y_train) s = dt.score(X_test, y_test) kap = metrics.cohen_kappa_score(dt.predict(X_test), y_test) acc.append(s) print('DT accuracy: ', s) # Quadratic Discriminant qda = QuadraticDiscriminantAnalysis() qda.fit(X_train, y_train) s = qda.score(X_test, y_test) kap = metrics.cohen_kappa_score(qda.predict(X_test), y_test) acc.append(s) print('QDA accuracy: ', s) # MPL classifier mpl = MLPClassifier(hidden_layer_sizes=(100,), activation='logistic', max_iter=5000) mpl.fit(X_train, y_train) s = mpl.score(X_test, y_test) kap = metrics.cohen_kappa_score(mpl.predict(X_test), y_test) acc.append(s) print('MPL accuracy: ', s) # Gaussian Process gpc = GaussianProcessClassifier() gpc.fit(X_train, y_train) s = gpc.score(X_test, y_test) kap = metrics.cohen_kappa_score(gpc.predict(X_test), y_test) acc.append(s) print('GPC accuracy: ', s) # Random Forest Classifier rfc = RandomForestClassifier() rfc.fit(X_train, y_train) s = rfc.score(X_test, y_test) kap = metrics.cohen_kappa_score(rfc.predict(X_test), y_test) acc.append(s) print('RFC accuracy: ', s) # Computes the Silhoutte coefficient print('Silhouette coefficient: ', metrics.silhouette_score(novos_dados_kpca.real.T, target, metric='euclidean')) print('Average accuracy: ', sum(acc)/len(acc)) print() #%%%%%%%%%%%%%%%%% ISOMAP print('Results for ISOMAP') # Split training data X_train, X_test, y_train, y_test = train_test_split(novos_dados_isomap.real.T, target, test_size=.4, random_state=42) acc = [] # KNN neigh = KNeighborsClassifier(n_neighbors=7) neigh.fit(X_train, y_train) s = neigh.score(X_test, y_test) kap = metrics.cohen_kappa_score(neigh.predict(X_test), y_test) acc.append(s) print('KNN accuracy: ', s) # SVM svm = SVC(gamma='auto') svm.fit(X_train, y_train) s = svm.score(X_test, y_test) kap = metrics.cohen_kappa_score(svm.predict(X_test), y_test) acc.append(s) print('SVM accuracy: ', s) # Naive Bayes nb = GaussianNB() nb.fit(X_train, y_train) s = nb.score(X_test, y_test) kap = metrics.cohen_kappa_score(nb.predict(X_test), y_test) acc.append(s) print('NB accuracy: ', s) # Decision Tree dt = DecisionTreeClassifier(random_state=0) dt.fit(X_train, y_train) s = dt.score(X_test, y_test) kap = metrics.cohen_kappa_score(dt.predict(X_test), y_test) acc.append(s) print('DT accuracy: ', s) # Quadratic Discriminant qda = QuadraticDiscriminantAnalysis() qda.fit(X_train, y_train) s = qda.score(X_test, y_test) kap = metrics.cohen_kappa_score(qda.predict(X_test), y_test) acc.append(s) print('QDA accuracy: ', s) # MPL classifier mpl = MLPClassifier(hidden_layer_sizes=(100,), activation='logistic', max_iter=5000) mpl.fit(X_train, y_train) s = mpl.score(X_test, y_test) kap = metrics.cohen_kappa_score(mpl.predict(X_test), y_test) acc.append(s) print('MPL accuracy: ', s) # Gaussian Process gpc = GaussianProcessClassifier() gpc.fit(X_train, y_train) s = gpc.score(X_test, y_test) kap = metrics.cohen_kappa_score(gpc.predict(X_test), y_test) acc.append(s) print('GPC accuracy: ', s) # Random Forest Classifier rfc = RandomForestClassifier() rfc.fit(X_train, y_train) s = rfc.score(X_test, y_test) kap = metrics.cohen_kappa_score(rfc.predict(X_test), y_test) acc.append(s) print('RFC accuracy: ', s) # Computes the Silhoutte coefficient print('Silhouette coefficient: ', metrics.silhouette_score(novos_dados_isomap.real.T, target, metric='euclidean')) print('Average accuracy: ', sum(acc)/len(acc)) print() #%%%%%%%%%%%%%%%%% LLE print('Results for LLE') # Split training data X_train, X_test, y_train, y_test = train_test_split(novos_dados_LLE.real.T, target, test_size=.4, random_state=42) acc = [] # KNN neigh = KNeighborsClassifier(n_neighbors=7) neigh.fit(X_train, y_train) s = neigh.score(X_test, y_test) kap = metrics.cohen_kappa_score(neigh.predict(X_test), y_test) acc.append(s) print('KNN accuracy: ', s) # SVM svm = SVC(gamma='auto') svm.fit(X_train, y_train) s = svm.score(X_test, y_test) kap = metrics.cohen_kappa_score(svm.predict(X_test), y_test) acc.append(s) print('SVM accuracy: ', s) # Naive Bayes nb = GaussianNB() nb.fit(X_train, y_train) s = nb.score(X_test, y_test) kap = metrics.cohen_kappa_score(nb.predict(X_test), y_test) acc.append(s) print('NB accuracy: ', s) # Decision Tree dt = DecisionTreeClassifier(random_state=0) dt.fit(X_train, y_train) s = dt.score(X_test, y_test) kap = metrics.cohen_kappa_score(dt.predict(X_test), y_test) acc.append(s) print('DT accuracy: ', s) # Quadratic Discriminant qda = QuadraticDiscriminantAnalysis() qda.fit(X_train, y_train) s = qda.score(X_test, y_test) kap = metrics.cohen_kappa_score(qda.predict(X_test), y_test) acc.append(s) print('QDA accuracy: ', s) # MPL classifier mpl = MLPClassifier(hidden_layer_sizes=(100,), activation='logistic', max_iter=5000) mpl.fit(X_train, y_train) s = mpl.score(X_test, y_test) kap = metrics.cohen_kappa_score(mpl.predict(X_test), y_test) acc.append(s) print('MPL accuracy: ', s) # Gaussian Process gpc = GaussianProcessClassifier() gpc.fit(X_train, y_train) s = gpc.score(X_test, y_test) kap = metrics.cohen_kappa_score(gpc.predict(X_test), y_test) acc.append(s) print('GPC accuracy: ', s) # Random Forest Classifier rfc = RandomForestClassifier() rfc.fit(X_train, y_train) s = rfc.score(X_test, y_test) kap = metrics.cohen_kappa_score(rfc.predict(X_test), y_test) acc.append(s) print('RFC accuracy: ', s) # Computes the Silhoutte coefficient print('Silhouette coefficient: ', metrics.silhouette_score(novos_dados_LLE.real.T, target, metric='euclidean')) print('Average accuracy: ', sum(acc)/len(acc)) print() #%%%%%%%%%%%%%%%%% Laplacian Eigenmaps print('Results for Laplacian Eigenmaps') # Split training data X_train, X_test, y_train, y_test = train_test_split(novos_dados_Lap.real.T, target, test_size=.4, random_state=42) acc = [] # KNN neigh = KNeighborsClassifier(n_neighbors=7) neigh.fit(X_train, y_train) s = neigh.score(X_test, y_test) kap = metrics.cohen_kappa_score(neigh.predict(X_test), y_test) acc.append(s) print('KNN accuracy: ', s) # SVM svm = SVC(gamma='auto') svm.fit(X_train, y_train) s = svm.score(X_test, y_test) kap = metrics.cohen_kappa_score(svm.predict(X_test), y_test) acc.append(s) print('SVM accuracy: ', s) # Naive Bayes nb = GaussianNB() nb.fit(X_train, y_train) s = nb.score(X_test, y_test) kap = metrics.cohen_kappa_score(nb.predict(X_test), y_test) acc.append(s) print('NB accuracy: ', s) # Decision Tree dt = DecisionTreeClassifier(random_state=0) dt.fit(X_train, y_train) s = dt.score(X_test, y_test) kap = metrics.cohen_kappa_score(dt.predict(X_test), y_test) acc.append(s) print('DT accuracy: ', s) # Quadratic Discriminant qda = QuadraticDiscriminantAnalysis() qda.fit(X_train, y_train) s = qda.score(X_test, y_test) kap = metrics.cohen_kappa_score(qda.predict(X_test), y_test) acc.append(s) print('QDA accuracy: ', s) # MPL classifier mpl = MLPClassifier(hidden_layer_sizes=(100,), activation='logistic', max_iter=5000) mpl.fit(X_train, y_train) s = mpl.score(X_test, y_test) kap = metrics.cohen_kappa_score(mpl.predict(X_test), y_test) acc.append(s) print('MPL accuracy: ', s) # Gaussian Process gpc = GaussianProcessClassifier() gpc.fit(X_train, y_train) s = gpc.score(X_test, y_test) kap = metrics.cohen_kappa_score(gpc.predict(X_test), y_test) acc.append(s) print('GPC accuracy: ', s) # Random Forest Classifier rfc = RandomForestClassifier() rfc.fit(X_train, y_train) s = rfc.score(X_test, y_test) kap = metrics.cohen_kappa_score(rfc.predict(X_test), y_test) acc.append(s) print('RFC accuracy: ', s) # Computes the Silhoutte coefficient print('Silhouette coefficient: ', metrics.silhouette_score(novos_dados_Lap.real.T, target, metric='euclidean')) print('Average accuracy: ', sum(acc)/len(acc)) print() batch_Parametric_PCA_cluster(X)
Qda = QuadraticDiscriminantAnalysis() Qda.fit(X_train, y_train) print('Accuracy of QDA classifier on training set: {:.2f}' .format(Qda.score(X_train, y_train))) print('Accuracy of QDA classifier on test set: {:.2f}' .format(Qda.score(X_test, y_test))) # In[29]: from sklearn.gaussian_process import GaussianProcessClassifier GPC = GaussianProcessClassifier() GPC.fit(X_train, y_train) print('Accuracy of GPC classifier on training set: {:.2f}' .format(GPC.score(X_train, y_train))) print('Accuracy of GPC classifier on test set: {:.2f}' .format(GPC.score(X_test, y_test))) # In[30]: svm2 = SVC(gamma=2, C=1) svm2.fit(X_train, y_train) print('Accuracy of svm2 classifier on training set: {:.2f}' .format(svm2.score(X_train, y_train))) print('Accuracy of svm2 classifier on test set: {:.2f}' .format(svm2.score(X_test, y_test)))
for i in range(0, 50): print(colored( round(model.predict(np.expand_dims(X_train[i], axis=0))[0][0]), "green"), end=" ") print("") elif train == "sk": # select whichever one you would like to use from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC from sklearn.ensemble import GradientBoostingClassifier from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.ensemble import AdaBoostClassifier print(colored("[TRAIN] Training with sklearn", "green")) model = GaussianProcessClassifier() model.fit(X_train, y_train) print(colored("[TRAIN] sklearn complete", "green")) score = model.score(X_test, y_test) print(colored("Accuracy: {}".format(score), "cyan", attrs=['bold'])) dump(model, 'model.joblib') for i in range(0, 50): print(colored(model.predict(np.expand_dims(X_train[i], axis=0)), "green"), end=" ") print("")
probs = probs[:, 1] auc_SVM = roc_auc_score(labels_test, probs) #calculating AUC probs = ensemble.predict_proba(selected_features_data_test) probs = probs[:, 1] auc_ensemble = roc_auc_score(labels_test, probs) ''' #calculating AUC probs = gpc.predict_proba(selected_features_data_test) probs = probs[:, 1] auc_GP = roc_auc_score(labels_test, probs) print('') print('training accuracy GP classifer:', gpc.score(selected_features_train_data, labels_train)) print("training accuracy SVM classifer:", metrics.accuracy_score(training_pred, labels_train)) print('test accuracy GP classifer:', gpc.score(selected_features_data_test, labels_test)) print('test accuracy SVM classifer:', metrics.accuracy_score(test_pred, labels_test)) print('test accuracy tree classifer:', metrics.accuracy_score(test_tree_pred, labels_test)) print('test accuracy ensamble classifer:', ensemble.score(selected_features_data_test, labels_test)) print('AUC GP classifer: %.2f' % auc_GP) #print('AUC SVM classifer: %.2f' % auc_SVM) #print('AUC ensemble classifer: %.2f' % auc_ensemble)
# split into training and test set train_set, test_set = train_test_split(parts_labeled, random_state=42) # get X and Y values X_train, X_test = [s[['corr_scaled','mass_scaled']].values for s in (train_set, test_set)] y_train, y_test = [s['manual_label'].values for s in (train_set, test_set)] #clf_scaler_path = '../output/pipeline/GPClassification/GPCclfRBF.p' #with open(clf_scaler_path, 'rb') as f: # clf = pickle.load(f) # scaler = pickle.load(f) # train a gaussian process classifier with RBF kernel (Default) clf = GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42, n_jobs=-1) clf.fit(X_train, y_train) plot2dDecisionFunc(clf, X_train, y_train, save=save_dir+'prob_surfaceGPC.pdf') clf.score(X_test, y_test) labels_pred = clf.predict_proba(X_test)[:,1] # compute f1 score: harmonic mean between precision and recall # see https://en.wikipedia.org/wiki/F1_score prob_f1 = pd.DataFrame() prob_f1['prob_thresh'] = np.linspace(0.1, 1, 90, endpoint=False) f1score = np.array([metrics.precision_recall_fscore_support(y_test, labels_pred>thresh)[2] for thresh in prob_thresh]) prob_f1['f1score_False']= f1score[:,0] prob_f1['f1score_True']= f1score[:,1] prob_f1.to_csv(save_dir+'prob_f1score.csv', index=False) fig, ax = plt.subplots() ax.plot(prob_f1.prob_thresh, prob_f1.f1score_False, color='r') ax.plot(prob_f1.prob_thresh, prob_f1.f1score_True, color='b')
#initialize the taret classifier and train it # clf = neighbors.KNeighborsClassifier(n_neighbors=3) #clf=SVC() clf = GaussianProcessClassifier(1.0 * RBF(1.0)) #clf = DecisionTreeClassifier(max_depth=5) #clf=MLPClassifier(alpha=1) clf.fit(X_train, y_train) #Store the predicted values y_pred = clf.predict(X_test) #Calculate global accuracy accuracy = accuracy_score(y_test, y_pred) #accuracy = clf.score(X_test, y_test) accuracy = clf.score(X_test, y_test) minority_y_test_index = [] minority_y_test_index1 = np.where(y_test == 1) total_indexes = np.where(y_test >= 0) minority_y_test_index1_list1 = minority_y_test_index1[0].tolist() minority_y_test_index = minority_y_test_index1_list1 y_pred_minority = [] y_test_minority = [] majority_test_index = total_indexes for item in minority_y_test_index: y_test_minority.append(y_test[item])