n_splits = 3 cv = ShuffleSplit(n_splits=n_splits, test_size=0.3) cv = KFold(n_splits=n_splits, shuffle=True) gscv = GridSearchCV(model, param_grid, cv=cv) gscv.fit(X_train, y_train) print_gscv_score_rgr(gscv, X_train, X_test, y_train, y_test, cv) #%% # Prediction y_pred = gscv.predict(X_test) # Applicability Domain (inside: +1, outside: -1) iappd = 1 if (iappd == 1): y_appd = ad_knn(X_train, X_test) else: y_appd = ad_ocsvm(X_train, X_test) data = [] for i in range(len(X_test)): temp = (f_test[i], int(X_test[i][0]), int(y_pred[i]), y_appd[i]) data.append(temp) properties = ['formula', 'P', 'Tc', 'AD'] df = pd.DataFrame(data, columns=properties) df.sort_values('Tc', ascending=False, inplace=True) # df.to_csv(output, index=False) df_in_ = df[df.AD == 1] df_in_.to_csv(output, index=False)
# Not re-optimize print(gscv.best_estimator_.fit(X_train, y_train).get_params()) print(gscv.best_estimator_.fit(X_test, y_test).get_params()) print(gscv.best_estimator_.fit(X, y).get_params()) # {... 'n_neighbors': 1, 'p': 2, 'weights': 'uniform'} # {... 'n_neighbors': 1, 'p': 2, 'weights': 'uniform'} # {... 'n_neighbors': 1, 'p': 2, 'weights': 'uniform'} # Re-learning with all data & best parameters -> Prediction y_pred = gscv.best_estimator_.fit(X, y).predict(X_pred) #%% # Applicability Domain (inside: +1, outside: -1) iappd = 1 if (iappd == 1): y_appd = ad_knn(X_train, X_pred) else: y_appd = ad_ocsvm(X_train, X_pred) data = [] for i in range(len(X_pred)): temp = (f_pred[i], int(P_pred[i]), int(y_pred[i]), y_appd[i]) data.append(temp) properties = ['formula', 'P', 'Tc', 'AD'] df = pd.DataFrame(data, columns=properties) df.sort_values('Tc', ascending=False, inplace=True) # df.to_csv(output, index=False) df_in_ = df[df.AD == 1] df_in_.to_csv(output, index=False)
# y_pred = gscv.predict(X_pred) # best = gscv.best_estimator_.fit(X, y) # y_pred = best.predict(X_pred) y_pred = best.predict(X_pred) #%% # Applicability Domain (inside: +1, outside: -1) from my_library import ad_knn_list iappd = 3 if(iappd == 1): y_appd = ad_knn(X, X_pred) elif(iappd == 2): y_appd = ad_ocsvm(X, X_pred) else: y_appd = ad_knn_list(X, X_pred, 10) data = [] for i in range(len(X_pred)): # temp = (f_pred[i], int(P_pred[i]), int(y_pred[i]), int(y_pred_db[i]), y_appd[i]) temp = (f_pred[i], int(P_pred[i]), int(y_pred[i]), int(y_pred_db[i])) data.append(temp) # properties=['formula','P', 'Tc(pred)', 'Tc(DB)','AD'] properties=['formula','P', 'Tc(pred)', 'Tc(DB)'] df = pd.DataFrame(data, columns=properties) # df.sort_values('Tc', ascending=False, inplace=True)
#%% # Novelty detection by One Class SVM with optimized hyperparameter from my_library import optimize_gamma optgamma = gscv.best_params_['gamma'] range_g = 2**np.arange(-20, 1, dtype=float) optgamma = optimize_gamma(X_train, range_g) clf = OneClassSVM(nu=0.003, kernel=gscv.best_params_['kernel'], gamma=optgamma) clf.fit(X_train) y_pred = gscv.predict(X_test) # prediction from my_library import ad_knn # Applicability Domain (inside: +1, outside: -1) ad_svm = clf.predict(X_test) # outliers = -1 ad_knn = ad_knn(X_train, X_test) results = np.c_[y_pred, y_test, ad_knn, ad_svm, X_test] df = pd.DataFrame(results, columns=list('ABCDEF')) df_knn = df[df.C == -1] df_svm = df[df.D == -1] print('AD svm =/= AD knn') print(df[df.C != df.D]) h = .02 # step size in the mesh x_min, x_max = X_test[:, 0].min() - .2, X_test[:, 0].max() + .2 y_min, y_max = X_test[:, 1].min() - .2, X_test[:, 1].max() + .2 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = gscv.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape)