n_splits = 3
cv = ShuffleSplit(n_splits=n_splits, test_size=0.3)
cv = KFold(n_splits=n_splits, shuffle=True)
gscv = GridSearchCV(model, param_grid, cv=cv)
gscv.fit(X_train, y_train)
print_gscv_score_rgr(gscv, X_train, X_test, y_train, y_test, cv)

#%%
# Prediction
y_pred = gscv.predict(X_test)

# Applicability Domain (inside: +1, outside: -1)
iappd = 1
if (iappd == 1):
    y_appd = ad_knn(X_train, X_test)
else:
    y_appd = ad_ocsvm(X_train, X_test)

data = []
for i in range(len(X_test)):
    temp = (f_test[i], int(X_test[i][0]), int(y_pred[i]), y_appd[i])
    data.append(temp)

properties = ['formula', 'P', 'Tc', 'AD']
df = pd.DataFrame(data, columns=properties)
df.sort_values('Tc', ascending=False, inplace=True)

# df.to_csv(output, index=False)
df_in_ = df[df.AD == 1]
df_in_.to_csv(output, index=False)
    #   Not re-optimize
    print(gscv.best_estimator_.fit(X_train, y_train).get_params())
    print(gscv.best_estimator_.fit(X_test, y_test).get_params())
    print(gscv.best_estimator_.fit(X, y).get_params())
#    {... 'n_neighbors': 1, 'p': 2, 'weights': 'uniform'}
#    {... 'n_neighbors': 1, 'p': 2, 'weights': 'uniform'}
#    {... 'n_neighbors': 1, 'p': 2, 'weights': 'uniform'}

# Re-learning with all data & best parameters -> Prediction
y_pred = gscv.best_estimator_.fit(X, y).predict(X_pred)

#%%
# Applicability Domain (inside: +1, outside: -1)
iappd = 1
if (iappd == 1):
    y_appd = ad_knn(X_train, X_pred)
else:
    y_appd = ad_ocsvm(X_train, X_pred)

data = []
for i in range(len(X_pred)):
    temp = (f_pred[i], int(P_pred[i]), int(y_pred[i]), y_appd[i])
    data.append(temp)

properties = ['formula', 'P', 'Tc', 'AD']
df = pd.DataFrame(data, columns=properties)
df.sort_values('Tc', ascending=False, inplace=True)

# df.to_csv(output, index=False)
df_in_ = df[df.AD == 1]
df_in_.to_csv(output, index=False)
Пример #3
0
#    y_pred = gscv.predict(X_pred)

#    best = gscv.best_estimator_.fit(X, y)
#    y_pred = best.predict(X_pred)

y_pred = best.predict(X_pred)

#%%
# Applicability Domain (inside: +1, outside: -1)

from my_library import ad_knn_list

iappd = 3
if(iappd == 1):
    y_appd = ad_knn(X, X_pred)
elif(iappd == 2):
    y_appd = ad_ocsvm(X, X_pred)
else:
    y_appd = ad_knn_list(X, X_pred, 10)

data = []
for i in range(len(X_pred)):
#    temp = (f_pred[i], int(P_pred[i]), int(y_pred[i]), int(y_pred_db[i]), y_appd[i])
    temp = (f_pred[i], int(P_pred[i]), int(y_pred[i]), int(y_pred_db[i]))
    data.append(temp)

# properties=['formula','P', 'Tc(pred)', 'Tc(DB)','AD']
properties=['formula','P', 'Tc(pred)', 'Tc(DB)']
df = pd.DataFrame(data, columns=properties)
# df.sort_values('Tc', ascending=False, inplace=True)
#%%

# Novelty detection by One Class SVM with optimized hyperparameter
from my_library import optimize_gamma
optgamma = gscv.best_params_['gamma']
range_g = 2**np.arange(-20, 1, dtype=float)
optgamma = optimize_gamma(X_train, range_g)
clf = OneClassSVM(nu=0.003, kernel=gscv.best_params_['kernel'], gamma=optgamma)
clf.fit(X_train)

y_pred = gscv.predict(X_test)  # prediction

from my_library import ad_knn
# Applicability Domain (inside: +1, outside: -1)
ad_svm = clf.predict(X_test)  # outliers = -1
ad_knn = ad_knn(X_train, X_test)

results = np.c_[y_pred, y_test, ad_knn, ad_svm, X_test]

df = pd.DataFrame(results, columns=list('ABCDEF'))
df_knn = df[df.C == -1]
df_svm = df[df.D == -1]
print('AD svm =/= AD knn')
print(df[df.C != df.D])

h = .02  # step size in the mesh
x_min, x_max = X_test[:, 0].min() - .2, X_test[:, 0].max() + .2
y_min, y_max = X_test[:, 1].min() - .2, X_test[:, 1].max() + .2
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = gscv.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)