data = pd.read_csv(filepath_or_buffer='tc_train.csv', index_col=0, header=0, sep=',') data.drop(['formula'], axis=1, inplace=True) print(data.describe()) #%% # Set the parameters by cross-validation n_splits = 3 cv = ShuffleSplit(n_splits=n_splits, test_size=0.3) cv = KFold(n_splits=n_splits, shuffle=True) gscv = GridSearchCV(model, param_grid, cv=cv) gscv.fit(X_train, y_train) print_gscv_score_rgr(gscv, X_train, X_test, y_train, y_test, cv) #%% # Prediction y_pred = gscv.predict(X_test) # Applicability Domain (inside: +1, outside: -1) iappd = 1 if (iappd == 1): y_appd = ad_knn(X_train, X_test) else: y_appd = ad_ocsvm(X_train, X_test) data = [] for i in range(len(X_test)): temp = (f_test[i], int(X_test[i][0]), int(y_pred[i]), y_appd[i])
pd.plotting.scatter_matrix(data) plt.show() #%% # Set the parameters by cross-validation n_splits = 3 icv=1 if(icv==1): cv = KFold(n_splits=n_splits, shuffle=True) else: cv = ShuffleSplit(n_splits=n_splits) gscv = GridSearchCV(model, param_grid, cv=cv) gscv.fit(X_train, y_train) print_gscv_score_rgr(gscv, X_train, X_test, y_train, y_test, cv) print_gscv_score_rgr(gscv, X, X_pred, y, y_pred_db, cv) best = gscv.best_estimator_ best.fit(X, y) if(key=='RF' or key=='GB'): f_impo = best.feature_importances_ print(f_impo) #%% # Prediction test ltest=False if(ltest): y_pred = gscv.predict(X_pred)