def bench_classifiers(name): classifiers = [ ada_boost(name + '.ada_boost'), # boo gaussian_nb(name + '.gaussian_nb'), # eey knn(name + '.knn', sparse_data=True), # eey linear_discriminant_analysis(name + '.linear_discriminant_analysis', n_components=1), # eey random_forest(name + '.random_forest'), # boo sgd(name + '.sgd') # eey ] if xgboost: classifiers.append(xgboost_classification(name + '.xgboost')) # boo return hp.choice('%s' % name, classifiers)
def main(): df_train = pd.read_csv('../train_dataset.csv') df_test = pd.read_csv('../test_dataset.csv') X_train, y_train = df_train.iloc[:, 2:].values, df_train.iloc[:, 0].values X_test, y_test = df_test.iloc[:, 2:].values, df_test.iloc[:, 0].values estim = HyperoptEstimator(classifier=xgboost_classification('myXG'), algo=tpe.suggest, max_evals=100, trial_timeout=120, verbose=True) estim.fit(X_train, y_train) print("\n\n{}\n\n".format(estim.score(X_test, y_test))) print("\n\n{}\n\n".format(estim.best_model()))
def makeXGB(myaeid): try: # output print statements into file os.makedirs("/home/rlougee/Desktop/xgb_results/" + str(myaeid)) sys.stdout = open( '/home/rlougee/Desktop/xgb_results/{}/{}_output.txt'.format( myaeid, myaeid), 'w') # prep data df = mc5_table[mc5_table['aeid'] == myaeid] df = df[['dsstox_compound_id', 'hitc']] df = handle_duplicates(df, 3) print("duplicates passed") df = fillfp(df, 1445) print("fillfp passed") # get file name for outputs name = myaeid print(name) # print(df) # declare variables y = np.array(df['hitc']) print(y) df = df.drop(['hitc', 'dsstox_compound_id'], 1) X = np.array(df.values) print(X.shape) ###### print(X) # X = X[:,~np.all(np.isnan(X), axis=0)] # print(X) # make test and train data X_, X_test, y_, y_test = train_test_split(X, y, test_size=0.2, stratify=y) print('##########################') # calculate base parameters spw = len(y[y == 0]) / len(y[y == 1]) max_delta_step = 0 # hp.quniform('max_delta_step', 0, 1, .1) min_child_weight = hp.quniform('min_child', 50, 60, .1) # subsample = hp.quniform('subsample',0,1,0.1) colsample_bytree = 1 #hp.quniform('colsample_bytree',0.1,1,.1) # max_delta_step = hp.quniform('max_delta_step',0,10,1) gamma = hp.quniform('gamma', 0.5, 1, .01) learning_rate = hp.quniform('learning_rate', .01, .2, .001) n_estimators = sample( scope.int(hp.quniform('n_estimators', 3000, 4000, 100))) max_depth = 100 n_jobs = 30 model = HyperoptEstimator(classifier=xgboost_classification( 'my_clf', min_child_weight=min_child_weight, colsample_bytree=colsample_bytree, max_delta_step=max_delta_step, gamma=gamma, learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth, scale_pos_weight=spw, reg_alpha=0, reg_lambda=1, colsample_bylevel=1, subsample=1)) model.fit(X_, y_) a = str(model.best_model()['learner']).replace('=', '":').replace( 'XGBClassifier(', '{"').replace(', ', ', "').replace(',\n ', ', "').replace( ')', '}').replace("'", '"').replace(' "missing":nan,', '').replace( ' "nthread":None,', '').replace(' "silent":True,', '').replace('"base_score":0.5, ', '') print(a) params_final = json.loads(a) ############################################### final_model = xgb.XGBClassifier(**params_final) final_model.fit(X_, y_, verbose=False) # Performance train train_y_pred = final_model.predict(X_) auc = roc_auc_score(y_, train_y_pred) print("Performance train : ", auc) # Performance test test_y_pred = final_model.predict(X_test) auc = roc_auc_score(y_test, test_y_pred) print("Performance test : ", auc) # # tn, fp, fn, tp = confusion_matrix(y_, final_model.predict(X_)).ravel() # Error rate : err_rate = (fp + fn) / (tp + tn + fn + fp) print("Error rate : ", err_rate) # Accuracy : acc_ = (tp + tn) / (tp + tn + fn + fp) print("Accuracy : ", acc_) # Sensitivity : sens_ = tp / (tp + fn) print("Sensitivity : ", sens_) # Specificity sp_ = tn / (tn + fp) print("Specificity : ", sp_) # False positive rate (FPR) FPR = fp / (tn + fp) print("False positive rate : ", FPR) # Error rate : err_rate = (fp + fn) / (tp + tn + fn + fp) print("Error rate on train set : ", err_rate) # Accuracy : acc_ = (tp + tn) / (tp + tn + fn + fp) print("Accuracy on train set : ", acc_) tn, fp, fn, tp = confusion_matrix(y_test, final_model.predict(X_test)).ravel() # Error rate : err_rate = (fp + fn) / (tp + tn + fn + fp) print("Error rate on test set : ", err_rate) # Accuracy : acc_ = (tp + tn) / (tp + tn + fn + fp) print("Accuracy on test set : ", acc_) # os.makedirs("/home/rlougee/Desktop/xgb_results/" + myaeid) # print(list(zip(X.columns[2:], final_model.feature_importances_))) #xgb.plot_tree(final_model, rankdir='LR') #plt.savefig("/home/rlougee/Desktop/xgb_results/{}/{}_treeplot".format(name, name), dpi=1200) #xgb.plot_importance(final_model, importance_type="weight") #importance type (weight, gain, cover) #plt.tight_layout() #plt.savefig("/home/rlougee/Desktop/xgb_results/{}/{}_featureimportance".format(name,name), dpi=1200) pickle.dump( final_model, open( "/home/rlougee/Desktop/xgb_results/{}/{}_model".format( name, name), 'wb')) #loaded_mdel = pickle.load(open(file_name, 'rb')) except: print('FAILURE: {}'.format(myaeid))
n_job = 6 select_classes = [0, 1, 2, 3, 4, 5] val_dist = X_val_mini.shape[0] / X_train_mini.shape[0] name = 'my_est_oVa' tic_mod_all = time.time() select_alg = [ ada_boost(name + '.ada_boost'), gaussian_nb(name + '.gaussian_nb'), knn(name + '.knn', sparse_data=True), linear_discriminant_analysis(name + '.linear_discriminant_analysis', n_components=1), random_forest(name + '.random_forest'), sgd(name + '.sgd'), xgboost_classification(name + '.xgboost') ] # fitting models estim_one_vs_rest = dict() # scoring models algo_scoring = dict() save_score_path = r'C:/Users/anden/PycharmProjects/NovelEEG/results' for alg in [select_alg[args.index]]: tic_mod = time.time() print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n", "running on %s" % (alg.name + '.one_V_all'), "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") clf_method = one_vs_rest(str(alg.name + '.one_V_all'), estimator=alg, n_jobs=1)
y_test = Y[test] y_train = Y[train] if (num_feats != 0): sk_obj = SelectKBest(f_classif, k=num_feats) x_train = sk_obj.fit_transform(x_train, y_train) x_test = sk_obj.transform(x_test) if (model_type == 'XGB'): if (num_classes == 2): objective = 'binary:logistic' else: objective = 'multi:softmax' if (hyper_param): model = HyperoptEstimator( classifier=xgboost_classification('xbc'), preprocessing=[], algo=tpe.suggest, trial_timeout=200) else: model = XGBClassifier(learning_rate=1, n_estimators=10, objective=objective, silent=True, nthread=num_threads) model.fit(x_train, y_train) elif (model_type == 'SVM'): from sklearn import svm if (hyper_param): model = HyperoptEstimator(classifier=svc("mySVC"), preprocessing=[],
print('##########################') # calculate base parameters spw = len(y[y==0])/len(y[y==1]) max_delta_step = 0 # hp.quniform('max_delta_step', 0, 1, .1) min_child_weight = hp.quniform('min_child',50,60,.1) # subsample = hp.quniform('subsample',0,1,0.1) colsample_bytree = 1 #hp.quniform('colsample_bytree',0.1,1,.1) # max_delta_step = hp.quniform('max_delta_step',0,10,1) gamma = hp.quniform('gamma',0.5,1,.01) learning_rate = hp.quniform('learning_rate',.01,.2,.001) n_estimators = sample(scope.int(hp.quniform('n_estimators',3000,4000,100))) max_depth = 100 n_jobs = 30 model = HyperoptEstimator(classifier=xgboost_classification('my_clf', min_child_weight=min_child_weight, colsample_bytree=colsample_bytree, max_delta_step=max_delta_step, gamma=gamma, learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth, scale_pos_weight=spw, reg_alpha=0, reg_lambda=1, colsample_bylevel=1, subsample=1 )) model.fit(X_, y_) a = str(model.best_model()['learner']).replace('=','":').replace('XGBClassifier(', '{"').replace(', ', ', "').replace(',\n ', ', "').replace(')', '}').replace("'", '"').replace(' "missing":nan,','').replace(' "nthread":None,','').replace(' "silent":True,','').replace('"base_score":0.5, ','') print(a) import json params_final = json.loads(a) # print('##########################') # # cv1_params = {'min_child_weight':[3, 4, 5], 'subsample':[0, 0.5, 0.6, 1], 'colsample_bytree':[.6, .7, .8, .9], 'max_delta_step': [0], 'gamma':[x/10.0 for x in range(0, 5)]} #params to be tried in the grid search # fix_params = { 'max_depth': 14, 'n_estimators': 100, 'objective': 'binary:logistic', 'learning_rate':0.4, 'scale_pos_weight':spw} # csv1 = GridSearchCV(xgb.XGBClassifier(**fix_params), cv1_params, scoring ='roc_auc', cv=5, n_jobs=30) # csv1.fit(X_, y_) # # csv.grid_scores_ # # print(csv1.cv_results_)