def objective_xgb(space): numfolds = 5 total = 0 kf = StratifiedKFold(n_splits=numfolds, shuffle=True,random_state=666) clf = xgb.XGBClassifier(n_estimators = 100, max_depth = space['max_depth'], learning_rate = space['learning_rate'], min_child_weight = space['min_child_weight'], subsample = space['subsample'], colsample_bytree = space['colsample_bytree']) # colsample_bylevel = space['colsample_bylevel'], # nthread = -1) for train_index, test_index in kf.split(X_train_pred,Y_train_new.is_duplicate): xtrain, xtest = X_train_pred.iloc[train_index], X_train_pred.iloc[test_index] ytrain, ytest = Y_train_new.iloc[train_index], Y_train_new.iloc[test_index] eval_set = [(xtrain, ytrain),(xtest, ytest)] clf.fit(xtrain, ytrain.values.ravel(), eval_metric="logloss",eval_set = eval_set, early_stopping_rounds=50) # rf.fit(xtrain,ytrain.values.ravel()) pred = clf.predict_proba(xtest)[:,1] logloss = log_loss(ytest, pred) print ("SCORE:", logloss) total += logloss total = total/numfolds print (total) return{'loss':total, 'status': STATUS_OK }
def cross_validation(data, label, para_c, para_o): kfold = para_c['kfold'] neg = 0 pos = 1 gF1 = [] ggmean = [] gauc = [] path = 'collection.xls' from vae4 import mnist_vae from sklearn.model_selection import StratifiedKFold skf = StratifiedKFold(n_splits=kfold) for train_index, test_index in skf.split(data, label): train = data[train_index] test = data[test_index] train, test = standard_scale(train, test) train_label = label[train_index] test_label = label[test_index] negative = train[train_label == neg] positive = train[train_label == pos] from sklearn.naive_bayes import GaussianNB gnb = GaussianNB() # from sklearn.ensemble import RandomForestClassifier # gnb = RandomForestClassifier() if para_c['over_sampling'] == 'SMOTE': s = Smote(positive, N=100) gene = s.over_sampling() elif para_c['over_sampling'] == 'vae': gene_size = positive.shape[0] gene = mnist_vae(positive, gene_size, para_o) # print(gene.shape) elif para_c['over_sampling'] == 'random_walk': gene_size = positive.shape[0] gene = random_walk(positive, gene_size) else: gene = [] train, train_label = app(positive, negative, gene) y_predne = gnb.fit(train, train_label).predict(test) temf, temg, tema = compute(test_label, y_predne) print('F1', temf, 'AUC', tema, 'gmean', temg) gF1.append(temf) ggmean.append(temg) gauc.append(tema) print( '##########################zhouying###################################' ) # if para_c['over_sampling'] == 'vae': # write(path,dict(para_c,**para_o),{'F1':gF1,'AUC':gauc,'gmean':ggmean}) # else: # write(path,para_c,{'F1':gF1,'AUC':gauc,'gmean':ggmean}) print('mean F1:', np.mean(gF1), 'mean AUC:', np.mean(gauc), 'mean gmean:', np.mean(ggmean)) return
def classifer_stacking(data_file,alertgroup_name,classifier_list): classifiers = {'KNN':KNeighborsClassifier(), # n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric_params=None, n_jobs=1), # 'LR': LogisticRegression(), 'RF': RandomForestClassifier(), # n_estimators=60,max_depth=13,min_samples_split=120,min_samples_leaf=20,random_state=10 'DT': tree.DecisionTreeClassifier(), # criterion='gini',splitter=random,max_features=None,max_depth=13,min_samples_leaf=2 'GBDT': GradientBoostingClassifier() # loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2, min_samples_leaf=1,max_depth=3,verbose=0,presort='auto') # 'XGB':xgboost_classifier } all_data = pd.read_csv(data_file, sep=',', dtype=str) for alertgroup, group in all_data.groupby('alertgroup'): if alertgroup == alertgroup_name: train_x, test_x, train_y, test_y = get_data(group, split=True) arr_x = train_x.values arr_y = train_y.values max_fs = 0 best_model = None stratified_folder = StratifiedKFold(n_folds=3,random_state=0,shuffle=False) for train_index,test_index in stratified_folder.split(train_x): train_x = arr_x[train_index] train_y = arr_y[train_index] test_x = arr_x[test_index] test_y = arr_y[test_index] classifiers_list = [classifiers[cl] for cl in classifier_list] stack_model = StackingClassifier(classifiers = classifiers_list,use_probas=True, average_probas=True,meta_classifier=classifiers['RF']) stack_model.fit(train_x,train_y) predict = stack_model.predict(test_x) fbetascore = fbeta_score(test_y, predict, 1) print(' f2score:' + str(fbetascore)) if fbetascore > max_fs: max_fs = fbetascore best_model = stack_model stack_model = best_model predict = stack_model.predict(test_x) precision = metrics.precision_score(test_y, predict) recall = metrics.recall_score(test_y, predict) fbetascore = fbeta_score(test_y, predict, 0.5) accuracy = metrics.accuracy_score(test_y, predict) print('final performance:') print(alertgroup_name) print('precision: %.6f' % (100 *precision)) print('recall: %.6f' % (100 * recall)) print('f0.5score: %.6f' % (100 * fbetascore)) print('accuracy: %.6f%%' % (100 * accuracy)) return best_model
def objective(space): numfolds = 5 total = 0 kf = StratifiedKFold(n_splits=numfolds, shuffle=True,random_state=13) rf = RandomForestClassifier(n_estimators = 200, max_depth = space['max_depth'], max_features = space['max_features'], criterion = space['criterion'], min_impurity_split = 0.0005, # min_impurity_split = space['min_impurity_split'], # scale = space['scale'], # normalize = space['normalize'], # min_samples_leaf = space['min_samples_leaf'], # min_weight_fraction_leaf = space['min_weight_fraction_leaf'], # min_impurity_split = space['min_impurity_split'], random_state = 666, # warm_start = True, n_jobs = -1 ) for train_index, test_index in kf.split(X_train_new,Y_train_new.is_duplicate): xtrain, xtest = X_train_new.iloc[train_index], X_train_new.iloc[test_index] ytrain, ytest = Y_train_new.iloc[train_index], Y_train_new.iloc[test_index] # eval_set = [(xtrain, ytrain),(xtest, ytest)] # clf.fit(xtrain, ytrain, eval_metric="logloss",eval_set = eval_set, early_stopping_rounds=50) rf.fit(xtrain,ytrain.values.ravel()) pred = rf.predict_proba(xtest)[:,1] logloss = log_loss(ytest, pred) print ("SCORE:", logloss) total += logloss total = total/numfolds print (total) return{'loss':total, 'status': STATUS_OK }
def create_cross_validation(data, positive, N): #将输入的全部数据转换成平均交叉验证的数据包,data为数据和标签包, #positive为该次生成数据中的正类样本的标签,N为交叉验证的折数 #data的形式为list,数据在前,为样本数*维数的形式,label在后 #为样本数*1的形式 label = data[1] data = data[0] label[label != positive] = 0 label[label == positive] = 1 result = {} from sklearn.model_selection import StratifiedKFold skf = StratifiedKFold(n_splits=N) i = 0 for train_index, test_index in skf.split(data, label): train = data[train_index] test = data[test_index] train_label = label[train_index] test_label = label[test_index] result[str(i)] = [train, train_label, test, test_label] i = i + 1 return result
# Create a Classifier Service. # Classifier process starts using a default configuration. classifier = Classifier.run(Config()) # Prepare arrays to keep true/predicted labels to display a report later. true_labels = [] predicted_labels = [] # Run stratified K-fold validation. labels = list(dataset.get_labels()) if sklearn_version < 18: train_test_indices = StratifiedKFold(labels, n_folds=10) else: skf = StratifiedKFold(n_splits=10) train_test_indices = skf.split(labels, labels) for train_idx, test_idx in train_test_indices: # Clear the classifier (call `clear` RPC). classifier.clear() # Split the dataset to train/test dataset. (train_ds, test_ds) = (dataset[train_idx], dataset[test_idx]) # Train the classifier using train dataset. for (idx, label) in classifier.train(train_ds): # You can peek records being trained. #print('train[{0}]: (label: {1}) => {2}'.format(idx, label, train_ds[idx])) pass # Test the classifier using test dataset.
def grid_search(data, label, para_c, para_o): kfold = para_c['kfold'] neg = 0 pos = 1 gF1 = [] ggmean = [] gauc = [] path = 'collection.xls' mF1 = 0 maxF1 = {} mgmean = 0 maxgmean = {} mauc = 0 maxauc = {} from vae4 import mnist_vae from sklearn.model_selection import StratifiedKFold from sklearn.preprocessing import MinMaxScaler for hidden_encoder_dim in np.arange(1, data.shape[1], 5): para_o['hidden_encoder_dim'] = hidden_encoder_dim for hidden_decoder_dim in np.arange(1, data.shape[1], 5): para_o['hidden_decoder_dim'] = hidden_decoder_dim for epochs in np.arange(20, 50, 10): para_o['epochs'] = epochs for batch_size in np.arange(1, 20, 3): para_o['batch_size'] = batch_size for learning_rate in np.linspace(0.001, 0.1, 10): para_o['learning_rate'] = learning_rate for lam in np.linspace(0, 0.25 * learning_rate, 4): para_o['lam'] = lam skf = StratifiedKFold(n_splits=kfold) for train_index, test_index in skf.split( data, label): train = data[train_index] test = data[test_index] min_max_scaler = MinMaxScaler() min_max_scaler.fit_transform(train) min_max_scaler.transform(test) train_label = label[train_index] test_label = label[test_index] negative = train[train_label == neg] positive = train[train_label == pos] from sklearn.naive_bayes import GaussianNB gnb = GaussianNB() gene_size = negative.shape[0] - positive.shape[ 0] gene = mnist_vae(positive, gene_size, para_o) train, train_label = app( positive, negative, gene) # print(train.shape) y_predne = gnb.fit(train, train_label).predict(test) temf, temg, tema = compute( test_label, y_predne) gF1.append(temf) ggmean.append(temg) gauc.append(tema) if mF1 < np.mean(gF1): mF1 = gF1 maxF1 = para_o.copy() if mgmean < np.mean(ggmean): mgmean = ggmean maxgmean = para_o.copy() if mauc < np.mean(gauc): mauc = gauc maxauc = para_o.copy() gF1 = [] ggmean = [] gauc = [] print( '##########################zhouying###################################' ) print( '##########################zhouying###################################' ) # print(dict(para_c,**maxF1)) # print({'max F1':mF1}) # print(dict(para_c,**maxgmean)) # print({'max gmean':mgmean}) # print(dict(para_c,**maxauc)) # print({'max auc':mauc}) write(path, dict(para_c, **maxF1), {'max F1': mF1}) write(path, dict(para_c, **maxgmean), {'max gmean': mgmean}) write(path, dict(para_c, **maxauc), {'max auc': mauc}) return
def BuildModel(this, clip_csv=None): """ Builds the DNN model used to classify partial clips @TODO: document me """ cfg = this.config if not clip_csv: clip_csv = cfg.full_clips_csv #first: load the clip lists clipFiles = pd.read_csv(clip_csv) #second : load the actual clip data this.log.debug('loading audio data') X_train = this._prepare_data(clipFiles) #third, index and binarize the labels this.log.debug('binarizing labels') y_train = pd.get_dummies(clipFiles['label']) # now we can actually build the model if (cfg.useDummyModel): model = this._buildDummyModel() else: model = this._buildModel() # and run it clipFiles['label_idx'] = clipFiles['label'].astype( 'category').cat.codes try: skf = StratifiedKFold(clipFiles.label_idx, n_folds=cfg.num_folds) except TypeError: n_samples = len(clipFiles.label_idx) skf = StratifiedKFold(n_splits=cfg.num_folds) skf = skf.split(np.zeros(n_samples), clipFiles.label_idx) for i, (train_split, val_split) in enumerate(skf): X, y = X_train[train_split], y_train.values[train_split] X_val, y_val = X_train[val_split], y_train.values[val_split] checkpoint = ModelCheckpoint(cfg.models_dir + '/best_%d.h5' % i, monitor='val_loss', verbose=1, save_best_only=True) early = EarlyStopping(monitor="val_loss", mode="min", patience=5) tb = TensorBoard(log_dir=cfg.logs_dir + '/fold_%i' % i, write_graph=True) callbacks_list = [checkpoint, early, tb] print("#" * 50) print("Fold: ", i) history = model.fit(X, y, validation_data=(X_val, y_val), callbacks=callbacks_list, batch_size=64, epochs=cfg.max_epochs) # run predict on our test set model.load_weights(cfg.models_dir + '/best_%d.h5' % i) predictions = model.predict(X_train, batch_size=64, verbose=1) # save the column names for the model columns = pd.Series(['name'] + list(y_train)) columns.to_csv(cfg.column_names_csv, header=False) # Save train predictions np.save(cfg.self_prediction_dir + "/train_predictions_%d.npy" % i, predictions) y_predict = pd.DataFrame(predictions, columns=list(y_train)) y_predict.to_csv(cfg.self_prediction_dir + "/train_predictions_%d.csv" % i, index=True, index_label='idx')
xgb_model = xgb.XGBClassifier(n_estimators = 300, max_depth = 5, # 7 learning_rate = 0.05, # 0.168 min_child_weight = 7, subsample = 0.97, colsample_bytree = 0.82794) del X_train_pred['pred_lgbm'] del X_test_pred['pred_lgbm'] pred_test_full = np.zeros(X_test_pred.shape[0]) for train_index, test_index in kf.split(X_train_preds_new,Y_train_new.is_duplicate): xtrain, xtest = X_train_preds_new.iloc[train_index], X_train_preds_new.iloc[test_index] ytrain, ytest = Y_train_new.iloc[train_index], Y_train_new.iloc[test_index] eval_set = [(xtrain, ytrain),(xtest, ytest)] xgb_model.fit(xtrain, ytrain.values.ravel(), eval_metric="logloss",eval_set = eval_set, early_stopping_rounds=50) # rf.fit(xtrain, ytrain) pred_test = xgb_model.predict_proba(X_test_pred)[:,1] pred_test_full += pred_test pred = xgb_model.predict_proba(xtest)[:,1]
min_impurity_split = 0.005356707662170046, # scale = space['scale'], # normalize = space['normalize'], # min_samples_leaf = space['min_samples_leaf'], # min_weight_fraction_leaf = space['min_weight_fraction_leaf'], # min_impurity_split = space['min_impurity_split'], random_state = 13, warm_start = True, n_jobs = -1 ) for train_index, test_index in kf.split(X_train_rf,Y_train_new.is_duplicate): xtrain, xtest = X_train_rf.iloc[train_index], X_train_rf.iloc[test_index] ytrain, ytest = Y_train_new.iloc[train_index], Y_train_new.iloc[test_index] eval_set = [(xtrain, ytrain),(xtest, ytest)] rf.fit(xtrain, ytrain) pred = rf.predict_proba(xtest)[:,1] logloss = log_loss(ytest, pred) # print ("SCORE:", logloss) total += logloss total = total/numfolds print (total)
tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 100) tprs_test = [] aucs_test = [] mean_fpr_test = np.linspace(0, 1, 100) tpot_plot = plt.figure() epochs = 100 for epoch in range(epochs): X=x_train Y=y_train.values X_test= x_test Y_test= y_test.values i = 0 for train, test in cv.split(X,Y): probas_ = gbc.fit(X[train], Y[train]).predict_proba(X[test]) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(Y[test], probas_[:, 1]) tprs.append(interp(mean_fpr, fpr, tpr)) tprs[-1][0] = 0.0 roc_auc = auc(fpr, tpr) aucs.append(roc_auc) #plt.plot(fpr, tpr, lw=1, alpha=0.3, label='Epoch %d, ROC fold %d (AUC = %0.2f)' % (epoch, i, roc_auc)) #i += 1 test_probas_ = gbc.predict_proba(X_test) # Compute ROC curve and area the curve fpr_test, tpr_test, thresholds_test = roc_curve(Y_test, test_probas_[:, 1]) tprs_test.append(interp(mean_fpr_test, fpr_test, tpr_test)) tprs_test[-1][0] = 0.0 roc_auc_test = auc(fpr_test, tpr_test)