def get_most_important_features(train): train = train.drop('ID', 1) train_y = train['TARGET'] train_X = train.drop('TARGET', 1) random_forest = RandomForestClassifier(n_estimators=100) random_forest.fit(train_X, train_y) feater_importance = pd.Series(random_forest.feature_importances_, index=train_X.columns) feater_importance.sort_values(inplace=True) feater_importance.tail(20).plot(kind='barh', figsize=(15 ,7), title='Feature importance by random forest') # plt.savefig("feature_importance.png") grad_boosting = GradientBoostingClassifier() grad_boosting.fit(train_X, train_y) feater_importance = pd.Series(grad_boosting.feature_importances_, index=train_X.columns) feater_importance.sort_values(inplace=True) feater_importance.tail(20).plot(kind='barh', figsize=(10,7), title='Feature importance by gradient boosting') # plt.savefig("feature_importance2.png") extra_trees = ExtraTreesClassifier() extra_trees.fit(train_X, train_y) feater_importance = pd.Series(extra_trees.feature_importances_, index=train_X.columns) feater_importance.sort_values(inplace=True) feater_importance.tail(20).plot(kind='barh', figsize=(20,7), title='Feature importance by extra trees classifier')
def test_gradient_boosting_validation_fraction(): X, y = make_classification(n_samples=1000, random_state=0) gbc = GradientBoostingClassifier(n_estimators=100, n_iter_no_change=10, validation_fraction=0.1, learning_rate=0.1, max_depth=3, random_state=42) gbc2 = clone(gbc).set_params(validation_fraction=0.3) gbc3 = clone(gbc).set_params(n_iter_no_change=20) gbr = GradientBoostingRegressor(n_estimators=100, n_iter_no_change=10, learning_rate=0.1, max_depth=3, validation_fraction=0.1, random_state=42) gbr2 = clone(gbr).set_params(validation_fraction=0.3) gbr3 = clone(gbr).set_params(n_iter_no_change=20) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) # Check if validation_fraction has an effect gbc.fit(X_train, y_train) gbc2.fit(X_train, y_train) assert gbc.n_estimators_ != gbc2.n_estimators_ gbr.fit(X_train, y_train) gbr2.fit(X_train, y_train) assert gbr.n_estimators_ != gbr2.n_estimators_ # Check if n_estimators_ increase monotonically with n_iter_no_change # Set validation gbc3.fit(X_train, y_train) gbr3.fit(X_train, y_train) assert gbr.n_estimators_ < gbr3.n_estimators_ assert gbc.n_estimators_ < gbc3.n_estimators_
def test_plot_partial_dependence_input(): # Test partial dependence plot function input checks. clf = GradientBoostingClassifier(n_estimators=10, random_state=1) # not fitted yet assert_raises(ValueError, plot_partial_dependence, clf, X, [0]) clf.fit(X, y) assert_raises(ValueError, plot_partial_dependence, clf, np.array(X)[:, :0], [0]) # first argument must be an instance of BaseGradientBoosting assert_raises(ValueError, plot_partial_dependence, {}, X, [0]) # must be larger than -1 assert_raises(ValueError, plot_partial_dependence, clf, X, [-1]) # too large feature value assert_raises(ValueError, plot_partial_dependence, clf, X, [100]) # str feature but no feature_names assert_raises(ValueError, plot_partial_dependence, clf, X, ['foobar']) # not valid features value assert_raises(ValueError, plot_partial_dependence, clf, X, [{'foo': 'bar'}])
def main(args): global verbose verbose = args.verbose # Load files if verbose: logger.info('Loading {}'.format(args.train_file)) train_X, train_y = load_file(args.train_file) if verbose: logger.info('Loading {}'.format(args.test_file)) test_X, test_y = load_file(args.test_file) # # Codes for Grid Search # params = [ # {'n_estimators': [50000], 'learning_rate': [2**i for i in np.arange(-10, -9, .25)], 'max_features': ['log2',], 'max_depth': [7,]}, # ] # method = GradientBoostingClassifier(random_state=1, verbose=1) # gscv = GridSearchCV(method, params, scoring='roc_auc', verbose=verbose, n_jobs=5) # gscv.fit(train_X.toarray(), train_y) # if verbose: # for params, mean_score, all_scores in gscv.grid_scores_: # logger.info('{:.6f} (+/- {:.6f}) for {}'.format(mean_score, all_scores.std() / 2, params)) # logger.info('params:{params}'.format(params=gscv.best_params_)) # logger.info('score:{params}'.format(params=gscv.best_score_)) # pred = gscv.best_estimator_.predict_proba(test_X.toarray()) # Best parameters for the competition data method = GradientBoostingClassifier(n_estimators=50000, learning_rate=2**(-9,5), max_features='log2', max_depth=7 random_state=1, verbose=1) method.fit(train_X.toarray(), train_y) pred = method.predict_proba(test_X.toarray()) np.savetxt(args.output, pred[:, 1], fmt='%.6f') if verbose: logger.info('Wrote preds to {file}'.format(file=args.output)) return 0
def test_gradient_boosting_early_stopping(): X, y = make_classification(n_samples=1000, random_state=0) gbc = GradientBoostingClassifier(n_estimators=1000, n_iter_no_change=10, learning_rate=0.1, max_depth=3, random_state=42) gbr = GradientBoostingRegressor(n_estimators=1000, n_iter_no_change=10, learning_rate=0.1, max_depth=3, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) # Check if early_stopping works as expected for est, tol, early_stop_n_estimators in ((gbc, 1e-1, 24), (gbr, 1e-1, 13), (gbc, 1e-3, 36), (gbr, 1e-3, 28)): est.set_params(tol=tol) est.fit(X_train, y_train) assert_equal(est.n_estimators_, early_stop_n_estimators) assert est.score(X_test, y_test) > 0.7 # Without early stopping gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42) gbc.fit(X, y) gbr = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42) gbr.fit(X, y) assert gbc.n_estimators_ == 100 assert gbr.n_estimators_ == 200
def model_train_ensemble(X1,Y1,Save = False, modelname = None): X1,Y1 = DowmSample(X1,Y1,9) # model = RandomForestClassifier(n_estimators=100,random_state=1) model = GradientBoostingClassifier(n_estimators=100,max_leaf_nodes=5, subsample=0.7, learning_rate=0.1, random_state=1) # model = LogisticRegression('l2') model.fit(X1, Y1.ravel()) # 保存模型 if Save == True: f = open(modelname,'w') pickle.dump(model, f) f.close() print '\n -------------- Training is over ----------------------' return model
def test_partial_dependecy_input(): # Test input validation of partial dependence. clf = GradientBoostingClassifier(n_estimators=10, random_state=1) clf.fit(X, y) assert_raises(ValueError, partial_dependence, clf, [0], grid=None, X=None) assert_raises(ValueError, partial_dependence, clf, [0], grid=[0, 1], X=X) # first argument must be an instance of BaseGradientBoosting assert_raises(ValueError, partial_dependence, {}, [0], X=X) # Gradient boosting estimator must be fit assert_raises(ValueError, partial_dependence, GradientBoostingClassifier(), [0], X=X) assert_raises(ValueError, partial_dependence, clf, [-1], X=X) assert_raises(ValueError, partial_dependence, clf, [100], X=X) # wrong ndim for grid grid = np.random.rand(10, 2, 1) assert_raises(ValueError, partial_dependence, clf, [0], grid=grid)
def transform_with_gbm_to_categorical(header, tr_x, tr_y, ts_x, n_est=100, learning_rate=0.1, max_depth=5): clf = GradientBoostingClassifier(n_estimators=n_est, learning_rate=learning_rate, max_depth=max_depth) clf = clf.fit(tr_x, tr_y) """ #Node count estimators = clf.estimators_ for row in estimators: for e in row: print(e.tree_.node_count)""" leaf_indices = clf.apply(tr_x) leaf_indices = leaf_indices.reshape(leaf_indices.shape[0], -1) ts_leaf_indices = clf.apply(ts_x) ts_leaf_indices = ts_leaf_indices.reshape(ts_leaf_indices.shape[0], -1) enc = OneHotEncoder() enc.fit(np.append(leaf_indices, ts_leaf_indices, axis=0)) tr_cat_features = enc.transform(leaf_indices).toarray() ts_cat_features = enc.transform(ts_leaf_indices).toarray() header = ["cat_" + str(i) for i in range(ts_cat_features.shape[1])] print("[gbm_cat] Features size: ", len(header)) return header, tr_cat_features, ts_cat_features
def train_GBDT(self): samples=self.trainset.values target=self.trainlabel.values classifier_GB=GradientBoostingClassifier(n_estimators=1000) classifier_GB.fit(samples,target) return classifier_GB
def classify2(dis_data, numeric_data, t_label): fold = 5 skf = StratifiedKFold(t_label, fold) roc_auc = 0 f1_score_value = 0 clf1 = LogisticRegression() clf2 = GradientBoostingClassifier() # clf3 = tree.DecisionTreeClassifier(max_depth=500, max_leaf_nodes= 500, class_weight={1:12}) clf3 = GradientBoostingClassifier() for train, test in skf: clf3 = clf3.fit(dis_data.iloc[train], t_label.iloc[train]) #compute auc probas_ = clf3.predict_proba(dis_data.iloc[test]) fpr, tpr, thresholds = roc_curve(t_label.iloc[test], probas_[:, 0]) roc_auc += auc(fpr, tpr) #compute f1_score label_pred = clf3.predict(dis_data.iloc[test]) f1_score_value += f1_score(t_label.iloc[test], label_pred, pos_label= 1) return roc_auc / fold, f1_score_value / fold
def PlotFeaturesImportance(X,y,featureNames,dataName): ''' Plot the relative contribution/importance of the features. Best to reduce to top X features first - for interpretability Code example from: http://bugra.github.io/work/notes/2014-11-22/an-introduction-to-supervised-learning-scikit-learn/ ''' gbc = GradientBoostingClassifier(n_estimators=40) gbc.fit(X, y) # Get Feature Importance from the classifier feature_importance = gbc.feature_importances_ # Normalize The Features feature_importance = 100 * (feature_importance / feature_importance.max()) sorted_idx = numpy.argsort(feature_importance) pos = numpy.arange(sorted_idx.shape[0]) + 4.5 # pos = numpy.arange(sorted_idx.shape[0]) # plt.figure(figsize=(16, 12)) plt.figure(figsize=(14, 9), dpi=250) plt.barh(pos, feature_importance[sorted_idx], align='center', color='#7A68A6') #plt.yticks(pos, numpy.asanyarray(df.columns.tolist())[sorted_idx]) #ORIG plt.yticks(pos, numpy.asanyarray(featureNames)[sorted_idx]) plt.xlabel('Relative Importance') plt.title('%s: Top Features' %(dataName)) plt.grid('off') plt.ion() plt.show() plt.savefig(str(dataName)+'TopFeatures.png',dpi=200)
def final_run(X,Y,Xtest,n_est): clf = GradientBoostingClassifier(n_estimators=n_est,random_state=n_est) clf = clf.fit(X,Y) #np.savetxt('gb_oob_improve_{}'.format(n_est),clf.oob_score_) #np.savetxt('gb_train_score_{}'.format(n_est),clf.train_score_) Ytest=clf.predict(Xtest) output(Ytest,'gradient_boost_{}.csv'.format(n_est))
def main(): makeSub = True featureImportance = False cvfold = True df = pd.read_csv('../data/cprobTrain15NA.csv') X, y = np.array(pd.read_csv('../data/train.csv',usecols=range(1,9))), np.array(pd.read_csv('../data/train.csv').ACTION) X = np.hstack((X,np.array(df))) params = {'max_depth':4, 'subsample':0.5, 'verbose':0, 'random_state':1337, 'min_samples_split':10, 'min_samples_leaf':10, 'max_features':10, 'n_estimators': 350, 'learning_rate': 0.05} clf = GradientBoostingClassifier(**params) prefix = 'lib/gbm350d4m10c15' if cvfold: c = classifier.Classifier(X,y) c.validate(clf,nFolds=10,out=prefix+'Train.csv') if makeSub: Xt = np.array(pd.read_csv('../data/test.csv',usecols=range(1,9))) Xt = np.hstack((Xt,np.array(pd.read_csv('../data/cprobTest15NA.csv')))) clf.fit(X,y) y_ = clf.predict_proba(Xt)[:,1] out = pd.read_csv('subs/nbBaseTest.csv') out.ACTION = y_ out.to_csv(prefix+'Test.csv',index=False) if featureImportance: print "Feature ranking:" importances = clf.feature_importances_ indices = np.argsort(importances)[::-1] np.savetxt('indices.txt',indices,delimiter=',') for f in xrange(df.shape[1]): print "%d. feature (%s,%f)" % (f + 1, df.columns[indices[f]], importances[indices[f]])
def gradientboost_prediction(features_train, labels_train, features_test, ids): class RandomForestClassifier_compability(RandomForestClassifier): def predict(self, X): return self.predict_proba(X)[:, 1][:,np.newaxis] base_estimator = RandomForestClassifier_compability() clf = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=5, subsample=0.3, min_samples_split=2, min_samples_leaf=1, max_depth=3, init=base_estimator, random_state=None, max_features=None, verbose=2, learn_rate=None) clf = clf.fit(features_train, labels_train) pred = clf.predict_proba(features_test)[:,1] # feature_importance = clf.feature_importances_ # # print (feature_importance) predictions_file = open("data/rf_prediction.csv", "wb") predictions_file_object = csv.writer(predictions_file) predictions_file_object.writerow(["ID", "TARGET"]) predictions_file_object.writerows(zip(ids, pred)) predictions_file.close()
def train(): posi_result = {} train_feature, test_feature, train_id_list, test_id_list, train_tar_list = merge_feature(feature_str) tmp1 = [m < 32 for m in trainTarList] tmp1 = np.array(tmp1) # train_feature = train_feature[tmp1] target_list = np.array(trainTarList) target_list = target_list[tmp1] # train_id_list = np.array(train_id_list) # train_id_list = train_id_list[tmp1] c_feature = trainFeature.columns[:] clf1 = RandomForestClassifier(n_estimators=200, min_samples_split=17) clf1.fit(trainFeature[c_feature], target_list) # rf_preds = clf1.predict(test_feature) rf_prob = clf1.predict_proba(test_feature) gbdt1 = GradientBoostingClassifier(n_estimators=150, min_samples_split=17) gbdt1.fit(trainFeature[c_feature], target_list) # gbdt_preds = gbdt1.predict(test_feature) gbdt_prob = gbdt1.predict_proba(test_feature) all_prob = rf_prob + gbdt_prob all_preds = [] print all_prob.shape for k in range(all_prob.shape[0]): prob1 = list(allProb[k, :]) ind1 = prob.index(max(prob1)) allPreds.append(ind1) for j in range(len(all_preds)): all_pre_name = dl.get_num_position(all_preds[j]) posi_result[test_id_list[j]] = all_pre_name return posi_result
def gbc_gp_predict(train_x, train_y, test_x): feature_indexs = getTopFeatures(train_x, train_y) sub_x_Train = get_data( train_x, feature_indexs[:16], features.feature_pair_sub_list, features.feature_pair_plus_list, features.feature_pair_mul_list, features.feature_pair_divide_list[:20], ) sub_x_Test = get_data( test_x, feature_indexs[:16], features.feature_pair_sub_list, features.feature_pair_plus_list, features.feature_pair_mul_list, features.feature_pair_divide_list[:20], ) labels = toLabels(train_y) gbc = GradientBoostingClassifier(n_estimators=3000, max_depth=9) gbc.fit(sub_x_Train, labels) pred_probs = gbc.predict_proba(sub_x_Test)[:, 1] ind_test = np.where(pred_probs > 0.55)[0] gp_preds_part = gbc_gp_predict_part(sub_x_Train, train_y, sub_x_Test[ind_test]) gp_preds = np.zeros(len(test_x)) gp_preds[ind_test] = gp_preds_part return gp_preds
def ada_boost(): savefile = open('traindata.pkl', 'rb') (x_train, y_train, t1) = cPickle.load(savefile) savefile.close() savefile = open('testdata.pkl', 'rb') (x_test, t1, name1) = cPickle.load(savefile) savefile.close() # X_train, X_valid, y_train, y_valid = cross_validation.train_test_split( # X, y, test_size=0.1, random_state=42) x_train = np.asarray(x_train,dtype=np.float32) y_train = np.asarray(y_train, dtype='int32')-1 nest = 190 lr = .1 md = 6 # clf1 = DecisionTreeClassifier(max_depth=2) # clf = AdaBoostClassifier(clf1, n_estimators=200, learning_rate=.25) clf = GradientBoostingClassifier(n_estimators=nest, learning_rate=lr, max_depth=md, random_state=0) # clf = RandomForestClassifier(n_estimators=200) #.81 # clf = ExtraTreesClassifier(n_estimators=1000, max_depth=None, min_samples_split=10, random_state=0,n_jobs=8) #.81 # clf = KNeighborsClassifier(15) if 1: clf.fit(x_train, y_train) ypred = clf.predict_proba(x_test) y_str = ['Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9'] kcsv.print_csv(ypred, name1, y_str,indexname='id') print (nest, lr, md) if 0: multiclass_log_loss = make_scorer(score_func=logloss_mc, greater_is_better=True, needs_proba=True) scores = cross_val_score(clf, x_train, y_train, n_jobs=8, cv=5,scoring=multiclass_log_loss) print scores print (nest, lr, md, scores.mean())
def GradBoost(X_DS, Y_DS, X_train, X_test, y_train, y_test, Cl_Names = 'None', mask='None',Max_Depth=3): #****************************************************************************** from sklearn.ensemble import GradientBoostingClassifier as GBC #import library for machine learning analysis from sklearn.metrics import classification_report print 'Gradient Boosting: Training...' #notify the user about the status of the process Gradient_Boosting_obj = GBC(max_depth=Max_Depth) #call the Gradient Boosting routine built in Gradient_Boosting_obj.fit(X_train, y_train) #fit the logistic model to the train data sets Pred_Train = Gradient_Boosting_obj.predict(X_train) #apply the logistic model to the train dataset Pred_Test = Gradient_Boosting_obj.predict(X_test) #apply the logistic model to the test dataset print 'Gradient Boosting: Completed!' #notify the user about the status of the process labels = len(np.unique(Y_DS)) #extract the labels from the classification classes Conf_M = np.zeros((labels,labels), dtype='int') #initialize the confusion matrix for the classification problem if Cl_Names != 'None': target_names = Cl_Names else: target_names = np.arange(len(np.unique(Y_DS))).astype(str).tolist() #end Conf_M = CM(y_test, Pred_Test,np.unique(Y_DS)) #calls the confusion matrix routine with the test set and prediction set print(classification_report(y_test, Pred_Test, target_names=target_names)) #print the performance indicators on the console return Gradient_Boosting_obj, Conf_M
def get_n_fold_validation_score(self, fold=10): features = data.get_features() lables = data.get_lables() length = len(features) jump = length / fold index = 0 k = 0 scores = list() while k < fold: feature_test = features.iloc[index : (index + jump), :] lable_test = lables.iloc[index : (index + jump), :] feature_train_1, feature_train_2 = ( features.iloc[0 : index - 1, :] if index != 0 else pd.DataFrame(), features.iloc[index + jump + 1 : length - 1], ) feature_train = pd.concat([feature_train_1, feature_train_2]) lable_train_1, lable_train_2 = ( lables.iloc[0 : index - 1, :] if index != 0 else pd.DataFrame(), lables.iloc[index + jump + 1 : length - 1], ) lable_train = pd.concat([lable_train_1, lable_train_2]) index += jump k += 1 classifier = GradientBoostingClassifier() classifier.fit(feature_train, lable_train["lable"].values) scores.append(accuracy_score(lable_test, classifier.predict(feature_test))) return sum(scores) / float(len(scores))
def gbdt_train(self, data, task_id, window=DEFAULT_WINDOW): """ Train a gbdt model. :param data: Training dataset. :param task_id: The id of the training task. :param window: the length of window """ X_train = [] y_train = [] features = self.__calculate_features(data, window) if features: return TSD_LACK_SAMPLE for index in features: X_train.append(index[0]) y_train.append(index[1]) X_train = np.array(X_train) y_train = np.array(y_train) try: grd = GradientBoostingClassifier(n_estimators=self.n_estimators, max_depth=self.max_depth, learning_rate=self.learning_rate) grd.fit(X_train, y_train) model_name = MODEL_PATH + task_id + "_model" joblib.dump(grd, model_name) except Exception as ex: return TSD_TRAIN_ERR, str(ex) return TSD_OP_SUCCESS, ""
def mse_sklearn(x_train, x_test, y_train, y_test, n_estimators): clf = GradientBoostingClassifier(n_estimators=n_estimators, min_samples_leaf=MIN_SAMPLES_LEAF, max_depth=MAX_DEPTH) clf.fit(x_train, y_train) pred = clf.predict(x_test) return f1_score(y_test, pred)
class Blender(BaseEstimator, ClassifierMixin): def __init__(self, trained_clfs): self.clfs = trained_clfs # self.classifier = make_pipeline(OneHotEncoder(), DenseTransformer(), # GradientBoostingClassifier()) self.classifier = GradientBoostingClassifier() # self.classifier = make_pipeline( # OneHotEncoder(), LogisticRegression(class_weight='auto')) def fit(self, data, target): # self.enc = LabelEncoder().fit(target) probs = self.transform_input(data) # self.classifier.fit(predictions, target) self.classifier.fit(probs, target) def predict(self, data): predictions = self.transform_input(data) return self.classifier.predict(predictions) def transform_input(self, data): probabilities = [clf.predict_proba(data) for clf in self.clfs] probabilities = np.array(probabilities) # features, samples = probabilities.shape n_clfs, samples, features = probabilities.shape probabilities = np.reshape(probabilities, (samples, n_clfs * features)) probabilities[np.isnan(probabilities)] = 0 return probabilities
def partial_dependence(df, y): ''' INPUT: X = features y = target variable binary, imbalanced classes OUPUT: X = features oversampled to have balanced target classes y = target variable oversample to have balanced classes Discovers the minority class and then oversamples until eah class makes up 50% of your data. ''' X_train, X_test, y_train, y_test = oversample_train_test(df, y) # X_train, X_test, y_train, y_test = train_test_split(df, y, random_state=42) feature_engineering = Pipeline([ ('lists', ListSplitter()), ('race', RaceDummies()), ('crime_sentence', CrimeAndSentence()), ('feat_eng', FeatureEngineer()), ('columns', ColumnFilter(prejudice=False)) ]) X = feature_engineering.fit_transform(X_train.copy(), y_train) X_test = feature_engineering.fit_transform(X_test.copy(), y_test) gbc = GradientBoostingClassifier(n_estimators=850, learning_rate=.75) gbc.fit(X.copy(), y_train) most_imp = np.argsort(gbc.feature_importances_)[-6:] names = list(X_test.columns) feats = list(most_imp) fig, axs = plot_partial_dependence(gbc, X_test, feats, feature_names=names, n_jobs=3, grid_resolution=50)
def run_gradient_boosting_classifier(data, _max_depth): (feature_train, feature_test, label_train, label_test) = train_test_split(data[:, 0:-1], data[:, -1].astype(int), test_size=0.25) # TODO: Vary Number of Estimators and Learning Rate gbc = GradientBoostingClassifier(learning_rate=0.1, n_estimators=50, max_depth=_max_depth, verbose = True) gbc.fit(feature_train, label_train) training_error = gbc.score(feature_train, label_train) #cross_validation_score = cross_val_score(gbc, feature_train, label_train, cv=10) testing_error = gbc.score(feature_test, label_test) print "Random Forest Results for Max Depth:", _max_depth print "Training Accuracy:", training_error #print "10-fold Cross Validation Accuracy: %0.2f (+/- %0.2f)" % (cross_validation_score.mean(), cross_validation_score.std() * 2) print "Testing Accuracy:", testing_error feature_importance = gbc.feature_importances_ stddev = np.std([tree[0].feature_importances_ for tree in gbc.estimators_], axis=0) indices = np.argsort(feature_importance)[::-1] # Print the feature ranking print("Feature ranking:") for f in range(len(feature_importance)): print("%d. feature %d (%f)" % (f + 1, indices[f], feature_importance[indices[f]])) plot_feature_importance(feature_importance, indices, stddev, "gradient-boosted-classifier-feature-importance-depth-" + str(_max_depth))
def main(): print '[INFO, time: %s] Getting Data....' % (time.strftime('%H:%M:%S')) testing_file = file('test.p', 'r') training_file = file('train.p', 'r') train = pickle.load(training_file) test = pickle.load(testing_file) testing_file.close() training_file.close() trainX = train[:,:-1] trainy = train[:,-1] testX = test[:,:-1] testy = test[:,-1] print '[INFO, time: %s] Fitting %s ...' % (time.strftime('%H:%M:%S'), 'GradientBoostingClassifier(n_estimators=1000)') clf = GradientBoostingClassifier(n_estimators=1000) clf.fit(trainX, trainy) print '[INFO, time: %s] Making Predictions...' % (time.strftime('%H:%M:%S')) prediction = clf.predict(testX) print '[RESULT, time: %s] accuracy = %f' % (time.strftime('%H:%M:%S'),accuracy_score(testy, prediction)) model_save_file = file('gradient_1000.p', 'w') pickle.dump(clf, model_save_file) model_save_file.close() print 'All done'
def ctr_gbdt(model='sklearn-clicklog', from_cache=False, train_dataset_length=100000, test_dataset_length=100000): TRAIN_FILE, TEST_FILE = create_dataset(model, from_cache, train_dataset_length, test_dataset_length) prediction_model = GradientBoostingClassifier( loss='deviance', learning_rate=0.1, n_estimators=30, subsample=1.0, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=5, ) x_train, y_train = clean_data(TRAIN_FILE) x_test, y_test = clean_data(TEST_FILE) with Timer('fit model'): prediction_model.fit(x_train, y_train) with Timer('evaluate model'): y_prediction_train = prediction_model.predict_proba(x_train) y_prediction_test = prediction_model.predict_proba(x_test) loss_train = log_loss(y_train, y_prediction_train) loss_test = log_loss(y_test, y_prediction_test) print 'loss_train: %s' % loss_train print 'loss_test: %s' % loss_test
def predict(fea, df, t, t9): Un = df.columns == 'Blank' for f in Fea: ''' try: df[(f+'_y')] = df[(f+'_x')] - df[(f+'_y')] print(1) except: pass ''' Un = Un | (df.columns == f) Un = Un | (df.columns == (f+'_x')) Un = Un | (df.columns == (f+'_y')) Un = Un & (df.columns != 'New_y') clf = GradientBoostingClassifier() y = df[t].label X = df[t].ix[:,Un] X_train, X_test, y_train, y_test=train_test_split(X, y, test_size = 0.9, random_state = 1) clf.fit(X_train, y_train) re = 'Testing AUC: \t' + str(roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])) print re re = 'September AUC: \t' + str(roc_auc_score(df[t9].label,clf.predict_proba(df[t9].ix[:,Un])[:,1])) print re print(X.columns) print(clf.feature_importances_) return Un, clf
def trainModelComb4(self): ntrain = self.data_train.shape[0] self.xtra = 5 est_prob = np.zeros([ntrain,self.xtra+1]) #for original data, essay and others, which would be fed to a second gb self.mlmodel2 = [LogisticRegression() for i in range(self.xtra)] for i in range(self.xtra-1): self.mlmodel2[i].fit(self.data_train,self.labels_train[:,i+1]) set_result = self.mlmodel2[i].predict_proba(self.data_train) est_prob[:,i] = set_result[:,1] self.mlmodel2[self.xtra-1].fit(self.data_train_ess,self.labels_train[:,0]) set_result2 = self.mlmodel2[self.xtra-1].predict_proba(self.data_train_ess) est_prob[:,self.xtra-1] = set_result2[:,1] #self.data_train = np.hstack((self.data_train,est_prob)) #self.mlmodel = AdaBoostClassifier() self.mlmodel = GradientBoostingClassifier(learning_rate=0.2,subsample=0.4) #self.mlmodel = RandomForestClassifier(n_estimators = 200, n_jobs=3,verbose =1) self.mlmodel.fit(self.data_train,self.labels_train[:,0]) set_result3 = self.mlmodel.predict_proba(self.data_train) est_prob[:,self.xtra] = set_result3[:,1] #2nd layer GB self.mlmodel3 = GradientBoostingClassifier(learning_rate=0.1) self.mlmodel3.fit(est_prob,self.labels_train[:,0])
def train_classifiers(X_data, y_data): ############ Linear SVM: 0.908 ############# clf_LSVM = svm.SVC(kernel = 'linear') clf_LSVM.fit(X_data, y_data) ############ MultinomialNB: 0.875 ############# clf_MNB = MultinomialNB() clf_MNB.fit(X_data, y_data) ############ Random Forest: 0.910 ############# clf_RF = RandomForestClassifier(n_estimators=200, criterion='entropy') clf_RF.fit(X_data, y_data) ############ Extra Tree: 0.915 ################## clf_ETC = ExtraTreesClassifier(n_estimators=500, max_depth=None, min_samples_split=1, random_state=0) clf_ETC.fit(X_data, y_data) ############ AdaBoost: 0.88 ################## clf_Ada = AdaBoostClassifier() clf_Ada.fit(X_data, y_data) ############ rbf SVM: 0.895 ############# clf_rbf = svm.SVC(C=200, gamma=0.06, kernel='rbf') clf_rbf.fit(X_data, y_data) ############ GradientBoosting: 0.88 ############# clf_GBC = GradientBoostingClassifier() clf_GBC.fit(X_data, y_data) return clf_LSVM, clf_MNB, clf_RF, clf_ETC, clf_Ada, clf_rbf, clf_GBC
def test_oob_improvement(): """Test if oob improvement has correct shape and regression test. """ clf = GradientBoostingClassifier(n_estimators=100, random_state=1, subsample=0.5) clf.fit(X, y) assert clf.oob_improvement_.shape[0] == 100 # hard-coded regression test - change if modification in OOB computation assert_array_almost_equal(clf.oob_improvement_[:5], np.array([0.19, 0.15, 0.12, -0.12, -0.11]), decimal=2)
learning_rate=0.1, subsample=0.81, colsample_bytree=0.61, max_depth=3, random_state=0) if os.path.exists("model/slot_gbdt.model"): with open("model/slot_gbdt.model", "rb") as f: gbdt_model = pickle.load(f) else: print("Waring : GBDT model not found, default model used") exit() gbdt_model = GradientBoostingClassifier(learning_rate=0.1, random_state=0, n_estimators=30, min_samples_split=2, min_samples_leaf=8, max_features=0.79, subsample=0.78, max_depth=5) base_models = [rf_model, gbdt_model, xgb_model] # stacker = LogisticRegression(random_state=43) stacker = XGBClassifier(random_state=42) ensemble = Ensemble(n_folds=5, stacker=stacker, base_models=base_models) y_pre = ensemble.fit_predict(X=x_train, y=y_train, T=x_test) print(y_test) print(y_pre) recall = recall_score(y_test, y_pre)
def do_gbdt4(train_x, train_y, test_x=None, test_y=None, learning_rate=0.03, max_depth=8, max_features=25, n_estimators=600, load=False, save=True, outfile=None, search=False, log=False): if search == False: if log == True: mdl_name = 'gbdt_log_train_lr' + str(learning_rate) + '_n' + str( n_estimators) + '_maxdep' + str(max_depth) + '.pkl' else: mdl_name = 'gbdt_train_lr' + str(learning_rate) + '_n' + str( n_estimators) + '_maxdep' + str(max_depth) + '.pkl' if os.path.exists(mdl_name) == True: clf_gbdt = joblib.load(mdl_name) else: # create gradient boosting clf_gbdt = GradientBoostingClassifier(learning_rate=learning_rate, max_depth=max_depth, max_features=max_features, n_estimators=n_estimators) #n_estimators=500, learning_rate=0.5, max_depth=3) clf_gbdt.fit(train_x, train_y) if save == True: try: _ = joblib.dump(clf_gbdt, mdl_name, compress=1) except: print("*** Save GBM model to pickle failed!!!") if outfile != None: outfile.write("*** Save RF model to pickle failed!!!") if test_x != None and test_y != None: probas_gbdt = clf_gbdt.predict_proba(test_x)[:, 1] score_gbdt = roc_auc_score(test_y, probas_gbdt) print("GBDT ROC score", score_gbdt) return clf_gbdt else: max_depth_list = [6, 7, 8, 9, 10] n_list = [2000] lr_list = [0.005, 0.003] max_feat_list = [15, 16, 17, 18, 20] info = {} for md in max_depth_list: for n in n_list: for lr in lr_list: for mf in max_feat_list: print 'max_depth = ', md print 'n = ', n print 'learning rate = ', lr print 'max feature = ', mf # n_estimators=500, learning_rate=0.5, max_depth=3) mdl_name = 'gbdt_n' + str(n) + '_lr' + str( lr) + '_md' + str(md) + 'mf' + str(mf) + '.pkl' if os.path.exists(mdl_name) == True: clf_gbdt = joblib.load(mdl_name) else: clf_gbdt = GradientBoostingClassifier( learning_rate=learning_rate, max_depth=md, max_features=mf, n_estimators=n_estimators) clf_gbdt.fit(train_x, train_y) _ = joblib.dump(clf_gbdt, mdl_name, compress=1) probas_gbdt = clf_gbdt.predict_proba(test_x)[:, 1] score_gbdt = roc_auc_score(test_y, probas_gbdt) info[md, n, lr, mf] = score_gbdt for md in info: scores = info[md] print( 'GBDT max_depth = %d, n = %d, lr = %.5f, max_feature = %d, ROC score = %.5f(%.5f)' % (md[0], md[1], md[2], md[3], scores.mean(), scores.std()))
df3 = df3.drop("Address", 1) df3 = df3.drop("Dates", 1) df3 = df3.drop('id', 1) """ scaler = MinMaxScaler() numerical = ['X','Y','Time','Date'] df1[numerical] = scaler.fit_transform(df1[numerical]) df3[numerical] = scaler.transform(df3[numerical]) """ X_train_sub, X_validation_sub, y_train_sub, y_validation_sub = model_selection.train_test_split( df1, train_target, random_state=0) clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, min_samples_leaf=20, max_depth=5, random_state=0) clf.fit(X_train_sub, y_train_sub) print("Accuracy score (training): {0:.3f}".format( clf.score(X_train_sub, y_train_sub))) print("Accuracy score (validation): {0:.3f}".format( clf.score(X_validation_sub, y_validation_sub))) print() """ clf = GradientBoostingClassifier(n_estimators=100, learning_rate = 0.5, min_samples_leaf=10, max_depth = 5, random_state = 0) X_train_sub, X_validation_sub, y_train_sub, y_validation_sub = model_selection.train_test_split(df1, train_target, random_state=0) learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # Parameter evaluation with GSC validation gbe = GradientBoostingClassifier(random_state=42) parameters = { 'learning_rate': [0.05, 0.1, 0.5], 'max_features': [0.5, 1], 'max_depth': [3, 4, 5] } gridsearch = GridSearchCV(gbe, parameters, cv=100, scoring='roc_auc') gridsearch.fit(X, y) print(gridsearch.best_params_) print(gridsearch.best_score_) # Adjusting development threshold gbi = GradientBoostingClassifier(learning_rate=0.05, max_depth=3, max_features=0.5, random_state=42)
def trainGBM(train, target, test, test_stripped): gbm = GradientBoostingClassifier() gbm.fit(train, target) prediction = [[test[index][0], x] for index, x in enumerate(gbm.predict(test_stripped))] return prediction
y_predXgb = xgbClassifier.predict(train) y_predXgbt = xgbClassifier.predict(test) cm = confusion_matrix(y_test, y_predXgb) from sklearn.neighbors import KNeighborsClassifier KNclassifier = KNeighborsClassifier(n_neighbors=20, metric='minkowski', p=2) KNclassifier.fit(train, y) y_predKN = KNclassifier.predict(train) y_predKNt = KNclassifier.predict(test) cm = confusion_matrix(y_test, y_predKN) from sklearn.ensemble import GradientBoostingClassifier gdr = GradientBoostingClassifier(n_estimators=1000, random_state=42, learning_rate=0.02, max_depth=2) gdr.fit(train, y) y_predgrd = gdr.predict(train) y_predgrdt = gdr.predict(test) cm = confusion_matrix(y_test, y_predgrd) stacked_prediction1 = np.column_stack((y_pred, y_predXgb, y_predKN, y_predgrd)) stacked_predictionTest1 = np.column_stack( (y_predt, y_predXgbt, y_predKNt, y_predgrdt)) xgb_metal_Classifier = XGBClassifier(n_estimators=10, learning_rate=0.1, random_state=42) xgb_metal_Classifier.fit(stacked_prediction1, y)
import numpy as np import pandas as pd from sklearn.ensemble import GradientBoostingClassifier from sklearn.model_selection import train_test_split # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.5640644334722438 exported_pipeline = GradientBoostingClassifier(learning_rate=0.01, max_depth=3, max_features=0.6000000000000001, min_samples_leaf=20, min_samples_split=18, n_estimators=100, subsample=0.4) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
y = titanic["Survived"].values # Split data in a train and a validation set X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.2, random_state=42) # Initialization of the classificators I whish to try clfs = [] clfs.append(LogisticRegression()) clfs.append(SVC()) clfs.append(KNeighborsClassifier(n_neighbors=3)) clfs.append(DecisionTreeClassifier()) clfs.append(RandomForestClassifier()) clfs.append(GradientBoostingClassifier()) mean_clfs = [] std_clfs = [] validation_score = [] # Cicle on the classifiers. For each classifier we look for cross validation accuracy score. # We save the accuracy on the validation set as well for name, classifier in zip(clfs_name, clfs): scores = cross_val_score(classifier, X_train, y_train, cv=7, scoring="accuracy") print('---------------------------------') print(name, ':')
def get_accuracies(data): X_train, X_test, y_train, y_test = get_balanced_data(data) seed = 1 rfc = RandomForestClassifier(bootstrap=True, max_depth=10, max_features='auto', min_samples_leaf=2, min_samples_split=10, n_estimators=500) rfc2 = RandomForestClassifier(bootstrap=False, max_depth=2, max_features='auto', min_samples_leaf=5, min_samples_split=20, n_estimators=100) gbm = GradientBoostingClassifier(min_samples_split=25, min_samples_leaf=25, loss='deviance', learning_rate=0.1, max_depth=5, max_features='auto', criterion='friedman_mse', n_estimators=100) def baseline_model(optimizer='adam', learn_rate=0.01): model = Sequential() model.add(Dense(100, input_dim=X_train.shape[1], activation='relu')) model.add( Dense(50, activation='relu') ) # 8 is the dim/ the number of hidden units (units are the kernel) model.add(Dense(2, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) return model keras = KerasClassifier(build_fn=baseline_model, batch_size=32, epochs=100, verbose=0, optimizer='Adam') outer_cv = KFold(n_splits=5, shuffle=True, random_state=seed) svm = SVC(gamma="scale", probability=True, kernel='rbf', C=0.5) models = [('GBM', gbm), ('RFC', rfc), ('RFC2', rfc2), ('Keras', keras), ('SVM', svm)] results = [] names = [] scoring = 'accuracy' accuracy = [] for name, model in models: cv_results = cross_val_score(model, X_train, y_train, cv=outer_cv, scoring=scoring) results.append(cv_results) names.append(name) # msg = "Cross-validation Accuracy %s: %f (+/- %f )" % (name, cv_results.mean() * 100, cv_results.std() * 100) # print(msg) model.fit(X_train, y_train) # print('Test set accuracy: {:.2f}'.format(model.score(X_test, y_test) * 100), '%') # accuracy.append(name) accuracy.append(model.score(X_test, y_test)) return accuracy
# Make predictions using Extra Tress Classifier + 0.5 subset as it gave the best estimated performance max_depth = 11 #Obtain the list of indexes for the required model indexes = [] for trans,name,X,X_val,v,cols_list,rem_list,rank_list,i_cols_list,i_rem_list in X_all_add: if v == 0.5: if trans == 'Orig': indexes = i_cols_list break from sklearn.ensemble import GradientBoostingClassifier #Best model definition best_model = GradientBoostingClassifier(max_depth=max_depth, random_state=seed) best_model.fit(X_orig[:,indexes],Y) #Read test dataset dataset_test = pandas.read_csv("../input/test.csv") #Drop unnecessary columns ID = dataset_test['Id'] dataset_test.drop('Id',axis=1,inplace=True) dataset_test.drop(rem,axis=1,inplace=True) X_test = dataset_test.values #Make predictions using the best model predictions = best_model.predict(X_test[:,indexes]) # Write submissions to output file in the correct format with open("submission.csv", "w") as subfile: subfile.write("Id,Cover_Type\n")
# ('cntr_brand', Pipeline([ # ('group_col_selector', ColumnSelector(['phone_brand'])), # ('print_data2', PrintTransfrmer()), # ('cnt', GroupCntTransfrmer()), # ])), # ], # transformer_weights={ ## weight components in FeatureUnion # 'cntr_device': 1.0, # 'cntr_brand': 1.0, # }, # ) # ), ('print_data', PrintTransfrmer()), ('estimators', FeatureUnion([ ('gbc', GradientBoostingClassifier()), ('rf', RandomForestClassifier()), ]) ), ('ensambler', LogisticRegression()), ]) pipe_params = {#'feature_union__transformer_weights':[[1,1], [4,1], [1,4]], 'estimators__gbc__n_estimators': [100, 500, 1500], 'estimators__rf__n_estimators': [100, 500, 1500], 'ensambler__C': [10, 1, 0.1], } if __name__=='__main__': all_data = dataset() all_data.data_wrangling(code_testing=False)
min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter='best') modle.fit(titanic[predictors], titanic["Survived"]) kf = cross_validation.KFold(titanic.shape[0], 3, random_state=1) scores = cross_validation.cross_val_score(modle, titanic[predictors], titanic["Survived"], cv=kf) print("DT估计的准确率为%f" % scores.mean()) #GBDT gb_clf = GradientBoostingClassifier(n_estimators=50, max_depth=3, random_state=1) gb_clf.fit(titanic[predictors], titanic["Survived"]) kf = cross_validation.KFold(titanic.shape[0], 3, random_state=1) scores = cross_validation.cross_val_score(gb_clf, titanic[predictors], titanic["Survived"], cv=kf) print("GBDT估计的准确率为%f" % scores.mean()) # KNN knn = neighbors.KNeighborsClassifier(algorithm='kd_tree', n_neighbors=10) knn.fit(titanic[predictors], titanic["Survived"]) kf = cross_validation.KFold(titanic.shape[0], 5, random_state=1) scores = cross_validation.cross_val_score(knn, titanic[predictors],
def modell(X_org, y_org, test_x): n_folds = 5 verbose = True shuffle = False X = X_org y = y_org X_submission = test_x #X_submission = X_org if shuffle: idx = np.random.permutation(y.size) X = X[idx] y = y[idx] skf = StratifiedKFold(n_splits=5, random_state=20, shuffle=True) a=list(skf.split(X, y)) # skf = StratifiedKFold(y=y, n_folds=n_folds) clfs = [ #RandomForestClassifier().set_params(**INITIAL_PARAMS.get("RFC:one", {})), ExtraTreesClassifier().set_params(**INITIAL_PARAMS.get("ETC:one", {})), GradientBoostingClassifier().set_params(**INITIAL_PARAMS.get("GBC:one", {})), #LogisticRegression().set_params(**INITIAL_PARAMS.get("LR:one", {})), xgb.XGBClassifier().set_params(**INITIAL_PARAMS.get("XGBC:two", {})), #xgb.XGBClassifier().set_params(**INITIAL_PARAMS.get("XGBC:one", {})), lgb.LGBMClassifier().set_params(**INITIAL_PARAMS.get("LGB:one", {})) ] print ("Creating train and test sets for blending.") dataset_blend_train = np.zeros((X.shape[0], len(clfs))) dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs))) for j, clf in enumerate(clfs): print (j, clf) dataset_blend_test_j = np.zeros((X_submission.shape[0], len(a))) for i, (train, test) in enumerate(a): print ("Fold", i) X_train = X[train] y_train = y[train] X_test = X[test] y_test = y[test] clf.fit(X_train, y_train) y_submission = clf.predict_proba(X_test)[:,1] dataset_blend_train[test, j] = y_submission dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:,1] dataset_blend_test[:,j] = dataset_blend_test_j.mean(1) print ("Blending.") clf = LogisticRegression(C=2, penalty='l2', class_weight='balanced', n_jobs=-1) # clf = linear_model.RidgeCV( # alphas=np.linspace(0, 200), cv=LM_CV_NUM) #clf = GradientBoostingClassifier(learning_rate=0.02, subsample=0.5, max_depth=6, n_estimators=100) clf.fit(dataset_blend_train, y) y_submission = clf.predict(dataset_blend_test) x_submission = clf.predict(dataset_blend_train) # final_model = LinearRegression() #final_model.fit(stacked_train, y_train) #test_prediction = final_model.predict(stacked_test) print ("Linear stretch of predictions to [0,1]") print ("blend result") #save_submission.to_csv(r'C:\Users\Administrator\Desktop\da\su.csv', index=False) return y_submission,dataset_blend_train,dataset_blend_test,x_submission
#https://www.kaggle.com/rblcoder/learning-bayes-search-optimization #https://scikit-optimize.github.io/#skopt.BayesSearchCV # uses baysian optimization to find model parameters from skopt import BayesSearchCV import pandas as pd from skopt.space import Real, Categorical, Integer #estimator = GradientBoostingClassifier(n_estimators=100, # max_depth=6, # min_samples_split=2, # min_samples_leaf=0.001, # subsample=0.5, # learning_rate=0.001) estimator = GradientBoostingClassifier() search_spaces = { 'n_estimators': Integer(100, 2000), 'max_depth': Integer(6, 15), 'min_samples_split': Integer(2, 20), 'min_samples_leaf': Integer(1, 20), 'subsample': Real(0.5, 1), 'learning_rate': Real(0.001, 0.2) } opt = BayesSearchCV(estimator, search_spaces, n_iter=20, scoring='roc_auc', n_jobs=-1,
clf = ExtraTreesClassifier() clf.fit(x_train, y_train) print("ExtraTrees classifier") print(clf.score(x_test, y_test)) print("\n") #GradientBoosting Classifier x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42) clf = GradientBoostingClassifier() clf.fit(x_train, y_train) print("GradientBoostingClassifier") print(clf.score(x_test, y_test)) print("\n") #Trying other classifier x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42) bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5,
import numpy as np import pandas as pd from sklearn.ensemble import GradientBoostingClassifier from sklearn.kernel_approximation import RBFSampler from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Average CV score on the training set was:0.9822134387351777 exported_pipeline = make_pipeline( RBFSampler(gamma=0.30000000000000004), GradientBoostingClassifier(learning_rate=0.01, max_depth=10, max_features=0.7000000000000001, min_samples_leaf=11, min_samples_split=11, n_estimators=100, subsample=0.3)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
# Same for test x_test, y_test = test_df[['Message', 'Status']].values.T y_test = y_test.astype('int') x_test = x_test.astype('str') print(x_test.shape) vect = CountVectorizer(min_df=2, ngram_range=(2,2)) X_train = vect.fit(x_train).transform(x_train) print(X_train[1].toarray()) X_test = vect.transform(x_test) print('Len of vocabulary is {0}'.format(len(vect.vocabulary_))) print(len(vect.get_feature_names())) param_grid = {'n_estimators':[200,100, 50],'max_depth':[5,6,7,8],'min_samples_leaf':[10,50,100],'max_features':['sqrt','log2']} grid = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=5) grid.fit(X_train, y_train) clf = grid.best_estimator_ print(grid.best_params_) clf.fit(X_train,y_train) # For test set y_test_pred = clf.predict(X_test) recall = recall_score(y_true=(y_test), y_pred=y_test_pred) precision = precision_score(y_true=(y_test), y_pred=y_test_pred) print('Recall of test set: {0}]'.format(recall)) print('Precision of test set :{0}'.format(precision)) # For training set y_train_pred = clf.predict(X_train) recall = recall_score(y_true=(y_train), y_pred=y_train_pred)
def create_and_save_model(X, y): clf1 = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=4, scale_pos_weight=0.2, seed=27, reg_alpha=0.4, reg_lambda=1, early_stopping_rounds=50, show_progress=True) clf2 = AdaBoostClassifier(n_estimators=150) # initialize the base classifier base_cls = DecisionTreeClassifier() # no. of base classifier num_trees = 200 # bagging classifier clf3 = BaggingClassifier(base_estimator=base_cls, n_estimators=num_trees, random_state=8, n_jobs=-1) clf4 = RandomForestClassifier(bootstrap=True, class_weight={ 0: 2.5, 1: 1 }, criterion='entropy', max_depth=60, max_features="auto", max_leaf_nodes=50, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=5, min_samples_split=6, min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1, oob_score=True, random_state=10, verbose=1, warm_start=False) params = { 'n_estimators': 200, 'max_depth': 20, 'subsample': 0.6, 'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': 3, 'loss': 'exponential', 'max_features': 'auto', 'verbose': 1 } #'ccp_alpha': 0.04 clf5 = GradientBoostingClassifier(**params) estimators = [('xgb', clf1), ('abc', clf2), ('bc', clf3), ('rf', clf4), ('gbc', clf5)] stack_estimator = XGBClassifier(learning_rate=0.1, n_estimators=300, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=40, scale_pos_weight=1, seed=27, reg_alpha=0, reg_lambda=1, early_stopping_rounds=50, show_progress=True) model = StackingClassifier(estimators=estimators, final_estimator=stack_estimator, n_jobs=-1, cv=5, verbose=1) model.fit(X, y) file_name = 'model_final.pkl' joblib.dump(model, file_name) return file_name
def get_classifier(method, mode, max_features=None, n_estimators=None, learning_rate=None, random_state=None, min_cases_for_training=30, max_depth=None, subsample=None, colsample_bytree=None): if method == "xgb" and mode == "regr": return ClassifierWrapper(cls=XGBRegressor( n_estimators=n_estimators, learning_rate=learning_rate, subsample=subsample, max_depth=max_depth, colsample_bytree=colsample_bytree, n_jobs=-1, random_state=random_state), min_cases_for_training=min_cases_for_training, mode=mode) elif method == "xgb" and mode == "class": return ClassifierWrapper(cls=XGBClassifier( n_estimators=n_estimators, learning_rate=learning_rate, subsample=subsample, max_depth=max_depth, colsample_bytree=colsample_bytree, n_jobs=-1, random_state=random_state), min_cases_for_training=min_cases_for_training, mode=mode) elif method == "rf" and mode == "regr": return ClassifierWrapper(cls=RandomForestRegressor( n_estimators=n_estimators, max_features=max_features, n_jobs=-1, random_state=random_state), min_cases_for_training=min_cases_for_training, mode=mode) elif method == "rf" and mode == "class": return ClassifierWrapper(cls=RandomForestClassifier( n_estimators=n_estimators, max_features=max_features, n_jobs=-1, random_state=random_state), min_cases_for_training=min_cases_for_training, mode=mode) elif method == "gbm" and mode == "regr": return ClassifierWrapper(cls=GradientBoostingRegressor( n_estimators=n_estimators, max_features=max_features, learning_rate=learning_rate, random_state=random_state), min_cases_for_training=min_cases_for_training, mode=mode) elif method == "gbm" and mode == "class": return ClassifierWrapper(cls=GradientBoostingClassifier( n_estimators=n_estimators, max_features=max_features, learning_rate=learning_rate, random_state=random_state), min_cases_for_training=min_cases_for_training, mode=mode) elif method == "dt" and mode == "regr": return ClassifierWrapper(cls=DecisionTreeRegressor( max_depth=max_depth, max_features=max_features, random_state=random_state), min_cases_for_training=min_cases_for_training, mode=mode) elif method == "dt" and mode == "class": return ClassifierWrapper(cls=DecisionTreeClassifier( max_depth=max_depth, max_features=max_features, random_state=random_state), min_cases_for_training=min_cases_for_training, mode=mode) else: print("Invalid classifier type") return None
} """ paramDist = {'n_estimators': sp_randint(50,100), # 'criterion': ['gini'], 'max_features':['auto'], 'max_depth': scipy.stats.expon(scale=5), # 'min_samples_split':scipy.stats.expon(scale=2), 'min_samples_leaf':scipy.stats.expon(scale=1)} """ paramReg = {'penalty': ['l2'], 'C': [0.1, 0.01, 0.001, 1]} paramSVC = {'kernel': ['rbf'], 'C': [0.1, 0.01, 0.001, 1]} Rforest = RandomForestClassifier(class_weight='subsample') Gradboost = GradientBoostingClassifier() LogReg = LogisticRegression(class_weight='auto') SVMCl = SVC(class_weight='auto') metric = roc_auc_score grid_search = GridSearchCV(SVMCl, cv=3, param_grid=paramSVC, n_jobs=4, pre_dispatch='1*n_jobs', scoring='precision') grid_search = GridSearchCV(LogReg, cv=3, param_grid=paramReg, n_jobs=4, pre_dispatch='1*n_jobs',
# Data X = pd.read_csv("X_feat_sel.csv") y = pd.read_csv("y.csv", header=None, names='y') ######################################################### import numpy as np from sklearn.model_selection import GridSearchCV from sklearn.model_selection import RandomizedSearchCV from sklearn.ensemble import GradientBoostingClassifier ######################################################### # tune tree gbX = GradientBoostingClassifier() tree_grid = {'max_depth':np.arange(1,12,1), 'min_samples_split':[2,10,25,50,100,150,250], 'min_samples_leaf':[1,5,10,25,50,100,150,250], 'max_leaf_nodes':[None, 5, 10, 20, 40, 80], 'max_features':np.arange(2,10), 'learning_rate':10 ** np.arange(-2,1, dtype=np.float), 'n_estimators':[10,25,50,100,200]} gsX = RandomizedSearchCV(gbX, tree_grid, cv=2, n_iter=200) gsX.grid_scores_, gsX.best_params_, gsX.best_score_ #########################################################
height).astype(int).transpose(1, 0) RF_predict_prob = RF.predict_proba(data_all) # Post-processing using Graph-Cut Seg_Label, seg_accuracy = Post_Processing(RF_predict_prob,height,width,\ num_classes,y_test,test_indexes) print('(Random Forest) Train_Acc=%.3f, Cla_Acc=%.3f, Seg_Acc=%.3f(Time_cost=%.3f)'\ % (RF.score(X_train,y_train),RF.score(X_test,y_test),\ seg_accuracy, (time.time()-start_time))) # draw classification map draw(GT_Label, RF_Label, Seg_Label, train_map, test_map) print('--------------------------------------------------------------------') # Gradient Boosting from sklearn.ensemble import GradientBoostingClassifier start_time = time.time() GBC = GradientBoostingClassifier(n_estimators=300, learning_rate=0.1).fit(X_train, y_train) GBC_Label = GBC.predict(data_all).reshape(width, height).astype(int).transpose(1, 0) GBC_predict_prob = GBC.predict_proba(data_all) # Post-processing using Graph-Cut Seg_Label, seg_accuracy = Post_Processing(GBC_predict_prob,height,width,\ num_classes,y_test,test_indexes) print('(Gradient Boosting) Train_Acc=%.3f, Cla_Acc=%.3f, Seg_Acc=%.3f(Time_cost=%.3f)'\ % (GBC.score(X_train,y_train),GBC.score(X_test,y_test),\ seg_accuracy, (time.time()-start_time))) # draw classification map draw(GT_Label, GBC_Label, Seg_Label, train_map, test_map) print('--------------------------------------------------------------------') # Neural Network - MLP from sklearn.neural_network import MLPClassifier
## TODO: Add any additional arguments that you will need to pass into your model # args holds all passed-in arguments args = parser.parse_args() # Read in csv training file training_dir = args.data_dir train_data = pd.read_csv(os.path.join(training_dir, "train.csv"), header=None, names=None) # Labels are in the first column train_y = train_data.iloc[:,0] train_X = train_data.iloc[:,1:] ## --- Your code here --- ## ## Define a model model = GradientBoostingClassifier() ## Train the model model.fit(train_X, train_y) ## --- End of your code --- ## # Save the trained model joblib.dump(model, os.path.join(args.model_dir, "model.joblib"))
from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=state) #Passing different learning rates to find best learning_rate. from sklearn.ensemble import GradientBoostingClassifier lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1] for learning_rate in lr_list: gb_clf = GradientBoostingClassifier(n_estimators=10, learning_rate=learning_rate, max_features=2, max_depth=2, random_state=0) gb_clf.fit(X_train, y_train) print("Learning rate: ", learning_rate) print("Accuracy score (training): {0:.3f}".format( gb_clf.score(X_train, y_train))) print("Accuracy score (validation): {0:.3f}".format( gb_clf.score(X_test, y_test))) from sklearn.metrics import classification_report, confusion_matrix gb_clf2 = GradientBoostingClassifier(n_estimators=10, learning_rate=0.1, max_features=2,
import pandas as pd from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import SelectFromModel from sklearn.model_selection import train_test_split from sklearn.model_selection import cross_validate from sklearn.ensemble import GradientBoostingClassifier malData = pd.read_csv("C:/Users/parsh/Desktop/MalwareData.csv", sep="|") begn = malData[0:41323].drop(["legitimate"], axis=1) mal = malData[41323::].drop(["legitimate"], axis=1) data_in = malData.drop(['Name', 'md5', 'legitimate'], axis=1).values labels = malData['legitimate'].values extratrees = ExtraTreesClassifier().fit(data_in, labels) select = SelectFromModel(extratrees, prefit=True) data_new = select.transform(data_in) begn_train, begn_test, mal_train, mal_test = train_test_split(data_new, labels, test_size=0.2) grad_boost = GradientBoostingClassifier(n_estimators=50) grad_boost.fit(begn_train, mal_train) print(grad_boost.score(begn_test, mal_test) * 100)
#ensemble models models = {} print "Training on all features" X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1010) models['RFC'] = RandomForestClassifier(n_estimators=300) models['XGB'] = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05) models['GBC'] = GradientBoostingClassifier() models['ABC'] = AdaBoostClassifier() models['ETC'] = ExtraTreesClassifier() for name, model in models.iteritems(): model.fit(X_train, y_train) print name print classification_report(y_test, model.predict(X_test)) print "Accuracy: ", accuracy_score(y_test, model.predict(X_test)) print '\n' feature_importances = pd.DataFrame() for name, model in models.iteritems(): df = pd.DataFrame(data=model.feature_importances_, index=X_test.columns,
def gradient_boosting_classifier(train_x, train_y): from sklearn.ensemble import GradientBoostingClassifier model = GradientBoostingClassifier(n_estimators=200) model.fit(train_x, train_y) return model
def predefined_estimators(estimator, random_state, n_jobs, p): """ Provides the classifiers and parameters using by the module Parameters ----------- estimator : str Name of scikit learn estimator. random_state : Any number Seed to use in randomized components. n_jobs : int Number of processing cores to use. p : dict Classifier setttings (keys) and values. Returns ------- clf : object Scikit-learn classifier object mode : str Flag to indicate whether classifier performs classification or regression. """ try: from sklearn.experimental import enable_hist_gradient_boosting except ImportError: pass from sklearn.linear_model import ( LogisticRegression, LinearRegression, SGDRegressor, SGDClassifier, ) from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.naive_bayes import GaussianNB from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.ensemble import ( RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier, ExtraTreesRegressor, ) from sklearn.ensemble import (GradientBoostingClassifier, GradientBoostingRegressor) from sklearn.svm import SVC, SVR from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor from sklearn.neural_network import MLPClassifier, MLPRegressor estimators = { "SVC": SVC(C=p["C"], probability=True, random_state=random_state), "SVR": SVR(C=p["C"], epsilon=p["epsilon"]), "LogisticRegression": LogisticRegression( C=p["C"], solver="liblinear", random_state=random_state, multi_class="auto", n_jobs=1, fit_intercept=True, ), "LinearRegression": LinearRegression(n_jobs=n_jobs, fit_intercept=True), "SGDClassifier": SGDClassifier( penalty=p["penalty"], alpha=p["alpha"], l1_ratio=p["l1_ratio"], n_jobs=n_jobs, random_state=random_state, ), "SGDRegressor": SGDRegressor( penalty=p["penalty"], alpha=p["alpha"], l1_ratio=p["l1_ratio"], random_state=random_state, ), "DecisionTreeClassifier": DecisionTreeClassifier( max_depth=p["max_depth"], max_features=p["max_features"], min_samples_leaf=p["min_samples_leaf"], random_state=random_state, ), "DecisionTreeRegressor": DecisionTreeRegressor( max_features=p["max_features"], min_samples_leaf=p["min_samples_leaf"], random_state=random_state, ), "RandomForestClassifier": RandomForestClassifier( n_estimators=p["n_estimators"], max_features=p["max_features"], min_samples_leaf=p["min_samples_leaf"], random_state=random_state, n_jobs=n_jobs, oob_score=True, ), "RandomForestRegressor": RandomForestRegressor( n_estimators=p["n_estimators"], max_features=p["max_features"], min_samples_leaf=p["min_samples_leaf"], random_state=random_state, n_jobs=n_jobs, oob_score=True, ), "ExtraTreesClassifier": ExtraTreesClassifier( n_estimators=p["n_estimators"], max_features=p["max_features"], min_samples_leaf=p["min_samples_leaf"], random_state=random_state, n_jobs=n_jobs, bootstrap=True, oob_score=True, ), "ExtraTreesRegressor": ExtraTreesRegressor( n_estimators=p["n_estimators"], max_features=p["max_features"], min_samples_leaf=p["min_samples_leaf"], random_state=random_state, bootstrap=True, n_jobs=n_jobs, oob_score=True, ), "GradientBoostingClassifier": GradientBoostingClassifier( learning_rate=p["learning_rate"], n_estimators=p["n_estimators"], max_depth=p["max_depth"], min_samples_leaf=p["min_samples_leaf"], subsample=p["subsample"], max_features=p["max_features"], random_state=random_state, ), "GradientBoostingRegressor": GradientBoostingRegressor( learning_rate=p["learning_rate"], n_estimators=p["n_estimators"], max_depth=p["max_depth"], min_samples_leaf=p["min_samples_leaf"], subsample=p["subsample"], max_features=p["max_features"], random_state=random_state, ), "HistGradientBoostingClassifier": GradientBoostingClassifier( learning_rate=p["learning_rate"], n_estimators=p["n_estimators"], max_depth=p["max_depth"], min_samples_leaf=p["min_samples_leaf"], subsample=p["subsample"], max_features=p["max_features"], random_state=random_state, ), "HistGradientBoostingRegressor": GradientBoostingRegressor( learning_rate=p["learning_rate"], n_estimators=p["n_estimators"], max_depth=p["max_depth"], min_samples_leaf=p["min_samples_leaf"], subsample=p["subsample"], max_features=p["max_features"], random_state=random_state, ), "MLPClassifier": MLPClassifier( hidden_layer_sizes=p["hidden_layer_sizes"], alpha=p["alpha"], random_state=random_state, ), "MLPRegressor": MLPRegressor( hidden_layer_sizes=p["hidden_layer_sizes"], alpha=p["alpha"], random_state=random_state, ), "GaussianNB": GaussianNB(), "LinearDiscriminantAnalysis": LinearDiscriminantAnalysis(), "QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(), "KNeighborsClassifier": KNeighborsClassifier(n_neighbors=p["n_neighbors"], weights=p["weights"], n_jobs=n_jobs), "KNeighborsRegressor": KNeighborsRegressor(n_neighbors=p["n_neighbors"], weights=p["weights"], n_jobs=n_jobs), } # define classifier model = estimators[estimator] # classification or regression if (estimator == "LogisticRegression" or estimator == "SGDClassifier" or estimator == "MLPClassifier" or estimator == "DecisionTreeClassifier" or estimator == "RandomForestClassifier" or estimator == "ExtraTreesClassifier" or estimator == "GradientBoostingClassifier" or estimator == "HistGradientBoostingClassifier" or estimator == "GaussianNB" or estimator == "LinearDiscriminantAnalysis" or estimator == "QuadraticDiscriminantAnalysis" or estimator == "SVC" or estimator == "KNeighborsClassifier"): mode = "classification" else: mode = "regression" return (model, mode)
dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.9940458797222709 exported_pipeline = make_pipeline( RFE(estimator=ExtraTreesClassifier(criterion="entropy", max_features=0.55, n_estimators=100), step=0.15000000000000002), StackingEstimator( estimator=GradientBoostingClassifier(learning_rate=1.0, max_depth=1, max_features=0.8500000000000001, min_samples_leaf=6, min_samples_split=5, n_estimators=100, subsample=0.25)), StackingEstimator( estimator=LogisticRegression(C=1.0, dual=False, penalty="l1")), StackingEstimator( estimator=RandomForestClassifier(bootstrap=False, criterion="gini", max_features=0.7500000000000001, min_samples_leaf=14, min_samples_split=14, n_estimators=100)), KNeighborsClassifier(n_neighbors=2, p=2, weights="distance")) exported_pipeline.fit(training_features, training_target)
from sklearn.svm import SVC, LinearSVC, NuSVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, \ GradientBoostingClassifier,ExtraTreesClassifier from sklearn.naive_bayes import GaussianNB from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.linear_model import LogisticRegression from mlxtend.classifier import StackingClassifier,EnsembleVoteClassifier from sklearn import cross_validation from sklearn.cross_validation import KFold clf1 = KNeighborsClassifier(4) clf2 = DecisionTreeClassifier(criterion="gini") clf3 = LogisticRegression() lr = LogisticRegression() gb =GradientBoostingClassifier() classifiers = [ # StackingClassifier(classifiers=[clf1, clf2, clf3], # meta_classifier=lr), # EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],voting='soft', verbose=0), # SVC(kernel="linear", C=0.025), ExtraTreesClassifier(n_estimators=150, criterion="entropy", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features="auto", max_leaf_nodes=None, min_impurity_split=1e-7, bootstrap=False, oob_score=False, n_jobs=1, random_state=410, verbose=0, warm_start=False, class_weight=None), RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_split=1e-07, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=70, n_jobs=1, oob_score=True, random_state=410,