def test_zero_estimator_clf(): # Test if ZeroEstimator works for classification. X = iris.data y = np.array(iris.target) est = GradientBoostingClassifier(n_estimators=20, max_depth=1, random_state=1, init=ZeroEstimator()) est.fit(X, y) assert_greater(est.score(X, y), 0.96) est = GradientBoostingClassifier(n_estimators=20, max_depth=1, random_state=1, init='zero') est.fit(X, y) assert_greater(est.score(X, y), 0.96) # binary clf mask = y != 0 y[mask] = 1 y[~mask] = 0 est = GradientBoostingClassifier(n_estimators=20, max_depth=1, random_state=1, init='zero') est.fit(X, y) assert_greater(est.score(X, y), 0.96) est = GradientBoostingClassifier(n_estimators=20, max_depth=1, random_state=1, init='foobar') assert_raises(ValueError, est.fit, X, y)
def run_gradient_boosting_classifier(data, _max_depth): (feature_train, feature_test, label_train, label_test) = train_test_split(data[:, 0:-1], data[:, -1].astype(int), test_size=0.25) # TODO: Vary Number of Estimators and Learning Rate gbc = GradientBoostingClassifier(learning_rate=0.1, n_estimators=50, max_depth=_max_depth, verbose = True) gbc.fit(feature_train, label_train) training_error = gbc.score(feature_train, label_train) #cross_validation_score = cross_val_score(gbc, feature_train, label_train, cv=10) testing_error = gbc.score(feature_test, label_test) print "Random Forest Results for Max Depth:", _max_depth print "Training Accuracy:", training_error #print "10-fold Cross Validation Accuracy: %0.2f (+/- %0.2f)" % (cross_validation_score.mean(), cross_validation_score.std() * 2) print "Testing Accuracy:", testing_error feature_importance = gbc.feature_importances_ stddev = np.std([tree[0].feature_importances_ for tree in gbc.estimators_], axis=0) indices = np.argsort(feature_importance)[::-1] # Print the feature ranking print("Feature ranking:") for f in range(len(feature_importance)): print("%d. feature %d (%f)" % (f + 1, indices[f], feature_importance[indices[f]])) plot_feature_importance(feature_importance, indices, stddev, "gradient-boosted-classifier-feature-importance-depth-" + str(_max_depth))
def test_classification_synthetic(): # Test GradientBoostingClassifier on synthetic dataset used by # Hastie et al. in ESLII Example 12.7. X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) X_train, X_test = X[:2000], X[2000:] y_train, y_test = y[:2000], y[2000:] for loss in ('deviance', 'exponential'): gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=1, max_depth=1, loss=loss, learning_rate=1.0, random_state=0) gbrt.fit(X_train, y_train) error_rate = (1.0 - gbrt.score(X_test, y_test)) assert error_rate < 0.09, \ "GB(loss={}) failed with error {}".format(loss, error_rate) gbrt = GradientBoostingClassifier(n_estimators=200, min_samples_split=1, max_depth=1, learning_rate=1.0, subsample=0.5, random_state=0) gbrt.fit(X_train, y_train) error_rate = (1.0 - gbrt.score(X_test, y_test)) assert error_rate < 0.08, ("Stochastic GradientBoostingClassifier(loss={}) " "failed with error {}".format(loss, error_rate))
def TestGradBoost(dat, lab): ''' This function finds the optimal parameters for the classifier Parameters: ----------- dat: numpy array with all records lab: numpy array with class labels of all records Returns: -------- par: optimal parameters for the classifier ''' # Gradient Boost parameters. Will choose one based on which does best on the validation set # learning_rate, subsample lr = np.linspace(0.01, 0.2, num = 5) sub = np.linspace(0.1, 1.0, num = 5) par = [(e,f) for e in lr for f in sub] # want to try different ensembles to get error bar on score num = 10 seed = np.random.randint(1000000, size = num) valScore = np.zeros((num, len(par))) testScore = np.zeros((num, len(par))) for nv in xrange(0, num): print 'Ensemble:', nv + 1 # split training data into train, validation, test (60, 20, 20) xTrain, xTmp, yTrain, yTmp = cross_validation.train_test_split(dat, lab, test_size = 0.4, random_state = seed[nv]) xVal, xTest, yVal, yTest = cross_validation.train_test_split(xTmp, yTmp, test_size = 0.5, random_state = seed[nv]) # now train RF for each parameter combination for i in xrange(0,len(par)): clf = GradientBoostingClassifier(learning_rate = par[i][0], subsample = par[i][1]) clf = clf.fit(xTrain, yTrain) valScore[nv,i] = clf.score(xVal, yVal) testScore[nv,i] = clf.score(xTest, yTest) # Find optimal parameters tmp = np.argmax(np.mean(valScore, axis = 0)) print print 'Optimal parameters (learning rate, subsampling):', par[tmp] print ('Mean | Std Score (Validation set):', np.mean(valScore, axis = 0)[tmp], '|', np.std(valScore, axis = 0)[tmp]) print ('Mean | Std Score (Test set):', np.mean(testScore, axis = 0)[tmp], '|', np.std(testScore, axis = 0)[tmp]) # Return optimal parameters return par[tmp]
def plotLearningCurve(dat,lab,optim): ''' This function plots the learning curve for the classifier Parameters: ----------- dat: numpy array with all records lab: numpay array with class labels of all records optim: optimal parameters for classifier ''' clf = GradientBoostingClassifier(learning_rate = optim[0], subsample = optim[1]) # split training data into train and test (already chose optimal parameters) xTrain, xTest, yTrain, yTest = cross_validation.train_test_split(dat, lab, test_size = 0.3) # choose various sizes of training set to model on to generate learning curve szV = range(10, np.shape(xTrain)[0], int(np.shape(xTrain)[0]) / 10) szV.append(np.shape(xTrain)[0]) LCvals = np.zeros((len(szV),3), dtype = np.float64) # store data points of learning curve for i in xrange(0, len(szV)): clf = clf.fit(xTrain[:szV[i],:], yTrain[:szV[i]]) LCvals[i,0] = szV[i] LCvals[i,1] = clf.score(xTest, yTest) LCvals[i,2] = clf.score(xTrain[:szV[i],:], yTrain[:szV[i]]) #print LCvals # generate figure fig = plt.figure(1, figsize = (10,10)) prop = matplotlib.font_manager.FontProperties(size=15.5) ax = fig.add_subplot(1, 1, 1) ax.plot(LCvals[:,0] / np.float64(np.shape(xTrain)[0]), 1.0 - LCvals[:,1], label = 'Test Set') ax.plot(LCvals[:,0] / np.float64(np.shape(xTrain)[0]), 1.0 - LCvals[:,2], label = 'Training Set') ax.set_ylabel(r"Error", fontsize = 20) ax.set_xlabel(r"% of Training Set Used", fontsize = 20) ax.axis([0.0, 1.0, -0.1, 0.5]) plt.legend(loc = 'upper right', prop = prop) plt.savefig('LC_GB.pdf', bbox_inches = 'tight') fig.clear() # where is model failing? predProb = clf.predict_proba(xTest) tmp = np.zeros((np.shape(predProb)[0], np.shape(predProb)[1] + 2)) tmp[:,:-2] = predProb tmp[:,-2] = clf.predict(xTest) tmp[:,-1] = yTest mask = tmp[:,-2] != tmp[:,-1] print tmp[mask] print mask.sum(), len(xTest) print tmp[:50,:]
def trainAndPredict(num_trees, train_num): train_X = X[:train_num] train_y = y[:train_num] test_X = X[train_num:] test_y = y[train_num:] #clf = svm.SVC() clf = GradientBoostingClassifier(n_estimators=num_trees, learning_rate=0.5, max_depth=2, random_state=0) clf.fit(train_X, train_y) return (clf.score(train_X, train_y), clf.score(test_X, test_y))
def main(): # generate synthetic binary classification data # (name refers to example 10.2 in ESL textbook...see refs below) X, y = make_hastie_10_2() # perform train/test split (no need to shuffle) split_pt = int(TRAIN_PCT * len(X)) X_train, X_test = X[:split_pt], X[split_pt:] y_train, y_test = y[:split_pt], y[split_pt:] # single dec stump stump_clf = DecisionTreeClassifier( max_depth=1) stump_clf.fit(X_train, y_train) stump_score = round(stump_clf.score(X_test, y_test), 3) print 'decision stump acc = {}\t(max_depth = 1)'.format(stump_score) # single dec tree (max_depth=3) tree_clf = DecisionTreeClassifier(max_depth=3) tree_clf.fit(X_train, y_train) tree_score = round(tree_clf.score(X_test, y_test), 3) print 'decision tree acc = {}\t(max_depth = 5)\n'.format(tree_score) # gbt: a powerful ensemble technique gbt_scores = list() for k in (10, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500): print 'fitting gbt for n_estimators = {}...'.format(k) gbt_clf = GradientBoostingClassifier( n_estimators=k, # number of weak learners for this iteration max_depth=1, # weak learners are dec stumps learning_rate=1.0) # regularization (shrinkage) hyperparam gbt_clf.fit(X_train, y_train) gbt_scores.append(round(gbt_clf.score(X_test, y_test), 3)) print '\ngbt accuracy =\n{}\n'.format(gbt_scores) # stochastic gbt (using subsampling) sgbt_scores = list() for k in (10, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500): print 'fitting sgbt for n_estimators = {}...'.format(k) sgbt_clf = GradientBoostingClassifier( n_estimators=k, # number of weak learners for this iteration max_depth=1, # weak learners are dec stumps subsample=0.5, # % of training set used by each bc learning_rate=1.0) # regularization (shrinkage) hyperparam sgbt_clf.fit(X_train, y_train) sgbt_scores.append(round(sgbt_clf.score(X_test, y_test), 3)) print '\nsgbt accuracy =\n{}'.format(sgbt_scores)
def l1_penalty_solver(train_data,test_data,n_est,m_d): best = 0.0 best_Output = [] for j in [10**(x) for x in xrange(-3,-2,1)]: X, y = train_data[:,1::], train_data[:,0] x1, y1 = test_data[:,1::], test_data[:,0] # Set regularization parameter for C in range(10,11,1): # turn down tolerance for short training time #cls = svm.SVC(kernel='poly',degree=3).fit(X,y) cls = GradientBoostingClassifier(n_estimators=n_est,max_depth=m_d).fit(X,y) #cls = DecisionTreeClassifier().fit(X,y) #cls = LogisticRegression(C=C, penalty='l1', tol=j).fit(X, y) #cls = LogisticRegression(C=C, penalty='l2', tol=j).fit(X, y) val1 = cls.predict(x1) #val1 = cls.predict(x1) val2 = val1 #cls.predict(x1) count = 0. for i in range(len(val1)): if val1[i] == y1[i]: count +=1. else: continue result1 = count/len(val1) count = 0. for i in range(len(val2)): if val2[i] == y1[i]: count +=1. else: continue result2 = count/len(val2) if result1>best: best = result1 best_Output = val1 if result2>best: best = result2 best_Output = val2 pr.print_results(best_Output) #return best return [cls.score(X,y),cls.score(x1,y1)]
def test_iris(): """Check consistency on dataset iris.""" for subsample in (1.0, 0.5): clf = GradientBoostingClassifier(n_estimators=100, loss="deviance", random_state=1, subsample=subsample) clf.fit(iris.data, iris.target) score = clf.score(iris.data, iris.target) assert score > 0.9, "Failed with subsample %.1f " "and score = %f" % (subsample, score)
def gbPredict(LOSS, N_EST, L_RATE, M_DEPT, SUB_S, W_START, N_FOLD, EX_F, TRAIN_DATA_X, TRAIN_DATA_Y, TEST__DATA_X, isProb): # feature extraction ### clf = GradientBoostingClassifier(loss=LOSS, n_estimators=N_EST, learning_rate=L_RATE, max_depth=M_DEPT, subsample=SUB_S, warm_start=W_START).fit(TRAIN_DATA_X, TRAIN_DATA_Y) ### extA = delFeatMin(clf.feature_importances_, EX_F) ### TRAIN_DATA_X = TRAIN_DATA_X[:, extA] # k-fold validation kf = KFold(TRAIN_DATA_Y.shape[0], n_folds=N_FOLD) tesV = 0.0 for train_index, test_index in kf: X_train, X_test = TRAIN_DATA_X[train_index], TRAIN_DATA_X[test_index] y_train, y_test = TRAIN_DATA_Y[train_index], TRAIN_DATA_Y[test_index] clf = GradientBoostingClassifier(loss=LOSS, n_estimators=N_EST, learning_rate=L_RATE, max_depth=M_DEPT, subsample=SUB_S, warm_start=W_START).fit(X_train, y_train) tesK = 1 - clf.score(X_test, y_test) tesV += tesK eVal = tesV / N_FOLD # train all data clf = GradientBoostingClassifier(loss=LOSS, n_estimators=N_EST, learning_rate=L_RATE, max_depth=M_DEPT, subsample=SUB_S, warm_start=W_START).fit(TRAIN_DATA_X, TRAIN_DATA_Y) TEST__DATA_X = TEST__DATA_X[:, extA] if isProb: data = clf.predict_proba(TEST__DATA_X) else: data = clf.predict(TEST__DATA_X) print "Eval =", eVal, "with n_esti =", N_EST, "l_rate =", L_RATE, "m_dep =", M_DEPT, "sub_s =", SUB_S, "ex_num =", EX_F, "and loss is", LOSS return (data, eVal)
def train_gbt(filename, color, name): '''Train on Gradient Boosted Trees Classifier''' # Read data data2 = pd.read_csv(filename, encoding="utf") X = data2.ix[:, 1:-1] y = data2.ix[:, -1] # Split into train, validation and test X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42) # Define model clf1 = GradientBoostingClassifier(learning_rate=0.05, max_depth=5, random_state=42) # Fit model t0 = time() clf1.fit(X_train, y_train) pred_probas = clf1.predict_proba(X_val) predictions = clf1.predict(X_val) print "Score", clf1.score(X_val, y_val) importances = clf1.feature_importances_ indices = np.argsort(importances)[::-1] # Metrics & Plotting metrics[1, 0] = precision_score(y_val, predictions) metrics[1, 1] = recall_score(y_val, predictions) metrics[1, 2] = f1_score(y_val, predictions) metrics[1, 3] = time() - t0 fpr_rf, tpr_rf, _ = roc_curve(y_val, predictions) plt.plot(fpr_rf, tpr_rf, color=color, label=name) return importances, indices
def classify(train, train_sample_ids, test_sample_ids, whichClassifier): feature_names = list(train.columns) feature_names.remove("click_bool") feature_names.remove("booking_bool") feature_names.remove("gross_bookings_usd") #feature_names.remove("date_time") feature_names.remove("position") # Create Train and Test trainX = train[feature_names][train_sample_ids] testX = train[feature_names][test_sample_ids] Y_columns = ["click_bool", "booking_bool", "position"] trainY = train[Y_columns][train_sample_ids].apply(lambda x: objective(x, whichClassifier), axis=1) testY = train[Y_columns][test_sample_ids].apply(lambda x: objective(x, whichClassifier), axis=1) print "Train: ", len(trainY) print "Test: ", len(testY) print("Training the Classifier") classifier = GradientBoostingClassifier(n_estimators=1024, verbose=3, subsample=0.8, min_samples_split=10, max_depth = 6, random_state=1) classifier.fit(trainX, trainY) print "Score = ", classifier.score(testX, testY) return classifier
def GB_Classifier(X_train, X_cv, X_test, Y_train,Y_cv,Y_test, Actual_DS): print("***************Starting Gradient Boosting***************") t0 = time() clf = GradientBoostingClassifier(n_estimators=500,learning_rate=0.01) clf.fit(X_train, Y_train) preds = clf.predict(X_cv) score = clf.score(X_cv,Y_cv) print("Gradient Boosting - {0:.2f}%".format(100 * score)) Summary = pd.crosstab(label_enc.inverse_transform(Y_cv), label_enc.inverse_transform(preds), rownames=['actual'], colnames=['preds']) Summary['pct'] = (Summary.divide(Summary.sum(axis=1), axis=1)).max(axis=1)*100 print(Summary) #Check with log loss function epsilon = 1e-15 #ll_output = log_loss_func(Y_cv, preds, epsilon) preds2 = clf.predict_proba(X_cv) ll_output2= log_loss(Y_cv, preds2, eps=1e-15, normalize=True) print(ll_output2) print("done in %0.3fs" % (time() - t0)) preds3 = clf.predict_proba(X_test) #preds4 = clf.predict_proba((Actual_DS.ix[:,'feat_1':])) preds4 = clf.predict_proba(Actual_DS) print("***************Ending Gradient Boosting***************") return pd.DataFrame(preds2),pd.DataFrame(preds3),pd.DataFrame(preds4)
def rand_forest_train(self): # 读取本地用户特征信息 users = pd.read_csv('names.csv') # 选取similarity、platform、reputation、entropy作为判别人类或机器的特征 X = users[['similarity', 'platform', 'reputation', 'entropy']] y = users['human_or_machine'] # 对原始数据进行分割, 25%的数据用于测试 from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33) # 对类别特征进行转化,成为特征向量 from sklearn.feature_extraction import DictVectorizer vec = DictVectorizer(sparse=False) X_train = vec.fit_transform(X_train.to_dict(orient='record')) X_test = vec.transform(X_test.to_dict(orient='record')) # 使用单一决策树进行集成模型的训练及预测分析 from sklearn.tree import DecisionTreeClassifier dtc = DecisionTreeClassifier() dtc.fit(X_train, y_train) dtc_y_pred = dtc.predict(X_test) # 使用随机森林分类器进行集成模型的训练及预测分析 from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier() rfc.fit(X_train, y_train) rfc_y_pred = rfc.predict(X_test) # 使用梯度提升决策树进行集成模型的训练及预测分析 from sklearn.ensemble import GradientBoostingClassifier gbc = GradientBoostingClassifier() gbc.fit(X_train, y_train) gbc_y_pred = gbc.predict(X_test) from sklearn.metrics import classification_report # 输出单一决策树在测试集上的分类准确性, 以及更加详细的精确率 召回率 F1指标 print("单一决策树的准确性为", dtc.score(X_test, y_test)) print(classification_report(dtc_y_pred, y_test)) # 输出随机森林分类器在测试集上的分类准确性,以及更加详细的精确率 召回率 F1指标 print("随机森林分类器的准确性为", rfc.score(X_test, y_test)) print(classification_report(rfc_y_pred, y_test)) # 输出梯度提升决策树在测试集上的分类准确性,以及更加详细的精确率 召回率 F1指标 print("梯度提升决策树的准确性为", gbc.score(X_test, y_test)) print(classification_report(gbc_y_pred, y_test)) users = pd.read_csv('values.csv') # 检验是否为机器或人类 X = users[['similarity', 'platform', 'reputation', 'entropy']] X = vec.transform(X.to_dict(orient='record')) print(rfc.predict(X)) self.dtc = dtc self.rfc = rfc self.gbc = gbc
def main(): train_f = pd.read_csv(train_path, header=0, parse_dates=['Dates']) print train_f.dtypes X, Y = get_feature(train_f, "training_set") ### TRAINING clf = GradientBoostingClassifier(n_estimators=50) # clf = RandomForestClassifier(n_estimators=2) # clf = LogisticRegression(n_jobs=4) X, Y = shuffle_XY(X, Y) data_len = len(X) train_len = data_len * 95 / 100 val_len = data_len - train_len X_train = X[:train_len] X_val = X[train_len:] Y_train = Y[:train_len] Y_val = Y[train_len:] clf = clf.fit(X_train, Y_train) print "Training done" val_acc = clf.score(X_val, Y_val) print "Val acc:", val_acc val_pred = clf.predict_proba(X_val) # print max(Y_val), min(Y_val) # print Y_val, Y_val + 1 val_log = 0.0 cnt = 0 for y in Y_val: val_log += math.log(val_pred[cnt, y]+0.0000001) cnt += 1 val_log = - val_log / len(Y_val) print "Val log loss:", val_log # print "Val loss:", log_loss(Y_val+1, val_pred) # Note the +1 here! """ # scores = cross_val_score(clf, X, Y) # print "Cross val acc:", scores.mean() """ ### Testing test_f = pd.read_csv(test_path, header=0, parse_dates=['Dates']) # print test_f.dtypes X_test, _ = get_feature(test_f, "test_set") Y_test = clf.predict_proba(X_test) ### Write results # write_results(Y_test) write_results_prob(Y_test)
def test_oob_multilcass_iris(): # Check OOB improvement on multi-class dataset. clf = GradientBoostingClassifier(n_estimators=100, loss='deviance', random_state=1, subsample=0.5) clf.fit(iris.data, iris.target) score = clf.score(iris.data, iris.target) assert_greater(score, 0.9) assert_equal(clf.oob_improvement_.shape[0], clf.n_estimators)
def gbdt_clf(x_train,x_test,y_train,y_test): clf = GradientBoostingClassifier(n_estimators=100) clf.fit(x_train,y_train) y_pred = clf.predict_proba(x_test)[:,1] print "gbdt F1 scores",clf.score(x_test,y_test) scores = roc_auc_score(y_test,y_pred) print "gbdt_clf scores: ",scores joblib.dump(clf,'./output/gbdt_clf.model')
def test_oob_multilcass_iris(): """Check OOB improvement on multi-class dataset.""" clf = GradientBoostingClassifier(n_estimators=100, loss="deviance", random_state=1, subsample=0.5) clf.fit(iris.data, iris.target) score = clf.score(iris.data, iris.target) assert score > 0.9, "Failed with subsample %.1f " "and score = %f" % (0.5, score) assert clf.oob_improvement_.shape[0] == clf.n_estimators
def gradientBoostingClassify(): maximumValue = 0 returnParameters = ['0'] for value in xrange(50,350,50): clfDeviance = GradientBoostingClassifier(n_estimators = value,loss='deviance') clfDeviance.fit(trainData, trainLabel) scoreEnt = clfDeviance.score(validationData, validationLabel) if scoreEnt > maximumValue: maximumValue = scoreEnt returnParameters[0] = str(value) neighTest = GradientBoostingClassifier(n_estimators = int(returnParameters[0]),loss='deviance') neighTest.fit(trainData, trainLabel) scoreTest = neighTest.score(testData, testLabel) guideToGraph['Gradient Boosting'] = scoreTest
def sentiment_analysis_random_forest(train, test, word2vec_model, num_features, num_estimators): trainDataVecs = getAvgFeatureVecs(train["review"], word2vec_model, num_features) testDataVecs = getAvgFeatureVecs(test["review"], word2vec_model, num_features) #forest = RandomForestClassifier(n_estimators=num_estimators) forest = GradientBoostingClassifier(n_estimators=num_estimators) forest.fit(trainDataVecs, train["sentiment"]) result = forest.score(testDataVecs, test["sentiment"]) print result return result
def performGTBClass(X_train, y_train, X_test, y_test): """ Gradient Tree Boosting binary Classification """ clf = GradientBoostingClassifier(n_estimators=100) clf.fit(X_train, y_train) accuracy = clf.score(X_test, y_test) #auc = roc_auc_score(y_test, clf.predict(X_test)) return accuracy
def gradientBoost(X, y, train, valid): from sklearn.ensemble import GradientBoostingClassifier clf1 = GradientBoostingClassifier(n_estimators=200, learning_rate=1.0, max_depth=1, random_state=0).fit(X[train], y[train]) print("gradientboosting" + str(clf1.score(X[valid].toarray(), y[valid]))) yhat = clf1.predict(X[valid].toarray()) yhat_prob = clf1.predict_proba(X[valid].toarray())[:,1] print(classification_report(y[valid], yhat)) print("gradient boosting roc_accuracy" + str(roc_auc_score(y[valid], yhat_prob))) np.savetxt("y_gb.csv", yhat_prob) return yhat_prob
def gb_classify(self): print "Gradient Boosting" clf = GradientBoostingClassifier() clf.fit(self.descr, self.target) mean = clf.score(self.test_descr, self.test_target) pred = clf.predict(self.test_descr) print "Pred ", pred print "Mean : %3f" % mean print "Feature Importances ", clf.feature_importances_
def test_iris(): # Check consistency on dataset iris. for subsample in (1.0, 0.5): for sample_weight in (None, np.ones(len(iris.target))): clf = GradientBoostingClassifier(n_estimators=100, loss='deviance', random_state=1, subsample=subsample) clf.fit(iris.data, iris.target, sample_weight=sample_weight) score = clf.score(iris.data, iris.target) assert score > 0.9, "Failed with subsample %.1f " \ "and score = %f" % (subsample, score)
def predictGBC(X, y): col_mean = np.nanmean(X,axis=0) inds = np.where(np.isnan(X)) X[inds]=np.take(col_mean,inds[1]) gbc = GBC(n_estimators = 100) X_train, X_test, y_train, y_test = chooseRandom(X, y) gbc.fit(X_train, y_train) return gbc.score(X_test, y_test)
def train(f, file_path): file_pt = open(file_path, "r") title = file_pt.readline() ret = None for l in file_pt.readlines(): res = l.split(",") fet = f.create_features_from_res(res) if ret == None: ret = fet elif fet != None: ret = numpy.vstack((ret, fet)) print ret.shape # classifier = RandomForestClassifier(n_estimators=100, # verbose=2, # n_jobs=1, # min_samples_split=10, # random_state=1) classifier = GradientBoostingClassifier(n_estimators=512, verbose=3, max_depth=6, min_samples_split=10, subsample=0.8, random_state=1) valid_ret = validate(f, data_io.get_paths()["valid_sol_path"], classifier) ret = numpy.vstack( (ret, valid_ret) ) print "Final size: ", ret.shape trainX, testX, trainY, testY = train_test_split(ret[:, 3:], ret[:, 0], random_state=1) classifier.fit(trainX, trainY) numpy.savetxt(data_io.get_paths()["feature_path"], ret.astype(float), fmt='%f', delimiter=",") print classifier.score(testX, testY) #validate(f, data_io.get_paths()["valid_sol_path"], classifier) print classifier.score(valid_ret[:, 3:], valid_ret[:, 0]) return classifier
def do_gradient_boost(lr = 1.0, md = 1): #The best values of lr and md have to be determined through grid search # for this dataset ~ lr =0.05, md =3 gave 0.769 on the test set from sklearn.ensemble import GradientBoostingClassifier train_X, train_Y, test_X, test_Y = analysis_glass() clf = GradientBoostingClassifier(n_estimators=100, learning_rate=lr,\ max_depth=md, \ random_state=0).fit(train_X, train_Y) return clf.score(test_X, test_Y)
def test_classification_synthetic(): """Test GradientBoostingClassifier on synthetic dataset used by Hastie et al. in ESLII Example 12.7. """ X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) X_train, X_test = X[:2000], X[2000:] y_train, y_test = y[:2000], y[2000:] gbrt = GradientBoostingClassifier( n_estimators=100, min_samples_split=1, max_depth=1, learning_rate=1.0, random_state=0 ) gbrt.fit(X_train, y_train) error_rate = 1.0 - gbrt.score(X_test, y_test) assert error_rate < 0.085, "GB failed with error %.4f" % error_rate gbrt = GradientBoostingClassifier( n_estimators=200, min_samples_split=1, max_depth=1, learning_rate=1.0, subsample=0.5, random_state=0 ) gbrt.fit(X_train, y_train) error_rate = 1.0 - gbrt.score(X_test, y_test) assert error_rate < 0.08, "Stochastic GB failed with error %.4f" % error_rate
def check_iris(presort, subsample, sample_weight): # Check consistency on dataset iris. clf = GradientBoostingClassifier(n_estimators=100, loss='deviance', random_state=1, subsample=subsample, presort=presort) clf.fit(iris.data, iris.target, sample_weight=sample_weight) score = clf.score(iris.data, iris.target) assert_greater(score, 0.9) leaves = clf.apply(iris.data) assert_equal(leaves.shape, (150, 100, 3))
def check_classification_synthetic(presort, loss): # Test GradientBoostingClassifier on synthetic dataset used by # Hastie et al. in ESLII Example 12.7. X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) X_train, X_test = X[:2000], X[2000:] y_train, y_test = y[:2000], y[2000:] gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=2, max_depth=1, loss=loss, learning_rate=1.0, random_state=0) gbrt.fit(X_train, y_train) error_rate = (1.0 - gbrt.score(X_test, y_test)) assert_less(error_rate, 0.09) gbrt = GradientBoostingClassifier(n_estimators=200, min_samples_split=2, max_depth=1, loss=loss, learning_rate=1.0, subsample=0.5, random_state=0, presort=presort) gbrt.fit(X_train, y_train) error_rate = (1.0 - gbrt.score(X_test, y_test)) assert_less(error_rate, 0.08)
def execute_classifiers(X_train, y_train, X_test, y_test, X_train_scaled, X_test_scaled): ########################################### Random Forest ########################################## print(datetime.datetime.now()) print('\n') print('Random Forests') print('\n') clf = RandomForestClassifier(verbose=1, n_estimators=2000) clf.fit(X_train, y_train) print("Accuracy on training set is : {}".format(clf.score( X_train, y_train))) print("Accuracy on test set is : {}".format(clf.score(X_test, y_test))) y_pred = clf.predict(X_test) print(classification_report(y_test, y_pred)) print(confusion_matrix(y_test, y_pred)) ############################################ XGBoost ############################################### print(datetime.datetime.now()) print('\n') print('XGB Classifier') print('\n') xgb_cls = XGBClassifier(objective="multi:softprob", num_class=20, random_state=61, colsample_bytree=0.6, learning_rate=0.1, n_estimators=200, max_depth=8, alpha=0.01, gamma=0.001, subsamples=0.6) xgb_cls.fit(X_train, y_train) print("Accuracy on training set is : {}".format( xgb_cls.score(X_train, y_train))) print("Accuracy on test set is : {}".format(xgb_cls.score(X_test, y_test))) y_pred = xgb_cls.predict(X_test) print(classification_report(y_test, y_pred)) print(confusion_matrix(y_test, y_pred)) ############################################# GB ################################################## print(datetime.datetime.now()) print('\n') print('GB Classifier') print('\n') gb_cls = GradientBoostingClassifier(min_samples_split=500, min_samples_leaf=50, max_depth=8, max_features='sqrt', subsample=0.8, n_estimators=200, learning_rate=0.2) gb_cls.fit(X_train, y_train) print("Accuracy on training set is : {}".format( gb_cls.score(X_train, y_train))) print("Accuracy on test set is : {}".format(gb_cls.score(X_test, y_test))) y_pred = gb_cls.predict(X_test) print(classification_report(y_test, y_pred)) print(confusion_matrix(y_test, y_pred)) ############################################ Knn ################################################## print(datetime.datetime.now()) print('\n') print('Knn Classifier') print('\n') k = 11 knn_cls = KNeighborsClassifier(n_neighbors=k) knn_cls.fit(X_train_scaled, y_train) print("Accuracy on training set is : {}".format( knn_cls.score(X_train_scaled, y_train))) print("Accuracy on test set is : {}".format( knn_cls.score(X_test_scaled, y_test))) y_pred = knn_cls.predict(X_test_scaled) print(classification_report(y_test, y_pred)) print(confusion_matrix(y_test, y_pred)) ########################################### SVM Classifier ######################################## print(datetime.datetime.now()) print('\n') print('LinearSVC Classifier') print('\n') svm_cls = LinearSVC(C=1) svm_cls.fit(X_train_scaled, y_train) print("Accuracy on training set is : {}".format( svm_cls.score(X_train_scaled, y_train))) print("Accuracy on test set is : {}".format( svm_cls.score(X_test_scaled, y_test))) y_pred = svm_cls.predict(X_test_scaled) print(classification_report(y_test, y_pred)) print(confusion_matrix(y_test, y_pred)) print(datetime.datetime.now()) return True
from sklearn.ensemble import GradientBoostingClassifier cancer = load_breast_cancer() X_train, X_test, y_train, y_test = train_test_split(\ cancer.data, cancer.target, stratify=cancer.target, random_state=0) clf = GradientBoostingClassifier(random_state=0, n_estimators=1500, max_depth=3, learning_rate=0.01, subsample=0.5) clf.fit(X_train, y_train) print("훈련 세트 정확도: {:.3f}".format(clf.score(X_train, y_train))) print("테스트 세트 정확도: {:.3f}".format(clf.score(X_test, y_test))) import os import pickle try: if not(os.path.isdir("../../save")): os.makedirs(os.path.join("../../save")) except OSError as e: if e.errno != errno.EEXIST: print("Failed to create directory!!!!!") raise with open("../../save/save_model_using_pickle.bin", "wb") as f : pickle.dump(clf, f)
Y_pred_SVM = linear_svc.predict(X_test) acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2) #tree decision decision_tree = DecisionTreeClassifier() decision_tree.fit(X_train, Y_train) Y_pred_tree = decision_tree.predict(X_test) acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2) # GradientBoostingClassifier gbs= GradientBoostingClassifier(random_state=1990) gbs.fit(X_train,Y_train) Y_pre_gbs=gbs.predict(X_test) acc_decision_tree = round(gbs.score(X_train, Y_train) * 100, 2) from sklearn.model_selection import cross_val_score rf = RandomForestClassifier(n_estimators=100) scores = cross_val_score(rf, X_train, Y_train, cv=10, scoring = "accuracy") #confusion metrics from sklearn.model_selection import cross_val_predict from sklearn.metrics import confusion_matrix predictions = cross_val_predict(random_forest, X_train, Y_train, cv=3) confusion_matrix(Y_train, predictions) ## precision and recall from sklearn.metrics import precision_score, recall_score print("Precision:", precision_score(Y_train, predictions)) print("Recall:",recall_score(Y_train, predictions))
# # os.system('afplay /System/Library/Sounds/Sosumi.aiff') # # # Predict test data. y_true = np.array(Y_test) y_pred = clf.predict(X_test) # # # https://de.wikipedia.org/wiki/Kontingenztafel # # assess table = pd.crosstab(pd.Series(y_true), pd.Series(y_pred), rownames=['True'], colnames=['Predicted'], margins=True) print(table) print(clf.score(X_test, Y_test)) # print(hits[hits>0]) # print(importance[importance>0]) if i % 100 == 0: print(i) # print(big_test_featureImportance[big_test_featureImportance>0]) # np.savetxt('/home/florian/Dropbox/Masterarbeit/data/machineLearning/2017-06-23T09:42:48.759694/hits.csv', hits, fmt='%.1d', delimiter=',') # np.savetxt('/home/florian/Dropbox/Masterarbeit/data/machineLearning/2017-06-23T09:42:48.759694/importance.csv', importance, delimiter=',') # big_test_featureImportance = importance / hits # # np.savetxt('/home/florian/Dropbox/Masterarbeit/data/machineLearning/2017-06-23T09:42:48.759694/featureImportance_bigTest.csv',big_test_featureImportance, delimiter=',') # exit() ''' Predicted Avian Human Swine All
plt.ylabel("Признак") plt.show() plot_feature_importances_cancer(forest) "In[72]:" from sklearn.ensemble import GradientBoostingClassifier X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0) gbrt = GradientBoostingClassifier(random_state=0) gbrt.fit(X_train, y_train) print("Правильность на обучающем наборе: {:.3f}".format( gbrt.score(X_train, y_train))) print("Правильность на тестовом наборе: {:.3f}".format( gbrt.score(X_test, y_test))) "In[73]:" gbrt = GradientBoostingClassifier(random_state=0, max_depth=1) gbrt.fit(X_train, y_train) print("Правильность на обучающем наборе: {:.3f}".format( gbrt.score(X_train, y_train))) print("Правильность на тестовом наборе: {:.3f}".format( gbrt.score(X_test, y_test))) "In[74]:" gbrt = GradientBoostingClassifier(random_state=0, learning_rate=0.01) gbrt.fit(X_train, y_train)
print('voting', vcr.score(X_train, Y_train)) logreg.fit(X_train, Y_train) rf.fit(X_train, Y_train) xg.fit(X_train, Y_train) svc.fit(X_train, Y_train) extree.fit(X_train, Y_train) knn.fit(X_train, Y_train) gb.fit(X_train, Y_train) print('logreg', logreg.score(X_train, Y_train)) print('randforest', rf.score(X_train, Y_train)) print('extree', extree.score(X_train, Y_train)) print('svc', svc.score(X_train, Y_train)) print('xg', xg.score(X_train, Y_train)) print('knn', knn.score(X_train, Y_train)) print('gb', gb.score(X_train, Y_train)) #report(random_search.cv_results_) #report(random_search_xgb.cv_results_) # In[145]: # get Correlation Coefficient for each feature using Logistic Regression coeff_df = DataFrame(titanic_df.columns.delete(0)) coeff_df.columns = ['Features'] coeff_df["Coefficient Estimate"] = pd.Series(logreg.coef_[0]) # preview coeff_df # In[146]:
def RandomLearning(LPATH, LFILE, LCNT, b, color, idn): def print2f(MSG): lf = open(LOGPATH + LFILE + '_out' + idn + '_random.txt', 'a') print >> lf, MSG lf.close() ff = open(LPATH + LFILE, 'r') idx = 0 fRNA = np.zeros((LCNT, 23 * 4)) label = np.zeros((LCNT, )) for line in ff: f = line.split('\t') if (int(f[1]) == 1): label[idx] = 1 else: label[idx] = -1 fRNA[idx] = Ronehot(f[0]) idx += 1 X_train, X_test, y_train, y_test = train_test_split(fRNA, label, test_size=0.2, random_state=0) print2f((np.shape(X_train), np.shape(y_train), np.shape(X_test), np.shape(y_test))) TRAINLEN = np.shape(X_train)[0] INTLEN = int(TRAINLEN * 0.01) aa = np.split(X_train, [TRAINLEN - INTLEN, TRAINLEN - INTLEN]) X_train = aa[0] inttrain_x = aa[2] aa = np.split(y_train, [TRAINLEN - INTLEN, TRAINLEN - INTLEN]) y_train = aa[0] inttrain_y = aa[2] TRAINLEN = np.shape(X_train)[0] INTLEN = np.shape(inttrain_x)[0] AUGLEN = TRAINLEN * 16 Uset = set() Lset = set() for i in range(0, INTLEN): Lset.add(i) for i in range(INTLEN, TRAINLEN): Uset.add(i) train_x = np.zeros((AUGLEN, 23 * 4)) train_y = np.zeros((AUGLEN, )) train_l = np.zeros((AUGLEN, )) patch = [set() for x in range(0, TRAINLEN)] for i in range(0, TRAINLEN): sample = X_train[i] R1 = np.zeros((4)) R2 = np.zeros((4)) for j in range(0, 4): R1[j] = 1 for k in range(0, 4): R2[k] = 1 RR = np.concatenate((R1, R2)) for x in range(0, 8): sample[x] = RR[x] train_x[i * 16 + j * 4 + k] = sample train_y[i * 16 + j * 4 + k] = y_train[i] train_l[i * 16 + j * 4 + k] = i patch[i].add(i * 16 + j * 4 + k) R2[k] = 0 R1[j] = 0 print2f((TRAINLEN, AUGLEN, INTLEN)) print2f( (np.shape(X_train)[0], np.shape(train_x)[0], np.shape(inttrain_x)[0])) print2f((patch[0])) clf = GradientBoostingClassifier().fit(inttrain_x, inttrain_y) print2f(("init: ", clf.score(X_test, y_test))) clf2 = GradientBoostingClassifier().fit(X_train, y_train) print2f(("complete: ", clf2.score(X_test, y_test))) #for i in range(10,20): # print (clf.predict(X_test[i]), clf.predict_proba(X_test[i])[0][1], clf.predict_log_proba(X_test[i])[0][1], math.log(clf.predict_proba(X_test[i])[0][1]), y_test[i]) eps = np.spacing(1) ITER = int(TRAINLEN / b) patchsize = 16 predpatch = [0.0 for x in range(0, patchsize)] ACC = [] ITR = [] LAB = [] for IT in range(0, ITER): if (INTLEN + b > TRAINLEN): print2f(("OUT OF RANGE ")) break Rm = random.sample(Uset, b) for elm in Rm: Lset.add(elm) Uset.remove(elm) inttrain_x = np.concatenate((inttrain_x, [X_train[elm]]), axis=0) inttrain_y = np.concatenate((inttrain_y, [y_train[elm]]), axis=0) INTLEN += 1 print2f((np.shape(inttrain_x)[0], len(Lset), len(Uset))) clf = GradientBoostingClassifier().fit(inttrain_x, inttrain_y) res = clf.score(X_test, y_test) print2f(("iter: ", IT, res)) ACC.append(res) ITR.append(IT) LAB.append(len(Lset)) plt.plot(LAB, ACC, color) #plt.plot(LAB,ACC,'b*') plt.xlabel('Num of labels') plt.ylabel('Accuracy') plt.ylim(0.5, 1.0) plt.title(LFILE) plt.show()
import pandas as pd from sklearn.model_selection import train_test_split from sklearn.ensemble import GradientBoostingClassifier X = pd.read_csv('Features1.csv') y = pd.read_csv('Res.csv') X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) Gb = GradientBoostingClassifier(learning_rate=0.09, max_depth=2) Gb.fit(X_train, y_train.values.ravel()) print("Accuracy on training set: {:.3f}".format(Gb.score(X_train, y_train))) print("Accuracy on test set: {:.3f}".format(Gb.score(X_test, y_test)))
class_scores['rfc'] = rfc.predict(tweets[list(emoji_codes)]) from sklearn.linear_model import LogisticRegression lr = LogisticRegression().fit(X_train,y_train) lr.score(X_train,y_train) class_scores['lr'] = lr.predict(imdb[list(emoji_codes)]) class_scores.sample(10) from sklearn.ensemble import GradientBoostingClassifier gbc = GradientBoostingClassifier(verbose=1) gbc.fit(X_train,y_train) gbc.score(X_train,y_train) gbc.score(X_test,y_test) from sklearn.model_selection import GridSearchCV param_grid = {"learning_rate":[1,.5,0.1,.01], "n_estimators":[100, 300, 1000], "max_leaf_nodes":[5,10,None]} clf = GridSearchCV(gbc, param_grid, verbose=2) clf.fit(X_train, y_train) clf.best_estimator_.score(X_train, y_train) clf.best_estimator_.score(X_test, y_test) class_scores['gbc_norm'] = np.sign((gbc-np.mean(gbc))/np.std(gbc)) gbc = clf.best_estimator_.predict_proba(imdb[list(emoji_codes)])[:,0] class_scores
#!/usr/bin/env python # ~/spy611/script/simple/scikit-learn_demo.py # Demo: # python ~/spy611/script/simple/scikit-learn_demo.py # Ref: # http://scikit-learn.org/stable/modules/ensemble.html#classification from sklearn.datasets import make_hastie_10_2 from sklearn.ensemble import GradientBoostingClassifier X, y = make_hastie_10_2(random_state=0) X_train, X_test = X[:2000], X[2000:] y_train, y_test = y[:2000], y[2000:] clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X_train, y_train) myscore = clf.score(X_test, y_test) print myscore
X = df[feature_cols] #features y = df.Decision # Target variable X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=2) lr = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1] for i in lr: gb = GradientBoostingClassifier(n_estimators=100, max_depth=2, learning_rate=i) gb.fit(X_train, y_train) score = gb.score(X_test, y_test) #if st.checkbox('Show the learning rate with corresponding score', ): #st.write('learning rate ',i,': ', score) gb = GradientBoostingClassifier(n_estimators=100, max_depth=2, learning_rate=0.75) gb.fit(X_train, y_train) y_pred = gb.predict(X_test) from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score from sklearn.metrics import roc_curve from sklearn.metrics import roc_auc_score
y_test, title, subaxes) plt.show() X_train, X_test, y_train, y_test = train_test_split(X_fruits.as_matrix(), y_fruits.as_matrix(), random_state=0) fig, subaxes = plt.subplots(6, 1, figsize=(6, 32)) pair_list = [[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]] for pair, axis in zip(pair_list, subaxes): X = X_train[:, pair] y = y_train clf = GradientBoostingClassifier().fit(X, y) plot_class_regions_for_classifier_subplot(clf, X, y, None, None, title, axis, target_names_fruits) axis.set_xlabel(feature_names_fruits[pair[0]]) axis.set_ylabel(feature_names_fruits[pair[1]]) plt.tight_layout() plt.show() clf = GradientBoostingClassifier().fit(X_train, y_train) print('GBDT, Fruit dataset, default settings') print('Accuracy of GBDT classifier on training set: {:.2f}'.format( clf.score(X_train, y_train))) print('Accuracy of GBDT classifier on test set: {:.2f}'.format( clf.score(X_test, y_test)))
from sklearn.ensemble import GradientBoostingClassifier from sklearn import datasets from sklearn.model_selection import train_test_split # 梯度 ------> 导数 X, y = datasets.load_iris(return_X_y=True) # cond = y != 2 # X = X[cond] # y = y[cond] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) gbdt = GradientBoostingClassifier(n_estimators=10) gbdt.fit(X_train, y_train) print(gbdt.score(X_test, y_test)) print(gbdt.estimators_)
verbose=False) gnb = GaussianNB() LR = LogisticRegression() x_train_tmp = x_train.iloc[:, item] x_test_tmp = x_test.iloc[:, item] knc.fit(x_train_tmp, y_train) dtc.fit(x_train_tmp, y_train) rfc.fit(x_train_tmp, y_train) gbc.fit(x_train_tmp, y_train) abc.fit(x_train_tmp, y_train) svc.fit(x_train_tmp, y_train) gnb.fit(x_train_tmp, y_train) LR.fit(x_train_tmp, y_train) list_accuracy_knc.append(knc.score(x_test_tmp, y_test)) list_accuracy_dtc.append(dtc.score(x_test_tmp, y_test)) list_accuracy_rfc.append(rfc.score(x_test_tmp, y_test)) list_accuracy_gbc.append(gbc.score(x_test_tmp, y_test)) list_accuracy_abc.append(abc.score(x_test_tmp, y_test)) list_accuracy_svc.append(svc.score(x_test_tmp, y_test)) list_accuracy_gnb.append(gnb.score(x_test_tmp, y_test)) list_accuracy_lr.append(LR.score(x_test_tmp, y_test)) print("knc,dtc,rfc,gbc,abc,svc,gnb,lr") for i in range(100): print(list_accuracy_knc[i],list_accuracy_dtc[i],list_accuracy_rfc[i],list_accuracy_gbc[i],\ list_accuracy_abc[i],list_accuracy_svc[i],list_accuracy_gnb[i],list_accuracy_lr[i]) # y_predict_rfc=rfc.predict(x_test_tmp) # list_matrix.append(confusion_matrix(y_test,y_predict_rfc))
random_state=state) # Varying the learning rate (various models) lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1] for learning_rate in lr_list: gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=2, max_depth=2, random_state=0) gb_clf.fit(X_train, y_train) print("Learning rate: ", learning_rate) print("Accuracy score (training): {0:.3f}".format( gb_clf.score(X_train, y_train))) print("Accuracy score (validation): {0:.3f}".format( gb_clf.score(X_val, y_val))) # Final Model gb_clf2 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=0) gb_clf2.fit(X_train, y_train) predictions = gb_clf2.predict(X_val) print("Confusion Matrix:") print(confusion_matrix(y_val, predictions))
from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split cancer = load_breast_cancer() x_train, x_test, y_train, y_test = train_test_split(cancer.data, cancer.target, train_size=0.8, shuffle=True) # model = DecisionTreeClassifier(max_depth=10) # model = RandomForestClassifier(n_estimators=100) model = GradientBoostingClassifier() model.fit(x_train, y_train) acc = model.score(x_test, y_test) print(f"acc : {acc}") print(model.feature_importances_) # max_features : 기본 값 사용 # n_estimators : 클수록 좋다. 클수록 메모리도 많이 먹음 # n_jobs : 병렬처리(gpu를 같이 돌릴 때는 사용 x) import matplotlib.pyplot as plt import numpy as np def plot_feature_importances_cancer(model): n_features = cancer.data.shape[1] plt.barh(np.arange(n_features), model.feature_importances_, align='center')
print("################Gradient Boosting Classifier############") from sklearn.ensemble import GradientBoostingClassifier clf = GradientBoostingClassifier(random_state=20, learning_rate=0.1, n_estimators=1000, max_depth=3, min_samples_split=5, min_samples_leaf=1, subsample=1, max_features='sqrt') clf.fit(feature_set, y_train) print("\nAccuracy on Training Set :") print(clf.score(feature_set, y_train)) print("\nAccuracy on Testing Set :") print(clf.score(feature_set_test, y_test)) y_pred = clf.predict(feature_set_test) print("\nPrecision Score") print(precision_score(y_test, y_pred)) print("\nRecall Score") print(recall_score(y_test, y_pred)) print("\nF1 Score") print(f1_score(y_test, y_pred)) print("################AdaBoost Classifier############")
def predict(team1, team2, city, toss_winner, toss_decision): #def predict(input): with open('ipl/teamCodes.json', encoding='utf-8') as data_file: teams = json.loads(data_file.read()) data_file.close() #print(teams) with open('ipl/venueCodes.json', encoding='utf-8') as data_file: venue = json.loads(data_file.read()) #print(venue) data_file.close() with open('ipl/tossCodes.json', encoding='utf-8') as data_file: toss = json.loads(data_file.read()) #print(toss) data_file.close() with open('ipl/reverseteamCodes.json', encoding='utf-8') as data_file: reverseteams = json.loads(data_file.read()) #print(reverseteams) data_file.close() print("Input : ") print("Team1 : ",team1) print("Team2 : ",team2) print("City : ", city) print("Toss Winner : ", toss_winner) print("Toss Decision : ", toss_decision) # print(homeTeam, awayTeam, City, tossW, tossD) input=[] input.append(teams[team1]) input.append(teams[team2]) input.append(venue[city]) input.append(teams[toss_winner]) input.append(toss[toss_decision]) #print("Numerical Input :", input) matches_data = pd.read_csv('ipl/matches_new.csv') matches_data = matches_data[['season','team1', 'team2', 'city', 'toss_winner', 'toss_decision', 'winner']] training = matches_data.loc[matches_data.season != 2018] testing = matches_data.loc[matches_data.season == 2018] training = training[['team1', 'team2', 'city', 'toss_winner', 'toss_decision', 'winner']] testing = testing[['team1', 'team2', 'city', 'toss_winner', 'toss_decision', 'winner']] testing.to_csv('ipl/team_prediction.csv',index=False) trainvector = training.values x_train = trainvector[:, 0:5] y_train = trainvector[:, 5] testvector = testing.values x_test = testvector[:, 0:5] y_test = testvector[:, 5] predictions =[] model1 = DecisionTreeClassifier(random_state=1) model1.fit(x_train,y_train) model11 = DecisionTreeClassifier(criterion="entropy",random_state=1) model11.fit(x_train, y_train) model2 = RandomForestClassifier(n_estimators=10) model2.fit(x_train, y_train) model3 = MLPClassifier(hidden_layer_sizes=(3,), activation='logistic', solver='lbfgs', alpha=0.0001,learning_rate='constant', learning_rate_init=0.001, max_iter= 10000) model3.fit(x_train, y_train) model4 = SVC(gamma='auto', probability=True) model4.fit(x_train,y_train) model6 = KNeighborsClassifier() model6.fit(x_train,y_train) pred1 = model1.predict([input]) accu1 = model1.predict(x_test) pred11 = model11.predict([input]) accu11 = model11.predict(x_test) pred2 = model2.predict([input]) accu2 = model2.predict(x_test) pred3 = model3.predict([input]) accu3 = model3.predict(x_test) pred4 = model4.predict([input]) accu4 = model4.predict(x_test) model5 = LogisticRegression(multi_class='auto',solver='lbfgs',max_iter=10000).fit(x_train, y_train) pred5 = model5.predict([input]) accu5 = model5.predict(x_test) pred6 = model6.predict([input]) accu6 = model6.predict(x_test) model17 = GaussianNB() model17.fit(x_train, y_train) model18 = LinearSVC(max_iter=100000) model18.fit(x_train, y_train) pred17 = model17.predict([input]) accu17 = model17.predict(x_test) pred18 = model18.predict([input]) accu18 = model18.predict(x_test) predictions.append(reverseteams[str(pred1[0])]) predictions.append(reverseteams[str(pred2[0])]) predictions.append(reverseteams[str(pred3[0])]) predictions.append(reverseteams[str(pred4[0])]) #predictions.append(reverseteams[str(pred5[0])]) predictions.append(reverseteams[str(pred6[0])]) #predictions.append(reverseteams[str(pred17[0])]) #predictions.append(reverseteams[str(pred18[0])]) print("<30% accuracy : ") print("Gaussian Naive Bayes :", reverseteams[str(pred17[0])]) print("Gaussian Naive Bayes Accuracy : ", round(accuracy_score(y_test, accu17)*100,2)) print("Linear SVC :", reverseteams[str(pred18[0])]) print("Linear SVC Accuracy : ", round(accuracy_score(y_test, accu18)*100,2)) #print("Logistic Regression :", reverseteams[str(pred5[0])]) #print("Logistic Regression Accuracy : ", round(accuracy_score(y_test, accu5) * 100, 2)) print("\n>30% accuracy : ") print("DecisionTreeClassifier :", reverseteams[str(pred1[0])]) print("DecisionTreeClassifier Accuracy : ", round(accuracy_score(y_test, accu1)*100,2)) print("DecisionTreeClassifier with entropy:", reverseteams[str(pred11[0])]) print("DecisionTreeClassifier Accuracy : ", round(accuracy_score(y_test, accu11) * 100, 2)) print("SVC :", reverseteams[str(pred4[0])]) print("SVC Accuracy : ", round(accuracy_score(y_test, accu4)*100,2)) print("KNeighbors Classifier :", reverseteams[str(pred6[0])]) print("KNeighbors Classifier Accuracy : ", round(accuracy_score(y_test, accu6)*100,2)) print("RandomForestClassifier :", reverseteams[str(pred2[0])]) print("RandomForestClassifier Accuracy : ", round(accuracy_score(y_test, accu2)*100,2)) print("MLPClassifier :", reverseteams[str(pred3[0])]) print("MLPClassifier Accuracy : ", round(accuracy_score(y_test, accu3)*100,2)) #Bagging #Building multiple models(same type) from different subsamples of the training dataset seed = 7 kfold = model_selection.KFold(n_splits=10, random_state= seed) #cart = DecisionTreeClassifier() num_trees = 100 model8 = BaggingClassifier(base_estimator=model1, n_estimators=num_trees, random_state=seed) # results = model_selection.cross_val_predict(model,x,y, cv=kfold) # print(results.mean()) model8.fit(x_train,y_train) pred8 = model8.predict([input]) predictions.append(reverseteams[str(pred8[0])]) print("Bagging Prediction : ",reverseteams[str(pred8[0])]) print("Bagging Accuracy : ", round((model8.score(x_test,y_test))*100,2)) #print(results) #Boosting num_trees = 30 kfold = model_selection.KFold(n_splits=10, random_state=seed) #model9 = AdaBoostClassifier(n_estimators=num_trees, random_state=seed) model9 = GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10) model9.fit(x_train,y_train) pred9 = model9.predict([input]) predictions.append(reverseteams[str(pred9[0])]) print("GradientBoostingClassifier(Adaboost) Prediction : ",reverseteams[str(pred9[0])]) print("GradientBoostingClassifier(Adaboost) Accuracy : ",round((model9.score(x_test,y_test))*100,2)) model7 = VotingClassifier(estimators=[('bg',model8),('bo',model9), ('dt', model1), ('rf', model2),('ls', model3), ('sv', model4),('kn', model6)], voting='soft') model7.fit(x_train, y_train) pred7 = model7.predict([input]) print("Voting Prediction ", reverseteams[str(pred7[0])]) print("Voting Prediction Accuracy : ",round((model7.score(x_test, y_test))*100,2)) predictions.append(reverseteams[str(pred7[0])]) frequent = most_frequent(predictions) #print(type(frequent)) #print(type(team1)) #print(type(team2)) #print(frequent) for strwinner in frequent: if strwinner == team1 or strwinner == team2: #print("winner",strwinner) return strwinner return winningProbabolity(team1,team2,city,toss_winner)
features += get_features(coeff) list_features.append(features) return list_features, list_labels X_train_ecg, Y_train_ecg = get_ecg_features(train_data_ecg, train_labels_ecg, 'db4') X_test_ecg, Y_test_ecg = get_ecg_features(test_data_ecg, test_labels_ecg, 'db4') X, Y = get_ecg_features(data_ecg, labels_ecg, 'db4') gb = GradientBoostingClassifier(n_estimators=10000) gb.fit(X_train_ecg, Y_train_ecg) train_score = gb.score(X_train_ecg, Y_train_ecg) test_score = gb.score(X_test_ecg, Y_test_ecg) print("Train Score for the ECG dataset is about: {}".format(train_score)) print("Test Score for the ECG dataset is about: {}".format(test_score)) predictions = gb.predict(X_test_ecg) print("Confusion Matrix:") print(confusion_matrix(Y_test_ecg, predictions)) pred_all = gb.predict(X) scores = cross_val_score(gb, X, Y, cv=10) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) sns.heatmap(confusion_matrix(Y, pred_all), annot=True, fmt='3.0f', cmap="summer") plt.title('Confusion_matrix', y=1.05, size=15)
one_array = np.ones(len(tokenized_texts), len(word2id)) result = np.log(result + one_array) result = result.multiply( 1 / word2freq) if scale: #result = result.tocsc() #result = result.std(0, ddof = 1) result = result.tocsc() result -= result.min() result /= (result.max() + 1e-6) return result.tocsr() VECTORIZATION_MODE = 'tfidf' train_vectors = vectorize_texts(text_tokenized, vocabulary, word_doc_freq, mode=VECTORIZATION_MODE) print('Размерность матрицы признаков обучающей выборки', train_vectors.shape) clf = GradientBoostingClassifier(random_state=value) clf.fit(train_vectors, y) print(train_vectors[:1]) """1-onion 3-milk 7-fries""" clf.predict(train_vectors[:1]) """в выборке есть молоко, и классификатор правильно это определил""" clf.score(train_vectors, y)
joblib.dump(tfidf, 'statement_tfidf.model', compress=3) vec = tfidf.transform(train_data) numpy.savez_compressed("statement_vec.npz", vec.todense()) print('train gbt...', print_mem()) gbt = GradientBoostingClassifier( learning_rate=0.01, n_estimators=100, max_depth=10, min_samples_leaf=10, min_samples_split=20, # max_features=9, verbose=1, ).fit(vec, target) joblib.dump(gbt, 'statement_som_gbt.model') yp = gbt.predict(vec) print("training score : %.3f " % gbt.score(vec, target)) # correct=0 # wrong=0 # for i in range(len(yp)): # if target[i]==0 and yp[i]==0: # correct += 1 # elif target[i] == 1 and yp[i] == 1: # correct +=1 # else: # wrong+=1 # print('precision:', correct*1.0/(correct+wrong)) # print("Mean squared error: %.2f" % mean_squared_error(vec, target)) # print('Variance score: %.2f' % r2_score(yp, target))
#加载cancer数据集 cancer = load_breast_cancer() #划分数据集 X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0) #构建gbdt模型 gbdt = GradientBoostingClassifier(random_state=0) #拟合训练集 gbdt.fit(X_train, y_train) #输出预测结果 train_score = gbdt.score(X_train, y_train) test_score = gbdt.score(X_test, y_test) print("-" * 5, "未处理的GBDT", "-" * 5) print("训练集精度:{:.3f}".format(train_score)) print("测试集精度:{:.3f}".format(test_score)) #由于训练集精度达到100%,可能存在过拟合,限制大树的深度来加强预剪枝 gbdt2 = GradientBoostingClassifier(random_state=0, max_depth=1) #利用深度为1的决策树 #拟合训练集 gbdt2.fit(X_train, y_train) #输出预测结果 train_score2 = gbdt2.score(X_train, y_train) test_score2 = gbdt2.score(X_test, y_test) print("-" * 5, "限制最大深度的GBDT", "-" * 5)
def main(argv): global PATH_IN,PATH_SCRIPT,PATH_OUT PATH_IN,PATH_SCRIPT,PATH_OUT = def_context.get_path() PATH_OUT = get_temp_path() if not os.path.exists(PATH_OUT+'model_PTV/'): os.makedirs(PATH_OUT+'model_PTV/') if(len(argv) == 0): argv = ['all'] if(argv[0] == 'test'): Y_test = pd.read_csv('results.csv').values y_pred = pd.read_csv('y_pred.csv') y_pred2 = pd.read_csv('y_pred2.csv') y_pred3 = pd.read_csv('y_pred2.csv') y_pred4 = pd.read_csv('y_pred4.csv') y_pred5 = pd.read_csv('y_pred5.csv') logreg = use_logisticreg(y_pred,y_pred2,y_pred3,y_pred4,y_pred5,Y_test) res = pd.concat([y_pred,y_pred2,y_pred3,y_pred4,y_pred5],axis=1).values res = logreg.predict_proba(res) for p1 in [0]: for p2 in [0]: def_context.Report('################### '+str(p1)+' ### '+str(p2)+'###################') def_context.Report('############XGB##############') mesure(y_pred.values,Y_test,p1,p2) mismatch(y_pred.values,Y_test,p1,p2) acc(y_pred.values,Y_test,p1,p2) def_context.Report('############CatBoost##############') mesure(y_pred2.values,Y_test,p1,p2) mismatch(y_pred2.values,Y_test,p1,p2) acc(y_pred2.values,Y_test,p1,p2) def_context.Report('############GradientBoostingClassifier##############') mesure(y_pred4.values,Y_test,p1,p2) mismatch(y_pred4.values,Y_test,p1,p2) acc(y_pred4.values,Y_test,p1,p2) def_context.Report('############RandomForestClassifier##############') mesure(y_pred5.values,Y_test,p1,p2) mismatch(y_pred5.values,Y_test,p1,p2) acc(y_pred5.values,Y_test,p1,p2) def_context.Report('############Stack##############') mesure(res,Y_test,p1,p2) mismatch(res,Y_test,p1,p2) acc(res,Y_test,p1,p2) elif(len(argv) == 1): X,Y = load_all(argv[0]) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2) X_train = X_train.replace([np.inf, -np.inf], np.nan) X_train = X_train.fillna(0) X_test = X_test.replace([np.inf, -np.inf], np.nan) X_test = X_test.fillna(0) Y_test = [Y[0] for Y in Y_test.values] ########################################## np.random.seed(42) clf = Classifier() clf.fit(X_train,Y_train) y_pred = clf.predict_proba(X_test) clf2 = Classifier2() clf2.fit(X_train,Y_train) y_pred2 = clf2.predict_proba(X_test) dtree_model = DecisionTreeClassifier(max_depth = 10).fit(X_train,Y_train) y_pred3 = dtree_model.predict_proba(X_test) tpot = GradientBoostingClassifier(learning_rate=0.05, max_depth=10, max_features=0.75, min_samples_leaf=7, min_samples_split=16, n_estimators=500, subsample=0.9) tpot.fit(X_train,Y_train) def_context.Report(tpot.score(X_test, Y_test)) y_pred4 = tpot.predict_proba(X_test) RF_model = RandomForestClassifier(max_depth = 10).fit(X_train,Y_train) y_pred5 = RF_model.predict_proba(X_test) y_p = clf.predict_proba(X_train) y_p2 = clf2.predict_proba(X_train) y_p3 = dtree_model.predict_proba(X_train) y_p4 = tpot.predict_proba(X_train) y_p5 = RF_model.predict_proba(X_train) logreg = use_logisticreg(y_p,y_p2,y_p3,y_p4,y_p5,Y_train) ########################################## save_model_xgb(clf) save_model_cat(clf2) save_model(dtree_model,"DT") save_model(RF_model,"RF") pickle.dump(tpot, open(PATH_OUT+"model_PTV/GradientBoostingClassifier.pickle.dat", "wb")) pickle.dump(RF_model, open(PATH_OUT+"model_PTV/RandomForestClassifier.pickle.dat", "wb")) X = pd.concat([pd.DataFrame(y_pred),pd.DataFrame(y_pred2),pd.DataFrame(y_pred3),pd.DataFrame(y_pred4),pd.DataFrame(y_pred5)],axis = 1).values res = logreg.predict_proba(X) for p1,p2 in zip([0],[0]): def_context.Report('############XGB##############') mesure(y_pred,Y_test,p1,p2) mismatch(y_pred,Y_test,p1,p2) acc(y_pred,Y_test,p1,p2) def_context.Report('############CatBoost##############') mesure(y_pred2,Y_test,p1,p2) mismatch(y_pred2,Y_test,p1,p2) acc(y_pred2,Y_test,p1,p2) def_context.Report('############DecisionTreeClassifier##############') mesure(y_pred3,Y_test,p1,p2) mismatch(y_pred3,Y_test,p1,p2) acc(y_pred3,Y_test,p1,p2) def_context.Report('############GradientBoostingClassifier##############') mesure(y_pred4,Y_test,p1,p2) mismatch(y_pred4,Y_test,p1,p2) acc(y_pred4,Y_test,p1,p2) def_context.Report('############RandomForestClassifier##############') mesure(y_pred5,Y_test,p1,p2) mismatch(y_pred5,Y_test,p1,p2) acc(y_pred5,Y_test,p1,p2) def_context.Report('############Stack##############') mesure(res,Y_test,p1,p2) mismatch(res,Y_test,p1,p2) acc(res,Y_test,p1,p2) #ROC_curve(y_pred,Y_test) #ROC_curve(y_pred2,Y_test) pd.DataFrame(Y_test).to_csv('results.csv',index=False) pd.DataFrame(y_pred).to_csv('y_pred.csv',index=False) pd.DataFrame(y_pred2).to_csv('y_pred2.csv',index=False) pd.DataFrame(y_pred3).to_csv('y_pred3.csv',index=False) pd.DataFrame(y_pred4).to_csv('y_pred4.csv',index=False) pd.DataFrame(y_pred5).to_csv('y_pred5.csv',index=False) return ("process achevé sans erreures")
Y_pred = random_forest.predict(X_test) random_forest.score(X_train, Y_train) acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2) score_rf = cross_val_score(random_forest, X_train, Y_train, cv=k_fold, n_jobs=1, scoring = 'accuracy') print('Random Forest Cross: {}\nRandom Forest: {}'.format(round(np.mean(score_rf)*100,2), acc_random_forest)) # In[112]: from sklearn.ensemble import GradientBoostingClassifier gbk = GradientBoostingClassifier() gbk.fit(X_train, Y_train) Y_pred = gbk.predict(X_test) acc_gbk = round(gbk.score(X_train, Y_train) * 100, 2) score_gbk = cross_val_score(gbk, X_train, Y_train, cv=k_fold, n_jobs=1, scoring = 'accuracy') print('Gradient Boosting Classifier Cross: {}\nGradient Boosting Classifier: {}'.format(round(np.mean(score_gbk)*100,2), acc_gbk)) # In[113]: from sklearn.svm import SVC svc = SVC(gamma = 'scale') svc.fit(X_train, Y_train) Y_pred = svc.predict(X_test) acc_svc = round(svc.score(X_train, Y_train) * 100, 2) score_svc = cross_val_score(svc, X_train, Y_train, cv=k_fold, n_jobs=1, scoring = 'accuracy') print('SVC Cross: {}\nSVC: {}'.format(round(np.mean(score_svc)*100,2), acc_svc))
X_test = vec.transform(X_test.to_dict(orient='record')) #使用单一决策树 from sklearn.tree import DecisionTreeClassifier dtc = DecisionTreeClassifier() dtc.fit(X_train, y_train) dtc_y_pred = dtc.predict(X_test) #使用森林 from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier() rfc.fit(X_train, y_train) rfc_y_pred = rfc.predict(X_test) #使用梯度 from sklearn.ensemble import GradientBoostingClassifier gbc = GradientBoostingClassifier() gbc.fit(X_train, y_train) gbc_y_pred = gbc.predict(X_test) from sklearn.metrics import classification_report print('The accuracy of decision tree is:', dtc.score(X_test, y_test)) print(classification_report(dtc_y_pred, y_test)) print('The accuracy of randomforestclassifier is:', rfc.score(X_test, y_test)) print(classification_report(rfc_y_pred, y_test)) print('The accuracy of gradientboostingclassifier is:', gbc.score(X_test, y_test)) print(classification_report(gbc_y_pred, y_test))
# Делим данные на учебную и тестовую части: X_train, X_test, y_train, y_test = train_test_split(X_scaled, y) # Создаем модель: clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X_train, y_train) y_pred = clf.predict(X_test) # Делаем прогноз для тестовых данных # Оцениваем точность, сравнивая прогноз с фактическими значениями: print('Accuracy score for test data:', metrics.accuracy_score(y_test, y_pred)) # То же самое в одну строку: print('\nMean accuracy for test data:', clf.score(X_test, y_test)) # Рассчитываем вероятности отнесения объектов к разным классам: y_pred_prob = clf.predict_proba(X_test) y_test_classes = pd.get_dummies(y_test) # Выводим показатель ROC AUC: print('\nROC AUC:', metrics.roc_auc_score(y_test_classes, y_pred_prob)) # Создадим confusion_matrix для анализа результатов и посмотрим, # в классификации объектов какого класса модель чаще делает ошибки: col_names = ['pred_' + i for i in target_names] ind = ['fact_' + i for i in target_names] conf_matrix = pd.DataFrame(metrics.confusion_matrix(y_test, y_pred), columns=col_names,
print( "\n ==================================================================== 03: AdaBoost with Decision Tree" ) from sklearn.ensemble import AdaBoostClassifier ada_model = AdaBoostClassifier(n_estimators=100) ada_model.fit(seq_x, seq_y) acc_ada = ada_model.score(seq_x_test, seq_y_test) print('Ada Boost Score: ', acc_ada) print( "\n ==================================================================== 04: Gradient boost" ) from sklearn.ensemble import GradientBoostingClassifier gra_model = GradientBoostingClassifier(n_estimators=100) gra_model.fit(seq_x, seq_y) acc_gra = gra_model.score(seq_x_test, seq_y_test) print('Grad Boost: ', acc_gra) print( "\n ==================================================================== 05: Voting" ) from sklearn.ensemble import VotingClassifier voting_model = VotingClassifier(estimators=[('m01', gra_model), ('m02', ada_model), ('m03', rf_model), ('m04', bag_model)], voting='soft') voting_model.fit(seq_x, seq_y) acc_voting = voting_model.score(seq_x_test, seq_y_test)
model6.fit(train_x,train_y) # In[ ]: #Model Performance plot_model_var_imp(model1, train_x, train_y) # In[ ]: #Model Performance print ('Model 2', model2.score( train_x , train_y ) , model2.score( valid_x , valid_y )) print ('Model 3', model3.score( train_x , train_y ) , model3.score( valid_x , valid_y )) print ('Model 4', model4.score( train_x , train_y ) , model4.score( valid_x , valid_y )) print ('Model 5', model5.score( train_x , train_y ) , model5.score( valid_x , valid_y )) print ('Model 6', model6.score( train_x , train_y ) , model6.score( valid_x , valid_y )) # In[ ]: rfecv = RFECV( estimator = model1, step = 1 , cv = StratifiedKFold( train_y , 2 ) , scoring = 'accuracy' ) rfecv.fit( train_x , train_y ) # In[ ]:
from sklearn.model_selection import train_test_split from sklearn.ensemble import GradientBoostingClassifier from sklearn.metrics import confusion_matrix,f1_score train_df,test_df, age_gender_df,countries_df,session_df = utils.load_data("../data/") train_df = utils.training_feature(train_df) print(train_df.shape,test_df.shape) #one hot encoding categorical_features = list(train_df.select_dtypes('object').columns) categorical_features.append('signup_flow') categorical_features.remove('id') categorical_features.remove('country_destination') print(categorical_features,train_df.columns) train_df = pd.get_dummies(train_df,columns=categorical_features,drop_first=True) X_train,X_test,y_train,y_test = train_test_split(train_df.drop(['country_destination','id'],axis=1),train_df['country_destination'],test_size=0.3,random_state=42) gbc = GradientBoostingClassifier() gbc.fit(X_train,y_train) gbc.score(X_train,y_train) y_pred = gbc.predict(X_test) f1 = f1_score(y_test,y_pred,average='weighted') print(f1,'f1 score')
Xtrain = pd.read_csv("MNIST_X_train.csv").values ytrain = pd.read_csv("MNIST_y_train.csv").values Xtest = pd.read_csv("MNIST_X_test.csv").values ytest = pd.read_csv("MNIST_y_test.csv").values ytrain, ytest = ytrain.flatten(), ytest.flatten() lb = LabelBinarizer(neg_label=0) lb.fit(ytrain) ytrain_ohe = lb.transform(ytrain) ytest_ohe = lb.transform(ytest) start = time.time() GBTC = GBT_classifier(n_estimators=100, max_depth=3, lr=0.5) GBTC.fit(Xtrain, ytrain_ohe) ypred = GBTC.predict(Xtest) end = time.time() score = accuracy(ytest, ypred) print("The accuracy of multiclass classification is {:.2f}%".format( score * 100)) #87.8% print("Takes {:.2f} seconds.".format(end - start)) gbc = GradientBoostingClassifier(learning_rate=0.5, n_estimators=100, max_depth=3, max_features=2) gbc.fit(Xtrain, ytrain) score = gbc.score(Xtest, ytest) print(score) # 87.2%