def GradBoost(X_DS, Y_DS, X_train, X_test, y_train, y_test, Cl_Names = 'None', mask='None',Max_Depth=3): #****************************************************************************** from sklearn.ensemble import GradientBoostingClassifier as GBC #import library for machine learning analysis from sklearn.metrics import classification_report print 'Gradient Boosting: Training...' #notify the user about the status of the process Gradient_Boosting_obj = GBC(max_depth=Max_Depth) #call the Gradient Boosting routine built in Gradient_Boosting_obj.fit(X_train, y_train) #fit the logistic model to the train data sets Pred_Train = Gradient_Boosting_obj.predict(X_train) #apply the logistic model to the train dataset Pred_Test = Gradient_Boosting_obj.predict(X_test) #apply the logistic model to the test dataset print 'Gradient Boosting: Completed!' #notify the user about the status of the process labels = len(np.unique(Y_DS)) #extract the labels from the classification classes Conf_M = np.zeros((labels,labels), dtype='int') #initialize the confusion matrix for the classification problem if Cl_Names != 'None': target_names = Cl_Names else: target_names = np.arange(len(np.unique(Y_DS))).astype(str).tolist() #end Conf_M = CM(y_test, Pred_Test,np.unique(Y_DS)) #calls the confusion matrix routine with the test set and prediction set print(classification_report(y_test, Pred_Test, target_names=target_names)) #print the performance indicators on the console return Gradient_Boosting_obj, Conf_M
def test_mem_layout(): # Test with different memory layouts of X and y X_ = np.asfortranarray(X) clf = GradientBoostingClassifier(n_estimators=100, random_state=1) clf.fit(X_, y) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_)) X_ = np.ascontiguousarray(X) clf = GradientBoostingClassifier(n_estimators=100, random_state=1) clf.fit(X_, y) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_)) y_ = np.asarray(y, dtype=np.int32) y_ = np.ascontiguousarray(y_) clf = GradientBoostingClassifier(n_estimators=100, random_state=1) clf.fit(X, y_) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_)) y_ = np.asarray(y, dtype=np.int32) y_ = np.asfortranarray(y_) clf = GradientBoostingClassifier(n_estimators=100, random_state=1) clf.fit(X, y_) assert_array_equal(clf.predict(T), true_result) assert_equal(100, len(clf.estimators_))
class GBClassifier: def __init__(self): """ Inititalizes the gradient descent classifier """ self.header = "#gbc" self.clf = None self.learningRate = 0.1 self.n_estimators = 100 self.loss = "deviance" self.acceptedLossValues = ["deviance", "exponential"] def setNumberOfEstimators(self, n_estimators): """ Sets the number of estimators of Gradient Boosting Classifier """ self.n_estimators = n_estimators def setLoss(self, loss): """ Sets the loss parameter for the SGDC """ try: if loss in self.acceptedLossValues: self.loss = loss else: raise ValueError("Error in input value") except Exception as error: logging.warning("Error: No such loss value:%s", loss) def buildModel(self): """ This builds the model of the Gradient boosting Classifier """ logging.info("Building Model") self.clf = GradientBoostingClassifier(loss=self.loss, n_estimators=self.n_estimators, learning_rate = self.learningRate) logging.info("Finished Building Model") def trainGBC(self,X, Y): """ Training the Gradient Boosting Classifier """ self.clf.fit(X, Y) def validateGBC(self,X, Y): """ Validate the Gradient Boosting Classifier """ YPred = self.clf.predict(X) print accuracy_score(Y, YPred) def testGBC(self,X, Y): """ Test the Gradient Boosting Classifier """ YPred = self.clf.predict(X) print accuracy_score(Y, YPred)
def gbc(train,test,train_target,test_target, lr=.1, n_est=100): clf = GradientBoostingClassifier(loss='deviance', learning_rate=lr, n_estimators=n_est) clf.fit(train, train_target) res = clf.predict(train) print '*************************** GBC ****************' print classification_report(train_target,res) res1 = clf.predict(test) print classification_report(test_target, res1) return clf
def test_degenerate_targets(): """Check if we can fit even though all targets are equal. """ clf = GradientBoostingClassifier(n_estimators=100, random_state=1) # classifier should raise exception assert_raises(ValueError, clf.fit, X, np.ones(len(X))) clf = GradientBoostingRegressor(n_estimators=100, random_state=1) clf.fit(X, np.ones(len(X))) clf.predict(rng.rand(2)) assert_array_equal(np.ones((1,), dtype=np.float64), clf.predict(rng.rand(2)))
def model_color_gboost(X_train, X_test, y_train, y_test): # Train the model clf = GradientBoostingClassifier(learning_rate=0.1, n_estimators=80, subsample=0.80, max_depth=4) clf.fit(tfidf_train, y_train) # Check the validity pred = clf.predict(tfidf_train.toarray()) print "Accuracy on train set: ", 100*accuracy_score(pred, y_train) pred = clf.predict(tfidf_test.toarray()) print "Accuracy on validation: ", 100*accuracy_score(pred, y_test) print confusion_matrix(y_test, pred, labels=['press-6', 'press-5', 'press-4', 'press-3', 'press-2', 'press-1'])
def predict_author(arr, yazar_features, yazar_classes): results = [] print "\n[DEBUG] K-NN result (neighbors: 10)" knn = KNeighborsClassifier(n_neighbors=10) knn.fit(yazar_features, yazar_classes) print knn.predict(arr) results.append(knn.predict(arr)[0]) print "\n[DEBUG] SVC result (linear) (degree=3)" svc = svm.SVC(kernel='linear', degree=3) svc.fit(yazar_features, yazar_classes) print svc.predict(arr) results.append(svc.predict(arr)[0]) print "\n[DEBUG] Logistic Regression result ()" regr = linear_model.LogisticRegression() regr.fit(yazar_features, yazar_classes) print regr.predict(arr) results.append(regr.predict(arr)[0]) print "\n[DEBUG] Gaussian Naive Bayes" gnb = GaussianNB() gnb.fit(yazar_features, yazar_classes) print gnb.predict(arr) results.append(gnb.predict(arr)[0]) print "\n[DEBUG] Decision Tree Classifier" dtc = tree.DecisionTreeClassifier() dtc.fit(yazar_features, yazar_classes) print dtc.predict(arr) results.append(dtc.predict(arr)[0]) print "\n[DEBUG] Gradient Boosting Classification" gbc = GradientBoostingClassifier() gbc.fit(yazar_features, yazar_classes) print gbc.predict(arr) results.append(gbc.predict(arr)[0]) # output = open('features.pkl', 'wb') # pickle.dump(yazar_features, output) # output.close() # output = open('classes.pkl', 'wb') # pickle.dump(yazar_classes, output) # output.close() # test_yazar_features = [] # for test data # test_yazar_classes = [] # for test classes # # yazar_features = [] # for train data # # yazar_classes = [] # for train classes return results
def gradient_boost(x_train, x_test, y_train, y_test, rands = None): """ Predict the lemons using a RandomForest and a random seed both for the number of features, as well as for the size of the sample to train the data on ARGS: - x_train: :class:`pandas.DataFrame` of the x_training data - y_train: :class:`pandas.Series` of the y_training data - x_test: :class:`pandas.DataFrame` of the x_testing data - y_test: :class:`pandas.Series` of the y_testing data - rands: a :class:`tuple` of the (rs, rf) to seed the sample and features of the BaggingClassifier. If `None`, then rands are generated and provided in the return `Series` RETURNS: :class:`pandas.Series` of the f1-scores and random seeds """ #create a dictionary for the return values ret_d = {'train-f1':[], 'test-f1':[], 'rs':[], 'rf':[]} #use the randoms provided if there are any, otherwise generate them if not rands: rs = numpy.random.rand() rf = numpy.random.rand() while rf < 0.1: rf = numpy.random.rand() else: rs, rf = rands[0], rands[1] #place them into the dictionary ret_d['rs'], ret_d['rf'] = rs, rf #create and run the bagging classifier bc = GradientBoostingClassifier(n_estimators = 300, max_features = rf) bc.fit(x_train, y_train) y_hat_train = bc.predict(x_train) ret_d['train-f1'] = f1_score(y_train, y_hat_train) y_hat_test = bc.predict(x_test) ret_d['test-f1'] = f1_score(y_test, y_hat_test) return pandas.Series(ret_d)
def GB_Classifier(X_train, X_cv, X_test, Y_train,Y_cv,Y_test, Actual_DS): print("***************Starting Gradient Boosting***************") t0 = time() clf = GradientBoostingClassifier(n_estimators=500,learning_rate=0.01) clf.fit(X_train, Y_train) preds = clf.predict(X_cv) score = clf.score(X_cv,Y_cv) print("Gradient Boosting - {0:.2f}%".format(100 * score)) Summary = pd.crosstab(label_enc.inverse_transform(Y_cv), label_enc.inverse_transform(preds), rownames=['actual'], colnames=['preds']) Summary['pct'] = (Summary.divide(Summary.sum(axis=1), axis=1)).max(axis=1)*100 print(Summary) #Check with log loss function epsilon = 1e-15 #ll_output = log_loss_func(Y_cv, preds, epsilon) preds2 = clf.predict_proba(X_cv) ll_output2= log_loss(Y_cv, preds2, eps=1e-15, normalize=True) print(ll_output2) print("done in %0.3fs" % (time() - t0)) preds3 = clf.predict_proba(X_test) #preds4 = clf.predict_proba((Actual_DS.ix[:,'feat_1':])) preds4 = clf.predict_proba(Actual_DS) print("***************Ending Gradient Boosting***************") return pd.DataFrame(preds2),pd.DataFrame(preds3),pd.DataFrame(preds4)
def train_gbt(filename, color, name): '''Train on Gradient Boosted Trees Classifier''' # Read data data2 = pd.read_csv(filename, encoding="utf") X = data2.ix[:, 1:-1] y = data2.ix[:, -1] # Split into train, validation and test X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42) # Define model clf1 = GradientBoostingClassifier(learning_rate=0.05, max_depth=5, random_state=42) # Fit model t0 = time() clf1.fit(X_train, y_train) pred_probas = clf1.predict_proba(X_val) predictions = clf1.predict(X_val) print "Score", clf1.score(X_val, y_val) importances = clf1.feature_importances_ indices = np.argsort(importances)[::-1] # Metrics & Plotting metrics[1, 0] = precision_score(y_val, predictions) metrics[1, 1] = recall_score(y_val, predictions) metrics[1, 2] = f1_score(y_val, predictions) metrics[1, 3] = time() - t0 fpr_rf, tpr_rf, _ = roc_curve(y_val, predictions) plt.plot(fpr_rf, tpr_rf, color=color, label=name) return importances, indices
def gbPredict(LOSS, N_EST, L_RATE, M_DEPT, SUB_S, W_START, N_FOLD, EX_F, TRAIN_DATA_X, TRAIN_DATA_Y, TEST__DATA_X, isProb): # feature extraction ### clf = GradientBoostingClassifier(loss=LOSS, n_estimators=N_EST, learning_rate=L_RATE, max_depth=M_DEPT, subsample=SUB_S, warm_start=W_START).fit(TRAIN_DATA_X, TRAIN_DATA_Y) ### extA = delFeatMin(clf.feature_importances_, EX_F) ### TRAIN_DATA_X = TRAIN_DATA_X[:, extA] # k-fold validation kf = KFold(TRAIN_DATA_Y.shape[0], n_folds=N_FOLD) tesV = 0.0 for train_index, test_index in kf: X_train, X_test = TRAIN_DATA_X[train_index], TRAIN_DATA_X[test_index] y_train, y_test = TRAIN_DATA_Y[train_index], TRAIN_DATA_Y[test_index] clf = GradientBoostingClassifier(loss=LOSS, n_estimators=N_EST, learning_rate=L_RATE, max_depth=M_DEPT, subsample=SUB_S, warm_start=W_START).fit(X_train, y_train) tesK = 1 - clf.score(X_test, y_test) tesV += tesK eVal = tesV / N_FOLD # train all data clf = GradientBoostingClassifier(loss=LOSS, n_estimators=N_EST, learning_rate=L_RATE, max_depth=M_DEPT, subsample=SUB_S, warm_start=W_START).fit(TRAIN_DATA_X, TRAIN_DATA_Y) TEST__DATA_X = TEST__DATA_X[:, extA] if isProb: data = clf.predict_proba(TEST__DATA_X) else: data = clf.predict(TEST__DATA_X) print "Eval =", eVal, "with n_esti =", N_EST, "l_rate =", L_RATE, "m_dep =", M_DEPT, "sub_s =", SUB_S, "ex_num =", EX_F, "and loss is", LOSS return (data, eVal)
def test_staged_predict_proba(): # Test whether staged predict proba eventually gives # the same prediction. X, y = datasets.make_hastie_10_2(n_samples=1200, random_state=1) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] clf = GradientBoostingClassifier(n_estimators=20) # test raise NotFittedError if not fitted assert_raises(NotFittedError, lambda X: np.fromiter( clf.staged_predict_proba(X), dtype=np.float64), X_test) clf.fit(X_train, y_train) # test if prediction for last stage equals ``predict`` for y_pred in clf.staged_predict(X_test): assert_equal(y_test.shape, y_pred.shape) assert_array_equal(clf.predict(X_test), y_pred) # test if prediction for last stage equals ``predict_proba`` for staged_proba in clf.staged_predict_proba(X_test): assert_equal(y_test.shape[0], staged_proba.shape[0]) assert_equal(2, staged_proba.shape[1]) assert_array_almost_equal(clf.predict_proba(X_test), staged_proba)
def main(): print("gradient boosting classifier!") X,Y,Xtest = importdata() print(Y.shape) param_grid={ "n_estimators":[10,100,200,2000,20000], "min_samples_split":[5,10,20,50] } gb=GradientBoostingClassifier() Gridsearch_impl(X,Y,gb,param_grid,5) # for i in range(10,11,5): # clf = DecisionTreeClassifier(min_samples_split=i) # rf = RandomForestClassifier(n_estimators = 100,random_state=0,min_samples_split=i) # ab = AdaBoostClassifier(rf,n_estimators = 10) #ab = GradientBoostingClassifier(n_estimators = 100) # score = cross_validation.cross_val_score(ab,X,Y,cv=3) # print(score) # print("average score %f"%np.mean(score)) # print("std %f"%np.std(score)) # ab.fit(X,Y) Ytest = gb.predict(Xtest) output(Ytest,'submit3.csv')
def main(): """ Use random forests to classify, based on cv results """ from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.grid_search import GridSearchCV from sklearn.preprocessing import StandardScaler import sys #call import_data on STDIN, returning formatted query results M, N, q, tset, pset = import_data(sys.stdin) #create features list so we can easily grab the feature fields features = ["F" + str(j) for j in xrange(1, M+1)] #read in to a pandas dataframe and perform some preprocessing training_set = pd.DataFrame(tset).set_index('ID') pred_set = pd.DataFrame(pset).set_index('ID') scale = StandardScaler().fit(training_set[features]) training_set[features] = scale.transform(training_set[features]) pred_set[features] = scale.transform(pred_set[features]) #adjust the labeling convention training_set['Label'] = training_set['Label'] == "+1" grad = GradientBoostingClassifier(n_estimators=1000, learning_rate=1.0, max_depth=1) grad.fit(training_set[features], training_set['Label']) def print_results(x): if x['Pred_Label'] == 1: print x.name + " +1" else: print x.name + " -1" pred_set['Pred_Label'] = grad.predict(pred_set[features]) pred_set.apply(print_results, axis=1)
def fit_model(): DATA_FILE = './data/train-set-ru-b64-utf-8.txt' stats_collector = StatsCollector() i=0 data = [] target = [] with open (DATA_FILE) as df: for i, line in enumerate(df): print i line = line.strip() parts = line.split() stats_collector = StatsCollector() stats_collector.collect(int(parts[1]), parts[3], parts[2]) data.append(stats_collector.get_features()) target.append(stats_collector.get_target()) #print len(data[-1]) data = np.asarray(data, dtype = np.float) target = np.asarray(target, dtype = np.float) print data.shape, target.shape df.close() clf = GradientBoostingClassifier(loss='deviance', learning_rate=0.07, n_estimators=300, min_samples_split=30,\ min_samples_leaf=15, max_depth=4) clf.fit(data, target) y_pred = clf.predict(data) print f1_score(target, y_pred) joblib.dump(clf, 'model/model.pkl')
def classify_survivors(Y = labels, orig_test = test_data): X, test = featurizer() best_model = {'n_estimators': 20, 'learning_rate': 1.0, 'max_depth': 3} gbt = GradientBoostingClassifier(subsample=0.8, min_samples_leaf=50, min_samples_split=20, n_estimators = 20, learning_rate = 1.0, max_depth = 3) ID_col = orig_test.loc[:,['PassengerId']] print ID_col.ix[0:10] gbt.fit(X,Y) #print test.ix[0:10] predicted_results = gbt.predict(test) predicted_results = pd.DataFrame(predicted_results) predicted = pd.concat( [ID_col,predicted_results], axis=1 ) predicted = predicted.rename(columns={0 : 'Survived'}) #predicted = predicted.drop(' ',axis=1) del predicted[''] #Print some of the dataframe with predictions to test results print predicted.ix[0:15],'\n' #print X.ix[0:15] #Output result dataframe as csv predicted.to_csv('predicted_results.csv')
def main(): print '[INFO, time: %s] Getting Data....' % (time.strftime('%H:%M:%S')) testing_file = file('test.p', 'r') training_file = file('train.p', 'r') train = pickle.load(training_file) test = pickle.load(testing_file) testing_file.close() training_file.close() trainX = train[:,:-1] trainy = train[:,-1] testX = test[:,:-1] testy = test[:,-1] print '[INFO, time: %s] Fitting %s ...' % (time.strftime('%H:%M:%S'), 'GradientBoostingClassifier(n_estimators=1000)') clf = GradientBoostingClassifier(n_estimators=1000) clf.fit(trainX, trainy) print '[INFO, time: %s] Making Predictions...' % (time.strftime('%H:%M:%S')) prediction = clf.predict(testX) print '[RESULT, time: %s] accuracy = %f' % (time.strftime('%H:%M:%S'),accuracy_score(testy, prediction)) model_save_file = file('gradient_1000.p', 'w') pickle.dump(clf, model_save_file) model_save_file.close() print 'All done'
def cv_model(): DATA_FILE = './data/train-set-ru-b64-utf-8.txt' all_data = [] target = [] with open(DATA_FILE) as df: for i, line in enumerate(df): print i line = line.strip() parts = line.split() stats_collector = StatsCollector() #print parts[2] #print base64.b64decode(parts[3])#.decode('utf-8') #print parts[2].decode('utf-8'), parts[3].decode('utf-8'), "\n" stats_collector.collect(int(parts[1]), parts[3], parts[2]) # mark page url all_data.append(stats_collector.get_features()) target.append(stats_collector.get_target()) #print all_data[-1] data = np.asarray(all_data, dtype = np.float) target = np.asarray(target, dtype = np.float) clf = GradientBoostingClassifier(loss='deviance', learning_rate=0.05, n_estimators=400,\ min_samples_split=30, min_samples_leaf=15, max_depth=5) kf = KFold(data.shape[0], n_folds = 3, shuffle = True) for train_index, test_index in kf: X_train, X_test = data[train_index], data[test_index] y_train, y_test = target[train_index], target[test_index] clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print f1_score(y_test, y_pred)
def mse_sklearn(x_train, x_test, y_train, y_test, n_estimators): clf = GradientBoostingClassifier(n_estimators=n_estimators, min_samples_leaf=MIN_SAMPLES_LEAF, max_depth=MAX_DEPTH) clf.fit(x_train, y_train) pred = clf.predict(x_test) return f1_score(y_test, pred)
class Blender(BaseEstimator, ClassifierMixin): def __init__(self, trained_clfs): self.clfs = trained_clfs # self.classifier = make_pipeline(OneHotEncoder(), DenseTransformer(), # GradientBoostingClassifier()) self.classifier = GradientBoostingClassifier() # self.classifier = make_pipeline( # OneHotEncoder(), LogisticRegression(class_weight='auto')) def fit(self, data, target): # self.enc = LabelEncoder().fit(target) probs = self.transform_input(data) # self.classifier.fit(predictions, target) self.classifier.fit(probs, target) def predict(self, data): predictions = self.transform_input(data) return self.classifier.predict(predictions) def transform_input(self, data): probabilities = [clf.predict_proba(data) for clf in self.clfs] probabilities = np.array(probabilities) # features, samples = probabilities.shape n_clfs, samples, features = probabilities.shape probabilities = np.reshape(probabilities, (samples, n_clfs * features)) probabilities[np.isnan(probabilities)] = 0 return probabilities
def get_n_fold_validation_score(self, fold=10): features = data.get_features() lables = data.get_lables() length = len(features) jump = length / fold index = 0 k = 0 scores = list() while k < fold: feature_test = features.iloc[index : (index + jump), :] lable_test = lables.iloc[index : (index + jump), :] feature_train_1, feature_train_2 = ( features.iloc[0 : index - 1, :] if index != 0 else pd.DataFrame(), features.iloc[index + jump + 1 : length - 1], ) feature_train = pd.concat([feature_train_1, feature_train_2]) lable_train_1, lable_train_2 = ( lables.iloc[0 : index - 1, :] if index != 0 else pd.DataFrame(), lables.iloc[index + jump + 1 : length - 1], ) lable_train = pd.concat([lable_train_1, lable_train_2]) index += jump k += 1 classifier = GradientBoostingClassifier() classifier.fit(feature_train, lable_train["lable"].values) scores.append(accuracy_score(lable_test, classifier.predict(feature_test))) return sum(scores) / float(len(scores))
def final_run(X,Y,Xtest,n_est): clf = GradientBoostingClassifier(n_estimators=n_est,random_state=n_est) clf = clf.fit(X,Y) #np.savetxt('gb_oob_improve_{}'.format(n_est),clf.oob_score_) #np.savetxt('gb_train_score_{}'.format(n_est),clf.train_score_) Ytest=clf.predict(Xtest) output(Ytest,'gradient_boost_{}.csv'.format(n_est))
def classify2(dis_data, numeric_data, t_label): fold = 5 skf = StratifiedKFold(t_label, fold) roc_auc = 0 f1_score_value = 0 clf1 = LogisticRegression() clf2 = GradientBoostingClassifier() # clf3 = tree.DecisionTreeClassifier(max_depth=500, max_leaf_nodes= 500, class_weight={1:12}) clf3 = GradientBoostingClassifier() for train, test in skf: clf3 = clf3.fit(dis_data.iloc[train], t_label.iloc[train]) #compute auc probas_ = clf3.predict_proba(dis_data.iloc[test]) fpr, tpr, thresholds = roc_curve(t_label.iloc[test], probas_[:, 0]) roc_auc += auc(fpr, tpr) #compute f1_score label_pred = clf3.predict(dis_data.iloc[test]) f1_score_value += f1_score(t_label.iloc[test], label_pred, pos_label= 1) return roc_auc / fold, f1_score_value / fold
class MyGradientBoosting(MyClassifier): def __init__(self): self.gradient_boosting = None def train(self, data_path='data/train.pkl', n_estimators=10, learning_rate=0.1): labels, instances = load_pickled_dataset(data_path) start_time = time.clock() self.gradient_boosting = GradientBoostingClassifier(loss='deviance', learning_rate=learning_rate, n_estimators=n_estimators, subsample=0.3, min_samples_split=2, min_samples_leaf=1, max_depth=3, init=None, random_state=None, max_features=None, verbose=2) self.gradient_boosting.fit(instances, labels) end_time = time.clock() print "STATUS: model training done. elapsed time - %d seconds" % (end_time - start_time) print "INFO: " + str(self.gradient_boosting) def predict(self, data_path='data/test.pkl'): labels, instances = load_pickled_dataset(data_path) return self.gradient_boosting.predict(instances) def save(self, file_path='model/gbc_model'): joblib.dump(self.gradient_boosting, file_path) def load(self, file_path='model/gbc_model'): self.gradient_boosting = joblib.load(file_path) def write_results(self, predictions): super(MyGradientBoosting, self).write(predictions, 'gbc_prediction.csv')
def plotLearningCurve(dat,lab,optim): ''' This function plots the learning curve for the classifier Parameters: ----------- dat: numpy array with all records lab: numpay array with class labels of all records optim: optimal parameters for classifier ''' clf = GradientBoostingClassifier(learning_rate = optim[0], subsample = optim[1]) # split training data into train and test (already chose optimal parameters) xTrain, xTest, yTrain, yTest = cross_validation.train_test_split(dat, lab, test_size = 0.3) # choose various sizes of training set to model on to generate learning curve szV = range(10, np.shape(xTrain)[0], int(np.shape(xTrain)[0]) / 10) szV.append(np.shape(xTrain)[0]) LCvals = np.zeros((len(szV),3), dtype = np.float64) # store data points of learning curve for i in xrange(0, len(szV)): clf = clf.fit(xTrain[:szV[i],:], yTrain[:szV[i]]) LCvals[i,0] = szV[i] LCvals[i,1] = clf.score(xTest, yTest) LCvals[i,2] = clf.score(xTrain[:szV[i],:], yTrain[:szV[i]]) #print LCvals # generate figure fig = plt.figure(1, figsize = (10,10)) prop = matplotlib.font_manager.FontProperties(size=15.5) ax = fig.add_subplot(1, 1, 1) ax.plot(LCvals[:,0] / np.float64(np.shape(xTrain)[0]), 1.0 - LCvals[:,1], label = 'Test Set') ax.plot(LCvals[:,0] / np.float64(np.shape(xTrain)[0]), 1.0 - LCvals[:,2], label = 'Training Set') ax.set_ylabel(r"Error", fontsize = 20) ax.set_xlabel(r"% of Training Set Used", fontsize = 20) ax.axis([0.0, 1.0, -0.1, 0.5]) plt.legend(loc = 'upper right', prop = prop) plt.savefig('LC_GB.pdf', bbox_inches = 'tight') fig.clear() # where is model failing? predProb = clf.predict_proba(xTest) tmp = np.zeros((np.shape(predProb)[0], np.shape(predProb)[1] + 2)) tmp[:,:-2] = predProb tmp[:,-2] = clf.predict(xTest) tmp[:,-1] = yTest mask = tmp[:,-2] != tmp[:,-1] print tmp[mask] print mask.sum(), len(xTest) print tmp[:50,:]
def rand_forest_train(self): # 读取本地用户特征信息 users = pd.read_csv('names.csv') # 选取similarity、platform、reputation、entropy作为判别人类或机器的特征 X = users[['similarity', 'platform', 'reputation', 'entropy']] y = users['human_or_machine'] # 对原始数据进行分割, 25%的数据用于测试 from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33) # 对类别特征进行转化,成为特征向量 from sklearn.feature_extraction import DictVectorizer vec = DictVectorizer(sparse=False) X_train = vec.fit_transform(X_train.to_dict(orient='record')) X_test = vec.transform(X_test.to_dict(orient='record')) # 使用单一决策树进行集成模型的训练及预测分析 from sklearn.tree import DecisionTreeClassifier dtc = DecisionTreeClassifier() dtc.fit(X_train, y_train) dtc_y_pred = dtc.predict(X_test) # 使用随机森林分类器进行集成模型的训练及预测分析 from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier() rfc.fit(X_train, y_train) rfc_y_pred = rfc.predict(X_test) # 使用梯度提升决策树进行集成模型的训练及预测分析 from sklearn.ensemble import GradientBoostingClassifier gbc = GradientBoostingClassifier() gbc.fit(X_train, y_train) gbc_y_pred = gbc.predict(X_test) from sklearn.metrics import classification_report # 输出单一决策树在测试集上的分类准确性, 以及更加详细的精确率 召回率 F1指标 print("单一决策树的准确性为", dtc.score(X_test, y_test)) print(classification_report(dtc_y_pred, y_test)) # 输出随机森林分类器在测试集上的分类准确性,以及更加详细的精确率 召回率 F1指标 print("随机森林分类器的准确性为", rfc.score(X_test, y_test)) print(classification_report(rfc_y_pred, y_test)) # 输出梯度提升决策树在测试集上的分类准确性,以及更加详细的精确率 召回率 F1指标 print("梯度提升决策树的准确性为", gbc.score(X_test, y_test)) print(classification_report(gbc_y_pred, y_test)) users = pd.read_csv('values.csv') # 检验是否为机器或人类 X = users[['similarity', 'platform', 'reputation', 'entropy']] X = vec.transform(X.to_dict(orient='record')) print(rfc.predict(X)) self.dtc = dtc self.rfc = rfc self.gbc = gbc
class Gdbc1Model: def __init__(self): self.model = GradientBoostingClassifier(max_features=0.6, learning_rate=0.05, max_depth=5, n_estimators=300) def fit(self,x,y): self.model.fit(x,y) def predict(self,X): return self.model.predict(X)
def GradientBoosting(): #import libraries from sklearn.ensemble import GradientBoostingClassifier #for classification from sklearn.ensemble import GradientBoostingRegressor #for regression #use GBM function clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0) clf.fit(X_train, y_train) predicted = clf.predict(X_test)
def gradient_boosting_classify(my_train_data, my_train_label, my_test_data, estimators): clf = GradientBoostingClassifier(n_estimators=estimators) scores = cross_validation.cross_val_score(clf, my_train_data, my_train_label, cv=5) print("gradient boosting(%d) accuracy: %0.3f (+/- %0.3f)" % (estimators, scores.mean(), scores.std() * 2)) clf.fit(my_train_data, my_train_label) my_test_label = clf.predict(my_test_data) file_name = "gradient_boosting_%d.csv" % estimators data_storer.save_data(my_test_label, file_name)
def gradient_boosting_classifier(x_train, y_train, x_test, y_test, num_tree): model = Gbc(loss='deviance', learning_rate=0.2, n_estimators=num_tree, subsample=1.0, min_samples_split=2, min_samples_leaf=10, min_weight_fraction_leaf=0.0, max_depth=5, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False) model.fit(x_train, y_train) expected = y_test predicted = model.predict(x_test) return expected, predicted
print(msg) # GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier selected ######## Run a basic GradientBoostingClassifier ######## GBC = GradientBoostingClassifier(random_state=10) # Scaling steps = [('scaler', scaler), ('GBC', GBC)] #Pipelining pipeline = Pipeline(steps) GBC.fit(X_train, y_train) y_pred = GBC.predict(X_test) y_predict_prob = GBC.predict_proba(X_test)[:, 1] cm = confusion_matrix(y_test, y_pred) print(cm) # Print accuracy of the model print('Score: {}'.format(accuracy_score(y_pred, y_test))) # Generate ROC curve values: fpr, tpr, thresholds fpr_gbc, tpr_gbc, thresholds_gbc = roc_curve(y_test, y_predict_prob) auc_gbc = roc_auc_score(y_test, y_pred) print("AUC: ", auc_gbc) #Score: 0.7875964036619049 #AUC: 0.7886511356295131
x_train, x_test, y_train, y_test = train_test_split(df_cp, train_Y, test_size=0.25, random_state=4) ########## model start from sklearn.ensemble import GradientBoostingClassifier gdbt = GradientBoostingClassifier(learning_rate=0.01) # 訓練模型 gdbt.fit(x_train, y_train) # 預測測試集 y_pred = gdbt.predict(x_test) y_pred_proba = gdbt.predict_proba(x_test)[:, 1] ########## model end ########## 糢型憑估 start from sklearn import datasets, metrics from sklearn.metrics import confusion_matrix from sklearn.metrics import mean_squared_error, r2_score, accuracy_score check_view = pd.DataFrame({'pred_poi': y_pred_proba, 'poi': y_test}) check_view = check_view.sort_values(by=['pred_poi']) acc = accuracy_score(y_test, y_pred) print("Accuracy: ", acc)
def main(): clf = GradientBoostingClassifier(n_estimators=1000, min_samples_split=15, learning_rate=0.1, max_depth=160) feature_set_id = '59' feature_sets_file = args.feature_sets_log_file feature_set_dict = {} with open(feature_sets_file, 'r') as stream: feature_set_dict = yaml.load(stream) feature_set = feature_set_dict[feature_set_id] engine = model_pipeline_script.get_engine() con = engine.connect() if args.prediction_table != '': contract_flag = True else: contract_flag = False contracts_data = pd.read_sql(args.training_table, engine) if contract_flag: prediction_data = pd.read_sql(args.prediction_table, engine) print contracts_data.columns #proccess training data contracts_data['amt_standardized'] = contracts_data['amount_standardized'] contracts_data['contract_signing_date'] = pd.to_datetime( contracts_data['contract_signing_date']) #Subsetting on only main allegation outcomes train_data = contracts_data[ (contracts_data['allegation_outcome'] == 'Substantiated') | (contracts_data['allegation_outcome'] == 'Unfounded') | (contracts_data['allegation_outcome'] == 'Unsubstantiated')] train_data, col_group_dict_train = model_pipeline_script.join_features( engine, con, contracts_data, args.train_table_id) col_group_dict_train, col_group_keys_train = model_pipeline_script.define_feature_sets( col_group_dict_train) if contract_flag: #process prediction data prediction_data['amt_standardized'] = prediction_data[ 'amount_standardized'] prediction_data['contract_signing_date'] = pd.to_datetime( prediction_data['contract_signing_date']) prediction_data['allegation_category'] = args.allegation_category prediction_data, col_group_dict_predict = model_pipeline_script.join_features( engine, con, prediction_data, args.predict_table_id) col_group_dict_predict, col_group_keys_predict = model_pipeline_script.define_feature_sets( col_group_dict_predict) train_df = train_data[train_data['allegation_outcome'].notnull()] if not contract_flag: predict_df = train_data[train_data['allegation_outcome'].isnull()] predict_df.drop('allegation_outcome', 1, inplace=True) else: predict_df = prediction_data feature_set_new = [] for feat_set in feature_set: if 'cntrcts_splr_ftr_set_train' in feat_set: feat_set = feat_set.replace( 'cntrcts_splr_ftr_set_train', 'cntrcts_splr_ftr_set_' + args.train_table_id) feature_set_new.append(feat_set) feature_set = feature_set_new df_features_train, y_train = model_pipeline_script.select_features( train_df, col_group_dict_train, feature_set) print 'feat_sets:' if args.predict_table_id != '': feature_set_new = [] for feat_set in feature_set: print feat_set if 'cntrcts_splr_ftr_set_' + args.train_table_id in feat_set: feat_set = feat_set.replace( 'cntrcts_splr_ftr_set_' + args.train_table_id, 'cntrcts_splr_ftr_set_' + args.predict_table_id) feature_set_new.append(feat_set) feature_set = feature_set_new print 'shape: ' print predict_df.shape, feature_set if contract_flag: df_features_predict, y_predict = model_pipeline_script.select_features( predict_df, col_group_dict_predict, feature_set) else: df_features_predict, y_predict = model_pipeline_script.select_features( predict_df, col_group_dict_train, feature_set) print df_features_predict.shape df_to_write = df_features_train.merge(pd.DataFrame(y_train), left_index=True, right_index=True) df_to_write.to_csv('features_and_outcomes.csv') matching_cols = [ val for val in df_features_train.columns if val in set(df_features_predict.columns) ] print len(matching_cols), len(df_features_train.columns), len( df_features_predict.columns) df_features_train = df_features_train[matching_cols] df_features_predict = df_features_predict[matching_cols] x_train = np.array(df_features_train) y_train = np.array(y_train) x_train = x_train.astype(float) x_predict = np.array(df_features_predict) x_predict = x_predict.astype(float) print 'Fitting....' clf.fit(x_train, y_train) print 'Predicting...' y_pred = clf.predict(x_predict) y_proba = clf.predict_proba(x_predict).T[1] #code for printing out top features #try: # print 'Feature importance...' # print df_features_train.columns,df_features_train.shape # top_features = model_pipeline_script.get_feature_importance(clf,x_train,y_train,df_features_train.columns,nfeatures=50) # print top_features #feat_idx = [] #for feat in top_features: # print feat # idx = # model_pipeline_script.decision_surface_plot(clf,df_features_train,y_train,top_features) # except IOError: # '' #code for plotting distribution of prediction scores # plt.hist(y_proba,bins=30) # if contract_flag: # plt.title('Prediction Scores on Contracts') # else: # plt.title('Prediction Scores on Uninvestigated Complaints') # plt.xlabel('Prediction Score') # if contract_flag: # plt.ylabel('Number of Contracts') # else: # plt.ylabel('Number of Complaints') # plt.show() prediction_data = predict_df prediction_data['prediction_score'] = y_proba grouped = prediction_data[[ 'country', 'prediction_score' ]].groupby('country').aggregate(['mean', 'median', 'std', 'count']) grouped.columns = [' '.join(col).strip() for col in grouped.columns.values] # print prediction_data.columns # prediction_data[['country','prediction_score']].to_sql('prediction_scores_complaints_by_country_nocountryfeatures',engine,if_exists='replace') if contract_flag: output_df = prediction_data[[ 'wb_contract_number', 'fiscal_year', 'region', 'country', 'project_id', 'project_name', 'contract_description', 'supplier', 'borrower_contract_reference_number', 'amount', 'prediction_score' ]] else: output_df = prediction_data[[ 'wb_contract_number', 'fiscal_year', 'region', 'country', 'project_id', 'project_name', 'contract_description', 'supplier', 'borrower_contract_reference_number', 'amount', 'allegation_category', 'prediction_score' ]] if '.csv' not in args.output_file: output_file = args.output_file + '.csv' output_table = args.output_file else: output_file = args.output_file output_table = re.sub(r'\.csv$', '', args.output_file) output_table_array = output_table.split("/") print output_table_array output_table = output_table_array[len(output_table_array) - 1] output_df.to_csv(output_file, encoding='utf-8') if len(output_table) > 63: output_table = output_table[:63] output_df.to_sql(output_table, engine, if_exists='replace')
# Training and testing from sklearn.naive_bayes import MultinomialNB from sklearn.model_selection import train_test_split docs_train, docs_test, y_train, y_test = train_test_split( twitter_tfidf, twitter_train['rank'], train_size=.90, test_size=.1, random_state=685) from sklearn.ensemble import GradientBoostingClassifier, RandomForestRegressor clf = GradientBoostingClassifier().fit(docs_train, y_train) y_pred = clf.predict(docs_test) stop = timeit.default_timer() print('Time: ', stop - start) # from joblib import dump, load # dump(clf, 'twitchsentiment.chatmodel') print(sklearn.metrics.accuracy_score(y_test, y_pred)) #Testing import csv reviews_new = [] with open("../scrapedchat/a_seagull.csv", 'r', encoding='utf8') as csvFile: reader = csv.reader(csvFile) for row in reader:
df["Cabin"].fillna("N", inplace=True) df["Embarked"].fillna("N", inplace=True) titanic_encode(df, ["Cabin", "Embarked", "Sex", "Ticket"]) df.loc[df["Age"].isnull(), "Age"] = df["Age"].mean() df = df.drop(["Name", "Ticket"], axis=1) input_data = df.drop(["Survived"], axis=1) output_data = df["Survived"] model = GradientBoostingClassifier() model.fit(input_data, output_data) test = pd.read_csv("test.csv") test["Cabin"].fillna("N", inplace=True) test["Embarked"].fillna("N", inplace=True) titanic_encode(test, ["Cabin", "Embarked", "Sex", "Ticket"]) test.loc[test["Age"].isnull(), "Age"] = test["Age"].mean() test.loc[test["Fare"].isnull(), "Fare"] = test["Fare"].mean() test = test.drop(["Name", "Ticket"], axis=1) print(model.predict(test)) submit = pd.DataFrame({ "PassengerId": test["PassengerId"], "Survived": model.predict(test) }) submit.to_csv("submit.csv", index=False)
class GradientBoostingClassifier: def __init__(self, loss, learning_rate, n_estimators, subsample, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_depth, criterion, max_features, max_leaf_nodes, min_impurity_decrease, random_state=None, verbose=0, **kwargs): self.loss = loss self.learning_rate = learning_rate self.n_estimators = n_estimators self.subsample = subsample self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_depth = max_depth self.criterion = criterion self.max_features = max_features self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease self.random_state = random_state self.verbose = verbose self.estimator = None self.fully_fit_ = False def fit(self, X, y, sample_weight=None): from sklearn.ensemble import GradientBoostingClassifier # Special fix for gradient boosting! if isinstance(X, np.ndarray): X = np.ascontiguousarray(X, dtype=X.dtype) if self.estimator is None: self.learning_rate = float(self.learning_rate) self.n_estimators = int(self.n_estimators) self.subsample = float(self.subsample) self.min_samples_split = int(self.min_samples_split) self.min_samples_leaf = int(self.min_samples_leaf) self.min_weight_fraction_leaf = float(self.min_weight_fraction_leaf) if check_none(self.max_depth): self.max_depth = None else: self.max_depth = int(self.max_depth) self.max_features = float(self.max_features) if check_none(self.max_leaf_nodes): self.max_leaf_nodes = None else: self.max_leaf_nodes = int(self.max_leaf_nodes) self.min_impurity_decrease = float(self.min_impurity_decrease) self.verbose = int(self.verbose) self.estimator = GradientBoostingClassifier( loss=self.loss, learning_rate=self.learning_rate, n_estimators=self.n_estimators, subsample=self.subsample, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, max_depth=self.max_depth, criterion=self.criterion, max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, random_state=self.random_state, verbose=self.verbose, warm_start=True, ) self.estimator.fit(X, y, sample_weight=sample_weight) return self def configuration_fully_fitted(self): if self.estimator is None: return False return not len(self.estimator.estimators_) < self.n_estimators def predict(self, X): if self.estimator is None: raise NotImplementedError return self.estimator.predict(X) @staticmethod def get_cs(): cs = ConfigurationSpace() loss = Constant("loss", "deviance") learning_rate = UniformFloatHyperparameter( name="learning_rate", lower=0.01, upper=1, default_value=0.1, log=True) # n_estimators = UniformIntegerHyperparameter( # "n_estimators", 50, 500, default_value=100) n_estimators = Constant("n_estimators", 100) max_depth = UniformIntegerHyperparameter( name="max_depth", lower=1, upper=8, default_value=3) criterion = CategoricalHyperparameter( 'criterion', ['friedman_mse', 'mse'], default_value='mse') min_samples_split = UniformIntegerHyperparameter( name="min_samples_split", lower=2, upper=20, default_value=2) min_samples_leaf = UniformIntegerHyperparameter( name="min_samples_leaf", lower=1, upper=20, default_value=1) min_weight_fraction_leaf = UnParametrizedHyperparameter("min_weight_fraction_leaf", 0.) subsample = UniformFloatHyperparameter( name="subsample", lower=0.01, upper=1.0, default_value=1.0) max_features = UniformFloatHyperparameter( "max_features", 0.1, 1.0, default_value=1) max_leaf_nodes = UnParametrizedHyperparameter( name="max_leaf_nodes", value="None") min_impurity_decrease = UnParametrizedHyperparameter( name='min_impurity_decrease', value=0.0) cs.add_hyperparameters([loss, learning_rate, n_estimators, max_depth, criterion, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, subsample, max_features, max_leaf_nodes, min_impurity_decrease]) return cs
print('') #随机森林 print('随机森林:') rfc = RandomForestClassifier(random_state=2018) rfc.fit(X_train_std,y_train) rfc_predict = rfc.predict(X_test_std) rfc_predict_proba = rfc.predict_proba(X_test_std)[:,1] get_scores(y_test,rfc_predict,rfc_predict_proba) print('') #GBDT print('GBDT:') gdbt = GradientBoostingClassifier(random_state=2018) gdbt.fit(X_train_std,y_train) gdbt_predict = gdbt.predict(X_test_std) gdbt_predict_proba = gdbt.predict_proba(X_test_std)[:,1] get_scores(y_test,gdbt_predict,gdbt_predict_proba) print('') #XGBoost print('XGBoost:') xgbs = XGBClassifier(random_state=2018) xgbs.fit(X_train_std,y_train) xgbs_predict = xgbs.predict(X_test_std) xgbs_predict_proba = xgbs.predict_proba(X_test_std)[:,1] get_scores(y_test,xgbs_predict,xgbs_predict_proba) print('') #LightGBM print('LightGBM:')
train_file[1], normalize=False) trainX, validX, trainY, validY = utils.train_test_split( trainX, trainY, 0.1) print( f'\033[32;1mtrainX: {trainX.shape}, trainY: {trainY.shape}, validX: {validX.shape}, validY: {validY.shape}\033[0m' ) if training: model = GradientBoostingClassifier( learning_rate=0.1, n_estimators=200, max_depth=3, random_state=880301) #, n_iter_no_change=10, tol=1e-4) model.fit(trainX, trainY.ravel()) utils.save_model(model_path, model) #a = model.feature_importances_[1:].reshape(-1, 9) #for i in a: # print(('%.3f '*9) % tuple(i)) else: model = utils.load_model(model_path) if test: testX = utils.load_test_data(test[0], mean, std) utils.generate_csv(model.predict(testX), test[1]) else: print( f'\033[32;1mTraining score: {model.score(trainX, trainY)}\033[0m') print( f'\033[32;1mValidaiton score: {model.score(validX, validY)}\033[0m' )
def GBC(X_train, Y_train, X_test): clf = GradientBoostingClassifier() clf.fit(X_train, Y_train) pre = clf.predict(X_test.toarray()) return pre
x = data.iloc[:, 1:] y = data['speed'] # encode string values as integer x = encodeData(x) y = mapSpeed(y) if training: x = predictInjSeverity(x) return x, y if __name__ == '__main__': # if len(sys.argv) != 3: # print("Bad argument list, enter in following form:") # print("python <script_name>.py <train_set_path> <test_set_path>") # exit() # X_train, y_train = read(sys.argv[1], True) # X_test, y_test = read(sys.argv[2]) X_train, y_train = read("./resources/train.csv", True) X_test, y_test = read("./resources/z4_test.csv") clf = GradientBoostingClassifier(n_estimators=100) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) f1 = f1_score(y_test, y_pred, average='micro') print(f1)
gb_GS.best_estimator_ gbcs=GradientBoostingClassifier(criterion='friedman_mse', init=None, learning_rate=0.1, loss='deviance', max_depth=20, max_features=20, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=2, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_iter_no_change=None, presort='auto', random_state=None, subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0, warm_start=False) gbcs.fit(X_train, y_train) gbcs.score(X_test, y_test) #0.9891213389121339 predicted=gbcs.predict(X_test) {'learning_rate': 0.1, 'max_depth': 20, 'max_features': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100} pred_probs = gbcs.predict_proba(X_test)[:,1] threshold = 0.5 predicted = pred_probs >= threshold accuracy = accuracy_score(y_test, predicted) precision = precision_score(y_test, predicted) recall = recall_score(y_test, predicted)
dt_predictions = dt.predict(X_test) dt_data = pd.read_csv('test.csv') dt_data.insert((dt_data.shape[1]),'Survived',dt_predictions) dt_data.to_csv('Titanic_DecisionTrees.csv') """ Gradient Boost """ # Instantiate our model gb = GradientBoostingClassifier() gb.fit(X_train, Y_train) gb_predictions = gb.predict(X_test) gb_data = pd.read_csv('test.csv') gb_data.insert((gb_data.shape[1]),'Survived',gb_predictions) gb_data.to_csv('Titanic_GradientBoost.csv') """ XGBoost """ # Instantiate our model xg = XGBClassifier(learning_rate=0.02, n_estimators=750, max_depth= 3, min_child_weight= 1, colsample_bytree= 0.6, gamma= 0.0, reg_alpha= 0.001, subsample= 0.8 )
# print ("precision" , "recall", "fscore", "support") # print ("0 unrelated: ",precision_recall_fscore_support(y_test, y_Pred, average='micro', labels=[0])) # print ("1 agree: ",precision_recall_fscore_support(y_test, y_Pred, average='micro', labels=[1])) # print ("2 disagree: ",precision_recall_fscore_support(y_test, y_Pred, average='micro', labels=[2])) # print ("3 discuss: ",precision_recall_fscore_support(y_test, y_Pred, average='micro', labels=[3])) ### -------------------LogisticRegression------------------- # LogisticRegression = LogisticRegression() # LogisticRegression.fit(X_train,y_train) # y_Pred = LogisticRegression.predict(X_test) # print ("LogisticRegression") # print ("accuracy:",LogisticRegression.score(X_test,y_test)) # print ("confusion_matrix:\n",confusion_matrix(y_test, y_Pred)) # print ("precision" , "recall", "fscore", "support") # print ("0 unrelated: ",precision_recall_fscore_support(y_test, y_Pred, average='micro', labels=[0])) # print ("1 related: ",precision_recall_fscore_support(y_test, y_Pred, average='micro', labels=[1])) ## -------------------- GradientBoosting -------------------- GradientBoosting = GradientBoostingClassifier() GradientBoosting.fit(X_train,y_train) y_Pred = GradientBoosting.predict(X_test) print ("GradientBoosting") print ("accuracy:",GradientBoosting.score(X_test,y_test)) print ("confusion_matrix:\n",confusion_matrix(y_test, y_Pred)) print ("precision" , "recall", "fscore", "support") print ("0 unrelated: ",precision_recall_fscore_support(y_test, y_Pred, average='micro', labels=[0])) print ("1 related: ",precision_recall_fscore_support(y_test, y_Pred, average='micro', labels=[1])) print ("2 disagree: ",precision_recall_fscore_support(y_test, y_Pred, average='micro', labels=[2])) print ("3 discuss: ",precision_recall_fscore_support(y_test, y_Pred, average='micro', labels=[3]))
labels_test = [] for ii in train_idx: features_train.append(features[ii]) labels_train.append(labels[ii]) for jj in test_idx: features_test.append(features[jj]) labels_test.append(labels[jj]) clf.fit(features_train, labels_train) clf2.fit(features_train, labels_train) clf3.fit(features_train, labels_train) clf4.fit(features_train, labels_train) clfvote.fit(features_train, labels_train) predictions1 = clf.predict(features_test) predictions2 = clf2.predict(features_test) predictions3 = clf3.predict(features_test) predictions4 = clf4.predict(features_test) predictions = clfvote.predict(features_test) clf_f1.append(f1_score(labels_test, predictions1)) clf2_f1.append(f1_score(labels_test, predictions2)) clf3_f1.append(f1_score(labels_test, predictions3)) clf4_f1.append(f1_score(labels_test, predictions4)) clfvote_f1.append(f1_score(labels_test, predictions)) # Added after GaussianNB() known to be best clf to evaluate for prediction, truth in zip(predictions1, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1:
import numpy as np import pandas as pd from sklearn.ensemble import GradientBoostingClassifier from sklearn.model_selection import train_test_split # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Average CV score on the training set was:0.8292963321682738 exported_pipeline = GradientBoostingClassifier(learning_rate=0.5, max_depth=4, max_features=0.05, min_samples_leaf=8, min_samples_split=12, n_estimators=100, subsample=0.9500000000000001) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
import numpy as np import pandas as pd from sklearn.cross_validation import train_test_split from sklearn.ensemble import GradientBoostingClassifier # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR') training_indices, testing_indices = train_test_split( tpot_data.index, stratify=tpot_data['class'].values, train_size=0.75, test_size=0.25) result1 = tpot_data.copy() # Perform classification with a gradient boosting classifier gbc1 = GradientBoostingClassifier(learning_rate=0.49, max_features=1.0, min_weight_fraction_leaf=0.09, n_estimators=500, random_state=42) gbc1.fit(result1.loc[training_indices].drop('class', axis=1).values, result1.loc[training_indices, 'class'].values) result1['gbc1-classification'] = gbc1.predict( result1.drop('class', axis=1).values)
def pca(x, n_feature): mean_x = np.mean(x, 0) x -= mean_x eig, vec = np.linalg.eig(np.dot(x.T, x)) idx = np.argsort(-eig) W = vec[:, idx[:n_feature]] new_x = np.dot(x, W) return new_x bone_data = pd.read_csv('all_bone_info_df.csv') features_list = list(bone_data.columns)[1:] features_list.remove('class_id') features_list.remove('target') x = bone_data[features_list] y = bone_data[['target']] PCA_x = pca(x.values, 10) x_train, x_test, y_train, y_test = train_test_split(PCA_x, y, test_size=0.2, random_state=1) gbdt = GradientBoostingClassifier(random_state=3) gbdt.fit(x_train, y_train) y_pred = gbdt.predict(x_test) # print(y_pred.dtype, y_test.values.dtype) print("accuracy: %.4g" % (metrics.accuracy_score(y_test, y_pred))) print(len(features_list))
# Load data iris_dataset = load_iris() data, target, target_names = iris_dataset["data"], iris_dataset[ "target"], iris_dataset["target_names"] # Instantiate model model = GradientBoostingClassifier() # Training and validation split np.random.shuffle(data), np.random.shuffle(target) train_x, train_y = data[:100], target[:100] val_x, val_y = data[100:], target[100:] # Train and evaluate models model.fit(train_x, train_y) print("MSE:", mean_squared_error(model.predict(val_x), val_y)) # Save the model and label to file with open("/tmp/iris_model_logistic_regression.pkl", "wb") as f: pickle.dump(model, f) with open("/tmp/iris_labels.json", "w") as f: json.dump(target_names.tolist(), f) # __doc_train_model_end__ # __doc_define_servable_begin__ class BoostingModel: def __init__(self): with open("/tmp/iris_model_logistic_regression.pkl", "rb") as f: self.model = pickle.load(f) with open("/tmp/iris_labels.json") as f:
# training print(train_data.shape) skf = StratifiedKFold(n_splits=5) eval_result = np.zeros((train_data.shape[0], 1)) predict_label = np.zeros((predict_data.shape[0], 2)) i = 0 for train_index, eval_index in skf.split(train_data, train_label): print('start ', i) i += 1 split_train, split_train_label = train_data.iloc[ train_index], train_label.iloc[train_index] eval_data, eval_label = train_data.iloc[eval_index], train_label.iloc[ eval_index] classifier = GradientBoostingClassifier(n_estimators=500) classifier.fit(split_train, split_train_label) eval_result[eval_index] = classifier.predict(eval_data).reshape( eval_data.shape[0], 1) predict_label += classifier.predict_proba(predict_data).reshape( predict_data.shape[0], 2) # test accuracy print('ac ', accuracy_score(train_label, eval_result)) print('precision ', precision_score(train_label, eval_result)) print('recall ', recall_score(train_label, eval_result)) print('f1_score ', f1_score(train_label, eval_result)) # predict predict_label = np.argmax(predict_label, axis=1) predict_label = pd.DataFrame(predict_label, columns=['income']) predict_label = pd.DataFrame(predict_label['income'].map( lambda item: '>50K' if item == 1.0 else '<=50K'), columns=['income'])
print "start fitting" clf2.fit(feat, la) print "saving model" # from sklearn.externals import joblib # joblib.dump(clf2, '../gbdt_feat/model/gbdt.model') # jbdt = joblib.load('../gbdt_feat/model/gbdt.model') # for m in range(len(model.feature_importances_)): # if model.feature_importances_[m]>0.05: # print "feature_importance",m,model.feature_importances_[m] print "loading test data" test = np.loadtxt(open("../gbdt_feat/gbdt2_3_class_feat_online.csv", "rb"), delimiter=",", skiprows=0) import numpy as np print "predicting" pre = clf2.predict(test) f = open("../submit/gbdt2_3_result_classifer_online.csv", "wb") write = csv.writer(f) write.writerow(["passengercount", "WIFIAPTag", "slice10min"]) for i in range(len(pre)): pre_date = "2016-09-25-" wifiname = wifi_name_dict[int(test[i][1])] slice10h = 15 + int(test[i][2] - 1) / 6 slice10m = int((test[i][2] - 1) % 6) pre_data = pre_date + str(slice10h) + "-" + str(slice10m) write.writerow([str(pre[i]), wifiname, pre_data]) f.close() for i in range(len(clf2.feature_importances_)): print clf2.feature_importances_[i]
X = train[predictors] scaler = StandardScaler() X = scaler.fit_transform(X) X = pd.DataFrame(X, columns=predictors) train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0) gbm0 = GradientBoostingClassifier(random_state=10) #Fit the algorithm on the data gbm0.fit(train_X, train_y) #Predict training set: train_predictions = gbm0.predict(val_X) train_predprob = gbm0.predict_proba(val_X)[:, 1] #Perform cross-validation: cv_score = cross_validation.cross_val_score(gbm0, train_X, train_y, cv=5, scoring='roc_auc') #Print model report: print("\nModel Report") print("Accuracy : %.4g" % metrics.accuracy_score(val_y.values, train_predictions)) print("AUC Score (Train): %f" % metrics.roc_auc_score(val_y, train_predprob))
# In[ ]: my_model = XGBClassifier(n_estimators=150, learning_rate=0.25) my_model.fit(df.values, pred, verbose=True) # In[ ]: predictions = my_model.predict(test.values) col = pd.Series(predictions) final_df = pd.DataFrame({"PassengerId": c1, "Survived": col}) final_df.to_csv("XGBSub.csv", index=False) # In[ ]: final_df.sample(19) # In[ ]: gbc = GradientBoostingClassifier(n_estimators=150, learning_rate=1, max_depth=3, random_state=0).fit(df.values, pred) x = gbc.predict(test.values) c4 = pd.Series(list(x)) final_df = pd.DataFrame({"PassengerId": c1, "Survived": c4}) final_df.to_csv("GBMSub.csv", index=False) # In[ ]: # In[ ]:
vec = DictVectorizer() X = vec.fit_transform(X).toarray() import random random.seed(1) #Splitting the data for training and testing from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05, random_state = 1234) from sklearn.metrics import accuracy_score from sklearn.ensemble import GradientBoostingClassifier model = GradientBoostingClassifier() model.fit(X_train, y_train) predicted= model.predict(X_test) print ("Model accuracy is %.2f" % accuracy_score(predicted, y_test)) location =r"C:\Users\Latoya Clarke\Desktop\Data for Analysis\Loan Prediction\test.csv" loan_test = pd.read_csv(location) loan_test['Gender'] = loan_test['Gender'].fillna('Male') loan_test['Married'] = loan_test['Married'].fillna('Yes') loan_test['Dependents'] = loan_test['Dependents'].fillna(0) loan_test['Self_Employed'] = loan_test['Self_Employed'].fillna('No') loan_test['LoanAmount'] = loan_test['LoanAmount'].fillna(round(loan_test['LoanAmount'].mean(),1)) loan_test['Loan_Amount_Term'] = loan_test['Loan_Amount_Term'].fillna(round(loan_test['Loan_Amount_Term'].mean(),1)) loan_test['Credit_History'] = loan_test['Credit_History'].fillna(round(loan_test['Credit_History'].mean(),0)) loan_selected_1 = loan_test.drop(['Loan_ID'], axis = 1) X_1= loan_selected_1.to_dict(orient='records')
#Ensemble Classifier from sklearn.ensemble import VotingClassifier # estimators=[('gnb', gnb), ('rf', rf), ('log_reg', logreg),('decesiontree',dt),('gradientBoost',gb_clf),('gaussian',gpc)] estimators = [('decesiontree', dt), ('gradientBoost', gb_clf), ('gaussian', gpc)] ensemble = VotingClassifier(estimators, voting='hard') ensemble.fit(X_train[:100], y_train[:100]) print('Ensemble: ' + str(ensemble.score(X_test, y_test)) + "\n") # y_pred_class1 = gnb.predict(X_test) # y_pred_class2= rf.predict(X_test) # y_pred_class3 = logreg.predict(X_test) y_pred_class4 = dt.predict(X_test) y_pred_class5 = gb_clf.predict(X_test) y_pred_class6 = gpc.predict(X_test) y_test_le = le.fit_transform(y_test) # y_pred_class1_le = le.fit_transform(y_pred_class1) # y_pred_class2_le = le.fit_transform(y_pred_class2) # y_pred_class3_le = le.fit_transform(y_pred_class3) y_pred_class4_le = le.fit_transform(y_pred_class4) y_pred_class5_le = le.fit_transform(y_pred_class5) y_pred_class6_le = le.fit_transform(y_pred_class6) #GNB class1_tp = 0 class1_fn = 0 class1_fp = 0
for fold in fold_stances: ids = list(range(len(folds))) del ids[fold] X_train = np.vstack(tuple([Xs[i] for i in ids])) y_train = np.hstack(tuple([ys[i] for i in ids])) X_test = Xs[fold] y_test = ys[fold] clf = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True) clf.fit(X_train, y_train) predicted = [LABELS[int(a)] for a in clf.predict(X_test)] actual = [LABELS[int(a)] for a in y_test] fold_score, _ = score_submission(actual, predicted) max_fold_score, _ = score_submission(actual, actual) score = fold_score / max_fold_score print("Score for fold " + str(fold) + " was - " + str(score)) if score > best_score: best_score = score best_fold = clf #Final result: test_data is a dataframe # test_data.to_csv('answer.csv', index=False, encoding='utf-8') # From pandas library
# Each learner aims to reduce the residuals (errors) produced by the previous learner. # The two main hyper-parameters are: # # - The **learning rate** (*lr*) controls over-fitting: # decreasing the *lr* limits the capacity of a learner to overfit the residuals, ie, # it slows down the learning speed and thus increases the **regularisation**. # # - The **sub-sampling fraction** controls the fraction of samples to be used for # fitting the learners. Values smaller than 1 leads to **Stochastic Gradient Boosting**. # It thus controls for over-fitting reducing variance and incresing bias. # # .. figure:: ../images/gradient_boosting.png # :width: 500 # :alt: Gradient boosting. # from sklearn.ensemble import GradientBoostingClassifier gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, subsample=0.5, random_state=0) gb.fit(X_train, y_train) y_pred = gb.predict(X_test) y_prob = gb.predict_proba(X_test)[:, 1] print("bAcc: %.2f, AUC: %.2f " % (metrics.balanced_accuracy_score(y_true=y_test, y_pred=y_pred), metrics.roc_auc_score(y_true=y_test, y_score=y_prob)))
if record[10] != 20 or age_of20 < 25000: training.append(list(record[0:10])) label_of_training.append(record[10]) counter = counter + 1 starttime = time.time() clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0) clf = clf.fit(training, label_of_training) endtime = time.time() TP = 0 ### correctly predicted cases whose age are below 40 FN = 0 ### cases mis-predicted age > 40 FP = 0 ### cases mis-predicted age < 40 TN = 0 ### correctly predicted cases whose age > 40 count = 0 for elem in clf.predict(testing): if label_of_testing[count] <= 40 and elem <= 40: TP = TP + 1 elif label_of_testing[count] <= 40 and elem > 40: FN = FN + 1 elif label_of_testing[count] > 40 and elem <= 40: FP = FP + 1 else: TN = TN + 1 count = count + 1 print("Accuracy Rate: ", (TN + TP)/len(testing)) print("Precision Rate: ", TP/(TP + FP)) print("Recall Rate: ", TP/(TP + FN)) print("Model Construction Time: ", (endtime - starttime), " sec")
import sys import numpy as np import pandas as pd from sklearn.ensemble import GradientBoostingClassifier if __name__ == '__main__': X_train = pd.read_csv(sys.argv[1]).values y_train = pd.read_csv(sys.argv[2]).values.reshape(-1) X_test = pd.read_csv(sys.argv[3]).values print("X_train:", X_train.shape, end=' / ') print("y_train:", y_train.shape, end=' / ') print("X_test:", X_test.shape) gbc = GradientBoostingClassifier(n_estimators=700) gbc.fit(X_train, y_train) y_predict = gbc.predict(X_test) print("y_predict:", y_predict, "/ shape:", y_predict.shape) data = np.c_[np.arange(len(y_predict)) + 1, y_predict] fo = open(sys.argv[4], 'w') fo.write(pd.DataFrame(data, columns=['id', 'label']).to_csv(index=False)) fo.close()
#grid_search.fit(X_train, Y_train) #grid_search.best_params_ # Random Forests random_forest = RandomForestClassifier(n_estimators=100) #random_forest = RandomForestClassifier(n_estimators=100, # criterion='entropy', # max_depth=10, # max_features='sqrt', # min_samples_split=5) random_forest.fit(X_train, Y_train) Y_pred_1 = random_forest.predict(X_test) #grid_2 = { "loss" : ["deviance","exponential"], # "n_estimators" : [100], # "max_features" : ['sqrt','log2',0.2,0.5,0.8]} #GB=GradientBoostingClassifier() #grid_search = sklearn.model_selection.GridSearchCV(GB, grid_2, n_jobs=-1, cv=5) #grid_search.fit(X_train, Y_train) #grid_search.best_params_ random_forest.score(X_train, Y_train) #gradient_boost = GradientBoostingClassifier(n_estimators=100,loss='exponential',max_features='log2') gradient_boost = GradientBoostingClassifier(n_estimators=100) gradient_boost.fit(X_train, Y_train) Y_pred_2 = gradient_boost.predict(X_test) gradient_boost.score(X_train, Y_train)
loss='deviance', max_depth=50, max_features=2, max_leaf_nodes=100, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=.2, n_estimators=100, presort='auto', random_state=None, subsample=1.0, verbose=1, warm_start=False) model.fit(x_train, y_train) res22 = model.predict([x_train[0]]) import RPi.GPIO as GPIO from time import sleep GPIO.setwarnings(False) GPIO.setmode(GPIO.BOARD) GPIO.setup(3, GPIO.OUT, initial=GPIO.LOW) GPIO.setup(5, GPIO.OUT, initial=GPIO.LOW) GPIO.setup(7, GPIO.OUT, initial=GPIO.LOW) if res22[0] == 0: GPIO.output(3, GPIO.HIGH) elif res22[0] == 1: GPIO.output(5, GPIO.HIGH) else: