def do_training(processed_train_csv_file): ## Processed train samples reading # read saved processed train samples from the given csv file processed_train_samples = pd.read_csv(processed_train_csv_file) # inf to nan processed_train_samples = processed_train_samples.replace([np.inf, -np.inf], np.nan) # nan to 0 processed_train_samples = processed_train_samples.fillna(value=0) processed_train_samples_index_lst = processed_train_samples.index.tolist() # 之前排过序,这里shuffle一下,效果更好 random.shuffle(processed_train_samples_index_lst) # organize new train samples and targets shuffled_train_samples = processed_train_samples.ix[processed_train_samples_index_lst] col_names = shuffled_train_samples.columns.tolist() col_names.remove("booking_bool") features = shuffled_train_samples[col_names].values labels = shuffled_train_samples['booking_bool'].values # Model training # 1 Random Forest Classifier print("Training Random Forest Classifier") rf_classifier = RandomForestClassifier(n_estimators=150, verbose=2, n_jobs=-1, min_samples_split=10) rf_classifier.fit(features, labels) print("Saving the Random Forest Classifier") data_io.save_model(rf_classifier, model_name='rf_classifier.pkl') # 2 Gradient Boosting Classifier print("Gradient Boosting Classifier") gb_classifier = GradientBoostingClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10) gb_classifier.fit(features, labels) print("Saving the Gradient Boosting Classifier") data_io.save_model(gb_classifier, model_name='gb_classifier.pkl') # 3 SGD Classifier print("SGD Classifier") sgd_classifier = SGDClassifier(loss="modified_huber", verbose=2, n_jobs=-1) sgd_classifier.fit(features, labels) print("saved the SGD Classifier") data_io.save_model(sgd_classifier, model_name='sgd_classifier.pkl')
def GN(pth): train_desc=np.load(pth+'/training_features.npy') nbr_occurences = np.sum( (train_desc > 0) * 1, axis = 0) idf = np.array(np.log((1.0*len(image_paths)+1) / (1.0*nbr_occurences + 1)), 'float32') # Scaling the words stdSlr = StandardScaler().fit(train_desc) train_desc = stdSlr.transform(train_desc) modelGN=GradientBoostingClassifier(n_estimators=100,learning_rate=1.0, max_depth=1, random_state=0) modelGN.fit(train_desc,np.array(train_labels)) joblib.dump((modelGN, img_classes, stdSlr), pth+"/gn-bof.pkl", compress=3) test(pth, "gn-")
def trainGBT(requestsQ, responsesQ): while True: args = requestsQ.get() if args[0] == 'KILL': break vectors = args[1] # expected in the order of learningRate, maxTrees, minSplitSize, maxDepth hyperparams = args[2] model = GradientBoostingClassifier(learning_rate=hyperparams[0], n_estimators=hyperparams[1], min_samples_split=hyperparams[2], max_depth=hyperparams[3]) model.fit(vectors['Xtrain'], vectors['Ytrain']) score = accuracy_score(vectors['Ytest'], model.predict(vectors['Xtest'])) responsesQ.put((model, score), True) return 0
def __init__(self, verbose=1, n_estimators = 200, max_depth=8, min_samples_leaf=10000): self.classifier = GradientBoostingClassifier( **{'verbose': verbose, 'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf }) self.name = "gb_n{n}_md{md}_ms{ms}".format( **{"n": n_estimators, "md": max_depth, "ms": min_samples_leaf} )
class MyGradientBoostingClassifier(BaseClassifier): def __init__(self, verbose=1, n_estimators = 200, max_depth=8, min_samples_leaf=10000): self.classifier = GradientBoostingClassifier( **{'verbose': verbose, 'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf }) self.name = "gb_n{n}_md{md}_ms{ms}".format( **{"n": n_estimators, "md": max_depth, "ms": min_samples_leaf} ) def get_name(self): return self.name def fit(self, X, y): return self.classifier.fit(X, y) def predict_proba(self, X): return self.classifier.predict_proba(X) def get_feature_importances(self): return self.classifier.feature_importances_
features_data_count = X.count() missing = features_data_count[features_data_count < matches_count] missing = missing.apply(lambda x: "missing {} of {}".format(matches_count - x, matches_count)) print(missing) X = X.fillna(0) # ===================== GradientBoosting ============================== size = 0 score = 0 for forest_size in [10, 20, 30, 50, 150, 300]: start_time = datetime.datetime.now() clf = GradientBoostingClassifier(n_estimators=forest_size) k_folder = KFold(X.shape[0], n_folds=5, shuffle=True) scores = cross_val_score(clf, X=X, y=y, cv=k_folder, scoring='roc_auc') current_score = np.mean(scores) print("for {} trees mean score has been {} and time elapsed {}".format(forest_size, current_score, datetime.datetime.now() - start_time)) if score < current_score: score = current_score size = forest_size print("best score was for {} forest size: {}".format(size, score)) # ===================LogisticRegression================= features = X def train_logistic(features, target, label): scaler = StandardScaler() features = scaler.fit_transform(features)
# <codecell> df2 = df[selected] # <codecell> X, y = shuffle(df2[possible_features], df2.bad) offset = int(X.shape[0] * 0.9) X_train, y_train = X[:offset], y[:offset] X_test, y_test = X[offset:], y[offset:] # <codecell> params = {'init': LogOddsEstimator(), 'n_estimators': 5, 'max_depth': 6, 'learning_rate': 0.1, 'loss': 'bdeviance'} clf = GradientBoostingClassifier(**params) # <codecell> clf = clf.fit(X_train, y_train) predicted = clf.predict(X_test) # <codecell> clf.feature_importances_ # <codecell> print "Mean Squared Error" mse = mean_squared_error(y_test, predicted) print("MSE: %.4f" % mse)
def trainModel(param,feat_folder,feat_name): #read data from folder print 'now we read data from folder:%s'%(feat_folder) #start cv print 'now we need to generate cross_validation' accuracy_cv = [] for i in range(0,2): print 'this is the run:%d cross-validation'%(i+1) testIndex = loadCVIndex("%s/test.run%d.txt"%("../data/feat/combine",(i+1))) #if we use xgboost to train model ,we need to use svmlib format if param['task'] in ['regression']: #with xgb we will dump the file with CV,and we will read data train_data = xgb.DMatrix("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) valid_data = xgb.DMatrix("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) watchlist = [(train_data,'train'),(valid_data,'valid')] bst = xgb.train(param,train_data,int(param['num_round']),watchlist) pred = bst.predict(valid_data) elif param['task'] in ['clf_skl_lr']: train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) train_data = train_data.tocsr() test_data = test_data.tocsr() clf = LogisticRegression() clf.fit(train_data,train_label) pred = clf.predict(test_data) elif param['task'] == "reg_skl_rf": ## regression with sklearn random forest regressor train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) rf = RandomForestRegressor(n_estimators=param['n_estimators'], max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) rf.fit(train_data, test_label) pred = rf.predict(test_data) elif param['task'] == "reg_skl_etr": ## regression with sklearn extra trees regressor train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) etr = ExtraTreesRegressor(n_estimators=param['n_estimators'], max_features=param['max_features'], n_jobs=param['n_jobs'], random_state=param['random_state']) etr.fit(train_data,test_label) pred = etr.predict(test_data) elif param['task'] in ['reg_skl_gbm'] : train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) gbm = GradientBoostingClassifier(n_estimators=int(param['n_estimators']), learning_rate=param['learning_rate'], max_features=param['max_features'], max_depth=param['max_depth'], subsample=param['subsample'], random_state=param['random_state']) feat_names.remove('cid') gbm.fit(train_data,train_label) pred = gbm.predict(test_data) elif param['task'] in ['reg_skl_ridge']: train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1))) test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1))) train_data = train_data.tocsr() test_data = test_data.tocsr() ridge = Ridge(alpha=param["alpha"], normalize=True) ridge.fit(train_data,train_label) predraw = ridge.predict(test_data) print predraw predrank = predraw.argsort().argsort() trainIndex = loadCVIndex("%s/train.run%d.txt"%("../data/feat/combine",(i+1))) cdf = creatCDF(train, trainIndex) pred = getScore(predrank,cdf) print pred """ elif param['task'] in ['regression']: elif param['task'] in ['reg_skl_gbm'] : gbm = GradientBoostingClassifier(n_estimators=int(param['n_estimators']), learning_rate=param['learning_rate'], max_features=param['max_features'], max_depth=param['max_depth'], subsample=param['subsample'], random_state=param['random_state']) feat_names.remove('cid') gbm.fit(train_data[feat_names],train_data['cid']) pred = gbm.predict(valid_data[feat_names]) elif param['task'] in ['reg_skl_ridge']: feat_names.remove('cid') ridge = Ridge(alpha=param["alpha"], normalize=True) ridge.fit(train_data[feat_names],train_data['cid']) pred = ridge.predict(valid_data[feat_names]) """ #now we use the the accuracy to limit our model acc = accuracy_model(pred,train.iloc[testIndex]['cid']) print "the model accurary:%s"%(acc) accuracy_cv.append(acc) #here we will count the accuracy_cv_mean = np.mean(accuracy_cv) accuracy_cv_std = np.std(accuracy_cv) print 'the accuracy for %.6f'%(accuracy_cv_mean) return {'loss':-accuracy_cv_mean,'attachments':{'std':accuracy_cv_std},'status': STATUS_OK}
import matplotlib.pyplot as plt import numpy as np import output_coursera as coursera from sklearn.metrics import log_loss from sklearn.cross_validation import train_test_split from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier from sklearn.ensemble.forest import RandomForestClassifier data = pandas.read_csv('gbm-data.csv') X = data[data.columns[1:]].values y = data[data.columns[0]].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241) clf = GradientBoostingClassifier(n_estimators=250, random_state=241, verbose=True) sigmoid = np.vectorize(lambda x: (1 / (1 + math.exp(-x)))) coursera.output('overfitting.txt', 'overfitting') looses = {} def plot_score(test_predictions, y_test, train_predictions, y_train, color, learning_rate): test_loss = [log_loss(y_test, pred) for pred in test_predictions] train_loss = [log_loss(y_train, pred) for pred in train_predictions] plt.plot(test_loss, color, linewidth=2) plt.plot(train_loss, color+'--', linewidth=2) looses[learning_rate] = test_loss