Пример #1
0
def do_training(processed_train_csv_file):
    ## Processed train samples reading
    # read saved processed train samples from the given csv file
    processed_train_samples = pd.read_csv(processed_train_csv_file)

    # inf to nan
    processed_train_samples = processed_train_samples.replace([np.inf, -np.inf], np.nan)
    # nan to 0
    processed_train_samples = processed_train_samples.fillna(value=0)

    processed_train_samples_index_lst = processed_train_samples.index.tolist()
    # 之前排过序,这里shuffle一下,效果更好
    random.shuffle(processed_train_samples_index_lst)

    # organize new train samples and targets
    shuffled_train_samples = processed_train_samples.ix[processed_train_samples_index_lst]
    col_names = shuffled_train_samples.columns.tolist()
    col_names.remove("booking_bool")
    features = shuffled_train_samples[col_names].values
    labels = shuffled_train_samples['booking_bool'].values

    # Model training
    # 1 Random Forest Classifier

    print("Training Random Forest Classifier")
    rf_classifier = RandomForestClassifier(n_estimators=150,
                                           verbose=2,
                                           n_jobs=-1,
                                           min_samples_split=10)
    rf_classifier.fit(features, labels)

    print("Saving the Random Forest Classifier")
    data_io.save_model(rf_classifier, model_name='rf_classifier.pkl')

    # 2 Gradient Boosting Classifier
    print("Gradient Boosting  Classifier")
    gb_classifier = GradientBoostingClassifier(n_estimators=150,
                                               verbose=2,
                                               learning_rate=0.1,
                                               min_samples_split=10)
    gb_classifier.fit(features, labels)
    print("Saving the Gradient Boosting  Classifier")
    data_io.save_model(gb_classifier, model_name='gb_classifier.pkl')

    # 3 SGD Classifier
    print("SGD Classifier")
    sgd_classifier = SGDClassifier(loss="modified_huber", verbose=2,
                                   n_jobs=-1)
    sgd_classifier.fit(features, labels)

    print("saved the SGD Classifier")
    data_io.save_model(sgd_classifier, model_name='sgd_classifier.pkl')
Пример #2
0
def GN(pth):
     train_desc=np.load(pth+'/training_features.npy')
     nbr_occurences = np.sum( (train_desc > 0) * 1, axis = 0)
     idf = np.array(np.log((1.0*len(image_paths)+1) / (1.0*nbr_occurences + 1)), 'float32')

# Scaling the words
     stdSlr = StandardScaler().fit(train_desc)
     train_desc = stdSlr.transform(train_desc)
     modelGN=GradientBoostingClassifier(n_estimators=100,learning_rate=1.0,
                                         max_depth=1, random_state=0)
     modelGN.fit(train_desc,np.array(train_labels))
     joblib.dump((modelGN, img_classes, stdSlr), pth+"/gn-bof.pkl", compress=3) 
     test(pth, "gn-")
Пример #3
0
def trainGBT(requestsQ, responsesQ):
    while True:
        args = requestsQ.get()
        if args[0] == 'KILL':
            break

        vectors = args[1]     
        # expected in the order of learningRate, maxTrees, minSplitSize, maxDepth
        hyperparams = args[2]

        model =   GradientBoostingClassifier(learning_rate=hyperparams[0], n_estimators=hyperparams[1], min_samples_split=hyperparams[2], max_depth=hyperparams[3])
        
        model.fit(vectors['Xtrain'], vectors['Ytrain'])
        score = accuracy_score(vectors['Ytest'], model.predict(vectors['Xtest']))        
        responsesQ.put((model, score), True)

    return 0
Пример #4
0
 def __init__(self, verbose=1, n_estimators = 200, max_depth=8, min_samples_leaf=10000):
     self.classifier = GradientBoostingClassifier( **{'verbose': verbose,
                                                  'n_estimators': n_estimators,
                                                  'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf
                                                  })
     self.name = "gb_n{n}_md{md}_ms{ms}".format(
         **{"n": n_estimators, "md": max_depth, "ms": min_samples_leaf}
     )
Пример #5
0
class MyGradientBoostingClassifier(BaseClassifier):
    def __init__(self, verbose=1, n_estimators = 200, max_depth=8, min_samples_leaf=10000):
        self.classifier = GradientBoostingClassifier( **{'verbose': verbose,
                                                     'n_estimators': n_estimators,
                                                     'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf
                                                     })
        self.name = "gb_n{n}_md{md}_ms{ms}".format(
            **{"n": n_estimators, "md": max_depth, "ms": min_samples_leaf}
        )

    def get_name(self):
        return self.name

    def fit(self, X, y):
        return self.classifier.fit(X, y)

    def predict_proba(self, X):
        return self.classifier.predict_proba(X)

    def get_feature_importances(self):
        return self.classifier.feature_importances_
features_data_count = X.count()
missing = features_data_count[features_data_count < matches_count]
missing = missing.apply(lambda x: "missing {} of {}".format(matches_count - x, matches_count))
print(missing)


X = X.fillna(0)


# ===================== GradientBoosting ==============================

size = 0
score = 0
for forest_size in [10, 20, 30, 50, 150, 300]:
    start_time = datetime.datetime.now()
    clf = GradientBoostingClassifier(n_estimators=forest_size)
    k_folder = KFold(X.shape[0], n_folds=5, shuffle=True)
    scores = cross_val_score(clf, X=X, y=y, cv=k_folder, scoring='roc_auc')
    current_score = np.mean(scores)
    print("for {} trees mean score has been {} and time elapsed {}".format(forest_size, current_score, datetime.datetime.now() - start_time))
    if score < current_score:
        score = current_score
        size = forest_size

print("best score was for {} forest size: {}".format(size, score))

# ===================LogisticRegression=================
features = X
def train_logistic(features, target, label):
    scaler = StandardScaler()
    features = scaler.fit_transform(features)
Пример #7
0
# <codecell>

df2 = df[selected]

# <codecell>

X, y = shuffle(df2[possible_features], df2.bad)
offset = int(X.shape[0] * 0.9)
X_train, y_train = X[:offset], y[:offset]
X_test, y_test = X[offset:], y[offset:]

# <codecell>

params = {'init': LogOddsEstimator(), 'n_estimators': 5, 'max_depth': 6, 'learning_rate': 0.1, 'loss': 'bdeviance'}
clf = GradientBoostingClassifier(**params)

# <codecell>

clf = clf.fit(X_train, y_train)
predicted = clf.predict(X_test)

# <codecell>

clf.feature_importances_

# <codecell>

print "Mean Squared Error"
mse = mean_squared_error(y_test, predicted)
print("MSE: %.4f" % mse)
Пример #8
0
def trainModel(param,feat_folder,feat_name):
    #read data from folder
    print 'now we read data from folder:%s'%(feat_folder)
   
    #start cv
    print 'now we need to generate cross_validation'
    accuracy_cv = []
  
    for i in range(0,2):
        print 'this is the run:%d cross-validation'%(i+1)
        testIndex = loadCVIndex("%s/test.run%d.txt"%("../data/feat/combine",(i+1)))
        #if we use xgboost to train model ,we need to use svmlib format
        if param['task'] in ['regression']:
            #with xgb we will dump the file with CV,and we will read data 
            train_data = xgb.DMatrix("%s/run%d/train.svm.txt"%(feat_folder,(i+1)))
            valid_data = xgb.DMatrix("%s/run%d/test.svm.txt"%(feat_folder,(i+1)))
            watchlist = [(train_data,'train'),(valid_data,'valid')]
            bst = xgb.train(param,train_data,int(param['num_round']),watchlist)
            pred = bst.predict(valid_data)
        
        elif param['task'] in ['clf_skl_lr']:
            train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1)))
            test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1)))
            train_data  = train_data.tocsr()
            test_data = test_data.tocsr()
            clf = LogisticRegression()
            clf.fit(train_data,train_label)
            pred = clf.predict(test_data)
        
        elif param['task'] == "reg_skl_rf":
                    ## regression with sklearn random forest regressor
                    train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1)))
                    test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1)))
                    rf = RandomForestRegressor(n_estimators=param['n_estimators'],
                                               max_features=param['max_features'],
                                               n_jobs=param['n_jobs'],
                                               random_state=param['random_state'])
                    rf.fit(train_data, test_label)
                    pred = rf.predict(test_data)
        
        elif param['task'] == "reg_skl_etr":
                    ## regression with sklearn extra trees regressor
                    train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1)))
                    test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1)))
                    etr = ExtraTreesRegressor(n_estimators=param['n_estimators'],
                                              max_features=param['max_features'],
                                              n_jobs=param['n_jobs'],
                                              random_state=param['random_state'])
                    etr.fit(train_data,test_label)
                    pred = etr.predict(test_data)
                    
        elif param['task'] in ['reg_skl_gbm'] :
            train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1)))
            test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1)))
            gbm = GradientBoostingClassifier(n_estimators=int(param['n_estimators']),
                                             learning_rate=param['learning_rate'],
                                             max_features=param['max_features'],
                                             max_depth=param['max_depth'],
                                             subsample=param['subsample'],
                                             random_state=param['random_state'])
            feat_names.remove('cid')
            gbm.fit(train_data,train_label)
            pred = gbm.predict(test_data) 
        
        elif param['task'] in ['reg_skl_ridge']:
            train_data,train_label = load_svmlight_file("%s/run%d/train.svm.txt"%(feat_folder,(i+1)))
            test_data,test_label = load_svmlight_file("%s/run%d/test.svm.txt"%(feat_folder,(i+1)))
            train_data  = train_data.tocsr()
            test_data = test_data.tocsr()
            ridge = Ridge(alpha=param["alpha"], normalize=True)
            ridge.fit(train_data,train_label)
            
            predraw = ridge.predict(test_data)
            print predraw
            predrank = predraw.argsort().argsort()
            trainIndex = loadCVIndex("%s/train.run%d.txt"%("../data/feat/combine",(i+1)))
            cdf = creatCDF(train, trainIndex)
            pred = getScore(predrank,cdf)
            print pred
            
        """
        elif param['task'] in ['regression']:
            
            
        
        elif param['task'] in ['reg_skl_gbm'] :
            gbm = GradientBoostingClassifier(n_estimators=int(param['n_estimators']),
                                             learning_rate=param['learning_rate'],
                                             max_features=param['max_features'],
                                             max_depth=param['max_depth'],
                                             subsample=param['subsample'],
                                             random_state=param['random_state'])
            feat_names.remove('cid')
            gbm.fit(train_data[feat_names],train_data['cid'])
            pred = gbm.predict(valid_data[feat_names])
        elif param['task'] in ['reg_skl_ridge']:
            feat_names.remove('cid')
            ridge = Ridge(alpha=param["alpha"], normalize=True)
            ridge.fit(train_data[feat_names],train_data['cid'])
            pred = ridge.predict(valid_data[feat_names])
        """
        #now we use the the accuracy to limit our model
        acc = accuracy_model(pred,train.iloc[testIndex]['cid'])
        print "the model accurary:%s"%(acc)
        accuracy_cv.append(acc)

    #here we will count the 
    accuracy_cv_mean = np.mean(accuracy_cv)
    accuracy_cv_std = np.std(accuracy_cv)
    print 'the accuracy for %.6f'%(accuracy_cv_mean)
    return {'loss':-accuracy_cv_mean,'attachments':{'std':accuracy_cv_std},'status': STATUS_OK}
import matplotlib.pyplot as plt
import numpy as np
import output_coursera as coursera
from sklearn.metrics import log_loss
from sklearn.cross_validation import train_test_split
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.ensemble.forest import RandomForestClassifier

data = pandas.read_csv('gbm-data.csv')
X = data[data.columns[1:]].values
y = data[data.columns[0]].values


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=241)

clf = GradientBoostingClassifier(n_estimators=250, random_state=241, verbose=True)


sigmoid = np.vectorize(lambda x: (1 / (1 + math.exp(-x))))


coursera.output('overfitting.txt', 'overfitting')

looses = {}
def plot_score(test_predictions, y_test, train_predictions, y_train, color, learning_rate):
    test_loss = [log_loss(y_test, pred) for pred in test_predictions]
    train_loss = [log_loss(y_train, pred) for pred in train_predictions]

    plt.plot(test_loss, color, linewidth=2)
    plt.plot(train_loss, color+'--', linewidth=2)
    looses[learning_rate] = test_loss