def train_model(train, model=Model.DECISION_TREE, seed=None):
    print("Training model using regressor: {}".format(model.name))
    train_dropped = train.drop('unit_sales', axis=1)
    target = train['unit_sales']

    if model == Model.RANDOM_FOREST:
        params = {'n_estimators': 10}
        clf = ensemble.RandomForestRegressor(random_state=seed, **params)
    elif model == Model.ADABOOST:
        params = {'n_estimators': 50, 'learning_rate': 1.0, 'loss':'linear'}
        clf = ensemble.AdaBoostRegressor(random_state=seed, **params)
    elif model == Model.GRADIENT_BOOST:
        params = {'n_estimators': 200, 'max_depth': 4}
        clf = ensemble.GradientBoostingRegressor(random_state=seed, **params)
    else:
        params = {'criterion': 'mse'}
        clf = tree.DecisionTreeRegressor(random_state=seed)

    trained_model = clf.fit(train_dropped, target)
    return (trained_model,params)
Exemplo n.º 2
0
    def regression_analysis(self):

        tmp = dict()
        #linear
        tmp['logic'] = feature_selection.RFECV(lm.LinearRegression(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_
        tmp['ridge'] = feature_selection.RFECV(lm.Ridge(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_
        tmp['SGD'] = feature_selection.RFECV(lm.SGDRegressor(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_
        tmp['lm_svm'] = feature_selection.RFECV(svm.LinearSVR(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_

        #non-linear
        tmp['ADABoost'] = feature_selection.RFECV(ensemble.AdaBoostRegressor(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_
        tmp['RandomForest'] = feature_selection.RFECV(ensemble.RandomForestRegressor(), cv=5, n_jobs = self.n_jobs).fit(self.x,self.y).ranking_

        #stats        
        fscore = feature_selection.f_regression(self.x,self.y)
        tmp['f_score'] = fscore[0]
        tmp['f_pval'] = fscore[1]
        tmp['MIC'] = feature_selection.mutual_info_regression(self.x,self.y)

        return tmp           
Exemplo n.º 3
0
def test_AdaBoostRegressor_learning_rate(*data):
    x_train, x_test, y_train, y_test = data
    learning_rates = np.linspace(0.01, 1)
    fig = plt.figure()
    train_scores = []
    test_scores = []
    ax = fig.add_subplot(1, 1, 1)
    for i, learning_rate in enumerate(learning_rates):
        regr = ensemble.AdaBoostRegressor(learning_rate=learning_rate)
        regr.fit(x_train, y_train)
        train_scores.append(regr.score(x_train, y_train))
        test_scores.append(regr.score(x_test, y_test))
    ax.plot(learning_rates, train_scores, label="train_score")
    ax.plot(learning_rates, test_scores, label="test_score")
    ax.set_xlabel("learning_rate")
    ax.set_ylabel("score")
    ax.set_title("AdaBoostRegression")
    ax.legend(loc='best')
    ax.set_ylim(0, 1)
    plt.show()
Exemplo n.º 4
0
def test_AdaBoostRegressor(*data):
    x_train, x_test, y_train, y_test = data
    regr = ensemble.AdaBoostRegressor()
    regr.fit(x_train, y_train)
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    estimators_num = len(regr.estimators_)
    x = range(1, estimators_num + 1)
    ax.plot(list(x),
            list(regr.staged_score(x_train, y_train)),
            label="train_score")
    ax.plot(list(x),
            list(regr.staged_score(x_test, y_test)),
            label="test_score")
    ax.set_xlabel("estimators_num")
    ax.set_ylabel("score")
    ax.set_ylim(0, 1)
    ax.legend(loc='best')
    ax.set_title("AdaBoost_regression")
    plt.show()
def cross_validate_model(X_train, Y_train):
    """
	Here we perform cross validation of models to choose the best one.
	"""
    # Divide the training and testing data
    train, test, y_actual, y_predict = train_test_split(X_train,
                                                        Y_train,
                                                        test_size=0.5,
                                                        random_state=42)

    # List the regression methods to use.
    clf_random_forest = ensemble.RandomForestRegressor(n_estimators=50)
    clf_adaboost_reg = ensemble.AdaBoostRegressor(n_estimators=50)
    clf_lasso_larscv = sklinear.LassoLarsCV(cv=9)
    clf_ridge = sklinear.RidgeCV()
    clf_elastic_net = sklinear.ElasticNet()
    clf_extra_tree = ensemble.ExtraTreesRegressor(n_estimators=50)
    clf_mlpr = neural_network.MLPRegressor(solver='adam')

    # Add the above methods in an array
    # More ameable for looping
    methods = [
        clf_random_forest, clf_adaboost_reg, clf_lasso_larscv, clf_elastic_net,
        clf_extra_tree, clf_mlpr
    ]
    methods_label = [
        'clf_random_forest', 'clf_adaboost_reg', 'clf_lasso_larscv',
        'clf_elastic_net', 'clf_extra_tree', 'clf_mlpr'
    ]

    method_mse = np.zeros((len(methods), 1))
    # Fit and predict for each method
    for i in range(len(methods)):
        methods[i].fit(train, y_actual)
        method_predict = methods[i].predict(test)
        method_mse[i] = metrics.mean_squared_error(y_predict, method_predict)
        print('MSE for %s while cross validation : %f' %
              (methods_label[i], method_mse[i]))

    # We return the method which has the minimum mse
    return np.argmin(method_mse)
Exemplo n.º 6
0
def test_AdaBoostRegressor(*data):
    '''
    test the regression with different number of regression model
    :param data: train_data, test_data, train_value, test_value
    :return:  None
    '''
    X_train,X_test,y_train,y_test=data
    regr=ensemble.AdaBoostRegressor()
    regr.fit(X_train,y_train)
    ## graph
    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    estimators_num=len(regr.estimators_)
    X=range(1,estimators_num+1)
    ax.plot(list(X),list(regr.staged_score(X_train,y_train)),label="Traing score")
    ax.plot(list(X),list(regr.staged_score(X_test,y_test)),label="Testing score")
    ax.set_xlabel("estimator num")
    ax.set_ylabel("score")
    ax.legend(loc="best")
    ax.set_title("AdaBoostRegressor")
    plt.show()
Exemplo n.º 7
0
    def fit_and_predict(self):
        """
        Insert your models here to do the work of predicting on the training set.
        """

        lifted_train_data, lifted_test_data = self.lift_data(
            self.X_tr, self.test_set)
        lifted_train_data, lifted_test_data = self.standardize_data(
            lifted_train_data, lifted_test_data)
        self.X_tr, self.test_set = self.standardize_data(
            self.X_tr, self.test_set)

        #Here are some good models, which you might want to use (they're not quite as good as GradientBoostingRegressors)
        #the models are multi_Ada, multi_ridge_chain, and tree.
        #All of them predict both latitude and longitude at the same time, rather than one and then the other
        #Use multioutput.MultiOutputRegressor() or multioutput.RegressorChain() to get multiple predictions at once
        #Tree models don't need to be fit on lifted data. Only ridge needs to be fit on lifted data.
        #multi_grad is a GradientBoostingRegressor that does pretty well. Yours might be better, though.
        ridge = linear_model.Ridge(alpha=2.0, tol=0.0001)
        multi_ridge_chain = multioutput.RegressorChain(base_estimator=ridge,
                                                       order=[0, 1])
        tree = sklearn.tree.DecisionTreeRegressor(max_depth=9,
                                                  min_samples_split=200,
                                                  min_samples_leaf=100)

        Ada_tree = sklearn.tree.DecisionTreeRegressor(max_depth=2,
                                                      min_samples_split=100,
                                                      min_samples_leaf=50)
        AdaBoost = ensemble.AdaBoostRegressor(base_estimator=Ada_tree,
                                              learning_rate=0.02,
                                              n_estimators=60)
        multi_Ada = multioutput.MultiOutputRegressor(estimator=AdaBoost,
                                                     n_jobs=3)

        gradient_boost = ensemble.GradientBoostingRegressor(
            learning_rate=0.065, n_estimators=500)
        multi_grad = multioutput.MultiOutputRegressor(estimator=gradient_boost,
                                                      n_jobs=3)

        self.print_txt()
Exemplo n.º 8
0
def findNextTick(df, type):
    df["nextClose"] = df["High"].shift(-1)
    #df["nextTime"] = df["time"].shift(-1)
    df["nextIndex"] = df.index
    df["nextIndex"] = df["nextIndex"].shift(-1)
    df.at[len(df) - 1, 'nextIndex'] = df.iloc[len(df) - 2]["nextIndex"] + 1
    df = df[0:len(df) - 2]
    #df.to_csv("test3.csv")
    X_pred = df[-1:].drop(["nextClose"], axis=1)
    print(X_pred)
    df = df[0:-1]
    X = df.drop(["nextClose"], axis=1)
    #X.to_csv("test4.csv")
    y = df["nextClose"]
    r1 = LinearRegression(n_jobs=-1)
    r2 = tree.DecisionTreeRegressor()
    r3 = ensemble.RandomForestRegressor(n_jobs=-1)
    r4 = ensemble.AdaBoostRegressor()
    r5 = ensemble.BaggingRegressor(n_jobs=-1)
    r6 = ensemble.GradientBoostingRegressor()
    estimators = [('r1', r1), ('r2', r2), ('r3', r3), ('r4', r4), ('r5', r5),
                  ('r6', r6)]
    if (type == 0):
        regressor = ensemble.StackingRegressor(
            estimators=estimators,
            final_estimator=ensemble.RandomForestRegressor(n_estimators=100,
                                                           random_state=42,
                                                           n_jobs=-1))
    elif (type == 1):
        regressor = ensemble.VotingRegressor(estimators=estimators)
    regressor.fit(X, y)  #training the algorithm
    y_pred = list(regressor.predict(X_pred))
    y_pred.insert(0, X_pred.iloc[0]["High"])
    y_pred = np.asarray(y_pred)
    x_predTime = list(X_pred.index)
    x_predTime.append(x_predTime[0] + 1)
    x_predTime = np.asarray(x_predTime)
    print(y_pred)
    print(x_predTime)
    return {"Y": y_pred, "X": x_predTime}
Exemplo n.º 9
0
    def __init__(self, df, run_prefix, max_iter, cv_count):
        self.run_prefix = run_prefix
        self.max_iter = max_iter
        self.cv_count = cv_count
       
        self.y_tune = df.PHENO
        self.X_tune = df.drop(columns=['PHENO'])
        self.IDs_tune = self.X_tune.ID
        self.X_tune = self.X_tune.drop(columns=['ID'])

        best_algo_name_in = run_prefix + '.best_algorithm.txt'
        best_algo_df = pd.read_csv(best_algo_name_in, header=None, index_col=False)
        self.best_algo = str(best_algo_df.iloc[0,0])

        self.algorithms = [
            linear_model.LinearRegression(),
            ensemble.RandomForestRegressor(),
            ensemble.AdaBoostRegressor(),
            ensemble.GradientBoostingRegressor(),
            linear_model.SGDRegressor(),
            svm.SVR(),
            neural_network.MLPRegressor(),
            neighbors.KNeighborsRegressor(),
            ensemble.BaggingRegressor(),
            xgboost.XGBRegressor()
        ]

        # Initialize a few variables we will be using later 
        self.log_table = None
        self.best_algo_name_in = None
        self.best_algo_df = None
        self.hyperparameters = None
        self.scoring_metric = None
        self.cv_tuned = None 
        self.cv_baseline = None 
        self.algo = None
        self.searchCVResults = None
        self.rand_search = None
        self.algo_tuned = None
        self.tune_out = None
Exemplo n.º 10
0
def modelAlgorithm(fileName, forecast_out, jiekou):
    X_train, X_test, y_train, y_test = load_data(fileName, forecast_out)
    if jiekou == 'LR':
        clf = LinearRegression()
    elif jiekou == 'KNN':
        clf = neighbors.KNeighborsRegressor()
    elif jiekou == 'DTR':
        clf = DecisionTreeRegressor()
    elif jiekou == 'SVM':
        clf = svm.SVR()
    elif jiekou == 'RFR':
        clf = ensemble.RandomForestRegressor(n_estimators=20)  # 这里使用20个决策树
    elif jiekou == 'AdaBoost':
        clf = ensemble.AdaBoostRegressor(n_estimators=50)
    elif jiekou == 'GBRT':
        clf = ensemble.GradientBoostingRegressor(n_estimators=100)  # 100个弱学习器
    else:
        clf = LinearRegression()

    accuracy, y_test, result, r1, r2, r3 = try_different_method(
        clf, X_train, X_test, y_train, y_test)
    return (clf, accuracy, y_test, result, r1, r2, r3)
Exemplo n.º 11
0
 def train(num,X_train,y_train,X_test,y_test):
     if num == 1:
         model = tree.DecisionTreeRegressor()
     elif num == 2:
         model = svm.SVR()
     elif num == 3:
         model = LinearRegression()
     elif num == 4:
         model = neighbors.KNeighborsRegressor(n_neighbors=11)
     elif num == 5:
         model = ensemble.RandomForestRegressor(n_estimators=100)
     elif num == 6:
         model = ensemble.AdaBoostRegressor(n_estimators=100)
     elif num == 7:
         model = ensemble.GradientBoostingRegressor(n_estimators=100)
     elif num == 8:
         model = ensemble.BaggingRegressor()
     elif num == 9:
         model = ExtraTreeRegressor()
     model.fit(X_train, y_train)
     pred=model.predict(X_test)
     return rmse(np.array(y_test), np.array(pred)),r_squared(np.array(y_test),np.array(pred))
Exemplo n.º 12
0
def test_AdaBoostRegressor(*data):
    '''
    测试 AdaBoostRegressor 的用法,绘制 AdaBoostRegressor 的预测性能随基础回归器数量的影响

    :param data: 可变参数。它是一个元组,这里要求其元素依次为:训练样本集、测试样本集、训练样本的值、测试样本的值
    :return:  None
    '''
    X_train,X_test,y_train,y_test=data
    regr=ensemble.AdaBoostRegressor()
    regr.fit(X_train,y_train)
    ## 绘图
    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    estimators_num=len(regr.estimators_)
    X=range(1,estimators_num+1)
    ax.plot(list(X),list(regr.staged_score(X_train,y_train)),label="Traing score")
    ax.plot(list(X),list(regr.staged_score(X_test,y_test)),label="Testing score")
    ax.set_xlabel("estimator num")
    ax.set_ylabel("score")
    ax.legend(loc="best")
    ax.set_title("AdaBoostRegressor")
    plt.show()
Exemplo n.º 13
0
def main():
    df = pd.read_csv('./Testing_Oceans_data.csv')
    df = df.convert_objects(convert_numeric=True)

    prediction_label = 'Sound_Velocity(m/s)'

    X = np.array(df.drop([prediction_label], 1))
    y = np.array(df[prediction_label])

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        X, y, test_size=0.2)

    evaluations = [
        ('Elastic Net', linear_model.ElasticNet(alpha=0.1), X_train, y_train,
         X_test, y_test),
        ('Lasso', linear_model.Lasso(alpha=0.1), X_train, y_train, X_test,
         y_test),
        ('Ridge', linear_model.Ridge(alpha=.1), X_train, y_train, X_test,
         y_test),
        ('Ensemble Random Forest', ensemble.RandomForestRegressor(), X_train,
         y_train, X_test, y_test),
        ('Ensemble Extra Trees', ensemble.ExtraTreesRegressor(), X_train,
         y_train, X_test, y_test),
        ('Ensemble Bagging Regressor', ensemble.BaggingRegressor(), X_train,
         y_train, X_test, y_test),
        ('Ensemble Gradiant Boosting Regressor',
         ensemble.GradientBoostingRegressor(), X_train, y_train, X_test,
         y_test),
        ('Ensemble Ada Boost Regressor', ensemble.AdaBoostRegressor(), X_train,
         y_train, X_test, y_test),
        ('SVR Kernel Linear', svm.SVR(kernel='linear'), X_train, y_train,
         X_test, y_test),
        ('SVR Kernel RBF', svm.SVR(kernel='rbf'), X_train, y_train, X_test,
         y_test)
    ]

    for evaluation in evaluations:
        evaluate(*evaluation)
Exemplo n.º 14
0
 def __init__(self, trainFilename, testFilename, resultsDir):
     # assert len(trainFilenames) == len(testFilenames)
     self.resultsDir = resultsDir
     #ntrees = 1000
     self.trainFilename = trainFilename
     self.testFilename = testFilename
     self.regressors = {
         'lm':
         MultiOutputRegressor(linear_model.LinearRegression()),
         'rg':
         MultiOutputRegressor(linear_model.Ridge()),
         'svm':
         MultiOutputRegressor(svm.SVR(kernel='rbf')),
         'gp':
         MultiOutputRegressor(gaussian_process.GaussianProcessRegressor()),
         'knn':
         MultiOutputRegressor(neighbors.KNeighborsRegressor(n_neighbors=5)),
         'dt':
         MultiOutputRegressor(tree.DecisionTreeRegressor()),
         'br':
         MultiOutputRegressor(ensemble.BaggingRegressor(n_jobs=-1)),
         'etr':
         MultiOutputRegressor(ensemble.ExtraTreesRegressor(n_jobs=-1)),
         'rfr':
         MultiOutputRegressor(ensemble.RandomForestRegressor(n_jobs=-1)),
         'abr':
         MultiOutputRegressor(ensemble.AdaBoostRegressor()),
         'gbr':
         MultiOutputRegressor(ensemble.GradientBoostingRegressor()),
         'xgb':
         MultiOutputRegressor(xgboost.XGBRegressor()),
         'dl':
         None
     }
     self.load_data()
     self.preprocess_data()
     for key in self.regressors.keys():
         self.fit_model(key)
Exemplo n.º 15
0
def run(type):
    if type == 'decision_tree':
        from sklearn import tree
        model = tree.DecisionTreeRegressor()
    elif type == 'linear':
        from sklearn import linear_model
        model = linear_model.LinearRegression()
    elif type == 'svm':
        from sklearn import svm
        model = svm.SVR()
    elif type == 'KNN':
        from sklearn import neighbors
        model = neighbors.KNeighborsRegressor()
    elif type == 'random_forest':
        from sklearn import ensemble
        model = ensemble.RandomForestRegressor(n_estimators=20)
    elif type == 'adaboost':
        from sklearn import ensemble
        model = ensemble.AdaBoostRegressor(n_estimators=50)
    elif type == 'extra_tree':
        from sklearn.tree import ExtraTreeRegressor
        model = ExtraTreeRegressor()
    method(model, model)
Exemplo n.º 16
0
def train():
	#x_train, x_test, y_train, y_test = cross_validation.tran_test_split(features,labels, test_size=0.3, random_state=3)  i

    mean = []
    std = []
    kfold = KFold(n_splits=5) # k=10, split the data into 10 equal parts
    models_name = ['RandomForestRegressor','GradientBoostingRegressor','AdaBoostRegressor']
    
    models=[ensemble.RandomForestRegressor(n_estimators=20),
            ensemble.GradientBoostingRegressor(n_estimators=50),
            ensemble.AdaBoostRegressor(n_estimators=50)]
    counter = 0
    for i in models:
        model = i
        cv_result = cross_val_score(model,features,labels, cv = kfold,scoring = pearson_score)
        print('=============================================')
        print(models_name[counter])
        print(cv_result)
        print('mean: ',cv_result.mean())
        print('std: ',cv_result.std())
        counter += 1
        mean.append(cv_result.mean())
        std.append(cv_result.std())
Exemplo n.º 17
0
def get_model(model_name):
    model_dict = {
        # 回归
        "model_DecisionTreeRegressor":
        tree.DecisionTreeRegressor(),  # 决策树
        "model_LinearRegression":
        linear_model.LinearRegression(),  # 线性回归
        "model_SVR":
        svm.SVR(),  # SVM
        "model_KNeighborsRegressor":
        neighbors.KNeighborsRegressor(),  # KNN
        "model_RandomForestRegressor":
        ensemble.RandomForestRegressor(n_estimators=20),  # 随机森林,这里使用20个决策树
        "model_AdaBoostRegressor":
        ensemble.AdaBoostRegressor(n_estimators=50),  # Adaboost,这里使用50个决策树
        "model_GradientBoostingRegressor":
        ensemble.GradientBoostingRegressor(
            n_estimators=100),  # GBRT,这里使用100个决策树
        "model_BaggingRegressor":
        BaggingRegressor(),  # Bagging回归
        "model_ExtraTreeRegressor":
        ExtraTreeRegressor(),  # ExtraTree极端随机树回归
        # 分类
        "model_LogisticRegression_weight":
        LogisticRegression(C=1000, class_weight={
            0: 0.8,
            1: 0.2
        }),  # 逻辑回归
        "model_LogisticRegression":
        LogisticRegression(C=1000),  # 逻辑回归(无权重)
        "model_SVC":
        svm.SVC(class_weight="balanced"),  # 向量机
        "model_RandomForestClassifier":
        RandomForestClassifier(n_estimators=7, class_weight="balanced")  # 随机森林
    }

    return model_dict[model_name]
Exemplo n.º 18
0
def boost_test(train_set,
               label_train,
               validation_set,
               label_validation,
               depth=8):
    print("Boosting test")

    boost_clf = ensemble.AdaBoostRegressor(tree.DecisionTreeRegressor(
        criterion='mse', max_depth=25),
                                           n_estimators=600,
                                           learning_rate=1.5)
    grad_boost_clf = ensemble.GradientBoostingRegressor(max_depth=7,
                                                        n_estimators=600,
                                                        learning_rate=0.05)
    boost_clf.fit(train_set, label_train)
    grad_boost_clf.fit(train_set, label_train)
    ada_computed_values = boost_clf.predict(validation_set)
    mean_absolute_error = sklearn.metrics.mean_absolute_error(
        label_validation, ada_computed_values, multioutput='uniform_average')
    print("ada mean error is : ", mean_absolute_error)
    mean_max_absolute_error_arrays = np.mean(
        np.amax(np.absolute(np.subtract(ada_computed_values,
                                        label_validation)),
                axis=1))
    print("ada mean max error per row is : ", mean_max_absolute_error_arrays)

    grad_computed_values = grad_boost_clf.predict(validation_set)
    mean_absolute_error = sklearn.metrics.mean_absolute_error(
        label_validation, grad_computed_values, multioutput='uniform_average')
    print("grad mean error is : ", mean_absolute_error)
    mean_max_absolute_error_arrays = np.mean(
        np.amax(np.absolute(np.subtract(grad_computed_values,
                                        label_validation)),
                axis=1))
    print("grad mean max error per row is : ", mean_max_absolute_error_arrays)
    return boost_clf, grad_boost_clf
Exemplo n.º 19
0
def test_AdaBoostRegressor_learning_rate(*data):
    '''
    test the performance with different learning rate
    :param data:   train_data, test_data, train_value, test_value
    :return: None
    '''
    X_train,X_test,y_train,y_test=data
    learning_rates=np.linspace(0.01,1)
    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    traing_scores=[]
    testing_scores=[]
    for learning_rate in learning_rates:
        regr=ensemble.AdaBoostRegressor(learning_rate=learning_rate,n_estimators=500)
        regr.fit(X_train,y_train)
        traing_scores.append(regr.score(X_train,y_train))
        testing_scores.append(regr.score(X_test,y_test))
    ax.plot(learning_rates,traing_scores,label="Traing score")
    ax.plot(learning_rates,testing_scores,label="Testing score")
    ax.set_xlabel("learning rate")
    ax.set_ylabel("score")
    ax.legend(loc="best")
    ax.set_title("AdaBoostRegressor")
    plt.show()
####3.2线性回归####
from sklearn import linear_model
model_LinearRegression = linear_model.LinearRegression()
####3.3SVM回归####
from sklearn import svm
model_SVR = svm.SVR()
####3.4KNN回归####
from sklearn import neighbors
model_KNeighborsRegressor = neighbors.KNeighborsRegressor()
####3.5随机森林回归####
from sklearn import ensemble
model_RandomForestRegressor = ensemble.RandomForestRegressor(
    n_estimators=20)  #这里使用20个决策树
####3.6Adaboost回归####
from sklearn import ensemble
model_AdaBoostRegressor = ensemble.AdaBoostRegressor(
    n_estimators=50)  #这里使用50个决策树
####3.7GBRT回归####
from sklearn import ensemble
model_GradientBoostingRegressor = ensemble.GradientBoostingRegressor(
    n_estimators=100)  #这里使用100个决策树
####3.8Bagging回归####
from sklearn.ensemble import BaggingRegressor
model_BaggingRegressor = BaggingRegressor()
####3.9ExtraTree极端随机树回归####
from sklearn.tree import ExtraTreeRegressor
model_ExtraTreeRegressor = ExtraTreeRegressor()

# In[48]:


def getHeadFromFile():
Exemplo n.º 21
0
# ####3.2线性回归####
# from sklearn import linear_model
# model_maps['linear'] = linear_model.LinearRegression()
# ####3.3SVM回归####
# from sklearn import svm
# model_maps['svr'] = svm.SVR()
# ####3.4KNN回归####
# from sklearn import neighbors
# model_maps['knn'] = neighbors.KNeighborsRegressor()
####3.5随机森林回归####
from sklearn import ensemble
model_maps['random_forest'] = ensemble.RandomForestRegressor(
    n_estimators=10)  #这里使用20个决策树
####3.6Adaboost回归####
from sklearn import ensemble
model_maps['adaboost'] = ensemble.AdaBoostRegressor(
    n_estimators=50)  #这里使用50个决策树
####3.7GBRT回归####
from sklearn import ensemble
model_maps['gradient_boosting'] = ensemble.GradientBoostingRegressor(
    n_estimators=100)  #这里使用100个决策树
####3.8Bagging回归####
from sklearn.ensemble import BaggingRegressor
model_maps['bagging'] = BaggingRegressor()
####3.9ExtraTree极端随机树回归####
from sklearn.tree import ExtraTreeRegressor
model_maps['extra_tree'] = ExtraTreeRegressor()


def get_score(data_path, model_name):
    data = np.memmap(data_path, dtype='float64', mode='r')
    data = np.array(data.reshape((int(len(data) / 46), 46)))
Exemplo n.º 22
0
def train_test_all_regressors_with_cross_validation(X, y, seed=SEED):
    """
    Train, test and print the results of most available regressors presented in sklearn using cross validation.
    Args:
        X_train (matrix): matrix with features of the training set
        y_train (list): list of values of target of the training set
        X_test (matrix): matrix with features of the test set
        y_test (list): list of values of target of the test set
    """
    assert isinstance(X, pd.core.frame.DataFrame)
    assert isinstance(y, pd.core.series.Series)
    assert isinstance(seed, int)

    from sklearn import linear_model
    from sklearn import tree
    from sklearn import ensemble
    from sklearn import neighbors
    from sklearn import neural_network

    from sklearn.model_selection import cross_val_score

    models = []
    models.append(("BayesianRidge", linear_model.BayesianRidge()))
    models.append(("ElasticNet", linear_model.ElasticNet()))
    models.append(("HuberRegressor", linear_model.HuberRegressor()))
    models.append(("Lars", linear_model.Lars()))
    models.append(("Lasso", linear_model.Lasso()))
    models.append(("LassoLars", linear_model.LassoLars()))
    models.append(("LinearRegression", linear_model.LinearRegression()))
    models.append(("OrthogonalMatchingPursuit",
                   linear_model.OrthogonalMatchingPursuit()))
    models.append(("PassiveAggressiveRegressor",
                   linear_model.PassiveAggressiveRegressor()))
    models.append(("Ridge", linear_model.Ridge()))
    models.append(("SGDRegressor", linear_model.SGDRegressor()))
    models.append(
        ("AdaBoostRegressor", ensemble.AdaBoostRegressor(random_state=seed)))
    models.append(
        ("BaggingRegressor", ensemble.BaggingRegressor(random_state=seed)))
    models.append(("ExtraTreesRegressor",
                   ensemble.ExtraTreesRegressor(random_state=seed)))
    models.append(("GradientBoostingRegressor",
                   ensemble.GradientBoostingRegressor(random_state=seed)))
    models.append(("RandomForestRegressor",
                   ensemble.RandomForestRegressor(random_state=seed)))
    models.append(("DecisionTreeRegressor",
                   tree.DecisionTreeRegressor(random_state=seed)))
    models.append(("KNeighborsRegressor", neighbors.KNeighborsRegressor()))
    models.append(("MLPRegressor", neural_network.MLPRegressor()))

    best_rmse = 1000000000.0
    best_model = ''

    for name, model in models:
        print(
            '------------------------------------------------------------------------------'
        )
        print(name)
        print(
            '------------------------------------------------------------------------------'
        )

        scores = cross_val_score(model,
                                 X,
                                 y,
                                 scoring='neg_root_mean_squared_error',
                                 cv=5)
        scores = -scores
        scores_mean = scores.mean()
        scores_std = scores.std()
        print("RMSE: %0.3f (+/- %0.2f)" % (scores_mean, scores_std * 2))

        #mean_absolute_percentage_error_value = mean_absolute_percentage_error(y_test, y_pred)
        if scores_mean < best_rmse:
            best_rmse = scores_mean
            best_model = name

    print(
        '------------------------------------------------------------------------------'
    )
    print('Best model: ' + best_model)
    print('Best RMSE: ' + str(best_rmse))
    print(
        '------------------------------------------------------------------------------'
    )
Exemplo n.º 23
0
from sklearn import datasets
from sklearn import ensemble
from sklearn import neighbors, svm, tree
from sklearn import metrics
from sklearn import model_selection

# Data: Boston housing price
X = datasets.load_boston().data
y = datasets.load_boston().target
# Score
mse = metrics.make_scorer(metrics.mean_squared_error)
""" 1. Boosting """
''' 1.1 Adaboost '''
base = neighbors.KNeighborsRegressor(n_neighbors=3)
model = ensemble.AdaBoostRegressor(base_estimator=base,
                                   n_estimators=30,
                                   learning_rate=0.1,
                                   random_state=1)
kfold = model_selection.KFold(n_splits=10, random_state=1)
result = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=mse)
print(f'RMSE of Adaboost: {result.mean()**0.5:.2f}')
''' 1.2 GradientBoostingRegressor (GBRT) '''
model = ensemble.GradientBoostingRegressor(n_estimators=30, random_state=1)
kfold = model_selection.KFold(n_splits=10, random_state=2)
result = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=mse)
print(f'RMSE of GBRT: {result.mean()**0.5:.2f}')
""" 2. Bagging """
''' 2.1 BaggingRegressor '''
base = tree.DecisionTreeRegressor()
model = ensemble.BaggingRegressor(base_estimator=base,
                                  n_estimators=10,
                                  random_state=1)
Exemplo n.º 24
0
#MLP
from sklearn.neural_network import MLPRegressor
mlp = MLPRegressor(hidden_layer_sizes=(200, 96),
                   solver='lbfgs',
                   activation='relu',
                   alpha=0.1,
                   learning_rate_init=0.01)

#110 120
#集成方法
#随机森林
from sklearn import ensemble
rf = ensemble.RandomForestRegressor(n_estimators=400)  #这里使用20个决策树

#AdaBoost
ada = ensemble.AdaBoostRegressor(n_estimators=50)

#GBRT
gbrt = ensemble.GradientBoostingRegressor(n_estimators=100)

methods = [{
    "linear": linear_reg,
    "tree": tree_reg,
    "MLP": mlp,
    "svr": svr,
    "KNN": knn,
    "RandomForest": rf,
    "Adaboost": ada,
    "GBRT": gbrt
}]
Exemplo n.º 25
0
def train_test_all_regressors(X_train, X_test, y_train, y_test, seed=SEED):
    """
    Train, test and print the results of most available regressors presented in sklearn.

    Args:
        X_train (matrix): matrix with features of the training set
        y_train (list): list of values of target of the training set
        X_test (matrix): matrix with features of the test set
        y_test (list): list of values of target of the test set
    """
    assert isinstance(X_train, pd.core.frame.DataFrame)
    assert isinstance(X_test, pd.core.frame.DataFrame)
    assert isinstance(y_train, pd.core.series.Series)
    assert isinstance(y_test, pd.core.series.Series)
    assert isinstance(seed, int)

    from sklearn import linear_model
    from sklearn import tree
    from sklearn import ensemble
    from sklearn import neighbors
    from sklearn import neural_network

    models = []
    models.append(("BayesianRidge", linear_model.BayesianRidge()))
    models.append(("ElasticNet", linear_model.ElasticNet()))
    models.append(("HuberRegressor", linear_model.HuberRegressor()))
    models.append(("Lars", linear_model.Lars()))
    models.append(("Lasso", linear_model.Lasso()))
    models.append(("LassoLars", linear_model.LassoLars()))
    models.append(("LinearRegression", linear_model.LinearRegression()))
    models.append(("OrthogonalMatchingPursuit",
                   linear_model.OrthogonalMatchingPursuit()))
    models.append(("PassiveAggressiveRegressor",
                   linear_model.PassiveAggressiveRegressor()))
    models.append(("Ridge", linear_model.Ridge()))
    models.append(("SGDRegressor", linear_model.SGDRegressor()))
    models.append(
        ("AdaBoostRegressor", ensemble.AdaBoostRegressor(random_state=seed)))
    models.append(
        ("BaggingRegressor", ensemble.BaggingRegressor(random_state=seed)))
    models.append(("ExtraTreesRegressor",
                   ensemble.ExtraTreesRegressor(random_state=seed)))
    models.append(("GradientBoostingRegressor",
                   ensemble.GradientBoostingRegressor(random_state=seed)))
    models.append(("RandomForestRegressor",
                   ensemble.RandomForestRegressor(random_state=seed)))
    models.append(("DecisionTreeRegressor",
                   tree.DecisionTreeRegressor(random_state=seed)))
    models.append(("KNeighborsRegressor", neighbors.KNeighborsRegressor()))
    models.append(("MLPRegressor", neural_network.MLPRegressor()))

    best_mean_absolute_percentage_error = 100
    best_model = ''

    for name, model in models:
        print(
            '------------------------------------------------------------------------------'
        )
        print(name)
        print(
            '------------------------------------------------------------------------------'
        )

        model.fit(X_train, y_train)

        print('Training Set')
        y_pred = model.predict(X_train)
        print_results(y_train, y_pred)

        print('Testing Set')
        y_pred = model.predict(X_test)
        print_results(y_test, y_pred)

        mean_absolute_percentage_error_value = mean_absolute_percentage_error(
            y_test, y_pred)
        if mean_absolute_percentage_error_value < best_mean_absolute_percentage_error:
            best_mean_absolute_percentage_error = mean_absolute_percentage_error
            best_model = name

    print(
        '------------------------------------------------------------------------------'
    )
    print('Best model: ' + best_model)
    print('Best mean absolute percentage error: ' +
          str(best_mean_absolute_percentage_error))
    print(
        '------------------------------------------------------------------------------'
    )
Exemplo n.º 26
0
# Level 2 Score: 

clf = ensemble.RandomForestRegressor(n_estimators=nET*1.5, max_features=30, max_depth=23, n_jobs=-1, random_state=rnd, verbose=0)
model_sum = blend_proba(clf=clf, X_train=train, y=target, X_test=test, nfolds=5, seed=rnd, category="regressor", filename = "RFR", setused=setused, tag = '30_23')


# Level 2 Score: 

clf = ensemble.AdaBoostClassifier(random_state=rnd, learning_rate=0.4, loss='linear')     
model_sum = blend_proba(clf=clf, X_train=train, y=target, X_test=test, nfolds=5, seed=rnd, category="classifier", filename = "AdaClass", setused=setused)


# Level 2 Score: 

clf = ensemble.AdaBoostRegressor(random_state=rnd, learning_rate=0.4, loss='linear')     
model_sum = blend_proba(clf=clf, X_train=train, y=target, X_test=test, nfolds=5, seed=rnd, category="regressor", filename = "AdaReg", setused=setused)


# Level 2 Score: 

clf = linear_model.LinearRegression()
model_sum = blend_proba(clf=clf, X_train=train, y=target, X_test=test, nfolds=5, seed=rnd, category="regressor", filename = "LinReg", setused=setused)


# Level 2 Score: 

clf = linear_model.LogisticRegression(solver='sag', random_state=rnd, verbose=0, n_jobs=-1)
model_sum = blend_proba(clf=clf, X_train=train, y=target, X_test=test, nfolds=5, seed=rnd, category="classifier", filename = "LogReg", setused=setused)

Exemplo n.º 27
0
import sklearn.utils as su
import sklearn.tree as st
import sklearn.ensemble as se
import sklearn.metrics as sm
import matplotlib.pyplot as mp
boston = sd.load_boston()
fn = boston.feature_names
print(fn)
# 摇一摇
x, y = su.shuffle(boston.data, boston.target, random_state=7)
train_size = int(len(x) * 0.8)
train_x, test_x, train_y, test_y = x[:train_size], x[
    train_size:], y[:train_size], y[train_size:]
# 由四百棵四层决策树组成的正向激励集成回归器
model = se.AdaBoostRegressor(st.DecisionTreeRegressor(max_depth=4),
                             n_estimators=400,
                             random_state=7)
model.fit(train_x, train_y)
pred_test_y = model.predict(test_x)
print(sm.r2_score(test_y, pred_test_y))
# 从回归器中获取特征重要性
fi = model.feature_importances_
print(fi)
print(max(fi))
print(boston.feature_names[list(fi).index(max(fi))])
# 可视化不同特征的重要性排序
mp.figure('Feature Importance', facecolor='lightgray')
mp.title('Feature Importance', fontsize=20)
mp.xlabel('Feature', fontsize=14)
mp.ylabel('Importance', fontsize=14)
mp.tick_params(labelsize=10)
Exemplo n.º 28
0
rf_estimator = ensemble.RandomForestRegressor(random_state=100)
rf_grid = {
    'n_estimators': list(range(100, 501, 200)),
    'max_features': [14, 16, 18, 20],
    'max_depth': [3, 5, 7]
}
rf_selector = get_best_model(rf_estimator,
                             rf_grid,
                             X_train,
                             y_trans,
                             scoring=scoring)
plot_feature_importances(rf_selector, X_train, 50)
important_features_rf = get_important_features(rf_selector, X_train)

dt_estimator = tree.DecisionTreeRegressor()
ada_estimator = ensemble.AdaBoostRegressor(dt_estimator)
ada_grid = {
    'n_estimators': list(range(100, 501, 200)),
    'learning_rate': [0.1, 1.0],
    'base_estimator__max_depth': [1, 3, 5]
}
ada_selector = get_best_model(ada_estimator,
                              ada_grid,
                              X_train,
                              y_trans,
                              scoring=scoring)
plot_feature_importances(ada_selector, X_train, 50)
important_features_ada = get_important_features(ada_selector, X_train)

important_features = set(important_features_lasso) | set(
    important_features_rf) | set(important_features_ada)
Exemplo n.º 29
0
    model_name = args.model_name
    model_dir = os.path.join(args.root, "model")  # get model dir
    data_dir = os.path.join(args.root, "data")  # get data dir

    data_path = os.path.join(data_dir, args.inFile)
    print('load data from'+data_path)
    
    data = pickle.load(open(data_path, 'rb'))
<<<<<<< HEAD
    out_path = os.path.join(data_dir, args.outFileName+'.csv')
=======
>>>>>>> 26834db2e373429b3393ac8503d74372ba3ef35f
    assert 'data' in data
    if args.train:
        ratio = args.ratio
        regr = ensemble.AdaBoostRegressor(n_estimators=args.n_estimators,learning_rate=args.learning_rate)

        assert 'target' in data

        features = data['data']
        labels = data['target']

        rs = ShuffleSplit(n_splits=1, test_size=ratio)
        train_index, val_index = next(rs.split(features, labels))

        x_train = features[train_index]
        x_test = features[val_index]

        y_train = labels[train_index]
        y_test = labels[val_index]
ESTIMATORS = {
    "Linear Regression":
    linear_model.LinearRegression(),
    "Lasso Regression":
    linear_model.Lasso(alpha=0.5),
    "Elastic Net":
    linear_model.ElasticNet(alpha=0.5, l1_ratio=0.7),
    "Ridge":
    linear_model.Ridge(fit_intercept=False),
    "Lasso Lars":
    linear_model.LassoLars(alpha=0.5),
    "Bayesian Ridge":
    linear_model.BayesianRidge(compute_score=True),
    "AdaBoost":
    ensemble.AdaBoostRegressor(),
    "Bagging":
    ensemble.BaggingRegressor(),
    "Extra trees":
    ensemble.ExtraTreesRegressor(n_estimators=10,
                                 max_features=32,
                                 random_state=0),
    "K -nn":
    KNeighborsRegressor(),
}

ESTIMATORS_SINGLE = {
    "Linear Regression":
    linear_model.LinearRegression(),
    "Lasso Regression":
    linear_model.Lasso(alpha=0.5),