Exemplo n.º 1
0
def mul_dtree(X, Y2):
    forest = ExtraTreesRegressor(n_estimators=5,
                             compute_importances=True,
                             random_state=0)
    forest.fit(X[:200], Y2[:200])
    forest.predict(X[200:])
    print Y2[200:]
Exemplo n.º 2
0
def main():
    for ind in range(1, 15+1):
        print "TrainingSet/ACT%d_competition_training.csv" % ind
        #read in  data, parse into training and target sets
        cols, molecules1, train = read_data("../TrainingSet/ACT%d_competition_training.csv" % ind)
        target = np.array( [x[0] for x in train] )

        #load train
        train = filter_cols(train, cols, "../selected/cor9/selected_%d.txt" % ind)
        train = np.array(train)
        #print("Train: ", len(train), " cols:", len(train[0]))

        # seeds used: orig=1279, cor8=1278, cor9=1277
        cfr = ExtraTreesRegressor(n_estimators=2000, max_features=(len(train[0])//3), n_jobs=8, random_state=1277)
                                  #min_samples_leaf=2, min_samples_split=2, random_state=1279)
        rf = cfr.fit(train, target)

        #predict train
        pred = rf.predict(train)
        write_file("erStacking/cor9/er_stacking_%d.csv" % ind, molecules1, pred)

        #load test
        cols, molecules2, test = read_data("../TestSet/ACT%d_competition_test.csv" % ind)
        test = filter_cols(test, cols, "../selected/cor9/selected_%d.txt" % ind)
        test = np.array(test)

        #predict test
        pred = rf.predict(test)
        write_file("erStacking/test/cor9/er_submission_%d.csv" % ind, molecules2, pred)
Exemplo n.º 3
0
def predict_with_one(X, out_file_name):
    n_samples, n_features = X.shape
    iter_num = 3
    div = ShuffleSplit(n_samples, n_iter=iter_num, test_size=0.2, random_state=0)
    model = ExtraTreesRegressor(n_estimators=5)
    score_matrix = np.zeros((n_features, n_features))

    t = time()
    round_num = 0
    for train, test in div:
        round_num += 1
        train_samples = X[np.array(train)]
        test_samples = X[np.array(test)]
        for i in range(n_features):
            for j in range(n_features):
                X_train = train_samples[:, i:i+1]
                X_test = test_samples[:, i:i+1]
                y_train = train_samples[:, j]
                y_test = test_samples[:, j]
        # for i in range(len(fl)):
        #     for j in range(len(fl)):
        #         if fl[j][1]-fl[j][0] != 1:
        #             continue
        #         X_train = train_samples[:, fl[i][0]:fl[i][1]]
        #         X_test = test_samples[:, fl[i][0]:fl[i][1]]
        #         y_train = train_samples[:, fl[j][0]]
        #         y_test = test_samples[:, fl[j][0]]
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                mae = mean_absolute_error(y_test, y_pred)
                score_matrix[i, j] += mae
                print('Round', round_num, '|', i, j, mae, time()-t)
    np.savetxt(os.path.join(CODE_PATH, out_file_name),
               score_matrix/iter_num, fmt='%.3f', delimiter=',')
Exemplo n.º 4
0
    def fit(self, X, y, weights = None, **kwargs):
        if weights is None: weights = np.ones(y.shape[0])
        data = np.hstack((y.reshape(y.shape[0],1),X))
        
        S = wcov(data, weights)
        corr = wcorr(data, weights)
        wsd = np.sqrt(S.diagonal())
        
        ExtraTrees = ExtraTreesRegressor(**kwargs)
        ExtraTrees.fit(X,y, sample_weight=weights)
        
        Rsquare = ( S[0,1:].dot(np.linalg.inv(S[1:,1:]).dot(S[1:,0])) )/S[0,0]
        
        # assign proportion of Rsquare to each covariate dep. on importance
        self.importances = ExtraTrees.feature_importances_ * Rsquare 
        model = self.constrained_optimization( corr )
        
        if self.fit_intercept:
            w = np.diagflat( weights/np.sum(weights),k=0)
            wmean = np.sum(w.dot(data), axis=0)
            self.intercept_ = wmean[0] - wsd[0]*np.sum(wmean[1:]*model.x/wsd[1:])

        self.coef_ = wsd[0]*model.x/wsd[1:] 
        
        return self
Exemplo n.º 5
0
 def fit(self, X, y, **kwargs):
     for key, value in kwargs.iteritems():
         if key in self.INITPARAMS.keys():
             self.INITPARAMS[key] = value
     model = ExtraTreesRegressor(**self.INITPARAMS)
     model.fit(X, y)
     self.model = model
Exemplo n.º 6
0
def main():
    for ind in range(1, 15+1):
    #for ind in [3,4,5,7,9,11,12,13,14,15]: # no 1,2,6,8,10
        print "TrainingSet/ACT%d_competition_training.csv" % ind
        #read in  data, parse into training and target sets
        cols, train = read_data("../TrainingSet/ACT%d_competition_training.csv" % ind)
        target = np.array( [x[0] for x in train] )

        train = filter_cols(train, cols, "../selected/selected_%d.txt" % ind)
        #print("Train: ", len(train), " cols:", len(train[0]))
        train = np.array( train )

        #In this case we'll use a random forest, but this could be any classifier
        cfr = ExtraTreesRegressor(n_estimators=1000, max_features=(len(train[0])//3), n_jobs=8, random_state=1279)

        #Simple K-Fold cross validation. 10 folds.
        cv = cross_validation.KFold(len(train), k=10, indices=False, shuffle=True)

        #iterate through the training and test cross validation segments and
        #run the classifier on each one, aggregating the results into a list
        results = []
        for traincv, testcv in cv:
            ft = cfr.fit(train[traincv], target[traincv])
            score = ft.score(train[testcv], target[testcv])
            results.append(score)
            print "\tFold %d: %f" % (len(results), score)

        #print out the mean of the cross-validated results
        print "Results: " + str( np.array(results).mean() )
Exemplo n.º 7
0
    def build_models(self):

        self.remove_columns(
            [
                "institute_latitude",
                "institute_longitude",
                "institute_state",
                "institute_country",
                "var10",
                "var11",
                "var12",
                "var13",
                "var14",
                "var15",
                "instructor_past_performance",
                "instructor_association_industry_expert",
                "secondary_area",
                "var24",
            ]
        )

        model1 = GradientBoostingRegressor(learning_rate=0.1, n_estimators=200, subsample=0.8)
        model2 = RandomForestRegressor(n_estimators=50)
        model3 = ExtraTreesRegressor(n_estimators=50)

        model1.fit(self.X, self.y)
        model2.fit(self.X, self.y)
        model3.fit(self.X, self.y)

        return [model1, model2, model3]
Exemplo n.º 8
0
    def fit(self,data_train,target):
        self.target_train = target
        self.catcol = data_train.filter(like='var').columns.tolist()
        #start_gbr_tr = time.clock()
        self.gbr = GradientBoostingRegressor(n_estimators =self.nest,max_depth=7)
        self.gbr.fit(data_train,self.target_train)
        self.transformed_train_gbr = self.gbr.transform(data_train,threshold="0.35*mean")
        self.gbr_tr_fit = GradientBoostingRegressor(n_estimators =self.nest,max_depth=7)
        self.gbr_tr_fit.fit(self.transformed_train_gbr,self.target_train)
        #end_gbr_tr = time.clock()
        #print >> log, "time_gbr_tr = ", end_gbr_tr-start_gbr_tr

        #start_xfr_tr = time.clock()
        self.xfr= ExtraTreesRegressor(n_estimators =self.nest,max_depth=7)
        self.xfr.fit(data_train,self.target_train)
        self.transformed_train_xfr = self.xfr.transform(data_train,threshold="0.35*mean")
        self.xfr_tr_fit = ExtraTreesRegressor(n_estimators =self.nest,max_depth=7)
        self.xfr_tr_fit.fit(self.transformed_train_xfr,self.target_train)
        #end_xfr_tr = time.clock()
        #print >> log, "time_xfr_tr = ", end_xfr_tr-start_xfr_tr

        #start_gbr_cat = time.clock()
        self.gbr_cat_fit = GradientBoostingRegressor(n_estimators =self.nest,max_depth=7)
        self.gbr_cat_fit.fit(data_train[self.catcol],self.target_train)
        #end_gbr_cat = time.clock()
        #print >> log, "time_gbr_cat = ", end_gbr_cat-start_gbr_cat

        #start_xfr_cat = time.clock()
        self.xfr_cat_fit = ExtraTreesRegressor(n_estimators =self.nest,max_depth=7)
        self.xfr_cat_fit.fit(data_train[self.catcol],self.target_train)
        #end_xfr_cat = time.clock()
        #print >> log, "time_xfr_cat = ", end_xfr_cat-start_xfr_cat
        return self
Exemplo n.º 9
0
def do_etrees(filename):
    df, Y = create_merged_dataset(filename)
    etree = ExtraTreesRegressor(n_estimators=200, n_jobs=-1, min_samples_leaf=5, random_state=SEED)
    X = df.drop(['driver', 'trip'], 1)
    etree.fit(X, Y)
    probs = etree.predict(X[:200])
    return pd.DataFrame({'driver': df['driver'][:200], 'trip': df['trip'][:200], 'probs': probs})
Exemplo n.º 10
0
def cal_important_features(batch=10, threshold=1e-4):
  X_samples, Y_samples, scaler = dat.data_prepare('ocpm', 'lifetime_ecpm', outlier=0.05)
  tot_goot_atrs = {}
  for a in ATRS[5:]: tot_goot_atrs[a] = {}
  for i in np.arange(1,batch+1):
    Ts = timeit.default_timer()
    model = ExtraTreesRegressor(n_jobs=6)
    model.fit(X_samples, Y_samples)
    print "Totally %i features." % len(model.feature_importances_)
    print "[Labels] %i categories, %i interests, %i client_names, %i auto_tags" % (num.categories_len, num.interests_len, num.client_names_len, num.auto_tags_len)
    good_atrs = show_important_features(model.feature_importances_, threshold)
    for a in reversed(ATRS[5:]):
      for b in good_atrs[a]:
        if b in tot_goot_atrs[a]:
          tot_goot_atrs[a][b] += 1
        else:
          tot_goot_atrs[a][b] = 1
    print "%i batch finished in %.1f secs." % (i, (timeit.default_timer() - Ts))
    print "------------------------------------------------"
  # show performances
  for atr in reversed(ATRS[5:]):
    print "-------[%s]-----------------------" % atr
    for j in np.arange(1,batch+1):
      good_keys = [k for k,v in tot_goot_atrs[atr].items() if (v >= j)]
      print "%i keys occurs > %i times." % (len(good_keys), j)
  return tot_goot_atrs
Exemplo n.º 11
0
def build_extra_tree_regressor(X_test, X_train_full, y_train_full):


    print "Building ExtraTrees regressor..."
    etr = ExtraTreesRegressor(n_estimators=500)
    etr.fit(X_train_full, y_train_full)
    etr_predict = etr.predict(X_test)

    return etr_predict
Exemplo n.º 12
0
 def classify(self):
     """Perform classification"""
     clf = ETRegressor(n_estimators=500, min_samples_split=5, min_samples_leaf=2)
     #pca = PCA(n_components = 400)
     #self._ClassifyDriver__traindata = pca.fit_transform(self._ClassifyDriver__traindata)
     #self._ClassifyDriver__testdata = pca.transform(self._ClassifyDriver__testdata)
     #print self._ClassifyDriver__traindata.shape
     clf.fit(self._ClassifyDriver__traindata, self._ClassifyDriver__trainlabels)
     self._ClassifyDriver__y = clf.predict(self._ClassifyDriver__testdata)
Exemplo n.º 13
0
def get_forest(X_names=Xs, y_names=ys, num_trees=256, data=data):
    forest = ExtraTreesRegressor(
        n_estimators=num_trees, n_jobs=62, bootstrap=True)
    X = data.loc[:, [i for i in X_names]]
    y = data.loc[:, [i for i in y_names]]
    start = time()
    rfr = forest.fit(X, y)
    end = time()
    return(rfr, end-start)
def reg_skl_etr(param, data):
    [X_tr, X_cv, y_class_tr, y_class_cv, y_reg_tr, y_reg_cv] = data
    etr = ExtraTreesRegressor(n_estimators=param['n_estimators'],
                              max_features=param['max_features'],
                              n_jobs=param['n_jobs'],
                              random_state=param['random_state'])
    etr.fit(X_tr, y_reg_tr)
    pred = etr.predict(X_cv)
    RMSEScore = getscoreRMSE(y_reg_cv, pred)
    return RMSEScore, pred
Exemplo n.º 15
0
def extra_trees_regressor(x, y, n_estimators, max_depth):
    kf = KFold(len(x), n_folds=3)
    scores = []
    for train_index, test_index in kf:
        X_train, X_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf = ExtraTreesRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=0)
        clf.fit(X_train, y_train)
        scores.append(mean_squared_error(clf.predict(X_test), y_test) ** 0.5)
    return np.mean(scores)
Exemplo n.º 16
0
class MyExtraTreeReg(MyRegressor):
    def __init__(self, params=dict()):
        self._params = params
        self._extree = ExtraTreesRegressor(**(self._params))

    def update_params(self, updates):
        self._params.update(updates)
        self._extree = ExtraTreesRegressor(**(self._params))

    def fit(self, Xtrain, ytrain):
        self._extree.fit(Xtrain, ytrain)

    def predict(self, Xtest, option = None):
      return self._extree.predict(Xtest)

    def plt_feature_importance(self, fname_list, f_range = list()):
        importances = self._extree.feature_importances_

        std = np.std([tree.feature_importances_ for tree in self._extree.estimators_], axis=0)
        indices = np.argsort(importances)[::-1]

        fname_array = np.array(fname_list)

        if not f_range:
            f_range = range(indices.shape[0])

        n_f = len(f_range)

        plt.figure()
        plt.title("Extra Tree Feature importances")
        plt.barh(range(n_f), importances[indices[f_range]],
               color="b", xerr=std[indices[f_range]], ecolor='k',align="center")
        plt.yticks(range(n_f), fname_array[indices[f_range]])
        plt.ylim([-1, n_f])
        plt.show()


    def list_feature_importance(self, fname_list, f_range = list(), return_list = False):
        importances = self._extree.feature_importances_
        indices = np.argsort(importances)[::-1]

        print 'Extra tree feature ranking:'

        if not f_range :
            f_range = range(indices.shape[0])

        n_f = len(f_range)

        for i in range(n_f):
            f = f_range[i]
            print '{0:d}. feature[{1:d}]  {2:s}  ({3:f})'.format(f + 1, indices[f], fname_list[indices[f]], importances[indices[f]])

        if return_list:
            return [indices[f_range[i]] for i in range(n_f)]
Exemplo n.º 17
0
def algorithm_ExtraTrees(X_train,Y_train,X_validation,Y_validation, seed=7):


    # 训练模型
    scaler = StandardScaler().fit(X_train)
    rescaledX = scaler.transform(X_train)
    gbr = ExtraTreesRegressor(n_estimators=80)
    gbr.fit(X=rescaledX, y=Y_train)
    # 评估算法模型
    rescaledX_validation = scaler.transform(X_validation)
    predictions = gbr.predict(rescaledX_validation)
    print(mean_squared_error(Y_validation, predictions))
Exemplo n.º 18
0
def dummie_columns_extra_trees(train, test):
    from sklearn.ensemble import ExtraTreesRegressor
    print "-- {} --".format("Extremely Randomized Trees Regression using all but remarks")
    predicting_columns = list(train._get_numeric_data().columns.values)
    predicting_columns.remove("LISTPRICE")
    predicting_columns.remove("SOLDPRICE")
    rf = ExtraTreesRegressor(
        n_estimators=300, n_jobs=-1)
    rf.fit(train[predicting_columns], train["SOLDPRICE"])
    score = rf.score(test[predicting_columns], test["SOLDPRICE"])
    predictions = rf.predict(test[predicting_columns])
    sample_predictions(test, predictions)
    print "Accuracy: {}\n".format(score)
    return score, predictions
Exemplo n.º 19
0
def simple_extremely_random_trees(data_train_x, data_test_x, data_train_y, data_test_y):
    from sklearn.ensemble import ExtraTreesRegressor
    print "-- {} --".format("Extremely Randomized Trees Regression using all but remarks")
    rf = ExtraTreesRegressor(
        n_estimators=300,
        n_jobs=-1
    )
    rf.fit(data_train_x, data_train_y)
    sample_predictions(rf.predict(data_test_x), data_test_y)
    score = rf.score(data_test_x, data_test_y)
    cross_validated_scores = cross_val_score(
        rf, data_test_x, data_test_y, cv=5)
    print "MSE Accuracy: {}".format(score)
    print "MSE Across 5 Folds: {}".format(cross_validated_scores)
    print "95%% Confidence Interval: %0.3f (+/- %0.3f)\n" % (cross_validated_scores.mean(), cross_validated_scores.std() * 1.96)
Exemplo n.º 20
0
def baseline_extra(train_x, train_y,
                   test_x, test_y, n, d,
                   result_path="review_baseline_extra.txt"):
    predict = []
    clf = ExtraTreesRegressor(n_estimators=n,
                              max_depth=d,
                              random_state=0)
    clf = clf.fit(train_x, train_y)
    predict = clf.predict(test_x).tolist()
    result = pd.DataFrame([], columns=['review_count', 'predict'])
    result['review_count'] = test_y
    result['predict'] = predict
    result.to_csv(result_path, index=False)
    rmse = mean_squared_error(predict, test_y) ** 0.5
    return rmse
def main():
    # X,Y = make_top_dataset(100000,30)
    X, Y = make_friedman1_random_attr(n_samples=100000, n_features=10)
    tX, tY = make_friedman1_random_attr(n_samples=100, n_features=10)

    start_time = time.time()

    ext = ETRs(max_features=None, n_estimators=100, min_samples_split=1, n_jobs=-1)
    # ext = RFR(max_features=None, n_estimators=100, min_samples_split=1, n_jobs=-1)
    ext.fit(X, Y)

    elapsed_time = time.time() - start_time
    print elapsed_time

    print score(ext, tX, tY)
Exemplo n.º 22
0
    def iterative_fit(self, X, y, n_iter=1, refit=False):
        if refit:
            self.estimator = None

        if self.estimator is None:
            num_features = X.shape[1]
            max_features = int(
                float(self.max_features) * (np.log(num_features) + 1))
            # Use at most half of the features
            max_features = max(1, min(int(X.shape[1] / 2), max_features))
            self.estimator = ETR(
                n_estimators=0, criterion=self.criterion,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                bootstrap=self.bootstrap,
                max_features=max_features, max_leaf_nodes=self.max_leaf_nodes,
                oob_score=self.oob_score, n_jobs=self.n_jobs,
                verbose=self.verbose,
                random_state=self.random_state,
                warm_start=True
            )
        tmp = self.estimator  # TODO copy ?
        tmp.n_estimators += n_iter
        tmp.fit(X, y,)
        self.estimator = tmp
        return self
Exemplo n.º 23
0
def trainRegressorsAndSave(computeScore=False):
    for db in dbs:
        if (not os.path.exists("clfs/" + db)):
            clf = ExtraTreesRegressor(n_estimators=500, random_state=1, n_jobs=-1)
            saveTrainedClassifier(db, clf)
        elif (computeScore):
            clf = joblib.load("clfs/" + db)

        if (computeScore):
            print("Loading test data...")
            loaded = loadDB(db + ".csv")
            X_test = loaded[:, 0:-1]
            y_test = loaded[:, -1]

            print("Normalized score is {}".format(clf.score(X_test, y_test)))
            X_test = y_test = 0
Exemplo n.º 24
0
def run():
    cycles = load_and_munge_training_data('train.csv')
    inputs = ['holiday', 'workingday', 'temp', 'atemp',
              'humidity', 'windspeed', 'month', 'hour']

    x_train, x_test, y_train, y_test = train_test_split(cycles[inputs],
                                                        cycles['count'],
                                                        test_size=0.25)
    scaler_x = StandardScaler().fit(x_train)
    scaler_y = StandardScaler().fit(y_train)
    x_train  = scaler_x.transform(x_train)
    y_train  = scaler_y.transform(y_train)
    x_test   = scaler_x.transform(x_test)
    y_test   = scaler_y.transform(y_test)

    techniques = {}

    clf_sgd = linear_model.SGDRegressor(loss='squared_loss', penalty=None)
    clf_sgd.fit(x_train, y_train)
    techniques['Linear - no penalty'] = evaluate(clf_sgd, x_train, y_train)

    clf_sgd1 = linear_model.SGDRegressor(loss='squared_loss', penalty='l2')
    clf_sgd1.fit(x_train, y_train)
    techniques['Linear - squared sums of the coefficients penalisation'] = \
        evaluate(clf_sgd1, x_train, y_train)

    clf_svr = svm.SVR(kernel='linear')
    clf_svr.fit(x_train, y_train)
    techniques['SVR - linear'] = evaluate(clf_svr, x_train, y_train)

    clf_svr_poly = svm.SVR(kernel='poly')
    clf_svr_poly.fit(x_train, y_train)
    techniques['SVR - poly'] = evaluate(clf_svr_poly, x_train, y_train)

    clf_svr_rbf = svm.SVR(kernel='rbf')
    clf_svr_rbf.fit(x_train, y_train)
    techniques['SVR - RBF'] = evaluate(clf_svr_rbf, x_train, y_train)

    clf_et = ExtraTreesRegressor(n_estimators=10, compute_importances=True)
    clf_et.fit(x_train, y_train)
    techniques['Random forest'] = evaluate(clf_et, x_train, y_train)

    clf_lr = LinearRegression()
    clf_lr.fit(x_train, y_train)
    techniques['Linear regression'] = evaluate(clf_lr, x_train, y_train)

    return sorted(techniques.iteritems(), key=operator.itemgetter(1))
 def train(self):
     print "start ert"
     self.model = ExtraTreesRegressor(n_jobs=self.prms["n_jobs"],
                                      verbose=1,
                                      random_state=self.prms["random_state"],
                                      n_estimators=int(self.prms["n_estimators"]),
                                      max_features=self.prms["max_features"])
     self.model.fit(self.data_tr.values, self.labels_tr)
Exemplo n.º 26
0
def predict_for(output, cycles, tests, raw_tests, inputs):
    x_train, x_test, y_train, y_test = train_test_split(cycles[inputs],
                                                        cycles[output],
                                                        test_size=0.25,
                                                        random_state=33)
    scaler_x  = StandardScaler().fit(x_train)
    scaler_t  = StandardScaler().fit(tests)
    x_train   = scaler_x.transform(x_train)
    x_test    = scaler_x.transform(x_test)
    tests     = scaler_t.transform(tests)

    clf_et = ExtraTreesRegressor(n_estimators=10,
                                 compute_importances=True, random_state=42)
    clf_et.fit(x_train, y_train)

    ps = clf_et.predict(tests)
    return {dt: int(round(p)) for dt, p in zip(raw_tests['datetime'], ps)}
Exemplo n.º 27
0
def baseline_extra_leave_one_out(train_raw_x, test_raw_x, test_ids, n=40, d=40, result_path="baseline_extra.txt"):
    predict = []
    for test_id in test_ids:
        train_x = train_raw_x[train_raw_x.business_id != test_id]
        train_y = train_raw_x[train_raw_x.business_id != test_id].stars.as_matrix()
        train_x = train_x.drop(["business_id", "stars"], 1).as_matrix()
        clf = ExtraTreesRegressor(n_estimators=n, max_depth=d, random_state=0)
        clf = clf.fit(train_x, train_y)
        test_x = test_raw_x[test_raw_x.business_id == test_id]
        test_x = test_x.drop(["business_id", "stars"], 1).as_matrix()
        predict.append(clf.predict(test_x)[0])
    result = pd.DataFrame([], columns=["stars", "predict"])
    result["stars"] = test_raw_x.stars
    result["predict"] = predict
    result = result.sort("stars", ascending=0)
    result.to_csv(result_path, index=False)
    rmse = mean_squared_error(predict, test_raw_x.stars.as_matrix()) ** 0.5
    return rmse
def buildModelOheETR(train_data, eval_data, train_labels, seed):
    train_data = sparse.csr_matrix(train_data)
    eval_data = sparse.csr_matrix(eval_data)
    clf = ExtraTreesRegressor(n_estimators=500, max_depth=38, min_samples_leaf=2,min_samples_split=6,\
        max_features='auto', n_jobs=-1, random_state=seed, verbose=1)
    clf.fit(train_data, train_labels)
    preds = clf.predict(eval_data)
    preds = np.expm1(preds)

    # transform -ve preds to 0
    for i in range(preds.shape[0]):
        if preds[i] < 0:
            preds[i] = 0
            
    # convert back to log1p
    preds = np.log1p(preds)
            
    return((model,preds))
    def fit(self, X, Y):
        from sklearn.ensemble import ExtraTreesRegressor
        from sklearn.feature_selection import SelectFromModel

        self.n_estimators = int(self.n_estimators)
        self.min_samples_leaf = int(self.min_samples_leaf)
        self.min_samples_split = int(self.min_samples_split)
        self.max_features = float(self.max_features)
        self.bootstrap = check_for_bool(self.bootstrap)
        self.n_jobs = int(self.n_jobs)
        self.verbose = int(self.verbose)

        if check_none(self.max_leaf_nodes):
            self.max_leaf_nodes = None
        else:
            self.max_leaf_nodes = int(self.max_leaf_nodes)

        if check_none(self.max_depth):
            self.max_depth = None
        else:
            self.max_depth = int(self.max_depth)

        self.min_weight_fraction_leaf = float(self.min_weight_fraction_leaf)

        num_features = X.shape[1]
        max_features = int(
            float(self.max_features) * (np.log(num_features) + 1))
        # Use at most half of the features
        max_features = max(1, min(int(X.shape[1] / 2), max_features))
        estimator = ExtraTreesRegressor(
            n_estimators=self.n_estimators, criterion=self.criterion,
            max_depth=self.max_depth, min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap,
            max_features=max_features, max_leaf_nodes=self.max_leaf_nodes,
            oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose,
            min_weight_fraction_leaf=self.min_weight_fraction_leaf,
            random_state=self.random_state)

        estimator.fit(X, Y)
        self.preprocessor = SelectFromModel(estimator=estimator,
                                            threshold='mean',
                                            prefit=True)

        return self
Exemplo n.º 30
0
def get_regressor(x, y, n_estimators=1500, n_tries=5,
                  verbose=False):
    """Calculate an ExtraTreesRegressor on predictor and target variables

    Parameters
    ----------
    x : numpy.array
        Predictor vector
    y : numpy.array
        Target vector
    n_estimators : int, optional
        Number of estimators to use
    n_tries : int, optional
        Number of attempts to calculate regression
    verbose : bool, optional
        If True, output progress statements

    Returns
    -------
    classifier : sklearn.ensemble.ExtraTreesRegressor
        The classifier with the highest out of bag scores of all the
        attempted "tries"
    oob_scores : numpy.array
        Out of bag scores of the classifier
    """
    if verbose:
        sys.stderr.write('Getting regressor\n')
    clfs = []
    oob_scores = []

    for i in range(n_tries):
        if verbose:
            sys.stderr.write('%d.' % i)

        clf = ExtraTreesRegressor(n_estimators=n_estimators, oob_score=True,
                                  bootstrap=True, max_features='sqrt',
                                  n_jobs=1, random_state=i).fit(x, y)
        clfs.append(clf)
        oob_scores.append(clf.oob_score_)
    clf = clfs[np.argmax(oob_scores)]
    clf.feature_importances = pd.Series(clf.feature_importances_,
                                        index=x.columns)

    return clf, oob_scores
def hyperopt_obj(param, feat_folder, feat_name, trial_counter):
    global loaded
    global X_train, labels_train, X_valid, labels_valid, numTrain, numValid, cdf_valid, Y_valid
    log_loss_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
    year = datetime.datetime.now().year
    for run in range(1, config.n_runs + 1):  # range(start, end)前包括后不包括
        for fold in range(1, config.n_folds + 1):
            rng = np.random.RandomState(datetime.datetime.now().year +
                                        1000 * run + 10 * fold)
            path = "%s/Run%d/Fold%d" % (feat_folder, run, fold)
            save_path = "%s/Run%d/Fold%d" % (output_path, run, fold)
            if not os.path.exists(save_path):
                os.makedirs(save_path)
            # feat: combine feat file
            feat_train_path = "%s/train.feat" % path
            feat_valid_path = "%s/valid.feat" % path
            # # weight
            # weight_train_path = "%s/train.feat.weight" % path
            # weight_valid_path = "%s/valid.feat.weight" % path
            # info
            info_train_path = "%s/train.info" % path
            info_valid_path = "%s/valid.info" % path
            # cdf
            cdf_valid_path = "%s/valid.cdf" % path
            # raw prediction path (rank)
            raw_pred_valid_path = "%s/valid.raw.pred.%s_[Id@%d].csv" % (
                save_path, feat_name, trial_counter)  #
            rank_pred_valid_path = "%s/valid.pred.%s_[Id@%d].csv" % (
                save_path, feat_name, trial_counter)  #
            if loaded is None:
                X_train, labels_train, X_valid, labels_valid, numTrain, numValid, cdf_valid, Y_valid = load_data(
                    run, fold)

            # ## make evalerror func 评价函数
            # evalerror_regrank_valid = lambda preds,dtrain: evalerror_regrank_cdf(preds, dtrain, cdf_valid)
            # evalerror_softmax_valid = lambda preds,dtrain: evalerror_softmax_cdf(preds, dtrain, cdf_valid)
            # evalerror_softkappa_valid = lambda preds,dtrain: evalerror_softkappa_cdf(preds, dtrain, cdf_valid)
            # evalerror_ebc_valid = lambda preds,dtrain: evalerror_ebc_cdf(preds, dtrain, cdf_valid, ebc_hard_threshold)
            # evalerror_cocr_valid = lambda preds,dtrain: evalerror_cocr_cdf(preds, dtrain, cdf_valid)

            ##############
            ## Training ##
            ##############
            ## you can use bagging to stabilize the predictions 还可以使用 bagging 来使模型更加稳定
            preds_bagging = np.zeros((numValid, bagging_size), dtype=float)
            for n in range(bagging_size):
                if bootstrap_replacement:
                    sampleSize = int(
                        numTrain *
                        bootstrap_ratio)  # bootstrap_ratio: 使用训练样本的比例
                    index_base = rng.randint(numTrain, size=sampleSize)
                    index_meta = [
                        i for i in range(numTrain) if i not in index_base
                    ]
                else:
                    randnum = rng.uniform(size=numTrain)  # 产生 0-1 之间的唯一的随机数
                    index_base = [
                        i for i in range(numTrain)
                        if randnum[i] < bootstrap_ratio
                    ]
                    index_meta = [
                        i for i in range(numTrain)
                        if randnum[i] >= bootstrap_ratio
                    ]

                # 如果是xgb则先把数据转换成xgb需要的格式
                if "booster" in param:
                    dvalid_base = xgb.DMatrix(
                        X_valid, label=labels_valid)  # , weight=weight_valid
                    dtrain_base = xgb.DMatrix(
                        X_train[index_base], label=labels_train[index_base]
                    )  # , weight=weight_train[index_base]

                    watchlist = []
                    if verbose_level >= 2:
                        watchlist = [(dtrain_base, 'train'),
                                     (dvalid_base, 'valid')]

                ## various models
                if param["task"] in ["regression", "ranking"]:
                    ## regression & pairwise ranking with xgboost
                    bst = xgb.train(
                        param, dtrain_base, param['num_round'],
                        watchlist)  # , feval=evalerror_regrank_valid
                    pred = bst.predict(dvalid_base)

                if param["task"] in ["classification"]:
                    ## regression & pairwise ranking with xgboost
                    bst = xgb.train(
                        param, dtrain_base, param['num_round'],
                        watchlist)  # , feval=evalerror_regrank_valid
                    pred = bst.predict(dvalid_base)

                elif param["task"] in ["softmax"]:
                    ## softmax regression with xgboost
                    bst = xgb.train(
                        param, dtrain_base, param['num_round'],
                        watchlist)  # , feval=evalerror_softmax_valid
                    pred = bst.predict(dvalid_base)
                    w = np.asarray(range(1, numValid))
                    pred = pred * w[
                        np.
                        newaxis, :]  # np.newaxis: 插入一个维度,等价于w[np.newaxis],这里pred是n*1矩阵,而w[np.newaxis,:]是1*n矩阵,注意w原是数组
                    pred = np.sum(pred, axis=1)

                elif param["task"] in ["softkappa"]:
                    ## softkappa with xgboost 自定义损失函数
                    # obj = lambda preds, dtrain: softkappaObj(preds, dtrain, hess_scale=param['hess_scale'])
                    bst = xgb.train(
                        param, dtrain_base, param['num_round'], watchlist
                    )  # , obj=obj, feval=evalerror_softkappa_valid
                    pred = softmax(bst.predict(dvalid_base))
                    w = np.asarray(range(1, numValid))
                    pred = pred * w[np.newaxis, :]
                    pred = np.sum(pred, axis=1)

                elif param["task"] in ["ebc"]:
                    ## ebc with xgboost 自定义损失函数
                    # obj = lambda preds, dtrain: ebcObj(preds, dtrain)
                    bst = xgb.train(
                        param, dtrain_base, param['num_round'],
                        watchlist)  # , obj=obj, feval=evalerror_ebc_valid
                    pred = sigmoid(bst.predict(dvalid_base))
                    pred = applyEBCRule(pred,
                                        hard_threshold=ebc_hard_threshold)

                elif param["task"] in ["cocr"]:
                    ## cocr with xgboost 自定义损失函数
                    # obj = lambda preds, dtrain: cocrObj(preds, dtrain)
                    bst = xgb.train(
                        param, dtrain_base, param['num_round'],
                        watchlist)  # , obj=obj, feval=evalerror_cocr_valid
                    pred = bst.predict(dvalid_base)
                    pred = applyCOCRRule(pred)

                elif param['task'] == "reg_skl_rf":
                    ## regression with sklearn random forest regressor
                    rf = RandomForestRegressor(
                        n_estimators=param['n_estimators'],
                        max_features=param['max_features'],
                        n_jobs=param['n_jobs'],
                        random_state=param['random_state'])
                    rf.fit(X_train[index_base], labels_train[index_base]
                           )  # , sample_weight=weight_train[index_base]
                    pred = rf.predict(X_valid)

                elif param['task'] == "reg_skl_etr":
                    ## regression with sklearn extra trees regressor
                    etr = ExtraTreesRegressor(
                        n_estimators=param['n_estimators'],
                        max_features=param['max_features'],
                        n_jobs=param['n_jobs'],
                        random_state=param['random_state'])
                    etr.fit(X_train[index_base], labels_train[index_base]
                            )  # , sample_weight=weight_train[index_base]
                    pred = etr.predict(X_valid)

                elif param['task'] == "reg_skl_gbm":
                    ## regression with sklearn gradient boosting regressor
                    gbm = GradientBoostingRegressor(
                        n_estimators=param['n_estimators'],
                        max_features=param['max_features'],
                        learning_rate=param['learning_rate'],
                        max_depth=param['max_depth'],
                        subsample=param['subsample'],
                        random_state=param['random_state'])
                    gbm.fit(X_train.toarray()[index_base],
                            labels_train[index_base]
                            )  # , sample_weight=weight_train[index_base]
                    pred = gbm.predict(X_valid.toarray())

                elif param['task'] == "clf_skl_lr":
                    ## classification with sklearn logistic regression
                    lr = LogisticRegression(penalty="l2",
                                            dual=True,
                                            tol=1e-5,
                                            C=param['C'],
                                            fit_intercept=True,
                                            intercept_scaling=1.0,
                                            class_weight='auto',
                                            random_state=param['random_state'])
                    lr.fit(X_train[index_base], labels_train[index_base])
                    pred = lr.predict_proba(X_valid)
                    w = np.asarray(range(1, numValid))
                    pred = pred * w[np.newaxis, :]
                    pred = np.sum(pred, axis=1)

                elif param['task'] == "reg_skl_svr":
                    ## regression with sklearn support vector regression
                    X_train, X_valid = X_train.toarray(), X_valid.toarray()
                    scaler = StandardScaler()
                    X_train[index_base] = scaler.fit_transform(
                        X_train[index_base])
                    X_valid = scaler.transform(X_valid)
                    svr = SVR(C=param['C'],
                              gamma=param['gamma'],
                              epsilon=param['epsilon'],
                              degree=param['degree'],
                              kernel=param['kernel'])
                    svr.fit(X_train[index_base], labels_train[index_base]
                            )  # , sample_weight=weight_train[index_base]
                    pred = svr.predict(X_valid)

                elif param['task'] == "reg_skl_ridge":
                    ## regression with sklearn ridge regression
                    ridge = Ridge(alpha=param["alpha"], normalize=True)
                    ridge.fit(X_train[index_base], labels_train[index_base]
                              )  # , sample_weight=weight_train[index_base]
                    pred = ridge.predict(X_valid)

                elif param['task'] == "reg_skl_lasso":
                    ## regression with sklearn lasso
                    lasso = Lasso(alpha=param["alpha"], normalize=True)
                    lasso.fit(X_train[index_base], labels_train[index_base])
                    pred = lasso.predict(X_valid)

                elif param['task'] == 'reg_libfm':
                    ## regression with factorization machine (libfm)
                    ## to array
                    X_train = X_train.toarray()
                    X_valid = X_valid.toarray()

                    ## scale
                    scaler = StandardScaler()
                    X_train[index_base] = scaler.fit_transform(
                        X_train[index_base])
                    X_valid = scaler.transform(X_valid)

                    ## dump feat
                    dump_svmlight_file(X_train[index_base],
                                       labels_train[index_base],
                                       feat_train_path + ".tmp")
                    dump_svmlight_file(X_valid, labels_valid,
                                       feat_valid_path + ".tmp")

                    ## train fm
                    cmd = "%s -task r -train %s -test %s -out %s -dim '1,1,%d' -iter %d > libfm.log" % ( \
                                libfm_exe, feat_train_path+".tmp", feat_valid_path+".tmp", raw_pred_valid_path, \
                                param['dim'], param['iter'])
                    os.system(cmd)
                    os.remove(feat_train_path + ".tmp")
                    os.remove(feat_valid_path + ".tmp")

                    ## extract libfm prediction
                    pred = np.loadtxt(raw_pred_valid_path, dtype=float)
                    ## labels are in [0,1,2,3]
                    pred += 1

                # elif param['task'] == "reg_keras_dnn":
                #     ## regression with keras' deep neural networks
                #     model = Sequential()
                #     ## input layer
                #     model.add(Dropout(param["input_dropout"]))
                #     ## hidden layers
                #     first = True
                #     hidden_layers = param['hidden_layers']
                #     while hidden_layers > 0:
                #         if first:
                #             dim = X_train.shape[1]
                #             first = False
                #         else:
                #             dim = param["hidden_units"]
                #         model.add(Dense(dim, param["hidden_units"], init='glorot_uniform'))
                #         if param["batch_norm"]:
                #             model.add(BatchNormalization((param["hidden_units"],)))
                #         if param["hidden_activation"] == "prelu":
                #             model.add(PReLU((param["hidden_units"],)))
                #         else:
                #             model.add(Activation(param['hidden_activation']))
                #         model.add(Dropout(param["hidden_dropout"]))
                #         hidden_layers -= 1
                #
                #     ## output layer
                #     model.add(Dense(param["hidden_units"], 1, init='glorot_uniform'))
                #     model.add(Activation('linear'))
                #
                #     ## loss
                #     model.compile(loss='mean_squared_error', optimizer="adam")
                #
                #     ## to array
                #     X_train = X_train.toarray()
                #     X_valid = X_valid.toarray()
                #
                #     ## scale
                #     scaler = StandardScaler()
                #     X_train[index_base] = scaler.fit_transform(X_train[index_base])
                #     X_valid = scaler.transform(X_valid)
                #
                #     ## train
                #     model.fit(X_train[index_base], labels_train[index_base],
                #                 nb_epoch=param['nb_epoch'], batch_size=param['batch_size'],
                #                 validation_split=0, verbose=0)
                #
                #     ##prediction
                #     pred = model.predict(X_valid, verbose=0)
                #     pred.shape = (X_valid.shape[0],)

                elif param['task'] == "reg_rgf":
                    ## regression with regularized greedy forest (rgf)
                    ## to array
                    X_train, X_valid = X_train.toarray(), X_valid.toarray()

                    train_x_fn = feat_train_path + ".x"
                    train_y_fn = feat_train_path + ".y"
                    valid_x_fn = feat_valid_path + ".x"
                    valid_pred_fn = feat_valid_path + ".pred"

                    model_fn_prefix = "rgf_model"

                    np.savetxt(train_x_fn,
                               X_train[index_base],
                               fmt="%.6f",
                               delimiter='\t')
                    np.savetxt(train_y_fn,
                               labels_train[index_base],
                               fmt="%d",
                               delimiter='\t')
                    np.savetxt(valid_x_fn, X_valid, fmt="%.6f", delimiter='\t')
                    # np.savetxt(valid_y_fn, labels_valid, fmt="%d", delimiter='\t')

                    pars = [
                        "train_x_fn=",
                        train_x_fn,
                        "\n",
                        "train_y_fn=",
                        train_y_fn,
                        "\n",
                        #"train_w_fn=",weight_train_path,"\n",
                        "model_fn_prefix=",
                        model_fn_prefix,
                        "\n",
                        "reg_L2=",
                        param['reg_L2'],
                        "\n",
                        #"reg_depth=", 1.01, "\n",
                        "algorithm=",
                        "RGF",
                        "\n",
                        "loss=",
                        "LS",
                        "\n",
                        #"opt_interval=", 100, "\n",
                        "valid_interval=",
                        param['max_leaf_forest'],
                        "\n",
                        "max_leaf_forest=",
                        param['max_leaf_forest'],
                        "\n",
                        "num_iteration_opt=",
                        param['num_iteration_opt'],
                        "\n",
                        "num_tree_search=",
                        param['num_tree_search'],
                        "\n",
                        "min_pop=",
                        param['min_pop'],
                        "\n",
                        "opt_interval=",
                        param['opt_interval'],
                        "\n",
                        "opt_stepsize=",
                        param['opt_stepsize'],
                        "\n",
                        "NormalizeTarget"
                    ]
                    pars = "".join([str(p) for p in pars])

                    rfg_setting_train = "./rfg_setting_train"
                    with open(rfg_setting_train + ".inp", "wb") as f:
                        f.write(pars)

                    ## train fm
                    cmd = "perl %s %s train %s >> rgf.log" % (
                        call_exe, rgf_exe, rfg_setting_train)
                    #print cmd
                    os.system(cmd)

                    model_fn = model_fn_prefix + "-01"
                    pars = [
                        "test_x_fn=", valid_x_fn, "\n", "model_fn=", model_fn,
                        "\n", "prediction_fn=", valid_pred_fn
                    ]

                    pars = "".join([str(p) for p in pars])

                    rfg_setting_valid = "./rfg_setting_valid"
                    with open(rfg_setting_valid + ".inp", "wb") as f:
                        f.write(pars)
                    cmd = "perl %s %s predict %s >> rgf.log" % (
                        call_exe, rgf_exe, rfg_setting_valid)
                    #print cmd
                    os.system(cmd)

                    pred = np.loadtxt(valid_pred_fn, dtype=float)

                ## weighted averageing over different models
                pred_valid = pred
                ## this bagging iteration
                preds_bagging[:,
                              n] = pred_valid  # preds_bagging的第n+1列为pred_valid
                pred_raw = np.mean(preds_bagging[:, :(n + 1)],
                                   axis=1)  # 按行(同行多列)进行平均值
                # pred_rank = pred_raw.argsort().argsort()    # argsort: 获取排序的索引值(index),但索引值本身不排序,第二次是归位
                # pred_score, cutoff = getScore(pred_rank, cdf_valid, valid=True) # 根据cdf来生成分数
                # kappa_valid = quadratic_weighted_kappa(pred_score, Y_valid) # 计算kappa分数
                log_loss_valid = elementwise.log_loss(Y_valid, pred_raw)
                print('Y_valid mean:', np.mean(Y_valid))
                print('pred_raw mean:', np.mean(pred_raw))
                if (n + 1) != bagging_size:
                    print(
                        "              {:>3}   {:>3}   {:>3}   {:>6}   {} x {}"
                        .format(run, fold, n + 1, np.round(log_loss_valid, 6),
                                X_train.shape[0], X_train.shape[1]))
                else:
                    print(
                        "                    {:>3}       {:>3}      {:>3}    {:>8}  {} x {}"
                        .format(run, fold, n + 1, np.round(log_loss_valid, 6),
                                X_train.shape[0], X_train.shape[1]))
            log_loss_cv[run - 1, fold - 1] = log_loss_valid
            ## save this prediction 保存的是单行的预测值
            dfPred = pd.DataFrame({"target": Y_valid, "prediction": pred_raw})
            dfPred.to_csv(raw_pred_valid_path,
                          index=False,
                          header=True,
                          columns=["target", "prediction"])
            # save this prediction 保存的是根据预测值排序之后,然后使用cdf来生成的预测值
            # dfPred = pd.DataFrame({"target": Y_valid, "prediction": pred_rank})
            # dfPred.to_csv(rank_pred_valid_path, index=False, header=True, columns=["target", "prediction"])

    log_loss_cv_mean = np.mean(log_loss_cv)
    log_loss_cv_std = np.std(log_loss_cv)
    if verbose_level >= 1:
        print("              Mean: %.6f" % log_loss_cv_mean)
        print("              Std: %.6f" % log_loss_cv_std)

    ####################
    #### Retraining ####
    ####################
    #### all the path
    # path = "%s/All" % (feat_folder)
    # save_path = "%s/All" % output_path
    # subm_path = "%s/Subm" % output_path
    # if not os.path.exists(save_path):
    #     os.makedirs(save_path)
    # if not os.path.exists(subm_path):
    #     os.makedirs(subm_path)
    # # feat
    # feat_train_path = "%s/train.feat" % path
    # feat_test_path = "%s/test.feat" % path
    # # weight
    # # weight_train_path = "%s/train.feat.weight" % path
    # # info
    # info_train_path = "%s/train.info" % path
    # info_test_path = "%s/test.info" % path
    # # cdf
    # cdf_test_path = "%s/test.cdf" % path
    # # raw prediction path (rank)
    # raw_pred_test_path = "%s/test.raw.pred.%s_[Id@%d].csv" % (save_path, feat_name, trial_counter)
    # rank_pred_test_path = "%s/test.pred.%s_[Id@%d].csv" % (save_path, feat_name, trial_counter)
    # # submission path (is_duplicate as in [0, 1])
    # subm_path = "%s/test.pred.%s_[Id@%d]_[Mean%.6f]_[Std%.6f].csv" % (subm_path, feat_name, trial_counter, log_loss_cv_mean, log_loss_cv_std)
    #
    # #### load data
    # ## load feat
    # X_train, labels_train = load_svmlight_file(feat_train_path)
    # X_test, labels_test = load_svmlight_file(feat_test_path)
    # if X_test.shape[1] < X_train.shape[1]:
    #     X_test = hstack([X_test, np.zeros((X_test.shape[0], X_train.shape[1]-X_test.shape[1]))])
    # elif X_test.shape[1] > X_train.shape[1]:
    #     X_train = hstack([X_train, np.zeros((X_train.shape[0], X_test.shape[1]-X_train.shape[1]))])
    # X_train = X_train.tocsr()
    # X_test = X_test.tocsr()
    # ## load train weight
    # # weight_train = np.loadtxt(weight_train_path, dtype=float)
    # ## load test info
    # info_train = pd.read_csv(info_train_path)
    # numTrain = info_train.shape[0]
    # info_test = pd.read_csv(info_test_path)
    # numTest = info_test.shape[0]
    # id_test = info_test["id"]
    #
    # ## load cdf
    # cdf_test = np.loadtxt(cdf_test_path, dtype=float)
    # # ## 评价函数
    # # evalerror_regrank_test = lambda preds,dtrain: evalerror_regrank_cdf(preds, dtrain, cdf_test)
    # # evalerror_softmax_test = lambda preds,dtrain: evalerror_softmax_cdf(preds, dtrain, cdf_test)
    # # evalerror_softkappa_test = lambda preds,dtrain: evalerror_softkappa_cdf(preds, dtrain, cdf_test)
    # # evalerror_ebc_test = lambda preds,dtrain: evalerror_ebc_cdf(preds, dtrain, cdf_test, ebc_hard_threshold)
    # # evalerror_cocr_test = lambda preds,dtrain: evalerror_cocr_cdf(preds, dtrain, cdf_test)
    #
    # ## bagging
    # preds_bagging = np.zeros((numTest, bagging_size), dtype=float)
    # for n in range(bagging_size):
    #     if bootstrap_replacement:
    #         sampleSize = int(numTrain*bootstrap_ratio)
    #         #index_meta = rng.randint(numTrain, size=sampleSize)
    #         #index_base = [i for i in range(numTrain) if i not in index_meta]
    #         index_base = rng.randint(numTrain, size=sampleSize)
    #         index_meta = [i for i in range(numTrain) if i not in index_base]
    #     else:
    #         randnum = rng.uniform(size=numTrain)
    #         index_base = [i for i in range(numTrain) if randnum[i] < bootstrap_ratio]
    #         index_meta = [i for i in range(numTrain) if randnum[i] >= bootstrap_ratio]
    #
    #     # 如果是xgb则先把数据转换成xgb需要的格式
    #     if "booster" in param:
    #         dtest = xgb.DMatrix(X_test, label=labels_test)
    #         dtrain = xgb.DMatrix(X_train[index_base], label=labels_train[index_base])   # , weight=weight_train[index_base]
    #
    #         watchlist = []
    #         if verbose_level >= 2:
    #             watchlist  = [(dtrain, 'train')]
    #
    #     ## train
    #     if param["task"] in ["regression", "ranking"]:
    #         bst = xgb.train(param, dtrain, param['num_round'], watchlist)   # , feval=evalerror_regrank_test
    #         pred = bst.predict(dtest)
    #
    #     elif param["task"] in ["softmax"]:
    #         bst = xgb.train(param, dtrain, param['num_round'], watchlist)   # , feval=evalerror_softmax_test
    #         pred = bst.predict(dtest)
    #         w = np.asarray(range(1,numValid))
    #         pred = pred * w[np.newaxis,:]
    #         pred = np.sum(pred, axis=1)
    #
    #     elif param["task"] in ["softkappa"]:
    #         #  自定义损失函数
    #         # obj = lambda preds, dtrain: softkappaObj(preds, dtrain, hess_scale=param['hess_scale'])
    #         bst = xgb.train(param, dtrain, param['num_round'], watchlist)   # , obj=obj, feval=evalerror_softkappa_test
    #         pred = softmax(bst.predict(dtest))
    #         w = np.asarray(range(1,numValid))
    #         pred = pred * w[np.newaxis,:]
    #         pred = np.sum(pred, axis=1)
    #
    #     elif param["task"]  in ["ebc"]:
    #         #  自定义损失函数
    #         # obj = lambda preds, dtrain: ebcObj(preds, dtrain)
    #         bst = xgb.train(param, dtrain, param['num_round'], watchlist)   # , obj=obj, feval=evalerror_ebc_test
    #         pred = sigmoid(bst.predict(dtest))
    #         pred = applyEBCRule(pred, hard_threshold=ebc_hard_threshold)
    #
    #     elif param["task"]  in ["cocr"]:
    #         #  自定义损失函数
    #         obj = lambda preds, dtrain: cocrObj(preds, dtrain)
    #         bst = xgb.train(param, dtrain, param['num_round'], watchlist)   # , obj=obj, feval=evalerror_cocr_test
    #         pred = bst.predict(dtest)
    #         pred = applyCOCRRule(pred)
    #
    #     elif param['task'] == "reg_skl_rf":
    #         ## random forest regressor
    #         rf = RandomForestRegressor(n_estimators=param['n_estimators'],
    #                                    max_features=param['max_features'],
    #                                    n_jobs=param['n_jobs'],
    #                                    random_state=param['random_state'])
    #         rf.fit(X_train[index_base], labels_train[index_base]) # , sample_weight=weight_train[index_base]
    #         pred = rf.predict(X_test)
    #
    #     elif param['task'] == "reg_skl_etr":
    #         ## extra trees regressor
    #         etr = ExtraTreesRegressor(n_estimators=param['n_estimators'],
    #                                   max_features=param['max_features'],
    #                                   n_jobs=param['n_jobs'],
    #                                   random_state=param['random_state'])
    #         etr.fit(X_train[index_base], labels_train[index_base])    # , sample_weight=weight_train[index_base]
    #         pred = etr.predict(X_test)
    #
    #     elif param['task'] == "reg_skl_gbm":
    #         ## gradient boosting regressor
    #         gbm = GradientBoostingRegressor(n_estimators=param['n_estimators'],
    #                                         max_features=param['max_features'],
    #                                         learning_rate=param['learning_rate'],
    #                                         max_depth=param['max_depth'],
    #                                         subsample=param['subsample'],
    #                                         random_state=param['random_state'])
    #         gbm.fit(X_train.toarray()[index_base], labels_train[index_base])  #, sample_weight=weight_train[index_base]
    #         pred = gbm.predict(X_test.toarray())
    #
    #     elif param['task'] == "clf_skl_lr":
    #         lr = LogisticRegression(penalty="l2", dual=True, tol=1e-5,
    #                                 C=param['C'], fit_intercept=True, intercept_scaling=1.0,
    #                                 class_weight='auto', random_state=param['random_state'])
    #         lr.fit(X_train[index_base], labels_train[index_base])
    #         pred = lr.predict_proba(X_test)
    #         w = np.asarray(range(1,numValid))
    #         pred = pred * w[np.newaxis,:]
    #         pred = np.sum(pred, axis=1)
    #
    #     elif param['task'] == "reg_skl_svr":
    #         ## regression with sklearn support vector regression
    #         X_train, X_test = X_train.toarray(), X_test.toarray()
    #         scaler = StandardScaler()
    #         X_train[index_base] = scaler.fit_transform(X_train[index_base])
    #         X_test = scaler.transform(X_test)
    #         svr = SVR(C=param['C'], gamma=param['gamma'], epsilon=param['epsilon'],
    #                                 degree=param['degree'], kernel=param['kernel'])
    #         svr.fit(X_train[index_base], labels_train[index_base])    # , sample_weight=weight_train[index_base]
    #         pred = svr.predict(X_test)
    #
    #     elif param['task'] == "reg_skl_ridge":
    #         ridge = Ridge(alpha=param["alpha"], normalize=True)
    #         ridge.fit(X_train[index_base], labels_train[index_base])  # , sample_weight=weight_train[index_base]
    #         pred = ridge.predict(X_test)
    #
    #     elif param['task'] == "reg_skl_lasso":
    #         lasso = Lasso(alpha=param["alpha"], normalize=True)
    #         lasso.fit(X_train[index_base], labels_train[index_base])
    #         pred = lasso.predict(X_test)
    #
    #     elif param['task'] == 'reg_libfm':
    #         ## to array
    #         X_train, X_test = X_train.toarray(), X_test.toarray()
    #
    #         ## scale
    #         scaler = StandardScaler()
    #         X_train[index_base] = scaler.fit_transform(X_train[index_base])
    #         X_test = scaler.transform(X_test)
    #
    #         ## dump feat
    #         dump_svmlight_file(X_train[index_base], labels_train[index_base], feat_train_path+".tmp")
    #         dump_svmlight_file(X_test, labels_test, feat_test_path+".tmp")
    #
    #         ## train fm
    #         cmd = "%s -task r -train %s -test %s -out %s -dim '1,1,%d' -iter %d > libfm.log" % ( \
    #                     libfm_exe, feat_train_path+".tmp", feat_test_path+".tmp", raw_pred_test_path, \
    #                     param['dim'], param['iter'])
    #         os.system(cmd)
    #         os.remove(feat_train_path+".tmp")
    #         os.remove(feat_test_path+".tmp")
    #
    #         ## extract libfm prediction
    #         pred = np.loadtxt(raw_pred_test_path, dtype=float)
    #         ## labels are in [0,1,2,3]
    #         pred += 1
    #
    #     elif param['task'] == "reg_keras_dnn":
    #         ## regression with keras deep neural networks
    #         model = Sequential()
    #         ## input layer
    #         model.add(Dropout(param["input_dropout"]))
    #         ## hidden layers
    #         first = True
    #         hidden_layers = param['hidden_layers']
    #         while hidden_layers > 0:
    #             if first:
    #                 dim = X_train.shape[1]
    #                 first = False
    #             else:
    #                 dim = param["hidden_units"]
    #             model.add(Dense(dim, param["hidden_units"], init='glorot_uniform'))
    #             if param["batch_norm"]:
    #                 model.add(BatchNormalization((param["hidden_units"],)))
    #             if param["hidden_activation"] == "prelu":
    #                 model.add(PReLU((param["hidden_units"],)))
    #             else:
    #                 model.add(Activation(param['hidden_activation']))
    #             model.add(Dropout(param["hidden_dropout"]))
    #             hidden_layers -= 1
    #
    #         ## output layer
    #         model.add(Dense(param["hidden_units"], 1, init='glorot_uniform'))
    #         model.add(Activation('linear'))
    #
    #         ## loss
    #         model.compile(loss='mean_squared_error', optimizer="adam")
    #
    #         ## to array
    #         X_train = X_train.toarray()
    #         X_test = X_test.toarray()
    #
    #         ## scale
    #         scaler = StandardScaler()
    #         X_train[index_base] = scaler.fit_transform(X_train[index_base])
    #         X_test = scaler.transform(X_test)
    #
    #         ## train
    #         model.fit(X_train[index_base], labels_train[index_base],
    #                     nb_epoch=param['nb_epoch'], batch_size=param['batch_size'], verbose=0)
    #
    #         ##prediction
    #         pred = model.predict(X_test, verbose=0)
    #         pred.shape = (X_test.shape[0],)
    #
    #     elif param['task'] == "reg_rgf":
    #         ## to array
    #         X_train, X_test = X_train.toarray(), X_test.toarray()
    #
    #         train_x_fn = feat_train_path+".x"
    #         train_y_fn = feat_train_path+".y"
    #         test_x_fn = feat_test_path+".x"
    #         test_pred_fn = feat_test_path+".pred"
    #
    #         model_fn_prefix = "rgf_model"
    #
    #         np.savetxt(train_x_fn, X_train[index_base], fmt="%.6f", delimiter='\t')
    #         np.savetxt(train_y_fn, labels_train[index_base], fmt="%d", delimiter='\t')
    #         np.savetxt(test_x_fn, X_test, fmt="%.6f", delimiter='\t')
    #         # np.savetxt(valid_y_fn, labels_valid, fmt="%d", delimiter='\t')
    #
    #
    #         pars = [
    #             "train_x_fn=",train_x_fn,"\n",
    #             "train_y_fn=",train_y_fn,"\n",
    #             #"train_w_fn=",weight_train_path,"\n",
    #             "model_fn_prefix=",model_fn_prefix,"\n",
    #             "reg_L2=", param['reg_L2'], "\n",
    #             #"reg_depth=", 1.01, "\n",
    #             "algorithm=","RGF","\n",
    #             "loss=","LS","\n",
    #             "test_interval=", param['max_leaf_forest'],"\n",
    #             "max_leaf_forest=", param['max_leaf_forest'],"\n",
    #             "num_iteration_opt=", param['num_iteration_opt'], "\n",
    #             "num_tree_search=", param['num_tree_search'], "\n",
    #             "min_pop=", param['min_pop'], "\n",
    #             "opt_interval=", param['opt_interval'], "\n",
    #             "opt_stepsize=", param['opt_stepsize'], "\n",
    #             "NormalizeTarget"
    #         ]
    #         pars = "".join([str(p) for p in pars])
    #
    #         rfg_setting_train = "./rfg_setting_train"
    #         with open(rfg_setting_train+".inp", "wb") as f:
    #             f.write(pars)
    #
    #         ## train fm
    #         cmd = "perl %s %s train %s >> rgf.log" % (
    #                 call_exe, rgf_exe, rfg_setting_train)
    #         #print cmd
    #         os.system(cmd)
    #
    #
    #         model_fn = model_fn_prefix + "-01"
    #         pars = [
    #             "test_x_fn=",test_x_fn,"\n",
    #             "model_fn=", model_fn,"\n",
    #             "prediction_fn=", test_pred_fn
    #         ]
    #
    #         pars = "".join([str(p) for p in pars])
    #
    #         rfg_setting_test = "./rfg_setting_test"
    #         with open(rfg_setting_test+".inp", "wb") as f:
    #             f.write(pars)
    #         cmd = "perl %s %s predict %s >> rgf.log" % (
    #                 call_exe, rgf_exe, rfg_setting_test)
    #         #print cmd
    #         os.system(cmd)
    #
    #         pred = np.loadtxt(test_pred_fn, dtype=float)
    #
    #     ## weighted averageing over different models
    #     pred_test = pred
    #     preds_bagging[:,n] = pred_test
    # pred_raw = np.mean(preds_bagging, axis=1)
    # pred_rank = pred_raw.argsort().argsort()
    # #
    # ## write
    # output = pd.DataFrame({"id": id_test, "prediction": pred_raw})
    # output.to_csv(raw_pred_test_path, index=False)
    #
    # ## write
    # output = pd.DataFrame({"id": id_test, "prediction": pred_rank})
    # output.to_csv(rank_pred_test_path, index=False)
    #
    # ## write score
    # pred_score = getScore(pred, cdf_test)
    # output = pd.DataFrame({"id": id_test, "prediction": pred_score})
    # output.to_csv(subm_path, index=False)
    # #"""

    return log_loss_cv_mean, log_loss_cv_std
Exemplo n.º 32
0
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: -1814.3293695408152
exported_pipeline = make_pipeline(
    MaxAbsScaler(),
    ExtraTreesRegressor(bootstrap=True, max_features=0.35000000000000003, min_samples_leaf=1, min_samples_split=14, n_estimators=100)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Exemplo n.º 33
0
        predictions = np.column_stack([regr.predict(X) for regr in self.regr_])
        return np.mean(predictions, axis=1)


en = make_pipeline(RobustScaler(), SelectFromModel(Lasso(alpha=0.03)),
                   ElasticNet(alpha=0.001, l1_ratio=0.1))

rf = RandomForestRegressor(n_estimators=250,
                           n_jobs=4,
                           min_samples_split=25,
                           min_samples_leaf=25,
                           max_depth=3)

et = ExtraTreesRegressor(n_estimators=100,
                         n_jobs=4,
                         min_samples_split=25,
                         min_samples_leaf=35,
                         max_features=150)

xgbm = xgb.sklearn.XGBRegressor(max_depth=4,
                                learning_rate=0.005,
                                subsample=0.9,
                                base_score=y_mean,
                                objective='reg:linear',
                                n_estimators=1000)

stack_avg = StackingCVRegressorAveraged((en, rf, et),
                                        ElasticNet(l1_ratio=0.1, alpha=1.4))

stack_with_feats = StackingCVRegressorRetrained((en, rf, et),
                                                xgbm,
Exemplo n.º 34
0
def perishing_mother_wife(passenger):
    surname, Pclass, person = passenger
    return 1.0 if (surname in perishing_female_surnames) else 0.0
full_data['perishing_mother_wife'] = full_data[['surname', 'Pclass', 'person']].apply(perishing_mother_wife, axis=1)

#### Survivng Males
surviving_male_surnames = list(set(full_data[(full_data.male_adult == 1.0) &
                                     (full_data.Survived == 1.0) &
                                     ((full_data.Parch > 0) | (full_data.SibSp > 0))]['surname'].values))
def surviving_father_husband(passenger):
    surname, Pclass, person = passenger
    return 1.0 if (surname in surviving_male_surnames) else 0.0
full_data['surviving_father_husband'] = full_data[['surname', 'Pclass', 'person']].apply(surviving_father_husband, axis=1)

classers = ['Fare','Parch','Pclass','SibSp','TitleCat','CabinCat','Sex_female','Sex_male', 'EmbarkedCat', 'FamilySize', 'NameLength', 'FamilyId']
age_et = ExtraTreesRegressor(n_estimators=200)
X_train = full_data.loc[full_data.Age.notnull(),classers]
Y_train = full_data.loc[full_data.Age.notnull(),['Age']]
X_test = full_data.loc[full_data.Age.isnull(),classers]
age_et.fit(X_train,np.ravel(Y_train))
age_preds = age_et.predict(X_test)
full_data.loc[full_data.Age.isnull(),['Age']] = age_preds

######################################################################
######################################################################

print('Building Model...')

#### Model Build - Random Forest (Categorical Features)
model_dummys = ['Age','male_adult', 'female_adult', 'child','perishing_mother_wife','surviving_father_husband','Fare','Parch','Pclass','SibSp','TitleCat','CabinCat','Sex_female','Sex_male', 'EmbarkedCat', 'FamilySize', 'NameLength', 'FamilyId']
model_rf = RandomForestClassifier(n_estimators=300, min_samples_leaf=4, class_weight={0:0.745,1:0.255})
Exemplo n.º 35
0
def ETCf():
    import pandas as pd
    import numpy as np
    import seaborn as sns
    from datetime import datetime
    from sklearn.linear_model import LinearRegression
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.ensemble import ExtraTreesRegressor
    from xgboost import XGBClassifier
    from sklearn.svm import LinearSVC
    import matplotlib.pyplot as plt
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import r2_score

    data1 = pd.read_excel("littoralis1516.xlsx")

    data1.head()

    data1.drop(["Date", "Larves"], axis=1, inplace=True)

    data1.head()

    data1 = data1.astype(float)

    data1["GDD"] = data1.Temp.astype(float) - 10

    data1.head()

    x = data1.iloc[:, 0].values

    y = data1.iloc[:, 1:8].values

    x

    y

    xtrain, xtest, ytrain, ytest = train_test_split(y,
                                                    x,
                                                    test_size=0.2,
                                                    random_state=0)

    #regressor=LinearRegression()
    #regressor=RandomForestRegressor(n_estimators=10,random_state=0,max_depth=20)                                     #max depth=10
    regressor = ExtraTreesRegressor(n_estimators=100,
                                    random_state=0,
                                    max_depth=10,
                                    min_samples_split=5)  #max depth=5
    #regressor=XGBClassifier()
    #regressor=LinearSVC()
    #regressor = LogisticRegression()

    regressor.fit(xtrain, ytrain)

    y_pred = regressor.predict(xtest)

    data1_cmp = pd.DataFrame(list(zip(y_pred, ytest)))

    data1_cmp['Difference'] = abs(data1_cmp[0] - data1_cmp[1])

    data1_cmp.rename(columns={0: "Predicted", 1: "Actual"}, inplace=True)

    data1_cmp.head()

    MAPE = data1_cmp['Difference'].mean()
    x000 = float("{:.5f}".format(MAPE))
    print("MAPE: %.5f" % (MAPE))
    Error = np.mean(data1_cmp["Difference"]) / np.mean(data1_cmp["Actual"])
    x11 = Error * 100
    x111 = float("{:.2f}".format(x11))
    print("Error: %.2f%%" % (Error * 100))

    Accuracy = accuracy_score((ytest * 100).astype(int),
                              (y_pred * 100).astype(int))
    #Accuracy = r2_score(ytest,y_pred)

    print("Accuracy: %.2f%%" % (Accuracy * 100.0))
    x22 = Accuracy * 100
    x222 = float("{:.2f}".format(x22))
    #plt.plot(data1_cmp.Actual, color="r")
    #plt.plot(data1_cmp.Predicted, color ="b")
    global Label11
    Label11 = Label(root, text="MAPE=")
    global Label12
    Label12 = Label(root, text=x000)

    global Label21
    Label21 = Label(root, text="Error=")
    global Label22
    Label22 = Label(root, text=x111)

    global Label31
    Label31 = Label(root, text="Accuracy=")
    global Label32
    Label32 = Label(root, text=x222)

    Label11.grid(row=10, column=5)
    Label12.grid(row=10, column=6)

    Label21.grid(row=11, column=5)
    Label22.grid(row=11, column=6)

    Label31.grid(row=12, column=5)
    Label32.grid(row=12, column=6)
    ETC['state'] = DISABLED
Exemplo n.º 36
0
    'property_type', 'room_type', 'bed_type', 'cancellation_policy'
]
base_airbnb_cod = pd.get_dummies(data=base_airbnb_cod,
                                 columns=colunas_categorias)
print(base_airbnb_cod.head())


#Modelo de Previsão
def avaliar_modelo(nome_modelo, y_test, previsao):
    r2 = r2_score(y_test, previsao)
    RSME = np.sqrt(mean_squared_error(y_test, previsao))
    return f'Modelo {nome_modelo}:\nR2:{r2:.2%}\nRSME:{RSME:.2f}'


modelo_rf = RandomForestRegressor()
modelo_lr = LinearRegression()
modelo_et = ExtraTreesRegressor()
modelos = {
    'RandomForest': modelo_rf,
    'LinearRegression': modelo_lr,
    'ExtraTreesRegressor': modelo_et
}
y = base_airbnb_cod['price']
x = base_airbnb_cod.drop('price', axis=1)
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=10)
for nome_modelo, modelo in modelos.items():
    #treinar
    modelo.fit(X_train, y_train)
    #testar
    previsao = modelo.predict(X_test)
    print(avaliar_modelo(nome_modelo, y_test, previsao))
Exemplo n.º 37
0
print(a.columns)
final_dataset = a[[
    'Year', 'Selling_Price', 'Present_Price', 'Kms_Driven', 'Fuel_Type',
    'Seller_Type', 'Transmission', 'Owner'
]]
final_dataset["current_year"] = 2020
final_dataset[
    "new_year"] = final_dataset["current_year"] - final_dataset["Year"]
final_dataset.drop(columns=['Year', 'current_year'], inplace=True)
final_dataset = pd.get_dummies(final_dataset, drop_first=True)
print(final_dataset)
x = final_dataset.iloc[:, 1:]
y = final_dataset.iloc[:, 0]
#feature importance
from sklearn.ensemble import ExtraTreesRegressor
model = ExtraTreesRegressor()
model.fit(x, y)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)
from sklearn.ensemble import RandomForestRegressor
r = RandomForestRegressor()
from sklearn.model_selection import RandomizedSearchCV

#Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=100, stop=1200, num=12)]
# Number of features to consider at every split
Exemplo n.º 38
0
train = pd.read_csv("../../input/train.csv") # read train data
test = pd.read_csv("../../input/test.csv") # read test data

# build a model library (can be improved)
base_models = [
        RandomForestRegressor(
            n_jobs=1, random_state=0,
            n_estimators=500, max_features=14
        ),
        RandomForestRegressor(
            n_jobs=1, random_state=0,
            n_estimators=500, max_features=20,
	    max_depth = 7
        ),
        ExtraTreesRegressor(
            n_jobs=1, random_state=0, 
            n_estimators=500, max_features=15
        ),
        ExtraTreesRegressor(
            n_jobs=1, random_state=0, 
          n_estimators=500, max_features=20
        ),
        GradientBoostingRegressor(
            random_state=0, 
            n_estimators=500, max_features=10, max_depth=6,
            learning_rate=0.05, subsample=0.8
        ),
	GradientBoostingRegressor(
            random_state=0, 
            n_estimators=500, max_features=15, max_depth=6,
            learning_rate=0.05, subsample=0.8
        ),
Exemplo n.º 39
0
        #print('vif :', vif)
        print('dropping ' + X[list_factors].columns[maxloc] + ' at index:  ' +
              str(maxloc))
        del list_factors[maxloc]
    else:
        break
print('Final variables:', list_factors)

X = X[list_factors]

# ensembles
ensembles = []
ensembles.append(('AB', AdaBoostRegressor()))
ensembles.append(('GBM', GradientBoostingRegressor()))
ensembles.append(('RF', RandomForestRegressor()))
ensembles.append(('ET', ExtraTreesRegressor()))

r2_results = []
mse_results = []
names = []

# evaluate model
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=200)

for name, model in ensembles:
    fs = SelectKBest(score_func=f_regression)
    pipeline = Pipeline(steps=[('anova', fs), ('model', model)])
    # define the grid
    grid = {
        'anova__k': [i + 1 for i in range(X.shape[1])],
        'model__n_estimators': randint(10, 400)
Exemplo n.º 40
0
    # Plot the estimated stability scores for a given alpha

    # Use 6-fold cross-validation rather than the default 3-fold: it leads to
    # a better choice of alpha:
    # Stop the user warnings outputs- they are not necessary for the example
    # as it is specifically set up to be challenging.
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', UserWarning)
        lars_cv = LassoLarsCV(cv=6).fit(X, y)

    # Run the RandomizedLasso: we use a paths going down to .1*alpha_max
    # to avoid exploring the regime in which very noisy variables enter
    # the model
    alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6)
    clf = RandomizedLasso(alpha=alphas, random_state=42).fit(X, y)
    trees = ExtraTreesRegressor(100, compute_importances=True).fit(X, y)
    # Compare with F-score
    F, _ = f_regression(X, y)

    pl.figure()
    for name, score in [
        ('F-test', F),
        ('Stability selection', clf.scores_),
        ('Lasso coefs', np.abs(lars_cv.coef_)),
        ('Trees', trees.feature_importances_),
    ]:
        precision, recall, thresholds = precision_recall_curve(
            coef != 0, score)
        pl.semilogy(np.maximum(score / np.max(score), 1e-4),
                    label="%s. AUC: %.3f" % (name, auc(recall, precision)))
Exemplo n.º 41
0
X_scaled=pca.fit_transform(X_scaled)
test_X_scaled = pca.transform(test_X_scaled)
print(X_scaled.shape, test_X_scaled.shape)

'''modeling&evalution'''
#34
# define cross validation strategy
def rmse_cv(model,X,y):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=5))
    return rmse

#35
#We choose 13 models and use 5-folds cross-calidation to evaluate these models.
models = [LinearRegression(),Ridge(),Lasso(alpha=0.01,max_iter=10000),RandomForestRegressor(),GradientBoostingRegressor(),SVR(),LinearSVR(),
          ElasticNet(alpha=0.001,max_iter=10000),SGDRegressor(),BayesianRidge(),KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5),
          ExtraTreesRegressor(),XGBRegressor()]

#36
names = ["LR", "Ridge", "Lasso", "RF", "GBR", "SVR", "LinSVR", "Ela","SGD","Bay","Ker","Extra","Xgb"]
for name, model in zip(names, models):
    score = rmse_cv(model, X_scaled, y_log)
    print("{}: {:.6f}, {:.4f}".format(name,score.mean(),score.std()))

#37
#Next we do some hyperparameters tuning. First define a gridsearch method.
class grid():
    def __init__(self, model):
        self.model = model

    def grid_get(self, X, y, param_grid):
        grid_search = GridSearchCV(self.model, param_grid, cv=5, scoring="neg_mean_squared_error")
Exemplo n.º 42
0
# forest = RandomForestRegressor( n_estimators=100,
#                                 max_depth=10,
#                                 n_jobs=-1 )

# forest = RegressionForest( n_estimators=30,
#                            min_items=5,
#                            max_depth=30,
#                            nb_tests=1000,
#                            test="axis",
#                            verbose=False)
# print forest.get_params()

forest = ExtraTreesRegressor(
    n_estimators=2000,
    min_samples_leaf=3,
    max_depth=60,
    #  bootstrap=True,
    n_jobs=-1)

forest.fit(all_points, all_responses)

#param_name = "max_depth"
#param_range = np.logspace(0, 2, 10)
#param_range = [60]

# param_name = "min_samples_leaf"
# param_range = np.logspace(0, 2, 5)

#param_name = "bootstrap"
#param_range = np.logspace(-1, 0, 10)
import numpy as np

from sklearn.ensemble import ExtraTreesRegressor, VotingClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1)
training_features, testing_features, training_classes, testing_classes = \
    train_test_split(features, tpot_data['class'], random_state=42)

exported_pipeline = make_pipeline(
    make_union(VotingClassifier([("est", ElasticNet(alpha=1.0, l1_ratio=0.84))]), FunctionTransformer(lambda X: X)),
    VarianceThreshold(threshold=26.0),
    ExtraTreesRegressor(max_features=0.6900000000000001, n_estimators=500)
)

exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)
Y_train = ss_Y.fit_transform(Y_train.reshape(-1, 1))
Y_test = ss_Y.transform(Y_test.reshape(-1, 1))
# 要把以为数据Y变成二维的

rfr = RandomForestRegressor()
rfr.fit(X_train, Y_train.ravel())
rfr_Y_predict = rfr.predict(X_test)
# 使用随机森林回归模型

print 'R-squared of RandomForestRegressor:', rfr.score(X_test, Y_test)
print 'the mean squared of RandomForestRegressor:', mean_squared_error(
    ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(rfr_Y_predict))
print 'the mean absolute squared of RandomForestRegressor:', mean_absolute_error(
    ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(rfr_Y_predict))

etr = ExtraTreesRegressor()
etr.fit(X_train, Y_train.ravel())
etr_Y_predict = etr.predict(X_test)
# 使用极端森林回归模型

print 'R-squared of ExtraTreesRegressor:', etr.score(X_test, Y_test)
print 'the mean squared of ExtraTreesRegressor:', mean_squared_error(
    ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(etr_Y_predict))
print 'the mean absolute squared of ExtraTreesRegressor:', mean_absolute_error(
    ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(etr_Y_predict))

print np.sort(zip(etr.feature_importances_, boston.feature_names), axis=0)
# 利用训练好的极端回归森林模型,输出各种特征对预测目标的贡献度

gbr = GradientBoostingRegressor()
gbr.fit(X_train, Y_train.ravel())
Exemplo n.º 45
0
#3.5随机森林
#调参建模
rfr = RandomForestRegressor(n_estimators=300, n_jobs=-1)
param_grid = {  #'min_samples_leaf':[3,5,10],
    #'max_depth':[20,30,40,55],
    'max_features': [20, 40, 60, 80]
}
rfr = gscvr(x_train, y_train, rfr, param_grid)
#交叉验证精度&学习曲线
rmse_cv(rfr, x_train, y_train)
plot_learning_curve_r(x_train, y_train, rfr, 'rfr learning_curve')

#3.6极端随机树
#调参建模
etr = ExtraTreesRegressor(n_estimators=500, n_jobs=-1)
param_grid = {  #'min_samples_leaf':[3,5,10],
    'max_depth': [3, 5, 8],
    'max_features': [40, 60, 80, 120, 160]
}
etr = gscvr(x_train, y_train, etr, param_grid)
#交叉验证精度&学习曲线
rmse_cv(etr, x_train, y_train)
plot_learning_curve_r(x_train, y_train, etr, 'etr learning_curve')

#3.7xgboost
#调参建模
xgbr = XGBRegressor(
    colsample_bytree=0.6,
    learning_rate=0.07,
    min_child_weight=1.5,
Exemplo n.º 46
0
            R = np.clip(R, -1, 1)

        toc('Number of non-zero feature: %s' % np.count_nonzero(np.mean(F[:-1], axis=0)))
        tic('Keeping NZV features')
        support = np.var(F, axis=0) != 0  # Keep only features with nonzero variance
        toc('Using %s features' % support.sum())

        if args.rfs:
            log('Filtering out ZV features')
            F = F[:, support]
            FF = FF[:, support]

            tic('Running RFS')
            ifs_estimator_params = {'n_estimators': ifs_nb_trees,
                                    'n_jobs': -1}
            ifs_params = {'estimator': ExtraTreesRegressor(**ifs_estimator_params),
                          'n_features_step': 1,
                          'cv': None,
                          'scale': True,
                          'verbose': 1,
                          'significance': ifs_significance}
            ifs = IFS(**ifs_params)
            features_names = np.array(map(str, range(F.shape[1])) + ['A'])
            rfs_params = {'feature_selector': ifs,
                          'features_names': features_names,
                          'verbose': 1}
            rfs = RFS(**rfs_params)
            rfs.fit(F, A, FF, R)

            # Process support
            support_rfs = rfs.get_support()
Exemplo n.º 47
0
targetActions[actions == -1] = 0
targetActions[actions[:,0] == 1] = [1,0]

# Upsample folds
upsampleRatio = 10
foldMask = actions[:,0] == 1
foldFeatures = np.tile(features[foldMask], (upsampleRatio,1))
foldTargetActions = np.tile(targetActions[foldMask], (upsampleRatio,1))

x = np.row_stack((features[rndPlayerMask],foldFeatures))
y = np.row_stack((targetActions[rndPlayerMask],foldTargetActions))
shuffler = np.arange(len(x))
np.random.shuffle(shuffler)

#regressorOld = copy.deepcopy(regressor)
regressor = ExtraTreesRegressor(n_estimators=100, min_samples_leaf=10, min_samples_split=4, 
                                verbose=2, n_jobs=-1)
regressor.fit(x[shuffler], y[shuffler])



# %%


nGames = 5000
callPlayerIdx = 0
aiPlayerIdx = 1
seed = 76

initGameStates, initStacks = initRandomGames(nGames, seed=seed)
smallBlinds = initGameStates.boards[:,1]
equities = getEquities(initGameStates, seed=seed)
Exemplo n.º 48
0
def train_test_regression(x_train, y_train, x_test, y_test):
    """
    Train and test a number of regression models using a train/test split of single dataset, and log/report scores.
    Each regression model used will use its default initialization parameters.

    :param x_train:
    :param y_train:
    :param x_test:
    :param y_test:
    :return: None
    """

    # a dictionary of model names to scores we'll populate and return
    model_scores = {}

    # create and train a linear regression model
    model = LinearRegression()
    model.fit(x_train, y_train)
    model_scores["LinearRegression"] = model.score(x_test, y_test)

    # create and train a ridge regression model
    model = Ridge()
    model.fit(x_train, y_train)
    model_scores["Ridge"] = model.score(x_test, y_test)

    # create and train a random forest regression model
    for trees in [3, 10, 20, 100, 250]:
        model = RandomForestRegressor(n_estimators=trees)
        model.fit(x_train, y_train)
        score = model.score(x_test, y_test)
        _logger.info("Random Forest (trees={t}) score: {result}".format(
            t=trees, result=score))

    # create and train a K-neighbors regression model
    for k in [1, 3, 5, 10, 20]:
        model = KNeighborsRegressor(n_neighbors=k)
        model.fit(x_train, y_train)
        score = model.score(x_test, y_test)
        _logger.info("K-Neighbors (k={k}) score: {result}".format(
            k=k, result=score))

    # # create and train an Ada boost regression model, trying various estimators and learning rate parameters
    # for estimators in [1, 3, 5, 10, 20]:
    #     for rate in [0.01, 0.1, 1, 5, 12]:
    #         model = AdaBoostRegressor(n_estimators=estimators, learning_rate=rate)
    #         model.fit(x_train, y_train)
    #         score = model.score(x_test, y_test)
    #         _logger.info("Ada Boost (estimators={n}, learning rate={r}) score: {result}".format(n=estimators,
    #                                                                                             r=rate,
    #                                                                                             result=score))

    # # create and train a bagging regression model
    # model = BaggingRegressor()
    # model.fit(x_train, y_train)
    # score = model.score(x_test, y_test)
    # _logger.info("Bagging score: {result}".format(result=score))

    # create and train an extra trees regression model
    for trees in [3, 6, 10, 20]:
        model = ExtraTreesRegressor(n_estimators=trees)
        model.fit(x_train, y_train)
        score = model.score(x_test, y_test)
        _logger.info("Extra Trees (trees={t}) score: {result}".format(
            t=trees, result=score))

    # create and train a support vector regression model with an linear kernel
    model = SVR(kernel='linear', C=1e3)
    model.fit(x_train.flatten(), y_train.flatten())
    score = model.score(x_test, y_test)
    _logger.info("SVR (linear) score: {result}".format(result=score))

    # create and train a support vector regression model with a polynomial kernel
    model = SVR(kernel='poly', C=1e3, degree=2)
    model.fit(x_train, y_train)
    score = model.score(x_test, y_test)
    _logger.info("SVR (polynomial) score: {result}".format(result=score))

    # create and train a support vector regression model with an RBF kernel
    model = SVR(kernel='rbf', C=1e3, gamma=0.1)
    model.fit(x_train, y_train)
    score = model.score(x_test, y_test)
    _logger.info("SVR (RBF) score: {result}".format(result=score))
Exemplo n.º 49
0
    'max_depth': None,
    'min_samples_leaf': 40,
    'min_samples_split': 40
}

args_rf2 = {
    'n_estimators': 1000,
    'max_depth': None,
    'min_samples_leaf': 40,
    'min_samples_split': 40
}

model_lasso1 = linear_model.Lasso(**args_lasso1)
model_lasso2 = linear_model.Lasso(**args_lasso2)

model_rf1 = ExtraTreesRegressor(**args_rf1)
model_rf2 = ExtraTreesRegressor(**args_rf2)

model_nn1 = CL2020.NeuralNet1(101)
model_nn2 = CL2020.NeuralNet2(100)

# I collect the models into a dictionary so that they can be easily iterated over
models = {
    'lasso': [model_lasso1, model_lasso2],
    'rf': [model_rf1, model_rf2],
    'nn': [model_nn1, model_nn2]
}

# This dictionary defines which ml methods uses added basis functions and which
# do not. An option is included in the fit method of DDMLCT to generate the
# basis functions.
            num_acc += 1
    return num_acc / len(y_pred)


X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y_time,
                                                    test_size=0.3,
                                                    random_state=0)

Regressor = {
    'Random Forest Regressor':
    RandomForestRegressor(n_estimators=200),
    'Gradient Boosting Regressor':
    GradientBoostingRegressor(n_estimators=500),
    'ExtraTrees Regressor':
    ExtraTreesRegressor(n_estimators=500, min_samples_split=5),
    'Bayesian Ridge':
    BayesianRidge(),
    'Elastic Net CV':
    ElasticNetCV()
}

for name, clf in Regressor.items():
    print(name)
    clf.fit(X_train, y_train)

    print('acc', clf.score(X_test, y_test))
    #print('new_acc',get_acc(y_test,clf.predict(X_test),10))

#         print(f'R2: {r2_score(y_test, clf.predict(X_test)):.2f}')
#         print(f'MAE: {mean_absolute_error(y_test, clf.predict(X_test)):.2f}')
Exemplo n.º 51
0
from sklearn.ensemble import GradientBoostingRegressor
import sys
sys.path.append('../tools')
from tools import get_result

day_time = '_02_16_3'

train_x = pd.read_csv('../train_0/train_x' + day_time + '.csv')
train_y = pd.read_csv('../train_0/train_y' + day_time + '.csv')
test_x = pd.read_csv('../test_0/test_x' + day_time + '.csv')

#RF = RandomForestRegressor(n_estimators=1200,random_state=1,n_jobs=-1,min_samples_split=2,min_samples_leaf=2,max_depth=25)
#RF.fit(train_x,train_y)
#pre = (RF.predict(test_x)).round()

ET = ExtraTreesRegressor(n_estimators=1200,
                         random_state=1,
                         n_jobs=-1,
                         min_samples_split=2,
                         min_samples_leaf=2,
                         max_depth=25,
                         max_features=270)
ET.fit(train_x, train_y)
pre = (ET.predict(test_x)).round()

result = get_result(pre)

result.to_csv('../results/result' + day_time + '.csv',
              index=False,
              header=False)
Exemplo n.º 52
0
x_train = pd.read_csv("X_train.csv")
y_train = pd.read_csv("y_train.csv")

y_train0 = y_train.drop('id', axis=1)
x_train0 = x_train.drop('id', axis=1)

for n_estimators, max_iter in [(e, i) for e in [10, 100] for i in [10, 100]]:

    x_train = x_train0
    y_train = y_train0

    # 1. Missing Values
    est = ExtraTreesRegressor(n_estimators=n_estimators,
                              random_state=42,
                              max_features='sqrt',
                              n_jobs=10,
                              verbose=0)
    imputer = IterativeImputer(estimator=est,
                               max_iter=max_iter,
                               tol=0.001,
                               n_nearest_features=100,
                               initial_strategy='median',
                               imputation_order='ascending',
                               verbose=2,
                               random_state=0)
    x_train_filled = imputer.fit_transform(x_train)
    x_train = pd.DataFrame(x_train_filled)

    # 2. Outliers detection
Exemplo n.º 53
0
ss_X=StandardScaler()
ss_y=StandardScaler()

X_train=ss_X.fit_transform(X_train)
X_test=ss_X.transform(X_test)
y_train=ss_y.fit_transform(y_train.reshape(-1,1))
y_test=ss_y.transform(y_test.reshape(-1,1))

### 3、回归预测
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor,GradientBoostingRegressor

rfr=RandomForestRegressor()
rfr.fit(X_train,y_train)
rfr_y_predict=rfr.predict(X_test)

etr=ExtraTreesRegressor()
etr.fit(X_train,y_train)
etr_y_predict=etr.predict(X_test)

gbr=GradientBoostingRegressor()
gbr.fit(X_train,y_train)
gbr_y_predict=gbr.predict(X_test)

#### 4、性能评估
import numpy as np
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

print('随机回归森林的R-squared值是:',rfr.score(X_test,y_test))
print('随机回归森林的MSE的值是:',mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(rfr_y_predict)))
print('随机回归森林的MAE的值是:',mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(rfr_y_predict)))
    RandomForestRegressor(n_estimators=150,
                          max_depth=8,
                          min_samples_leaf=4,
                          n_jobs=-1,
                          random_state=882),
    Pipeline([('feature_selection', SelectFromModel(LinearSVC(penalty="l2"))),
              ('Regression',
               RandomForestRegressor(n_estimators=200,
                                     max_depth=8,
                                     min_samples_leaf=4,
                                     max_features=0.4,
                                     n_jobs=-1,
                                     random_state=0))]),
    ExtraTreesRegressor(n_estimators=10,
                        criterion='mse',
                        max_depth=8,
                        min_samples_split=4,
                        min_samples_leaf=2,
                        warm_start=False),
]


def train_xgboost():
    df = pd.read_csv('/home/kshitij/Desktop/Dataset/stage1_labels.csv')

    x = []
    y = []
    did = df['id'].tolist()
    cancer = df['cancer'].tolist()
    for i in range(len(df)):
        g = []
        if os.path.isfile('/home/kshitij/Desktop/Dataset/stage1/%s.npy' %
Exemplo n.º 55
0
#run python3
import pandas as pd
import numpy as np
conc = pd.read_csv('concrete.csv')
from sklearn.model_selection import KFold
y = np.array(conc['strength'])
X = np.array(conc.drop(['strength'], axis=1))
kfold = KFold(n_splits=10, shuffle=True, random_state=7)
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from mlxtend.regressor import StackingRegressor
rf = RandomForestRegressor(n_estimators=54, max_depth=None, random_state=7)
ext = ExtraTreesRegressor(n_estimators=84, min_samples_split=2, random_state=7)
clf = StackingRegressor(regressors=[ext], meta_regressor=rf)

scores = []
for train, test in kfold.split(X, y):
    clf.fit(X[train], y[train])
    score = clf.score(X[test], y[test])
    print(score)
    scores.append(score)
print("%.3f%% (+/- %.3f)" % (np.mean(scores), np.std(scores)))
Exemplo n.º 56
0
    [LogisticRegression(random_state=42)],
    [OneVsRestClassifier(LogisticRegression(random_state=42))],
    [SGDClassifier(random_state=42)],
    [SVC(kernel='linear', random_state=42)],
    [NuSVC(kernel='linear', random_state=42)],
])
def test_explain_clf_binary_iris(clf, iris_train_binary):
    X, y, feature_names = iris_train_binary
    clf.fit(X, y)
    assert_explain_prediction_single_target(clf, X, feature_names)
    assert_correct_class_explained_binary(clf, X)


@pytest.mark.parametrize(['reg'], [
    [DecisionTreeRegressor(random_state=42)],
    [ExtraTreesRegressor(random_state=42)],
    [RandomForestRegressor(random_state=42)],
])
def test_explain_tree_regressor_multitarget(reg):
    X, y = make_regression(n_samples=100,
                           n_targets=3,
                           n_features=10,
                           random_state=42)
    reg.fit(X, y)
    res = explain_prediction(reg, X[0])
    for expl in format_as_all(res, reg):
        for target in ['y0', 'y1', 'y2']:
            assert target in expl
        assert 'BIAS' in expl
        assert any('x%d' % i in expl for i in range(10))
    check_targets_scores(res)
Exemplo n.º 57
0
#利用ExtraTrees回归进行共线性检验,剔除变量
data_start = data_drop[col_temp + col_hum + col_weather + col_target]
train, test = train_test_split(data_start, test_size=0.25, random_state=40)
#数据标准化处理
train_standed = pd.DataFrame(StandardScaler().fit_transform(train),
                             columns=train.columns,
                             index=train.index)
test_standed = pd.DataFrame(StandardScaler().fit_transform(test),
                            columns=test.columns,
                            index=test.index)
x_train = train_standed[col_temp + col_hum + col_weather]
y_train = train_standed[col_target]
x_test = test_standed[col_temp + col_hum + col_weather]
y_test = test_standed[col_target]
#ExtraTrees回归模型
etr = ExtraTreesRegressor()
vif_data = pd.Series([
    variance_inflation_factor(x_train.values.astype(np.float), i)
    for i in range(x_train.shape[1])
],
                     index=x_train.columns,
                     name='vif')
#共线性检验并进行剔除
while (vif_data > 10).sum() > 0:
    etr.fit(x_train[vif_data.index], y_train)
    #得到变量的重要性系数
    selector_data = pd.Series(etr.feature_importances_,
                              index=vif_data.index,
                              name='etr')
    select_etr = np.abs(selector_data).sort_values(ascending=False)
    etr_vif_data = pd.concat([select_etr, vif_data], join='inner', axis=1)
Exemplo n.º 58
0
    #
    #n_components = 30
    #pca = PCA(n_components=n_components)
    #X = pca.fit_transform(X)

    print(X.shape)
    print(type(X))
    print(y.shape)
    print(type(y))
    print(y.shape)

    estimator = Ridge()
    #selector = RFECV(estimator, step=1, cv=5)

    selector = ExtraTreesRegressor(n_estimators=50)
    selector = selector.fit(X, y)
    print("Optimal number of features : %d" % selector.n_features_)
    X = selector.transform(X)
    print(X.shape)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=42)

    # build a classifier
    clf = RandomForestRegressor(n_estimators=20)
    # use a full grid over all parameters
    param_grid = {
        "max_depth": [3, None],
Exemplo n.º 59
0
        }
        stds = {
            key + '_std': np.std(value)
            for key, value in pipeline_results[pipeline_name].items()
        }
        means.update(stds)
        means['pipeline_name'] = pipeline_name
        results.append(means)

    return pd.DataFrame(results)


# non-default parameters are from https://arxiv.org/pdf/1708.05070.pdf
estimators = {
    'extra_trees_regressor': [
        ('extra_trees_regressor', ExtraTreesRegressor()),
    ],
    'gradient_boosting_regressor':
    [('gradient_boosting_regressor', GradientBoostingRegressor())],
    'random_forest_regressor':
    [('random_forest_regressor', RandomForestRegressor())],
    'knn_regressor': [('standard_scaler', StandardScaler()),
                      ('knn_regressor', KNeighborsRegressor())],
    'xgb_regressor': [('xgb_regressor', XGBRegressor())],
    'lightgbm_regressor': [('lightgbm_regressor', LightGBMRegressor())],
    'catboost_regressor': [('catboost_regressor', CatBoostRegressor())],
    'lasso_regressor': [('standard_scaler', StandardScaler()),
                        ('lasso_regressor', Lasso())],
    'ridge_regressor': [('ridge_regressor', Ridge())],
    'elastic_net_regressor': [('elastic_net_regressor', ElasticNet())],
    'sgd_regressor': [('sgd_regressor', SGDRegressor())],
Exemplo n.º 60
0
y_test = ss_y.transform(y_test.reshape(-1,1)) #修改2

#采用单一回归树模型
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
dtr.fit(X_train,y_train)
dtr_y = dtr.predict(X_test)

from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor,GradientBoostingRegressor
#采用随机森林模型
rfr = RandomForestRegressor()
rfr.fit(X_train,y_train)
rfr_y = rfr.predict(X_test)

#采用极端森林模型
etr = ExtraTreesRegressor()
etr.fit(X_train,y_train)
etr_y = etr.predict(X_test)

#采用梯度提升模型
gbr = GradientBoostingRegressor()
gbr.fit(X_train,y_train)
gbr_y = gbr.predict(X_test)

#对单一回归树做出预测
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
print('R_squared value of DecisionTreeRegressor is ',dtr.score(X_test,y_test))
print('The mean squared error of DecisionTreeRegressor is ',mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(dtr_y)))
print('The mean absolute error of DecisionTreeRegressor is ',mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(dtr_y)))

#对随机森林做出预测