Exemplo n.º 1
0
    def bagging_regressor(self, data):
        train, validacion = data
        x_tr, y_tr = train
        x_val, y_val = validacion
        #print("El set de train tiene {} filas y {} columnas".format(x_tr.shape[0],x_tr.shape[1]))
        #print("El set de validacion tiene {} filas y {} columnas".format(x_val.shape[0],x_val.shape[1]))

        print('Start training BaggingRegressor...')
        start_time = self.timer()

        bg = BaggingRegressor(oob_score=True, verbose=1)
        bg.fit(x_tr, y_tr)
        print("The R2 is: {}".format(bg.score(x_tr, y_tr)))
        #		print("The alpha choose by CV is:{}".format(krrl.alpha_))
        self.timer(start_time)

        print("Making prediction on validation data")
        y_val = np.expm1(y_val)
        y_val_pred = np.expm1(bg.predict(x_val))
        mae = mean_absolute_error(y_val, y_val_pred)
        print("El mean absolute error de es {}".format(mae))

        print('Saving model into a pickle')
        try:
            os.mkdir('pickles')
        except:
            pass

        with open('pickles/bg.pkl', 'wb') as f:
            pickle.dump(bg, f)

        print('Making prediction and saving into a csv')
        y_test = bg.predict(self.x_test)

        return y_test
def model_fit_rf_bagging():

	def in_limits(x):
		if x<1: return 1
		if x>3: return 3
		return x

	print "STARTING MODEL"
	X = full_data[['count_words','count_digits','match_d_title','match_d_description','match_w_title','match_w_description','match_d_attribute','match_w_attribute']].values
	y = full_data['relevance'].values
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
	
	rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
	clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
	clf.fit(X_train, y_train)
	y_pred = clf.predict(X_test)

	in_limits = np.vectorize(in_limits,otypes=[np.float])
	y_pred = in_limits(y_pred)
	RMSE = mean_squared_error(y_test, y_pred)**0.5
	print "RMSE: ",RMSE

	# for the submission
	real_X_test = real_full_test[['count_words','count_digits','match_d_title','match_d_description','match_w_title','match_w_description','match_d_attribute','match_w_attribute']].values
	test_pred = clf.predict(real_X_test)
	test_pred = in_limits(test_pred)

	return test_pred
Exemplo n.º 3
0
class LinReg:
    lr = None
    sc = None
    X_train = None
    y_train = None

    def __init__(self):
        pass

    def train(self, X_train, y_train, model_num):
        # Scale features
        self.sc = StandardScaler()
        self.X_train = self.sc.fit_transform(X_train)
        self.y_train = y_train
        base_model = Ridge(fit_intercept=True)
        self.lr = BaggingRegressor(base_estimator=base_model,
                                   n_estimators=model_num).fit(
                                       self.X_train, self.y_train)

    def predict(self, x_test, retstd=True):
        x_pred = self.sc.transform(x_test)
        if retstd is False:
            return self.lr.predict(x_pred)
        error = []
        for x in range(len(x_pred)):
            preds = []
            for pred in self.lr.estimators_:
                preds.append(pred.predict([x_pred[x]])[0])
            error.append(statistics.stdev(preds))
        error = np.array(error)
        return self.lr.predict(x_pred), error
Exemplo n.º 4
0
class MFBaggingRegressorStacked(MFModel):
    """Stacked Gradient Boosting Regression predictor."""
    def __init__(self, **args):
        """Init model."""
        self.model_lf = BaggingRegressor(**copy.deepcopy(args))
        self.model_hf = BaggingRegressor(**copy.deepcopy(args))

    def fit(self, X_train_lf, y_train_lf, X_train_hf, y_train_hf):
        """Fits a model to low- and high- fidelity samples using stacking scheme for BaggingRegressor."""
        self.model_lf.fit(X_train_lf, y_train_lf)
        X_train_hf = np.hstack(
            (X_train_hf, self.model_lf.predict(X_train_hf).reshape(-1, 1)))
        self.model_hf.fit(X_train_hf, y_train_hf)

    def predict_hf(self, X):
        """Predict low-fidelity values."""
        y_pred_lf = self.model_lf.predict(X)
        X = np.hstack((X, y_pred_lf.reshape(-1, 1)))

        base_preds = [e.predict(X) for e in self.model_hf.estimators_]

        y_pred_hf = np.mean(base_preds, axis=0)

        rho = linregress(y_pred_lf, y_pred_hf)[0]  # get slope

        return rho, y_pred_hf, np.std(base_preds, axis=0)

    def predict_lf(self, X):
        """Predict low-fidelity values."""
        base_preds = [e.predict(X) for e in self.model_lf.estimators_]

        return np.mean(base_preds, axis=0), np.std(base_preds, axis=0)
Exemplo n.º 5
0
def bagging_model(df_dict,filename):
    result_bagging=pd.DataFrame()
    for i in range(36):
        df=df_dict['Model'+str(i+1)]
        mae_gs=pd.DataFrame()
        for feature in [int(x) for x in np.linspace(start = 1, stop = 10, num = 5)]:
            for sample in [int(x) for x in np.linspace(start = 1, stop = 10, num = 5)]:
                for estimator in [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]:
                    bagging=BaggingRegressor(max_features=feature,
                                             max_samples=sample,
                                             n_estimators=estimator)
                    bagging.fit(split_data(df)[0],split_data(df)[1])
                    bagging_ypred=bagging.predict(split_data(df)[2])
                    bagging_mae=MAE(split_data(df)[3],bagging_ypred)
                    mae_gs=mae_gs.append(pd.DataFrame({'max_features':[feature],
                                                       'max_samples':[sample],
                                                       'n_estimators':[estimator],
                                                       'valid_mae':[bagging_mae]}),ignore_index=True)
        parameter_best=mae_gs.loc[mae_gs['valid_mae'].idxmin(),:]
        bagging_best=BaggingRegressor(max_features=int(parameter_best['max_features']),
                                      max_samples=int(parameter_best['max_samples']),
                                      n_estimators=int(parameter_best['n_estimators']))
        bagging_best.fit(split_data(df)[0],split_data(df)[1])
        bagging_ypred=bagging_best.predict(split_data(df)[4])
        mae_bagging=pd.Series(MAE(split_data(df)[5],bagging_ypred))
        df=parameter_best.append(mae_bagging)
        print(i)
        result_bagging=result_bagging.append(df,ignore_index=True)
        #pickle.dump(bagging_best,open(path+ '/model_bagging/' + filename + '_bagging_%s.sav'%('df'+str(i)),'wb')) 
    result_bagging.columns=['mae','max_features','max_samples','n_estimators','valid_mae']
    return result_bagging
def Bagging_reg(X,y,X_test):
    BR = BaggingRegressor(SVR(kernel='linear',epsilon=0.01),n_estimators=5,warm_start=True)
    #BR = BaggingRegressor(base_estimator=LinearRegression(fit_intercept=False),n_estimators=5)
    BR.fit(X, y)
    y_BR_pred = BR.predict(X_test)
    pred_BR = pd.DataFrame(y_BR_pred)
    y_BR_test = BR.predict(X)
    test_BR = pd.DataFrame(y_BR_test)
    return pred_BR,test_BR
Exemplo n.º 7
0
class ModelSelectionRegressor(BaseEstimator, RegressorMixin):
    def __init__(self,
                 objective,
                 optimizer,
                 bagging_params={},
                 random_state=None):
        self.objective = objective
        self.optimizer = optimizer
        self.bagging_params = bagging_params
        self.is_frozen = False
        self.random_state = random_state

    def freeze(self):
        self.is_frozen = True

    def fit(self, X, y):
        self.objective.X = X
        self.objective.y = y

        if not self.is_frozen:
            self.best_parameters = self.optimizer.execute(self.objective)

        #logging.info('best_parameters: %s' % str(self.best_parameters))

        if self.bagging_params == {}:
            self.model = self.objective.instantiate(self.best_parameters)
        else:
            self.model = BaggingRegressor(
                base_estimator=self.objective.instantiate(
                    self.best_parameters),
                **(self.bagging_params),
                random_state=self.random_state)

        self.model.fit(X[:, self.best_parameters['features']], y)

        return self

    def predict(self, X):
        return self.model.predict(X[:, self.best_parameters['features']])

    def predict_std(self, X):
        if isinstance(self.model, BaggingRegressor):
            ens_preds = []
            for e in self.model.estimators_:
                ens_preds.append(
                    e.predict(X[:, self.best_parameters['features']]))
            ens_preds = np.stack(ens_preds, axis=1)
            return self.model.predict(
                X[:, self.best_parameters['features']]), np.std(ens_preds,
                                                                axis=1)
        else:
            return self.model.predict(
                X[:,
                  self.best_parameters['features']]), np.repeat(0.0, len(X))

    def score(self):
        return self.best_parameters['score']
Exemplo n.º 8
0
def runEnsembleBaggingLR(X_train, y_train, X_test, y_test):
    # Trained on sparse format
    bag_knn = BaggingRegressor(
        base_estimator=KNeighborsRegressor(),
        random_state=1,
    ).fit(X_train, y_train)

    mse = mean_squared_error(y_test, bag_knn.predict(X_test))
    r2 = r2_score(y_test, bag_knn.predict(X_test))
    print('\nEnsemble Bagging using KNN Regression:')
    print('MSE = ', mse)
    print('R^2: ', r2)

    return bag_knn, mse, r2
Exemplo n.º 9
0
def estimate_forest_variance(X, y, runs=100, **kwargs):

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    imputer = SimpleImputer(strategy='mean')
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)

    h_bar = BaggingRegressor(base_estimator=RandomForestRegressor(
        n_estimators=100, **kwargs),
                             n_estimators=runs,
                             n_jobs=1,
                             max_samples=0.8)
    h_bar.fit(X_train, y_train)

    h_bar_preds = h_bar.predict(X_test)

    estimators_preds = []
    for tree in h_bar.estimators_:
        estimators_preds.append(tree.predict(X_test))

    estimators_preds = np.array(estimators_preds)
    var = np.mean((estimators_preds - h_bar_preds)**2)

    return var
Exemplo n.º 10
0
def avmPredict(params):
    town = getPlace(params['lat'], params['long'])[0]

    x, y, z = getXYZ(params['lat'], params['long'])

    r = 1.0

    data = []
    target = []
    header = []

    with open('../../../data/working22.csv') as f:

        f = csv.reader(f)
        header = next(f)

        for row in f:
            t = (map(float, row[:3] + row[4:]), float(row[3]))

            if weightF([x, y, z], t[0][0:3], r):
                data.append(t[0])
                target.append(t[1])

    ensemble = BaggingRegressor()
    ensemble.fit(data, target)

    test = createTest(params)
    return ensemble.predict(test)
Exemplo n.º 11
0
    def Breg(self, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, 
             bootstrap_features=False, warm_start=False, n_jobs=None, random_state=None, verbose=0):

        model = BaggingRegressor(n_estimators=n_estimators, max_samples=max_samples, 
                                 max_features=max_features,bootstrap=bootstrap,
                                 bootstrap_features=bootstrap_features, warm_start=warm_start, n_jobs=n_jobs,
                                 random_state=random_state, verbose=verbose)
        kf = KFold(n_splits=self.cv)
        test_result = {'RMSE': [], 'R2': [], 'MAE': []}
        for train_index, test_index in kf.split(self.data.copy()):
            train_X = self.data.copy().drop(
                [self.name_target], axis=1).iloc[train_index, :]
            train_y = self.data.copy().loc[train_index, [self.name_target]]
            test_X = self.data.copy().drop(
                [self.name_target], axis=1).iloc[test_index, :]
            test_y = self.data.copy().loc[test_index, [self.name_target]]
            model.fit(train_X, train_y)
            # 测试
            y_pred = model.predict(test_X)
            test_result['RMSE'].append(
                np.sqrt(metrics.mean_squared_error(test_y, y_pred)))
            test_result['R2'].append(metrics.r2_score(test_y, y_pred))
            test_result['MAE'].append(
                metrics.mean_absolute_error(test_y, y_pred))

        for key, values in test_result.items():
            test_result[key] = np.array(values).mean()

        return model, test_result, train_X,test_X,test_y
Exemplo n.º 12
0
def fitPredict():

    df_train = pd.read_pickle('../../resources/data/dframes/train_df.pickle')
    print('Loaded train df')
    df_test = pd.read_pickle('../../resources/data/dframes/test_df.pickle')
    print('Loaded test df')

    id_test = df_test['id']

    y_train = df_train['relevance'].values
    X_train = df_train.drop(['id', 'relevance'], axis=1).values
    X_test = df_test.drop(['id', 'relevance'], axis=1).values

    rf = RandomForestRegressor(n_estimators=15, max_depth=None, random_state=0)
    clf = BaggingRegressor(rf,
                           n_estimators=45,
                           max_samples=0.1,
                           random_state=25)
    print('Fitting')
    clf.fit(X_train, y_train)
    print('Predicting')
    y_pred = clf.predict(X_test)

    pd.DataFrame({
        "id": id_test,
        "relevance": y_pred
    }).to_csv('../../resources/results/submission.csv', index=False)
Exemplo n.º 13
0
class BaggingLearner:
# see more here: http://scikit-learn.org/stable/modules/ensemble.html#bagging
    def __init__(self, estimator):
        self.estimator = estimator
        self.model = BaggingRegressor(self.estimator.model)
        pass

    def fit(self, X, y):
        if isinstance(Y, pd.DataFrame):
            y = y.ix[:,0]
        self.model.fit(X, y)

    def predict(self, X):
        prediction = (pd.DataFrame(index=X.index, data={self.GetName():self.model.predict(X)}))
        return prediction

    def GetName(self):
        return "BaggingLearner (" + self.estimator.GetName() + ")"

    def Save(self, filename):
        self.estimator.Save(filename + '-estimator.pkl')
        joblib.dump(self.model, filename + '-model.pkl')
        joblib.dump(self, filename + '.pkl')

    def Load(self, filename):
        self.estimator.Load(filename + '-estimator.pkl')
        self.model = joblib.load(filename + '-model.pkl')
        self = joblib.load(filename + '.pkl')
Exemplo n.º 14
0
class BAGGING():
    """docstring for ClassName"""
    def __init__(self, BaggingRegressor, N):
        self.cores_number = int(np.ceil(multiprocessing.cpu_count()/N))
        self.model = BaggingRegressor(
                 base_estimator=None, 
                 bootstrap=True,
                 bootstrap_features=False, 
                 max_features=1.0, 
                 max_samples=1.0,
                 n_estimators=10, 
                 n_jobs= self.cores_number, 
                 oob_score=False, 
                 random_state=None,
                 verbose=0, 
                 warm_start=False)


        print("Bagging Cores: ", self.cores_number)

    def fit(self, X_train, y_train, X_test, y_test, error_type = "MAE"):

        error_dict = {"MSE":"rmse", "R2":{"l1","l2"}, "MAE":"mae","LOGLOSS": "multi_logloss" }
        error_metric = error_dict[error_type]
        self.model.fit(X_train, y_train )

    def predict(self, X_test):
         prediction=self.model.predict(X_test)
         return(prediction)
Exemplo n.º 15
0
class _BaggingRegressorImpl:
    def __init__(
        self,
        base_estimator=None,
        n_estimators=10,
        *,
        max_samples=1.0,
        max_features=1.0,
        bootstrap=True,
        bootstrap_features=False,
        oob_score=False,
        warm_start=False,
        n_jobs=None,
        random_state=None,
        verbose=0,
    ):
        estimator_impl = base_estimator

        self._hyperparams = {
            "base_estimator": estimator_impl,
            "n_estimators": n_estimators,
            "max_samples": max_samples,
            "max_features": max_features,
            "bootstrap": bootstrap,
            "bootstrap_features": bootstrap_features,
            "oob_score": oob_score,
            "warm_start": warm_start,
            "n_jobs": n_jobs,
            "random_state": random_state,
            "verbose": verbose,
        }
        self._wrapped_model = SKLModel(**self._hyperparams)
        self._hyperparams["base_estimator"] = base_estimator

    def get_params(self, deep=True):
        out = self._wrapped_model.get_params(deep=deep)
        # we want to return the lale operator, not the underlying impl
        out["base_estimator"] = self._hyperparams["base_estimator"]
        return out

    def fit(self, X, y, sample_weight=None):
        if isinstance(X, pd.DataFrame):
            feature_transformer = FunctionTransformer(
                func=lambda X_prime: pd.DataFrame(X_prime, columns=X.columns),
                inverse_func=None,
                check_inverse=False,
            )
            self._hyperparams["base_estimator"] = (
                feature_transformer >> self._hyperparams["base_estimator"]
            )
            self._wrapped_model = SKLModel(**self._hyperparams)
        self._wrapped_model.fit(X, y, sample_weight)

        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def score(self, X, y, sample_weight=None):
        return self._wrapped_model.score(X, y, sample_weight)
Exemplo n.º 16
0
def estimate_tree_variance(X, y, runs=100, **kwargs):

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    imputer = SimpleImputer(strategy='mean')
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)
    """
    The BaggingRegressor here estimates the average of multiple decision trees
    each trained on a random sub-sample of the training set (~80% of it) by
    averaging their predictions on each testing sample as the final prediction
    """
    h_bar = BaggingRegressor(base_estimator=DecisionTreeRegressor(
        random_state=42, **kwargs),
                             n_estimators=runs,
                             n_jobs=6,
                             max_samples=0.8,
                             random_state=42)
    h_bar.fit(X_train, y_train)

    h_bar_preds = h_bar.predict(X_test)
    """
    Here we access each individual tree and take its prediction on the test samples
    """
    estimators_preds = []
    for tree in h_bar.estimators_:
        estimators_preds.append(tree.predict(X_test))
    """
    Here we simply implement the variance defintion
    Var(h) = E[E_x[(h(x) - h_bar(x)) ^ 2]]
    """
    estimators_preds = np.array(estimators_preds)
    var = np.mean((estimators_preds - h_bar_preds)**2)

    return var
Exemplo n.º 17
0
def bagging_logistic_regression():
    data_train = pd.read_csv(TRAIN_FILE_PATH)
    # 逻辑回归建模
    raw_data_train, processer = process_train_data(data_train)
    data_train = raw_data_train.as_matrix()
    y = data_train[:, 0]  # 取survived
    X = data_train[:, 1:]  # 取survived

    clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
    bagging_clf = BaggingRegressor(clf,
                                   n_estimators=20,
                                   max_samples=0.8,
                                   max_features=1,
                                   bootstrap=True,
                                   bootstrap_features=False,
                                   n_jobs=-1)

    # 同样对test_data预处理
    raw_data_test = pd.read_csv(TEST_FILE_PATH)
    data_test = process_test_data(raw_data_test, processer)

    # 预测
    predictions = bagging_clf.predict(data_test.as_matrix())
    predict_result = pd.DataFrame(
        dict(PassengerId=raw_data_test.PassengerId.as_matrix(),
             Survived=predictions.astype(np.int32)))
    # predict_result.to_csv(RESULT_OUTPUT_PATH, index=False)
    print(predict_result)
Exemplo n.º 18
0
class BaggingRegressorPrim(primitive):
    def __init__(self, random_state=0):
        super(BaggingRegressorPrim, self).__init__(name='BaggingRegressor')
        self.hyperparams = []
        self.type = 'Regressor'
        self.description = "A Bagging regressor. A Bagging regressor is an ensemble meta-estimator that fits base regressors each on random subsets of the original dataset and then aggregate their individual predictions (either by voting or by averaging) to form a final prediction. Such a meta-estimator can typically be used as a way to reduce the variance of a black-box estimator (e.g., a decision tree), by introducing randomization into its construction procedure and then making an ensemble out of it. This algorithm encompasses several works from the literature. When random subsets of the dataset are drawn as random subsets of the samples, then this algorithm is known as Pasting [1]. If samples are drawn with replacement, then the method is known as Bagging [2]. When random subsets of the dataset are drawn as random subsets of the features, then the method is known as Random Subspaces [3]. Finally, when base estimators are built on subsets of both samples and features, then the method is known as Random Patches [4]."
        self.hyperparams_run = {'default': True}
        self.random_state = random_state
        self.model = BaggingRegressor(random_state=random_state, n_jobs=5)
        self.accept_type = 'c_r'

    def can_accept(self, data):
        return self.can_accept_c(data, 'Regression')

    def is_needed(self, data):
        # data = handle_data(data)
        return True

    def fit(self, data):
        data = handle_data(data)
        self.model.fit(data['X'], data['Y'])

    def produce(self, data):
        output = handle_data(data)
        output['predictions'] = self.model.predict(output['X'])
        output['X'] = pd.DataFrame(output['predictions'],
                                   columns=[self.name + "Pred"])
        final_output = {0: output}
        return final_output
Exemplo n.º 19
0
def avmPredict(params):
	town = getPlace(params['lat'], params['long'])[0]

	x, y, z = getXYZ(params['lat'], params['long'])

	r = 1.0

	data = []
	target = []
	header = []

	with open('../../../data/working22.csv') as f:
	
		f = csv.reader(f)
		header = next(f)

		for row in f:
			t = (map(float, row[:3] + row[4:]), float(row[3]))

			if weightF([x, y, z], t[0][0:3], r):
				data.append(t[0])
				target.append(t[1])

	ensemble = BaggingRegressor()
	ensemble.fit(data, target)

	test = createTest(params)
	return ensemble.predict(test)
Exemplo n.º 20
0
def KNeighborsBagging(neigh):
    kn1 = neighbors.KNeighborsRegressor(n_neighbors=neigh, weights='uniform')
    kn2 = neighbors.KNeighborsRegressor(n_neighbors=neigh, weights='distance')
    bgg = BaggingRegressor(kn1,
                           n_estimators=10,
                           max_samples=0.7,
                           max_features=0.9,
                           verbose=0)  #, max_features=0.5

    bgg.fit(X_train, y_train)

    print(bgg.score(X_train, y_train))

    y_pred = bgg.predict(X_test)

    # Generate ROC curve values: fpr, tpr, thresholds
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)

    # Plot ROC curve
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.show()
Exemplo n.º 21
0
class Predictor(BaseEstimator):
    '''Predictor: modify this class to create a predictor of
    your choice. This could be your own algorithm, of one for the scikit-learn
    models, for which you choose the hyper-parameters.'''
    def __init__(self):
        '''This method initializes the predictor.'''
        self.mod = BaggingRegressor(base_estimator=RandomForestRegressor(
            n_estimators=50))
        print("PREDICTOR=" + self.mod.__str__())

    def fit(self, X, y):
        ''' This is the training method: parameters are adjusted with training data.'''
        self.mod = self.mod.fit(X, y)
        return self

    def predict(self, X):
        ''' This is called to make predictions on test data. Predicted classes are output.'''
        return self.mod.predict(X)

    def save(self, path="./"):
        pickle.dump(self, open(path + '_model.pickle', "w"))

    def load(self, path="./"):
        self = pickle.load(open(path + '_model.pickle'))
        return self
Exemplo n.º 22
0
def train_model(train, test, labels):
    rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=10)
    #rf = RandomForestRegressor(n_estimators=45, max_depth=9, random_state=10)
    clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.2, random_state=25)
    clf.fit(train, labels)
    #clf = SVR(C=1.0, epsilon=0.2)
    #clf.fit(train, labels)
    #clf = GaussianNB()
    #clf.fit(train, labels)
    print "Good!"
    predictions = clf.predict(test)
    print predictions.shape
    predictions = pd.DataFrame(predictions, columns = ['relevance'])
    print "Good again!"
    print "Predictions head -------"
    print predictions.head()
    print predictions.shape
    print "TEST head -------"
    print test.head()
    print test.shape
    #test['id'].to_csv("TEST_TEST.csv",index=False)
    #predictions.to_csv("PREDICTIONS.csv",index=False)
    #test = test.reset_index()
    #predictions = predictions.reset_index()
    #test = test.groupby(level=0).first()
    #predictions = predictions.groupby(level=0).first()
    predictions = pd.concat([test['id'],predictions], axis=1, verify_integrity=False)
    print predictions
    return predictions
Exemplo n.º 23
0
def run_gpr(down_station, input_list, include_time, sample_size, network_type,
            include_diff, n_estimators, b):
    start_time_run = time.time()

    result_dir = util.get_result_dir(down_station, network_type, n_estimators,
                                     b, sample_size)
    if not os.path.exists(result_dir):
        os.makedirs(result_dir)

    (y_train, x_train, y_cv, x_cv, _, _, _, _, train_y_max, train_y_min, _, _,
     _, _, _) = data.construct(down_station, input_list, include_time,
                               sample_size, network_type)

    # n_estimators = 50
    gpr = BaggingRegressor(GaussianProcessRegressor(copy_X_train=False),
                           max_samples=1.0 / n_estimators,
                           n_estimators=n_estimators,
                           n_jobs=1)
    # svr = SVR(C=_C, epsilon=_epsilon, verbose=True, cache_size=1024) # No bagging
    # gpr = GaussianProcessRegressor(copy_X_train=False)

    gpr.fit(x_train, y_train)
    util.save_sklearn_model(gpr, result_dir)

    y_cv_pred = gpr.predict(x_cv)

    predict.plot_prediction(y_cv_pred, result_dir, y_cv, train_y_max,
                            train_y_min)

    elapsed_time_run = time.time() - start_time_run
    print(
        time.strftime("Fitting time : %H:%M:%S",
                      time.gmtime(elapsed_time_run)))
Exemplo n.º 24
0
def procedureA(goldenFlag=False):
    # Trains and generates a prediction file
    # Uses hard heuristic for buy_or_not

    popFlag = True
    X, Y = getDataXY(currYearFlag=False, popFlag=popFlag)
    X, Y = shuffle(X, Y, random_state=0)

    if popFlag:
        encoder = oneHot(X[:, 2:])
        Xt = encoder.transform(X[:, 2:])
        Xt = np.hstack((X[:, :2], Xt))
    else:
        encoder = oneHot(X)
        Xt = encoder.transform(X)

    buySet = set()
    for i in range(X.shape[0]):
        tmpTup = (X[i][0], X[i][2])
        buySet.add(tmpTup)
    # Y_buy = [1] * Xt.shape[0]

    min_max_scaler = preprocessing.MinMaxScaler()

    # Xt = min_max_scaler.fit_transform(Xt)

    if goldenFlag:
        print Xt.shape
        Xt = getGoldenX(Xt, 2, 2 + encoder.feature_indices_[1],
                        2 + encoder.feature_indices_[0],
                        2 + min(9, encoder.feature_indices_[1]))

    split = 0.9
    X_train, X_test = Xt[:(int(Xt.shape[0] * split)), :], Xt[int(Xt.shape[0] *
                                                                 split):, :]
    Y_train, Y_test = Y[:(int(Y.shape[0] * split)), :], Y[int(Y.shape[0] *
                                                              split):, :]
    Y_train = Y_train.ravel()
    Y_test = Y_test.ravel()

    print X_train.shape
    print X_test.shape

    # clf = Ridge(alpha = 100)
    # clf = SVR(C = 10.0, kernel = 'poly', degree = 2)
    # clf = LinearSVR(C = 1.0)
    clf = BaggingRegressor(DecisionTreeRegressor(),
                           n_estimators=125,
                           n_jobs=4,
                           random_state=0)
    # clf = AdaBoostRegressor(DecisionTreeRegressor(), n_estimators = 100)
    # clf = DecisionTreeRegressor()
    # clf = RandomForestRegressor(random_state = 0, n_estimators = 200, n_jobs = 4)
    clf.fit(X_train, Y_train.ravel())

    Y_pred = clf.predict(X_test)
    evaluatePred(Y_pred, Y_test)

    return clf, encoder, min_max_scaler
Exemplo n.º 25
0
def random_forest(X,Y,Xt):
    print('learn')    
    rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
    clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
    clf.fit(X, Y)
    print('predict')
    Yp_clamped = clf.predict(Xt)
    return Yp_clamped
def bagging_regressor_train(X_train, y_train, X_valid, y_valid):
    model = BaggingRegressor(base_estimator=SVR(),
                             n_estimators=10,
                             random_state=0)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    valid_score = np.sqrt(mean_squared_error(y_valid, y_pred))
    return model, valid_score
def run():
    print "Bagged Decision Tree Regression started..."

    #Preparing Training data
    dir_path = ""
    train_file_path = dir_path + "train.csv"
    train_file = read_csv(train_file_path, skiprows=1, header=None)

    train_file = train_file.drop(train_file.columns[0], axis=1)
    train_file = train_file.values

    #Combining previous 5 time step data into one row
    train_X_temp = train_file[5:50000, :-1]
    train_Y = train_file[6:50001, -1]
    train_X = np.zeros((train_X_temp.shape[0], 8 * 5))
    for i in range(train_X_temp.shape[0]):
        for j in range(5):
            for k in range(8):
                train_X[i][j * 8 + k] = train_X_temp[i - j][k]

    #Preparing testing data
    test_file_name = dir_path + "test2.csv"
    test_file = read_csv(test_file_name, skiprows=1, header=None)
    test_file = test_file.values
    test_X = np.array(test_file[:, :-1])
    test_y = test_file[:, -1]

    # print "\nSimple Decison Tree:"
    # dec_tree = DecisionTreeRegressor(max_depth = 5)
    # dec_tree.fit(train_X, train_Y)
    # prediction = dec_tree.predict(test_X)
    # print "Predictions: \n",prediction
    # print "Score: ",dec_tree.score(test_X,test_y)

    # print "\nADABoost Decision Tree:"
    # ada_boost = AdaBoostRegressor(DecisionTreeRegressor(max_depth = 5),n_estimators = 10)
    # ada_boost.fit(train_X, train_Y)
    # prediction = ada_boost.predict(test_X)
    # print "Predictions: \n",prediction
    # print "Score: ",ada_boost.score(test_X,test_y)

    #Model training and prediction
    print "\nBagged Decision Tree:"
    start = time.time()
    bag_reg = BaggingRegressor(DecisionTreeRegressor(),
                               n_jobs=2,
                               random_state=0).fit(train_X, train_Y)

    #bag_reg.set_params(n_jobs=1)
    #Calculating and printing Results
    prediction = bag_reg.predict(test_X)
    mse = np.mean((prediction - test_y)**2)
    print "MSE: ", mse
    # print "Predictions: \n",prediction
    print "Score: ", bag_reg.score(test_X, test_y)

    print "Time: ", (time.time() - start)
    print "Decision Tree Regressor done...\n"
Exemplo n.º 28
0
def test_sparse_regression():
    # Check regression for various parameter settings on sparse input.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
                                                        boston.target[:50],
                                                        random_state=rng)

    class CustomSVR(SVR):
        """SVC variant that records the nature of the training set"""

        def fit(self, X, y):
            super().fit(X, y)
            self.data_type_ = type(X)
            return self

    parameter_sets = [
        {"max_samples": 0.5,
         "max_features": 2,
         "bootstrap": True,
         "bootstrap_features": True},
        {"max_samples": 1.0,
         "max_features": 4,
         "bootstrap": True,
         "bootstrap_features": True},
        {"max_features": 2,
         "bootstrap": False,
         "bootstrap_features": True},
        {"max_samples": 0.5,
         "bootstrap": True,
         "bootstrap_features": False},
    ]

    for sparse_format in [csc_matrix, csr_matrix]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)
        for params in parameter_sets:

            # Trained on sparse format
            sparse_classifier = BaggingRegressor(
                base_estimator=CustomSVR(),
                random_state=1,
                **params
            ).fit(X_train_sparse, y_train)
            sparse_results = sparse_classifier.predict(X_test_sparse)

            # Trained on dense format
            dense_results = BaggingRegressor(
                base_estimator=CustomSVR(),
                random_state=1,
                **params
            ).fit(X_train, y_train).predict(X_test)

            sparse_type = type(X_train_sparse)
            types = [i.data_type_ for i in sparse_classifier.estimators_]

            assert_array_almost_equal(sparse_results, dense_results)
            assert all([t == sparse_type for t in types])
            assert_array_almost_equal(sparse_results, dense_results)
Exemplo n.º 29
0
def test_sparse_regression():
    # Check regression for various parameter settings on sparse input.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(diabetes.data[:50],
                                                        diabetes.target[:50],
                                                        random_state=rng)

    class CustomSVR(SVR):
        """SVC variant that records the nature of the training set"""

        def fit(self, X, y):
            super().fit(X, y)
            self.data_type_ = type(X)
            return self

    parameter_sets = [
        {"max_samples": 0.5,
         "max_features": 2,
         "bootstrap": True,
         "bootstrap_features": True},
        {"max_samples": 1.0,
         "max_features": 4,
         "bootstrap": True,
         "bootstrap_features": True},
        {"max_features": 2,
         "bootstrap": False,
         "bootstrap_features": True},
        {"max_samples": 0.5,
         "bootstrap": True,
         "bootstrap_features": False},
    ]

    for sparse_format in [csc_matrix, csr_matrix]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)
        for params in parameter_sets:

            # Trained on sparse format
            sparse_classifier = BaggingRegressor(
                base_estimator=CustomSVR(),
                random_state=1,
                **params
            ).fit(X_train_sparse, y_train)
            sparse_results = sparse_classifier.predict(X_test_sparse)

            # Trained on dense format
            dense_results = BaggingRegressor(
                base_estimator=CustomSVR(),
                random_state=1,
                **params
            ).fit(X_train, y_train).predict(X_test)

            sparse_type = type(X_train_sparse)
            types = [i.data_type_ for i in sparse_classifier.estimators_]

            assert_array_almost_equal(sparse_results, dense_results)
            assert all([t == sparse_type for t in types])
            assert_array_almost_equal(sparse_results, dense_results)
Exemplo n.º 30
0
def test_parallel_regression():
    # Check parallel regression.
    rng = check_random_state(0)

    X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng)

    ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit(X_train, y_train)

    ensemble.set_params(n_jobs=1)
    y1 = ensemble.predict(X_test)
    ensemble.set_params(n_jobs=2)
    y2 = ensemble.predict(X_test)
    assert_array_almost_equal(y1, y2)

    ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=1, random_state=0).fit(X_train, y_train)

    y3 = ensemble.predict(X_test)
    assert_array_almost_equal(y1, y3)
Exemplo n.º 31
0
def train_and_validate(X_inp, Y_inp, seed):

    # To compensate inbalance, we need to define different test size on each
    # error visibility level
    #
    testsizes = [
        0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.5, 0.5, 0.5, 0.5,
        0.5, 0.5, 0.5, 0.5, 0.5, 0.5
    ]

    X_train = []
    Y_train = []
    X_validation = []
    Y_validation = []

    for i in range(0, 20):
        X_this = [
            X_inp[j, :] for j in range(0, len(Y))
            if Y[j] > i / 20 - 0.001 and Y[j] < i / 20 + 0.001
        ]
        Y_this = [
            Y_inp[j] for j in range(0, len(Y))
            if Y[j] > i / 20 - 0.001 and Y[j] < i / 20 + 0.001
        ]
        X_train_this, X_validation_this, Y_train_this, Y_validation_this = model_selection.train_test_split(
            X_this, Y_this, test_size=testsizes[i], random_state=seed)

        Y_train.extend(Y_train_this)
        X_train.extend(X_train_this)
        Y_validation.extend(Y_validation_this)
        X_validation.extend(X_validation_this)

    # =====================================================================
    # Regression training here. You can just use whatever regression model
    # you like.
    #
    #

    # First, scale the features
    scaler = MinMaxScaler(feature_range=(0.0001, 1))
    X_train = scaler.fit_transform(X_train)

    # Then, define the model and the parameters
    #model = GradientBoostingRegressor()
    #model = MLPRegressor()
    #model = SVR()
    #model = RandomForestRegressor()
    model = BaggingRegressor(
        base_estimator=ensemble.GradientBoostingRegressor())
    model.fit(X_train, Y_train)

    # =====================================================================
    # Validation part starts here. Nothing very special.
    #
    X_validation = scaler.transform(X_validation)
    Y_pred = model.predict(X_validation)
    return Y_pred, Y_validation
Exemplo n.º 32
0
def HHT_MARS_TEST(series, regressors=4, delay=1, N=2000):
    series = series[len(series) - 2000:]
    series = np.array(series)
    series = series.reshape(-1, 1)

    D = regressors  # number of regressors
    T = delay  # delay
    N = N
    series = series[500:]
    data = np.zeros((N - 500 - T - (D - 1) * T, D))
    lbls = np.zeros((N - 500 - T - (D - 1) * T, ))

    for t in range((D - 1) * T, N - 500 - T):
        data[t - (D - 1) * T, :] = [
            series[t - 3 * T], series[t - 2 * T], series[t - T], series[t]
        ]
        lbls[t - (D - 1) * T] = series[t + T]
    trnData = data[:lbls.size - round(lbls.size * 0.3), :]
    trnLbls = lbls[:lbls.size - round(lbls.size * 0.3)]
    chkData = data[lbls.size - round(lbls.size * 0.3):, :]
    chkLbls = lbls[lbls.size - round(lbls.size * 0.3):]

    aa = np.array(chkLbls[-4:]).reshape(1, -1)
    chkData = np.append(chkData, aa, axis=0)

    mars = Earth()
    mars.fit(trnData, trnLbls)
    boosted_mars = AdaBoostRegressor(base_estimator=mars,
                                     n_estimators=25,
                                     learning_rate=0.1,
                                     loss='exponential')
    bag = BaggingRegressor(base_estimator=mars, n_estimators=25)
    bag.fit(trnData, trnLbls)
    boosted_mars.fit(trnData, trnLbls)
    pred2 = bag.predict(chkData)
    oos_preds = boosted_mars.predict(chkData)

    stack_predict = np.vstack([oos_preds, pred2]).T

    params_xgd = {
        'max_depth': 7,
        'objective': 'reg:linear',
        'learning_rate': 0.05,
        'n_estimators': 10000
    }
    clf = xgb.XGBRegressor(**params_xgd)
    clf.fit(stack_predict[:-1, :],
            chkLbls,
            eval_set=[(stack_predict[:-1, :], chkLbls)],
            eval_metric='rmse',
            early_stopping_rounds=20,
            verbose=False)

    xgb_pred = clf.predict(stack_predict)

    return xgb_pred
Exemplo n.º 33
0
def Bagging(x_train, y_train, x_test, y_test):
    estimator = BaggingRegressor(n_estimators=1000, random_state=0, n_jobs=-1)
    estimator.fit(x_train, y_train)
    t = estimator.score(x_train, y_train)
    y_pred = estimator.predict(x_test)
    mse_score = mse(y_test, y_pred)
    print("mse_score: " + str(mse_score))
    r2_score = r2(y_test, y_pred)
    print("r2_score: " + str(r2_score))
    print(t)
Exemplo n.º 34
0
def main():
    arg = args()
    train = rw.read(arg.train)
    test = rw.read(arg.test)
    X = train.loc[:, train.columns != 'Market Share_total']
    y = train['Market Share_total']
    bagging_regressor = BaggingRegressor()
    bagging_regressor.fit(X, y)
    predictions = bagging_regressor.predict(test)
    rw.write(predictions, "test_results.csv")
Exemplo n.º 35
0
def Bagging(Xtrain, Ytrain, Xtest, Ytest):
    """
	Apply the extra trees regressor
	"""
    from sklearn.ensemble import BaggingRegressor
    print('\nBagging regressor:')

    clf = BaggingRegressor(n_estimators=100, n_jobs=-1).fit(Xtrain, Ytrain)
    print('Accuracy: {0}'.format(clf.score(Xtrain, Ytrain)))

    #find the training error
    prediction = clf.predict(Xtrain)
    Etrain = error(prediction, Ytrain)
    print('Training error: {0}'.format(Etrain))

    #find the test error
    prediction = clf.predict(Xtest)
    Etrain = error(prediction, Ytest)
    print('Test error: {0}'.format(Etrain))
Exemplo n.º 36
0
def procedureA(goldenFlag = False):
	# Trains and generates a prediction file
	# Uses hard heuristic for buy_or_not

	popFlag = True
	X, Y = getDataXY(currYearFlag = False, popFlag = popFlag)
	X, Y = shuffle(X, Y, random_state = 0)

	if popFlag:
		encoder = oneHot(X[:, 2:])
		Xt = encoder.transform(X[:, 2:])
		Xt = np.hstack((X[:,:2], Xt))
	else:
		encoder = oneHot(X)
		Xt = encoder.transform(X)

	buySet = set()
	for i in range(X.shape[0]):
		tmpTup = (X[i][0], X[i][2])
		buySet.add(tmpTup)
	# Y_buy = [1] * Xt.shape[0]

	min_max_scaler = preprocessing.MinMaxScaler()

	# Xt = min_max_scaler.fit_transform(Xt)

	if goldenFlag:
		print Xt.shape
		Xt = getGoldenX(Xt, 2, 2 + encoder.feature_indices_[1], 2 + encoder.feature_indices_[0], 2 + min(9, encoder.feature_indices_[1]))


	split = 0.9
	X_train, X_test = Xt[:(int(Xt.shape[0]*split)),:], Xt[int(Xt.shape[0]*split):, :]
	Y_train, Y_test = Y[:(int(Y.shape[0]*split)),:], Y[int(Y.shape[0]*split):, :]
	Y_train = Y_train.ravel()
	Y_test = Y_test.ravel()

	print X_train.shape
	print X_test.shape

	# clf = Ridge(alpha = 100)
	# clf = SVR(C = 10.0, kernel = 'poly', degree = 2)
	# clf = LinearSVR(C = 1.0)
	clf = BaggingRegressor(DecisionTreeRegressor(), n_estimators = 125, n_jobs = 4, random_state = 0)
	# clf = AdaBoostRegressor(DecisionTreeRegressor(), n_estimators = 100)
	# clf = DecisionTreeRegressor()
	# clf = RandomForestRegressor(random_state = 0, n_estimators = 200, n_jobs = 4)
	clf.fit(X_train, Y_train.ravel())

	Y_pred = clf.predict(X_test)
	evaluatePred(Y_pred, Y_test)

	return clf, encoder, min_max_scaler
Exemplo n.º 37
0
def get_bagging_prediction(X_train, y_train, X_test, X_valid=None, GS=False):
    if not GS:
        rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
        clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        if X_valid is None:
            return y_pred
        else:
            return y_pred, clf.predict(X_valid)
    else:
        rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
        clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
        param_grid = {'rfr__max_features': [10], 'rfr__max_depth': [20]}
        model = grid_search.GridSearchCV(estimator=clf, param_grid=param_grid, n_jobs=-1, cv=2, verbose=VERBOSE, scoring=RMSE)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        if X_valid is None:
            return y_pred
        else:
            return y_pred, model.predict(X_valid)
Exemplo n.º 38
0
def runTests():

    # Generate the training samples, extract training features and target
    trainSamples = GenSamples(numSamples)
    trainFeatures = extractFeatures(trainSamples)
    trainPred = extractPred(trainSamples)

    # Generate the test samples, extracr test features and target
    testSamples = GenSamples(numTestSamples)
    testFeatures = extractFeatures(testSamples)
    testPred = extractPred(testSamples)

    R2List = OrderedDict()
    R2List['TrainROI'] = []
    R2List['TestROI'] = []
    print 'Running Tests: '
    for i in range(numTests):
        # Bootstrap is True by default i.e., sampling with replacement
        # Bootstrap features is False by default i.e., all features used
        classifier = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                                      n_estimators=numTrees,
                                      max_samples=int(0.5*numSamples),
                                      max_features=int(1))

        classifier.fit(trainFeatures, trainPred)
        predictROI = {}
        predictROI['Training'] = classifier.predict(trainFeatures)
        predictROI['Test'] = classifier.predict(testFeatures)

        R2 = {}
        R2['Train'] = r2_score(trainPred, predictROI['Training'])
        R2['Test'] = r2_score(testPred, predictROI['Test'])

        R2List['TrainROI'].append(R2['Train'])
        R2List['TestROI'].append(R2['Test'])

    print 'Best Train ROI: ', max(R2List['TrainROI'])
    print 'Best Test ROI: ', max(R2List['TestROI'])
Exemplo n.º 39
0
def test_single_estimator():
    # Check singleton ensembles.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        random_state=rng)

    clf1 = BaggingRegressor(base_estimator=KNeighborsRegressor(),
                            n_estimators=1,
                            bootstrap=False,
                            bootstrap_features=False,
                            random_state=rng).fit(X_train, y_train)

    clf2 = KNeighborsRegressor().fit(X_train, y_train)

    assert_array_almost_equal(clf1.predict(X_test), clf2.predict(X_test))
Exemplo n.º 40
0
def train_model(training, testing, window=5, n=5):
	X_train, y_train = prepare_data(training)
	X_test, y_test = prepare_data(testing)
	rf = RandomForestRegressor()
	rf.fit(X_train, y_train)
	predrf = rf.predict(X_test)
	print "mse for random forest regressor: ", mean_squared_error(predrf, y_test)

	gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.025)
	gb.fit(X_train, y_train)
	predgb = gb.predict(X_test)
	print "mse for gradient boosting regressor: ", mean_squared_error(predgb, y_test)
	## plot feature importance using GBR results
	fx_imp = pd.Series(gb.feature_importances_, index=['bb', 'momentum', 'sma', 'volatility'])
	fx_imp /= fx_imp.max()  # normalize
	fx_imp.sort()
	ax = fx_imp.plot(kind='barh')
	fig = ax.get_figure()
	fig.savefig("output/feature_importance.png")

	adb = AdaBoostRegressor(DecisionTreeRegressor())
	adb.fit(X_train, y_train)
	predadb = adb.predict(X_test)
	print "mse for adaboosting decision tree regressor: ", mean_squared_error(predadb, y_test)

	scale = StandardScaler()
	scale.fit(X_train)
	X_trainscale = scale.transform(X_train)
	X_testscale = scale.transform(X_test)

	knn = BaggingRegressor(KNeighborsRegressor(n_neighbors=10), max_samples=0.5, max_features=0.5)
	knn.fit(X_trainscale, y_train)
	predknn = knn.predict(X_testscale)
	print "mse for bagging knn regressor: ", mean_squared_error(predknn, y_test)

	pred_test = 0.1*predrf+0.2*predgb+0.1*predadb+0.6*predknn
	print "mse for ensemble all the regressors: ", mean_squared_error(pred_test, y_test)
	result = testing.copy()
	result.ix[5:-5, 'trend'] = pred_test
	result.ix[10:, 'pred'] = pred_test * result.ix[5:-5, 'IBM'].values
	result.ix[:-5, 'pred_date'] = result.index[5:]

	return result
Exemplo n.º 41
0
def procc_modelfusion(df_test, data_test):
    from sklearn.ensemble import BaggingRegressor
    from sklearn import linear_model
    train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title')
    train_np = train_df.as_matrix()

    # y即Survival结果
    y = train_np[:, 0]

    # X即特征属性值
    X = train_np[:, 1:]

    # fit到BaggingRegressor之中
    clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
    bagging_clf = BaggingRegressor(clf, n_estimators=10, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=-1)
    bagging_clf.fit(X, y)

    test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title')
    predictions = bagging_clf.predict(test)
    result = pd.DataFrame({'PassengerId' : data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)})
    result.to_csv("logistic_regression_predictions3.csv", index=False)
Exemplo n.º 42
0
class Regressor(BaseEstimator):
    def __init__(self):
#         self.clf = GradientBoostingRegressor(n_estimators=200, max_features="sqrt", max_depth=5)
#         self.clf = LinearRegression() 
         self.clf = BaggingRegressor(LinearRegression())
#         self.clf = GaussianProcess(theta0=4)
#         self.sp = RandomizedLasso()       
         self.sp = SparseRandomProjection(n_components=5)
#         self.sp = TruncatedSVD()
 #        self.sp = KernelPCA(n_components=3, tol=0.0001, kernel="poly")
    # self.clf = ExtraTreesRegressor(n_estimators=200, max_features="sqrt", max_depth=5)

    def fit(self, X, y):
#        print(self.sp)

#        Xr = self.sp.fit_transform(X, y)
        self.clf.fit(X, y.ravel())
 
    def predict(self, X):
#        Xr = self.sp.transform(X)
        return self.clf.predict(X)
Exemplo n.º 43
0
Arquivo: ensemble.py Projeto: smly/ume
class BaggingRegressor(BaseEstimator):
    """
    Usage:

    ```
    "model": {
        "class": "ume.ensemble.BaggingRegressor",
        "params": {
            "base_estimator": {
                "class": "sklearn.svm.SVR",
                "params": {
                    "kernel": "rbf",
                    "degree": 1,
                    "C": 1000000.0,
                    "epsilon": 0.01,
                },
            },
            "bag_kwargs": {
                "n_estimators": 100,
                "n_jobs": 5,
                "max_samples": 0.9,
            },
        }
    }
    ```
    """
    def __init__(self, base_estimator=None, bag_kwargs=None):
        klass = dynamic_load(base_estimator['class'])
        svr_reg = klass(**base_estimator['params'])
        self.__clf = SK_BaggingRegressor(base_estimator=svr_reg, **bag_kwargs)

    def fit(self, X, y):
        return self.__clf.fit(X, y)

    def predict(self, X):
        return self.__clf.predict(X)
sorted_pairs = sorted(pairs, key = lambda pair: pair[1])
features_sorted, featImportances_sorted = zip(*sorted_pairs)
fig, ax = plt.subplots()
plt.barh(pos, featImportances_sorted, 1, color = "blue")
plt.yticks(pos,features_sorted)
ax.set_title('Gradient Boosting: Relative Feature Importance')

#Tree Bagging
TreeBagger=BaggingRegressor()
TreeBagger.fit(Xtrain, Ytrain)
fig = plt.figure()
ax1 = fig.add_subplot(2, 1, 1)
ax1.plot_date(dates, modeldata.Load[45000:50000], 'r-',tz=None, xdate=True,
          ydate=False, label='Actual Load')
ax1.set_title('Tree Bagging: Actual and Predicted Loads')          
plt.plot(dates, TreeBagger.predict(Xtest), 'g-',label='Predicted Load')
ax1.legend()
ax2 = fig.add_subplot(2, 1, 2)
ax2.plot_date(dates, modeldata.Load[45000:50000]-TreeBagger.predict(Xtest), 'r-',tz=None, xdate=True,
          ydate=False)
ax2.set_title('Error between actual and predicted loads, MW')

MSEs_Bagging=[mean_squared_error(Ytest, TreeBagger.predict(Xtest)), mean_squared_error(Ytrain, TreeBagger.predict(Xtrain))]

#Model Comparison: Bar charts
fig, ax = plt.subplots()
width=.3
rects1 = ax.bar([0,1,2], [MSEs_Boost[0],MSEs_lm[0], MSEs_Bagging[0]], width, color='y')
rects2 = ax.bar([width, width+1, width+2], [MSEs_Boost[1],MSEs_lm[1], MSEs_Bagging[1]], width, color='b')
ax.set_xticks([width, width+1, width+2])
ax.set_xticklabels(('Gradient Boosting', 'Linear Model', 'Tree Bagging'))
Exemplo n.º 45
0
# define the training and testing sets
X_train = train.iloc[:, 1:]
y_train = train.iloc[:, 0]
X_test = test.iloc[:, 1:]
y_test = test.iloc[:, 0]


# instruct BaggingRegressor to use DecisionTreeRegressor as the "base estimator"
from sklearn.ensemble import BaggingRegressor
bagreg = BaggingRegressor(DecisionTreeRegressor(), n_estimators=500, bootstrap=True, oob_score=True, random_state=1)


# fit and predict
bagreg.fit(X_train, y_train)
y_pred = bagreg.predict(X_test)
y_pred


# calculate RMSE
np.sqrt(metrics.mean_squared_error(y_test, y_pred))


# ## Estimating out-of-sample error
# 
# For bagged models, out-of-sample error can be estimated without using **train/test split** or **cross-validation**!
# 
# On average, each bagged tree uses about **two-thirds** of the observations. For each tree, the **remaining observations** are called "out-of-bag" observations.

# show the first bootstrap sample
samples[0]
# 大家都看过知识问答的综艺节目中, 求助现场观众时候, 让观众投票, 最高的答案作为自己的答案的形式吧, 每个人都有一个判定结果,
#     最后我们相信答案在大多数人手里.
# 再通俗一点举个例子. 你和你班某数学大神关系好, 每次作业都『模仿』他的, 于是绝大多数情况下, 他做对了, 你也对了.
#     突然某一天大神脑子犯糊涂, 手一抖, 写错了一个数, 于是…恩, 你也只能跟着错了.
# 我们再来看看另外一个场景, 你和你班 5 个数学大神关系都很好, 每次都把他们作业拿过来, 对比一下, 再『自己做』, 那你想想,
#     如果哪天某大神犯糊涂了, 写错了, but 另外四个写对了啊, 那你肯定相信另外 4 人的是正确答案吧?
# 最简单的模型融合大概就是这么个意思, 比如分类问题, 当我们手头上有一堆在同一份数据集上训练得到的分类器
#     (比如 logistic regression, SVM, KNN, random forest, 神经网络), 那我们让他们都分别去做判定, 然后对结果做投票统计, 取票数最多的结果为最后结果.

# 模型融合可以比较好地缓解, 训练过程中产生的过拟合问题, 从而对于结果的准确度提升有一定的帮助.
# 话说回来, 回到我们现在的问题. 你看, 我们现在只讲了 logistic regression, 如果我们还想用这个融合思想去提高我们的结果, 我们该怎么做呢?
# 既然这个时候模型没得选, 那咱们就在数据上动动手脚咯. 大家想想, 如果模型出现过拟合现在, 一定是在我们的训练上出现拟合过度造成的对吧.
# 那我们干脆就不要用全部的训练集, 每次取训练集的一个 subset, 做训练, 这样, 我们虽然用的是同一个机器学习算法,
#     但是得到的模型却是不一样的;同时, 因为我们没有任何一份子数据集是全的, 因此即使出现过拟合, 也是在子训练集上出现过拟合,
#     而不是全体数据上, 这样做一个融合, 可能对最后的结果有一定的帮助. 对, 这就是常用的 Bagging.
# 我们用 scikit-learn 里面的 Bagging 来完成上面的思路, 过程非常简单. 代码如下:
from sklearn.ensemble import BaggingRegressor
train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title')
train_np = train_df.as_matrix()
y = train_np[:, 0]  # y 即 Survival 结果
X = train_np[:, 1:]  # X 即特征属性值
clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)  # fit 到 BaggingRegressor 之中
bagging_clf = BaggingRegressor(clf, n_estimators=20, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=-1)
bagging_clf.fit(X, y)
test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass.*|Mother|Child|Family|Title')
predictions = bagging_clf.predict(test)
result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)})
result.to_csv("./tmp_dataset/Kaggle-Titanic/result.csv", index=False)
# 0.75598; 竟然更低了, 可能是 BaggingRegressor 随机分配的时候运气不好
# 上一个结果和博客中作者的结果一毛一样, 第二次竟然不一样了
Exemplo n.º 47
0
model = grid_search.GridSearchCV(estimator=clf, param_grid=param_grid, n_jobs=-1, cv=5, verbose=0, scoring=RMSE)
errors = []
X_train = df.drop(['product_uid', 'id', 'relevance'], axis=1).values
y_train = df['relevance'].values
model.fit(X_train, y_train)

print("Best parameters found by grid search:")
print(model.best_params_)
print("Best CV score:")
print(model.best_score_)

del X_train, y_train



kf = KFold(df.shape[0], n_folds=K_fold)
for train_index, test_index in kf:
    train_set = df.iloc[train_index]
    test_set = df.iloc[test_index]

    y_train = train_set['relevance'].values
    X_train = train_set.drop(['product_uid', 'id', 'relevance'], axis=1).values
    y_test = train_set['relevance'].values
    X_test = test_set.drop(['product_uid', 'id', 'relevance'], axis=1).values

    clf2.fit(X_train,y_train)

    result = clf2.predict(X_test)
    error = np.sqrt(mean_squared_error(result,y_test))
    errors.extend([error])
print np.mean(errors)
Exemplo n.º 48
0
rf = RandomForestRegressor()
br = BaggingRegressor(rf)

pipe = pipeline.Pipeline([('rf', rf), ('br', br)])

parameters = dict(rf__n_estimators=[5, 10, 15, 20], rf__max_depth=[2, 4, 6, 8, 10], rf__random_state=[0, 5, 10, 15],
	br__n_estimators=[5, 15, 25, 35, 45, 55], br__max_samples=[0.1, 0.2, 0.3], br__random_state=[0, 5, 10, 15, 20, 25, 30])
model = grid_search.GridSearchCV(pipe, parameters)
model.fit(features_train, labels_train)

print("Best parameters:")
print(model.best_params_)
print("Best CV score:")
print(model.best_score_)

#Best parameters:
#{'br__max_samples': 0.1, 'br__n_estimators': 45, 'rf__max_depth': 6, 'br__random_state': 25, 'rf__random_state': 0, 'rf__n_estimators': 5}
#Best CV score: 0.13390585367

pred = model.predict(features_test)
"""

# Use the best parameters from gridsearch
rf = RandomForestRegressor(n_estimators=5, max_depth=6, random_state=0)
clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)

# Write predicted numbers to submission.csv file
pd.DataFrame({"id": id_test, "relevance": pred}).to_csv('submission.csv',index=False)
Exemplo n.º 49
0
        
        lm_bagged = BaggingRegressor(
          base_estimator = lm, 
          n_estimators = 75, 
          max_samples = n_samp, 
          max_features = n_feat,
          bootstrap = True, 
          oob_score = False, 
          warm_start = False, 
          n_jobs = -1
        )
        
        log_bagged = BaggingClassifier(
          base_estimator = log, 
          n_estimators = 75, 
          max_samples = n_samp, 
          max_features = n_feat,
          bootstrap = True, 
          oob_score = False, 
          warm_start = False, 
          n_jobs = -1
        )
        
        lm_bagged.fit(X = train[features], y = train['y'])
        log_bagged.fit(X = train[features], y = train['y'])        
        lm_bagged_preds = lm_bagged.predict(X = test[features])
        log_bagged_preds = log_bagged.predict_proba(X = test[features])
        
        write_function(lm_bagged_preds, '/tmp/lm_bagged_preds_nsamp-%s_nfeat-%s.txt' % (n_samp, n_feat))
        write_function(second_pos_clip(log_bagged_preds), '/tmp/log_bagged_preds_nsamp-%s_nfeat-%s.txt' % (n_samp, n_feat))
#RMSE :	0.508722146314
#trainSetFeatures = SelectKBest(f_regression, k=180).fit_transform(trainSetFeatures, trainSetLabels)
#testSetFeatures = SelectKBest(f_regression, k=180).fit_transform(testSetFeatures, testSetLabels)

#RMSE : 0.508874063656
#trainSetFeatures = SelectKBest(f_regression, k=200).fit_transform(trainSetFeatures, trainSetLabels)
#testSetFeatures = SelectKBest(f_regression, k=200).fit_transform(testSetFeatures, testSetLabels)

#RMSE :0.512003679803
#trainSetFeatures = SelectKBest(f_regression, k=220).fit_transform(trainSetFeatures, trainSetLabels)
#testSetFeatures = SelectKBest(f_regression, k=220).fit_transform(testSetFeatures, testSetLabels)


print "\nBegin training..."
#train the model
random_forest_regressor = RandomForestRegressor(n_estimators=15, max_depth=100, random_state=0)
bagging_regressor = BaggingRegressor(random_forest_regressor, n_estimators=45, max_samples=0.1, random_state=25)
bagging_regressor.fit(trainSetFeatures, trainSetLabels)

print "\nBegin prediction..."
#make the prediction on the test set
predictedLabels = bagging_regressor.predict(testSetFeatures)

print "\nOutput the result..."
#output the prediction
testSetId = testSet['id']
pd.DataFrame({"id": testSetId, "relevance": predictedLabels}).to_csv('IOFolder/random_forest_results.csv', index=False)

print "RMSE :\t", utils.getRMSE(testSetLabels, predictedLabels)
    plt.legend(loc='upper right')
    plt.grid(b=True)

    plt.subplot(132)
    t = np.arange(N)
    plt.plot(t, x, 'r-', lw=1, label=u'原始数据')
    plt.plot(abnormal, x[abnormal], 'go', markeredgecolor='g', ms=3, label=u'异常值')
    plt.legend(loc='upper right')
    plt.title(u'异常检测', fontsize=18)
    plt.grid(b=True)

    # 预测
    plt.subplot(133)
    select = np.ones(N, dtype=np.bool)
    select[abnormal] = False
    t = np.arange(N)
    dtr = DecisionTreeRegressor(criterion='mse', max_depth=10)
    br = BaggingRegressor(dtr, n_estimators=10, max_samples=0.3)
    br.fit(t[select].reshape(-1, 1), x[select])
    y = br.predict(np.arange(N).reshape(-1, 1))
    y[select] = x[select]
    plt.plot(x, 'g--', lw=1, label=u'原始值')    # 原始值
    plt.plot(y, 'r-', lw=1, label=u'校正值')     # 校正值
    plt.legend(loc='upper right')
    plt.title(u'异常值校正', fontsize=18)
    plt.grid(b=True)

    plt.tight_layout(1.5, rect=(0, 0, 1, 0.95))
    plt.suptitle(u'排污数据的异常值检测与校正', fontsize=22)
    plt.show()
Exemplo n.º 52
0
# Getting Testing Data out of the DF
test_data_frame = data_frame_regression.iloc[num_train:]

# Getting IDs for Testing Data
id_test = test_data_frame['id']

relevance_train = train_data_frame['relevance'].values

# All the Independent Variables in the Regressor
# These are Words in Title, Desription, Values
X_train = train_data_frame.drop(['id', 'relevance'], axis=1).values

# Same for Test Data
X_test = test_data_frame.drop(['id', 'relevance'], axis=1).values

# Using RandomForest Regressor
rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)

# Using Bagging Regressor
clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)

# Fit the Training Data to a Model
clf.fit(X_train, relevance_train)

# Predicting the relevance for Testind Data
relevance_pred = clf.predict(X_test)

# Writing the Relevance Values to Submission.csv
pandas.DataFrame({"id": id_test, "relevance": relevance_pred}).to_csv('submission.csv', index=False)
Exemplo n.º 53
0
for train, test in kf:
	TR.append(train)
	TS.append(test)

A = []
B = []

for k in range(kfcv):
	print k
	X_train = X[TR[k], :]
	y_train = y[TR[k]]
	X_test = X[TS[k], :]
	y_test = y[TS[k]]

	model.fit(X_train, y_train)
	y_predict = model.predict(X_test)

#	plt.subplot(2, 10, k + 1)
#	plt.scatter(y_predict, y_test)
#	plt.xlabel('y_predict')	
#	plt.ylabel('y_true')
#	plt.title('Fold = %d' % (k + 1))

	A.extend(list(y_predict))
	B.extend(list(y_test))

#	mse = mean_squared_error(y_predict, y_test)
#	print 'mse = %f' % mse

mse = mean_squared_error(A, B)
              }

        
model_gbr_allfeatures = grid_search.GridSearchCV(estimator =gbr, param_grid = parameters, n_jobs = -1, cv = 2, verbose = 20, scoring='mean_squared_error')
model_gbr_allfeatures.fit(X_train, Y_train)
print(model_gbr_allfeatures.best_params_) #'max_depth': 6, 'n_estimators': 500, 'learning_rate': 0.1, 'max_features': 'auto'


predictions_gbr_allfeatures = model_gbr_allfeatures.predict(X_test)
mean_squared_error(Y_test, predictions_gbr_allfeatures) #7.071566



#ensembling randomForest model using bagging
bag = BaggingRegressor(rfr, n_estimators=500, max_samples=0.1, random_state=25)
bag.fit(X_train, Y_train)
predictions_rfr_bagging = bag.predict(X_test)
mean_squared_error(Y_test, predictions_rfr_bagging)

#recursive selection of features for randomForest
from sklearn.feature_selection import RFECV
rfecv = RFECV(estimator=rfr, step=1, cv=3,
              scoring='mean_squared_error')
rfecv.fit(X_train, Y_train)
print("Optimal number of features : %d" % rfecv.n_features_) 




                     
Exemplo n.º 55
0
#     midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2
#     diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1])
#     return midpoint, diff
#
# plot_learning_curve(model, u"learning  curve", X, Y)


# 6...................模型融合.....................#


train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
train_np = train_df.as_matrix()

# y即Survival结果
y = train_np[:, 0]

x = train_np[:, 1:]


# fit 到BaggingRegressor之中
model = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
bagging_model = BaggingRegressor(model,n_estimators=20,max_samples=0.8,max_features=1.0,bootstrap=True,bootstrap_features=False,n_jobs=-1)
bagging_model.fit(x,y)

test = df_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
predictions = bagging_model.predict(test)
result = pd.DataFrame({'PassengerId':data_test['PassengerId'].as_matrix(),'Survived':predictions.astype(np.int32 )})
result.to_csv('./result.csv',index=False)


Exemplo n.º 56
0
if not os.path.exists(out_dir): os.mkdir(out_dir)

filenames = []

for sts12_train_id, sts12_test_id, sts13_test_id, sts14_test_id in id_pairs:
    # combine 2012, 2013 training and test data
    X_sts12_train, y_sts12_train = ntnu_sts12.read_train_data(sts12_train_id, feats)
    X_sts12_test, y_sts12_test = ntnu_sts12.read_test_data(sts12_test_id, feats)
    X_sts13_test, y_sts13_test = sts13.read_test_data(sts13_test_id, feats)
    X_train = np.vstack([X_sts12_train, X_sts12_test, X_sts13_test])
    y_train = np.hstack([y_sts12_train, y_sts12_test, y_sts13_test])

    regressor.fit(X_train, y_train)

    X_test = read_blind_test_data(sts14_test_id, feats)
    y_test = regressor.predict(X_test)

    test_input = read_system_input(test_input_fnames[sts14_test_id])
    postprocess(test_input,  y_test)

    fname =  "{}/STS-en.output.{}.txt".format(out_dir, sts14_test_id)
    write_scores(fname, y_test)
    filenames.append(fname)

descr_fname = "{}/STS-en-{}-{}.description.txt".format(out_dir, GROUP, APPROACH)
open(descr_fname, "w").write(DESCRIPTION)
filenames.append(descr_fname)

filenames = " ".join(filenames)

zipfile = "STS-en-{}-{}.zip".format(GROUP, APPROACH)
Exemplo n.º 57
0
    ]
    full_predictions = []
    for alg, predictors in algorithms:
        if alg == "xgboost_Label":
            full_predictions.append(xgboost_Label(train, test, labels))
        elif alg == "xgboost_Vect":
            full_predictions.append(xgboost_Vect(train, test, labels))
        elif alg == "xgboost_Dummies":
            full_predictions.append(xgboost_Dummies(train, test, labels))
        else:
            if predictors == "dummies":
                print ("Train ", alg.__class__.__name__, " dummies Model ")
                alg = BaggingRegressor(alg)
                alg.fit(train_du, labels)
                print "Prediction :", alg.__class__.__name__, " dummies Model "
                prediction = alg.predict(test_du)
                full_predictions.append(prediction)
            else:
                print ("Train ", alg.__class__.__name__, " Label Model ")
                alg = BaggingRegressor(alg)
                alg.fit(train_rf, labels)
                print "Prediction :", alg.__class__.__name__, " Label Model "
                prediction = alg.predict(test_rf)
                full_predictions.append(prediction)

                # Ensemble models
    RF_label_pred = full_predictions[0]
    RF_dummies_pred = full_predictions[1]
    pred_xgb_dummies = full_predictions[2]
    pred_xgb_Label = full_predictions[3]
    pred_xgb_Vect = full_predictions[4]
Exemplo n.º 58
0
df_all['letter_in_description'] = df_all['product_info'].map(
        lambda x: str_common_letter(x.split('\t')[0], x.split('\t')[2]))

print("Drop columns that were changed...")
df_all = df_all.drop(['search_term', 'product_title', 'product_description', 'product_info'], axis=1)

# Set up training and test sets
df_train = df_all.iloc[:num_train]
df_test = df_all.iloc[num_train:]

id_test = df_test['id']
y_train = df_train['relevance'].values

# Drop 'id' and 'relevance' columns from the training and test sets
X_train = df_train.drop(['id', 'relevance'], axis=1).values
X_test = df_test.drop(['id', 'relevance'], axis=1).values

# Setup RandomForest and Bagging Regressors
rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)

# Fit the training data into the regression model using the output values
clf.fit(X_train, y_train)

# Run the prediction
y_pred = clf.predict(X_test)

# Set up our Data Frame
datafr = pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('../dataset/submission.csv', index=False)
print(datafr)
# -*- coding: utf-8 -*-

import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt

def f(x):
    return 0.5 * np.exp(-(x+3)**2) + np.exp(-x**2) + 0.5 * np.exp(-(x-3)**2)

N = 200 # 200 samples

x_train = np.linspace(-5.5, 5.5, N)
X_train = pd.DataFrame({"x": x_train})
y_train = f(x_train) + (np.random.rand(N) - 0.5) * (2 * 0.05)

dtr = DecisionTreeRegressor(max_depth=5)
br = BaggingRegressor(dtr, n_estimators=200, max_samples=0.2)
br.fit(X_train, y_train)

x_test = np.linspace(x_train.min() * 1.1, x_train.max() * 1.1, 1000)
X_test = pd.DataFrame({"x": x_test})
y_test = f(x_test)
y_predict = br.predict(X_test)

plt.scatter(x_train, y_train)
plt.scatter(x_test, y_test)
plt.scatter(x_test, y_predict)
plt.show()