hour_list.append(date.hour)

rental_data["month"] = np.array(month_list)
rental_data["day"] = np.array(day_list)
rental_data["hour"] = np.array(hour_list)

del rental_data["datetime"]
rental_data = rental_data.iloc[np.random.permutation(len(rental_data))]
rental_counts = rental_data["count"].values

train_data,test_data,train_counts,test_counts = cross_validation.train_test_split(rental_data.values,rental_counts,test_size=0.2)

rf = RandomForestRegressor(n_estimators=101)
ada = AdaBoostRegressor(n_estimators=101)
grad = GradientBoostingRegressor(n_estimators=101)
bagging = BaggingRegressor(n_estimators=101)
svr = SVR()

regressors = [rf,ada,grad,bagging,svr]
regressor_names = ["Random Forests","Adaboost Regressor","Gradient Boost Regressor","Bagging Regressor","Support Vector Regressor"]

for regressor,regressor_name in zip(regressors,regressor_names):
    
    regressor.fit(train_data,train_counts)
    predicted_counts = regressor.predict(test_data)
    
    print "-----------------------------------------\n"
    print "Mean Absolute Error for ",regressor_name," : ",metrics.mean_absolute_error(test_counts,predicted_counts)
    print "Median Absolute Error for ",regressor_name," : ",metrics.median_absolute_error(test_counts,predicted_counts)
    print "Mean Squared Error for ",regressor_name," : ",metrics.mean_squared_error(test_counts,predicted_counts)
    print "R2 Score for ",regressor_name, " : ",metrics.r2_score(test_counts,predicted_counts)
Exemplo n.º 2
0
                                  reducer__svd_solver=['auto'])
reducers_cfg[GenericUnivariateSelect.__name__] = dict(
    reducer__score_func=[f_regression],
    reducer__mode=['k_best'],
    reducer__param=[])
reducers_cfg[RFE.__name__] = dict(reducer__n_features_to_select=[],
                                  reducer__step=[0.1])

#########################
####### Models ##########
#########################
#models = [LinearSVC(),MLPClassifier(),GradientBoostingClassifier(),RandomForestClassifier(),LogisticRegression()]
#models = [AdaBoostClassifier(),BaggingClassifier(),ExtraTreesClassifier(),GradientBoostingClassifier(),RandomForestClassifier(),PassiveAggressiveClassifier(),LogisticRegression(),RidgeClassifier(),SGDClassifier(),GaussianNB(),MultinomialNB(),KNeighborsClassifier(),RadiusNeighborsClassifier(),NearestCentroid(),MLPClassifier(),SVC(),LinearSVC(),NuSVC(),DecisionTreeClassifier(),ExtraTreeClassifier()]
#models_reg = [AdaBoostRegressor(),BaggingRegressor(),ExtraTreesRegressor(),GradientBoostingRegressor(),RandomForestRegressor(),ElasticNet(),HuberRegressor(),Lasso(),LassoLars(),LinearRegression(),PassiveAggressiveRegressor(),Ridge(),SGDRegressor(),OrthogonalMatchingPursuit(),RANSACRegressor(),KNeighborsRegressor(),RadiusNeighborsRegressor(),MLPRegressor(),SVR(),LinearSVR(),NuSVR(),DecisionTreeRegressor(),ExtraTreeRegressor()]
models_reg = [
    BaggingRegressor(),
    ExtraTreesRegressor(),
    GradientBoostingRegressor()
]

models_class = [GradientBoostingClassifier()]
models_class_cfg = {}
models_cfg = {}

#full params - dont work
'''
models_cfg[BaggingClassifier.__name__] = dict(
    model__n_estimators = [10, 50, 100, 130],
    model__bootstrap = [True, False],
    model__bootstrap_features = [True, False],
)
Exemplo n.º 3
0
def batch_classify(X_train, Y_train, X_test, Y_test, no_classifiers = 5, verbose = True):

dict_models = {}
for classifier_name, classifier in list(dict_classifiers.items())[:no_classifiers]:
    t_start = time.clock()
    classifier.fit(X_train, Y_train)
    t_end = time.clock()

    t_diff = t_end - t_start
    train_score = classifier.score(X_train, Y_train)
    test_score = classifier.score(X_test, Y_test)
    
    dict_models[classifier_name] = {'model': classifier, 'train_score': train_score, 'test_score': test_score, 'train_time': t_diff}
    if verbose:
        print("trained {c} in {f:.2f} s".format(c=classifier_name, f=t_diff))
return dict_models
 
 
 
def display_dict_models(dict_models, sort_by='test_score'):
    cls = [key for key in dict_models.keys()]
    test_s = [dict_models[key]['test_score'] for key in cls]
    training_s = [dict_models[key]['train_score'] for key in cls]
    training_t = [dict_models[key]['train_time'] for key in cls]
    
    df_ = pd.DataFrame(data=np.zeros(shape=(len(cls),4)), columns = ['classifier', 'train_score', 'test_score', 'train_time'])
    for ii in range(0,len(cls)):
        df_.loc[ii, 'classifier'] = cls[ii]
        df_.loc[ii, 'train_score'] = training_s[ii]
        df_.loc[ii, 'test_score'] = test_s[ii]
        df_.loc[ii, 'train_time'] = training_t[ii]
    
    display(df_.sort_values(by=sort_by, ascending=False))

# classification trees

def classification_tree(X_train, y_train):
 tree_clf = DecisionTreeClassifier(max_depth = 2)
 tree_clf.fit(X_train,Y_train)

 from sklearn.tree import export_graphviz
 export_graphviz(tree_clf,out_file = "classification_tree.dot",max_depth = None,feature_names = None,
    rounded = True,
    filled = True)


# ensemble learners

def ensemble_learner(x, y):
    log_clf = LogisticRegression()
    rnd_reg = RandomForestRegressor()
    svm_reg = LinearSVR()

    voting_clf = VotingClassifier(
    estimators = [('lr', log_clf), ('rf', rnd_reg), ('svc', svm_reg)],
    voting = 'soft')  

#bootstrap aggegation
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score

def bagging_regressor(x, y):
   for max_number_samples = len(x):
        if n_estimators  < len(y):
       bag_reg = BaggingRegressor(DecisionTreeRegressor(), n_estimators,
       max_samples = 50, bootstrap = True, n_jobs = -1)
       bag_reg.fit(x, y)    # n_jobs denotes the number of processors dedicated to the the process
       y_pred = bag_reg.predict(x)
Exemplo n.º 4
0
nJOBS = 8

cnt = 0
for attrib in attribs:
    workARR = attrib
    label = labels[cnt]
    print(label)

    randForrC = RandomForestClassifier(n_jobs=nJOBS, n_estimators=500)
    randForrR = RandomForestRegressor(n_jobs=nJOBS, n_estimators=500)

    adaBoostC = AdaBoostClassifier(n_estimators=250)
    adaBoostR = AdaBoostRegressor(n_estimators=250)

    bagCoobN = BaggingClassifier(n_estimators=500, n_jobs=nJOBS)
    bagRoobN = BaggingRegressor(n_estimators=500, n_jobs=nJOBS)

    bagCoobY = BaggingClassifier(n_estimators=250,
                                 oob_score=True,
                                 n_jobs=nJOBS)
    bagRoobY = BaggingRegressor(n_estimators=250, oob_score=True, n_jobs=nJOBS)

    bernNB = BernoulliNB()
    gausRidge = linear_model.Ridge(max_iter=1e9, tol=1e-6)

    gradBoostC = GradientBoostingClassifier(n_estimators=500, max_depth=2500)
    gradBoostR = GradientBoostingRegressor(n_estimators=500, max_depth=2500)

    sdgC = linear_model.SGDClassifier(n_jobs=nJOBS)
    sdgR = linear_model.SGDRegressor()
Exemplo n.º 5
0
# 예측한 결과값들의 평균을 계산하여 실제 테스트 데이트의 타겟변수와 비교하여 성능 평가
print("RMSE: {}".format(sqrt(mean_squared_error(bagging_predict, test_y)))) # RMSE

## 학습 데이터를 선형 회귀 모형에 적합 후 평가 데이터로 검증 (Scikit-Learn)

from sklearn.linear_model import LinearRegression
regression_model = LinearRegression() # 선형 회귀 모형
linear_model1 = regression_model.fit(train_x, train_y) # 학습 데이터를 선형 회귀 모형에 적합
predict1 = linear_model1.predict(test_x) # 학습된 선형 회귀 모형으로 평가 데이터 예측
print("RMSE: {}".format(sqrt(mean_squared_error(predict1, test_y)))) # RMSE 결과

## Bagging 을 이용하여 선형 회귀 모형에 적합 후 평가 (Sampling 10번)

from sklearn.ensemble import BaggingRegressor
bagging_model = BaggingRegressor(base_estimator = regression_model, # 선형회귀모형
                                 n_estimators = 5, # 5번 샘플링
                                 verbose = 1) # 학습 과정 표시
linear_model2 = bagging_model.fit(train_x, train_y) # 학습 진행
predict2 = linear_model2.predict(test_x) # 학습된 Bagging 선형 회귀 모형으로 평가 데이터 예측
print("RMSE: {}".format(sqrt(mean_squared_error(predict2, test_y)))) # RMSE 결과

## 그렇다면 Sampling을 많이 해보자!

bagging_model2 = BaggingRegressor(base_estimator = regression_model, # 선형 회귀모형
                                  n_estimators = 30, # 30번 샘플링
                                  verbose = 1) # 학습 과정 표시
linear_model3 = bagging_model2.fit(train_x, train_y) # 학습 진행
predict3 = linear_model3.predict(test_x) # 학습된 Bagging 선형 회귀 모형으로 평가 데이터 예측
print("RMSE: {}".format(sqrt(mean_squared_error(predict3, test_y)))) # RMSE 결과

## 학습 데이터를 의사결정나무모형에 적합 후 평가 데이터로 검증
Exemplo n.º 6
0
    def __init__(self):
        '''
    Define the Classifiers to be Used for 
    @Classifiers:
                List of Tuples
    @Pipeline: Channel of Estimators
    @Employ the use of GridSearchCV
    Predicting Returns
    '''
        self.N_NEIGBORS = 10
        self.KERNELS = ['linear', 'rbf']
        self.GAMMA = [0.0001, 0.001, 0.01, 1]
        self.CRITERION = ['gini', 'entropy']
        self.MAX_DEPTH = 5
        self.MAX_FEATURES = ['auto', 'sqrt', 'log2']
        self.N_VALIDATION = 2
        self.N_COMPONENTS = 2
        self.BEST_ACCURACY = 0.0
        self.BEST_CLASSIFIER = 0
        self.BEST_GRIDSEARCH = ''

        #Support vector regressor
        self.pipe_SVR = Pipeline([('normalizer', StandardScaler()),
                                  ('clf', SVR())])
        self.pipe_SVR_PCA = Pipeline([('normalizer', StandardScaler()),
                                      ('PCA',
                                       PCA(n_components=self.N_COMPONENTS)),
                                      ('clf', SVR())])
        #Adaboost regressor
        self.pipe_AdaBoostRegressor = Pipeline([('normalizer',
                                                 StandardScaler()),
                                                ('clf', AdaBoostRegressor())])
        self.pipe_AdaBoostRegressor_PCA = Pipeline([
            ('normalizer', StandardScaler()),
            ('PCA', PCA(n_components=self.N_COMPONENTS)),
            ('clf', AdaBoostRegressor())
        ])
        #RandomForest Regressor
        self.pipe_RandomForestRegressor = Pipeline([
            ('normalizer', StandardScaler()), ('clf', RandomForestRegressor())
        ])
        self.pipe_RandomForestRegressor_PCA = Pipeline([
            ('normalizer', StandardScaler()),
            ('PCA', PCA(n_components=self.N_COMPONENTS)),
            ('clf', RandomForestRegressor())
        ])
        #Gradient boosting regressor
        self.pipe_GradientBoostingRegressor = Pipeline([
            ('normalizer', StandardScaler()),
            ('clf', GradientBoostingRegressor())
        ])
        self.pipe_GradientBoostingRegressor_PCA = Pipeline([
            ('normalizer', StandardScaler()),
            ('PCA', PCA(n_components=self.N_COMPONENTS)),
            ('clf', GradientBoostingRegressor())
        ])
        #Bagging regressor
        self.pipe_BaggingRegressor = Pipeline([('normalizer',
                                                StandardScaler()),
                                               ('clf', BaggingRegressor())])
        self.pipe_BaggingRegressor_PCA = Pipeline([
            ('normalizer', StandardScaler()),
            ('PCA', PCA(n_components=self.N_COMPONENTS)),
            ('clf', BaggingRegressor())
        ])
        #Extratrees regressor
        self.pipe_ExtraTreesRegressor = Pipeline([
            ('normalizer', StandardScaler()), ('clf', ExtraTreesRegressor())
        ])
        self.pipe_ExtraTreesRegressor_PCA = Pipeline([
            ('normalizer', StandardScaler()),
            ('PCA', PCA(n_components=self.N_COMPONENTS)),
            ('clf', ExtraTreesRegressor())
        ])
        #DecisionTreeRegressor
        self.pipe_DecisionTreeRegressor = Pipeline([
            ('normalizer', StandardScaler()), ('clf', DecisionTreeRegressor())
        ])
        self.pipe_DecisionTreeRegressor_PCA = Pipeline([
            ('normalizer', StandardScaler()),
            ('PCA', PCA(n_components=self.N_COMPONENTS)),
            ('clf', DecisionTreeRegressor())
        ])
        #KNeighborsRegressor
        self.pipe_KNeighborsRegressor = Pipeline([
            ('normalizer', StandardScaler()), ('clf', KNeighborsRegressor())
        ])
        self.pipe_KNeighborsRegressor_PCA = Pipeline([
            ('normalizer', StandardScaler()),
            ('PCA', PCA(n_components=self.N_COMPONENTS)),
            ('clf', KNeighborsRegressor())
        ])
        #RadiusNeighborsRegressor
        self.pipe_RadiusNeighborsRegressor = Pipeline([
            ('normalizer', StandardScaler()),
            ('clf', RadiusNeighborsRegressor())
        ])
        self.pipe_RadiusNeighborsRegressor_PCA = Pipeline([
            ('normalizer', StandardScaler()),
            ('PCA', PCA(n_components=self.N_COMPONENTS)),
            ('clf', RadiusNeighborsRegressor())
        ])
        #LinearRegression
        self.pipe_LinearRegression = Pipeline([('normalizer',
                                                StandardScaler()),
                                               ('clf', LinearRegression())])
        self.pipe_LinearRegression_PCA = Pipeline([
            ('normalizer', StandardScaler()),
            ('PCA', PCA(n_components=self.N_COMPONENTS)),
            ('clf', LinearRegression())
        ])

        #LogisticRegression
        self.pipe_LogisticRegression = Pipeline([
            ('normalizer', StandardScaler()), ('clf', LogisticRegression())
        ])
        self.pipe_LogisticRegression_PCA = Pipeline([
            ('normalizer', StandardScaler()),
            ('PCA', PCA(n_components=self.N_COMPONENTS)),
            ('clf', LogisticRegression())
        ])
        #RANSACRegressor
        self.pipe_RANSACRegressor = Pipeline([('normalizer', StandardScaler()),
                                              ('clf', RANSACRegressor())])
        self.pipe_RANSACRegressor_PCA = Pipeline([
            ('normalizer', StandardScaler()),
            ('PCA', PCA(n_components=self.N_COMPONENTS)),
            ('clf', RANSACRegressor())
        ])
        #Ridge
        self.pipe_Ridge = Pipeline([('normalizer', StandardScaler()),
                                    ('clf', Ridge())])
        self.pipe_Ridge_PCA = Pipeline([('normalizer', StandardScaler()),
                                        ('PCA',
                                         PCA(n_components=self.N_COMPONENTS)),
                                        ('clf', Ridge())])
        #Lasso
        self.pipe_Lasso = Pipeline([('normalizer', StandardScaler()),
                                    ('clf', Lasso())])
        self.pipe_Ridge_PCA = Pipeline([('normalizer', StandardScaler()),
                                        ('PCA',
                                         PCA(n_components=self.N_COMPONENTS)),
                                        ('clf', Lasso())])

        self.pipe_KNN_param = [{
            'clf__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            'clf__leaf_size': [1, 2, 3, 5],
            'clf__weights': ['uniform', 'distance'],
            'algorithm': ['auto', 'ball_tree', 'kdtree', 'brute']
        }]

        self.pipe_SVR_params = [{
            'clf__kernel': self.KERNELS,
            'clf__C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            'clf__gamma': self.GAMMA
        }]

        self.pipe_AdaBoostRegressor_param = [{
            'clf__n_estimators':
            np.arange(1, 50),
            'clf__learning_rate': [0.01, 0.05, 0.1, 0.3, 1],
            'clf__loss': ['linear', 'square', 'exponential']
        }]

        self.pipe_RandomForestRegressor_params = [{
            'clf__criterion':
            self.CRITERION,
            'clf__max_depth':
            np.arange(2, 10),
            'clf__min_samples_split':
            np.arange(2, 10, 1),
            'clf__min_samples_leaf':
            np.arange(2, 10, 1),
            'clf__max_leaf_nodes':
            np.arange(2, 10, 1)
        }]

        self.pipe_GradientBoostingRegressor_param = [{
            'clf__max_depth':
            np.arange(2, 10)
        }]

        self.pipe_GaussianNB_params = [{'clf__priors': [None]}]

        self.pipe_GaussianProcessClassifier_params = [{
            'clf__kernel': [1**2 * RBF(1.0)]
        }]

        self.pipe_LogisticRegression_params = [{
            'clf__penalty': ['l1', 'l2'],
            'clf__C': [1.0, 0.5, 0.1],
            'clf__solver': ['liblinear']
        }]

        self.QuadraticDiscriminantAnalysis_params = [{'clf__priors': [None]}]
Exemplo n.º 7
0
def sklearnBagging(params):
    X = params['X_train']
    Y = params['y_train']
    name = params['name']
    model = BaggingRegressor()
    return MachinLearningModel(model, X, Y, modelType="Linear", name=name)
X_train, y_train = generate(n_samples=n_train, noise=noise)
X_test, y_test = generate(n_samples=n_test, noise=noise)

# One decision tree regressor
dtree = DecisionTreeRegressor().fit(X_train, y_train)
d_predict = dtree.predict(X_test)

plt.figure(figsize=(10, 8))
plt.plot(X_test, f(X_test), 'b')
plt.scatter(X_train, y_train, c='b', s=20)
plt.plot(X_test, d_predict, 'g', lw=2)
plt.xlim([-5, 5])
plt.title("Decision tree, MSE = %.2f" % np.sum((y_test - d_predict)**2))

# Bagging decision tree regressor
bdt = BaggingRegressor(DecisionTreeRegressor()).fit(X_train, y_train)
bdt_predict = bdt.predict(X_test)

plt.figure(figsize=(10, 8))
plt.plot(X_test, f(X_test), 'b')
plt.scatter(X_train, y_train, c='b', s=20)
plt.plot(X_test, bdt_predict, 'y', lw=2)
plt.xlim([-5, 5])
plt.title("Bagging decision tree, MSE = %.2f" % np.sum(
    (y_test - bdt_predict)**2))

# Random forest
rf = RandomForestRegressor(n_estimators=10).fit(X_train, y_train)
rf_predict = rf.predict(X_test)

plt.figure(figsize=(10, 8))
Exemplo n.º 9
0
del globals()['profiles']
del globals()['profilesLSo']
del globals()['profilesLS']
del globals()['row']
del globals()['tmpLS']
del globals()['tmpAGE']
del globals()['profsTOlikes']
del globals()['i']
del globals()['tmpIND']

seed = 7
myRand = np.random.seed(seed)
X_train, X_test, y_train, y_test = train_test_split(likesMAT,
                                                    consARR,
                                                    test_size=1500)

nJOBS = int(sys.argv[1])
nEST = int(sys.argv[2])
bagOUT = BaggingRegressor(n_jobs=nJOBS, n_estimators=nEST, oob_score=True)

#bagOUT.fit(likesMAT, consARR)
bagOUT.fit(X_train, y_train)

y_pred = bagOUT.predict(X_test)
import math
myRMSE = math.sqrt(metrics.mean_squared_error(y_test, y_pred))
print("cons, bagOUT:  ", str(nEST), " ", myRMSE)

# joblib.dump(bagOUT, "/Users/jamster/bagOUT-A-cons.xz", compress=9)

# impbagOUT = joblib.load("/Users/jamster/bagOUT-A-cons.xz")
Exemplo n.º 10
0
    def fit(self, X, y, sample_weight=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape (n_samples,)
            Target vector relative to X. Has to follow the convention 0 for
            normal data, 1 for anomalies.

        sample_weight : array-like, shape (n_samples,) optional
            Array of weights that are assigned to individual samples, typically
            the amount in case of transactions data. Used to grow regression
            trees producing further rules to be tested.
            If not provided, then each sample is given unit weight.

        Returns
        -------
        self : object
            Returns self.
        """

        X, y = check_X_y(X, y)
        check_classification_targets(y)
        self.n_features_ = X.shape[1]

        self.classes_ = np.unique(y)
        n_classes = len(self.classes_)

        if n_classes < 2:
            raise ValueError("This method needs samples of at least 2 classes"
                             " in the data, but the data contains only one"
                             " class: %r" % self.classes_[0])

        if not isinstance(self.max_depth_duplication, int) \
                and self.max_depth_duplication is not None:
            raise ValueError("max_depth_duplication should be an integer"
                             )
        if not set(self.classes_) == set([0, 1]):
            warn("Found labels %s. This method assumes target class to be"
                 " labeled as 1 and normal data to be labeled as 0. Any label"
                 " different from 0 will be considered as being from the"
                 " target class."
                 % set(self.classes_))
            y = (y > 0)

        # ensure that max_samples is in [1, n_samples]:
        n_samples = X.shape[0]

        if isinstance(self.max_samples, six.string_types):
            raise ValueError('max_samples (%s) is not supported.'
                             'Valid choices are: "auto", int or'
                             'float' % self.max_samples)

        elif isinstance(self.max_samples, INTEGER_TYPES):
            if self.max_samples > n_samples:
                warn("max_samples (%s) is greater than the "
                     "total number of samples (%s). max_samples "
                     "will be set to n_samples for estimation."
                     % (self.max_samples, n_samples))
                max_samples = n_samples
            else:
                max_samples = self.max_samples
        else:  # float
            if not (0. < self.max_samples <= 1.):
                raise ValueError("max_samples must be in (0, 1], got %r"
                                 % self.max_samples)
            max_samples = int(self.max_samples * X.shape[0])

        self.max_samples_ = max_samples

        self.rules_ = {}
        self.estimators_ = []
        self.estimators_samples_ = []
        self.estimators_features_ = []

        # default columns names :
        feature_names_ = [BASE_FEATURE_NAME + x for x in
                          np.arange(X.shape[1]).astype(str)]
        if self.feature_names is not None:
            self.feature_dict_ = {BASE_FEATURE_NAME + str(i): feat
                                  for i, feat in enumerate(self.feature_names)}
        else:
            self.feature_dict_ = {BASE_FEATURE_NAME + str(i): feat
                                  for i, feat in enumerate(feature_names_)}
        self.feature_names_ = feature_names_

        clfs = []
        regs = []

        self._max_depths = self.max_depth \
            if isinstance(self.max_depth, Iterable) else [self.max_depth]

        for max_depth in self._max_depths:
            bagging_clf = BaggingClassifier(
                base_estimator=DecisionTreeClassifier(
                    max_depth=max_depth,
                    max_features=self.max_features,
                    min_samples_split=self.min_samples_split),
                n_estimators=self.n_estimators,
                max_samples=self.max_samples_,
                max_features=self.max_samples_features,
                bootstrap=self.bootstrap,
                bootstrap_features=self.bootstrap_features,
                # oob_score=... XXX may be added
                # if selection on tree perf needed.
                # warm_start=... XXX may be added to increase computation perf.
                n_jobs=self.n_jobs,
                random_state=self.random_state,
                verbose=self.verbose)

            bagging_reg = BaggingRegressor(
                base_estimator=DecisionTreeRegressor(
                    max_depth=max_depth,
                    max_features=self.max_features,
                    min_samples_split=self.min_samples_split),
                n_estimators=self.n_estimators,
                max_samples=self.max_samples_,
                max_features=self.max_samples_features,
                bootstrap=self.bootstrap,
                bootstrap_features=self.bootstrap_features,
                # oob_score=... XXX may be added
                # if selection on tree perf needed.
                # warm_start=... XXX may be added to increase computation perf.
                n_jobs=self.n_jobs,
                random_state=self.random_state,
                verbose=self.verbose)

            clfs.append(bagging_clf)
            regs.append(bagging_reg)

        # define regression target:
        if sample_weight is not None:
            if sample_weight is not None:
                sample_weight = check_array(sample_weight, ensure_2d=False)
            weights = sample_weight - sample_weight.min()
            contamination = float(sum(y)) / len(y)
            y_reg = (
                    pow(weights, 0.5) * 0.5 / contamination * (y > 0) -
                    pow((weights).mean(), 0.5) * (y == 0))
            y_reg = 1. / (1 + np.exp(-y_reg))  # sigmoid
        else:
            y_reg = y  # same as an other classification bagging

        for clf in clfs:
            clf.fit(X, y)
            self.estimators_ += clf.estimators_
            self.estimators_samples_ += clf.estimators_samples_
            self.estimators_features_ += clf.estimators_features_

        for reg in regs:
            reg.fit(X, y_reg)
            self.estimators_ += reg.estimators_
            self.estimators_samples_ += reg.estimators_samples_
            self.estimators_features_ += reg.estimators_features_

        rules_ = []
        for estimator, samples, features in zip(self.estimators_,
                                                self.estimators_samples_,
                                                self.estimators_features_):

            # Create mask for OOB samples
            mask = ~samples
            if sum(mask) == 0:
                warn("OOB evaluation not possible: doing it in-bag."
                     " Performance evaluation is likely to be wrong"
                     " (overfitting) and selected rules are likely to"
                     " not perform well! Please use max_samples < 1.")
                mask = samples
            rules_from_tree = self._tree_to_rules(
                estimator, np.array(self.feature_names_)[features])

            # XXX todo: idem without dataframe
            X_oob = pandas.DataFrame((X[mask, :])[:, features],
                                     columns=np.array(
                                         self.feature_names_)[features])

            if X_oob.shape[1] > 1:  # otherwise pandas bug (cf. issue #16363)
                y_oob = y[mask]
                y_oob = np.array((y_oob != 0))

                # Add OOB performances to rules:
                rules_from_tree = [(r, self._eval_rule_perf(r, X_oob, y_oob))
                                   for r in set(rules_from_tree)]
                rules_ += rules_from_tree

        # Factorize rules before semantic tree filtering
        rules_ = [
            tuple(rule)
            for rule in
            [Rule(r, args=args) for r, args in rules_]]

        # keep only rules verifying precision_min and recall_min:
        for rule, score in rules_:
            if score[0] >= self.precision_min and score[1] >= self.recall_min:
                if rule in self.rules_:
                    # update the score to the new mean
                    c = self.rules_[rule][2] + 1
                    b = self.rules_[rule][1] + 1. / c * (
                            score[1] - self.rules_[rule][1])
                    a = self.rules_[rule][0] + 1. / c * (
                            score[0] - self.rules_[rule][0])

                    self.rules_[rule] = (a, b, c)
                else:
                    self.rules_[rule] = (score[0], score[1], 1)

        self.rules_ = sorted(self.rules_.items(),
                             key=lambda x: (x[1][0], x[1][1]), reverse=True)

        # Deduplicate the rule using semantic tree
        if self.max_depth_duplication is not None:
            self.rules_ = self.deduplicate(self.rules_)

        self.rules_ = sorted(self.rules_, key=lambda x: - self.f1_score(x))
        self.rules_without_feature_names_ = self.rules_

        # Replace generic feature names by real feature names
        self.rules_ = [(replace_feature_name(rule, self.feature_dict_), perf)
                       for rule, perf in self.rules_]

        return self
Exemplo n.º 11
0
    plt.plot(t, x, 'r-', lw=1, label=u'原始数据')
    plt.plot(abnormal,
             x[abnormal],
             'go',
             markeredgecolor='g',
             ms=3,
             label=u'异常值')
    plt.legend(loc='upper right')
    plt.title(u'异常检测', fontsize=18)
    plt.grid(b=True)

    # 预测
    plt.subplot(133)
    select = np.ones(N, dtype=np.bool)
    select[abnormal] = False
    t = np.arange(N)
    dtr = DecisionTreeRegressor(criterion='mse', max_depth=10)
    br = BaggingRegressor(dtr, n_estimators=10, max_samples=0.3)
    br.fit(t[select].reshape(-1, 1), x[select])
    y = br.predict(np.arange(N).reshape(-1, 1))
    y[select] = x[select]
    plt.plot(x, 'g--', lw=1, label=u'原始值')  # 原始值
    plt.plot(y, 'r-', lw=1, label=u'校正值')  # 校正值
    plt.legend(loc='upper right')
    plt.title(u'异常值校正', fontsize=18)
    plt.grid(b=True)

    plt.tight_layout(1.5, rect=(0, 0, 1, 0.95))
    plt.suptitle(u'排污数据的异常值检测与校正', fontsize=22)
    plt.show()
Exemplo n.º 12
0
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

#Step 1:Loading data
X, y = load_boston(return_X_y=True)

#Step 2:Split data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=40)

#step3:Training--BaggingRegressor
regression = BaggingRegressor(random_state=40)
param_grid = {
    'base_estimator':
    [DecisionTreeRegressor(criterion='mse', splitter='best')],
    'n_estimators': [x for x in np.arange(10, 101, 30)],
    'max_samples': [0.3, 0.7, 1.0],
    'max_features': [3, 6, 9, 13],
    'bootstrap_features': [True, False]
},
search = GridSearchCV(estimator=regression,
                      param_grid=param_grid,
                      cv=5,
                      refit=True,
                      verbose=1,
                      n_jobs=-1)
search.fit(X_train, y_train)
Exemplo n.º 13
0
def test_sparse_regression():
    # Check regression for various parameter settings on sparse input.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
                                                        boston.target[:50],
                                                        random_state=rng)

    class CustomSVR(SVR):
        """SVC variant that records the nature of the training set"""
        def fit(self, X, y):
            super(CustomSVR, self).fit(X, y)
            self.data_type_ = type(X)
            return self

    parameter_sets = [
        {
            "max_samples": 0.5,
            "max_features": 2,
            "bootstrap": True,
            "bootstrap_features": True
        },
        {
            "max_samples": 1.0,
            "max_features": 4,
            "bootstrap": True,
            "bootstrap_features": True
        },
        {
            "max_features": 2,
            "bootstrap": False,
            "bootstrap_features": True
        },
        {
            "max_samples": 0.5,
            "bootstrap": True,
            "bootstrap_features": False
        },
    ]

    for sparse_format in [csc_matrix, csr_matrix]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)
        for params in parameter_sets:

            # Trained on sparse format
            sparse_classifier = BaggingRegressor(base_estimator=CustomSVR(),
                                                 random_state=1,
                                                 **params).fit(
                                                     X_train_sparse, y_train)
            sparse_results = sparse_classifier.predict(X_test_sparse)

            # Trained on dense format
            dense_results = BaggingRegressor(base_estimator=CustomSVR(),
                                             random_state=1,
                                             **params).fit(
                                                 X_train,
                                                 y_train).predict(X_test)

            sparse_type = type(X_train_sparse)
            types = [i.data_type_ for i in sparse_classifier.estimators_]

            assert_array_equal(sparse_results, dense_results)
            assert all([t == sparse_type for t in types])
            assert_array_equal(sparse_results, dense_results)
Exemplo n.º 14
0
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import operator
import copy

x, y = datahelper.get_xy('data/', num_hours=3, error_minutes=15)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

scores = {}
models = []

for n in range(2, 20):
    estimator = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=4), max_samples=0.5, n_estimators=n)
    estimator.fit(x_train, y_train)
    scores[n] = estimator.score(x_test,y_test)
    models.append(copy.copy(estimator))

sorted_by_scores = sorted(scores.items(), key=operator.itemgetter(1), reverse=True)
print('Results of 5 best # of estimators:\n')

for i in range(0, 5):
    n, score = sorted_by_scores[i]
    print("№ estimators = ", n)

    y_predicted = models[n-2].predict(x_test)

    print('R^2 = ' + str(r2_score(y_test, y_predicted)))
    print('MSE = ' + str(np.sqrt(mean_squared_error(y_test, y_predicted))))
Exemplo n.º 15
0
import numpy as np
import pandas as pd

from simulator import simulate
from sklearn import metrics
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

np.random.seed(123456)

lr = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=1))

data = pd.read_csv('BTC-USD.csv')
data = data.dropna()
data.Date = pd.to_datetime(data.Date)
data.set_index('Date', drop=True, inplace=True)
diffs = (data.Close.diff()/data.Close).values[1:]

diff_len = len(diffs)



def create_x_data(lags=1):
    diff_data = np.zeros((diff_len, lags))

    for lag in range(1, lags+1):
        this_data = diffs[:-lag]
        diff_data[lag:, lag-1] = this_data

    return  diff_data
Exemplo n.º 16
0
def test_parallel():
    """Check parallel computations."""
    rng = check_random_state(0)

    # Classification
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=rng)

    for n_jobs in [-1, 3]:
        ensemble = BaggingClassifier(DecisionTreeClassifier(),
                                     n_jobs=n_jobs,
                                     random_state=0).fit(X_train, y_train)

        # predict_proba
        ensemble.set_params(n_jobs=1)
        y1 = ensemble.predict_proba(X_test)
        ensemble.set_params(n_jobs=2)
        y2 = ensemble.predict_proba(X_test)
        assert_array_almost_equal(y1, y2)

        ensemble = BaggingClassifier(DecisionTreeClassifier(),
                                     n_jobs=1,
                                     random_state=0).fit(X_train, y_train)

        y3 = ensemble.predict_proba(X_test)
        assert_array_almost_equal(y1, y3)

        # decision_function
        ensemble = BaggingClassifier(SVC(), n_jobs=n_jobs,
                                     random_state=0).fit(X_train, y_train)

        ensemble.set_params(n_jobs=1)
        decisions1 = ensemble.decision_function(X_test)
        ensemble.set_params(n_jobs=2)
        decisions2 = ensemble.decision_function(X_test)
        assert_array_almost_equal(decisions1, decisions2)

        ensemble = BaggingClassifier(SVC(), n_jobs=1,
                                     random_state=0).fit(X_train, y_train)

        decisions3 = ensemble.decision_function(X_test)
        assert_array_almost_equal(decisions1, decisions3)

    # Regression
    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        random_state=rng)

    for n_jobs in [-1, 3]:
        ensemble = BaggingRegressor(DecisionTreeRegressor(),
                                    n_jobs=3,
                                    random_state=0).fit(X_train, y_train)

        ensemble.set_params(n_jobs=1)
        y1 = ensemble.predict(X_test)
        ensemble.set_params(n_jobs=2)
        y2 = ensemble.predict(X_test)
        assert_array_almost_equal(y1, y2)

        ensemble = BaggingRegressor(DecisionTreeRegressor(),
                                    n_jobs=1,
                                    random_state=0).fit(X_train, y_train)

        y3 = ensemble.predict(X_test)
        assert_array_almost_equal(y1, y3)
Exemplo n.º 17
0
y = train_np[:, 0]

# X即特征属性值
X = train_np[:, 1:]

# fit到RandomForestRegressor之中
clf = linear_model.LogisticRegression(C=1.0,
                                      penalty='l1',
                                      tol=1e-6,
                                      solver='liblinear',
                                      multi_class='auto',
                                      max_iter=1000)
bagging_clf = BaggingRegressor(clf,
                               n_estimators=20,
                               max_samples=0.8,
                               max_features=1.0,
                               bootstrap=True,
                               bootstrap_features=False,
                               n_jobs=-1)
bagging_clf.fit(X, y)

print(bagging_clf)

test = data_test.filter(regex='Age_.*|SibSp|Parch|Fare_.*|Sex_.*|Pclass_.*')
predictions = bagging_clf.predict(test.values)
result = pd.DataFrame({
    'PassengerId': data_test['PassengerId'].values,
    'Survived': predictions.astype(np.int32)
})
result.to_csv("logistic_regression_predictions.csv", index=False)
Exemplo n.º 18
0
mlp = MLPRegressor()
mlpFit = mlp.fit(x_train, y_train)

regr = AdaBoostRegressor(random_state=0, n_estimators=100)
regrFit = regr.fit(x_train, y_train)

clfRidge = Ridge(alpha=1.0)
clfRidgeFit = clfRidge.fit(x_train, y_train)

clfBayesian = linear_model.BayesianRidge()
clfBayesianFit = clfBayesian.fit(x_train, y_train)

reg = linear_model.LassoLars(alpha=0.01)
regFit = reg.fit(x_train, y_train)

bag = BaggingRegressor()
bagFit = bag.fit(x_train, y_train)

DT_MAD = mean_absolute_error(y_test, DT_regressionFit.predict(x_test))
SVR_MAD = mean_absolute_error(y_test, svr_regressionFit.predict(x_test))
KNN_MAD = mean_absolute_error(y_test, neighFit.predict(x_test))
MLP_MAD = mean_absolute_error(y_test, mlpFit.predict(x_test))
regr_MAD = mean_absolute_error(y_test, mlpFit.predict(x_test))
clfRidge_MAD = mean_absolute_error(y_test, clfRidgeFit.predict(x_test))
clfBayesion_MAD = mean_absolute_error(y_test, clfBayesianFit.predict(x_test))
reg_MAD = mean_absolute_error(y_test, regFit.predict(x_test))
bag_MAD = mean_absolute_error(y_test, bagFit.predict(x_test))

print('Regression Tree MAD: ' + str(DT_MAD))
print('Support Vector Regression MAD ' + str(SVR_MAD))
print('KNN MAD ' + str(KNN_MAD))
Exemplo n.º 19
0
    print('Iteration :',i)
    (XTrain, XTest, YTrain, YTest) = train_test_split(X, Y, test_size=20, random_state=123)    
# Linear Regressions
    lreg = LinearRegression()
    lreg.fit(XTrain, YTrain)
    
    # Decision Tree Regression
    dtr = DecisionTreeRegressor(max_leaf_nodes=34)
    dtr.fit(XTrain, YTrain)
       
    # ADA BOOST REGRESSION
    sen = AdaBoostRegressor(n_estimators=200)
    sen.fit(XTrain, YTrain)         
    
    # Bagging Regression
    breg = BaggingRegressor(n_estimators=100)
    breg.fit(XTrain, YTrain)
    
    # Random Forest
    rfreg = RandomForestRegressor(n_estimators=10)
    rfreg.fit(XTrain, YTrain)
    
    dscores.append(dtr.score(XTest, YTest) * 100)
    bscores.append(breg.score(XTest, YTest) * 100)
    ascores.append(sen.score(XTest, YTest) * 100)
    rscores.append(rfreg.score(XTest, YTest) * 100)
    lscores.append(lreg.score(XTest, YTest) * 100)

plt = matplotlib.pyplot
plt.figure(figsize=(15,15))
plt.scatter(range(num_iters), dscores, color='k', label='Decision Tree Regressor')
    def process(i):
        current_dat = header.iloc[i]
        current_dat_name = current_dat['bench.id']
        # Define Datasets
        # print(str(i)+': '+current_dat_name)
        filename = '/Users/apple/Documents/AD_Datasets/' + dataname + '/benchmarks/' + current_dat_name + '.csv'
        with open(filename, 'r') as csvfile:
            reader = csv.reader(csvfile)
            data = list(reader)
        data = np.array(data)

        X_train = data[1:, 6:].astype('double')
        anomaly_type = data[1:, 5]
        y_label = np.zeros(len(anomaly_type))
        # normal_ind = np.where(anomaly_type == 'nominal')[0]
        anomaly_ind = np.where(anomaly_type == 'anomaly')[0]
        y_label[anomaly_ind] = 1
        # X_normal = X_train[normal_ind,:]
        # X_outlier = X_train[anomaly_ind,:]
        # contamination = len(anomaly_ind)/len(y_label)

        rng = np.random.RandomState(42)

        # BaggedDTM
        #     #################################################################################################
        #     # max_samples = min(2048,X_train.shape[0])
        #     # y = np.random.uniform(size=X_train.shape[0])
        #     # bag_neigh = max(10, int(np.floor(0.03 * max_samples)))
        #     # clf_bagDTM = BaggingRegressor(base_estimator=DTM(n_neighbors=bag_neigh,contamination=0.1),
        #     #                               n_estimators=100, max_samples=max_samples, bootstrap=False, random_state=rng)
        #     # y_score_BDTM = clf_bagDTM.fit(X_train, y).predict(X_train)
        #     # # fpr_DTM, tpr_DTM, thresholds_DTM = roc_curve(y_label, -DTM_score)
        #     # auc_BDTM_score = roc_auc_score(y_label, -y_score_BDTM)
        #     # ap_BDTM_score = average_precision_score(y_label, -y_score_BDTM)

        # sp
        #################################################################################################
        max_samples = min(20, X_train.shape[0])
        y = np.random.uniform(size=X_train.shape[0])
        bag_neigh = 1
        clf_spDTM = BaggingRegressor(base_estimator=DTM(n_neighbors=bag_neigh,
                                                        contamination=0.1),
                                     n_estimators=1,
                                     max_samples=max_samples,
                                     bootstrap=False,
                                     random_state=rng)
        y_score_spDTM = clf_spDTM.fit(X_train, y).predict(X_train)
        auc_spDTM_score = roc_auc_score(y_label, -y_score_spDTM)
        ap_spDTM_score = average_precision_score(y_label, -y_score_spDTM)

        # aNNE
        #################################################################################################
        clf_aNNE = BaggingRegressor(base_estimator=DTM(n_neighbors=bag_neigh,
                                                       contamination=0.1),
                                    n_estimators=100,
                                    max_samples=max_samples,
                                    bootstrap=False,
                                    random_state=rng)
        y_score_aNNE = clf_aNNE.fit(X_train, y).predict(X_train)
        auc_aNNE_score = roc_auc_score(y_label, -y_score_aNNE)
        ap_aNNE_score = average_precision_score(y_label, -y_score_aNNE)

        return [auc_spDTM_score,
                auc_aNNE_score], [ap_spDTM_score, ap_aNNE_score]
Exemplo n.º 21
0
rand_pred = random_forest.predict(X_test)
print('train score for random_forest:', random_forest.score(X_train, y_train))
print('test score for random_forest:', random_forest.score(X_test, y_test))

#predicted price
y_pred = random_forest.predict(X_test)

#
print("Predicted Prices for Single Family House in Ca", y_pred)

from sklearn.model_selection import cross_val_score
clf = RandomForestRegressor()
scores = cross_val_score(clf, X_test, y_test, cv=5)

#bagging
bg = BaggingRegressor(RandomForestRegressor(), n_estimators=10)
bg.fit(X_train, y_train)
bg.score(X_train, y_train)
bg.score(X_test, y_test)

#mse in $
mse = mean_absolute_error(y_test, y_pred)
print("The mean absolute error is:$", mse)
#chceking r^2
from sklearn.metrics import r2_score

print("r_Score:", r2_score(y_test, y_pred))

print("Predicted Prices for Single Family House in Ca", y_test, y_pred)

print("crossvalidation:", scores.mean())
from sklearn.ensemble import RandomForestRegressor

from sklearn.multioutput import MultiOutputRegressor

# to set number of jobs to the number of cores, use n_jobs=-1
model = MultiOutputRegressor(GradientBoostingRegressor(), n_jobs=1).fit(train_scaled[0:-1,:], train_scaled[1:,:])
models.append(model)
modeldims.append(2)
modelnames.append('GradientBoostingRegressor')

model = MultiOutputRegressor(AdaBoostRegressor(), n_jobs=1).fit(train_scaled[0:-1,:], train_scaled[1:,:])
models.append(model)
modeldims.append(2)
modelnames.append('AdaBoostRegressor')

model = MultiOutputRegressor(BaggingRegressor(), n_jobs=1).fit(train_scaled[0:-1,:], train_scaled[1:,:])
models.append(model)
modeldims.append(2)
modelnames.append('BaggingRegressor')

model = MultiOutputRegressor(ExtraTreesRegressor(), n_jobs=1).fit(train_scaled[0:-1,:], train_scaled[1:,:])
models.append(model)
modeldims.append(2)
modelnames.append('ExtraTreesRegressor')

model = MultiOutputRegressor(RandomForestRegressor(), n_jobs=1).fit(train_scaled[0:-1,:], train_scaled[1:,:])
models.append(model)
modeldims.append(2)
modelnames.append('RandomForestRegressor')

model = MultiOutputRegressor(SVR(), n_jobs=1).fit(train_scaled[0:-1,:], train_scaled[1:,:])
Exemplo n.º 23
0
#
# ## Bagging in scikit-learn
#
# Scikit-learn implements the bagging procedure as a "meta-estimator", that is
# an estimator that wraps another estimator: it takes a base model that is
# cloned several times and trained independently on each bootstrap sample.
#
# The following code snippet shows how to build a bagging ensemble of decision
# trees. We set `n_estimtators=100` instead of 3 in our manual implementation
# above to get a stronger smoothing effect.

# %%
from sklearn.ensemble import BaggingRegressor

bagged_trees = BaggingRegressor(
    base_estimator=DecisionTreeRegressor(max_depth=3),
    n_estimators=100,
)
_ = bagged_trees.fit(data_train, target_train)

# %% [markdown]
#
# Let us visualize the predictions of the ensemble on the same test data:
# %%
sns.scatterplot(x=data_train["Feature"],
                y=target_train,
                color="black",
                alpha=0.5)

bagged_trees_predictions = bagged_trees.predict(data_test)
plt.plot(data_test, bagged_trees_predictions)
Exemplo n.º 24
0
    def linear_regression_algo(self):

        X = []
        Y = []

        with open('../Data/full_table.csv', 'r') as file:
            for line in csv.reader(file, delimiter=','):
                if len(line) == 13:
                    try:
                        zhvi = float(line[5])
                        property_type = line[6]
                        room_type = line[7]
                        accommodates = int(line[8])
                        bathrooms = float(line[9])
                        beds = int(line[10])
                        bed_type = line[11]
                        price = float(line[12])

                        x = {
                            'zhvi': zhvi,
                            'property_type': property_type,
                            'room_type': room_type,
                            'accommodates': accommodates,
                            'bathrooms': bathrooms,
                            'beds': beds,
                            'bed_type': bed_type
                        }

                        y = price

                        X.append(x)
                        Y.append(y)

                    except:
                        pass

        # The DictVectorizer converts data from a dictionary to an array
        vec = DictVectorizer()

        # Convert X to Array
        X = vec.fit_transform(X).toarray()

        # Split X and Y into training and testing sets
        X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                            Y,
                                                            test_size=0.33,
                                                            random_state=43)

        # Linear Regression
        model = linear_model.LinearRegression()
        model.fit(X_train, Y_train)
        Y_pred = model.predict(X_test)
        mse = mean_squared_error(Y_test, Y_pred)
        mae = mean_absolute_error(Y_test, Y_pred)
        r2 = r2_score(Y_test, Y_pred)

        print('Linear Regression')
        print('Mean Squared Error: {0}'.format(mse))
        print('Mean Average Error: {0}'.format(mae))
        print('R2 Score: {0}'.format(r2))

        # With Boosting
        model_boost = AdaBoostRegressor(linear_model.LinearRegression())
        model_boost.fit(X_train, Y_train)
        Y_pred = model_boost.predict(X_test)
        mse = mean_squared_error(Y_test, Y_pred)
        mae = mean_absolute_error(Y_test, Y_pred)
        r2 = r2_score(Y_test, Y_pred)

        print('Linear Regression (with AdaBoost)')
        print('Mean Squared Error: {0}'.format(mse))
        print('Mean Average Error: {0}'.format(mae))
        print('R2 Score: {0}'.format(r2))

        # With Bagging
        model_bag = BaggingRegressor(linear_model.LinearRegression())
        model_bag.fit(X_train, Y_train)
        Y_pred = model_bag.predict(X_test)
        mse = mean_squared_error(Y_test, Y_pred)
        mae = mean_absolute_error(Y_test, Y_pred)
        r2 = r2_score(Y_test, Y_pred)

        print('Linear Regression (with Bagging)')
        print('Mean Squared Error: {0}'.format(mse))
        print('Mean Average Error: {0}'.format(mae))
        print('R2 Score: {0}'.format(r2))
Exemplo n.º 25
0
    print("AUC scores of downstream classifiers on test data : ")
    for i in range(0, len(learners)):
        score = learners[i].fit(X_syn, y_syn)
        pred_probs = learners[i].predict_proba(X_test)
        auc_score = roc_auc_score(y_test, pred_probs[:, 1])
        print('-' * 40)
        print('{0}: {1}'.format(names[i], auc_score))

else:
    names = ['Ridge', 'Lasso', 'ElasticNet', 'Bagging', 'MLP']

    learners.append((Ridge()))
    learners.append((Lasso()))
    learners.append((ElasticNet()))
    learners.append((BaggingRegressor()))
    learners.append((MLPRegressor()))

    print("RMSE scores of downstream regressors on test data : ")
    for i in range(0, len(learners)):
        score = learners[i].fit(X_syn, y_syn)
        pred_vals = learners[i].predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, pred_vals))
        print('-' * 40)
        print('{0}: {1}'.format(names[i], rmse))

if opt.model != 'real-data':
    if opt.save_synthetic:

        if not os.path.isdir(opt.output_data_path):
            raise Exception('Output directory does not exist')
Exemplo n.º 26
0
####3.4KNN回归####
from sklearn import neighbors

model_KNeighborsRegressor = neighbors.KNeighborsRegressor()
####3.5随机森林回归####
from sklearn import ensemble

model_RandomForestRegressor = ensemble.RandomForestRegressor(
    n_estimators=20)  #这里使用20个决策树
####3.6Adaboost回归####
from sklearn import ensemble

model_AdaBoostRegressor = ensemble.AdaBoostRegressor(
    n_estimators=50)  #这里使用50个决策树
####3.7GBRT回归####
from sklearn import ensemble

model_GradientBoostingRegressor = ensemble.GradientBoostingRegressor(
    learning_rate=0.2, n_estimators=200)  #这里使用100个决策树
####3.8Bagging回归####
from sklearn.ensemble import BaggingRegressor

model_BaggingRegressor = BaggingRegressor()
####3.9ExtraTree极端随机树回归####
from sklearn.tree import ExtraTreeRegressor

model_ExtraTreeRegressor = ExtraTreeRegressor()

###########4.具体方法调用部分##########
try_different_method(model_LinearRegression)
Exemplo n.º 27
0
#bootstrap aggegation
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score

def bagging_regressor(x, y):
   for max_number_samples = len(x):
        if n_estimators  < len(y):
       bag_reg = BaggingRegressor(DecisionTreeRegressor(), n_estimators,
       max_samples = 50, bootstrap = True, n_jobs = -1)
       bag_reg.fit(x, y)    # n_jobs denotes the number of processors dedicated to the the process
       y_pred = bag_reg.predict(x)
    else: # out of bag evaluation
        bag_reg = BaggingRegressor(
        DecisionTreeRegressor(), n_estimators,
        bootstrap = True, n_jobs = -1, oob_score = True)
        bag_reg.fit(x, y)
        bag_reg.oob_score_

#seqeuencial decision tree regressor/ gradient boosting

def decision_tree_stacking(x, y):
    " a number of first level individual learners are generated from training data set" 
    "and then combined by a metalearner"
        tree_reg1 = DecisionTreeRegressor(max_depth = 2)
        tree_reg1.fit(x, y)

        # now train a second DecisionTreeRegressor on the residual errors made by the first predictor

        y2 = y- tree_reg1.predict(x)
Exemplo n.º 28
0
 def __init__(self, **args):
     """Init model."""
     self.model_lf = BaggingRegressor(**copy.deepcopy(args))
     self.model_hf = BaggingRegressor(**copy.deepcopy(args))
Exemplo n.º 29
0
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor

train_df = pd.read_csv("/Users/jianjun.yue/PycharmGItHub/data/house_price/model/house_price_train.csv",index_col = 0)
test_df = pd.read_csv("/Users/jianjun.yue/PycharmGItHub/data/house_price/model/house_price_test.csv",index_col = 0)
numeric_cols = train_df.columns[train_df.dtypes != 'object']
y_train=train_df["SalePrice"]
X_train=train_df.drop(['SalePrice'],axis=1)
test_df=test_df.drop(['MSSubClass_90'],axis=1)

ridge = Ridge(alpha = 15)
# bagging 把很多小的分类器放在一起,每个train随机的一部分数据,然后把它们的最终结果综合起来(多数投票)
# bagging 算是一种算法框架
params = [1,10,15,20,25,30,40]
test_scores = []
for param in params:
    clf = BaggingRegressor(base_estimator = ridge,n_estimators = param)
    test_score = np.sqrt(-cross_val_score(clf,X_train,y_train,cv = 10,scoring = 'neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))

plt.plot(params,test_scores)
plt.title('n_estimators vs CV Error')
plt.show()

br = BaggingRegressor(base_estimator = ridge,n_estimators = 25)
br.fit(X_train,y_train)
y_final = np.expm1(br.predict(test_df))
    def fit(self,
            label_correctness_file,
            point_labels_file,
            users_file,
            user_quality_features_file='all_users.csv'):
        users = pd.read_csv(users_file)
        users = users.set_index('user_id')
        y_train = users['accuracy']

        users_for_training = users[users['labels_validated'] > 25].index
        self.label_correctness = extract_label_features(
            point_labels_file, label_correctness_file)
        #  Splits the users into training & testing groups
        user_quality_features = pd.read_csv(
            user_quality_features_file).set_index('user_id')
        half = int(len(users_for_training) / 2)
        users_labels_train = users_for_training[:half]
        users_labels_test = users_for_training[half:]

        #         mask = np.random.permutation(np.arange(len(users_for_training)))
        #         users_labels_train = users_for_training[mask[:int(proportion_labels * len(mask))]]
        #         users_labels_test = users_for_training[mask[int(proportion_labels * len(mask)):]]

        train_labels = self.label_correctness.copy()
        train_labels = train_labels[~pd.isna(train_labels['correct'])]
        train_labels = train_labels[~(pd.isna(train_labels[features]).any(
            axis=1))]

        #         en = OrdinalEncoder()
        #         en.fit(pd.concat((train_labels[['CLASS_DESC']], test_labels[['CLASS_DESC']])))
        #         train_labels[['CLASS_DESC']] = en.transform(train_labels[['CLASS_DESC']])

        self.rfe_labels = RFECV(
            estimator=RandomForestClassifier(n_estimators=10),
            step=1,
            cv=StratifiedKFold(5),
            scoring='precision')
        self.clf_labels = RandomForestClassifier(random_state=0,
                                                 n_jobs=-1,
                                                 n_estimators=30)
        self.clf_accuracy = BaggingRegressor(random_state=0,
                                             n_jobs=-1,
                                             n_estimators=30)
        self.rfe_accuracy = RFECV(
            estimator=RandomForestClassifier(n_estimators=10),
            step=1,
            cv=StratifiedKFold(5),
            scoring='f1')

        print('Training label classifier...')
        self.rfe_labels.fit(
            train_labels[train_labels['user_id'].isin(users_labels_train)]
            [features].values, train_labels[train_labels['user_id'].isin(
                users_labels_train)]['correct'].astype(int))

        self.clf_labels.fit(
            train_labels[train_labels['user_id'].isin(users_labels_train)]
            [features].values[:, self.rfe_labels.support_],
            train_labels[train_labels['user_id'].isin(
                users_labels_train)]['correct'].astype(int))

        train_labels = train_labels.join(pd.Series(
            data=self.clf_labels.predict_proba(
                train_labels[train_labels['user_id'].isin(users_labels_test)]
                [features].values[:, self.rfe_labels.support_])[:, 1],
            index=train_labels[train_labels['user_id'].isin(
                users_labels_test)].index).rename('prob'),
                                         how='outer')

        prob_hist_predictions = pd.DataFrame(train_labels[train_labels['user_id'].isin(users_labels_test)]
            .groupby('user_id').apply(lambda x:\
            prob_hist(x['prob'].values)).rename('prob'))

        prob_hist_predictions = prob_hist_predictions.join(
            user_quality_features)

        print('Training accuracy classifier...')
        self.rfe_accuracy.fit(
            np.concatenate((dearray(prob_hist_predictions['prob']),
                            prob_hist_predictions.drop(columns='prob').values),
                           axis=1),
            y_train.loc[prob_hist_predictions.index] > 65)

        self.clf_accuracy.fit(
            np.concatenate((dearray(prob_hist_predictions['prob']),
                            prob_hist_predictions.drop(columns='prob').values),
                           axis=1)[:, self.rfe_accuracy.support_],
            y_train.loc[prob_hist_predictions.index])