예제 #1
0
def imputData(data):
    #Separate columns numeric and no numeric
    def splitTypes(dframe):
        objectsName = []
        objects = pd.DataFrame()
        #dframeName= []
        for i in dframe.columns:
            #If i column is not  float
            if (not (dframe[i].dtype == np.float64)):
                #Get name
                objectsName.append(i)
                #Get no numeric column
                objects = pd.concat([objects, dframe[i]], axis=1)
                #Drop no numeric column
                dframe = dframe.drop(columns=objectsName[-1])
        #Get numeric columns
        dframeName = list(dframe.columns)

        return dframeName, dframe, objectsName, objects

    w, x, y, z = splitTypes(data)

    #Extra Tree Regressor
    impute_est = ETR(n_estimators=10, random_state=0)
    #Iterative imputer
    estimator = IterativeImputer(random_state=0, estimator=impute_est)
    #Fit transform data
    impdf = estimator.fit_transform(x)
    #Concat imputed data and class
    imp_data = pd.concat([pd.DataFrame(impdf), z], axis=1)
    #Rename columns
    imp_data.columns = w + y

    return imp_data
예제 #2
0
    def iterative_fit(self, X, y, n_iter=1, refit=False):
        from sklearn.ensemble import ExtraTreesRegressor as ETR

        if refit:
            self.estimator = None

        if self.estimator is None:
            num_features = X.shape[1]
            max_features = int(
                float(self.max_features) * (np.log(num_features) + 1))
            # Use at most half of the features
            max_features = max(1, min(int(X.shape[1] / 2), max_features))
            self.estimator = ETR(n_estimators=n_iter,
                                 criterion=self.criterion,
                                 max_depth=self.max_depth,
                                 min_samples_split=self.min_samples_split,
                                 min_samples_leaf=self.min_samples_leaf,
                                 bootstrap=self.bootstrap,
                                 max_features=max_features,
                                 max_leaf_nodes=self.max_leaf_nodes,
                                 oob_score=self.oob_score,
                                 n_jobs=self.n_jobs,
                                 verbose=self.verbose,
                                 random_state=self.random_state,
                                 warm_start=True)
        else:
            self.estimator.n_estimators += n_iter

        self.estimator.fit(
            X,
            y,
        )

        return self
예제 #3
0
    def iterative_fit(self, X, y, n_iter=1, refit=False):
        if refit:
            self.estimator = None

        if self.estimator is None:
            num_features = X.shape[1]
            max_features = int(
                float(self.max_features) * (np.log(num_features) + 1))
            # Use at most half of the features
            max_features = max(1, min(int(X.shape[1] / 2), max_features))
            self.estimator = ETR(n_estimators=0,
                                 criterion=self.criterion,
                                 max_depth=self.max_depth,
                                 min_samples_split=self.min_samples_split,
                                 min_samples_leaf=self.min_samples_leaf,
                                 bootstrap=self.bootstrap,
                                 max_features=max_features,
                                 max_leaf_nodes=self.max_leaf_nodes,
                                 oob_score=self.oob_score,
                                 n_jobs=self.n_jobs,
                                 verbose=self.verbose,
                                 random_state=self.random_state,
                                 warm_start=True)
        tmp = self.estimator  # TODO copy ?
        tmp.n_estimators += n_iter
        tmp.fit(
            X,
            y,
        )
        self.estimator = tmp
        return self
예제 #4
0
    def iterative_fit(self, X, y, sample_weight=None, n_iter=1, refit=False):
        from sklearn.ensemble import ExtraTreesRegressor as ETR

        if refit:
            self.estimator = None

        if self.estimator is None:
            max_features = int(X.shape[1]**float(self.max_features))
            self.estimator = ETR(
                n_estimators=n_iter,
                criterion=self.criterion,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                bootstrap=self.bootstrap,
                max_features=max_features,
                max_leaf_nodes=self.max_leaf_nodes,
                min_weight_fraction_leaf=self.min_weight_fraction_leaf,
                min_impurity_decrease=self.min_impurity_decrease,
                oob_score=self.oob_score,
                n_jobs=self.n_jobs,
                verbose=self.verbose,
                random_state=self.random_state,
                warm_start=True)

        else:
            self.estimator.n_estimators += n_iter
            self.estimator.n_estimators = min(self.estimator.n_estimators,
                                              self.n_estimators)

        self.estimator.fit(X, y, sample_weight=sample_weight)
        return self
예제 #5
0
    def iterative_fit(self, X, y, n_iter=1, refit=False):
        from sklearn.ensemble import ExtraTreesRegressor as ETR

        if refit:
            self.estimator = None

        if self.estimator is None:

            self.n_estimators = int(self.n_estimators)
            if self.criterion not in ("mse", "friedman_mse", "mae"):
                raise ValueError(
                    "'criterion' is not in ('mse', 'friedman_mse', "
                    "'mae): %s" % self.criterion)

            if check_none(self.max_depth):
                self.max_depth = None
            else:
                self.max_depth = int(self.max_depth)

            if check_none(self.max_leaf_nodes):
                self.max_leaf_nodes = None
            else:
                self.max_leaf_nodes = int(self.max_leaf_nodes)

            self.min_samples_leaf = int(self.min_samples_leaf)
            self.min_samples_split = int(self.min_samples_split)
            self.max_features = float(self.max_features)
            self.min_impurity_decrease = float(self.min_impurity_decrease)
            self.bootstrap = check_for_bool(self.bootstrap)
            self.n_jobs = int(self.n_jobs)
            self.verbose = int(self.verbose)

            self.estimator = ETR(
                n_estimators=n_iter,
                criterion=self.criterion,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                bootstrap=self.bootstrap,
                max_features=self.max_features,
                max_leaf_nodes=self.max_leaf_nodes,
                min_impurity_decrease=self.min_impurity_decrease,
                oob_score=self.oob_score,
                n_jobs=self.n_jobs,
                verbose=self.verbose,
                random_state=self.random_state,
                warm_start=True)
        else:
            self.estimator.n_estimators += n_iter
            self.estimator.n_estimators = min(self.estimator.n_estimators,
                                              self.n_estimators)

        self.estimator.fit(
            X,
            y,
        )

        return self
예제 #6
0
    def __init__(self,
                 timeseries,
                 dataname,
                 n_estimators=100,
                 criterion='mse',
                 min_samples_leaf=1,
                 min_samples_split=2,
                 max_features=1,
                 bootstrap=False,
                 max_leaf_nodes='None',
                 max_depth='None',
                 min_impurity_decrease=0.0,
                 Window_size=20,
                 Difference=False,
                 time_feature=True,
                 tsfresh_feature=True,
                 forecasting_steps=25,
                 n_splits=5,
                 max_train_size=None,
                 NAN_threshold=0.05):

        if criterion not in ("mse", "friedman_mse", "mae"):
            raise ValueError("'criterion' is not in ('mse', 'friedman_mse', "
                             "'mae): %s" % criterion)
        self.criterion = criterion
        self.n_estimators = int(n_estimators)
        self.min_samples_leaf = int(min_samples_leaf)
        self.min_samples_split = int(min_samples_split)
        self.max_features = float(max_features)
        self.bootstrap = bootstrap

        if max_leaf_nodes == "None" or max_leaf_nodes is None:
            self.max_leaf_nodes = None
        else:
            self.max_leaf_nodes = int(max_leaf_nodes)

        if max_depth == "None" or max_depth is None:
            self.max_depth = None
        else:
            self.max_depth = int(max_depth)

        self.min_impurity_decrease = float(min_impurity_decrease)

        self.estimator = ETR(n_estimators=self.n_estimators,
                             criterion=self.criterion,
                             max_depth=self.max_depth,
                             min_samples_split=self.min_samples_split,
                             min_samples_leaf=self.min_samples_leaf,
                             bootstrap=self.bootstrap,
                             max_features=self.max_features,
                             max_leaf_nodes=self.max_leaf_nodes,
                             min_impurity_decrease=self.min_impurity_decrease,
                             warm_start=True)

        super().__init__(timeseries, dataname, Window_size, time_feature,
                         Difference, tsfresh_feature, forecasting_steps,
                         n_splits, max_train_size, NAN_threshold)
예제 #7
0
def get_Qmin(X0, X1, u, cost, T):
    """
    Gets the extra trees regressor model after training through multiple steps.
    Evaluates the suboptimal regressor for the infinite horizon problem. 

    Parameters
    ----------
    X0 : numpy 2D array of (n_samples,n_features)
         Each row indicate the episodes in the training data and columns indicate
         delta and omega in respective columns
    X1 : numpy 2D array of (n_samples,n_features)
         Each row indicate the episodes in the training data and columns indicate
         delta and omega in respective columns
    u : numpy 1D array (n_samples,)
        Each row indicates the episode and contains the control used in the 
        episode
    cost : numpy 1D array (n_samples,)
        Each row indicates the episode and contains the cost incurred in the 
        episode
    T : integer value
        number of time steps to consider.

    Returns
    -------
    TYPE
        DESCRIPTION.

    """
    gamma = 0.95
    n_samples = X0.shape[0]
    Xtrain = np.concatenate((X0, u), axis=1)
    for n in range(T):
        if n == 0:
            ytrain = cost
            REGRESSOR = ETR(n_estimators=50).fit(Xtrain, ytrain)
        else:
            Qdata = np.zeros(shape=(n_samples, 11))
            for i in range(11):
                udata = -i * 0.016 * np.ones(shape=(n_samples, 1))
                Xdata = np.concatenate((X1, udata), axis=1)
                Qdata[:, i] = REGRESSOR.predict(Xdata)
            ytrain = cost + gamma * np.amin(Qdata, axis=1)
            REGRESSOR = ETR(n_estimators=50).fit(Xtrain, ytrain)
    return REGRESSOR
    def __init__(self, df, mode, pca):
        #self.scaler = MinMaxScaler() #StandardScaler()
        y = df.y.values
        x = df[cols].values
        #self.means = x.mean()
        #x = x.fillna(self.means)
        #x = x.values
        self.decomp = False
        self.model = None

        #x = self.scaler.fit_transform(x)
        if pca:
            var_exp_tol = 0.9
            self.decomp = ipca()
            x_mod = self.decomp.fit_transform(x)
            cum_var_exp = self.decomp.explained_variance_ratio_.cumsum()
            self.n = np.arange(0, x.shape[1])[cum_var_exp >= var_exp_tol][0]
            self.n = min(self.n, int(np.sqrt(df.shape[0])))
            x = x_mod[:, 0:self.n]
            print("# PC used:", self.n, "variance explained:", var_exp_tol)
        if mode == 'A':
            self.model = 'linreg'
            self.regr = LinearRegression()
        elif mode == 'B':
            self.model = 'ridge'
            #self.regr = Ridge(alpha = 200,random_state = 52)
            self.regr = RidgeCV(alphas=np.arange(0.1,1.1,0.1)*1E5,\
                        store_cv_values=True)
            #scoring='neg_mean_squared_error')
        elif mode == 'C':
            self.model = 'lasso'
            #self.regr = Lasso(alpha = 1, selection = 'random', random_state = 52)
            self.regr = LassoCV(alphas=np.array([0.1, 1, 10]) * 1E4)
        elif mode == 'D':
            self.model = 'elasticNet'
            #self.regr = ElasticNet(alpha = 1,l1_ratio = 0.2, selection = 'random', random_state = 52)
            self.regr = ElasticNetCV(l1_ratio=np.arange(0.1,1.1)*0.1,\
                        n_alphas=3,alphas = np.array([0.01,0.1,1])*1E7)
        elif mode == 'E':
            self.model = 'ETR'
            self.regr = ETR(n_estimators=500,max_depth=10,\
                            max_features='auto',\
                            bootstrap=True,\
                            criterion='mse',\
                            #verbose=1,\
                            oob_score=True,\
                            n_jobs=4,random_state=50)
            #est = ETR(random_state=50,verbose=1,n_jobs=-1)
            tuned_parameters = [{
                'n_estimators': [10, 100],
                'max_depth': [5, 50]
            }]
            #self.regr = GridSearchCV(estimator=est,\
            #                        param_grid=tuned_parameters,\
            #                        verbose=1)
        self.regr.fit(x, y)
예제 #9
0
def __ensemble_test(type, X_train, X_test, y_train, y_test):
    if type.lower() == 'gbr':
        reg = GBR(n_estimators=100, random_state=1)
    elif type.lower() == 'rfr':
        reg = RFR(n_estimators=100, random_state=1)
    elif type.lower() == 'abr':
        reg = ABR(n_estimators=100, random_state=1)
    elif type.lower() == 'etr':
        reg = ETR(n_estimators=100, random_state=1)
    reg.fit(X_train, y_train)
    return reg, reg.score(X_test, y_test), reg.feature_importances_
예제 #10
0
    def fit(self, X, y, sample_weight=None):
        from sklearn.ensemble import ExtraTreesRegressor as ETR

        max_features = int(X.shape[1]**float(self.max_features))
        self.estimator = ETR(
            n_estimators=self.get_max_iter(),
            criterion=self.criterion,
            max_depth=self.max_depth,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            bootstrap=self.bootstrap,
            max_features=max_features,
            max_leaf_nodes=self.max_leaf_nodes,
            min_weight_fraction_leaf=self.min_weight_fraction_leaf,
            min_impurity_decrease=self.min_impurity_decrease,
            oob_score=self.oob_score,
            n_jobs=self.n_jobs,
            verbose=self.verbose,
            random_state=self.random_state,
            warm_start=True)

        self.estimator.fit(X, y, sample_weight=sample_weight)
        return self
def regression(X,
               Y,
               X_cv,
               Y_cv,
               n_estimators=200,
               criterion='mse',
               max_depth=20):
    ####### ERT regressor trained by simulated data
    KK = X.shape[1]
    mse_train = []
    mse_cv = []
    importance = np.zeros(KK)

    regr = ETR(n_estimators=n_estimators,
               criterion=criterion,
               max_depth=max_depth)
    regr.fit(X, Y)
    Y_pred = regr.predict(X)
    mse_train.append(np.sqrt(mse(Y, Y_pred)))
    Y_pred = regr.predict(X_cv)
    mse_cv.append(np.sqrt(mse(Y_cv, Y_pred)))
    importance[:] = regr.feature_importances_

    return regr, mse_train, mse_cv
예제 #12
0
def main():

    ### parsing and Data pre-processing
    # load the provided data
    train_features_path = os.path.join(data_path, 'dengue_features_train.csv')
    train_labels_path = os.path.join(data_path, 'dengue_labels_train.csv')

    ### pre-processing data
    sj_train, iq_train = preprocess_data(train_features_path,
                                         labels_path=train_labels_path)
    #print(sj_train.describe())
    #print(iq_train.describe())

    kf = KFold(n_splits=6)

    sj_model_list = []
    sj_err_list = []
    loop = 1
    for train_index, val_index in kf.split(
            sj_train
    ):  #The index will be split into [train_index] and [val_index]
        X_train, X_val = sj_train.ix[train_index], sj_train.ix[val_index]
        sj_etr = ETR(n_estimators=800, max_depth=4, random_state=0, verbose=1)
        sj_etr.fit(X_train.drop('total_cases', axis=1), X_train['total_cases'])
        predictions = sj_etr.predict(X_val.drop('total_cases', axis=1))
        sj_err_list.append(
            eval_measures.meanabs(predictions, X_val.total_cases))
        sj_model_list.append(sj_etr)
        loop += 1
    print(sj_err_list)
    argmax = sorted(range(len(sj_err_list)), key=lambda x: sj_err_list[x])[0]
    print(argmax)
    sj_best_model = sj_model_list[argmax]

    iq_model_list = []
    iq_err_list = []
    loop = 1
    for train_index, val_index in kf.split(iq_train):
        X_train, X_val = iq_train.ix[train_index], iq_train.ix[val_index]
        iq_etr = ETR(n_estimators=400, max_depth=4, random_state=0)
        iq_etr.fit(X_train.drop('total_cases', axis=1), X_train['total_cases'])
        predictions = iq_etr.predict(X_val.drop('total_cases', axis=1))
        iq_err_list.append(
            eval_measures.meanabs(predictions, X_val.total_cases))
        iq_model_list.append(iq_etr)

        loop += 1
    print(iq_err_list)
    argmax = sorted(range(len(iq_err_list)), key=lambda x: iq_err_list[x])[0]
    print(argmax)
    iq_best_model = iq_model_list[argmax]

    ##Accessing testing data
    test_features_path = os.path.join(data_path, 'dengue_features_test.csv')
    sj_test, iq_test = preprocess_data(test_features_path)

    #Calculate the k-fold validation error
    sj_score = []
    for train_index, val_index in kf.split(sj_train):
        X_train, X_val = sj_train.ix[train_index], sj_train.ix[val_index]
        train_predict = np.array(
            sj_best_model.predict(X_val.drop('total_cases',
                                             axis=1))).astype(int)
        sj_score.append(eval_measures.meanabs(train_predict,
                                              X_val.total_cases))
    print("Mean of {} cross validation of sj_score is {} (+/- {})".format(
        kf.get_n_splits(sj_train), np.mean(sj_score), np.std(sj_score)))

    iq_score = []
    for train_index, val_index in kf.split(iq_train):
        X_train, X_val = iq_train.ix[train_index], iq_train.ix[val_index]
        train_predict = np.array(
            iq_best_model.predict(X_val.drop('total_cases',
                                             axis=1))).astype(int)
        iq_score.append(eval_measures.meanabs(train_predict,
                                              X_val.total_cases))
    print("Mean of {} cross validation of iq_score is {} (+/- {})".format(
        kf.get_n_splits(iq_train), np.mean(iq_score), np.std(iq_score)))

    ##Use the model sj_lr and iq_lr trained before to predict the testing data
    print("Predicting testing data...")
    sj_predictions = sj_best_model.predict(sj_test)
    iq_predictions = iq_best_model.predict(iq_test)
    sj_predictions = np.array(sj_predictions).astype(int)
    iq_predictions = np.array(iq_predictions).astype(int)

    print("Creating submit file...")
    ##Use submission_format as template to write the answer
    sample_path = os.path.join(data_path, 'submission_format.csv')
    submission = pd.read_csv(sample_path, index_col=[0, 1, 2])
    submission.total_cases = np.concatenate([sj_predictions, iq_predictions])
    submission.to_csv("./data/ext_new.csv")
    '''
예제 #13
0
def main():

    ### parsing and Data pre-processing
    # load the provided data
    train_features_path = os.path.join(data_path, 'dengue_features_train.csv')
    train_labels_path = os.path.join(data_path, 'dengue_labels_train.csv')

    ### pre-processing data
    sj_train, iq_train = preprocess_data(train_features_path,
                                         labels_path=train_labels_path)
    #print(sj_train.describe())
    #print(iq_train.describe())

    ###Define the xgb parameters
    xgb_params = {
        'eta': 0.05,
        'max_depth': 5,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'objective': 'reg:linear',
        'eval_metric': 'rmse',
        'silent': 1
    }
    num_boost_rounds = 1000
    ##Use K-fold to create cross validation data
    kf = KFold(n_splits=6)

    ##Do the stacking by adding 5 dataframes 'negbi', 'gb', 'xgb','adaboost','extratree' ,'bagging'which store the training prediction
    sj_train = sj_train.assign(negbi=0)
    sj_train = sj_train.assign(gb=0)
    sj_train = sj_train.assign(xgb=0)
    sj_train = sj_train.assign(abr=0)
    sj_train = sj_train.assign(etr=0)
    sj_train = sj_train.assign(br=0)

    loop = 1
    for train_index, val_index in kf.split(
            sj_train
    ):  #The index will be split into [train_index] and [val_index]
        X_train, X_val = sj_train.ix[train_index], sj_train.ix[val_index]
        ###(1)neg_binomial method
        sj_neg_model = get_best_model(X_train, X_val, 'sj')
        predictions_neg = sj_neg_model.predict(X_val).astype(int)
        #Shift the prediction manually
        for i in range(predictions_neg.shape[0] - 1, 3, -1):
            predictions_neg.ix[i] = predictions_neg.ix[i - 4]

        ###(2)gradient boosting method
        sj_gb_model = gradient_boosting(
            X_train.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1),
            X_val.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1))
        predictions_gb = sj_gb_model.predict(
            X_val.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1)).astype(int)

        ###(3)xgboost method
        dtrain = xgb.DMatrix(
            X_train.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1), X_train['total_cases'])
        dval = xgb.DMatrix(
            X_val.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1))
        sj_xgb_model = xgb.train(dict(xgb_params, silent=0),
                                 dtrain,
                                 num_boost_round=num_boost_rounds)
        predictions_xgb = sj_xgb_model.predict(dval).astype(int)

        ###(4)Adaboost regressor method
        sj_abr_model = ABR(n_estimators=800,
                           learning_rate=0.08,
                           loss='linear',
                           random_state=0)
        sj_abr_model.fit(
            X_train.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1), X_train['total_cases'])
        predictions_abr = sj_abr_model.predict(
            X_val.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1))

        ###(5)Extra tree regressor method
        sj_etr_model = ETR(n_estimators=800,
                           max_depth=4,
                           random_state=0,
                           verbose=1)
        sj_etr_model.fit(
            X_train.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1), X_train['total_cases'])
        predictions_etr = sj_etr_model.predict(
            X_val.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1))

        ###(6) Bagging Regressor method
        sj_br_model = BR(n_estimators=800,
                         oob_score=False,
                         n_jobs=5,
                         random_state=0,
                         verbose=1)
        sj_br_model.fit(
            X_train.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1), X_train['total_cases'])
        predictions_br = sj_br_model.predict(
            X_val.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1))

        ###Store the result in sj_train  predictions_neg -> 'negbi', predictions_gb -> 'gb'
        print(
            "Adding the result of the predictions to sj training data({}/{})".
            format(loop, 6))
        for idx, index in enumerate(val_index):
            sj_train['negbi'].ix[index] = predictions_neg.ix[idx]
            sj_train['gb'].ix[index] = predictions_gb[idx]
            sj_train['xgb'].ix[index] = predictions_xgb[idx]
            sj_train['abr'].ix[index] = predictions_abr[idx]
            sj_train['etr'].ix[index] = predictions_etr[idx]
            sj_train['br'].ix[index] = predictions_br[idx]
        loop += 1

    iq_train = iq_train.assign(negbi=0)
    iq_train = iq_train.assign(gb=0)
    iq_train = iq_train.assign(xgb=0)
    iq_train = iq_train.assign(abr=0)
    iq_train = iq_train.assign(etr=0)
    iq_train = iq_train.assign(br=0)

    loop = 1
    for train_index, val_index in kf.split(iq_train):
        X_train, X_val = iq_train.ix[train_index], iq_train.ix[val_index]

        ###(1)neg_binomial method
        iq_neg_model = get_best_model(X_train, X_val, 'iq')
        predictions_neg = iq_neg_model.predict(X_val).astype(int)
        #Shift the prediction manually
        for i in range(predictions_neg.shape[0] - 1, 0, -1):
            predictions_neg.ix[i] = predictions_neg.ix[i - 1]

        ###(2)gradient boosting method
        iq_gb_model = gradient_boosting(
            X_train.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1),
            X_val.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1))
        predictions_gb = iq_gb_model.predict(
            X_val.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1)).astype(int)

        ###(3)xgb method
        dtrain = xgb.DMatrix(
            X_train.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1), X_train['total_cases'])
        dval = xgb.DMatrix(
            X_val.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1))
        iq_xgb_model = xgb.train(dict(xgb_params, silent=0),
                                 dtrain,
                                 num_boost_round=num_boost_rounds)
        predictions_xgb = iq_xgb_model.predict(dval).astype(int)

        ###(4)Adaboost regressor method
        iq_abr_model = ABR(n_estimators=800,
                           learning_rate=0.08,
                           loss='linear',
                           random_state=0)
        iq_abr_model.fit(
            X_train.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1), X_train['total_cases'])
        predictions_abr = iq_abr_model.predict(
            X_val.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1))

        ###(5)Extra tree regressor method
        iq_etr_model = ETR(n_estimators=800,
                           max_depth=4,
                           random_state=0,
                           verbose=1)
        iq_etr_model.fit(
            X_train.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1), X_train['total_cases'])
        predictions_etr = iq_etr_model.predict(
            X_val.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1))

        ###(6) Bagging Regressor method
        iq_br_model = BR(n_estimators=800,
                         oob_score=False,
                         n_jobs=5,
                         random_state=0,
                         verbose=1)
        iq_br_model.fit(
            X_train.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1), X_train['total_cases'])
        predictions_br = iq_br_model.predict(
            X_val.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1))

        ###Store the result in iq_train predictions_neg -> 'negbi', predictions_gb -> 'gb'
        print(
            "Adding the result of the predictions to iq training data({}/{})".
            format(loop, 6))
        for idx, index in enumerate(val_index):
            iq_train['negbi'].ix[index] = predictions_neg.ix[idx]
            iq_train['gb'].ix[index] = predictions_gb[idx]
            iq_train['xgb'].ix[index] = predictions_xgb[idx]
            iq_train['abr'].ix[index] = predictions_abr[idx]
            iq_train['etr'].ix[index] = predictions_etr[idx]
            iq_train['br'].ix[index] = predictions_br[idx]
        loop += 1

    ###Now the training data looks like [feature, total_cases, negbi, gb, xgb]

    ##Accessing testing data
    test_features_path = os.path.join(data_path, 'dengue_features_test.csv')
    sj_test, iq_test = preprocess_data(test_features_path)
    ##Like training, add 'negbi' and 'gb' to the testing dataframe
    sj_test = sj_test.assign(negbi=0)
    sj_test = sj_test.assign(gb=0)
    sj_test = sj_test.assign(xgb=0)
    sj_test = sj_test.assign(abr=0)
    sj_test = sj_test.assign(etr=0)
    sj_test = sj_test.assign(br=0)

    ##(1)neg_binomial prediction
    sj_predictions_neg = sj_neg_model.predict(sj_test).astype(int)
    for i in range(sj_predictions_neg.shape[0] - 1, 3, -1):
        sj_predictions_neg.ix[i] = sj_predictions_neg.ix[i - 4]
    ##(2)gradient boosting prediction
    sj_predictions_gb = sj_gb_model.predict(
        sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                     axis=1)).astype(int)
    ##(3)xgb prediction
    dtest = xgb.DMatrix(
        sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1))
    sj_predictions_xgb = sj_xgb_model.predict(dtest).astype(int)
    ###(4)Adaboost regressor method
    sj_predictions_abr = sj_br_model.predict(
        sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                     axis=1)).astype(int)
    ###(5)extra tree regressor method
    sj_predictions_etr = sj_etr_model.predict(
        sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                     axis=1)).astype(int)
    ###(6)bagging regressor method
    sj_predictions_br = sj_br_model.predict(
        sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                     axis=1)).astype(int)

    print("Adding predictions as features to sj testing data...")
    for i in range(len(sj_test['negbi'])
                   ):  #Add the prediction to the corresponding column
        sj_test['negbi'].ix[i] = sj_predictions_neg.ix[i]
        sj_test['gb'].ix[i] = sj_predictions_gb[i]
        sj_test['xgb'].ix[i] = sj_predictions_xgb[i]
        sj_test['abr'].ix[i] = sj_predictions_abr[i]
        sj_test['etr'].ix[i] = sj_predictions_etr[i]
        sj_test['br'].ix[i] = sj_predictions_br[i]

    ##Same process as city sj
    iq_test = iq_test.assign(negbi=0)
    iq_test = iq_test.assign(gb=0)
    iq_test = iq_test.assign(xgb=0)
    iq_test = iq_test.assign(abr=0)
    iq_test = iq_test.assign(etr=0)
    iq_test = iq_test.assign(br=0)

    ###(1)neg_binomial prediction
    iq_predictions_neg = iq_neg_model.predict(iq_test).astype(int)
    for i in range(iq_predictions_neg.shape[0] - 1, 0, -1):
        iq_predictions_neg.ix[i] = iq_predictions_neg.ix[i - 1]
    ##(2)gradient boosting prediction
    iq_predictions_gb = iq_gb_model.predict(
        iq_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                     axis=1)).astype(int)
    ##(3)xgb prediction
    dtest = xgb.DMatrix(
        iq_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1))
    iq_predictions_xgb = iq_xgb_model.predict(dtest).astype(int)
    ###(4)Adaboost regressor method
    iq_predictions_abr = iq_abr_model.predict(
        sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                     axis=1)).astype(int)
    ###(5)extra tree regressor method
    iq_predictions_etr = iq_etr_model.predict(
        sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                     axis=1)).astype(int)
    ###(6)bagging regressor method
    iq_predictions_br = iq_br_model.predict(
        sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                     axis=1)).astype(int)

    print("Adding predictions as features to iq testing data...")
    for i in range(len(iq_test['negbi'])):
        iq_test['negbi'].ix[i] = iq_predictions_neg.ix[i]
        iq_test['gb'].ix[i] = iq_predictions_gb[i]
        iq_test['xgb'].ix[i] = iq_predictions_xgb[i]
        iq_test['abr'].ix[i] = iq_predictions_abr[i]
        iq_test['etr'].ix[i] = iq_predictions_etr[i]
        iq_test['br'].ix[i] = iq_predictions_br[i]

    ##use new information to run a linear regression
    print("Building linear regression model...")
    #Now the linear regression model uses (X = [features, negbi, gb, xgb], y = total_cases )to train(fit)
    sj_lr = LR()
    sj_lr.fit(sj_train.drop('total_cases', axis=1), sj_train['total_cases'])
    iq_lr = LR()
    iq_lr.fit(iq_train.drop('total_cases', axis=1), iq_train['total_cases'])

    #Calculate the k-fold validation error
    sj_score = []
    for train_index, val_index in kf.split(sj_train):
        X_train, X_val = sj_train.ix[train_index], sj_train.ix[val_index]
        train_predict = np.array(
            sj_lr.predict(X_val.drop('total_cases', axis=1))).astype(int)
        sj_score.append(eval_measures.meanabs(train_predict,
                                              X_val.total_cases))
    print("Mean of {} cross validation of sj_score is {} (+/- {})".format(
        kf.get_n_splits(sj_train), np.mean(sj_score), np.std(sj_score)))

    iq_score = []
    for train_index, val_index in kf.split(iq_train):
        X_train, X_val = iq_train.ix[train_index], iq_train.ix[val_index]
        train_predict = np.array(
            iq_lr.predict(X_val.drop('total_cases', axis=1))).astype(int)
        iq_score.append(eval_measures.meanabs(train_predict,
                                              X_val.total_cases))
    print("Mean of {} cross validation of iq_score is {} (+/- {})".format(
        kf.get_n_splits(iq_train), np.mean(iq_score), np.std(iq_score)))

    ##Use the model sj_lr and iq_lr trained before to predict the testing data
    print("Predicting testing data...")
    sj_predictions = sj_lr.predict(sj_test)
    iq_predictions = iq_lr.predict(iq_test)
    sj_predictions = np.array(sj_predictions).astype(int)
    iq_predictions = np.array(iq_predictions).astype(int)

    print("Creating submit file...")
    ##Use submission_format as template to write the answer
    sample_path = os.path.join(data_path, 'submission_format.csv')
    submission = pd.read_csv(sample_path, index_col=[0, 1, 2])
    submission.total_cases = np.concatenate([sj_predictions, iq_predictions])
    submission.to_csv("./data/stacking_6_less_feature.csv")
    '''
예제 #14
0
                    for min_samples_split in [2]:
                        for min_samples_leaf in [1]:
                            if algorithm_name == "rf":
                                estimator_withParams = RFR(
                                    n_estimators=n_estimators,
                                    max_features="auto",
                                    min_samples_split=min_samples_split,
                                    min_samples_leaf=min_samples_leaf,
                                    oob_score=False,
                                    n_jobs=-1,
                                    random_state=2017)
                            if algorithm_name == "etr":
                                estimator_withParams = ETR(
                                    n_estimators=n_estimators,
                                    max_features="auto",
                                    min_samples_split=min_samples_split,
                                    min_samples_leaf=min_samples_leaf,
                                    oob_score=False,
                                    n_jobs=-1,
                                    random_state=2017)

                            Model_for_competition(
                                algorithm_name=algorithm_name,
                                estimator_withParams=estimator_withParams,
                                pickle_model_file_name="1.csv",
                                fgfs=fgfs,
                                ReportFolder=ReportFolder,
                                source=source,
                                predictors_type=predictors_type,
                                predictors=predictors,
                                target_variables=target_variables,
                                estimator_output_length=estimator_output_length,
예제 #15
0
def testPCA(components):

    #pca_trans=PCA(n_components=components,random_state=1)
    pca_trans = tsvd(n_components=components, random_state=7, n_iter=10)

    pca_trans.fit(data)

    data2 = pca_trans.transform(data)

    #MinMax Normalizer
    scaler = MinMaxScaler()
    scaler.fit(data2)
    data2 = scaler.transform(data2)

    y["target"] = np.log1p(y["target"])

    #train test split
    x_train, x_test, y_train, y_test = tts(data2, y["target"], test_size=0.20)

    #######################----------Algos--------------------#######################
    ranfor = RFR(n_estimators=500, verbose=0, n_jobs=-1, random_state=7)
    extratrees = ETR(n_estimators=500, random_state=7)
    bagging = BR(ETR(n_estimators=10, random_state=1),
                 n_estimators=100,
                 random_state=7)
    """---XGBOOST---"""
    xgb_train = xgb.DMatrix(x_train, label=y_train)
    xgb_validate = xgb.DMatrix(x_test, label=y_test)
    xgb_test_pred = xgb.DMatrix(x_test)

    param = {}
    param['objective'] = 'reg:linear'
    param['eta'] = 0.001
    param['max_depth'] = 6
    param['alpha'] = 0.001
    param['colsample_bytree'] = 0.6
    param['subsample'] = 0.6
    param['silent'] = 0
    param['nthread'] = 4
    param['random_state'] = 42
    param['eval_metric'] = 'rmse'

    watchlist = [(xgb_train, 'train'), (xgb_validate, 'validation')]
    """-fit-"""
    ranfor.fit(x_train, y_train)
    extratrees.fit(x_train, y_train)

    bst = xgb.train(param,
                    xgb_train,
                    10000,
                    watchlist,
                    early_stopping_rounds=100,
                    verbose_eval=100,
                    maximize=False)

    y_pred = ranfor.predict(x_test)
    y_pred_ada = extratrees.predict(x_test)
    y_pred_xgb = bst.predict(xgb_test_pred, ntree_limit=bst.best_ntree_limit)

    #blending
    blending_X = pd.DataFrame()
    blending_X['xgb'] = bst.predict(xgb.DMatrix(x_train),
                                    ntree_limit=bst.best_ntree_limit)
    blending_X['ExtraTrees'] = extratrees.predict(x_train)
    blending_X['ranfor'] = ranfor.predict(x_train)

    bagging.fit(blending_X, y_train)

    blending_test = pd.DataFrame()
    blending_test['xgb'] = y_pred_xgb
    blending_test['ExtraTrees'] = y_pred_ada
    blending_test['ranfor'] = y_pred

    y_pred_grad = bagging.predict(blending_test)
    ###############################################

    y_pred_2best = (0.6 * y_pred_ada) + (0.4 * y_pred_xgb)

    print("PCA: %s --- Ranfor RMSE is : %s" %
          (components, np.sqrt(mse(y_test, y_pred))))
    print("PCA: %s --- ExtraTrees RMSE is : %s" %
          (components, np.sqrt(mse(y_test, y_pred_ada))))
    print("PCA: %s --- XGBoost RMSE is : %s" %
          (components, np.sqrt(mse(y_test, y_pred_xgb))))

    print("PCA: %s --- blended bagging RMSE is : %s" %
          (components, np.sqrt(mse(y_test, y_pred_grad))))
    print("PCA: %s --- XGBoost+ExtraTrees RMSE is : %s" %
          (components, np.sqrt(mse(y_test, y_pred_2best))))

    return {
        "pca": pca_trans,
        "scaler": scaler,
        "ranfor": ranfor,
        'extratrees': extratrees,
        'bagging': bagging,
        'xgboost': bst
    }
예제 #16
0
def main():

    ### parsing and Data pre-processing
    # load the provided data
    train_features_path = os.path.join(data_path, 'dengue_features_train.csv')
    train_labels_path = os.path.join(data_path, 'dengue_labels_train.csv')

    ### pre-processing data
    sj_train, iq_train = preprocess_data(train_features_path,
                                         labels_path=train_labels_path)
    #print(sj_train.describe())
    #print(iq_train.describe())

    choose = rand.sample(range(0, sj_train.shape[0] - 1), 800)
    val = [i for i in range(sj_train.shape[0]) if i not in choose]
    sj_train_subtrain = sj_train.ix[choose]
    sj_train_subtest = sj_train.ix[val]
    sj_etr = ETR(n_estimators=2000, max_depth=3, criterion='mae', verbose=1)
    sj_etr.fit(sj_train_subtrain.drop('total_cases', axis=1),
               sj_train_subtrain['total_cases'])
    ##The model generate by neg_binomial with best alpha on val_set chosen before

    kf = KFold(n_splits=12)

    sj_model_list = []
    sj_err_list = []
    loop = 1
    for train_index, val_index in kf.split(
            sj_train
    ):  #The index will be split into [train_index] and [val_index]
        X_train, X_val = sj_train.ix[train_index], sj_train.ix[val_index]
        #sj_etr = ETR(n_estimators = 2000,  max_depth = 3,criterion = 'mae',verbose = 1)
        #sj_etr.fit(X_train.drop(['station_avg_temp_c','total_cases'],axis = 1),X_train['total_cases'])
        predictions = sj_etr.predict(X_val.drop('total_cases', axis=1))
        sj_err_list.append(
            eval_measures.meanabs(predictions, X_val.total_cases))
        #sj_model_list.append(sj_etr)
        loop += 1
    print(sj_err_list)

    #argmax = sorted(range(len(sj_err_list)), key=lambda x: sj_err_list[x])[0]
    #print(argmax)

    #sj_best_model = sj_model_list[argmax]
    sj_best_model = sj_etr
    #print(sj_best_model.feature_importances_)

    choose = rand.sample(range(0, iq_train.shape[0] - 1), 400)
    val = [i for i in range(iq_train.shape[0]) if i not in choose]
    iq_train_subtrain = iq_train.ix[choose]
    iq_train_subtest = iq_train.ix[val]
    iq_etr = ETR(n_estimators=2000, max_depth=3, criterion='mae', verbose=1)
    iq_etr.fit(iq_train_subtrain.drop('total_cases', axis=1),
               iq_train_subtrain['total_cases'])

    iq_model_list = []
    iq_err_list = []
    loop = 1
    for train_index, val_index in kf.split(iq_train):
        X_train, X_val = iq_train.ix[train_index], iq_train.ix[val_index]
        #iq_etr = ETR(n_estimators = 2000,  max_depth = 3,criterion = 'mae',verbose = 1)
        #iq_etr.fit(X_train.drop(['station_min_temp_c','total_cases'],axis = 1),X_train['total_cases'])
        predictions = iq_etr.predict(X_val.drop('total_cases', axis=1))
        iq_err_list.append(
            eval_measures.meanabs(predictions, X_val.total_cases))
        #iq_model_list.append(iq_etr)

        loop += 1
    print(iq_err_list)
    #argmax = sorted(range(len(iq_err_list)), key=lambda x: iq_err_list[x])[0]
    #print(argmax)
    #iq_best_model = iq_model_list[argmax]
    iq_best_model = iq_etr
    #print(iq_best_model.feature_importances_)
    ##Accessing testing data
    test_features_path = os.path.join(data_path, 'dengue_features_test.csv')
    sj_test, iq_test = preprocess_data(test_features_path)

    #Calculate the k-fold validation error
    sj_score = []
    for train_index, val_index in kf.split(sj_train):
        X_train, X_val = sj_train.ix[train_index], sj_train.ix[val_index]
        train_predict = np.array(
            sj_best_model.predict(X_val.drop('total_cases',
                                             axis=1))).astype(int)
        sj_score.append(eval_measures.meanabs(train_predict,
                                              X_val.total_cases))
    print("Mean of {} cross validation of sj_score is {} (+/- {})".format(
        kf.get_n_splits(sj_train), np.mean(sj_score), np.std(sj_score)))

    iq_score = []
    for train_index, val_index in kf.split(iq_train):
        X_train, X_val = iq_train.ix[train_index], iq_train.ix[val_index]
        train_predict = np.array(
            iq_best_model.predict(X_val.drop('total_cases',
                                             axis=1))).astype(int)
        iq_score.append(eval_measures.meanabs(train_predict,
                                              X_val.total_cases))
    print("Mean of {} cross validation of iq_score is {} (+/- {})".format(
        kf.get_n_splits(iq_train), np.mean(iq_score), np.std(iq_score)))

    ##Use the model sj_lr and iq_lr trained before to predict the testing data
    print("Predicting testing data...")
    sj_predictions = sj_best_model.predict(sj_test)
    iq_predictions = iq_best_model.predict(iq_test)
    sj_predictions = np.round(sj_predictions).astype(int)
    iq_predictions = np.round(iq_predictions).astype(int)

    print("Creating submit file...")
    ##Use submission_format as template to write the answer
    sample_path = os.path.join(data_path, 'submission_format.csv')
    submission = pd.read_csv(sample_path, index_col=[0, 1, 2])
    submission.total_cases = np.concatenate([[28], [25], [34], sj_predictions,
                                             [8], [6], [10], iq_predictions])
    submission.to_csv("./data/ext_final_new.csv")
    '''
예제 #17
0
def main():

    ### parsing and Data pre-processing
    # load the provided data
    train_features_path = os.path.join(data_path, 'dengue_features_train.csv')
    train_labels_path = os.path.join(data_path, 'dengue_labels_train.csv')

    ### pre-processing data
    sj_train, iq_train = preprocess_data(train_features_path,
                                         labels_path=train_labels_path)
    #print(sj_train.describe())
    #print(iq_train.describe())

    ###Define the xgb parameters
    xgb_params = {
        'eta': 0.05,
        'max_depth': 5,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'objective': 'reg:linear',
        'eval_metric': 'rmse',
        'silent': 1
    }
    num_boost_rounds = 1000
    ##Use K-fold to create cross validation data
    kf = KFold(n_splits=6)

    ##Do the stacking by adding 5 dataframes 'negbi', 'gb', 'xgb','adaboost','extratree' ,'bagging'which store the training prediction
    sj_train = sj_train.assign(xgb=0)
    sj_train = sj_train.assign(etr=0)
    sj_train = sj_train.assign(br=0)

    loop = 1
    for train_index, val_index in kf.split(
            sj_train
    ):  #The index will be split into [train_index] and [val_index]
        X_train, X_val = sj_train.ix[train_index], sj_train.ix[val_index]

        ###(1)xgboost method
        dtrain = xgb.DMatrix(
            X_train.drop(['total_cases', 'xgb', 'etr', 'br'], axis=1),
            X_train['total_cases'])
        dval = xgb.DMatrix(
            X_val.drop(['total_cases', 'xgb', 'etr', 'br'], axis=1))
        sj_xgb_model = xgb.train(dict(xgb_params, silent=0),
                                 dtrain,
                                 num_boost_round=num_boost_rounds)
        predictions_xgb = sj_xgb_model.predict(dval).astype(int)

        ###(2)Extra tree regressor method
        sj_etr_model = ETR(n_estimators=2000, max_depth=3, criterion='mae')
        sj_etr_model.fit(
            X_train.drop(['total_cases', 'xgb', 'etr', 'br'], axis=1),
            X_train['total_cases'])
        predictions_etr = sj_etr_model.predict(
            X_val.drop(['total_cases', 'xgb', 'etr', 'br'], axis=1))

        ###(3) Bagging Regressor method
        sj_br_model = BR(n_estimators=100, max_features=0.6, max_samples=0.6)
        sj_br_model.fit(
            X_train.drop(['total_cases', 'xgb', 'etr', 'br'], axis=1),
            X_train['total_cases'])
        predictions_br = sj_br_model.predict(
            X_val.drop(['total_cases', 'xgb', 'etr', 'br'], axis=1))

        ###Store the result in sj_train  predictions_neg -> 'negbi', predictions_gb -> 'gb'
        print(
            "Adding the result of the predictions to sj training data({}/{})".
            format(loop, 6))
        for idx, index in enumerate(val_index):
            sj_train['xgb'].ix[index] = predictions_xgb[idx]
            sj_train['etr'].ix[index] = predictions_etr[idx]
            sj_train['br'].ix[index] = predictions_br[idx]
        loop += 1

    iq_train = iq_train.assign(xgb=0)
    iq_train = iq_train.assign(etr=0)
    iq_train = iq_train.assign(br=0)

    loop = 1
    for train_index, val_index in kf.split(iq_train):
        X_train, X_val = iq_train.ix[train_index], iq_train.ix[val_index]
        ###(1)xgb method
        dtrain = xgb.DMatrix(
            X_train.drop(['total_cases', 'xgb', 'etr', 'br'], axis=1),
            X_train['total_cases'])
        dval = xgb.DMatrix(
            X_val.drop(['total_cases', 'xgb', 'etr', 'br'], axis=1))
        iq_xgb_model = xgb.train(dict(xgb_params, silent=0),
                                 dtrain,
                                 num_boost_round=num_boost_rounds)
        predictions_xgb = iq_xgb_model.predict(dval).astype(int)

        ###(2)Extra tree regressor method
        iq_etr_model = ETR(n_estimators=2000, max_depth=3, criterion='mae')
        iq_etr_model.fit(
            X_train.drop(['total_cases', 'xgb', 'etr', 'br'], axis=1),
            X_train['total_cases'])
        predictions_etr = iq_etr_model.predict(
            X_val.drop(['total_cases', 'xgb', 'etr', 'br'], axis=1))

        ###(3) Bagging Regressor method
        iq_br_model = BR(n_estimators=800, max_features=0.6, max_samples=0.6)
        iq_br_model.fit(
            X_train.drop(['total_cases', 'xgb', 'etr', 'br'], axis=1),
            X_train['total_cases'])
        predictions_br = iq_br_model.predict(
            X_val.drop(['total_cases', 'xgb', 'etr', 'br'], axis=1))

        ###Store the result in iq_train predictions_neg -> 'negbi', predictions_gb -> 'gb'
        print(
            "Adding the result of the predictions to iq training data({}/{})".
            format(loop, 6))
        for idx, index in enumerate(val_index):
            iq_train['xgb'].ix[index] = predictions_xgb[idx]
            iq_train['etr'].ix[index] = predictions_etr[idx]
            iq_train['br'].ix[index] = predictions_br[idx]
        loop += 1

    ###Now the training data looks like [feature, total_cases, negbi, gb, xgb]

    ##Accessing testing data
    test_features_path = os.path.join(data_path, 'dengue_features_test.csv')
    sj_test, iq_test = preprocess_data(test_features_path)
    ##Like training, add 'xgb' 'br' 'etr'  to the testing dataframe
    sj_test = sj_test.assign(xgb=0)
    sj_test = sj_test.assign(etr=0)
    sj_test = sj_test.assign(br=0)
    ##(1)xgb prediction
    dtest = xgb.DMatrix(sj_test.drop(['xgb', 'etr', 'br'], axis=1))
    sj_predictions_xgb = sj_xgb_model.predict(dtest).astype(int)
    ###(2)extra tree regressor method
    sj_predictions_etr = sj_etr_model.predict(
        sj_test.drop(['xgb', 'etr', 'br'], axis=1)).astype(int)
    ###(3)bagging regressor method
    sj_predictions_br = sj_br_model.predict(
        sj_test.drop(['xgb', 'etr', 'br'], axis=1)).astype(int)

    print("Adding predictions as features to sj testing data...")
    for i in range(len(
            sj_test['xgb'])):  #Add the prediction to the corresponding column
        sj_test['xgb'].ix[i] = sj_predictions_xgb[i]
        sj_test['etr'].ix[i] = sj_predictions_etr[i]
        sj_test['br'].ix[i] = sj_predictions_br[i]

    ##Same process as city iq
    iq_test = iq_test.assign(xgb=0)
    iq_test = iq_test.assign(etr=0)
    iq_test = iq_test.assign(br=0)

    ###(1)xgb prediction
    dtest = xgb.DMatrix(iq_test.drop(['xgb', 'etr', 'br'], axis=1))
    iq_predictions_xgb = iq_xgb_model.predict(dtest).astype(int)
    ###(2)extra tree regressor method
    iq_predictions_etr = iq_etr_model.predict(
        sj_test.drop(['xgb', 'etr', 'br'], axis=1)).astype(int)
    ###(3)bagging regressor method
    iq_predictions_br = iq_br_model.predict(
        sj_test.drop(['xgb', 'etr', 'br'], axis=1)).astype(int)

    print("Adding predictions as features to iq testing data...")
    for i in range(len(iq_test['xgb'])):
        iq_test['xgb'].ix[i] = iq_predictions_xgb[i]
        iq_test['etr'].ix[i] = iq_predictions_etr[i]
        iq_test['br'].ix[i] = iq_predictions_br[i]

    ##use new information to run a linear regression
    '''
	print("Building linear regression model...")
	#Now the linear regression model uses (X = [features, negbi, gb, xgb, abr, etr, br], y = total_cases )to train(fit)
	sj_lr = LR()
	sj_lr.fit(sj_train.drop('total_cases',axis = 1),sj_train['total_cases'])
	iq_lr = LR()
	iq_lr.fit(iq_train.drop('total_cases',axis = 1),iq_train['total_cases'])
	
	#Calculate the k-fold validation error
	sj_score = []
	for train_index, val_index in kf.split(sj_train):
		X_train,X_val = sj_train.ix[train_index], sj_train.ix[val_index]
		train_predict = np.array(sj_lr.predict(X_val.drop('total_cases',axis = 1))).astype(int)
		sj_score.append(eval_measures.meanabs(train_predict, X_val.total_cases))
	print("Mean of {} cross validation of sj_score is {} (+/- {})".format(kf.get_n_splits(sj_train)
																,np.mean(sj_score),np.std(sj_score)))
	
	iq_score = []
	for train_index, val_index in kf.split(iq_train):
		X_train,X_val = iq_train.ix[train_index], iq_train.ix[val_index]
		train_predict = np.array(iq_lr.predict(X_val.drop('total_cases',axis = 1))).astype(int)
		iq_score.append(eval_measures.meanabs(train_predict, X_val.total_cases))
	print("Mean of {} cross validation of iq_score is {} (+/- {})".format(kf.get_n_splits(iq_train)
																,np.mean(iq_score),np.std(iq_score)))
	
	##Use the model sj_lr and iq_lr trained before to predict the testing data
	print("Predicting testing data...")
	sj_predictions = sj_lr.predict(sj_test)
	iq_predictions = iq_lr.predict(iq_test)
	'''
    sj_predictions = []
    iq_predictions = []
    for i in range(len(sj_test['br'])):
        sj_predictions.append(( #sj_test['negbi'].ix[i]+\
               #sj_test['gb'].ix[i]+\
               #sj_test['abr'].ix[i]+\
               sj_test['xgb'].ix[i]+\
               sj_test['etr'].ix[i]+\
               sj_test['br'].ix[i])/3)

    for i in range(len(iq_test['br'])):
        iq_predictions.append(( #iq_test['negbi'].ix[i]+\
              #iq_test['gb'].ix[i]+\
              #iq_test['abr'].ix[i]+\
              iq_test['xgb'].ix[i]+\
             iq_test['etr'].ix[i]+\
             iq_test['br'].ix[i])/3)

    sj_predictions = np.round(sj_predictions).astype(int)
    iq_predictions = np.round(iq_predictions).astype(int)

    print("Creating submit file...")
    ##Use submission_format as template to write the answer
    sample_path = os.path.join(data_path, 'submission_format.csv')
    submission = pd.read_csv(sample_path, index_col=[0, 1, 2])
    submission.total_cases = np.concatenate([[28], [25], [34], sj_predictions,
                                             [7], [6], [8], iq_predictions])
    submission.to_csv("./data/stacking_final_new.csv")
    '''