示例#1
0
def do_time_ml(ticker):
    X, y = extract_featuresets(ticker)
    num_splits = 5

    tscv = TimeSeriesSplit(n_splits=num_splits)
    # need to have () after the classifier otherwise it gives an error
    # TypeError: get_params() missing 1 required positional argument: 'self'
    clf = VotingClassifier([('lsvc', svm.LinearSVC()),
                            ('knn', neighbors.KNeighborsClassifier()),
                            ('rfor', RandomForestClassifier()),
                            ('gap', GaussianProcessClassifier()),
                            ('bag', BaggingClassifier()),
                            ('nn', MLPClassifier(max_iter=2000))])
    i = 1
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        clf.fit(X_train, y_train['{}_target'.format(ticker)])
        confidence = clf.score(X_test, y_test['{}_target'.format(ticker)])
        predictions = clf.predict(X_test)
        print('Accuracy:', confidence)
        print("Predicted Spread:", Counter(predictions))

        clf.fit(X_train, y_train['{}_10d pred'.format(ticker)])
        confidence = clf.score(X_test, y_test['{}_10d pred'.format(ticker)])
        predictions = clf.predict(X_test)
        print('p Accuracy:', confidence)
        print("p Predicted Spread:", Counter(predictions))
        i += 1
        if i == num_splits:
            np.savetxt('p_aes_1020_pred.csv', predictions, delimiter=',')
            np.savetxt('t_aes_1020_pred.csv', X_test, delimiter=',')
示例#2
0
    def calc_density(self, df):
        cnt = 0
        n_splits = 5
        # Calculate the difference between shifted training duration
        thrs = 3
        col = list(set(df.columns) - set(['target']))[0]
        tss = TimeSeriesSplit(n_splits=n_splits)
        total_cnt = 0
        for idx, (train_idx, valid_idx) in enumerate(tss.split(df)):
            if idx > 1:
                train_len = int(len(valid_idx)*(idx+1)/2)
                train_idx = pd.Index(np.random.choice(train_idx, train_len, replace=False))
            # Training target mean
            train_vc = df[col].iloc[train_idx].value_counts()
            train_vc = train_vc.loc[train_vc > thrs].index
            series = df.iloc[train_idx]
            series = series.loc[series[col].isin(train_vc)]
            train_mean = series.groupby(col)['target'].agg({'avg': 'mean', 'cnt': 'count'})
            # Validation target mean
            valid_vc = df[col].iloc[valid_idx].value_counts()
            valid_vc = valid_vc.loc[valid_vc > thrs].index
            series = df.iloc[valid_idx]
            series = series.loc[series[col].isin(valid_vc)]
            valid_mean = series.groupby(col)['target'].agg({'avg': 'mean', 'cnt': 'count'})
            # Compare two distribution with p-value
            prop = pd.merge(train_mean, valid_mean, how='inner', left_index=True, right_index=True)
            total_cnt += len(prop)
            for _, cat in prop.iterrows():
                p_value = self.t_test(cat)
                if p_value < 0.01 or p_value > 0.99:
                    cnt += 1
                if cnt > total_cnt/10:
                    return (col, True)

        return (col, False)
    def do_cross_val(self, X, Y):
        ## Do a log transform on Y
        tscv = TimeSeriesSplit(n_splits=self.n_splits)
        i = 1
        for i_train, i_test in tscv.split(X):
            self.model.fit(X[i_train], Y[i_train])

            self.train_resid_Y.append([
                y - y_true for y_true, y in zip(Y[i_train],
                                                self.model.predict(X[i_train]))
            ])
            self.train_true_Y.append(Y[i_train])
            self.train_X.append(i_train)
            self.train_pred_Y.append(
                [y for y in self.model.predict(X[i_train])])
            self.test_resid_Y.append([
                y - y_true
                for y, y_true in zip(self.model.predict(X[i_test]), Y[i_test])
            ])
            self.test_true_Y.append(Y[i_test])
            self.test_X.append(i_test)
            self.test_pred_Y.append([y for y in self.model.predict(X[i_test])])

            self.mses.append(
                mean_squared_error(Y[i_test], self.model.predict(X[i_test])))

            self.mapes.append(100 * np.mean(
                [(y - Y[i]) / Y[i]
                 for i, y in zip(i_test, self.model.predict(X[i_test]))]))

            print(
                f"Fold {i} complete with: {len(i_train)} training samples and {len(i_test)} test samples."
            )
            i += 1
    def cv_fit(self,n_splits, X_train,_y_train,regressor, **kwargs):
        '''
        Cross validates the data with a Time Series Split
        Fits the training data for each split and finds the average score


        INPUT:  # of Time Series splits (int)
                X_train (arr)
                y_train (arr)
                Regression Model (reg)
                Kwargs (varies depending on model)

        OUTPUT: None

        '''
        tscv = TimeSeriesSplit(n_splits=n_splits)
        my_cv = tscv.split(X_train,y_train)
        scores = []
        reg = regressor(**kwargs)
        for train_index, test_index in my_cv:
            X_val_train, X_val_test = X_train[train_index], X_train[test_index]
            y_val_train, y_val_test = y_train[train_index], y_train[test_index]
            reg.fit(X_val_train,y_val_train)
            scores.append(reg.score(X_val_test, y_val_test))
        self.name = reg.__class__.__name__
        self.score = round(np.mean(scores),3)
        self.reg = reg
    def train_arma(self):

        tscv = TimeSeriesSplit(n_splits=10)

        for train_index, test_index in tscv.split(self.dt):
            train = self.dt[train_index]
            test = self.dt[test_index]
        arma_model = ARMA(train, order=(18, 0))
        results_AR = arma_model.fit(disp=-1)
        plt.figure(figsize=(16, 6))
        plt.title('ARMA Model on Aggregate Data')
        plt.plot(train, label='Training Actual Occupancy Rate')
        plt.xlabel('Date')
        plt.ylabel('Percent Occupied')
        y_pred_AR = pd.Series(results_AR.forecast(steps=len(test))[0],
                              index=test.index)
        plt.plot(test, label='Testing Actual Occupancy Rate')
        plt.plot(y_pred_AR,
                 color='purple',
                 label='ARMA Predicted Occupancy Rate')
        plt.legend()

        plt.show()
        print('ARMA Model Metrics on Test Data')
        self.report_metrics(test.squeeze(), y_pred_AR.squeeze())
示例#6
0
    def feed_consumers(self, data, symbols, unprocessed_symbols):
        """
        send data to queue
        """
        logger.info("Sending ready")
        tscv = TimeSeriesSplit(n_splits=10)
        for symbols_index, symbol in enumerate(symbols):
            if symbol in unprocessed_symbols:
                continue

            folds_index = 1
            for train_index, test_index in tscv.split(data[symbol].data):
                X_train, X_test = data[symbol].data.values[train_index], data[
                    symbol].data.values[test_index]
                y_train, y_test = data[symbol].target[train_index], data[
                    symbol].target[test_index]
                logger.info("processing %s %d", symbol, folds_index)
                mq_body = encode_data(
                    (symbol, X_train, X_test, y_train, y_test, folds_index))
                with self.lock:
                    self._data.basic_publish(exchange='',
                                             routing_key='tpot_data',
                                             body=mq_body)
                    self.to_process += 1
                folds_index += 1
        self.done = True
示例#7
0
    def split(self, df, y=None, groups=None):
        self._validate_df(df)
        groups = df.groupby(self.groupby).indices
        splits = {}
        while True:
            X_idxs, y_idxs = [], []
            for key, sub_idx in groups.items():
                sub_df = df.iloc[sub_idx]
                sub_y = y[sub_idx] if y is not None else None

                if key not in splits:
                    splitter = TimeSeriesSplit(self.n_splits,
                                               self.max_train_size)
                    splits[key] = splitter.split(sub_df, sub_y)

                try:
                    X_idx, y_idx = next(splits[key])
                    X_idx = np.array([
                        df.index.get_loc(i) for i in sub_df.iloc[X_idx].index
                    ])
                    y_idx = np.array([
                        df.index.get_loc(i) for i in sub_df.iloc[y_idx].index
                    ])
                    X_idxs.append(X_idx)
                    y_idxs.append(y_idx)
                except StopIteration:
                    pass

            if len(X_idxs) == 0:
                break

            yield np.concatenate(X_idxs), np.concatenate(y_idxs)
示例#8
0
def SVM_TimeSplit(X, y, n_splits, algoritmo, name):
    tscv = TimeSeriesSplit(n_splits=n_splits)
    print(name)
    print(tscv)
    i = 0
    acc = []
    yhat = y.copy()
    for train_index, test_index in tscv.split(X):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        #print(X_train, X_test, y_train, y_test)
        #scaler = StandardScaler()
        #X_train = scaler.fit_transform(X_train)
        #Standard parameters
        clf = algoritmo
        try:
            clf.fit(X_train, y_train.ravel())
        except:
            return 0, clf
        #X_test = scaler.transform(X_test)
        yhat[test_index] = clf.predict(X_test)
        acc.append(
            metrics.accuracy_score(yhat[test_index].round(),
                                   y_test.round(),
                                   normalize=False))
        i = i + 1
        #print ('Score medio: '+ str(np.mean(acc)))
        return yhat, clf
def nestedCV(model_params, train_dataset, freq, k=3):
    """
    Performs nested cv on the given dataset with the given hyper parameters
    """
    train_dataset.index = pd.DatetimeIndex(train_dataset.index.values,
                                           freq=freq)
    tscv = TimeSeriesSplit(n_splits=k)
    mses = []

    #loop through the split data
    for train_index, val_index in tscv.split(train_dataset):
        cv_train, cv_val = train_dataset.iloc[train_index], train_dataset.iloc[
            val_index]
        sarima = sm.tsa.SARIMAX(
            endog=cv_train,
            order=model_params[0],  # for SARIMAX, it means (p,d,q), 
            seasonal_order=model_params[
                1],  # for SARIMAX, it means (sp,sd,sq,12), 
            enforce_stationarity=False,
            enforce_invertibility=False).fit()

        predictions = sarima.predict(cv_val.index.values[0],
                                     cv_val.index.values[-1])
        true_values = cv_val.values
        #sometimes returns nan for predictions, but just ignore
        if not np.isnan(predictions.values).any():
            mse = math.sqrt(mean_squared_error(true_values,
                                               predictions.values))
            mses.append(mse)

    return np.mean(mses)
示例#10
0
def run_test(data):
    y = np.array(data)[:, np.newaxis]
    N = len(y)
    X = np.linspace(-2, 8, N)[:, np.newaxis]

    tscv = TimeSeriesSplit(n_splits=N - 1)
    print("Starting model %s" % str(model))
    predictions = []
    predictors = []
    for train_index, test_index in tscv.split(y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        fitted = model.fit(X_train, y_train)
        prediction = fitted.predict(X_test)
        predictions.append(prediction[0][0])
        predictors.append(X_test[0][0])

    rmse, mae, r2, profit, accuracy = eval_metrics(y.flatten(),
                                                   np.array(predictions))

    print("  Profit: %s" % profit)
    print("  Accuracy: %s" % accuracy)
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)

    parameters = model.get_params()
    return rmse, mae, r2, profit, accuracy, parameters
示例#11
0
def timeseriesCVscore(params, series, loss_function=mean_squared_error, slen=12):
    """
        Returns error on CV  
        
        params - vector of parameters for optimization
        series - dataset with timeseries
        slen - season length for Holt-Winters model
    """
    # errors array
    errors = []
    
    values = series.values
    alpha, beta, gamma = params
    
    # set the number of folds for cross-validation
    tscv = TimeSeriesSplit(n_splits=3) 
    
    # iterating over folds, train model on each, forecast and calculate error
    for train, test in tscv.split(values):

        model = HoltWinters(series=values[train], slen=slen, 
                            alpha=alpha, beta=beta, gamma=gamma, n_preds=len(test))
        model.triple_exponential_smoothing()
        
        predictions = model.result[-len(test):]
        actual = values[test]
        error = loss_function(predictions, actual)
        errors.append(error)
        
    return np.mean(np.array(errors))
示例#12
0
 def reg2(T):
     global i
     print(i)
     i += 1
     #防止全部为Nan
     if T.isnull().sum() != T.shape[0]:
         window = 50
         tscv = TimeSeriesSplit(n_splits=T.shape[0] - window + 1)
         new_dd = pd.Series(np.NAN, index=T.index)
         for train_index, test_index in tscv.split(T):
             #print("TRAIN:", train_index[-window:], "TEST:", test_index)
             X, Y = T.iloc[train_index[-window:]], bench.iloc[
                 train_index[-window:]]
             #防止全部为Nan
             if X.isnull().sum() != X.shape[0]:
                 X = sm.add_constant(X)
                 model = OLS(Y, X, missing='drop')
                 results = model.fit()
                 res = results.resid.iloc[-1]
                 new_dd.iloc[train_index[-1]] = res
         #计算最后一个
         X, Y = T.iloc[-window:], bench.iloc[-window:]
         #防止全部为Nan
         if X.isnull().sum() != X.shape[0]:
             X = sm.add_constant(X)
             model = OLS(Y, X, missing='drop')
             results = model.fit()
             res = results.resid.iloc[-1]
             new_dd.iloc[-1] = res
             return new_dd
         else:
             return T
     else:
         return T
示例#13
0
def train_models(x, y, params, folds=5):
    models = []

    tssp = TimeSeriesSplit(n_splits=folds, max_train_size=None)

    for train_idx, test_idx in tssp.split(x):
        x_tr, x_te = x.iloc[train_idx], x.iloc[test_idx]
        y_tr, y_te = y.iloc[train_idx], y.iloc[test_idx]

        model = LGBMRegressor(colsample_bytree = params["colsample_bytree"],
                              learning_rate = params["learning_rate"],
                              max_depth = params["max_depth"],
                              min_child_samples = params["min_child_samples"],
                              min_sum_hessian_in_leaf = params["min_sum_hessian_in_leaf"],
                              n_estimators = params["n_estimators"],
                              num_leaves = params["num_leaves"],
                              reg_alpha = params["reg_alpha"],
                              reg_lambda = params["reg_lambda"],
                              subsample = params["subsample"],
                              tree_learner = params["tree_learner"],
                              boosting_type = params["boosting_type"],
                              objective = params["objective"],
                              )
        model.fit(x_tr, y_tr)
        preds = model.predict(x_te)
        print(rmsle(y_true=y_te, y_pred=preds))
        models.append(model)
        del x_tr, x_te, y_tr, y_te, model
        print(gc.collect())
        
    return models
示例#14
0
def fnGridSearchModel(trainX, trainY, nn_model, nb_epoch, batch_size,
                      param_dict):

    model = KerasRegressor(build_fn=nn_model,
                           nb_epoch=nb_epoch,
                           batch_size=batch_size,
                           verbose=2)

    numNeurons = [i * numFeatures for i in range(1, 4, 1)]

    nLayers = [3, 4, 5, 6]
    nDropout = [.2]

    tscv = TimeSeriesSplit(n_splits=2)
    CVData = [(fnSliceOffDataPerBatchSize(pFeatures=train,
                                          pBatch_Size=batch_size)[0],
               fnSliceOffDataPerBatchSize(pFeatures=test,
                                          pBatch_Size=batch_size)[0])
              for train, test in tscv.split(trainX)]
    param_grid = dict(nLayers=nLayers,
                      numNeurons=numNeurons,
                      nDropout=nDropout)
    grid = GridSearchCV(estimator=model,
                        param_grid=param_grid,
                        n_jobs=1,
                        cv=CVData)
    grid_result = grid.fit(trainX, trainY)

    print("Best: %f using %s" %
          (grid_result.best_score_, grid_result.best_params_))

    params = grid_result.best_params_
    return grid_result.best_score_, params
示例#15
0
def timeseriesCVscore(series, model, loss_function, n_splits=3):
    """

    :param n_splits:
    :param series:
    :param model:
    :param loss_function:
    :return:
    """
    # errors array
    errors = []
    values = series.values

    # set the number of folds for cross-validation
    tscv = TimeSeriesSplit(n_splits=n_splits)

    # iterating over folds, train model on each, forecast and calculate error
    # TODO add interface for learning
    for train, test in tscv.split(values):
        model.fit(values[train])
        predictions = model.predict(steps=len(test))
        actual = values[test]
        error = loss_function(actual, predictions)
        errors.append(error)

    return np.mean(np.array(errors))
def panel_split(n_folds, groups, grouping_var='date_of_transaction'):
    """
    Function to generate time series splits of a panel, provided
    a number of folds, and an indexable dataframe to create groups.
    Returns a generator object for compliance with sci-kit learn API.
    """
    date_idx = (groups[[
        grouping_var
    ]].drop_duplicates().sort_values(grouping_var).reset_index().rename(
        {'index': 'tsidx'}, axis=1))

    by_ticker_index = groups.reset_index().rename({'index': 'panel_index'},
                                                  axis=1)
    by_ticker_index = (pd.merge(
        by_ticker_index, date_idx,
        on=grouping_var).sort_values('panel_index').set_index('panel_index'))

    ticker_range = sorted(by_ticker_index['tsidx'].unique().tolist())

    splits = TimeSeriesSplit(n_splits=n_folds)

    for train_indices, test_indices in splits.split(ticker_range):
        panel_train_indices = (by_ticker_index[by_ticker_index['tsidx'].isin(
            train_indices)].index.tolist())
        panel_test_indices = (by_ticker_index[by_ticker_index['tsidx'].isin(
            test_indices)].index.tolist())
        yield panel_train_indices, panel_test_indices
示例#17
0
    def LGB_bayesian(self, learning_rate,
    num_leaves,
    bagging_fraction,
    feature_fraction,
    min_child_weight,
    min_data_in_leaf,
    max_depth,
    reg_alpha,
    reg_lambda):

        # LightGBM expects next three parameters need to be integer.
        num_leaves = int(num_leaves)
        min_data_in_leaf = int(min_data_in_leaf)
        max_depth = int(max_depth)

        assert type(num_leaves) == int
        assert type(min_data_in_leaf) == int
        assert type(max_depth) == int

        BayesianParams = {
            'num_leaves': num_leaves,
            'min_data_in_leaf': min_data_in_leaf,
            'min_child_weight': min_child_weight,
            'bagging_fraction': bagging_fraction,
            'feature_fraction': feature_fraction,
            'learning_rate' : learning_rate,
            'max_depth': max_depth,
            'reg_alpha': reg_alpha,
            'reg_lambda': reg_lambda,
            'objective': 'binary',
            'save_binary': True,
            'seed': 1337,
            'feature_fraction_seed': 1337,
            'bagging_seed': 1337,
            'drop_seed': 1337,
            'data_random_seed': 1337,
            'boosting_type': 'gbdt',
            'verbose': 1,
            'is_unbalance': False,
            'boost_from_average': True,
            'metric': 'auc'}



        folds = TimeSeriesSplit(n_splits=5)

        for fold, (bayesian_tr_idx, bayesian_val_idx) in enumerate(folds.split(self.train, self.targetCol)):
            print('Training on fold {}'.format(fold + 1))

            trn_data = lgb.Dataset(self.train.iloc[bayesian_tr_idx], label=self.targetCol.iloc[bayesian_tr_idx])
            val_data = lgb.Dataset(self.train.iloc[bayesian_val_idx], label=self.targetCol.iloc[bayesian_val_idx])
            clf = lgb.train(BayesianParams, trn_data, 10000, valid_sets=[trn_data, val_data], verbose_eval=1000,
                            early_stopping_rounds=10, categorical_feature=['TransactionHour'])
            oof = np.zeros(len(self.train))
            features=list(self.train)
            oof[bayesian_val_idx] = clf.predict(self.train_df.iloc[bayesian_val_idx][features].values,
                                                num_iteration=clf.best_iteration)
            score = roc_auc_score(self.train_df.iloc[bayesian_val_idx][self.target].values, oof[bayesian_val_idx])

            return score
示例#18
0
def execute():
    for lg in LAGS:
        for c in COUNTRIES:

            # get the data
            X, y = get_data(c, lg)

            # define the splits
            tscv = TimeSeriesSplit(n_splits=10)

            # do the grid search. Scoring follows the convention that
            # higher values are better (hence the `neg')
            grid_search = GridSearchCV(CLF, parameters, scoring='neg_mean_absolute_error',
                                       cv=tscv.split(X), n_jobs=-1, verbose=1)

            print("Performing grid search...")
            t0 = time()
            grid_search.fit(X, y)
            print("done in %0.3fs" % (time() - t0))

            # record best result
            par_res = grid_search.best_estimator_.get_params()

            m = {'params': {pn: par_res[pn] for pn in parameters.keys()},
                 'country': c, 'lag': lg}

            result.append(m)

    json.dump(result, open(config['gridsearch-parameters'], 'w'))
    print("Done")
示例#19
0
def get_cv_split(y, method, **init_params):

    # Returns None if group_labels doesn't exist (e.g. for cv=timeseries)
    # groups = load_processed_data(name="group_labels", **load_kwargs)

    if method == "timeseries":
        splitter = TimeSeriesSplit(**init_params)
        groups = None
    elif method == "storms":
        splitter = GroupKFold(**init_params)
        # groups = load_processed_data(
        #     name="group_labels", must_exist=True, **load_kwargs
        # )

        # NOTE: y should be a pandas object with storm index so set groups to be storm index
        groups = y.storms.index

        # Reindex times within each storm
        # Cannot just reindex groups with y index directly because there are
        # overlapping storms
        # groups = pd.concat(
        #     (
        #         groups[groups[STORM_LEVEL] == storm].reindex(y.storms.get(storm).index)
        #         for storm in y.storms.level
        #     )
        # )

        # assert len(groups) == len(
        #     y
        # ), f"Length of groups ({len(groups)}) does not match length of y ({len(y)})"

    split = splitter.split(y, groups=groups)

    return list(split)
示例#20
0
文件: explore.py 项目: madvid/ModeTiS
def wrapper_fit_pred_val(serie, order, s_order, **kwargs):
    dct_model = {}
    stop_at = 'fit'
    tscv = TimeSeriesSplit(n_splits=3, test_size=10, gap=10)
    try:
        for train_idx, val_idx in tscv.split(serie):
            print(
                "ajustement:",
                f"[{serie.index[train_idx[0]]} --> {serie.index[train_idx[-1]]}]",
                "-- validation:",
                f"[{serie.index[val_idx[0]]} --> {serie.index[val_idx[-1]]}]")
            s_train = serie.iloc[train_idx]
            s_valid = serie.iloc[val_idx]
            model_fit(dct_model, s_train, order, s_order, **kwargs)
            stop_at = model_eval(dct_model, **kwargs)
            stop_at = model_pred(dct_model, s_train, **kwargs)
            stop_at = model_fcst(dct_model, s_valid, **kwargs)
            stop_at = model_val(dct_model, s_valid, **kwargs)
            for key in dct_model["validation_score"].keys():
                dct_model['validation_score'][key] /= tscv.n_splits
    except:
        print(
            f"Une exception est survenue pour: SARIMA {order}{s_order} -- [{stop_at}]"
        )
        dct_model = None
    return dct_model
示例#21
0
def testSplit(df):
    '''
    Test to guarantee that split is done on dates instead of row count
    '''
    loop = 0
    split = 2
    splits = TimeSeriesSplit(max_train_size=4025, n_splits=split)
    dates = np.unique(df.date)
    backtest = 1
    # cutoff_date = '2018-03-23T00:00:00.000000000'
    cutoff_date = np.datetime64('2016-01-04')

    if backtest == 1:
        a = np.where(dates < cutoff_date)[0]
        b = np.where(dates >= cutoff_date)[0]
        s = []
        s.append((a, b))
    else:
        s = splits.split(dates)

    for train_date_index, test_date_index in s:
        train = df[df.date.isin(dates[train_date_index])]
        test = df[df.date.isin(dates[test_date_index])]
        print("\ntrain", min(train.date), max(train.date))
        print("test ", min(test.date), max(test.date))
示例#22
0
def grid_search(models, params, X_train, y_train, n_splits=3, n_iter=5):
    """
    Function for randomsearch CV to get best estimators
    :param models: dictionary of models to be used
    :param params: dictionary of models' parameters to be searched
    :param X_train: training explanatory features
    :param y_train: training target feature
    :param n_splits: number of cross validation to be used
    :param n_iter: number of parameter combinations to iterate
    :return: best estimators from the grid search
    """
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)

    # no. of crossvalidation
    tscv = TimeSeriesSplit(n_splits=n_splits)

    best_estimators = {}
    for key, model in models.items():
        print(key, ' search')
        try:
            # gridsearch to get best parameters
            model_param_search = RandomizedSearchCV(estimator=model, param_distributions=params[key],
                                                    scoring='neg_mean_squared_error',
                                                    n_jobs=4, verbose=0, random_state=42, n_iter=n_iter,
                                                    cv=[(train_split_index, val_index) for train_split_index, val_index in
                                                        tscv.split(X_train)])

            model_param_search.fit(X_train, y_train.reshape(len(y_train), ))
            best_estimators[key] = model_param_search.best_estimator_

        except Exception as e:
            logger.error(f'Training for {key} failed: ', str(e))

    return best_estimators
def split(
    X: np.ndarray,
    horizon: int = 12,
    min_train_size: int = None,
    max_train_size: int = None,
) -> List[Tuple[List[int]]]:
    """ creates a list of train/test indices for time series cross-fold validation
    
    Arguments:
        X {np.ndarray} -- values to split by index
    
    Keyword Arguments:
        horizon {int} -- length of prediction horizon (default: {12})
        max_train_size {int} -- maximum size for a single training set
            (default: {None})
        min_train_size {int} -- minimum size for a single training set. If None, 
            it will be set to the length of the prediction horizon (default: {None})
    
    Returns:
        List[Tuple[List[int]]] -- list of train/test index tuples
    """

    n_splits = X.shape[0] // horizon - 1
    splits = TimeSeriesSplit(n_splits=n_splits, max_train_size=max_train_size)
    if min_train_size is None:
        min_train_size = horizon

    filtered_splits = []
    for train_index, test_index in splits.split(X):
        if len(train_index) >= min_train_size:
            filtered_splits.append((train_index, test_index[:horizon]))

    print(
        f'The dataset has been split into {len(filtered_splits)} folds for CV')
    return filtered_splits
示例#24
0
def train_model(train_files, labels, model):

    ## trains visitor prediction model
    folds = TimeSeriesSplit(n_splits=8)
    scores = []
    preds = []

    for i, (train_index,
            val_index) in enumerate(folds.split(train_files, labels), 1):
        ## training and validation subsets
        X_train, y_train = train_files[train_index], np.take(labels,
                                                             train_index,
                                                             axis=0)
        X_val, y_val = train_files[val_index], np.take(labels,
                                                       val_index,
                                                       axis=0)

        ## fitting the model
        model.fit(X_train, y_train)

        ## calculating rmsle for training and testing subsets for each fold
        t_score = rmse(y_train, model.predict(X_train))
        val_preds = model.predict(X_val)
        score = rmse(y_val, val_preds)
        scores.append(score)
        preds.append(val_preds)
        print(f'Fold-{i}: Train_RMSLE: {t_score}, Validation_RMSLE: {score}')
    print(f'Mean_RMSLE of validation set: {np.round(np.mean(scores), 4)}')
    print(
        f'Normalized_RMSLE using Standard Deviation:{np.mean(scores)/np.std(preds)}'
    )
    print('\n===============Finished Training====================\n')

    return model
def cross_valid(model_dict_cv, df_train, target_col, feature_col):
    scoring = 'ACC'
    model_dict = {}  # keep selected parameters for each model

    for model_tuple, param_grid in model_dict_cv.items():
        # produces all combinations of parameters for given model
        all_grid = list(dict_product(param_grid))
        cv_res = []
        for param in all_grid:
            # timeseries split for cross validation
            tscv = TimeSeriesSplit(n_splits=3)
            model = model_tuple[1](**param)
            score = []
            for train_index, test_index in tscv.split(df_train):
                X_train, X_test = df_train[feature_col].iloc[train_index],\
                    df_train[feature_col].iloc[test_index]
                y_train, y_test = df_train[target_col].iloc[train_index],\
                    df_train[target_col].iloc[test_index]
                date_train, date_test = df_train['Date'].iloc[train_index],\
                    df_train['Date'].iloc[test_index]
                model.fit(X_train, y_train)
                y_binary = model.predict(X_test)
                y_prob = model.predict_proba(X_test)[:, 1]
                res_dict = {
                    'Date': date_test,
                    'Regime': y_test,
                    'crash_prob': y_prob,
                    'crash_binary': y_binary
                }
                res_df = pd.DataFrame.from_dict(res_dict)
                score.append(error_metrics(res_df)[scoring])
            cv_res.append(np.mean(score))
        best_param = all_grid[np.argmax(cv_res)]
        model_dict[model_tuple] = best_param
    return model_dict
示例#26
0
class Predictor(LoggerMixin, object):
    def __init__(self, model, fold=5, file='temp.log', loglevel=logging.INFO):
        super().__init__(file, loglevel)
        self.logger.debug('Created an instance of %s', self.__class__.__name__)
        self.model = model
        self.splitor = TimeSeriesSplit(n_splits=fold)

    @staticmethod
    def reg_error(y_predicted, y_test):
        return sqrt(mean_squared_log_error(y_predicted, y_test))

    def fit(self, X, y=None):
        return self

    def predict(self, X, y=None):
        predictions = pd.DataFrame([])

        i = 0
        for train_idx, test_idx in self.splitor.split(X):
            X_train, X_test, y_train, y_test = X[train_idx], X[test_idx], y[
                train_idx], y[test_idx]
            self.model.fit(X_train, y_train)
            predictions[i] = self.model.predict(X_test)
            i += 1

        return predictions

    def score(self, X, y=None):
        self.logger.debug('--- score starts.')
        errors = []

        for train_idx, test_idx in self.splitor.split(X):
            X_train, X_test, y_train, y_test = X[train_idx], X[test_idx], y[
                train_idx], y[test_idx]
            self.model.fit(X_train, y_train)
            y_predicted = self.model.predict(X_test)
            error = self.reg_error(y_predicted, y_test)
            errors.append(error)

            self.logger.debug('test idx {}, error: {:.6f}'.format(
                test_idx[0], error))
            print("---- split: {}".format(error))

        avg_error = mean(errors)
        self.logger.debug('avg error: {:.4f}'.format(avg_error))

        return avg_error
示例#27
0
def tuning_hyper_parameters():
    # 开始计时,并打印相关信息
    start = time()
    print('\nStart tuning hyper parameters')

    # 加载训练集
    X_train = load_npz(path_modeling_dataset + npz_X_train)
    y_train = np.load(path_modeling_dataset + npy_y_train)

    from sklearn.metrics import make_scorer, log_loss
    loss = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

    from sklearn.model_selection import TimeSeriesSplit
    tscv = TimeSeriesSplit(n_splits=5)

    # GridSearch
    from sklearn.model_selection import GridSearchCV
    from sklearn.linear_model import SGDClassifier
    alphas = np.logspace(-4, -1, 4)
    param_grid = {'alpha': alphas}
    generator = tscv.split(X_train)
    clf = GridSearchCV(SGDClassifier(loss='log', n_jobs=-1), param_grid, cv=generator, scoring=loss, n_jobs=-1)

    # 训练模型
    clf.fit(X_train, y_train)

    # 打印 cv_results
    cv_results_df = \
        DataFrame(clf.cv_results_)[['rank_test_score', 'param_alpha', 'mean_train_score', 'mean_test_score']]
    cv_results_df.rename(
        columns={'mean_train_score': 'mean_train_loss',
                 'mean_test_score': 'mean_val_loss',
                 'rank_test_score': 'rank_val_loss'},
        inplace=True)
    cv_results_df[['mean_val_loss', 'mean_train_loss']] = -cv_results_df[['mean_val_loss', 'mean_train_loss']]
    print('cv results: ')
    print(cv_results_df)

    # 手动释放内存
    del X_train
    del y_train
    gc.collect()

    # 加载测试集
    X_test = load_npz(path_modeling_dataset + npz_X_test)
    y_test = np.load(path_modeling_dataset + npy_y_test)
    # 打印在测试集上的 logloss
    print('logloss in testset: ', -clf.score(X=X_test, y=y_test))

    # 手动释放内存
    del X_test
    del y_test
    gc.collect()

    # 存储模型
    util.safe_save(path_model, 'sgd_lr.pkl', clf.best_estimator_)

    # 停止计时,并打印相关信息
    util.print_stop(start)
示例#28
0
    def fit(self, folds=3, thetas=(-2, -1, 0, 0.25, 0.5, 0.75, 1.25, 1.5, 1.75, 2)):
        """Function to theta models based on Kevin Sheppard's code. Selects the
        best theta for the series based on KFold cross-validation

        Parameters
        ----------
        @Parameters thetas  -   tuple of float theta values to evaluate

        Returns
        ----------
        None
        """

        # Initialise the KFold object
        kf = TimeSeriesSplit(n_splits=folds)

        for i, series in enumerate(self.data.columns):
            x = self.data.loc[:self.train_ix[series] - 1, series]

            mspes = {t: np.empty((folds, 1)) for t in thetas}
            p = pd.DataFrame(None, index=["a0", "b0"], dtype=np.double)
            params = {i: p for i in range(folds)}

            fold_ix = 0
            for tr_ix, te_ix in kf.split(x):
                # Set up data
                x_tr, x_te = x.iloc[tr_ix], x.iloc[te_ix]

                t = x_tr.shape[0]
                k = x_te.shape[0]

                for theta in thetas:
                    # Estimate the different theta models
                    params[fold_ix][theta] = self.estimate(x_tr, theta)
                    # Forecast for different theta models:
                    b0 = params[fold_ix][theta]["b0"]
                    # New RHS for forecasting
                    rhs_oos = np.ones((k, 2))
                    rhs_oos[:, 1] = np.arange(k) + t + 1
                    # Exp. Smoothing term
                    fit_args = {"disp": False, "iprint": -1, "low_memory": True}
                    ses = ExponentialSmoothing(x_tr).fit(**fit_args)
                    alpha = ses.params.smoothing_level
                    # Actual forecasting
                    ses_forecast = ses.forecast(k)
                    trend = (np.arange(k) + 1 / alpha - ((1 -alpha) ** t) / alpha)
                    trend *= 0.5 * b0
                    forecast = np.array(ses_forecast + trend)
                    mspes[theta][fold_ix] = mse(x_te, forecast)

                fold_ix += 1

            # Evaluate the KFold
            for k, v in mspes.items():
                mspes[k] = np.mean(v)

            self.best_theta[series] = min(mspes, key=mspes.get)
            self.fitted[series] = self.estimate(x, self.best_theta[series])
            self.fit_success = True
示例#29
0
def q02_data_splitter(path):
    path = 'data/elecdemand.csv'
    shape, df = q01_load_data(path)
    tscv = TimeSeriesSplit(n_splits=2)
    com_idx = []
    for train_index, valid_index in tscv.split(df):
        com_idx.append((train_index, valid_index))
    return com_idx
示例#30
0
 def do_stuff(X, y):
     tscv = TimeSeriesSplit(n_splits=3)
     score = []
     for train_index, test_index in tscv.split(X):
         estimator.fit(X[train_index], y[train_index])
         score.append(estimator.SMAPE(X[test_index], y[test_index]))
         print score[-1]
     return np.mean(score)
示例#31
0
def test_cv():
    df = pd.read_pickle(os.path.join(root, '..', 'data', 'ta', 'base1', 'AAPL.pkl'))
    assert isinstance(df, pd.DataFrame)
    npDates = df["date"].unique()
    df.set_index(["date"], drop=True, inplace=True)
    assert df.shape == df.loc[npDates.tolist()].shape
    cv = TimeSeriesSplit(n_splits=5)
    for (train, test) in cv.split(npDates):
        train_size = len(df.loc[npDates[train]])
        test_size = len(df.loc[npDates[test]])
    assert len(df) == train_size + test_size
示例#32
0
 def test_cv(self):
     X, y = load_boston(True)
     X_train, _, y_train, _ = train_test_split(X, y, test_size=0.1, random_state=42)
     params = {'verbose': -1}
     lgb_train = lgb.Dataset(X_train, y_train)
     # shuffle = False, override metric in params
     params_with_metric = {'metric': 'l2', 'verbose': -1}
     lgb.cv(params_with_metric, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=False,
            metrics='l1', verbose_eval=False)
     # shuffle = True, callbacks
     lgb.cv(params, lgb_train, num_boost_round=10, nfold=3, stratified=False, shuffle=True,
            metrics='l1', verbose_eval=False,
            callbacks=[lgb.reset_parameter(learning_rate=lambda i: 0.1 - 0.001 * i)])
     # self defined folds
     tss = TimeSeriesSplit(3)
     folds = tss.split(X_train)
     lgb.cv(params_with_metric, lgb_train, num_boost_round=10, folds=folds, stratified=False, verbose_eval=False)
     # lambdarank
     X_train, y_train = load_svmlight_file(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train'))
     q_train = np.loadtxt(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../examples/lambdarank/rank.train.query'))
     params_lambdarank = {'objective': 'lambdarank', 'verbose': -1}
     lgb_train = lgb.Dataset(X_train, y_train, group=q_train)
     lgb.cv(params_lambdarank, lgb_train, num_boost_round=10, nfold=3, stratified=False, metrics='l2', verbose_eval=False)
示例#33
0
def test_time_series_cv():
    X = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]]

    # Should fail if there are more folds than samples
    assert_raises_regexp(ValueError, "Cannot have number of folds.*greater",
                         next,
                         TimeSeriesSplit(n_splits=7).split(X))

    tscv = TimeSeriesSplit(2)

    # Manually check that Time Series CV preserves the data
    # ordering on toy datasets
    splits = tscv.split(X[:-1])
    train, test = next(splits)
    assert_array_equal(train, [0, 1])
    assert_array_equal(test, [2, 3])

    train, test = next(splits)
    assert_array_equal(train, [0, 1, 2, 3])
    assert_array_equal(test, [4, 5])

    splits = TimeSeriesSplit(2).split(X)

    train, test = next(splits)
    assert_array_equal(train, [0, 1, 2])
    assert_array_equal(test, [3, 4])

    train, test = next(splits)
    assert_array_equal(train, [0, 1, 2, 3, 4])
    assert_array_equal(test, [5, 6])

    # Check get_n_splits returns the correct number of splits
    splits = TimeSeriesSplit(2).split(X)
    n_splits_actual = len(list(splits))
    assert_equal(n_splits_actual, tscv.get_n_splits())
    assert_equal(n_splits_actual, 2)
def arima_gridsearch_cv(series, cv_splits=2,verbose=True,show_plots=True):
    # prepare train-test split object
    tscv = TimeSeriesSplit(n_splits=cv_splits)
    
    # initialize variables
    splits = []
    best_models = []
    all_models = []
    i = 1
    
    # loop through each CV split
    for train_index, test_index in tscv.split(series):
        print("*"*20)
        print("Iteration {} of {}".format(i,cv_splits))
        i = i + 1
        
        # print train and test indices
        if verbose:
            print("TRAIN:", train_index, "TEST:", test_index)
        splits.append({'train':train_index,'test':test_index})
        
        # split train and test sets
        train_series = series.ix[train_index]
        test_series = series.ix[test_index]
        
        print("Train shape:{}, Test shape:{}".format(train_series.shape,
              test_series.shape))
        
        # perform auto arima
        _best_model, _all_models = auto_arima(series=train_series)
        best_models.append(_best_model)
        all_models.append(_all_models)
        
        # display summary for best fitting model
        if verbose:
            print(_best_model['model_obj'].summary())
        results = _best_model['model_obj']
        
        if show_plots:
            # show residual plots
            residuals = pd.DataFrame(results.resid)
            residuals.plot()
            plt.title('Residual Plot')
            plt.show()
            residuals.plot(kind='kde')
            plt.title('KDE Plot')
            plt.show()
            print(residuals.describe())
        
            # show forecast plot
            fig, ax = plt.subplots(figsize=(18, 4))
            fig.autofmt_xdate()
            ax = train_series.plot(ax=ax)
            test_series.plot(ax=ax)
            fig = results.plot_predict(test_series.index.min(), 
                                       test_series.index.max(), 
                                       dynamic=True,ax=ax,
                                       plot_insample=False)
            plt.title('Forecast Plot ')
            plt.legend()
            plt.show()

            # show error plot
            insample_fit = list(results.predict(train_series.index.min()+1, 
                                                train_series.index.max(),
                                                typ='levels')) 
            plt.plot((np.exp(train_series.ix[1:].tolist())-\
                             np.exp(insample_fit)))
            plt.title('Error Plot')
            plt.show()
    return {'cv_split_index':splits,
            'all_models':all_models,
            'best_models':best_models}
示例#35
0
def timeSeriesSplit(cso = False):
    state = {0: 'NSW', 1: 'QLD', 2: 'SA', 3: 'TAS', 4: 'VIC'}
    year = {0: '2015', 1: '2016', 2: '2017'}
    
    df_nsw = pd.DataFrame()
    df_qld = pd.DataFrame()
    df_sa = pd.DataFrame()
    df_tas = pd.DataFrame()
    df_vic = pd.DataFrame()
    
    df = {'NSW': df_nsw, 'QLD': df_qld, 'SA': df_sa, 'TAS': df_tas, 'VIC': df_vic}
    
    df_nsw_test = pd.DataFrame()
    df_qld_test = pd.DataFrame()
    df_sa_test = pd.DataFrame()
    df_tas_test = pd.DataFrame()
    df_vic_test = pd.DataFrame()
    
    df_test = {'NSW': df_nsw_test, 'QLD': df_qld_test, 'SA': df_sa_test, 'TAS': df_tas_test, 'VIC': df_vic_test}
    
    for st in state.values():
        for ye in year.values():
            for mn in range(1,13):
                if mn < 10:            
                    dataset = pd.read_csv('./datasets/train/' + st + '/PRICE_AND_DEMAND_' + ye + '0' + str(mn) +'_' + st + '1.csv')
                else:
                    dataset = pd.read_csv('./datasets/train/' + st + '/PRICE_AND_DEMAND_' + ye + str(mn) +'_' + st + '1.csv')
                df[st] = df[st].append(dataset.iloc[:,1:3])
        df[st] = df[st].set_index('SETTLEMENTDATE')
       
    for st in state.values():
        dataset = pd.read_csv('./datasets/test/' + st + '/PRICE_AND_DEMAND_201801_' + st + '1.csv')
        df_test[st] = df_test[st].append(dataset.iloc[:,1:3])
        df_test[st] = df_test[st].set_index('SETTLEMENTDATE')
       
    # numpy array
    list_hourly_load_NSW = np.array(df['NSW'])
    list_hourly_load_QLD = np.array(df['QLD'])
    list_hourly_load_SA = np.array(df['SA'])
    list_hourly_load_TAS = np.array(df['TAS'])
    list_hourly_load_VIC = np.array(df['VIC'])
       
    # the length of the sequnce for predicting the future value
    sequence_length = 84
    x_size = 36
    hidden = 10
    y_size = 48
    
    # normalizing
    matrix_load_NSW = list_hourly_load_NSW / np.linalg.norm(list_hourly_load_NSW)
    matrix_load_QLD = list_hourly_load_QLD / np.linalg.norm(list_hourly_load_QLD)
    matrix_load_SA = list_hourly_load_SA / np.linalg.norm(list_hourly_load_SA)
    matrix_load_TAS = list_hourly_load_TAS / np.linalg.norm(list_hourly_load_TAS)
    matrix_load_VIC = list_hourly_load_VIC / np.linalg.norm(list_hourly_load_VIC)
    
    matrix_load_NSW = matrix_load_NSW[:-(len(matrix_load_NSW) % sequence_length)]
    matrix_load_QLD = matrix_load_QLD[:-(len(matrix_load_QLD) % sequence_length)]
    matrix_load_SA = matrix_load_SA[:-(len(matrix_load_SA) % sequence_length)]
    matrix_load_TAS = matrix_load_TAS[:-(len(matrix_load_TAS) % sequence_length)]
    matrix_load_VIC = matrix_load_VIC[:-(len(matrix_load_VIC) % sequence_length)]
    
    matrix_load_NSW = matrix_load_NSW.reshape(-1, sequence_length)
    matrix_load_QLD = matrix_load_QLD.reshape(-1, sequence_length)
    matrix_load_SA = matrix_load_SA.reshape(-1, sequence_length)
    matrix_load_TAS = matrix_load_TAS.reshape(-1, sequence_length)
    matrix_load_VIC = matrix_load_VIC.reshape(-1, sequence_length)
    
    # shuffle the training set (but do not shuffle the test set)
    np.random.shuffle(matrix_load_NSW)
    np.random.shuffle(matrix_load_QLD)
    np.random.shuffle(matrix_load_SA)
    np.random.shuffle(matrix_load_TAS)
    np.random.shuffle(matrix_load_VIC)
    
    # the training set
    X_NSW = matrix_load_NSW[:, :x_size]
    X_QLD = matrix_load_QLD[:, :x_size]
    X_SA = matrix_load_SA[:, :x_size]
    X_TAS = matrix_load_TAS[:, :x_size]
    X_VIC = matrix_load_VIC[:, :x_size]
    
    # the last column is the true value to compute the mean-squared-error loss
    y_NSW = matrix_load_NSW[:, x_size:]
    y_QLD = matrix_load_QLD[:, x_size:]
    y_SA = matrix_load_SA[:, x_size:]
    y_TAS = matrix_load_TAS[:, x_size:]
    y_VIC = matrix_load_VIC[:, x_size:]
    
    tscv = TimeSeriesSplit(n_splits=5)
    
    X = {'NSW': X_NSW, 'QLD': X_QLD, 'SA': X_SA, 'TAS': X_TAS, 'VIC': X_VIC}
    y = {'NSW': y_NSW, 'QLD': y_QLD, 'SA': y_SA, 'TAS': y_TAS, 'VIC': y_VIC}
    
    for st in state.values():
        print("State: ", st)
        i = 1
        for train_index, test_index in tscv.split(X[st]):
            X_train, X_test = X[st][train_index], X[st][test_index]
            y_train, y_test = y[st][train_index], y[st][test_index]
            
            print("Train and validation from state ", st, " split ", i)
            net = nt.Network([x_size, hidden, y_size], nt.Activation.tanh, nt.QuadraticCost)
            if cso:
                fname = "kernelBiasTimeSeries" + st + ".npy"
                if not path.exists(fname):
                    print("Weights and biases initialization for state ",st, " in progress...")
                    randInt = np.random.randint(X_train.shape[0])
                    net.cso(100,X_train[randInt].reshape(x_size,1),y_train[randInt].reshape(y_size,1),
                                net.multiObjectiveFunction,-0.6,0.6,net.dim ,100)
                    net.set_weight_bias(np.array(net.get_Gbest()))
                    np.save(fname, np.array(net.get_Gbest()))
                net.set_weight_bias(np.load(fname))

            if cso:
                fname = "results_" + st + "_TS_" + str(i) + "CSO"
            else:
                fname = "results_" + st + "_TS_" + str(i) + "GD"
            num_epochs = 1500
            lmbda = 2
            
            evaluation_cost, eval_mape, eval_rmse, eval_mae, training_cost, training_mape, training_rmse, training_mae = net.SGD(
                    X_train.transpose(),y_train.transpose(), num_epochs, 
                    10, 0.01, 
                    X_test.transpose(), y_test.transpose(), 
                    lmbda, monitor_evaluation_cost = True,
                    monitor_evaluation_accuracy = True,
                    monitor_training_cost = True,
                    monitor_training_accuracy = True,
                    output2D = True)
            
            f = open(fname, "w")
            json.dump([evaluation_cost, eval_mape, eval_rmse, eval_mae, training_cost, training_mape, training_rmse, training_mae], f)
            f.close()
                
#            make_plots(fname, num_epochs,
#                       training_cost_xmin = 0,
#                       test_accuracy_xmin = 0,
#                       test_cost_xmin = 0, 
#                       training_accuracy_xmin = 0)
            i = i+1
示例#36
0
train_val_sample.reset_index(drop=True,inplace=True)
testing_sample.dropna(inplace=True)
X_train=train_val_sample.drop(['y','date','device']+removal_list,1)
y_train=train_val_sample['y'].astype(int)
X_test=testing_sample.drop(['y','date','device']+removal_list,1)
y_test=testing_sample['y'].astype(int)

y_train.value_counts()

#I create 3 training samples and 3 validation samples. 


tscv=TimeSeriesSplit(n_splits=3)
print(tscv)

for train,test in tscv.split(X_train):
    print('%s %s' %(train,test))
    
################################################
#fit the model

#Cross validation and hyper-parameter search


print('running cross validation')

########################################
#XGBoost

clf_xgb = xgb.XGBClassifier(objective = 'binary:logistic')
param_dist_xgb = {'n_estimators': stats.randint(150, 500),