示例#1
0
def train_and_test_sensor(idx_sensor, id_sensor, n_sensors, use_lat=False):
    X_tr1, y_tr1, X_te1, y_te1 = to_array(X_tr_ord,
                                          y_tr_ord,
                                          X_te_ord,
                                          y_te_ord,
                                          id_sensor=id_sensor)

    if use_lat:
        X_tr2, y_tr2, X_te2, y_te2 = to_array(X_tr_lat,
                                              y_tr_lat,
                                              X_te_lat,
                                              y_te_lat,
                                              id_sensor=id_sensor)

    # Validation using TS split (just to obtain different MAE estimations, no hyperoptimization for the moment)
    cv_loss = []
    for tr_idx, va_idx in TimeSeriesSplit(n_splits=5).split(X_tr1):

        if not use_lat:
            train_data = np.atleast_3d(X_tr1[tr_idx])
            validation_data = np.atleast_3d(X_tr1[va_idx])
            model = conv1D_lon(idx_sensor, n_sensors=n_sensors)

        else:
            train_data = [
                np.atleast_3d(X_tr1[tr_idx]),
                np.atleast_3d(X_tr2[tr_idx])
            ]
            validation_data = [
                np.atleast_3d(X_tr1[va_idx]),
                np.atleast_3d(X_tr2[va_idx])
            ]
            model = conv1D_lon_lat(idx_sensor, n_sensors=n_sensors)

        model.compile(opt, loss='mean_absolute_error')
        model.fit(train_data,
                  y_tr1[tr_idx],
                  batch_size=batch_size,
                  epochs=epochs,
                  validation_data=(validation_data, y_tr1[va_idx]),
                  callbacks=[c2, c3],
                  verbose=0)

        cv_loss.append(c3.history['val_loss'][-1])

    # Testing
    if not use_lat:
        train_data = np.atleast_3d(X_tr1)
        validation_data = np.atleast_3d(X_te1)
        model = conv1D_lon(idx_sensor, n_sensors=n_sensors)

    else:
        train_data = [np.atleast_3d(X_tr1), np.atleast_3d(X_tr2)]
        validation_data = [np.atleast_3d(X_te1), np.atleast_3d(X_te2)]
        model = conv1D_lon_lat(idx_sensor, n_sensors=n_sensors)

    model.compile(opt, loss='mean_absolute_error')
    model.fit(train_data,
              y_tr1,
              batch_size=batch_size,
              epochs=epochs,
              validation_data=(validation_data, y_te1),
              callbacks=[c2, c3],
              verbose=0)

    test_loss = c3.history['val_loss'][-1]

    #model.save('../models/conv1D_{}_{:1d}.h5'.format(id_sensor, use_lat))

    print('MAE_val ', cv_loss)
    print('MAE_test ', test_loss)

    return test_loss, cv_loss
def modelsearch():
    # get the data
    _, train_df_field2, _, _, _, humidity_field2, _, _ = getdata()
    humidity_field2 = humidity_field2.values.reshape(-1)
    utils.logger.info(train_df_field2.shape)
    utils.logger.info(humidity_field2.shape)

    #rmtree(cachedir)
    cachedir = mkdtemp()  #creates a temporary directory

    pipe = createpipeline(cachedir)

    utils.logger.info(pipe)

    # Evaluate different algorithms using cross-validation(cv)

    methods = []
    #methods.append(('LR', LinearRegression())) #no-good
    #methods.append(('RIDGE', Ridge(random_state=42))) #no-good
    #methods.append(('LASSO', Lasso(random_state=42))) #no-good
    #methods.append(('SGR', SGDRegressor(random_state=42))) #no-good
    methods.append(('SVR', SVR(gamma='auto')))
    methods.append(('KNN', KNeighborsRegressor()))
    methods.append(('MLP',
                    MLPRegressor(random_state=42,
                                 max_iter=2000,
                                 activation="tanh",
                                 shuffle=False)))
    methods.append(('GBR', GradientBoostingRegressor(random_state=42)))
    #methods.append(('CART', DecisionTreeRegressor(random_state=42)))
    #methods.append(('RFR', RandomForestRegressor(random_state=42, n_estimators=200)))
    #methods.append(('ETR', ExtraTreesRegressor(n_estimators=200, random_state=42)))
    #methods.append(('ABR', AdaBoostRegressor(n_estimators=200, random_state=42, base_estimator=RandomForestRegressor(random_state=42, max_depth=3))))
    #methods.append(('ABR.', AdaBoostRegressor(n_estimators=50, random_state=42, base_estimator=LinearRegression())))
    #methods.append(('ABR_', AdaBoostRegressor(n_estimators=50, random_state=42, base_estimator=DecisionTreeRegressor(random_state=42, max_depth=1))))
    #methods.append(('ABR__', AdaBoostRegressor(n_estimators=50, random_state=42, base_estimator=ExtraTreesClassifier(n_estimators=7,max_depth=2, random_state=42))))
    #methods.append(('BR', BaggingRegressor(n_estimators=200, random_state=42, base_estimator=RandomForestRegressor(random_state=42, max_depth=3))))
    #methods.append(('BR.', BaggingRegressor(n_estimators=50, random_state=42, base_estimator=LinearRegression())))
    #methods.append(('BR_', BaggingRegressor(n_estimators=50, random_state=42, base_estimator=DecisionTreeRegressor(random_state=42, max_depth=1))))
    #methods.append(('BR__', BaggingRegressor(n_estimators=50, random_state=42, base_estimator=ExtraTreesClassifier(n_estimators=7,max_depth=2, random_state=42))))
    #base_estimator=LogisticRegression(solver='lbfgs',random_state=42,class_weight=class_weights)
    #base_estimator=DecisionTreeClassifier(random_state=42, max_depth=5, class_weight=class_weights)
    #base_estimator=ExtraTreesClassifier(n_estimators=200,max_depth=5, random_state=42, class_weight=class_weights)

    results = []
    names = []

    for name, method in methods:
        #sKfold = model_selection.StratifiedKFold(n_splits = 2, random_state=42)	# cross-validation
        ts_cv = TimeSeriesSplit(5)  # 5-fold forward chaining
        cv_results = cross_val_score(method,
                                     pipe.fit_transform(train_df_field2),
                                     humidity_field2,
                                     cv=ts_cv,
                                     scoring='neg_mean_squared_error',
                                     verbose=1)
        results.append(cv_results)
        names.append(name)
        utils.logger.info(name + " : " + cv_results)
    for i in range(len(names)):
        result = results[i]
        name = names[i]
        msg = "%s: %f mean (+/- %f) std" % (name, result.mean(), result.std())
        utils.logger.info(msg)  #performance of methods
示例#3
0
def main():
    r = Reader()
    seasons = [10, 11, 12, 13, 14, 15, 16, 17]
    x_train = {}
    y_train = {}
    x_test = {}
    y_test = {}
    vec = DictVectorizer(sparse=False)
    for season in seasons[:-1]:
        x, y = r.read("data/"+str(season)+".csv")
        x_train[season], y_train[season] = x, y
    x, y = r.read("data/"+str(seasons[-1])+".csv")
    x_test[seasons[-1]], y_test[seasons[-1]] = x, y
    
    #  read pred_data
    x, y = r.read("data/18.csv", interactive=True)
    x_pred = {}
    y_pred = {}
    x_pred[18], y_pred[18] = x, y

   
    #x_test.update(x_train)
    #y_test.update(y_train)
    #x_train, y_train = transform_to_lstm(x_train, y_train)
    #x_test, y_test = transform_to_lstm(x_test, y_test)
    x_train, y_train = dict_list_transform(x_train, y_train)
    x_test, y_test = dict_list_transform(x_test, y_test)
    pred_data, y_pred = dict_list_transform(x_pred, y_pred)

    #print(len(x_test[0]))
    #print(x_data['Marco Reus'], y_data['Marco Reus'])
    x_all = pandas.DataFrame(x_test+x_train+pred_data, columns=['name', 'position', 'age', 'club'])
    x_train = pandas.DataFrame(x_train, columns=['name', 'position', 'age', 'club'])
    x_test = pandas.DataFrame(x_test, columns=['name', 'position', 'age', 'club'])
    pred_data = pandas.DataFrame(pred_data, columns=['name', 'position', 'age', 'club'])
    vec.fit(x_all.to_dict('records'))
    #print(x_test.to_dict('records'))
    #train = pandas.DataFrame(x_train.assign(pts=y_train), columns=['name', 'position', 'age', 'club', 'pts'])
    X_train, X_test = vec.transform(x_train.to_dict('records')), vec.transform(x_test.to_dict('records'))
    pred_data = vec.transform(pred_data.to_dict('records'))
    #DEEP NETWORK
    #print(X_train, y_train)
    #none = vec.vocabulary_['club=None']
    #for p in X_train:
    #    if p[none] == 1:
    #        p = np.full(p.shape, 2)
    X_train = pandas.DataFrame(X_train).values
    X_test = pandas.DataFrame(X_test).values
    y_train = pandas.DataFrame(y_train).values
    y_test = pandas.DataFrame(y_test).values
    
    # lstm reshape
    X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
    X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))
    pred_data = pred_data.reshape((pred_data.shape[0], 1, pred_data.shape[1]))
    # init other
    kf = KFold(shuffle=True)
    tscv = TimeSeriesSplit(n_splits=3)
    scaler = StandardScaler()
    
    #init model
    #dnn = KerasRegressor(build_fn=baseline_model, nb_epoch=100, batch_size=5, verbose=0)
    #lstm = KerasRegressor(build_fn=lstm_model, epochs=100, batch_size=24, verbose=1)
    lstm = KerasRegressor(build_fn=lstm_model, epochs=200, batch_size=1, verbose=0)

    svr = SVR()
    lgbm = lgb.LGBMRegressor(boosting_type='dart', num_leaves=40, learning_rate=0.1)

    #get interactive data
    p = Parser()
    #p_int = p.parse_interactive()
    
    run_model("LSTM", lstm, [], X_train, X_test, y_train, y_test, tscv, vec, cv=False, out=False, pred_data=None, price_data=None, hyper=True)
def RunRF(Abs_train, Abs_test, X_train, Y_train, X_test, Y_test, name):
    model = RandomForestClassifier(n_estimators=301,
                                   criterion='gini',
                                   max_depth=40,
                                   min_samples_split=2,
                                   min_samples_leaf=10,
                                   min_weight_fraction_leaf=0.0,
                                   max_features='auto',
                                   max_leaf_nodes=None,
                                   min_impurity_decrease=0.00,
                                   min_impurity_split=None,
                                   bootstrap=True,
                                   oob_score=False,
                                   n_jobs=1,
                                   random_state=None,
                                   verbose=0,
                                   warm_start=False,
                                   class_weight={
                                       1: 1,
                                       -1: 1.15
                                   })
    relevant_features = FeatureSelection(X_train, Y_train, model, N_FEATURES)
    cv = TimeSeriesSplit(n_splits=10)
    #	cv = 3
    min_samples_leaf = [50 * i for i in range(1, 7)]
    min_samples_split = [i * 2 for i in range(1, 10)]
    param_grid = [{
        'min_samples_leaf': min_samples_leaf,
        'min_samples_split': min_samples_split
    }]
    clf = model_selection.GridSearchCV(model,
                                       param_grid,
                                       scoring=None,
                                       fit_params=None,
                                       n_jobs=4,
                                       iid=True,
                                       refit='best_score_',
                                       cv=cv,
                                       verbose=0,
                                       pre_dispatch='2*n_jobs',
                                       error_score='raise',
                                       return_train_score='warn')
    clf.fit(X_train, Y_train)
    x = (clf.best_params_)
    print x
    #	exit()
    model.set_params(**x)
    model.fit(X_train, Y_train)
    #	model 				= RandomForestClassifier(n_estimators=300, criterion='entropy',random_state = 0)
    X_train_ = X_train[:, relevant_features]
    X_test_ = X_test[:, relevant_features]
    model.fit(X_train_, Y_train)
    pred_test = model.predict(X_test_)
    pred_train = model.predict(X_train_)
    cnf_mat_test = GenerateCnfMatrix(pred_test, Y_test)
    cnf_mat_train = GenerateCnfMatrix(pred_train, Y_train)
    actual_dist = ComputeDistribution(Y_train, Y_test)
    accuracy = ComputeAccuracy(cnf_mat_test, cnf_mat_train, name, actual_dist)
    #	print np.mean(cross_val_score(model, X_train, Y_train, cv=100))
    if CALCULATE_RETURNS == 'y':
        returns = ComputeReturns(Abs_test, Abs_train, pred_test, pred_train,
                                 Y_test, Y_train, name)
    print('------------------------------------------')
    return accuracy[2][0], pred_test
示例#5
0
    generation_df = get_data(weeks)

    # prepare data
    pre_pipeline = Pipeline([
        ('date_worker', mytransformers.DateTransformer()),
        ('shifter', mytransformers.Shifter())
    ])

    processed_data = pre_pipeline.fit_transform(generation_df, shifter__hours = hours)
    features = processed_data[0]
    labels = processed_data[1]

    # start mlflow run
    with mlflow.start_run():
        # cross validation
        tscv = TimeSeriesSplit(5)
        for train_index, test_index in tscv.split(labels):
            X_train, X_test = features.iloc[train_index], features.iloc[test_index]
            y_train, y_test = labels[train_index], labels[test_index]
            model = Lasso(alpha).fit(X_train, y_train)
            preds = model.predict(X_test)

            rmse, mae, r2 = eval_metrics(y_test, preds)

            mlflow.log_param("alpha", alpha)
            mlflow.log_param("weeks", weeks)
            mlflow.log_param("hours", hours)

            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("r2", r2)
            mlflow.log_metric("mae", mae)
示例#6
0
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve, auc, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.utils.class_weight import compute_class_weight

# X, y
dfDate = dfClean['Date']
dfY = dfClean['Target']
dfX = dfClean.copy().drop(columns=['Target', 'Date'])

# TS Split
nSplit = 3
cvSplit = TimeSeriesSplit(n_splits=nSplit)

# Class Weights
classWeights = compute_class_weight('balanced', np.unique(dfY), dfY.values)
print('The weights are: %s' % classWeights +
      ' respectively for the classes: %s' % np.unique(dfY))

# Define Models
nMinSample = 1250
modelRF = RandomForestClassifier(n_estimators=200,
                                 class_weight='balanced_subsample')
modelXGB = xgb.XGBClassifier(learning_rate=0.1,
                             sample_weight=classWeights,
                             nthread=-2)
#modelLogit_L1 = LogisticRegression(penalty='l1', class_weight='balanced', solver='liblinear')
modelLogit_L2 = LogisticRegression(penalty='l2',
示例#7
0
     'Alcohol_full_bar', 'Alcohol_none', 'Caters_True', 'WiFi_free', 'WiFi_no', 'WiFi_paid', \
     'BikeParking_True', 'NoiseLevel_average', 'NoiseLevel_loud', 'NoiseLevel_quiet', \
     'NoiseLevel_very_loud', 'HasTV_True', 'OutdoorSeating_True', 'RestaurantsTakeOut_True', \
     'RestaurantsReservations_True', 'GoodForKids_True', 'RestaurantsPriceRange2_1', \
     'RestaurantsPriceRange2_2', 'RestaurantsPriceRange2_3', 'RestaurantsPriceRange2_4', \
     'RestaurantsGoodForGroups_True', 'Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', \
     'Tuesday', 'Wednesday']
train_data.dropna(inplace=True)
labels = ['running_average']

train_data_X = train_data[features]
train_data_y = train_data[labels]

alphas = [10**a for a in range(-5, 2)]
n_splits = 5
ts_CV = TimeSeriesSplit(n_splits=n_splits)

regList, rmse_list_test, rmse_list_train, r2_list_test, r2_list_train = regressionCV_new(
    train_data, features, labels, alphas, n_splits)

best_regMod = regList[np.argmin(rmse_list_test)]
min_rmse = np.amin(rmse_list_test)

features += ['running_average_past_bin']

regList_pr, rmse_list_test_pr, rmse_list_train_pr, r2_list_test_pr, r2_list_train_pr = regressionCV_new(
    train_data, features, labels, alphas, n_splits)

best_regMod_pr = regList_pr[np.argmin(rmse_list_test_pr)]
min_rmse_pr = np.amin(rmse_list_test_pr)
示例#8
0
X_ext = X_ext.drop([
    'aaa', 'baa', 'wti_mom', 'incl103', 'm1_mom', 'm2_mom', 'usd_mom',
    'ism_inv_mom', 'ism_man_mom', 'ism_prices_mom', 'jobl_claims_mom',
    'gold_mom', 'spx_mom', 'wti_vol', 'spx_vol'
],
                   axis=1)
X_ext = X_ext.dropna()

X_test = X.loc[:datetime(1999, 12, 1), :]
y_test = y.align(X_test, join='inner')[0]

reg1 = make_pipeline(
    StandardScaler(),
    RidgeClassifierCV(fit_intercept=False,
                      normalize=False,
                      cv=TimeSeriesSplit(12)))
reg2 = make_pipeline(StandardScaler(), Perceptron(fit_intercept=False))
reg3 = make_pipeline(StandardScaler(),
                     PassiveAggressiveClassifier(fit_intercept=False))
reg4 = make_pipeline(
    StandardScaler(),
    SGDClassifier(loss='log', penalty='elasticnet', fit_intercept=False))
reg5 = make_pipeline(
    StandardScaler(),
    LogisticRegressionCV(cv=TimeSeriesSplit(12),
                         penalty='l2',
                         fit_intercept=False))
reg6 = make_pipeline(StandardScaler(), SVC(probability=True))
reg7 = make_pipeline(StandardScaler(), GaussianNB())
reg8 = make_pipeline(StandardScaler(), RandomForestClassifier())
from pandas import read_csv
from sklearn.model_selection import TimeSeriesSplit
from matplotlib import pyplot

#load data
series = read_csv('data\Elec_daily_Dmd_2D.csv', header=0, index_col=0)

# dmd2D = ['data','demand']
# series2D = series['dmd2D']

X = series.values
splits = TimeSeriesSplit(n_splits=3)
pyplot.figure(1)
index = 1
for train_index, test_index in splits.split(X):
    train = X[train_index]
    test = X[test_index]
    #train_size = int(len(X)*0.66)
    #train, test = X[0:train_size], X[train_size:len(X)]
    print('Observations: %d' % (len(train) + len(test)))
    print('Training Observations: %d' % (len(train)))
    print('Testing Observations: %d' % (len(test)))
    pyplot.subplot(310 + index)
    pyplot.plot(train)
    pyplot.plot([None for i in train] + [X for X in test])
    #pyplot(xlabel('date'))
    index += 1
pyplot.show()

# #print(series.head())
# # series.plot()
示例#10
0
def random_forest_randomforward(X_train,
                                y_train,
                                X_test,
                                y_test,
                                n_selected_features=1000,
                                scoring='accuracy',
                                n_iter=1000):

    from sklearn.model_selection import TimeSeriesSplit
    from datetime import datetime as dt
    import random
    import warnings

    warnings.filterwarnings("ignore")

    st_t = dt.now()

    n_samples, n_features = X_train.shape

    n_estimators = [5, 10, 50, 100, 150, 200, 250, 300]
    max_depth = [5, 10, 25, 50, 75, 100]
    min_samples_leaf = [1, 2, 4, 8, 10]
    min_samples_split = [2, 4, 6, 8, 10]
    max_features = ["auto", "sqrt", "log2", None]

    hyperparameter = {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'min_samples_leaf': min_samples_leaf,
        'min_samples_split': min_samples_split,
        'max_features': max_features
    }

    cv_timeSeries = TimeSeriesSplit(n_splits=5).split(X_train)
    base_model_rf = RandomForestClassifier(criterion='gini', random_state=42)
    n_iter_search = 30
    scoring = scoring

    # selected feature set, initialized to be empty
    count = 0
    ddict = {}
    all_F = []
    all_c = []
    all_acc = []
    all_model = []

    while count < n_selected_features:
        #F = []
        max_acc = 0
        for i in range(n_iter):
            col_train = random.sample(list(X_train.columns), count + 1)
            col_train = np.array(col_train)
            X_train_tmp = X_train[col_train]
            acc = 0
            rsearch_cv = RandomizedSearchCV(
                estimator=base_model_rf,
                random_state=42,
                param_distributions=hyperparameter,
                n_iter=n_iter_search,
                #cv=cv_timeSeries,
                cv=2,
                scoring=scoring,
                n_jobs=-1)
            rsearch_cv.fit(X_train_tmp, y_train)
            best_estimator = rsearch_cv.best_estimator_
            y_pred = best_estimator.predict(X_test[col_train])
            acc = metrics.accuracy_score(y_test, y_pred)
            if acc > max_acc:
                max_acc = acc
                idx = col_train
                best_model = best_estimator
        #F.append(idx)
        count += 1

        print("The current number of features: {} - Accuracy: {}%".format(
            count, round(max_acc * 100, 2)))

        all_F.append(idx)
        all_c.append(count)
        all_acc.append(max_acc)
        all_model.append(best_model)

    c = pd.DataFrame(all_c)
    a = pd.DataFrame(all_acc)
    f = pd.DataFrame(all_F)
    f["All"] = f[f.columns[0:]].apply(
        lambda x: ', '.join(x.dropna().astype(str)), axis=1)

    all_info = pd.concat([c, a, f["All"]], axis=1)
    all_info.columns = ['Num_features', 'Accuracy', 'Features']
    all_info = all_info.sort_values(by='Accuracy',
                                    ascending=False).reset_index(drop=True)

    print("The total time for searching subset: {}".format(dt.now() - st_t))

    return all_info, all_model, f
示例#11
0
def xgboost_forward(X_train,
                    y_train,
                    X_test,
                    y_test,
                    n_selected_features=1000,
                    scoring='accuracy'):
    from sklearn.model_selection import TimeSeriesSplit
    from datetime import datetime as dt
    import random
    import warnings

    warnings.filterwarnings("ignore")

    st_t = dt.now()

    n_samples, n_features = X_train.shape

    n_estimators = [5, 10, 50, 100, 150, 200, 250, 300]
    max_depth = [5, 10, 25, 50, 75, 100]
    min_child_weight = [5, 10, 25, 50, 75, 100]
    gamma = [0.5, 1, 1.5, 2, 5]
    subsample = [0.2, 0.4, 0.6, 0.8, 1]
    colsample_bytree = [0.2, 0.4, 0.6, 0.8, 1]

    hyperparameter = {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'min_child_weight': min_child_weight,
        'gamma': gamma,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree
    }

    cv_timeSeries = TimeSeriesSplit(n_splits=5).split(X_train)
    xgb = XGBClassifier(learning_rate=0.02,
                        objective='multi:softmax',
                        silent=True,
                        nthread=20)
    n_iter_search = 30
    scoring = scoring

    # selected feature set, initialized to be empty
    F = []
    count = 0
    ddict = {}
    all_F = []
    all_c = []
    all_acc = []
    all_model = []
    while count < n_selected_features:
        max_acc = 0
        for i in X_train.columns:
            if i not in F:
                F.append(i)
                X_train_tmp = X_train[F]
                acc = 0
                rsearch_cv = RandomizedSearchCV(
                    estimator=xgb,
                    random_state=42,
                    param_distributions=hyperparameter,
                    n_iter=n_iter_search,
                    #cv=cv_timeSeries,
                    cv=2,
                    scoring=scoring,
                    n_jobs=-1)
                rsearch_cv.fit(X_train_tmp, y_train)
                best_estimator = rsearch_cv.best_estimator_
                y_pred = best_estimator.predict(X_test[F])
                acc = metrics.accuracy_score(y_test, y_pred)
                F.pop()
                if acc > max_acc:
                    max_acc = acc
                    idx = i
                    best_model = best_estimator

        F.append(idx)
        count += 1

        print("The current number of features: {} - Accuracy: {}%".format(
            count, round(max_acc * 100, 2)))

        all_F.append(np.array(F))
        all_c.append(count)
        all_acc.append(max_acc)
        all_model.append(best_model)

    c = pd.DataFrame(all_c)
    a = pd.DataFrame(all_acc)
    f = pd.DataFrame(all_F)
    f["All"] = f[f.columns[0:]].apply(
        lambda x: ', '.join(x.dropna().astype(str)), axis=1)

    all_info = pd.concat([c, a, f["All"]], axis=1)
    all_info.columns = ['Num_feature', 'Accuracy', 'Feature']
    all_info = all_info.sort_values(by='Accuracy',
                                    ascending=False).reset_index(drop=True)

    print("The total time for searching subset: {}".format(dt.now() - st_t))

    return all_info, all_model, f
示例#12
0
def get_RandSearchCV(X_train, y_train, X_test, y_test, scoring, type_search,
                     output_file):
    from sklearn.model_selection import TimeSeriesSplit
    from datetime import datetime as dt
    st_t = dt.now()
    # Numer of trees are used
    n_estimators = [5, 10, 50, 100, 150, 200, 250, 300]
    #n_estimators = list(np.arange(100,1000,50))
    #n_estimators = [1000]

    # Maximum depth of each tree
    max_depth = [5, 10, 25, 50, 75, 100]

    # Minimum number of samples per leaf
    min_samples_leaf = [1, 2, 4, 8, 10]

    # Minimum number of samples to split a node
    min_samples_split = [2, 4, 6, 8, 10]

    # Maximum numeber of features to consider for making splits
    max_features = ["auto", "sqrt", "log2", None]

    hyperparameter = {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'min_samples_leaf': min_samples_leaf,
        'min_samples_split': min_samples_split,
        'max_features': max_features
    }

    cv_timeSeries = TimeSeriesSplit(n_splits=5).split(X_train)
    base_model_rf = RandomForestClassifier(criterion="gini", random_state=42)
    base_model_gb = GradientBoostingClassifier(criterion="friedman_mse",
                                               random_state=42)

    # Run randomzed search
    n_iter_search = 30
    if type_search == "RandomSearchCV-RandomForest":
        rsearch_cv = RandomizedSearchCV(estimator=base_model_rf,
                                        random_state=42,
                                        param_distributions=hyperparameter,
                                        n_iter=n_iter_search,
                                        cv=cv_timeSeries,
                                        scoring=scoring,
                                        n_jobs=-1)
    else:
        rsearch_cv = RandomizedSearchCV(estimator=base_model_gb,
                                        random_state=42,
                                        param_distributions=hyperparameter,
                                        n_iter=n_iter_search,
                                        cv=cv_timeSeries,
                                        scoring=scoring,
                                        n_jobs=-1)

    rsearch_cv.fit(X_train, y_train)
    #f = open("output.txt", "a")
    print("Best estimator obtained from CV data: \n",
          rsearch_cv.best_estimator_,
          file=output_file)
    print("Best Score: ", rsearch_cv.best_score_, file=output_file)
    return rsearch_cv
if __name__ == '__main__':
    # load dataset
    data = import_training_set(fast_pc = True)
    data.dropna(inplace=True)
    # set up classifier and pipeline
    bagging = BaggingClassifier(base_estimator=GaussianNB(),
                                n_estimators=25,
                                bootstrap=True,
                                max_samples=0.25,
                                n_jobs=1)

    pipe = Pipeline([('scaler', StandardScaler()),
                     ('reduce_dim', 'passthrough'),
                     ('clf', bagging)])

    # set up param grid
    param_grid = [
            {'reduce_dim': ['passthrough']},

            {'reduce_dim': [PCA()],
             'reduce_dim__n_components': [0.91, 0.93, 0.95, 0.97],
             'clf__max_features' : [0.33,0.66,1.0]},

            {'reduce_dim': [SelectKBest()],
             'reduce_dim__k': [20, 30, 40, 50],
             'clf__max_features' : [0.33,0.66,1.0]}]

    cv = TimeSeriesSplit(n_splits=10, test_size=100000, gap=100000)
    # initiate hyperparameter search
    results = search(data,pipe,param_grid,filepath='Results/bagging_naive_25_4.csv',cv=cv)
示例#14
0
X = preprocessing.MinMaxScaler().fit_transform(X)
#X = preprocessing.StandardScaler().fit_transform(X)

y=df.loc[:,'Good sell Point?']
# Split train set and test set
xtrain,ytrain=X[:testduration],y[:testduration]
xtest,ytest=X[testduration:],y[testduration:]

Market_GoodRatio=sum(df['Good sell Point?'].iloc[:testduration,]==1)/len(df['Good sell Point?'].iloc[:testduration,])#Good sell Point Ratio in market is manully set to nearly 0.5 
ResultTable=ResultTable.append({'Stock':stock,'Method':'Market Good sell Ratio','AvgScores':Market_GoodRatio,'StdScores':0},ignore_index=True)

#Compare and Plot the precision rate of each algorithm        
index=0
for method in method_list.loc[0,:]:
    clf = method
    cv=TimeSeriesSplit(n_splits=4) #Time series test
    scores = cross_val_score(clf,xtrain, ytrain, cv=4,scoring='precision')
    print(scores[scores>0])
    series={'Stock':stock,'Method':method_list.columns[index],'AvgScores':scores[scores>0].mean(),'StdScores':scores[scores>0].std()}
    index=index+1
    ResultTable=ResultTable.append(series,ignore_index=True)

name_list= ['Market Good sell Ratio']
name_list=np.append(name_list,method_list.columns)
num_list= ResultTable.loc[ResultTable['Stock']==stock]['AvgScores']
plt.barh(range(len(num_list)), num_list,tick_label = name_list)
plt.title(stock+'\nPrecision Rate')
plt.show()
    
#Plot precision rate of each method 
index=0
          X_test = df_covid_encoded['2020-10':]
          X_scaler = StandardScaler()
          X_train = X_scaler.fit_transform(X_train)
          X_test = X_scaler.transform(X_test)

          y_train = df_econ_encoded.loc[:'2020-09', column]
          y_test = df_econ_encoded.loc['2020-10':, column]

          ## Fit a Random Forest Regressor and Find the best Parameters
          model = RandomForestRegressor()
          param_search = { 
              'n_estimators': [20, 50, 100],
              'max_features': ['auto', 'sqrt', 'log2'],
              'max_depth' : [i for i in range(5,15)]
          }
          tscv = TimeSeriesSplit(n_splits=3)
          gsearch = GridSearchCV(estimator=model, cv=tscv, param_grid=param_search, scoring = 'r2')
          gsearch.fit(X_train, y_train)
          best_score = gsearch.best_score_
          best_model = gsearch.best_estimator_

          y_true = y_test.values
          y_pred = best_model.predict(X_test)

          print('Regression results Using Random-Forest-Regressor on', column, ' are:')
          regression_results(y_true, y_pred)

sys.stdout = orig_stdout
f.close()

## Preprocess the data using StandardScaler
示例#16
0
def train_evaluate(parameterization,
                   validation,
                   data_path,
                   n_known_outlier_classes,
                   ratio_known_normal,
                   ratio_known_outlier,
                   ratio_pollution,
                   cfg,
                   n_jobs_dataloader,
                   n_splits=3):

    device = 'cpu'

    period = np.array(
        ['2019-11-08', '2019-11-09', '2019-11-11', '2019-11-12', '2019-11-13'])

    if (validation == 'kfold'):
        split = KFold(n_splits=n_splits)
    elif (validation == 'time_series'):
        split = TimeSeriesSplit(n_splits=n_splits)
    else:
        # Dummy object with split method that return indexes of train/test split 0.8/0.2. Similar to train_test_split without shuffle
        split = type(
            'obj', (object, ), {
                'split':
                lambda p: [([x for x in range(int(len(p) * 0.8))],
                            [x for x in range(int(len(p) * 0.8), len(p))])]
            })

    test_aucs = []

    for train, test in split.split(period):

        dataset = CICFlowADDataset(
            root=os.path.abspath(data_path),
            n_known_outlier_classes=n_known_outlier_classes,
            ratio_known_normal=ratio_known_normal,
            ratio_known_outlier=ratio_known_outlier,
            train_dates=period[train],
            test_dates=period[test],
            ratio_pollution=ratio_pollution)

        # Initialize DeepSAD model and set neural network phi

        # Log random sample of known anomaly classes if more than 1 class
        if n_known_outlier_classes > 1:
            logger.info('Known anomaly classes: %s' %
                        (dataset.known_outlier_classes, ))

        # Initialize Isolation Forest model
        Isoforest = IsoForest(hybrid=False,
                              n_estimators=int(
                                  parameterization['n_estimators']),
                              max_samples=parameterization['max_samples'],
                              contamination=parameterization['contamination'],
                              n_jobs=4,
                              seed=cfg.settings['seed'])

        # Train model on dataset
        Isoforest.train(dataset,
                        device=device,
                        n_jobs_dataloader=n_jobs_dataloader)

        # Test model
        Isoforest.test(dataset,
                       device=device,
                       n_jobs_dataloader=n_jobs_dataloader)

        test_auc = Isoforest.results['auc_roc']

        test_aucs.append(test_auc)

    reporter(mean_auc=evaluate_aucs(test_aucs=test_aucs))
示例#17
0
def main(argv):
    np.random.seed(1234)

    if len(argv) != 3:
        print("Must be in format: python featurize.py  <TICKER> <FORWARD_LAG>")
        exit(0)
    elif not int(argv[2]):
        print("Must be in format: python featurize.py  <TICKER> <FORWARD_LAG>")
        exit(0)

    # set relevant vars
    ticker = argv[1]
    forward_lag  = int(argv[2])

    # display ticker info
    print("Ticker = ",ticker)
    print(f"Prediction Window = {forward_lag} days")

    print()

    # read data
    print("Reading data ... ")
    PREFIX = config.gen_prefix(ticker,forward_lag)

    # relevant filenames
    """
    FEATURES_TRAIN_FILENAME = f'../data/processed/{PREFIX}_train_features.csv'
    FEATURES_TEST_FILENAME = f'../data/processed/{PREFIX}_test_features.csv'
    LABELS_TRAIN_FILENAME = f'../data/processed/{PREFIX}_train_labels.csv'
    LABELS_TEST_FILENAME = f'../data/processed/{PREFIX}_test_labels.csv'
    """

    FEATURES_FILENAME = f'../data/processed/{PREFIX}_features.csv'
    LABELS_FILENAME = f'../data/processed/{PREFIX}_labels.csv'

    # load data
    X = pd.read_csv(FEATURES_FILENAME).set_index('date')
    y = pd.read_csv(LABELS_FILENAME).set_index('date')

    """
    X_train = pd.read_csv(FEATURES_TRAIN_FILENAME).set_index('date')
    X_test = pd.read_csv(FEATURES_TEST_FILENAME).set_index('date')
    y_train = pd.read_csv(LABELS_TRAIN_FILENAME).set_index('date')
    y_test = pd.read_csv(LABELS_TEST_FILENAME).set_index('date')
    """

    # Split into train and test data
    X_train,X_test,y_train,y_test = split_train_test(X,y,train_percent=0.7)

    # relevant dates
    print(f"Training data range:\n\t {str(X_train.index[0])[:10]} to {str(X_train.index[-1])[:10]}")
    print(f"Test data range:\n\t {str(X_test.index[0])[:10]} to {str(X_test.index[-1])[:10]}")

    print()

    # reduce number of features
    # NOTE: want to add this step to pipeline
    X_train,X_test = config.reduce_features(X_train,X_test,y_train)
    y_train,y_test = y_train.target.ravel(),y_test.target.ravel()

    # get necessary sizes
    n,d = X_train.shape

    # define cv
    cv_inner = TimeSeriesSplit(n_splits=5)
    #cv_outer = TimeSeriesSplit(n_splits=5)


    # get classifier names
    model_strs = models.MODELS

    # scoring dct to track performance
    scores = {}

    # dislays subplot legend
    plot_traces = []
    legend=True

    # best model
    best_model = None

    # run all models on dataset
    for model_str in model_strs:
        print("\nmodel = ",model_str)

        dct = {}

        # define pipeline
        model = getattr(models, model_str)(n,d)

        steps = [('Scaler',preprocessors.Scaler()),(model_str,model)]
        pipe,param_grid = make_pipeline_and_grid(steps)

        # determine which CV to be used
        if model_str in ['linearSVR','Dummy','LinearRegressor','KNN','lr_boost','LSTMRegressor']:
            search = GridSearchCV(pipe,param_grid,
                                  cv=cv_inner,
                                  refit='mean_absolute_error',
                                  iid=False,
                                  scoring=METRICS,
                                  return_train_score=True,
                                  n_jobs=-1)

        else:
            search = RandomizedSearchCV(pipe,param_grid,
                                        cv=cv_inner,
                                        n_iter=20,
                                        iid=False,
                                        random_state=0,
                                        refit='mean_absolute_error',
                                        scoring=METRICS,
                                        return_train_score=True,
                                        n_jobs=-1)

        # training
        print("Training Model ... ")
        search.fit(X_train,y_train)
        results = search.cv_results_
        print("Best Parameters: ",search.best_params_)

        # make predictions
        print("Making Predictions ... ")
        y_pred = search.predict(X_test)

        # generate subplot traces
        subplot_traces = gen_subplot(
            X_train.index,
            X_test.index,
            y_train,
            y_test,
            y_pred,
            legend
        )

        # append subplot to plot
        legend=False
        plot_traces.append(subplot_traces)

        # record metrics
        print("Recording Metrics ... \n")

        for met in METRICS:
            dct[f'train_{met}'] = results[f'mean_train_{met}'].mean()

        test_scores = gen_metrics(y_test,y_pred)
        dct.update(test_scores)

        # update scores dict
        scores[model_str] = dct

    # Display results
    scores_df = pd.DataFrame.from_dict(scores,orient='index')
    print(scores_df)
    print("\nBest Model: ",scores_df.test_mean_squared_error.idxmin())
    print("Mean Squared Error: ",scores_df.test_mean_squared_error.min())

    print()

    # show predictions
    # NOTE: this is hacky, need to fix later
    print(f'{forward_lag} day predictions')
    RAW_PREFIX = PREFIX.replace(f'_{forward_lag}','')
    raw_df = pd.read_csv(f'../data/raw/{RAW_PREFIX}_hist.csv').set_index('date')
    prediction_dates = raw_df.index[-forward_lag:]
    for i,date in enumerate(prediction_dates):
        print(f'{date}: {y_pred[-forward_lag+i]:.2f}')

    print()

    # plot results
    print("Plotting Results ... ")
    fig = plot_results(plot_traces,model_strs,ticker)
    fig.show()

    print()

    # log results
    print("Saving Results ... ")
    RESULTS_FILENAME = f'../data/results/{PREFIX}.csv'
    scores_df.to_csv(RESULTS_FILENAME)
    print(f'\{RESULTS_FILENAME}')
示例#18
0
    # Divide features and labels
    y = data.pop("offers")
    X = data

    # define polynomial regression degrees
    poly_reg = PolynomialFeatures(degree=2)
    X = pd.DataFrame(poly_reg.fit_transform(X))

    # create pipeline with regressor and scaler
    pipeline = Pipeline([("scaler", RobustScaler()),
                         ("regressor", LinearRegression())])

    # nested cross validation
    tscv = TimeSeriesSplit(n_splits=6,
                           max_train_size=365 * 48,
                           test_size=48 * 30)

    # perform nested cross validation and get results
    y_test, y_pred = utils.my_cross_val_predict(pipeline, X, y, tscv)

    # calculate results
    results = utils.get_results(y_test, y_pred)

    # save results
    with open("results/results_polynomial_regression.json", "w") as f:
        json.dump(results, f)

    utils.plot_results(
        y_test,
        y_pred,
示例#19
0
def main():
	# Load data
	print("Reading file...")
	calendar = pd.read_csv('Data/calendar.csv')
	sell_prices = pd.read_csv('Data/sell_prices.csv')
	sales_train_validation = pd.read_csv('Data/sales_train_validation.csv')
	submission = pd.read_csv('Data/sample_submission.csv')

	# Reduce memory size
	print("Reducing memory size...")
	calendar = reduce_mem(calendar)
	sell_prices = reduce_mem(sell_prices)
	sales_train_validation = reduce_mem(sales_train_validation)
	submission = reduce_mem(submission)

	# Combine all data into one dataset
	print("Combining data...")
	data = combine_data(calendar, sell_prices, sales_train_validation, submission, nrows = 27500000, merge = True)
	gc.collect()

	# Encoding data
	print("Encoding data...")
	data = data_encoding(data)
	gc.collect()

	# Create new feature
	print("Creating new feature...")
	data = feature_create(data)
	data = reduce_mem(data)
	gc.collect()

	# Train Test split
	x = data[data['date'] <= '2016-04-24']
	y = x.sort_values('date')['demand']
	test = data[(data['date'] > '2016-04-24')]
	x = x.sort_values('date')
	test = test.sort_values('date')
	del data


	# Model parameters setting
	## k-fold using TimeSeriesSplit
	n_fold = 3
	folds = TimeSeriesSplit(n_splits=n_fold)

	## lgb model parameters
	default_params = {"metric": 'rmse',
					  "verbosity": -1,
	}

	params = {'num_leaves': 555,
		  'min_child_weight': 0.034,
		  'feature_fraction': 0.379,
		  'bagging_fraction': 0.418,
		  'min_data_in_leaf': 106,
		  'objective': 'regression', #default
		  'max_depth': -1,
		  'learning_rate': 0.005,
		  "boosting_type": "gbdt", #defaul
		  "bagging_seed": 11,
		  "metric": 'rmse',
		  "verbosity": -1,
		  'reg_alpha': 0.3899,
		  'reg_lambda': 0.648,
		  'random_state': 222,
	}

	# Model training
	columns = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'year', 'month', 'week', 'day', 'dayofweek', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 
			'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'lag_t28', 'lag_t29', 'lag_t30', 'rolling_mean_t7', 'rolling_std_t7', 'rolling_mean_t30', 'rolling_mean_t90', 
			'rolling_mean_t180', 'rolling_std_t30', 'price_change_t1', 'price_change_t365', 'rolling_price_std_t7', 'rolling_price_std_t30']

	splits = folds.split(x, y)
	y_preds = np.zeros(test.shape[0])
	y_oof = np.zeros(x.shape[0])
	
	feature_importances = pd.DataFrame()
	feature_importances['feature'] = columns
	
	mean_score = []

	print("Start to train...")
	
	for fold_n, (train_index, valid_index) in enumerate(splits):
		print("-" * 20 +"LGB Fold:"+str(fold_n)+ "-" * 20)
		X_train, X_valid = x[columns].iloc[train_index], x[columns].iloc[valid_index]
		y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
		dtrain = lgb.Dataset(X_train, label=y_train)
		dvalid = lgb.Dataset(X_valid, label=y_valid)
		
		clf = lgb.train(params, dtrain, 2500, valid_sets = [dtrain, dvalid], early_stopping_rounds = 50, verbose_eval=100)
		feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()
		
		y_pred_valid = clf.predict(X_valid, num_iteration=clf.best_iteration)
		y_oof[valid_index] = y_pred_valid
		
		val_score = np.sqrt(metrics.mean_squared_error(y_pred_valid, y_valid))
		print(f'val rmse score is {val_score}')
		
		mean_score.append(val_score)
		y_preds += clf.predict(test[columns], num_iteration=clf.best_iteration)/n_fold
		del X_train, X_valid, y_train, y_valid
		gc.collect()
	
	print('mean rmse score over folds is', np.mean(mean_score))
	test['demand'] = y_preds

	# Submission format
	subs = submit_format(test, submission)
	subs.to_csv('submission.csv',index = False)

	# Plot feature importance
	feature_importances['average'] = feature_importances[[f'fold_{fold_n + 1}' for fold_n in range(folds.n_splits)]].mean(axis=1)
	feature_importances.to_csv('feature_importances.csv')

	plt.figure(figsize=(16, 12))
	sns.barplot(data=feature_importances.sort_values(by='average', ascending=False).head(20), x='average', y='feature');
	plt.title('20 TOP feature importance over {} folds average'.format(folds.n_splits));
示例#20
0
    def fit(self, ts_df: pd.DataFrame, target_col: str, cv: Optional[int] = None) -> object:
        """
         This builds a VAR model given a multivariate time series data frame with time as the Index.

        :param ts_df The time series data to be used for fitting the model. Note that the input can be
        a data frame with one column or multiple cols or a multivariate array. However, the first column
        must be the target variable. You must include only Time Series data in it. DO NOT include
        "Non-Stationary" or "Trendy" data. Make sure your Time Series is "Stationary" before you send
        it in!! If not, this will give spurious results.
        :type ts_df pd.DataFrame

        :param target_col The column name of the target time series that needs to be modeled.
        All other columns will be considered as exogenous variables (if applicable to method)
        :type target_col str

        :param cv: Number of folds to use for cross validation.
        Number of observations in the Validation set for each fold = forecast period
        If None, a single fold is used
        :type cv Optional[int]

        :rtype object
        """
        self.original_target_col = target_col
        self.original_preds = [x for x in list(ts_df) if x not in [self.original_target_col]]

        ts_df = ts_df[[self.original_target_col] + self.original_preds]

        self.find_best_parameters(data = ts_df)

        #######################################
        #### Cross Validation across Folds ####
        #######################################

        rmse_folds = []
        norm_rmse_folds = []
        forecast_df_folds = []

        NFOLDS = self.get_num_folds_from_cv(cv)
        #cv = GapWalkForward(n_splits=NFOLDS, gap_size=0, test_size=self.forecast_period)
        #cv = TimeSeriesSplit(n_splits=NFOLDS, test_size=self.forecast_period) ### sklearn version 0.0.24
        max_trainsize = len(ts_df) - self.forecast_period
        try:
            cv = TimeSeriesSplit(n_splits=NFOLDS, test_size=self.forecast_period) ### this works only sklearn v 0.0.24]
        except:
            cv = TimeSeriesSplit(n_splits=NFOLDS, max_train_size = max_trainsize)

        if type(ts_df) == dask.dataframe.core.DataFrame:
            ts_df = dft.head(len(ts_df)) ### this converts dask into a pandas dataframe

        for fold_number, (train_index, test_index) in enumerate(cv.split(ts_df)):
            dftx = ts_df.head(len(train_index)+len(test_index))
            ts_train = dftx.head(len(train_index)) ## now train will be the first segment of dftx
            ts_test = dftx.tail(len(test_index)) ### now test will be right after train in dftx

            print(f"\nFold Number: {fold_number+1} --> Train Shape: {ts_train.shape[0]} Test Shape: {ts_test.shape[0]}")

            #########################################
            #### Define the model with fold data ####
            #########################################
            y_train = ts_train.iloc[:, [0, self.best_d]]
            bestmodel = self.get_best_model(y_train)

            ######################################
            #### Fit the model with fold data ####
            ######################################

            if self.verbose >= 1:
                print(f'Fitting best VAR model on Fold: {fold_number+1}')
            try:
                self.model = bestmodel.fit(disp=False)
            except Exception as e:
                print(e)
                print(f'Error: VAR Fit on Fold: {fold_number+1} unsuccessful.')
                return bestmodel, None, np.inf, np.inf

            forecast_df = self.predict(ts_test.shape[0],simple=False)
            forecast_df_folds.append(forecast_df['yhat'].values)

            rmse, norm_rmse = print_dynamic_rmse(ts_test.iloc[:, 0].values, forecast_df['yhat'].values,
                                        ts_train.iloc[:, 0].values)
            rmse_folds.append(rmse)
            norm_rmse_folds.append(norm_rmse)

        norm_rmse_folds2 = rmse_folds/ts_df[self.original_target_col].values.std()  # Same as what was there in print_dynamic_rmse()
        self.model.plot_diagnostics(figsize=(16, 12))
        axis = self.model.impulse_responses(12, orthogonalized=True).plot(figsize=(12, 4))
        axis.set(xlabel='Time Steps', title='VAR model Impulse Response Functions')

        ###############################################
        #### Refit the model on the entire dataset ####
        ###############################################
        y_train = ts_df.iloc[:, [0, self.best_d]]
        self.refit(ts_df=y_train)

        # return self.model, forecast_df_folds, rmse_folds, norm_rmse_folds
        return self.model, forecast_df_folds, rmse_folds, norm_rmse_folds2
def RunSVM(Abs_train, Abs_test, X_train, Y_train, X_test, Y_test, name):
    model = SVC(C=1000, kernel='rbf', gamma=1)
    gs_ = [1.7**i for i in range(1, 20)]
    gs = [1.0 / i for i in gs_]
    cs = [i * 1000 for i in range(1, 30)] + [i * 10 for i in range(1, 100)]
    #cs = [10**i for i in range(2,4)]
    param_grid = [{'C': cs, 'gamma': gs}]
    clf = model_selection.GridSearchCV(model,
                                       param_grid,
                                       scoring=None,
                                       fit_params=None,
                                       n_jobs=-1,
                                       iid=True,
                                       refit='best_score_',
                                       cv=TimeSeriesSplit(n_splits=2),
                                       verbose=0,
                                       pre_dispatch='2*n_jobs',
                                       error_score='raise',
                                       return_train_score='warn')
    clf.fit(X_train, Y_train)
    x = (clf.best_params_)
    print x
    #	print model.get_params()
    model.set_params(**x)
    model.fit(X_train, Y_train)
    actual_dist = ComputeDistribution(Y_train, Y_test)
    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)
    cnf_mat_test = GenerateCnfMatrix(pred_test, Y_test)
    cnf_mat_train = GenerateCnfMatrix(pred_train, Y_train)
    accuracy = ComputeAccuracy(cnf_mat_test, cnf_mat_train, name, actual_dist)
    if CALCULATE_RETURNS == 'y':
        returns = ComputeReturns(Abs_test, Abs_train, pred_test, pred_train,
                                 Y_test, Y_train, name)
    '''gs_ = [0.001,0.005,0.01,0.05,0.1,0.15,0.28,0.75,1]+range(10,140)
					gs = [1.0/i for i in gs_]
					cs = [10,15,50,100,150,500,700,1000,2500,10000]
					c_array = []
					g_array = [] 
					actual_dist_array = []
					predicted_test_array =[]
					predicted_train_array =[]
					predicted_train_acc_array =[]
					predicted_test_acc_array = []
					ret_pt_tot_train =[]
					ret_pt_cor_inc_train=[]
					ret_pt_tot_test=[]
					ret_pt_cor_inc_test=[]
					
					for c in cs:
						for g in gs:
							c_array.append(c)
							g_array.append(g)
							print 'c ' +str(c)
							print 'g ' +str(g)
					#param_grid =  [{ 'C': C_range,'kernel': ['rbf']}]
					#clf = model_selection.GridSearchCV(model, param_grid, cv = TimeSeriesSplit(n_splits = 5))
					#clf.fit(X_train, Y_train)
					#x = (clf.best_params_ )
					#print x
					#model.set_params(**x)
					
							model 			= SVC(C = c, kernel = 'rbf', gamma = g)
				
					
							actual_dist_array.append(list(actual_dist))
							
							predicted_test_array.append(list(accuracy[0]))
							predicted_train_array.append(list(accuracy[1]))
							predicted_test_acc_array.append(list(accuracy[2])) 
							predicted_train_acc_array.append(list(accuracy[3]))
				
							print(' ')
								ret_pt_tot_train.append(list(returns[0]))
								ret_pt_cor_inc_train.append(list(returns[1]))
								ret_pt_tot_test.append(list(returns[2]))
								ret_pt_cor_inc_test.append(list(returns[3]))
							print ('------------------------------------------')
					c_array = np.asarray(c_array).T
					g_array = np.asarray(g_array).T
					predicted_train_array = np.asarray(predicted_train_array).T
					predicted_train_acc_array = np.asarray(predicted_train_acc_array).T
					predicted_test_acc_array = np.asarray(predicted_test_acc_array).T
					predicted_test_array = np.asarray(predicted_test_array).T
					actual_dist_array = np.asarray(actual_dist_array).T
					out = np.vstack((c_array,g_array,actual_dist_array,predicted_train_array,predicted_test_array,predicted_train_acc_array,predicted_test_acc_array,ret_pt_tot_train,ret_pt_cor_inc_train,ret_pt_tot_test,ret_pt_cor_inc_test))
					#out = out.T
					#header = ['c','gamma','dist_plus_actual','dist_minus_act','pred_plus_train','pred_minus_train','pred_plus_test','pred_minus_test','pred_tain_accuracy_tot','pred_train_acc_plus','pred_train_acc_minus','pre_test_acc_tot','pred_test_acc_plus','pred_test_acc_minus','ret_pt_tot_train','ret_pt_tot_plus','ret_pt_train_minus','ret_pt_cor_train','ret_pt_inc_train','rt_pt_tot_test','rt_pt_plus_test','rt_pt_minus_test','rt_pt_cor_test','ret_pt_inc_test']	
					#header = np.asarray(header)
					#out = np.vstack((header,out))
					np.savetxt("c_gaama.csv", out.T, delimiter=",")'''
    return accuracy[2][0], pred_test
示例#22
0
 def function(self):
     self.out_1.val = TimeSeriesSplit()
示例#23
0
def build_model(
    name: str,
    model_config: dict,
    data_config: Union[GordoBaseDataset, dict],
    metadata: dict,
):
    """
    Build a model and serialize to a directory for later serving.

    Parameters
    ----------
    name: str
        Name of model to be built
    model_config: dict
        Mapping of Model to initialize and any additional kwargs which are to be used in it's initialization.
        Example::

          {'type': 'KerasAutoEncoder',
           'kind': 'feedforward_hourglass'}

    data_config: dict
        Mapping of the Dataset to initialize, following the same logic as model_config.
    metadata: dict
        Mapping of arbitrary metadata data.

    Returns
    -------
        Tuple[sklearn.base.BaseEstimator, dict]
    """
    # Get the dataset from config
    logger.debug(f"Initializing Dataset with config {data_config}")

    dataset = (data_config if isinstance(data_config, GordoBaseDataset) else
               _get_dataset(data_config))

    logger.debug("Fetching training data")
    start = time.time()

    X, y = dataset.get_data()

    time_elapsed_data = time.time() - start

    # Get the model and dataset
    logger.debug(f"Initializing Model with config: {model_config}")
    model = serializer.pipeline_from_definition(model_config)

    # Cross validate
    logger.debug(f"Starting to do cross validation")
    start = time.time()

    scores: Dict[str, Any]
    if hasattr(model, "score"):
        cv_scores = cross_val_score(model,
                                    X,
                                    y,
                                    cv=TimeSeriesSplit(n_splits=3))
        scores = {
            "explained-variance": {
                "mean": cv_scores.mean(),
                "std": cv_scores.std(),
                "max": cv_scores.max(),
                "min": cv_scores.min(),
                "raw-scores": cv_scores.tolist(),
            }
        }
    else:
        logger.debug("Unable to score model, has no attribute 'score'.")
        scores = dict()

    cv_duration_sec = time.time() - start

    # Train
    logger.debug("Starting to train model.")
    start = time.time()
    model.fit(X, y)
    time_elapsed_model = time.time() - start

    metadata = {"user-defined": metadata}
    metadata["name"] = name
    metadata["dataset"] = dataset.get_metadata()
    utc_dt = datetime.datetime.now(datetime.timezone.utc)
    metadata["model"] = {
        "model-creation-date": str(utc_dt.astimezone()),
        "model-builder-version": __version__,
        "model-config": model_config,
        "data-query-duration-sec": time_elapsed_data,
        "model-training-duration-sec": time_elapsed_model,
        "cross-validation": {
            "cv-duration-sec": cv_duration_sec,
            "scores": scores
        },
    }

    gordobase_final_step = _get_final_gordo_base_step(model)
    if gordobase_final_step:
        metadata["model"].update(gordobase_final_step.get_metadata())

    return model, metadata
示例#24
0
    def __call__(self, trial):

        models = [classe.__name__ for classe in Models.__subclasses__()]

        classifier_name = trial.suggest_categorical('classifier', models)
        #n_in = trial.suggest_int('window_neg', -90, -1)
        n_in = trial.suggest_int('window_neg', -7, -7)

        window = WindowProcessor()
        X_lag, y_lag = window.transform(X=self.X,
                                        y=self.y,
                                        n_in=n_in,
                                        n_out=self.n_outs)
        '''Separação sequencial por ser um time series'''
        X_lag, X_test_lag, y_lag, y_test_lag = train_test_split(X_lag,
                                                                y_lag,
                                                                test_size=0.20,
                                                                stratify=None,
                                                                shuffle=False)

        if classifier_name == 'MultiLayerPerceptron':

            regressor = MultiLayerPerceptron()

            layers = list()

            # Determinando Numero de camadas
            n_layers = trial.suggest_int(
                'n_layers', regressor.search_space['num_layer'][0],
                regressor.search_space['num_layer'][1])

            # Determinando quantidade de neuronios por camada
            for layer in range(n_layers):
                layers.append(
                    trial.suggest_int(
                        'layer_{:}'.format(layer),
                        regressor.search_space['hidden_layer_sizes'][0],
                        regressor.search_space['hidden_layer_sizes'][1]))

            # Determinando penalidade L2 - alpha
            alpha = trial.suggest_loguniform(
                'alpha', regressor.search_space['alpha'][0],
                regressor.search_space['alpha'][1])

            learning_rate_init = trial.suggest_loguniform(
                'learning_rate',
                regressor.search_space['learning_rate_init'][0],
                regressor.search_space['learning_rate_init'][1])

            # Determinando random state
            random_state = trial.suggest_int(
                'random_state', regressor.search_space['random_state'][0],
                regressor.search_space['random_state'][1])

            param_grid = {
                'Mlp__hidden_layer_sizes': [tuple(layers)],
                'Mlp__alpha': [alpha],
                'Mlp__random_state': [random_state],
                'Mlp__learning_rate_init': [learning_rate_init]
            }
        print('Window Neg: {:}'.format(n_in))
        print('Window Forecast: {:}'.format(self.n_outs))

        [print(k, v) for k, v in param_grid.items()]
        with parallel_backend('threading'):

            grid = GridSearchCV(verbose=1,
                                scoring='neg_mean_absolute_error',
                                estimator=regressor.pipeline,
                                param_grid=param_grid,
                                cv=TimeSeriesSplit(n_splits=5),
                                n_jobs=-1,
                                refit='neg_mean_absolute_error')

            grid.fit(X=X_lag, y=y_lag)

        y_hat_test = grid.predict(X_test_lag)
        y_hat_test = self.y_scaler.inverse_transform(y_hat_test)
        y_test_lag = self.y_scaler.inverse_transform(y_test_lag)

        print('Score cross-val: {:}'.format(grid.best_score_))
        print('Score Test - MAE: {:}'.format(
            mean_absolute_error(y_true=y_test_lag, y_pred=y_hat_test)))
        print('R2 test: {:}'.format(
            r2_score(y_true=y_test_lag,
                     y_pred=y_hat_test,
                     multioutput='uniform_average')))
        print('-' * 100)
        print('\n')
        return mean_absolute_error(y_true=y_test_lag, y_pred=y_hat_test)
示例#25
0
def run_board_ensemble(X,
                       y,
                       dic_params_board,
                       time_serie=False,
                       n_splits=5,
                       nbr_train_test_split=3,
                       nbr_to_filter=12,
                       performance=accuracy_score):
    performance_sk = make_scorer(performance)
    start = datetime.now()

    dic_params_board["time_serie"] = time_serie
    dic_params_board["nbr_train_test_split"] = nbr_train_test_split
    dic_params_board["scoring"] = performance_sk

    #Data
    if (time_serie):
        splits = TimeSeriesSplit(n_splits=n_splits)
    else:
        splits = StratifiedKFold(n_splits=n_splits)

    ensemble_acc_dic = {}

    clfs_preds = []
    best_clf = []

    mgs_preds = {'cds': [], 'ruler': [], 'perf': []}
    maj_vote_preds = {'cds': [], 'ruler': [], 'perf': []}
    rf_preds = {'cds': [], 'ruler': [], 'perf': []}
    clf2_preds = {'cds': [], 'ruler': [], 'perf': []}

    clfs_acc = []

    y_of_preds = []
    #print(datetime.now())
    for train, test in splits.split(X, y):
        X_train, y_train = X[train], y[train]
        X_validation, y_validation = X[test], y[test]

        y_of_preds = y_of_preds + list(y_validation)

        #Models
        # run block of code and catch warnings
        with warnings.catch_warnings():
            # ignore all caught warnings
            warnings.filterwarnings("ignore")
            board = bo(**dic_params_board)
            train_preds, y_trained = board.fit(X_train,
                                               y_train,
                                               predict_training_probas=True)
            preds = board.predict_probas(X_validation)
            if (clfs_preds == []):
                clfs_preds = np.argmax(preds, axis=-1).tolist()
            else:
                for ind, pred in enumerate(preds):
                    clfs_preds[ind].extend(np.argmax(pred, axis=-1).tolist())

        clf_time = datetime.now()
        #print("CLF time",clf_time - start, preds.shape[0])
        filters_dico = {}
        #Testing cds filter
        filt = cds_filter(performance)
        filt = filt.selection(y_trained, train_preds)
        filters_dico["cds"] = filt
        cds_time = datetime.now()
        #print("cds time", cds_time - clf_time)

        #Testing cds/perf ruling filter
        filt = ruler_filter(performance)
        filt = filt.selection(y_trained, train_preds)
        filters_dico["ruler"] = filt
        ruler_time = datetime.now()
        #print("ruler time", ruler_time - cds_time)

        #Filter
        filt = perf_filter(performance)
        filt = filt.selection(y_trained, train_preds)
        filters_dico["perf"] = filt
        #Select the best clf at training and record it's testing scores
        best_pred = filt.filter(preds, nbr_to_filter=1)
        best_clf.extend(np.argmax(best_pred[0], axis=1))
        best_time = datetime.now()
        #print("best clf time", best_time - ruler_time)

        #For each filter
        for filter_name in filters_dico:
            #Do
            train_preds_filtered = filt.filter(train_preds,
                                               nbr_to_filter=nbr_to_filter)
            preds_filtered = filt.filter(preds, nbr_to_filter=nbr_to_filter)

            #MGS
            #print("MGS")
            mgs = MGS(score_function=performance, n_jobs=-1)
            mgs = mgs.fit(train_preds_filtered, y_trained)
            pred = mgs.predict_proba(preds_filtered)
            mgs_preds[filter_name].extend(np.argmax(pred, axis=1).tolist())
            mgs_time = datetime.now()
            #print("mgs",mgs_time - clf_time)

            #MAJ VOTING
            #print("Maj_voting")
            pred = voting_booth().vote(copy.deepcopy(preds_filtered))
            maj_vote_preds[filter_name].extend(pred.tolist())
            maj_vote_time = datetime.now()
            #print("vote", maj_vote_time - mgs_time)

            #Reshaping of train_preds_filtered and preds filtered from (a,b,c) to (a*c,b).
            #a: number classifiers
            #b: number of events
            #c: value for each target class
            train_preds_filtered = np.array(
                [x.T for x in train_preds_filtered])
            preds_filtered = np.array([x.T for x in preds_filtered])
            train_preds_filtered = train_preds_filtered.reshape(
                -1, train_preds_filtered.shape[-1])
            preds_filtered = preds_filtered.reshape(-1,
                                                    preds_filtered.shape[-1])

            #RANDOM FOREST
            #print("Random Forest")
            rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
            rf = rf.fit(train_preds_filtered.T, y_trained)
            pred = rf.predict(preds_filtered.T)
            rf_preds[filter_name].extend(pred.tolist())
            rf_time = datetime.now()

            #New set of classifiers:
            with warnings.catch_warnings():
                # ignore all caught warnings
                warnings.filterwarnings("ignore")
                board = bo(**dic_params_board)
                train_preds2, y_trained2 = board.fit(
                    train_preds_filtered.T,
                    y_trained,
                    predict_training_probas=True)
                pred = board.predict_probas(preds_filtered.T)
            #Filter the training to get best one
            filt2 = perf_filter(performance)
            filt2 = filt2.selection(y_trained2, train_preds2)
            #Select the best clf at training
            pred = np.argmax(filt2.filter(pred, nbr_to_filter=1)[0], axis=1)
            clf2_preds[filter_name].extend(pred.tolist())

    ensemble_acc_dic["BestClf"] = performance(y_of_preds, best_clf)
    for filter_name in mgs_preds:
        ensemble_acc_dic["MGS_" + filter_name] = performance(
            y_of_preds, mgs_preds[filter_name])
        ensemble_acc_dic["MajVoting_" + filter_name] = performance(
            y_of_preds, maj_vote_preds[filter_name])
        ensemble_acc_dic["RF_" + filter_name] = performance(
            y_of_preds, rf_preds[filter_name])
        ensemble_acc_dic["BestClf2_" + filter_name] = performance(
            y_of_preds, clf2_preds[filter_name])

    clfs_acc = np.array([performance(y_of_preds, pred) for pred in clfs_preds])

    return clfs_acc, ensemble_acc_dic
示例#26
0
def get_model(data, target, use_ensemble=True):

    params1 = {
        'el__alpha': np.logspace(-5, 2, 30),
        'el__l1_ratio': np.linspace(0, 1, 3),
        'pca__n_components': [2, 5, 10]
    }

    params2 = {
        'rf__n_estimators': range(10, 101, 30),
        'rf__max_depth': [2, 5, 9],
        'pca__n_components': [2, 5, 10]
    }

    params3 = {
        'lgb__learning_rate': np.logspace(-6, 0, 5),
        'lgb__n_estimators': range(10, 101, 30),
        'lgb__max_depth': [6, 9, 12],
        'pca__n_components': [2, 5, 10],
        'lgb__num_leaves': [100]
    }

    rf = Pipeline([('scale', StandardScaler()), ('pca', PCA()),
                   ('rf', RandomForestRegressor())])
    el = Pipeline([('scale', StandardScaler()), ('pca', PCA()),
                   ('el', ElasticNet(max_iter=5000))])
    lgb = Pipeline([('scale', StandardScaler()), ('pca', PCA()),
                    ('lgb', LGBMRegressor())])

    gr_lgb = GridSearchCV(lgb,
                          params3,
                          cv=TimeSeriesSplit(),
                          scoring='neg_mean_squared_error',
                          refit=True)
    gr_lgb.fit(data, target)
    logger.info('Booster params discovered')

    gr_el = GridSearchCV(el,
                         params1,
                         cv=TimeSeriesSplit(),
                         scoring='neg_mean_squared_error',
                         refit=True)
    gr_el.fit(data, target)
    logger.info('ElasticNet params discovered')

    gr_rf = GridSearchCV(rf,
                         params2,
                         cv=TimeSeriesSplit(),
                         scoring='neg_mean_squared_error',
                         refit=True)
    gr_rf.fit(data, target)
    logger.info('RandomForest params discovered')

    res_scores = {
        'elastic': gr_el.best_score_,
        'random_forest': gr_rf.best_score_,
        'lgbm': gr_lgb.best_score_
    }

    res_est = {
        'elastic': gr_el.best_estimator_,
        'random_forest': gr_rf.best_estimator_,
        'lgbm': gr_lgb.best_estimator_
    }
    if use_ensemble:
        estimators = [('elastic', gr_el.best_estimator_),
                      ('random_forest', gr_rf.best_estimator_),
                      ('lgbm', gr_lgb.best_estimator_)]

        stacked = StackingRegressor(estimators=estimators,
                                    final_estimator=RandomForestRegressor(
                                        n_estimators=100, max_depth=3),
                                    passthrough=True)
        stacked.fit(data, target)
        logger.info('Ensemble fitted')
        return stacked
    return res_est[sorted(res_scores, key=lambda x: (-res_scores[x], x))[0]]
示例#27
0
params = [{
    'Switcher__estimator': [RandomForestRegressor()],
    'preprocess__p_text__TFIDF__ngram_range': [(1, 1)],
    'preprocess__p_text__tSVD__n_components': [8, 9, 10],
    'Switcher__estimator__n_estimators': [30, 40],
    'Switcher__estimator__max_depth': [8, 9]
}, {
    'Switcher__estimator': [GradientBoostingRegressor()],
    'preprocess__p_text__TFIDF__ngram_range': [(1, 1)],
    'preprocess__p_text__tSVD__n_components': [8, 9, 10],
    'Switcher__estimator__n_estimators': [30, 40],
    'Switcher__estimator__max_depth': [8, 9],
    'Switcher__estimator__learning_rate': np.logspace(-2, 1, 4)
}]

tscv = TimeSeriesSplit(n_splits=5)
regr = GridSearchCV(p_tot, param_grid=params, cv=tscv, scoring='r2')

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                Y,
                                                test_size=0.25,
                                                shuffle=False)
regr.fit(Xtrain, Ytrain)

print('best params: ', regr.best_params_)
print('best score: ', regr.best_score_)

with open('./pickled_models/RF_all_property_sold_price.pkl', 'wb') as f:
    pickle.dump(regr, f)

Ypred = regr.predict(Xtest)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

X = np.loadtxt('../datasets/X.csv', delimiter=',')
y = np.loadtxt('../datasets/y.csv', delimiter=',')
'''
INSTRUCTIONS

*   Import TimeSeriesSplit from sklearn.model_selection.
*   Instantiate a time series cross-validation iterator with 10 splits.
*   Iterate through CV splits. On each iteration, visualize the values of the input data that would be used to train the model for that iteration.
'''

# Import TimeSeriesSplit
from sklearn.model_selection import TimeSeriesSplit

# Create time-series cross-validation object
cv = TimeSeriesSplit(n_splits=10)

# Iterate through CV splits
fig, ax = plt.subplots()

for ii, (tr, tt) in enumerate(cv.split(X, y)):
    # Plot the training data on each iteration, to see the behavior of the CV
    ax.plot(tr, ii + y[tr])

ax.set(title='Training data on each CV iteration', ylabel='CV iteration')

plt.show()
    return pywt.waverec(coeff, wavelet, mode='per')


# Get training, testing datasets
df_reg = df.drop("GWP", axis=1)
X = df_reg.drop("Discount_off", axis=1)
X = df_reg[["date_delta"]]
Y = df_reg['Discount_off']
# denoise discount using wavelet transform
#Y = pd.Series(denoise_signal(Y))
#X_train, X_test, Y_train, Y_test =  train_test_split(X, Y, train_size = 1-365/len(df), shuffle = False)
X_train = load('Data/regression_train_X.npy', allow_pickle=True)
X_test = load('Data/regression_test_X.npy', allow_pickle=True)
Y_train = load('Data/regression_train_y.npy', allow_pickle=True)
Y_test = load('Data/regression_test_y.npy', allow_pickle=True)
time_split = TimeSeriesSplit(n_splits=10)

## train SVM
regressors = [
    svm.SVR(),
    #        linear_model.SGDRegressor(),
    linear_model.BayesianRidge(),
    linear_model.LassoLars(),
    linear_model.ARDRegression(),
    linear_model.PassiveAggressiveRegressor(),
    linear_model.TheilSenRegressor(),
    linear_model.LinearRegression()
]

name = [
    'svm.SVR',
示例#30
0
    def cross_cwcf(self,
                   gamma_u,
                   C_u,
                   gamma_l,
                   C_l,
                   d_u=None,
                   d_l=None,
                   cv=3,
                   mu=0.6,
                   eta=10,
                   isplt=False):
        """计算交叉验证的cwc(区间覆盖宽度标准) picp(区间覆盖率) pinew(区间宽度)
        cv: 折数
        mu: 预测置信区间
        eta: 惩罚因子,表现算法偏好度。eta增加提升picp;eta减小,提升pinew
        isplt: 是否将每折画图
        """
        tscv = TimeSeriesSplit(n_splits=cv)
        cwc_l = []
        picp_l = []
        pinew_l = []
        i_cv = 0
        plot_num = cv * 100 + 3 * 10
        for tridx, teidx in tscv.split(self.cv_t):
            cv_tr_u, cv_te_u = self.cv_u[tridx], self.cv_u[teidx]
            cv_tr_l, cv_te_l = self.cv_l[tridx], self.cv_l[teidx]
            cv_tr_t, cv_te_t = self.cv_t[tridx], self.cv_t[teidx]

            svr_u = svm.SVR(gamma=gamma_u, C=C_u)
            svr_l = svm.SVR(gamma=gamma_l, C=C_l)
            pu, pl, d = self.svm_predict(cv_tr_u, cv_te_u, cv_tr_l, cv_te_l,
                                         svr_u, svr_l, d_u, d_l)

            if isplt:
                plt.figure(figsize=(15, 6))
                plt.subplot(plot_num + 1)
                plt.title('cv train %d' % (i_cv))
                plt.plot(cv_tr_u, color='blue', marker='o')
                plt.plot(cv_tr_l, color='blue', marker='o')
                plt.plot(cv_tr_t, color='gray', marker='x')
                plt.subplot(plot_num + 2)
                plt.title('cv test %d' % (i_cv))
                plt.plot(cv_te_u, color='blue', marker='o')
                plt.plot(cv_te_l, color='blue', marker='o')
                plt.plot(cv_te_t, color='gray', marker='x')
                plt.subplot(plot_num + 3)
                plt.title('cv predict %d' % (i_cv))
                plt.plot(pu, color='blue', marker='o')
                plt.plot(pl, color='blue', marker='o')
                plt.plot(cv_te_t[d:], color='gray', marker='x')
            i_cv += 1
            plot_num += 3

            picp = picpf(cv_te_t[d:], pu, pl)
            picp_l.append(picp)
            pinew = pinewf(pu, pl)
            pinew_l.append(pinew)
            cwc = cwcf(picp, pinew, mu, eta)
            cwc_l.append(cwc)
        return np.array(cwc_l).mean(), np.array(picp_l).mean(), np.array(
            pinew_l).mean()