Пример #1
0
class MIKernelSVR(MIKernelSVM):

    def __init__(self, **parameters):
        svr_params = {
            'kernel' : 'precomputed',
            'max_iter': MAX_ITERS,
        }
        if 'C' in parameters:
            svr_params['C'] = parameters.pop('C')
        if 'nu' in parameters:
            svr_params['nu'] = parameters.pop('nu')
        self.estimator = NuSVR(**svr_params)

        # Get kernel name and pass remaining parameters to kernel
        mi_kernel_name = parameters.pop('kernel')
        self.mi_kernel = kernel.by_name(mi_kernel_name, **parameters)

    def fit(self, X, y):
        X = map(np.asarray, X)
        self.fit_data = X
        self.gram_matrix = self.mi_kernel(X, X)
        self.estimator.fit(self.gram_matrix, y)
        return self

    def predict(self, X=None):
        if X is None:
            gram_matrix = self.gram_matrix
        else:
            X = map(np.asarray, X)
            gram_matrix = self.mi_kernel(X, self.fit_data)
        return self.estimator.predict(gram_matrix)
    def train(self, x, y, param_names, random_search=100,
              kernel_cache_size=2000, **kwargs):
        if self._debug:
            print "Before preprocessing: 1st sample:\n", x[0]
        start = time.time()
        scaled_x = self._set_and_preprocess(x=x, param_names=param_names)

        # Check that each input is between 0 and 1
        self._check_scaling(scaled_x=scaled_x)

        if self._debug:
            print "Shape of training data: ", scaled_x.shape
            print "Param names: ", self._used_param_names
            print "First training sample\n", scaled_x[0]
            print "Encode: ", self._encode

        # Do a random search
        nu, c, gamma = self._random_search(random_iter=100, x=scaled_x, y=y,
                                           kernel_cache_size=kernel_cache_size)

        # Now train model
        try:
            nusvr = NuSVR(gamma=gamma, C=c, nu=nu, random_state=self._rng,
                          cache_size=kernel_cache_size)
            nusvr.fit(scaled_x, y)
            self._model = nusvr
        except Exception, e:
            print "Training failed", e.message
            svr = None
Пример #3
0
 def fit(self, X, Y, W):
     clf = NuSVR(nu=self.nu, C=self.C, kernel=self.kernel, degree=self.degree,
                 gamma=self.gamma, coef0=self.coef0, shrinking=self.shrinking,
                 tol=self.tol, cache_size=self.cache_size,
                 max_iter=self.max_iter)
     if W is not None:
         return NuSVRClassifier(clf.fit(X, Y.reshape(-1), W.reshape(-1)))
     return NuSVRClassifier(clf.fit(X, Y.reshape(-1)))
Пример #4
0
def traindt(x,y):
    global clf

    #print "training surrogate"
    #clft = DecisionTreeRegressor(max_depth=tree_max_depth,splitter='random')
    #clft = RandomForestRegressor()
    #clft = GradientBoostingRegressor(loss='lad',n_estimators=50,learning_rate=0.3,max_depth=2)
    clft = NuSVR(C=1e6)
    clf = clft.fit(x,y)
    def _random_search(self, random_iter, x, y, kernel_cache_size):
        # Default Values
        c = 1.0
        gamma = 0.0
        nu = 0.5
        best_score = -sys.maxint

        if random_iter > 0:
            sys.stdout.write("Do a random search %d times" % random_iter)
            param_dist = {"C": numpy.power(2.0, range(-5, 16)),
                          "gamma": numpy.power(2.0, range(-15, 4)),
                          "nu": uniform(loc=0.0001, scale=1-0.0001)}
            param_list = [{"C": c, "gamma": gamma, "nu": nu}, ]
            param_list.extend(list(ParameterSampler(param_dist,
                                                    n_iter=random_iter-1,
                                                    random_state=self._rng)))
            for idx, d in enumerate(param_list):
                nusvr = NuSVR(kernel='rbf',
                              gamma=d['gamma'],
                              C=d['C'],
                              nu=d['nu'],
                              random_state=self._rng,
                              cache_size=kernel_cache_size)
                train_x, test_x, train_y, test_y = \
                    train_test_split(x, y, test_size=0.5, random_state=self._rng)
                self._check_scaling(scaled_x=train_x)
                nusvr.fit(train_x, train_y)
                sc = nusvr.score(test_x, test_y)
                # Tiny output
                m = "."
                if idx % 10 == 0:
                    m = "#"
                if sc > best_score:
                    m = "<"
                    best_score = sc
                    c = d['C']
                    gamma = d['gamma']
                    nu = d['nu']
                sys.stdout.write(m)
                sys.stdout.flush()
            sys.stdout.write("Using C: %f, nu: %f and Gamma: %f\n" %
                             (c, nu, gamma))
        return nu, c, gamma
Пример #6
0
    def __init__(self, **parameters):
        svr_params = {
            'kernel' : 'precomputed',
            'max_iter': MAX_ITERS,
        }
        if 'C' in parameters:
            svr_params['C'] = parameters.pop('C')
        if 'nu' in parameters:
            svr_params['nu'] = parameters.pop('nu')
        self.estimator = NuSVR(**svr_params)

        # Get kernel name and pass remaining parameters to kernel
        mi_kernel_name = parameters.pop('kernel')
        self.mi_kernel = kernel.by_name(mi_kernel_name, **parameters)
Пример #7
0
# kernels
Kx = K[testIdx][:, trainIdx]
Kv = K[valIdx][:, trainIdx]
Kt = K[trainIdx][:, trainIdx]

#n = len(trainIdx)
#nv = len(valIdx)
#nx = len(testIdx)

#Train Support Vector Regression
# C = 10.^(-2:1:2);
C = [0.1]
for c in C:
    print("C = %f" % c)
    tic = time.time()
    svr = NuSVR(C=c, kernel='precomputed')
    svr.fit(Kt, trainLabels)
    toc = time.time()
    print("train cost %f s" % (toc - tic))
    trainScores = svr.predict(Kt)
    mseTrain = np.mean((trainLabels - trainScores)**2)
    valScores = svr.predict(Kv)
    mseVal = np.mean((valLabels - valScores)**2)
    testScores = svr.predict(Kx)
    mseTest = np.mean((testLabels - testScores)**2)
    print('Train MSE : %g' % mseTrain)
    print('val MSE : %g' % mseVal)
    print('Test MSE : %g' % mseTest)

    # use all samples to train
    svr = NuSVR(C=c, kernel='precomputed')
Пример #8
0
def run(seed):

    # create folders for scores models and preds
    folder_models = './models/domain2_var1/scores/'
    if not os.path.exists(folder_models):
        os.makedirs(folder_models)

    folder_preds = './predicts/domain2_var1/scores/'
    if not os.path.exists(folder_preds):
        os.makedirs(folder_preds)

    print('Loading data...')

    # load biases
    ic_bias = read_pickle('./data/biases/ic_biases.pickle')
    ic_bias_site = read_pickle('./data/biases/ic_biases_site.pickle')
    fnc_bias = read_pickle('./data/biases/fnc_biases.pickle')
    fnc_bias_site = read_pickle('./data/biases/fnc_biases_site.pickle')
    pca_bias = read_pickle('./data/biases/200pca_biases.pickle')
    pca_bias_site = read_pickle('./data/biases/200pca_biases_site.pickle')

    # load classifier and add extra sites2
    extra_site = pd.DataFrame()
    extra_site['Id'] = np.load('./predicts/classifier/site2_test_new_9735.npy')

    # load competiton data
    ids_df = pd.read_csv('./data/raw/reveal_ID_site2.csv')
    fnc_df = pd.read_csv('./data/raw/fnc.csv')
    loading_df = pd.read_csv('./data/raw/loading.csv')
    labels_df = pd.read_csv('./data/raw/train_scores.csv')

    ids_df = ids_df.append(extra_site)
    print('Detected Site2 ids count: ', ids_df['Id'].nunique())

    # load created features
    agg_df = pd.read_csv('./data/features/agg_feats.csv')
    im_df = pd.read_csv('./data/features/im_feats.csv')
    dl_df = pd.read_csv('./data/features/dl_feats.csv')

    pca_df = pd.read_csv('./data/features/200pca_feats/200pca_3d_k0.csv')
    for i in range(1, 6):
        part = pd.read_csv(
            './data/features/200pca_feats/200pca_3d_k{}.csv'.format(i))
        del part['Id']
        pca_df = pd.concat((pca_df, part), axis=1)

    # merge data
    ic_cols = list(loading_df.columns[1:])
    fnc_cols = list(fnc_df.columns[1:])
    agg_cols = list(agg_df.columns[1:])
    im_cols = list(im_df.columns[1:])
    pca_cols = list(pca_df.columns[1:])
    dl_cols = list(dl_df.columns[1:])
    pca0_cols = [c for c in pca_cols if 'k0' in c]

    df = fnc_df.merge(loading_df, on='Id')
    df = df.merge(agg_df, how='left', on='Id')
    df = df.merge(im_df, how='left', on='Id')
    df = df.merge(pca_df, how='left', on='Id')
    df = df.merge(dl_df, how='left', on='Id')
    df = df.merge(labels_df, how='left', on='Id')

    del loading_df, fnc_df, agg_df, im_df, pca_df
    gc.collect()

    # split train and test
    df.loc[df['Id'].isin(labels_df['Id']), 'is_test'] = 0
    df.loc[~df['Id'].isin(labels_df['Id']), 'is_test'] = 1

    train = df.query('is_test==0')
    del train['is_test']
    test = df.query('is_test==1')
    del test['is_test']
    y = train['domain2_var1'].copy().reset_index(drop=True)
    d21_index = list(train['domain2_var1'].dropna().index)

    # apply biases
    for c in ic_bias_site.keys():
        test.loc[~test['Id'].isin(ids_df['Id']), c] += ic_bias[c]
        test.loc[test['Id'].isin(ids_df['Id']), c] += ic_bias_site[c]

    for c in fnc_bias_site.keys():
        test.loc[~test['Id'].isin(ids_df['Id']), c] += fnc_bias[c]
        test.loc[test['Id'].isin(ids_df['Id']), c] += fnc_bias_site[c]

    for c in pca_bias_site.keys():
        test.loc[~test['Id'].isin(ids_df['Id']), c] += pca_bias[c]
        test.loc[test['Id'].isin(ids_df['Id']), c] += pca_bias_site[c]

    # save df for scaling
    df_scale = pd.concat([train, test], axis=0)

    # I. Create fnc score
    print('Creating FNC score...')

    # prepare datasets for fnc score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, fnc_cols)

    # define models
    names = ['ENet', 'BRidge']
    names = [name + '_fnc_seed{}'.format(seed) for name in names]
    pack = [
        ElasticNet(alpha=0.05, l1_ratio=0.5, random_state=0),
        BayesianRidge()
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 2, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 2, names)

    # save oof, pred, models
    np.save(folder_preds + 'fnc_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'fnc_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # II. Create agg score
    print('Creating AGG score...')

    # prepare datasets for agg score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, agg_cols)

    # define models
    names = ['RGF', 'ENet', 'Huber']
    names = [name + '_agg_seed{}'.format(seed) for name in names]
    pack = [
        RGFRegressor(max_leaf=1000,
                     reg_depth=5,
                     min_samples_leaf=100,
                     normalize=True),
        ElasticNet(alpha=0.05, l1_ratio=0.3, random_state=0),
        HuberRegressor(epsilon=2.5, alpha=1)
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 3, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 3, names)

    # save oof, pred, models
    np.save(folder_preds + 'agg_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'agg_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # III. Create pca score
    print('Creating PCA score...')

    # prepare datasets for pca score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, pca_cols)

    # define models
    names = ['ENet', 'BRidge', 'OMP']
    names = [name + '_pca_seed{}'.format(seed) for name in names]
    pack = [
        ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0),
        BayesianRidge(),
        OrthogonalMatchingPursuit()
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 3, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 3, names)

    # save oof, pred, models
    np.save(folder_preds + 'pca_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'pca_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # IV. Create im score
    print('Creating IM score...')

    # prepare datasets for pca score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, im_cols)

    # define models
    names = ['ENet', 'BRidge', 'OMP']
    names = [name + '_im_seed{}'.format(seed) for name in names]
    pack = [
        ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0),
        BayesianRidge(),
        OrthogonalMatchingPursuit()
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 3, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 3, names)

    # save oof, pred, models
    np.save(folder_preds + 'im_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'im_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # V. Create dl score
    print('Creating DL score...')

    # prepare datasets for pca score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, dl_cols)

    # define models
    names = ['ENet', 'BRidge', 'OMP']
    names = [name + '_dl_seed{}'.format(seed) for name in names]
    pack = [
        ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0),
        BayesianRidge(),
        OrthogonalMatchingPursuit()
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 3, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 3, names)

    # save oof, pred, models
    np.save(folder_preds + 'dl_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'dl_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # VI. Training and predicting procedure
    print('Training has started...')

    # add scores
    for prefix in ['fnc', 'agg', 'im', 'pca', 'dl']:
        train.loc[d21_index, prefix + '_score'] = np.load(
            folder_preds + '{}_score_seed{}.npy'.format(prefix, seed))
        test.loc[:, prefix + '_score'] = np.load(
            folder_preds + '{}_score_test_seed{}.npy'.format(prefix, seed))
    score_cols = [c for c in train.columns if c.endswith('_score')]

    # save df for scaling
    df_scale = pd.concat([train, test], axis=0)

    # create differents datasets
    # linear
    linear_cols = sorted(
        list(set(ic_cols + fnc_cols + pca0_cols) - set(['IC_20'])))
    train_linear, test_linear = scale_select_data(train, test, df_scale,
                                                  linear_cols)

    # kernel
    kernel_cols = sorted(list(set(ic_cols + pca0_cols) - set(['IC_20'])))
    train_kernel, test_kernel = scale_select_data(train=train,
                                                  test=test,
                                                  df_scale=df_scale,
                                                  cols=kernel_cols,
                                                  scale_factor=0.2,
                                                  scale_cols=pca0_cols,
                                                  sc=StandardScaler())

    # score
    sc_cols = sorted(list(set(ic_cols + score_cols) - set(['IC_20'])))
    train_sc, test_sc = scale_select_data(train, test, df_scale, sc_cols)

    # learning process on different datasets
    names = ['GP', 'SVM1', 'SVM2', 'Lasso', 'BgR']
    names = [name + '_seed{}'.format(seed) for name in names]
    pack = [
        GaussianProcessRegressor(DotProduct(), random_state=0),
        NuSVR(C=3, kernel='rbf'),
        NuSVR(C=3, kernel='rbf'),
        Lasso(alpha=0.1, random_state=0),
        BaggingRegressor(Ridge(alpha=1),
                         n_estimators=100,
                         max_samples=0.2,
                         max_features=0.2,
                         random_state=0)
    ]

    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_sc] * 2 + [train_kernel] + [train_linear] * 2, y)
    de_blend = zoo.blend_oof()
    preds = zoo.predict([test_sc] * 2 + [test_kernel] + [test_linear] * 2,
                        names,
                        is_blend=True)

    # rewrite folders for models and preds
    folder_models = './models/domain2_var1/stack/'
    if not os.path.exists(folder_models):
        os.makedirs(folder_models)

    folder_preds = './predicts/domain2_var1/stack/'
    if not os.path.exists(folder_preds):
        os.makedirs(folder_preds)

    print('Saving models to', folder_models)
    print('Saving predictions to', folder_preds)

    # save oofs and models
    zoo.save_oofs(names, folder=folder_preds)
    zoo.save_models(names, folder=folder_models)

    # stacking predictions
    print('Stacking predictions...')
    d21_prediction = pd.DataFrame()
    d21_prediction['Id'] = test['Id'].values
    d21_prediction['pred'] = preds
    d21_prediction.to_csv(folder_preds +
                          'domain2_var1_stack_seed{}.csv'.format(seed),
                          index=False)
    print('domain2_var1 seed pred is saved as',
          folder_preds + 'domain2_var1_stack_seed{}.csv'.format(seed))
Пример #9
0
def train_svr_cpu(X, Y, X_eval, c, kernel='linear', nu=0.5):
    svc = NuSVR(kernel=kernel, C=c, max_iter=100000, nu=nu, gamma='auto')
    svc.fit(X, Y)
    y_prob = svc.predict(X_eval)
    return y_prob
Пример #10
0
def regress_NuSVR(X_train, X_test, y_train, y_test, C1, nu1):
    nusvr = NuSVR(nu=nu1, C=C1, kernel='rbf', gamma=0.0001, tol=0.001)
    regr_nusvr = prep_process(nusvr, X_train, X_test, y_train, y_test)
    return (regr_nusvr[0], regr_nusvr[1])
Пример #11
0
    def fit(self, xtrain, ytrain, info, learn_hyper=True):

        # prepare training data
        xtrain_data = self.prepare_data(info)
        y_train = np.array(ytrain)

        # learn hyperparameters of the extrapolator by cross validation
        if self.best_hyper is None or learn_hyper:
            # specify model hyper-parameters
            if self.model_name == 'svr':
                C = loguniform(1e-5, 10, self.n_hypers)
                nu = np.random.uniform(0, 1, self.n_hypers)
                gamma = loguniform(1e-5, 10, self.n_hypers)
                hyper = np.vstack([C, nu, gamma]).T

            elif self.model_name == 'blr':
                alpha_1 = np.random.uniform(1e-7, 1e-5, self.n_hypers)
                alpha_2 = np.random.uniform(1e-7, 1e-5, self.n_hypers)
                lambda_1 = np.random.uniform(1e-7, 1e-5, self.n_hypers)
                lambda_2 = np.random.uniform(1e-7, 1e-5, self.n_hypers)
                hyper = np.vstack([alpha_1, alpha_2, lambda_1, lambda_2]).T

            elif self.model_name == 'rf':
                n_trees = np.random.randint(10, 800, self.n_hypers)
                frac_feature = np.random.uniform(0.1, 0.5, self.n_hypers)
                hyper = np.vstack([n_trees, frac_feature]).T

            print(f'start CV on {self.model_name}')
            mean_score_list = []
            t_start = time.time()
            for i in range(self.n_hypers):
                # define model
                if self.model_name == 'svr':
                    model = NuSVR(C=hyper[i, 0],
                                  nu=hyper[i, 1],
                                  gamma=hyper[i, 2],
                                  kernel='rbf')

                elif self.model_name == 'blr':
                    model = BayesianRidge(alpha_1=hyper[i, 0],
                                          alpha_2=hyper[i, 1],
                                          lambda_1=hyper[i, 2],
                                          lambda_2=hyper[i, 3])

                elif self.model_name == 'rf':
                    model = RandomForestRegressor(n_estimators=int(hyper[i,
                                                                         0]),
                                                  max_features=hyper[i, 1])

                # perform cross validation to learn the best hyper value
                scores = cross_val_score(model, xtrain_data, y_train, cv=3)
                mean_scores = np.mean(scores)
                mean_score_list.append(mean_scores)

            t_end = time.time()
            best_hyper_idx = np.argmax(mean_score_list)
            best_hyper = hyper[best_hyper_idx]
            max_score = np.max(mean_score_list)
            time_taken = t_end - t_start
            print(
                f'{self.model_name}'
                f'best_hyper={best_hyper}, score={max_score}, time={time_taken}'
            )
            self.best_hyper = best_hyper

        # fit the extrapolator with the best hyperparameters to the training data
        if self.model_name == 'svr':
            best_model = NuSVR(C=self.best_hyper[0],
                               nu=self.best_hyper[1],
                               gamma=self.best_hyper[2],
                               kernel='rbf')

        elif self.model_name == 'blr':
            best_model = BayesianRidge(alpha_1=self.best_hyper[0],
                                       alpha_2=self.best_hyper[1],
                                       lambda_1=self.best_hyper[2],
                                       lambda_2=self.best_hyper[3])

        elif self.model_name == 'rf':
            best_model = RandomForestRegressor(n_estimators=int(
                self.best_hyper[0]),
                                               max_features=self.best_hyper[1])

        best_model.fit(xtrain_data, y_train)
        self.best_model = best_model
Пример #12
0
def runTcheby():
    global param, approx_pareto_front, archiveOK, NO_FILE_TO_WRITE

    ############################################################################
    # PARAMETER

    # clf = SVR(C=1.0, epsilon=0.1, kernel="rbf")
    clf = NuSVR()
    clf2 = -1
    two_models_bool = False

    isReals = True
    start_fct, nb_functions = param[0:2]
    nb_iterations, neighboring_size = param[2:4]
    init_decisions, problem_size = param[4:6]
    max_decisions_maj, delta_neighbourhood = param[6:8]
    CR, search_space = param[8:10]
    F, distrib_index_n = param[10:12]
    pm, operator_fct = param[12:14]
    nb_samples, training_neighborhood_size = param[14:16]
    strategy, file_to_write = param[16:18]
    filter_strat, free_eval = param[18:20]
    param_print_every, file_to_writeR2 = param[20:22]
    filenameDIR, filenameSCORE = param[22:24]

    nb_objectives = len(start_fct)

    # get separatly offspring operator fct
    crossover_fct, mutation_fct, repair_fct = operator_fct

    best_decisions = copy.deepcopy(init_decisions)

    sampling_param = [
        crossover_fct,
        mutation_fct,
        repair_fct,
        best_decisions,
        F,
        problem_size,
        CR,
        search_space,
        distrib_index_n,
        pm,
    ]

    ############################################################################
    # INITIALISATION

    qual_tools.resetGlobalVariables(filenameDIR, filenameSCORE, nb_iterations, nb_functions)

    eval_to.resetEval()

    # get the directions weight for both starting functions
    directions = dec.getDirections(nb_functions, nb_objectives)

    # init the neighboring constant
    nt.initNeighboringTab(nb_functions, neighboring_size, directions, nb_objectives)

    # giving global visibility to the best_decisions to get the result at the end
    approx_pareto_front = best_decisions

    # initial best decisions scores
    best_decisions_scores = [eval_to.free_eval(start_fct, best_decisions[i], problem_size) for i in range(nb_functions)]

    pop_size = nb_functions

    # current optimal scores for both axes
    z_opt_scores = gt.getMinTabOf(best_decisions_scores)

    eval_to.initZstar(z_opt_scores)

    # get the first training part of the item we will learn on
    model_directions = train_to.getDirectionsTrainingMatrix(directions)

    # if the data shall be write in a file
    writeOK = False
    if file_to_write != NO_FILE_TO_WRITE:
        writeOK = True

    writeR2OK = False
    if file_to_writeR2 != NO_FILE_TO_WRITE:
        writeR2OK = True

    ############################################################################
    # MAIN ALGORITHM

    if writeOK:
        iot.printObjectives(file_to_write, eval_to.getNbEvals(), 0, best_decisions_scores, problem_size, nb_objectives)

    # IDs tab to allow a random course through the directions in the main loop
    id_directions = [i for i in range(nb_functions)]

    # iterations loop
    for itera in range(nb_iterations):
        if not free_eval:
            # Update model
            training_inputs, training_outputs, training_set_size, training_scores = train_to.getTrainingSet(
                model_directions,
                best_decisions,
                best_decisions_scores,
                eval_to.getZstar_with_decal(),
                strategy,
                nb_functions,
                training_neighborhood_size,
            )

            clf.fit(training_inputs, training_outputs)

        """
        if(writeR2OK and not free_eval):
            training_inputs_tcheby      = eval_to.getManyTcheby(training_inputs, training_scores, eval_to.getZstar_with_decal(), training_set_size)

            random_index = numpy.arange(0,training_set_size)
            numpy.random.shuffle(random_index)
            n_folds = 10
            folds_sizes = (training_set_size // n_folds) * numpy.ones(n_folds, dtype=numpy.int)
            folds_sizes[:training_set_size % n_folds] += 1

            training_inputs_array = numpy.array(training_inputs)
            training_tcheby_array = numpy.array(training_inputs_tcheby)

            R2_cv = []
            MSE_cv = []
            MAE_cv = []
            MDAE_cv = []

            clfCV = NuSVR()

            current = 0
            for fold_size in folds_sizes:
                start, stop = current, current + fold_size
                mask = numpy.ones(training_set_size, dtype=bool)
                mask[start:stop] = 0
                current = stop

                clfCV.fit(training_inputs_array[random_index[mask]], training_tcheby_array[random_index[mask]])

                test_fold_tcheby = training_tcheby_array[random_index[start:stop]]
                test_fold_predict = clfCV.predict(training_inputs_array[random_index[start:stop]])

                R2_cv  .append(r2_score             (test_fold_tcheby, test_fold_predict))
                MSE_cv .append(mean_squared_error   (test_fold_tcheby, test_fold_predict))
                MAE_cv .append(mean_absolute_error  (test_fold_tcheby, test_fold_predict))
                MDAE_cv.append(median_absolute_error(test_fold_tcheby, test_fold_predict))

            R2 = clf.score(training_inputs, training_outputs)
            MSE_cv_mean = numpy.mean(MSE_cv)
            RMSE_cv_mean = math.sqrt(MSE_cv_mean)
            MAE_cv_mean = numpy.mean(MAE_cv)
            MDAE_cv_mean = numpy.mean(MDAE_cv)
            R2_cv_mean = numpy.mean(R2_cv)

            iot.printR2(file_to_writeR2, eval_to.getNbEvals(), itera,  R2, R2_cv_mean, MSE_cv_mean , MAE_cv_mean, MDAE_cv_mean, RMSE_cv_mean, problem_size, print_every=1)

        """

        # random course through the directions
        random.shuffle(id_directions)

        # functions loop
        for f in id_directions:

            # get all the indice of neighbors of a function in a certain distance of f and include f in
            f_neighbors, current_neighbourhing_size = nt.getNeighborsOf(f, delta_neighbourhood)

            # get a list of offspring from the neighbors
            list_offspring = samp_to.extended_sampling(f, f_neighbors, sampling_param, nb_samples)

            # apply a filter on the offspring list and select the best one
            filter_param = [
                itera,
                f,
                clf,
                clf2,
                two_models_bool,
                f_neighbors,
                list_offspring,
                model_directions,
                start_fct,
                problem_size,
                eval_to.getZstar_with_decal(),
                best_decisions_scores,
                best_decisions,
                nb_objectives,
            ]
            best_candidate = filt_to.model_based_filtring(filter_strat, free_eval, filter_param)

            # evaluation of the newly made solution
            mix_scores = eval_to.eval(start_fct, best_candidate, problem_size)

            # MAJ of the z_star point
            has_changed = eval_to.min_update_Z_star(mix_scores, nb_objectives)

            # retraining of the model with the new z_star
            if has_changed and not free_eval:
                train_to.updateTrainingZstar(eval_to.getZstar_with_decal())
                training_outputs = train_to.retrainSet(
                    training_inputs, training_scores, eval_to.getZstar_with_decal(), training_set_size, nb_objectives
                )
                clf.fit(training_inputs, training_outputs)

            # boolean that is True if the offspring has been add to the archive
            added_to_S = False

            # count how many best decisions has been changed by the newly offspring
            cmpt_best_maj = 0

            # random course through the neighbors list
            random.shuffle(f_neighbors)

            # course through the neighbors list
            for j in f_neighbors:

                # stop if already max number of remplacement reach
                if cmpt_best_maj >= max_decisions_maj:
                    break

                # compute g_tcheby
                # wj = (directions[0][j],directions[1][j])
                wj = [directions[obj][j] for obj in range(0, nb_objectives)]
                g_mix = eval_to.g_tcheby(wj, mix_scores, eval_to.getZstar_with_decal())
                g_best = eval_to.g_tcheby(wj, best_decisions_scores[j], eval_to.getZstar_with_decal())

                # if the g_tcheby of the new solution is less distant from the z_optimal solution than the current best solution of the function j
                if g_mix < g_best:
                    cmpt_best_maj += 1
                    best_decisions[j] = best_candidate
                    best_decisions_scores[j] = mix_scores

                    # if we manage the archive and the solution have not been add already
                    if archiveOK and not (added_to_S):
                        arch_to.archivePut(best_candidate, mix_scores)
                        added_to_S = True
        # print("Update", itera, "done.")

        # if manage archive
        if archiveOK:
            arch_to.maintain_archive()

        # if write the result in a file
        if writeOK:
            iot.printObjectives(
                file_to_write,
                eval_to.getNbEvals(),
                itera + 1,
                best_decisions_scores,
                problem_size,
                nb_objectives,
                print_every=param_print_every,
            )
            continue

        # graphic update
        # yield arch_to.getArchiveScore(), best_decisions_scores, itera+1, eval_to.getNbEvals(), eval_to.getZstar_with_decal(), pop_size, isReals
    if not free_eval and writeR2OK:
        qual_tools.computeQualityEvaluation()
        qual_tools.generateDiffPredFreeFile()
    return
Пример #13
0
print 'LinearSVR precision train: {}'.format(lsvr_score_train)
lsvr_score_test = lsvr.score(smr_test.feature_matrix, smr_test.labels)
print 'LinearSVR precision test: {}'.format(lsvr_score_test)
print ''

nusvc = NuSVC()
print 'NuSVC config:'
print nusvc.get_params()
nusvc.fit(smr_train.feature_matrix, smr_train.labels)
nusvc_score_train = nusvc.score(smr_train.feature_matrix, smr_train.labels)
print 'NuSVC precision train: {}'.format(nusvc_score_train)
nusvc_score_test = nusvc.score(smr_test.feature_matrix, smr_test.labels)
print 'NuSVC precision test: {}'.format(nusvc_score_test)
print ''

nusvr = NuSVR()
print 'NuSVR config:'
print nusvr.get_params()
nusvr.fit(smr_train.feature_matrix, smr_train.labels)
nusvr_score_train = svc.score(smr_train.feature_matrix, smr_train.labels)
print 'NuSVR precision train: {}'.format(nusvr_score_train)
nusvr_score_test = nusvr.score(smr_test.feature_matrix, smr_test.labels)
print 'NuSVR precision test: {}'.format(nusvr_score_test)
print ''


dtc = DecisionTreeClassifier()
print 'DecisionTreeClassifier config:'
print dtc.get_params()
dtc.fit(smr_train.feature_matrix, smr_train.labels)
dtc_score_train = dtc.score(smr_train.feature_matrix, smr_train.labels)
Пример #14
0
def nusvrtrain(x, y, pre_x):
	x, pre_x = datscater(x, pre_x)
	clf = NuSVR(C = 5.0).fit(x, y)
	pred = clf.predict(pre_x)
	return pred
Пример #15
0
import numpy as np
import pickle
from build_database import flux_obj
from sklearn.svm import SVR
from sklearn.svm import NuSVR
from matplotlib import pyplot as plt

with open('database_lat.pkl','rb') as file:
    db = pickle.load(file)


print db.keys()


S = NuSVR(kernel='rbf')

X = []
Y = []
for k in db.keys():
#k = db.keys()[5]
#    print np.array(k)
    t = np.linspace(0,db[k].RES_FINT,db[k].NUM_T)
    #X = np.atleast_2d(t).T
    #Y = np.power(10,db[k].N)
    inp = np.vstack([np.outer(np.array([k[0],k[3]]), np.ones(int(db[k].NUM_T))), t]).T
    X.extend(inp)
    Y.extend(np.power(10,db[k].N))
    #Y.extend(db[k].N)

Пример #16
0
def func_model(X_train,y_train):
    '''
    Process one model by training data
    Input: X_train,y_train
    Output: regressor by the need
    '''
    global model
    if model == 'XG':
        reg = XGBRegressor()
    elif model == 'RD':
        reg = RidgeCV(alphas=(0.1, 1.0, 10.0), fit_intercept=True, normalize=False, 
              scoring=None, cv=5, gcv_mode=None, store_cv_values=False)
    elif model == 'LS':
        reg = LassoCV(max_iter = 10**8)
    elif model == 'LLS':
        reg = LassoLarsCV()
    elif model == 'ADA':
        reg = AdaBoostRegressor()
    elif model == 'EN':
        reg = ElasticNetCV()
    elif model == 'DT':
        reg = DecisionTreeRegressor(criterion="mse", splitter="best", max_depth=None, 
            min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
            max_features=None, random_state=None, max_leaf_nodes=None, 
            min_impurity_decrease=0.0, min_impurity_split=None)
    elif model == 'SVR':
        reg = SVR()
    elif model == 'KN':
        reg = KNeighborsRegressor(n_neighbors=5, weights="uniform", algorithm="auto", 
            leaf_size=30, p=2, metric="minkowski", metric_params=None)
    elif model == 'BG':
        reg = BaggingRegressor(base_estimator=LassoCV(max_iter = 10**8), n_estimators=10, max_samples=1.0, 
            max_features=1.0, bootstrap=True, bootstrap_features=False, 
            oob_score=False, warm_start=True, random_state=None, verbose=0)
    elif model == 'GB':
        reg = GradientBoostingRegressor(loss="ls", learning_rate=0.1, n_estimators=100, 
            subsample=1.0, criterion="friedman_mse", min_samples_split=2, 
            min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, 
            min_impurity_decrease=0.0, min_impurity_split=None, init=None, 
            random_state=None, max_features=None, alpha=0.9, verbose=0, 
            max_leaf_nodes=None, warm_start=False, 
            validation_fraction=0.1, n_iter_no_change=None, tol=0.0001)
    elif model == 'ET':
        reg = ExtraTreesRegressor()
    elif model == 'RF':
        reg = RandomForestRegressor()
    elif model == 'ST':
        estimators = [
            ('ADA',AdaBoostRegressor()),
            ('LS',LassoCV(max_iter = 10**8)),
            ('LLS',LassoLarsCV()),
            ('RD',RidgeCV()),
            ('XG',XGBRegressor()),
            ('KN',KNeighborsRegressor())
        ]
        reg = StackingRegressor(estimators=estimators)
    elif model == 'NSVR':
        reg = NuSVR()
    elif model == 'ST2':
        estimators = [RidgeCV(), AdaBoostRegressor(), LassoCV(max_iter = 10**8), LassoLarsCV(), XGBRegressor(), KNeighborsRegressor(),ElasticNetCV()]
        reg = StackingCVRegressor(regressors = estimators, meta_regressor = LassoCV(max_iter = 10**8))
    elif model == 'LR':
        reg = LinearRegression()
    elif model == 'NN':
        reg = MLPRegressor(learning_rate = 'adaptive', max_iter = 1000)
    reg.fit(X_train,y_train)
    return reg
Пример #17
0
df = test_regressor(ARDRegression(compute_score=True, copy_X=True), df)
# test_regressor(LogisticRegressionCV(cv=5)) - it's used for classification
df = test_regressor(SGDRegressor(), df)
df = test_regressor(PassiveAggressiveRegressor(), df)
df = test_regressor(RANSACRegressor(), df)
df = test_regressor(TheilSenRegressor(copy_X=True), df)
df = test_regressor(HuberRegressor(), df)
df = test_regressor(AdaBoostRegressor(n_estimators=1000), df)
df = test_regressor(BaggingRegressor(n_estimators=1000), df)
df = test_regressor(ExtraTreesRegressor(n_estimators=1000), df)
df = test_regressor(GradientBoostingRegressor(n_estimators=1000), df)
df = test_regressor(RandomForestRegressor(n_estimators=1000), df)
df = test_regressor(GaussianProcessRegressor(), df)
# df = test_regressor(IsotonicRegression(), df) - has errors
df = test_regressor(LinearSVR(), df)
df = test_regressor(NuSVR(), df)
df = test_regressor(SVR(), df)
df = test_regressor(XGBRegressor(n_estimators=1000), df)

df = test_regressor(lgb.LGBMRegressor(n_estimators=1000), df)
df = test_regressor(CatBoostRegressor(n_estimators=1000), df)
df = test_regressor(DecisionTreeRegressor(max_depth=3), df)
df = test_regressor(KNeighborsRegressor(), df)
# df = test_regressor(RadiusNeighborsRegressor(), df) - also has errors
df = test_regressor(DummyRegressor(), df)

df = test_regressor(
    StackingRegressor(regressors=[
        GradientBoostingRegressor(n_estimators=1000),
        HuberRegressor(),
        RidgeCV(cv=5),
Пример #18
0
# In[ ]:


xgb_params = {'eta': 0.03,
              'max_depth': 9,
              'subsample': 0.85,
              'objective': 'reg:linear',
              'eval_metric': 'mae',
              'silent': True,
              'nthread': 4}
oof_xgb, prediction_xgb = train_model(X=X_train_scaled, X_test=X_test_scaled, params=xgb_params, model_type='xgb')

# In[ ]:


model = NuSVR(gamma='scale', nu=0.9, C=10.0, tol=0.01)
oof_svr, prediction_svr = train_model(X=X_train_scaled, X_test=X_test_scaled, params=None, model_type='sklearn', model=model)

# In[ ]:


model = NuSVR(gamma='scale', nu=0.7, tol=0.01, C=1.0)
oof_svr1, prediction_svr1 = train_model(X=X_train_scaled, X_test=X_test_scaled, params=None, model_type='sklearn', model=model)

# In[ ]:


params = {'loss_function':'MAE'}
oof_cat, prediction_cat = train_model(X=X_train_scaled, X_test=X_test_scaled, params=params, model_type='cat')

# In[ ]:
Пример #19
0
class NuSVRScikitTest(unittest.TestCase):
    """
    Unit test class for testing scikit-learn converter.
    """
    @classmethod
    def setUpClass(self):
        """
        Set up the unit test by loading the dataset and training a model.
        """
        if not HAS_SKLEARN:
            return

        self.scikit_model = NuSVR(kernel='linear')
        self.data = load_boston()
        self.scikit_model.fit(self.data['data'], self.data['target'])

    def test_conversion_bad_inputs(self):
        # Error on converting an untrained model
        with self.assertRaises(TypeError):
            model = NuSVR()
            spec = scikit_converter.convert(model, 'data', 'out')

        # Check the expected class during covnersion.
        with self.assertRaises(TypeError):
            model = OneHotEncoder()
            spec = scikit_converter.convert(model, 'data', 'out')

    @pytest.mark.slow
    def test_evaluation_stress_test(self):
        self._test_evaluation(allow_slow = True)

    def test_evaluation(self):
        self._test_evaluation(allow_slow = False)


    def _test_evaluation(self, allow_slow):
        """
        Test that the same predictions are made
        """

        # Generate some smallish (some kernels take too long on anything else) random data
        x, y = [], []
        for _ in range(50):
            cur_x1, cur_x2 = random.gauss(2,3), random.gauss(-1,2)
            x.append([cur_x1, cur_x2])
            y.append( 1 + 2*cur_x1 + 3*cur_x2 )

        input_names = ['x1', 'x2']
        df = pd.DataFrame(x, columns=input_names)

        # Parameters to test
        kernel_parameters = [{}, {'kernel': 'rbf', 'gamma': 1.2},
                             {'kernel': 'linear'},
                             {'kernel': 'poly'},  {'kernel': 'poly', 'degree': 2},  {'kernel': 'poly', 'gamma': 0.75},
                                 {'kernel': 'poly', 'degree': 0, 'gamma': 0.9, 'coef0':2},
                             {'kernel': 'sigmoid'}, {'kernel': 'sigmoid', 'gamma': 1.3}, {'kernel': 'sigmoid', 'coef0': 0.8},
                                 {'kernel': 'sigmoid', 'coef0': 0.8, 'gamma': 0.5}
                             ]
        non_kernel_parameters = [{}, {'C': 1}, {'C': 1.5, 'shrinking': True}, {'C': 0.5, 'shrinking': False, 'nu': 0.9}]

        # Test
        for param1 in non_kernel_parameters:
            for param2 in kernel_parameters:
                cur_params = param1.copy()
                cur_params.update(param2)

                cur_model = NuSVR(**cur_params)
                cur_model.fit(x, y)
                df['prediction'] = cur_model.predict(x)

                spec = scikit_converter.convert(cur_model, input_names, 'target')

                if is_macos() and macos_version() >= (10, 13):
                    metrics = evaluate_regressor(spec, df)
                    self.assertAlmostEquals(metrics['max_error'], 0)

                if not allow_slow:
                    break

            if not allow_slow:
                break
Пример #20
0
    def __init__(self, task_type="linearsvc"):
        self.task_type = task_type
        assert self.task_type in {
            "linearsvc", "linearsvr", "nusvc", "nusvr", "oneclasssvm", "svc",
            "svr", "l1_min_c"
        }

        if self.task_type == "linearsvc":  # 线性支持向量分类
            self.model = LinearSVC(penalty='l2',
                                   loss='squared_hinge',
                                   dual=True,
                                   tol=1e-4,
                                   C=1.0,
                                   multi_class='ovr',
                                   fit_intercept=True,
                                   intercept_scaling=1,
                                   class_weight=None,
                                   verbose=0,
                                   random_state=None,
                                   max_iter=1000)

        elif self.task_type == "linearsvr":  # 线性支持向量回归
            self.model = LinearSVR(epsilon=0.0,
                                   tol=1e-4,
                                   C=1.0,
                                   loss='epsilon_insensitive',
                                   fit_intercept=True,
                                   intercept_scaling=1.,
                                   dual=True,
                                   verbose=0,
                                   random_state=None,
                                   max_iter=1000)

        elif self.task_type == "nusvc":  # Nu 支持向量分类
            self.model = NuSVC(nu=0.5,
                               kernel='rbf',
                               degree=3,
                               gamma='scale',
                               coef0=0.0,
                               shrinking=True,
                               probability=False,
                               tol=1e-3,
                               cache_size=200,
                               class_weight=None,
                               verbose=False,
                               max_iter=-1,
                               decision_function_shape='ovr',
                               break_ties=False,
                               random_state=None)

        elif self.task_type == "nusvr":  # Nu支持向量回归
            self.model = NuSVR(nu=0.5,
                               C=1.0,
                               kernel='rbf',
                               degree=3,
                               gamma='scale',
                               coef0=0.0,
                               shrinking=True,
                               tol=1e-3,
                               cache_size=200,
                               verbose=False,
                               max_iter=-1)

        elif self.task_type == "oneclasssvm":  # 无监督异常值检测
            self.model = OneClassSVM(kernel='rbf',
                                     degree=3,
                                     gamma='scale',
                                     coef0=0.0,
                                     tol=1e-3,
                                     nu=0.5,
                                     shrinking=True,
                                     cache_size=200,
                                     verbose=False,
                                     max_iter=-1)

        elif self.task_type == "svc":  # c支持向量分类
            self.model = SVC(C=1.0,
                             kernel='rbf',
                             degree=3,
                             gamma='scale',
                             coef0=0.0,
                             shrinking=True,
                             probability=False,
                             tol=1e-3,
                             cache_size=200,
                             class_weight=None,
                             verbose=False,
                             max_iter=-1,
                             decision_function_shape='ovr',
                             break_ties=False,
                             random_state=None)

        else:  # Epsilion 支持向量回归
            self.model = SVR(kernel='rbf',
                             degree=3,
                             gamma='scale',
                             coef0=0.0,
                             tol=1e-3,
                             C=1.0,
                             epsilon=0.1,
                             shrinking=True,
                             cache_size=200,
                             verbose=False,
                             max_iter=-1)
# Parameters
depth = 60
horizon = 7

# Form feature and target vectors
featureVectors, targetVectors = util.formFeatureAndTargetVectorsMultiHorizon(correctedSeries, depth, horizon)


outputFolderName = "Outputs/Outputs" + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
os.mkdir(outputFolderName)
for i in range(horizon):
    # Train different models for different horizon
    # Train the model
    #model = Pipeline([('poly', PolynomialFeatures(degree=2)), ('linear', LinearRegression(fit_intercept=False))])
    #model = NuSVR(kernel='linear', nu=1.0)
    model = NuSVR(kernel="rbf", nu=1.0, tol=1e-10, gamma=1.0)
    #model = RidgeCV()
    model.fit(featureVectors, targetVectors[:, i])

    predictedTargetVectors = model.predict(featureVectors)

    # Plot the actual and predicted
    actual = targetVectors[:, i]
    predicted = predictedTargetVectors

    # Descale
    actual = util.scalingFunction.inverse_transform(actual)
    predicted = util.scalingFunction.inverse_transform(predicted)

    outplot = outputPlot.OutputPlot(outputFolderName + "/Prediction_horizon"+str(i+1)+".html", "Facebook Fans Change - Linear Regression", "Taylor Swift", "Time", "Output")
    outplot.setXSeries(np.arange(1, targetVectors.shape[0]))
Пример #22
0
#normalize train data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
print(X_train_scaled)

# In[6]:
#apply model

#from sklearn.isotonic import IsotonicRegression
#from sklearn.linear_model import ElasticNet
#from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn import svm
from sklearn.svm import NuSVR

model = NuSVR()

model.fit(X_train_scaled, y_train.values.flatten())
y_pred = model.predict(X_train_scaled)

# In[7]:
#plt.figure(figsize=(6, 6))
#plt.scatter(y_train.values, y_pred)
#plt.xlim(0, 20)
#plt.ylim(0, 20)
#plt.xlabel('actual', fontsize=12)
#plt.ylabel('predicted', fontsize=12)
#plt.plot([(0, 0), (20, 20)], [(0, 0), (20, 20)])
#plt.show()

plt.figure(figsize=(16, 8))
Пример #23
0
def regress(X_train, y_train):
    # comment out any classifier that should not be used
    classifiers = [
        (SGDRegressor(), "SGDRegressor", 1 * global_data_scale),
        (LinearRegression(), "LinearRegression", 1 * global_data_scale),
        (Ridge(), "Ridge", 1 * global_data_scale),
        (Lasso(), "Lasso", 1 * global_data_scale),
        (ElasticNet(), "ElasticNet", 1 * global_data_scale),
        (Lars(), "Lars", 1 * global_data_scale),
        (OrthogonalMatchingPursuit(), "OrthogonalMatchingPursuit", 1 * global_data_scale),
        (BayesianRidge(), "BayesianRidge", 1 * global_data_scale),
        (ARDRegression(), "ARDRegression", 1 * global_data_scale),
        ### NOTE the scoring might be different of PassiveAggressiveRegressor
        (PassiveAggressiveRegressor(), "PassiveAggressiveRegressor", 1 * global_data_scale),
        ### NOTE the scoring might be different of RANSACRegressor
        (RANSACRegressor(), "RANSACRegressor", 1 * global_data_scale),
        (TheilSenRegressor(), "TheilSenRegressor", 1 * global_data_scale),
        (HuberRegressor(), "HuberRegressor", 1 * global_data_scale),
        (DecisionTreeRegressor(), "DecisionTreeRegressor", 1 * global_data_scale),
        (GaussianProcessRegressor(), "GaussianProcessRegressor", 1 * global_data_scale),
        (MLPRegressor(), "MLPRegressor", 1 * global_data_scale),
        (KNeighborsRegressor(), "KNeighborsRegressor", 1 * global_data_scale),
        (RadiusNeighborsRegressor(), "RadiusNeighborsRegressor", 1 * global_data_scale),
        (SVR(), "SVR", 1 * global_data_scale),
        (NuSVR(), "NuSVR", 1 * global_data_scale),
        (LinearSVR(), "LinearSVR", 1 * global_data_scale),
        (KernelRidge(), "KernalRidge", 1 * global_data_scale),
        (IsotonicRegression(), "IsotonicRegression", 1 * global_data_scale)
    ]

    # set the list of the values that should be used in grid search
    params_dict = {
        "SGDRegressor": {
            "penalty": ["l2", "l1"],
            "alpha": [.001, .0001, .00001],
            "l1_ratio": [.15, .2, .25],
            "fit_intercept": [True, False],
            "max_iter": [1000],
            "shuffle": [True, False],
            "epsilon": [.05, .1, .2],
            "learning_rate": ["constant", "optimal", "invscaling", "adaptive"],
            "eta0": [.005, .01, .02],
            "power_t": [.2, .25, .3]
        },
        "LinearRegression": {
            "fit_intercept": [True, False],
            "normalize": [True, False]
        },
        "Ridge": {
            "alpha": [.8, 1., 1.2],
            "fit_intercept": [True, False],
            "normalize": [True, False],
            "tol": [.01, .001, .0001],
            "solver": ["svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]
        },
        "Lasso": {
            "alpha": [.8, 1., 1.2],
            "fit_intercept": [True, False],
            "normalize": [True, False],
            "positive": [True, False],
            "precompute": [True, False]
        },
        "ElasticNet": {
            "alpha": [.8, 1., 1.2],
            "fit_intercept": [True, False],
            "normalize": [True, False],
            "precompute": [True, False],
            "positive": [True, False],
            "selection": ["cyclic", "random"]
        },
        "Lars": {
            "fit_intercept": [True, False],
            "normalize": [True, False],
            "precompute": [True, False],
            "n_nonzero_coefs": [np.inf]
        },
        "OrthogonalMatchingPursuit": {
            "n_nonzero_coefs": [np.inf, None],
            "precompute": [True, False],
            "fit_intercept": [True, False],
            "normalize": [True, False]
        },
        "BayesianRidge": {
            "tol": [.01, .001, .0001],
            "alpha_1": [1e-5, 1e-6, 1e-7],
            "alpha_2": [1e-5, 1e-6, 1e-7],
            "lambda_1": [1e-5, 1e-6, 1e-7],
            "lambda_2": [1e-5, 1e-6, 1e-7],
            "fit_intercept": [True, False],
            "normalize": [True, False]
        },
        "ARDRegression": {
            "tol": [.01, .001, .0001],
            "alpha_1": [1e-5, 1e-6, 1e-7],
            "alpha_2": [1e-5, 1e-6, 1e-7],
            "lambda_1": [1e-5, 1e-6, 1e-7],
            "lambda_2": [1e-5, 1e-6, 1e-7],
            "threshold_lambda": [1000, 10000, 100000],
            "fit_intercept": [True, False],
            "normalize": [True, False]
        },
        "PassiveAggressiveRegressor": {
            "C": [.8, 1., 1.2 ],
            "tol": [1e-2, 1e-3, 1e-4],
            "n_iter_no_change": [3, 5, 8],
            "shuffle": [True, False],
            "average": [True, False]
        },
        "RANSACRegressor": {
            "base_estimator": [LinearRegression()]
        },
        "TheilSenRegressor": {
            "max_subpopulation": [1e3, 1e4, 1e5],
            "tol": [1e-2, 1e-3, 1e-4]
        },
        "HuberRegressor": {
            "epsilon": [1.1, 1.35,  1.5],
            "alpha": [1e-3, 1e-4, 1e-5],
            "warm_start": [True, False],
            "fit_intercept": [True, False],
            "": [1e-4, 1e-5, 1e-6]
        },
        "DecisionTreeRegressor": {
            "criterion": ["mse", "friedman_mse", "mae"],
            "splitter": ["best", "random"],
            "min_samples_split": [2, 3],
            "min_samples_leaf": [1, 2],
            "min_weight_fraction_leaf": [.0],
            "max_features": ["auto", "sqrt", "log2"],
            "min_impurity_split": [1e-6, 1e-7, 1e-8]
        },
        "GaussianProcessRegressor": {
            "alpha": [1e-8, 1e-10, 1e-12],
            "optimizer": ["fmin_l_bfgs_b"],
            "normalize_y": [True, False]
        },
        "MLPRegressor": {
            "hidden_layer_sizes": [(100,)],
            "activation": ["identity", "logistic", "tanh", "relu"],
            "solver": ["lbfgs", "sgd", "adam"],
            "alpha": [1e-3, 1e-4, 1e-5],
            # "learning_rate": ["constant", "invscaling", "adaptive"],
            # "learning_rate_init": [1e-2, 1e-3, 1e-4],
            # "power_t": [.3, .5, .8],
            # "shuffle": [True, False],
            # "tol": [1e-3, 1e-4, 1e-5],
            # "momentum": [.8, .9, .99],
            # "beta_1": [.8, .9, .99],
            # "beta_2": [.999],
            # "epsilon": [1e-7, 1e-8, 1e-9],
            # "n_iter_no_change": [10],
            # "max_fun": [15000]
        },
        "KNeighborsRegressor": {
            "n_neighbors": [20, 10, 5, 3],
            "weights": ["uniform", "distance"],
            "algorithm": ["ball_tree", "kd_tree", "brute"],
            "leaf_size": [20, 30, 40],
            "p": [1, 2]
        },
        "RadiusNeighborsRegressor": {
            "radius": [.8, 1, 1.2],
            "n_neighbors": [20, 10, 5, 3],
            "weights": ["uniform", "distance"],
            "algorithm": ["ball_tree", "kd_tree", "brute"],
            "leaf_size": [20, 30, 40],
            "p": [1, 2]
        },
        "SVR": {
            "kernel": ["poly", "rbf", "sigmoid"],
            "degree": [2, 3, 5],
            "gamma": ["scale", "auto"],
            "coef0": [.0],
            "tol": [1e-2, 1e-3, 1e-4],
            "C": [.8, .1, 1.2],
            "epsilon": [.08, .1, .12],
            "shrinking": [True, False],
            "max_iter": [-1]
        },
        "NuSVR": {
            "nu": [.2, .5, .8],
            "C": [.8, .1, 1.2],
            "kernel": ["poly", "rbf", "sigmoid"],
            "degree": [2, 3, 5],
            "gamma": ["scale", "auto"],
            "coef0": [.0],
            "shrinking": [True, False],
            "tol": [1e-2, 1e-3, 1e-4],
            "max_iter": [-1]
        },
        "LinearSVR": {
            "epsilon": [.0],
            "tol": [1e-3, 1e-4, 1e-5],
            "C": [.8, .1, 1.2],
            "fit_intercept": [True, False],
            "dual": [True, False],
            "intercept_scaling": [.8, 1., 1.2]
        },
        "KernelRidge": {
            "coef0": [.8, 1, 1.2],
            "degree": [2, 3, 5],
        },
        "IsotonicRegression": {
            "increasing": [True, False],
        }
    }

    for model, params, frac in classifiers:
        full = pd.DataFrame(X_train).join(pd.DataFrame(y_train))
        loan_data = full.sample(frac=frac, random_state=random_state)
        X = loan_data.drop("loan_status", axis=1)
        y = loan_data["loan_status"]
        grid = GridSearchCV(model, params_dict[params], verbose=verbose, cv=folds, n_jobs=workers)
        grid.fit(X, y)
        yield grid, params
def run_kernel(input_dir, verbose=False):
    if verbose:
        print(os.listdir(input_dir))

    train = pd.read_csv(
        input_dir / 'train.csv',
        dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})

    if verbose:
        print(train.head())

        pd.options.display.precision = 15

        print(train.head())

    # Create a training file with simple derived features

    rows = 150_000
    segments = int(np.floor(train.shape[0] / rows))

    X_train = pd.DataFrame(index=range(segments), dtype=np.float64,
                           columns=['ave', 'std', 'max', 'min'])
    y_train = pd.DataFrame(index=range(segments), dtype=np.float64,
                           columns=['time_to_failure'])

    for segment in tqdm(range(segments)):
        seg = train.iloc[segment * rows:segment * rows + rows]
        x = seg['acoustic_data'].values
        y = seg['time_to_failure'].values[-1]

        y_train.loc[segment, 'time_to_failure'] = y

        X_train.loc[segment, 'ave'] = x.mean()
        X_train.loc[segment, 'std'] = x.std()
        X_train.loc[segment, 'max'] = x.max()
        X_train.loc[segment, 'min'] = x.min()

    if verbose:
        print(X_train.head())

    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)

    svm = NuSVR()
    svm.fit(X_train_scaled, y_train.values.flatten())
    y_pred = svm.predict(X_train_scaled)

    if verbose:
        plt.figure(figsize=(6, 6))
        plt.scatter(y_train.values.flatten(), y_pred)
        plt.xlim(0, 20)
        plt.ylim(0, 20)
        plt.xlabel('actual', fontsize=12)
        plt.ylabel('predicted', fontsize=12)
        plt.plot([(0, 0), (20, 20)], [(0, 0), (20, 20)])
        plt.show()

    score = mean_absolute_error(y_train.values.flatten(), y_pred)

    if verbose:
        print(f'Score: {score:0.3f}')

    submission = pd.read_csv(
        input_dir / 'sample_submission.csv', index_col='seg_id')

    X_test = pd.DataFrame(columns=X_train.columns,
                          dtype=np.float64, index=submission.index)

    for seg_id in X_test.index:
        seg = pd.read_csv(input_dir / ('test/' + seg_id + '.csv'))

        x = seg['acoustic_data'].values

        X_test.loc[seg_id, 'ave'] = x.mean()
        X_test.loc[seg_id, 'std'] = x.std()
        X_test.loc[seg_id, 'max'] = x.max()
        X_test.loc[seg_id, 'min'] = x.min()

    X_test_scaled = scaler.transform(X_test)
    submission['time_to_failure'] = svm.predict(X_test_scaled)
    submission.to_csv('submission.csv')
Пример #25
0
    model = SVR(C=100.0, gamma=0.1, cache_size=500)
    # 拟合训练集
    model.fit(train_X, train_Y.values.ravel())
    # 打印模型的系数
    # print model.intercept_
    # print model.dual_coef_
    # 预测测试集
    test_Y_pred = model.predict(test_X)
    print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred)
    print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred))
    print "总耗时:", time() - t, "秒"

    print "\n**********测试NuSVR类**********"
    t = time()
    model = GridSearchCV(NuSVR(cache_size=1000),
                         param_grid={
                             "C": np.logspace(-3, 3, 7),
                             "nu": np.linspace(0.1, 1, 10),
                             "gamma": np.logspace(-3, 3, 7)
                         },
                         cv=5)
    model.fit(train_X, train_Y.values.ravel())
    print("最好的参数是:%s, 此时的得分是:%0.2f" % (model.best_params_, model.best_score_))

    model = NuSVR(C=100.0, nu=0.3, gamma=0.1, cache_size=500)
    # 拟合训练集
    model.fit(train_X, train_Y.values.ravel())
    # 打印模型的系数
    # print model.intercept_
    # print model.dual_coef_
Пример #26
0
		medv_ids = DataFrame(kneighbors[1] + 1, columns = ["neighbor(" + str(x + 1) + ")" for x in range(regressor.n_neighbors)])
		medv = pandas.concat((medv, medv_ids), axis = 1)
	store_csv(medv, name)

if "Housing" in datasets:
	build_housing(AdaBoostRegressor(DecisionTreeRegressor(min_samples_leaf = 5, random_state = 13), n_estimators = 17, random_state = 13), "AdaBoostHousing")
	build_housing(BayesianRidge(), "BayesianRidgeHousing")
	build_housing(GBDTLMRegressor(GradientBoostingRegressor(n_estimators = 31, random_state = 13), LinearRegression()), "GBDTLMHousing")
	build_housing(GBDTLMRegressor(XGBRFRegressor(n_estimators = 17, max_depth = 5, random_state = 13), SGDRegressor(penalty = "elasticnet", random_state = 13)), "XGBRFLMHousing")
	build_housing(HistGradientBoostingRegressor(max_iter = 31, random_state = 13), "HistGradientBoostingHousing")
	build_housing(KNeighborsRegressor(), "KNNHousing", with_kneighbors = True)
	build_housing(MLPRegressor(activation = "tanh", hidden_layer_sizes = (26,), solver = "lbfgs", tol = 0.001, max_iter = 1000, random_state = 13), "MLPHousing")
	build_housing(SGDRegressor(random_state = 13), "SGDHousing")
	build_housing(SVR(gamma = "auto"), "SVRHousing")
	build_housing(LinearSVR(random_state = 13), "LinearSVRHousing")
	build_housing(NuSVR(gamma = "auto"), "NuSVRHousing")
	build_housing(VotingRegressor([("dt", DecisionTreeRegressor(random_state = 13)), ("lr", LinearRegression())]), "VotingEnsembleHousing")

visit_X, visit_y = load_visit("Visit")

def build_visit(regressor, name):
	mapper = DataFrameMapper(
		[(["edlevel"], [CategoricalDomain(), OneHotEncoder()])] +
		[([bin_column], [CategoricalDomain(), OneHotEncoder()]) for bin_column in ["outwork", "female", "married", "kids", "self"]] +
		[(["age"], ContinuousDomain())] +
		[(["hhninc", "educ"], ContinuousDomain())]
	)
	pipeline = PMMLPipeline([
		("mapper", mapper),
		("regressor", regressor)
	])
Пример #27
0
    def objective(self, trial):
        if 'xgb' in str.lower(self.model_type):
            params = {
                'learning_rate':
                trial.suggest_uniform('learning_rate', 0.0001, 0.5),
                'max_depth':
                trial.suggest_int('max_depth', 1, 150),
                'min_child_weight':
                trial.suggest_int('min_child_weight', 1, 10),
                'colsample_bytree':
                trial.suggest_uniform('colsample_bytree', 0.4, 1.0),
                'colsample_bynode':
                trial.suggest_uniform('colsample_bynode', 0.4, 1.0),
                'subsample':
                trial.suggest_uniform('subsample', 0.4, 1.0),
                'gamma':
                trial.suggest_uniform('gamma', 0.01, 10),
                'reg_alpha':
                trial.suggest_uniform('reg_alpha', 0, 1.0)
            }
            model = xgb.XGBRegressor(objective="reg:squarederror",
                                     random_state=42)
        elif 'rf' in str.lower(self.model_type):
            params = {
                'max_depth':
                trial.suggest_int('max_depth', 1, 150),
                'max_features':
                trial.suggest_categorical(
                    'max_features',
                    ['auto', 'sqrt', 'log2', None, 0.8, 0.6, 0.4]),
                'min_samples_leaf':
                trial.suggest_int('min_samples_leaf', 1, 250),
                'min_samples_split':
                trial.suggest_int('min_samples_split', 2, 250),
            }
            model = RandomForestRegressor(n_estimators=500, random_state=42)
        elif str.lower(self.model_type) == 'svm':
            params = {
                'C':
                trial.suggest_uniform('C', 1e-4, 1e3),
                'kernel':
                trial.suggest_categorical(
                    'kernel', ['linear', 'poly', 'rbf', 'sigmoid']),
                'gamma':
                trial.suggest_uniform('gamma', 1e-2, 10)
            }
            model = SVR(max_iter=1000000)
        elif str.lower(self.model_type) == 'nusvm':
            params = {
                'nu': trial.suggest_uniform('nu', 0.01, 0.99),
                'C': trial.suggest_uniform('C', 1e-4, 1e5),
                'gamma': trial.suggest_uniform('gamma', 1e-2, 10)
            }
            model = NuSVR(max_iter=1000000)
        elif 'mlp' in str.lower(self.model_type):
            n_layers = trial.suggest_int('n_layers', 1, 2)
            layers = []
            for i in range(n_layers):
                layers.append(
                    trial.suggest_int('n_units_l{}'.format(i), 3, 800))
            params = {
                'alpha': trial.suggest_loguniform('alpha', 1e-5, 1e-1),
            }

            model = MLPRegressor(hidden_layer_sizes=layers,
                                 max_iter=1000,
                                 early_stopping=True)

        return self.fit_model(model, params)
Пример #28
0
class NuSvrClass:
    """
    Name      : NuSVR
    Attribute : None
    Method    : predict, predict_by_cv, save_model
    """
    def __init__(self):
        # 알고리즘 이름
        self._name = 'nusvr'

        # 기본 경로
        self._f_path = os.path.abspath(
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         os.pardir))

        # 경고 메시지 삭제
        warnings.filterwarnings('ignore')

        # 원본 데이터 로드
        data = pd.read_csv(self._f_path +
                           "/regression/resource/regression_sample.csv",
                           sep=",",
                           encoding="utf-8")

        # 학습 및 테스트 데이터 분리
        self._x = (data["year"] <= 2017)
        self._y = (data["year"] >= 2018)

        # 학습 데이터 분리
        self._x_train, self._y_train = self.preprocessing(data[self._x])
        # 테스트 데이터 분리
        self._x_test, self._y_test = self.preprocessing(data[self._y])

        # 모델 선언
        self._model = NuSVR(nu=0.5, cache_size=100)

        # 모델 학습
        self._model.fit(self._x_train, self._y_train)

    # 데이터 전처리
    def preprocessing(self, data):
        # 학습
        x = []
        # 레이블
        y = []
        # 기준점(7일)
        base_interval = 7
        # 기온
        temps = list(data["temperature"])

        for i in range(len(temps)):
            if i < base_interval:
                continue
            y.append(temps[i])

            xa = []

            for p in range(base_interval):
                d = i + p - base_interval
                xa.append(temps[d])
            x.append(xa)
        return x, y

    # 일반 예측
    def predict(self, save_img=False, show_chart=False):
        # 예측
        y_pred = self._model.predict(self._x_test)

        # 스코어 정보
        score = r2_score(self._y_test, y_pred)

        # 리포트 확인
        if hasattr(self._model, 'coef_') and hasattr(self._model,
                                                     'intercept_'):
            print(f'Coef = {self._model.coef_}')
            print(f'intercept = {self._model.intercept_}')

        print(f'Score = {score}')

        # 이미지 저장 여부
        if save_img:
            self.save_chart_image(y_pred, show_chart)

        # 예측 값  & 스코어
        return [list(y_pred), score]

    #  CV 예측(Cross Validation)
    def predict_by_cv(self):
        # Regression 알고리즘은 실 프로젝트 상황에 맞게 Cross Validation 구현
        return False

    #  GridSearchCV 예측
    def predict_by_gs(self):
        pass

    # 모델 저장 및 갱신
    def save_model(self, renew=False):
        # 모델 저장
        if not renew:
            # 처음 저장
            joblib.dump(self._model,
                        self._f_path + f'/model/{self._name}_rg.pkl')
        else:
            # 기존 모델 대체
            if os.path.isfile(self._f_path + f'/model/{self._name}_rg.pkl'):
                os.rename(
                    self._f_path + f'/model/{self._name}_rg.pkl',
                    self._f_path +
                    f'/model/{str(self._name) + str(time.time())}_rg.pkl')
            joblib.dump(self._model,
                        self._f_path + f'/model/{self._name}_rg.pkl')

    # 회귀 차트 저장
    def save_chart_image(self, data, show_chart):
        # 사이즈
        plt.figure(figsize=(15, 10), dpi=100)

        # 레이블
        plt.plot(self._y_test, c='r')

        # 예측 값
        plt.plot(data, c='b')

        # 이미지로 저장
        plt.savefig('./chart_images/tenki-kion-lr.png')

        # 차트 확인(Optional)
        if show_chart:
            plt.show()

    def __del__(self):
        del self._x_train, self._x_test, self._y_train, self._y_test, self._x, self._y, self._model
Пример #29
0
 def __init__(self,
              features=None,
              fit_target=None,
              **kwargs):
     super().__init__(features=features, fit_target=fit_target)
     self.impl = NuSVR(**kwargs)
Пример #30
0
    def init_RM(self, **kwargs):
        """
        Initialisation of the Regression Model.
        Any parameter is passed to the Model.
        self.N_out RM can be needed if not ANN type. 
        They are stored in self.RMs list.
        """
        self.RMs = []
        self.train_params = {}
        if self.RM_type in ('SK_ANN', 'SK_ANN_Dis'):
            self.RMs = [MLPRegressor(random_state=self.random_seed, **kwargs)]
            self._multi_predic = True
        elif self.RM_type == 'SK_SVM':
            for i in range(self.N_out):
                self.RMs.append(SVR(**kwargs))
            self._multi_predic = False
        elif self.RM_type == 'SK_NuSVM':
            for i in range(self.N_out):
                self.RMs.append(NuSVR(**kwargs))
            self._multi_predic = False
        elif self.RM_type == 'SK_BR':
            for i in range(self.N_out):
                self.RMs.append(BayesianRidge(**kwargs))
            self._multi_predic = False
        elif self.RM_type == 'SK_AB':
            for i in range(self.N_out):
                self.RMs.append(
                    AdaBoostRegressor(random_state=self.random_seed, **kwargs))
            self._multi_predic = False
        elif self.RM_type in ("K_ANN", "K_ANN_Dis"):
            if not TF_OK:
                raise ValueError(
                    'Tensorflow not installed, Keras RM_type not available')

            def get_kwargs(kw, default):
                if kw in kwargs:
                    return kwargs[kw]
                else:
                    return default

            activation = get_kwargs('activation', 'relu')
            kernel_initializer = get_kwargs(
                'kernel_initializer',
                initializers.glorot_uniform(seed=self.random_seed))
            if self.random_seed is None:
                bias_initializer = 'zeros'
            else:
                cst = np.random.rand()
                bias_initializer = initializers.Constant(0.1 + 0.05 + cst)
            optimizer = get_kwargs('optimizer', get_kwargs('solver', 'adam'))
            epochs = get_kwargs('epochs', 100)
            batch_size = get_kwargs('batch_size', None)
            validation_split = get_kwargs('validation_split', 0.0)
            hidden_layer_sizes = get_kwargs('hidden_layer_sizes', (10, 10))
            random_state = get_kwargs('random_state', self.random_seed)
            dropout = get_kwargs('dropout', None)
            L1 = get_kwargs('L1', 0.)
            L2 = get_kwargs('L2', 0.)
            tf.compat.v1.random.set_random_seed(random_state)
            model = Sequential()
            model.add(
                Dense(hidden_layer_sizes[0],
                      input_dim=self.N_in,
                      kernel_initializer=kernel_initializer,
                      bias_initializer=bias_initializer,
                      activation=activation,
                      kernel_regularizer=regularizers.l1_l2(l1=L1, l2=L2)))
            if dropout is not None:
                if type(dropout) in (type(()), type([])):
                    d1 = dropout[0]
                else:
                    d1 = dropout
                if d1 != 0.0:
                    model.add(Dropout(d1, seed=random_state))
            for i_hl, hidden_layer_size in enumerate(hidden_layer_sizes[1:]):
                model.add(
                    Dense(hidden_layer_size,
                          activation=activation,
                          kernel_initializer=kernel_initializer,
                          bias_initializer=bias_initializer,
                          kernel_regularizer=regularizers.l1_l2(l1=L1, l2=L2)))
                if dropout is not None:
                    if type(dropout) in (type(()), type([])):
                        di = dropout[i_hl + 1]
                    else:
                        di = dropout
                    if di != 0.0:
                        model.add(Dropout(di, seed=random_state))
            if self.RM_type == 'K_ANN':
                model.add(
                    Dense(self.N_out,
                          activation='linear',
                          kernel_initializer=kernel_initializer,
                          bias_initializer=bias_initializer,
                          kernel_regularizer=regularizers.l1_l2(l1=L1, l2=L2)))
                metrics = get_kwargs('metrics', ['mse', 'mae'])
                model.compile(loss='mse', optimizer=optimizer, metrics=metrics)
                self.RMs = [model]
                self.train_params = {
                    'epochs': epochs,
                    'batch_size': batch_size,
                    'verbose': False,
                    'validation_split': validation_split
                }
                self._multi_predic = True
            elif self.RM_type == 'K_ANN_Dis':
                model.add(
                    Dense(self.N_out,
                          activation='softmax',
                          kernel_initializer=kernel_initializer,
                          bias_initializer=bias_initializer,
                          kernel_regularizer=regularizers.l1_l2(l1=L1, l2=L2)))
                metrics = get_kwargs('metrics', ['accuracy'])
                model.compile(loss='categorical_crossentropy',
                              optimizer=optimizer,
                              metrics=metrics)
            if self.verbose:
                model.summary()
            self.RMs = [model]
            self.train_params = {
                'epochs': epochs,
                'batch_size': batch_size,
                'verbose': False,
                'validation_split': validation_split
            }
            self._multi_predic = True
        elif self.RM_type == 'KSK_ANN':

            def get_kwargs(kw, default):
                if kw in kwargs:
                    return kwargs[kw]
                else:
                    return default

            activation = get_kwargs('activation', 'relu')
            kernel_initializer = get_kwargs(
                'kernel_initializer',
                initializers.glorot_uniform(seed=self.random_seed))
            if self.random_seed is None:
                bias_initializer = 'zeroes'
            else:
                bias_initializer = initializers.Constant(0.1)
            optimizer = get_kwargs('optimizer', get_kwargs('solver', 'adam'))
            epochs = get_kwargs('epochs', 1)
            batch_size = get_kwargs('batch_size', None)
            validation_split = get_kwargs('validation_split', 0.0)
            hidden_layer_sizes = get_kwargs('hidden_layer_sizes', (10, 10))
            random_state = get_kwargs('random_state', self.random_seed)
            tf.random.set_random_seed(random_state)

            def create_model(hidden_layer_sizes, N_in, activation,
                             random_state, N_out):
                model = Sequential()
                model.add(
                    Dense(hidden_layer_sizes[0],
                          input_dim=N_in,
                          activation=activation))
                for hidden_layer_size in hidden_layer_sizes[1:]:
                    model.add(Dense(
                        hidden_layer_size,
                        activation=activation,
                    ))
                model.add(Dense(N_out, activation='linear'))
                metrics = ['mse', 'mae']
                model.compile(loss='mse', optimizer=optimizer, metrics=metrics)
                return model

            model = KerasRegressor(create_model,
                                   hidden_layer_sizes=hidden_layer_sizes,
                                   N_in=self.N_in,
                                   activation=activation,
                                   random_state=random_state,
                                   N_out=self.N_out)
            #if self.verbose:
            #    model.summary()
            self.RMs = [model]
            self.train_params = {
                'epochs': epochs,
                'batch_size': batch_size,
                'verbose': False,
                'validation_split': validation_split
            }
            self._multi_predic = True  # TBC ***
        else:
            raise ValueError('Unkown Regression method {}'.format(
                self.RM_type))
        if self.verbose:
            print('Regression Model {}'.format(self.RM_type))
Пример #31
0
def _gerarPlotFit(list_index_real, list_y_real, list_index_previsto,
                  list_y_previsto, list_index_real_original,
                  x_predict_original, list_index_previsto_original,
                  list_y_previsto_original, list_y_real_original, isFit,
                  df_norm):
    global cach_fit

    #Plotando FIT
    if (isFit):
        x_fit_real = [x + 1 for x in np.arange(len(list_index_real))]
        y_fit_real = list_y_real

        x_fit_previsto = np.asarray(
            [x + 1 for x in np.arange(len(list_index_previsto))],
            dtype=np.int32)
        y_fit_previsto = list_y_previsto

        x_fit_previsto_original = np.asarray(
            [x + 1 for x in np.arange(len(list_index_previsto_original))],
            dtype=np.int32)
        y_fit_previsto_original = list_y_previsto_original

        x_fit_real_original = np.asarray(
            [x + 1 for x in np.arange(len(list_index_real_original))],
            dtype=np.int32)
        y_fit_real_original = list_y_real_original

        list_x = np.arange(len(df_norm.index))
        parcela_x = (0 if len(x_fit_real) == 1 else ceil(
            len(x_fit_real) * 0.4))
        #print(parcela_x)
        coefs_linear_reais = np.polyfit(
            x_fit_real,
            y_fit_real,
            1,
        )
        coefs_linear_previsto = np.polyfit(x_fit_previsto, y_fit_previsto, 1)
        coefs_linear_previsto_parcela = np.polyfit(
            x_fit_previsto[parcela_x:len(x_fit_previsto)],
            y_fit_previsto[parcela_x:len(x_fit_previsto)], 1)
        coefs_linear_previsto_peso = np.polyfit(x_fit_previsto,
                                                y_fit_previsto,
                                                1,
                                                w=np.sqrt(
                                                    x_fit_previsto[::-1]))

        if (x_predict_original.sum() == 0 and len(cach_fit) != 0):
            ffit_reais = cach_fit[0]
            ffit_peso = cach_fit[1]
            ffit = cach_fit[2]
            fit_reta_previsto = cach_fit[3]
            fit_svr = cach_fit[4]
            fit_reta_previsto_parcela = cach_fit[5]
            fit_svr_ply = cach_fit[6]
            list_x = cach_fit[7]
        else:
            ffit_reais = np.poly1d(coefs_linear_reais)
            ffit_peso = np.poly1d(coefs_linear_previsto_peso)
            ffit = np.poly1d(coefs_linear_previsto)
            fit_reta_previsto_parcela = np.poly1d(
                coefs_linear_previsto_parcela)
            #FIT com Equação da Reta Reduzida [y = ax + b]
            fit_reta_previsto = [
                ((y_fit_real_original[-1] - y_fit_real_original[0]) /
                 (x_fit_real_original[-1] - x_fit_real_original[0])) *
                (x - x_fit_real_original[0]) + x_fit_real_original[0]
                for x in list_x
            ]

            svr_nu = NuSVR(kernel='linear', C=1, gamma='scale', nu=0.9)
            svr_nu_poly = NuSVR(kernel='rbf', C=1, gamma='scale', nu=0.9)
            svr_nu.fit((x_fit_previsto_original.reshape(-1, 1)),
                       y_fit_previsto_original)
            svr_nu_poly.fit((x_fit_previsto_original.reshape(-1, 1)),
                            y_fit_previsto_original)
            fit_svr = svr_nu.predict(list_x.reshape(-1, 1))
            fit_svr_ply = svr_nu_poly.predict(list_x.reshape(-1, 1))
            cach_fit = (ffit_reais, ffit_peso, ffit, fit_reta_previsto,
                        fit_svr, fit_reta_previsto_parcela, fit_svr_ply,
                        list_x)

    # legend_fit_real,= plt.plot(df_norm.index, ffit_reais(list_x), color="orange",  linestyle='--', label="FIT [pontos reais]")
    # legend_fit_previsto, = plt.plot(df_norm.index, ffit_peso(list_x), color="red",  linestyle='--', label= "FIT [pontos reais + último ponto previsto] PESO (SQRT)")
    # legend_fit_previsto_sem_peso, = plt.plot(df_norm.index, ffit(list_x), color="g",  linestyle='--', label= "FIT [pontos reais + último ponto previsto] Sem peso")
    # legend_fit_previsto_reta, = plt.plot(df_norm.index,fit_reta_previsto, color="chocolate",  linestyle='--', label= "FIT Equacao da Reta")
    # legend_fit_previsto_sem_peso_parcela, = plt.plot(df_norm.index, fit_reta_previsto_parcela(list_x), color="slategray",  linestyle='--', label= "FIT [pontos reais + último ponto previsto - parcela] Sem peso")

        legend_fit_previsto_svr, = plt.plot(df_norm.index,
                                            fit_svr,
                                            color="mediumvioletred",
                                            linestyle='--',
                                            label="FIT SVR [Linear]")
        #legend_fit_previsto_svr_poly, = plt.plot(df_norm.index,fit_svr_ply, color="red",  linestyle='--', label= "FIT SVR [Poly]")

        # list_legend_fit = [legend_fit_previsto, legend_fit_previsto_sem_peso, legend_fit_real, legend_fit_previsto_reta,legend_fit_previsto_svr,legend_fit_previsto_sem_peso_parcela]
        list_legend_fit = [legend_fit_previsto_svr]
        return list_legend_fit
        init_sect_beg = timer()

        # Save maternal feature vectors & composite maternal / fetal feature vectors:
        maternal_feature_vectors[n_svrs, :] = cwt_wdw.flatten()
        maternal_fetal_feature_vectors[n_svrs, :] = np.concatenate(
            (cwt_wdw.flatten(), cwt_wdw_fetal.flatten()), axis=None)

        # Linear support vector regression: maternal -> abdominal
        #
        nusv_res = NuSVR(nu=0.95,
                         C=10.0,
                         kernel='linear',
                         degree=3,
                         gamma='scale',
                         coef0=0.0,
                         shrinking=True,
                         tol=0.001,
                         cache_size=200,
                         verbose=False,
                         max_iter=10000)
        z_rbf = nusv_res.fit(cwt_wdw, fetal_lead_wdw).predict(cwt_wdw)
        # z_rbf = nusv_res.fit(cwt_wdw, mat_lead_wdw).predict(cwt_wdw)

        # Store regression coef's & offset:
        nusv_lin_coef = np.float32(nusv_res.coef_)
        nusv_intercept = np.float32(nusv_res.intercept_)

        linear_regression_coefs[n_svrs, :] = nusv_lin_coef
        linear_regression_intercepts[n_svrs] = nusv_intercept
Пример #33
0
def main():
    #data cleaning and features engineering section
    data = pd.read_csv('input.csv')
    data = fix_data_encoding(data)
    data = name_mapping(data)
    data = get_duration(data)
    data = get_court_city_type(data)
    data = fix_leading_zeros(data)
    data = add_judge_age(data)
    data = encode_receipt_procedure(data)
    data = add_money_amount_indicator(data)
    data = create_person_business_indicators(data)
    data = encode_case_matter(data)
    data = create_court_indicators(data)
    data = add_loadiness_of_courts(data)
    data = add_not_subject_to_duty_not_zero(data)
    data = add_lives_abroad_over_persons_and_companies_involved(data)
    data = add_date_groups(data)
    data = get_total_persons_and_companies_started(data)
    data = remove_outliers(data)
    data = add_single_person_or_company_started(data)
    data = add_single_person_or_company_answered(data)

    public_data = pd.read_csv('public_data.csv')
    print("Public data columns: ", list(public_data))
    data = add_public_data(data, public_data)
    print("After adding data: ", list(data))

    data = add_court_productivity(data)

    data.pop('start_date')
    data.pop('end_date')
    data.pop('court_name')
    data.pop('case_id')
    data.pop('court_id')
    data.pop('date_of_birth')

    # Depends if start_date will be available in final data
    data.pop('start_date_year')

    data.to_csv("out.csv")

    train, test = train_test_split(data, test_size=0.2, random_seed=1)

    #store the name of the variable we want to predict. Separately store the names of all other variables
    target = 'duration_m'
    all_columns_except_target = train.columns.difference([target])

    #    #-----------------------------------------------------------------------------------------------------
    #    #model calibration section
    #    #tree amount calibration
    #    for tree_amount in range(10,60,10):
    #        model = RandomForestRegressor(n_estimators=tree_amount)
    #        score_from_cross_validation = get_score_from_cross_validation(model, train, target, valid_split_size=3)
    #        print('number of trees=', tree_amount)
    #        print("score from cross-validation:", score_from_cross_validation)
    #
    #    #max_features calibration
    #    for max_features in ['auto','sqrt','log2']:
    #        model = RandomForestRegressor(max_features=max_features)
    #        score_from_cross_validation = get_score_from_cross_validation(model, train, target, valid_split_size=5)
    #        print('max_features_type=', max_features)
    #        print("score from cross-validation:", score_from_cross_validation)
    #
    #    #min_samples_leaf calibration
    #    for min_samples_leaf in range(1,5,1):
    #        model = RandomForestRegressor(n_estimators = 60, min_samples_leaf=min_samples_leaf)
    #        score_from_cross_validation = get_score_from_cross_validation(model, train, target, valid_split_size=3)
    #        print('min_samples_leaf=', min_samples_leaf)
    #        print("score from cross-validation:", score_from_cross_validation)

    #default settings vs manually calibrated settings
    print('RadnomForestRegressor from scikit-learn')
    model = RandomForestRegressor()
    score_from_cross_validation = get_score_from_cross_validation(
        model, train, target, valid_split_size=5)
    print('default model:')
    print("score from cross-validation on train data:",
          score_from_cross_validation)

    model = RandomForestRegressor(n_estimators=60, min_samples_leaf=2)
    score_from_cross_validation = get_score_from_cross_validation(
        model, train, target, valid_split_size=5)
    print('manually calibrated model:')
    print("score from cross-validation on train data:",
          score_from_cross_validation)

    #    model = RandomForestRegressor()
    #    model.fit(train[all_columns_except_target], train[target])
    #    score_on_test = model.score(test[all_columns_except_target], test[target])
    #    print('default model:')
    #    print("score for test data:", score_on_test)
    #
    #    model = RandomForestRegressor(n_estimators = 60, min_samples_leaf=2)
    #    model.fit(train[all_columns_except_target], train[target])
    #    score_on_test = model.score(test[all_columns_except_target], test[target])
    #    print('manually calibrated model:')
    #    print("score for test data:", score_on_test)

    #-----------------------------------------------------------------------------------------------------
    #trying different models/algorithms
    #default settings for GradientBoostingRegressor ~ a bit better than RandomForestRegressor
    print('GradientBoostingRegressor from scikit-learn')
    model = GradientBoostingRegressor()
    score_from_cross_validation = get_score_from_cross_validation(
        model, train, target, valid_split_size=5)
    print('default model:')
    print("score from cross-validation on train data:",
          score_from_cross_validation)

    #    model = GradientBoostingRegressor()
    #    model.fit(train[all_columns_except_target], train[target])
    #    score_on_test = model.score(test[all_columns_except_target], test[target])
    #    print('default model:')
    #    print("score for test data:", score_on_test)

    #default settings for ADAboost - sucks!
    print('AdaBoostRegressor from scikit-learn')
    model = AdaBoostRegressor()
    score_from_cross_validation = get_score_from_cross_validation(
        model, train, target, valid_split_size=5)
    print('default model:')
    print("score from cross-validation on train data:",
          score_from_cross_validation)

    #    model = AdaBoostRegressor()
    #    model.fit(train[all_columns_except_target], train[target])
    #    score_on_test = model.score(test[all_columns_except_target], test[target])
    #    print('default model:')
    #    print("score for test data:", score_on_test)

    #default settings for ExtraTreesRegressor - sucks
    print('ExtraTreesRegressor from scikit-learn')
    model = ExtraTreesRegressor()
    score_from_cross_validation = get_score_from_cross_validation(
        model, train, target, valid_split_size=5)
    print('default model:')
    print("score from cross-validation on train data:",
          score_from_cross_validation)

    #    model = ExtraTreesRegressor()
    #    model.fit(train[all_columns_except_target], train[target])
    #    score_on_test = model.score(test[all_columns_except_target], test[target])
    #    print('default model:')
    #    print("score for test data:", score_on_test)

    #default settings for BaggingRegressor ~ almost like RandomForestRegressor
    print('BaggingRegressor from scikit-learn')
    model = BaggingRegressor()
    score_from_cross_validation = get_score_from_cross_validation(
        model, train, target, valid_split_size=5)
    print('default model:')
    print("score from cross-validation on train data:",
          score_from_cross_validation)

    #    model = BaggingRegressor()
    #    model.fit(train[all_columns_except_target], train[target])
    #    score_on_test = model.score(test[all_columns_except_target], test[target])
    #    print('default model:')
    #    print("score for test data:", score_on_test)

    #default settings for XGBModel ~ 1% better than GradientBoostingRegressor
    print('XGBModel for scikit-learn')
    model = XGBModel()
    model.fit(train[all_columns_except_target], train[target])
    score_from_cross_validation = get_score_from_cross_validation(
        model, train, target,
        valid_split_size=5)  #calculating R^2 score manually
    print('default model:')
    print("score for test data:", score_from_cross_validation)

    #default settings for SVR - very bad!!!!!!!
    print('SVR from scikit-learn')
    model = SVR()
    score_from_cross_validation = get_score_from_cross_validation(
        model, train, target, valid_split_size=5)
    print('default model:')
    print("score from cross-validation on train data:",
          score_from_cross_validation)

    #    model = SVR()
    #    model.fit(train[all_columns_except_target], train[target])
    #    score_on_test = model.score(test[all_columns_except_target], test[target])
    #    print('default model:')
    #    print("score for test data:", score_on_test)

    #default settings for NuSVR - very bad as well!!!
    print('NuSVR from scikit-learn')
    model = NuSVR()
    score_from_cross_validation = get_score_from_cross_validation(
        model, train, target, valid_split_size=5)
    print('default model:')
    print("score from cross-validation on train data:",
          score_from_cross_validation)

    #    model = NuSVR()
    #    model.fit(train[all_columns_except_target], train[target])
    #    score_on_test = model.score(test[all_columns_except_target], test[target])
    #    print('default model:')
    #    print("score for test data:", score_on_test)

    #default settings for LinearRegression - not too bad for such model
    print('LinearRegression from scikit-learn')
    model = LinearRegression()
    score_from_cross_validation = get_score_from_cross_validation(
        model, train, target, valid_split_size=5)
    print('default model:')
    print("score from cross-validation on train data:",
          score_from_cross_validation)
Пример #34
0
    random_permutation_index = np.random.permutation(x.shape[0])
    x = x[random_permutation_index]
    y = y[random_permutation_index]

    train_x = x[:-100]
    train_y = y[:-100]
    test_x = x[-100:]
    test_y = y[-100:]
    model = svr_model.fit(train_x, train_y)
    y_train_result = model.predict(train_x)
    print('trainig RMSE = {}'.format(get_RMSE(train_y, y_train_result)))
    y_predict_result = model.predict(test_x)
    print('test RMSE = {}'.format(get_RMSE(test_y, y_predict_result)))
    print()
    return test_y, y_predict_result


svr_model = SVR(C=1)
with open('prediction1.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    y_truth, y_predict = run_SVR('input_2007_w5.csv', svr_model)
    for i in range(len(y_truth)):
        writer.writerow([y_truth[i], y_predict[i]])

svr_model = NuSVR(C=100)
with open('prediction2.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    y_truth, y_predict = run_SVR('input_2007.csv', svr_model)
    for i in range(len(y_truth)):
        writer.writerow([y_truth[i], y_predict[i]])
Пример #35
0
 def train(self, X, y, hypers):
     self.regressor = NuSVR(kernel='rbf', C=hypers[0],
                            gamma=hypers[1]).fit(X, y)
def main(X, Y, Params, print_info=False, is_regression=True, Y_other=None):

    parameters = Params['Algorithm'][1]
    is_cv_run = False
    starttime = time.time()

    if print_info:
        print('Fitting model \'%s\' for %s' %
              (Params['Algorithm'][0],
               'regression' if is_regression else 'classification'))

    if Params['Algorithm'][0] == 'BayesianRidge':
        if not is_regression:
            model = BayesianRidge(n_iter=300,
                                  tol=0.001,
                                  compute_score=False,
                                  fit_intercept=True,
                                  normalize=False,
                                  copy_X=True,
                                  verbose=False,
                                  **parameters)
            #parameters = {'alpha_1': [1e-6,1e-5,1e-4],'alpha_2': [1e-6,1e-5,1e-4], 'lambda_1': [1e-6,1e-5,1e-4], 'lambda_2': [1e-6,1e-5,1e-4]}
        else:
            model = BayesianRidge(n_iter=300,
                                  tol=0.001,
                                  compute_score=False,
                                  fit_intercept=True,
                                  normalize=False,
                                  copy_X=True,
                                  verbose=False,
                                  **parameters)
    elif Params['Algorithm'][0] == 'StringKernel':
        if not is_regression:
            raise (Exception('not implemented'))
        else:
            # we create an instance of SVM and fit out data.
            #
            # model = KernelRidge(alpha=parameters['alpha'], kernel='precomputed')
            model = SVR(kernel='precomputed',
                        gamma='auto',
                        coef0=0.0,
                        shrinking=True,
                        tol=0.001,
                        cache_size=400,
                        verbose=False,
                        max_iter=-1)
            param_grid = {
                'C': np.logspace(np.log10(0.0001), np.log10(500), 25)
            }

            model = NuSVR(
                kernel='precomputed'
            )  #cache_size=400, coef0=0.0, gamma='auto', max_iter=-1, shrinking=True, tol=0.001, verbose=False,**parameters)
            param_grid = {'nu': (0.50, )}

            model = GridSearchCV(model,
                                 param_grid,
                                 n_jobs=1,
                                 iid=True,
                                 refit=True,
                                 cv=7,
                                 verbose=0,
                                 scoring=neg_mean_squared_error_scorer)
            is_cv_run = True

    elif Params['Algorithm'][0] == 'XGBoost':
        # max_depth = 3, learning_rate = 0.1, n_estimators = 100, silent = True, objective = 'reg:linear',
        # booster = 'gbtree', n_jobs = 1, nthread = None, gamma = 0, min_child_weight = 1,
        # max_delta_step = 0, subsample = 1, colsample_bytree = 1, colsample_bylevel = 1, reg_alpha = 0,
        # reg_lambda = 1, scale_pos_weight = 1, base_score = 0.5, random_state = 0, seed = None,
        # missing = None
        if not is_regression:
            model = xgboost.XGBClassifier(
                missing=None,
                silent=True,
                learning_rate=0.10,
                objective='rank:pairwise',
                booster='gbtree',
                n_jobs=1,
                max_delta_step=0,
                colsample_bylevel=1,
                scale_pos_weight=1,
                base_score=0.5,
                random_state=666,
                colsample_bytree=0.75,  # default 1
                subsample=0.75,
                gamma=0,
                reg_alpha=0.01,  # default 0
                min_child_weight=6,
                **parameters)
        else:
            # model=xgboost.XGBRegressor(missing=None, silent=True,
            #                            learning_rate=0.10,
            #                            objective='reg:linear',#'rank:pairwise' booster='gbtree'
            #                            n_jobs=1,
            #                            booster='gbtree',
            #                            max_delta_step=0,
            #                            colsample_bylevel=1,
            #                            scale_pos_weight=1,
            #                            base_score=0.5,
            #                            random_state=666,
            #                            colsample_bytree=0.75, # default 1
            #                            subsample=0.75,
            #                            gamma=0,
            #                            reg_alpha=0.01, # default 0
            #                            reg_lambda=1.0,
            #                            min_child_weight=6,
            #                            **parameters)

            model = xgboost.XGBRegressor(
                missing=None,
                silent=True,
                learning_rate=0.10,
                objective='reg:linear',  #'rank:pairwise' booster='gbtree'
                n_jobs=1,
                booster='gbtree',
                random_state=666,
                **parameters)

            param_grid = {
                'colsample_bytree': (0.75, 1.0),
                'subsample': (0.75, 1.0),
                'min_child_weight': (3, 6, 9),
                'reg_lambda': (0.80, 1.0, 1.20),
                'reg_alpha': (0.001, 0.01)
            }
            model = GridSearchCV(model,
                                 param_grid,
                                 n_jobs=1,
                                 iid=True,
                                 refit=True,
                                 cv=7,
                                 verbose=0,
                                 scoring=neg_mean_squared_error_scorer)
            is_cv_run = True

    elif Params['Algorithm'][0] == "Keras_ElasticNet":

        #use_keras_CPU()

        if not is_regression:
            raise (Exception('ElasticNet is only for regression!'))
        else:
            param_grid = {
                'l1_ratio': (Params['Algorithm'][1]['l1_ratio'], ),
                'alpha': np.logspace(-3, 1, 15)
            }

            model = GridSearchCV(KerasENet(),
                                 param_grid,
                                 n_jobs=1,
                                 iid=True,
                                 refit=True,
                                 cv=5,
                                 verbose=0,
                                 scoring=neg_mean_squared_error_scorer)
            # first_output = Dense(1,activation='sigmoid')(first_output)
            is_cv_run = True

    elif Params['Algorithm'][0] == "Ridge":
        if not is_regression:
            raise (Exception('Ridge is only for regression!'))
        else:
            model = RidgeCV(alphas=np.logspace(-1, np.log10(700),
                                               parameters['n_alphas']),
                            fit_intercept=True,
                            normalize=False,
                            scoring=None,
                            cv=8,
                            gcv_mode=None,
                            store_cv_values=False)

    elif Params['Algorithm'][0] == "ElasticNet":
        tol = 0.0001
        selection = 'cyclic'
        n_alphas = 90
        max_iter = 1300
        if X.shape[1] > 4000:
            tol = 0.001
            selection = 'random'
            n_alphas = 60
            max_iter = 1000
        if not is_regression:
            raise (Exception('ElasticNet is only for regression!'))
        else:
            if Params['is_multitarget']:
                model = MultiTaskElasticNetCV(eps=0.001,
                                              alphas=None,
                                              fit_intercept=True,
                                              normalize=False,
                                              max_iter=max_iter,
                                              tol=tol,
                                              cv=7,
                                              copy_X=True,
                                              verbose=0,
                                              n_alphas=n_alphas,
                                              n_jobs=1,
                                              random_state=666,
                                              selection=selection,
                                              **parameters)
            else:
                model = ElasticNetCV(eps=0.001,
                                     alphas=None,
                                     fit_intercept=True,
                                     normalize=False,
                                     max_iter=max_iter,
                                     tol=tol,
                                     cv=7,
                                     copy_X=True,
                                     verbose=0,
                                     n_alphas=n_alphas,
                                     n_jobs=1,
                                     random_state=666,
                                     selection=selection,
                                     **parameters)

    elif Params['Algorithm'][0] == "RandomForest":

        if not is_regression:
            raise (Exception('not set up (lazy)'))
        else:
            model = RandomForestRegressor(criterion='mse',
                                          min_samples_leaf=1,
                                          min_weight_fraction_leaf=0.0,
                                          max_leaf_nodes=None,
                                          min_impurity_decrease=0.0,
                                          min_impurity_split=None,
                                          bootstrap=True,
                                          oob_score=False,
                                          n_jobs=1,
                                          random_state=None,
                                          verbose=0,
                                          warm_start=False,
                                          **parameters)
            param_grid = {
                'max_features': ('auto', 'sqrt'),
                'min_samples_split': (
                    2,
                    4,
                ),
            }
            model = GridSearchCV(model,
                                 param_grid,
                                 n_jobs=1,
                                 iid=True,
                                 refit=True,
                                 cv=7,
                                 verbose=0,
                                 scoring=neg_mean_squared_error_scorer)
            is_cv_run = True

    elif Params['Algorithm'][0] == 'SVM':
        # 0.001, 0.005, 0.01, 0.05, 0.1, 0.5,1.0,1.5,2.0,3.0,4.0,5.0,10.0
        if not is_regression:
            model = SVC(cache_size=400,
                        coef0=0.0,
                        gamma='auto',
                        max_iter=-1,
                        shrinking=True,
                        tol=0.001,
                        verbose=False,
                        **parameters)
            #parameters = {'reg__C':[0.5],'reg__epsilon':[0.1]}
        else:
            model = SVR(cache_size=400,
                        coef0=0.0,
                        gamma='auto',
                        max_iter=-1,
                        shrinking=True,
                        tol=0.001,
                        verbose=False,
                        **parameters)
            param_grid = {'C': np.logspace(np.log10(0.0005), np.log10(10), 30)}
            #param_grid = {'nu':(0.1,0.3,0.5,0.7,0.9)}
            model = GridSearchCV(model,
                                 param_grid,
                                 n_jobs=1,
                                 iid=True,
                                 refit=True,
                                 cv=8,
                                 verbose=0,
                                 scoring=neg_mean_squared_error_scorer)
            is_cv_run = True

    elif Params['Algorithm'][0] == 'GradientBoosting':
        if not is_regression:
            model = GradientBoostingClassifier(random_state=1, **parameters)
            #parameters = {'reg__n_estimators': [140], 'reg__max_depth': [6],'learning_rate':[0.01,0.03,0.1],'min_samples_leaf':[2,3,4]}
        else:
            model = GradientBoostingRegressor(random_state=1, **parameters)
            #parameters = {'reg__n_estimators': [140], 'reg__max_depth': [6]}
    elif Params['Algorithm'][0] == 'MLP':
        #parameters['hidden_layer_sizes']=[parameters['hidden_layer_sizes']]
        #model = MLPRegressorCV(hidden_layer_sizes=parameters['hidden_layer_sizes'])
        model = MLPRegressor(
            activation="relu",
            solver="lbfgs",
            learning_rate="constant",
            learning_rate_init=0.0011,
            max_iter=450,
            random_state=None,
            tol=0.00013,
            epsilon=1e-08,
            hidden_layer_sizes=parameters['hidden_layer_sizes'])

        param_grid = {'alpha': np.logspace(0, np.log10(350), 20)}
        model = GridSearchCV(model,
                             param_grid,
                             n_jobs=1,
                             iid=True,
                             refit=True,
                             cv=7,
                             verbose=0,
                             scoring=neg_mean_squared_error_scorer)
        is_cv_run = True
        #model = MLPRegressor(activation="relu", solver ="lbfgs",learning_rate ="constant",
        #             learning_rate_init = 0.001, power_t = 0.5, max_iter = 500, shuffle = True, random_state = None,
        #             tol = 0.0001, verbose = False, warm_start = False, momentum = 0.9, epsilon = 1e-08,**parameters)
    elif Params['Algorithm'][0] == 'MLP_KERAS':

        from keras.models import Sequential
        from keras import regularizers
        from keras.layers import Dense, Dropout
        from keras.callbacks import EarlyStopping
        from sklearn.preprocessing import LabelEncoder
        from keras.utils import np_utils
        import tensorflow as tf
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        session = tf.Session(config=config)

        early_stopping = EarlyStopping(monitor='val_loss', patience=5)

        model = Sequential()
        model.add(
            Dense(
                parameters['layers_and_nodes'][0],
                activation='tanh',
                input_shape=(X.shape[1], ),
                kernel_initializer='glorot_uniform',
                kernel_regularizer=regularizers.l2(
                    parameters['l2_regularization']),
            ))
        model.add(Dropout(parameters['dropout'], noise_shape=None, seed=1))
        for layer in range(1, len(parameters['layers_and_nodes'])):
            model.add(
                Dense(parameters['layers_and_nodes'][layer],
                      activation='relu',
                      input_shape=(parameters['layers_and_nodes'][layer -
                                                                  1], ),
                      kernel_initializer='glorot_normal',
                      kernel_regularizer=regularizers.l2(
                          parameters['l2_regularization'])))
            model.add(Dropout(parameters['dropout'], noise_shape=None, seed=1))

        if not is_regression:
            model.add(
                Dense(1,
                      activation='softmax',
                      input_shape=(parameters['nodes'][-1], )))
            model.compile(loss='categorical_crossentropy',
                          optimizer='rmsprop',
                          metrics=['f1'])
            encoder = LabelEncoder()
            encoder.fit(Y)
            encoded_Y = encoder.transform(Y)
            # convert integers to dummy variables (i.e. one hot encoded)
            Y = np_utils.to_categorical(encoded_Y)
        else:
            model.add(
                Dense(1,
                      activation='linear',
                      input_shape=(parameters['layers_and_nodes'][-1], )))
            model.compile(loss='mean_squared_error',
                          optimizer='adam',
                          metrics=['mse'])

        model.fit(X,
                  Y,
                  batch_size=X.shape[0],
                  epochs=100,
                  validation_split=0,
                  verbose=0)  #,callbacks=[early_stopping])

        return model

    else:

        raise (Exception('unknown model'))
    #decomposer = LatentDirichletAllocation(n_topics=10, max_iter=10,learning_method='online',learning_offset=50.,random_state=1)
    #decomposer = TruncatedSVD(n_components=100,random_state=666)
    """
    X = data.iloc[:]['text'].values
    y = data.iloc[:]['mylabel'].values.astype(str)
    
    dat = vect.fit_transform(X)
    dat = tfidf.fit_transform(dat)
    dat = decomposer.fit_transform(dat)  
    
    for a in numpy.unique(y):
        plt.scatter(dat[y==a,0],dat[y==a,1])
    """
    """
    START LOOP
    """

    #t0 = time()
    # if get_set_count(parameters)>1:
    #     grid_search = GridSearchCV(model, parameters, n_jobs=6,verbose=1,cv=10,refit=True)
    #     grid_search.fit(X=X,y=Y)
    #     best_parameters = grid_search.best_estimator_.get_params()
    #     print('--> best parameters: %s' % best_parameters)
    #     return grid_search
    # else:

    if 1:
        start_time = time.time()
        print('... training model (X.shape=%s)' % str(X.shape), end='')

    warnings.filterwarnings("ignore")

    if Y_other is not None and Params['is_multitarget']:
        Y = np.expand_dims(Y, axis=1)
        model.fit(X=X, y=np.concatenate((Y, Y_other), axis=1))
    else:
        Y = Y.flatten()
        model.fit(X=X, y=Y)

    if is_cv_run:
        print(' [best gridsearch params: %s] ' % model.best_params_, end='')

    if 1:
        end_time = time.time()
        print(' ... done (%1.1f min)' % ((end_time - start_time) / 60.0))

    #elapsedtime = (time.time() - starttime) / 60.0
    #print('fit done (took %f minutes)' % elapsedtime)

    return model
Пример #37
0
# need to convert to np.array() to extract last values
y_train = np.array(y_train)
y_test = np.array(y_test)

y_train = y_train[:, y_train.shape[1] - 1]
y_test = y_test[:, y_test.shape[1] - 1]

# make candles
s = data.index[data['mid_close'] == y_test[0]].tolist()[1]
px_test = {
    'bid': data.iloc[s:, 1].values,
    'ask': data.iloc[s:, 5].values,
    'mid': data.iloc[s:, 9].values
}

clf = NuSVR()
fitted_clf = clf.fit(X_train, y_train)


def plot_strategy(strategy, default):
    '''
    fn: compare 2 strategies

    Params:
    -------
    strategy: list, accumulated returns from predicting strateg
    default: list, accumulated returns from buy & hold
    '''
    fig = plt.figure(figsize=(10, 6))
    ax = fig.add_subplot(111)
 def test_convert_nusvr_default(self):
     model, X = self._fit_binary_classification(NuSVR())
     model_onnx = convert_sklearn(
         model, "SVR", [("input", FloatTensorType([None, X.shape[1]]))])
     self.assertIsNotNone(model_onnx)
     dump_data_and_model(X, model, model_onnx, basename="SklearnRegNuSVR2")
Пример #39
0
            # "Passive Aggressive Regressor ": PassiveAggressiveRegressor(max_iter=100000, tol=0.5), 
            # "random forest regressor": RandomForestRegressor(n_estimators=10), 
            # "gradient boosting regressor": GradientBoostingRegressor(min_samples_leaf=3),
            # "k nearest neighbiours regressor": KNeighborsRegressor(),
            # "RANSAC regressor": RANSACRegressor(),
            "SGD regressor": SGDRegressor(max_iter=100000, tol=0.5),
            # "kernel ridge": KernelRidge(),
            # "ada boost regressor": AdaBoostRegressor(),
            # "bagging regressor": BaggingRegressor(),
            # "extra trees regressor": ExtraTreesRegressor(n_estimators=10),
            # "dummy regressor": DummyRegressor(),
            # "PLSR regressor": PLSRegression(),
            # "radius neighbours regressor": RadiusNeighborsRegressor(radius=5),
            # "neural_network.MLPRegressor 500": MLPRegressor(hidden_layer_sizes=(50)),
            # "svm.SVR": SVR(gamma="scale"),
            "svm.NuSVR epsilon=": NuSVR(nu=0.7, gamma="scale")
            # "svm.LinearSVR epsilom=": LinearSVR(max_iter=10000)
            # "decision tree regressor": DecisionTreeRegressor(),
            # "extra tree regressor": ExtraTreeRegressor()
        }

# models = {
#             "1":MLPRegressor(hidden_layer_sizes=(64,2), solver="adam"),
#             "2":MLPRegressor(hidden_layer_sizes=(64,2), solver="lbfgs"),
#         }

cp(t, "initialising models")

results = []

rand = [0,0]
Пример #40
0
def regress_NuSVR(X_train, X_test, y_train, y_test, C1, nu1):
    nusvr = NuSVR(nu=nu1, C=C1, kernel='rbf', gamma=0.0001, tol=0.001)
    regr_nusvr = prep_process(nusvr, X_train, X_test, y_train, y_test)
    return (regr_nusvr[0], regr_nusvr[1])


def parameter_choosing_svr(estimator, params, X_train, Y_train, tem_list):
    grid_search = GridSearchCV(estimator, param_grid=params, cv=5)
    grid_search.fit(X_train, Y_train)
    for k, v in grid_search.best_params_.items():
        tem_list.append(v)
        print('', v)
    return (tem_list[0], tem_list[1])


estimator = NuSVR(kernel='rbf', gamma=0.0001, tol=0.001)
Cs = np.arange(5, 30, 5)
Nus = np.arange(0.2, 0.9, 0.1)
params = {'nu': Nus, 'C': Cs}

# Read data files
fname_pars = '/Ginzburg_Landau_equation/File4_CSV/Beta_Parameter_values.csv'
par_values = pd.read_csv(fname_pars)

for j in range(10, 50):
    fname_features = '/Ginzburg_Landau_equation/File4_CSV/TDA_Beta_features_m_%d.csv' % (
        j)
    print('file number', j)

    # Read data files
    tda_features = pd.read_csv(fname_features)
Пример #41
0
print("\nMean absolute error: ", metrics.mean_absolute_error(y_test, preds))

models = [
    LinearRegression(),
    LassoCV(alphas=np.logspace(-6, 6, 13)),
    ElasticNetCV(alphas=np.logspace(-6, 6, 13)),
    SGDRegressor(),
    PassiveAggressiveRegressor(),
    Ridge(),
    PassiveAggressiveRegressor(),
    RandomForestRegressor(max_depth=5),
    GradientBoostingRegressor(),
    AdaBoostRegressor(loss='exponential'),
    BaggingRegressor(),
    SVR(),
    NuSVR(),
    XGBRFRegressor(max_depth=5, objective="reg:squarederror"),
    XGBRegressor(max_depth=5, objective="reg:squarederror")
]


def show_score(x, y, estimator):
    """
    Returns MAE scores for specified models.
    Also returns r2 scores if applicable

    Arguments:
        x {[array/DataFrame]} -- [Array or matrix of features. Can also be dataframe]
        y {[array]} -- [Target values]
        estimator {[str]} -- [The estimator being used]
    """
# Step 4 - Remove the outliers
correctedSeries = util.detectAndRemoveOutliers(rawSeries)

# Learning Process - Start

# Parameters
depth = 100

# Form feature and target vectors
featureVectors, targetVectors = util.formContinousFeatureAndTargetVectorsWithoutBias(correctedSeries, depth)
featureVectors, targetVectors = util.formFeatureAndTargetVectors(correctedSeries, depth)


# # Train using linear regression
#model = SVR(kernel="linear")
model = NuSVR(nu=1.0, kernel="linear")
model.fit(featureVectors, targetVectors[:, 0])
predictedTrainingOutputData = model.predict(featureVectors)

targetVectors = targetVectors

# Predicted and actual Series
actualSeries = pd.Series(data=targetVectors.flatten(), index=correctedSeries.index[-targetVectors.shape[0]:])
predictedSeries = pd.Series(data=predictedTrainingOutputData.flatten(), index=correctedSeries.index[-targetVectors.shape[0]:])

# Learning Process - End

# Step 5 - Descale the series
actualSeries = util.descaleSeries(actualSeries)
predictedSeries = util.descaleSeries(predictedSeries)