class MIKernelSVR(MIKernelSVM): def __init__(self, **parameters): svr_params = { 'kernel' : 'precomputed', 'max_iter': MAX_ITERS, } if 'C' in parameters: svr_params['C'] = parameters.pop('C') if 'nu' in parameters: svr_params['nu'] = parameters.pop('nu') self.estimator = NuSVR(**svr_params) # Get kernel name and pass remaining parameters to kernel mi_kernel_name = parameters.pop('kernel') self.mi_kernel = kernel.by_name(mi_kernel_name, **parameters) def fit(self, X, y): X = map(np.asarray, X) self.fit_data = X self.gram_matrix = self.mi_kernel(X, X) self.estimator.fit(self.gram_matrix, y) return self def predict(self, X=None): if X is None: gram_matrix = self.gram_matrix else: X = map(np.asarray, X) gram_matrix = self.mi_kernel(X, self.fit_data) return self.estimator.predict(gram_matrix)
def train(self, x, y, param_names, random_search=100, kernel_cache_size=2000, **kwargs): if self._debug: print "Before preprocessing: 1st sample:\n", x[0] start = time.time() scaled_x = self._set_and_preprocess(x=x, param_names=param_names) # Check that each input is between 0 and 1 self._check_scaling(scaled_x=scaled_x) if self._debug: print "Shape of training data: ", scaled_x.shape print "Param names: ", self._used_param_names print "First training sample\n", scaled_x[0] print "Encode: ", self._encode # Do a random search nu, c, gamma = self._random_search(random_iter=100, x=scaled_x, y=y, kernel_cache_size=kernel_cache_size) # Now train model try: nusvr = NuSVR(gamma=gamma, C=c, nu=nu, random_state=self._rng, cache_size=kernel_cache_size) nusvr.fit(scaled_x, y) self._model = nusvr except Exception, e: print "Training failed", e.message svr = None
def fit(self, X, Y, W): clf = NuSVR(nu=self.nu, C=self.C, kernel=self.kernel, degree=self.degree, gamma=self.gamma, coef0=self.coef0, shrinking=self.shrinking, tol=self.tol, cache_size=self.cache_size, max_iter=self.max_iter) if W is not None: return NuSVRClassifier(clf.fit(X, Y.reshape(-1), W.reshape(-1))) return NuSVRClassifier(clf.fit(X, Y.reshape(-1)))
def traindt(x,y): global clf #print "training surrogate" #clft = DecisionTreeRegressor(max_depth=tree_max_depth,splitter='random') #clft = RandomForestRegressor() #clft = GradientBoostingRegressor(loss='lad',n_estimators=50,learning_rate=0.3,max_depth=2) clft = NuSVR(C=1e6) clf = clft.fit(x,y)
def _random_search(self, random_iter, x, y, kernel_cache_size): # Default Values c = 1.0 gamma = 0.0 nu = 0.5 best_score = -sys.maxint if random_iter > 0: sys.stdout.write("Do a random search %d times" % random_iter) param_dist = {"C": numpy.power(2.0, range(-5, 16)), "gamma": numpy.power(2.0, range(-15, 4)), "nu": uniform(loc=0.0001, scale=1-0.0001)} param_list = [{"C": c, "gamma": gamma, "nu": nu}, ] param_list.extend(list(ParameterSampler(param_dist, n_iter=random_iter-1, random_state=self._rng))) for idx, d in enumerate(param_list): nusvr = NuSVR(kernel='rbf', gamma=d['gamma'], C=d['C'], nu=d['nu'], random_state=self._rng, cache_size=kernel_cache_size) train_x, test_x, train_y, test_y = \ train_test_split(x, y, test_size=0.5, random_state=self._rng) self._check_scaling(scaled_x=train_x) nusvr.fit(train_x, train_y) sc = nusvr.score(test_x, test_y) # Tiny output m = "." if idx % 10 == 0: m = "#" if sc > best_score: m = "<" best_score = sc c = d['C'] gamma = d['gamma'] nu = d['nu'] sys.stdout.write(m) sys.stdout.flush() sys.stdout.write("Using C: %f, nu: %f and Gamma: %f\n" % (c, nu, gamma)) return nu, c, gamma
def __init__(self, **parameters): svr_params = { 'kernel' : 'precomputed', 'max_iter': MAX_ITERS, } if 'C' in parameters: svr_params['C'] = parameters.pop('C') if 'nu' in parameters: svr_params['nu'] = parameters.pop('nu') self.estimator = NuSVR(**svr_params) # Get kernel name and pass remaining parameters to kernel mi_kernel_name = parameters.pop('kernel') self.mi_kernel = kernel.by_name(mi_kernel_name, **parameters)
# kernels Kx = K[testIdx][:, trainIdx] Kv = K[valIdx][:, trainIdx] Kt = K[trainIdx][:, trainIdx] #n = len(trainIdx) #nv = len(valIdx) #nx = len(testIdx) #Train Support Vector Regression # C = 10.^(-2:1:2); C = [0.1] for c in C: print("C = %f" % c) tic = time.time() svr = NuSVR(C=c, kernel='precomputed') svr.fit(Kt, trainLabels) toc = time.time() print("train cost %f s" % (toc - tic)) trainScores = svr.predict(Kt) mseTrain = np.mean((trainLabels - trainScores)**2) valScores = svr.predict(Kv) mseVal = np.mean((valLabels - valScores)**2) testScores = svr.predict(Kx) mseTest = np.mean((testLabels - testScores)**2) print('Train MSE : %g' % mseTrain) print('val MSE : %g' % mseVal) print('Test MSE : %g' % mseTest) # use all samples to train svr = NuSVR(C=c, kernel='precomputed')
def run(seed): # create folders for scores models and preds folder_models = './models/domain2_var1/scores/' if not os.path.exists(folder_models): os.makedirs(folder_models) folder_preds = './predicts/domain2_var1/scores/' if not os.path.exists(folder_preds): os.makedirs(folder_preds) print('Loading data...') # load biases ic_bias = read_pickle('./data/biases/ic_biases.pickle') ic_bias_site = read_pickle('./data/biases/ic_biases_site.pickle') fnc_bias = read_pickle('./data/biases/fnc_biases.pickle') fnc_bias_site = read_pickle('./data/biases/fnc_biases_site.pickle') pca_bias = read_pickle('./data/biases/200pca_biases.pickle') pca_bias_site = read_pickle('./data/biases/200pca_biases_site.pickle') # load classifier and add extra sites2 extra_site = pd.DataFrame() extra_site['Id'] = np.load('./predicts/classifier/site2_test_new_9735.npy') # load competiton data ids_df = pd.read_csv('./data/raw/reveal_ID_site2.csv') fnc_df = pd.read_csv('./data/raw/fnc.csv') loading_df = pd.read_csv('./data/raw/loading.csv') labels_df = pd.read_csv('./data/raw/train_scores.csv') ids_df = ids_df.append(extra_site) print('Detected Site2 ids count: ', ids_df['Id'].nunique()) # load created features agg_df = pd.read_csv('./data/features/agg_feats.csv') im_df = pd.read_csv('./data/features/im_feats.csv') dl_df = pd.read_csv('./data/features/dl_feats.csv') pca_df = pd.read_csv('./data/features/200pca_feats/200pca_3d_k0.csv') for i in range(1, 6): part = pd.read_csv( './data/features/200pca_feats/200pca_3d_k{}.csv'.format(i)) del part['Id'] pca_df = pd.concat((pca_df, part), axis=1) # merge data ic_cols = list(loading_df.columns[1:]) fnc_cols = list(fnc_df.columns[1:]) agg_cols = list(agg_df.columns[1:]) im_cols = list(im_df.columns[1:]) pca_cols = list(pca_df.columns[1:]) dl_cols = list(dl_df.columns[1:]) pca0_cols = [c for c in pca_cols if 'k0' in c] df = fnc_df.merge(loading_df, on='Id') df = df.merge(agg_df, how='left', on='Id') df = df.merge(im_df, how='left', on='Id') df = df.merge(pca_df, how='left', on='Id') df = df.merge(dl_df, how='left', on='Id') df = df.merge(labels_df, how='left', on='Id') del loading_df, fnc_df, agg_df, im_df, pca_df gc.collect() # split train and test df.loc[df['Id'].isin(labels_df['Id']), 'is_test'] = 0 df.loc[~df['Id'].isin(labels_df['Id']), 'is_test'] = 1 train = df.query('is_test==0') del train['is_test'] test = df.query('is_test==1') del test['is_test'] y = train['domain2_var1'].copy().reset_index(drop=True) d21_index = list(train['domain2_var1'].dropna().index) # apply biases for c in ic_bias_site.keys(): test.loc[~test['Id'].isin(ids_df['Id']), c] += ic_bias[c] test.loc[test['Id'].isin(ids_df['Id']), c] += ic_bias_site[c] for c in fnc_bias_site.keys(): test.loc[~test['Id'].isin(ids_df['Id']), c] += fnc_bias[c] test.loc[test['Id'].isin(ids_df['Id']), c] += fnc_bias_site[c] for c in pca_bias_site.keys(): test.loc[~test['Id'].isin(ids_df['Id']), c] += pca_bias[c] test.loc[test['Id'].isin(ids_df['Id']), c] += pca_bias_site[c] # save df for scaling df_scale = pd.concat([train, test], axis=0) # I. Create fnc score print('Creating FNC score...') # prepare datasets for fnc score train_for_score, test_for_score = scale_select_data( train, test, df_scale, fnc_cols) # define models names = ['ENet', 'BRidge'] names = [name + '_fnc_seed{}'.format(seed) for name in names] pack = [ ElasticNet(alpha=0.05, l1_ratio=0.5, random_state=0), BayesianRidge() ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 2, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 2, names) # save oof, pred, models np.save(folder_preds + 'fnc_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'fnc_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # II. Create agg score print('Creating AGG score...') # prepare datasets for agg score train_for_score, test_for_score = scale_select_data( train, test, df_scale, agg_cols) # define models names = ['RGF', 'ENet', 'Huber'] names = [name + '_agg_seed{}'.format(seed) for name in names] pack = [ RGFRegressor(max_leaf=1000, reg_depth=5, min_samples_leaf=100, normalize=True), ElasticNet(alpha=0.05, l1_ratio=0.3, random_state=0), HuberRegressor(epsilon=2.5, alpha=1) ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 3, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 3, names) # save oof, pred, models np.save(folder_preds + 'agg_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'agg_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # III. Create pca score print('Creating PCA score...') # prepare datasets for pca score train_for_score, test_for_score = scale_select_data( train, test, df_scale, pca_cols) # define models names = ['ENet', 'BRidge', 'OMP'] names = [name + '_pca_seed{}'.format(seed) for name in names] pack = [ ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0), BayesianRidge(), OrthogonalMatchingPursuit() ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 3, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 3, names) # save oof, pred, models np.save(folder_preds + 'pca_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'pca_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # IV. Create im score print('Creating IM score...') # prepare datasets for pca score train_for_score, test_for_score = scale_select_data( train, test, df_scale, im_cols) # define models names = ['ENet', 'BRidge', 'OMP'] names = [name + '_im_seed{}'.format(seed) for name in names] pack = [ ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0), BayesianRidge(), OrthogonalMatchingPursuit() ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 3, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 3, names) # save oof, pred, models np.save(folder_preds + 'im_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'im_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # V. Create dl score print('Creating DL score...') # prepare datasets for pca score train_for_score, test_for_score = scale_select_data( train, test, df_scale, dl_cols) # define models names = ['ENet', 'BRidge', 'OMP'] names = [name + '_dl_seed{}'.format(seed) for name in names] pack = [ ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0), BayesianRidge(), OrthogonalMatchingPursuit() ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 3, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 3, names) # save oof, pred, models np.save(folder_preds + 'dl_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'dl_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # VI. Training and predicting procedure print('Training has started...') # add scores for prefix in ['fnc', 'agg', 'im', 'pca', 'dl']: train.loc[d21_index, prefix + '_score'] = np.load( folder_preds + '{}_score_seed{}.npy'.format(prefix, seed)) test.loc[:, prefix + '_score'] = np.load( folder_preds + '{}_score_test_seed{}.npy'.format(prefix, seed)) score_cols = [c for c in train.columns if c.endswith('_score')] # save df for scaling df_scale = pd.concat([train, test], axis=0) # create differents datasets # linear linear_cols = sorted( list(set(ic_cols + fnc_cols + pca0_cols) - set(['IC_20']))) train_linear, test_linear = scale_select_data(train, test, df_scale, linear_cols) # kernel kernel_cols = sorted(list(set(ic_cols + pca0_cols) - set(['IC_20']))) train_kernel, test_kernel = scale_select_data(train=train, test=test, df_scale=df_scale, cols=kernel_cols, scale_factor=0.2, scale_cols=pca0_cols, sc=StandardScaler()) # score sc_cols = sorted(list(set(ic_cols + score_cols) - set(['IC_20']))) train_sc, test_sc = scale_select_data(train, test, df_scale, sc_cols) # learning process on different datasets names = ['GP', 'SVM1', 'SVM2', 'Lasso', 'BgR'] names = [name + '_seed{}'.format(seed) for name in names] pack = [ GaussianProcessRegressor(DotProduct(), random_state=0), NuSVR(C=3, kernel='rbf'), NuSVR(C=3, kernel='rbf'), Lasso(alpha=0.1, random_state=0), BaggingRegressor(Ridge(alpha=1), n_estimators=100, max_samples=0.2, max_features=0.2, random_state=0) ] zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_sc] * 2 + [train_kernel] + [train_linear] * 2, y) de_blend = zoo.blend_oof() preds = zoo.predict([test_sc] * 2 + [test_kernel] + [test_linear] * 2, names, is_blend=True) # rewrite folders for models and preds folder_models = './models/domain2_var1/stack/' if not os.path.exists(folder_models): os.makedirs(folder_models) folder_preds = './predicts/domain2_var1/stack/' if not os.path.exists(folder_preds): os.makedirs(folder_preds) print('Saving models to', folder_models) print('Saving predictions to', folder_preds) # save oofs and models zoo.save_oofs(names, folder=folder_preds) zoo.save_models(names, folder=folder_models) # stacking predictions print('Stacking predictions...') d21_prediction = pd.DataFrame() d21_prediction['Id'] = test['Id'].values d21_prediction['pred'] = preds d21_prediction.to_csv(folder_preds + 'domain2_var1_stack_seed{}.csv'.format(seed), index=False) print('domain2_var1 seed pred is saved as', folder_preds + 'domain2_var1_stack_seed{}.csv'.format(seed))
def train_svr_cpu(X, Y, X_eval, c, kernel='linear', nu=0.5): svc = NuSVR(kernel=kernel, C=c, max_iter=100000, nu=nu, gamma='auto') svc.fit(X, Y) y_prob = svc.predict(X_eval) return y_prob
def regress_NuSVR(X_train, X_test, y_train, y_test, C1, nu1): nusvr = NuSVR(nu=nu1, C=C1, kernel='rbf', gamma=0.0001, tol=0.001) regr_nusvr = prep_process(nusvr, X_train, X_test, y_train, y_test) return (regr_nusvr[0], regr_nusvr[1])
def fit(self, xtrain, ytrain, info, learn_hyper=True): # prepare training data xtrain_data = self.prepare_data(info) y_train = np.array(ytrain) # learn hyperparameters of the extrapolator by cross validation if self.best_hyper is None or learn_hyper: # specify model hyper-parameters if self.model_name == 'svr': C = loguniform(1e-5, 10, self.n_hypers) nu = np.random.uniform(0, 1, self.n_hypers) gamma = loguniform(1e-5, 10, self.n_hypers) hyper = np.vstack([C, nu, gamma]).T elif self.model_name == 'blr': alpha_1 = np.random.uniform(1e-7, 1e-5, self.n_hypers) alpha_2 = np.random.uniform(1e-7, 1e-5, self.n_hypers) lambda_1 = np.random.uniform(1e-7, 1e-5, self.n_hypers) lambda_2 = np.random.uniform(1e-7, 1e-5, self.n_hypers) hyper = np.vstack([alpha_1, alpha_2, lambda_1, lambda_2]).T elif self.model_name == 'rf': n_trees = np.random.randint(10, 800, self.n_hypers) frac_feature = np.random.uniform(0.1, 0.5, self.n_hypers) hyper = np.vstack([n_trees, frac_feature]).T print(f'start CV on {self.model_name}') mean_score_list = [] t_start = time.time() for i in range(self.n_hypers): # define model if self.model_name == 'svr': model = NuSVR(C=hyper[i, 0], nu=hyper[i, 1], gamma=hyper[i, 2], kernel='rbf') elif self.model_name == 'blr': model = BayesianRidge(alpha_1=hyper[i, 0], alpha_2=hyper[i, 1], lambda_1=hyper[i, 2], lambda_2=hyper[i, 3]) elif self.model_name == 'rf': model = RandomForestRegressor(n_estimators=int(hyper[i, 0]), max_features=hyper[i, 1]) # perform cross validation to learn the best hyper value scores = cross_val_score(model, xtrain_data, y_train, cv=3) mean_scores = np.mean(scores) mean_score_list.append(mean_scores) t_end = time.time() best_hyper_idx = np.argmax(mean_score_list) best_hyper = hyper[best_hyper_idx] max_score = np.max(mean_score_list) time_taken = t_end - t_start print( f'{self.model_name}' f'best_hyper={best_hyper}, score={max_score}, time={time_taken}' ) self.best_hyper = best_hyper # fit the extrapolator with the best hyperparameters to the training data if self.model_name == 'svr': best_model = NuSVR(C=self.best_hyper[0], nu=self.best_hyper[1], gamma=self.best_hyper[2], kernel='rbf') elif self.model_name == 'blr': best_model = BayesianRidge(alpha_1=self.best_hyper[0], alpha_2=self.best_hyper[1], lambda_1=self.best_hyper[2], lambda_2=self.best_hyper[3]) elif self.model_name == 'rf': best_model = RandomForestRegressor(n_estimators=int( self.best_hyper[0]), max_features=self.best_hyper[1]) best_model.fit(xtrain_data, y_train) self.best_model = best_model
def runTcheby(): global param, approx_pareto_front, archiveOK, NO_FILE_TO_WRITE ############################################################################ # PARAMETER # clf = SVR(C=1.0, epsilon=0.1, kernel="rbf") clf = NuSVR() clf2 = -1 two_models_bool = False isReals = True start_fct, nb_functions = param[0:2] nb_iterations, neighboring_size = param[2:4] init_decisions, problem_size = param[4:6] max_decisions_maj, delta_neighbourhood = param[6:8] CR, search_space = param[8:10] F, distrib_index_n = param[10:12] pm, operator_fct = param[12:14] nb_samples, training_neighborhood_size = param[14:16] strategy, file_to_write = param[16:18] filter_strat, free_eval = param[18:20] param_print_every, file_to_writeR2 = param[20:22] filenameDIR, filenameSCORE = param[22:24] nb_objectives = len(start_fct) # get separatly offspring operator fct crossover_fct, mutation_fct, repair_fct = operator_fct best_decisions = copy.deepcopy(init_decisions) sampling_param = [ crossover_fct, mutation_fct, repair_fct, best_decisions, F, problem_size, CR, search_space, distrib_index_n, pm, ] ############################################################################ # INITIALISATION qual_tools.resetGlobalVariables(filenameDIR, filenameSCORE, nb_iterations, nb_functions) eval_to.resetEval() # get the directions weight for both starting functions directions = dec.getDirections(nb_functions, nb_objectives) # init the neighboring constant nt.initNeighboringTab(nb_functions, neighboring_size, directions, nb_objectives) # giving global visibility to the best_decisions to get the result at the end approx_pareto_front = best_decisions # initial best decisions scores best_decisions_scores = [eval_to.free_eval(start_fct, best_decisions[i], problem_size) for i in range(nb_functions)] pop_size = nb_functions # current optimal scores for both axes z_opt_scores = gt.getMinTabOf(best_decisions_scores) eval_to.initZstar(z_opt_scores) # get the first training part of the item we will learn on model_directions = train_to.getDirectionsTrainingMatrix(directions) # if the data shall be write in a file writeOK = False if file_to_write != NO_FILE_TO_WRITE: writeOK = True writeR2OK = False if file_to_writeR2 != NO_FILE_TO_WRITE: writeR2OK = True ############################################################################ # MAIN ALGORITHM if writeOK: iot.printObjectives(file_to_write, eval_to.getNbEvals(), 0, best_decisions_scores, problem_size, nb_objectives) # IDs tab to allow a random course through the directions in the main loop id_directions = [i for i in range(nb_functions)] # iterations loop for itera in range(nb_iterations): if not free_eval: # Update model training_inputs, training_outputs, training_set_size, training_scores = train_to.getTrainingSet( model_directions, best_decisions, best_decisions_scores, eval_to.getZstar_with_decal(), strategy, nb_functions, training_neighborhood_size, ) clf.fit(training_inputs, training_outputs) """ if(writeR2OK and not free_eval): training_inputs_tcheby = eval_to.getManyTcheby(training_inputs, training_scores, eval_to.getZstar_with_decal(), training_set_size) random_index = numpy.arange(0,training_set_size) numpy.random.shuffle(random_index) n_folds = 10 folds_sizes = (training_set_size // n_folds) * numpy.ones(n_folds, dtype=numpy.int) folds_sizes[:training_set_size % n_folds] += 1 training_inputs_array = numpy.array(training_inputs) training_tcheby_array = numpy.array(training_inputs_tcheby) R2_cv = [] MSE_cv = [] MAE_cv = [] MDAE_cv = [] clfCV = NuSVR() current = 0 for fold_size in folds_sizes: start, stop = current, current + fold_size mask = numpy.ones(training_set_size, dtype=bool) mask[start:stop] = 0 current = stop clfCV.fit(training_inputs_array[random_index[mask]], training_tcheby_array[random_index[mask]]) test_fold_tcheby = training_tcheby_array[random_index[start:stop]] test_fold_predict = clfCV.predict(training_inputs_array[random_index[start:stop]]) R2_cv .append(r2_score (test_fold_tcheby, test_fold_predict)) MSE_cv .append(mean_squared_error (test_fold_tcheby, test_fold_predict)) MAE_cv .append(mean_absolute_error (test_fold_tcheby, test_fold_predict)) MDAE_cv.append(median_absolute_error(test_fold_tcheby, test_fold_predict)) R2 = clf.score(training_inputs, training_outputs) MSE_cv_mean = numpy.mean(MSE_cv) RMSE_cv_mean = math.sqrt(MSE_cv_mean) MAE_cv_mean = numpy.mean(MAE_cv) MDAE_cv_mean = numpy.mean(MDAE_cv) R2_cv_mean = numpy.mean(R2_cv) iot.printR2(file_to_writeR2, eval_to.getNbEvals(), itera, R2, R2_cv_mean, MSE_cv_mean , MAE_cv_mean, MDAE_cv_mean, RMSE_cv_mean, problem_size, print_every=1) """ # random course through the directions random.shuffle(id_directions) # functions loop for f in id_directions: # get all the indice of neighbors of a function in a certain distance of f and include f in f_neighbors, current_neighbourhing_size = nt.getNeighborsOf(f, delta_neighbourhood) # get a list of offspring from the neighbors list_offspring = samp_to.extended_sampling(f, f_neighbors, sampling_param, nb_samples) # apply a filter on the offspring list and select the best one filter_param = [ itera, f, clf, clf2, two_models_bool, f_neighbors, list_offspring, model_directions, start_fct, problem_size, eval_to.getZstar_with_decal(), best_decisions_scores, best_decisions, nb_objectives, ] best_candidate = filt_to.model_based_filtring(filter_strat, free_eval, filter_param) # evaluation of the newly made solution mix_scores = eval_to.eval(start_fct, best_candidate, problem_size) # MAJ of the z_star point has_changed = eval_to.min_update_Z_star(mix_scores, nb_objectives) # retraining of the model with the new z_star if has_changed and not free_eval: train_to.updateTrainingZstar(eval_to.getZstar_with_decal()) training_outputs = train_to.retrainSet( training_inputs, training_scores, eval_to.getZstar_with_decal(), training_set_size, nb_objectives ) clf.fit(training_inputs, training_outputs) # boolean that is True if the offspring has been add to the archive added_to_S = False # count how many best decisions has been changed by the newly offspring cmpt_best_maj = 0 # random course through the neighbors list random.shuffle(f_neighbors) # course through the neighbors list for j in f_neighbors: # stop if already max number of remplacement reach if cmpt_best_maj >= max_decisions_maj: break # compute g_tcheby # wj = (directions[0][j],directions[1][j]) wj = [directions[obj][j] for obj in range(0, nb_objectives)] g_mix = eval_to.g_tcheby(wj, mix_scores, eval_to.getZstar_with_decal()) g_best = eval_to.g_tcheby(wj, best_decisions_scores[j], eval_to.getZstar_with_decal()) # if the g_tcheby of the new solution is less distant from the z_optimal solution than the current best solution of the function j if g_mix < g_best: cmpt_best_maj += 1 best_decisions[j] = best_candidate best_decisions_scores[j] = mix_scores # if we manage the archive and the solution have not been add already if archiveOK and not (added_to_S): arch_to.archivePut(best_candidate, mix_scores) added_to_S = True # print("Update", itera, "done.") # if manage archive if archiveOK: arch_to.maintain_archive() # if write the result in a file if writeOK: iot.printObjectives( file_to_write, eval_to.getNbEvals(), itera + 1, best_decisions_scores, problem_size, nb_objectives, print_every=param_print_every, ) continue # graphic update # yield arch_to.getArchiveScore(), best_decisions_scores, itera+1, eval_to.getNbEvals(), eval_to.getZstar_with_decal(), pop_size, isReals if not free_eval and writeR2OK: qual_tools.computeQualityEvaluation() qual_tools.generateDiffPredFreeFile() return
print 'LinearSVR precision train: {}'.format(lsvr_score_train) lsvr_score_test = lsvr.score(smr_test.feature_matrix, smr_test.labels) print 'LinearSVR precision test: {}'.format(lsvr_score_test) print '' nusvc = NuSVC() print 'NuSVC config:' print nusvc.get_params() nusvc.fit(smr_train.feature_matrix, smr_train.labels) nusvc_score_train = nusvc.score(smr_train.feature_matrix, smr_train.labels) print 'NuSVC precision train: {}'.format(nusvc_score_train) nusvc_score_test = nusvc.score(smr_test.feature_matrix, smr_test.labels) print 'NuSVC precision test: {}'.format(nusvc_score_test) print '' nusvr = NuSVR() print 'NuSVR config:' print nusvr.get_params() nusvr.fit(smr_train.feature_matrix, smr_train.labels) nusvr_score_train = svc.score(smr_train.feature_matrix, smr_train.labels) print 'NuSVR precision train: {}'.format(nusvr_score_train) nusvr_score_test = nusvr.score(smr_test.feature_matrix, smr_test.labels) print 'NuSVR precision test: {}'.format(nusvr_score_test) print '' dtc = DecisionTreeClassifier() print 'DecisionTreeClassifier config:' print dtc.get_params() dtc.fit(smr_train.feature_matrix, smr_train.labels) dtc_score_train = dtc.score(smr_train.feature_matrix, smr_train.labels)
def nusvrtrain(x, y, pre_x): x, pre_x = datscater(x, pre_x) clf = NuSVR(C = 5.0).fit(x, y) pred = clf.predict(pre_x) return pred
import numpy as np import pickle from build_database import flux_obj from sklearn.svm import SVR from sklearn.svm import NuSVR from matplotlib import pyplot as plt with open('database_lat.pkl','rb') as file: db = pickle.load(file) print db.keys() S = NuSVR(kernel='rbf') X = [] Y = [] for k in db.keys(): #k = db.keys()[5] # print np.array(k) t = np.linspace(0,db[k].RES_FINT,db[k].NUM_T) #X = np.atleast_2d(t).T #Y = np.power(10,db[k].N) inp = np.vstack([np.outer(np.array([k[0],k[3]]), np.ones(int(db[k].NUM_T))), t]).T X.extend(inp) Y.extend(np.power(10,db[k].N)) #Y.extend(db[k].N)
def func_model(X_train,y_train): ''' Process one model by training data Input: X_train,y_train Output: regressor by the need ''' global model if model == 'XG': reg = XGBRegressor() elif model == 'RD': reg = RidgeCV(alphas=(0.1, 1.0, 10.0), fit_intercept=True, normalize=False, scoring=None, cv=5, gcv_mode=None, store_cv_values=False) elif model == 'LS': reg = LassoCV(max_iter = 10**8) elif model == 'LLS': reg = LassoLarsCV() elif model == 'ADA': reg = AdaBoostRegressor() elif model == 'EN': reg = ElasticNetCV() elif model == 'DT': reg = DecisionTreeRegressor(criterion="mse", splitter="best", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None) elif model == 'SVR': reg = SVR() elif model == 'KN': reg = KNeighborsRegressor(n_neighbors=5, weights="uniform", algorithm="auto", leaf_size=30, p=2, metric="minkowski", metric_params=None) elif model == 'BG': reg = BaggingRegressor(base_estimator=LassoCV(max_iter = 10**8), n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=True, random_state=None, verbose=0) elif model == 'GB': reg = GradientBoostingRegressor(loss="ls", learning_rate=0.1, n_estimators=100, subsample=1.0, criterion="friedman_mse", min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001) elif model == 'ET': reg = ExtraTreesRegressor() elif model == 'RF': reg = RandomForestRegressor() elif model == 'ST': estimators = [ ('ADA',AdaBoostRegressor()), ('LS',LassoCV(max_iter = 10**8)), ('LLS',LassoLarsCV()), ('RD',RidgeCV()), ('XG',XGBRegressor()), ('KN',KNeighborsRegressor()) ] reg = StackingRegressor(estimators=estimators) elif model == 'NSVR': reg = NuSVR() elif model == 'ST2': estimators = [RidgeCV(), AdaBoostRegressor(), LassoCV(max_iter = 10**8), LassoLarsCV(), XGBRegressor(), KNeighborsRegressor(),ElasticNetCV()] reg = StackingCVRegressor(regressors = estimators, meta_regressor = LassoCV(max_iter = 10**8)) elif model == 'LR': reg = LinearRegression() elif model == 'NN': reg = MLPRegressor(learning_rate = 'adaptive', max_iter = 1000) reg.fit(X_train,y_train) return reg
df = test_regressor(ARDRegression(compute_score=True, copy_X=True), df) # test_regressor(LogisticRegressionCV(cv=5)) - it's used for classification df = test_regressor(SGDRegressor(), df) df = test_regressor(PassiveAggressiveRegressor(), df) df = test_regressor(RANSACRegressor(), df) df = test_regressor(TheilSenRegressor(copy_X=True), df) df = test_regressor(HuberRegressor(), df) df = test_regressor(AdaBoostRegressor(n_estimators=1000), df) df = test_regressor(BaggingRegressor(n_estimators=1000), df) df = test_regressor(ExtraTreesRegressor(n_estimators=1000), df) df = test_regressor(GradientBoostingRegressor(n_estimators=1000), df) df = test_regressor(RandomForestRegressor(n_estimators=1000), df) df = test_regressor(GaussianProcessRegressor(), df) # df = test_regressor(IsotonicRegression(), df) - has errors df = test_regressor(LinearSVR(), df) df = test_regressor(NuSVR(), df) df = test_regressor(SVR(), df) df = test_regressor(XGBRegressor(n_estimators=1000), df) df = test_regressor(lgb.LGBMRegressor(n_estimators=1000), df) df = test_regressor(CatBoostRegressor(n_estimators=1000), df) df = test_regressor(DecisionTreeRegressor(max_depth=3), df) df = test_regressor(KNeighborsRegressor(), df) # df = test_regressor(RadiusNeighborsRegressor(), df) - also has errors df = test_regressor(DummyRegressor(), df) df = test_regressor( StackingRegressor(regressors=[ GradientBoostingRegressor(n_estimators=1000), HuberRegressor(), RidgeCV(cv=5),
# In[ ]: xgb_params = {'eta': 0.03, 'max_depth': 9, 'subsample': 0.85, 'objective': 'reg:linear', 'eval_metric': 'mae', 'silent': True, 'nthread': 4} oof_xgb, prediction_xgb = train_model(X=X_train_scaled, X_test=X_test_scaled, params=xgb_params, model_type='xgb') # In[ ]: model = NuSVR(gamma='scale', nu=0.9, C=10.0, tol=0.01) oof_svr, prediction_svr = train_model(X=X_train_scaled, X_test=X_test_scaled, params=None, model_type='sklearn', model=model) # In[ ]: model = NuSVR(gamma='scale', nu=0.7, tol=0.01, C=1.0) oof_svr1, prediction_svr1 = train_model(X=X_train_scaled, X_test=X_test_scaled, params=None, model_type='sklearn', model=model) # In[ ]: params = {'loss_function':'MAE'} oof_cat, prediction_cat = train_model(X=X_train_scaled, X_test=X_test_scaled, params=params, model_type='cat') # In[ ]:
class NuSVRScikitTest(unittest.TestCase): """ Unit test class for testing scikit-learn converter. """ @classmethod def setUpClass(self): """ Set up the unit test by loading the dataset and training a model. """ if not HAS_SKLEARN: return self.scikit_model = NuSVR(kernel='linear') self.data = load_boston() self.scikit_model.fit(self.data['data'], self.data['target']) def test_conversion_bad_inputs(self): # Error on converting an untrained model with self.assertRaises(TypeError): model = NuSVR() spec = scikit_converter.convert(model, 'data', 'out') # Check the expected class during covnersion. with self.assertRaises(TypeError): model = OneHotEncoder() spec = scikit_converter.convert(model, 'data', 'out') @pytest.mark.slow def test_evaluation_stress_test(self): self._test_evaluation(allow_slow = True) def test_evaluation(self): self._test_evaluation(allow_slow = False) def _test_evaluation(self, allow_slow): """ Test that the same predictions are made """ # Generate some smallish (some kernels take too long on anything else) random data x, y = [], [] for _ in range(50): cur_x1, cur_x2 = random.gauss(2,3), random.gauss(-1,2) x.append([cur_x1, cur_x2]) y.append( 1 + 2*cur_x1 + 3*cur_x2 ) input_names = ['x1', 'x2'] df = pd.DataFrame(x, columns=input_names) # Parameters to test kernel_parameters = [{}, {'kernel': 'rbf', 'gamma': 1.2}, {'kernel': 'linear'}, {'kernel': 'poly'}, {'kernel': 'poly', 'degree': 2}, {'kernel': 'poly', 'gamma': 0.75}, {'kernel': 'poly', 'degree': 0, 'gamma': 0.9, 'coef0':2}, {'kernel': 'sigmoid'}, {'kernel': 'sigmoid', 'gamma': 1.3}, {'kernel': 'sigmoid', 'coef0': 0.8}, {'kernel': 'sigmoid', 'coef0': 0.8, 'gamma': 0.5} ] non_kernel_parameters = [{}, {'C': 1}, {'C': 1.5, 'shrinking': True}, {'C': 0.5, 'shrinking': False, 'nu': 0.9}] # Test for param1 in non_kernel_parameters: for param2 in kernel_parameters: cur_params = param1.copy() cur_params.update(param2) cur_model = NuSVR(**cur_params) cur_model.fit(x, y) df['prediction'] = cur_model.predict(x) spec = scikit_converter.convert(cur_model, input_names, 'target') if is_macos() and macos_version() >= (10, 13): metrics = evaluate_regressor(spec, df) self.assertAlmostEquals(metrics['max_error'], 0) if not allow_slow: break if not allow_slow: break
def __init__(self, task_type="linearsvc"): self.task_type = task_type assert self.task_type in { "linearsvc", "linearsvr", "nusvc", "nusvr", "oneclasssvm", "svc", "svr", "l1_min_c" } if self.task_type == "linearsvc": # 线性支持向量分类 self.model = LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=1e-4, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000) elif self.task_type == "linearsvr": # 线性支持向量回归 self.model = LinearSVR(epsilon=0.0, tol=1e-4, C=1.0, loss='epsilon_insensitive', fit_intercept=True, intercept_scaling=1., dual=True, verbose=0, random_state=None, max_iter=1000) elif self.task_type == "nusvc": # Nu 支持向量分类 self.model = NuSVC(nu=0.5, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=1e-3, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=None) elif self.task_type == "nusvr": # Nu支持向量回归 self.model = NuSVR(nu=0.5, C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, tol=1e-3, cache_size=200, verbose=False, max_iter=-1) elif self.task_type == "oneclasssvm": # 无监督异常值检测 self.model = OneClassSVM(kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=1e-3, nu=0.5, shrinking=True, cache_size=200, verbose=False, max_iter=-1) elif self.task_type == "svc": # c支持向量分类 self.model = SVC(C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=1e-3, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=None) else: # Epsilion 支持向量回归 self.model = SVR(kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=1e-3, C=1.0, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=-1)
# Parameters depth = 60 horizon = 7 # Form feature and target vectors featureVectors, targetVectors = util.formFeatureAndTargetVectorsMultiHorizon(correctedSeries, depth, horizon) outputFolderName = "Outputs/Outputs" + datetime.now().strftime("%Y_%m_%d_%H_%M_%S") os.mkdir(outputFolderName) for i in range(horizon): # Train different models for different horizon # Train the model #model = Pipeline([('poly', PolynomialFeatures(degree=2)), ('linear', LinearRegression(fit_intercept=False))]) #model = NuSVR(kernel='linear', nu=1.0) model = NuSVR(kernel="rbf", nu=1.0, tol=1e-10, gamma=1.0) #model = RidgeCV() model.fit(featureVectors, targetVectors[:, i]) predictedTargetVectors = model.predict(featureVectors) # Plot the actual and predicted actual = targetVectors[:, i] predicted = predictedTargetVectors # Descale actual = util.scalingFunction.inverse_transform(actual) predicted = util.scalingFunction.inverse_transform(predicted) outplot = outputPlot.OutputPlot(outputFolderName + "/Prediction_horizon"+str(i+1)+".html", "Facebook Fans Change - Linear Regression", "Taylor Swift", "Time", "Output") outplot.setXSeries(np.arange(1, targetVectors.shape[0]))
#normalize train data scaler = StandardScaler() scaler.fit(X_train) X_train_scaled = scaler.transform(X_train) print(X_train_scaled) # In[6]: #apply model #from sklearn.isotonic import IsotonicRegression #from sklearn.linear_model import ElasticNet #from sklearn.gaussian_process import GaussianProcessRegressor from sklearn import svm from sklearn.svm import NuSVR model = NuSVR() model.fit(X_train_scaled, y_train.values.flatten()) y_pred = model.predict(X_train_scaled) # In[7]: #plt.figure(figsize=(6, 6)) #plt.scatter(y_train.values, y_pred) #plt.xlim(0, 20) #plt.ylim(0, 20) #plt.xlabel('actual', fontsize=12) #plt.ylabel('predicted', fontsize=12) #plt.plot([(0, 0), (20, 20)], [(0, 0), (20, 20)]) #plt.show() plt.figure(figsize=(16, 8))
def regress(X_train, y_train): # comment out any classifier that should not be used classifiers = [ (SGDRegressor(), "SGDRegressor", 1 * global_data_scale), (LinearRegression(), "LinearRegression", 1 * global_data_scale), (Ridge(), "Ridge", 1 * global_data_scale), (Lasso(), "Lasso", 1 * global_data_scale), (ElasticNet(), "ElasticNet", 1 * global_data_scale), (Lars(), "Lars", 1 * global_data_scale), (OrthogonalMatchingPursuit(), "OrthogonalMatchingPursuit", 1 * global_data_scale), (BayesianRidge(), "BayesianRidge", 1 * global_data_scale), (ARDRegression(), "ARDRegression", 1 * global_data_scale), ### NOTE the scoring might be different of PassiveAggressiveRegressor (PassiveAggressiveRegressor(), "PassiveAggressiveRegressor", 1 * global_data_scale), ### NOTE the scoring might be different of RANSACRegressor (RANSACRegressor(), "RANSACRegressor", 1 * global_data_scale), (TheilSenRegressor(), "TheilSenRegressor", 1 * global_data_scale), (HuberRegressor(), "HuberRegressor", 1 * global_data_scale), (DecisionTreeRegressor(), "DecisionTreeRegressor", 1 * global_data_scale), (GaussianProcessRegressor(), "GaussianProcessRegressor", 1 * global_data_scale), (MLPRegressor(), "MLPRegressor", 1 * global_data_scale), (KNeighborsRegressor(), "KNeighborsRegressor", 1 * global_data_scale), (RadiusNeighborsRegressor(), "RadiusNeighborsRegressor", 1 * global_data_scale), (SVR(), "SVR", 1 * global_data_scale), (NuSVR(), "NuSVR", 1 * global_data_scale), (LinearSVR(), "LinearSVR", 1 * global_data_scale), (KernelRidge(), "KernalRidge", 1 * global_data_scale), (IsotonicRegression(), "IsotonicRegression", 1 * global_data_scale) ] # set the list of the values that should be used in grid search params_dict = { "SGDRegressor": { "penalty": ["l2", "l1"], "alpha": [.001, .0001, .00001], "l1_ratio": [.15, .2, .25], "fit_intercept": [True, False], "max_iter": [1000], "shuffle": [True, False], "epsilon": [.05, .1, .2], "learning_rate": ["constant", "optimal", "invscaling", "adaptive"], "eta0": [.005, .01, .02], "power_t": [.2, .25, .3] }, "LinearRegression": { "fit_intercept": [True, False], "normalize": [True, False] }, "Ridge": { "alpha": [.8, 1., 1.2], "fit_intercept": [True, False], "normalize": [True, False], "tol": [.01, .001, .0001], "solver": ["svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"] }, "Lasso": { "alpha": [.8, 1., 1.2], "fit_intercept": [True, False], "normalize": [True, False], "positive": [True, False], "precompute": [True, False] }, "ElasticNet": { "alpha": [.8, 1., 1.2], "fit_intercept": [True, False], "normalize": [True, False], "precompute": [True, False], "positive": [True, False], "selection": ["cyclic", "random"] }, "Lars": { "fit_intercept": [True, False], "normalize": [True, False], "precompute": [True, False], "n_nonzero_coefs": [np.inf] }, "OrthogonalMatchingPursuit": { "n_nonzero_coefs": [np.inf, None], "precompute": [True, False], "fit_intercept": [True, False], "normalize": [True, False] }, "BayesianRidge": { "tol": [.01, .001, .0001], "alpha_1": [1e-5, 1e-6, 1e-7], "alpha_2": [1e-5, 1e-6, 1e-7], "lambda_1": [1e-5, 1e-6, 1e-7], "lambda_2": [1e-5, 1e-6, 1e-7], "fit_intercept": [True, False], "normalize": [True, False] }, "ARDRegression": { "tol": [.01, .001, .0001], "alpha_1": [1e-5, 1e-6, 1e-7], "alpha_2": [1e-5, 1e-6, 1e-7], "lambda_1": [1e-5, 1e-6, 1e-7], "lambda_2": [1e-5, 1e-6, 1e-7], "threshold_lambda": [1000, 10000, 100000], "fit_intercept": [True, False], "normalize": [True, False] }, "PassiveAggressiveRegressor": { "C": [.8, 1., 1.2 ], "tol": [1e-2, 1e-3, 1e-4], "n_iter_no_change": [3, 5, 8], "shuffle": [True, False], "average": [True, False] }, "RANSACRegressor": { "base_estimator": [LinearRegression()] }, "TheilSenRegressor": { "max_subpopulation": [1e3, 1e4, 1e5], "tol": [1e-2, 1e-3, 1e-4] }, "HuberRegressor": { "epsilon": [1.1, 1.35, 1.5], "alpha": [1e-3, 1e-4, 1e-5], "warm_start": [True, False], "fit_intercept": [True, False], "": [1e-4, 1e-5, 1e-6] }, "DecisionTreeRegressor": { "criterion": ["mse", "friedman_mse", "mae"], "splitter": ["best", "random"], "min_samples_split": [2, 3], "min_samples_leaf": [1, 2], "min_weight_fraction_leaf": [.0], "max_features": ["auto", "sqrt", "log2"], "min_impurity_split": [1e-6, 1e-7, 1e-8] }, "GaussianProcessRegressor": { "alpha": [1e-8, 1e-10, 1e-12], "optimizer": ["fmin_l_bfgs_b"], "normalize_y": [True, False] }, "MLPRegressor": { "hidden_layer_sizes": [(100,)], "activation": ["identity", "logistic", "tanh", "relu"], "solver": ["lbfgs", "sgd", "adam"], "alpha": [1e-3, 1e-4, 1e-5], # "learning_rate": ["constant", "invscaling", "adaptive"], # "learning_rate_init": [1e-2, 1e-3, 1e-4], # "power_t": [.3, .5, .8], # "shuffle": [True, False], # "tol": [1e-3, 1e-4, 1e-5], # "momentum": [.8, .9, .99], # "beta_1": [.8, .9, .99], # "beta_2": [.999], # "epsilon": [1e-7, 1e-8, 1e-9], # "n_iter_no_change": [10], # "max_fun": [15000] }, "KNeighborsRegressor": { "n_neighbors": [20, 10, 5, 3], "weights": ["uniform", "distance"], "algorithm": ["ball_tree", "kd_tree", "brute"], "leaf_size": [20, 30, 40], "p": [1, 2] }, "RadiusNeighborsRegressor": { "radius": [.8, 1, 1.2], "n_neighbors": [20, 10, 5, 3], "weights": ["uniform", "distance"], "algorithm": ["ball_tree", "kd_tree", "brute"], "leaf_size": [20, 30, 40], "p": [1, 2] }, "SVR": { "kernel": ["poly", "rbf", "sigmoid"], "degree": [2, 3, 5], "gamma": ["scale", "auto"], "coef0": [.0], "tol": [1e-2, 1e-3, 1e-4], "C": [.8, .1, 1.2], "epsilon": [.08, .1, .12], "shrinking": [True, False], "max_iter": [-1] }, "NuSVR": { "nu": [.2, .5, .8], "C": [.8, .1, 1.2], "kernel": ["poly", "rbf", "sigmoid"], "degree": [2, 3, 5], "gamma": ["scale", "auto"], "coef0": [.0], "shrinking": [True, False], "tol": [1e-2, 1e-3, 1e-4], "max_iter": [-1] }, "LinearSVR": { "epsilon": [.0], "tol": [1e-3, 1e-4, 1e-5], "C": [.8, .1, 1.2], "fit_intercept": [True, False], "dual": [True, False], "intercept_scaling": [.8, 1., 1.2] }, "KernelRidge": { "coef0": [.8, 1, 1.2], "degree": [2, 3, 5], }, "IsotonicRegression": { "increasing": [True, False], } } for model, params, frac in classifiers: full = pd.DataFrame(X_train).join(pd.DataFrame(y_train)) loan_data = full.sample(frac=frac, random_state=random_state) X = loan_data.drop("loan_status", axis=1) y = loan_data["loan_status"] grid = GridSearchCV(model, params_dict[params], verbose=verbose, cv=folds, n_jobs=workers) grid.fit(X, y) yield grid, params
def run_kernel(input_dir, verbose=False): if verbose: print(os.listdir(input_dir)) train = pd.read_csv( input_dir / 'train.csv', dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64}) if verbose: print(train.head()) pd.options.display.precision = 15 print(train.head()) # Create a training file with simple derived features rows = 150_000 segments = int(np.floor(train.shape[0] / rows)) X_train = pd.DataFrame(index=range(segments), dtype=np.float64, columns=['ave', 'std', 'max', 'min']) y_train = pd.DataFrame(index=range(segments), dtype=np.float64, columns=['time_to_failure']) for segment in tqdm(range(segments)): seg = train.iloc[segment * rows:segment * rows + rows] x = seg['acoustic_data'].values y = seg['time_to_failure'].values[-1] y_train.loc[segment, 'time_to_failure'] = y X_train.loc[segment, 'ave'] = x.mean() X_train.loc[segment, 'std'] = x.std() X_train.loc[segment, 'max'] = x.max() X_train.loc[segment, 'min'] = x.min() if verbose: print(X_train.head()) scaler = StandardScaler() scaler.fit(X_train) X_train_scaled = scaler.transform(X_train) svm = NuSVR() svm.fit(X_train_scaled, y_train.values.flatten()) y_pred = svm.predict(X_train_scaled) if verbose: plt.figure(figsize=(6, 6)) plt.scatter(y_train.values.flatten(), y_pred) plt.xlim(0, 20) plt.ylim(0, 20) plt.xlabel('actual', fontsize=12) plt.ylabel('predicted', fontsize=12) plt.plot([(0, 0), (20, 20)], [(0, 0), (20, 20)]) plt.show() score = mean_absolute_error(y_train.values.flatten(), y_pred) if verbose: print(f'Score: {score:0.3f}') submission = pd.read_csv( input_dir / 'sample_submission.csv', index_col='seg_id') X_test = pd.DataFrame(columns=X_train.columns, dtype=np.float64, index=submission.index) for seg_id in X_test.index: seg = pd.read_csv(input_dir / ('test/' + seg_id + '.csv')) x = seg['acoustic_data'].values X_test.loc[seg_id, 'ave'] = x.mean() X_test.loc[seg_id, 'std'] = x.std() X_test.loc[seg_id, 'max'] = x.max() X_test.loc[seg_id, 'min'] = x.min() X_test_scaled = scaler.transform(X_test) submission['time_to_failure'] = svm.predict(X_test_scaled) submission.to_csv('submission.csv')
model = SVR(C=100.0, gamma=0.1, cache_size=500) # 拟合训练集 model.fit(train_X, train_Y.values.ravel()) # 打印模型的系数 # print model.intercept_ # print model.dual_coef_ # 预测测试集 test_Y_pred = model.predict(test_X) print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred) print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred)) print "总耗时:", time() - t, "秒" print "\n**********测试NuSVR类**********" t = time() model = GridSearchCV(NuSVR(cache_size=1000), param_grid={ "C": np.logspace(-3, 3, 7), "nu": np.linspace(0.1, 1, 10), "gamma": np.logspace(-3, 3, 7) }, cv=5) model.fit(train_X, train_Y.values.ravel()) print("最好的参数是:%s, 此时的得分是:%0.2f" % (model.best_params_, model.best_score_)) model = NuSVR(C=100.0, nu=0.3, gamma=0.1, cache_size=500) # 拟合训练集 model.fit(train_X, train_Y.values.ravel()) # 打印模型的系数 # print model.intercept_ # print model.dual_coef_
medv_ids = DataFrame(kneighbors[1] + 1, columns = ["neighbor(" + str(x + 1) + ")" for x in range(regressor.n_neighbors)]) medv = pandas.concat((medv, medv_ids), axis = 1) store_csv(medv, name) if "Housing" in datasets: build_housing(AdaBoostRegressor(DecisionTreeRegressor(min_samples_leaf = 5, random_state = 13), n_estimators = 17, random_state = 13), "AdaBoostHousing") build_housing(BayesianRidge(), "BayesianRidgeHousing") build_housing(GBDTLMRegressor(GradientBoostingRegressor(n_estimators = 31, random_state = 13), LinearRegression()), "GBDTLMHousing") build_housing(GBDTLMRegressor(XGBRFRegressor(n_estimators = 17, max_depth = 5, random_state = 13), SGDRegressor(penalty = "elasticnet", random_state = 13)), "XGBRFLMHousing") build_housing(HistGradientBoostingRegressor(max_iter = 31, random_state = 13), "HistGradientBoostingHousing") build_housing(KNeighborsRegressor(), "KNNHousing", with_kneighbors = True) build_housing(MLPRegressor(activation = "tanh", hidden_layer_sizes = (26,), solver = "lbfgs", tol = 0.001, max_iter = 1000, random_state = 13), "MLPHousing") build_housing(SGDRegressor(random_state = 13), "SGDHousing") build_housing(SVR(gamma = "auto"), "SVRHousing") build_housing(LinearSVR(random_state = 13), "LinearSVRHousing") build_housing(NuSVR(gamma = "auto"), "NuSVRHousing") build_housing(VotingRegressor([("dt", DecisionTreeRegressor(random_state = 13)), ("lr", LinearRegression())]), "VotingEnsembleHousing") visit_X, visit_y = load_visit("Visit") def build_visit(regressor, name): mapper = DataFrameMapper( [(["edlevel"], [CategoricalDomain(), OneHotEncoder()])] + [([bin_column], [CategoricalDomain(), OneHotEncoder()]) for bin_column in ["outwork", "female", "married", "kids", "self"]] + [(["age"], ContinuousDomain())] + [(["hhninc", "educ"], ContinuousDomain())] ) pipeline = PMMLPipeline([ ("mapper", mapper), ("regressor", regressor) ])
def objective(self, trial): if 'xgb' in str.lower(self.model_type): params = { 'learning_rate': trial.suggest_uniform('learning_rate', 0.0001, 0.5), 'max_depth': trial.suggest_int('max_depth', 1, 150), 'min_child_weight': trial.suggest_int('min_child_weight', 1, 10), 'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.4, 1.0), 'colsample_bynode': trial.suggest_uniform('colsample_bynode', 0.4, 1.0), 'subsample': trial.suggest_uniform('subsample', 0.4, 1.0), 'gamma': trial.suggest_uniform('gamma', 0.01, 10), 'reg_alpha': trial.suggest_uniform('reg_alpha', 0, 1.0) } model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42) elif 'rf' in str.lower(self.model_type): params = { 'max_depth': trial.suggest_int('max_depth', 1, 150), 'max_features': trial.suggest_categorical( 'max_features', ['auto', 'sqrt', 'log2', None, 0.8, 0.6, 0.4]), 'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 250), 'min_samples_split': trial.suggest_int('min_samples_split', 2, 250), } model = RandomForestRegressor(n_estimators=500, random_state=42) elif str.lower(self.model_type) == 'svm': params = { 'C': trial.suggest_uniform('C', 1e-4, 1e3), 'kernel': trial.suggest_categorical( 'kernel', ['linear', 'poly', 'rbf', 'sigmoid']), 'gamma': trial.suggest_uniform('gamma', 1e-2, 10) } model = SVR(max_iter=1000000) elif str.lower(self.model_type) == 'nusvm': params = { 'nu': trial.suggest_uniform('nu', 0.01, 0.99), 'C': trial.suggest_uniform('C', 1e-4, 1e5), 'gamma': trial.suggest_uniform('gamma', 1e-2, 10) } model = NuSVR(max_iter=1000000) elif 'mlp' in str.lower(self.model_type): n_layers = trial.suggest_int('n_layers', 1, 2) layers = [] for i in range(n_layers): layers.append( trial.suggest_int('n_units_l{}'.format(i), 3, 800)) params = { 'alpha': trial.suggest_loguniform('alpha', 1e-5, 1e-1), } model = MLPRegressor(hidden_layer_sizes=layers, max_iter=1000, early_stopping=True) return self.fit_model(model, params)
class NuSvrClass: """ Name : NuSVR Attribute : None Method : predict, predict_by_cv, save_model """ def __init__(self): # 알고리즘 이름 self._name = 'nusvr' # 기본 경로 self._f_path = os.path.abspath( os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir)) # 경고 메시지 삭제 warnings.filterwarnings('ignore') # 원본 데이터 로드 data = pd.read_csv(self._f_path + "/regression/resource/regression_sample.csv", sep=",", encoding="utf-8") # 학습 및 테스트 데이터 분리 self._x = (data["year"] <= 2017) self._y = (data["year"] >= 2018) # 학습 데이터 분리 self._x_train, self._y_train = self.preprocessing(data[self._x]) # 테스트 데이터 분리 self._x_test, self._y_test = self.preprocessing(data[self._y]) # 모델 선언 self._model = NuSVR(nu=0.5, cache_size=100) # 모델 학습 self._model.fit(self._x_train, self._y_train) # 데이터 전처리 def preprocessing(self, data): # 학습 x = [] # 레이블 y = [] # 기준점(7일) base_interval = 7 # 기온 temps = list(data["temperature"]) for i in range(len(temps)): if i < base_interval: continue y.append(temps[i]) xa = [] for p in range(base_interval): d = i + p - base_interval xa.append(temps[d]) x.append(xa) return x, y # 일반 예측 def predict(self, save_img=False, show_chart=False): # 예측 y_pred = self._model.predict(self._x_test) # 스코어 정보 score = r2_score(self._y_test, y_pred) # 리포트 확인 if hasattr(self._model, 'coef_') and hasattr(self._model, 'intercept_'): print(f'Coef = {self._model.coef_}') print(f'intercept = {self._model.intercept_}') print(f'Score = {score}') # 이미지 저장 여부 if save_img: self.save_chart_image(y_pred, show_chart) # 예측 값 & 스코어 return [list(y_pred), score] # CV 예측(Cross Validation) def predict_by_cv(self): # Regression 알고리즘은 실 프로젝트 상황에 맞게 Cross Validation 구현 return False # GridSearchCV 예측 def predict_by_gs(self): pass # 모델 저장 및 갱신 def save_model(self, renew=False): # 모델 저장 if not renew: # 처음 저장 joblib.dump(self._model, self._f_path + f'/model/{self._name}_rg.pkl') else: # 기존 모델 대체 if os.path.isfile(self._f_path + f'/model/{self._name}_rg.pkl'): os.rename( self._f_path + f'/model/{self._name}_rg.pkl', self._f_path + f'/model/{str(self._name) + str(time.time())}_rg.pkl') joblib.dump(self._model, self._f_path + f'/model/{self._name}_rg.pkl') # 회귀 차트 저장 def save_chart_image(self, data, show_chart): # 사이즈 plt.figure(figsize=(15, 10), dpi=100) # 레이블 plt.plot(self._y_test, c='r') # 예측 값 plt.plot(data, c='b') # 이미지로 저장 plt.savefig('./chart_images/tenki-kion-lr.png') # 차트 확인(Optional) if show_chart: plt.show() def __del__(self): del self._x_train, self._x_test, self._y_train, self._y_test, self._x, self._y, self._model
def __init__(self, features=None, fit_target=None, **kwargs): super().__init__(features=features, fit_target=fit_target) self.impl = NuSVR(**kwargs)
def init_RM(self, **kwargs): """ Initialisation of the Regression Model. Any parameter is passed to the Model. self.N_out RM can be needed if not ANN type. They are stored in self.RMs list. """ self.RMs = [] self.train_params = {} if self.RM_type in ('SK_ANN', 'SK_ANN_Dis'): self.RMs = [MLPRegressor(random_state=self.random_seed, **kwargs)] self._multi_predic = True elif self.RM_type == 'SK_SVM': for i in range(self.N_out): self.RMs.append(SVR(**kwargs)) self._multi_predic = False elif self.RM_type == 'SK_NuSVM': for i in range(self.N_out): self.RMs.append(NuSVR(**kwargs)) self._multi_predic = False elif self.RM_type == 'SK_BR': for i in range(self.N_out): self.RMs.append(BayesianRidge(**kwargs)) self._multi_predic = False elif self.RM_type == 'SK_AB': for i in range(self.N_out): self.RMs.append( AdaBoostRegressor(random_state=self.random_seed, **kwargs)) self._multi_predic = False elif self.RM_type in ("K_ANN", "K_ANN_Dis"): if not TF_OK: raise ValueError( 'Tensorflow not installed, Keras RM_type not available') def get_kwargs(kw, default): if kw in kwargs: return kwargs[kw] else: return default activation = get_kwargs('activation', 'relu') kernel_initializer = get_kwargs( 'kernel_initializer', initializers.glorot_uniform(seed=self.random_seed)) if self.random_seed is None: bias_initializer = 'zeros' else: cst = np.random.rand() bias_initializer = initializers.Constant(0.1 + 0.05 + cst) optimizer = get_kwargs('optimizer', get_kwargs('solver', 'adam')) epochs = get_kwargs('epochs', 100) batch_size = get_kwargs('batch_size', None) validation_split = get_kwargs('validation_split', 0.0) hidden_layer_sizes = get_kwargs('hidden_layer_sizes', (10, 10)) random_state = get_kwargs('random_state', self.random_seed) dropout = get_kwargs('dropout', None) L1 = get_kwargs('L1', 0.) L2 = get_kwargs('L2', 0.) tf.compat.v1.random.set_random_seed(random_state) model = Sequential() model.add( Dense(hidden_layer_sizes[0], input_dim=self.N_in, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, activation=activation, kernel_regularizer=regularizers.l1_l2(l1=L1, l2=L2))) if dropout is not None: if type(dropout) in (type(()), type([])): d1 = dropout[0] else: d1 = dropout if d1 != 0.0: model.add(Dropout(d1, seed=random_state)) for i_hl, hidden_layer_size in enumerate(hidden_layer_sizes[1:]): model.add( Dense(hidden_layer_size, activation=activation, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, kernel_regularizer=regularizers.l1_l2(l1=L1, l2=L2))) if dropout is not None: if type(dropout) in (type(()), type([])): di = dropout[i_hl + 1] else: di = dropout if di != 0.0: model.add(Dropout(di, seed=random_state)) if self.RM_type == 'K_ANN': model.add( Dense(self.N_out, activation='linear', kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, kernel_regularizer=regularizers.l1_l2(l1=L1, l2=L2))) metrics = get_kwargs('metrics', ['mse', 'mae']) model.compile(loss='mse', optimizer=optimizer, metrics=metrics) self.RMs = [model] self.train_params = { 'epochs': epochs, 'batch_size': batch_size, 'verbose': False, 'validation_split': validation_split } self._multi_predic = True elif self.RM_type == 'K_ANN_Dis': model.add( Dense(self.N_out, activation='softmax', kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, kernel_regularizer=regularizers.l1_l2(l1=L1, l2=L2))) metrics = get_kwargs('metrics', ['accuracy']) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=metrics) if self.verbose: model.summary() self.RMs = [model] self.train_params = { 'epochs': epochs, 'batch_size': batch_size, 'verbose': False, 'validation_split': validation_split } self._multi_predic = True elif self.RM_type == 'KSK_ANN': def get_kwargs(kw, default): if kw in kwargs: return kwargs[kw] else: return default activation = get_kwargs('activation', 'relu') kernel_initializer = get_kwargs( 'kernel_initializer', initializers.glorot_uniform(seed=self.random_seed)) if self.random_seed is None: bias_initializer = 'zeroes' else: bias_initializer = initializers.Constant(0.1) optimizer = get_kwargs('optimizer', get_kwargs('solver', 'adam')) epochs = get_kwargs('epochs', 1) batch_size = get_kwargs('batch_size', None) validation_split = get_kwargs('validation_split', 0.0) hidden_layer_sizes = get_kwargs('hidden_layer_sizes', (10, 10)) random_state = get_kwargs('random_state', self.random_seed) tf.random.set_random_seed(random_state) def create_model(hidden_layer_sizes, N_in, activation, random_state, N_out): model = Sequential() model.add( Dense(hidden_layer_sizes[0], input_dim=N_in, activation=activation)) for hidden_layer_size in hidden_layer_sizes[1:]: model.add(Dense( hidden_layer_size, activation=activation, )) model.add(Dense(N_out, activation='linear')) metrics = ['mse', 'mae'] model.compile(loss='mse', optimizer=optimizer, metrics=metrics) return model model = KerasRegressor(create_model, hidden_layer_sizes=hidden_layer_sizes, N_in=self.N_in, activation=activation, random_state=random_state, N_out=self.N_out) #if self.verbose: # model.summary() self.RMs = [model] self.train_params = { 'epochs': epochs, 'batch_size': batch_size, 'verbose': False, 'validation_split': validation_split } self._multi_predic = True # TBC *** else: raise ValueError('Unkown Regression method {}'.format( self.RM_type)) if self.verbose: print('Regression Model {}'.format(self.RM_type))
def _gerarPlotFit(list_index_real, list_y_real, list_index_previsto, list_y_previsto, list_index_real_original, x_predict_original, list_index_previsto_original, list_y_previsto_original, list_y_real_original, isFit, df_norm): global cach_fit #Plotando FIT if (isFit): x_fit_real = [x + 1 for x in np.arange(len(list_index_real))] y_fit_real = list_y_real x_fit_previsto = np.asarray( [x + 1 for x in np.arange(len(list_index_previsto))], dtype=np.int32) y_fit_previsto = list_y_previsto x_fit_previsto_original = np.asarray( [x + 1 for x in np.arange(len(list_index_previsto_original))], dtype=np.int32) y_fit_previsto_original = list_y_previsto_original x_fit_real_original = np.asarray( [x + 1 for x in np.arange(len(list_index_real_original))], dtype=np.int32) y_fit_real_original = list_y_real_original list_x = np.arange(len(df_norm.index)) parcela_x = (0 if len(x_fit_real) == 1 else ceil( len(x_fit_real) * 0.4)) #print(parcela_x) coefs_linear_reais = np.polyfit( x_fit_real, y_fit_real, 1, ) coefs_linear_previsto = np.polyfit(x_fit_previsto, y_fit_previsto, 1) coefs_linear_previsto_parcela = np.polyfit( x_fit_previsto[parcela_x:len(x_fit_previsto)], y_fit_previsto[parcela_x:len(x_fit_previsto)], 1) coefs_linear_previsto_peso = np.polyfit(x_fit_previsto, y_fit_previsto, 1, w=np.sqrt( x_fit_previsto[::-1])) if (x_predict_original.sum() == 0 and len(cach_fit) != 0): ffit_reais = cach_fit[0] ffit_peso = cach_fit[1] ffit = cach_fit[2] fit_reta_previsto = cach_fit[3] fit_svr = cach_fit[4] fit_reta_previsto_parcela = cach_fit[5] fit_svr_ply = cach_fit[6] list_x = cach_fit[7] else: ffit_reais = np.poly1d(coefs_linear_reais) ffit_peso = np.poly1d(coefs_linear_previsto_peso) ffit = np.poly1d(coefs_linear_previsto) fit_reta_previsto_parcela = np.poly1d( coefs_linear_previsto_parcela) #FIT com Equação da Reta Reduzida [y = ax + b] fit_reta_previsto = [ ((y_fit_real_original[-1] - y_fit_real_original[0]) / (x_fit_real_original[-1] - x_fit_real_original[0])) * (x - x_fit_real_original[0]) + x_fit_real_original[0] for x in list_x ] svr_nu = NuSVR(kernel='linear', C=1, gamma='scale', nu=0.9) svr_nu_poly = NuSVR(kernel='rbf', C=1, gamma='scale', nu=0.9) svr_nu.fit((x_fit_previsto_original.reshape(-1, 1)), y_fit_previsto_original) svr_nu_poly.fit((x_fit_previsto_original.reshape(-1, 1)), y_fit_previsto_original) fit_svr = svr_nu.predict(list_x.reshape(-1, 1)) fit_svr_ply = svr_nu_poly.predict(list_x.reshape(-1, 1)) cach_fit = (ffit_reais, ffit_peso, ffit, fit_reta_previsto, fit_svr, fit_reta_previsto_parcela, fit_svr_ply, list_x) # legend_fit_real,= plt.plot(df_norm.index, ffit_reais(list_x), color="orange", linestyle='--', label="FIT [pontos reais]") # legend_fit_previsto, = plt.plot(df_norm.index, ffit_peso(list_x), color="red", linestyle='--', label= "FIT [pontos reais + último ponto previsto] PESO (SQRT)") # legend_fit_previsto_sem_peso, = plt.plot(df_norm.index, ffit(list_x), color="g", linestyle='--', label= "FIT [pontos reais + último ponto previsto] Sem peso") # legend_fit_previsto_reta, = plt.plot(df_norm.index,fit_reta_previsto, color="chocolate", linestyle='--', label= "FIT Equacao da Reta") # legend_fit_previsto_sem_peso_parcela, = plt.plot(df_norm.index, fit_reta_previsto_parcela(list_x), color="slategray", linestyle='--', label= "FIT [pontos reais + último ponto previsto - parcela] Sem peso") legend_fit_previsto_svr, = plt.plot(df_norm.index, fit_svr, color="mediumvioletred", linestyle='--', label="FIT SVR [Linear]") #legend_fit_previsto_svr_poly, = plt.plot(df_norm.index,fit_svr_ply, color="red", linestyle='--', label= "FIT SVR [Poly]") # list_legend_fit = [legend_fit_previsto, legend_fit_previsto_sem_peso, legend_fit_real, legend_fit_previsto_reta,legend_fit_previsto_svr,legend_fit_previsto_sem_peso_parcela] list_legend_fit = [legend_fit_previsto_svr] return list_legend_fit
init_sect_beg = timer() # Save maternal feature vectors & composite maternal / fetal feature vectors: maternal_feature_vectors[n_svrs, :] = cwt_wdw.flatten() maternal_fetal_feature_vectors[n_svrs, :] = np.concatenate( (cwt_wdw.flatten(), cwt_wdw_fetal.flatten()), axis=None) # Linear support vector regression: maternal -> abdominal # nusv_res = NuSVR(nu=0.95, C=10.0, kernel='linear', degree=3, gamma='scale', coef0=0.0, shrinking=True, tol=0.001, cache_size=200, verbose=False, max_iter=10000) z_rbf = nusv_res.fit(cwt_wdw, fetal_lead_wdw).predict(cwt_wdw) # z_rbf = nusv_res.fit(cwt_wdw, mat_lead_wdw).predict(cwt_wdw) # Store regression coef's & offset: nusv_lin_coef = np.float32(nusv_res.coef_) nusv_intercept = np.float32(nusv_res.intercept_) linear_regression_coefs[n_svrs, :] = nusv_lin_coef linear_regression_intercepts[n_svrs] = nusv_intercept
def main(): #data cleaning and features engineering section data = pd.read_csv('input.csv') data = fix_data_encoding(data) data = name_mapping(data) data = get_duration(data) data = get_court_city_type(data) data = fix_leading_zeros(data) data = add_judge_age(data) data = encode_receipt_procedure(data) data = add_money_amount_indicator(data) data = create_person_business_indicators(data) data = encode_case_matter(data) data = create_court_indicators(data) data = add_loadiness_of_courts(data) data = add_not_subject_to_duty_not_zero(data) data = add_lives_abroad_over_persons_and_companies_involved(data) data = add_date_groups(data) data = get_total_persons_and_companies_started(data) data = remove_outliers(data) data = add_single_person_or_company_started(data) data = add_single_person_or_company_answered(data) public_data = pd.read_csv('public_data.csv') print("Public data columns: ", list(public_data)) data = add_public_data(data, public_data) print("After adding data: ", list(data)) data = add_court_productivity(data) data.pop('start_date') data.pop('end_date') data.pop('court_name') data.pop('case_id') data.pop('court_id') data.pop('date_of_birth') # Depends if start_date will be available in final data data.pop('start_date_year') data.to_csv("out.csv") train, test = train_test_split(data, test_size=0.2, random_seed=1) #store the name of the variable we want to predict. Separately store the names of all other variables target = 'duration_m' all_columns_except_target = train.columns.difference([target]) # #----------------------------------------------------------------------------------------------------- # #model calibration section # #tree amount calibration # for tree_amount in range(10,60,10): # model = RandomForestRegressor(n_estimators=tree_amount) # score_from_cross_validation = get_score_from_cross_validation(model, train, target, valid_split_size=3) # print('number of trees=', tree_amount) # print("score from cross-validation:", score_from_cross_validation) # # #max_features calibration # for max_features in ['auto','sqrt','log2']: # model = RandomForestRegressor(max_features=max_features) # score_from_cross_validation = get_score_from_cross_validation(model, train, target, valid_split_size=5) # print('max_features_type=', max_features) # print("score from cross-validation:", score_from_cross_validation) # # #min_samples_leaf calibration # for min_samples_leaf in range(1,5,1): # model = RandomForestRegressor(n_estimators = 60, min_samples_leaf=min_samples_leaf) # score_from_cross_validation = get_score_from_cross_validation(model, train, target, valid_split_size=3) # print('min_samples_leaf=', min_samples_leaf) # print("score from cross-validation:", score_from_cross_validation) #default settings vs manually calibrated settings print('RadnomForestRegressor from scikit-learn') model = RandomForestRegressor() score_from_cross_validation = get_score_from_cross_validation( model, train, target, valid_split_size=5) print('default model:') print("score from cross-validation on train data:", score_from_cross_validation) model = RandomForestRegressor(n_estimators=60, min_samples_leaf=2) score_from_cross_validation = get_score_from_cross_validation( model, train, target, valid_split_size=5) print('manually calibrated model:') print("score from cross-validation on train data:", score_from_cross_validation) # model = RandomForestRegressor() # model.fit(train[all_columns_except_target], train[target]) # score_on_test = model.score(test[all_columns_except_target], test[target]) # print('default model:') # print("score for test data:", score_on_test) # # model = RandomForestRegressor(n_estimators = 60, min_samples_leaf=2) # model.fit(train[all_columns_except_target], train[target]) # score_on_test = model.score(test[all_columns_except_target], test[target]) # print('manually calibrated model:') # print("score for test data:", score_on_test) #----------------------------------------------------------------------------------------------------- #trying different models/algorithms #default settings for GradientBoostingRegressor ~ a bit better than RandomForestRegressor print('GradientBoostingRegressor from scikit-learn') model = GradientBoostingRegressor() score_from_cross_validation = get_score_from_cross_validation( model, train, target, valid_split_size=5) print('default model:') print("score from cross-validation on train data:", score_from_cross_validation) # model = GradientBoostingRegressor() # model.fit(train[all_columns_except_target], train[target]) # score_on_test = model.score(test[all_columns_except_target], test[target]) # print('default model:') # print("score for test data:", score_on_test) #default settings for ADAboost - sucks! print('AdaBoostRegressor from scikit-learn') model = AdaBoostRegressor() score_from_cross_validation = get_score_from_cross_validation( model, train, target, valid_split_size=5) print('default model:') print("score from cross-validation on train data:", score_from_cross_validation) # model = AdaBoostRegressor() # model.fit(train[all_columns_except_target], train[target]) # score_on_test = model.score(test[all_columns_except_target], test[target]) # print('default model:') # print("score for test data:", score_on_test) #default settings for ExtraTreesRegressor - sucks print('ExtraTreesRegressor from scikit-learn') model = ExtraTreesRegressor() score_from_cross_validation = get_score_from_cross_validation( model, train, target, valid_split_size=5) print('default model:') print("score from cross-validation on train data:", score_from_cross_validation) # model = ExtraTreesRegressor() # model.fit(train[all_columns_except_target], train[target]) # score_on_test = model.score(test[all_columns_except_target], test[target]) # print('default model:') # print("score for test data:", score_on_test) #default settings for BaggingRegressor ~ almost like RandomForestRegressor print('BaggingRegressor from scikit-learn') model = BaggingRegressor() score_from_cross_validation = get_score_from_cross_validation( model, train, target, valid_split_size=5) print('default model:') print("score from cross-validation on train data:", score_from_cross_validation) # model = BaggingRegressor() # model.fit(train[all_columns_except_target], train[target]) # score_on_test = model.score(test[all_columns_except_target], test[target]) # print('default model:') # print("score for test data:", score_on_test) #default settings for XGBModel ~ 1% better than GradientBoostingRegressor print('XGBModel for scikit-learn') model = XGBModel() model.fit(train[all_columns_except_target], train[target]) score_from_cross_validation = get_score_from_cross_validation( model, train, target, valid_split_size=5) #calculating R^2 score manually print('default model:') print("score for test data:", score_from_cross_validation) #default settings for SVR - very bad!!!!!!! print('SVR from scikit-learn') model = SVR() score_from_cross_validation = get_score_from_cross_validation( model, train, target, valid_split_size=5) print('default model:') print("score from cross-validation on train data:", score_from_cross_validation) # model = SVR() # model.fit(train[all_columns_except_target], train[target]) # score_on_test = model.score(test[all_columns_except_target], test[target]) # print('default model:') # print("score for test data:", score_on_test) #default settings for NuSVR - very bad as well!!! print('NuSVR from scikit-learn') model = NuSVR() score_from_cross_validation = get_score_from_cross_validation( model, train, target, valid_split_size=5) print('default model:') print("score from cross-validation on train data:", score_from_cross_validation) # model = NuSVR() # model.fit(train[all_columns_except_target], train[target]) # score_on_test = model.score(test[all_columns_except_target], test[target]) # print('default model:') # print("score for test data:", score_on_test) #default settings for LinearRegression - not too bad for such model print('LinearRegression from scikit-learn') model = LinearRegression() score_from_cross_validation = get_score_from_cross_validation( model, train, target, valid_split_size=5) print('default model:') print("score from cross-validation on train data:", score_from_cross_validation)
random_permutation_index = np.random.permutation(x.shape[0]) x = x[random_permutation_index] y = y[random_permutation_index] train_x = x[:-100] train_y = y[:-100] test_x = x[-100:] test_y = y[-100:] model = svr_model.fit(train_x, train_y) y_train_result = model.predict(train_x) print('trainig RMSE = {}'.format(get_RMSE(train_y, y_train_result))) y_predict_result = model.predict(test_x) print('test RMSE = {}'.format(get_RMSE(test_y, y_predict_result))) print() return test_y, y_predict_result svr_model = SVR(C=1) with open('prediction1.csv', 'w', newline='') as csvfile: writer = csv.writer(csvfile) y_truth, y_predict = run_SVR('input_2007_w5.csv', svr_model) for i in range(len(y_truth)): writer.writerow([y_truth[i], y_predict[i]]) svr_model = NuSVR(C=100) with open('prediction2.csv', 'w', newline='') as csvfile: writer = csv.writer(csvfile) y_truth, y_predict = run_SVR('input_2007.csv', svr_model) for i in range(len(y_truth)): writer.writerow([y_truth[i], y_predict[i]])
def train(self, X, y, hypers): self.regressor = NuSVR(kernel='rbf', C=hypers[0], gamma=hypers[1]).fit(X, y)
def main(X, Y, Params, print_info=False, is_regression=True, Y_other=None): parameters = Params['Algorithm'][1] is_cv_run = False starttime = time.time() if print_info: print('Fitting model \'%s\' for %s' % (Params['Algorithm'][0], 'regression' if is_regression else 'classification')) if Params['Algorithm'][0] == 'BayesianRidge': if not is_regression: model = BayesianRidge(n_iter=300, tol=0.001, compute_score=False, fit_intercept=True, normalize=False, copy_X=True, verbose=False, **parameters) #parameters = {'alpha_1': [1e-6,1e-5,1e-4],'alpha_2': [1e-6,1e-5,1e-4], 'lambda_1': [1e-6,1e-5,1e-4], 'lambda_2': [1e-6,1e-5,1e-4]} else: model = BayesianRidge(n_iter=300, tol=0.001, compute_score=False, fit_intercept=True, normalize=False, copy_X=True, verbose=False, **parameters) elif Params['Algorithm'][0] == 'StringKernel': if not is_regression: raise (Exception('not implemented')) else: # we create an instance of SVM and fit out data. # # model = KernelRidge(alpha=parameters['alpha'], kernel='precomputed') model = SVR(kernel='precomputed', gamma='auto', coef0=0.0, shrinking=True, tol=0.001, cache_size=400, verbose=False, max_iter=-1) param_grid = { 'C': np.logspace(np.log10(0.0001), np.log10(500), 25) } model = NuSVR( kernel='precomputed' ) #cache_size=400, coef0=0.0, gamma='auto', max_iter=-1, shrinking=True, tol=0.001, verbose=False,**parameters) param_grid = {'nu': (0.50, )} model = GridSearchCV(model, param_grid, n_jobs=1, iid=True, refit=True, cv=7, verbose=0, scoring=neg_mean_squared_error_scorer) is_cv_run = True elif Params['Algorithm'][0] == 'XGBoost': # max_depth = 3, learning_rate = 0.1, n_estimators = 100, silent = True, objective = 'reg:linear', # booster = 'gbtree', n_jobs = 1, nthread = None, gamma = 0, min_child_weight = 1, # max_delta_step = 0, subsample = 1, colsample_bytree = 1, colsample_bylevel = 1, reg_alpha = 0, # reg_lambda = 1, scale_pos_weight = 1, base_score = 0.5, random_state = 0, seed = None, # missing = None if not is_regression: model = xgboost.XGBClassifier( missing=None, silent=True, learning_rate=0.10, objective='rank:pairwise', booster='gbtree', n_jobs=1, max_delta_step=0, colsample_bylevel=1, scale_pos_weight=1, base_score=0.5, random_state=666, colsample_bytree=0.75, # default 1 subsample=0.75, gamma=0, reg_alpha=0.01, # default 0 min_child_weight=6, **parameters) else: # model=xgboost.XGBRegressor(missing=None, silent=True, # learning_rate=0.10, # objective='reg:linear',#'rank:pairwise' booster='gbtree' # n_jobs=1, # booster='gbtree', # max_delta_step=0, # colsample_bylevel=1, # scale_pos_weight=1, # base_score=0.5, # random_state=666, # colsample_bytree=0.75, # default 1 # subsample=0.75, # gamma=0, # reg_alpha=0.01, # default 0 # reg_lambda=1.0, # min_child_weight=6, # **parameters) model = xgboost.XGBRegressor( missing=None, silent=True, learning_rate=0.10, objective='reg:linear', #'rank:pairwise' booster='gbtree' n_jobs=1, booster='gbtree', random_state=666, **parameters) param_grid = { 'colsample_bytree': (0.75, 1.0), 'subsample': (0.75, 1.0), 'min_child_weight': (3, 6, 9), 'reg_lambda': (0.80, 1.0, 1.20), 'reg_alpha': (0.001, 0.01) } model = GridSearchCV(model, param_grid, n_jobs=1, iid=True, refit=True, cv=7, verbose=0, scoring=neg_mean_squared_error_scorer) is_cv_run = True elif Params['Algorithm'][0] == "Keras_ElasticNet": #use_keras_CPU() if not is_regression: raise (Exception('ElasticNet is only for regression!')) else: param_grid = { 'l1_ratio': (Params['Algorithm'][1]['l1_ratio'], ), 'alpha': np.logspace(-3, 1, 15) } model = GridSearchCV(KerasENet(), param_grid, n_jobs=1, iid=True, refit=True, cv=5, verbose=0, scoring=neg_mean_squared_error_scorer) # first_output = Dense(1,activation='sigmoid')(first_output) is_cv_run = True elif Params['Algorithm'][0] == "Ridge": if not is_regression: raise (Exception('Ridge is only for regression!')) else: model = RidgeCV(alphas=np.logspace(-1, np.log10(700), parameters['n_alphas']), fit_intercept=True, normalize=False, scoring=None, cv=8, gcv_mode=None, store_cv_values=False) elif Params['Algorithm'][0] == "ElasticNet": tol = 0.0001 selection = 'cyclic' n_alphas = 90 max_iter = 1300 if X.shape[1] > 4000: tol = 0.001 selection = 'random' n_alphas = 60 max_iter = 1000 if not is_regression: raise (Exception('ElasticNet is only for regression!')) else: if Params['is_multitarget']: model = MultiTaskElasticNetCV(eps=0.001, alphas=None, fit_intercept=True, normalize=False, max_iter=max_iter, tol=tol, cv=7, copy_X=True, verbose=0, n_alphas=n_alphas, n_jobs=1, random_state=666, selection=selection, **parameters) else: model = ElasticNetCV(eps=0.001, alphas=None, fit_intercept=True, normalize=False, max_iter=max_iter, tol=tol, cv=7, copy_X=True, verbose=0, n_alphas=n_alphas, n_jobs=1, random_state=666, selection=selection, **parameters) elif Params['Algorithm'][0] == "RandomForest": if not is_regression: raise (Exception('not set up (lazy)')) else: model = RandomForestRegressor(criterion='mse', min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, **parameters) param_grid = { 'max_features': ('auto', 'sqrt'), 'min_samples_split': ( 2, 4, ), } model = GridSearchCV(model, param_grid, n_jobs=1, iid=True, refit=True, cv=7, verbose=0, scoring=neg_mean_squared_error_scorer) is_cv_run = True elif Params['Algorithm'][0] == 'SVM': # 0.001, 0.005, 0.01, 0.05, 0.1, 0.5,1.0,1.5,2.0,3.0,4.0,5.0,10.0 if not is_regression: model = SVC(cache_size=400, coef0=0.0, gamma='auto', max_iter=-1, shrinking=True, tol=0.001, verbose=False, **parameters) #parameters = {'reg__C':[0.5],'reg__epsilon':[0.1]} else: model = SVR(cache_size=400, coef0=0.0, gamma='auto', max_iter=-1, shrinking=True, tol=0.001, verbose=False, **parameters) param_grid = {'C': np.logspace(np.log10(0.0005), np.log10(10), 30)} #param_grid = {'nu':(0.1,0.3,0.5,0.7,0.9)} model = GridSearchCV(model, param_grid, n_jobs=1, iid=True, refit=True, cv=8, verbose=0, scoring=neg_mean_squared_error_scorer) is_cv_run = True elif Params['Algorithm'][0] == 'GradientBoosting': if not is_regression: model = GradientBoostingClassifier(random_state=1, **parameters) #parameters = {'reg__n_estimators': [140], 'reg__max_depth': [6],'learning_rate':[0.01,0.03,0.1],'min_samples_leaf':[2,3,4]} else: model = GradientBoostingRegressor(random_state=1, **parameters) #parameters = {'reg__n_estimators': [140], 'reg__max_depth': [6]} elif Params['Algorithm'][0] == 'MLP': #parameters['hidden_layer_sizes']=[parameters['hidden_layer_sizes']] #model = MLPRegressorCV(hidden_layer_sizes=parameters['hidden_layer_sizes']) model = MLPRegressor( activation="relu", solver="lbfgs", learning_rate="constant", learning_rate_init=0.0011, max_iter=450, random_state=None, tol=0.00013, epsilon=1e-08, hidden_layer_sizes=parameters['hidden_layer_sizes']) param_grid = {'alpha': np.logspace(0, np.log10(350), 20)} model = GridSearchCV(model, param_grid, n_jobs=1, iid=True, refit=True, cv=7, verbose=0, scoring=neg_mean_squared_error_scorer) is_cv_run = True #model = MLPRegressor(activation="relu", solver ="lbfgs",learning_rate ="constant", # learning_rate_init = 0.001, power_t = 0.5, max_iter = 500, shuffle = True, random_state = None, # tol = 0.0001, verbose = False, warm_start = False, momentum = 0.9, epsilon = 1e-08,**parameters) elif Params['Algorithm'][0] == 'MLP_KERAS': from keras.models import Sequential from keras import regularizers from keras.layers import Dense, Dropout from keras.callbacks import EarlyStopping from sklearn.preprocessing import LabelEncoder from keras.utils import np_utils import tensorflow as tf config = tf.ConfigProto() config.gpu_options.allow_growth = True session = tf.Session(config=config) early_stopping = EarlyStopping(monitor='val_loss', patience=5) model = Sequential() model.add( Dense( parameters['layers_and_nodes'][0], activation='tanh', input_shape=(X.shape[1], ), kernel_initializer='glorot_uniform', kernel_regularizer=regularizers.l2( parameters['l2_regularization']), )) model.add(Dropout(parameters['dropout'], noise_shape=None, seed=1)) for layer in range(1, len(parameters['layers_and_nodes'])): model.add( Dense(parameters['layers_and_nodes'][layer], activation='relu', input_shape=(parameters['layers_and_nodes'][layer - 1], ), kernel_initializer='glorot_normal', kernel_regularizer=regularizers.l2( parameters['l2_regularization']))) model.add(Dropout(parameters['dropout'], noise_shape=None, seed=1)) if not is_regression: model.add( Dense(1, activation='softmax', input_shape=(parameters['nodes'][-1], ))) model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['f1']) encoder = LabelEncoder() encoder.fit(Y) encoded_Y = encoder.transform(Y) # convert integers to dummy variables (i.e. one hot encoded) Y = np_utils.to_categorical(encoded_Y) else: model.add( Dense(1, activation='linear', input_shape=(parameters['layers_and_nodes'][-1], ))) model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mse']) model.fit(X, Y, batch_size=X.shape[0], epochs=100, validation_split=0, verbose=0) #,callbacks=[early_stopping]) return model else: raise (Exception('unknown model')) #decomposer = LatentDirichletAllocation(n_topics=10, max_iter=10,learning_method='online',learning_offset=50.,random_state=1) #decomposer = TruncatedSVD(n_components=100,random_state=666) """ X = data.iloc[:]['text'].values y = data.iloc[:]['mylabel'].values.astype(str) dat = vect.fit_transform(X) dat = tfidf.fit_transform(dat) dat = decomposer.fit_transform(dat) for a in numpy.unique(y): plt.scatter(dat[y==a,0],dat[y==a,1]) """ """ START LOOP """ #t0 = time() # if get_set_count(parameters)>1: # grid_search = GridSearchCV(model, parameters, n_jobs=6,verbose=1,cv=10,refit=True) # grid_search.fit(X=X,y=Y) # best_parameters = grid_search.best_estimator_.get_params() # print('--> best parameters: %s' % best_parameters) # return grid_search # else: if 1: start_time = time.time() print('... training model (X.shape=%s)' % str(X.shape), end='') warnings.filterwarnings("ignore") if Y_other is not None and Params['is_multitarget']: Y = np.expand_dims(Y, axis=1) model.fit(X=X, y=np.concatenate((Y, Y_other), axis=1)) else: Y = Y.flatten() model.fit(X=X, y=Y) if is_cv_run: print(' [best gridsearch params: %s] ' % model.best_params_, end='') if 1: end_time = time.time() print(' ... done (%1.1f min)' % ((end_time - start_time) / 60.0)) #elapsedtime = (time.time() - starttime) / 60.0 #print('fit done (took %f minutes)' % elapsedtime) return model
# need to convert to np.array() to extract last values y_train = np.array(y_train) y_test = np.array(y_test) y_train = y_train[:, y_train.shape[1] - 1] y_test = y_test[:, y_test.shape[1] - 1] # make candles s = data.index[data['mid_close'] == y_test[0]].tolist()[1] px_test = { 'bid': data.iloc[s:, 1].values, 'ask': data.iloc[s:, 5].values, 'mid': data.iloc[s:, 9].values } clf = NuSVR() fitted_clf = clf.fit(X_train, y_train) def plot_strategy(strategy, default): ''' fn: compare 2 strategies Params: ------- strategy: list, accumulated returns from predicting strateg default: list, accumulated returns from buy & hold ''' fig = plt.figure(figsize=(10, 6)) ax = fig.add_subplot(111)
def test_convert_nusvr_default(self): model, X = self._fit_binary_classification(NuSVR()) model_onnx = convert_sklearn( model, "SVR", [("input", FloatTensorType([None, X.shape[1]]))]) self.assertIsNotNone(model_onnx) dump_data_and_model(X, model, model_onnx, basename="SklearnRegNuSVR2")
# "Passive Aggressive Regressor ": PassiveAggressiveRegressor(max_iter=100000, tol=0.5), # "random forest regressor": RandomForestRegressor(n_estimators=10), # "gradient boosting regressor": GradientBoostingRegressor(min_samples_leaf=3), # "k nearest neighbiours regressor": KNeighborsRegressor(), # "RANSAC regressor": RANSACRegressor(), "SGD regressor": SGDRegressor(max_iter=100000, tol=0.5), # "kernel ridge": KernelRidge(), # "ada boost regressor": AdaBoostRegressor(), # "bagging regressor": BaggingRegressor(), # "extra trees regressor": ExtraTreesRegressor(n_estimators=10), # "dummy regressor": DummyRegressor(), # "PLSR regressor": PLSRegression(), # "radius neighbours regressor": RadiusNeighborsRegressor(radius=5), # "neural_network.MLPRegressor 500": MLPRegressor(hidden_layer_sizes=(50)), # "svm.SVR": SVR(gamma="scale"), "svm.NuSVR epsilon=": NuSVR(nu=0.7, gamma="scale") # "svm.LinearSVR epsilom=": LinearSVR(max_iter=10000) # "decision tree regressor": DecisionTreeRegressor(), # "extra tree regressor": ExtraTreeRegressor() } # models = { # "1":MLPRegressor(hidden_layer_sizes=(64,2), solver="adam"), # "2":MLPRegressor(hidden_layer_sizes=(64,2), solver="lbfgs"), # } cp(t, "initialising models") results = [] rand = [0,0]
def regress_NuSVR(X_train, X_test, y_train, y_test, C1, nu1): nusvr = NuSVR(nu=nu1, C=C1, kernel='rbf', gamma=0.0001, tol=0.001) regr_nusvr = prep_process(nusvr, X_train, X_test, y_train, y_test) return (regr_nusvr[0], regr_nusvr[1]) def parameter_choosing_svr(estimator, params, X_train, Y_train, tem_list): grid_search = GridSearchCV(estimator, param_grid=params, cv=5) grid_search.fit(X_train, Y_train) for k, v in grid_search.best_params_.items(): tem_list.append(v) print('', v) return (tem_list[0], tem_list[1]) estimator = NuSVR(kernel='rbf', gamma=0.0001, tol=0.001) Cs = np.arange(5, 30, 5) Nus = np.arange(0.2, 0.9, 0.1) params = {'nu': Nus, 'C': Cs} # Read data files fname_pars = '/Ginzburg_Landau_equation/File4_CSV/Beta_Parameter_values.csv' par_values = pd.read_csv(fname_pars) for j in range(10, 50): fname_features = '/Ginzburg_Landau_equation/File4_CSV/TDA_Beta_features_m_%d.csv' % ( j) print('file number', j) # Read data files tda_features = pd.read_csv(fname_features)
print("\nMean absolute error: ", metrics.mean_absolute_error(y_test, preds)) models = [ LinearRegression(), LassoCV(alphas=np.logspace(-6, 6, 13)), ElasticNetCV(alphas=np.logspace(-6, 6, 13)), SGDRegressor(), PassiveAggressiveRegressor(), Ridge(), PassiveAggressiveRegressor(), RandomForestRegressor(max_depth=5), GradientBoostingRegressor(), AdaBoostRegressor(loss='exponential'), BaggingRegressor(), SVR(), NuSVR(), XGBRFRegressor(max_depth=5, objective="reg:squarederror"), XGBRegressor(max_depth=5, objective="reg:squarederror") ] def show_score(x, y, estimator): """ Returns MAE scores for specified models. Also returns r2 scores if applicable Arguments: x {[array/DataFrame]} -- [Array or matrix of features. Can also be dataframe] y {[array]} -- [Target values] estimator {[str]} -- [The estimator being used] """
# Step 4 - Remove the outliers correctedSeries = util.detectAndRemoveOutliers(rawSeries) # Learning Process - Start # Parameters depth = 100 # Form feature and target vectors featureVectors, targetVectors = util.formContinousFeatureAndTargetVectorsWithoutBias(correctedSeries, depth) featureVectors, targetVectors = util.formFeatureAndTargetVectors(correctedSeries, depth) # # Train using linear regression #model = SVR(kernel="linear") model = NuSVR(nu=1.0, kernel="linear") model.fit(featureVectors, targetVectors[:, 0]) predictedTrainingOutputData = model.predict(featureVectors) targetVectors = targetVectors # Predicted and actual Series actualSeries = pd.Series(data=targetVectors.flatten(), index=correctedSeries.index[-targetVectors.shape[0]:]) predictedSeries = pd.Series(data=predictedTrainingOutputData.flatten(), index=correctedSeries.index[-targetVectors.shape[0]:]) # Learning Process - End # Step 5 - Descale the series actualSeries = util.descaleSeries(actualSeries) predictedSeries = util.descaleSeries(predictedSeries)