def labels_io(): from shogun import RegressionLabels, CSVFile lab = RegressionLabels() f = CSVFile("../data/label_train_regression.dat", "r") f.set_delimiter(" ") lab.load(f) #print lab.get_labels() return lab
def evaluation_meansquarederror(ground_truth, predicted): from shogun import RegressionLabels from shogun import MeanSquaredError ground_truth_labels = RegressionLabels(ground_truth) predicted_labels = RegressionLabels(predicted) evaluator = MeanSquaredError() mse = evaluator.evaluate(predicted_labels, ground_truth_labels) return mse
def regression_chaidtree(num_train=500,num_test=50,x_range=15,noise_var=0.2,ft=feattypes): try: from shogun import RealFeatures, RegressionLabels, CSVFile, CHAIDTree, PT_REGRESSION from numpy import random except ImportError: print("Could not import Shogun and/or numpy modules") return random.seed(1) # form training dataset : y=x with noise X_train=random.rand(1,num_train)*x_range; Y_train=X_train+random.randn(num_train)*noise_var # form test dataset X_test=array([[float(i)/num_test*x_range for i in range(num_test)]]) # wrap features and labels into Shogun objects feats_train=RealFeatures(X_train) feats_test=RealFeatures(X_test) train_labels=RegressionLabels(Y_train[0]) # CHAID Tree formation c=CHAIDTree(2,feattypes,50) c.set_labels(train_labels) c.train(feats_train) # Regress on test data output=c.apply_regression(feats_test).get_labels() return c,output
def RunLASSOShogun(): totalTimer = Timer() # Load input dataset. # If the dataset contains two files then the second file is the responses # file. try: Log.Info("Loading dataset", self.verbose) if len(self.dataset) >= 2: testSet = np.genfromtxt(self.dataset[1], delimiter=',') # Get all the parameters. lambda1 = None if "lambda1" in options: lambda1 = float(options.pop("lambda1")) if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") # Use the last row of the training set as the responses. X, y = SplitTrainData(self.dataset) with totalTimer: model = LeastAngleRegression(lasso=True) if lambda1: model.set_max_l1_norm(lambda1) model.set_labels(RegressionLabels(y)) model.train(RealFeatures(X.T)) except Exception as e: return -1 return totalTimer.ElapsedTime()
def RunLinearRidgeRegressionShogun(): totalTimer = Timer() # Load input dataset. # If the dataset contains two files then the second file is the responses # file. Log.Info("Loading dataset", self.verbose) if len(self.dataset) >= 2: testSet = np.genfromtxt(self.dataset[1], delimiter=',') # Use the last row of the training set as the responses. X, y = SplitTrainData(self.dataset) if "alpha" in options: tau = float(options.pop("alpha")) else: Log.Fatal("Required parameter 'alpha' not specified!") raise Exception("missing parameter") if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") try: with totalTimer: # Perform linear ridge regression. model = LRR(tau, RealFeatures(X.T), RegressionLabels(y)) model.train() if len(self.dataset) >= 2: model.apply_regression(RealFeatures(testSet.T)) except Exception as e: return [-1] return [totalTimer.ElapsedTime(), model]
def RunLARSShogun(): totalTimer = Timer() # Load input dataset. try: Log.Info("Loading dataset", self.verbose) inputData = np.genfromtxt(self.dataset[0], delimiter=',') responsesData = np.genfromtxt(self.dataset[1], delimiter=',') inputFeat = RealFeatures(inputData.T) responsesFeat = RegressionLabels(responsesData) # Get all the parameters. lambda1 = None if "lambda1" in options: lambda1 = float(options.pop("lambda1")) if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") with totalTimer: # Perform LARS. model = LeastAngleRegression(False) if lambda1: model.set_max_l1_norm(lambda1) model.set_labels(responsesFeat) model.train(inputFeat) model.get_w_for_var(model.get_path_size() - 1) except Exception as e: return -1 return totalTimer.ElapsedTime()
def transfer_multitask_leastsquares_regression(fm_train=traindat, fm_test=testdat, label_train=label_traindat): from shogun import RegressionLabels, RealFeatures, Task, TaskGroup try: from shogun import MultitaskLeastSquaresRegression except ImportError: print("MultitaskLeastSquaresRegression not available") exit(0) features = RealFeatures(traindat) labels = RegressionLabels(label_train) n_vectors = features.get_num_vectors() task_one = Task(0, n_vectors // 2) task_two = Task(n_vectors // 2, n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) mtlsr = MultitaskLeastSquaresRegression(0.1, features, labels, task_group) mtlsr.set_regularization(1) # use regularization ratio mtlsr.set_tolerance(1e-2) # use 1e-2 tolerance mtlsr.train() mtlsr.set_current_task(0) out = mtlsr.apply_regression().get_labels() return out
def regression_svrlight (fm_train=traindat,fm_test=testdat,label_train=label_traindat, \ width=1.2,C=1,epsilon=1e-5,tube_epsilon=1e-2,num_threads=3): from shogun import RegressionLabels, RealFeatures from shogun import GaussianKernel try: from shogun import SVRLight except ImportError: print('No support for SVRLight available.') return feats_train=RealFeatures(fm_train) feats_test=RealFeatures(fm_test) kernel=GaussianKernel(feats_train, feats_train, width) labels=RegressionLabels(label_train) svr=SVRLight(C, epsilon, kernel, labels) svr.set_tube_epsilon(tube_epsilon) svr.parallel.set_num_threads(num_threads) svr.train() kernel.init(feats_train, feats_test) out = svr.apply().get_labels() return out, kernel
def get_labels(raw=False, type='binary'): data = concatenate( array( (-ones(NUM_EXAMPLES, dtype=double), ones(NUM_EXAMPLES, dtype=double)))) if raw: return data else: if type == 'binary': return BinaryLabels(data) if type == 'regression': return RegressionLabels(data) return None
def __init__(self, method_param, run_param): self.info = "SHOGUN_LARS (" + str(method_param) + ")" # Assemble run model parameter. self.data = load_dataset(method_param["datasets"], ["csv"]) self.data_split = split_dataset(self.data[0]) self.train_features = RealFeatures(self.data_split[0].T) self.train_labels = RegressionLabels(self.data_split[1]) if len(self.data) >= 2: self.test_features = RealFeatures(self.data[1].T) self.lasso = True if "lasso" in method_param: self.lasso = bool(method_param["lasso"]) self.solver = "auto" if "solver" in method_param: self.solver = str(method_param["solver"])
def __init__(self, method_param, run_param): self.info = "SHOGUN_LINEARREGRESSION (" + str(method_param) + ")" # Assemble run model parameter. self.data = load_dataset(method_param["datasets"], ["csv"]) self.data_split = split_dataset(self.data[0]) self.train_feat = RealFeatures(self.data_split[0].T) self.train_labels = RegressionLabels(self.data_split[1]) if len(self.data) >= 2: self.test_feat = RealFeatures(self.data[1].T) self.bias = 0 if "bias" in method_param: self.bias = float(method_param["bias"]) self.solver = "auto" if "solver" in method_param: self.solver = str(method_param["solver"])
def __init__(self, method_param, run_param): self.info = "SHOGUN_DTR (" + str(method_param) + ")" # Assemble run model parameter. self.data = load_dataset(method_param["datasets"], ["csv"]) self.data_split = split_dataset(self.data[0]) self.train_feat = RealFeatures(self.data_split[0].T) self.train_labels = RegressionLabels(self.data_split[1]) if len(self.data) >= 2: self.test_feat = RealFeatures(self.data[1].T) # Flag for Cross Validation Pruning self.cv_prune = False if "pruning" in method_param: self.cv_prune = bool(method_param["pruning"]) self.num_folds = 2 if "k" in method_param: # Making sure that the value is of the right type self.num_folds = int(method_param["k"])
def evaluation_cross_validation_regression(train_fname=traindat, label_fname=label_traindat, width=0.8, tau=1e-6): from shogun import CrossValidation, CrossValidationResult from shogun import MeanSquaredError, CrossValidationSplitting from shogun import RegressionLabels, RealFeatures from shogun import GaussianKernel, KernelRidgeRegression, CSVFile # training data features = RealFeatures(CSVFile(train_fname)) labels = RegressionLabels(CSVFile(label_fname)) # kernel and predictor kernel = GaussianKernel() predictor = KernelRidgeRegression(tau, kernel, labels) # splitting strategy for 5 fold cross-validation (for classification its better # to use "StratifiedCrossValidation", but here, the std x-val is used splitting_strategy = CrossValidationSplitting(labels, 5) # evaluation method evaluation_criterium = MeanSquaredError() # cross-validation instance cross_validation = CrossValidation(predictor, features, labels, splitting_strategy, evaluation_criterium) # (optional) repeat x-val 10 times cross_validation.set_num_runs(10) # (optional) tell machine to precompute kernel matrix. speeds up. may not work predictor.data_lock(labels, features) # perform cross-validation and print(results) result = cross_validation.evaluate()
def RunSVRShogun(): totalTimer = Timer() # Load input dataset. Log.Info("Loading dataset", self.verbose) # Use the last row of the training set as the responses. X, y = SplitTrainData(self.dataset) # Get all the parameters. self.C = 1.0 self.epsilon = 1.0 self.width = 0.1 if "c" in options: self.C = float(options.pop("c")) if "epsilon" in options: self.epsilon = float(options.pop("epsilon")) if "gamma" in options: self.width = np.true_divide(1, float(options.pop("gamma"))) if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") data = RealFeatures(X.T) labels_train = RegressionLabels(y) self.kernel = GaussianKernel(data, data, self.width) try: with totalTimer: # Perform SVR. model = LibSVR(self.C, self.epsilon, self.kernel, labels_train) model.train() except Exception as e: return -1 return totalTimer.ElapsedTime()
def regression_randomforest(num_train=500, num_test=50, x_range=15, noise_var=0.2, ft=feattypes): try: from shogun import RealFeatures, RegressionLabels, CSVFile, RandomForest, MeanRule, PT_REGRESSION except ImportError: print("Could not import Shogun modules") return random.seed(1) # form training dataset : y=x with noise X_train = random.rand(1, num_train) * x_range Y_train = X_train + random.randn(num_train) * noise_var # form test dataset X_test = array([[float(i) / num_test * x_range for i in range(num_test)]]) # wrap features and labels into Shogun objects feats_train = RealFeatures(X_train) feats_test = RealFeatures(X_test) train_labels = RegressionLabels(Y_train[0]) # Random Forest formation rand_forest = RandomForest(feats_train, train_labels, 20, 1) rand_forest.set_feature_types(ft) rand_forest.set_machine_problem_type(PT_REGRESSION) rand_forest.set_combination_rule(MeanRule()) rand_forest.train() # Regress test data output = rand_forest.apply_regression(feats_test).get_labels() return rand_forest, output
def stochasticgbmachine(train=traindat, train_labels=label_traindat, ft=feat_types): try: from shogun import RealFeatures, RegressionLabels, CSVFile, CARTree, StochasticGBMachine, SquaredLoss except ImportError: print("Could not import Shogun modules") return # wrap features and labels into Shogun objects feats = RealFeatures(CSVFile(train)) labels = RegressionLabels(CSVFile(train_labels)) # divide into training (90%) and test dataset (10%) p = np.random.permutation(labels.get_num_labels()) num = labels.get_num_labels() * 0.9 cart = CARTree() cart.set_feature_types(ft) cart.set_max_depth(1) loss = SquaredLoss() s = StochasticGBMachine(cart, loss, 500, 0.01, 0.6) # train feats.add_subset(np.int32(p[0:int(num)])) labels.add_subset(np.int32(p[0:int(num)])) s.set_labels(labels) s.train(feats) feats.remove_subset() labels.remove_subset() # apply feats.add_subset(np.int32(p[int(num):len(p)])) labels.add_subset(np.int32(p[int(num):len(p)])) output = s.apply_regression(feats) feats.remove_subset() labels.remove_subset() return s, output
def stochasticgbmachine(train=traindat,train_labels=label_traindat,ft=feat_types): try: from shogun import RealFeatures, RegressionLabels, CSVFile, CARTree, StochasticGBMachine, SquaredLoss except ImportError: print("Could not import Shogun modules") return # wrap features and labels into Shogun objects feats=RealFeatures(CSVFile(train)) labels=RegressionLabels(CSVFile(train_labels)) # divide into training (90%) and test dataset (10%) p=np.random.permutation(labels.get_num_labels()) num=labels.get_num_labels()*0.9 cart=CARTree() cart.set_feature_types(ft) cart.set_max_depth(1) loss=SquaredLoss() s=StochasticGBMachine(cart,loss,500,0.01,0.6) # train feats.add_subset(np.int32(p[0:int(num)])) labels.add_subset(np.int32(p[0:int(num)])) s.set_labels(labels) s.train(feats) feats.remove_subset() labels.remove_subset() # apply feats.add_subset(np.int32(p[int(num):len(p)])) labels.add_subset(np.int32(p[int(num):len(p)])) output=s.apply_regression(feats) feats.remove_subset() labels.remove_subset() return s,output
def modelselection_grid_search_libsvr (fm_train=traindat,fm_test=testdat,label_train=label_traindat,\ width=2.1,C=1,epsilon=1e-5,tube_epsilon=1e-2): from shogun import CrossValidation, CrossValidationResult from shogun import MeanSquaredError from shogun import CrossValidationSplitting from shogun import RegressionLabels from shogun import RealFeatures from shogun import GaussianKernel from shogun import LibSVR from shogun import GridSearchModelSelection from shogun import ModelSelectionParameters, R_EXP from shogun import ParameterCombination # training data features_train = RealFeatures(traindat) labels = RegressionLabels(label_traindat) # kernel kernel = GaussianKernel(features_train, features_train, width) # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list #kernel.print_modsel_params() labels = RegressionLabels(label_train) # predictor predictor = LibSVR(C, tube_epsilon, kernel, labels) predictor.set_epsilon(epsilon) # splitting strategy for 5 fold cross-validation (for classification its better # to use "StratifiedCrossValidation", but the standard # "StratifiedCrossValidationSplitting" is also available splitting_strategy = CrossValidationSplitting(labels, 5) # evaluation method evaluation_criterium = MeanSquaredError() # cross-validation instance cross_validation = CrossValidation(predictor, features_train, labels, splitting_strategy, evaluation_criterium) # (optional) repeat x-val (set larger to get better estimates) cross_validation.set_num_runs(2) # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list #predictor.print_modsel_params() # build parameter tree to select C1 and C2 param_tree_root = ModelSelectionParameters() c1 = ModelSelectionParameters("C1") param_tree_root.append_child(c1) c1.build_values(-1.0, 0.0, R_EXP) c2 = ModelSelectionParameters("C2") param_tree_root.append_child(c2) c2.build_values(-1.0, 0.0, R_EXP) # model selection instance model_selection = GridSearchModelSelection(cross_validation, param_tree_root) # perform model selection with selected methods #print "performing model selection of" #print "parameter tree" #param_tree_root.print_tree() #print "starting model selection" # print the current parameter combination, if no parameter nothing is printed print_state = False # lock data before since model selection will not change the kernel matrix # (use with care) This avoids that the kernel matrix is recomputed in every # iteration of the model search predictor.data_lock(labels, features_train) best_parameters = model_selection.select_model(print_state) # print best parameters #print "best parameters:" #best_parameters.print_tree() # apply them and print result best_parameters.apply_to_machine(predictor) result = cross_validation.evaluate()
def modelselection_grid_search_krr (fm_train=traindat,fm_test=testdat,label_train=label_traindat,\ width=2.1,C=1,epsilon=1e-5,tube_epsilon=1e-2): from shogun import CrossValidation, CrossValidationResult from shogun import MeanSquaredError from shogun import CrossValidationSplitting from shogun import RegressionLabels from shogun import RealFeatures from shogun import KernelRidgeRegression from shogun import GridSearchModelSelection from shogun import ModelSelectionParameters # training data features_train = RealFeatures(traindat) features_test = RealFeatures(testdat) labels = RegressionLabels(label_traindat) # labels labels = RegressionLabels(label_train) # predictor, set tau=0 here, doesnt matter predictor = KernelRidgeRegression() # splitting strategy for 5 fold cross-validation (for classification its better # to use "StratifiedCrossValidation", but the standard # "StratifiedCrossValidationSplitting" is also available splitting_strategy = CrossValidationSplitting(labels, 5) # evaluation method evaluation_criterium = MeanSquaredError() # cross-validation instance cross_validation = CrossValidation(predictor, features_train, labels, splitting_strategy, evaluation_criterium) # (optional) repeat x-val (set larger to get better estimates) cross_validation.set_num_runs(2) # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list #predictor.print_modsel_params() # build parameter tree to select regularization parameter param_tree_root = create_param_tree() # model selection instance model_selection = GridSearchModelSelection(cross_validation, param_tree_root) # perform model selection with selected methods #print "performing model selection of" #print "parameter tree:" #param_tree_root.print_tree() #print "starting model selection" # print the current parameter combination, if no parameter nothing is printed print_state = False best_parameters = model_selection.select_model(print_state) # print best parameters #print "best parameters:" #best_parameters.print_tree() # apply them and print result best_parameters.apply_to_machine(predictor) result = cross_validation.evaluate()
X = Xall[0:ntrain, :] y = yall[0:ntrain] Xtest = Xall[ntrain:, :] ytest = yall[ntrain:] # preprocess data for i in xrange(p): X[:, i] -= np.mean(X[:, i]) X[:, i] /= np.linalg.norm(X[:, i]) y -= np.mean(y) # train LASSO LeastAngleRegression = LeastAngleRegression() LeastAngleRegression.set_labels(RegressionLabels(y)) LeastAngleRegression.train(RealFeatures(X.T)) # train ordinary LSR if use_ridge: lsr = LinearRidgeRegression(0.01, RealFeatures(X.T), Labels(y)) lsr.train() else: lsr = LeastSquaresRegression() lsr.set_labels(RegressionLabels(y)) lsr.train(RealFeatures(X.T)) # gather LASSO path path = np.zeros((p, LeastAngleRegression.get_path_size())) for i in xrange(path.shape[1]): path[:, i] = LeastAngleRegression.get_w(i)
def __init__(self, method_param, run_param): self.info = "SHOGUN_SVR (" + str(method_param) + ")" # Assemble run model parameter. self.data = load_dataset(method_param["datasets"], ["csv"]) self.data_split = split_dataset(self.data[0]) self.train_feat = RealFeatures(self.data_split[0].T) self.train_labels = RegressionLabels(self.data_split[1]) if len(self.data) >= 2: self.test_feat = RealFeatures(self.data[1].T) self.solver_type = "epsilon" if "libsvr-solver" in method_param: self.solver_type = str(method_param["libsvr-solver"]) self.C = 1.0 if "C" in method_param: self.C = float(method_param["C"]) self.svr_param = 1.0 if "svr-paramter" in method_param: self.svr_param = float(method_param["svr-parameter"]) self.kernel = "Gaussian" if "kernel" in method_param: self.kernel = str(method_param["kernel"]) self.degree = 3 if "degree" in method_param: self.degree = int(method_param["degree"]) self.gamma = 2.0 if "gamma" in method_param: self.gamma = float(method_param["gamma"]) self.distance = "Euclidean" if "distance" in method_param: self.distance = str(method_param["distance"]) self.cache_size = 10 if "cache-size" in method_param: cache_size = int(method_param["cache-size"]) self.coef0 = 1.0 if "coef0" in method_param: self.coef0 = float(method_param["coef0"]) self.order = 2.0 if "order" in method_param: self.order = float(method_param["order"]) self.width = 2.0 if "width" in method_param: self.width = float(method_param["width"]) self.sigma = 1.5 if "sigma" in method_param: self.sigma = float(method_param["sigma"]) self.const = 2.0 if "constant" in method_param: const = float(method_param["constant"])
def regression_gaussian_process_modelselection (n=100, n_test=100, \ x_range=5, x_range_test=10, noise_var=0.4): from shogun import RealFeatures, RegressionLabels from shogun import GaussianKernel from shogun import GradientModelSelection, ModelSelectionParameters from shogun import GaussianLikelihood, ZeroMean, \ ExactInferenceMethod, GaussianProcessRegression, GradientCriterion, \ GradientEvaluation # easy regression data: one dimensional noisy sine wave X_train = random.rand(1, n) * x_range X_test = array([[float(i) / n_test * x_range_test for i in range(n_test)]]) y_test = sin(X_test) y_train = sin(X_train) + random.randn(n) * noise_var # shogun representation labels = RegressionLabels(y_train[0]) feats_train = RealFeatures(X_train) feats_test = RealFeatures(X_test) # GP specification kernel = GaussianKernel(10, 0.05) mean = ZeroMean() likelihood = GaussianLikelihood(0.8) inf = ExactInferenceMethod(kernel, feats_train, mean, labels, likelihood) inf.set_scale(2.5) gp = GaussianProcessRegression(inf) means = gp.get_mean_vector(feats_test) variances = gp.get_variance_vector(feats_test) # plot results figure() subplot(2, 1, 1) title('Initial parameter\'s values') plot(X_train[0], y_train[0], 'bx') # training observations plot(X_test[0], y_test[0], 'g-') # ground truth of test plot(X_test[0], means, 'r-') # mean predictions of test fill_between(X_test[0], means - 1.96 * sqrt(variances), means + 1.96 * sqrt(variances), color='grey') legend(["training", "ground truth", "mean predictions"]) # evaluate our inference method for its derivatives grad = GradientEvaluation(gp, feats_train, labels, GradientCriterion(), False) grad.set_function(inf) # handles all of the above structures in memory grad_search = GradientModelSelection(grad) # search for best parameters best_combination = grad_search.select_model(True) # outputs all result and information best_combination.apply_to_machine(gp) means = gp.get_mean_vector(feats_test) variances = gp.get_variance_vector(feats_test) # plot results subplot(2, 1, 2) title('Selected by gradient search parameter\'s values') plot(X_train[0], y_train[0], 'bx') # training observations plot(X_test[0], y_test[0], 'g-') # ground truth of test plot(X_test[0], means, 'r-') # mean predictions of test fill_between(X_test[0], means - 1.96 * sqrt(variances), means + 1.96 * sqrt(variances), color='grey') legend(["training", "ground truth", "mean predictions"]) show()