def _train(self, data): """Train the classifier using `data` (`Dataset`). """ targets = data.sa[self.get_space()].value[:, np.newaxis] enet_kwargs = {} if self.__max_steps is not None: enet_kwargs['max.steps'] = self.__max_steps try: self.__trained_model = trained_model = \ r.enet(data.samples, targets, self.__lm, normalize=self.__normalize, intercept=self.__intercept, trace=self.__trace, **enet_kwargs) except RRuntimeError as e: raise FailedToTrainError("Failed to predict on %s using %s. Exceptions was: %s" \ % (data, self, e)) # find the step with the lowest Cp (risk) # it is often the last step if you set a max_steps # must first convert dictionary to array # Cp_vals = np.asarray([trained_model['Cp'][str(x)] # for x in range(len(trained_model['Cp']))]) # self.__lowest_Cp_step = Cp_vals.argmin() # set the weights to the last step beta_pure = np.asanyarray(Rrx2(trained_model, 'beta.pure')) self.__beta_pure_shape = beta_pure.shape self.__weights = np.zeros(data.nfeatures, dtype=beta_pure.dtype) ind = np.asanyarray(Rrx2(trained_model, 'allset')) - 1 self.__weights[ind] = beta_pure[-1, :]
def _train(self, dataset): """Train the classifier using `data` (`Dataset`). """ # process targets based on the model family targets = dataset.sa[self.get_space()].value if self.params.family == 'gaussian': # do nothing, just save the targets as a list #targets = targets.tolist() self._utargets = None elif self.params.family == 'multinomial': # turn lables into list of range values starting at 1 #targets = _label2indlist(dataset.targets, # dataset.uniquetargets) targets_unique = dataset.sa[self.get_space()].unique targets = _label2oneofm(targets, targets_unique) # save some properties of the data/classification self._utargets = targets_unique.copy() # process the pmax if self.params.pmax is None: # set it to the num features pmax = dataset.nfeatures else: # use the value pmax = self.params.pmax try: self.__trained_model = trained_model = \ r.glmnet(dataset.samples, targets, family=self.params.family, alpha=self.params.alpha, nlambda=self.params.nlambda, standardize=self.params.standardize, thresh=self.params.thresh, pmax=pmax, maxit=self.params.maxit, type=self.params.model_type) except RRuntimeError as e: raise FailedToTrainError("Failed to train %s on %s. Got '%s' during call r.glmnet()." \ % (self, dataset, e)) self.__last_lambda = last_lambda = \ np.asanyarray(Rrx2(trained_model, 'lambda'))[-1] # set the weights to the last step weights = r.coef(trained_model, s=last_lambda) if self.params.family == 'multinomial': self.__weights = np.hstack([np.array(r['as.matrix'](weight))[1:] for weight in weights]) elif self.params.family == 'gaussian': self.__weights = np.array(r['as.matrix'](weights))[1:, 0] else: raise NotImplementedError("Somehow managed to get here with family %s." % \ (self.params.family,))
def _train(self, dataset): """Train the skl learner using `dataset` (`Dataset`). """ targets_sa = dataset.sa[self.get_space()] targets = targets_sa.value if not 'regression' in self.__tags__: targets = self._attrmap.to_numeric(targets) try: self._R_model = r[self._learner]( dataset.samples, targets, **self._kwargs) except RRuntimeError as e: raise FailedToTrainError("Failed to train %s on %s. Got '%s' during call to fit()." \ % (self, dataset, e))
def _train(self, data): """Train the classifier using `data` (`Dataset`). """ targets = data.sa[self.get_space()].value[:, np.newaxis] # some non-Python friendly R-lars arguments lars_kwargs = {'use.Gram': self.__use_Gram} if self.__max_steps is not None: lars_kwargs['max.steps'] = self.__max_steps trained_model = r.lars(data.samples, targets, type=self.__type, normalize=self.__normalize, intercept=self.__intercept, trace=self.__trace, **lars_kwargs) #import pydb #pydb.debugger() # find the step with the lowest Cp (risk) # it is often the last step if you set a max_steps # must first convert dictionary to array Cp_vals = None try: Cp_vals = np.asanyarray(Rrx2(trained_model, 'Cp')) except TypeError as e: raise FailedToTrainError("Failed to train %s on %s. Got '%s' while trying to access " \ "trained model %s" % (self, data, e, trained_model)) if Cp_vals is None: # if there were no any -- just choose 0th lowest_Cp_step = 0 elif np.isnan(Cp_vals[0]): # sometimes may come back nan, so just pick the last one lowest_Cp_step = len(Cp_vals) - 1 else: # determine the lowest lowest_Cp_step = Cp_vals.argmin() self.__lowest_Cp_step = lowest_Cp_step # set the weights to the lowest Cp step self.__weights = np.asanyarray(Rrx2(trained_model, 'beta'))[lowest_Cp_step] self.__trained_model = trained_model # bind to an instance
def _train(self, dataset): """Train the skl learner using `dataset` (`Dataset`). """ targets_sa = dataset.sa[self.get_space()] targets = targets_sa.value # Some sanity checking so some classifiers such as LDA do not # puke meaningless exceptions if 'lda' in self.__tags__: if not dataset.nsamples > len(targets_sa.unique): raise DegenerateInputError( "LDA requires # of samples exceeding # of classes") # we better map into numeric labels if it is not a regression if not 'regression' in self.__tags__: targets = self._attrmap.to_numeric(targets) try: # train underlying learner self._skl_learner.fit(dataset.samples, targets) except (ValueError, np.linalg.LinAlgError) as e: raise FailedToTrainError("Failed to train %s on %s. Got '%s' during call to fit()." \ % (self, dataset, e))
def _train(self, dataset): """Train SVM """ super(SVM, self)._train(dataset) targets_sa_name = self.get_space() # name of targets sa targets_sa = dataset.sa[targets_sa_name] # actual targets sa # libsvm needs doubles src = _data2ls(dataset) # libsvm cannot handle literal labels labels = self._attrmap.to_numeric(targets_sa.value).tolist() svmprob = _svm.SVMProblem(labels, src) # Translate few params TRANSLATEDICT = {'epsilon': 'eps', 'tube_epsilon': 'p'} args = [] for paramname, param in list(self.params.items()) \ + list(self.kernel_params.items()): if paramname in TRANSLATEDICT: argname = TRANSLATEDICT[paramname] elif paramname in _svm.SVMParameter.default_parameters: argname = paramname else: if __debug__: debug( "SVM_", "Skipping parameter %s since it is not known " "to libsvm" % paramname) continue args.append((argname, param.value)) # ??? All those parameters should be fetched if present from # **kwargs and create appropriate parameters within .params or # .kernel_params libsvm_param = _svm.SVMParameter( kernel_type=self.params.kernel.as_raw_ls(), # Just an integer ID svm_type=self._svm_type, **dict(args)) """Store SVM parameters in libSVM compatible format.""" if 'C' in self.params: # svm_type in [_svm.svmc.C_SVC]: Cs = self._get_cvec(dataset) if len(Cs) > 1: C0 = abs(Cs[0]) scale = 1.0 / (C0) #*np.sqrt(C0)) # so we got 1 C per label uls = self._attrmap.to_numeric(targets_sa.unique) if len(Cs) != len(uls): raise ValueError( "SVM was parameterized with %d Cs but there are %d " "labels in the dataset" % (len(Cs), len(targets_sa.unique))) weight = [c * scale for c in Cs] # All 3 need to be set to take an effect libsvm_param._set_parameter('weight', weight) libsvm_param._set_parameter('nr_weight', len(weight)) libsvm_param._set_parameter('weight_label', uls) libsvm_param._set_parameter('C', Cs[0]) try: self.__model = _svm.SVMModel(svmprob, libsvm_param) except Exception as e: raise FailedToTrainError(str(e))
def _train(self, dataset): """Train SVM """ super(SVM, self)._train(dataset) # XXX watchout # self.untrain() newkernel, newsvm = False, False # local bindings for faster lookup params = self.params retrainable = self.params.retrainable targets_sa_name = self.get_space() # name of targets sa targets_sa = dataset.sa[targets_sa_name] # actual targets sa if retrainable: _changedData = self._changedData # LABELS ul = None self.__traindataset = dataset # OK -- we have to map labels since # binary ones expect -1/+1 # Multiclass expect labels starting with 0, otherwise they puke # when ran from ipython... yikes if __debug__: debug("SG_", "Creating labels instance") if self.__is_regression__: labels_ = np.asarray(targets_sa.value, dtype='double') else: ul = targets_sa.unique # ul.sort() if len(ul) == 2: # assure that we have -1/+1 _labels_dict = {ul[0]: -1.0, ul[1]: +1.0} elif len(ul) < 2: raise FailedToTrainError( "We do not have 1-class SVM brought into SG yet") else: # can't use plain enumerate since we need them swapped _labels_dict = dict([(u, i) for i, u in enumerate(ul)]) # Create SG-customized attrmap to assure -1 / +1 if necessary self._attrmap = AttributeMap(_labels_dict, mapnumeric=True) if __debug__: debug("SG__", "Mapping labels using dict %s" % _labels_dict) labels_ = self._attrmap.to_numeric(targets_sa.value).astype(float) labels = shogun.Features.Labels(labels_) _setdebug(labels, 'Labels') # KERNEL # XXX cruel fix for now... whole retraining business needs to # be rethought if retrainable: _changedData['kernel_params'] = _changedData.get( 'kernel_params', False) # TODO: big RF to move non-kernel classifiers away if 'kernel-based' in self.__tags__ and (not retrainable or _changedData['traindata'] or _changedData['kernel_params']): # If needed compute or just collect arguments for SVM and for # the kernel if retrainable and __debug__: if _changedData['traindata']: debug( "SG", "Re-Creating kernel since training data has changed") if _changedData['kernel_params']: debug( "SG", "Re-Creating kernel since params %s has changed" % _changedData['kernel_params']) k = self.params.kernel k.compute(dataset) self.__kernel = kernel = k.as_raw_sg() newkernel = True self.kernel_params.reset() # mark them as not-changed #_setdebug(kernel, 'Kernels') #self.__condition_kernel(kernel) if retrainable: if __debug__: debug("SG_", "Resetting test kernel for retrainable SVM") self.__kernel_test = None # TODO -- handle _changedData['params'] correctly, ie without recreating # whole SVM Cs = None if not retrainable or self.__svm is None or _changedData['params']: # SVM if 'C' in self.params: Cs = self._get_cvec(dataset) # XXX do not jump over the head and leave it up to the user # ie do not rescale automagically by the number of samples #if len(Cs) == 2 and not ('regression' in self.__tags__) and len(ul) == 2: # # we were given two Cs # if np.max(C) < 0 and np.min(C) < 0: # # and both are requested to be 'scaled' TODO : # # provide proper 'features' to the parameters, # # so we could specify explicitely if to scale # # them by the number of samples here # nl = [np.sum(labels_ == _labels_dict[l]) for l in ul] # ratio = np.sqrt(float(nl[1]) / nl[0]) # #ratio = (float(nl[1]) / nl[0]) # Cs[0] *= ratio # Cs[1] /= ratio # if __debug__: # debug("SG_", "Rescaled Cs to %s to accomodate the " # "difference in number of training samples" % # Cs) # Choose appropriate implementation svm_impl_class = self.__get_implementation(ul) if __debug__: debug("SG", "Creating SVM instance of %s" % repr(svm_impl_class)) if self._svm_impl in ['libsvr', 'svrlight']: # for regressions constructor a bit different self.__svm = svm_impl_class(Cs[0], self.params.tube_epsilon, self.__kernel, labels) # we need to set epsilon explicitly self.__svm.set_epsilon(self.params.epsilon) elif self._svm_impl in ['krr']: self.__svm = svm_impl_class(self.params.tau, self.__kernel, labels) elif 'kernel-based' in self.__tags__: self.__svm = svm_impl_class(Cs[0], self.__kernel, labels) self.__svm.set_epsilon(self.params.epsilon) else: traindata_sg = _tosg(dataset.samples) self.__svm = svm_impl_class(Cs[0], traindata_sg, labels) self.__svm.set_epsilon(self.params.epsilon) # To stay compatible with versions across API changes in sg 1.0.0 self.__svm_apply = externals.versions['shogun'] >= '1' \ and self.__svm.apply \ or self.__svm.classify # the last one for old API # Set shrinking if 'shrinking' in params: shrinking = params.shrinking if __debug__: debug("SG_", "Setting shrinking to %s" % shrinking) self.__svm.set_shrinking_enabled(shrinking) if Cs is not None and len(Cs) == 2: if __debug__: debug( "SG_", "Since multiple Cs are provided: %s, assign them" % Cs) self.__svm.set_C(Cs[0], Cs[1]) self.params.reset() # mark them as not-changed newsvm = True _setdebug(self.__svm, 'SVM') # Set optimization parameters if 'tube_epsilon' in self.params and \ hasattr(self.__svm, 'set_tube_epsilon'): self.__svm.set_tube_epsilon(self.params.tube_epsilon) self.__svm.parallel.set_num_threads(self.params.num_threads) else: if __debug__: debug("SG_", "SVM instance is not re-created") if _changedData['targets']: # labels were changed if __debug__: debug("SG__", "Assigning new labels") self.__svm.set_labels(labels) if newkernel: # kernel was replaced if __debug__: debug("SG__", "Assigning new kernel") self.__svm.set_kernel(self.__kernel) assert (_changedData['params'] is False ) # we should never get here if retrainable: # we must assign it only if it is retrainable self.ca.retrained = not newsvm or not newkernel # Train if __debug__ and 'SG' in debug.active: if not self.__is_regression__: lstr = " with labels %s" % targets_sa.unique else: lstr = "" debug( "SG", "%sTraining %s on data%s" % (("", "Re-")[retrainable and self.ca.retrained], self, lstr)) self.__svm.train() if __debug__: debug("SG_", "Done training SG_SVM %s" % self) # Report on training if (__debug__ and 'SG__' in debug.active) or \ self.ca.is_enabled('training_stats'): if __debug__: debug("SG_", "Assessing predictions on training data") trained_targets = self.__svm_apply().get_labels() else: trained_targets = None if __debug__ and "SG__" in debug.active: debug( "SG__", "Original labels: %s, Trained labels: %s" % (targets_sa.value, trained_targets)) # Assign training confusion right away here since we are ready # to do so. # XXX TODO use some other conditional attribute like 'trained_targets' and # use it within base Classifier._posttrain to assign predictions # instead of duplicating code here # XXX For now it can be done only for regressions since labels need to # be remapped and that becomes even worse if we use regression # as a classifier so mapping happens upstairs if self.__is_regression__ and self.ca.is_enabled('training_stats'): self.ca.training_stats = self.__summary_class__( targets=targets_sa.value, predictions=trained_targets)
def _train(self, data): """Train the classifier using `data` (`Dataset`). """ # Set up the environment for fitting the data X = data.samples.T d = self._attrmap.to_numeric(data.sa[self.get_space()].value) if set(d) != set([0, 1]): raise ValueError("Regressors for logistic regression should be [0,1]. Got %s" \ %(set(d),)) if self.__reduced != 0: # Data have reduced rank from scipy.linalg import svd # Compensate for reduced rank: # Select only the n largest eigenvectors U, S, V = svd(X.T) if S[0] == 0: raise FailedToTrainError( "Data provided to PLR seems to be degenerate -- " "0-th singular value is 0") S /= S[0] V = np.matrix(V[:, :np.max(np.where(S > self.__reduced)) + 1]) # Map Data to the subspace spanned by the eigenvectors X = (X.T * V).T nfeatures, npatterns = X.shape # Weighting vector w = np.matrix(np.zeros((nfeatures + 1, 1), 'd')) # Error for convergence criterion dw = np.matrix(np.ones((nfeatures + 1, 1), 'd')) # Patterns of interest in the columns X = np.matrix( \ np.concatenate((X, np.ones((1, npatterns), 'd')), 0) \ ) p = np.matrix(np.zeros((1, npatterns), 'd')) # Matrix implementation of penalty term Lambda = self.__lm * np.identity(nfeatures + 1, 'd') Lambda[nfeatures, nfeatures] = 0 # Gradient g = np.matrix(np.zeros((nfeatures + 1, 1), 'd')) # Fisher information matrix H = np.matrix(np.identity(nfeatures + 1, 'd')) # Optimize k = 0 while np.sum(np.ravel(dw.A**2)) > self.__criterion: p[:, :] = self.__f(w.T * X) g[:, :] = X * (d - p).T - Lambda * w H[:, :] = X * np.diag(p.A1 * (1 - p.A1)) * X.T + Lambda dw[:, :] = H.I * g w += dw k += 1 if k > self.__maxiter: raise ConvergenceError("More than %d Iterations without convergence" % \ (self.__maxiter)) if __debug__: debug("PLR", \ "PLR converged after %d steps. Error: %g" % \ (k, np.sum(np.ravel(dw.A ** 2)))) if self.__reduced: # We have computed in rank reduced space -> # Project to original space self.w = V * w[:-1] self.bias = w[-1] else: self.w = w[:-1] self.bias = w[-1]