def _call(self, dataset): # OPT: local bindings clfclf = self.clf.clf analyzer = self.__analyzer if analyzer is None: analyzer = clfclf.get_sensitivity_analyzer( **(self._slave_kwargs)) if analyzer is None: raise ValueError, \ "Wasn't able to figure basic analyzer for clf %s" % \ `clfclf` if __debug__: debug("SA", "Selected analyzer %s for clf %s" % \ (analyzer, clfclf)) # bind to the instance finally self.__analyzer = analyzer # TODO "remove" unnecessary things below on each call... # assign corresponding classifier analyzer.clf = clfclf # if clf was trained already - don't train again if clfclf.trained: analyzer._force_training = False result = analyzer._call(dataset) self.ca.clf_sensitivities = result return result
def __del__(self): if __debug__: debug('CLF_', 'Destroying libsvm._SVMCParameter %s' % str(self)) free_int_array(svmc.svm_parameter_weight_label_get(self.param)) free_double_array(svmc.svm_parameter_weight_get(self.param)) svmc.delete_svm_parameter(self.param)
def _call(self, dataset): sensitivities = [] for ind, analyzer in enumerate(self.__analyzers): if __debug__: debug("SA", "Computing sensitivity for SA#%d:%s" % (ind, analyzer)) sensitivity = analyzer(dataset) sensitivities.append(sensitivity) if __debug__: debug("SA", "Returning %d sensitivities from %s" % (len(sensitivities), self.__class__.__name__)) sa_attr = self._sa_attr if isinstance(sensitivities[0], AttrDataset): smerged = None for i, s in enumerate(sensitivities): s.sa[sa_attr] = np.repeat(i, len(s)) if smerged is None: smerged = s else: smerged.append(s) sensitivities = smerged else: sensitivities = \ Dataset(sensitivities, sa={sa_attr: np.arange(len(sensitivities))}) self.ca.sensitivities = sensitivities return sensitivities
def label_voxel(self, c, levels=None): if self.__referenceLevel is None: warning("You did not provide what level to use " "for reference. Assigning 0th level -- '%s'" % (self._levels[0], )) self.set_reference_level(0) # return self.__referenceAtlas.label_voxel(c, levels) c = self._check_range(c) # obtain coordinates of the closest voxel cref = self._data[self.__referenceLevel.indexes, c[2], c[1], c[0]] dist = norm((cref - c) * self.voxdim) if __debug__: debug( 'ATL__', "Closest referenced point for %r is " "%r at distance %3.2f" % (c, cref, dist)) if (self.distance - dist) >= 1e-3: # neglect everything smaller result = self.__referenceAtlas.label_voxel(cref, levels) result['voxel_referenced'] = c result['distance'] = dist else: result = self.__referenceAtlas.label_voxel(c, levels) if __debug__: debug( 'ATL__', "Closest referenced point is " "further than desired distance %.2f" % self.distance) result['voxel_referenced'] = None result['distance'] = 0 return result
def _train(self, dataset): """Train SVM """ targets_sa_name = self.params.targets_attr # name of targets sa targets_sa = dataset.sa[targets_sa_name] # actual targets sa # libsvm needs doubles src = _data2ls(dataset) # libsvm cannot handle literal labels labels = self._attrmap.to_numeric(targets_sa.value).tolist() svmprob = _svm.SVMProblem(labels, src ) # Translate few params TRANSLATEDICT = {'epsilon': 'eps', 'tube_epsilon': 'p'} args = [] for paramname, param in self.params.items() \ + self.kernel_params.items(): if paramname in TRANSLATEDICT: argname = TRANSLATEDICT[paramname] elif paramname in _svm.SVMParameter.default_parameters: argname = paramname else: if __debug__: debug("SVM_", "Skipping parameter %s since it is not known" "to libsvm" % paramname) continue args.append( (argname, param.value) ) # ??? All those parameters should be fetched if present from # **kwargs and create appropriate parameters within .params or # .kernel_params libsvm_param = _svm.SVMParameter( kernel_type=self.params.kernel.as_raw_ls(),# Just an integer ID svm_type=self._svm_type, **dict(args)) """Store SVM parameters in libSVM compatible format.""" if self.params.has_key('C'):#svm_type in [_svm.svmc.C_SVC]: Cs = self._get_cvec(dataset) if len(Cs)>1: C0 = abs(Cs[0]) scale = 1.0/(C0)#*np.sqrt(C0)) # so we got 1 C per label uls = self._attrmap.to_numeric(targets_sa.unique) if len(Cs) != len(uls): raise ValueError, "SVM was parameterized with %d Cs but " \ "there are %d labels in the dataset" % \ (len(Cs), len(targets_sa.unique)) weight = [ c*scale for c in Cs ] # All 3 need to be set to take an effect libsvm_param._set_parameter('weight', weight) libsvm_param._set_parameter('nr_weight', len(weight)) libsvm_param._set_parameter('weight_label', uls) libsvm_param._set_parameter('C', Cs[0]) self.__model = _svm.SVMModel(svmprob, libsvm_param)
def _untrain(self): if __debug__: debug("FS_", "Untraining combined FS: %s" % self) for fs in self.__selectors: fs.untrain() # ask base class to do its untrain super(CombinedFeatureSelection, self)._untrain()
def _suppress_scipy_warnings(): # Infiltrate warnings if necessary numpy_ver = versions['numpy'] scipy_ver = versions['scipy'] # There is way too much deprecation warnings spit out onto the # user. Lets assume that they should be fixed by scipy 0.7.0 time if scipy_ver >= "0.6.0" and scipy_ver < "0.7.0" \ and numpy_ver > "1.1.0": import warnings if not __debug__ or (__debug__ and not 'PY' in debug.active): if __debug__: debug('EXT', "Setting up filters for numpy DeprecationWarnings") filter_lines = [ ('NumpyTest will be removed in the next release.*', DeprecationWarning), ('PyArray_FromDims: use PyArray_SimpleNew.', DeprecationWarning), ('PyArray_FromDimsAndDataAndDescr: use PyArray_NewFromDescr.', DeprecationWarning), # Trick re.match, since in warnings absent re.DOTALL in re.compile ('[\na-z \t0-9]*The original semantics of histogram is scheduled to be.*' '[\na-z \t0-9]*', Warning) ] for f, w in filter_lines: warnings.filterwarnings('ignore', f, w)
def forward(self, data): """Map data from input to output space. Parameters ---------- data : Dataset-like, (at least 2D)-array-like Typically this is a `Dataset`, but it might also be a plain data array, or even something completely different(TM) that is supported by a subclass' implementation. If such an object is Dataset-like it is handled by a dedicated method that also transforms dataset attributes if necessary. If an array-like is passed, it has to be at least two-dimensional, with the first axis separating samples or observations. For single samples `forward1()` might be more appropriate. """ if is_datasetlike(data): if __debug__: debug('MAP', "Forward-map %s-shaped dataset through '%s'." % (data.shape, self)) return self._forward_dataset(data) else: if hasattr(data, 'ndim') and data.ndim < 2: raise ValueError( 'Mapper.forward() only support mapping of data with ' 'at least two dimensions, where the first axis ' 'separates samples/observations. Consider using ' 'Mapper.forward1() instead.') if __debug__: debug('MAP', "Forward-map data through '%s'." % (self)) return self._forward_data(data)
def _SLcholesky_autoreg(C, nsteps=None, **kwargs): """Simple wrapper around cholesky to incrementally regularize the matrix until successful computation. For `nsteps` we boost diagonal 10-fold each time from the 'epsilon' of the respective dtype. If None -- would proceed until reaching 1. """ if nsteps is None: nsteps = -int(np.floor(np.log10(np.finfo(float).eps))) result = None for step in xrange(nsteps): epsilon_value = (10**step) * np.finfo(C.dtype).eps epsilon = epsilon_value * np.eye(C.shape[0]) try: result = SLcholesky(C + epsilon, lower=True) except SLAError, e: warning("Cholesky decomposition lead to failure: %s. " "As requested, performing auto-regularization but " "for better control you might prefer to regularize " "yourself by providing lm parameter to GPR" % e) if step < nsteps-1: if __debug__: debug("GPR", "Failed to obtain cholesky on " "auto-regularization step %d value %g. Got %s." " Boosting lambda more to reg. C." % (step, epsilon_value, e)) continue else: raise
def _set(self, val, init=False): different_value = self._value != val isarray = isinstance(different_value, np.ndarray) if self._ro and not init: raise RuntimeError, \ "Attempt to set read-only parameter %s to %s" \ % (self.name, val) if (isarray and np.any(different_value)) or \ ((not isarray) and different_value): if __debug__: debug("COL", "Parameter: setting %s to %s " % (str(self), val)) if not isarray: if hasattr(self, 'min') and val < self.min: raise ValueError, \ "Minimal value for parameter %s is %s. Got %s" % \ (self.name, self.min, val) if hasattr(self, 'max') and val > self.max: raise ValueError, \ "Maximal value for parameter %s is %s. Got %s" % \ (self.name, self.max, val) if hasattr(self, 'choices') and (not val in self.choices): raise ValueError, \ "Valid choices for parameter %s are %s. Got %s" % \ (self.name, self.choices, val) self._value = val # Set 'isset' only if not called from initialization routine self._isset = not init #True elif __debug__: debug("COL", "Parameter: not setting %s since value is the same" \ % (str(self)))
def __init__(self, name=None, enabled=True, doc="State variable"): CollectableAttribute.__init__(self, name, doc) self._isenabled = enabled self._defaultenabled = enabled if __debug__: debug("STV", "Initialized new state variable %s " % name + `self`)
def __new__(cls, *args, **kwargs): if len(args) > 0: if len(kwargs) > 0: raise ValueError, \ "Do not mix positional and keyword arguments. " \ "Use a single positional argument -- filename, " \ "or any number of keyword arguments, without having " \ "filename specified" if len(args) == 1 and isinstance(args[0], basestring): filename = args[0] args = args[1:] if __debug__: debug('IOH', 'Undigging hamster from %s' % filename) # compressed or not -- that is the question if filename.endswith('.gz'): f = gzip.open(filename) else: f = open(filename) result = cPickle.load(f) if not isinstance(result, Hamster): warning("Loaded other than Hamster class from %s" % filename) return result else: raise ValueError, "Hamster accepts only a single positional " \ "argument and it must be a filename. Got %d " \ "arguments" % (len(args),) else: return object.__new__(cls)
def __init__(self, name=None, hasunique=True, doc="Attribute with unique"): CollectableAttribute.__init__(self, name, doc) self._hasunique = hasunique self._resetUnique() if __debug__: debug("UATTR", "Initialized new AttributeWithUnique %s " % name + `self`)
def _call(self, dataset): sensitivities = [] for ind,analyzer in enumerate(self.__analyzers): if __debug__: debug("SA", "Computing sensitivity for SA#%d:%s" % (ind, analyzer)) sensitivity = analyzer(dataset) sensitivities.append(sensitivity) if __debug__: debug("SA", "Returning combined using %s sensitivity across %d items" % (self.__combiner, len(sensitivities))) # TODO Simplify if we go Dataset-only if len(sensitivities) == 1: sensitivities = np.asanyarray(sensitivities[0]) else: if isinstance(sensitivities[0], AttrDataset): smerged = None for i, s in enumerate(sensitivities): s.sa['splits'] = np.repeat(i, len(s)) if smerged is None: smerged = s else: smerged.append(s) sensitivities = smerged else: sensitivities = \ Dataset(sensitivities, sa={'splits': np.arange(len(sensitivities))}) self.ca.sensitivities = sensitivities return sensitivities
def dump(self, filename, compresslevel='auto'): """Bury the hamster into the file Parameters ---------- filename : str Name of the target file. When writing to a compressed file the filename gets a '.gz' extension if not already specified. This is necessary as the constructor uses the extension to decide whether it loads from a compressed or uncompressed file. compresslevel : 'auto' or int Compression level setting passed to gzip. When set to 'auto', if filename ends with '.gz' `compresslevel` is set to 5, 0 otherwise. However, when `compresslevel` is set to 0 gzip is bypassed completely and everything is written to an uncompressed file. """ if compresslevel == 'auto': compresslevel = (0, 5)[int(filename.endswith('.gz'))] if compresslevel > 0 and not filename.endswith('.gz'): filename += '.gz' if __debug__: debug('IOH', 'Burying hamster into %s' % filename) if compresslevel == 0: f = open(filename, 'w') else: f = gzip.open(filename, 'w', compresslevel) cPickle.dump(self, f) f.close()
def flowbreak(self): """Just a marker for the break of the flow """ if __debug__ and not self in debug.handlers: debug("REP", "Adding flowbreak") self._story.append(self.__flowbreak)
def _SLcholesky_autoreg(C, nsteps=None, **kwargs): """Simple wrapper around cholesky to incrementally regularize the matrix until successful computation. For `nsteps` we boost diagonal 10-fold each time from the 'epsilon' of the respective dtype. If None -- would proceed until reaching 1. """ if nsteps is None: nsteps = -int(np.floor(np.log10(np.finfo(float).eps))) result = None for step in xrange(nsteps): epsilon_value = (10**step) * np.finfo(C.dtype).eps epsilon = epsilon_value * np.eye(C.shape[0]) try: result = SLcholesky(C + epsilon, lower=True) except SLAError, e: warning("Cholesky decomposition lead to failure: %s. " "As requested, performing auto-regularization but " "for better control you might prefer to regularize " "yourself by providing lm parameter to GPR" % e) if step < nsteps - 1: if __debug__: debug( "GPR", "Failed to obtain cholesky on " "auto-regularization step %d value %g. Got %s." " Boosting lambda more to reg. C." % (step, epsilon_value, e)) continue else: raise
def __init__(self, index=None, *args, **kwargs): """ Parameters ---------- value : arbitrary (see derived implementations) The actual value of this attribute. **kwargs Passed to `Collectable` """ if index is None: IndexedCollectable._instance_index += 1 index = IndexedCollectable._instance_index else: # TODO: there can be collision between custom provided indexes # and the ones automagically assigned. # Check might be due pass self._instance_index = index self._isset = False self.reset() Collectable.__init__(self, *args, **kwargs) if __debug__ and 'COL' in debug.active: debug("COL", "Initialized new IndexedCollectable #%d:%s %r" % (index, self.name, self))
def _recon_customobj_customrecon(hdf, memo): """Reconstruct a custom object from HDF using a custom recontructor""" # we found something that has some special idea about how it wants # to be reconstructed mod_name = hdf.attrs['module'] recon_name = hdf.attrs['recon'] if mod_name == '__builtin__': raise NotImplementedError( "Built-in reconstructors are not supported (yet). " "Got: '%s'" % recon_name) if __debug__: debug('HDF5', "Load from custom reconstructor '%s.%s' [%s]" % (mod_name, recon_name, hdf.name)) # turn names into definitions mod = __import__(mod_name, fromlist=[recon_name]) recon = mod.__dict__[recon_name] if 'rcargs' in hdf: recon_args_hdf = hdf['rcargs'] if __debug__: debug('HDF5', "Load reconstructor args in [%s]" % recon_args_hdf.name) recon_args = _hdf_tupleitems_to_obj(recon_args_hdf, memo) else: recon_args = () # reconstruct obj = recon(*recon_args) # TODO Handle potentially avialable state settings return obj
def _train(self, dataset): """Select the most important features Parameters ---------- dataset : Dataset used to compute sensitivity maps """ # optionally train the analyzer first if self.__train_analyzer: self.__sensitivity_analyzer.train(dataset) sensitivity = self.__sensitivity_analyzer(dataset) """Compute the sensitivity map.""" self.ca.sensitivity = sensitivity # Select features to preserve selected_ids = self.__feature_selector(sensitivity) if __debug__: debug("FS_", "Sensitivity: %s Selected ids: %s" % (sensitivity, selected_ids)) # XXX not sure if it really has to be sorted selected_ids.sort() # announce desired features to the underlying slice mapper self._safe_assign_slicearg(selected_ids) # and perform its own training super(SensitivityBasedFeatureSelection, self)._train(dataset)
def __reverse_single_level(self, wp): # local bindings level_paths = self.__level_paths # define wavelet packet to use WP = pywt.WaveletPacket(data=None, wavelet=self._wavelet, mode=self._mode, maxlevel=self.__level) # prepare storage signal_shape = wp.shape[:1] + self._inshape[1:] signal = np.zeros(signal_shape) Ntime_points = self._intimepoints for indexes in _get_indexes(signal_shape, self._dim): if __debug__: debug('MAP_', " %s" % (indexes, ), lf=False, cr=True) for path, level_data in zip(level_paths, wp[indexes]): WP[path] = level_data signal[indexes] = WP.reconstruct(True)[:Ntime_points] return signal
def _prepredict(self, dataset): """Functionality prior prediction """ if not ('notrain2predict' in self.__tags__): # check if classifier was trained if that is needed if not self.trained: raise ValueError, \ "Classifier %s wasn't yet trained, therefore can't " \ "predict" % self nfeatures = dataset.nfeatures #data.shape[1] # check if number of features is the same as in the data # it was trained on if nfeatures != self.__trainednfeatures: raise ValueError, \ "Classifier %s was trained on data with %d features, " % \ (self, self.__trainednfeatures) + \ "thus can't predict for %d features" % nfeatures if self.params.retrainable: if not self.__changedData_isset: self.__reset_changed_data() _changedData = self._changedData data = np.asanyarray(dataset.samples) _changedData['testdata'] = \ self.__was_data_changed('testdata', data) if __debug__: debug('CLF_', "prepredict: Obtained _changedData is %s" % (_changedData))
def _train(self, samples): """Determine the projection matrix onto the SVD components from a 2D samples x feature data matrix. """ X = np.asmatrix(samples) X = self._demean_data(X) # singular value decomposition U, SV, Vh = np.linalg.svd(X, full_matrices=0) # store the final matrix with the new basis vectors to project the # features onto the SVD components. And store its .H right away to # avoid computing it in forward() self._proj = Vh.H # also store singular values of all components self._sv = SV if __debug__: debug("MAP", "SVD was done on %s and obtained %d SVs " % (samples, len(SV)) + " (%d non-0, max=%f)" % (len(SV.nonzero()), SV[0])) # .norm might be somewhat expensive to compute if "MAP_" in debug.active: debug("MAP_", "Mixing matrix has %s shape and norm=%f" % (self._proj.shape, np.linalg.norm(self._proj)))
def _filter2slice(self, bf): if self.__noslicing: # we are not allowed to help :-( return bf # the filter should be a boolean array if not len(bf): raise ValueError("'%s' recieved an empty filter. This is a " "bug." % self.__class__.__name__) # get indices of non-zero filter elements idx = bf.nonzero()[0] idx_start = idx[0] idx_end = idx[-1] + 1 idx_step = None if len(idx) > 1: # we need to figure out if there is a regular step-size # between elements stepsizes = np.unique(idx[1:] - idx[:-1]) if len(stepsizes) > 1: # multiple step-sizes -> slicing is not possible -> return # orginal filter return bf else: idx_step = stepsizes[0] sl = slice(idx_start, idx_end, idx_step) if __debug__: debug( "SPL", "Splitting by basic slicing is possible and permitted " "(%s)." % sl) return sl
def _call(self, dataset): # OPT: local bindings clfclf = self.clf.clf analyzer = self.__analyzer if analyzer is None: analyzer = clfclf.get_sensitivity_analyzer( **(self._slave_kwargs)) if analyzer is None: raise ValueError, \ "Wasn't able to figure basic analyzer for clf %s" % \ `clfclf` if __debug__: debug("SA", "Selected analyzer %s for clf %s" % \ (analyzer, clfclf)) # bind to the instance finally self.__analyzer = analyzer # TODO "remove" unnecessary things below on each call... # assign corresponding classifier analyzer.clf = clfclf # if clf was trained already - don't train again if clfclf.trained: analyzer._force_train = False result = analyzer._call(dataset) self.ca.clf_sensitivities = result return result
def __was_data_changed(self, key, entry, update=True): """Check if given entry was changed from what known prior. If so -- store only the ones needed for retrainable beastie """ idhash_ = idhash(entry) __idhashes = self.__idhashes changed = __idhashes[key] != idhash_ if __debug__ and 'CHECK_RETRAIN' in debug.active: __trained = self.__trained changed2 = entry != __trained[key] if isinstance(changed2, np.ndarray): changed2 = changed2.any() if changed != changed2 and not changed: raise RuntimeError, \ 'idhash found to be weak for %s. Though hashid %s!=%s %s, '\ 'estimates %s!=%s %s' % \ (key, idhash_, __idhashes[key], changed, entry, __trained[key], changed2) if update: __trained[key] = entry if __debug__ and changed: debug( 'CLF_', "Changed %s from %s to %s.%s" % (key, __idhashes[key], idhash_, ('', 'updated')[int(update)])) if update: __idhashes[key] = idhash_ return changed
def _set(self, val): if __debug__ and __mvpadebug__: # Since this call is quite often, don't convert # values to strings here, rely on passing them # withing msgargs debug("COL", "Setting %(self)s to %(val)s ", msgargs={"self": self, "val": val}) self._value = val
def _untrain(self): if __debug__: debug("FS_", "Untraining Iterative FS: %s" % self) self._fmeasure.untrain() self._pmeasure.untrain() # ask base class to do its untrain super(IterativeFeatureSelection, self)._untrain()
def _call(self, dataset, testdataset=None, **kwargs): """Invocation of the feature selection """ wdataset = dataset wtestdataset = testdataset self.ca.selected_ids = None self.ca.nfeatures = [] """Number of features at each step (before running selection)""" for fs in self.__feature_selections: # enable selected_ids state if it was requested from this class fs.ca.change_temporarily(enable_ca=["selected_ids"], other=self) if self.ca.is_enabled("nfeatures"): self.ca.nfeatures.append(wdataset.nfeatures) if __debug__: debug('FSPL', 'Invoking %s on (%s, %s)' % (fs, wdataset, wtestdataset)) wdataset, wtestdataset = fs(wdataset, wtestdataset, **kwargs) if self.ca.is_enabled("selected_ids"): if self.ca.selected_ids == None: self.ca.selected_ids = fs.ca.selected_ids else: self.ca.selected_ids = self.ca.selected_ids[ fs.ca.selected_ids] fs.ca.reset_changed_temporarily() return (wdataset, wtestdataset)
def _prepredict(self, dataset): """Functionality prior prediction """ if not ('notrain2predict' in self.__tags__): # check if classifier was trained if that is needed if not self.trained: raise ValueError, \ "Classifier %s wasn't yet trained, therefore can't " \ "predict" % self nfeatures = dataset.nfeatures #data.shape[1] # check if number of features is the same as in the data # it was trained on if nfeatures != self.__trainednfeatures: raise ValueError, \ "Classifier %s was trained on data with %d features, " % \ (self, self.__trainednfeatures) + \ "thus can't predict for %d features" % nfeatures if self.params.retrainable: if not self.__changedData_isset: self.__reset_changed_data() _changedData = self._changedData data = np.asanyarray(dataset.samples) _changedData['testdata'] = \ self.__was_data_changed('testdata', data) if __debug__: debug( 'CLF_', "prepredict: Obtained _changedData is %s" % (_changedData))
def _call(self, dataset): analyzers = [] # create analyzers for clf in self.clf.clfs: if self.__analyzer is None: analyzer = clf.get_sensitivity_analyzer(**(self._slave_kwargs)) if analyzer is None: raise ValueError, \ "Wasn't able to figure basic analyzer for clf %r" % \ (clf,) if __debug__: debug("SA", "Selected analyzer %r for clf %r" % \ (analyzer, clf)) else: # XXX shallow copy should be enough... analyzer = copy.copy(self.__analyzer) # assign corresponding classifier analyzer.clf = clf # if clf was trained already - don't train again if clf.trained: analyzer._force_training = False analyzers.append(analyzer) self.__combined_analyzer.analyzers = analyzers # XXX not sure if we don't want to call directly ._call(dataset) to avoid # double application of transformers/combiners, after all we are just # 'proxying' here to combined_analyzer... # YOH: decided -- lets call ._call return self.__combined_analyzer._call(dataset)
def __init__(self, clf, labels=None, confusion_state="training_stats", **kwargs): """Initialization. Parameters ---------- clf : Classifier Either trained or untrained classifier confusion_state Id of the conditional attribute which stores `ConfusionMatrix` labels : list if provided, should be a set of labels to add on top of the ones present in testdata """ ClassifierError.__init__(self, clf, labels, **kwargs) self.__confusion_state = confusion_state """What state to extract from""" if not clf.ca.has_key(confusion_state): raise ValueError, \ "Conditional attribute %s is not defined for classifier %r" % \ (confusion_state, clf) if not clf.ca.is_enabled(confusion_state): if __debug__: debug('CERR', "Forcing state %s to be enabled for %r" % (confusion_state, clf)) clf.ca.enable(confusion_state)
def _train(self, samples): """Determine the projection matrix onto the SVD components from a 2D samples x feature data matrix. """ X = np.asmatrix(samples) X = self._demean_data(X) # singular value decomposition U, SV, Vh = np.linalg.svd(X, full_matrices=0) # store the final matrix with the new basis vectors to project the # features onto the SVD components. And store its .H right away to # avoid computing it in forward() self._proj = Vh.H # also store singular values of all components self._sv = SV if __debug__: debug( "MAP", "SVD was done on %s and obtained %d SVs " % (samples, len(SV)) + " (%d non-0, max=%f)" % (len(SV.nonzero()), SV[0])) # .norm might be somewhat expensive to compute if "MAP_" in debug.active: debug( "MAP_", "Mixing matrix has %s shape and norm=%f" % (self._proj.shape, np.linalg.norm(self._proj)))
def __reverseSingleLevel(self, wp): # local bindings level_paths = self.__level_paths # define wavelet packet to use WP = pywt.WaveletPacket( data=None, wavelet=self._wavelet, mode=self._mode, maxlevel=self.__level) # prepare storage signal_shape = wp.shape[:1] + self.getInSize() signal = N.zeros(signal_shape) Ntime_points = self._intimepoints for indexes in _getIndexes(signal_shape, self._dim): if __debug__: debug('MAP_', " %s" % (indexes,), lf=False, cr=True) for path, level_data in zip(level_paths, wp[indexes]): WP[path] = level_data signal[indexes] = WP.reconstruct(True)[:Ntime_points] return signal
def __was_data_changed(self, key, entry, update=True): """Check if given entry was changed from what known prior. If so -- store only the ones needed for retrainable beastie """ idhash_ = idhash(entry) __idhashes = self.__idhashes changed = __idhashes[key] != idhash_ if __debug__ and 'CHECK_RETRAIN' in debug.active: __trained = self.__trained changed2 = entry != __trained[key] if isinstance(changed2, np.ndarray): changed2 = changed2.any() if changed != changed2 and not changed: raise RuntimeError, \ 'idhash found to be weak for %s. Though hashid %s!=%s %s, '\ 'estimates %s!=%s %s' % \ (key, idhash_, __idhashes[key], changed, entry, __trained[key], changed2) if update: __trained[key] = entry if __debug__ and changed: debug('CLF_', "Changed %s from %s to %s.%s" % (key, __idhashes[key], idhash_, ('','updated')[int(update)])) if update: __idhashes[key] = idhash_ return changed
def _forward_dataset(self, dataset): """Forward-map a dataset. This is a private method that can be reimplemented in derived classes. The default implementation forward-maps the dataset samples and returns a new dataset that is a shallow copy of the input with the mapped samples. Parameters ---------- dataset : Dataset-like """ if __debug__: debug('MAP_', "Forward-map %s-shaped samples in dataset with '%s'." % (dataset.samples.shape, self)) msamples = self._forward_data(dataset.samples) if __debug__: debug('MAP_', "Make shallow copy of to-be-forward-mapped dataset " "and assigned forward-mapped samples ({sf}a_filters: " "%s, %s, %s)." % (self._sa_filter, self._fa_filter, self._a_filter)) mds = dataset.copy(deep=False, sa=self._sa_filter, fa=self._fa_filter, a=self._a_filter) mds.samples = msamples return mds
def labelVoxel(self, c, levels = None): if self.__referenceLevel is None: warning("You did not provide what level to use " "for reference. Assigning 0th level -- '%s'" % (self._levels_dict[0],)) self.setReferenceLevel(0) # return self.__referenceAtlas.labelVoxel(c, levels) c = self._checkRange(c) # obtain coordinates of the closest voxel cref = self._data[ self.__referenceLevel.indexes, c[2], c[1], c[0] ] dist = norm( (cref - c) * self.voxdim ) if __debug__: debug('ATL__', "Closest referenced point for %s is " "%s at distance %3.2f" % (`c`, `cref`, dist)) if (self.distance - dist) >= 1e-3: # neglect everything smaller result = self.__referenceAtlas.labelVoxel(cref, levels) result['voxel_referenced'] = c result['distance'] = dist else: result = self.__referenceAtlas.labelVoxel(c, levels) if __debug__: debug('ATL__', "Closest referenced point is " "further than desired distance %.2f" % self.distance) result['voxel_referenced'] = None result['distance'] = 0 return result
def __check_scipy(): """Check if scipy is present an if it is -- store its version """ import warnings exists('numpy', raiseException=True) # To don't allow any crappy warning to sneak in warnings.simplefilter('ignore', DeprecationWarning) try: import scipy as sp except: warnings.simplefilter('default', DeprecationWarning) raise warnings.simplefilter('default', DeprecationWarning) # Infiltrate warnings if necessary numpy_ver = versions['numpy'] scipy_ver = versions['scipy'] = SmartVersion(sp.__version__) # There is way too much deprecation warnings spit out onto the # user. Lets assume that they should be fixed by scipy 0.7.0 time if scipy_ver >= "0.6.0" and scipy_ver < "0.7.0" \ and numpy_ver > "1.1.0": import warnings if not __debug__ or (__debug__ and not 'PY' in debug.active): debug('EXT', "Setting up filters for numpy DeprecationWarnings") filter_lines = [ ('NumpyTest will be removed in the next release.*', DeprecationWarning), ('PyArray_FromDims: use PyArray_SimpleNew.', DeprecationWarning), ('PyArray_FromDimsAndDataAndDescr: use PyArray_NewFromDescr.', DeprecationWarning), # Trick re.match, since in warnings absent re.DOTALL in re.compile ('[\na-z \t0-9]*The original semantics of histogram is scheduled to be.*' '[\na-z \t0-9]*', Warning) ] for f, w in filter_lines: warnings.filterwarnings('ignore', f, w)
def _postcall(self, dataset, result): """Some postprocessing on the result """ self.raw_result = result if not self.__transformer is None: if __debug__: debug("SA_", "Applying transformer %s" % self.__transformer) result = self.__transformer(result) # estimate the NULL distribution when functor is given if not self.__null_dist is None: if __debug__: debug("SA_", "Estimating NULL distribution using %s" % self.__null_dist) # we need a matching datameasure instance, but we have to disable # the estimation of the null distribution in that child to prevent # infinite looping. measure = copy.copy(self) measure.__null_dist = None self.__null_dist.fit(measure, dataset) if self.states.isEnabled('null_t'): # get probability under NULL hyp, but also request # either it belong to the right tail null_prob, null_right_tail = \ self.__null_dist.p(result, return_tails=True) self.null_prob = null_prob externals.exists('scipy', raiseException=True) from scipy.stats import norm # TODO: following logic should appear in NullDist, # not here tail = self.null_dist.tail if tail == 'left': acdf = N.abs(null_prob) elif tail == 'right': acdf = 1.0 - N.abs(null_prob) elif tail in ['any', 'both']: acdf = 1.0 - N.clip(N.abs(null_prob), 0, 0.5) else: raise RuntimeError, 'Unhandled tail %s' % tail # We need to clip to avoid non-informative inf's ;-) # that happens due to lack of precision in mantissa # which is 11 bits in double. We could clip values # around 0 at as low as 1e-100 (correspond to z~=21), # but for consistency lets clip at 1e-16 which leads # to distinguishable value around p=1 and max z=8.2. # Should be sufficient range of z-values ;-) clip = 1e-16 null_t = norm.ppf(N.clip(acdf, clip, 1.0 - clip)) null_t[~null_right_tail] *= -1.0 # revert sign for negatives self.null_t = null_t # store else: # get probability of result under NULL hypothesis if available # and don't request tail information self.null_prob = self.__null_dist.p(result) return result
def _call(self, dataset, testdataset=None, **kwargs): """Invocation of the feature selection """ wdataset = dataset wtestdataset = testdataset self.ca.selected_ids = None self.ca.nfeatures = [] """Number of features at each step (before running selection)""" for fs in self.__feature_selections: # enable selected_ids state if it was requested from this class fs.ca.change_temporarily( enable_ca=["selected_ids"], other=self) if self.ca.is_enabled("nfeatures"): self.ca.nfeatures.append(wdataset.nfeatures) if __debug__: debug('FSPL', 'Invoking %s on (%s, %s)' % (fs, wdataset, wtestdataset)) wdataset, wtestdataset = fs(wdataset, wtestdataset, **kwargs) if self.ca.is_enabled("selected_ids"): if self.ca.selected_ids == None: self.ca.selected_ids = fs.ca.selected_ids else: self.ca.selected_ids = self.ca.selected_ids[fs.ca.selected_ids] fs.ca.reset_changed_temporarily() return (wdataset, wtestdataset)
def _call(self, dataset): # local bindings analyzer = self.__analyzer insplit_index = self.__insplit_index sensitivities = [] self.splits = splits = [] store_splits = self.states.isEnabled("splits") for ind,split in enumerate(self.__splitter(dataset)): ds = split[insplit_index] if __debug__ and "SA" in debug.active: debug("SA", "Computing sensitivity for split %d on " "dataset %s using %s" % (ind, ds, analyzer)) sensitivity = analyzer(ds) sensitivities.append(sensitivity) if store_splits: splits.append(split) self.sensitivities = sensitivities if __debug__: debug("SA", "Returning sensitivities combined using %s across %d items " "generated by splitter %s" % (self.__combiner, len(sensitivities), self.__splitter)) if self.__combiner is not None: sensitivities = self.__combiner(sensitivities) else: # assure that we have an ndarray on output sensitivities = N.asarray(sensitivities) return sensitivities
def _call(self, dataset): analyzers = [] # create analyzers for clf in self.clf.clfs: if self.__analyzer is None: analyzer = clf.get_sensitivity_analyzer(**(self._slave_kwargs)) if analyzer is None: raise ValueError, \ "Wasn't able to figure basic analyzer for clf %r" % \ (clf,) if __debug__: debug("SA", "Selected analyzer %r for clf %r" % \ (analyzer, clf)) else: # XXX shallow copy should be enough... analyzer = copy.copy(self.__analyzer) # assign corresponding classifier analyzer.clf = clf # if clf was trained already - don't train again if clf.trained: analyzer._force_train = False analyzers.append(analyzer) self.__combined_analyzer.analyzers = analyzers # XXX not sure if we don't want to call directly ._call(dataset) to avoid # double application of transformers/combiners, after all we are just # 'proxying' here to combined_analyzer... # YOH: decided -- lets call ._call return self.__combined_analyzer._call(dataset)
def train(self, dataset): """Train classifier on a dataset Shouldn't be overridden in subclasses unless explicitly needed to do so """ if dataset.nfeatures == 0 or dataset.nsamples == 0: raise DegenerateInputError, \ "Cannot train classifier on degenerate data %s" % dataset if __debug__: debug("CLF", "Training classifier %(clf)s on dataset %(dataset)s", msgargs={'clf':self, 'dataset':dataset}) self._pretrain(dataset) # remember the time when started training t0 = time.time() if dataset.nfeatures > 0: result = self._train(dataset) else: warning("Trying to train on dataset with no features present") if __debug__: debug("CLF", "No features present for training, no actual training " \ "is called") result = None self.ca.training_time = time.time() - t0 self._posttrain(dataset) return result
def _train(self, dataset): """Train SVM """ targets_sa_name = self.params.targets_attr # name of targets sa targets_sa = dataset.sa[targets_sa_name] # actual targets sa # libsvm needs doubles src = _data2ls(dataset) # libsvm cannot handle literal labels labels = self._attrmap.to_numeric(targets_sa.value).tolist() svmprob = _svm.SVMProblem(labels, src) # Translate few params TRANSLATEDICT = {'epsilon': 'eps', 'tube_epsilon': 'p'} args = [] for paramname, param in self.params.items() \ + self.kernel_params.items(): if paramname in TRANSLATEDICT: argname = TRANSLATEDICT[paramname] elif paramname in _svm.SVMParameter.default_parameters: argname = paramname else: if __debug__: debug( "SVM_", "Skipping parameter %s since it is not known" "to libsvm" % paramname) continue args.append((argname, param.value)) # ??? All those parameters should be fetched if present from # **kwargs and create appropriate parameters within .params or # .kernel_params libsvm_param = _svm.SVMParameter( kernel_type=self.params.kernel.as_raw_ls(), # Just an integer ID svm_type=self._svm_type, **dict(args)) """Store SVM parameters in libSVM compatible format.""" if self.params.has_key('C'): #svm_type in [_svm.svmc.C_SVC]: Cs = self._get_cvec(dataset) if len(Cs) > 1: C0 = abs(Cs[0]) scale = 1.0 / (C0) #*np.sqrt(C0)) # so we got 1 C per label uls = self._attrmap.to_numeric(targets_sa.unique) if len(Cs) != len(uls): raise ValueError, "SVM was parameterized with %d Cs but " \ "there are %d labels in the dataset" % \ (len(Cs), len(targets_sa.unique)) weight = [c * scale for c in Cs] # All 3 need to be set to take an effect libsvm_param._set_parameter('weight', weight) libsvm_param._set_parameter('nr_weight', len(weight)) libsvm_param._set_parameter('weight_label', uls) libsvm_param._set_parameter('C', Cs[0]) self.__model = _svm.SVMModel(svmprob, libsvm_param)
def seed(random_seed): if __debug__: debug('SG', "Seeding shogun's RNG with %s" % random_seed) try: # reuse the same seed for shogun shogun.Library.Math_init_random(random_seed) except Exception, e: warning('Shogun cannot be seeded due to %s' % (e, ))
def _seqitems_to_hdf(obj, hdf, memo, noid=False, **kwargs): """Store a sequence as HDF item list""" hdf.attrs.create('length', len(obj)) items = hdf.create_group('items') for i, item in enumerate(obj): if __debug__: debug('HDF5', "Item %i" % i) obj2hdf(items, item, name=str(i), memo=memo, noid=noid, **kwargs)
def xml(self, line, style=None): """Adding XML string to the report """ if __debug__ and not self in debug.handlers: debug("REP", "Adding xml '%s'" % line.strip()) if style is None: style = self.style self._story.append(Paragraph(line, style=style))
def _verbose_callback(option, optstr, value, parser): """Callback for -v|--verbose cmdline option """ if __debug__: debug("CMDLINE", "Setting verbose.level to %s" % str(value)) verbose.level = value optstr = optstr # pylint shut up setattr(parser.values, option.dest, value)
def untrain(self): """Untrain libsvm's SVM: forget the model """ if __debug__ and "SVM" in debug.active: debug("SVM", "Untraining %s and destroying libsvm model" % self) super(SVM, self).untrain() del self.__model self.__model = None
def _wm_forward(self, data): if __debug__: debug('MAP', "Converting signal using DWP") if self.__level is None: return self.__forward_multiple_levels(data) else: return self.__forward_single_level(data)
def __init__(self, descr=None, **kwargs): """Initialize ClassWithCollections object Parameters ---------- descr : str Description of the instance """ # Note: __params_set was initialized in __new__ if not self.__params_set: self.__descr = descr """Set humane description for the object""" # To avoid double initialization in case of multiple inheritance self.__params_set = True collections = self._collections # Assign attributes values if they are given among # **kwargs for arg, argument in kwargs.items(): isset = False for collection in collections.itervalues(): if collection._is_initializable(arg): collection._initialize(arg, argument) isset = True break if isset: _ = kwargs.pop(arg) else: known_params = reduce( lambda x, y: x + y, [x.keys() for x in collections.itervalues()], []) raise TypeError, \ "Unexpected keyword argument %s=%s for %s." \ % (arg, argument, self) \ + " Valid parameters are %s" % known_params ## Initialize other base classes ## commented out since it seems to be of no use for now #if init_classes is not None: # # return back stateful arguments since they might be # # processed by underlying classes # kwargs.update(kwargs_stateful) # for cls in init_classes: # cls.__init__(self, **kwargs) #else: # if len(kwargs)>0: # known_params = reduce(lambda x, y: x + y, \ # [x.keys() for x in collections], # []) # raise TypeError, \ # "Unknown parameters %s for %s." % (kwargs.keys(), # self) \ # + " Valid parameters are %s" % known_params if __debug__: debug("COL", "ClassWithCollections.__init__ was done " "for %s#%s with descr=%s" \ % (self.__class__.__name__, id(self), descr))
def _set_enabled(self, value=False): if self.__enabled == value: # Do nothing since it is already in proper state return if __debug__: debug("STV", "%s %s" % ({True: 'Enabling', False: 'Disabling'}[value], self)) self.__enabled = value
def reset_changed_temporarily(self): """Reset to previousely stored set of enabled ca""" if __debug__: debug("ST", "Resetting to previous set of enabled ca") if len(self.enabled) > 0: self.enabled = self.__storedTemporarily.pop() else: raise ValueError("Trying to restore not-stored list of enabled " \ "ca")
def text(self, line, **kwargs): """Add a text string to the report """ if __debug__ and not self in debug.handlers: debug("REP_", "Adding text '%s'" % line.strip()) # we need to convert some of the characters to make it # legal XML line = escape_xml(line) self.xml(line, **kwargs)