def test_uniquemerge2literal(): assert_equal(u2l(range(3)), ['0+1+2']) assert_equal(u2l(np.arange(6).reshape(2, 3)), ['[0 1 2]+[3 4 5]']) assert_array_equal(u2l([[2, 3, 4]]), [[2, 3, 4]]) assert_array_equal(u2l([[2, 3, 4], [2, 3, 4]]), [[2, 3, 4]]) assert_equal(u2l([2, 2, 2]), [2]) assert_array_equal(u2l(['L1', 'L1']), ['L1']) # we should not loose our precious "tuples" assert_equal(u2l(asobjarray([('1', '0'), ('1', '0')])), asobjarray([('1', '0')]))
def test_uniquemerge2literal(): assert_equal(u2l(range(3)), ['0+1+2']) assert_equal(u2l( np.arange(6).reshape(2, 3)), ['[0 1 2]+[3 4 5]']) assert_array_equal(u2l([[2, 3, 4]]), [[2, 3, 4]]) assert_array_equal(u2l([[2, 3, 4], [2, 3, 4]]), [[2, 3, 4]]) assert_equal(u2l([2, 2, 2]), [2]) assert_array_equal(u2l(['L1', 'L1']), ['L1']) # we should not loose our precious "tuples" assert_equal(u2l(asobjarray([('1', '0'), ('1', '0')])), asobjarray([('1', '0')]))
def _hdf_list_to_objarray(hdf, memo): if not ('shape' in hdf.attrs): if __debug__: debug('HDF5', "Enountered objarray stored without shape (due to a bug " "in post 2.1 release). Some nested structures etc might not be " "loaded incorrectly") # yoh: we have possibly a problematic case due to my fix earlier # resolve to old logic: nested referencing might not work :-/ obj = _hdf_list_to_obj(hdf, memo) # need to handle special case of arrays of objects if np.isscalar(obj): obj = np.array(obj, dtype=np.object) else: obj = asobjarray(obj) else: shape = tuple(hdf.attrs['shape']) # reserve space first if len(shape): obj = np.empty(np.prod(shape), dtype=object) else: # scalar obj = np.array(None, dtype=object) # now load the items from the list, noting existence of this # container obj_items = _hdf_list_to_obj(hdf, memo, target_container=obj) # assign to the object array for i, v in enumerate(obj_items): obj[i] = v if len(shape) and shape != obj.shape: obj = obj.reshape(shape) return obj
def test_asobjarray(self): for i in ([1, 2, 3], ['a', 2, '3'], ('asd')): i_con = asobjarray(i) self.assertTrue(i_con.dtype is np.dtype('object')) self.assertEqual(len(i), len(i_con)) self.assertTrue(np.all(i == i_con))
def _call(self, dataset=None): """Extract weights from PLR classifier. PLR always has weights available, so nothing has to be computed here. """ clf = self.clf attrmap = clf._attrmap if attrmap: # labels (values of the corresponding space) which were used # for mapping Here we rely on the fact that they are sorted # originally (just an arange()) labels_num = list(attrmap.values()) labels = attrmap.to_literal(asobjarray([tuple(sorted(labels_num)) ]), recurse=True) else: labels = [(0, 1)] # we just had our good old numeric ones ds = Dataset(clf.w.T, sa={ clf.get_space(): labels, 'biases': [clf.bias] }) return ds
def _call(self, dataset): # for a binary decision between two labels, for all pairwise combinations of labels in # the dataset, compute weights per feature as the difference between means given label # divided by the variance. clf = self.clf # get means of all attributes given class label means = clf.means # number of features nfeat = clf.means.shape[1] # all pairwise combinations of labels pairs = list(itertools.combinations(range(len(clf.ulabels)), 2)) weights = np.zeros([len(pairs), nfeat]) # do not compute sensitivity for features with variance 0 as this would # implicate a division by zero nonzero_vars = clf.variances != 0 assert clf.params.common_variance nonzero_vars0 = nonzero_vars[0, :] for idx, pair in enumerate(pairs): # two-class sensitivity for (L0, L1) assumes that L1 is the # "positive one" weights[idx, nonzero_vars0] = (means[pair[1], nonzero_vars0] - means[pair[0], nonzero_vars0]) / \ clf.variances[pair[0], nonzero_vars0] # put everything into a Dataset ds = Dataset(weights, sa={ clf.get_space(): asobjarray([(clf.ulabels[p1], clf.ulabels[p2]) for p1, p2 in pairs]) }) return ds
def test_asobjarray(self): for i in ([1, 2, 3], ['a', 2, '3'], ('asd')): i_con = asobjarray(i) self.failUnless(i_con.dtype is np.dtype('object')) self.failUnlessEqual(len(i), len(i_con)) self.failUnless(np.all(i == i_con))
def _hdf_list_to_objarray(hdf, memo): if not ('shape' in hdf.attrs): if __debug__: debug( 'HDF5', "Enountered objarray stored without shape (due to a bug " "in post 2.1 release). Some nested structures etc might not be " "loaded incorrectly") # yoh: we have possibly a problematic case due to my fix earlier # resolve to old logic: nested referencing might not work :-/ obj = _hdf_list_to_obj(hdf, memo) # need to handle special case of arrays of objects if np.isscalar(obj): obj = np.array(obj, dtype=np.object) else: obj = asobjarray(obj) else: shape = tuple(hdf.attrs['shape']) # reserve space first if len(shape): obj = np.empty(np.prod(shape), dtype=object) else: # scalar obj = np.array(None, dtype=object) # now load the items from the list, noting existence of this # container obj_items = _hdf_list_to_obj(hdf, memo, target_container=obj) # assign to the object array for i, v in enumerate(obj_items): obj[i] = v if len(shape) and shape != obj.shape: obj = obj.reshape(shape) return obj
def _call(self, dataset): # XXX Hm... it might make sense to unify access functions # naming across our swig libsvm wrapper and sg access # functions for svm clf = self.clf sgsvm = clf.svm sens_labels = None if isinstance(sgsvm, shogun.Classifier.MultiClassSVM): sens, biases = [], [] nsvms = sgsvm.get_num_svms() clabels = sorted(clf._attrmap.values()) nclabels = len(clabels) sens_labels = [] isvm = 0 # index for svm among known for i in xrange(nclabels): for j in xrange(i + 1, nclabels): sgsvmi = sgsvm.get_svm(isvm) labels_tuple = (clabels[i], clabels[j]) # Since we gave the labels in incremental order, # we always should be right - but it does not # hurt to check if set of labels is the same if __debug__ and _shogun_exposes_slavesvm_labels: if not sgsvmi.get_labels(): # We need to call classify() so labels get assigned # to the multiclass SVM sgsvm.classify() assert (set([ sgsvmi.get_label(int(x)) for x in sgsvmi.get_support_vectors() ]) == set(labels_tuple)) sens1, bias = self.__sg_helper(sgsvmi) sens.append(sens1) biases.append(bias) sens_labels += [labels_tuple[::-1]] # ??? positive first isvm += 1 assert (len(sens) == nsvms) # we should have covered all else: sens1, bias = self.__sg_helper(sgsvm) biases = np.atleast_1d(bias) sens = np.atleast_2d(sens1) if not clf.__is_regression__: assert (set(clf._attrmap.values()) == set([-1.0, 1.0])) assert (sens.shape[0] == 1) sens_labels = [(-1.0, 1.0)] ds = Dataset(np.atleast_2d(sens)) if sens_labels is not None: if isinstance(sens_labels[0], tuple): # Need to have them in array of dtype object sens_labels = asobjarray(sens_labels) if len(clf._attrmap): sens_labels = clf._attrmap.to_literal(sens_labels, recurse=True) ds.sa[clf.get_space()] = sens_labels ds.sa['biases'] = biases return ds
def _call(self, dataset): # XXX Hm... it might make sense to unify access functions # naming across our swig libsvm wrapper and sg access # functions for svm clf = self.clf sgsvm = clf.svm sens_labels = None if isinstance(sgsvm, shogun.Classifier.MultiClassSVM): sens, biases = [], [] nsvms = sgsvm.get_num_svms() clabels = sorted(clf._attrmap.values()) nclabels = len(clabels) sens_labels = [] isvm = 0 # index for svm among known for i in xrange(nclabels): for j in xrange(i+1, nclabels): sgsvmi = sgsvm.get_svm(isvm) labels_tuple = (clabels[i], clabels[j]) # Since we gave the labels in incremental order, # we always should be right - but it does not # hurt to check if set of labels is the same if __debug__ and _shogun_exposes_slavesvm_labels: if not sgsvmi.get_labels(): # We need to call classify() so labels get assigned # to the multiclass SVM sgsvm.classify() assert(set([sgsvmi.get_label(int(x)) for x in sgsvmi.get_support_vectors()]) == set(labels_tuple)) sens1, bias = self.__sg_helper(sgsvmi) sens.append(sens1) biases.append(bias) sens_labels += [labels_tuple[::-1]] # ??? positive first isvm += 1 assert(len(sens) == nsvms) # we should have covered all else: sens1, bias = self.__sg_helper(sgsvm) biases = np.atleast_1d(bias) sens = np.atleast_2d(sens1) if not clf.__is_regression__: assert(set(clf._attrmap.values()) == set([-1.0, 1.0])) assert(sens.shape[0] == 1) sens_labels = [(-1.0, 1.0)] ds = Dataset(np.atleast_2d(sens)) if sens_labels is not None: if isinstance(sens_labels[0], tuple): # Need to have them in array of dtype object sens_labels = asobjarray(sens_labels) if len(clf._attrmap): sens_labels = clf._attrmap.to_literal(sens_labels, recurse=True) ds.sa[clf.get_space()] = sens_labels ds.sa['biases'] = biases return ds
def test_asobjarray(self): for i in ([1, 2, 3], ['a', 2, '3'], ('asd')): i_con = asobjarray(i) self.assertTrue(i_con.dtype is np.dtype('object')) self.assertEqual(len(i), len(i_con)) # Note: in Python3 the ['a' , 2, '3'] list is converted to # an array with elements 'a', '2',' and '3' (i.e. string representation # for the second element), and thus np.all(i==i_con) fails. # Instead here each element is tested for equality seperately # XXX is this an issue? self.assertTrue(np.all((i[j] == i_con[j]) for j in xrange(len(i))))
def _call(self, dataset): sens = super(RegressionAsClassifierSensitivityAnalyzer, self)._call(dataset) # We can have only a single sensitivity out of regression assert sens.shape[0] == 1 clf = self.clf targets_attr = clf.get_space() if targets_attr not in sens.sa: # We just assign a tuple of all labels sorted labels = tuple(sorted(clf._trained_attrmap.values())) if len(clf._trained_attrmap): labels = clf._trained_attrmap.to_literal(labels, recurse=True) sens.sa[targets_attr] = asobjarray([labels]) return sens
def _call(self, dataset): sens = super(RegressionAsClassifierSensitivityAnalyzer, self)._call(dataset) # We can have only a single sensitivity out of regression assert (sens.shape[0] == 1) clf = self.clf targets_attr = clf.get_space() if targets_attr not in sens.sa: # We just assign a tuple of all labels sorted labels = tuple(sorted(clf._trained_attrmap.values())) if len(clf._trained_attrmap): labels = clf._trained_attrmap.to_literal(labels, recurse=True) sens.sa[targets_attr] = asobjarray([labels]) return sens
def _uniquemerge2literal(attrs): """Compress a sequence into its unique elements (with string merge). Whenever there is more then one unique element in `attrs`, these are converted to a string and join with a '+' character inbetween. Parameters ---------- attrs : sequence, arbitrary Returns ------- Non-sequence arguments are passed as is, otherwise a sequences of unique items is. None is returned in case of an empty sequence. """ try: if isinstance(attrs[0], basestring): # do not try to disassemble sequences of strings raise TypeError uvalues = set(map(tuple, attrs)) # if we were provided array of object type, most likely because # we had tuples or other objects, we must produce also object array if isinstance(attrs, np.ndarray) and attrs.dtype == 'O': unq = asobjarray(list(uvalues)) else: unq = list(map(np.array, uvalues)) except TypeError: # either no 2d-iterable... try: unq = np.unique(attrs) except TypeError: # or no iterable at all -- return the original return attrs lunq = len(unq) if lunq > 1: return ['+'.join([str(l) for l in unq])] elif lunq: return unq else: return None
def _call(self, dataset=None): """Extract weights from PLR classifier. PLR always has weights available, so nothing has to be computed here. """ clf = self.clf attrmap = clf._attrmap if attrmap: # labels (values of the corresponding space) which were used # for mapping Here we rely on the fact that they are sorted # originally (just an arange()) labels_num = attrmap.values() labels = attrmap.to_literal(asobjarray([tuple(sorted(labels_num))]), recurse=True) else: labels = [(0, 1)] # we just had our good old numeric ones ds = Dataset(clf.w.T, sa={clf.get_space(): labels, 'biases' : [clf.bias]}) return ds
def _call(self, dataset): # for a binary decision between two labels, for all pairwise combinations of labels in # the dataset, compute weights per feature as the difference between means given label # divided by the variance. clf = self.clf # get means of all attributes given class label means = clf.means # number of features nfeat = clf.means.shape[1] # all pairwise combinations of labels pairs = list(itertools.combinations(range(len(clf.ulabels)), 2)) weights = np.zeros([len(pairs), nfeat]) # do not compute sensitivity for features with variance 0 as this would # implicate a division by zero nonzero_vars = clf.variances != 0 assert clf.params.common_variance nonzero_vars0 = nonzero_vars[0, :] for idx, pair in enumerate(pairs): # two-class sensitivity for (L0, L1) assumes that L1 is the # "positive one" weights[idx, nonzero_vars0] = (means[pair[1], nonzero_vars0] - means[pair[0], nonzero_vars0]) / \ clf.variances[pair[0], nonzero_vars0] # put everything into a Dataset ds = Dataset( weights, sa={ clf.get_space(): asobjarray([ (clf.ulabels[p1], clf.ulabels[p2]) for p1, p2 in pairs] ) } ) return ds
def test_asobjarray(self): for i in ([1, 2, 3], ["a", 2, "3"], ("asd")): i_con = asobjarray(i) self.assertTrue(i_con.dtype is np.dtype("object")) self.assertEqual(len(i), len(i_con)) self.assertTrue(np.all(i == i_con))
def _call(self, dataset, callables=[]): # local bindings clf = self.clf model = clf.model # Labels for sensitivities to be returned sens_labels = None if clf.__is_regression__: nr_class = None svm_labels = None # shouldn't bother to provide "targets" for regressions else: nr_class = model.nr_class svm_labels = model.labels # No need to warn since now we by default we do not do # anything evil and provide labels -- so it is up for a user # to decide either he wants to do something silly #if nr_class != 2: # warning("You are estimating sensitivity for SVM %s trained on %d" % # (str(clf), nr_class) + # " classes. Make sure that it is what you intended to do" ) svcoef = np.matrix(model.get_sv_coef()) svs = np.matrix(model.get_sv()) rhos = np.asarray(model.get_rho()) if self.params.split_weights: if nr_class != 2: raise NotImplementedError, \ "Cannot compute per-class weights for" \ " non-binary classification task" # libsvm might have different idea on the ordering # of labels, so we would need to map them back explicitely ds_labels = list( dataset.sa[clf.get_space()].unique) # labels in the dataset senses = [None for i in ds_labels] # first label is given positive value for i, (c, l) in enumerate([(svcoef > 0, lambda x: x), (svcoef < 0, lambda x: x * -1)]): # convert to array, and just take the meaningful dimension c_ = c.A[0] # NOTE svm_labels are numerical; ds_labels are literal senses[ds_labels.index( clf._attrmap.to_literal(svm_labels[i]))] = \ (l(svcoef[:, c_] * svs[c_, :])).A[0] weights = np.array(senses) sens_labels = svm_labels else: # XXX yoh: .mean() is effectively # averages across "sensitivities" of all paired classifiers (I # think). See more info on this topic in svm.py on how sv_coefs # are stored # # First multiply SV coefficients with the actual SVs to get # weighted impact of SVs on decision, then for each feature # take mean across SVs to get a single weight value # per feature if nr_class is None or nr_class <= 2: # as simple as this weights = (svcoef * svs).A # and only in case of classification if nr_class: # ??? First label seems corresponds to positive sens_labels = [tuple(svm_labels[::-1])] else: # we need to compose correctly per each pair of classifiers. # See docstring for get_sv_coef for more details on internal # structure of bloody storage # total # of pairs npairs = nr_class * (nr_class - 1) / 2 # # of SVs in each class NSVs_perclass = model.get_n_sv() # indices where each class starts in each row of SVs # name is after similar variable in libsvm internals nz_start = np.cumsum([0] + NSVs_perclass[:-1]) nz_end = nz_start + NSVs_perclass # reserve storage weights = np.zeros((npairs, svs.shape[1])) ipair = 0 # index of the pair """ // classifier (i,j): coefficients with // i are in sv_coef[j-1][nz_start[i]...], // j are in sv_coef[i][nz_start[j]...] """ sens_labels = [] for i in xrange(nr_class): for j in xrange(i + 1, nr_class): weights[ipair, :] = np.asarray( svcoef[j - 1, nz_start[i]:nz_end[i]] * svs[nz_start[i]:nz_end[i]] + svcoef[i, nz_start[j]:nz_end[j]] * svs[nz_start[j]:nz_end[j]]) # ??? First label corresponds to positive # that is why [j], [i] sens_labels += [(svm_labels[j], svm_labels[i])] ipair += 1 # go to the next pair assert (ipair == npairs) if __debug__ and 'SVM' in debug.active: if nr_class: nsvs = model.get_n_sv() else: nsvs = model.get_total_n_sv() if clf.__is_regression__: svm_type = clf._svm_impl # type of regression else: svm_type = '%d-class SVM(%s)' % (nr_class, clf._svm_impl) debug('SVM', "Extracting weights for %s: #SVs=%s, " % \ (svm_type, nsvs) + \ " SVcoefshape=%s SVs.shape=%s Rhos=%s." % \ (svcoef.shape, svs.shape, rhos) + \ " Result: min=%f max=%f" % (np.min(weights), np.max(weights))) ds_kwargs = {} if nr_class: # for classification only # and we should have prepared the labels assert (sens_labels is not None) if len(clf._attrmap): if isinstance(sens_labels[0], tuple): sens_labels = asobjarray(sens_labels) sens_labels = clf._attrmap.to_literal(sens_labels, recurse=True) # NOTE: `weights` is already and always 2D ds_kwargs = dict(sa={clf.get_space(): sens_labels}) weights_ds = Dataset(weights, **ds_kwargs) weights_ds.sa['biases'] = rhos return weights_ds
def _call(self, dataset, callables=[]): # local bindings clf = self.clf model = clf.model # Labels for sensitivities to be returned sens_labels = None if clf.__is_regression__: nr_class = None svm_labels = None # shouldn't bother to provide "targets" for regressions else: nr_class = model.nr_class svm_labels = model.labels # No need to warn since now we by default we do not do # anything evil and provide labels -- so it is up for a user # to decide either he wants to do something silly #if nr_class != 2: # warning("You are estimating sensitivity for SVM %s trained on %d" % # (str(clf), nr_class) + # " classes. Make sure that it is what you intended to do" ) svcoef = np.matrix(model.get_sv_coef()) svs = np.matrix(model.get_sv()) rhos = np.asarray(model.get_rho()) if self.params.split_weights: if nr_class != 2: raise NotImplementedError, \ "Cannot compute per-class weights for" \ " non-binary classification task" # libsvm might have different idea on the ordering # of labels, so we would need to map them back explicitely ds_labels = list(dataset.sa[clf.get_space()].unique) # labels in the dataset senses = [None for i in ds_labels] # first label is given positive value for i, (c, l) in enumerate( [(svcoef > 0, lambda x: x), (svcoef < 0, lambda x: x*-1)] ): # convert to array, and just take the meaningful dimension c_ = c.A[0] # NOTE svm_labels are numerical; ds_labels are literal senses[ds_labels.index( clf._attrmap.to_literal(svm_labels[i]))] = \ (l(svcoef[:, c_] * svs[c_, :])).A[0] weights = np.array(senses) sens_labels = svm_labels else: # XXX yoh: .mean() is effectively # averages across "sensitivities" of all paired classifiers (I # think). See more info on this topic in svm.py on how sv_coefs # are stored # # First multiply SV coefficients with the actual SVs to get # weighted impact of SVs on decision, then for each feature # take mean across SVs to get a single weight value # per feature if nr_class is None or nr_class <= 2: # as simple as this weights = (svcoef * svs).A # and only in case of classification if nr_class: # ??? First label seems corresponds to positive sens_labels = [tuple(svm_labels[::-1])] else: # we need to compose correctly per each pair of classifiers. # See docstring for get_sv_coef for more details on internal # structure of bloody storage # total # of pairs npairs = nr_class * (nr_class-1)/2 # # of SVs in each class NSVs_perclass = model.get_n_sv() # indices where each class starts in each row of SVs # name is after similar variable in libsvm internals nz_start = np.cumsum([0] + NSVs_perclass[:-1]) nz_end = nz_start + NSVs_perclass # reserve storage weights = np.zeros((npairs, svs.shape[1])) ipair = 0 # index of the pair """ // classifier (i,j): coefficients with // i are in sv_coef[j-1][nz_start[i]...], // j are in sv_coef[i][nz_start[j]...] """ sens_labels = [] for i in xrange(nr_class): for j in xrange(i+1, nr_class): weights[ipair, :] = np.asarray( svcoef[j-1, nz_start[i]:nz_end[i]] * svs[nz_start[i]:nz_end[i]] + svcoef[i, nz_start[j]:nz_end[j]] * svs[nz_start[j]:nz_end[j]] ) # ??? First label corresponds to positive # that is why [j], [i] sens_labels += [(svm_labels[j], svm_labels[i])] ipair += 1 # go to the next pair assert(ipair == npairs) if __debug__ and 'SVM' in debug.active: if nr_class: nsvs = model.get_n_sv() else: nsvs = model.get_total_n_sv() if clf.__is_regression__: svm_type = clf._svm_impl # type of regression else: svm_type = '%d-class SVM(%s)' % (nr_class, clf._svm_impl) debug('SVM', "Extracting weights for %s: #SVs=%s, " % \ (svm_type, nsvs) + \ " SVcoefshape=%s SVs.shape=%s Rhos=%s." % \ (svcoef.shape, svs.shape, rhos) + \ " Result: min=%f max=%f" % (np.min(weights), np.max(weights))) ds_kwargs = {} if nr_class: # for classification only # and we should have prepared the labels assert(sens_labels is not None) if len(clf._attrmap): if isinstance(sens_labels[0], tuple): sens_labels = asobjarray(sens_labels) sens_labels = clf._attrmap.to_literal(sens_labels, recurse=True) # NOTE: `weights` is already and always 2D ds_kwargs = dict(sa={clf.get_space(): sens_labels}) weights_ds = Dataset(weights, **ds_kwargs) weights_ds.sa['biases'] = rhos return weights_ds
def hdf2obj(hdf, memo=None): """Convert an HDF5 group definition into an object instance. Obviously, this function assumes the conventions implemented in the `obj2hdf()` function. Those conventions will eventually be documented in the module docstring, whenever they are sufficiently stable. Parameters ---------- hdf : HDF5 group instance HDF5 group instance. this could also be an HDF5 file instance. memo : dict Dictionary tracking reconstructed objects to prevent recursions (analog to deepcopy). Notes ----- Although, this function uses a way to reconstruct object instances that is similar to unpickling, it should be *relatively* safe to open HDF files from untrusted sources. Only basic datatypes are stored in HDF files, and there is no foreign code that is executed during reconstructing. For that reason, any type that shall be reconstructed needs to be importable (importing is done be fully-qualified module names). Returns ------- object instance """ if memo is None: # init object tracker memo = {} # note, older file formats did not store objrefs if 'objref' in hdf.attrs: objref = hdf.attrs['objref'] else: objref = None # if this HDF group has an objref that points to an already recontructed # object, simple return this object again if not objref is None and objref in memo: obj = memo[objref] if __debug__: debug('HDF5', "Use tracked object %s (%i)" % (type(obj), objref)) return obj # # Actual data # if isinstance(hdf, h5py.Dataset): if __debug__: debug('HDF5', "Load from HDF5 dataset [%s]" % hdf.name) if 'is_scalar' in hdf.attrs: # extract the scalar from the 0D array obj = hdf[()] # and coerce it back into the native Python type if necessary if issubclass(type(obj), np.generic): obj = np.asscalar(obj) elif 'is_numpy_scalar' in hdf.attrs: # extract the scalar from the 0D array as is obj = hdf[()] else: # read array-dataset into an array obj = np.empty(hdf.shape, hdf.dtype) if obj.size: hdf.read_direct(obj) else: # check if we have a class instance definition here if not ('class' in hdf.attrs or 'recon' in hdf.attrs): raise LookupError( "Found hdf group without class instance " "information (group: %s). Cannot convert it into an " "object (content: '%s', attributes: '%s')." % (hdf.name, hdf.keys(), hdf.attrs.keys())) mod_name = hdf.attrs['module'] if 'recon' in hdf.attrs: # Custom objects custom reconstructor obj = _recon_customobj_customrecon(hdf, memo) elif mod_name != '__builtin__': # Custom objects default reconstructor cls_name = hdf.attrs['class'] if cls_name in ('function', 'type', 'builtin_function_or_method'): # Functions and types obj = _recon_functype(hdf) else: # Other custom objects obj = _recon_customobj_defaultrecon(hdf, memo) else: # Built-in objects cls_name = hdf.attrs['class'] if __debug__: debug('HDF5', "Reconstructing built-in object '%s'." % cls_name) # built in type (there should be only 'list', 'dict' and 'None' # that would not be in a Dataset if cls_name == 'NoneType': obj = None elif cls_name == 'tuple': obj = _hdf_tupleitems_to_obj(hdf, memo) elif cls_name == 'list': obj = _hdf_list_to_obj(hdf, memo) elif cls_name == 'dict': obj = _hdf_dict_to_obj(hdf, memo) elif cls_name == 'type': obj = eval(hdf.attrs['name']) elif cls_name == 'function': raise RuntimeError("Unhandled reconstruction of built-in " "function (at '%s')." % hdf.name) else: raise RuntimeError( "Found hdf group with a builtin type " "that is not handled by the parser (group: %s). This " "is a conceptual bug in the parser. Please report." % hdf.name) # # Final post-processing # if 'is_objarray' in hdf.attrs: # need to handle special case of arrays of objects if np.isscalar(obj): obj = np.array(obj, dtype=np.object) else: obj = asobjarray(obj) if 'shape' in hdf.attrs: shape = tuple(hdf.attrs['shape']) if shape != obj.shape: obj = obj.reshape(shape) # track if desired if objref: memo[objref] = obj if __debug__: debug('HDF5', "Done loading %s [%s]" % (type(obj), hdf.name)) return obj
def hdf2obj(hdf, memo=None): """Convert an HDF5 group definition into an object instance. Obviously, this function assumes the conventions implemented in the `obj2hdf()` function. Those conventions will eventually be documented in the module docstring, whenever they are sufficiently stable. Parameters ---------- hdf : HDF5 group instance HDF5 group instance. this could also be an HDF5 file instance. memo : dict Dictionary tracking reconstructed objects to prevent recursions (analog to deepcopy). Notes ----- Although, this function uses a way to reconstruct object instances that is similar to unpickling, it should be *relatively* safe to open HDF files from untrusted sources. Only basic datatypes are stored in HDF files, and there is no foreign code that is executed during reconstructing. For that reason, any type that shall be reconstructed needs to be importable (importing is done be fully-qualified module names). Returns ------- object instance """ if memo is None: # init object tracker memo = {} # note, older file formats did not store objrefs if 'objref' in hdf.attrs: objref = hdf.attrs['objref'] else: objref = None # if this HDF group has an objref that points to an already recontructed # object, simple return this object again if not objref is None and objref in memo: obj = memo[objref] if __debug__: debug('HDF5', "Use tracked object %s (%i)" % (type(obj), objref)) return obj # # Actual data # if isinstance(hdf, h5py.Dataset): if __debug__: debug('HDF5', "Load from HDF5 dataset [%s]" % hdf.name) if 'is_scalar' in hdf.attrs: # extract the scalar from the 0D array obj = hdf[()] # and coerce it back into the native Python type if necessary if issubclass(type(obj), np.generic): obj = np.asscalar(obj) elif 'is_numpy_scalar' in hdf.attrs: # extract the scalar from the 0D array as is obj = hdf[()] else: # read array-dataset into an array obj = np.empty(hdf.shape, hdf.dtype) hdf.read_direct(obj) else: # check if we have a class instance definition here if not ('class' in hdf.attrs or 'recon' in hdf.attrs): raise LookupError("Found hdf group without class instance " "information (group: %s). Cannot convert it into an " "object (content: '%s', attributes: '%s')." % (hdf.name, hdf.keys(), hdf.attrs.keys())) mod_name = hdf.attrs['module'] if 'recon' in hdf.attrs: # Custom objects custom reconstructor obj = _recon_customobj_customrecon(hdf, memo) elif mod_name != '__builtin__': # Custom objects default reconstructor cls_name = hdf.attrs['class'] if cls_name in ('function', 'type', 'builtin_function_or_method'): # Functions and types obj = _recon_functype(hdf) else: # Other custom objects obj = _recon_customobj_defaultrecon(hdf, memo) else: # Built-in objects cls_name = hdf.attrs['class'] if __debug__: debug('HDF5', "Reconstructing built-in object '%s'." % cls_name) # built in type (there should be only 'list', 'dict' and 'None' # that would not be in a Dataset if cls_name == 'NoneType': obj = None elif cls_name == 'tuple': obj = _hdf_tupleitems_to_obj(hdf, memo) elif cls_name == 'list': obj = _hdf_list_to_obj(hdf, memo) elif cls_name == 'dict': obj = _hdf_dict_to_obj(hdf, memo) elif cls_name == 'type': obj = eval(hdf.attrs['name']) elif cls_name == 'function': raise RuntimeError("Unhandled reconstruction of built-in " "function (at '%s')." % hdf.name) else: raise RuntimeError("Found hdf group with a builtin type " "that is not handled by the parser (group: %s). This " "is a conceptual bug in the parser. Please report." % hdf.name) # # Final post-processing # if 'is_objarray' in hdf.attrs: # need to handle special case of arrays of objects if np.isscalar(obj): obj = np.array(obj, dtype=np.object) else: obj = asobjarray(obj) if 'shape' in hdf.attrs: shape = tuple(hdf.attrs['shape']) if shape != obj.shape: obj = obj.reshape(shape) # track if desired if objref: memo[objref] = obj if __debug__: debug('HDF5', "Done loading %s [%s]" % (type(obj), hdf.name)) return obj