def get_inv_eff(self, signal_data=None, gen_data=None): this_hash = hash_obj( [self.true_binning.hash, self.output_str, 'inv_eff'], full_hash=self.full_hash) assert len(set([signal_data is None, gen_data is None])) == 1 if signal_data is None and gen_data is None: if self.inv_eff_hash == this_hash: logging.trace('Loading inv eff from mem cache') return self._inv_eff if this_hash in self.disk_cache: logging.debug('Loading inv eff histogram from disk cache.') inv_eff = self.disk_cache[this_hash] else: raise ValueError( 'inverse efficiency histogram with correct hash not found ' 'in disk_cache') else: this_hash = hash_obj([this_hash, self.fit_hash], full_hash=self.full_hash) if self.inv_eff_hash == this_hash: logging.trace('Loading inv eff from mem cache') return self._inv_eff inv_eff = self._get_inv_eff(signal_data, gen_data, self.true_binning, self.output_str) if self.disk_cache is not None: if this_hash not in self.disk_cache: logging.debug('Caching inv eff histogram to disk.') self.disk_cache[this_hash] = inv_eff self.inv_eff_hash = this_hash self._inv_eff = inv_eff return inv_eff
def create_response(self, reco_norm_data=None, true_norm_data=None, data=None): """Create the response object from the signal data.""" unfold_bg = self.params['unfold_bg'].value unfold_eff = self.params['unfold_eff'].value unfold_unweighted = self.params['unfold_unweighted'].value this_hash = hash_obj([ self.reco_binning.hash, self.true_binning.hash, unfold_bg, unfold_eff, unfold_unweighted, self.output_str, 'response' ], full_hash=self.full_hash) assert len( set([reco_norm_data is None, true_norm_data is None, data is None])) == 1 if reco_norm_data is None and true_norm_data is None and data is None: if self.response_hash == this_hash: logging.trace('Loading response from mem cache') return self._response else: try: del self._response except: pass if this_hash in self.disk_cache: logging.debug('Loading response from disk cache.') response = self.disk_cache[this_hash] else: raise ValueError( 'response object with correct hash not found in disk_cache' ) else: this_hash = hash_obj([this_hash, self.fit_hash] + list(self.params.values), full_hash=self.full_hash) if self.response_hash == this_hash: logging.debug('Loading response from mem cache') return self._response else: try: del self._response del self.t_th1d except: pass # Truth histogram also gets returned if response matrix is created response, self.t_th1d = self._create_response( reco_norm_data, true_norm_data, data, self.reco_binning, self.true_binning) if self.disk_cache is not None: if this_hash not in self.disk_cache: logging.debug('Caching response object to disk.') self.disk_cache[this_hash] = response self.response_hash = this_hash self._response = response return response
def hash(self): """Combines source_code_hash and params.hash for checking/tagging provenance of persisted (on-disk) objects.""" objects_to_hash = [self.source_code_hash, self.params.hash] for attr in sorted(self._attrs_to_hash): objects_to_hash.append( hash_obj(getattr(self, attr), full_hash=self.full_hash)) return hash_obj(objects_to_hash, full_hash=self.full_hash)
def calculate_fit_coeffs(self): """ Calculate the fit coefficients for each systematic, flavint, bin for a polynomial. """ this_hash = hash_obj( [self.fit_binning.hash, self.weight_hash] + [self.params[name].value for name in self.fit_params], full_hash=self.full_hash ) if self.fitcoeffs_hash == this_hash: return self._fit_coeffs if self.neutrinos: nu_params = self.nu_params else: nu_params = None if self.muons: mu_params = self.mu_params else: mu_params = None if self.params['cache_fit'].value: this_cache_hash = hash_obj( [self._data.metadata['name'], self._data.metadata['sample'], self._data.metadata['cuts'], self.fit_binning.hash] + [self.params[name].value for name in self.fit_params], full_hash=self.full_hash ) if self.fitcoeffs_cache_hash == this_cache_hash: fit_coeffs = deepcopy(self._cached_fc) elif this_cache_hash in self.disk_cache: logging.info('Loading fit coefficients from cache.') self._cached_fc = self.disk_cache[this_cache_hash] fit_coeffs = deepcopy(self._cached_fc) self.fitcoeffs_cache_hash = this_cache_hash else: fit_coeffs = self._calculate_fit_coeffs( self._data, ParamSet(p for p in self.params if p.name in self.fit_params), self.fit_binning, nu_params, mu_params ) else: fit_coeffs = self._calculate_fit_coeffs( self._data, ParamSet(p for p in self.params if p.name in self.fit_params), self.fit_binning, nu_params, mu_params ) if self.params['cache_fit'].value: if this_cache_hash not in self.disk_cache: logging.info('Caching fit coefficients values to disk.') self.disk_cache[this_cache_hash] = fit_coeffs self.fitcoeffs_hash = this_hash self._fit_coeffs = fit_coeffs return fit_coeffs
def _derive_nominal_transforms_hash(self): """Derive a hash to uniquely identify the nominal transform. This should be unique across processes and invocations bacuase the nominal transforms can be non-volatile (cached to disk) and must still be valid given their hash value upon loading from disk in the future. This implementation uses the nominal parameter values' hash combined with the source code hash to generate the final nominal transforms hash. Notes ----- The hashing scheme implemented here might be sufficiently unique for many cases, but override this method in services according to the following guidelines: * Stages that use a nominal transform should override this method if the hash is more accurately computed differently from here. * Stages that use transforms but do not use nominal transforms can override this method with a simpler version that simply returns None to save computation time (if this method is found to be a significant performance hit). (This method is called each time an output is computed if `self.use_transforms == True`.) * Stages that use no transforms (i.e., `self.use_transforms == False`) will not call any built-in methods related to transforms, so overriding this method is irrelevant to such stages. If this method *is* overridden (and not just to return None), since the nominal transform may be stored to a disk cache, make sure that `self.source_code_hash` is included in the objects used to compute the final hash value. Even if all parameters are the same, a nominal transform stored to disk is ***invalid if the source code changes***, and `_derive_nominal_transforms_hash` must reflect this. """ id_objects = [] id_objects.append(self.params.nominal_values_hash) for attr in sorted(self._attrs_to_hash): val = getattr(self, attr) if hasattr(val, "hash"): attr_hash = val.hash elif self.full_hash: norm_val = normQuant(val) attr_hash = hash_obj(norm_val, full_hash=self.full_hash) else: attr_hash = hash_obj(val, full_hash=self.full_hash) id_objects.append(attr_hash) id_objects.append(self.source_code_hash) # If any hashes are missing (i.e, None), invalidate the entire hash if any([(h is None) for h in id_objects]): nominal_transforms_hash = None else: nominal_transforms_hash = hash_obj(id_objects, full_hash=self.full_hash) return nominal_transforms_hash
def _compute_outputs(self, inputs=None): # Following is just so that we only produce new maps when params # change, but produce the same maps with the same param values # (for a more realistic test of caching). seed = hash_obj(self.params.values, hash_to='int') % (2**32 - 1) np.random.seed(seed) # Convert a parameter that the user can specify in any (compatible) # units to the units used for compuation height = self.params['test'].to('meter').magnitude output_maps = [] for output_name in self.output_names: # Generate the fake per-bin "fluxes", modified by the parameter hist = np.random.random(self.output_binning.shape) * height # Put the "fluxes" into a Map object, give it the output_name m = Map(name=output_name, hist=hist, binning=self.output_binning) # Optionally turn on errors here, that will be propagated through # rest of pipeline (slows things down, but essential in some cases) #m.set_poisson_errors() output_maps.append(m) # Combine the output maps into a single MapSet object to return. # The MapSet contains the varous things that are necessary to make # caching work and also provides a nice interface for the user to all # of the contained maps return MapSet(maps=output_maps, name='flux maps')
def load_gen_data(self): logging.debug('Loading generator level sample') unfold_pipeline_cfg = self.params['unfold_pipeline_cfg'].value if isinstance(unfold_pipeline_cfg, str): pipeline_cfg = from_file(unfold_pipeline_cfg) pipeline_hash = pipeline_cfg sa_cfg = from_file( pipeline_cfg.get('stage.data', 'param.data_sample_config')) template_maker = Pipeline(pipeline_cfg) elif isinstance(unfold_pipeline_cfg, Pipeline): pipeline_hash = unfold_pipeline_cfg.state_hash sa_cfg = from_file( unfold_pipeline_cfg.params['data_sample_config'].value) template_maker = unfold_pipeline_cfg gen_cfg = from_file(sa_cfg.get('neutrinos|gen_lvl', 'gen_cfg_file')) this_hash = hash_obj([gen_cfg, pipeline_hash, self.output_str], full_hash=self.full_hash) if self.gen_data_hash == this_hash: return self._gen_data full_gen_data = template_maker.get_outputs() if not isinstance(full_gen_data, Data): raise AssertionError( 'Output of pipeline is not a Data object, instead is type ' '{0}'.format(type(full_gen_data))) trans_data = full_gen_data.transform_groups(self.output_str) gen_data = trans_data[self.output_str] self._gen_data = gen_data self.gen_data_hash = this_hash return gen_data
def split_data(self): this_hash = hash_obj([ self.fit_hash, self.output_str, self._data.contains_muons, self._data.contains_noise ], full_hash=self.full_hash) if self.split_data_hash == this_hash: return self._signal_data, self._bg_data, self._all_data if self.params['real_data'].value: return self._data, None, self._data trans_data = self._data.transform_groups(self.output_str) [trans_data[fig].pop('sample_weight') for fig in trans_data] bg_str = [fig for fig in trans_data if fig != self.output_str] if trans_data.contains_muons: trans_data['muons'].pop('sample_weight') bg_str.append('muons') if trans_data.contains_noise: trans_data['noise'].pop('sample_weight') bg_str.append('noise') signal_data = trans_data[self.output_str] bg_data = [trans_data[bg] for bg in bg_str] bg_data = reduce(Data._merge, bg_data) all_data = Data._merge(deepcopy(bg_data), signal_data) self._signal_data = signal_data self._bg_data = bg_data self._all_data = all_data self.split_data_hash = this_hash return signal_data, bg_data, all_data
def hash(self): """int : Hash for entire set of transforms""" hashes = self.hashes if len(hashes) > 0: if all([(h is not None and h == hashes[0]) for h in hashes]): return hashes[0] if all([(h is not None) for h in hashes]): return hash_obj(hashes) return None
def source_code_hash(self): """Hash for the source code of this object's class. Not meant to be perfect, but should suffice for tracking provenance of an object stored to disk that were produced by a Stage. """ if self._source_code_hash is None: self._source_code_hash = hash_obj(inspect.getsource(self.__class__)) return self._source_code_hash
def load_pid_energy_param(self, source): """Load pid energy-dependent parameterisation from file or dictionary. Parameters ---------- source : string Resource location of the file """ this_hash = hash_obj(source) if (self._pid_energy_param_hash is not None and this_hash == self._pid_energy_param_hash): return # Invalidate the hash and clear the entry, so we aren't left in an # inconsistent state if any of the below fails self._pid_energy_param_hash = None self.pid_energy_param_dict = None # Call external function for basic loading and conversion pid_energy_param_dict = load_pid_energy_param(source) # Perform validation for flavintgroup, subdict in pid_energy_param_dict.items(): if set(subdict.keys()) != set(self.signatures): raise ValueError( 'Expected PID specs for %s, but the energy PID' ' parameterization for %s specifies %s instead.' % (self.signatures, flavintgroup, subdict.keys()) ) # Transform groups are implicitly defined by keys implicit_transform_groups = pid_energy_param_dict.keys() # Make sure these match the transform groups specified for the stage if set(implicit_transform_groups) != set(self.transform_groups): raise ValueError( 'Transform groups (%s) defined implicitly by `source` "%s" do' ' not match those defined as the stage\'s configured' ' `transform_groups` (%s).' % (implicit_transform_groups, source, self.transform_groups) ) # Verify that each input name--which specifies a flavint or # flavintgroup--is wholly encapsulated by one of the transform # flavintgroups for name in self.input_names: if not any(name in group for group in implicit_transform_groups): raise ValueError( 'Input "%s" either not present in or spans multiple' ' transform groups (transform_groups = %s)' % (name, implicit_transform_groups) ) self.pid_energy_param_dict = pid_energy_param_dict self._pid_energy_param_hash = this_hash
def load_xsec_splines(self): """Load the cross-sections splines from the ROOT file.""" xsec_file = self.params['xsec_file'].value this_hash = hash_obj(xsec_file, full_hash=self.full_hash) if this_hash == self.xsec_hash: self.xsec.reset() return logging.info('Extracting cross-section spline from file: %s', xsec_file) self.xsec = self.get_combined_xsec(xsec_file, ver='v2.10.0') self.xsec_hash = this_hash
def _derive_transforms_hash(self, nominal_transforms_hash=None): """Compute a hash that uniquely identifies the transforms that will be produced from the current configuration. Note that this hash needs only to be valid for this run (i.e., it is a volatile hash). This implementation returns a hash from the current parameters' values. """ id_objects = [] h = self.params.values_hash logging.trace("self.params.values_hash = %s" % h) id_objects.append(h) # Grab any provided nominal transforms hash, or derive it again if nominal_transforms_hash is None: nominal_transforms_hash = self._derive_nominal_transforms_hash() # If a valid hash has been gotten, include it if nominal_transforms_hash is not None: id_objects.append(nominal_transforms_hash) for attr in sorted(self._attrs_to_hash): val = getattr(self, attr) if hasattr(val, "hash"): attr_hash = val.hash elif self.full_hash: norm_val = normQuant(val) attr_hash = hash_obj(norm_val, full_hash=self.full_hash) else: attr_hash = hash_obj(val, full_hash=self.full_hash) id_objects.append(attr_hash) # If any hashes are missing (i.e, None), invalidate the entire hash if any([(h is None) for h in id_objects]): transforms_hash = None else: transforms_hash = hash_obj(id_objects, full_hash=self.full_hash) return transforms_hash, nominal_transforms_hash
def get_bg_hist(self, bg_data=None): """Histogram the bg data unless using real data, in which case load the bg hist from disk cache.""" this_hash = hash_obj( [self.reco_binning.hash, self.output_str, 'bg_hist'], full_hash=self.full_hash) if bg_data is None: if self.bg_hist_hash == this_hash: logging.trace('Loading bg hist from mem cache') return self._bg_hist if this_hash in self.disk_cache: logging.debug('Loading bg hist from disk cache.') bg_hist = self.disk_cache[this_hash] else: raise ValueError( 'bg hist object with correct hash not found in disk_cache') else: this_hash = hash_obj([this_hash, self.fit_hash], full_hash=self.full_hash) if self.bg_hist_hash == this_hash: logging.trace('Loading bg hist from mem cache') return self._bg_hist bg_hist = self._histogram(events=bg_data, binning=self.reco_binning, weights=bg_data['pisa_weight'], errors=True, name='background', tex=r'\rm{background}') if self.disk_cache is not None: if this_hash not in self.disk_cache: logging.debug('Caching bg hist to disk.') self.disk_cache[this_hash] = bg_hist self.bg_hist_hash = this_hash self._bg_hist = bg_hist return bg_hist
def _compute_outputs(self, inputs=None): """Apply basic cuts and compute histograms for output channels.""" logging.debug('Entering events_to_data._compute_outputs') #Hashing #TODO What should I hash?? hash_property = [ self.events_file, self.params['dataset'].value, self.output_names ] this_hash = hash_obj(hash_property, full_hash=self.full_hash) #if this_hash == self.sample_hash: #TODO Fix this and replace... # return #TODO Check there are no inputs #Fill an events instance from a file events = Events(self.events_file) #TODO Handle nominal, etc, etc datasets? #Extract the neutrino data from the 'Events' instance nu_data = [] flav_fidg = FlavIntDataGroup(flavint_groups=events.flavints) for flavint in events.present_flavints: flav_fidg[flavint] = { var: events[flavint][var] for var in events[flavint].keys() } nu_data.append(flav_fidg) #Create the data instance, including the metadata #Note that there is no muon or noise data in the 'Events' data = Data(reduce(add, nu_data), metadata=deepcopy(events.metadata)) #Make cuts if self.params['keep_criteria'].value is not None: self._data.applyCut(self.params['keep_criteria'].value ) #TODO Shivesh says this needs testing self._data.update_hash() #Update hashes self.sample_hash = this_hash data.metadata['sample_hash'] = this_hash data.update_hash() return data
def cut_events(self, keep_criteria): """Apply a cut to `self.events`, keeping only events that pass `keep_criteria`. Parameters ---------- keep_criteria : string See pisa.core.Events.applyCut for more info on specifying this. """ if isinstance(keep_criteria, Param): keep_criteria = keep_criteria.value if keep_criteria is not None: events = self.events.applyCut(keep_criteria=keep_criteria) events_hash = hash_obj(events, full_hash=self.full_hash) self.events = events self._events_hash = events_hash
def load_events(self, events): """Load events from path given by `events`. Stored as `self.events`. Parameters ---------- events : string or Events object If string, load events from that location. If Events object, deepcopy to obtain `self.events` """ if isinstance(events, Param): events = events.value elif isinstance(events, basestring): events = find_resource(events) this_hash = hash_obj(events, full_hash=self.full_hash) if self._events_hash is not None and this_hash == self._events_hash: return logging.debug("Extracting events from Events obj or file: %s", events) events_obj = Events(events) events_hash = this_hash self.events = events_obj self._events_hash = events_hash
def apply(self, inputs): """Apply each transform to `inputs`; return computed outputs. Parameters ----------- inputs : sequence of objects Returns ------- outputs : container with computed outputs (no sideband objects) """ output_names = [] outputs = [] # If any outputs have the same name, add them together to form a single # output for that name for xform in self: output = xform.apply(inputs) name = output.name try: idx = output_names.index(name) outputs[idx] = outputs[idx] + output outputs[idx].name = name except ValueError: outputs.append(output) output_names.append(name) # Automatically attach a sensible hash (this may be overwritten, but # the below should be a reasonable hash in most cases) if inputs.hash is None or self.hash is None: hash_ = None else: hash_ = hash_obj((inputs.hash, self.hash)) # TODO: what to set for map set's name, tex, etc. ? return MapSet(maps=outputs, hash=hash_)
def _compute_transforms(self): """Compute new oscillation transforms.""" # The seed is created from parameter values to produce different sets # of transforms for different sets of parameters seed = hash_obj(self.params.values, hash_to='int') % (2**32 - 1) np.random.seed(seed) # Read parameters in in the units used for computation, e.g. theta23 = self.params.theta23.m_as('rad') transforms = [] for out_idx, output_name in enumerate(self.output_names): if out_idx < 3: # neutrinos (-> input names are neutrinos) input_names = self.input_names[0:2] else: # anti-neutrinos (-> input names are anti-neutrinos) input_names = self.input_names[2:4] # generate the "oscillation probabilities" xform = self.create_dummy_osc_probs() # create object of type `BinnedTensorTransform` and attach # to list of transforms with correct set of input names for the # output name in question transforms.append( BinnedTensorTransform( input_names=input_names, output_name=output_name, # we have already made sure that input and output binnings # are identical input_binning=self.input_binning, output_binning=self.output_binning, xform_array=xform)) return TransformSet(transforms=transforms)
def hash(self): """int : Hash of the state of the pipeline. This hashes together a hash of the Pipeline class's source code and a hash of the state of each contained stage.""" return hash_obj([self.source_code_hash] + [stage.hash for stage in self])
def _derive_outputs_hash(self): """Derive a hash value that unique identifies the outputs that will be generated based upon the current state of the stage. This implementation hashes together: * Input and output binning objects' hash values (if either input or output binning is not None) * Current params' values hash * Hashes from any input objects with names in `self.input_names` If any of the above objects is specified but returns None for its hash value, the entire output hash is invalidated, and None is returned. """ id_objects = [] # If stage uses inputs, grab hash from the inputs container object if self.outputs_cache is not None and len(self.input_names) > 0: inhash = self.inputs.hash logging.trace("inputs.hash = %s" % inhash) id_objects.append(inhash) # If stage uses transforms, get hash from the transforms transforms_hash = None if self.use_transforms: transforms_hash, nominal_transforms_hash = self._derive_transforms_hash() id_objects.append(transforms_hash) logging.trace("derived transforms hash = %s" % id_objects[-1]) # Otherwise, generate sub-hash on binning and param values here else: transforms_hash, nominal_transforms_hash = None, None if self.outputs_cache is not None: id_subobjects = [] # Include all parameter values id_subobjects.append(self.params.values_hash) # Include additional attributes of this object for attr in sorted(self._attrs_to_hash): val = getattr(self, attr) if hasattr(val, "hash"): attr_hash = val.hash elif self.full_hash: norm_val = normQuant(val) attr_hash = hash_obj(norm_val, full_hash=self.full_hash) else: attr_hash = hash_obj(val, full_hash=self.full_hash) id_subobjects.append(attr_hash) # Generate the "sub-hash" if any([(h is None) for h in id_subobjects]): sub_hash = None else: sub_hash = hash_obj(id_subobjects, full_hash=self.full_hash) id_objects.append(sub_hash) # If any hashes are missing (i.e, None), invalidate the entire hash if self.outputs_cache is None or any([(h is None) for h in id_objects]): outputs_hash = None else: outputs_hash = hash_obj(id_objects, full_hash=self.full_hash) return outputs_hash, transforms_hash, nominal_transforms_hash
def reweight(self): """Main rewighting function.""" this_hash = hash_obj([self.weight_hash, self.params.values_hash], full_hash=self.full_hash) if this_hash == self.fit_hash: return fit_coeffs = self.calculate_fit_coeffs() sample_config = from_file(self.params['discr_sys_sample_config'].value) degree = int(self.params['poly_degree'].value) force_through_nominal = self.params['force_through_nominal'].value if force_through_nominal: def fit_func(vals, *poly_coeffs): return np.polynomial.polynomial.polyval( vals, [1.] + list(poly_coeffs)) else: def fit_func(vals, *poly_coeffs): return np.polynomial.polynomial.polyval( vals, list(poly_coeffs)) # add free param for constant term degree += 1 def parse(string): return string.replace(' ', '').split(',') if self.neutrinos: sys_list = parse(sample_config.get('neutrinos', 'sys_list')) for fig in self._data.keys(): self._data[fig]['fit_weight'] = \ deepcopy(self._data[fig]['weight_weight']) for sys in sys_list: nominal = sample_config.get('neutrinos|' + sys, 'nominal') for fig in self._data.keys(): fit_map = unp.nominal_values(fit_coeffs[sys][fig].hist) if self.params['smoothing'].value == 'gauss': # TODO(shivesh): new MapSet functions? for d in range(degree): fit_map[..., d] = gaussian_filter(fit_map[..., d], sigma=1) shape = self.fit_binning.shape transform = np.ones(shape) sys_offset = self.params['nu_' + sys].value.m - float(nominal) for idx in np.ndindex(shape): transform[idx] *= fit_func(sys_offset, *fit_map[idx]) hist_idxs = self._data.digitize( kinds=fig, binning=self.fit_binning, ) # Discrete systematics reweighting # TODO(shivesh): speedup this for idx, wght in enumerate( np.nditer(self._data[fig]['fit_weight'], op_flags=['readwrite'])): idx_slice = tuple(hist_idxs[idx]) if shape[0] == 0 or shape[1] == 0 or \ idx_slice[0] > shape[0] or idx_slice[1] > shape[1]: # Outside binning range wght *= 0 else: wght *= transform[tuple([x - 1 for x in idx_slice])] for fig in self._data.keys(): self._data[fig]['pisa_weight'] = \ deepcopy(self._data[fig]['fit_weight']) if self.muons: sys_list = parse(sample_config.get('muons', 'sys_list')) self._data['muons']['fit_weight'] = \ deepcopy(self._data['muons']['weight_weight']) for sys in sys_list: fit_map = unp.nominal_values(fit_coeffs[sys]['muons'].hist) if self.params['smoothing'].value == 'gauss': # TODO(shivesh): new MapSet functions? for d in range(degree): fit_map[..., d] = gaussian_filter(fit_map[..., d], sigma=1) shape = self.fit_binning.shape transform = np.ones(shape) for idx in np.ndindex(shape): transform[idx] *= fit_func(self.params['mu_' + sys].value, *fit_map[idx]) hist_idxs = self._data.digitize( kinds='muons', binning=self.fit_binning, ) # Discrete systematics reweighting for idx, wght in enumerate(self._data['muons']['fit_weight']): idx_slice = tuple(hist_idxs[idx]) if shape[0] == 0 or shape[1] == 0 or \ idx_slice[0] > shape[0] or idx_slice[1] > shape[1]: # Outside binning range wght *= 0 else: wght *= transform[tuple([x - 1 for x in idx_slice])] self._data['muons']['pisa_weight'] = \ deepcopy(self._data['muons']['fit_weight']) self.fit_hash = this_hash self._data.metadata['fit_hash'] = self.fit_hash self._data.update_hash()
def store_recursively(fhandle, node, path=None, attrs=None, node_hashes=None): """Function for interatively doing the work""" path = [] if path is None else path node_hashes = OrderedDict() if node_hashes is None else node_hashes full_path = '/' + '/'.join(path) if attrs is not None: if isinstance(attrs, OrderedDict): sorted_attr_keys = attrs.keys() else: sorted_attr_keys = sorted(attrs.keys()) if isinstance(node, Mapping): logging.trace(' creating Group "%s"', full_path) try: dset = fhandle.create_group(full_path) if attrs is not None: for key in sorted_attr_keys: dset.attrs[key] = attrs[key] except ValueError: pass for key in sorted(node.keys()): if isinstance(key, basestring): key_str = key else: key_str = str(key) logging.warn( 'Making string from key "%s", %s for use as' ' name in HDF5 file', key_str, type(key)) val = node[key] new_path = path + [key_str] store_recursively(fhandle=fhandle, node=val, path=new_path, node_hashes=node_hashes) else: # Check for existing node node_hash = hash_obj(node) if node_hash in node_hashes: logging.trace(' creating hardlink for Dataset: "%s" -> "%s"', full_path, node_hashes[node_hash]) # Hardlink the matching existing dataset fhandle[full_path] = fhandle[node_hashes[node_hash]] return # For now, convert None to np.nan since h5py appears to not handle # None if node is None: node = np.nan logging.warn( ' encountered `None` at node "%s"; converting to' ' np.nan', full_path) # "Scalar datasets don't support chunk/filter options". Shuffling # is a good idea otherwise since subsequent compression will # generally benefit; shuffling requires chunking. Compression is # not done here since it is slow, but can be done by # post-processing the generated file(s). if np.isscalar(node): shuffle = False chunks = None else: shuffle = True chunks = True # Store the node_hash for linking to later if this is more than # a scalar datatype. Assumed that "None" has node_hashes[node_hash] = full_path if isinstance(node, basestring): # TODO: Treat strings as follows? Would this break # compatibility with pytables/Pandas? What are benefits? # Leaving the following two lines out for now... #dtype = h5py.special_dtype(vlen=str) #fh.create_dataset(k,data=v,dtype=dtype) # ... Instead: creating length-1 array out of string; this # seems to be compatible with both h5py and pytables node = np.array(node) logging.trace(' creating dataset at node "%s", hash %s', full_path, node_hash) try: dset = fhandle.create_dataset(name=full_path, data=node, chunks=chunks, compression=None, shuffle=shuffle, fletcher32=False) except TypeError: try: shuffle = False chunks = None dset = fhandle.create_dataset(name=full_path, data=node, chunks=chunks, compression=None, shuffle=shuffle, fletcher32=False) except: logging.error(' full_path: %s', full_path) logging.error(' chunks : %s', str(chunks)) logging.error(' shuffle : %s', str(shuffle)) logging.error(' node : %s', str(node)) raise if attrs is not None: for key in sorted_attr_keys: dset.attrs[key] = attrs[key]
def _compute_nominal_transforms(self): """Compute parameterised effective area transforms""" energy_param_source = self.params.aeff_energy_paramfile.value coszen_param_source = self.params.aeff_coszen_paramfile.value energy_param_hash = hash_obj(energy_param_source) coszen_param_hash = hash_obj(coszen_param_source) load_energy = False load_coszen = False if (self._param_hashes['energy'] is None or energy_param_hash != self._param_hashes['energy']): load_energy = True if (self.has_cz and (self._param_hashes['coszen'] is None or energy_param_hash != self._param_hashes)): load_coszen = True if energy_param_source is None: raise ValueError( 'non-None energy parameterization params.aeff_energy_paramfile' ' must be provided' ) if not self.has_cz and coszen_param_source is not None: raise ValueError( 'true_coszen dimension was not found in the binning but a' ' coszen parameterisation file has been provided by' ' `params.aeff_coszen_paramfile`.' ) if not (load_energy or load_coszen): return dims = ['energy', 'coszen'] loads = [load_energy, load_coszen] sources = [energy_param_source, coszen_param_source] hashes = [energy_param_hash, coszen_param_hash] for dim, load, source, hash_ in zip(dims, loads, sources, hashes): if not load: continue self._param_hashes[dim] = None self.aeff_params[dim] = None params = load_aeff_param(source) # Transform groups are implicitly defined by the contents of the # `pid_energy_paramfile`'s keys implicit_transform_groups = params.keys() # Make sure these match transform groups specified for the stage if set(implicit_transform_groups) != set(self.transform_groups): raise ValueError( 'Transform groups (%s) defined implicitly by' ' %s aeff parameterizations "%s" do not match those' ' defined as the stage\'s `transform_groups` (%s).' % (implicit_transform_groups, dim, source, self.transform_groups) ) self.aeff_params[dim] = params self._param_hashes[dim] = hash_ nominal_transforms = [] for xform_flavints in self.transform_groups: logging.debug('Working on %s effective areas xform', xform_flavints) energy_param_func = self.aeff_params['energy'][xform_flavints] coszen_param_func = None if self.aeff_params['coszen'] is not None: coszen_param_func = self.aeff_params['coszen'][xform_flavints] # Now calculate the 1D aeff along energy aeff_vs_e = energy_param_func(self.ecen) # NOTE/TODO: Below is taken from the PISA 2 implementation of this. # Almost certainly comes from the fact that the highest knot there # was 79.5 GeV with the upper energy bin edge being 80 GeV. There's # probably something better that could be done here... # Correct for final energy bin, since interpolation does not # extend to JUST right outside the final bin if aeff_vs_e[-1] == 0: aeff_vs_e[-1] = aeff_vs_e[-2] if self.has_cz: aeff_vs_e = self.input_binning.broadcast( aeff_vs_e, from_dim='true_energy', to_dims='true_coszen' ) if coszen_param_func is not None: aeff_vs_cz = coszen_param_func(self.czcen) # Normalize aeff_vs_cz *= len(aeff_vs_cz) / np.sum(aeff_vs_cz) else: aeff_vs_cz = np.ones(shape=len(self.czcen)) cz_broadcasted = self.input_binning.broadcast( aeff_vs_cz, from_dim='true_coszen', to_dims='true_energy' ) aeff_transform = aeff_vs_e * cz_broadcasted else: aeff_transform = aeff_vs_e nominal_transforms.extend( populate_transforms( service=self, xform_flavints=xform_flavints, xform_array=aeff_transform ) ) return TransformSet(transforms=nominal_transforms)
def _compute_transforms(self): """ Generate reconstruction "smearing kernels" by reading in a set of parameterisation functions from a json file. This should have the same dimensionality as the input binning i.e. if you have energy and coszenith input binning then the kernels provided should have both energy and coszenith resolution functions. Any superposition of distributions from scipy.stats is supported. """ res_scale_ref = self.params.res_scale_ref.value.strip().lower() assert res_scale_ref in ['zero'] # TODO: , 'mean', 'median'] reco_param_source = self.params.reco_paramfile.value if reco_param_source is None: raise ValueError( 'non-None reco parameterization params.reco_paramfile' ' must be provided') reco_param_hash = hash_obj(reco_param_source) if (self._reco_param_hash is None or reco_param_hash != self._reco_param_hash): reco_param = load_reco_param(reco_param_source) # Transform groups are implicitly defined by the contents of the # reco paramfile's keys implicit_transform_groups = reco_param.keys() # Make sure these match transform groups specified for the stage if set(implicit_transform_groups) != set(self.transform_groups): raise ValueError( 'Transform groups (%s) defined implicitly by' ' %s reco parameterizations do not match those' ' defined as the stage\'s `transform_groups` (%s).' % (implicit_transform_groups, reco_param_source, self.transform_groups)) self.param_dict = reco_param self._reco_param_hash = reco_param_hash self.eval_dict = self.evaluate_reco_param() self.reco_scales_and_biases_applicable() # everything seems to be fine, so rescale and shift distributions eval_dict = self.scale_and_shift_reco_dists() # Computational units must be the following for compatibility with # events file comp_units = dict(true_energy='GeV', true_coszen=None, true_azimuth='rad', reco_energy='GeV', reco_coszen=None, reco_azimuth='rad', pid=None) # Select only the units in the input/output binning for conversion # (can't pass more than what's actually there) in_units = { dim: unit for dim, unit in comp_units.items() if dim in self.input_binning } out_units = { dim: unit for dim, unit in comp_units.items() if dim in self.output_binning } # These binnings will be in the computational units defined above input_binning = self.input_binning.to(**in_units) output_binning = self.output_binning.to(**out_units) en_centers_in = self.input_binning[ 'true_energy'].weighted_centers.magnitude en_edges_in = self.input_binning['true_energy'].bin_edges.magnitude cz_centers_in = self.input_binning[ 'true_coszen'].weighted_centers.magnitude cz_edges_in = self.input_binning['true_coszen'].bin_edges.magnitude en_edges_out = self.output_binning['reco_energy'].bin_edges.magnitude cz_edges_out = self.output_binning['reco_coszen'].bin_edges.magnitude n_e_in = len(en_centers_in) n_cz_in = len(cz_centers_in) n_e_out = len(en_edges_out) - 1 n_cz_out = len(cz_edges_out) - 1 if self.coszen_flipback: cz_edges_out, flipback_mask, keep = \ self.extend_binning_for_coszen(ext_low=-3., ext_high=+3.) xforms = [] for xform_flavints in self.transform_groups: logging.debug("Working on %s reco kernel..." % xform_flavints) this_params = eval_dict[xform_flavints] reco_kernel = np.zeros((n_e_in, n_cz_in, n_e_out, n_cz_out)) for (i, j) in itertools.product(range(n_e_in), range(n_cz_in)): e_kern_cdf = self.make_cdf(bin_edges=en_edges_out, enval=en_centers_in[i], enindex=i, czval=None, czindex=j, dist_params=this_params['energy']) cz_kern_cdf = self.make_cdf(bin_edges=cz_edges_out, enval=en_centers_in[i], enindex=i, czval=cz_centers_in[j], czindex=j, dist_params=this_params['coszen']) if self.coszen_flipback: cz_kern_cdf = perform_coszen_flipback( cz_kern_cdf, flipback_mask, keep) reco_kernel[i, j] = np.outer(e_kern_cdf, cz_kern_cdf) # Sanity check of reco kernels - intolerable negative values? logging.trace(" Ensuring reco kernel sanity...") kern_neg_invalid = reco_kernel < -EQUALITY_PREC if np.any(kern_neg_invalid): raise ValueError("Detected intolerable negative entries in" " reco kernel! Min.: %.15e" % np.min(reco_kernel)) # Set values numerically compatible with zero to zero np.where((np.abs(reco_kernel) < EQUALITY_PREC), reco_kernel, 0) sum_over_axes = tuple(range(-len(self.output_binning), 0)) totals = np.sum(reco_kernel, axis=sum_over_axes) totals_large = totals > (1 + EQUALITY_PREC) if np.any(totals_large): raise ValueError("Detected overflow in reco kernel! Max.:" " %0.15e" % (np.max(totals))) if self.input_binning.basenames[0] == "coszen": # The reconstruction kernel has been set up with energy as its # first dimension, so swap axes if it is applied to an input # binning where 'coszen' is the first logging.trace(" Swapping kernel dimensions since 'coszen' has" " been requested as the first.") reco_kernel = np.swapaxes(reco_kernel, 0, 1) reco_kernel = np.swapaxes(reco_kernel, 2, 3) if self.sum_grouped_flavints: xform_input_names = [] for input_name in self.input_names: if set(NuFlavIntGroup(input_name)).isdisjoint( xform_flavints): continue xform_input_names.append(input_name) for output_name in self.output_names: if output_name not in xform_flavints: continue xform = BinnedTensorTransform( input_names=xform_input_names, output_name=output_name, input_binning=self.input_binning, output_binning=self.output_binning, xform_array=reco_kernel, sum_inputs=self.sum_grouped_flavints) xforms.append(xform) # If *not* combining grouped flavints: # Copy the transform for each input flavor, regardless if the # transform is computed from a combination of flavors. else: for input_name in self.input_names: if set(NuFlavIntGroup(input_name)).isdisjoint( xform_flavints): continue for output_name in self.output_names: if (output_name not in NuFlavIntGroup(input_name) or output_name not in xform_flavints): continue logging.trace(' input: %s, output: %s, xform: %s', input_name, output_name, xform_flavints) xform = BinnedTensorTransform( input_names=input_name, output_name=output_name, input_binning=self.input_binning, output_binning=self.output_binning, xform_array=reco_kernel, sum_inputs=self.sum_grouped_flavints) xforms.append(xform) return TransformSet(transforms=xforms)
def hash(self): return hash_obj([self.source_code_hash] + [p.hash for p in self])
def __hash__(self): return hash_obj([(sec, (self.items(sec))) for sec in sorted(self.sections())])
def store_recursively(fhandle, node, path=None, attrs=None, node_hashes=None): """Function for iteratively doing the work""" path = [] if path is None else path full_path = '/' + '/'.join(path) node_hashes = OrderedDict() if node_hashes is None else node_hashes if attrs is None: sorted_attr_keys = [] else: if isinstance(attrs, OrderedDict): sorted_attr_keys = attrs.keys() else: sorted_attr_keys = sorted(attrs.keys()) if isinstance(node, Mapping): logging.trace(' creating Group "%s"', full_path) try: dset = fhandle.create_group(full_path) for key in sorted_attr_keys: dset.attrs[key] = attrs[key] except ValueError: pass for key in sorted(node.keys()): if isinstance(key, str): key_str = key else: key_str = str(key) logging.warning( 'Making string from key "%s", %s for use as' ' name in HDF5 file', key_str, type(key) ) val = node[key] new_path = path + [key_str] store_recursively(fhandle=fhandle, node=val, path=new_path, node_hashes=node_hashes) else: # Check for existing node node_hash = hash_obj(node) if node_hash in node_hashes: logging.trace(' creating hardlink for Dataset: "%s" -> "%s"', full_path, node_hashes[node_hash]) # Hardlink the matching existing dataset fhandle[full_path] = fhandle[node_hashes[node_hash]] return # For now, convert None to np.nan since h5py appears to not handle # None if node is None: node = np.nan logging.warning( ' encountered `None` at node "%s"; converting to' ' np.nan', full_path ) # "Scalar datasets don't support chunk/filter options". Shuffling # is a good idea otherwise since subsequent compression will # generally benefit; shuffling requires chunking. Compression is # not done here since it is slow, but can be done by # post-processing the generated file(s). if np.isscalar(node): shuffle = False chunks = None else: shuffle = True chunks = True # Store the node_hash for linking to later if this is more than # a scalar datatype. Assumed that "None" has node_hashes[node_hash] = full_path # -- Handle special types -- # # See h5py docs at # # https://docs.h5py.org/en/stable/strings.html#how-to-store-text-strings # # where using `bytes` objects (i.e., in numpy, np.string_) is # deemed the most compatible way to encode objects, but apparently # we don't have pytables compatibility right now. # # For boolean support, see # # https://docs.h5py.org/en/stable/faq.html#faq # TODO: make written hdf5 files compatible with pytables # see docs at https://www.pytables.org/usersguide/datatypes.html if isinstance(node, string_types): node = np.string_(node) elif isinstance(node, bool): # includes np.bool node = np.bool_(node) # same as np.bool8 elif isinstance(node, np.ndarray): if issubclass(node.dtype.type, string_types): node = node.astype(np.string_) elif node.dtype.type in (bool, np.bool): node = node.astype(np.bool_) logging.trace(' creating dataset at path "%s", hash %s', full_path, node_hash) try: dset = fhandle.create_dataset( name=full_path, data=node, chunks=chunks, compression=None, shuffle=shuffle, fletcher32=False ) except TypeError: try: shuffle = False chunks = None dset = fhandle.create_dataset( name=full_path, data=node, chunks=chunks, compression=None, shuffle=shuffle, fletcher32=False ) except Exception: logging.error(' full_path: "%s"', full_path) logging.error(' chunks : %s', str(chunks)) logging.error(' shuffle : %s', str(shuffle)) logging.error(' node : "%s"', str(node)) raise for key in sorted_attr_keys: dset.attrs[key] = attrs[key]
def load_sample_events(self): """Load the event sample given the configuration file and output groups. Hash this object using both the configuration file and the output types.""" hash_property = [self.config, self.neutrinos, self.muons, self.noise, self.params['dataset'].value] this_hash = hash_obj(hash_property, full_hash=self.full_hash) if this_hash == self.sample_hash: return name = self.config.get('general', 'name') event_types = split(self.config.get('general', 'event_type')) logging.info( "Event types in data sample '%s': %s" % (name,[str(e) for e in event_types]) ) events = [] if self.neutrinos: if 'neutrinos' not in event_types: raise AssertionError('`neutrinos` field not found in ' 'configuration file.') dataset = self.params['dataset'].value.lower() if 'neutrinos' not in dataset: dataset = 'nominal' nu_data = self.load_neutrino_events( config=self.config, dataset=dataset ) events.append(nu_data) if self.muons: if 'muons' not in event_types: raise AssertionError('`muons` field not found in ' 'configuration file.') dataset = self.params['dataset'].value if 'muons' not in dataset: dataset = 'nominal' muon_events = self.load_muon_events( config=self.config, dataset=dataset ) events.append(muon_events) if self.noise: if 'noise' not in event_types: raise AssertionError('`noise` field not found in ' 'configuration file.') dataset = self.params['dataset'].value if 'noise' not in dataset: dataset = 'nominal' noise_events = self.load_noise_events( config=self.config, dataset=dataset ) events.append(noise_events) self._data = reduce(add, events) #If requested, add fix the truth variable names if self.fix_truth_variable_names : for event_key in self._data.metadata["flavints_joined"] : for var in self.truth_variables : if var in self._data[event_key] : new_var = self.truth_variable_prefix + var self._data[event_key][new_var] = self._data[event_key].pop(var) self.sample_hash = this_hash self._data.metadata['sample_hash'] = this_hash self._data.update_hash()
def update_hash(self): """Update the cached hash value""" self._hash = hash_obj(normQuant(self.metadata))