예제 #1
0
    def _get_reco_kernels(self, reco_vbwkde_evts_file=None, evts_dict=None,
                          reco_vbwkde_make_plots=False, **kwargs):
        """Given a reco events resource (resource file name or dictionary),
        retrieve data from it then serialize and hash the data. If the object
        attribute kernels were computed from the same source data, simply
        return those. Otherwise, compute the kernels anew and return them.

        Arguments
        ---------
        NOTE: One--and only one--of the two arguments must be specified.

        reco_vbwkde_evts_file : str (or dict)
            Name or path to file containing event reco info. See doc for
            __init__ method for details about contents. If a dict is passed
            in, it is automatically populated to evts_dict (see below).

        evts_dict : dict
            Dictionary containing event reco info. Allows user to pass in a
            non-string-object to avoid re-loading a file to check whether the
            contents have changed each time. See doc for __init__ method for
            details about the dictionary's format.

        reco_vbwkde_make_plots : bool
        """
        if not isinstance(reco_vbwkde_make_plots, bool):
            raise ValueError("Option reco_vbwkde_make_plots must be specified and of bool type")

        for reco_scale in ['e_reco_scale', 'cz_reco_scale']:
            if reco_scale in kwargs and kwargs[reco_scale] != 1:
                raise ValueError('%s = %.2f, must be 1.0 for RecoServiceVBWKDE!'
                                  %(reco_scale, kwargs[reco_scale]))
        

        REMOVE_SIM_DOWNGOING = True

        if (reco_vbwkde_evts_file is not None) and (evts_dict is not None):
            raise TypeError(
                'One--and only one--of {reco_vbwkde_evts_file|evts_dict} ' +
                'may be specified'
            )

        if isinstance(reco_vbwkde_evts_file, dict):
            evts_dict = reco_vbwkde_evts_file
            evts_dict = None

        if isinstance(reco_vbwkde_evts_file, str):
            logging.info('Constructing VBWKDEs from event true & reco ' +
                         'info in file: %s' % reco_vbwkde_evts_file)
            fpath = find_resource(reco_vbwkde_evts_file)
            eventsdict = hdf.from_hdf(fpath)
            new_hash = utils.hash_file(fpath)
        elif isinstance(evts_dict, dict):
            eventsdict = evts_dict
            new_hash = utils.hash_obj(eventsdict)
        else:
            raise TypeError('A {reco_vbwkde_evts_file|evts_dict} must be' +
                            'provided, where the former must be a str ' +
                            'and the latter must be a dict.')

        if (self.kernels is not None) and (new_hash == self.reco_events_hash):
            return self.kernels

        self.kernels = self.all_kernels_from_events(
            eventsdict=eventsdict, remove_sim_downgoing=REMOVE_SIM_DOWNGOING,
            make_plots=reco_vbwkde_make_plots
        )
        self.reco_events_hash = new_hash

        return self.kernels
예제 #2
0
    def all_kernels_from_events(self, eventsdict, remove_sim_downgoing,
                                make_plots=False):
        """Given a reco events dictionary, retrieve reco/true information from
        it, group MC data by flavor & interaction type, and return VBWKDE-based
        PISA reco kernels for all flavors/types. Checks are performed if
        duplicate data has already been computed, in which case a (deep) copy
        of the already-computed kernels are populated.

        Arguments
        ---------
        eventsdict : dict
            Dictionary containing event reco info. See docstr for __init__ for
            details.

        remove_sim_downgoing : bool
            Whether to remove MC-true downgoing events prior to computing
            resolutions.

        """
        all_flavs = \
                ['nue', 'nue_bar', 'numu', 'numu_bar', 'nutau', 'nutau_bar']
        all_ints = ['cc', 'nc']
        flav_ints = itertools.product(all_flavs, all_ints)

        kernels = {f:{} for f in all_flavs}
        kernels['ebins'] = self.ebins
        kernels['czbins'] = self.czbins
        computed_datahashes = {}
        for flav, int_type in flav_ints:
            logging.info("Working on %s/%s kernels" % (flav, int_type))
            e_true = eventsdict[flav][int_type]['true_energy']
            e_reco = eventsdict[flav][int_type]['reco_energy']
            cz_true = eventsdict[flav][int_type]['true_coszen']
            cz_reco = eventsdict[flav][int_type]['reco_coszen']

            if remove_sim_downgoing:
                logging.info("Removing simulated downgoing " +
                              "events in KDE construction.")
                keep_inds = np.where(cz_true < 0.0)
                e_true = e_true[keep_inds]
                e_reco = e_reco[keep_inds]
                cz_true = cz_true[keep_inds]
                cz_reco = cz_reco[keep_inds]

            datahash = utils.hash_obj((e_true.tolist(), e_reco.tolist(),
                                       cz_true.tolist(), cz_reco.tolist()))
            if datahash in computed_datahashes:
                ref_flav, ref_int_type = computed_datahashes[datahash]
                logging.info("   > Found duplicate source data; " +
                              "copying kernels already computed for " +
                              "%s/%s to %s/%s."
                              % (ref_flav, ref_int_type, flav, int_type))
                kernels[flav][int_type] = copy.deepcopy(
                    kernels[ref_flav][ref_int_type]
                )
                continue

            kernels[flav][int_type] = self.single_kernel_set(
                e_true=e_true, cz_true=cz_true, e_reco=e_reco, cz_reco=cz_reco,
                flav=flav, int_type=int_type, make_plots=make_plots,
                out_dir=None
            )
            computed_datahashes[datahash] = (flav, int_type)

        return kernels
예제 #3
0
    def all_kernels_from_events(self,
                                eventsdict,
                                remove_sim_downgoing,
                                make_plots=False):
        """Given a reco events dictionary, retrieve reco/true information from
        it, group MC data by flavor & interaction type, and return VBWKDE-based
        PISA reco kernels for all flavors/types. Checks are performed if
        duplicate data has already been computed, in which case a (deep) copy
        of the already-computed kernels are populated.

        Arguments
        ---------
        eventsdict : dict
            Dictionary containing event reco info. See docstr for __init__ for
            details.

        remove_sim_downgoing : bool
            Whether to remove MC-true downgoing events prior to computing
            resolutions.

        """
        all_flavs = \
                ['nue', 'nue_bar', 'numu', 'numu_bar', 'nutau', 'nutau_bar']
        all_ints = ['cc', 'nc']
        flav_ints = itertools.product(all_flavs, all_ints)

        kernels = {f: {} for f in all_flavs}
        kernels['ebins'] = self.ebins
        kernels['czbins'] = self.czbins
        computed_datahashes = {}
        for flav, int_type in flav_ints:
            logging.info("Working on %s/%s kernels" % (flav, int_type))
            e_true = eventsdict[flav][int_type]['true_energy']
            e_reco = eventsdict[flav][int_type]['reco_energy']
            cz_true = eventsdict[flav][int_type]['true_coszen']
            cz_reco = eventsdict[flav][int_type]['reco_coszen']

            if remove_sim_downgoing:
                logging.info("Removing simulated downgoing " +
                             "events in KDE construction.")
                keep_inds = np.where(cz_true < 0.0)
                e_true = e_true[keep_inds]
                e_reco = e_reco[keep_inds]
                cz_true = cz_true[keep_inds]
                cz_reco = cz_reco[keep_inds]

            datahash = utils.hash_obj((e_true.tolist(), e_reco.tolist(),
                                       cz_true.tolist(), cz_reco.tolist()))
            if datahash in computed_datahashes:
                ref_flav, ref_int_type = computed_datahashes[datahash]
                logging.info("   > Found duplicate source data; " +
                             "copying kernels already computed for " +
                             "%s/%s to %s/%s." %
                             (ref_flav, ref_int_type, flav, int_type))
                kernels[flav][int_type] = copy.deepcopy(
                    kernels[ref_flav][ref_int_type])
                continue

            kernels[flav][int_type] = self.single_kernel_set(
                e_true=e_true,
                cz_true=cz_true,
                e_reco=e_reco,
                cz_reco=cz_reco,
                flav=flav,
                int_type=int_type,
                make_plots=make_plots,
                out_dir=None)
            computed_datahashes[datahash] = (flav, int_type)

        return kernels
예제 #4
0
    def _get_reco_kernels(self,
                          reco_vbwkde_evts_file=None,
                          evts_dict=None,
                          reco_vbwkde_make_plots=False,
                          **kwargs):
        """Given a reco events resource (resource file name or dictionary),
        retrieve data from it then serialize and hash the data. If the object
        attribute kernels were computed from the same source data, simply
        return those. Otherwise, compute the kernels anew and return them.

        Arguments
        ---------
        NOTE: One--and only one--of the two arguments must be specified.

        reco_vbwkde_evts_file : str (or dict)
            Name or path to file containing event reco info. See doc for
            __init__ method for details about contents. If a dict is passed
            in, it is automatically populated to evts_dict (see below).

        evts_dict : dict
            Dictionary containing event reco info. Allows user to pass in a
            non-string-object to avoid re-loading a file to check whether the
            contents have changed each time. See doc for __init__ method for
            details about the dictionary's format.

        reco_vbwkde_make_plots : bool
        """
        if not isinstance(reco_vbwkde_make_plots, bool):
            raise ValueError(
                "Option reco_vbwkde_make_plots must be specified and of bool type"
            )

        for reco_scale in ['e_reco_scale', 'cz_reco_scale']:
            if reco_scale in kwargs and kwargs[reco_scale] != 1:
                raise ValueError(
                    '%s = %.2f, must be 1.0 for RecoServiceVBWKDE!' %
                    (reco_scale, kwargs[reco_scale]))

        REMOVE_SIM_DOWNGOING = True

        if (reco_vbwkde_evts_file is not None) and (evts_dict is not None):
            raise TypeError(
                'One--and only one--of {reco_vbwkde_evts_file|evts_dict} ' +
                'may be specified')

        if isinstance(reco_vbwkde_evts_file, dict):
            evts_dict = reco_vbwkde_evts_file
            evts_dict = None

        if isinstance(reco_vbwkde_evts_file, str):
            logging.info('Constructing VBWKDEs from event true & reco ' +
                         'info in file: %s' % reco_vbwkde_evts_file)
            fpath = find_resource(reco_vbwkde_evts_file)
            eventsdict = hdf.from_hdf(fpath)
            new_hash = utils.hash_file(fpath)
        elif isinstance(evts_dict, dict):
            eventsdict = evts_dict
            new_hash = utils.hash_obj(eventsdict)
        else:
            raise TypeError('A {reco_vbwkde_evts_file|evts_dict} must be' +
                            'provided, where the former must be a str ' +
                            'and the latter must be a dict.')

        if (self.kernels is not None) and (new_hash == self.reco_events_hash):
            return self.kernels

        self.kernels = self.all_kernels_from_events(
            eventsdict=eventsdict,
            remove_sim_downgoing=REMOVE_SIM_DOWNGOING,
            make_plots=reco_vbwkde_make_plots)
        self.reco_events_hash = new_hash

        return self.kernels
예제 #5
0
 def store_recursively(fhandle, node, path=None, node_hashes=None):
     if path is None:
         path = []
     if node_hashes is None:
         node_hashes = {}
     full_path = '/' + '/'.join(path)
     if isinstance(node, dict):
         logging.trace("  creating Group `%s`" % full_path)
         try:
             fhandle.create_group(full_path)
         except ValueError:
             pass
         for key in sorted(node.iterkeys()):
             key_str = str(key)
             if not isinstance(key, str):
                 logging.warn('Stringifying key `' + key_str +
                              '`for use as name in HDF5 file')
             val = node[key]
             new_path = path + [key_str]
             store_recursively(fhandle=fhandle,
                               node=val,
                               path=new_path,
                               node_hashes=node_hashes)
     else:
         # Check for existing node
         node_hash = utils.hash_obj(node)
         if node_hash in node_hashes:
             logging.trace("  creating hardlink for Dataset: `%s` -> `%s`" %
                           (full_path, node_hashes[node_hash]))
             # Hardlink the matching existing dataset
             fhandle[full_path] = fhandle[node_hashes[node_hash]]
             return
         # For now, convert None to np.nan since h5py appears to not handle None
         if node is None:
             node = np.nan
             logging.warn("  encountered `None` at node `%s`; converting to"
                          " np.nan" % full_path)
         # "Scalar datasets don't support chunk/filter options". Shuffling
         # is a good idea otherwise since subsequent compression will
         # generally benefit; shuffling requires chunking. Compression is
         # not done here since it is slow.
         if np.isscalar(node):
             shuffle = False
             chunks = None
         else:
             shuffle = True
             chunks = True
             # Store the node_hash for linking to later if this is more than
             # a scalar datatype. Assumed that "None" has
             node_hashes[node_hash] = full_path
         # TODO: Treat strings as follows? Would this break compatibility
         # with pytables/Pandas? What are benefits? Leaving out for now.
         # if isinstance(node, basestr):
         #     dtype = h5py.special_dtype(vlen=str)
         #     fh.create_dataset(k,data=v,dtype=dtype)
         logging.trace("  creating dataset at node `%s`" % full_path)
         try:
             fhandle.create_dataset(name=full_path,
                                    data=node,
                                    chunks=chunks,
                                    compression=None,
                                    shuffle=shuffle,
                                    fletcher32=False)
         except TypeError:
             try:
                 shuffle = False
                 chunks = None
                 fhandle.create_dataset(name=full_path,
                                        data=node,
                                        chunks=chunks,
                                        compression=None,
                                        shuffle=shuffle,
                                        fletcher32=False)
             except:
                 logging.error('  full_path: ' + full_path)
                 logging.error('  chunks   : ' + str(chunks))
                 logging.error('  shuffle  : ' + str(shuffle))
                 logging.error('  node     : ' + str(node))
                 raise
예제 #6
0
파일: hdf.py 프로젝트: gkrueckl/pisa
 def store_recursively(fhandle, node, path=None, node_hashes=None):
     if path is None:
         path = []
     if node_hashes is None:
         node_hashes = {}
     full_path = '/' + '/'.join(path)
     if isinstance(node, dict):
         logging.trace("  creating Group `%s`" % full_path)
         try:
             fhandle.create_group(full_path)
         except ValueError:
             pass
         for key in sorted(node.iterkeys()):
             key_str = str(key)
             if not isinstance(key, str):
                 logging.warn('Stringifying key `' + key_str +
                              '`for use as name in HDF5 file')
             val = node[key]
             new_path = path + [key_str]
             store_recursively(fhandle=fhandle, node=val, path=new_path,
                               node_hashes=node_hashes)
     else:
         # Check for existing node
         node_hash = utils.hash_obj(node)
         if node_hash in node_hashes:
             logging.trace("  creating hardlink for Dataset: `%s` -> `%s`" %
                           (full_path, node_hashes[node_hash]))
             # Hardlink the matching existing dataset
             fhandle[full_path] = fhandle[node_hashes[node_hash]]
             return
         # For now, convert None to np.nan since h5py appears to not handle None
         if node is None:
             node = np.nan
             logging.warn("  encountered `None` at node `%s`; converting to"
                          " np.nan" % full_path)
         # "Scalar datasets don't support chunk/filter options". Shuffling
         # is a good idea otherwise since subsequent compression will
         # generally benefit; shuffling requires chunking. Compression is
         # not done here since it is slow.
         if np.isscalar(node):
             shuffle = False
             chunks = None
         else:
             shuffle = True
             chunks = True
             # Store the node_hash for linking to later if this is more than
             # a scalar datatype. Assumed that "None" has 
             node_hashes[node_hash] = full_path
         # TODO: Treat strings as follows? Would this break compatibility
         # with pytables/Pandas? What are benefits? Leaving out for now.
         # if isinstance(node, basestr):
         #     dtype = h5py.special_dtype(vlen=str)
         #     fh.create_dataset(k,data=v,dtype=dtype)
         logging.trace("  creating dataset at node `%s`" % full_path)
         try:
             fhandle.create_dataset(name=full_path, data=node,
                                    chunks=chunks, compression=None,
                                    shuffle=shuffle, fletcher32=False)
         except TypeError:
             try:
                 shuffle = False
                 chunks = None
                 fhandle.create_dataset(name=full_path, data=node,
                                        chunks=chunks, compression=None,
                                        shuffle=shuffle, fletcher32=False)
             except:
                 logging.error('  full_path: ' + full_path)
                 logging.error('  chunks   : ' + str(chunks))
                 logging.error('  shuffle  : ' + str(shuffle))
                 logging.error('  node     : ' + str(node))
                 raise