Exemplo n.º 1
0
def extract_cols(datagrp, sel, slc, columns):
    """
    :param datagrp: something like and HDF5 data group
    :param sel: dictionary column name -> value specifying a selection
    :param slc: a slice object specifying the rows considered
    :param columns: the full list of column names
    :returns: a dictionary col -> array of values
    """
    first = columns[0]
    nrows = len(datagrp[first])
    if slc.start is None and slc.stop is None:  # split in slices
        slcs = general.gen_slices(0, nrows, MAX_ROWS)
    else:
        slcs = [slc]
    acc = general.AccumDict(accum=[])  # col -> arrays
    for slc in slcs:
        if sel:
            ok = slice(None)
            dic = {col: datagrp[col][slc] for col in sel}
            for col in sel:
                if isinstance(ok, slice):  # first selection
                    ok = is_ok(dic[col], sel[col])
                else:  # other selections
                    ok &= is_ok(dic[col], sel[col])
            for col in columns:
                acc[col].append(datagrp[col][slc][ok])
        else:  # avoid making unneeded copies
            for col in columns:
                acc[col].append(datagrp[col][slc])
    return {k: numpy.concatenate(decode_lol(vs)) for k, vs in acc.items()}
Exemplo n.º 2
0
    def get_poes(self, mean_std, cmaker, ctx):
        """
        Calculate and return probabilities of exceedance (PoEs) of one or more
        intensity measure levels (IMLs) of one intensity measure type (IMT)
        for one or more pairs "site -- rupture".

        :param mean_std:
            An array of shape (2, M, N) with mean and standard deviations
            for the sites and intensity measure types
        :param cmaker:
            A ContextMaker instance
        :param ctxs:
            Context objects used to compute mean_std
        :returns:
            array of PoEs of shape (N, L)
        :raises ValueError:
            If truncation level is not ``None`` and neither non-negative
            float number, and if ``imts`` dictionary contain wrong or
            unsupported IMTs (see :attr:`DEFINED_FOR_INTENSITY_MEASURE_TYPES`).
        """
        loglevels = cmaker.loglevels
        truncation_level = cmaker.truncation_level
        N = mean_std.shape[2]  # 2, M, N
        L = loglevels.size
        maxsize = int(numpy.ceil(ONE_MB / L / 8))
        arr = numpy.zeros((N, L))
        if truncation_level is not None and truncation_level < 0:
            raise ValueError('truncation level must be zero, positive number '
                             'or None')
        if hasattr(self, 'weights_signs'):
            outs = []
            weights, signs = zip(*self.weights_signs)
            for s in signs:
                ms = numpy.array(mean_std)  # make a copy
                for m in range(len(loglevels)):
                    ms[0, m] += s * ctx.adjustment
                outs.append(_get_poes(ms, loglevels, truncation_level))
            arr[:] = numpy.average(outs, weights=weights, axis=0)
        elif hasattr(self, "mixture_model"):
            for f, w in zip(self.mixture_model["factors"],
                            self.mixture_model["weights"]):
                mean_stdi = numpy.array(mean_std)  # a copy
                mean_stdi[1] *= f  # multiply stddev by factor
                arr[:] += w * _get_poes(mean_stdi, loglevels, truncation_level)
        else:  # regular case
            # split large arrays in slices < 1 MB to fit inside the CPU cache
            for sl in gen_slices(0, N, maxsize):
                arr[sl] = _get_poes(mean_std[:, :, sl], loglevels,
                                    truncation_level)
        imtweight = getattr(self, 'weight', None)  # ImtWeight or None
        for imt in loglevels:
            if imtweight and imtweight.dic.get(imt) == 0:
                # set by the engine when parsing the gsim logictree
                # when 0 ignore the contribution: see _build_trts_branches
                arr[:, loglevels(imt)] = 0
        return arr
Exemplo n.º 3
0
def split_df(df, cond=True, maxsize=1000):
    """
    :param df: a large dataframe
    :param cond: boolean condition for splitting
    :param maxsize: split dataframes larger than maxsize
    :yields: dataframes smaller than maxsize
    """
    n = len(df)
    if n <= maxsize or not cond:
        yield df
    else:
        for slc in gen_slices(0, len(df), maxsize):
            yield df[slc]
Exemplo n.º 4
0
 def __iter__(self):
     if len(self.mags) <= BLOCKSIZE:  # already split
         yield self
         return
     # split in blocks of BLOCKSIZE ruptures each
     for i, slc in enumerate(gen_slices(0, len(self.mags), BLOCKSIZE)):
         src = self.__class__(
             '%s:%d' % (self.source_id, i),
             self.name,
             self.tectonic_region_type,
             self.rupture_idxs[slc],
             self.pmfs[slc],
             self.mags[slc],
             self.rakes[slc])
         src.set_sections(self.sections)
         src.num_ruptures = src.count_ruptures()
         yield src
Exemplo n.º 5
0
    def full_disaggregation(self):
        """
        Run the disaggregation phase.
        """
        oq = self.oqparam
        tl = oq.truncation_level
        src_filter = self.src_filter()
        if hasattr(self, 'csm'):
            for sg in self.csm.src_groups:
                if sg.atomic:
                    raise NotImplementedError(
                        'Atomic groups are not supported yet')
            if not self.csm.get_sources():
                raise RuntimeError('All sources were filtered away!')

        csm_info = self.datastore['csm_info']
        self.poes_disagg = oq.poes_disagg or (None, )
        self.imts = list(oq.imtls)

        self.ws = [rlz.weight for rlz in self.rlzs_assoc.realizations]
        self.pgetter = getters.PmapGetter(self.datastore, self.ws,
                                          self.sitecol.sids)

        # build array rlzs (N, Z)
        if oq.rlz_index is None:
            Z = oq.num_rlzs_disagg
            rlzs = numpy.zeros((self.N, Z), int)
            if self.R > 1:
                for sid in self.sitecol.sids:
                    curves = numpy.array(
                        [pc.array for pc in self.pgetter.get_pcurves(sid)])
                    mean = getters.build_stat_curve(curves, oq.imtls,
                                                    stats.mean_curve, self.ws)
                    rlzs[sid] = util.closest_to_ref(curves, mean.array)[:Z]
                self.datastore['best_rlzs'] = rlzs
        else:
            Z = len(oq.rlz_index)
            rlzs = numpy.zeros((self.N, Z), int)
            for z in range(Z):
                rlzs[:, z] = oq.rlz_index[z]
        assert Z <= self.R, (Z, self.R)
        self.Z = Z
        self.rlzs = rlzs

        if oq.iml_disagg:
            # no hazard curves are needed
            self.poe_id = {None: 0}
            curves = [[None for z in range(Z)] for s in range(self.N)]
            self.ok_sites = set(self.sitecol.sids)
        else:
            self.poe_id = {poe: i for i, poe in enumerate(oq.poes_disagg)}
            curves = [
                self.get_curve(sid, rlzs[sid]) for sid in self.sitecol.sids
            ]
            self.ok_sites = set(self.check_poes_disagg(curves, rlzs))
        self.iml4 = _iml4(rlzs, oq.iml_disagg, oq.imtls, self.poes_disagg,
                          curves)
        if oq.disagg_by_src:
            self.build_disagg_by_src(rlzs)

        eps_edges = numpy.linspace(-tl, tl, oq.num_epsilon_bins + 1)

        # build trt_edges
        trts = tuple(csm_info.trts)
        trt_num = {trt: i for i, trt in enumerate(trts)}
        self.trts = trts

        # build mag_edges
        min_mag = csm_info.min_mag
        max_mag = csm_info.max_mag
        mag_edges = oq.mag_bin_width * numpy.arange(
            int(numpy.floor(min_mag / oq.mag_bin_width)),
            int(numpy.ceil(max_mag / oq.mag_bin_width) + 1))

        # build dist_edges
        maxdist = max(oq.maximum_distance(trt) for trt in trts)
        dist_edges = oq.distance_bin_width * numpy.arange(
            0, int(numpy.ceil(maxdist / oq.distance_bin_width) + 1))

        # build eps_edges
        eps_edges = numpy.linspace(-tl, tl, oq.num_epsilon_bins + 1)

        # build lon_edges, lat_edges per sid
        bbs = src_filter.get_bounding_boxes(mag=max_mag)
        lon_edges, lat_edges = {}, {}  # by sid
        for sid, bb in zip(self.sitecol.sids, bbs):
            lon_edges[sid], lat_edges[sid] = disagg.lon_lat_bins(
                bb, oq.coordinate_bin_width)
        self.bin_edges = mag_edges, dist_edges, lon_edges, lat_edges, eps_edges
        self.save_bin_edges()

        self.imldict = {}  # sid, rlz, poe, imt -> iml
        for s in self.sitecol.sids:
            for z, rlz in enumerate(rlzs[s]):
                logging.info('Site #%d, disaggregating for rlz=#%d', s, rlz)
                for p, poe in enumerate(self.poes_disagg):
                    for m, imt in enumerate(oq.imtls):
                        self.imldict[s, rlz, poe, imt] = self.iml4[s, m, p, z]

        # submit disagg tasks
        gid = self.datastore['rup/grp_id'][()]
        indices_by_grp = get_indices(gid)  # grp_id -> [(start, stop),...]
        blocksize = len(gid) // (oq.concurrent_tasks or 1) + 1
        # NB: removing the blocksize causes slow disaggregation tasks
        allargs = []
        dstore = (self.datastore.parent
                  if self.datastore.parent else self.datastore)
        for grp_id, trt in csm_info.trt_by_grp.items():
            trti = trt_num[trt]
            rlzs_by_gsim = self.rlzs_assoc.get_rlzs_by_gsim(grp_id)
            cmaker = ContextMaker(
                trt, rlzs_by_gsim, {
                    'truncation_level': oq.truncation_level,
                    'maximum_distance': src_filter.integration_distance,
                    'filter_distance': oq.filter_distance,
                    'imtls': oq.imtls
                })
            for start, stop in indices_by_grp[grp_id]:
                for slc in gen_slices(start, stop, blocksize):
                    allargs.append((dstore, slc, self.sitecol, oq, cmaker,
                                    self.iml4, trti, self.bin_edges))
        results = parallel.Starmap(compute_disagg,
                                   allargs,
                                   h5=self.datastore.hdf5).reduce(
                                       self.agg_result, AccumDict(accum={}))
        return results  # sid -> trti-> 8D array
    def full_disaggregation(self):
        """
        Run the disaggregation phase.
        """
        oq = self.oqparam
        tl = oq.truncation_level
        src_filter = self.src_filter()
        if hasattr(self, 'csm'):
            for sg in self.csm.src_groups:
                if sg.atomic:
                    raise NotImplementedError(
                        'Atomic groups are not supported yet')
            if not self.csm.get_sources():
                raise RuntimeError('All sources were filtered away!')

        csm_info = self.datastore['csm_info']
        self.poes_disagg = oq.poes_disagg or (None, )
        self.imts = list(oq.imtls)
        if oq.rlz_index is None:
            try:
                rlzs = self.datastore['best_rlz'][()]
            except KeyError:
                rlzs = numpy.zeros(self.N, int)
        else:
            rlzs = [oq.rlz_index] * self.N

        if oq.iml_disagg:
            self.poe_id = {None: 0}
            curves = [None] * len(self.sitecol)  # no hazard curves are needed
            self.ok_sites = set(self.sitecol.sids)
        else:
            self.poe_id = {poe: i for i, poe in enumerate(oq.poes_disagg)}
            curves = [self.get_curve(sid, rlzs) for sid in self.sitecol.sids]
            self.ok_sites = set(self.check_poes_disagg(curves, rlzs))
        self.iml2s = _iml2s(rlzs, oq.iml_disagg, oq.imtls, self.poes_disagg,
                            curves)
        if oq.disagg_by_src:
            self.build_disagg_by_src()

        eps_edges = numpy.linspace(-tl, tl, oq.num_epsilon_bins + 1)

        # build trt_edges
        trts = tuple(csm_info.trts)
        trt_num = {trt: i for i, trt in enumerate(trts)}
        self.trts = trts

        # build mag_edges
        min_mag = csm_info.min_mag
        max_mag = csm_info.max_mag
        mag_edges = oq.mag_bin_width * numpy.arange(
            int(numpy.floor(min_mag / oq.mag_bin_width)),
            int(numpy.ceil(max_mag / oq.mag_bin_width) + 1))

        # build dist_edges
        maxdist = max(oq.maximum_distance(trt, max_mag) for trt in trts)
        dist_edges = oq.distance_bin_width * numpy.arange(
            0, int(numpy.ceil(maxdist / oq.distance_bin_width) + 1))

        # build eps_edges
        eps_edges = numpy.linspace(-tl, tl, oq.num_epsilon_bins + 1)

        # build lon_edges, lat_edges per sid
        bbs = src_filter.get_bounding_boxes(mag=max_mag)
        lon_edges, lat_edges = {}, {}  # by sid
        for sid, bb in zip(self.sitecol.sids, bbs):
            lon_edges[sid], lat_edges[sid] = disagg.lon_lat_bins(
                bb, oq.coordinate_bin_width)
        self.bin_edges = mag_edges, dist_edges, lon_edges, lat_edges, eps_edges
        self.save_bin_edges()

        self.imldict = {}  # sid, rlzi, poe, imt -> iml
        for s in self.sitecol.sids:
            iml2 = self.iml2s[s]
            r = rlzs[s]
            logging.info('Site #%d, disaggregating for rlz=#%d', s, r)
            for p, poe in enumerate(self.poes_disagg):
                for m, imt in enumerate(oq.imtls):
                    self.imldict[s, r, poe, imt] = iml2[m, p]

        # submit disagg tasks
        gid = self.datastore['rup/grp_id'][()]
        indices_by_grp = get_indices(gid)  # grp_id -> [(start, stop),...]
        blocksize = len(gid) // (oq.concurrent_tasks or 1) + 1
        allargs = []
        for grp_id, trt in csm_info.trt_by_grp.items():
            trti = trt_num[trt]
            rlzs_by_gsim = self.rlzs_assoc.get_rlzs_by_gsim(grp_id)
            cmaker = ContextMaker(
                trt, rlzs_by_gsim, {
                    'truncation_level': oq.truncation_level,
                    'maximum_distance': src_filter.integration_distance,
                    'filter_distance': oq.filter_distance,
                    'imtls': oq.imtls
                })
            for start, stop in indices_by_grp[grp_id]:
                for slc in gen_slices(start, stop, blocksize):
                    allargs.append((self.datastore, slc, cmaker, self.iml2s,
                                    trti, self.bin_edges))
        results = parallel.Starmap(compute_disagg,
                                   allargs,
                                   h5=self.datastore.hdf5).reduce(
                                       self.agg_result, AccumDict(accum={}))
        return results  # sid -> trti-> 7D array