Exemplo n.º 1
0
    def _generate_mask(self, *ds_slice):
        """
        Generate multiplicative inclusion mask from exclusion layers.

        Parameters
        ----------
        ds_slice : int | slice | list | ndarray
            What to extract from ds, each arg is for a sequential axis.
            For example, (slice(0, 64), slice(0, 64)) will extract a 64x64
            exclusions mask.

        Returns
        -------
        mask : ndarray
            Multiplicative inclusion mask with all layers multiplied together
            ("and" operation) such that 1 is included, 0 is excluded,
            0.5 is half.
        """
        mask = None
        if len(ds_slice) == 1 & isinstance(ds_slice[0], tuple):
            ds_slice = ds_slice[0]

        if self._min_area is not None:
            ds_slice, sub_slice = self._increase_mask_slice(ds_slice, n=1)

        if self.layers:
            force_include = []
            for layer in self.layers:
                if layer.force_include:
                    force_include.append(layer)
                else:
                    logger.debug('Computing exclusions {}'.format(layer))
                    log_mem(logger, log_level='DEBUG')
                    layer_slice = (layer.layer, ) + ds_slice
                    layer_mask = layer[self.excl_h5[layer_slice]]
                    if mask is None:
                        mask = layer_mask
                    else:
                        mask = np.minimum(mask, layer_mask, dtype='float32')

            if force_include:
                logger.debug('Computing forced inclusions')
                log_mem(logger, log_level='DEBUG')
                mask = self._force_include(mask, force_include, ds_slice)

            if self._min_area is not None:
                mask = self._area_filter(mask,
                                         min_area=self._min_area,
                                         kernel=self._kernel)
                mask = mask[sub_slice]
        else:
            if self._min_area is not None:
                ds_slice = sub_slice

            mask = self._generate_ones_mask(ds_slice)

        return mask
Exemplo n.º 2
0
    def _collect(self):
        """Simple & robust serial collection optimized for low memory usage."""
        with Outputs(self._h5_file, mode='a') as f_out:
            for fp in self._source_files:
                with Outputs(fp, mode='r') as f_source:

                    x = self._get_source_gid_chunks(f_source)
                    all_source_gids, source_gid_chunks = x

                    for source_gids in source_gid_chunks:
                        self._collect_chunk(all_source_gids, source_gids,
                                            f_out, f_source, fp)

                log_mem(logger, log_level='DEBUG')
Exemplo n.º 3
0
    def run(self, **kwargs):
        """
        Run ParallelSmartJobs

        Parameters
        ----------
        kwargs : dict
            Keyword arguments to be passed to obj.run(). Makes it easier to
            have obj.run() as a @staticmethod.
        """

        logger.info('Executing parallel run on a local cluster with '
                    '{0} workers over {1} total iterations.'.format(
                        self.n_workers, 1 + len(self.execution_iter)))
        log_mem()

        # initialize a client based on the input cluster.
        with SpawnProcessPool(max_workers=self.n_workers) as executor:
            futures = []

            # iterate through split executions, submitting each to worker
            for i, exec_slice in enumerate(self.execution_iter):
                logger.debug(
                    'Kicking off serial worker #{0} for: {1}. '.format(
                        i, exec_slice))

                # submit executions and append to futures list
                futures.append(
                    executor.submit(self.obj.run, exec_slice, **kwargs))

                # Take a pause after one complete set of workers
                if (i + 1) % self.n_workers == 0:
                    futures = self.gather_and_flush(i, futures)

            # All futures complete
            self.gather_and_flush('END', futures, force_flush=True)
            logger.debug('Smart parallel job complete. Returning execution '
                         'control to higher level processes.')
            log_mem()
Exemplo n.º 4
0
def execute_single(fun, input_obj, worker=0, **kwargs):
    """Execute a serial compute on a single core.

    Parameters
    ----------
    fun : function
        Function to execute.
    input_obj : object
        Object passed as first argument to fun. Typically a project control
        object that can be the result of iteration in the parallel execution
        framework.
    worker : int
        Worker number (for debugging purposes).
    **kwargs : dict
        Key word arguments passed to fun.
    """

    logger.debug(
        'Running single serial execution on worker #{} for: {}'.format(
            worker, input_obj))
    out = fun(input_obj, **kwargs)
    log_mem()

    return out
Exemplo n.º 5
0
    def run_serial(excl_fpath, h5_fpath, tm_dset, *agg_dset,
                   agg_method='mean', excl_dict=None,
                   area_filter_kernel='queen', min_area=None,
                   check_excl_layers=False, resolution=64, excl_area=0.0081,
                   gids=None, gen_index=None):
        """
        Standalone method to aggregate - can be parallelized.

        Parameters
        ----------
        excl_fpath : str
            Filepath to exclusions h5 with techmap dataset.
        h5_fpath : str
            Filepath to .h5 file to aggregate
        tm_dset : str
            Dataset name in the techmap file containing the
            exclusions-to-resource mapping data.
        agg_dset : str
            Dataset to aggreate, can supply multiple datasets
        agg_method : str
            Aggregation method, either mean or sum/aggregate
        excl_dict : dict | None
            Dictionary of exclusion LayerMask arugments {layer: {kwarg: value}}
        area_filter_kernel : str
            Contiguous area filter method to use on final exclusions mask
        min_area : float | None
            Minimum required contiguous area filter in sq-km
        check_excl_layers : bool
            Run a pre-flight check on each exclusion layer to ensure they
            contain un-excluded values
        resolution : int | None
            SC resolution, must be input in combination with gid. Prefered
            option is to use the row/col slices to define the SC point instead.
        excl_area : float
            Area of an exclusion cell (square km).
        gids : list | None
            List of gids to get summary for (can use to subset if running in
            parallel), or None for all gids in the SC extent.
        gen_index : np.ndarray
            Array of generation gids with array index equal to resource gid.
            Array value is -1 if the resource index was not used in the
            generation run.

        Returns
        -------
        agg_out : dict
            Aggregated values for each aggregation dataset
        """
        with SupplyCurveExtent(excl_fpath, resolution=resolution) as sc:
            exclusion_shape = sc.exclusions.shape
            if gids is None:
                gids = sc.valid_sc_points(tm_dset)

        # pre-extract handlers so they are not repeatedly initialized
        file_kwargs = {'excl_dict': excl_dict,
                       'area_filter_kernel': area_filter_kernel,
                       'min_area': min_area,
                       'check_excl_layers': check_excl_layers}
        dsets = agg_dset + ('meta', )
        agg_out = {ds: [] for ds in dsets}
        with AggFileHandler(excl_fpath, h5_fpath, **file_kwargs) as fh:
            n_finished = 0
            for gid in gids:
                try:
                    gid_out = AggregationSupplyCurvePoint.run(
                        gid,
                        fh.exclusions,
                        fh.h5,
                        tm_dset,
                        *agg_dset,
                        agg_method=agg_method,
                        excl_dict=excl_dict,
                        resolution=resolution,
                        excl_area=excl_area,
                        exclusion_shape=exclusion_shape,
                        close=False,
                        gen_index=gen_index)

                except EmptySupplyCurvePointError:
                    logger.debug('SC gid {} is fully excluded or does not '
                                 'have any valid source data!'.format(gid))
                except Exception:
                    logger.exception('SC gid {} failed!'.format(gid))
                    raise
                else:
                    n_finished += 1
                    logger.debug('Serial aggregation: '
                                 '{} out of {} points complete'
                                 .format(n_finished, len(gids)))
                    log_mem(logger)
                    for k, v in gid_out.items():
                        agg_out[k].append(v)

        return agg_out
Exemplo n.º 6
0
    def compute_statistics(self,
                           dataset,
                           sites=None,
                           diurnal=False,
                           month=False,
                           combinations=False,
                           max_workers=None,
                           chunks_per_worker=5,
                           lat_lon_only=True):
        """
        Compute statistics

        Parameters
        ----------
        dataset : str
            Dataset to extract stats for
        sites : list | slice, optional
            Subset of sites to extract, by default None or all sites
        diurnal : bool, optional
            Extract diurnal stats, by default False
        month : bool, optional
            Extract monthly stats, by default False
        combinations : bool, optional
            Extract all combinations of temporal stats, by default False
        max_workers : None | int, optional
            Number of workers to use, if 1 run in serial, if None use all
            available cores, by default None
        chunks_per_worker : int, optional
            Number of chunks to extract on each worker, by default 5
        lat_lon_only : bool, optional
            Only append lat, lon coordinates to stats, by default True

        Returns
        -------
        res_stats : pandas.DataFrame
            DataFrame of desired statistics at desired time intervals
        """
        if max_workers is None:
            max_workers = os.cpu_count()

        slices = self._get_slices(dataset,
                                  sites,
                                  chunks_per_slice=chunks_per_worker)
        if len(slices) == 1:
            max_workers = 1

        if max_workers > 1:
            msg = ('Extracting {} for {} in parallel using {} workers'.format(
                list(self.statistics), dataset, max_workers))
            logger.info(msg)

            loggers = [__name__, 'rex']
            with SpawnProcessPool(max_workers=max_workers,
                                  loggers=loggers) as exe:
                futures = []
                for sites_slice in slices:
                    future = exe.submit(self._extract_stats,
                                        self.res_h5,
                                        self.statistics,
                                        dataset,
                                        res_cls=self.res_cls,
                                        hsds=self._hsds,
                                        time_index=self.time_index,
                                        sites_slice=sites_slice,
                                        diurnal=diurnal,
                                        month=month,
                                        combinations=combinations)
                    futures.append(future)

                res_stats = []
                for i, future in enumerate(as_completed(futures)):
                    res_stats.append(future.result())
                    logger.debug('Completed {} out of {} workers'.format(
                        (i + 1), len(futures)))
        else:
            msg = ('Extracting {} for {} in serial'.format(
                self.statistics.keys(), dataset))
            logger.info(msg)
            res_stats = []
            for i, sites_slice in enumerate(slices):
                res_stats.append(
                    self._extract_stats(self.res_h5,
                                        self.statistics,
                                        dataset,
                                        res_cls=self.res_cls,
                                        hsds=self._hsds,
                                        time_index=self.time_index,
                                        sites_slice=sites_slice,
                                        diurnal=diurnal,
                                        month=month,
                                        combinations=combinations))
                logger.debug('Completed {} out of {} sets of sites'.format(
                    (i + 1), len(slices)))

        gc.collect()
        log_mem(logger)
        res_stats = pd.concat(res_stats)

        if lat_lon_only:
            meta = self.lat_lon
        else:
            meta = self.meta

        res_stats = meta.join(res_stats.sort_index(), how='inner')

        return res_stats
Exemplo n.º 7
0
Arquivo: joint_pd.py Projeto: NREL/rex
    def compute(self,
                dset1,
                dset2,
                bins1,
                bins2,
                sites=None,
                max_workers=None,
                chunks_per_worker=5):
        """
        Compute joint probability distribution between given datasets using
        given bins for all sites.

        Parameters
        ----------
        dset1 : str
            Dataset 1 to generate joint probability distribution for
        dset2 : str
            Dataset 2 to generate joint probabilty distribution for
        bins1 : tuple
            (start, stop, step) for dataset 1 bins. The stop value is
            inclusive, so (0, 6, 2) would yield three bins with edges (0, 2, 4,
            6). If the stop value is not perfectly divisible by the step, the
            last bin will overshoot the stop value.
        bins2 : tuple
            (start, stop, step) for dataset 2 bins. The stop value is
            inclusive, so (0, 6, 2) would yield three bins with edges (0, 2, 4,
            6). If the stop value is not perfectly divisible by the step, the
            last bin will overshoot the stop value.
        sites : list | slice, optional
            Subset of sites to extract, by default None or all sites
        max_workers : None | int, optional
            Number of workers to use, if 1 run in serial, if None use all
            available cores, by default None
        chunks_per_worker : int, optional
            Number of chunks to extract on each worker, by default 5

        Returns
        -------
        jpd: pandas.DataFrame
            DataFrame of joint probability distribution between given datasets
            with given bins
        """
        if max_workers is None:
            max_workers = os.cpu_count()

        slices = self._get_slices(dset1,
                                  dset2,
                                  sites,
                                  chunks_per_slice=chunks_per_worker)
        if len(slices) == 1:
            max_workers = 1

        jpd = {}
        if max_workers > 1:
            msg = ('Computing the joint probability distribution between {} '
                   'and {} in parallel using {} workers'.format(
                       dset1, dset2, max_workers))
            logger.info(msg)

            loggers = [__name__, 'rex']
            with SpawnProcessPool(max_workers=max_workers,
                                  loggers=loggers) as exe:
                futures = []
                for sites_slice in slices:
                    future = exe.submit(self.compute_joint_pd,
                                        self.res_h5,
                                        dset1,
                                        dset2,
                                        bins1,
                                        bins2,
                                        res_cls=self.res_cls,
                                        hsds=self._hsds,
                                        sites_slice=sites_slice)
                    futures.append(future)

                for i, future in enumerate(as_completed(futures)):
                    jpd.update(future.result())
                    logger.debug('Completed {} out of {} workers'.format(
                        (i + 1), len(futures)))

        else:
            msg = ('Computing the joint probability distribution between {} '
                   'and {} in serial.'.format(dset1, dset2))
            logger.info(msg)
            for i, sites_slice in enumerate(slices):
                jpd.update(
                    self.compute_joint_pd(self.res_h5,
                                          dset1,
                                          dset2,
                                          bins1,
                                          bins2,
                                          res_cls=self.res_cls,
                                          hsds=self._hsds,
                                          sites_slice=sites_slice))
                logger.debug('Completed {} out of {} sets of sites'.format(
                    (i + 1), len(slices)))

        gc.collect()
        log_mem(logger)
        bins1 = self._make_bins(*bins1)
        bins2 = self._make_bins(*bins2)
        index = np.meshgrid(bins1[:-1], bins2[:-1], indexing='ij')
        index = np.array(index).T.reshape(-1, 2).astype(np.int16)
        index = pd.MultiIndex.from_arrays(index.T, names=(dset1, dset2))
        jpd = pd.DataFrame({k: v.flatten(order='F')
                            for k, v in jpd.items()},
                           index=index).sort_index(axis=1)

        return jpd