예제 #1
0
    def get_data(self):
        """
        Returns
        -------
        vel_data : np.ndarray
            2D array (time x sites) of the velocity dataset (e.g. windspeed in
            m/s)
        color_data : np.ndarray
            2D array (time x sites) of the color_dset (e.g. windspeed in m/s)
        dir : np.ndarray
            2D array (time x sites) of direction data in radians measured
            counter-clockwise from east such that dx = delta*np.cos(dir) and
            dy = delta*np.sin(dir). This also implies that if
            self.dir_option="from" then this dir output has converted the
            direction to "to"
        """

        logger.info('Reading data...')

        with Resource(self.fp_dir) as res:
            dir_data = res[self.dir_dset, self.time_slice, :]

        self.color_data = np.ones_like(dir_data)
        if self.color_dset is not None:
            with Resource(self.fp_color) as res:
                self.color_data = res[self.color_dset, self.time_slice, :]

        self.vel_data = np.ones_like(dir_data)
        if self.vel_dset is not None:
            with Resource(self.fp_vel) as res:
                self.vel_data = res[self.vel_dset, self.time_slice, :]

        if self.dir_option.lower() == 'from':
            dir_data = 270 - dir_data.copy()
        elif self.dir_option.lower() == 'to':
            dir_data = 90 - dir_data.copy()

        self.dir_data = dir_data * (np.pi / 180)

        logger.debug('Velocity data stats from dataset "{}": {} {} {}'.format(
            self.vel_dset, self.vel_data.min(), self.vel_data.mean(),
            self.vel_data.max()))
        logger.debug('Color data stats from dataset "{}": {} {} {}'.format(
            self.color_dset, self.color_data.min(), self.color_data.mean(),
            self.color_data.max()))
        logger.info('Data read complete')

        return self.vel_data, self.color_data, self.dir_data
예제 #2
0
    def _compute_ds_summary(h5_file, ds_name, group=None):
        """
        Compute summary statistics for given dataset (assumed to be a vector)

        Parameters
        ----------
        h5_file : str
            Path to .h5 file to summarize data from
        ds_name : str
            Dataset name of interest
        group : str, optional
            Group within h5_file to summarize datasets for, by default None

        Returns
        -------
        ds_summary : pandas.DataFrame
            Summary statistics for dataset
        """
        with Resource(h5_file, group=group) as f:
            ds_data = f[ds_name, :]

        ds_summary = pd.DataFrame(ds_data, columns=[ds_name])
        ds_summary = ds_summary.describe().drop(['count'])
        ds_summary.at['sum'] = ds_data.sum()

        return ds_summary
예제 #3
0
    def _compute_sites_summary(h5_file, ds_name, sites=None, group=None):
        """
        Compute summary stats for given sites of given dataset

        Parameters
        ----------
        h5_file : str
            Path to .h5 file to summarize data from
        ds_name : str
            Dataset name of interest
        sites : list | slice, optional
            sites of interest, by default None
        group : str, optional
            Group within h5_file to summarize datasets for, by default None

        Returns
        -------
        sites_summary : pandas.DataFrame
            Summary stats for given sites / dataset
        """
        if sites is None:
            sites = slice(None)

        with Resource(h5_file, group=group) as f:
            sites_meta = f['meta', sites]
            sites_data = f[ds_name, :, sites]

        sites_summary = pd.DataFrame(sites_data, columns=sites_meta.index)
        sites_summary = sites_summary.describe().T.drop(columns=['count'])
        sites_summary['sum'] = sites_data.sum(axis=0)

        return sites_summary
예제 #4
0
    def summarize_means(self, out_path=None):
        """
        Add means datasets to meta data

        Parameters
        ----------
        out_path : str, optional
            Path to .csv file to save update meta data to, by default None

        Returns
        -------
        meta : pandas.DataFrame
            Meta data with means datasets added
        """
        with Resource(self.h5_file, group=self._group) as f:
            meta = f.meta
            if 'gid' not in meta:
                if meta.index.name != 'gid':
                    meta.index.name = 'gid'

                meta = meta.reset_index()

            for ds_name in f.datasets:
                shape, dtype, _ = f.get_dset_properties(ds_name)
                if len(shape) == 1 and np.issubdtype(dtype, np.number):
                    meta[ds_name] = f[ds_name]

        if out_path is not None:
            meta.to_csv(out_path, index=False)

        return meta
예제 #5
0
    def get_meta(self):
        """
        Returns
        -------
        meta : pd.DataFrame
            Meta data from fp_res.
        """

        with Resource(self.fp_res) as res:
            self.meta = res.meta

        return self.meta
예제 #6
0
    def run(cls,
            h5_file,
            out_dir,
            group=None,
            dsets=None,
            process_size=None,
            max_workers=None):
        """
        Summarize all datasets in h5_file and dump to out_dir

        Parameters
        ----------
        h5_file : str
            Path to .h5 file to summarize data from
        out_dir : str
            Directory to dump summary .csv files to
        group : str, optional
            Group within h5_file to summarize datasets for, by default None
        dsets : str | list, optional
            Datasets to summarize, by default None
        process_size : int, optional
            Number of sites to process at a time, by default None
        max_workers : int, optional
            Number of workers to use when summarizing 2D datasets,
            by default None
        """
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

        if dsets is None:
            with Resource(h5_file, group=group) as f:
                dsets = [
                    dset for dset in f.datasets
                    if dset not in ['meta', 'time_index']
                ]
        elif isinstance(dsets, str):
            dsets = [dsets]

        summary = cls(h5_file)
        for ds_name in dsets:
            out_path = os.path.join(out_dir, "{}_summary.csv".format(ds_name))
            summary.summarize_dset(ds_name,
                                   process_size=process_size,
                                   max_workers=max_workers,
                                   out_path=out_path)

        out_path = os.path.basename(h5_file).replace('.h5', '_summary.csv')
        out_path = os.path.join(out_dir, out_path)
        summary.summarize_means(out_path=out_path)
예제 #7
0
def test_regions(counties):
    """
    Test ProjectPoint.regions class method
    """
    res_file = os.path.join(TESTDATADIR, 'nsrdb/', 'ri_100_nsrdb_2012.h5')
    sam_files = os.path.join(TESTDATADIR, 'SAM/naris_pv_1axis_inv13_cs.json')

    with Resource(res_file) as f:
        meta = f.meta

    baseline = meta.loc[meta['county'].isin(counties)].index.values.tolist()

    regions = {c: 'county' for c in counties}
    pp = ProjectPoints.regions(regions, res_file, sam_files)

    assert sorted(baseline) == pp.sites
예제 #8
0
def test_duplicate_coords():
    """
    Test ProjectPoint.lat_lon_coords duplicate coords error
    """
    res_file = os.path.join(TESTDATADIR, 'nsrdb/', 'ri_100_nsrdb_2012.h5')
    sam_files = os.path.join(TESTDATADIR, 'SAM/naris_pv_1axis_inv13_cs.json')

    with Resource(res_file) as f:
        meta = f.meta

    duplicates = meta.loc[[2, 3, 3, 4], ['latitude', 'longitude']].values

    with pytest.raises(RuntimeError):
        ProjectPoints.lat_lon_coords(duplicates, res_file, sam_files)

    regions = {'Kent': 'county', 'Rhode Island': 'state'}
    with pytest.raises(RuntimeError):
        ProjectPoints.regions(regions, res_file, sam_files)
예제 #9
0
def test_coords(sites):
    """
    Test ProjectPoint.lat_lon_coords class method
    """
    res_file = os.path.join(TESTDATADIR, 'nsrdb/', 'ri_100_nsrdb_2012.h5')
    sam_files = os.path.join(TESTDATADIR, 'SAM/naris_pv_1axis_inv13_cs.json')

    with Resource(res_file) as f:
        meta = f.meta

    gids = np.random.choice(meta.index.values, sites, replace=False).tolist()
    if not isinstance(gids, list):
        gids = [gids]

    lat_lons = meta.loc[gids, ['latitude', 'longitude']].values
    pp = ProjectPoints.lat_lon_coords(lat_lons, res_file, sam_files)

    assert sorted(gids) == pp.sites
예제 #10
0
    def get_time_index(self):
        """
        Returns
        -------
        time_slice : slice
            Row slice object that can be used to slice data from fp_res.
        time_index : pd.Datetimeindex
            Datetimeindex for the time frame of interest
        """

        with Resource(self.fp_res) as res:
            time_index = res.time_index.tz_localize(None)

        mask = (time_index >= self.date0) & (time_index < self.date1)
        ilocs = np.where(mask)[0]
        self.time_slice = slice(ilocs[0], ilocs[-1])
        self.time_index = time_index[self.time_slice]

        return self.time_slice, self.time_index
예제 #11
0
    def summarize_dset(self, ds_name, process_size=None, max_workers=None,
                       out_path=None):
        """
        Compute dataset summary. If dataset is 2D compute temporal statistics
        for each site

        Parameters
        ----------
        ds_name : str
            Dataset name of interest
        process_size : int, optional
            Number of sites to process at a time, by default None
        max_workers : int, optional
            Number of workers to use in parallel, if 1 run in serial,
            if None use all available cores, by default None
        out_path : str
            File path to save summary to

        Returns
        -------
        summary : pandas.DataFrame
            Summary summary for dataset
        """
        with Resource(self.h5_file, group=self._group) as f:
            ds_shape, _, ds_chunks = f.get_dset_properties(ds_name)

        if len(ds_shape) > 1:
            sites = np.arange(ds_shape[1])
            if max_workers != 1:
                if process_size is None and ds_chunks is not None:
                    process_size = ds_chunks[1]
                if process_size is None:
                    process_size = ds_shape[-1]

                sites = \
                    np.array_split(sites,
                                   int(np.ceil(len(sites) / process_size)))
                loggers = [__name__]
                with SpawnProcessPool(max_workers=max_workers,
                                      loggers=loggers) as ex:
                    futures = []
                    for site_slice in sites:
                        futures.append(ex.submit(
                            self._compute_sites_summary,
                            self.h5_file, ds_name, sites=site_slice,
                            group=self._group))

                    summary = [future.result() for future in futures]

                summary = pd.concat(summary)
            else:
                if process_size is None:
                    summary = self._compute_sites_summary(self.h5_file,
                                                          ds_name,
                                                          sites=sites,
                                                          group=self._group)
                else:
                    sites = np.array_split(
                        sites, int(np.ceil(len(sites) / process_size)))

                    summary = []
                    for site_slice in sites:
                        summary.append(self._compute_sites_summary(
                            self.h5_file, ds_name,
                            sites=site_slice,
                            group=self._group))

                    summary = pd.concat(summary)

            summary.index.name = 'gid'

        else:
            summary = self._compute_ds_summary(self.h5_file, ds_name,
                                               group=self._group)

        if out_path is not None:
            summary.to_csv(out_path)

        return summary
예제 #12
0
def test_cli(runner):
    """Test multi year collection cli with pass through of some datasets."""

    with tempfile.TemporaryDirectory() as temp:
        config = {
            "directories": {
                "log_directory": temp,
                "output_directory": temp
            },
            "execution_control": {
                "option": "local"
            },
            "groups": {
                "none": {
                    "dsets": ["cf_mean"],
                    "pass_through_dsets": ['pass_through_1', 'pass_through_2'],
                    "source_dir": temp,
                    "source_prefix": "gen_ri_pv"
                }
            },
            "name": "my_test",
            "log_level": "INFO"
        }

        my_out = os.path.join(temp, "{}.h5".format(config['name']))
        temp_h5_files = [
            os.path.join(temp, os.path.basename(fp)) for fp in H5_FILES
        ]
        for fp, fp_temp in zip(H5_FILES, temp_h5_files):
            shutil.copy(fp, fp_temp)

        pass_through_dsets = config['groups']['none']['pass_through_dsets']
        for fp in temp_h5_files:
            for i, dset in enumerate(pass_through_dsets):
                with h5py.File(fp, 'a') as f:
                    shape = f['meta'].shape
                    arr = np.arange(shape[0]) * (i + 1)
                    f.create_dataset(dset, shape, data=arr)

        fp_config = os.path.join(temp, 'config.json')
        with open(fp_config, 'w') as f:
            json.dump(config, f)

        result = runner.invoke(main, ['from-config', '-c', fp_config])
        msg = ('Failed with error {}'.format(
            traceback.print_exception(*result.exc_info)))
        assert result.exit_code == 0, msg

        with Resource(my_out) as res:
            assert 'cf_mean-2012' in res.dsets
            assert 'cf_mean-2013' in res.dsets
            assert 'cf_mean-means' in res.dsets
            assert 'cf_mean-stdev' in res.dsets
            assert 'pass_through_1' in res.dsets
            assert 'pass_through_2' in res.dsets
            assert 'pass_through_1-means' not in res.dsets
            assert 'pass_through_2-means' not in res.dsets
            assert np.allclose(res['pass_through_1'],
                               1 * np.arange(len(res.meta)))
            assert np.allclose(res['pass_through_2'],
                               2 * np.arange(len(res.meta)))