def get_data(self): """ Returns ------- vel_data : np.ndarray 2D array (time x sites) of the velocity dataset (e.g. windspeed in m/s) color_data : np.ndarray 2D array (time x sites) of the color_dset (e.g. windspeed in m/s) dir : np.ndarray 2D array (time x sites) of direction data in radians measured counter-clockwise from east such that dx = delta*np.cos(dir) and dy = delta*np.sin(dir). This also implies that if self.dir_option="from" then this dir output has converted the direction to "to" """ logger.info('Reading data...') with Resource(self.fp_dir) as res: dir_data = res[self.dir_dset, self.time_slice, :] self.color_data = np.ones_like(dir_data) if self.color_dset is not None: with Resource(self.fp_color) as res: self.color_data = res[self.color_dset, self.time_slice, :] self.vel_data = np.ones_like(dir_data) if self.vel_dset is not None: with Resource(self.fp_vel) as res: self.vel_data = res[self.vel_dset, self.time_slice, :] if self.dir_option.lower() == 'from': dir_data = 270 - dir_data.copy() elif self.dir_option.lower() == 'to': dir_data = 90 - dir_data.copy() self.dir_data = dir_data * (np.pi / 180) logger.debug('Velocity data stats from dataset "{}": {} {} {}'.format( self.vel_dset, self.vel_data.min(), self.vel_data.mean(), self.vel_data.max())) logger.debug('Color data stats from dataset "{}": {} {} {}'.format( self.color_dset, self.color_data.min(), self.color_data.mean(), self.color_data.max())) logger.info('Data read complete') return self.vel_data, self.color_data, self.dir_data
def _compute_ds_summary(h5_file, ds_name, group=None): """ Compute summary statistics for given dataset (assumed to be a vector) Parameters ---------- h5_file : str Path to .h5 file to summarize data from ds_name : str Dataset name of interest group : str, optional Group within h5_file to summarize datasets for, by default None Returns ------- ds_summary : pandas.DataFrame Summary statistics for dataset """ with Resource(h5_file, group=group) as f: ds_data = f[ds_name, :] ds_summary = pd.DataFrame(ds_data, columns=[ds_name]) ds_summary = ds_summary.describe().drop(['count']) ds_summary.at['sum'] = ds_data.sum() return ds_summary
def _compute_sites_summary(h5_file, ds_name, sites=None, group=None): """ Compute summary stats for given sites of given dataset Parameters ---------- h5_file : str Path to .h5 file to summarize data from ds_name : str Dataset name of interest sites : list | slice, optional sites of interest, by default None group : str, optional Group within h5_file to summarize datasets for, by default None Returns ------- sites_summary : pandas.DataFrame Summary stats for given sites / dataset """ if sites is None: sites = slice(None) with Resource(h5_file, group=group) as f: sites_meta = f['meta', sites] sites_data = f[ds_name, :, sites] sites_summary = pd.DataFrame(sites_data, columns=sites_meta.index) sites_summary = sites_summary.describe().T.drop(columns=['count']) sites_summary['sum'] = sites_data.sum(axis=0) return sites_summary
def summarize_means(self, out_path=None): """ Add means datasets to meta data Parameters ---------- out_path : str, optional Path to .csv file to save update meta data to, by default None Returns ------- meta : pandas.DataFrame Meta data with means datasets added """ with Resource(self.h5_file, group=self._group) as f: meta = f.meta if 'gid' not in meta: if meta.index.name != 'gid': meta.index.name = 'gid' meta = meta.reset_index() for ds_name in f.datasets: shape, dtype, _ = f.get_dset_properties(ds_name) if len(shape) == 1 and np.issubdtype(dtype, np.number): meta[ds_name] = f[ds_name] if out_path is not None: meta.to_csv(out_path, index=False) return meta
def get_meta(self): """ Returns ------- meta : pd.DataFrame Meta data from fp_res. """ with Resource(self.fp_res) as res: self.meta = res.meta return self.meta
def run(cls, h5_file, out_dir, group=None, dsets=None, process_size=None, max_workers=None): """ Summarize all datasets in h5_file and dump to out_dir Parameters ---------- h5_file : str Path to .h5 file to summarize data from out_dir : str Directory to dump summary .csv files to group : str, optional Group within h5_file to summarize datasets for, by default None dsets : str | list, optional Datasets to summarize, by default None process_size : int, optional Number of sites to process at a time, by default None max_workers : int, optional Number of workers to use when summarizing 2D datasets, by default None """ if not os.path.exists(out_dir): os.makedirs(out_dir) if dsets is None: with Resource(h5_file, group=group) as f: dsets = [ dset for dset in f.datasets if dset not in ['meta', 'time_index'] ] elif isinstance(dsets, str): dsets = [dsets] summary = cls(h5_file) for ds_name in dsets: out_path = os.path.join(out_dir, "{}_summary.csv".format(ds_name)) summary.summarize_dset(ds_name, process_size=process_size, max_workers=max_workers, out_path=out_path) out_path = os.path.basename(h5_file).replace('.h5', '_summary.csv') out_path = os.path.join(out_dir, out_path) summary.summarize_means(out_path=out_path)
def test_regions(counties): """ Test ProjectPoint.regions class method """ res_file = os.path.join(TESTDATADIR, 'nsrdb/', 'ri_100_nsrdb_2012.h5') sam_files = os.path.join(TESTDATADIR, 'SAM/naris_pv_1axis_inv13_cs.json') with Resource(res_file) as f: meta = f.meta baseline = meta.loc[meta['county'].isin(counties)].index.values.tolist() regions = {c: 'county' for c in counties} pp = ProjectPoints.regions(regions, res_file, sam_files) assert sorted(baseline) == pp.sites
def test_duplicate_coords(): """ Test ProjectPoint.lat_lon_coords duplicate coords error """ res_file = os.path.join(TESTDATADIR, 'nsrdb/', 'ri_100_nsrdb_2012.h5') sam_files = os.path.join(TESTDATADIR, 'SAM/naris_pv_1axis_inv13_cs.json') with Resource(res_file) as f: meta = f.meta duplicates = meta.loc[[2, 3, 3, 4], ['latitude', 'longitude']].values with pytest.raises(RuntimeError): ProjectPoints.lat_lon_coords(duplicates, res_file, sam_files) regions = {'Kent': 'county', 'Rhode Island': 'state'} with pytest.raises(RuntimeError): ProjectPoints.regions(regions, res_file, sam_files)
def test_coords(sites): """ Test ProjectPoint.lat_lon_coords class method """ res_file = os.path.join(TESTDATADIR, 'nsrdb/', 'ri_100_nsrdb_2012.h5') sam_files = os.path.join(TESTDATADIR, 'SAM/naris_pv_1axis_inv13_cs.json') with Resource(res_file) as f: meta = f.meta gids = np.random.choice(meta.index.values, sites, replace=False).tolist() if not isinstance(gids, list): gids = [gids] lat_lons = meta.loc[gids, ['latitude', 'longitude']].values pp = ProjectPoints.lat_lon_coords(lat_lons, res_file, sam_files) assert sorted(gids) == pp.sites
def get_time_index(self): """ Returns ------- time_slice : slice Row slice object that can be used to slice data from fp_res. time_index : pd.Datetimeindex Datetimeindex for the time frame of interest """ with Resource(self.fp_res) as res: time_index = res.time_index.tz_localize(None) mask = (time_index >= self.date0) & (time_index < self.date1) ilocs = np.where(mask)[0] self.time_slice = slice(ilocs[0], ilocs[-1]) self.time_index = time_index[self.time_slice] return self.time_slice, self.time_index
def summarize_dset(self, ds_name, process_size=None, max_workers=None, out_path=None): """ Compute dataset summary. If dataset is 2D compute temporal statistics for each site Parameters ---------- ds_name : str Dataset name of interest process_size : int, optional Number of sites to process at a time, by default None max_workers : int, optional Number of workers to use in parallel, if 1 run in serial, if None use all available cores, by default None out_path : str File path to save summary to Returns ------- summary : pandas.DataFrame Summary summary for dataset """ with Resource(self.h5_file, group=self._group) as f: ds_shape, _, ds_chunks = f.get_dset_properties(ds_name) if len(ds_shape) > 1: sites = np.arange(ds_shape[1]) if max_workers != 1: if process_size is None and ds_chunks is not None: process_size = ds_chunks[1] if process_size is None: process_size = ds_shape[-1] sites = \ np.array_split(sites, int(np.ceil(len(sites) / process_size))) loggers = [__name__] with SpawnProcessPool(max_workers=max_workers, loggers=loggers) as ex: futures = [] for site_slice in sites: futures.append(ex.submit( self._compute_sites_summary, self.h5_file, ds_name, sites=site_slice, group=self._group)) summary = [future.result() for future in futures] summary = pd.concat(summary) else: if process_size is None: summary = self._compute_sites_summary(self.h5_file, ds_name, sites=sites, group=self._group) else: sites = np.array_split( sites, int(np.ceil(len(sites) / process_size))) summary = [] for site_slice in sites: summary.append(self._compute_sites_summary( self.h5_file, ds_name, sites=site_slice, group=self._group)) summary = pd.concat(summary) summary.index.name = 'gid' else: summary = self._compute_ds_summary(self.h5_file, ds_name, group=self._group) if out_path is not None: summary.to_csv(out_path) return summary
def test_cli(runner): """Test multi year collection cli with pass through of some datasets.""" with tempfile.TemporaryDirectory() as temp: config = { "directories": { "log_directory": temp, "output_directory": temp }, "execution_control": { "option": "local" }, "groups": { "none": { "dsets": ["cf_mean"], "pass_through_dsets": ['pass_through_1', 'pass_through_2'], "source_dir": temp, "source_prefix": "gen_ri_pv" } }, "name": "my_test", "log_level": "INFO" } my_out = os.path.join(temp, "{}.h5".format(config['name'])) temp_h5_files = [ os.path.join(temp, os.path.basename(fp)) for fp in H5_FILES ] for fp, fp_temp in zip(H5_FILES, temp_h5_files): shutil.copy(fp, fp_temp) pass_through_dsets = config['groups']['none']['pass_through_dsets'] for fp in temp_h5_files: for i, dset in enumerate(pass_through_dsets): with h5py.File(fp, 'a') as f: shape = f['meta'].shape arr = np.arange(shape[0]) * (i + 1) f.create_dataset(dset, shape, data=arr) fp_config = os.path.join(temp, 'config.json') with open(fp_config, 'w') as f: json.dump(config, f) result = runner.invoke(main, ['from-config', '-c', fp_config]) msg = ('Failed with error {}'.format( traceback.print_exception(*result.exc_info))) assert result.exit_code == 0, msg with Resource(my_out) as res: assert 'cf_mean-2012' in res.dsets assert 'cf_mean-2013' in res.dsets assert 'cf_mean-means' in res.dsets assert 'cf_mean-stdev' in res.dsets assert 'pass_through_1' in res.dsets assert 'pass_through_2' in res.dsets assert 'pass_through_1-means' not in res.dsets assert 'pass_through_2-means' not in res.dsets assert np.allclose(res['pass_through_1'], 1 * np.arange(len(res.meta))) assert np.allclose(res['pass_through_2'], 2 * np.arange(len(res.meta)))