def read(self, path): """Read metadata and dataframe from HDF5 store.""" with pd.HDFStore(path) as h5store: dataframe = h5store.get('dataframe') setattr_on_dataframe( dataframe, 'metadata', h5store.get_storer('dataframe').attrs.metadata) return dataframe
def _run_tool(self): dataset = self.dataset input_ts = load_plugins('io', 'timeseries-hdf5')['timeseries-hdf5'] orig_metadata = get_metadata(dataset)[dataset] parameter = orig_metadata['parameter'] if orig_metadata['file_path'] is None: raise IOError('No data file available for this dataset') df = input_ts.read(orig_metadata['file_path']) # apply transformation # run filter # new_df = self._run(df, options) metadata = df.metadata if 'file_path' in metadata: del metadata['file_path'] df.sort_values([parameter], ascending=False, na_position='last', inplace=True) df['Rank'] = df[parameter].rank(method='min', ascending=False) df.dropna(inplace=True) df['Percent Exceeded'] = (df['Rank'] / (df[parameter].count() + 1)) * 100 df.index = df['Percent Exceeded'] setattr_on_dataframe(df, 'metadata', metadata) new_df = df # setup new dataset new_metadata = { 'parameter': new_df.metadata.get('parameter'), 'datatype': orig_metadata['datatype'], 'options': self.set_options, 'file_format': orig_metadata['file_format'], 'unit': new_df.metadata.get('unit'), } new_dset, file_path, catalog_entry = self._create_new_dataset( old_dataset=dataset, ext='.h5', dataset_metadata=new_metadata, ) # save dataframe output = load_plugins('io', 'xy-hdf5')['xy-hdf5'] output.write(file_path, new_df, new_metadata) return {'datasets': new_dset, 'catalog_entries': catalog_entry}
def _run(self, df): metadata = df.metadata if 'file_path' in metadata: del metadata['file_path'] param = metadata['parameter'] period = self.period method = self.method orig_param, orig_period, orig_method = (param.split(':') + [None, None])[:3] new_df = getattr(df.resample(periods[period], kind='period'), method)() new_param = '%s:%s:%s' % (orig_param, period, method) new_df.rename( columns={param: new_param}, inplace=True) #inplace must be set to True to make changes metadata.update({'parameter': new_param}) setattr_on_dataframe(new_df, 'metadata', metadata) return new_df
def _run(self, df): metadata = df.metadata if 'file_path' in metadata: del metadata['file_path'] parameter = metadata['parameter'] sigma = self.sigma if sigma is None: sigma = 3 # remove anything 'sigma' standard deviations from median vmin = df[parameter].median() - float(sigma) * df[parameter].std() vmax = df[parameter].median() + float(sigma) * df[parameter].std() df = df[(df[parameter] > vmin)] df = df[(df[parameter] < vmax)] setattr_on_dataframe(df, 'metadata', metadata) #if despike: # kw = dict(n1=2, n2=20, block=6) # df = despike(df, **kw) # new_df = df.resample(periods[period], how=method, kind='period') return df
def _run(self, df): if self.to_units is None: raise ValueError('To_units cannot be None') metadata = df.metadata if 'file_path' in metadata: del metadata['file_path'] reg = unit_registry() from_units = metadata['unit'] if '/' in from_units and '/' not in self.to_units: beg = from_units.find('/') end = len(from_units) default_time = from_units[beg:end] to_units = self.to_units + default_time else: to_units = self.to_units conversion = reg.convert(1, src=from_units, dst=to_units) df[df.columns[1]] = df[df.columns[1]] * conversion metadata.update({'unit': to_units}) setattr_on_dataframe(df, 'metadata', metadata) return df