def _run_tool(self): dataset = self.dataset input_ts = load_plugins('io', 'timeseries-hdf5')['timeseries-hdf5'] orig_metadata = get_metadata(dataset)[dataset] parameter = orig_metadata['parameter'] if orig_metadata['file_path'] is None: raise IOError('No data file available for this dataset') df = input_ts.read(orig_metadata['file_path']) # apply transformation # run filter # new_df = self._run(df, options) metadata = df.metadata if 'file_path' in metadata: del metadata['file_path'] df.sort_values([parameter], ascending=False, na_position='last', inplace=True) df['Rank'] = df[parameter].rank(method='min', ascending=False) df.dropna(inplace=True) df['Percent Exceeded'] = (df['Rank'] / (df[parameter].count() + 1)) * 100 df.index = df['Percent Exceeded'] setattr_on_dataframe(df, 'metadata', metadata) new_df = df # setup new dataset new_metadata = { 'parameter': new_df.metadata.get('parameter'), 'datatype': orig_metadata['datatype'], 'options': self.set_options, 'file_format': orig_metadata['file_format'], 'unit': new_df.metadata.get('unit'), } new_dset, file_path, catalog_entry = self._create_new_dataset( old_dataset=dataset, ext='.h5', dataset_metadata=new_metadata, ) # save dataframe output = load_plugins('io', 'xy-hdf5')['xy-hdf5'] output.write(file_path, new_df, new_metadata) return {'datasets': new_dset, 'catalog_entries': catalog_entry}
def _run_tool(self): dataset = self.dataset io = load_plugins('io', 'timeseries-hdf5')['timeseries-hdf5'] orig_metadata = get_metadata(dataset)[dataset] if orig_metadata['file_path'] is None: raise IOError('No data file available for this dataset') df = io.read(orig_metadata['file_path']) # run filter new_df = self._run(df) # setup new dataset new_metadata = { 'parameter': new_df.metadata.get('parameter'), 'unit': new_df.metadata.get('unit'), 'datatype': orig_metadata['datatype'], 'file_format': orig_metadata['file_format'], } new_dset, file_path, catalog_entry = self._create_new_dataset( old_dataset=dataset, ext='.h5', dataset_metadata=new_metadata, ) # save dataframe io.write(file_path, new_df, new_metadata) return {'datasets': new_dset}
def download(self, catalog_id, file_path, dataset, **kwargs): p = param.ParamOverrides(self, kwargs) self.parameter = p.parameter self.end = pd.to_datetime(p.end) self.start = pd.to_datetime(p.start) self._catalog_id = catalog_id if dataset is None: dataset = 'station-' + catalog_id try: url = self.url logger.info('downloading data from %s' % url) data = pd.read_csv(url) if data.empty: raise ValueError('No Data Available') rename = {x: x.split()[0] for x in data.columns.tolist()} units = {x.split()[0]: x.split()[-1].strip('()').lower() for x in data.columns.tolist()} data.rename(columns=rename, inplace=True) data = data.set_index('time') data.index = pd.to_datetime(data.index) data.rename(columns={self.parameter_code: self.parameter}) file_path = os.path.join(file_path, self.BASE_PATH, self.service_name, dataset, '{0}.h5'.format(dataset)) metadata = { 'file_path': file_path, 'file_format': 'timeseries-hdf5', 'datatype': 'timeseries', 'parameter': p.parameter, 'unit': units[self.parameter_code], 'service_id': 'svc://noaa:{}/{}'.format(self.service_name, catalog_id) } # save data to disk io = load_plugins('io', 'timeseries-hdf5')['timeseries-hdf5'] io.write(file_path, data, metadata) del metadata['service_id'] return metadata except HTTPError as error: if error.code == 500: raise ValueError('No Data Available') elif error.code == 400: raise ValueError('Bad Request') else: raise error
def download(self, catalog_id, file_path, dataset, **kwargs): p = param.ParamOverrides(self, kwargs) self.parameter = p.parameter self.end = pd.to_datetime(p.end) self.start = pd.to_datetime(p.start) self._catalog_entry = catalog_id if dataset is None: dataset = 'station-' + catalog_id # if end is None: # end = pd.datetime.now().strftime('%Y-%m-%d') # # if start is None: # start = pd.to_datetime(end) - pd.datetools.timedelta(days=30) # start = start.strftime('%Y-%m-%d') file_path = os.path.join(file_path, BASE_PATH, self.service_name, dataset, '{0}.h5'.format(dataset)) metadata = { 'file_path': file_path, 'file_format': 'timeseries-hdf5', 'datatype': DataType.TIMESERIES, 'parameter': self.parameter, 'unit': self._unit_map[self.parameter], 'service_id': 'svc://ncdc:{}/{}'.format(self.service_name, catalog_id) } # save data to disk io = load_plugins('io', 'timeseries-hdf5')['timeseries-hdf5'] io.write(file_path, self.data, metadata) del metadata['service_id'] return metadata
def download(self, catalog_id, file_path, dataset, **kwargs): p = param.ParamOverrides(self, kwargs) parameter = p.parameter start = p.start end = p.end period = p.period if dataset is None: dataset = 'station-' + catalog_id if start and end: period = None pmap = self.parameter_map(invert=True) parameter_code, statistic_code = (pmap[parameter].split(':') + [None])[:2] data = nwis.get_site_data(catalog_id, parameter_code=parameter_code, statistic_code=statistic_code, start=start, end=end, period=period, service=self.service_name) # dict contains only one key since only one parameter/statistic was # downloaded, this would need to be changed if multiple # parameter/stat were downloaded together if not data: raise ValueError('No Data Available') data = list(data.values())[0] # convert to dataframe and cleanup bad data df = pd.DataFrame(data['values']) if df.empty: raise ValueError('No Data Available') df = df.set_index('datetime') df.value = df.value.astype(float) if statistic_code in ['00001', '00002', '00003']: df.index = pd.to_datetime(df.index).to_period('D') else: df.index = pd.to_datetime(df.index) # this is in UTC df[df.values == -999999] = pd.np.nan df.rename(columns={'value': parameter}, inplace=True) file_path = os.path.join(file_path, BASE_PATH, self.service_name, dataset, '{0}.h5'.format(dataset)) del data['values'] metadata = { 'name': dataset, 'metadata': data, 'file_path': file_path, 'file_format': 'timeseries-hdf5', 'datatype': 'timeseries', 'parameter': parameter, 'unit': data['variable']['units']['code'], 'service_id': 'svc://usgs-nwis:{}/{}'.format(self.service_name, catalog_id) } # save data to disk io = load_plugins('io', 'timeseries-hdf5')['timeseries-hdf5'] io.write(file_path, df, metadata) del metadata['service_id'] return metadata