def test_shared_dimensions(self): adcp_fn = 'deployment0000_RS03AXBS-LJ03A-10-ADCPTE301-streamed-adcp_velocity_beam.nc' adcp_sk = StreamKey('RS03AXBS', 'LJ03A', '10-ADCPTE301', 'streamed', 'adcp_velocity_beam') adcp_ds = xr.open_dataset(os.path.join(DATA_DIR, adcp_fn), decode_times=False) # grab the stream from preload stream = Stream.query.filter( Stream.name == 'adcp_velocity_beam').first() params = [p.name for p in stream.parameters if not p.is_function] # transform into row data suitable for to_xray_dataset rows = [] for i in adcp_ds.obs.values: row = [] for col in params: data = adcp_ds[col].values[i] if isinstance(data, np.ndarray) and data.shape: if 'velocity_beam' in col: data[np.isnan(data)] = -32768 data = data.astype('int64') data = msgpack.packb(list(data)) row.append(data) rows.append(row) # create the dataset ds = to_xray_dataset(params, rows, adcp_sk, None) # verify only two dimensions exists, bin and obs self.assertEqual(set(ds.dims), {'bin', 'obs'})
def test_shared_dimensions(self): adcp_fn = 'deployment0000_RS03AXBS-LJ03A-10-ADCPTE301-streamed-adcp_velocity_beam.nc' adcp_sk = StreamKey('RS03AXBS', 'LJ03A', '10-ADCPTE301', 'streamed', 'adcp_velocity_beam') adcp_ds = xr.open_dataset(os.path.join(DATA_DIR, adcp_fn), decode_times=False) # grab the stream from preload stream = Stream.query.filter(Stream.name == 'adcp_velocity_beam').first() params = [p.name for p in stream.parameters if not p.is_function] # transform into row data suitable for to_xray_dataset rows = [] for i in adcp_ds.obs.values: row = [] for col in params: data = adcp_ds[col].values[i] if isinstance(data, np.ndarray) and data.shape: if 'velocity_beam' in col: data[np.isnan(data)] = -32768 data = data.astype('int64') data = msgpack.packb(list(data)) row.append(data) rows.append(row) # create the dataset ds = to_xray_dataset(params, rows, adcp_sk, None) # verify only two dimensions exists, bin and obs self.assertEqual(set(ds.dims), {'bin', 'obs'})
def fetch_nth_data(stream_key, time_range, num_points=1000, location_metadata=None, request_id=None): """ Given a time range, generate evenly spaced times over the specified interval. Fetch a single result from either side of each point in time. :param stream_key: :param time_range: :param num_points: :return: """ cols = SessionManager.get_query_columns(stream_key.stream.name) if location_metadata is None: location_metadata, _, _ = get_location_metadata(stream_key, time_range) estimated_rate = location_metadata.particle_rate() estimated_particles = int(estimated_rate * time_range.secs()) data_ratio = estimated_particles / num_points log.info("CASS: Estimated total number of points to be %d based on calculated mean rate of %f particles/s", estimated_particles, estimated_rate) # Fetch it all if it's gonna be close to the same size if data_ratio < engine.app.config['UI_FULL_RETURN_RATIO']: log.info( "CASS: Estimated points (%d) / the requested number (%d) is less than ratio %f. Returning all points.", estimated_particles, num_points, engine.app.config['UI_FULL_RETURN_RATIO']) _, results = fetch_all_data(stream_key, time_range, location_metadata) # We have a small amount of bins with data so we can read them all elif estimated_particles < engine.app.config['UI_FULL_SAMPLE_LIMIT'] \ and data_ratio < engine.app.config['UI_FULL_SAMPLE_RATIO']: log.info("CASS: Reading all (%d) bins and then sampling.", len(location_metadata.bin_list)) _, results = sample_full_bins(stream_key, time_range, num_points, location_metadata.bin_list, cols) # We have a lot of bins so just grab the first from each of the bins elif len(location_metadata.bin_list) > num_points: log.info("CASS: More bins (%d) than requested points (%d). Selecting first particle from %d bins.", len(location_metadata.bin_list), num_points, num_points) _, results = sample_n_bins(stream_key, time_range, num_points, location_metadata.bin_list, cols) else: log.info("CASS: Sampling %d points across %d bins.", num_points, len(location_metadata.bin_list)) _, results = sample_n_points(stream_key, time_range, num_points, location_metadata.bin_list, location_metadata.bin_information, cols) # dedup data before return values size = len(results) to_return = [] uuids = set() uuid_index = cols.index('id') for row in results: my_uuid = row[uuid_index] if my_uuid in uuids: continue uuids.add(my_uuid) to_return.append(row) log.info("Removed %d duplicates from data", size - len(to_return)) log.info("Returning %s rows from %s fetch", len(to_return), stream_key.as_refdes()) return to_xray_dataset(cols, to_return, stream_key, request_id)
def get_full_cass_dataset(stream_key, time_range, location_metadata=None, request_id=None, keep_exclusions=False): cols, rows = fetch_all_data(stream_key, time_range, location_metadata) return to_xray_dataset(cols, rows, stream_key, request_id, keep_exclusions=keep_exclusions)
def get_cass_lookback_dataset(stream_key, start_time, data_bin, deployments, request_id): # try to fetch the first n times to ensure we get a deployment value in there. cols, rows = fetch_with_func(query_n_before, stream_key, [(data_bin, start_time, engine.app.config['LOOKBACK_QUERY_LIMIT'])]) needed = set(deployments) dep_idx = cols.index('deployment') ret_rows = [] for r in rows: if r[dep_idx] in needed: ret_rows.append(r) needed.remove(r[dep_idx]) return to_xray_dataset(cols, ret_rows, stream_key, request_id)
def get_cass_lookforward_dataset(stream_key, end_time, data_bin, deployment_stop_time, request_id): # try to fetch the first n times after the request end time cols, rows = fetch_with_func( query_n_after, stream_key, [(data_bin, end_time, engine.app.config['LOOKBACK_QUERY_LIMIT'])]) # Only return data gathered before the end of the last deployment # within the time range of this request time_idx = cols.index('time') ret_rows = [] for r in rows: if r[time_idx] < deployment_stop_time: ret_rows.append(r) return to_xray_dataset(cols, ret_rows, stream_key, request_id)
def offload_bin(stream, data_bin, san_dir_string, request_id): # get the data and drop duplicates cols, data = fetch_bin(stream, data_bin) dataset = to_xray_dataset(cols, data, stream, request_id, san=True) nc_directory = san_dir_string.format(data_bin) if not os.path.exists(nc_directory): os.makedirs(nc_directory) for deployment, deployment_ds in dataset.groupby('deployment'): # get a file name and create deployment directory if needed nc_file_name = get_nc_filename(stream, nc_directory, deployment) log.info('Offloading %s deployment %d to %s - There are %d particles', str(stream), deployment, nc_file_name, len(deployment_ds['index'])) # create netCDF file deployment_ds.to_netcdf(path=nc_file_name) return True, ''
def test_no_int64(self): echo_fn = 'echo_sounding.nc' echo_sk = StreamKey('RS01SLBS', 'LJ01A', '05-HPIESA101', 'streamed', 'echo_sounding') echo_ds = xr.open_dataset(os.path.join(DATA_DIR, echo_fn), decode_times=False) # turn the dataset back into a dataframe, then into rows echo_df = echo_ds.to_dataframe() cols = echo_df.columns rows = list(echo_df.itertuples(index=False)) ds = to_xray_dataset(cols, rows, echo_sk, None) # first, verify there were 64-bit vars in the original dataset found = self.find_int64_vars(echo_ds) self.assertNotEqual(found, set()) # second, verify there are no 64-bit vars in the output dataset found = self.find_int64_vars(ds) self.assertEqual(found, set())
def get_full_cass_dataset(stream_key, time_range, location_metadata=None, request_id=None): cols, rows = fetch_all_data(stream_key, time_range, location_metadata) return to_xray_dataset(cols, rows, stream_key, request_id)