示例#1
0
    def test_shared_dimensions(self):
        adcp_fn = 'deployment0000_RS03AXBS-LJ03A-10-ADCPTE301-streamed-adcp_velocity_beam.nc'
        adcp_sk = StreamKey('RS03AXBS', 'LJ03A', '10-ADCPTE301', 'streamed',
                            'adcp_velocity_beam')
        adcp_ds = xr.open_dataset(os.path.join(DATA_DIR, adcp_fn),
                                  decode_times=False)

        # grab the stream from preload
        stream = Stream.query.filter(
            Stream.name == 'adcp_velocity_beam').first()
        params = [p.name for p in stream.parameters if not p.is_function]

        # transform into row data suitable for to_xray_dataset
        rows = []

        for i in adcp_ds.obs.values:
            row = []
            for col in params:
                data = adcp_ds[col].values[i]
                if isinstance(data, np.ndarray) and data.shape:
                    if 'velocity_beam' in col:
                        data[np.isnan(data)] = -32768
                        data = data.astype('int64')
                    data = msgpack.packb(list(data))
                row.append(data)
            rows.append(row)

        # create the dataset
        ds = to_xray_dataset(params, rows, adcp_sk, None)
        # verify only two dimensions exists, bin and obs
        self.assertEqual(set(ds.dims), {'bin', 'obs'})
    def test_shared_dimensions(self):
        adcp_fn = 'deployment0000_RS03AXBS-LJ03A-10-ADCPTE301-streamed-adcp_velocity_beam.nc'
        adcp_sk = StreamKey('RS03AXBS', 'LJ03A', '10-ADCPTE301', 'streamed', 'adcp_velocity_beam')
        adcp_ds = xr.open_dataset(os.path.join(DATA_DIR, adcp_fn), decode_times=False)

        # grab the stream from preload
        stream = Stream.query.filter(Stream.name == 'adcp_velocity_beam').first()
        params = [p.name for p in stream.parameters if not p.is_function]

        # transform into row data suitable for to_xray_dataset
        rows = []

        for i in adcp_ds.obs.values:
            row = []
            for col in params:
                data = adcp_ds[col].values[i]
                if isinstance(data, np.ndarray) and data.shape:
                    if 'velocity_beam' in col:
                        data[np.isnan(data)] = -32768
                        data = data.astype('int64')
                    data = msgpack.packb(list(data))
                row.append(data)
            rows.append(row)

        # create the dataset
        ds = to_xray_dataset(params, rows, adcp_sk, None)
        # verify only two dimensions exists, bin and obs
        self.assertEqual(set(ds.dims), {'bin', 'obs'})
示例#3
0
def fetch_nth_data(stream_key, time_range, num_points=1000, location_metadata=None, request_id=None):
    """
    Given a time range, generate evenly spaced times over the specified interval. Fetch a single
    result from either side of each point in time.
    :param stream_key:
    :param time_range:
    :param num_points:
    :return:
    """
    cols = SessionManager.get_query_columns(stream_key.stream.name)

    if location_metadata is None:
        location_metadata, _, _ = get_location_metadata(stream_key, time_range)

    estimated_rate = location_metadata.particle_rate()
    estimated_particles = int(estimated_rate * time_range.secs())
    data_ratio = estimated_particles / num_points
    log.info("CASS: Estimated total number of points to be %d based on calculated mean rate of %f particles/s",
             estimated_particles, estimated_rate)
    # Fetch it all if it's gonna be close to the same size
    if data_ratio < engine.app.config['UI_FULL_RETURN_RATIO']:
        log.info(
                "CASS: Estimated points (%d) / the requested  number (%d) is less than ratio %f.  Returning all points.",
                estimated_particles, num_points, engine.app.config['UI_FULL_RETURN_RATIO'])
        _, results = fetch_all_data(stream_key, time_range, location_metadata)
    # We have a small amount of bins with data so we can read them all
    elif estimated_particles < engine.app.config['UI_FULL_SAMPLE_LIMIT'] \
            and data_ratio < engine.app.config['UI_FULL_SAMPLE_RATIO']:
        log.info("CASS: Reading all (%d) bins and then sampling.", len(location_metadata.bin_list))
        _, results = sample_full_bins(stream_key, time_range, num_points, location_metadata.bin_list, cols)
    # We have a lot of bins so just grab the first from each of the bins
    elif len(location_metadata.bin_list) > num_points:
        log.info("CASS: More bins (%d) than requested points (%d). Selecting first particle from %d bins.",
                 len(location_metadata.bin_list), num_points, num_points)
        _, results = sample_n_bins(stream_key, time_range, num_points, location_metadata.bin_list, cols)
    else:
        log.info("CASS: Sampling %d points across %d bins.", num_points, len(location_metadata.bin_list))
        _, results = sample_n_points(stream_key, time_range, num_points, location_metadata.bin_list,
                                     location_metadata.bin_information, cols)

    # dedup data before return values
    size = len(results)
    to_return = []
    uuids = set()
    uuid_index = cols.index('id')
    for row in results:
        my_uuid = row[uuid_index]
        if my_uuid in uuids:
            continue
        uuids.add(my_uuid)
        to_return.append(row)
    log.info("Removed %d duplicates from data", size - len(to_return))
    log.info("Returning %s rows from %s fetch", len(to_return), stream_key.as_refdes())
    return to_xray_dataset(cols, to_return, stream_key, request_id)
示例#4
0
def fetch_nth_data(stream_key, time_range, num_points=1000, location_metadata=None, request_id=None):
    """
    Given a time range, generate evenly spaced times over the specified interval. Fetch a single
    result from either side of each point in time.
    :param stream_key:
    :param time_range:
    :param num_points:
    :return:
    """
    cols = SessionManager.get_query_columns(stream_key.stream.name)

    if location_metadata is None:
        location_metadata, _, _ = get_location_metadata(stream_key, time_range)

    estimated_rate = location_metadata.particle_rate()
    estimated_particles = int(estimated_rate * time_range.secs())
    data_ratio = estimated_particles / num_points
    log.info("CASS: Estimated total number of points to be %d based on calculated mean rate of %f particles/s",
             estimated_particles, estimated_rate)
    # Fetch it all if it's gonna be close to the same size
    if data_ratio < engine.app.config['UI_FULL_RETURN_RATIO']:
        log.info(
                "CASS: Estimated points (%d) / the requested  number (%d) is less than ratio %f.  Returning all points.",
                estimated_particles, num_points, engine.app.config['UI_FULL_RETURN_RATIO'])
        _, results = fetch_all_data(stream_key, time_range, location_metadata)
    # We have a small amount of bins with data so we can read them all
    elif estimated_particles < engine.app.config['UI_FULL_SAMPLE_LIMIT'] \
            and data_ratio < engine.app.config['UI_FULL_SAMPLE_RATIO']:
        log.info("CASS: Reading all (%d) bins and then sampling.", len(location_metadata.bin_list))
        _, results = sample_full_bins(stream_key, time_range, num_points, location_metadata.bin_list, cols)
    # We have a lot of bins so just grab the first from each of the bins
    elif len(location_metadata.bin_list) > num_points:
        log.info("CASS: More bins (%d) than requested points (%d). Selecting first particle from %d bins.",
                 len(location_metadata.bin_list), num_points, num_points)
        _, results = sample_n_bins(stream_key, time_range, num_points, location_metadata.bin_list, cols)
    else:
        log.info("CASS: Sampling %d points across %d bins.", num_points, len(location_metadata.bin_list))
        _, results = sample_n_points(stream_key, time_range, num_points, location_metadata.bin_list,
                                     location_metadata.bin_information, cols)

    # dedup data before return values
    size = len(results)
    to_return = []
    uuids = set()
    uuid_index = cols.index('id')
    for row in results:
        my_uuid = row[uuid_index]
        if my_uuid in uuids:
            continue
        uuids.add(my_uuid)
        to_return.append(row)
    log.info("Removed %d duplicates from data", size - len(to_return))
    log.info("Returning %s rows from %s fetch", len(to_return), stream_key.as_refdes())
    return to_xray_dataset(cols, to_return, stream_key, request_id)
示例#5
0
def get_full_cass_dataset(stream_key,
                          time_range,
                          location_metadata=None,
                          request_id=None,
                          keep_exclusions=False):
    cols, rows = fetch_all_data(stream_key, time_range, location_metadata)
    return to_xray_dataset(cols,
                           rows,
                           stream_key,
                           request_id,
                           keep_exclusions=keep_exclusions)
示例#6
0
def get_cass_lookback_dataset(stream_key, start_time, data_bin, deployments, request_id):
    # try to fetch the first n times to ensure we get a deployment value in there.
    cols, rows = fetch_with_func(query_n_before, stream_key,
                                 [(data_bin, start_time, engine.app.config['LOOKBACK_QUERY_LIMIT'])])
    needed = set(deployments)
    dep_idx = cols.index('deployment')
    ret_rows = []
    for r in rows:
        if r[dep_idx] in needed:
            ret_rows.append(r)
            needed.remove(r[dep_idx])
    return to_xray_dataset(cols, ret_rows, stream_key, request_id)
示例#7
0
def get_cass_lookback_dataset(stream_key, start_time, data_bin, deployments, request_id):
    # try to fetch the first n times to ensure we get a deployment value in there.
    cols, rows = fetch_with_func(query_n_before, stream_key,
                                 [(data_bin, start_time, engine.app.config['LOOKBACK_QUERY_LIMIT'])])
    needed = set(deployments)
    dep_idx = cols.index('deployment')
    ret_rows = []
    for r in rows:
        if r[dep_idx] in needed:
            ret_rows.append(r)
            needed.remove(r[dep_idx])
    return to_xray_dataset(cols, ret_rows, stream_key, request_id)
示例#8
0
def get_cass_lookforward_dataset(stream_key, end_time, data_bin,
                                 deployment_stop_time, request_id):
    # try to fetch the first n times after the request end time
    cols, rows = fetch_with_func(
        query_n_after, stream_key,
        [(data_bin, end_time, engine.app.config['LOOKBACK_QUERY_LIMIT'])])
    # Only return data gathered before the end of the last deployment
    # within the time range of this request
    time_idx = cols.index('time')
    ret_rows = []
    for r in rows:
        if r[time_idx] < deployment_stop_time:
            ret_rows.append(r)
    return to_xray_dataset(cols, ret_rows, stream_key, request_id)
示例#9
0
def offload_bin(stream, data_bin, san_dir_string, request_id):
    # get the data and drop duplicates
    cols, data = fetch_bin(stream, data_bin)
    dataset = to_xray_dataset(cols, data, stream, request_id, san=True)
    nc_directory = san_dir_string.format(data_bin)
    if not os.path.exists(nc_directory):
        os.makedirs(nc_directory)
    for deployment, deployment_ds in dataset.groupby('deployment'):
        # get a file name and create deployment directory if needed
        nc_file_name = get_nc_filename(stream, nc_directory, deployment)
        log.info('Offloading %s deployment %d to %s - There are  %d particles', str(stream),
                 deployment, nc_file_name, len(deployment_ds['index']))
        # create netCDF file
        deployment_ds.to_netcdf(path=nc_file_name)
    return True, ''
示例#10
0
def offload_bin(stream, data_bin, san_dir_string, request_id):
    # get the data and drop duplicates
    cols, data = fetch_bin(stream, data_bin)
    dataset = to_xray_dataset(cols, data, stream, request_id, san=True)
    nc_directory = san_dir_string.format(data_bin)
    if not os.path.exists(nc_directory):
        os.makedirs(nc_directory)
    for deployment, deployment_ds in dataset.groupby('deployment'):
        # get a file name and create deployment directory if needed
        nc_file_name = get_nc_filename(stream, nc_directory, deployment)
        log.info('Offloading %s deployment %d to %s - There are  %d particles',
                 str(stream), deployment, nc_file_name,
                 len(deployment_ds['index']))
        # create netCDF file
        deployment_ds.to_netcdf(path=nc_file_name)
    return True, ''
示例#11
0
    def test_no_int64(self):
        echo_fn = 'echo_sounding.nc'
        echo_sk = StreamKey('RS01SLBS', 'LJ01A', '05-HPIESA101', 'streamed', 'echo_sounding')
        echo_ds = xr.open_dataset(os.path.join(DATA_DIR, echo_fn), decode_times=False)

        # turn the dataset back into a dataframe, then into rows
        echo_df = echo_ds.to_dataframe()
        cols = echo_df.columns
        rows = list(echo_df.itertuples(index=False))

        ds = to_xray_dataset(cols, rows, echo_sk, None)

        # first, verify there were 64-bit vars in the original dataset
        found = self.find_int64_vars(echo_ds)
        self.assertNotEqual(found, set())

        # second, verify there are no 64-bit vars in the output dataset
        found = self.find_int64_vars(ds)
        self.assertEqual(found, set())
示例#12
0
    def test_no_int64(self):
        echo_fn = 'echo_sounding.nc'
        echo_sk = StreamKey('RS01SLBS', 'LJ01A', '05-HPIESA101', 'streamed',
                            'echo_sounding')
        echo_ds = xr.open_dataset(os.path.join(DATA_DIR, echo_fn),
                                  decode_times=False)

        # turn the dataset back into a dataframe, then into rows
        echo_df = echo_ds.to_dataframe()
        cols = echo_df.columns
        rows = list(echo_df.itertuples(index=False))

        ds = to_xray_dataset(cols, rows, echo_sk, None)

        # first, verify there were 64-bit vars in the original dataset
        found = self.find_int64_vars(echo_ds)
        self.assertNotEqual(found, set())

        # second, verify there are no 64-bit vars in the output dataset
        found = self.find_int64_vars(ds)
        self.assertEqual(found, set())
示例#13
0
def get_full_cass_dataset(stream_key, time_range, location_metadata=None, request_id=None):
    cols, rows = fetch_all_data(stream_key, time_range, location_metadata)
    return to_xray_dataset(cols, rows, stream_key, request_id)