def test_get_array(request, nctuple, polygon): nc, fname, var = nctuple t0 = time() x = get_array(nc, fname, 0, polygon, var) t = time() - t0 print(t) assert t < .030 assert type(x) == MaskedArray assert hasattr(x, 'mask') assert np.mean(x) > 0 or np.all(x.mask)
def test_get_array(request, nctuple, polygon): nc, fname, var = nctuple t0 = time() x = get_array(nc, fname, 0, polygon, var) t = time() - t0 print(t) assert t < 0.1 assert type(x) == MaskedArray assert hasattr(x, "mask") assert np.mean(x) > 0 or np.all(x.mask)
def get_spatially_averaged_data(data_file, time_idx, is_thredds): """ From the NetCDF data file pointed at by `data_file`, get the spatial average over the area specified by `area` of the data for variable `variable` at time index `time_idx`. :param data_file (modelmeta.DataFile): source data file :param time_idx (int): index of time of interest :param is_thredds (bool): whether data target is on thredds server :return: float """ if isinstance(is_thredds, str): is_thredds = strtobool(is_thredds) if is_thredds: data_filename = os.getenv("THREDDS_URL_ROOT") + data_file.filename else: data_filename = data_file.filename with open_nc(data_filename) as nc: a = get_array(nc, data_filename, time_idx, area, variable) return np.mean(a).item()
def getdata(file_, time_idx): with open_nc(file_.filename) as nc: a = get_array(nc, file_.filename, time_idx, area, variable) return np.asscalar(np.mean(a))
def stats( sesh, id_, time, area, variable, is_thredds=False, ): """Request and calculate summary statistics averaged across a region For performing regional analysis, one typically wants to summarize statistical information across a region. This call allows one to request either a single timestep (or an average across all timesteps), and averaged across all cells within the given region. The stats call may only be called for a single data file and single variable per invocation. Args: sesh (sqlalchemy.orm.session.Session): A database Session object id_ (str): Unique id which is a key to the data file requested time (int): Timestep index (0-based) representing the time of year; 0-11 for monthly, 0-3 for seasonal, 0 for annual datasets. area (str): WKT polygon of selected area variable (str): Short name of the variable to be returned is_thredds (bool): If set to `True` the filepath will be searched for on THREDDS server. This flag is not needed when running the backend as a server as the files are accessed over the web. Returns: dict: Empty dictionary if model_id is not found in the database. Otherwise, returns a single dict with the key of the file's unique_id and the value consisting of a nested dictionary with the following attributes: 'mean', 'stdev', 'min', 'max', 'median', 'ncells', 'units', 'time'. For example :: {'file0': { 'mean': 303.97227647569446, 'stdev': 8.428096450998078, 'min': 288.71807861328125, 'max': 318.9695739746094, 'median': 301.61065673828125, 'ncells': 72, 'units': 'K', 'time': datetime.datetime(1985, 6, 30, 12, 0, 0), 'modtime': datetime.datetime(2010, 1, 1, 17, 30, 4) } } There are two semi-error cases which should be mentioned, when the filesystem is out of sync with the database. 1. The file pointed to by `id_` does not exist in the filesystem 2. The requested variable does not exist in the given file In these cases, the numerical values will all be NaN, and the results dict will be missing the 'units' and 'time' keys. Raises: Exception: If `time` parameter cannot be converted to an integer """ # Validate arguments if time: try: time = int(time) except ValueError: raise Exception( 'time parameter "{}" not convertable to an integer.'.format( time)) else: time = None if isinstance(is_thredds, str): is_thredds = strtobool(is_thredds) try: df = sesh.query(DataFile).filter(DataFile.unique_id == id_).one() resource = df.filename if not is_thredds else apply_thredds_root( df.filename) except NoResultFound: return {} try: with open_nc(resource) as nc: array = get_array(nc, resource, time, area, variable) units = get_units_from_netcdf_file(nc, variable) except Exception as e: log.error(e) return {id_: na_array_stats} stats = array_stats(array) query = sesh.query(Time.timestep).filter(Time.time_set_id == df.timeset.id) if time: query.filter(Time.time_idx == time) timevals = [t for t, in query.all()] timeval = mean_datetime(timevals) stats.update({"units": units, "time": timeval, "modtime": df.index_time}) return {id_: stats}
def stats(sesh, id_, time, area, variable): '''Request and calculate summary statistics averaged across a region For performing regional analysis, one typically wants to summarize statistical information across a region. This call allows one to request either a single timestep (or an average across all timesteps), and averaged across all cells within the given region. The stats call may only be called for a single data file and single variable per invocation. Args: sesh (sqlalchemy.orm.session.Session): A database Session object id_ (str): Unique id which is a key to the data file requested time (int): Timestep integer (1-17) representing the time of year area (str): WKT polygon of selected area variable (str): Short name of the variable to be returned Returns: dict: Empty dictionary if model_id is not found in the database. Otherwise, returns a single dict with the key of the file's unique_id and the value consisting of a nested dictionary with the following attributes: 'mean', 'stdev', 'min', 'max', 'median', 'ncells', 'units', 'time'. For example :: {'file0': { 'mean': 303.97227647569446, 'stdev': 8.428096450998078, 'min': 288.71807861328125, 'max': 318.9695739746094, 'median': 301.61065673828125, 'ncells': 72, 'units': 'K', 'time': '1985-06-30T12:00:00Z' } } There are two semi-error cases which should be mentioned, when the filesystem is out of sync with the database. 1. The file pointed to by `id_` does not exist in the filesystem 2. The requested variable does not exist in the given file In these cases, the numerical values will all be NaN, and the results dict will be missing the 'units' and 'time' keys. Raises: None? ''' try: df = sesh.query(DataFile).filter(DataFile.unique_id == id_).one() fname = df.filename except NoResultFound: return {} try: with open_nc(fname) as nc: array = get_array(nc, fname, time, area, variable) units = get_units_from_netcdf_file(nc, variable) except Exception as e: log.error(e) return {id_: na_array_stats} stats = array_stats(array) query = sesh.query(Time.timestep).filter(Time.time_set_id == df.timeset.id) if time: query.filter(Time.time_idx == time) timevals = [ t for t, in query.all() ] timeval = mean_datetime(timevals) stats.update({ 'units': units, 'time': timeval.strftime('%Y-%m-%dT%H:%M:%SZ') }) return {id_: stats}
def timeseries(sesh, id_, area, variable): """Delegate for performing data lookups within a single file Opens the data file specified by the id_ parameter and returns the data values at each timestep in the file. Args: sesh (sqlalchemy.orm.session.Session): A database Session object id_ (str): Unique id which is a key to the data file requested area (str): WKT polygon of selected area variable (str): Short name of the variable to be returned Returns: dict: Empty dictionary if id_ is not found in the database. Otherwise returns a single dict with keys `id`, `units` and `data`. The value for `data` is another dictionary where keys correspond to the time values (formatted as '%Y-%m-%dT%H:%M:%SZ') and values correspond to the data values themselves. The value for `id` is the unique_id for the file and the value for `units` is the unit string of the data values. For example:: { 'id': 'tmax_monClim_PRISM_historical_run1_198101-201012', 'units': 'degC', 'data': { '1985-01-15T00:00:00Z': 1.5, '1985-02-15T00:00:00Z': 2.5, '1985-03-15T00:00:00Z': 5.5, '1985-04-15T00:00:00Z': 10.2, ... '1985-12-15T00:00:00Z': 2.5, } } Raises: None? """ try: file_ = sesh.query(DataFile).filter(DataFile.unique_id == id_).one() except NoResultFound: return {} # Get all time indexes for this file ti = [(time.timestep, time.time_idx) for time in file_.timeset.times] ti.sort(key=lambda x: x[1]) with open_nc(file_.filename) as nc: data = OrderedDict( [ ( timeval.strftime("%Y-%m-%dT%H:%M:%SZ"), np.asscalar(np.mean(get_array(nc, file_.filename, idx, area, variable))), ) for timeval, idx in ti ] ) units = get_units_from_netcdf_file(nc, variable) return {"id": id_, "data": data, "units": units}
def timeseries(sesh, id_, area, variable): """Delegate for performing data lookups within a single file Opens the data file specified by the id_ parameter and returns the data values at each timestep in the file. Args: sesh (sqlalchemy.orm.session.Session): A database Session object id_ (str): Unique id which is a key to the data file requested area (str): WKT polygon of selected area variable (str): Short name of the variable to be returned Returns: dict: Empty dictionary if id_ is not found in the database. Otherwise returns a single dict with keys `id`, `units` and `data`. The value for `data` is another dictionary where keys correspond to the time values (formatted as '%Y-%m-%dT%H:%M:%SZ') and values correspond to the data values themselves. The value for `id` is the unique_id for the file and the value for `units` is the unit string of the data values. For example:: { 'id': 'tmax_monClim_PRISM_historical_run1_198101-201012', 'units': 'degC', 'data': { '1985-01-15T00:00:00Z': 1.5, '1985-02-15T00:00:00Z': 2.5, '1985-03-15T00:00:00Z': 5.5, '1985-04-15T00:00:00Z': 10.2, ... '1985-12-15T00:00:00Z': 2.5, }, 'modtime': '2010-01-01T17:30:04Z' } Raises: None? """ try: file_ = sesh.query(DataFile).filter(DataFile.unique_id == id_).one() except NoResultFound: return {} # Get all time indexes for this file ti = [(time.timestep, time.time_idx) for time in file_.timeset.times] ti.sort(key=lambda x: x[1]) with open_nc(file_.filename) as nc: data = OrderedDict([( timeval.strftime("%Y-%m-%dT%H:%M:%SZ"), np.mean(get_array(nc, file_.filename, idx, area, variable)).item(), ) for timeval, idx in ti]) units = get_units_from_netcdf_file(nc, variable) return { "id": id_, "data": data, "units": units, "modtime": file_.index_time }