def get_start_time(self): """Read first datatime from the database table. Raises: nc_exc.NoDataException """ vname = "datetime" try: with self.conn as conn: with conn.cursor() as cur: # datetimes in database are returned to python as timezone naive. cur.execute( "SELECT {} FROM {} FETCH FIRST 1 ROW ONLY;".format( vname, self.table)) start_time = cur.fetchone() if not start_time: _logger.warning("%s: read %s: no data", conn, vname) raise nc_exc.NoDataException("read {}".format(vname)) return pytz.utc.localize(start_time[0]) except psycopg2.Error as exc: _logger.warning("%s: read %s: %s", conn, vname, exc) RAFDatabase.close_connection(conn) raise nc_exc.NoDataException("read {}: {}".format(vname, exc))
def read_times(self, start_time=pytz.utc.localize(datetime.min), end_time=pytz.utc.localize(datetime.max)): """Read datetimes from the table within a range. Raises: nc_exc.NoDataException """ start_time = start_time.replace(tzinfo=None) end_time = end_time.replace(tzinfo=None) # _logger.debug("read_times, table=%s", self.table) vname = "datetime" try: with self.conn as conn: with conn.cursor() as cur: # datetimes in database are returned to python as timezone naive. cur.execute( "SELECT {} FROM {} WHERE {} >= %s AND {} < %s;".format( vname, self.table, vname, vname), (start_time, end_time)) return [pytz.utc.localize(x[0]).timestamp() for x in cur] except psycopg2.Error as exc: RAFDatabase.close_connection(conn) raise nc_exc.NoDataException("read {}: {}".format(vname, exc))
def get_variables(self): """Fetch pertinent fields from the 'variable_list' table in the RAF database, such as the list of variable names, their units, and missing values. Raises: nc_exc.NoDataException """ try: with self.conn as conn: with conn.cursor() as cur: cur.execute("\ SELECT name, units, long_name, ndims, dims, missing_value from variable_list;" ) variables = {} for var in cur: dimnames = ["time"] # make a bold assumption that a second dimension # is a particle-probe bin number if var[3] > 1: dimnames.append("bin") variables[var[0]] = { "units": var[1], "long_name": var[2], "dimnames": dimnames, "shape": var[4] } return variables except psycopg2.Error as exc: # psycopg.connections are thread safe RAFDatabase.close_connection(conn) raise nc_exc.NoDataException("No variables found: {}".format(exc))
def __init__(self, database="real-time-GV", user="******", host="eol-rt-data.fl-ext.ucar.edu", port=5432, password=None, table="raf_lrt"): """Construct an instance of RAF database connection. Args: database, user, host, port, password: Usual parameters needed to create a PostgreSQL connection. table: name of table in the database which contains the time-series data to be read. Raises: nc_exc.NoDataException """ try: self.conn = RAFDatabase.get_connection(database=database, user=user, host=host, port=port, password=password) self.database = database self.user = user self.host = host self.port = port self.password = password self.table = table except psycopg2.Error as exc: raise nc_exc.NoDataException( "Database not available: {}".format(exc))
def post(self, request, *args, project_name, dataset_name, **kwargs): """Respond to a POST request where the user has sent back a form. Using the requested parameters in the form, such as start and end times and a list of variables, the dataset can be read, and the contents sent back to the user. """ _logger.debug("DatasetView.post(project=%s, dataset=%s)", project_name, dataset_name) try: client_state = get_client_from_session(request.session, project_name, dataset_name) except Http404 as exc: _logger.warning("post: %s", exc) messages.warning(request, exc) return redirect('ncharts:dataset', project_name=project_name, dataset_name=dataset_name) dset = get_dataset(client_state) # dataset name and project name from POST should agree with # those in the cached dataset. if dset.name != dataset_name or dset.project.name != project_name: _logger.error( "post, old session, project=%s, dataset=%s, " "url project=%s, dataset=%s", dset.project.name, dset.name, project_name, dataset_name) messages.warning(request, "session is for a different dataset") return redirect('ncharts:dataset', project_name=project_name, dataset_name=dataset_name) proj = nc_models.Project.objects.get(name=project_name) projs = nc_models.Project.objects.all() plats = nc_models.Platform.objects.all() dsets = proj.dataset_set.all() # vars = [ v.name for v in dset.variables.all() ] # page-backward or page-forward in time # TODO: implement a javascript button that manipulates the # html field directly if 'submit' in request.POST and request.POST['submit'][0:4] == 'page': post = request.POST.copy() try: # will throw ValidationError if timezone doesn't exist timezone = nc_models.TimeZone.objects.get( tz=request.POST['timezone']).tz start_time = timezone.localize( datetime.datetime.strptime(request.POST['start_time'], "%Y-%m-%d %H:%M")) delt = nc_forms.get_time_length( request.POST['time_length_0'], request.POST['time_length_units']) if request.POST['submit'] == 'page-backward': start_time = start_time - delt elif request.POST['submit'] == 'page-forward': start_time = start_time + delt post['start_time'] = start_time.replace(tzinfo=None) except (KeyError, ValueError, dj_exc.ValidationError): # Will result in invalid form below post['start_time'] = None post['track_real_time'] = False form = nc_forms.DataSelectionForm(post, dataset=dset, request=request) else: form = nc_forms.DataSelectionForm(request.POST, dataset=dset, request=request) # Have to set the choices for variables and soundings # before the form is validated. soundings = [] sounding_choices = [] dsetvars = {} dsetstns = [] try: dsetstns = dset.get_station_names() form.set_station_choices(dsetstns) dsetvars = dset.get_variables() form.set_variable_choices(dsetvars) form.set_yvariable_choices(dsetvars) if dset.dset_type == "sounding": # all soundings in the dataset soundings = dset.get_series_tuples( series_name_fmt=SOUNDING_NAME_FMT) sounding_choices = dset.get_series_names( series_name_fmt=SOUNDING_NAME_FMT) sounding_choices = [(s, s) for s in sounding_choices] form.fields['soundings'].choices = sounding_choices except (nc_exc.NoDataException) as exc: _logger.warning("%s, %s: get_variables: %s", project_name, dset, exc) form.no_data("No variables found in {}: {}".format(dset, exc)) if not form.is_valid(): _logger.warning('User form is not valid!: %s', repr(form.errors)) if dset.dset_type == "sounding": sounding_choices = [] start_time = form.get_cleaned_start_time() tdelta = form.get_cleaned_time_length() if start_time and tdelta: sounding_choices = dset.get_series_names( series_name_fmt=SOUNDING_NAME_FMT, start_time=start_time, end_time=start_time + tdelta) sounding_choices = [(s, s) for s in sounding_choices] if form.clean_method_altered_data: post = request.POST.copy() post['start_time'] = form.cleaned_data['start_time'] post['track_real_time'] = form.cleaned_data['track_real_time'] form = nc_forms.DataSelectionForm(post, dataset=dset) form.set_station_choices(dsetstns) form.set_variable_choices(dsetvars) form.set_yvariable_choices(dsetvars) if dset.dset_type == "sounding": form.fields['soundings'].choices = sounding_choices return render( request, self.template_name, { 'version': _version, 'form': form, 'dataset': dset, 'datasets': dsets, 'variables': dsetvars, 'soundings': mark_safe(json.dumps(soundings)), 'projects': projs, 'platforms': plats }) # Save the client state from the form sel_stns = form.cleaned_data['stations'] sel_vars = form.cleaned_data['variables'] sel_soundings = form.cleaned_data['soundings'] yvar = form.cleaned_data['yvariable'] tdelta = form.get_cleaned_time_length() start_time = form.get_cleaned_start_time() end_time = start_time + tdelta client_state.stations = json.dumps(sel_stns) client_state.variables = json.dumps(sel_vars) client_state.start_time = start_time client_state.timezone = form.cleaned_data['timezone'] client_state.time_length = tdelta.total_seconds() client_state.track_real_time = form.cleaned_data['track_real_time'] client_state.soundings = json.dumps(sel_soundings) client_state.yvariable = yvar client_state.save() # Re-create form if any values have been altered if form.clean_method_altered_data: post = request.POST.copy() post['start_time'] = form.cleaned_data['start_time'] post['track_real_time'] = form.cleaned_data['track_real_time'] form = nc_forms.DataSelectionForm(post, dataset=dset) form.set_station_choices(dsetstns) form.set_variable_choices(dsetvars) form.set_yvariable_choices(dsetvars) if dset.dset_type == "sounding": # set sounding choices for selected time period # soundings between the start and end time sounding_choices = dset.get_series_names( series_name_fmt=SOUNDING_NAME_FMT, start_time=client_state.start_time, end_time=client_state.start_time + \ datetime.timedelta(seconds=client_state.time_length)) sounding_choices = [(s, s) for s in sounding_choices] form.fields['soundings'].choices = sounding_choices if yvar != "": if yvar not in dsetvars.keys(): exc = nc_exc.NoDataException( "variable {} not found in {}".format(yvar, dset)) _logger.warning(repr(exc)) form.no_data(repr(exc)) return render( request, self.template_name, { 'version': _version, 'form': form, 'dataset': dset, 'datasets': dsets, 'variables': dsetvars, 'soundings': mark_safe(json.dumps(soundings)), 'projects': projs, 'platforms': plats }) if yvar not in sel_vars: sel_vars.append(yvar) series_name_fmt = None if dset.dset_type == "sounding": series_name_fmt = SOUNDING_NAME_FMT else: sel_soundings = None # If variables exists in the dataset, get their # attributes there, otherwise from the actual dataset. if dset.variables.all(): variables = { k: { 'units': dset.variables.get(name=k).units, 'long_name': dset.variables.get(name=k).long_name } for k in sel_vars } else: variables = {k: dsetvars[k] for k in sel_vars} stndims = {"station": [int(stn) for stn in sel_stns]} try: if isinstance(dset, nc_models.FileDataset): ncdset = dset.get_netcdf_dataset() indata = ncdset.read_time_series( sel_vars, start_time=start_time, end_time=end_time, selectdim=stndims, series=sel_soundings, series_name_fmt=series_name_fmt) else: dbcon = dset.get_connection() indata = dbcon.read_time_series(sel_vars, start_time=start_time, end_time=end_time) except nc_exc.TooMuchDataException as exc: _logger.warning("%s, %s: %s", project_name, dataset_name, exc) form.too_much_data(repr(exc)) return render( request, self.template_name, { 'version': _version, 'form': form, 'dataset': dset, 'datasets': dsets, 'variables': dsetvars, 'soundings': mark_safe(json.dumps(soundings)), 'projects': projs, 'platforms': plats }) except (OSError, nc_exc.NoDataException) as exc: _logger.warning("%s, %s: %s", project_name, dataset_name, exc) form.no_data(repr(exc)) return render( request, self.template_name, { 'version': _version, 'form': form, 'dataset': dset, 'datasets': dsets, 'variables': dsetvars, 'soundings': mark_safe(json.dumps(soundings)), 'projects': projs, 'platforms': plats }) time0 = {} vsizes = {} for series_name in indata: ser_data = indata[series_name] vsizes[series_name] = {} if series_name == "": for vname in sel_vars: vsizes[series_name][vname] = 0 try: # works for any shape, as long as time is the # first dimension vindex = ser_data['vmap'][vname] vsizes[series_name][vname] = ser_data['data'][ vindex].size lastok = np.where( ~np.isnan(ser_data['data'][vindex]))[0][-1] time_last_ok = ser_data['time'][lastok] except IndexError: # all data is nan time_last_ok = (start_time - \ datetime.timedelta(seconds=0.001)).timestamp() except KeyError: # variable not in vmap continue try: time_last = ser_data['time'][-1] except IndexError: # no data time_last = time_last_ok client_state.save_data_times(vname, time_last_ok, time_last) # A simple compression, subtract first time from all times, # reducing the number of characters sent. time0[series_name] = 0 if ser_data['time']: time0[series_name] = ser_data['time'][0] # subtract off time0 ser_data['time'] = [x - time0[series_name] for \ x in ser_data['time']] json_time0 = mark_safe(json.dumps(time0)) json_time = mark_safe( json.dumps({sn: indata[sn]['time'] for sn in indata})) json_data = mark_safe( json.dumps({sn: indata[sn]['data'] for sn in indata}, cls=NChartsJSONEncoder)) json_vmap = mark_safe( json.dumps({sn: indata[sn]['vmap'] for sn in indata}, cls=NChartsJSONEncoder).replace("'", r"\u0027")) json_dim2 = mark_safe( json.dumps({sn: indata[sn]['dim2'] for sn in indata}, cls=NChartsJSONEncoder).replace("'", r"\u0027")) # indata may not have stnnames element json_stns = mark_safe( json.dumps( { sn: (indata[sn]['stnnames'] if 'stnnames' in indata[sn] else {}) for sn in indata }, cls=NChartsJSONEncoder).replace("'", r"\u0027")) def type_by_dims(dimnames): """Crude function to return a plot type, given a dimension. """ if len(dimnames) == 1: return 'time-series' elif len(dimnames) == 2: if dimnames[1] == "station": return 'time-series' return 'heatmap' else: return 'none' plot_types = set() if len(indata) == 1 and '' in indata: # one series, named '' for vname, var in variables.items(): ptype = "time-series" if vname in indata['']['vmap']: vindex = indata['']['vmap'][vname] vdimnames = dsetvars[vname]["dimnames"] # print("vname=",vname,",shape=",str(indata['']['data'][vindex].shape)) # print("vname=",vname,",nbytes=",str(indata['']['data'][vindex].nbytes)) # print("vname=",vname,",ndim=",str(indata['']['data'][vindex].ndim)) # print("dsetvars[",vname,"]['dimnames']=",str(dsetvars[vname]["dimnames"])) ptype = type_by_dims(vdimnames) var['plot_type'] = ptype plot_types.add(ptype) else: plot_types.add("sounding-profile") # Create plot groups dictionary, for each # group, the variables in the group, their units, long_names, plot_type # Use OrderedDict so the plots come out in this order plot_groups = collections.OrderedDict() # loop over plot_types grpid = 0 for ptype in plot_types: # _logger.debug("ptype=%s", ptype) # For a heatmap, one plot per variable. if ptype == 'heatmap': for vname in sorted(variables): # returns sorted keys var = variables[vname] if vsizes[''][vname] > 0 and var['plot_type'] == ptype: plot_groups['g{}'.format(grpid)] = { 'series': "", 'variables': mark_safe( json.dumps([vname]).replace("'", r"\u0027")), 'units': mark_safe( json.dumps([var['units'] ]).replace("'", r"\u0027")), 'long_names': mark_safe( json.dumps([var['long_name'] ]).replace("'", r"\u0027")), 'plot_type': mark_safe(ptype), } grpid += 1 elif ptype == 'sounding-profile': # one profile plot per series name for series_name in sorted(indata.keys()): vnames = sorted([v for v in variables]) units = [variables[v]['units'] for v in vnames] long_names = [(variables[v]['long_name'] \ if 'long_name' in variables[v] else v) for v in vnames] plot_groups['g{}'.format(grpid)] = { 'series': series_name, 'variables': mark_safe(json.dumps(vnames).replace("'", r"\u0027")), 'units': mark_safe(json.dumps(units).replace("'", r"\u0027")), 'long_names': mark_safe( json.dumps(long_names).replace("'", r"\u0027")), 'plot_type': mark_safe(ptype), } grpid += 1 else: # unique units, in alphabetical order by the name of the # first variable which uses it. In this way the plots # are in alphabetical order on the page by the first plotted variable. uunits = [] # sorted(dict) becomes a list of sorted keys for vname in sorted(variables): units = '' if 'units' in variables[vname]: units = variables[vname]['units'] else: variables[vname]['units'] = units if units not in uunits: uunits.append(units) # unique units for units in uunits: uvars = sorted([vname for vname, var in variables.items() \ if vsizes[''][vname] > 0 and var['plot_type'] == ptype and var['units'] == units]) # uvars is a sorted list of variables with units and this plot type. # Might be empty if the variable is of a different plot type if uvars: plot_groups['g{}'.format(grpid)] = { 'series': "", 'variables': mark_safe( json.dumps(uvars).replace("'", r"\u0027")), 'units': mark_safe(json.dumps( [(variables[v]['units'] if 'units' in variables[v] else '') \ for v in uvars]).replace("'", r"\u0027")), 'long_names': mark_safe(json.dumps( [(variables[v]['long_name'] \ if 'long_name' in variables[v] else '') \ for v in uvars]).replace("'", r"\u0027")), 'plot_type': mark_safe(ptype), } grpid += 1 # log the request if len(variables) > 2: logvars = sorted(variables)[:2] + ['...'] else: logvars = variables _request_logger.info( "%s, %s, %s, #recs=%d, real_time=%s, vars=%s, #vars=%d, stns=%s, snding=%s, plot=%s, fromaddr=%s", dset.project.name, dset.name, start_time, len(ser_data['time']), client_state.track_real_time, ' '.join(logvars), len(variables), ' '.join(["%s" % s for s in sel_stns ]), ' '.join(["%s" % s for s in soundings]), ' '.join(plot_types), request.META['REMOTE_ADDR']) return render( request, self.template_name, { 'version': _version, 'form': form, 'dataset': dset, 'datasets': dsets, 'variables': dsetvars, 'plot_groups': plot_groups, 'time0': json_time0, 'time': json_time, 'data': json_data, 'vmap': json_vmap, 'dim2': json_dim2, 'stations': json_stns, 'time_length': client_state.time_length, 'soundings': mark_safe(json.dumps(soundings)), 'yvariable': yvar.replace("'", r"\u0027"), 'projects': projs, 'platforms': plats })
def read_time_series(self, variables=(), start_time=pytz.utc.localize(datetime.min), end_time=pytz.utc.localize(datetime.max), size_limit=1000 * 1000 * 1000): """Read times and variables from the table within a time period. For each variable, its missing_value will be read from the variable_list table. Values read from the time series table which match the missing_value will be set to float('nan'). Args: variables: list or tuple of variable names to read. start_time: starting datetime of data to be read. end_time: ending datetime of data to be read. size_limit: attempt to screen outrageous requests. Returns: A one element dict, compatible with that returned by netcdf.read_time_series(), containing for a series_name of '': { 'time' : list of UTC timestamps, 'data': lists of numpy.ndarray containing the data for each variable, 'vmap': dict by variable name, containing the index into the series data for the variable, 'dim2': dict by variable name, of values for second dimension of the data, such as height. } Raises: nc_exc.NoDataException """ total_size = 0 start_time = start_time.replace(tzinfo=None) end_time = end_time.replace(tzinfo=None) vtime = self.read_times(start_time=start_time, end_time=end_time) # _logger.debug("read_times, len=%d", len(vtime)) total_size += sys.getsizeof(vtime) if total_size > size_limit: raise nc_exc.TooMuchDataException( "too many time values requested, size={0} MB".\ format(total_size/(1000 * 1000))) vdata = [] vmap = {} vdim2 = {} try: with self.conn as conn: with conn.cursor() as cur: for vname in variables: operation = "read variable_list" # _logger.debug("vname=%s",vname) cur.execute( "SELECT dims, missing_value from variable_list where name=%s;", (vname, )) vinfo = cur.fetchall() # _logger.debug("vinfo=%s",vinfo) dims = vinfo[0][0] dims[0] = len(vtime) missval = vinfo[0][1] if len(dims) > 1: # In initial CSET data, dims for CUHSAS_RWOOU # in variable_list was [1,99] # Seems that the 99 should have been 100, # which is what is returned by this: operation = "read dimension of {}".format(vname) cur.execute("\ SELECT array_upper({},1) FROM {} FETCH FIRST 1 ROW ONLY;\ ".format(vname, self.table)) dimsx = cur.fetchall()[0] dims[1] = dimsx[0] # _logger.debug("vname=%s, dims=%s, dimsx=%s", vname, dims, dimsx) operation = "read {}".format(vname) cur.execute( "\ SELECT {} FROM {} WHERE datetime >= %s AND datetime < %s;\ ".format(vname, self.table), (start_time, end_time)) cdata = np.ma.masked_values(np.ndarray( shape=dims, buffer=np.array([v for v in cur], dtype=float)), value=missval) if isinstance(cdata, np.ma.core.MaskedArray): # _logger.debug("is MaskedArray") cdata = cdata.filled(fill_value=float('nan')) total_size += sys.getsizeof(cdata) if total_size > size_limit: raise nc_exc.TooMuchDataException( "too many values requested, size={0} MB".\ format(total_size/(1000 * 1000))) vindex = len(vdata) vdata.append(cdata) vmap[vname] = vindex if len(dims) > 1: vdim2[vname] = { "data": [i for i in range(dims[1])], "name": "bin", "units": "" } return { '': { 'time': vtime, 'data': vdata, 'vmap': vmap, 'dim2': vdim2, } } except psycopg2.Error as exc: RAFDatabase.close_connection(conn) raise nc_exc.NoDataException((operation + ": {}").format(exc))
def read_time_series( self, variables=(), start_time=pytz.utc.localize(datetime.min), end_time=pytz.utc.localize(datetime.max), selectdim=None, size_limit=1000 * 1000 * 1000, series=None, series_name_fmt=None): """ Read a list of time-series variables from this fileset. Args: variables: A list of strs containing time series variable names to be read. start_time: A datetime, which is timezone aware, of the start time of the series to read. end_time: A datetime, timezone aware, end time of series to read. selectdim: A dict containing for each dimension name of type string, the indices of the dimension to read. For example: {"station":[3,4,5]} to read indices 3,4 and 5 (indexed from 0) of the station dimension for variables which have that dimension. size_limit: Limit on the total size in bytes to read, used to screen huge requests. series: A list of series to be read by name. For soundings a series name is something like "Aug23_0000Z", as created by series_fmt from the time associated with a file. In this way the data read can be split into named series. If series_fmt is None, the series name should be a list of one empty string, [''], and all data are concatenated together in time order. series_fmt: a datetime.strftime format to create a series name for the data found in each file, based on the time associated with the file. If series_name_fmt is None, all data is put in a dictionary element named ''. Returns: A dict containing, by series name: 'time' : list of UTC timestamps, 'data': list of numpy.ndarray containing the data for each variable, 'vmap': dict by variable name, containing the index into the series data for the variable, 'dim2': dict by variable name, of values for second dimension of the data, such as height, 'stnnames': dict by variable name, of the list of the station names for the variable that were read, as selected by selectdim. A list of length 1 containing an empty string indicates the variable does not have a station dimension. Raises: OSError nc_exc.NoDataException The 'data' element in the returned dict is a list of numpy arrays, and not a dict by variable name. The 'vmap' element provides the mapping from a variable name to an index into 'data'. The data object is typically JSON-ified and sent to a browser. If it were a dict, the variable names may contain characters which cause headaches with JSON and javascript in django templates. For example, the JSON-ified string is typically passed to javascript in a django template by surrounding it with single quotes: var data = jQuery.parseJSON('{{ data }}'); A single quote within the data JSON string causes grief, and we want to support single quotes in variable names. The only work around I know of is to convert the single quotes within the string to '\u0027'. This is, of course, a time-consuming step we want to avoid when JSON-ifying a large chunk of data. It is less time-consuming to replace the quotes in the smaller vmap. The series names will not contain single quotes. """ debug = False dsinfo = self.get_dataset_info() if not dsinfo['time_name']: self.scan_files() dsinfo = self.get_dataset_info() dsinfo_vars = dsinfo['variables'] if not selectdim: selectdim = {} vshapes = self.resolve_variable_shapes(variables, selectdim) res_data = {} total_size = 0 ntimes = 0 files = self.get_files(start_time, end_time) if debug: _logger.debug( "len(files)=%d, series_name_fmt=%s", len(files), series_name_fmt) if series_name_fmt: file_tuples = [(f.time.strftime(series_name_fmt), f.path) \ for f in files] else: file_tuples = [("", f.path) for f in files] for (series_name, ncpath) in file_tuples: if series and not series_name in series: continue if debug: _logger.debug("series=%s", str(series)) _logger.debug("series_name=%s ,ncpath=%s", series_name, ncpath) # the files might be in the process of being moved, deleted, etc fileok = False exc = None for itry in range(0, 3): try: ncfile = netCDF4.Dataset(ncpath) fileok = True break except (OSError, RuntimeError) as excx: exc = excx time.sleep(itry) if not fileok: _logger.error("%s: %s", ncpath, exc) continue if not series_name in res_data: res_data[series_name] = { 'time': [], 'data': [], 'vmap': {}, 'dim2': {}, 'stnnames': {}, } otime = res_data[series_name]['time'] odata = res_data[series_name]['data'] ovmap = res_data[series_name]['vmap'] odim2 = res_data[series_name]['dim2'] ostns = res_data[series_name]['stnnames'] try: size1 = sys.getsizeof(otime) # times are apended to otime time_slice = self.read_times( ncfile, ncpath, start_time, end_time, otime, size_limit - total_size) # time_slice.start is None if nothing to read if time_slice.start is None or \ time_slice.stop <= time_slice.start: continue total_size += sys.getsizeof(otime) - size1 for exp_vname in variables: # skip if variable is not a time series or # doesn't have a selected dimension if not exp_vname in dsinfo_vars or not exp_vname in vshapes: continue # selected shape of this variable vshape = vshapes[exp_vname] vsize = reduce_( operator.mul, vshape, 1) * \ dsinfo_vars[exp_vname]["dtype"].itemsize if not vsize: continue if total_size + vsize > size_limit: raise nc_exc.TooMuchDataException( "too much data requested, will exceed {} mbytes". format(size_limit/(1000 * 1000))) dim2 = {} stnnames = [] vdata = self.read_time_series_data( ncfile, ncpath, exp_vname, time_slice, vshape, selectdim, dim2, stnnames) if vdata is None: continue # dim2 will be empty if variable is not found in file if dim2 and not exp_vname in odim2: odim2[exp_vname] = dim2 # stnnames will be empty if variable is not found in file if stnnames and not exp_vname in ostns: ostns[exp_vname] = stnnames if not exp_vname in ovmap: size1 = 0 vindex = len(odata) odata.append(vdata) ovmap[exp_vname] = vindex else: if debug: _logger.debug( "odata[%s].shape=%s, vdata.shape=%s", exp_vname, odata[vindex].shape, vdata.shape) vindex = ovmap[exp_vname] size1 = sys.getsizeof(odata[vindex]) time_index = dsinfo_vars[exp_vname]["time_index"] odata[vindex] = np.append( odata[vindex], vdata, axis=time_index) total_size += sys.getsizeof(odata[vindex]) - size1 finally: ncfile.close() ntimes += len(otime) if ntimes == 0: exc = nc_exc.NoDataException( "No data between {} and {}". format( start_time.isoformat(), end_time.isoformat())) # _logger.warning("%s: %s", str(self), repr(exc)) raise exc if debug: for series_name in res_data: for exp_vname in res_data[series_name]['vmap']: var_index = res_data[series_name]['vmap'][exp_vname] _logger.debug( "res_data[%s]['data'][%d].shape=%s, exp_vname=%s", series_name, var_index, repr(res_data[series_name]['data'][var_index].shape), exp_vname) _logger.debug( "total_size=%d", total_size) return res_data
def scan_files( self, time_names=('time', 'Time', 'time_offset')): """ Scan the set of files, accumulating information about the dataset in a dict, with the following keys: file_mod_times: dictionary of file modification times by file name, of each file just before it was last scanned. base_time: name of base_time variable time_dim_name: name of time dimension time_name: name of time variable nstations: size of the station dimension station_dim: name of the station dimension station_names: names of the stations sites: dictionary of names of sites long names by the site short names extracted from the exported names of those variables not associated with a numbered station variables: dictionary of information for each variable. If an element in file_mod_times exists for a file, that file is not scanned if its current modification time has not been updated. The names of the variables in the dataset are converted to an exported form. If a variable has a 'short_name' attribute, it is used for the variable name, otherwise the exported name is set to the NetCDF variable name. Note, we don't read every file. May want to have MAX_NUM_FILES_TO_PRESCAN be an attribute of the dataset. Args: time_names: List of allowed names for time variable. Raises: OSError nc_exc.NoDataException """ dsinfo = self.get_dataset_info() # Note: dsinfo_vars is a reference. Modificatons to it # are also modifications to dsinfo. dsinfo_vars = dsinfo['variables'] sitedict = dsinfo['sites'] files = self.get_files( start_time=self.start_time, end_time=self.end_time) # typically get_files() also returns the file before start_time # We may want that in reading a period of data, but not # in assembling the variables for the dataset filepaths = [f.path for f in files if f.time >= self.start_time and f.time < self.end_time] skip = 1 if len(filepaths) > NetCDFDataset.MAX_NUM_FILES_TO_PRESCAN: skip = len(filepaths) / NetCDFDataset.MAX_NUM_FILES_TO_PRESCAN # Read at most MAX_NUM_FILES_TO_PRESCAN, including latest file. # Files are scanned in a backwards sequence pindex = len(filepaths) - 1 n_files_read = 0 while pindex >= 0: ncpath = filepaths[int(pindex)] pindex -= skip # The files might be in the process of being moved, deleted, etc, # so if we get an exception in this open, try a few more times. # Testing indicates that with a truncated file (artificially # truncated with dd), the underlying C code will cause a crash # of python from an assert() rather than raising an exception # that could be caught. # If the netcdf library is compiled with -DNDEBUG, then the # the open and parse of the truncated header succeeds, but # still no exception. # If the file is artificially corrupted by removing an # initial portion of the file: # dd if=test.nc of=bad.nc bs=1014 count=100 skip=1 # then an exception is raised (this was with -DNDEBUG): # RuntimeError bad.nc: NetCDF: Unknown file format # To make this robust, it would be good to run a king's # taster process on each file first to reduce the possibility # of a server death. The king's taster would not use NDEBUG, # but perhaps the python server would. Complicated. fileok = False skip_file = False exc = None siteset = set() site_sn = [] site_ln = [] for itry in range(0, 3): try: curr_mod_time = get_file_modtime(ncpath) if ncpath in dsinfo['file_mod_times']: prev_mod_time = dsinfo['file_mod_times'][ncpath] if curr_mod_time <= prev_mod_time: skip_file = True fileok = True break dsinfo['file_mod_times'][ncpath] = curr_mod_time # _logger.debug("ncpath=%s",ncpath) ncfile = netCDF4.Dataset(ncpath) fileok = True break except (OSError, RuntimeError) as excx: exc = excx time.sleep(itry) if not fileok: _logger.error("%s: %s", ncpath, exc) continue n_files_read += 1 if skip_file: continue try: if not dsinfo['base_time'] and 'base_time' in ncfile.variables: dsinfo['base_time'] = 'base_time' tdim = None # look for a time dimension for tname in ['time', 'Time']: if tname in ncfile.dimensions: tdim = ncfile.dimensions[tname] break if not tdim: continue # check for tdim.is_unlimited? if not dsinfo['time_dim_name']: dsinfo['time_dim_name'] = tdim.name if STATION_DIMENSION_NAME in ncfile.dimensions: if dsinfo['nstations'] is None: dsinfo['nstations'] = len(ncfile.dimensions[STATION_DIMENSION_NAME]) dsinfo['station_dim'] = STATION_DIMENSION_NAME if STATION_DIMENSION_NAME in ncfile.variables: dsinfo['station_names'] = [] var = ncfile.variables[STATION_DIMENSION_NAME] if var.datatype == np.dtype('S1'): snms = [str(netCDF4.chartostring(v)) for v in var] dsinfo['station_names'].extend(snms) elif not dsinfo['nstations'] == \ len(ncfile.dimensions[STATION_DIMENSION_NAME]): _logger.warning( "%s: station dimension (%d) is " "different than that of other files (%d)", ncpath, len(ncfile.dimensions[STATION_DIMENSION_NAME]), dsinfo['nstations']) # look for a time variable if not dsinfo['time_name']: for tname in time_names: if tname in ncfile.variables: if tdim.name in ncfile.variables[tname].dimensions: dsinfo['time_name'] = tname break if not dsinfo['time_name'] or \ not dsinfo['time_name'] in ncfile.variables: # time variable not yet found or not in this file continue if not tdim.name in ncfile.variables[dsinfo['time_name']].dimensions: # time variable in this file doesn't have a time dimension continue # pylint: disable=no-member for (nc_vname, var) in ncfile.variables.items(): if nc_vname == "site_long_name" and \ var.datatype == np.dtype('S1'): site_ln = [str(netCDF4.chartostring(v)) for v in var] if nc_vname == "sites" and \ var.datatype == np.dtype('S1'): site_sn = [str(netCDF4.chartostring(v)) for v in var] # looking for time series variables if not dsinfo['time_dim_name'] in var.dimensions: continue # time variable if nc_vname == dsinfo['time_name']: continue # exported variable name if hasattr(var, 'short_name'): exp_vname = getattr(var, 'short_name') else: exp_vname = nc_vname # var.dimensions is a tuple of dimension names time_index = var.dimensions.index(dsinfo['time_dim_name']) # Check if we have found this variable in a earlier file if not exp_vname in dsinfo_vars: # New variable varinfo = {} varinfo['netcdf_name'] = nc_vname varinfo['shape'] = var.shape varinfo['dimnames'] = var.dimensions varinfo['dtype'] = var.dtype varinfo['time_index'] = time_index # Grab certain attributes for att in ['units', 'long_name']: if hasattr(var, att): varinfo[att] = getattr(var, att) # Set default units to '' if 'units' not in varinfo: varinfo['units'] = '' # For non-station variables, parse the name to # determine the possible site if not dsinfo['station_dim'] or \ not dsinfo['station_dim'] in var.dimensions: site = get_isfs_site(exp_vname) if site: varinfo['site'] = site siteset.add(site) # dsinfo['sites'].add(site) else: dsinfo['has_station_variables'] = True dsinfo_vars[exp_vname] = varinfo continue varinfo = dsinfo_vars[exp_vname] # variable has been found in an earlier ncfile # check for consistancy across files if varinfo['shape'][1:] != var.shape[1:]: # the above check works even if either shape # has length 1 if len(varinfo['shape']) != \ len(var.shape): # changing number of dimensions, punt _logger.error( "%s: %s: number of " "dimensions is not consistent: %d and %d. " "Skipping this variable.", ncpath, nc_vname, len(var.shape), len(varinfo['shape'])) del dsinfo_vars[exp_vname] continue # here we know that shapes have same length and # they must have len > 1. Allow final dimension # to change. ndim = len(var.shape) if (varinfo['shape'][1:(ndim-1)] != var.shape[1:(ndim-1)]): _logger.error( "%s: %s: incompatible shapes: " "%s and %s. Skipping this variable.", ncpath, nc_vname, repr(var.shape), repr(varinfo['shape'])) del dsinfo_vars[exp_vname] continue # set shape to max shape (leaving the problem # for later...) varinfo['shape'] = tuple( [max(i, j) for (i, j) in zip( varinfo['shape'], var.shape)]) if varinfo['dtype'] != var.dtype: _logger.error( "%s: %s: type=%s is different than " "in other files", ncpath, nc_vname, repr(var.dtype)) if varinfo['time_index'] != time_index: _logger.error( "%s: %s: time_index=%d is different than " "in other files. Skipping this variable.", ncpath, nc_vname, time_index) del dsinfo_vars[exp_vname] for att in ['units', 'long_name']: if hasattr(var, att) and att in varinfo: if getattr(var, att) != varinfo[att]: _logger.info( "%s: %s: %s=%s is different than previous value=%s", ncpath, nc_vname, att, getattr(var, att), varinfo[att]) varinfo[att] = getattr(var, att) finally: for site in siteset: try: i = site_sn.index(site) if i < len(site_ln): sitedict[site] = site_ln[i] except ValueError: pass if site not in sitedict: sitedict[site] = '' ncfile.close() if not n_files_read: msg = self.path + ": No files found" raise nc_exc.NoDataException(msg) # Remove the station names if no variables have a station dimension if not dsinfo['has_station_variables']: dsinfo['station_names'] = [] else: # create station names if a "station" variable is not found # in NetCDF files. Names are S1, S2, etc for dimension index 0, 1 if dsinfo['nstations'] and not dsinfo['station_names']: dsinfo['station_names'].extend(\ ['S{}'.format(i+1) for i in range(dsinfo['nstations'])]) # cache dsinfo self.save_dataset_info(dsinfo)