예제 #1
0
    def get_start_time(self):
        """Read first datatime from the database table.

        Raises:
            nc_exc.NoDataException
        """

        vname = "datetime"

        try:
            with self.conn as conn:
                with conn.cursor() as cur:
                    # datetimes in database are returned to python as timezone naive.
                    cur.execute(
                        "SELECT {} FROM {} FETCH FIRST 1 ROW ONLY;".format(
                            vname, self.table))
                    start_time = cur.fetchone()
                    if not start_time:
                        _logger.warning("%s: read %s: no data", conn, vname)
                        raise nc_exc.NoDataException("read {}".format(vname))
                    return pytz.utc.localize(start_time[0])
        except psycopg2.Error as exc:
            _logger.warning("%s: read %s: %s", conn, vname, exc)
            RAFDatabase.close_connection(conn)
            raise nc_exc.NoDataException("read {}: {}".format(vname, exc))
예제 #2
0
    def read_times(self,
                   start_time=pytz.utc.localize(datetime.min),
                   end_time=pytz.utc.localize(datetime.max)):
        """Read datetimes from the table within a range.

        Raises:
            nc_exc.NoDataException
        """

        start_time = start_time.replace(tzinfo=None)
        end_time = end_time.replace(tzinfo=None)

        # _logger.debug("read_times, table=%s", self.table)

        vname = "datetime"

        try:
            with self.conn as conn:
                with conn.cursor() as cur:
                    # datetimes in database are returned to python as timezone naive.
                    cur.execute(
                        "SELECT {} FROM {} WHERE {} >= %s AND {} < %s;".format(
                            vname, self.table, vname, vname),
                        (start_time, end_time))
                    return [pytz.utc.localize(x[0]).timestamp() for x in cur]
        except psycopg2.Error as exc:
            RAFDatabase.close_connection(conn)
            raise nc_exc.NoDataException("read {}: {}".format(vname, exc))
예제 #3
0
    def get_variables(self):
        """Fetch pertinent fields from the 'variable_list' table in
        the RAF database, such as the list of variable names, their units, and
        missing values.

        Raises:
            nc_exc.NoDataException
        """

        try:
            with self.conn as conn:
                with conn.cursor() as cur:
                    cur.execute("\
    SELECT name, units, long_name, ndims, dims, missing_value from variable_list;"
                                )
                    variables = {}
                    for var in cur:
                        dimnames = ["time"]
                        # make a bold assumption that a second dimension
                        # is a particle-probe bin number
                        if var[3] > 1:
                            dimnames.append("bin")
                        variables[var[0]] = {
                            "units": var[1],
                            "long_name": var[2],
                            "dimnames": dimnames,
                            "shape": var[4]
                        }

                    return variables
        except psycopg2.Error as exc:
            # psycopg.connections are thread safe
            RAFDatabase.close_connection(conn)
            raise nc_exc.NoDataException("No variables found: {}".format(exc))
예제 #4
0
    def __init__(self,
                 database="real-time-GV",
                 user="******",
                 host="eol-rt-data.fl-ext.ucar.edu",
                 port=5432,
                 password=None,
                 table="raf_lrt"):
        """Construct an instance of RAF database connection.

        Args:
            database, user, host, port, password: Usual parameters
                needed to create a PostgreSQL connection.
            table: name of table in the database which contains
                the time-series data to be read.

        Raises:
            nc_exc.NoDataException
        """

        try:
            self.conn = RAFDatabase.get_connection(database=database,
                                                   user=user,
                                                   host=host,
                                                   port=port,
                                                   password=password)
            self.database = database
            self.user = user
            self.host = host
            self.port = port
            self.password = password
            self.table = table
        except psycopg2.Error as exc:
            raise nc_exc.NoDataException(
                "Database not available: {}".format(exc))
예제 #5
0
    def post(self, request, *args, project_name, dataset_name, **kwargs):
        """Respond to a POST request where the user has sent back a form.

        Using the requested parameters in the form, such as start and end times
        and a list of variables, the dataset can be read, and the contents
        sent back to the user.
        """

        _logger.debug("DatasetView.post(project=%s, dataset=%s)", project_name,
                      dataset_name)
        try:
            client_state = get_client_from_session(request.session,
                                                   project_name, dataset_name)
        except Http404 as exc:
            _logger.warning("post: %s", exc)
            messages.warning(request, exc)
            return redirect('ncharts:dataset',
                            project_name=project_name,
                            dataset_name=dataset_name)

        dset = get_dataset(client_state)

        # dataset name and project name from POST should agree with
        # those in the cached dataset.
        if dset.name != dataset_name or dset.project.name != project_name:
            _logger.error(
                "post, old session, project=%s, dataset=%s, "
                "url project=%s, dataset=%s", dset.project.name, dset.name,
                project_name, dataset_name)
            messages.warning(request, "session is for a different dataset")
            return redirect('ncharts:dataset',
                            project_name=project_name,
                            dataset_name=dataset_name)

        proj = nc_models.Project.objects.get(name=project_name)
        projs = nc_models.Project.objects.all()
        plats = nc_models.Platform.objects.all()
        dsets = proj.dataset_set.all()

        # vars = [ v.name for v in dset.variables.all() ]

        # page-backward or page-forward in time
        # TODO: implement a javascript button that manipulates the
        # html field directly

        if 'submit' in request.POST and request.POST['submit'][0:4] == 'page':

            post = request.POST.copy()
            try:
                # will throw ValidationError if timezone doesn't exist
                timezone = nc_models.TimeZone.objects.get(
                    tz=request.POST['timezone']).tz

                start_time = timezone.localize(
                    datetime.datetime.strptime(request.POST['start_time'],
                                               "%Y-%m-%d %H:%M"))

                delt = nc_forms.get_time_length(
                    request.POST['time_length_0'],
                    request.POST['time_length_units'])

                if request.POST['submit'] == 'page-backward':
                    start_time = start_time - delt
                elif request.POST['submit'] == 'page-forward':
                    start_time = start_time + delt

                post['start_time'] = start_time.replace(tzinfo=None)

            except (KeyError, ValueError, dj_exc.ValidationError):
                # Will result in invalid form below
                post['start_time'] = None

            post['track_real_time'] = False

            form = nc_forms.DataSelectionForm(post,
                                              dataset=dset,
                                              request=request)
        else:
            form = nc_forms.DataSelectionForm(request.POST,
                                              dataset=dset,
                                              request=request)

        # Have to set the choices for variables and soundings
        # before the form is validated.
        soundings = []
        sounding_choices = []
        dsetvars = {}
        dsetstns = []
        try:
            dsetstns = dset.get_station_names()
            form.set_station_choices(dsetstns)

            dsetvars = dset.get_variables()
            form.set_variable_choices(dsetvars)
            form.set_yvariable_choices(dsetvars)

            if dset.dset_type == "sounding":
                # all soundings in the dataset
                soundings = dset.get_series_tuples(
                    series_name_fmt=SOUNDING_NAME_FMT)

                sounding_choices = dset.get_series_names(
                    series_name_fmt=SOUNDING_NAME_FMT)

                sounding_choices = [(s, s) for s in sounding_choices]
                form.fields['soundings'].choices = sounding_choices

        except (nc_exc.NoDataException) as exc:
            _logger.warning("%s, %s: get_variables: %s", project_name, dset,
                            exc)
            form.no_data("No variables found in {}: {}".format(dset, exc))

        if not form.is_valid():
            _logger.warning('User form is not valid!: %s', repr(form.errors))

            if dset.dset_type == "sounding":
                sounding_choices = []
                start_time = form.get_cleaned_start_time()
                tdelta = form.get_cleaned_time_length()

                if start_time and tdelta:
                    sounding_choices = dset.get_series_names(
                        series_name_fmt=SOUNDING_NAME_FMT,
                        start_time=start_time,
                        end_time=start_time + tdelta)
                    sounding_choices = [(s, s) for s in sounding_choices]

            if form.clean_method_altered_data:
                post = request.POST.copy()
                post['start_time'] = form.cleaned_data['start_time']
                post['track_real_time'] = form.cleaned_data['track_real_time']
                form = nc_forms.DataSelectionForm(post, dataset=dset)

            form.set_station_choices(dsetstns)
            form.set_variable_choices(dsetvars)
            form.set_yvariable_choices(dsetvars)
            if dset.dset_type == "sounding":
                form.fields['soundings'].choices = sounding_choices
            return render(
                request, self.template_name, {
                    'version': _version,
                    'form': form,
                    'dataset': dset,
                    'datasets': dsets,
                    'variables': dsetvars,
                    'soundings': mark_safe(json.dumps(soundings)),
                    'projects': projs,
                    'platforms': plats
                })

        # Save the client state from the form
        sel_stns = form.cleaned_data['stations']
        sel_vars = form.cleaned_data['variables']
        sel_soundings = form.cleaned_data['soundings']

        yvar = form.cleaned_data['yvariable']

        tdelta = form.get_cleaned_time_length()
        start_time = form.get_cleaned_start_time()

        end_time = start_time + tdelta
        client_state.stations = json.dumps(sel_stns)
        client_state.variables = json.dumps(sel_vars)
        client_state.start_time = start_time
        client_state.timezone = form.cleaned_data['timezone']
        client_state.time_length = tdelta.total_seconds()
        client_state.track_real_time = form.cleaned_data['track_real_time']
        client_state.soundings = json.dumps(sel_soundings)
        client_state.yvariable = yvar
        client_state.save()

        # Re-create form if any values have been altered
        if form.clean_method_altered_data:
            post = request.POST.copy()
            post['start_time'] = form.cleaned_data['start_time']
            post['track_real_time'] = form.cleaned_data['track_real_time']
            form = nc_forms.DataSelectionForm(post, dataset=dset)

        form.set_station_choices(dsetstns)
        form.set_variable_choices(dsetvars)
        form.set_yvariable_choices(dsetvars)

        if dset.dset_type == "sounding":
            # set sounding choices for selected time period
            # soundings between the start and end time
            sounding_choices = dset.get_series_names(
                series_name_fmt=SOUNDING_NAME_FMT,
                start_time=client_state.start_time,
                end_time=client_state.start_time + \
                    datetime.timedelta(seconds=client_state.time_length))

            sounding_choices = [(s, s) for s in sounding_choices]
            form.fields['soundings'].choices = sounding_choices

        if yvar != "":
            if yvar not in dsetvars.keys():
                exc = nc_exc.NoDataException(
                    "variable {} not found in {}".format(yvar, dset))
                _logger.warning(repr(exc))
                form.no_data(repr(exc))
                return render(
                    request, self.template_name, {
                        'version': _version,
                        'form': form,
                        'dataset': dset,
                        'datasets': dsets,
                        'variables': dsetvars,
                        'soundings': mark_safe(json.dumps(soundings)),
                        'projects': projs,
                        'platforms': plats
                    })

            if yvar not in sel_vars:
                sel_vars.append(yvar)

        series_name_fmt = None
        if dset.dset_type == "sounding":
            series_name_fmt = SOUNDING_NAME_FMT
        else:
            sel_soundings = None

        # If variables exists in the dataset, get their
        # attributes there, otherwise from the actual dataset.
        if dset.variables.all():
            variables = {
                k: {
                    'units': dset.variables.get(name=k).units,
                    'long_name': dset.variables.get(name=k).long_name
                }
                for k in sel_vars
            }
        else:
            variables = {k: dsetvars[k] for k in sel_vars}

        stndims = {"station": [int(stn) for stn in sel_stns]}

        try:
            if isinstance(dset, nc_models.FileDataset):
                ncdset = dset.get_netcdf_dataset()
                indata = ncdset.read_time_series(
                    sel_vars,
                    start_time=start_time,
                    end_time=end_time,
                    selectdim=stndims,
                    series=sel_soundings,
                    series_name_fmt=series_name_fmt)
            else:
                dbcon = dset.get_connection()
                indata = dbcon.read_time_series(sel_vars,
                                                start_time=start_time,
                                                end_time=end_time)

        except nc_exc.TooMuchDataException as exc:
            _logger.warning("%s, %s: %s", project_name, dataset_name, exc)
            form.too_much_data(repr(exc))
            return render(
                request, self.template_name, {
                    'version': _version,
                    'form': form,
                    'dataset': dset,
                    'datasets': dsets,
                    'variables': dsetvars,
                    'soundings': mark_safe(json.dumps(soundings)),
                    'projects': projs,
                    'platforms': plats
                })

        except (OSError, nc_exc.NoDataException) as exc:
            _logger.warning("%s, %s: %s", project_name, dataset_name, exc)
            form.no_data(repr(exc))
            return render(
                request, self.template_name, {
                    'version': _version,
                    'form': form,
                    'dataset': dset,
                    'datasets': dsets,
                    'variables': dsetvars,
                    'soundings': mark_safe(json.dumps(soundings)),
                    'projects': projs,
                    'platforms': plats
                })

        time0 = {}
        vsizes = {}

        for series_name in indata:
            ser_data = indata[series_name]
            vsizes[series_name] = {}
            if series_name == "":
                for vname in sel_vars:
                    vsizes[series_name][vname] = 0
                    try:
                        # works for any shape, as long as time is the
                        # first dimension
                        vindex = ser_data['vmap'][vname]
                        vsizes[series_name][vname] = ser_data['data'][
                            vindex].size
                        lastok = np.where(
                            ~np.isnan(ser_data['data'][vindex]))[0][-1]
                        time_last_ok = ser_data['time'][lastok]
                    except IndexError:  # all data is nan
                        time_last_ok = (start_time - \
                            datetime.timedelta(seconds=0.001)).timestamp()
                    except KeyError:  # variable not in vmap
                        continue

                    try:
                        time_last = ser_data['time'][-1]
                    except IndexError:  # no data
                        time_last = time_last_ok

                    client_state.save_data_times(vname, time_last_ok,
                                                 time_last)

            # A simple compression, subtract first time from all times,
            # reducing the number of characters sent.
            time0[series_name] = 0
            if ser_data['time']:
                time0[series_name] = ser_data['time'][0]

            # subtract off time0
            ser_data['time'] = [x - time0[series_name] for \
                    x in ser_data['time']]

        json_time0 = mark_safe(json.dumps(time0))
        json_time = mark_safe(
            json.dumps({sn: indata[sn]['time']
                        for sn in indata}))
        json_data = mark_safe(
            json.dumps({sn: indata[sn]['data']
                        for sn in indata},
                       cls=NChartsJSONEncoder))
        json_vmap = mark_safe(
            json.dumps({sn: indata[sn]['vmap']
                        for sn in indata},
                       cls=NChartsJSONEncoder).replace("'", r"\u0027"))
        json_dim2 = mark_safe(
            json.dumps({sn: indata[sn]['dim2']
                        for sn in indata},
                       cls=NChartsJSONEncoder).replace("'", r"\u0027"))

        # indata may not have stnnames element
        json_stns = mark_safe(
            json.dumps(
                {
                    sn: (indata[sn]['stnnames']
                         if 'stnnames' in indata[sn] else {})
                    for sn in indata
                },
                cls=NChartsJSONEncoder).replace("'", r"\u0027"))

        def type_by_dims(dimnames):
            """Crude function to return a plot type, given a dimension.
            """
            if len(dimnames) == 1:
                return 'time-series'
            elif len(dimnames) == 2:
                if dimnames[1] == "station":
                    return 'time-series'
                return 'heatmap'
            else:
                return 'none'

        plot_types = set()
        if len(indata) == 1 and '' in indata:
            # one series, named ''
            for vname, var in variables.items():
                ptype = "time-series"
                if vname in indata['']['vmap']:
                    vindex = indata['']['vmap'][vname]
                    vdimnames = dsetvars[vname]["dimnames"]
                    # print("vname=",vname,",shape=",str(indata['']['data'][vindex].shape))
                    # print("vname=",vname,",nbytes=",str(indata['']['data'][vindex].nbytes))
                    # print("vname=",vname,",ndim=",str(indata['']['data'][vindex].ndim))
                    # print("dsetvars[",vname,"]['dimnames']=",str(dsetvars[vname]["dimnames"]))
                    ptype = type_by_dims(vdimnames)
                var['plot_type'] = ptype
                plot_types.add(ptype)
        else:
            plot_types.add("sounding-profile")

        # Create plot groups dictionary, for each
        # group, the variables in the group, their units, long_names, plot_type
        # Use OrderedDict so the plots come out in this order
        plot_groups = collections.OrderedDict()

        # loop over plot_types
        grpid = 0
        for ptype in plot_types:
            # _logger.debug("ptype=%s", ptype)

            # For a heatmap, one plot per variable.
            if ptype == 'heatmap':
                for vname in sorted(variables):  # returns sorted keys
                    var = variables[vname]
                    if vsizes[''][vname] > 0 and var['plot_type'] == ptype:
                        plot_groups['g{}'.format(grpid)] = {
                            'series':
                            "",
                            'variables':
                            mark_safe(
                                json.dumps([vname]).replace("'", r"\u0027")),
                            'units':
                            mark_safe(
                                json.dumps([var['units']
                                            ]).replace("'", r"\u0027")),
                            'long_names':
                            mark_safe(
                                json.dumps([var['long_name']
                                            ]).replace("'", r"\u0027")),
                            'plot_type':
                            mark_safe(ptype),
                        }
                        grpid += 1
            elif ptype == 'sounding-profile':
                # one profile plot per series name
                for series_name in sorted(indata.keys()):
                    vnames = sorted([v for v in variables])
                    units = [variables[v]['units'] for v in vnames]
                    long_names = [(variables[v]['long_name'] \
                        if 'long_name' in variables[v] else v) for v in vnames]
                    plot_groups['g{}'.format(grpid)] = {
                        'series':
                        series_name,
                        'variables':
                        mark_safe(json.dumps(vnames).replace("'", r"\u0027")),
                        'units':
                        mark_safe(json.dumps(units).replace("'", r"\u0027")),
                        'long_names':
                        mark_safe(
                            json.dumps(long_names).replace("'", r"\u0027")),
                        'plot_type':
                        mark_safe(ptype),
                    }
                    grpid += 1
            else:
                # unique units, in alphabetical order by the name of the
                # first variable which uses it. In this way the plots
                # are in alphabetical order on the page by the first plotted variable.
                uunits = []
                # sorted(dict) becomes a list of sorted keys
                for vname in sorted(variables):
                    units = ''
                    if 'units' in variables[vname]:
                        units = variables[vname]['units']
                    else:
                        variables[vname]['units'] = units
                    if units not in uunits:
                        uunits.append(units)

                # unique units
                for units in uunits:
                    uvars = sorted([vname for vname, var in variables.items() \
                        if vsizes[''][vname] > 0 and var['plot_type'] == ptype and var['units'] == units])
                    # uvars is a sorted list of variables with units and this plot type.
                    # Might be empty if the variable is of a different plot type
                    if uvars:
                        plot_groups['g{}'.format(grpid)] = {
                            'series': "",
                            'variables': mark_safe(
                                json.dumps(uvars).replace("'", r"\u0027")),
                            'units': mark_safe(json.dumps(
                                [(variables[v]['units'] if 'units' in variables[v] else '') \
                                    for v in uvars]).replace("'", r"\u0027")),
                            'long_names': mark_safe(json.dumps(
                                [(variables[v]['long_name'] \
                                    if 'long_name' in variables[v] else '') \
                                    for v in uvars]).replace("'", r"\u0027")),
                            'plot_type': mark_safe(ptype),
                        }
                        grpid += 1

        # log the request

        if len(variables) > 2:
            logvars = sorted(variables)[:2] + ['...']
        else:
            logvars = variables
        _request_logger.info(
            "%s, %s, %s, #recs=%d, real_time=%s, vars=%s, #vars=%d, stns=%s, snding=%s, plot=%s, fromaddr=%s",
            dset.project.name, dset.name, start_time, len(ser_data['time']),
            client_state.track_real_time, ' '.join(logvars), len(variables),
            ' '.join(["%s" % s for s in sel_stns
                      ]), ' '.join(["%s" % s for s in soundings]),
            ' '.join(plot_types), request.META['REMOTE_ADDR'])

        return render(
            request, self.template_name, {
                'version': _version,
                'form': form,
                'dataset': dset,
                'datasets': dsets,
                'variables': dsetvars,
                'plot_groups': plot_groups,
                'time0': json_time0,
                'time': json_time,
                'data': json_data,
                'vmap': json_vmap,
                'dim2': json_dim2,
                'stations': json_stns,
                'time_length': client_state.time_length,
                'soundings': mark_safe(json.dumps(soundings)),
                'yvariable': yvar.replace("'", r"\u0027"),
                'projects': projs,
                'platforms': plats
            })
예제 #6
0
    def read_time_series(self,
                         variables=(),
                         start_time=pytz.utc.localize(datetime.min),
                         end_time=pytz.utc.localize(datetime.max),
                         size_limit=1000 * 1000 * 1000):
        """Read times and variables from the table within a time period.

        For each variable, its missing_value will be read from the
        variable_list table. Values read from the time series table
        which match the missing_value will be set to float('nan').

        Args:
            variables: list or tuple of variable names to read.
            start_time: starting datetime of data to be read.
            end_time: ending datetime of data to be read.
            size_limit: attempt to screen outrageous requests.

        Returns:
            A one element dict, compatible with that returned by
            netcdf.read_time_series(), containing for a series_name of '':
            {
                'time' : list of UTC timestamps,
                'data': lists of numpy.ndarray containing
                    the data for each variable,
                'vmap': dict by variable name,
                    containing the index into the series data for the variable,
                'dim2': dict by variable name, of values for second
                    dimension of the data, such as height.
            }
        Raises:
            nc_exc.NoDataException
        """

        total_size = 0

        start_time = start_time.replace(tzinfo=None)
        end_time = end_time.replace(tzinfo=None)

        vtime = self.read_times(start_time=start_time, end_time=end_time)
        # _logger.debug("read_times, len=%d", len(vtime))

        total_size += sys.getsizeof(vtime)
        if total_size > size_limit:
            raise nc_exc.TooMuchDataException(
                "too many time values requested, size={0} MB".\
                format(total_size/(1000 * 1000)))

        vdata = []
        vmap = {}
        vdim2 = {}

        try:
            with self.conn as conn:
                with conn.cursor() as cur:
                    for vname in variables:

                        operation = "read variable_list"
                        # _logger.debug("vname=%s",vname)
                        cur.execute(
                            "SELECT dims, missing_value from variable_list where name=%s;",
                            (vname, ))
                        vinfo = cur.fetchall()
                        # _logger.debug("vinfo=%s",vinfo)
                        dims = vinfo[0][0]
                        dims[0] = len(vtime)
                        missval = vinfo[0][1]

                        if len(dims) > 1:
                            # In initial CSET data, dims for CUHSAS_RWOOU
                            # in variable_list was [1,99]
                            # Seems that the 99 should have been 100,
                            # which is what is returned by this:
                            operation = "read dimension of {}".format(vname)
                            cur.execute("\
    SELECT array_upper({},1) FROM {} FETCH FIRST 1 ROW ONLY;\
    ".format(vname, self.table))

                            dimsx = cur.fetchall()[0]
                            dims[1] = dimsx[0]
                            # _logger.debug("vname=%s, dims=%s, dimsx=%s", vname, dims, dimsx)

                        operation = "read {}".format(vname)
                        cur.execute(
                            "\
    SELECT {} FROM {} WHERE datetime >= %s AND datetime < %s;\
    ".format(vname, self.table), (start_time, end_time))

                        cdata = np.ma.masked_values(np.ndarray(
                            shape=dims,
                            buffer=np.array([v for v in cur], dtype=float)),
                                                    value=missval)

                        if isinstance(cdata, np.ma.core.MaskedArray):
                            # _logger.debug("is MaskedArray")
                            cdata = cdata.filled(fill_value=float('nan'))

                        total_size += sys.getsizeof(cdata)
                        if total_size > size_limit:
                            raise nc_exc.TooMuchDataException(
                                "too many values requested, size={0} MB".\
                                format(total_size/(1000 * 1000)))
                        vindex = len(vdata)
                        vdata.append(cdata)
                        vmap[vname] = vindex
                        if len(dims) > 1:
                            vdim2[vname] = {
                                "data": [i for i in range(dims[1])],
                                "name": "bin",
                                "units": ""
                            }

                    return {
                        '': {
                            'time': vtime,
                            'data': vdata,
                            'vmap': vmap,
                            'dim2': vdim2,
                        }
                    }

        except psycopg2.Error as exc:
            RAFDatabase.close_connection(conn)
            raise nc_exc.NoDataException((operation + ": {}").format(exc))
예제 #7
0
    def read_time_series(
            self,
            variables=(),
            start_time=pytz.utc.localize(datetime.min),
            end_time=pytz.utc.localize(datetime.max),
            selectdim=None,
            size_limit=1000 * 1000 * 1000,
            series=None,
            series_name_fmt=None):
        """ Read a list of time-series variables from this fileset.

        Args:
            variables: A list of strs containing time series variable
                names to be read.
            start_time: A datetime, which is timezone aware, of the start
                time of the series to read.
            end_time: A datetime, timezone aware, end time of series to read.
            selectdim: A dict containing for each dimension name of type
                string, the indices of the dimension to read.
                For example: {"station":[3,4,5]} to read indices 3,4 and 5
                (indexed from 0) of the station dimension for variables
                which have that dimension.
            size_limit: Limit on the total size in bytes to read, used to
                screen huge requests.
            series: A list of series to be read by name. For soundings
                a series name is something like "Aug23_0000Z", as
                created by series_fmt from the time associated with
                a file.  In this way the data read can be split into
                named series.  If series_fmt is None, the series name
                should be a list of one empty string, [''],
                and all data are concatenated together in time order.
            series_fmt: a datetime.strftime format to create a
                series name for the data found in each file, based
                on the time associated with the file.
                If series_name_fmt is None, all data is put in a dictionary
                element named ''.

        Returns:
            A dict containing, by series name:
                'time' : list of UTC timestamps,
                'data': list of numpy.ndarray containing the data for
                    each variable,
                'vmap': dict by variable name,
                    containing the index into the series data for the variable,
                'dim2': dict by variable name, of values for second dimension
                    of the data, such as height,
                'stnnames': dict by variable name, of the list of the
                    station names for the variable that were read,
                    as selected by selectdim. A list of length 1 containing
                    an empty string indicates the variable does not have
                    a station dimension.

        Raises:
            OSError
            nc_exc.NoDataException

        The 'data' element in the returned dict is a list of numpy arrays,
        and not a dict by variable name. The 'vmap' element provides the
        mapping from a variable name to an index into 'data'. The data object
        is typically JSON-ified and sent to a browser. If it were a dict,
        the variable names may contain characters which cause headaches with
        JSON and javascript in django templates. For example, the JSON-ified
        string is typically passed to javascript in a django template by
        surrounding it with single quotes:
            var data = jQuery.parseJSON('{{ data }}');
        A single quote within the data JSON string causes grief, and we want
        to support single quotes in variable names. The only work around I
        know of is to convert the single quotes within the string to '\u0027'.
        This is, of course, a time-consuming step we want to avoid when
        JSON-ifying a large chunk of data.  It is less time-consuming to
        replace the quotes in the smaller vmap.

        The series names will not contain single quotes.

        """

        debug = False

        dsinfo = self.get_dataset_info()

        if not dsinfo['time_name']:
            self.scan_files()
            dsinfo = self.get_dataset_info()

        dsinfo_vars = dsinfo['variables']

        if not selectdim:
            selectdim = {}

        vshapes = self.resolve_variable_shapes(variables, selectdim)

        res_data = {}

        total_size = 0
        ntimes = 0

        files = self.get_files(start_time, end_time)
        if debug:
            _logger.debug(
                "len(files)=%d, series_name_fmt=%s",
                len(files), series_name_fmt)

        if series_name_fmt:
            file_tuples = [(f.time.strftime(series_name_fmt), f.path) \
                for f in files]
        else:
            file_tuples = [("", f.path) for f in files]

        for (series_name, ncpath) in file_tuples:

            if series and not series_name in series:
                continue

            if debug:
                _logger.debug("series=%s", str(series))
                _logger.debug("series_name=%s ,ncpath=%s", series_name, ncpath)

            # the files might be in the process of being moved, deleted, etc
            fileok = False
            exc = None
            for itry in range(0, 3):
                try:
                    ncfile = netCDF4.Dataset(ncpath)
                    fileok = True
                    break
                except (OSError, RuntimeError) as excx:
                    exc = excx
                    time.sleep(itry)

            if not fileok:
                _logger.error("%s: %s", ncpath, exc)
                continue

            if not series_name in res_data:
                res_data[series_name] = {
                    'time': [],
                    'data': [],
                    'vmap': {},
                    'dim2': {},
                    'stnnames': {},
                }

            otime = res_data[series_name]['time']
            odata = res_data[series_name]['data']
            ovmap = res_data[series_name]['vmap']
            odim2 = res_data[series_name]['dim2']
            ostns = res_data[series_name]['stnnames']

            try:
                size1 = sys.getsizeof(otime)

                # times are apended to otime
                time_slice = self.read_times(
                    ncfile, ncpath, start_time, end_time, otime,
                    size_limit - total_size)

                # time_slice.start is None if nothing to read
                if time_slice.start is None or \
                    time_slice.stop <= time_slice.start:
                    continue

                total_size += sys.getsizeof(otime) - size1

                for exp_vname in variables:

                    # skip if variable is not a time series or
                    # doesn't have a selected dimension
                    if not exp_vname in dsinfo_vars or not exp_vname in vshapes:
                        continue

                    # selected shape of this variable
                    vshape = vshapes[exp_vname]
                    vsize = reduce_(
                        operator.mul, vshape, 1) * \
                        dsinfo_vars[exp_vname]["dtype"].itemsize

                    if not vsize:
                        continue

                    if total_size + vsize > size_limit:
                        raise nc_exc.TooMuchDataException(
                            "too much data requested, will exceed {} mbytes".
                            format(size_limit/(1000 * 1000)))

                    dim2 = {}
                    stnnames = []
                    vdata = self.read_time_series_data(
                        ncfile, ncpath, exp_vname, time_slice, vshape,
                        selectdim, dim2, stnnames)

                    if vdata is None:
                        continue

                    # dim2 will be empty if variable is not found in file
                    if dim2 and not exp_vname in odim2:
                        odim2[exp_vname] = dim2

                    # stnnames will be empty if variable is not found in file
                    if stnnames and not exp_vname in ostns:
                        ostns[exp_vname] = stnnames

                    if not exp_vname in ovmap:
                        size1 = 0
                        vindex = len(odata)
                        odata.append(vdata)
                        ovmap[exp_vname] = vindex
                    else:
                        if debug:
                            _logger.debug(
                                "odata[%s].shape=%s, vdata.shape=%s",
                                exp_vname, odata[vindex].shape, vdata.shape)

                        vindex = ovmap[exp_vname]
                        size1 = sys.getsizeof(odata[vindex])

                        time_index = dsinfo_vars[exp_vname]["time_index"]
                        odata[vindex] = np.append(
                            odata[vindex], vdata, axis=time_index)

                    total_size += sys.getsizeof(odata[vindex]) - size1

            finally:
                ncfile.close()

            ntimes += len(otime)

        if ntimes == 0:
            exc = nc_exc.NoDataException(
                "No data between {} and {}".
                format(
                    start_time.isoformat(),
                    end_time.isoformat()))
            # _logger.warning("%s: %s", str(self), repr(exc))
            raise exc

        if debug:
            for series_name in res_data:
                for exp_vname in res_data[series_name]['vmap']:
                    var_index = res_data[series_name]['vmap'][exp_vname]
                    _logger.debug(
                        "res_data[%s]['data'][%d].shape=%s, exp_vname=%s",
                        series_name, var_index,
                        repr(res_data[series_name]['data'][var_index].shape),
                        exp_vname)
            _logger.debug(
                "total_size=%d", total_size)

        return res_data
예제 #8
0
    def scan_files(
            self,
            time_names=('time', 'Time', 'time_offset')):
        """ Scan the set of files, accumulating information about the dataset in a dict,
        with the following keys:
            file_mod_times: dictionary of file modification times by file name, of each
                file just before it was last scanned.
            base_time: name of base_time variable
            time_dim_name: name of time dimension
            time_name: name of time variable
            nstations: size of the station dimension
            station_dim: name of the station dimension
            station_names: names of the stations
            sites: dictionary of names of sites long names by the site short
                names extracted from the exported names
                of those variables not associated with a numbered station
            variables: dictionary of information for each variable.

        If an element in file_mod_times exists for a file, that file is not scanned if
        its current modification time has not been updated.

        The names of the variables in the dataset are converted to an exported
        form. If a variable has a 'short_name' attribute, it is used for the
        variable name, otherwise the exported name is set to the NetCDF variable
        name.

        Note, we don't read every file.  May want to have
        MAX_NUM_FILES_TO_PRESCAN be an attribute of the dataset.

        Args:
            time_names: List of allowed names for time variable.

        Raises:
            OSError
            nc_exc.NoDataException
        """

        dsinfo = self.get_dataset_info()

        # Note: dsinfo_vars is a reference. Modificatons to it
        # are also modifications to dsinfo.
        dsinfo_vars = dsinfo['variables']

        sitedict = dsinfo['sites']

        files = self.get_files(
            start_time=self.start_time,
            end_time=self.end_time)

        # typically get_files() also returns the file before start_time
        # We may want that in reading a period of data, but not
        # in assembling the variables for the dataset
        filepaths = [f.path for f in files if f.time >= self.start_time and f.time < self.end_time]

        skip = 1
        if len(filepaths) > NetCDFDataset.MAX_NUM_FILES_TO_PRESCAN:
            skip = len(filepaths) / NetCDFDataset.MAX_NUM_FILES_TO_PRESCAN

        # Read at most MAX_NUM_FILES_TO_PRESCAN, including latest file.
        # Files are scanned in a backwards sequence
        pindex = len(filepaths) - 1

        n_files_read = 0

        while pindex >= 0:
            ncpath = filepaths[int(pindex)]
            pindex -= skip

            # The files might be in the process of being moved, deleted, etc,
            # so if we get an exception in this open, try a few more times.

            # Testing indicates that with a truncated file (artificially
            # truncated with dd), the underlying C code will cause a crash
            # of python from an assert() rather than raising an exception
            # that could be caught.

            # If the netcdf library is compiled with -DNDEBUG, then the
            # the open and parse of the truncated header succeeds, but
            # still no exception.

            # If the file is artificially corrupted by removing an
            # initial portion of the file:
            #   dd if=test.nc of=bad.nc bs=1014 count=100 skip=1
            # then an exception is raised (this was with -DNDEBUG):
            # RuntimeError bad.nc: NetCDF: Unknown file format

            # To make this robust, it would be good to run a king's
            # taster process on each file first to reduce the possibility
            # of a server death. The king's taster would not use NDEBUG,
            # but perhaps the python server would.  Complicated.

            fileok = False
            skip_file = False
            exc = None

            siteset = set()
            site_sn = []
            site_ln = []

            for itry in range(0, 3):
                try:
                    curr_mod_time = get_file_modtime(ncpath)
                    if ncpath in dsinfo['file_mod_times']:
                        prev_mod_time = dsinfo['file_mod_times'][ncpath]
                        if curr_mod_time <= prev_mod_time:
                            skip_file = True
                            fileok = True
                            break
                    dsinfo['file_mod_times'][ncpath] = curr_mod_time
                    # _logger.debug("ncpath=%s",ncpath)
                    ncfile = netCDF4.Dataset(ncpath)
                    fileok = True
                    break
                except (OSError, RuntimeError) as excx:
                    exc = excx
                    time.sleep(itry)

            if not fileok:
                _logger.error("%s: %s", ncpath, exc)
                continue

            n_files_read += 1

            if skip_file:
                continue

            try:
                if not dsinfo['base_time'] and 'base_time' in ncfile.variables:
                    dsinfo['base_time'] = 'base_time'

                tdim = None
                # look for a time dimension
                for tname in ['time', 'Time']:
                    if tname in ncfile.dimensions:
                        tdim = ncfile.dimensions[tname]
                        break
                if not tdim:
                    continue

                # check for tdim.is_unlimited?
                if not dsinfo['time_dim_name']:
                    dsinfo['time_dim_name'] = tdim.name

                if STATION_DIMENSION_NAME in ncfile.dimensions:
                    if dsinfo['nstations'] is None:
                        dsinfo['nstations'] = len(ncfile.dimensions[STATION_DIMENSION_NAME])
                        dsinfo['station_dim'] = STATION_DIMENSION_NAME
                        if STATION_DIMENSION_NAME in ncfile.variables:
                            dsinfo['station_names'] = []
                            var = ncfile.variables[STATION_DIMENSION_NAME]
                            if var.datatype == np.dtype('S1'):

                                snms = [str(netCDF4.chartostring(v)) for v in var]
                                dsinfo['station_names'].extend(snms)
                    elif not dsinfo['nstations'] == \
                            len(ncfile.dimensions[STATION_DIMENSION_NAME]):
                        _logger.warning(
                            "%s: station dimension (%d) is "
                            "different than that of other files (%d)",
                            ncpath,
                            len(ncfile.dimensions[STATION_DIMENSION_NAME]),
                            dsinfo['nstations'])


                # look for a time variable
                if not dsinfo['time_name']:
                    for tname in time_names:
                        if tname in ncfile.variables:
                            if tdim.name in ncfile.variables[tname].dimensions:
                                dsinfo['time_name'] = tname
                                break

                if not dsinfo['time_name'] or \
                    not dsinfo['time_name'] in ncfile.variables:
                    # time variable not yet found or not in this file
                    continue

                if not tdim.name in ncfile.variables[dsinfo['time_name']].dimensions:
                    # time variable in this file doesn't have a time dimension
                    continue

                # pylint: disable=no-member
                for (nc_vname, var) in ncfile.variables.items():

                    if nc_vname == "site_long_name" and \
                        var.datatype == np.dtype('S1'):
                        site_ln = [str(netCDF4.chartostring(v)) for v in var]

                    if nc_vname == "sites" and \
                        var.datatype == np.dtype('S1'):
                        site_sn = [str(netCDF4.chartostring(v)) for v in var]

                    # looking for time series variables
                    if not dsinfo['time_dim_name'] in var.dimensions:
                        continue

                    # time variable
                    if nc_vname == dsinfo['time_name']:
                        continue

                    # exported variable name
                    if hasattr(var, 'short_name'):
                        exp_vname = getattr(var, 'short_name')
                    else:
                        exp_vname = nc_vname

                    # var.dimensions is a tuple of dimension names
                    time_index = var.dimensions.index(dsinfo['time_dim_name'])

                    # Check if we have found this variable in a earlier file
                    if not exp_vname in dsinfo_vars:

                        # New variable
                        varinfo = {}
                        varinfo['netcdf_name'] = nc_vname
                        varinfo['shape'] = var.shape
                        varinfo['dimnames'] = var.dimensions
                        varinfo['dtype'] = var.dtype
                        varinfo['time_index'] = time_index

                        # Grab certain attributes
                        for att in ['units', 'long_name']:
                            if hasattr(var, att):
                                varinfo[att] = getattr(var, att)

                        # Set default units to ''
                        if 'units' not in varinfo:
                            varinfo['units'] = ''

                        # For non-station variables, parse the name to
                        # determine the possible site
                        if not dsinfo['station_dim'] or \
                            not dsinfo['station_dim'] in var.dimensions:
                            site = get_isfs_site(exp_vname)
                            if site:
                                varinfo['site'] = site
                                siteset.add(site)
                                # dsinfo['sites'].add(site)
                        else:
                            dsinfo['has_station_variables'] = True

                        dsinfo_vars[exp_vname] = varinfo
                        continue

                    varinfo = dsinfo_vars[exp_vname]

                    # variable has been found in an earlier ncfile
                    # check for consistancy across files
                    if varinfo['shape'][1:] != var.shape[1:]:
                        # the above check works even if either shape
                        # has length 1
                        if len(varinfo['shape']) != \
                                len(var.shape):
                            # changing number of dimensions, punt
                            _logger.error(
                                "%s: %s: number of "
                                "dimensions is not consistent: %d and %d. "
                                "Skipping this variable.",
                                ncpath, nc_vname, len(var.shape),
                                len(varinfo['shape']))
                            del dsinfo_vars[exp_vname]
                            continue
                        # here we know that shapes have same length and
                        # they must have len > 1. Allow final dimension
                        # to change.
                        ndim = len(var.shape)
                        if (varinfo['shape'][1:(ndim-1)] !=
                                var.shape[1:(ndim-1)]):
                            _logger.error(
                                "%s: %s: incompatible shapes: "
                                "%s and %s. Skipping this variable.",
                                ncpath, nc_vname, repr(var.shape),
                                repr(varinfo['shape']))
                            del dsinfo_vars[exp_vname]
                            continue
                        # set shape to max shape (leaving the problem
                        # for later...)
                        varinfo['shape'] = tuple(
                            [max(i, j) for (i, j) in zip(
                                varinfo['shape'], var.shape)])

                    if varinfo['dtype'] != var.dtype:
                        _logger.error(
                            "%s: %s: type=%s is different than "
                            "in other files",
                            ncpath, nc_vname, repr(var.dtype))

                    if varinfo['time_index'] != time_index:
                        _logger.error(
                            "%s: %s: time_index=%d is different than "
                            "in other files. Skipping this variable.",
                            ncpath, nc_vname, time_index)
                        del dsinfo_vars[exp_vname]

                    for att in ['units', 'long_name']:
                        if hasattr(var, att) and att in varinfo:
                            if getattr(var, att) != varinfo[att]:
                                _logger.info(
                                    "%s: %s: %s=%s is different than previous value=%s",
                                    ncpath, nc_vname, att, getattr(var, att),
                                    varinfo[att])
                                varinfo[att] = getattr(var, att)

            finally:
                for site in siteset:
                    try:
                        i = site_sn.index(site)
                        if i < len(site_ln):
                            sitedict[site] = site_ln[i]
                    except ValueError:
                        pass

                    if site not in sitedict:
                        sitedict[site] = ''
                ncfile.close()

        if not n_files_read:
            msg = self.path + ": No files found"
            raise nc_exc.NoDataException(msg)

        # Remove the station names if no variables have a station dimension
        if not dsinfo['has_station_variables']:
            dsinfo['station_names'] = []
        else:
            # create station names if a "station" variable is not found
            # in NetCDF files. Names are S1, S2, etc for dimension index 0, 1
            if dsinfo['nstations'] and not dsinfo['station_names']:
                dsinfo['station_names'].extend(\
                    ['S{}'.format(i+1) for i in range(dsinfo['nstations'])])

        # cache dsinfo
        self.save_dataset_info(dsinfo)