예제 #1
0
파일: fstd.py 프로젝트: neishm/pygeode-rpn
def encode_time_axis (varlist):
  from pygeode.timeaxis import StandardTime
  from pygeode.timeutils import reltime
  from pygeode.formats import fstd_core
  import numpy as np
  for i,var in enumerate(varlist):
    if not var.hasaxis(StandardTime): continue
    time = var.getaxis(StandardTime)
    seconds = reltime (time, startdate=dict(year=1980,month=1,day=1), units='seconds')
    seconds = np.asarray(seconds,dtype=int)
    values = fstd_core.date2stamp(seconds)
    taxis = Dateo(values=values)
    varlist[i] = var.replace_axes(time=taxis)
예제 #2
0
파일: cfmeta.py 프로젝트: admg26/pygeode
def encode_cf (dataset):
  from pygeode.dataset import asdataset, Dataset
  from pygeode.axis import Lat, Lon, Pres, Hybrid, XAxis, YAxis, ZAxis, TAxis, NonCoordinateAxis, Station
  from pygeode.timeaxis import Time, ModelTime365, ModelTime360, StandardTime, Yearless
  from pygeode.axis import NamedAxis, DummyAxis
  from pygeode.var import Var
  from pygeode.timeutils import reltime
  from copy import copy
  dataset = asdataset(dataset)
  varlist = list(dataset)
  axisdict = dataset.axisdict.copy()
  global_atts = dataset.atts.copy()
  del dataset

  # Fix the variable names
  for i,v in enumerate(list(varlist)):
    oldname = v.name
    newname = fix_name(oldname)
    if newname != oldname:
      from warnings import warn
      warn ("renaming '%s' to '%s'"%(oldname,newname))
      varlist[i] = v.rename(newname)

  # Fix the axis names
  #TODO

  # Fix the variable metadata
  #TODO

  # Fix the global metadata
  # Specify the conventions we're (supposedly) using
  global_atts['Conventions'] = "CF-1.0"

  for v in varlist: assert v.name not in axisdict, "'%s' refers to both a variable and an axis"%v.name

  # Metadata based on axis classes
  for name,a in list(axisdict.items()):
    atts = a.atts.copy()
    plotatts = a.plotatts.copy() # passed on to Axis constructor
    
    if isinstance(a,Lat):
      atts['standard_name'] = 'latitude'
      atts['units'] = 'degrees_north'
    if isinstance(a,Lon):
      atts['standard_name'] = 'longitude'
      atts['units'] = 'degrees_east'
    if isinstance(a,Pres):
      atts['standard_name'] = 'air_pressure'
      atts['units'] = 'hPa'
      atts['positive'] = 'down'
    if isinstance(a,Hybrid):
      #TODO: formula_terms (how do we specify LNSP instead of P0?????)
      atts['standard_name'] = 'atmosphere_hybrid_sigma_pressure_coordinate'
    if isinstance(a,Time):
      atts['standard_name'] = 'time'
      #TODO: change the unit depending on the time resolution?
      start = a.startdate
      atts['units'] = '%s since %04i-%02i-%02i %02i:%02i:%02i'% (a.units,
        start.get('year',0), start.get('month',1), start.get('day',1),
        start.get('hour',0), start.get('minute',0), start.get('second',0)
      )
    if isinstance(a,StandardTime): atts['calendar'] = 'standard'
    if isinstance(a,ModelTime365): atts['calendar'] = '365_day'
    if isinstance(a,ModelTime360): atts['calendar'] = '360_day'
    if isinstance(a,Yearless): atts['calendar'] = 'none'

    if isinstance(a,XAxis): atts['axis'] = 'X'
    if isinstance(a,YAxis): atts['axis'] = 'Y'
    if isinstance(a,ZAxis): atts['axis'] = 'Z'
    if isinstance(a,TAxis): atts['axis'] = 'T'

    # Change the time axis to be relative to a start date
    #TODO: check 'units' attribute of the time axis, use that in the 'units' of the netcdf metadata
    if isinstance(a, Time):
      #TODO: cast into an integer array if possible
      axisdict[name] = NamedAxis(values=reltime(a), name=name, atts=atts, plotatts=plotatts)
      continue

    # Encode non-coordinate axes, including station (timeseries) data.
    # Loosely follow http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#_orthogonal_multidimensional_array_representation_of_time_series
    # Move station lat/lon/name data into separate variables.
    if isinstance(a, NonCoordinateAxis):

      # Keep track of extra variables created from auxarray data.
      extra_vars = []

      # Detect certain arrays that should be treated as "coordinates".
      coordinates = []

      # Encode station latitude.
      if 'lat' in a.auxarrays:
        lat = a.auxasvar('lat')
        lat.atts = dict(standard_name="latitude", long_name=a.name+" latitude", units="degrees_north")
        extra_vars.append(lat)
        coordinates.append('lat')
      # Encode station longitude.
      if 'lon' in a.auxarrays:
        lon = a.auxasvar('lon')
        lon.atts = dict(standard_name="longitude", long_name=a.name+" longitude", units="degrees_east")
        extra_vars.append(lon)
        coordinates.append('lon')

      coordinates = " ".join(coordinates)

      # Encode other auxarrays as generic "ancillary" arrays.
      ancillary_variables = []
      for auxname in list(a.auxarrays.keys()):
        if auxname in coordinates: continue  # Handled above
        var = a.auxasvar(auxname)
        if var.dtype.name.startswith('str'):
          var = encode_string_var(var)
        # Some extra CF encoding for the station name, to use it as the unique identifier.
        if auxname == 'station':
          var.atts = dict(cf_role = "timeseries_id")
        extra_vars.append(var)
        ancillary_variables.append(auxname)

      ancillary_variables = " ".join(ancillary_variables)

      # Attach these coordinates to all variables that use this axis.
      #TODO: cleaner way of adding this information without having to do a shallow copy.
      for i,var in enumerate(varlist):
        if var.hasaxis(a):
          var = copy(var)
          var.atts = copy(var.atts)
          if len(coordinates) > 0:
            var.atts['coordinates'] = coordinates
          if len(ancillary_variables) > 0:
            var.atts['ancillary_variables'] = ancillary_variables
          varlist[i] = var

      # Add these coordinates / ancillary variables to the output.
      varlist.extend(extra_vars)

      # The values in the axis itself are meaningless, so mark them as such
      axisdict[name] = DummyAxis(len(a),name=name)

      # Special case: Station (timeseries) data.
      if isinstance(a, Station):
        global_atts['featureType'] = "timeSeries"
      # Nothing more to do for this axis type
      continue

    # Encode custom axes from add-ons
    for n,c in list(custom_axes.items()):
      if isinstance(a,c):
        atts['standard_name'] = n

    # Add associated arrays as new variables
    auxarrays = a.auxarrays
    for aux,values in auxarrays.items():
      auxname = name+'_'+aux
      assert not any(v.name == auxname for v in varlist), "already have a variable named %s"%auxname
      varlist.append( Var([a], values=values, name=auxname) )
    if len(auxarrays) > 0:
      atts['ancillary_variables'] = ' '.join(name+'_'+aux for aux in auxarrays.keys())

    # Create new, generic axes with the desired attributes
    # (Replaces the existing entry in the dictionary)
    axisdict[name] = NamedAxis(values=a.values, name=name, atts=atts, plotatts=plotatts)

  # Apply these new axes to the variables
  for i,oldvar in enumerate(list(varlist)):
    name = oldvar.name
    try:
      #TODO: use Var.replace_axes instead?
      varlist[i] = var_newaxes(oldvar, [axisdict.get(a.name,a) for a in oldvar.axes], atts=oldvar.atts, plotatts=oldvar.plotatts)
    except KeyError:
      print('??', a.name, axisdict)
      raise

  dataset = Dataset(varlist, atts=global_atts)
  return dataset
예제 #3
0
파일: timeaxis.py 프로젝트: aerler/pygeode
 def reltime (self, startdate=None, units=None):
   from pygeode import timeutils
   from warnings import warn
   warn ("Deprecated.  Use timeutils module.")
   return timeutils.reltime(self, startdate, units)
예제 #4
0
파일: timeaxis.py 프로젝트: admg26/pygeode
 def reltime (self, startdate=None, units=None):
   from pygeode import timeutils
   from warnings import warn
   warn ("Deprecated.  Use timeutils module.")
   return timeutils.reltime(self, startdate, units)
예제 #5
0
파일: multifile.py 프로젝트: aerler/pygeode
def open_multi (files, format=None, opener=None, pattern=None, file2date=None, **kwargs):
# {{{
  ''' Returns a :class:`Dataset` containing variables merged across many files.

  Parameters
  ==========
  files : string, list, or tuple
    Either a single filename or a list of filenames. Wildcards are supported, :func:`glob.iglob` is
    used to expand these into an explicit list of files.

  format : string, optional
    String specifying format of file to open. If none is given the format will be automatically
    detected from the first filename (see :func:`autodetectformat`)

  opener : function, optional
    Function to open individual files. If none is provided, uses the
    format-specific version of :func:`open`. The datasets returned by this
    function are then concatenated and returned. See Notes.

  pattern : string, optional
    A regex pattern to extract date stamps from the filename; used by default file2date.
    Matching patterns must be named <year>, <month>, <day>, <hour> or <minute>.
    Abbreviations are available for the above; $Y matches a four digit year, $m, $d, $H,
    and $M match a two-digit month, day, hour and minute, respectively.

  file2date : function, optional
    Function which returns a date dictionary given a filename. By default this is produced
    by applying the regex pattern ``pattern`` to the filename.

  sorted : boolean, optional
    If True, the filenames are sorted (by alpha) prior to opening each file, and
    the axes on the returned dataset are sorted by calling :meth:`Dataset.sorted`.

  **kwargs : keyword arguments
    These are passed on to the function ``opener``;

  Returns
  =======
  dataset
    A dataset containing the variables concatenated across all specified files.
    The variable data itself is not loaded into memory. 

  Notes
  =====
  This is intended to provide access to large datasets whose files are
  separated by timestep.  To avoid opening every file individually, the time
  axis is constructed by opening the first and the last file in the list of
  files provided. This is done to provide a template of what variables and what
  times are stored in each file - it is assumed that the number of timesteps
  (and their offsets) is the same accross the whole dataset. The time axis is
  then constructed from the filenames themselves, using the function
  ``file2date`` to generate a date from each filename. As a result only two files
  need to be opened, which makes this a very efficient way to work with very large
  datasets.

  However, no explicit check is made of the integrity of the files - if there
  are corrupt or missing data within individual files, this will not become
  clear until that data is actually accessed. This can be done explicitly with
  :func:`check_dataset`, which explicitly attempts to access all the data and
  returns a list of any problems encountered; this can take a long time, but is
  a useful check (and is more likely to provide helpful error messages). 

  The function ``opener`` must take a single positional argument - the filename
  of the file to open - and keyword arguments that are passed through from this
  function. It must return a :class:`Dataset` object with the loaded variables.
  By default the standard :func:`open` is used, but providing a custom opener
  can be useful for any reshaping of the variables that must be done prior to
  concatenating the whole dataset. 

  See Also
  ========
  open
  openall
  '''

  from pygeode.timeaxis import Time, StandardTime
  from pygeode.timeutils import reltime, delta
  from pygeode.dataset import Dataset
  from pygeode.tools import common_dict
  from pygeode.formats import open, autodetectformat
  import numpy as np

  files = expand_file_list(files)
  nfiles = len(files)
  assert nfiles > 0

  if opener is None: 
    if format is None: format = autodetectformat(files[0])

    if not hasattr(format, 'open'): 
      try:
        format = __import__("pygeode.formats.%s" % format, fromlist=["pygeode.formats"])
      except ImportError:
        raise ValueError('Unrecognized format module %s.' % format)

    opener = format.open

  # Apply keyword arguments
  if len(kwargs) > 0:
    old_opener = opener
    opener = lambda f: old_opener (f, **kwargs)


  # Degenerate case: only one file was given
  if nfiles == 1: return opener(files[0])


  # We'll need a function to translate filenames to dates
  # (if we don't have one, use the supplied pattern to make one)
  if file2date is None:
    import re
    assert pattern is not None, "I don't know how to get the dates from the filenames"
    regex = pattern
    regex = regex.replace('$Y', '(?P<year>[0-9]{4})')
    regex = regex.replace('$m', '(?P<month>[0-9]{2})')
    regex = regex.replace('$d', '(?P<day>[0-9]{2})')
    regex = regex.replace('$H', '(?P<hour>[0-9]{2})')
    regex = regex.replace('$M', '(?P<minute>[0-9]{2})')
    regex = re.compile(regex)
    def file2date (f):
      d = regex.search(f)
      assert d is not None, "can't use the pattern on the filenames?"
      d = d.groupdict()
      d = dict([k,int(v)] for k,v in d.iteritems())
      # Apply default values (i.e. for minutes, seconds if they're not in the file format?)
      d = dict({'hour':0, 'minute':0,'second':0}, **d)
      return d


  # Get the starting date of each file
  dates = [file2date(f) for f in files]
  dates = dict((k,[d[k] for d in dates]) for k in dates[0].keys())

  # Open a file to get a time axis
  file = opener(files[0])
  T = None
  for v in file.vars:
    if v.hasaxis(Time):
      T = type(v.getaxis(Time))
      break
  if T is None: T = StandardTime
#  T = [v.getaxis(Time) for v in file.vars if v.hasaxis(Time)]
#  T = type(T[0]) if len(T) > 0 else StandardTime
  del file

  # Generate a lower-resolution time axis (the start of *each* file)
  faxis = T(units='days',**dates)

  # Re-sort the files, if they weren't in order
  S = faxis.argsort()
  faxis = faxis.slice[S]
  files = [files[s] for s in S]
  # Re-init the faxis to force the proper start date
  faxis = type(faxis)(units=faxis.units, **faxis.auxarrays)

  # Open the first and last file, so we know what the variables & timesteps are
  first = opener(files[0])
  last  = opener(files[-1])
  names = [v.name for v in first.vars]
  for n in names: assert n in last, "inconsistent vars"
  # Get global attributes
  global_atts = common_dict (first.atts, last.atts)

  #---
  timedict = {None:faxis}
  for v1 in first:
    if not v1.hasaxis(Time): continue
    t1 = v1.getaxis(Time)
    if t1.name in timedict: continue  # already handled this one
    t2 = last[v1.name].getaxis(Time)
    # Construct a full time axis from these pieces

    # One timestep per file? (check for an offset for the var time compared
    #  to the file time)
    if max(len(t1),len(t2)) == 1:
      offset = reltime(t1, startdate=faxis.startdate, units=faxis.units)[0]
      taxis = faxis.withnewvalues(faxis.values + offset)
    # At least one of first/last files has multiple timesteps?
    else:
      assert t1.units == t2.units
      dt = max(delta(t1),delta(t2))
      assert dt > 0
      val1 = t1.values[0]
      val2 = reltime(t2, startdate=t1.startdate)[-1]
      nt = (val2-val1)/dt + 1
      assert round(nt) == nt
      nt = int(round(nt))
      assert nt > 0
      taxis = t1.withnewvalues(np.arange(nt)*dt + val1)

    timedict[t1.name] = taxis

  #---

  # Create the multifile version of the vars
  vars = [Multifile_Var(v1, opener, files, faxis, timedict) for v1 in first]


  return Dataset(vars,atts=global_atts)
예제 #6
0
파일: multifile.py 프로젝트: aerler/pygeode
  def getview (self, view, pbar):
# {{{
    from pygeode.timeaxis import Time
    from pygeode.timeutils import reltime
    import numpy as np
    from warnings import warn

    out = np.empty(view.shape, self.dtype)
    out[()] = float('nan')

    # Get the times
    itime = view.index(Time)
    times = view.subaxis(Time)
    handled_times = np.zeros([len(times)], dtype='bool')

#    print times

    # Map these times to values along the 'file' axis
    x = reltime(times, startdate=self.faxis.startdate, units=self.faxis.units)
    file_indices = np.searchsorted(self.faxis.values, x, side='right') - 1 # -1 because we want the file that has a date *before* the specified timestep

    diff = np.diff(file_indices)

    # Where a new file needs to be loaded
    newfile_pos = list(np.where(diff != 0)[0] + 1)
    newfile_pos = [0] + newfile_pos

    # Loop over each file, git 'er done
    for i,p in enumerate(newfile_pos):
      file_index = file_indices[p]
      try:
        file = self.opener(self.files[file_index])
      except Exception as e:
        raise Exception("Multifile: error encountered with file '%s': %s"%(self.files[file_index], str(e)))
      if self.name not in file:
        raise Exception("Multifile: var '%s' was expected to be in file '%s', but it's not there!"%(self.name, self.files[file_index]))
      var = file[self.name] # abandon all hope, ye who use non-unique variable names
      # How does this var map to the overall time axis?
      if var.hasaxis(Time):
        timechunk = var.getaxis(Time)
      else:
        timechunk = self.faxis.slice[file_index]
        # Remove any vestigial internal time axis
        if var.hasaxis('time'):
          assert len(var.getaxis('time')) == 1, "unresolved time axis.  this should have been caught at init time!"
          var = var.squeeze()
      bigmap, smallmap = times.common_map(timechunk)
      # Check for any funky problems with the map
#      assert len(bigmap) > 0, "?? %s <-> %s"%(times,timechunk)
      if len(bigmap) == 0:
        raise Exception("Multifile: Can't find an entire chunk of data for variable '%s'.  Perhaps a file is missing?"%self.name)

      slices = [slice(None)] * self.naxes
      slices[itime] = bigmap
      newview = view.replace_axis(Time, times, bigmap)
      try:
        data = newview.get(var, pbar=pbar.part(i,len(newfile_pos)))
      except Exception as e:
        raise Exception("Multifile: problem fetching variable '%s' from file '%s': %s"%(self.name, self.files[file_index], str(e)))
      # Stick this data into the output
      out[slices] = data
      handled_times[bigmap] = True

    if not np.all(handled_times):
      raise Exception("Multifile: Can't find some data for variable '%s'.  Perhaps a file is missing?"%self.name)
    return out
예제 #7
0
파일: cfmeta.py 프로젝트: aerler/pygeode
def encode_cf (dataset):
  from pygeode.dataset import asdataset, Dataset
  from pygeode.axis import Lat, Lon, Pres, Hybrid, XAxis, YAxis, ZAxis, TAxis
  from pygeode.timeaxis import Time, ModelTime365, ModelTime360, StandardTime, Yearless
  from pygeode.axis import NamedAxis
  from pygeode.var import Var
  from pygeode.timeutils import reltime
  dataset = asdataset(dataset)
  varlist = list(dataset)
  axisdict = dataset.axisdict.copy()
  global_atts = dataset.atts
  del dataset

  # Fix the variable names
  for i,v in enumerate(list(varlist)):
    oldname = v.name
    newname = fix_name(oldname)
    if newname != oldname:
      from warnings import warn
      warn ("renaming '%s' to '%s'"%(oldname,newname))
      varlist[i] = v.rename(newname)

  # Fix the axis names
  #TODO

  # Fix the variable metadata
  #TODO

  # Fix the global metadata
  #TODO

  for v in varlist: assert v.name not in axisdict, "'%s' refers to both a variable and an axis"%v.name

  # Metadata based on axis classes
  for name,a in axisdict.items():
    atts = a.atts.copy()
    plotatts = a.plotatts.copy() # passed on to Axis constructor (l.139)
    
    if isinstance(a,Lat):
      atts['standard_name'] = 'latitude'
      atts['units'] = 'degrees_north'
    if isinstance(a,Lon):
      atts['standard_name'] = 'longitude'
      atts['units'] = 'degrees_east'
    if isinstance(a,Pres):
      atts['standard_name'] = 'air_pressure'
      atts['units'] = 'hPa'
      atts['positive'] = 'down'
    if isinstance(a,Hybrid):
      #TODO: formula_terms (how do we specify LNSP instead of P0?????)
      atts['standard_name'] = 'atmosphere_hybrid_sigma_pressure_coordinate'
    if isinstance(a,Time):
      atts['standard_name'] = 'time'
      #TODO: change the unit depending on the time resolution?
      start = a.startdate
      atts['units'] = '%s since %04i-%02i-%02i %02i:%02i:%02i'% (a.units,
        start.get('year',0), start.get('month',1), start.get('day',1),
        start.get('hour',0), start.get('minute',0), start.get('second',0)
      )
    if isinstance(a,StandardTime): atts['calendar'] = 'standard'
    if isinstance(a,ModelTime365): atts['calendar'] = '365_day'
    if isinstance(a,ModelTime360): atts['calendar'] = '360_day'
    if isinstance(a,Yearless): atts['calendar'] = 'none'

    if isinstance(a,XAxis): atts['axis'] = 'X'
    if isinstance(a,YAxis): atts['axis'] = 'Y'
    if isinstance(a,ZAxis): atts['axis'] = 'Z'
    if isinstance(a,TAxis): atts['axis'] = 'T'

    # Change the time axis to be relative to a start date
    #TODO: check 'units' attribute of the time axis, use that in the 'units' of the netcdf metadata
    if isinstance(a, Time):
      #TODO: cast into an integer array if possible
      axisdict[name] = NamedAxis(values=reltime(a), name=name, atts=atts, plotatts=plotatts)
      continue

    # Add associated arrays as new variables
    auxarrays = a.auxarrays
    for aux,values in auxarrays.iteritems():
      auxname = name+'_'+aux
      assert not any(v.name == auxname for v in varlist), "already have a variable named %s"%auxname
      varlist.append( Var([a], values=values, name=auxname) )
    if len(auxarrays) > 0:
      atts['ancillary_variables'] = ' '.join(name+'_'+aux for aux in auxarrays.iterkeys())

    # Create new, generic axes with the desired attributes
    # (Replaces the existing entry in the dictionary)
    axisdict[name] = NamedAxis(values=a.values, name=name, atts=atts, plotatts=plotatts)

  # Apply these new axes to the variables
  for i,oldvar in enumerate(list(varlist)):
    name = oldvar.name
    try:
      #TODO: use Var.replace_axes instead?
      varlist[i] = var_newaxes(oldvar, [axisdict[a.name] for a in oldvar.axes], atts=oldvar.atts, plotatts=oldvar.plotatts)
    except KeyError:
      print '??', a.name, axisdict
      raise

  dataset = Dataset(varlist, atts=global_atts)
  return dataset
예제 #8
0
파일: cfmeta.py 프로젝트: neishm/pygeode
def encode_cf (dataset):
  from pygeode.dataset import asdataset, Dataset
  from pygeode.axis import Lat, Lon, Pres, Hybrid, XAxis, YAxis, ZAxis, TAxis, NonCoordinateAxis, Station
  from pygeode.timeaxis import Time, ModelTime365, ModelTime360, StandardTime, Yearless
  from pygeode.axis import NamedAxis, DummyAxis
  from pygeode.var import Var
  from pygeode.timeutils import reltime
  from copy import copy
  dataset = asdataset(dataset)
  varlist = list(dataset)
  axisdict = dataset.axisdict.copy()
  global_atts = dataset.atts.copy()
  del dataset

  # Fix the variable names
  for i,v in enumerate(list(varlist)):
    oldname = v.name
    newname = fix_name(oldname)
    if newname != oldname:
      from warnings import warn
      warn ("renaming '%s' to '%s'"%(oldname,newname))
      varlist[i] = v.rename(newname)

  # Fix the axis names
  #TODO

  # Fix the variable metadata
  #TODO

  # Fix the global metadata
  # Specify the conventions we're (supposedly) using
  global_atts['Conventions'] = "CF-1.0"

  for v in varlist: assert v.name not in axisdict, "'%s' refers to both a variable and an axis"%v.name

  # Metadata based on axis classes
  for name,a in list(axisdict.items()):
    atts = a.atts.copy()
    plotatts = a.plotatts.copy() # passed on to Axis constructor
    
    if isinstance(a,Lat):
      atts['standard_name'] = 'latitude'
      atts['units'] = 'degrees_north'
    if isinstance(a,Lon):
      atts['standard_name'] = 'longitude'
      atts['units'] = 'degrees_east'
    if isinstance(a,Pres):
      atts['standard_name'] = 'air_pressure'
      atts['units'] = 'hPa'
      atts['positive'] = 'down'
    if isinstance(a,Hybrid):
      #TODO: formula_terms (how do we specify LNSP instead of P0?????)
      atts['standard_name'] = 'atmosphere_hybrid_sigma_pressure_coordinate'
    if isinstance(a,Time):
      atts['standard_name'] = 'time'
      #TODO: change the unit depending on the time resolution?
      start = a.startdate
      atts['units'] = '%s since %04i-%02i-%02i %02i:%02i:%02i'% (a.units,
        start.get('year',0), start.get('month',1), start.get('day',1),
        start.get('hour',0), start.get('minute',0), start.get('second',0)
      )
    if isinstance(a,StandardTime): atts['calendar'] = 'standard'
    if isinstance(a,ModelTime365): atts['calendar'] = '365_day'
    if isinstance(a,ModelTime360): atts['calendar'] = '360_day'
    if isinstance(a,Yearless): atts['calendar'] = 'none'

    if isinstance(a,XAxis): atts['axis'] = 'X'
    if isinstance(a,YAxis): atts['axis'] = 'Y'
    if isinstance(a,ZAxis): atts['axis'] = 'Z'
    if isinstance(a,TAxis): atts['axis'] = 'T'

    # Change the time axis to be relative to a start date
    #TODO: check 'units' attribute of the time axis, use that in the 'units' of the netcdf metadata
    if isinstance(a, Time):
      #TODO: cast into an integer array if possible
      axisdict[name] = NamedAxis(values=reltime(a), name=name, atts=atts, plotatts=plotatts)
      continue

    # Encode non-coordinate axes, including station (timeseries) data.
    # Loosely follow http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#_orthogonal_multidimensional_array_representation_of_time_series
    # Move station lat/lon/name data into separate variables.
    if isinstance(a, NonCoordinateAxis):

      # Keep track of extra variables created from auxarray data.
      extra_vars = []

      # Detect certain arrays that should be treated as "coordinates".
      coordinates = []

      # Encode station latitude.
      if 'lat' in a.auxarrays:
        lat = a.auxasvar('lat')
        lat.atts = dict(standard_name="latitude", long_name=a.name+" latitude", units="degrees_north")
        extra_vars.append(lat)
        coordinates.append('lat')
      # Encode station longitude.
      if 'lon' in a.auxarrays:
        lon = a.auxasvar('lon')
        lon.atts = dict(standard_name="longitude", long_name=a.name+" longitude", units="degrees_east")
        extra_vars.append(lon)
        coordinates.append('lon')

      coordinates = " ".join(coordinates)

      # Encode other auxarrays as generic "ancillary" arrays.
      ancillary_variables = []
      for auxname in list(a.auxarrays.keys()):
        if auxname in coordinates: continue  # Handled above
        var = a.auxasvar(auxname)
        if var.dtype.name.startswith('str'):
          var = encode_string_var(var)
        # Some extra CF encoding for the station name, to use it as the unique identifier.
        if auxname == 'station':
          var.atts = dict(cf_role = "timeseries_id")
        extra_vars.append(var)
        ancillary_variables.append(auxname)

      ancillary_variables = " ".join(ancillary_variables)

      # Attach these coordinates to all variables that use this axis.
      #TODO: cleaner way of adding this information without having to do a shallow copy.
      for i,var in enumerate(varlist):
        if var.hasaxis(a):
          var = copy(var)
          var.atts = copy(var.atts)
          if len(coordinates) > 0:
            var.atts['coordinates'] = coordinates
          if len(ancillary_variables) > 0:
            var.atts['ancillary_variables'] = ancillary_variables
          varlist[i] = var

      # Add these coordinates / ancillary variables to the output.
      varlist.extend(extra_vars)

      # The values in the axis itself are meaningless, so mark them as such
      axisdict[name] = DummyAxis(len(a),name=name)

      # Special case: Station (timeseries) data.
      if isinstance(a, Station):
        global_atts['featureType'] = "timeSeries"
      # Nothing more to do for this axis type
      continue

    # Encode custom axes from add-ons
    for n,c in list(custom_axes.items()):
      if isinstance(a,c):
        atts['standard_name'] = n

    # Add associated arrays as new variables
    auxarrays = a.auxarrays
    for aux,values in auxarrays.items():
      auxname = name+'_'+aux
      assert not any(v.name == auxname for v in varlist), "already have a variable named %s"%auxname
      varlist.append( Var([a], values=values, name=auxname) )
    if len(auxarrays) > 0:
      atts['ancillary_variables'] = ' '.join(name+'_'+aux for aux in auxarrays.keys())

    # Create new, generic axes with the desired attributes
    # (Replaces the existing entry in the dictionary)
    axisdict[name] = NamedAxis(values=a.values, name=name, atts=atts, plotatts=plotatts)

  # Apply these new axes to the variables
  for i,oldvar in enumerate(list(varlist)):
    name = oldvar.name
    try:
      #TODO: use Var.replace_axes instead?
      varlist[i] = var_newaxes(oldvar, [axisdict.get(a.name,a) for a in oldvar.axes], atts=oldvar.atts, plotatts=oldvar.plotatts)
    except KeyError:
      print('??', a.name, axisdict)
      raise

  dataset = Dataset(varlist, atts=global_atts)
  return dataset
예제 #9
0
def open_multi (files, format=None, opener=None, pattern=None, file2date=None, **kwargs):
# {{{
  ''' Returns a :class:`Dataset` containing variables merged across many files.

  Parameters
  ==========
  files : string, list, or tuple
    Either a single filename or a list of filenames. Wildcards are supported, :func:`glob.iglob` is
    used to expand these into an explicit list of files.

  format : string, optional
    String specifying format of file to open. If none is given the format will be automatically
    detected from the first filename (see :func:`autodetectformat`)

  opener : function, optional
    Function to open individual files. If none is provided, uses the
    format-specific version of :func:`open`. The datasets returned by this
    function are then concatenated and returned. See Notes.

  pattern : string, optional
    A regex pattern to extract date stamps from the filename; used by default file2date.
    Matching patterns must be named <year>, <month>, <day>, <hour> or <minute>.
    Abbreviations are available for the above; $Y matches a four digit year, $m, $d, $H,
    and $M match a two-digit month, day, hour and minute, respectively.

  file2date : function, optional
    Function which returns a date dictionary given a filename. By default this is produced
    by applying the regex pattern ``pattern`` to the filename.

  sorted : boolean, optional
    If True, the filenames are sorted (by alpha) prior to opening each file, and
    the axes on the returned dataset are sorted by calling :meth:`Dataset.sorted`.

  **kwargs : keyword arguments
    These are passed on to the function ``opener``;

  Returns
  =======
  dataset
    A dataset containing the variables concatenated across all specified files.
    The variable data itself is not loaded into memory. 

  Notes
  =====
  This is intended to provide access to large datasets whose files are
  separated by timestep.  To avoid opening every file individually, the time
  axis is constructed by opening the first and the last file in the list of
  files provided. This is done to provide a template of what variables and what
  times are stored in each file - it is assumed that the number of timesteps
  (and their offsets) is the same accross the whole dataset. The time axis is
  then constructed from the filenames themselves, using the function
  ``file2date`` to generate a date from each filename. As a result only two files
  need to be opened, which makes this a very efficient way to work with very large
  datasets.

  However, no explicit check is made of the integrity of the files - if there
  are corrupt or missing data within individual files, this will not become
  clear until that data is actually accessed. This can be done explicitly with
  :func:`check_multi`, which explicitly attempts to access all the data and
  returns a list of any problems encountered; this can take a long time, but is
  a useful check (and is more likely to provide helpful error messages). 

  The function ``opener`` must take a single positional argument - the filename
  of the file to open - and keyword arguments that are passed through from this
  function. It must return a :class:`Dataset` object with the loaded variables.
  By default the standard :func:`open` is used, but providing a custom opener
  can be useful for any reshaping of the variables that must be done prior to
  concatenating the whole dataset. 

  See Also
  ========
  open
  openall
  '''

  from pygeode.timeaxis import Time, StandardTime
  from pygeode.timeutils import reltime, delta
  from pygeode.dataset import Dataset
  from pygeode.tools import common_dict
  from pygeode.formats import open, autodetectformat
  import numpy as np

  files = expand_file_list(files)
  nfiles = len(files)
  assert nfiles > 0

  if opener is None: 
    if format is None: format = autodetectformat(files[0])

    if not hasattr(format, 'open'): 
      try:
        format = __import__("pygeode.formats.%s" % format, fromlist=["pygeode.formats"])
      except ImportError:
        raise ValueError('Unrecognized format module %s.' % format)

    opener = format.open

  # Apply keyword arguments
  if len(kwargs) > 0:
    old_opener = opener
    opener = lambda f: old_opener (f, **kwargs)


  # Degenerate case: only one file was given
  if nfiles == 1: return opener(files[0])


  # We'll need a function to translate filenames to dates
  # (if we don't have one, use the supplied pattern to make one)
  if file2date is None:
    import re
    assert pattern is not None, "I don't know how to get the dates from the filenames"
    regex = pattern
    regex = regex.replace('$Y', '(?P<year>[0-9]{4})')
    regex = regex.replace('$m', '(?P<month>[0-9]{2})')
    regex = regex.replace('$d', '(?P<day>[0-9]{2})')
    regex = regex.replace('$H', '(?P<hour>[0-9]{2})')
    regex = regex.replace('$M', '(?P<minute>[0-9]{2})')
    regex = re.compile(regex)
    def file2date (f):
      d = regex.search(f)
      assert d is not None, "can't use the pattern on the filenames?"
      d = d.groupdict()
      d = dict([k,int(v)] for k,v in d.items() if v is not None)
      # Apply default values (i.e. for minutes, seconds if they're not in the file format?)
      d = dict({'hour':0, 'minute':0,'second':0}, **d)
      return d


  # Get the starting date of each file
  dates = [file2date(f) for f in files]
  dates = dict((k,[d[k] for d in dates]) for k in list(dates[0].keys()))

  # Open a file to get a time axis
  file = opener(files[0])
  T = None
  for v in file.vars:
    if v.hasaxis(Time):
      T = type(v.getaxis(Time))
      break
  if T is None: T = StandardTime
#  T = [v.getaxis(Time) for v in file.vars if v.hasaxis(Time)]
#  T = type(T[0]) if len(T) > 0 else StandardTime
  del file

  # Generate a lower-resolution time axis (the start of *each* file)
  faxis = T(units='days',**dates)

  # Re-sort the files, if they weren't in order
  S = faxis.argsort()
  faxis = faxis.slice[S]
  files = [files[s] for s in S]
  # Re-init the faxis to force the proper start date
  faxis = type(faxis)(units=faxis.units, **faxis.auxarrays)

  # Open the first and last file, so we know what the variables & timesteps are
  first = opener(files[0])
  last  = opener(files[-1])
  names = [v.name for v in first.vars]
  for n in names: assert n in last, "inconsistent vars"
  # Get global attributes
  global_atts = common_dict (first.atts, last.atts)

  #---
  timedict = {None:faxis}
  for v1 in first:
    if not v1.hasaxis(Time): continue
    t1 = v1.getaxis(Time)
    if t1.name in timedict: continue  # already handled this one
    t2 = last[v1.name].getaxis(Time)
    # Construct a full time axis from these pieces

    # One timestep per file? (check for an offset for the var time compared
    #  to the file time)
    if max(len(t1),len(t2)) == 1:
      offset = reltime(t1, startdate=faxis.startdate, units=faxis.units)[0]
      taxis = faxis.withnewvalues(faxis.values + offset)
    # At least one of first/last files has multiple timesteps?
    else:
      assert t1.units == t2.units
      dt = max(delta(t1),delta(t2))
      assert dt > 0
      val1 = t1.values[0]
      val2 = reltime(t2, startdate=t1.startdate)[-1]
      nt = (val2-val1)/dt + 1
      assert round(nt) == nt
      nt = int(round(nt))
      assert nt > 0
      taxis = t1.withnewvalues(np.arange(nt)*dt + val1)

    timedict[t1.name] = taxis

  #---

  # Create the multifile version of the vars
  vars = [Multifile_Var(v1, opener, files, faxis, timedict) for v1 in first]


  return Dataset(vars,atts=global_atts)
예제 #10
0
def check_multi (*args, **kwargs):
  ''' Validates the files for completeness and consistency with the assumptions
      made by pygeode.formats.multifile.open_multi.
  '''
  from pygeode.timeutils import reltime
  import numpy as np
  # First, query open_multi to find out what we *expect* to see in all the files
  full_dataset = open_multi (*args, **kwargs)
  # Dig into this object, to find the list of files and the file opener.
  # (this will break if open_multi or Multifile_Var are ever changed!)
  sample_var = full_dataset.vars[0]
  assert isinstance(sample_var,Multifile_Var)
  files = sample_var.files
  faxis = sample_var.faxis
  opener = sample_var.opener
  full_taxis = sample_var.getaxis('time')
  del sample_var
  # Helper method - associate a time axis value with a particular file.
  def find_file (t):
    i = np.searchsorted(faxis.values, t, side='right') - 1
    if i == -1:  return '(some missing file?)'
    return files[i]
  # Similar to above, but return all files that should cover all given timesteps.
  def find_files (t_array):
    return sorted(set(map(find_file,t_array)))
  # Loop over each file, and check the contents.
  all_ok = True
  all_expected_times = set(full_taxis.values)

  # Check for uniformity in the data, and report any potential holes.
  dt = np.diff(full_taxis.values)
  expected_dt = min(dt[dt > 0])
  gaps = full_taxis.values[np.where(dt > expected_dt)]
  if len(gaps) > 0:
    print("ERROR: detected gaps on or after file(s):")
    for filename in find_files(gaps):
      print(filename)
    print("There may be missing files near those files.")
    all_ok = False

  covered_times = set()
  for i,filename in enumerate(files):
    print("Scanning "+filename)
    try:
      current_file = opener(filename)
    except Exception as e:
      print("  ERROR: Can't even open the file.  Reason: %s"%str(e))
      all_ok = False
      continue
    for var in current_file:
      if var.name not in full_dataset:
        print("  ERROR: unexpected variable '%s'"%var.name)
        all_ok = False
        continue
    for var in full_dataset:
      if var.name not in current_file:
        print("  ERROR: missing variable '%s'"%var.name)
        all_ok = False
        continue
      try:
        source_data = current_file[var.name].get().flatten()
      except Exception as e:
        print("  ERROR: unable to read source variable '%s'.  Reason: %s"%(var.name, str(e)))
        all_ok = False
        continue
      try:
        file_taxis = current_file[var.name].getaxis('time')
        times = reltime(file_taxis, startdate=full_taxis.startdate, units=full_taxis.units)
        multifile_data = var(l_time=list(times)).get().flatten()
      except Exception as e:
        print("  ERROR: unable to read multifile variable '%s'.  Reason: %s"%(var.name, str(e)))
        all_ok = False
        continue
      if len(source_data) != len(multifile_data):
        print("  ERROR: size mismatch for variable '%s'"%var.name)
        all_ok = False
        continue
      source_mask = ~np.isfinite(source_data)
      multifile_mask = ~np.isfinite(multifile_data)
      if not np.all(source_mask == multifile_mask):
        print("  ERROR: different missing value masks found in multifile vs. direct access for '%s'"%var.name)
        all_ok = False
        continue
      source_data = np.ma.masked_array(source_data, mask=source_mask)
      multifile_data = np.ma.masked_array(multifile_data, mask=multifile_mask)
      if not np.all(source_data == multifile_data):
        print("  ERROR: get different data from multifile vs. direct access for '%s'"%var.name)
        all_ok = False
        continue

    covered_times.update(times)
    if i < len(files)-1 and np.any(times >= faxis[i+1]):
      print("  ERROR: found timesteps beyond the expected range of this file.")
      all_ok = False
    if np.any(times < faxis[i]):
      print("  ERROR: found timestep(s) earlier than the expected start of this file.")
      all_ok = False

  missing_times = all_expected_times - covered_times
  if len(missing_times) > 0:
    print("ERROR: did not get full time coverage.  Missing some timesteps for file(s):")
    for filename in find_files(missing_times):
      print(filename)
    all_ok = False
  extra_times = covered_times - all_expected_times
  if len(extra_times) > 0:
    print("ERROR: found extra (unexpected) timesteps in the following file(s):")
    for filename in find_files(extra_times):
      print(filename)
    all_ok = False

  if all_ok:
    print("Scan completed without any errors.")
  else:
    print("One or more errors occurred while scanning the files.")
예제 #11
0
  def getview (self, view, pbar):
# {{{
    from pygeode.timeaxis import Time
    from pygeode.timeutils import reltime
    import numpy as np
    from warnings import warn

    out = np.empty(view.shape, self.dtype)
    out[()] = float('nan')

    # Get the times
    itime = view.index(Time)
    times = view.subaxis(Time)
    handled_times = np.zeros([len(times)], dtype='bool')

#    print times

    # Map these times to values along the 'file' axis
    x = reltime(times, startdate=self.faxis.startdate, units=self.faxis.units)
    file_indices = np.searchsorted(self.faxis.values, x, side='right') - 1 # -1 because we want the file that has a date *before* the specified timestep

    diff = np.diff(file_indices)

    # Where a new file needs to be loaded
    newfile_pos = list(np.where(diff != 0)[0] + 1)
    newfile_pos = [0] + newfile_pos

    # Loop over each file, git 'er done
    for i,p in enumerate(newfile_pos):
      file_index = file_indices[p]
      try:
        file = self.opener(self.files[file_index])
      except Exception as e:
        raise Exception("Multifile: error encountered with file '%s': %s"%(self.files[file_index], str(e)))
      if self.name not in file:
        raise Exception("Multifile: var '%s' was expected to be in file '%s', but it's not there!"%(self.name, self.files[file_index]))
      var = file[self.name] # abandon all hope, ye who use non-unique variable names
      # How does this var map to the overall time axis?
      if var.hasaxis(Time):
        timechunk = var.getaxis(Time)
      else:
        timechunk = self.faxis.slice[file_index]
        # Remove any vestigial internal time axis
        if var.hasaxis('time'):
          assert len(var.getaxis('time')) == 1, "unresolved time axis.  this should have been caught at init time!"
          var = var.squeeze()
      bigmap, smallmap = times.common_map(timechunk)
      # Check for any funky problems with the map
#      assert len(bigmap) > 0, "?? %s <-> %s"%(times,timechunk)
      if len(bigmap) == 0:
        raise Exception("Multifile: Can't find an entire chunk of data for variable '%s'.  Perhaps a file is missing?"%self.name)

      slices = [slice(None)] * self.naxes
      slices[itime] = bigmap
      newview = view.replace_axis(Time, times, bigmap)
      try:
        data = newview.get(var, pbar=pbar.part(i,len(newfile_pos)))
      except Exception as e:
        raise Exception("Multifile: problem fetching variable '%s' from file '%s': %s"%(self.name, self.files[file_index], str(e)))
      # Stick this data into the output
      out[slices] = data
      handled_times[bigmap] = True

    if not np.all(handled_times):
      raise Exception("Multifile: Can't find some data for variable '%s'.  Perhaps a file is missing?"%self.name)
    return out