Пример #1
0
 def __call__(self,
              lparallel=False,
              NP=None,
              inner_list=None,
              outer_list=None,
              callback=None,
              **kwargs):
     ''' this method is called instead of a class or instance method; it applies the arguments 
     'kwargs' to each ensemble member; it also supports argument expansion with inner and 
     outer product (prior to application to ensemble) and parallelization using multiprocessing '''
     # expand kwargs to ensemble list
     kwargs_list = expandArgumentList(inner_list=inner_list,
                                      outer_list=outer_list,
                                      **kwargs)
     if len(kwargs_list) == 1:
         kwargs_list = kwargs_list * len(self.klass.members)
     elif len(kwargs_list) != len(self.klass.members):
         raise ArgumentError(
             'Length of expanded argument list does not match ensemble size! {} ~= {}'
             .format(len(kwargs_list), len(self.klass.members)))
     # loop over ensemble members and execute function
     if lparallel:
         # parallelize method execution using multiprocessing
         pool = multiprocessing.Pool(processes=NP)  # initialize worker pool
         if callback is not None and not callable(callback):
             raise TypeError(callback)
         # N.B.: the callback function is passed a result from the apply_method function,
         #       which returns a tuple of the form (member, exit_code)
         # define work loads (function and its arguments) and start tasks
         results = [
             pool.apply_async(apply_method, (member, self.attr),
                              kwargs,
                              callback=callback)
             for member, kwargs in zip(self.klass.members, kwargs_list)
         ]
         # N.B.: Beware Pickling!!!
         pool.close()
         pool.join()  # wait to finish
         # retrieve and assemble results
         results = [result.get() for result in results]
         # divide members and results (apply_method returns both, in case members were modified)
         self.klass.members = [result[0] for result in results]
         results = [result[1] for result in results]
     else:
         # get instance methods
         methods = [
             getattr(member, self.attr) for member in self.klass.members
         ]
         # just apply sequentially
         results = [
             method(**kwargs)
             for method, kwargs in zip(methods, kwargs_list)
         ]
     if len(results) != len(self.klass.members):
         raise ArgumentError(
             'Length of results list does not match ensemble size! {} ~= {}'
             .format(len(results), len(self.klass.members)))
     return tuple(results)
Пример #2
0
def timeAxis(start_date=None,
             end_date=None,
             sampling=None,
             date_range=None,
             time_axis=None,
             llastIncl=True,
             ntime=None,
             varatts=None):
    ''' figure out type and dimensions of time axis '''
    # check time input
    if date_range: start_date, end_date, sampling = date_range
    if start_date and end_date and sampling:
        start_year, start_month, start_day = convertDate(start_date)
        start_datetime = np.datetime64(
            dt.datetime(year=start_year, month=start_month, day=start_day),
            sampling)
        end_year, end_month, end_day = convertDate(end_date)
        end_datetime = np.datetime64(
            dt.datetime(year=end_year, month=end_month, day=end_day), sampling)
        if llastIncl: end_datetime += np.timedelta64(1, sampling)
        date_range = np.arange(start_datetime,
                               end_datetime,
                               dtype='datetime64[{}]'.format(sampling))
        assert date_range[0] == start_datetime, date_range[0]
        if ntime:
            if ntime > len(date_range):
                raise ArgumentError(date_range)
            else:
                # trim
                date_range = date_range[0:ntime]
        else:
            ntime = len(date_range)
    elif time_axis == 'datetime':
        raise ArgumentError('Insufficient time axis information!')
    # construct time axis
    atts = varatts['time']
    if time_axis.lower() == 'simple':
        time = Axis(atts=atts, coord=np.arange(1, ntime + 1))
    elif time_axis.lower() == 'datetime':
        if sampling.lower() == 'y' or sampling.lower() == '1y': units = 'year'
        elif sampling.lower() == 'm' or sampling.lower() == '1m':
            units = 'month'
        elif sampling.lower() == 'd' or sampling.lower() == '1d':
            units = 'day'
        elif sampling.lower() == 'h' or sampling.lower() == '1h':
            units = 'hour'
        else:
            units = sampling
        long_name = '{}s since {}'.format(units.title(), str(
            date_range[0]))  # hope this makes sense...
        atts.update(long_name=long_name, units=units)
        time = Axis(atts=atts, coord=date_range)
    else:
        raise ArgumentError(time_axis)
    # return time axis
    return time
Пример #3
0
def detrend(var,
            ax=None,
            lcopy=True,
            ldetrend=True,
            ltrend=False,
            degree=1,
            rcond=None,
            w=None,
            lsmooth=False,
            lresidual=False,
            window_len=11,
            window='hanning'):
    ''' subtract a linear trend from a time-series array (operation is in-place) '''
    # check input
    if not isinstance(var, np.ndarray):
        raise NotImplementedError  # too many checks
    if lcopy: var = var.copy()  # make copy - not in-place!
    # fit over entire array (usually not what we want...)
    if ax is None and ldetrend:
        ax = np.arange(var.size)  # make dummy axis, if necessary
    if var.ndim != 1:
        shape = var.shape
        var = var.ravel()  # flatten array, if necessary
    else:
        shape = None
    # apply optional detrending
    if ldetrend or ltrend:
        # fit linear trend
        trend = np.polyfit(ax,
                           var,
                           deg=degree,
                           rcond=rcond,
                           w=w,
                           full=False,
                           cov=False)
        # evaluate and subtract linear trend
        if ldetrend and ltrend:
            raise ArgumentError(
                "Can either return trend/polyfit or residuals, not both.")
        elif ldetrend and not ltrend:
            var -= np.polyval(trend, ax)  # residuals
        elif ltrend and not ldetrend:
            var = np.polyval(trend, ax)  # residuals
    # apply optional smoothing
    if lsmooth and lresidual:
        raise ArgumentError(
            "Can either return smoothed array or residuals, not both.")
    elif lsmooth:
        var = smooth(var, window_len=window_len, window=window)
    elif lresidual:
        var -= smooth(var, window_len=window_len, window=window)
    # return detrended and/or smoothed time-series
    if shape is not None: var = var.reshape(shape)
    return var
Пример #4
0
 def getTimeseriesData(self, units='kg/s', lcheck=True, lexpand=True, lfill=True, period=None, lflatten=True):
   ''' extract time series data and time coordinates from a WSC monthly CSV file '''
   if self.monthly_file:
     # use numpy's CSV functionality
     # get timeseries data
     data = np.genfromtxt(self.monthly_file, dtype=np.float32, delimiter=',', skip_header=1, filling_values=np.nan,  
                          usecols=np.arange(4,28,2), usemask=True, loose=True, invalid_raise=True)
     assert data.shape[1] == 12, data.shape
     # for some reason every value is followed by an extra comma...
     #data = np.ma.masked_less(data, 10) # remove some invalid values
     # N.B.: some values appear unrealistically small, however, these are removed in the check-
     #       section below (it appears they consistently fail the ckeck test)
     if units.lower() == 'kg/s': data *= 1000. # m^3 == 1000 kg (water)
     elif units.lower() == 'm^3/s': pass # original units
     else: raise ArgumentError("Unknown units: {}".format(units))
     # get time coordinates and verification flag
     check = np.genfromtxt(self.monthly_file, dtype=np.int, delimiter=',', skip_header=1, filling_values=-9999,  
                          usecols=np.arange(1,4,1), usemask=True, loose=True, invalid_raise=True)
     assert check.shape[0] == data.shape[0], check.shape
     assert np.all(check >= 0), np.sum(check < 0)
     time = check[:,2].astype(np.int) # this is the year (time coordinate)
     # determine valid entries
     if lcheck:
       check = np.all(check[:,:2]==1, axis=1) # require all entries to be one
       # N.B.: I'm not sure what it means if values are not equal to one, but the flow values look 
       #       unrealistically small (see above); probably different units...
       data = data[check,:]; time = time[check]
       assert time.shape[0] == data.shape[0], check.shape
     # slice off values outside the period of interest
     if period:
       valid = np.logical_and(time >= period[0],time < period[1])
       time = time[valid]; data = data[valid]
     # fill in missing time periods/years
     if lfill:
       if period: time0 = period[0]; time1 = period[1]
       else: time0 = time[0]; time1 = time[-1]+1
       idx = np.asarray(time - time0, dtype=np.int32); tlen = time1 - time0 # start at 0; length is last value (+1)
       pad_time = np.arange(time0,time1) # form continuous sequence
       #assert np.all( pad_time[idx] == time ), idx # potentially expensive
       time = pad_time # new continuous time coordinate
       pad_data = np.ma.zeros((tlen,12), dtype=np.float32)*np.NaN # pre-allocate with NaN
       pad_data.mask = True # mask everywhere for now
       pad_data[idx,:] = data; #pad_data.mask[idx,:] = data.mask
       #assert np.all( pad_data.mask[idx,:] == data.mask ) # potentially expensive
       data = pad_data
     # now, expand time coordinate by adding month
     if lexpand:
       time = time.reshape((time.size,1))
       coord = np.repeat((time-1979)*12, 12, axis=1) + np.arange(0,12).reshape((1,12))
       assert coord.shape == data.shape, coord.shape
       #assert np.all( np.diff(coord.flatten()) == 1 ), coord  # potentially expensive
       time = coord
     if lflatten:
       time = time.flatten(); data = data.flatten()
     # return data array and coordinate vector
     return data, time
   else:
     raise IOError("No timeseries file defined or file not found for gage station '{}'.\n(folder: '{}')".format(self.name,self.folder))
Пример #5
0
 def __init__(self, project=None, filetype='aux', folder=None, bc_method=None, **expargs):
   ''' take arguments that have been passed from caller and initialize parameters '''
   if bc_method:
     if not filetype: filetype = bc_method.lower()
     elif filetype != bc_method.lower():
         raise ArgumentError(filetype, bc_method)
   self.bc_method = bc_method
   self.filetype = filetype; self.folder_pattern = folder    
   self.export_arguments = expargs
Пример #6
0
def binedges(bins=None, binedgs=None, limits=None, lcheckVar=True):
    ''' utility function to generate and validate bins and binegdes from either one '''
    # check input
    if bins is None and binedgs is None: raise ArgumentError
    elif bins is not None and binedgs is not None:
        if len(bins) + 1 != len(binedgs): raise ArgumentError(len(bins))
    if bins is not None:
        if limits is not None: vmin, vmax = limits
        else: raise ArgumentError(bins)
        # expand bins (values refer to center of bins)
        if isinstance(bins, (int, np.integer)):
            if bins == 1: bins = np.asarray(((vmin + vmax) / 2., ))
            else: bins = np.linspace(vmin, vmax, bins)
        elif isinstance(bins, (tuple, list)) and 0 < len(bins) < 4:
            bins = np.linspace(*bins)
        elif not isinstance(bins, (list, np.ndarray)):
            raise TypeError(bins)
        if len(bins) == 1:
            tmpbinedgs = np.asarray((vmin, vmax))
        else:
            hbd = np.diff(bins) / 2.  # make sure this is a float!
            tmpbinedgs = np.hstack(
                (bins[0] - hbd[0], bins[1:] - hbd,
                 bins[-1] + hbd[-1]))  # assuming even spacing
        if binedgs is None: binedgs = tmpbinedgs  # computed from bins
        elif lcheckVar:
            assert isEqual(binedgs, np.asarray(tmpbinedgs,
                                               dtype=binedgs.dtype))
    if binedgs is not None:
        # expand bin edges
        if not isinstance(binedgs, (tuple, list)):
            binedgs = np.asarray(binedgs)
        elif not isinstance(binedgs, np.ndarray):
            raise TypeError(binedgs)
        tmpbins = binedgs[1:] - (np.diff(binedgs) / 2.
                                 )  # make sure this is a float!
        if bins is None: bins = tmpbins  # compute from binedgs
        elif lcheckVar:
            assert isEqual(bins, np.asarray(tmpbins, dtype=bins.dtype))
    # return bins and binegdes
    return bins, binedgs
Пример #7
0
def loadMetadata(well,
                 filename='metadata.dbf',
                 wellname='W{WELL_ID:07d}-{WELL_NO:1d}',
                 llistWells=False,
                 folder=None,
                 conservation_authority=None):
    if not folder and conservation_authority:
        folder = ca_folder.format(conservation_authority)
    # clean up well name
    well_id, well_no = getWellName(well)
    well = wellname.format(WELL_ID=well_id, WELL_NO=well_no)
    # open database and get relevant entry
    #from simpledbf import Dbf5
    from dbfread import DBF
    filepath = filename if folder is None else os.path.join(folder, filename)
    table = DBF(filepath)
    meta = None
    for record in table:
        if llistWells: print((record['PGMN_WELL']))
        if record['PGMN_WELL'] == well:
            meta = record.copy()
    if meta is None:
        raise ArgumentError(well)
    # parse screen information
    screen_type, screen_depth = meta['SCREEN_HOL'].split(':')
    meta['Screen'] = screen_type.title()
    screen_hilo = []
    lunit = False
    for hilo in screen_depth.split('-'):
        if hilo[-1] == 'M':
            lunit = True
            screen_hilo.append(float(hilo[:-1]))
        else:
            screen_hilo.append(float(hilo))
    if not lunit: raise ValueError(screen_depth)
    assert len(screen_hilo) == 2, screen_hilo
    meta['screen_top'] = screen_hilo[0]
    meta['screen_bottom'] = screen_hilo[1]
    meta['screen_depth'] = (screen_hilo[0] + screen_hilo[1]) / 2.
    meta['zs'] = float(meta['ELVA_GROUN'])
    meta['z'] = meta['zs'] - meta['screen_depth']
    meta['z_t'] = meta['zs'] - meta['screen_top']
    meta['z_b'] = meta['zs'] - meta['screen_bottom']
    # return coordinate arrays (in degree)
    return meta
Пример #8
0
 def __init__(self, basin=None, river=None, name=None, folder=None, lcheck=False):
   ''' initialize gage station based on various input data '''
   if name is None: raise ArgumentError()
   if folder is None: folder = '{:s}/Basins/{:s}/'.format(root_folder,basin)
   if not os.path.isdir(folder): IOError(folder)
   if river is not None and river not in name: name = '{:s}_{:s}'.format(river,name)
   if not os.path.isdir(folder): IOError(folder)
   self.folder = folder # usually basin folder
   self.name = name # or prefix...
   self.basin_name = basin # has to be a long_name in order to construct the folder
   self.meta_file = '{:s}/{:s}'.format(folder,name + self.meta_ext)  
   if not os.path.isfile(self.meta_file): 
     if lcheck: raise IOError(self.meta_file)
     else: self.meta_file = None # clear if not available
   self.monthly_file = '{:s}/{:s}'.format(folder,name + self.monthly_ext)
   if not os.path.isfile(self.monthly_file): 
     if lcheck: raise IOError(self.monthly_file)
     else: self.monthly_file = None # clear if not available
Пример #9
0
def loadHGS_StnEns(ensemble=None, station=None, varlist=None, varatts=None, name=None, title=None, 
                   period=None, run_period=15, folder=None, obs_period=None,  
                   ensemble_list=None, ensemble_args=None, observation_list=None, # ensemble and obs lists for project
                   loadHGS_StnTS=loadHGS_StnTS, loadWSC_StnTS=loadWSC_StnTS, # these can also be overloaded
                   prefix=None, WSC_station=None, basin=None, basin_list=None, **kwargs):
  ''' a wrapper for the regular HGS loader that can also load gage stations and assemble ensembles '''
  if observation_list is None: observation_list = ('obs','observations')
  if ensemble_list is None: ensemble_list = dict() # empty, i.e. no ensembles
  elif not isinstance(ensemble_list, dict): raise TypeError(ensemble_list)
  if ensemble is None: raise ArgumentError("Mandatory argument 'ensemble' is not defined!")
  # decide what to do, based on inputs
  if ensemble.lower() in observation_list:
      # translate parameters
      station = station if WSC_station is None else WSC_station
      period = period if obs_period is None else obs_period
      filetype = 'monthly'
      # load gage station with slightly altered parameters
      dataset = loadWSC_StnTS(station=station, name=name, title=title, basin=basin, basin_list=basin_list, 
                              varlist=varlist, varatts=varatts, period=period, filetype=filetype)
  elif ensemble.lower() in ensemble_list:
      if ensemble_args is None: ensemble_args = dict()
      # loop over list of experiments in ensemble
      ens = []
      for exp in ensemble_list[ensemble]:
          # load individual HGS simulation
          ds = loadHGS_StnTS(station=station, varlist=varlist, varatts=varatts, name=name, title=title, 
                             period=period, ENSEMBLE=exp, run_period=run_period, folder=folder, prefix=prefix, 
                             WSC_station=WSC_station, basin=basin, basin_list=basin_list, **kwargs)
          ens.append(ds)
      # construct ensemble by concatenating time-series
      ensemble_args.setdefault('name',ds.name.replace(exp,ensemble).replace(exp.title(),ensemble.title()))
      ensemble_args.setdefault('title',ds.title.replace(exp,ensemble).replace(exp.title(),ensemble.title())) 
      # N.B.: the ensemble name is constructed by replacing the experiment name in specific dataset names with the ensemble name
      ensemble_args.setdefault('axis','time')
      dataset = concatDatasets(ens, **ensemble_args)
  else:
      # load HGS simulation
      dataset = loadHGS_StnTS(station=station, varlist=varlist, varatts=varatts, name=name, title=title, period=period, 
                              ENSEMBLE=ensemble, run_period=run_period, folder=folder, prefix=prefix, 
                              WSC_station=WSC_station, basin=basin, basin_list=basin_list, **kwargs)
  return dataset
Пример #10
0
def PCA(data,
        degree=None,
        lprewhiten=False,
        lpostwhiten=False,
        lEOF=False,
        lfeedback=False):
    ''' A function to perform principal component analysis and return the time-series of the leading EOF's. '''
    data = np.asarray(data)
    if not data.ndim == 2: raise ArgumentError(data.ndim)
    # pre-whiten features
    if lprewhiten:
        data -= data.mean(axis=0, keepdims=True)
        data /= data.std(axis=0, keepdims=True)
    # compute PCA
    R = np.cov(data.transpose())  # covariance matrix
    eig, eof = la.eigh(R)  # eigenvalues, eigenvectors (of symmetric matrix)
    ieig = np.argsort(eig, )[::-1]  # sort in descending order
    eig = eig[ieig]
    eof = eof[:, ieig]
    eig /= eig.sum()  # normalize by total variance
    # truncate EOF's
    if degree is not None:
        eig = eig[:degree]
        eof = eof[:, :degree]
    # generate report/feedback
    if lfeedback:
        string = "Variance explained by {:s} PCA's: {:s}; total variance explained: {:2.0f}%"
        eiglist = ', '.join('{:.0f}%'.format(e * 100.) for e in eig)
        dgrstr = 'all' if degree is None else "{:d} leading".format(degree)
        print(string.format(dgrstr, eiglist, eig.sum() * 100.))
    # project data onto (leading) EOF's
    pca = np.dot(data, eof)  # inverse order, because the are transposed
    # post-whiten features
    if lpostwhiten:
        pca -= pca.mean(axis=0, keepdims=True)
        pca /= pca.std(axis=0, keepdims=True)
    # return results
    if lEOF: return pca, eig, eof
    else: return pca, eig
Пример #11
0
def loadHydro(filename='discharge_out.mpiio', folder=None, nreal=None, ntime=None, dtype=np.float64):
    ''' function to load hydroggraphs/discharge from EnKF output '''
    if not nreal and not ntime: 
      raise ArgumentError("Please specify number of realizations 'nreal' or number of time steps 'ntime'.")
    filepath = os.path.join(folder,filename)
    if isinstance(dtype, str): dtype = getattr(np,dtype)
    # load data
    data = np.fromfile(filepath, dtype=dtype)    
    # reshape (need to know number or realizations or time steps)
    n = data.size
    if nreal:
        nt = int(n/nreal)
        nr = nreal
        if ntime  and nt != ntime:
            raise ValueError("Given number of time steps is not consistent with file size or data type ({} != {}).".format(ntime,nt))
    elif ntime:
        nt = ntime
        nr = int(n/ntime)
    if nt*nr != n: 
        raise ValueError("Given number of realizations ({}) and time steps do not divide number of data points ({}).".format(nr,nt,n))
    data = data.reshape((nt,nr))
    # return timeseries
    return data
Пример #12
0
def defaultNamedtuple(typename, field_names, defaults=None):
    ''' wrapper for namedtuple that supports defaults; adapted from stackoverflow:
      https://stackoverflow.com/questions/11351032/named-tuple-and-optional-keyword-arguments '''
    T = col.namedtuple(typename, field_names)  # make named tuple
    T.__new__.__defaults__ = (None, ) * len(T._fields)  # set defaults to None
    # add custom defaults
    if defaults is not None:
        if isinstance(defaults, col.Mapping):
            prototype = T(**defaults)
        elif isinstance(defaults, col.Iterable):
            prototype = T(*defaults)
        else:
            raise ArgumentError(str(defaults))
        T.__new__.__defaults__ = tuple(prototype)
#     # add self-referenc defaults
#     if ref_prefix:
#       l = len(ref_prefix)
#       for field,value in T._asdict().iteritems():
#         if isinstance(value,basestring) and value[:l] == ref_prefix:
#           T.__dict__[field] = T.__dict__[value[l:]]
#     # N.B.: this would have to go into the constructor in order to work...
# return namedtuple with defaults
    return T
Пример #13
0
def loadHGS_StnTS(station=None, varlist=None, varatts=None, folder=None, name=None, title=None,
                  start_date=None, end_date=None, run_period=15, period=None, lskipNaN=False, lcheckComplete=True,
                  basin=None, WSC_station=None, basin_list=None, filename=None, prefix=None, 
                  scalefactors=None, **kwargs):
  ''' Get a properly formatted WRF dataset with monthly time-series at station locations; as in
      the hgsrun module, the capitalized kwargs can be used to construct folders and/or names '''
  if folder is None or ( filename is None and station is None ): raise ArgumentError
  # try to find meta data for gage station from WSC
  HGS_station = station
  if basin is not None and basin_list is not None:
    station_name = station
    station = getGageStation(basin=basin, station=station if WSC_station is None else WSC_station, 
                             basin_list=basin_list) # only works with registered basins
    if station_name is None: station_name = station.name # backup, in case we don't have a HGS station name
    metadata = station.getMetaData() # load station meta data
    if metadata is None: raise GageStationError(name)
  else: 
    metadata = dict(); station = None; station_name =  None    
  # prepare name expansion arguments (all capitalized)
  expargs = dict(ROOT_FOLDER=root_folder, STATION=HGS_station, NAME=name, TITLE=title,
                 PREFIX=prefix, BASIN=basin, WSC_STATION=WSC_station)
  for key,value in metadata.items():
      if isinstance(value,basestring):
          expargs['WSC_'+key.upper()] = value # in particular, this includes WSC_ID
  if 'WSC_ID' in expargs: 
      if expargs['WSC_ID'][0] == '0': expargs['WSC_ID0'] = expargs['WSC_ID'][1:]
      else: raise DatasetError('Expected leading zero in WSC station ID: {}'.format(expargs['WSC_ID']))
  # exparg preset keys will get overwritten if capitalized versions are defined
  for key,value in kwargs.items():
    KEY = key.upper() # we only use capitalized keywords, and non-capitalized keywords are only used/converted
    if KEY == key or KEY not in kwargs: expargs[KEY] = value # if no capitalized version is defined
  # read folder and infer prefix, if necessary
  folder = folder.format(**expargs)
  if not os.path.exists(folder): raise IOError(folder)
  if expargs['PREFIX'] is None:
    with open('{}/{}'.format(folder,prefix_file), 'r') as pfx:
      expargs['PREFIX'] = prefix = ''.join(pfx.readlines()).strip()      
  # now assemble file name for station timeseries
  filename = filename.format(**expargs)
  filepath = '{}/{}'.format(folder,filename)
  if not os.path.exists(filepath): IOError(filepath)
  if station_name is None: 
      station_name = filename[filename.index('hydrograph.')+1:-4] if station is None else station
  # set meta data (and allow keyword expansion of name and title)
  metadata['problem'] = prefix
  metadata['station_name'] = metadata.get('long_name', station_name)
  if name is not None: name = name.format(**expargs) # name expansion with capitalized keyword arguments
  else: name = 'HGS_{:s}'.format(station_name)
  metadata['name'] = name; expargs['Name'] = name.title() # name in title format
  if title is None: title = '{{Name:s}} (HGS, {problem:s})'.format(**metadata)
  title = title.format(**expargs) # name expansion with capitalized keyword arguments
  metadata['long_name'] = metadata['title'] = title
  # now determine start data for date_parser
  if end_date is None: 
      if start_date and run_period: end_date = start_date + run_period 
      elif period: end_date = period[1]
      else: raise ArgumentError("Need to specify either 'start_date' & 'run_period' or 'period' to infer 'end_date'.")
  end_year,end_month,end_day = convertDate(end_date)
  if start_date is None: 
      if end_date and run_period: start_date = end_date - run_period 
      elif period: start_date = period[0]
      else: raise ArgumentError("Need to specify either 'end_date' & 'run_period' or 'period' to infer 'start_date'.")
  start_year,start_month,start_day = convertDate(start_date)
  if start_day != 1 or end_day != 1: 
    raise NotImplementedError('Currently only monthly data is supported.')
#   import functools
#   date_parser = functools.partial(date_parser, year=start_year, month=start_month, day=start_day)
#   # now load data using pandas ascii reader
#   data_frame = pd.read_table(filepath, sep='\s+', header=2, dtype=np.float64, index_col=['time'], 
#                              date_parser=date_parser, names=ascii_varlist)
#   # resample to monthly data
#   data_frame = data_frame.resample(resampling).agg(np.mean)
#       data = data_frame[flowvar].values
  # parse header
  if varlist is None: varlist = variable_list[:] # default list 
  with open(filepath, 'r') as f:
      line = f.readline(); lline = line.lower() # 1st line
      if not "hydrograph" in lline: raise GageStationError(line,filepath)
      # parse variables and determine columns
      line = f.readline(); lline = line.lower() # 2nd line
      if not "variables" in lline: raise GageStationError(line)
      variable_order = [v.strip('"').lower() for v in line[line.find('"'):].strip().split(',')]
  # figure out varlist and data columns
  if variable_order[0] == 'time': del variable_order[0] # only keep variables
  else: raise GageStationError(variable_order)
  variable_order = [hgs_variables[v] for v in variable_order] # replace HGS names with GeoPy names
  vardict = {v:i+1 for i,v in enumerate(variable_order)} # column mapping; +1 because time was removed
  variable_order = [v for v in variable_order if v in varlist or flow_to_flux[v] in varlist]
  usecols = tuple(vardict[v] for v in variable_order) # variable columns that need to loaded (except time, which is col 0)
  assert 0 not in usecols, usecols
  # load data as tab separated values
  data = np.genfromtxt(filepath, dtype=np.float64, delimiter=None, skip_header=3, usecols = (0,)+usecols)
  assert data.shape[1] == len(usecols)+1, data.shape
  if lskipNaN:
      data = data[np.isnan(data).sum(axis=1)==0,:]
  elif np.any( np.isnan(data) ):
      raise DataError("Missing values (NaN) encountered in hydrograph file; use 'lskipNaN' to ignore.\n('{:s}')".format(filepath))    
  time_series = data[:,0]; flow_data = data[:,1:]
  assert flow_data.shape == (len(time_series),len(usecols)), flow_data.shape
  # original time deltas in seconds
  time_diff = time_series.copy(); time_diff[1:] = np.diff(time_series) # time period between time steps
  assert np.all( time_diff > 0 ), filepath
  time_diff = time_diff.reshape((len(time_diff),1)) # reshape to make sure broadcasting works
  # integrate flow over time steps before resampling
  flow_data[1:,:] -= np.diff(flow_data, axis=0)/2. # get average flow between time steps
  flow_data *= time_diff # integrate flow in time interval by multiplying average flow with time period
  flow_data = np.cumsum(flow_data, axis=0) # integrate by summing up total flow per time interval
  # generate regular monthly time steps
  start_datetime = np.datetime64(dt.datetime(year=start_year, month=start_month, day=start_day), 'M')
  end_datetime = np.datetime64(dt.datetime(year=end_year, month=end_month, day=end_day), 'M')
  time_monthly = np.arange(start_datetime, end_datetime+np.timedelta64(1, 'M'), dtype='datetime64[M]')
  assert time_monthly[0] == start_datetime, time_monthly[0]
  assert time_monthly[-1] == end_datetime, time_monthly[-1] 
  # convert monthly time series to regular array of seconds since start date
  time_monthly = ( time_monthly.astype('datetime64[s]') - start_datetime.astype('datetime64[s]') ) / np.timedelta64(1,'s')
  assert time_monthly[0] == 0, time_monthly[0]
  # interpolate integrated flow to new time axis
  #flow_data = np.interp(time_monthly, xp=time_series[:,0], fp=flow_data[:,0],).reshape((len(time_monthly),1))
  time_series = np.concatenate(([0],time_series), axis=0) # integrated flow at time zero must be zero...
  flow_data = np.concatenate(([[0,]*len(usecols)],flow_data), axis=0) # ... this is probably better than interpolation
  # N.B.: we are adding zeros here so we don't have to extrapolate to the left; on the right we just fill in NaN's
  if ( time_monthly[-1] - time_series[-1] ) > 3*86400. and lcheckComplete: 
      warn("Data record ends more than 3 days befor end of period: {} days".format((time_monthly[-1]-time_series[-1])/86400.))
  elif (time_monthly[-1]-time_series[-1]) > 5*86400.: 
      if lcheckComplete: 
        raise DataError("Data record ends more than 5 days befor end of period: {} days".format((time_monthly[-1]-time_series[-1])/86400.))
      else:
        warn("Data record ends more than 5 days befor end of period: {} days".format((time_monthly[-1]-time_series[-1])/86400.))
  flow_interp = si.interp1d(x=time_series, y=flow_data, kind='linear', axis=0, copy=False, 
                            bounds_error=False, fill_value=np.NaN, assume_sorted=True) 
  flow_data = flow_interp(time_monthly) # evaluate with call
  # compute monthly flow rate from interpolated integrated flow
  flow_data = np.diff(flow_data, axis=0) / np.diff(time_monthly, axis=0).reshape((len(time_monthly)-1,1))
  flow_data *= 1000 # convert from m^3/s to kg/s
  # construct time axis
  start_time = 12*(start_year - 1979) + start_month -1
  end_time = 12*(end_year - 1979) + end_month -1
  time = Axis(name='time', units='month', atts=dict(long_name='Month since 1979-01'), 
              coord=np.arange(start_time, end_time)) # not including the last, e.g. 1979-01 to 1980-01 is 12 month
  assert len(time_monthly) == end_time-start_time+1
  assert flow_data.shape == (len(time),len(variable_order)), (flow_data.shape,len(time),len(variable_order))
  # construct dataset
  dataset = Dataset(atts=metadata)
  dataset.station = station # add gage station object, if available (else None)
  for i,flowvar in enumerate(variable_order):
      data = flow_data[:,i]
      fluxvar = flow_to_flux[flowvar]
      if flowvar in varlist:
        flowatts = variable_attributes[flowvar]
        # convert variables and put into dataset (monthly time series)
        if flowatts['units'] != 'kg/s': 
          raise VariableError("Hydrograph data is read as kg/s; flow variable does not match.\n{}".format(flowatts))
        dataset += Variable(data=data, axes=(time,), **flowatts)
      if fluxvar in varlist and 'shp_area' in metadata:
        # compute surface flux variable based on drainage area
        fluxatts = variable_attributes[fluxvar]
        if fluxatts['units'] == 'kg/s' and fluxatts['units'] != 'kg/m^2/s': raise VariableError(fluxatts)
        data = data / metadata['shp_area'] # need to make a copy
        dataset += Variable(data=data, axes=(time,), **fluxatts)
  # apply analysis period
  if period is not None:
      dataset = dataset(years=period)
  # adjust scalefactors, if necessary
  if scalefactors:
      if isinstance(scalefactors,dict):
          dataset = updateScalefactor(dataset, varlist=scalefactors, scalefactor=None)
      elif isNumber(scalefactors):
          scalelist = ('discharge','seepage','flow')
          dataset = updateScalefactor(dataset, varlist=scalelist, scalefactor=scalefactors)
      else: 
          raise TypeError(scalefactors) 
  # return completed dataset
  return dataset
Пример #14
0
def loadGageStation(basin=None, station=None, varlist=None, varatts=None, mode='climatology', 
                    aggregation=None, filetype='monthly', folder=None, name=None, period=None,
                    basin_list=None, lcheck=True, lexpand=True, lfill=True, lflatten=True,
                    lkgs=True, scalefactors=None, title=None):
  ''' function to load hydrograph climatologies and timeseries for a given basin '''
  ## resolve input
  if mode == 'timeseries' and aggregation: 
    raise ArgumentError('Timeseries does not support aggregation.')
  # get GageStation instance
  station = getGageStation(basin=basin, station=station, name=name, folder=folder, 
                           river=None, basin_list=basin_list, lcheck=True)
  # variable attributes
  if varlist is None: varlist = variable_list
  elif not isinstance(varlist,(list,tuple)): raise TypeError  
  varlist = list(varlist) # make copy of varlist to avoid interference
  if varatts is None: 
    if aggregation is None: varatts = variable_attributes_kgs if lkgs else variable_attributes_mms
    else: varatts = agg_varatts_kgs if lkgs else agg_varatts_mms
  elif not isinstance(varatts,dict): raise TypeError
  
  ## read csv data
  # time series data and time coordinates
  lexpand = True; lfill = True
  if mode == 'climatology': lexpand = False; lfill = False; lflatten = False
  data, time = station.getTimeseriesData(units='kg/s' if lkgs else 'm^3/s', lcheck=True, lexpand=lexpand, 
                                         lfill=lfill, period=period, lflatten=lflatten)
  # station meta data
  metadata = station.getMetaData(lcheck=True)
  den = metadata['shp_area'] if lkgs else ( metadata['shp_area'] / 1000. )
  ## create dataset for station
  dataset = Dataset(name='WSC', title=title or metadata['Station Name'], varlist=[], atts=metadata,) 
  if mode.lower() in ('timeseries','time-series'): 
    time = time.flatten(); data = data.flatten() # just to make sure...
    # make time axis based on time coordinate from csv file
    timeAxis = Axis(name='time', units='month', coord=time, # time series centered at 1979-01
                    atts=dict(long_name='Month since 1979-01'))
    dataset += timeAxis
    # load mean discharge
    dataset += Variable(axes=[timeAxis], data=data, atts=varatts['discharge'])
    # load mean runoff
    doa = data / den 
    dataset += Variable(axes=[timeAxis], data=doa, atts=varatts['runoff'])
  elif mode == 'climatology': 
    # N.B.: this is primarily for backwards compatibility; it should not be used anymore...
    # make common time axis for climatology
    te = 12 # length of time axis: 12 month
    climAxis = Axis(name='time', units='month', length=12, coord=np.arange(1,te+1,1)) # monthly climatology
    dataset.addAxis(climAxis, copy=False)
    # extract variables (min/max/mean are separate variables)
    # N.B.: this is mainly for backwards compatibility
    doa = data / den
    if aggregation is None or aggregation.lower() == 'mean':
      # load mean discharge
      tmpdata = nf.nanmean(data, axis=0)
      tmpvar = Variable(axes=[climAxis], data=tmpdata, atts=varatts['discharge'])
      dataset.addVariable(tmpvar, copy=False)
      # load mean runoff
      tmpdata = nf.nanmean(doa, axis=0)
      tmpvar = Variable(axes=[climAxis], data=tmpdata, atts=varatts['runoff'])
      dataset.addVariable(tmpvar, copy=False)
    if aggregation is None or aggregation.lower() == 'std':
      # load  discharge standard deviation
      tmpdata = nf.nanstd(data, axis=0, ddof=1) # very few values means large uncertainty!
      tmpvar = Variable(axes=[climAxis], data=tmpdata, atts=varatts['discstd'])
      dataset.addVariable(tmpvar, copy=False)
      # load  runoff standard deviation
      tmpdata = nf.nanstd(doa, axis=0, ddof=1)
      tmpvar = Variable(axes=[climAxis], data=tmpdata, atts=varatts['roff_std'])
      dataset.addVariable(tmpvar, copy=False)
    if aggregation is None or aggregation.lower() == 'sem':
      # load  discharge standard deviation
      tmpdata = nf.nansem(data, axis=0, ddof=1) # very few values means large uncertainty!
      tmpvar = Variable(axes=[climAxis], data=tmpdata, atts=varatts['discsem'])
      dataset.addVariable(tmpvar, copy=False)
      # load  runoff standard deviation
      tmpdata = nf.nansem(doa, axis=0, ddof=1)
      tmpvar = Variable(axes=[climAxis], data=tmpdata, atts=varatts['roff_sem'])
      dataset.addVariable(tmpvar, copy=False)
    if aggregation is None or aggregation.lower() == 'max':
      # load maximum discharge
      tmpdata = nf.nanmax(data, axis=0)
      tmpvar = Variable(axes=[climAxis], data=tmpdata, atts=varatts['discmax'])
      dataset.addVariable(tmpvar, copy=False)
      # load maximum runoff
      tmpdata = nf.nanmax(doa, axis=0)
      tmpvar = Variable(axes=[climAxis], data=tmpdata, atts=varatts['roff_max'])
      dataset.addVariable(tmpvar, copy=False)
    if aggregation is None or aggregation.lower() == 'min':
      # load minimum discharge
      tmpdata = nf.nanmin(data, axis=0)
      tmpvar = Variable(axes=[climAxis], data=tmpdata, atts=varatts['discmin'])
      dataset.addVariable(tmpvar, copy=False)
      # load minimum runoff
      tmpdata = nf.nanmin(doa, axis=0)
      tmpvar = Variable(axes=[climAxis], data=tmpdata, atts=varatts['roff_min'])
      dataset.addVariable(tmpvar, copy=False)
  else: 
    raise NotImplementedError, "Time axis mode '{}' is not supported.".format(mode)
  # adjust scalefactors, if necessary
  if scalefactors:
      if isinstance(scalefactors,dict):
          dataset = updateScalefactor(dataset, varlist=scalefactors, scalefactor=None)
      elif isNumber(scalefactors):
          scalelist = ('discharge','StdDisc','SEMDisc','MaxDisc','MinDisc',)
          dataset = updateScalefactor(dataset, varlist=scalelist, scalefactor=scalefactors)
      else: 
          raise TypeError(scalefactors) 
  # return station dataset
  return dataset   
Пример #15
0
def rasterVariable(name=None,
                   units=None,
                   axes=None,
                   atts=None,
                   plot=None,
                   dtype=None,
                   projection=None,
                   griddef=None,
                   file_pattern=None,
                   lgzip=None,
                   lgdal=True,
                   lmask=True,
                   fillValue=None,
                   lskipMissing=True,
                   path_params=None,
                   offset=0,
                   scalefactor=1,
                   transform=None,
                   time_axis=None,
                   lfeedback=False,
                   **kwargs):
    ''' function to read multi-dimensional raster data and construct a GDAL-enabled Variable object '''

    # print status
    if lfeedback: print "Loading variable '{}': ".format(name),  # no newline

    ## figure out axes arguments and load data
    # figure out axes (list/tuple of axes has to be ordered correctly!)
    axes_list = [ax.name for ax in axes[:-2]]
    # N.B.: the last two axes are the two horizontal map axes (x&y); they can be None and will be inferred from raster
    # N.B.: coordinate values can be overridden with keyword arguments, but length must be consistent
    # figure out coordinates for axes
    for ax in axes[:-2]:
        if ax.name in kwargs:
            # just make sure the dimensions match, but use keyword argument
            if not len(kwargs[ax.name]) == len(ax):
                raise AxisError(
                    "Length of Variable axis and raster file dimension have to be equal."
                )
        else:
            # use Axis coordinates and add to kwargs for readRasterArray call
            kwargs[ax.name] = tuple(ax.coord)
    # load raster data
    if lfeedback: print("'{}'".format(file_pattern))
    data, geotransform = readRasterArray(file_pattern,
                                         lgzip=lgzip,
                                         lgdal=lgdal,
                                         dtype=dtype,
                                         lmask=lmask,
                                         fillValue=fillValue,
                                         lgeotransform=True,
                                         axes=axes_list,
                                         lna=False,
                                         lskipMissing=lskipMissing,
                                         path_params=path_params,
                                         lfeedback=lfeedback,
                                         **kwargs)
    # shift and rescale
    if offset != 0: data += offset
    if scalefactor != 1: data *= scalefactor
    ## create Variable object and add GDAL
    # check map axes and generate if necessary
    xlon, ylat = getAxes(
        geotransform,
        xlen=data.shape[-1],
        ylen=data.shape[-2],
        projected=griddef.isProjected if griddef else bool(projection))
    axes = list(axes)
    if axes[-1] is None: axes[-1] = xlon
    elif len(axes[-1]) != len(xlon): raise AxisError(axes[-1])
    if axes[-2] is None: axes[-2] = ylat
    elif len(axes[-2]) != len(ylat): raise AxisError(axes[-2])
    # create regular Variable with data in memory
    var = Variable(name=name,
                   units=units,
                   axes=axes,
                   data=data,
                   dtype=dtype,
                   mask=None,
                   fillValue=fillValue,
                   atts=atts,
                   plot=plot)
    # apply transform (if any), now that we have axes etc.
    if transform is not None: var = transform(var=var, time_axis=time_axis)
    # add GDAL functionality
    if griddef is not None:
        # perform some consistency checks ...
        if projection is None:
            projection = griddef.projection
        elif projection != griddef.projection:
            raise ArgumentError(
                "Conflicting projection and GridDef!\n {} != {}".format(
                    projection, griddef.projection))
        if not np.isclose(geotransform, griddef.geotransform).all():
            raise ArgumentError(
                "Conflicting geotransform (from raster) and GridDef!\n {} != {}"
                .format(geotransform, griddef.geotransform))
        # ... and use provided geotransform (due to issues with numerical precision, this is usually better)
        geotransform = griddef.geotransform  # if we don't pass the geotransform explicitly, it will be recomputed from the axes
    # add GDAL functionality
    var = addGDALtoVar(var,
                       griddef=griddef,
                       projection=projection,
                       geotransform=geotransform,
                       gridfolder=None)

    # return final, GDAL-enabled variable
    return var
Пример #16
0
def rasterDataset(name=None,
                  title=None,
                  vardefs=None,
                  axdefs=None,
                  atts=None,
                  projection=None,
                  griddef=None,
                  lgzip=None,
                  lgdal=True,
                  lmask=True,
                  fillValue=None,
                  lskipMissing=True,
                  lgeolocator=True,
                  file_pattern=None,
                  lfeedback=True,
                  **kwargs):
    ''' function to load a set of variables that are stored in raster format in a systematic directory tree into a Dataset
        Variables and Axis are defined as follows:
          vardefs[varname] = dict(name=string, units=string, axes=tuple of strings, atts=dict, plot=dict, dtype=np.dtype, fillValue=value)
          axdefs[axname]   = dict(name=string, units=string, atts=dict, coord=array or list) or None
        The path to raster files is constructed as variable_pattern+axes_pattern, where axes_pattern is defined through the axes, 
        (as in rasterVarialbe) and variable_pattern takes the special keywords VAR, which is the variable key in vardefs.
    '''

    ## prepare input data and axes
    if griddef:
        xlon, ylat = griddef.xlon, griddef.ylat
        if projection is None:
            projection = griddef.projection
        elif projection != griddef.projection:
            raise ArgumentError("Conflicting projection and GridDef!")
        geotransform = griddef.geotransform
        isProjected = griddef.isProjected
    else:
        xlon = ylat = geotransform = None
        isProjected = False if projection is None else True
    # construct axes dict
    axes = dict()
    for axname, axdef in axdefs.items():
        assert 'coord' in axdef, axdef
        assert ('name' in axdef and 'units' in axdef) or 'atts' in axdef, axdef
        if axdef is None:
            axes[axname] = None
        else:
            ax = Axis(**axdef)
            axes[ax.name] = ax
    # check for map Axis
    if isProjected:
        if 'x' not in axes: axes['x'] = xlon
        if 'y' not in axes: axes['y'] = ylat
    else:
        if 'lon' not in axes: axes['lon'] = xlon
        if 'lat' not in axes: axes['lat'] = ylat

    ## load raster data into Variable objects
    varlist = []
    for varname, vardef in vardefs.items():
        # check definitions
        assert 'axes' in vardef and 'dtype' in vardef, vardef
        assert ('name' in vardef
                and 'units' in vardef) or 'atts' in vardef, vardef
        # determine relevant axes
        vardef = vardef.copy()
        axes_list = [
            None if ax is None else axes[ax] for ax in vardef.pop('axes')
        ]
        # define path parameters (with varname)
        path_params = vardef.pop('path_params', None)
        path_params = dict() if path_params is None else path_params.copy()
        if 'VAR' not in path_params:
            path_params['VAR'] = varname  # a special key
        # add kwargs and relevant axis indices
        relaxes = [ax.name for ax in axes_list
                   if ax is not None]  # relevant axes
        for key, value in kwargs.items():
            if key not in axes or key in relaxes:
                vardef[key] = value
        # create Variable object
        var = rasterVariable(projection=projection,
                             griddef=griddef,
                             file_pattern=file_pattern,
                             lgzip=lgzip,
                             lgdal=lgdal,
                             lmask=lmask,
                             lskipMissing=lskipMissing,
                             axes=axes_list,
                             path_params=path_params,
                             lfeedback=lfeedback,
                             **vardef)
        # vardef components: name, units, atts, plot, dtype, fillValue
        varlist.append(var)
        # check that map axes are correct
        for ax in var.xlon, var.ylat:
            if axes[ax.name] is None: axes[ax.name] = ax
            elif axes[ax.name] != ax:
                raise AxisError("{} axes are incompatible.".format(ax.name))
        if griddef is None: griddef = var.griddef
        elif griddef != var.griddef:
            raise AxisError("GridDefs are inconsistent.")
        if geotransform is None: geotransform = var.geotransform
        elif geotransform != var.geotransform:
            raise AxisError(
                "Conflicting geotransform (from Variable) and GridDef!\n {} != {}"
                .format(var.geotransform, geotransform))

    ## create Dataset
    # create dataset
    dataset = Dataset(name=name,
                      title=title,
                      varlist=varlist,
                      axes=axes,
                      atts=atts)
    # add GDAL functionality
    dataset = addGDALtoDataset(dataset,
                               griddef=griddef,
                               projection=projection,
                               geotransform=geotransform,
                               gridfolder=None,
                               lwrap360=None,
                               geolocator=lgeolocator,
                               lforce=False)
    # N.B.: for some reason we also need to pass the geotransform, otherwise it is recomputed internally and some consistency
    #       checks fail due to machine-precision differences

    # return GDAL-enabled Dataset
    return dataset
Пример #17
0
def expandArgumentList(inner_list=None,
                       outer_list=None,
                       expand_list=None,
                       lproduct='outer',
                       **kwargs):
    ''' A function that generates a list of complete argument dict's, based on given kwargs and certain 
      expansion rules: kwargs listed in expand_list are expanded and distributed element-wise, 
      either as inner ('inner_list') or outer ('outer_list') product, while other kwargs are repeated 
      in every argument dict. 
      Arguments can be expanded simultaneously (in parallel) within an outer product by specifying
      them as a tuple within the outer product argument list ('outer_list'). '''
    if not (expand_list or inner_list or outer_list):
        arg_dicts = [kwargs]  # return immediately - nothing to do
    else:

        # handle legacy arguments
        if expand_list is not None:
            if inner_list is not None or outer_list is not None:
                raise ArgumentError("Can not mix input modes!")
            if lproduct.lower() == 'inner': inner_list = expand_list
            elif lproduct.lower() == 'outer': outer_list = expand_list
            else: raise ArgumentError(lproduct)
        outer_list = outer_list or []
        inner_list = inner_list or []

        # handle outer product expansion first
        if len(outer_list) > 0:
            kwtmp = {
                key: value
                for key, value in kwargs.items() if key not in inner_list
            }

            # detect variables for parallel expansion
            # N.B.: parallel outer expansion is handled by replacing the arguments in each parallel expansion group
            #       with a single (fake) argument that is a tuple of the original argument values; the tuple is then,
            #       after expansion, disassembled into its former constituent arguments
            par_dict = dict()
            for kw in outer_list:
                if isinstance(kw, (tuple, list)):
                    # retrieve parallel expansion group
                    par_args = [kwtmp.pop(name) for name in kw]
                    if not all(
                        [len(args) == len(par_args[0]) for args in par_args]):
                        raise ArgumentError(
                            "Lists for parallel expansion arguments have to be of same length!"
                        )
                    # introduce fake argument and save record
                    fake = 'TMP_' + '_'.join(kw) + '_{:d}'.format(
                        len(kw))  # long name that is unlikely to interfere...
                    par_dict[
                        fake] = kw  # store record of parallel expansion for reassembly later
                    kwtmp[fake] = zip(
                        *par_args)  # transpose lists to get a list of tuples
                elif not isinstance(kw, basestring):
                    raise TypeError(kw)
            # replace entries in outer list
            if len(par_dict) > 0:
                outer_list = outer_list[:]  # copy list
                for fake, names in par_dict.items():
                    if names in outer_list:
                        outer_list[outer_list.index(names)] = fake
            assert all([isinstance(arg, basestring) for arg in outer_list])

            outer_list, outer_dict = _prepareList(outer_list, kwtmp)
            lstlen = 1
            for el in outer_list:
                lstlen *= len(outer_dict[el])
            # execute recursive function for outer product expansion
            list_dict = _loop_recursion(outer_list,
                                        **outer_dict)  # use copy of
            # N.B.: returns a dictionary where all kwargs have been expanded to lists of appropriate length
            assert all(key in outer_dict for key in list_dict.iterkeys())
            assert all(len(list_dict[el]) == lstlen
                       for el in outer_list)  # check length
            assert all(len(ld) == lstlen
                       for ld in list_dict.itervalues())  # check length

            # disassemble parallel expansion tuple and reassemble as individual arguments
            if len(par_dict) > 0:
                for fake, names in par_dict.iteritems():
                    assert fake in list_dict
                    par_args = zip(
                        *list_dict.pop(fake)
                    )  # transpose, to get an expanded tuple for each argument
                    assert len(par_args) == len(names)
                    for name, args in zip(names, par_args):
                        list_dict[name] = args

        # handle inner product expansion last
        if len(inner_list) > 0:
            kwtmp = kwargs.copy()
            if len(outer_list) > 0:
                kwtmp.update(list_dict)
                inner_list = outer_list + inner_list
            # N.B.: this replaces all outer expansion arguments with lists of appropriate length for inner expansion
            inner_list, inner_dict = _prepareList(inner_list, kwtmp)
            # inner product: essentially no expansion
            lst0 = inner_dict[inner_list[0]]
            lstlen = len(lst0)
            for el in inner_list:  # check length
                if len(inner_dict[el]) == 1:
                    inner_dict[el] = inner_dict[
                        el] * lstlen  # broadcast singleton list
                elif len(inner_dict[el]) != lstlen:
                    raise TypeError(
                        'Lists have to be of same length to form inner product!'
                    )
            list_dict = inner_dict

        ## generate list of argument dicts
        arg_dicts = []
        for n in xrange(lstlen):
            # assemble arguments
            lstargs = {key: lst[n] for key, lst in list_dict.iteritems()}
            arg_dict = kwargs.copy()
            arg_dict.update(lstargs)
            arg_dicts.append(arg_dict)
    # return list of arguments
    return arg_dicts
Пример #18
0
def addDistFit(ensemble=None,
               lfit=True,
               lflatten=None,
               lrescale=False,
               reference=None,
               target=None,
               lbootstrap=False,
               nbs=30,
               sample_axis=None,
               lglobalScale=False,
               lcrossval=False,
               ncv=0.2,
               dist=None,
               dist_args=None,
               load_list=None,
               lproduct='outer',
               **kwargs):
    ''' add distribution fits to ensemble; optionally also rescale; kwargs are necessary for correct list expansion '''

    # find appropriate sample axis
    if lflatten:
        if sample_axis is not None: raise ArgumentError, sample_axis
    elif sample_axis is None:  # auto-detect
        for saxis in ('time', 'year'):
            if all([all(ens.hasAxis(saxis)) for ens in ensemble]):
                sample_axis = saxis
                break
        if sample_axis is None: raise AxisError, "No sample axis detected"
    else:
        if isinstance(sample_axis, basestring):
            if not all([all(ens.hasAxis(sample_axis)) for ens in ensemble]):
                raise AxisError, sample_axis
        elif isinstance(sample_axis, (tuple, list)):
            # check that axes are there
            for ax in sample_axis:
                if not all([all(ens.hasAxis(ax)) for ens in ensemble]):
                    raise AxisError, ax
            # merge axes
            ensemble = [ens.mergeAxes(axes=sample_axis, new_axis='sample', asVar=True, linplace=False, \
                                    lcheckAxis=False) for ens in ensemble]
            sample_axis = 'sample'
        else:
            raise AxisError, sample_axis
    # perform fit or return dummy
    if dist_args is None: dist_args = dict()
    if lfit:
        fitens = [
            ens.fitDist(lflatten=lflatten,
                        axis=sample_axis,
                        lcrossval=lcrossval,
                        ncv=ncv,
                        lignoreParams=True,
                        lbootstrap=lbootstrap,
                        nbs=nbs,
                        dist=dist,
                        **dist_args) for ens in ensemble
        ]
    else:
        fitens = [None] * len(ensemble)

    # rescale fitted distribution (according to certain rules)
    if lrescale:
        if not reference: raise ArgumentError(str(reference))
        # expand target list
        if isinstance(target, (list, tuple)):
            expand_list = load_list[:]
            if 'names' in expand_list:
                expand_list[expand_list.index('names')] = 'target'
            kwarg_list = expandArgumentList(target=target,
                                            expand_list=expand_list,
                                            lproduct=lproduct,
                                            **kwargs)
            targets = [kwarg['target'] for kwarg in kwarg_list]
        else:
            targets = [target] * len(fitens)
        if isinstance(reference, (list, tuple)):
            raise NotImplementedError  # don't expand reference list
        # use global reference, if necessary
        if isinstance(reference, basestring) and not all(reference in fit
                                                         for fit in fitens):
            i = 0
            while i < len(fitens) and reference not in fitens[i]:
                i += 1
            if i >= len(fitens):
                raise ArgumentError, "Reference {:s} not found in any dataset!".format(
                    reference)
            reference = fitens[i][reference]
        sclens = [
            rescaleDistributions(fit,
                                 reference=reference,
                                 target=tgt,
                                 lglobal=lglobalScale)
            for fit, tgt in zip(fitens, targets)
        ]
    else:
        sclens = [None] * len(ensemble)

    # return results
    return fitens, sclens
Пример #19
0
def performExport(dataset, mode, dataargs, expargs, bcargs, loverwrite=False, 
                  ldebug=False, lparallel=False, pidstr='', logger=None):
    ''' worker function to export ASCII rasters for a given dataset '''
    # input checking
    if not isinstance(dataset,basestring): raise TypeError
    if not isinstance(dataargs,dict): raise TypeError # all dataset arguments are kwargs 
    
    # logging
    if logger is None: # make new logger     
        logger = logging.getLogger() # new logger
        logger.addHandler(logging.StreamHandler())
    else:
        if isinstance(logger,basestring): 
            logger = logging.getLogger(name=logger) # connect to existing one
        elif not isinstance(logger,logging.Logger): 
            raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format(str(logger))
  
    ## extract meta data from arguments
    dataargs, loadfct, srcage, datamsgstr = getMetaData(dataset, mode, dataargs, lone=False)
    dataset_name = dataargs.dataset_name; periodstr = dataargs.periodstr; domain = dataargs.domain
    
    # figure out bias correction parameters
    if bcargs:
        bcargs = bcargs.copy() # first copy, then modify...
        bc_method = bcargs.pop('method',None)
        if bc_method is None: raise ArgumentError("Need to specify bias-correction method to use bias correction!")
        bc_obs = bcargs.pop('obs_dataset',None)
        if bc_obs is None: raise ArgumentError("Need to specify observational dataset to use bias correction!")
        bc_reference = bcargs.pop('reference',None)
        if bc_reference is None: # infer from experiment name
            if dataset_name[-5:] in ('-2050','-2100'): bc_reference = dataset_name[:-5] # cut of period indicator and hope for the best 
            else: bc_reference = dataset_name 
        bc_grid = bcargs.pop('grid',None)
        if bc_grid is None: bc_grid = dataargs.grid
        bc_domain = bcargs.pop('domain',None)
        if bc_domain is None: bc_domain = domain
        bc_varlist = bcargs.pop('varlist',None)
        bc_varmap = bcargs.pop('varmap',None)       
        bc_tag = bcargs.pop('tag',None) # an optional name extension/tag
        bc_pattern = bcargs.pop('file_pattern',None) # usually default in getPickleFile
        lgzip = bcargs.pop('lgzip',None) # if pickle is gzipped (None: auto-detect based on file name extension)
        # get name of pickle file (and folder)
        picklefolder = dataargs.avgfolder.replace(dataset_name,bc_reference)
        picklefile = getPickleFileName(method=bc_method, obs_name=bc_obs, gridstr=bc_grid, domain=bc_domain, 
                                       tag=bc_tag, pattern=bc_pattern)
        picklepath = '{:s}/{:s}'.format(picklefolder,picklefile)
        if lgzip:
            picklepath += '.gz' # add extension
            if not os.path.exists(picklepath): raise IOError(picklepath)
        elif lgzip is None:
            lgzip = False
            if not os.path.exists(picklepath):
                lgzip = True # assume gzipped file
                picklepath += '.gz' # try with extension...
                if not os.path.exists(picklepath): raise IOError(picklepath)
        elif not os.path.exists(picklepath): raise IOError(picklepath)
        pickleage = datetime.fromtimestamp(os.path.getmtime(picklepath))
        # determine age of pickle file and compare against source age
    else:
      bc_method = False 
      pickleage = srcage
    
    # parse export options
    expargs = expargs.copy() # first copy, then modify...
    lm3 = expargs.pop('lm3') # convert kg/m^2/s to m^3/m^2/s (water flux)
    expformat = expargs.pop('format') # needed to get FileFormat object
    exp_list= expargs.pop('exp_list') # this handled outside of export
    compute_list = expargs.pop('compute_list', []) # variables to be (re-)computed - by default all
    # initialize FileFormat class instance
    fileFormat = getFileFormat(expformat, bc_method=bc_method, **expargs)
    # get folder for target dataset and do some checks
    expname = '{:s}_d{:02d}'.format(dataset_name,domain) if domain else dataset_name
    expfolder = fileFormat.defineDataset(dataset=dataset, mode=mode, dataargs=dataargs, lwrite=True, ldebug=ldebug)
  
    # prepare destination for new dataset
    lskip = fileFormat.prepareDestination(srcage=max(srcage,pickleage), loverwrite=loverwrite)
  
    # depending on last modification time of file or overwrite setting, start computation, or skip
    if lskip:        
        # print message
        skipmsg =  "\n{:s}   >>>   Skipping: Format '{:s} for dataset '{:s}' already exists and is newer than source file.".format(pidstr,expformat,dataset_name)
        skipmsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr,expfolder)
        logger.info(skipmsg)              
    else:
            
      ## actually load datasets
      source = loadfct() # load source data
      # check period
      if 'period' in source.atts and dataargs.periodstr != source.atts.period: # a NetCDF attribute
          raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format(periodstr,source.atts.period)
      
      # load BiasCorrection object from pickle
      if bc_method:      
          op = gzip.open if lgzip else open
          with op(picklepath, 'r') as filehandle:
              BC = pickle.load(filehandle) 
          # assemble logger entry
          bcmsgstr = "(performing bias-correction using {:s} from {:s} towards {:s})".format(BC.long_name,bc_reference,bc_obs)
      
      # print message
      if mode == 'climatology': opmsgstr = 'Exporting Climatology ({:s}) to {:s} Format'.format(periodstr, expformat)
      elif mode == 'time-series': opmsgstr = 'Exporting Time-series to {:s} Format'.format(expformat)
      elif mode[-5:] == '-mean': opmsgstr = 'Exporting {:s}-Mean ({:s}) to {:s} Format'.format(mode[:-5], periodstr, expformat)
      else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode)        
      # print feedback to logger
      logmsg = '\n{0:s}   ***   {1:^65s}   ***   \n{0:s}   ***   {2:^65s}   ***   \n'.format(pidstr,datamsgstr,opmsgstr)
      if bc_method:
          logmsg += "{0:s}   ***   {1:^65s}   ***   \n".format(pidstr,bcmsgstr)
      logger.info(logmsg)
      if not lparallel and ldebug: logger.info('\n'+str(source)+'\n')
      
      # create GDAL-enabled target dataset
      sink = Dataset(axes=(source.xlon,source.ylat), name=expname, title=source.title, atts=source.atts.copy())
      addGDALtoDataset(dataset=sink, griddef=source.griddef)
      assert sink.gdal, sink
      
      # apply bias-correction
      if bc_method:
          source = BC.correct(source, asNC=False, varlist=bc_varlist, varmap=bc_varmap) # load bias-corrected variables into memory
        
      # N.B.: for variables that are not bias-corrected, data are not loaded immediately but on demand; this way 
      #       I/O and computing can be further disentangled and not all variables are always needed
      
      # compute intermediate variables, if necessary
      for varname in exp_list:
          variables = None # variable list
          var = None
          # (re-)compute variable, if desired...
          if varname in compute_list:
              if varname == 'precip': var = newvars.computeTotalPrecip(source)
              elif varname == 'waterflx': var = newvars.computeWaterFlux(source)
              elif varname == 'liqwatflx': var = newvars.computeLiquidWaterFlux(source)
              elif varname == 'netrad': var = newvars.computeNetRadiation(source, asVar=True)
              elif varname == 'netrad_bb': var = newvars.computeNetRadiation(source, asVar=True, lrad=False, name='netrad_bb')
              elif varname == 'netrad_bb0': var = newvars.computeNetRadiation(source, asVar=True, lrad=False, lA=False, name='netrad_bb0')
              elif varname == 'vapdef': var = newvars.computeVaporDeficit(source)
              elif varname in ('pet','pet_pm','petrad','petwnd') and 'pet' not in sink:
                  if 'petrad' in exp_list or 'petwnd' in exp_list:
                      variables = newvars.computePotEvapPM(source, lterms=True) # default; returns mutliple PET terms
                  else: var = newvars.computePotEvapPM(source, lterms=False) # returns only PET
              elif varname == 'pet_th': var = None # skip for now
                  #var = computePotEvapTh(source) # simplified formula (less prerequisites)
          # ... otherwise load from source file
          if var is None and variables is None and varname in source:
              var = source[varname].load() # load data (may not have to load all)
          #else: raise VariableError, "Unsupported Variable '{:s}'.".format(varname)
          # for now, skip variables that are None
          if var or variables:
              # handle lists as well
              if var and variables: raise VariableError, (var,variables)
              elif var: variables = (var,)
              for var in variables:
                  addGDALtoVar(var=var, griddef=sink.griddef)
                  if not var.gdal and isinstance(fileFormat,ASCII_raster):
                      raise GDALError, "Exporting to ASCII_raster format requires GDAL-enabled variables."
                  # add to new dataset
                  sink += var
      # convert units
      if lm3:
          for var in sink:
              if var.units == 'kg/m^2/s':
                  var /= 1000. # divide to get m^3/m^2/s
                  var.units = 'm^3/m^2/s' # update units
      
      # compute seasonal mean if we are in mean-mode
      if mode[-5:] == '-mean': 
          sink = sink.seasonalMean(season=mode[:-5], lclim=True)
          # N.B.: to remain consistent with other output modes, 
          #       we need to prevent renaming of the time axis
          sink = concatDatasets([sink,sink], axis='time', lensembleAxis=True)
          sink.squeeze() # we need the year-axis until now to distinguish constant fields; now remove
      
      # print dataset
      if not lparallel and ldebug:
          logger.info('\n'+str(sink)+'\n')
        
      # export new dataset to selected format
      fileFormat.exportDataset(sink)
        
      # write results to file
      writemsg =  "\n{:s}   >>>   Export of Dataset '{:s}' to Format '{:s}' complete.".format(pidstr,expname, expformat)
      writemsg += "\n{:s}   >>>   ('{:s}')\n".format(pidstr,expfolder)
      logger.info(writemsg)      
         
      # clean up and return
      source.unload(); #del source
      return 0 # "exit code"
Пример #20
0
 def __init__(self, inner_list=None, outer_list=None, **kwargs):
     ''' initialize an ensemble of HGS simulations based on HGS arguments and project descriptors;
     all keyword arguments are automatically expanded based on inner/outer product rules, defined
     using the inner_list/outer_list arguments; the expanded argument lists are used to initialize
     the individual ensemble members; note that a string substitution is applied to all folder 
     variables (incl. 'rundir') prior to constructing the HGS instance, i.e. rundir.format(**kwargs) '''
     self.lreport = kwargs.get('lreport', self.lreport)
     self.loverwrite = kwargs.get('loverwrite', self.loverwrite)
     self.lindicator = kwargs.get('lindicator', self.lindicator)
     self.lrunfailed = kwargs.get('lrunfailed', self.lrunfailed)
     self.lrestart = kwargs.get('lrestart', self.lrestart)
     # expand argument list (plain, nothing special)
     kwargs_list = expandArgumentList(inner_list=inner_list,
                                      outer_list=outer_list,
                                      **kwargs)
     # loop over ensemble members
     self.members = []
     self.rundirs = []
     self.hgsargs = []  # ensemble lists
     for kwargs in kwargs_list:
         # isolate folder variables and perform variable substitution
         for folder_type in ('rundir', 'template_folder', 'input_folder',
                             'pet_folder', 'precip_inc', 'pet_inc',
                             'ic_files'):
             if folder_type in kwargs:
                 folder = kwargs[folder_type]
                 if isinstance(folder, str):
                     # perform keyword substitution with all available arguments
                     if folder_type is 'ic_files':
                         # we need to preserve '{FILETYPE}' for later
                         kwargs[folder_type] = folder.format(
                             FILETYPE='{FILETYPE}', **kwargs)
                     else:
                         kwargs[folder_type] = folder.format(**kwargs)
                 elif folder is None:
                     pass
                 else:
                     raise TypeError(folder)
         # check rundir
         rundir = kwargs['rundir']
         kwargs[
             'restart'] = False  # this keyword argument should be controlled by the Ensemble handler
         if rundir in self.rundirs:
             raise ArgumentError(
                 "Multiple occurence of run directory:\n '{}'".format(
                     rundir))
         # figure out skipping
         if os.path.exists(rundir):
             if self.loverwrite:
                 if self.lreport:
                     print(
                         ("Overwriting existing experiment folder '{:s}'.".
                          format(rundir)))
                 lskip = False
             elif self.lindicator and os.path.exists(
                     '{}/SCHEDULED'.format(rundir)):
                 if self.lreport:
                     print(
                         ("Skipping experiment folder '{:s}' (scheduled).".
                          format(rundir)))
                 lskip = True
             elif self.lindicator and os.path.exists(
                     '{}/IN_PROGRESS'.format(rundir)):
                 if self.lrestart:
                     shutil.move(os.path.join(rundir, 'IN_PROGRESS'),
                                 os.path.join(rundir, 'RESTARTED'))
                     if self.lreport:
                         print((
                             "Restarting experiment in folder '{:s}' (was in progress)."
                             .format(rundir)))
                     lskip = False
                     kwargs['restart'] = True
                 else:
                     if self.lreport:
                         print((
                             "Skipping experiment folder '{:s}' (in progress)."
                             .format(rundir)))
                     lskip = True
             elif self.lindicator and os.path.exists(
                     '{}/COMPLETED'.format(rundir)):
                 if self.lreport:
                     print(
                         ("Skipping experiment folder '{:s}' (completed).".
                          format(rundir)))
                 lskip = True
             elif self.lindicator and os.path.exists(
                     '{}/FAILED'.format(rundir)):
                 # this should be the last option, so as to prevent overwriting data
                 if self.lrunfailed:
                     if self.lreport:
                         print(
                             ("Overwriting failed experiment folder '{:s}'."
                              .format(rundir)))
                     lskip = False  # rundir will be deleted
                 else:
                     if self.lreport:
                         print(
                             ("Skipping experiment folder '{:s}' (failed).".
                              format(rundir)))
                     lskip = True
             else:  # no/unknown indicator file
                 if self.lreport:
                     print(
                         ("Overwriting existing experiment folder '{:s}'.".
                          format(rundir)))
                 lskip = False  # rundir will be deleted
         else:
             if self.lreport:
                 print(("Creating new experiment folder '{:s}'.".format(
                     rundir)))
             lskip = False
         if not lskip:
             self.rundirs.append(rundir)
             # isolate HGS constructor arguments
             hgsargs = inspect.getargspec(
                 HGS.__init__
             ).args  # returns args, varargs, kwargs, defaults
             hgsargs = {
                 arg: kwargs[arg]
                 for arg in hgsargs if arg in kwargs
             }
             self.hgsargs.append(hgsargs)
             # initialize HGS instance
             hgs = HGS(**hgsargs)
             self.members.append(hgs)
     # final check
     if len(self.members) == 0:
         raise EnsembleError("No experiments to run (empty list).")