def __call__(self, lparallel=False, NP=None, inner_list=None, outer_list=None, callback=None, **kwargs): ''' this method is called instead of a class or instance method; it applies the arguments 'kwargs' to each ensemble member; it also supports argument expansion with inner and outer product (prior to application to ensemble) and parallelization using multiprocessing ''' # expand kwargs to ensemble list kwargs_list = expandArgumentList(inner_list=inner_list, outer_list=outer_list, **kwargs) if len(kwargs_list) == 1: kwargs_list = kwargs_list * len(self.klass.members) elif len(kwargs_list) != len(self.klass.members): raise ArgumentError( 'Length of expanded argument list does not match ensemble size! {} ~= {}' .format(len(kwargs_list), len(self.klass.members))) # loop over ensemble members and execute function if lparallel: # parallelize method execution using multiprocessing pool = multiprocessing.Pool(processes=NP) # initialize worker pool if callback is not None and not callable(callback): raise TypeError(callback) # N.B.: the callback function is passed a result from the apply_method function, # which returns a tuple of the form (member, exit_code) # define work loads (function and its arguments) and start tasks results = [ pool.apply_async(apply_method, (member, self.attr), kwargs, callback=callback) for member, kwargs in zip(self.klass.members, kwargs_list) ] # N.B.: Beware Pickling!!! pool.close() pool.join() # wait to finish # retrieve and assemble results results = [result.get() for result in results] # divide members and results (apply_method returns both, in case members were modified) self.klass.members = [result[0] for result in results] results = [result[1] for result in results] else: # get instance methods methods = [ getattr(member, self.attr) for member in self.klass.members ] # just apply sequentially results = [ method(**kwargs) for method, kwargs in zip(methods, kwargs_list) ] if len(results) != len(self.klass.members): raise ArgumentError( 'Length of results list does not match ensemble size! {} ~= {}' .format(len(results), len(self.klass.members))) return tuple(results)
def timeAxis(start_date=None, end_date=None, sampling=None, date_range=None, time_axis=None, llastIncl=True, ntime=None, varatts=None): ''' figure out type and dimensions of time axis ''' # check time input if date_range: start_date, end_date, sampling = date_range if start_date and end_date and sampling: start_year, start_month, start_day = convertDate(start_date) start_datetime = np.datetime64( dt.datetime(year=start_year, month=start_month, day=start_day), sampling) end_year, end_month, end_day = convertDate(end_date) end_datetime = np.datetime64( dt.datetime(year=end_year, month=end_month, day=end_day), sampling) if llastIncl: end_datetime += np.timedelta64(1, sampling) date_range = np.arange(start_datetime, end_datetime, dtype='datetime64[{}]'.format(sampling)) assert date_range[0] == start_datetime, date_range[0] if ntime: if ntime > len(date_range): raise ArgumentError(date_range) else: # trim date_range = date_range[0:ntime] else: ntime = len(date_range) elif time_axis == 'datetime': raise ArgumentError('Insufficient time axis information!') # construct time axis atts = varatts['time'] if time_axis.lower() == 'simple': time = Axis(atts=atts, coord=np.arange(1, ntime + 1)) elif time_axis.lower() == 'datetime': if sampling.lower() == 'y' or sampling.lower() == '1y': units = 'year' elif sampling.lower() == 'm' or sampling.lower() == '1m': units = 'month' elif sampling.lower() == 'd' or sampling.lower() == '1d': units = 'day' elif sampling.lower() == 'h' or sampling.lower() == '1h': units = 'hour' else: units = sampling long_name = '{}s since {}'.format(units.title(), str( date_range[0])) # hope this makes sense... atts.update(long_name=long_name, units=units) time = Axis(atts=atts, coord=date_range) else: raise ArgumentError(time_axis) # return time axis return time
def detrend(var, ax=None, lcopy=True, ldetrend=True, ltrend=False, degree=1, rcond=None, w=None, lsmooth=False, lresidual=False, window_len=11, window='hanning'): ''' subtract a linear trend from a time-series array (operation is in-place) ''' # check input if not isinstance(var, np.ndarray): raise NotImplementedError # too many checks if lcopy: var = var.copy() # make copy - not in-place! # fit over entire array (usually not what we want...) if ax is None and ldetrend: ax = np.arange(var.size) # make dummy axis, if necessary if var.ndim != 1: shape = var.shape var = var.ravel() # flatten array, if necessary else: shape = None # apply optional detrending if ldetrend or ltrend: # fit linear trend trend = np.polyfit(ax, var, deg=degree, rcond=rcond, w=w, full=False, cov=False) # evaluate and subtract linear trend if ldetrend and ltrend: raise ArgumentError( "Can either return trend/polyfit or residuals, not both.") elif ldetrend and not ltrend: var -= np.polyval(trend, ax) # residuals elif ltrend and not ldetrend: var = np.polyval(trend, ax) # residuals # apply optional smoothing if lsmooth and lresidual: raise ArgumentError( "Can either return smoothed array or residuals, not both.") elif lsmooth: var = smooth(var, window_len=window_len, window=window) elif lresidual: var -= smooth(var, window_len=window_len, window=window) # return detrended and/or smoothed time-series if shape is not None: var = var.reshape(shape) return var
def getTimeseriesData(self, units='kg/s', lcheck=True, lexpand=True, lfill=True, period=None, lflatten=True): ''' extract time series data and time coordinates from a WSC monthly CSV file ''' if self.monthly_file: # use numpy's CSV functionality # get timeseries data data = np.genfromtxt(self.monthly_file, dtype=np.float32, delimiter=',', skip_header=1, filling_values=np.nan, usecols=np.arange(4,28,2), usemask=True, loose=True, invalid_raise=True) assert data.shape[1] == 12, data.shape # for some reason every value is followed by an extra comma... #data = np.ma.masked_less(data, 10) # remove some invalid values # N.B.: some values appear unrealistically small, however, these are removed in the check- # section below (it appears they consistently fail the ckeck test) if units.lower() == 'kg/s': data *= 1000. # m^3 == 1000 kg (water) elif units.lower() == 'm^3/s': pass # original units else: raise ArgumentError("Unknown units: {}".format(units)) # get time coordinates and verification flag check = np.genfromtxt(self.monthly_file, dtype=np.int, delimiter=',', skip_header=1, filling_values=-9999, usecols=np.arange(1,4,1), usemask=True, loose=True, invalid_raise=True) assert check.shape[0] == data.shape[0], check.shape assert np.all(check >= 0), np.sum(check < 0) time = check[:,2].astype(np.int) # this is the year (time coordinate) # determine valid entries if lcheck: check = np.all(check[:,:2]==1, axis=1) # require all entries to be one # N.B.: I'm not sure what it means if values are not equal to one, but the flow values look # unrealistically small (see above); probably different units... data = data[check,:]; time = time[check] assert time.shape[0] == data.shape[0], check.shape # slice off values outside the period of interest if period: valid = np.logical_and(time >= period[0],time < period[1]) time = time[valid]; data = data[valid] # fill in missing time periods/years if lfill: if period: time0 = period[0]; time1 = period[1] else: time0 = time[0]; time1 = time[-1]+1 idx = np.asarray(time - time0, dtype=np.int32); tlen = time1 - time0 # start at 0; length is last value (+1) pad_time = np.arange(time0,time1) # form continuous sequence #assert np.all( pad_time[idx] == time ), idx # potentially expensive time = pad_time # new continuous time coordinate pad_data = np.ma.zeros((tlen,12), dtype=np.float32)*np.NaN # pre-allocate with NaN pad_data.mask = True # mask everywhere for now pad_data[idx,:] = data; #pad_data.mask[idx,:] = data.mask #assert np.all( pad_data.mask[idx,:] == data.mask ) # potentially expensive data = pad_data # now, expand time coordinate by adding month if lexpand: time = time.reshape((time.size,1)) coord = np.repeat((time-1979)*12, 12, axis=1) + np.arange(0,12).reshape((1,12)) assert coord.shape == data.shape, coord.shape #assert np.all( np.diff(coord.flatten()) == 1 ), coord # potentially expensive time = coord if lflatten: time = time.flatten(); data = data.flatten() # return data array and coordinate vector return data, time else: raise IOError("No timeseries file defined or file not found for gage station '{}'.\n(folder: '{}')".format(self.name,self.folder))
def __init__(self, project=None, filetype='aux', folder=None, bc_method=None, **expargs): ''' take arguments that have been passed from caller and initialize parameters ''' if bc_method: if not filetype: filetype = bc_method.lower() elif filetype != bc_method.lower(): raise ArgumentError(filetype, bc_method) self.bc_method = bc_method self.filetype = filetype; self.folder_pattern = folder self.export_arguments = expargs
def binedges(bins=None, binedgs=None, limits=None, lcheckVar=True): ''' utility function to generate and validate bins and binegdes from either one ''' # check input if bins is None and binedgs is None: raise ArgumentError elif bins is not None and binedgs is not None: if len(bins) + 1 != len(binedgs): raise ArgumentError(len(bins)) if bins is not None: if limits is not None: vmin, vmax = limits else: raise ArgumentError(bins) # expand bins (values refer to center of bins) if isinstance(bins, (int, np.integer)): if bins == 1: bins = np.asarray(((vmin + vmax) / 2., )) else: bins = np.linspace(vmin, vmax, bins) elif isinstance(bins, (tuple, list)) and 0 < len(bins) < 4: bins = np.linspace(*bins) elif not isinstance(bins, (list, np.ndarray)): raise TypeError(bins) if len(bins) == 1: tmpbinedgs = np.asarray((vmin, vmax)) else: hbd = np.diff(bins) / 2. # make sure this is a float! tmpbinedgs = np.hstack( (bins[0] - hbd[0], bins[1:] - hbd, bins[-1] + hbd[-1])) # assuming even spacing if binedgs is None: binedgs = tmpbinedgs # computed from bins elif lcheckVar: assert isEqual(binedgs, np.asarray(tmpbinedgs, dtype=binedgs.dtype)) if binedgs is not None: # expand bin edges if not isinstance(binedgs, (tuple, list)): binedgs = np.asarray(binedgs) elif not isinstance(binedgs, np.ndarray): raise TypeError(binedgs) tmpbins = binedgs[1:] - (np.diff(binedgs) / 2. ) # make sure this is a float! if bins is None: bins = tmpbins # compute from binedgs elif lcheckVar: assert isEqual(bins, np.asarray(tmpbins, dtype=bins.dtype)) # return bins and binegdes return bins, binedgs
def loadMetadata(well, filename='metadata.dbf', wellname='W{WELL_ID:07d}-{WELL_NO:1d}', llistWells=False, folder=None, conservation_authority=None): if not folder and conservation_authority: folder = ca_folder.format(conservation_authority) # clean up well name well_id, well_no = getWellName(well) well = wellname.format(WELL_ID=well_id, WELL_NO=well_no) # open database and get relevant entry #from simpledbf import Dbf5 from dbfread import DBF filepath = filename if folder is None else os.path.join(folder, filename) table = DBF(filepath) meta = None for record in table: if llistWells: print((record['PGMN_WELL'])) if record['PGMN_WELL'] == well: meta = record.copy() if meta is None: raise ArgumentError(well) # parse screen information screen_type, screen_depth = meta['SCREEN_HOL'].split(':') meta['Screen'] = screen_type.title() screen_hilo = [] lunit = False for hilo in screen_depth.split('-'): if hilo[-1] == 'M': lunit = True screen_hilo.append(float(hilo[:-1])) else: screen_hilo.append(float(hilo)) if not lunit: raise ValueError(screen_depth) assert len(screen_hilo) == 2, screen_hilo meta['screen_top'] = screen_hilo[0] meta['screen_bottom'] = screen_hilo[1] meta['screen_depth'] = (screen_hilo[0] + screen_hilo[1]) / 2. meta['zs'] = float(meta['ELVA_GROUN']) meta['z'] = meta['zs'] - meta['screen_depth'] meta['z_t'] = meta['zs'] - meta['screen_top'] meta['z_b'] = meta['zs'] - meta['screen_bottom'] # return coordinate arrays (in degree) return meta
def __init__(self, basin=None, river=None, name=None, folder=None, lcheck=False): ''' initialize gage station based on various input data ''' if name is None: raise ArgumentError() if folder is None: folder = '{:s}/Basins/{:s}/'.format(root_folder,basin) if not os.path.isdir(folder): IOError(folder) if river is not None and river not in name: name = '{:s}_{:s}'.format(river,name) if not os.path.isdir(folder): IOError(folder) self.folder = folder # usually basin folder self.name = name # or prefix... self.basin_name = basin # has to be a long_name in order to construct the folder self.meta_file = '{:s}/{:s}'.format(folder,name + self.meta_ext) if not os.path.isfile(self.meta_file): if lcheck: raise IOError(self.meta_file) else: self.meta_file = None # clear if not available self.monthly_file = '{:s}/{:s}'.format(folder,name + self.monthly_ext) if not os.path.isfile(self.monthly_file): if lcheck: raise IOError(self.monthly_file) else: self.monthly_file = None # clear if not available
def loadHGS_StnEns(ensemble=None, station=None, varlist=None, varatts=None, name=None, title=None, period=None, run_period=15, folder=None, obs_period=None, ensemble_list=None, ensemble_args=None, observation_list=None, # ensemble and obs lists for project loadHGS_StnTS=loadHGS_StnTS, loadWSC_StnTS=loadWSC_StnTS, # these can also be overloaded prefix=None, WSC_station=None, basin=None, basin_list=None, **kwargs): ''' a wrapper for the regular HGS loader that can also load gage stations and assemble ensembles ''' if observation_list is None: observation_list = ('obs','observations') if ensemble_list is None: ensemble_list = dict() # empty, i.e. no ensembles elif not isinstance(ensemble_list, dict): raise TypeError(ensemble_list) if ensemble is None: raise ArgumentError("Mandatory argument 'ensemble' is not defined!") # decide what to do, based on inputs if ensemble.lower() in observation_list: # translate parameters station = station if WSC_station is None else WSC_station period = period if obs_period is None else obs_period filetype = 'monthly' # load gage station with slightly altered parameters dataset = loadWSC_StnTS(station=station, name=name, title=title, basin=basin, basin_list=basin_list, varlist=varlist, varatts=varatts, period=period, filetype=filetype) elif ensemble.lower() in ensemble_list: if ensemble_args is None: ensemble_args = dict() # loop over list of experiments in ensemble ens = [] for exp in ensemble_list[ensemble]: # load individual HGS simulation ds = loadHGS_StnTS(station=station, varlist=varlist, varatts=varatts, name=name, title=title, period=period, ENSEMBLE=exp, run_period=run_period, folder=folder, prefix=prefix, WSC_station=WSC_station, basin=basin, basin_list=basin_list, **kwargs) ens.append(ds) # construct ensemble by concatenating time-series ensemble_args.setdefault('name',ds.name.replace(exp,ensemble).replace(exp.title(),ensemble.title())) ensemble_args.setdefault('title',ds.title.replace(exp,ensemble).replace(exp.title(),ensemble.title())) # N.B.: the ensemble name is constructed by replacing the experiment name in specific dataset names with the ensemble name ensemble_args.setdefault('axis','time') dataset = concatDatasets(ens, **ensemble_args) else: # load HGS simulation dataset = loadHGS_StnTS(station=station, varlist=varlist, varatts=varatts, name=name, title=title, period=period, ENSEMBLE=ensemble, run_period=run_period, folder=folder, prefix=prefix, WSC_station=WSC_station, basin=basin, basin_list=basin_list, **kwargs) return dataset
def PCA(data, degree=None, lprewhiten=False, lpostwhiten=False, lEOF=False, lfeedback=False): ''' A function to perform principal component analysis and return the time-series of the leading EOF's. ''' data = np.asarray(data) if not data.ndim == 2: raise ArgumentError(data.ndim) # pre-whiten features if lprewhiten: data -= data.mean(axis=0, keepdims=True) data /= data.std(axis=0, keepdims=True) # compute PCA R = np.cov(data.transpose()) # covariance matrix eig, eof = la.eigh(R) # eigenvalues, eigenvectors (of symmetric matrix) ieig = np.argsort(eig, )[::-1] # sort in descending order eig = eig[ieig] eof = eof[:, ieig] eig /= eig.sum() # normalize by total variance # truncate EOF's if degree is not None: eig = eig[:degree] eof = eof[:, :degree] # generate report/feedback if lfeedback: string = "Variance explained by {:s} PCA's: {:s}; total variance explained: {:2.0f}%" eiglist = ', '.join('{:.0f}%'.format(e * 100.) for e in eig) dgrstr = 'all' if degree is None else "{:d} leading".format(degree) print(string.format(dgrstr, eiglist, eig.sum() * 100.)) # project data onto (leading) EOF's pca = np.dot(data, eof) # inverse order, because the are transposed # post-whiten features if lpostwhiten: pca -= pca.mean(axis=0, keepdims=True) pca /= pca.std(axis=0, keepdims=True) # return results if lEOF: return pca, eig, eof else: return pca, eig
def loadHydro(filename='discharge_out.mpiio', folder=None, nreal=None, ntime=None, dtype=np.float64): ''' function to load hydroggraphs/discharge from EnKF output ''' if not nreal and not ntime: raise ArgumentError("Please specify number of realizations 'nreal' or number of time steps 'ntime'.") filepath = os.path.join(folder,filename) if isinstance(dtype, str): dtype = getattr(np,dtype) # load data data = np.fromfile(filepath, dtype=dtype) # reshape (need to know number or realizations or time steps) n = data.size if nreal: nt = int(n/nreal) nr = nreal if ntime and nt != ntime: raise ValueError("Given number of time steps is not consistent with file size or data type ({} != {}).".format(ntime,nt)) elif ntime: nt = ntime nr = int(n/ntime) if nt*nr != n: raise ValueError("Given number of realizations ({}) and time steps do not divide number of data points ({}).".format(nr,nt,n)) data = data.reshape((nt,nr)) # return timeseries return data
def defaultNamedtuple(typename, field_names, defaults=None): ''' wrapper for namedtuple that supports defaults; adapted from stackoverflow: https://stackoverflow.com/questions/11351032/named-tuple-and-optional-keyword-arguments ''' T = col.namedtuple(typename, field_names) # make named tuple T.__new__.__defaults__ = (None, ) * len(T._fields) # set defaults to None # add custom defaults if defaults is not None: if isinstance(defaults, col.Mapping): prototype = T(**defaults) elif isinstance(defaults, col.Iterable): prototype = T(*defaults) else: raise ArgumentError(str(defaults)) T.__new__.__defaults__ = tuple(prototype) # # add self-referenc defaults # if ref_prefix: # l = len(ref_prefix) # for field,value in T._asdict().iteritems(): # if isinstance(value,basestring) and value[:l] == ref_prefix: # T.__dict__[field] = T.__dict__[value[l:]] # # N.B.: this would have to go into the constructor in order to work... # return namedtuple with defaults return T
def loadHGS_StnTS(station=None, varlist=None, varatts=None, folder=None, name=None, title=None, start_date=None, end_date=None, run_period=15, period=None, lskipNaN=False, lcheckComplete=True, basin=None, WSC_station=None, basin_list=None, filename=None, prefix=None, scalefactors=None, **kwargs): ''' Get a properly formatted WRF dataset with monthly time-series at station locations; as in the hgsrun module, the capitalized kwargs can be used to construct folders and/or names ''' if folder is None or ( filename is None and station is None ): raise ArgumentError # try to find meta data for gage station from WSC HGS_station = station if basin is not None and basin_list is not None: station_name = station station = getGageStation(basin=basin, station=station if WSC_station is None else WSC_station, basin_list=basin_list) # only works with registered basins if station_name is None: station_name = station.name # backup, in case we don't have a HGS station name metadata = station.getMetaData() # load station meta data if metadata is None: raise GageStationError(name) else: metadata = dict(); station = None; station_name = None # prepare name expansion arguments (all capitalized) expargs = dict(ROOT_FOLDER=root_folder, STATION=HGS_station, NAME=name, TITLE=title, PREFIX=prefix, BASIN=basin, WSC_STATION=WSC_station) for key,value in metadata.items(): if isinstance(value,basestring): expargs['WSC_'+key.upper()] = value # in particular, this includes WSC_ID if 'WSC_ID' in expargs: if expargs['WSC_ID'][0] == '0': expargs['WSC_ID0'] = expargs['WSC_ID'][1:] else: raise DatasetError('Expected leading zero in WSC station ID: {}'.format(expargs['WSC_ID'])) # exparg preset keys will get overwritten if capitalized versions are defined for key,value in kwargs.items(): KEY = key.upper() # we only use capitalized keywords, and non-capitalized keywords are only used/converted if KEY == key or KEY not in kwargs: expargs[KEY] = value # if no capitalized version is defined # read folder and infer prefix, if necessary folder = folder.format(**expargs) if not os.path.exists(folder): raise IOError(folder) if expargs['PREFIX'] is None: with open('{}/{}'.format(folder,prefix_file), 'r') as pfx: expargs['PREFIX'] = prefix = ''.join(pfx.readlines()).strip() # now assemble file name for station timeseries filename = filename.format(**expargs) filepath = '{}/{}'.format(folder,filename) if not os.path.exists(filepath): IOError(filepath) if station_name is None: station_name = filename[filename.index('hydrograph.')+1:-4] if station is None else station # set meta data (and allow keyword expansion of name and title) metadata['problem'] = prefix metadata['station_name'] = metadata.get('long_name', station_name) if name is not None: name = name.format(**expargs) # name expansion with capitalized keyword arguments else: name = 'HGS_{:s}'.format(station_name) metadata['name'] = name; expargs['Name'] = name.title() # name in title format if title is None: title = '{{Name:s}} (HGS, {problem:s})'.format(**metadata) title = title.format(**expargs) # name expansion with capitalized keyword arguments metadata['long_name'] = metadata['title'] = title # now determine start data for date_parser if end_date is None: if start_date and run_period: end_date = start_date + run_period elif period: end_date = period[1] else: raise ArgumentError("Need to specify either 'start_date' & 'run_period' or 'period' to infer 'end_date'.") end_year,end_month,end_day = convertDate(end_date) if start_date is None: if end_date and run_period: start_date = end_date - run_period elif period: start_date = period[0] else: raise ArgumentError("Need to specify either 'end_date' & 'run_period' or 'period' to infer 'start_date'.") start_year,start_month,start_day = convertDate(start_date) if start_day != 1 or end_day != 1: raise NotImplementedError('Currently only monthly data is supported.') # import functools # date_parser = functools.partial(date_parser, year=start_year, month=start_month, day=start_day) # # now load data using pandas ascii reader # data_frame = pd.read_table(filepath, sep='\s+', header=2, dtype=np.float64, index_col=['time'], # date_parser=date_parser, names=ascii_varlist) # # resample to monthly data # data_frame = data_frame.resample(resampling).agg(np.mean) # data = data_frame[flowvar].values # parse header if varlist is None: varlist = variable_list[:] # default list with open(filepath, 'r') as f: line = f.readline(); lline = line.lower() # 1st line if not "hydrograph" in lline: raise GageStationError(line,filepath) # parse variables and determine columns line = f.readline(); lline = line.lower() # 2nd line if not "variables" in lline: raise GageStationError(line) variable_order = [v.strip('"').lower() for v in line[line.find('"'):].strip().split(',')] # figure out varlist and data columns if variable_order[0] == 'time': del variable_order[0] # only keep variables else: raise GageStationError(variable_order) variable_order = [hgs_variables[v] for v in variable_order] # replace HGS names with GeoPy names vardict = {v:i+1 for i,v in enumerate(variable_order)} # column mapping; +1 because time was removed variable_order = [v for v in variable_order if v in varlist or flow_to_flux[v] in varlist] usecols = tuple(vardict[v] for v in variable_order) # variable columns that need to loaded (except time, which is col 0) assert 0 not in usecols, usecols # load data as tab separated values data = np.genfromtxt(filepath, dtype=np.float64, delimiter=None, skip_header=3, usecols = (0,)+usecols) assert data.shape[1] == len(usecols)+1, data.shape if lskipNaN: data = data[np.isnan(data).sum(axis=1)==0,:] elif np.any( np.isnan(data) ): raise DataError("Missing values (NaN) encountered in hydrograph file; use 'lskipNaN' to ignore.\n('{:s}')".format(filepath)) time_series = data[:,0]; flow_data = data[:,1:] assert flow_data.shape == (len(time_series),len(usecols)), flow_data.shape # original time deltas in seconds time_diff = time_series.copy(); time_diff[1:] = np.diff(time_series) # time period between time steps assert np.all( time_diff > 0 ), filepath time_diff = time_diff.reshape((len(time_diff),1)) # reshape to make sure broadcasting works # integrate flow over time steps before resampling flow_data[1:,:] -= np.diff(flow_data, axis=0)/2. # get average flow between time steps flow_data *= time_diff # integrate flow in time interval by multiplying average flow with time period flow_data = np.cumsum(flow_data, axis=0) # integrate by summing up total flow per time interval # generate regular monthly time steps start_datetime = np.datetime64(dt.datetime(year=start_year, month=start_month, day=start_day), 'M') end_datetime = np.datetime64(dt.datetime(year=end_year, month=end_month, day=end_day), 'M') time_monthly = np.arange(start_datetime, end_datetime+np.timedelta64(1, 'M'), dtype='datetime64[M]') assert time_monthly[0] == start_datetime, time_monthly[0] assert time_monthly[-1] == end_datetime, time_monthly[-1] # convert monthly time series to regular array of seconds since start date time_monthly = ( time_monthly.astype('datetime64[s]') - start_datetime.astype('datetime64[s]') ) / np.timedelta64(1,'s') assert time_monthly[0] == 0, time_monthly[0] # interpolate integrated flow to new time axis #flow_data = np.interp(time_monthly, xp=time_series[:,0], fp=flow_data[:,0],).reshape((len(time_monthly),1)) time_series = np.concatenate(([0],time_series), axis=0) # integrated flow at time zero must be zero... flow_data = np.concatenate(([[0,]*len(usecols)],flow_data), axis=0) # ... this is probably better than interpolation # N.B.: we are adding zeros here so we don't have to extrapolate to the left; on the right we just fill in NaN's if ( time_monthly[-1] - time_series[-1] ) > 3*86400. and lcheckComplete: warn("Data record ends more than 3 days befor end of period: {} days".format((time_monthly[-1]-time_series[-1])/86400.)) elif (time_monthly[-1]-time_series[-1]) > 5*86400.: if lcheckComplete: raise DataError("Data record ends more than 5 days befor end of period: {} days".format((time_monthly[-1]-time_series[-1])/86400.)) else: warn("Data record ends more than 5 days befor end of period: {} days".format((time_monthly[-1]-time_series[-1])/86400.)) flow_interp = si.interp1d(x=time_series, y=flow_data, kind='linear', axis=0, copy=False, bounds_error=False, fill_value=np.NaN, assume_sorted=True) flow_data = flow_interp(time_monthly) # evaluate with call # compute monthly flow rate from interpolated integrated flow flow_data = np.diff(flow_data, axis=0) / np.diff(time_monthly, axis=0).reshape((len(time_monthly)-1,1)) flow_data *= 1000 # convert from m^3/s to kg/s # construct time axis start_time = 12*(start_year - 1979) + start_month -1 end_time = 12*(end_year - 1979) + end_month -1 time = Axis(name='time', units='month', atts=dict(long_name='Month since 1979-01'), coord=np.arange(start_time, end_time)) # not including the last, e.g. 1979-01 to 1980-01 is 12 month assert len(time_monthly) == end_time-start_time+1 assert flow_data.shape == (len(time),len(variable_order)), (flow_data.shape,len(time),len(variable_order)) # construct dataset dataset = Dataset(atts=metadata) dataset.station = station # add gage station object, if available (else None) for i,flowvar in enumerate(variable_order): data = flow_data[:,i] fluxvar = flow_to_flux[flowvar] if flowvar in varlist: flowatts = variable_attributes[flowvar] # convert variables and put into dataset (monthly time series) if flowatts['units'] != 'kg/s': raise VariableError("Hydrograph data is read as kg/s; flow variable does not match.\n{}".format(flowatts)) dataset += Variable(data=data, axes=(time,), **flowatts) if fluxvar in varlist and 'shp_area' in metadata: # compute surface flux variable based on drainage area fluxatts = variable_attributes[fluxvar] if fluxatts['units'] == 'kg/s' and fluxatts['units'] != 'kg/m^2/s': raise VariableError(fluxatts) data = data / metadata['shp_area'] # need to make a copy dataset += Variable(data=data, axes=(time,), **fluxatts) # apply analysis period if period is not None: dataset = dataset(years=period) # adjust scalefactors, if necessary if scalefactors: if isinstance(scalefactors,dict): dataset = updateScalefactor(dataset, varlist=scalefactors, scalefactor=None) elif isNumber(scalefactors): scalelist = ('discharge','seepage','flow') dataset = updateScalefactor(dataset, varlist=scalelist, scalefactor=scalefactors) else: raise TypeError(scalefactors) # return completed dataset return dataset
def loadGageStation(basin=None, station=None, varlist=None, varatts=None, mode='climatology', aggregation=None, filetype='monthly', folder=None, name=None, period=None, basin_list=None, lcheck=True, lexpand=True, lfill=True, lflatten=True, lkgs=True, scalefactors=None, title=None): ''' function to load hydrograph climatologies and timeseries for a given basin ''' ## resolve input if mode == 'timeseries' and aggregation: raise ArgumentError('Timeseries does not support aggregation.') # get GageStation instance station = getGageStation(basin=basin, station=station, name=name, folder=folder, river=None, basin_list=basin_list, lcheck=True) # variable attributes if varlist is None: varlist = variable_list elif not isinstance(varlist,(list,tuple)): raise TypeError varlist = list(varlist) # make copy of varlist to avoid interference if varatts is None: if aggregation is None: varatts = variable_attributes_kgs if lkgs else variable_attributes_mms else: varatts = agg_varatts_kgs if lkgs else agg_varatts_mms elif not isinstance(varatts,dict): raise TypeError ## read csv data # time series data and time coordinates lexpand = True; lfill = True if mode == 'climatology': lexpand = False; lfill = False; lflatten = False data, time = station.getTimeseriesData(units='kg/s' if lkgs else 'm^3/s', lcheck=True, lexpand=lexpand, lfill=lfill, period=period, lflatten=lflatten) # station meta data metadata = station.getMetaData(lcheck=True) den = metadata['shp_area'] if lkgs else ( metadata['shp_area'] / 1000. ) ## create dataset for station dataset = Dataset(name='WSC', title=title or metadata['Station Name'], varlist=[], atts=metadata,) if mode.lower() in ('timeseries','time-series'): time = time.flatten(); data = data.flatten() # just to make sure... # make time axis based on time coordinate from csv file timeAxis = Axis(name='time', units='month', coord=time, # time series centered at 1979-01 atts=dict(long_name='Month since 1979-01')) dataset += timeAxis # load mean discharge dataset += Variable(axes=[timeAxis], data=data, atts=varatts['discharge']) # load mean runoff doa = data / den dataset += Variable(axes=[timeAxis], data=doa, atts=varatts['runoff']) elif mode == 'climatology': # N.B.: this is primarily for backwards compatibility; it should not be used anymore... # make common time axis for climatology te = 12 # length of time axis: 12 month climAxis = Axis(name='time', units='month', length=12, coord=np.arange(1,te+1,1)) # monthly climatology dataset.addAxis(climAxis, copy=False) # extract variables (min/max/mean are separate variables) # N.B.: this is mainly for backwards compatibility doa = data / den if aggregation is None or aggregation.lower() == 'mean': # load mean discharge tmpdata = nf.nanmean(data, axis=0) tmpvar = Variable(axes=[climAxis], data=tmpdata, atts=varatts['discharge']) dataset.addVariable(tmpvar, copy=False) # load mean runoff tmpdata = nf.nanmean(doa, axis=0) tmpvar = Variable(axes=[climAxis], data=tmpdata, atts=varatts['runoff']) dataset.addVariable(tmpvar, copy=False) if aggregation is None or aggregation.lower() == 'std': # load discharge standard deviation tmpdata = nf.nanstd(data, axis=0, ddof=1) # very few values means large uncertainty! tmpvar = Variable(axes=[climAxis], data=tmpdata, atts=varatts['discstd']) dataset.addVariable(tmpvar, copy=False) # load runoff standard deviation tmpdata = nf.nanstd(doa, axis=0, ddof=1) tmpvar = Variable(axes=[climAxis], data=tmpdata, atts=varatts['roff_std']) dataset.addVariable(tmpvar, copy=False) if aggregation is None or aggregation.lower() == 'sem': # load discharge standard deviation tmpdata = nf.nansem(data, axis=0, ddof=1) # very few values means large uncertainty! tmpvar = Variable(axes=[climAxis], data=tmpdata, atts=varatts['discsem']) dataset.addVariable(tmpvar, copy=False) # load runoff standard deviation tmpdata = nf.nansem(doa, axis=0, ddof=1) tmpvar = Variable(axes=[climAxis], data=tmpdata, atts=varatts['roff_sem']) dataset.addVariable(tmpvar, copy=False) if aggregation is None or aggregation.lower() == 'max': # load maximum discharge tmpdata = nf.nanmax(data, axis=0) tmpvar = Variable(axes=[climAxis], data=tmpdata, atts=varatts['discmax']) dataset.addVariable(tmpvar, copy=False) # load maximum runoff tmpdata = nf.nanmax(doa, axis=0) tmpvar = Variable(axes=[climAxis], data=tmpdata, atts=varatts['roff_max']) dataset.addVariable(tmpvar, copy=False) if aggregation is None or aggregation.lower() == 'min': # load minimum discharge tmpdata = nf.nanmin(data, axis=0) tmpvar = Variable(axes=[climAxis], data=tmpdata, atts=varatts['discmin']) dataset.addVariable(tmpvar, copy=False) # load minimum runoff tmpdata = nf.nanmin(doa, axis=0) tmpvar = Variable(axes=[climAxis], data=tmpdata, atts=varatts['roff_min']) dataset.addVariable(tmpvar, copy=False) else: raise NotImplementedError, "Time axis mode '{}' is not supported.".format(mode) # adjust scalefactors, if necessary if scalefactors: if isinstance(scalefactors,dict): dataset = updateScalefactor(dataset, varlist=scalefactors, scalefactor=None) elif isNumber(scalefactors): scalelist = ('discharge','StdDisc','SEMDisc','MaxDisc','MinDisc',) dataset = updateScalefactor(dataset, varlist=scalelist, scalefactor=scalefactors) else: raise TypeError(scalefactors) # return station dataset return dataset
def rasterVariable(name=None, units=None, axes=None, atts=None, plot=None, dtype=None, projection=None, griddef=None, file_pattern=None, lgzip=None, lgdal=True, lmask=True, fillValue=None, lskipMissing=True, path_params=None, offset=0, scalefactor=1, transform=None, time_axis=None, lfeedback=False, **kwargs): ''' function to read multi-dimensional raster data and construct a GDAL-enabled Variable object ''' # print status if lfeedback: print "Loading variable '{}': ".format(name), # no newline ## figure out axes arguments and load data # figure out axes (list/tuple of axes has to be ordered correctly!) axes_list = [ax.name for ax in axes[:-2]] # N.B.: the last two axes are the two horizontal map axes (x&y); they can be None and will be inferred from raster # N.B.: coordinate values can be overridden with keyword arguments, but length must be consistent # figure out coordinates for axes for ax in axes[:-2]: if ax.name in kwargs: # just make sure the dimensions match, but use keyword argument if not len(kwargs[ax.name]) == len(ax): raise AxisError( "Length of Variable axis and raster file dimension have to be equal." ) else: # use Axis coordinates and add to kwargs for readRasterArray call kwargs[ax.name] = tuple(ax.coord) # load raster data if lfeedback: print("'{}'".format(file_pattern)) data, geotransform = readRasterArray(file_pattern, lgzip=lgzip, lgdal=lgdal, dtype=dtype, lmask=lmask, fillValue=fillValue, lgeotransform=True, axes=axes_list, lna=False, lskipMissing=lskipMissing, path_params=path_params, lfeedback=lfeedback, **kwargs) # shift and rescale if offset != 0: data += offset if scalefactor != 1: data *= scalefactor ## create Variable object and add GDAL # check map axes and generate if necessary xlon, ylat = getAxes( geotransform, xlen=data.shape[-1], ylen=data.shape[-2], projected=griddef.isProjected if griddef else bool(projection)) axes = list(axes) if axes[-1] is None: axes[-1] = xlon elif len(axes[-1]) != len(xlon): raise AxisError(axes[-1]) if axes[-2] is None: axes[-2] = ylat elif len(axes[-2]) != len(ylat): raise AxisError(axes[-2]) # create regular Variable with data in memory var = Variable(name=name, units=units, axes=axes, data=data, dtype=dtype, mask=None, fillValue=fillValue, atts=atts, plot=plot) # apply transform (if any), now that we have axes etc. if transform is not None: var = transform(var=var, time_axis=time_axis) # add GDAL functionality if griddef is not None: # perform some consistency checks ... if projection is None: projection = griddef.projection elif projection != griddef.projection: raise ArgumentError( "Conflicting projection and GridDef!\n {} != {}".format( projection, griddef.projection)) if not np.isclose(geotransform, griddef.geotransform).all(): raise ArgumentError( "Conflicting geotransform (from raster) and GridDef!\n {} != {}" .format(geotransform, griddef.geotransform)) # ... and use provided geotransform (due to issues with numerical precision, this is usually better) geotransform = griddef.geotransform # if we don't pass the geotransform explicitly, it will be recomputed from the axes # add GDAL functionality var = addGDALtoVar(var, griddef=griddef, projection=projection, geotransform=geotransform, gridfolder=None) # return final, GDAL-enabled variable return var
def rasterDataset(name=None, title=None, vardefs=None, axdefs=None, atts=None, projection=None, griddef=None, lgzip=None, lgdal=True, lmask=True, fillValue=None, lskipMissing=True, lgeolocator=True, file_pattern=None, lfeedback=True, **kwargs): ''' function to load a set of variables that are stored in raster format in a systematic directory tree into a Dataset Variables and Axis are defined as follows: vardefs[varname] = dict(name=string, units=string, axes=tuple of strings, atts=dict, plot=dict, dtype=np.dtype, fillValue=value) axdefs[axname] = dict(name=string, units=string, atts=dict, coord=array or list) or None The path to raster files is constructed as variable_pattern+axes_pattern, where axes_pattern is defined through the axes, (as in rasterVarialbe) and variable_pattern takes the special keywords VAR, which is the variable key in vardefs. ''' ## prepare input data and axes if griddef: xlon, ylat = griddef.xlon, griddef.ylat if projection is None: projection = griddef.projection elif projection != griddef.projection: raise ArgumentError("Conflicting projection and GridDef!") geotransform = griddef.geotransform isProjected = griddef.isProjected else: xlon = ylat = geotransform = None isProjected = False if projection is None else True # construct axes dict axes = dict() for axname, axdef in axdefs.items(): assert 'coord' in axdef, axdef assert ('name' in axdef and 'units' in axdef) or 'atts' in axdef, axdef if axdef is None: axes[axname] = None else: ax = Axis(**axdef) axes[ax.name] = ax # check for map Axis if isProjected: if 'x' not in axes: axes['x'] = xlon if 'y' not in axes: axes['y'] = ylat else: if 'lon' not in axes: axes['lon'] = xlon if 'lat' not in axes: axes['lat'] = ylat ## load raster data into Variable objects varlist = [] for varname, vardef in vardefs.items(): # check definitions assert 'axes' in vardef and 'dtype' in vardef, vardef assert ('name' in vardef and 'units' in vardef) or 'atts' in vardef, vardef # determine relevant axes vardef = vardef.copy() axes_list = [ None if ax is None else axes[ax] for ax in vardef.pop('axes') ] # define path parameters (with varname) path_params = vardef.pop('path_params', None) path_params = dict() if path_params is None else path_params.copy() if 'VAR' not in path_params: path_params['VAR'] = varname # a special key # add kwargs and relevant axis indices relaxes = [ax.name for ax in axes_list if ax is not None] # relevant axes for key, value in kwargs.items(): if key not in axes or key in relaxes: vardef[key] = value # create Variable object var = rasterVariable(projection=projection, griddef=griddef, file_pattern=file_pattern, lgzip=lgzip, lgdal=lgdal, lmask=lmask, lskipMissing=lskipMissing, axes=axes_list, path_params=path_params, lfeedback=lfeedback, **vardef) # vardef components: name, units, atts, plot, dtype, fillValue varlist.append(var) # check that map axes are correct for ax in var.xlon, var.ylat: if axes[ax.name] is None: axes[ax.name] = ax elif axes[ax.name] != ax: raise AxisError("{} axes are incompatible.".format(ax.name)) if griddef is None: griddef = var.griddef elif griddef != var.griddef: raise AxisError("GridDefs are inconsistent.") if geotransform is None: geotransform = var.geotransform elif geotransform != var.geotransform: raise AxisError( "Conflicting geotransform (from Variable) and GridDef!\n {} != {}" .format(var.geotransform, geotransform)) ## create Dataset # create dataset dataset = Dataset(name=name, title=title, varlist=varlist, axes=axes, atts=atts) # add GDAL functionality dataset = addGDALtoDataset(dataset, griddef=griddef, projection=projection, geotransform=geotransform, gridfolder=None, lwrap360=None, geolocator=lgeolocator, lforce=False) # N.B.: for some reason we also need to pass the geotransform, otherwise it is recomputed internally and some consistency # checks fail due to machine-precision differences # return GDAL-enabled Dataset return dataset
def expandArgumentList(inner_list=None, outer_list=None, expand_list=None, lproduct='outer', **kwargs): ''' A function that generates a list of complete argument dict's, based on given kwargs and certain expansion rules: kwargs listed in expand_list are expanded and distributed element-wise, either as inner ('inner_list') or outer ('outer_list') product, while other kwargs are repeated in every argument dict. Arguments can be expanded simultaneously (in parallel) within an outer product by specifying them as a tuple within the outer product argument list ('outer_list'). ''' if not (expand_list or inner_list or outer_list): arg_dicts = [kwargs] # return immediately - nothing to do else: # handle legacy arguments if expand_list is not None: if inner_list is not None or outer_list is not None: raise ArgumentError("Can not mix input modes!") if lproduct.lower() == 'inner': inner_list = expand_list elif lproduct.lower() == 'outer': outer_list = expand_list else: raise ArgumentError(lproduct) outer_list = outer_list or [] inner_list = inner_list or [] # handle outer product expansion first if len(outer_list) > 0: kwtmp = { key: value for key, value in kwargs.items() if key not in inner_list } # detect variables for parallel expansion # N.B.: parallel outer expansion is handled by replacing the arguments in each parallel expansion group # with a single (fake) argument that is a tuple of the original argument values; the tuple is then, # after expansion, disassembled into its former constituent arguments par_dict = dict() for kw in outer_list: if isinstance(kw, (tuple, list)): # retrieve parallel expansion group par_args = [kwtmp.pop(name) for name in kw] if not all( [len(args) == len(par_args[0]) for args in par_args]): raise ArgumentError( "Lists for parallel expansion arguments have to be of same length!" ) # introduce fake argument and save record fake = 'TMP_' + '_'.join(kw) + '_{:d}'.format( len(kw)) # long name that is unlikely to interfere... par_dict[ fake] = kw # store record of parallel expansion for reassembly later kwtmp[fake] = zip( *par_args) # transpose lists to get a list of tuples elif not isinstance(kw, basestring): raise TypeError(kw) # replace entries in outer list if len(par_dict) > 0: outer_list = outer_list[:] # copy list for fake, names in par_dict.items(): if names in outer_list: outer_list[outer_list.index(names)] = fake assert all([isinstance(arg, basestring) for arg in outer_list]) outer_list, outer_dict = _prepareList(outer_list, kwtmp) lstlen = 1 for el in outer_list: lstlen *= len(outer_dict[el]) # execute recursive function for outer product expansion list_dict = _loop_recursion(outer_list, **outer_dict) # use copy of # N.B.: returns a dictionary where all kwargs have been expanded to lists of appropriate length assert all(key in outer_dict for key in list_dict.iterkeys()) assert all(len(list_dict[el]) == lstlen for el in outer_list) # check length assert all(len(ld) == lstlen for ld in list_dict.itervalues()) # check length # disassemble parallel expansion tuple and reassemble as individual arguments if len(par_dict) > 0: for fake, names in par_dict.iteritems(): assert fake in list_dict par_args = zip( *list_dict.pop(fake) ) # transpose, to get an expanded tuple for each argument assert len(par_args) == len(names) for name, args in zip(names, par_args): list_dict[name] = args # handle inner product expansion last if len(inner_list) > 0: kwtmp = kwargs.copy() if len(outer_list) > 0: kwtmp.update(list_dict) inner_list = outer_list + inner_list # N.B.: this replaces all outer expansion arguments with lists of appropriate length for inner expansion inner_list, inner_dict = _prepareList(inner_list, kwtmp) # inner product: essentially no expansion lst0 = inner_dict[inner_list[0]] lstlen = len(lst0) for el in inner_list: # check length if len(inner_dict[el]) == 1: inner_dict[el] = inner_dict[ el] * lstlen # broadcast singleton list elif len(inner_dict[el]) != lstlen: raise TypeError( 'Lists have to be of same length to form inner product!' ) list_dict = inner_dict ## generate list of argument dicts arg_dicts = [] for n in xrange(lstlen): # assemble arguments lstargs = {key: lst[n] for key, lst in list_dict.iteritems()} arg_dict = kwargs.copy() arg_dict.update(lstargs) arg_dicts.append(arg_dict) # return list of arguments return arg_dicts
def addDistFit(ensemble=None, lfit=True, lflatten=None, lrescale=False, reference=None, target=None, lbootstrap=False, nbs=30, sample_axis=None, lglobalScale=False, lcrossval=False, ncv=0.2, dist=None, dist_args=None, load_list=None, lproduct='outer', **kwargs): ''' add distribution fits to ensemble; optionally also rescale; kwargs are necessary for correct list expansion ''' # find appropriate sample axis if lflatten: if sample_axis is not None: raise ArgumentError, sample_axis elif sample_axis is None: # auto-detect for saxis in ('time', 'year'): if all([all(ens.hasAxis(saxis)) for ens in ensemble]): sample_axis = saxis break if sample_axis is None: raise AxisError, "No sample axis detected" else: if isinstance(sample_axis, basestring): if not all([all(ens.hasAxis(sample_axis)) for ens in ensemble]): raise AxisError, sample_axis elif isinstance(sample_axis, (tuple, list)): # check that axes are there for ax in sample_axis: if not all([all(ens.hasAxis(ax)) for ens in ensemble]): raise AxisError, ax # merge axes ensemble = [ens.mergeAxes(axes=sample_axis, new_axis='sample', asVar=True, linplace=False, \ lcheckAxis=False) for ens in ensemble] sample_axis = 'sample' else: raise AxisError, sample_axis # perform fit or return dummy if dist_args is None: dist_args = dict() if lfit: fitens = [ ens.fitDist(lflatten=lflatten, axis=sample_axis, lcrossval=lcrossval, ncv=ncv, lignoreParams=True, lbootstrap=lbootstrap, nbs=nbs, dist=dist, **dist_args) for ens in ensemble ] else: fitens = [None] * len(ensemble) # rescale fitted distribution (according to certain rules) if lrescale: if not reference: raise ArgumentError(str(reference)) # expand target list if isinstance(target, (list, tuple)): expand_list = load_list[:] if 'names' in expand_list: expand_list[expand_list.index('names')] = 'target' kwarg_list = expandArgumentList(target=target, expand_list=expand_list, lproduct=lproduct, **kwargs) targets = [kwarg['target'] for kwarg in kwarg_list] else: targets = [target] * len(fitens) if isinstance(reference, (list, tuple)): raise NotImplementedError # don't expand reference list # use global reference, if necessary if isinstance(reference, basestring) and not all(reference in fit for fit in fitens): i = 0 while i < len(fitens) and reference not in fitens[i]: i += 1 if i >= len(fitens): raise ArgumentError, "Reference {:s} not found in any dataset!".format( reference) reference = fitens[i][reference] sclens = [ rescaleDistributions(fit, reference=reference, target=tgt, lglobal=lglobalScale) for fit, tgt in zip(fitens, targets) ] else: sclens = [None] * len(ensemble) # return results return fitens, sclens
def performExport(dataset, mode, dataargs, expargs, bcargs, loverwrite=False, ldebug=False, lparallel=False, pidstr='', logger=None): ''' worker function to export ASCII rasters for a given dataset ''' # input checking if not isinstance(dataset,basestring): raise TypeError if not isinstance(dataargs,dict): raise TypeError # all dataset arguments are kwargs # logging if logger is None: # make new logger logger = logging.getLogger() # new logger logger.addHandler(logging.StreamHandler()) else: if isinstance(logger,basestring): logger = logging.getLogger(name=logger) # connect to existing one elif not isinstance(logger,logging.Logger): raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format(str(logger)) ## extract meta data from arguments dataargs, loadfct, srcage, datamsgstr = getMetaData(dataset, mode, dataargs, lone=False) dataset_name = dataargs.dataset_name; periodstr = dataargs.periodstr; domain = dataargs.domain # figure out bias correction parameters if bcargs: bcargs = bcargs.copy() # first copy, then modify... bc_method = bcargs.pop('method',None) if bc_method is None: raise ArgumentError("Need to specify bias-correction method to use bias correction!") bc_obs = bcargs.pop('obs_dataset',None) if bc_obs is None: raise ArgumentError("Need to specify observational dataset to use bias correction!") bc_reference = bcargs.pop('reference',None) if bc_reference is None: # infer from experiment name if dataset_name[-5:] in ('-2050','-2100'): bc_reference = dataset_name[:-5] # cut of period indicator and hope for the best else: bc_reference = dataset_name bc_grid = bcargs.pop('grid',None) if bc_grid is None: bc_grid = dataargs.grid bc_domain = bcargs.pop('domain',None) if bc_domain is None: bc_domain = domain bc_varlist = bcargs.pop('varlist',None) bc_varmap = bcargs.pop('varmap',None) bc_tag = bcargs.pop('tag',None) # an optional name extension/tag bc_pattern = bcargs.pop('file_pattern',None) # usually default in getPickleFile lgzip = bcargs.pop('lgzip',None) # if pickle is gzipped (None: auto-detect based on file name extension) # get name of pickle file (and folder) picklefolder = dataargs.avgfolder.replace(dataset_name,bc_reference) picklefile = getPickleFileName(method=bc_method, obs_name=bc_obs, gridstr=bc_grid, domain=bc_domain, tag=bc_tag, pattern=bc_pattern) picklepath = '{:s}/{:s}'.format(picklefolder,picklefile) if lgzip: picklepath += '.gz' # add extension if not os.path.exists(picklepath): raise IOError(picklepath) elif lgzip is None: lgzip = False if not os.path.exists(picklepath): lgzip = True # assume gzipped file picklepath += '.gz' # try with extension... if not os.path.exists(picklepath): raise IOError(picklepath) elif not os.path.exists(picklepath): raise IOError(picklepath) pickleage = datetime.fromtimestamp(os.path.getmtime(picklepath)) # determine age of pickle file and compare against source age else: bc_method = False pickleage = srcage # parse export options expargs = expargs.copy() # first copy, then modify... lm3 = expargs.pop('lm3') # convert kg/m^2/s to m^3/m^2/s (water flux) expformat = expargs.pop('format') # needed to get FileFormat object exp_list= expargs.pop('exp_list') # this handled outside of export compute_list = expargs.pop('compute_list', []) # variables to be (re-)computed - by default all # initialize FileFormat class instance fileFormat = getFileFormat(expformat, bc_method=bc_method, **expargs) # get folder for target dataset and do some checks expname = '{:s}_d{:02d}'.format(dataset_name,domain) if domain else dataset_name expfolder = fileFormat.defineDataset(dataset=dataset, mode=mode, dataargs=dataargs, lwrite=True, ldebug=ldebug) # prepare destination for new dataset lskip = fileFormat.prepareDestination(srcage=max(srcage,pickleage), loverwrite=loverwrite) # depending on last modification time of file or overwrite setting, start computation, or skip if lskip: # print message skipmsg = "\n{:s} >>> Skipping: Format '{:s} for dataset '{:s}' already exists and is newer than source file.".format(pidstr,expformat,dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,expfolder) logger.info(skipmsg) else: ## actually load datasets source = loadfct() # load source data # check period if 'period' in source.atts and dataargs.periodstr != source.atts.period: # a NetCDF attribute raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format(periodstr,source.atts.period) # load BiasCorrection object from pickle if bc_method: op = gzip.open if lgzip else open with op(picklepath, 'r') as filehandle: BC = pickle.load(filehandle) # assemble logger entry bcmsgstr = "(performing bias-correction using {:s} from {:s} towards {:s})".format(BC.long_name,bc_reference,bc_obs) # print message if mode == 'climatology': opmsgstr = 'Exporting Climatology ({:s}) to {:s} Format'.format(periodstr, expformat) elif mode == 'time-series': opmsgstr = 'Exporting Time-series to {:s} Format'.format(expformat) elif mode[-5:] == '-mean': opmsgstr = 'Exporting {:s}-Mean ({:s}) to {:s} Format'.format(mode[:-5], periodstr, expformat) else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode) # print feedback to logger logmsg = '\n{0:s} *** {1:^65s} *** \n{0:s} *** {2:^65s} *** \n'.format(pidstr,datamsgstr,opmsgstr) if bc_method: logmsg += "{0:s} *** {1:^65s} *** \n".format(pidstr,bcmsgstr) logger.info(logmsg) if not lparallel and ldebug: logger.info('\n'+str(source)+'\n') # create GDAL-enabled target dataset sink = Dataset(axes=(source.xlon,source.ylat), name=expname, title=source.title, atts=source.atts.copy()) addGDALtoDataset(dataset=sink, griddef=source.griddef) assert sink.gdal, sink # apply bias-correction if bc_method: source = BC.correct(source, asNC=False, varlist=bc_varlist, varmap=bc_varmap) # load bias-corrected variables into memory # N.B.: for variables that are not bias-corrected, data are not loaded immediately but on demand; this way # I/O and computing can be further disentangled and not all variables are always needed # compute intermediate variables, if necessary for varname in exp_list: variables = None # variable list var = None # (re-)compute variable, if desired... if varname in compute_list: if varname == 'precip': var = newvars.computeTotalPrecip(source) elif varname == 'waterflx': var = newvars.computeWaterFlux(source) elif varname == 'liqwatflx': var = newvars.computeLiquidWaterFlux(source) elif varname == 'netrad': var = newvars.computeNetRadiation(source, asVar=True) elif varname == 'netrad_bb': var = newvars.computeNetRadiation(source, asVar=True, lrad=False, name='netrad_bb') elif varname == 'netrad_bb0': var = newvars.computeNetRadiation(source, asVar=True, lrad=False, lA=False, name='netrad_bb0') elif varname == 'vapdef': var = newvars.computeVaporDeficit(source) elif varname in ('pet','pet_pm','petrad','petwnd') and 'pet' not in sink: if 'petrad' in exp_list or 'petwnd' in exp_list: variables = newvars.computePotEvapPM(source, lterms=True) # default; returns mutliple PET terms else: var = newvars.computePotEvapPM(source, lterms=False) # returns only PET elif varname == 'pet_th': var = None # skip for now #var = computePotEvapTh(source) # simplified formula (less prerequisites) # ... otherwise load from source file if var is None and variables is None and varname in source: var = source[varname].load() # load data (may not have to load all) #else: raise VariableError, "Unsupported Variable '{:s}'.".format(varname) # for now, skip variables that are None if var or variables: # handle lists as well if var and variables: raise VariableError, (var,variables) elif var: variables = (var,) for var in variables: addGDALtoVar(var=var, griddef=sink.griddef) if not var.gdal and isinstance(fileFormat,ASCII_raster): raise GDALError, "Exporting to ASCII_raster format requires GDAL-enabled variables." # add to new dataset sink += var # convert units if lm3: for var in sink: if var.units == 'kg/m^2/s': var /= 1000. # divide to get m^3/m^2/s var.units = 'm^3/m^2/s' # update units # compute seasonal mean if we are in mean-mode if mode[-5:] == '-mean': sink = sink.seasonalMean(season=mode[:-5], lclim=True) # N.B.: to remain consistent with other output modes, # we need to prevent renaming of the time axis sink = concatDatasets([sink,sink], axis='time', lensembleAxis=True) sink.squeeze() # we need the year-axis until now to distinguish constant fields; now remove # print dataset if not lparallel and ldebug: logger.info('\n'+str(sink)+'\n') # export new dataset to selected format fileFormat.exportDataset(sink) # write results to file writemsg = "\n{:s} >>> Export of Dataset '{:s}' to Format '{:s}' complete.".format(pidstr,expname, expformat) writemsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,expfolder) logger.info(writemsg) # clean up and return source.unload(); #del source return 0 # "exit code"
def __init__(self, inner_list=None, outer_list=None, **kwargs): ''' initialize an ensemble of HGS simulations based on HGS arguments and project descriptors; all keyword arguments are automatically expanded based on inner/outer product rules, defined using the inner_list/outer_list arguments; the expanded argument lists are used to initialize the individual ensemble members; note that a string substitution is applied to all folder variables (incl. 'rundir') prior to constructing the HGS instance, i.e. rundir.format(**kwargs) ''' self.lreport = kwargs.get('lreport', self.lreport) self.loverwrite = kwargs.get('loverwrite', self.loverwrite) self.lindicator = kwargs.get('lindicator', self.lindicator) self.lrunfailed = kwargs.get('lrunfailed', self.lrunfailed) self.lrestart = kwargs.get('lrestart', self.lrestart) # expand argument list (plain, nothing special) kwargs_list = expandArgumentList(inner_list=inner_list, outer_list=outer_list, **kwargs) # loop over ensemble members self.members = [] self.rundirs = [] self.hgsargs = [] # ensemble lists for kwargs in kwargs_list: # isolate folder variables and perform variable substitution for folder_type in ('rundir', 'template_folder', 'input_folder', 'pet_folder', 'precip_inc', 'pet_inc', 'ic_files'): if folder_type in kwargs: folder = kwargs[folder_type] if isinstance(folder, str): # perform keyword substitution with all available arguments if folder_type is 'ic_files': # we need to preserve '{FILETYPE}' for later kwargs[folder_type] = folder.format( FILETYPE='{FILETYPE}', **kwargs) else: kwargs[folder_type] = folder.format(**kwargs) elif folder is None: pass else: raise TypeError(folder) # check rundir rundir = kwargs['rundir'] kwargs[ 'restart'] = False # this keyword argument should be controlled by the Ensemble handler if rundir in self.rundirs: raise ArgumentError( "Multiple occurence of run directory:\n '{}'".format( rundir)) # figure out skipping if os.path.exists(rundir): if self.loverwrite: if self.lreport: print( ("Overwriting existing experiment folder '{:s}'.". format(rundir))) lskip = False elif self.lindicator and os.path.exists( '{}/SCHEDULED'.format(rundir)): if self.lreport: print( ("Skipping experiment folder '{:s}' (scheduled).". format(rundir))) lskip = True elif self.lindicator and os.path.exists( '{}/IN_PROGRESS'.format(rundir)): if self.lrestart: shutil.move(os.path.join(rundir, 'IN_PROGRESS'), os.path.join(rundir, 'RESTARTED')) if self.lreport: print(( "Restarting experiment in folder '{:s}' (was in progress)." .format(rundir))) lskip = False kwargs['restart'] = True else: if self.lreport: print(( "Skipping experiment folder '{:s}' (in progress)." .format(rundir))) lskip = True elif self.lindicator and os.path.exists( '{}/COMPLETED'.format(rundir)): if self.lreport: print( ("Skipping experiment folder '{:s}' (completed).". format(rundir))) lskip = True elif self.lindicator and os.path.exists( '{}/FAILED'.format(rundir)): # this should be the last option, so as to prevent overwriting data if self.lrunfailed: if self.lreport: print( ("Overwriting failed experiment folder '{:s}'." .format(rundir))) lskip = False # rundir will be deleted else: if self.lreport: print( ("Skipping experiment folder '{:s}' (failed).". format(rundir))) lskip = True else: # no/unknown indicator file if self.lreport: print( ("Overwriting existing experiment folder '{:s}'.". format(rundir))) lskip = False # rundir will be deleted else: if self.lreport: print(("Creating new experiment folder '{:s}'.".format( rundir))) lskip = False if not lskip: self.rundirs.append(rundir) # isolate HGS constructor arguments hgsargs = inspect.getargspec( HGS.__init__ ).args # returns args, varargs, kwargs, defaults hgsargs = { arg: kwargs[arg] for arg in hgsargs if arg in kwargs } self.hgsargs.append(hgsargs) # initialize HGS instance hgs = HGS(**hgsargs) self.members.append(hgs) # final check if len(self.members) == 0: raise EnsembleError("No experiments to run (empty list).")