def __call__(self, load_list=None, lproduct='outer', inner_list=None, outer_list=None, lensemble=None, ens_name=None, ens_title=None, **kwargs): ''' wrap original function: expand argument list, execute load_fct over argument list, and return a list or Ensemble of datasets ''' # decide, what to do if load_list is None and inner_list is None and outer_list is None: # normal operation: no expansion datasets = self.load_fct(**kwargs) else: # expansion required lensemble = ens_name is not None if lensemble is None else lensemble # figure out arguments kwargs_list = expandArgumentList(expand_list=load_list, lproduct=lproduct, inner_list=inner_list, outer_list=outer_list, **kwargs) # load datasets datasets = [] for kwargs in kwargs_list: # load dataset datasets.append(self.load_fct(**kwargs)) # construct ensemble if lensemble: datasets = Ensemble(members=datasets, name=ens_name, title=ens_title, basetype='Dataset') # return list or ensemble of datasets return datasets
def loadEnsembleTS(names=None, name=None, title=None, varlist=None, aggregation=None, season=None, prov=None, slices=None, obsslices=None, years=None, reduction=None, shape=None, station=None, constraints=None, filetypes=None, domain=None, ldataset=False, lcheckVar=False, lwrite=False, ltrimT=True, name_tags=None, dataset_mode='time-series', lminmax=False, master=None, lall=True, ensemble_list=None, ensemble_product='inner', lensembleAxis=False, WRF_exps=None, CESM_exps=None, WRF_ens=None, CESM_ens=None, **kwargs): ''' a convenience function to load an ensemble of time-series, based on certain criteria; works with either stations or regions; seasonal/climatological aggregation is also supported ''' # prepare ensemble if varlist is not None: varlist = list(varlist)[:] # copy list if station: for var in stn_params: # necessary to select stations if var not in varlist: varlist.append(var) if shape: for var in shp_params: # necessary to select shapes if var not in varlist: varlist.append(var) # perpare ensemble and arguments if ldataset and ensemble_list: raise ArgumentError() elif not ldataset: ensemble = Ensemble(name=name, title=title, basetype='Dataset') # expand argument list if ensemble_list is None: ensemble_list = ['names'] if not ldataset else None loadargs = expandArgumentList(names=names, station=station, prov=prov, shape=shape, varlist=varlist, mode=dataset_mode, filetypes=filetypes, domains=domain, lwrite=lwrite, slices=slices, obsslices=obsslices, name_tags=name_tags, ltrimT=ltrimT, years=years, expand_list=ensemble_list, lproduct=ensemble_product, lensembleAxis=lensembleAxis) for loadarg in loadargs: # clean up argumetns name = loadarg.pop('names',None); name_tag = loadarg.pop('name_tags',None) slcs = loadarg.pop('slices',None); obsslcs = loadarg.pop('obsslices',None) # load individual dataset dataset = loadDataset(name=name, WRF_exps=WRF_exps, CESM_exps=CESM_exps, WRF_ens=WRF_ens, CESM_ens=CESM_ens, **loadarg) if name_tag is not None: if name_tag[0] == '_': dataset.name += name_tag else: dataset.name = name_tag # apply slicing if obsslcs and ( dataset.name[:3].lower() == 'obs' or dataset.name.isupper() ): slcs = dict() if slcs is None else slcs.copy() slcs.update(**obsslcs) # add special slices for obs # N.B.: currently VarNC's can only be sliced once, because we can't combine slices yet if slcs: dataset = dataset(lminmax=lminmax, **slcs) # slice immediately if not ldataset: ensemble += dataset.load() # load data and add to ensemble # if input was not a list, just return dataset if ldataset: ensemble = dataset.load() # load data # select specific stations (if applicable) if not ldataset and station and constraints: from datasets.EC import selectStations ensemble = selectStations(ensemble, stnaxis='station', master=master, linplace=False, lall=lall, lcheckVar=lcheckVar, **constraints) # make sure all have cluster meta data for varname in stn_params + shp_params: # find valid instance var = None for ds in ensemble: if varname in ds: var = ds[varname]; break # give to those who have not if var is not None: var.load() # load data and add as regular variable (not VarNC) for ds in ensemble: if varname not in ds: ds.addVariable(var.copy()) # apply general reduction operations if reduction is not None: for ax,op in reduction.iteritems(): if isinstance(op, basestring): ensemble = getattr(ensemble,op)(axis=ax) elif isinstance(op, (int,np.integer,float,np.inexact)): ensemble = ensemble(**{ax:op}) # extract seasonal/climatological values/extrema if (ldataset and len(ensemble)==0): raise EmptyDatasetError(varlist) if not ldataset and any([len(ds)==0 for ds in ensemble]): raise EmptyDatasetError(ensemble) # N.B.: the operations below should work with Ensembles as well as Datasets if aggregation: method = aggregation if aggregation.isupper() else aggregation.title() if season is None: ensemble = getattr(ensemble,'clim'+method)(taxis='time', **kwargs) else: ensemble = getattr(ensemble,'seasonal'+method)(season=season, taxis='time', **kwargs) elif season: # but not aggregation ensemble = ensemble.seasonalSample(season=season) # return dataset return ensemble
def loadEnsembleTS(names=None, name=None, title=None, varlist=None, aggregation=None, season=None, prov=None, slices=None, obsslices=None, years=None, reduction=None, shape=None, station=None, constraints=None, filetypes=None, domain=None, ldataset=False, lcheckVar=False, lwrite=False, ltrimT=True, name_tags=None, dataset_mode='time-series', lminmax=False, master=None, lall=True, ensemble_list=None, ensemble_product='inner', lensembleAxis=False, WRF_exps=None, CESM_exps=None, WRF_ens=None, CESM_ens=None, **kwargs): ''' a convenience function to load an ensemble of time-series, based on certain criteria; works with either stations or regions; seasonal/climatological aggregation is also supported ''' # prepare ensemble if varlist is not None: varlist = list(varlist)[:] # copy list if station: for var in stn_params: # necessary to select stations if var not in varlist: varlist.append(var) if shape: for var in shp_params: # necessary to select shapes if var not in varlist: varlist.append(var) # perpare ensemble and arguments if ldataset and ensemble_list: raise ArgumentError elif not ldataset: ensemble = Ensemble(name=name, title=title, basetype='Dataset') # expand argument list if ensemble_list is None: ensemble_list = ['names'] if not ldataset else None loadargs = expandArgumentList(names=names, station=station, prov=prov, shape=shape, varlist=varlist, mode=dataset_mode, filetypes=filetypes, domains=domain, lwrite=lwrite, slices=slices, obsslices=obsslices, name_tags=name_tags, ltrimT=ltrimT, years=years, expand_list=ensemble_list, lproduct=ensemble_product, lensembleAxis=lensembleAxis) for loadarg in loadargs: # clean up argumetns name = loadarg.pop('names',None); name_tag = loadarg.pop('name_tags',None) slcs = loadarg.pop('slices',None); obsslcs = loadarg.pop('obsslices',None) # load individual dataset dataset = loadDataset(name=name, WRF_exps=WRF_exps, CESM_exps=CESM_exps, WRF_ens=WRF_ens, CESM_ens=CESM_ens, **loadarg) if name_tag is not None: if name_tag[0] == '_': dataset.name += name_tag else: dataset.name = name_tag # apply slicing if obsslcs and ( dataset.name[:3].lower() == 'obs' or dataset.name.isupper() ): if slcs is None: slcs = obsslcs else: slcs.update(**obsslcs) # add special slices for obs # N.B.: currently VarNC's can only be sliced once, because we can't combine slices yet if slcs: dataset = dataset(lminmax=lminmax, **slcs) # slice immediately if not ldataset: ensemble += dataset.load() # load data and add to ensemble # if input was not a list, just return dataset if ldataset: ensemble = dataset.load() # load data # select specific stations (if applicable) if not ldataset and station and constraints: from datasets.EC import selectStations ensemble = selectStations(ensemble, stnaxis='station', master=master, linplace=False, lall=lall, lcheckVar=lcheckVar, **constraints) # make sure all have cluster meta data for varname in stn_params + shp_params: # find valid instance var = None for ds in ensemble: if varname in ds: var = ds[varname]; break # give to those who have not if var is not None: var.load() # load data and add as regular variable (not VarNC) for ds in ensemble: if varname not in ds: ds.addVariable(var.copy()) # apply general reduction operations if reduction is not None: for ax,op in reduction.iteritems(): if isinstance(op, basestring): ensemble = getattr(ensemble,op)(axis=ax) elif isinstance(op, (int,np.integer,float,np.inexact)): ensemble = ensemble(**{ax:op}) # extract seasonal/climatological values/extrema if (ldataset and len(ensemble)==0): raise EmptyDatasetError, varlist if not ldataset and any([len(ds)==0 for ds in ensemble]): raise EmptyDatasetError, ensemble # N.B.: the operations below should work with Ensembles as well as Datasets if aggregation: method = aggregation if aggregation.isupper() else aggregation.title() if season is None: ensemble = getattr(ensemble,'clim'+method)(taxis='time', **kwargs) else: ensemble = getattr(ensemble,'seasonal'+method)(season=season, taxis='time', **kwargs) elif season: # but not aggregation ensemble = ensemble.seasonalSample(season=season) # return dataset return ensemble
def selectElements(datasets, axis, testFct=None, master=None, linplace=False, lall=False): ''' Extract common points that meet a specific criterion from a list of datasets. The test function has to accept the following input: index, dataset, axis''' if linplace: raise NotImplementedError, "Option 'linplace' does not work currently." # check input if not isinstance(datasets, (list,tuple,Ensemble)): raise TypeError if not all(isinstance(dataset,Dataset) for dataset in datasets): raise TypeError if not isCallable(testFct) and testFct is not None: raise TypeError if isinstance(axis, Axis): axis = axis.name if not isinstance(axis, basestring): raise TypeError if lall and master is not None: raise ArgumentError, "The options 'lall' and 'imaster' are mutually exclusive!" # save some ensemble parameters for later lnotest = testFct is None lens = isinstance(datasets,Ensemble) if lens: enskwargs = dict(basetype=datasets.basetype, idkey=datasets.idkey, name=datasets.name, title=datasets.title) # use dataset with shortest axis as master sample (more efficient) axes = [dataset.getAxis(axis) for dataset in datasets] if master is None: imaster = np.argmin([len(ax) for ax in axes]) # find shortest axis elif isinstance(master,basestring): # translate name of dataset into index imaster = None for i,dataset in enumerate(datasets): if dataset.name == master: imaster = i; break if imaster is None: raise ArgumentError, "Master '{:s}' not found in datasets".format(master) else: imaster = master if not imaster is None and not isinstance(imaster,(int,np.integer)): raise TypeError, imaster elif imaster >= len(datasets) or imaster < 0: raise ValueError maxis = axes.pop(imaster) # extraxt shortest axis for loop if lall: tmpds = tuple(datasets) if imaster != 0: tmpds = (tmpds[imaster],)+tmpds[:imaster]+tmpds[imaster+1:] test_fct = lambda i,ds: testFct(i, ds, axis) # prepare test function arguments else: test_fct = lambda i: testFct(i, datasets[imaster], axis) # loop over coordinate axis itpls = [] # list of valid index tuple for i,x in enumerate(maxis.coord): # check other axes if all([x in ax.coord for ax in axes]): # only the other axes # no condition if lnotest: # just find and add indices itpls.append((i,)+tuple(ax.coord.searchsorted(x) for ax in axes)) # check condition using shortest dataset elif lall: # check test condition on all datasets (slower) tmpidx = (i,)+tuple(ax.coord.searchsorted(x) for ax in axes) if all(test_fct(ii,ds) for ii,ds in zip(tmpidx,tmpds)): # add corresponding indices in each dataset to list itpls.append(tmpidx) else: # check test condition on only one dataset (faster, default) if test_fct(i): # add corresponding indices in each dataset to list itpls.append((i,)+tuple(ax.coord.searchsorted(x) for ax in axes)) # N.B.: since we can expect exact matches, plain searchsorted is fastest (side='left') # check if there is anything left... if len(itpls) == 0: raise DatasetError, "Aborting: no data points match all criteria!" # construct axis indices for each dataset (need to remember to move shortest axis back in line) idxs = [[] for ds in datasets] # create unique empty lists for itpl in itpls: for i,idx in enumerate(itpl): idxs[i].append(idx) idxs.insert(imaster,idxs.pop(0)) # move first element back in line (where shortest axis was) idxs = [np.asarray(idxlst, dtype='int') for idxlst in idxs] # slice datasets using only positive results datasets = [ds(lidx=True, linplace=linplace, **{axis:idx}) for ds,idx in zip(datasets,idxs)] if lens: datasets = Ensemble(*datasets, **enskwargs) # return datasets return datasets
def generateStatistics(varname, ens, fit, scl=None, reference=None, mode='Ratio', plot_labels=None, nsamples=None, bootstrap_axis='bootstrap', lflatten=False, sample_axis='time', lcrossval=True): ''' Perform K-S test and compute ratio of means; return results in formatted string. ''' # some average diagnosics idkey = 'dataset_name' if ens.basetype is Dataset else 'name' varlist = Ensemble(*[ds[varname] for ds in ens if ds is not None and varname in ds], idkey=idkey) if not all(varlist[0].ndim==ndim for ndim in varlist.ndim): new_axes = varlist[np.argmax(varlist.ndim)].axes varlist = varlist.insertAxes(new_axes=new_axes, lcheckAxis=False) mvars = varlist.mean() # growth rate lratio = mode.lower() == 'ratio' lshift = mode.lower() == 'shift' if plot_labels is None: plot_labels = dict() # figure out fillValue if np.issubdtype(varlist[0].dtype, np.floating): fillValue = np.NaN elif np.issubdtype(varlist[0].dtype, np.integer): fillValue = 0 else: raise TypeError(varlist[0].dtype) # define reference if isinstance(reference,(list,tuple)): reflist0 = list(reference); reference = reference[0] else: reflist0 = [] # dummy list if reference is None: iref0 = 0 elif isinstance(reference,(int,np.integer)): iref0 = reference elif isinstance(reference,str): iref0 = varlist.idkeys.index(reference) else: raise ArgumentError # goodness of fit, reported on plot panels if fit: fitlist = Ensemble(*[ds[varname] for ds in fit if ds is not None and varname in ds], idkey=idkey) if any(fitlist.hasAxis(bootstrap_axis)): fitlist = fitlist(**{bootstrap_axis:0, 'lcheckAxis':False}) if not all(fitlist[0].ndim==ndim for ndim in fitlist.ndim): new_axes = fitlist[np.argmax(fitlist.ndim)].axes fitlist = fitlist.insertAxes(new_axes=new_axes, lcheckAxis=False) # for var in fitlist: # print [ax.name for ax in var.axes], var.shape # assert np.all(fitlist[0][1,:] == fitlist[0][2,:]) assert not isinstance(reference,str) or iref0 == fitlist.idkeys.index(reference), reference if any([isinstance(dist,VarRV) for dist in fitlist]) or not scl: names = [plot_labels.get(getattr(dist,idkey),getattr(dist,idkey)) for dist in fitlist] lnames = max([len(name) for name in names]) # allocate line space headline = 'Sample'; lhead = len(headline) # sample/exp header headline += ' '*max(lnames-lhead,0) # 'Exp.'+' '*max(lnames-4,0) if lnames < 8 else 'Experiment' string = '{:s} Fit {:s}\n'.format(headline,mode.title()) namestr = '{{:>{:d}s}} {{:s}} '.format(max(lhead,lnames)) iref = iref0; reflist = reflist0[:] # copy list for i,dist,var,name,mvar in zip(range(len(fitlist)),fitlist,varlist,names,mvars): if isinstance(dist,VarRV) or not scl: if isinstance(dist,VarRV): pval = dist.fittest(var, nsamples=nsamples, asVar=False, lcrossval=lcrossval) #lflatten=lflatten, axis_idx=var.axisIndex(sample_axis, lcheck=False)) # print var.name, pval, pval.mean().__class__.__name__, '{:s}'.format(pval.mean()) # pval = '{:3.2f}'.format(float(pval.mean())) # mean is only necessary to convert to scalar pval = '{:3.2f}'.format(float(np.median(pval))) # mean is only necessary to convert to scalar # for some reason masked array scalars appear string-type, rather than numbers... else: pval = ' - ' if len(reflist) > 0 and name == reflist[0]: # assign new reference iref = i; del reflist[0] # pop element if isinstance(mvar,np.ma.core.MaskedConstant) or isinstance(mvars[iref],np.ma.core.MaskedConstant): string += namestr.format(name,' N/A\n') elif lratio: string += (namestr+'{:3.2f}\n').format(name,pval,(mvar/mvars[iref]).mean()) elif lshift: string += (namestr+'{:+2.1f}\n').format(name,pval,(mvar-mvars[iref]).mean()) else: string = '' else: raise NotImplementedError if scl: scllist = Ensemble(*[ds[varname] for ds in scl if ds is not None and varname in ds], idkey=idkey) bs_axes = scllist.axisIndex(bootstrap_axis, lcheck=False) # return None, if not present if bs_axes is None: bs_axes = [None]*len(scllist) scllist = scllist(**{bootstrap_axis:0, 'lcheckAxis':False}) if not all(scllist[0].ndim==ndim for ndim in scllist.ndim): new_axes = scllist[np.argmax(scllist.ndim)].axes scllist = scllist.insertAxes(new_axes=new_axes, lcheckAxis=False) assert not isinstance(reference,str) or iref0 == scllist.idkeys.index(reference), reference if len(scllist) != len(varlist): raise AxisError(scllist) # compute means mvars = [] for svr,var in zip(scllist,varlist): if isinstance(svr,VarRV): mvar = svr.stats(moments='mv', asVar=False)[...,0] # only first moment else: mvar = var.mean()*svr.atts.get('loc_factor',1.) mvars.append(mvar) # figure out label width and prepare header if len(varlist) > 1: # otherwise no comparison... names = [plot_labels.get(getattr(dist,idkey),getattr(dist,idkey)) for dist in scllist] lnames = max([len(name) for name in names]) # allocate line space namestr = '{{:>{:d}s}} {{:s}} '.format(max(lhead,lnames)) tmphead = 'Fit to {:s}:' if scl == fit else 'Rescaled to {:s}:' # new heading tmphead += ' '*(max(lnames-len(names[iref0]),0)+5)+'\n' string += tmphead.format(names[iref0]) # prepare first reference sample for K-S test scale,shape = scllist[iref0].atts.get('scale_factor', 1),scllist[iref0].atts.get('shape_factor', 1) if not (scale is None or scale == 1) and not (shape is None or shape == 1): raise NotImplementedError("Cannot rescale scale/variance and shape parameters of reference sample!") refsmpl = varlist[iref0].getArray(unmask=True, fillValue=fillValue) # only once loc0 = scllist[iref0].atts.get('loc_factor', 1) refsmpl = _rescaleSample(refsmpl, loc0, bs_axis=bs_axes[iref0]) # apply rescaling (varies, dependign on loc-type) # print varlist[iref0].dataset_name, [ax.name for ax in varlist[iref0].axes], refsmpl.shape, # start loop iref = iref0; reflist = reflist0[:] # copy list for i,dist,varsmpl,mvar,bs_axis in zip(range(len(varlist)),scllist,varlist,mvars,bs_axes): name = getattr(dist,idkey) if len(reflist) > 0 and name == reflist[0]: # assign new reference iref = i; del reflist[0] # pop element # prepare subsequent reference sample for K-S test scale,shape = dist.atts.get('scale_factor', 1),dist.atts.get('shape_factor', 1) if not (scale is None or scale == 1) and not (shape is None or shape == 1): raise NotImplementedError("Cannot rescale scale/variance and shape parameters of reference sample!") refsmpl = varsmpl.getArray(unmask=True, fillValue=fillValue) # only once if not varsmpl.atts.get('rescaled',False): refsmpl = _rescaleSample(refsmpl, dist.atts.get('loc_factor', 1), bs_axis=bs_axis) # apply rescaling (varies, dependign on loc-type) elif i != iref: scale,shape = dist.atts.get('scale_factor', 1),dist.atts.get('shape_factor', 1) # perform K-S test if (scale is None or scale == 1) and (shape is None or shape == 1): # K-S test between actual samples is more realistic, and rescaling of mean is simple smpl = varsmpl.getArray(unmask=True, fillValue=fillValue) # only once if not varsmpl.atts.get('rescaled',False): smpl = _rescaleSample(smpl, dist.atts.get('loc_factor', 1), bs_axis=bs_axis) # apply rescaling (varies, dependign on loc-type) # print varsmpl.dataset_name, [ax.name for ax in varsmpl.axes], smpl.shape # print smpl.shape, np.nanmean(smpl), refsmpl.shape, np.nanmean(refsmpl) # print lflatten, sample_axis pval = ks_2samp(refsmpl, smpl, asVar=False, lflatten=lflatten, axis_idx=varsmpl.axisIndex(sample_axis, lcheck=False)) # print dist.name, pval # pval = '{:3.2f}'.format(float(pval.mean())) pval = '{:3.2f}'.format(float(np.median(pval))) else: # no straight-forward way to rescale samples, so have to compare distribution with # reference sample, which means more noise (since the distribution will be randomly sampled) if isinstance(dist,VarRV): pval = '{:3.2f}'.format(float(dist.kstest(refsmpl).mean())) else: pval = ' - ' # add column with ratio/difference of means after rescaling if name in plot_labels: name = plot_labels[name] if isinstance(mvar,np.ma.core.MaskedConstant) or isinstance(mvars[iref],np.ma.core.MaskedConstant): string += namestr.format(name,' N/A\n') elif lratio: string += (namestr+'{:3.2f}\n').format(name,pval,(mvar/mvars[iref]).mean()) elif lshift: string += (namestr+'{:+2.1f}\n').format(name,pval,(mvar-mvars[iref]).mean()) # return formatted table in string return string
def loadShapeObservations(obs=None, seasons=None, basins=None, provs=None, shapes=None, varlist=None, slices=None, aggregation='mean', shapetype=None, period=None, variable_list=None, **kwargs): ''' convenience function to load shape observations; the main function is to select sensible defaults based on 'varlist', if no 'obs' are specified ''' # prepare arguments if shapetype is None: shapetype = 'shpavg' # really only one in use # resolve variable list (no need to maintain order) if isinstance(varlist, basestring): varlist = [varlist] variables = set(shp_params) for name in varlist: if name in variable_list: variables.update(variable_list[name].vars) else: variables.add(name) variables = list(variables) # figure out default datasets if obs is None: obs = 'Observations' lUnity = lCRU = lWSC = False if obs[:3].lower() in ('obs', 'wsc'): if any(var in CRU_vars for var in variables): if aggregation == 'mean' and seasons is None: lUnity = True obs = [] if basins and any([var in WSC_vars for var in variables]): if aggregation.lower() in ('mean', 'std', 'sem', 'min', 'max') and seasons is None: lWSC = True obs = [] if not isinstance(obs, (list, tuple)): obs = (obs, ) # configure slicing (extract basin/province/shape and period) slices = _configSlices(slices=slices, basins=basins, provs=provs, shapes=shapes, period=period) if slices is not None: noyears = slices.copy() noyears.pop('years', None) # slices for climatologies # prepare and load ensemble of observations obsens = Ensemble(name='obs', title='Observations', basetype=Dataset) if len(obs) > 0: # regular operations with user-defined dataset try: ensemble = loadEnsembleTS(names=obs, season=seasons, aggregation=aggregation, slices=slices, varlist=variables, shape=shapetype, ldataset=False, **kwargs) for ens in ensemble: obsens += ens except EmptyDatasetError: pass if lUnity: # load Unity data instead of averaging CRU data if period is None: period = (1979, 1994) dataset = loadDataset(name='Unity', varlist=variables, mode='climatology', period=period, shape=shapetype) if slices is not None: dataset = dataset(**noyears) # slice immediately obsens += dataset.load() if lCRU: # this is basically regular operations with CRU as default obsens += loadEnsembleTS(names='CRU', season=seasons, aggregation=aggregation, slices=slices, varlist=variables, shape=shapetype, ldataset=True, **kwargs) if lWSC: # another special case: river hydrographs # from datasets.WSC import loadGageStation, GageStationError try: dataset = loadGageStation(basin=basins, varlist=['runoff'], aggregation=aggregation, mode='climatology', filetype='monthly') if slices is not None: dataset = dataset(**noyears) # slice immediately obsens += dataset.load() except GageStationError: pass # just ignore, if gage station data is missing # return ensembles (will be wrapped in a list, if BatchLoad is used) return obsens
def rescaleDistributions(datasets, reference=None, target=None, lscale=False, suffixes=None, lglobal=False): ''' Rescale datasets, so that the mean of each variable matches the corresponding variable in the reference dataset; if a target is specified, the target scale factors are applied to all datasets, if target is None, each dataset is rescaled individually. ''' if not isinstance(datasets, (list, tuple, Ensemble)): raise TypeError if isinstance(datasets, Ensemble) and isinstance(reference, basestring): reference = datasets[reference] elif not isinstance(reference, Dataset): raise TypeError if target is None or target == 'auto': pass # every dataset is scaled individually or based on suffixes elif isinstance(datasets, Ensemble) and isinstance(target, basestring): target = datasets[target] elif not isinstance(target, Dataset): raise TypeError, target if suffixes is None: suffixes = ('-2050', '2100') # suffixes for scaling heuristic # determine scale factor def scaleFactor(reference, target, lscale=False, lglobal=False): ''' internal function to compute rescaling factors for common variables ''' scalefactors = dict( ) # return dict with scalefactors for all applicable variables for varname, refvar in reference.variables.iteritems(): if varname in target and isinstance( refvar, VarRV): # only varaibles that appear in both sets tgtvar = target.variables[varname] iloc = 1 if refvar.shape[-1] == 3 else 0 # insert dummy ensemble axis, if necessary refvar = refvar.insertAxes(new_axes=tgtvar.axes, lcopy=True, asVar=True, linplace=False) if refvar.axes[-1].name.startswith('params'): refdata = refvar.data_array.take(iloc, axis=-1) else: raise AxisError, refvar.axes[-1] if refvar.ndim < tgtvar.ndim: # N.B.: this is necessary, because WRF (target) can have an extra ensemble dimension that obs # typically don't have; then we just replicate the obs for each ensemble element from warnings import warn if lglobal: warn( "Scalefactors are being averaged over extra target dimensions (e.g. 'ensemble' axis)" ) dimdiff = tgtvar.ndim - refvar.ndim if refvar.shape != tgtvar.shape[dimdiff:]: raise AxisError, "{:s} != {:s}".format(tgtvar, refvar) refdata = refdata.reshape((1, ) * dimdiff + refvar.shape[:-1]) elif refvar.shape != tgtvar.shape: raise AxisError, "{:s} != {:s}".format(tgtvar, refvar) tgtdata = tgtvar.data_array.take(iloc, axis=-1) if lglobal: loc = np.mean(refdata) / np.mean(tgtdata) else: loc = refdata / tgtdata if lscale: iscale = 2 if refvar.shape[-1] == 3 else 1 if lglobal: scale = np.mean(refvar.data_array.take( iscale, axis=-1)) / np.mean( tgtvar.data_array.take(iscale, axis=-1)) else: scale = refvar.data_array.take( iscale, axis=-1) / tgtvar.data_array.take(iscale, axis=-1) scalefactors[varname] = loc, (scale / loc) else: scalefactors[varname] = loc return scalefactors # return dict with scale factors for variables # compute general scalefactors if target == 'auto': scalefactor_collection = dict() elif target is not None: scalefactors = scaleFactor(reference, target, lscale=lscale, lglobal=lglobal) # loop over datasets rescaled_datasets = [] for dataset in datasets: if dataset == reference: # determine variables that can be scaled (VarRV's) varlist = [ varname for varname, var in dataset.variables.iteritems() if isinstance(var, VarRV) ] rescaled_dataset = dataset.copy(varlist=varlist) # add mock scale factors for consistency for var in rescaled_dataset.variables.itervalues(): var.atts['loc_factor'] = 1 var.atts['scale_factor'] = 1 var.atts['shape_factor'] = 1 else: # generate new dataset (without variables, and in-memory) if isinstance(dataset, DatasetNetCDF): rescaled_dataset = dataset.copy(varlist=[], asNC=False) else: rescaled_dataset = dataset.copy(varlist=[]) # individual scaling if target is None or target == 'auto': parent = None if target == 'auto' and dataset.name.endswith(suffixes): for suffix in suffixes: if dataset.name.endswith( suffix): # check, which suffix, and remove it parent = dataset.name[:-(len(suffix) + 1)] break if parent and '-' not in parent: parent += '-1' # convention for WRF names if parent and parent in scalefactor_collection: scalefactors = scalefactor_collection[ parent] # use scale factors from parent else: # scale individually scalefactors = scaleFactor(reference, dataset, lscale=lscale, lglobal=lglobal) if target == 'auto': scalefactor_collection[ dataset.name] = scalefactors # for later use # loop over variables for varname, scalefactor in scalefactors.iteritems(): if varname in dataset: # rescale and add variable to new dataset var = dataset.variables[varname] if lscale: rsvar = var.rescale(loc=scalefactor[0], scale=scalefactor[1]) else: rsvar = var.rescale(loc=scalefactor) rescaled_dataset.addVariable(rsvar) # add dataset to list rescaled_datasets.append(rescaled_dataset) # put everythign into Ensemble, if input was Ensemble if isinstance(datasets, Ensemble): rescaled_datasets = Ensemble(*rescaled_datasets, name=datasets.ens_name, title=datasets.ens_title) # return datasets/ensemble return rescaled_datasets
def loadShapeObservations(obs=None, seasons=None, basins=None, provs=None, shapes=None, stations=None, varlist=None, slices=None, aggregation='mean', dataset_mode='time-series', lWSC=True, WSC_period=None, shapetype=None, variable_list=None, basin_list=None, lforceList=True, obs_ts=None, obs_clim=None, name=None, title=None, obs_list=None, ensemble_list=None, ensemble_product='inner', **kwargs): ''' convenience function to load shape observations based on 'aggregation' and 'varlist' (mainly add WSC gage data) ''' if obs_list is None: obs_list = observational_datasets if name is None: name = 'obs' if title is None: title = 'Observations' # variables for which ensemble expansion is not supported not_supported = ('season','seasons','varlist','mode','dataset_mode','provs','basins','shapes',) # resolve variable list (no need to maintain order) if isinstance(varlist,str): varlist = [varlist] variables = set(shp_params) for name in varlist: if name in variable_list: variables.update(variable_list[name].vars) elif lforceList: raise VariableError("Variable list '{}' does not exist.".format(name)) else: variables.add(name) variables = list(variables) # determine if we need gage dataset lWSC = isinstance(basins,str) and any([var in WSC_vars for var in variables]) and lWSC # doesn't work if multiple basins are loaded # default obs list if obs is None: obs = ['Observations',] elif isinstance(obs,str): obs = [obs] elif isinstance(obs,tuple): obs = list(obs) elif not isinstance(obs,list): raise TypeError(obs) # configure slicing (extract basin/province/shape and period) expand_vars = ('basins','stations','provs','shapes','slices') # variables that need to be added to slices (and expanded first) if ensemble_list: expand_list = [varname for varname in expand_vars if varname in ensemble_list] if ensemble_list and expand_list: local_vars = locals(); exp_args = dict() for varname in expand_vars: # copy variables to expand right away exp_args[varname] = local_vars[varname] for varname in expand_list: # remove entries from ensemble expansion if varname != 'slices': ensemble_list.remove(varname) # only 'slices' will continue to be expanded if 'slices' not in ensemble_list: ensemble_list.append('slices') slices = [_configSlices(**arg_dict) for arg_dict in expandArgumentList(expand_list=expand_list, lproduct=ensemble_product, **exp_args)] else: slices = _configSlices(slices=slices, basins=basins, provs=provs, shapes=shapes, stations=stations, period=None) # substitute default observational dataset and seperate aggregation methods iobs = None; clim_ens = None for i,obs_name in reverse_enumerate(obs): # N.B.: we need to iterate in reverse order, so that deleting items does not interfere with the indexing if obs_name in obs_aliases or obs_name not in timeseries_datasets: if iobs is not None: raise ArgumentError("Can only resolve one default dataset: {}".format(obs)) if aggregation == 'mean' and seasons is None and obs_clim is not None: # remove dataset entry from list (and all the arguments) del obs[i]; iobs = i # remember position of default obs in ensemble clim_args = kwargs.copy(); slc = slices; shp = shapetype # clean up variables for ensemble expansion, if necessary if ensemble_list and ensemble_product.lower() == 'inner': if 'names' in ensemble_list: obs_names = [obs_clim] for arg in ensemble_list: if arg in ('slices','shape'): pass # dealt with separately elif arg in not_supported: raise ArgumentError("Expansion of keyword '{:s}' is currently not supported in ensemble expansion.".format(arg)) elif arg in kwargs: clim_args[arg] = kwargs[arg][iobs]; del kwargs[arg][iobs] else: raise ArgumentError("Keyword '{:s}' not found in keyword arguments.".format(arg)) if 'slices' in ensemble_list: slc = slices[iobs]; del slices[iobs] if 'shape' in ensemble_list: shp = shapetype[iobs]; del shapetype[iobs] clim_len = 1 # expect length of climatology ensemble else: obs_names = obs_clim # no name expansion clim_len = None # expect length of climatology ensemble for arg in ensemble_list: if arg in not_supported: raise ArgumentError("Expansion of keyword '{:s}' is currently not supported in ensemble expansion.".format(arg)) elif 'slices' in ensemble_list: l = len(slc) elif 'shape' in ensemble_list: l = len(shp) elif arg in clim_args: l = len(clim_args[arg]) else: raise ArgumentError("Keyword '{:s}' not found in keyword arguments.".format(arg)) if clim_len is None: clim_len = l elif l != clim_len: raise ArgumentError(arg,l,clim_len) elif ensemble_list and ensemble_product.lower() == 'outer': clim_len = 1 for arg in ensemble_list: if arg != 'names': assert isinstance(clim_args[arg],(list,tuple)), clim_args[arg] clim_len *= len(clim_args[arg]) obs_names = [obs_clim] if 'names' in ensemble_list else obs_clim else: obs_names = [obs_clim]; clim_len = 1 # now load climtology instead of time-series and skip aggregation try: clim_ens = loadEnsemble(names=obs_names, season=seasons, aggregation=None, slices=slc, varlist=variables, ldataset=False, dataset_mode='climatology', shape=shp, ensemble_list=ensemble_list, ensemble_product=ensemble_product, obs_list=obs_list, basin_list=basin_list, **clim_args) assert len(clim_ens) == clim_len, clim_ens except EmptyDatasetError: pass else: obs[i] = obs_ts # trivial: just substitute default name and load time-series # prepare and load ensemble of observations if len(obs) > 0: if len(obs) == 1 and ensemble_list and 'names' not in ensemble_list: obs = obs[0] try: obsens = loadEnsemble(names=obs, season=seasons, aggregation=aggregation, slices=slices, varlist=variables, ldataset=False, dataset_mode=dataset_mode, shape=shapetype, obs_list=obs_list, basin_list=basin_list, ensemble_list=ensemble_list, ensemble_product=ensemble_product, **kwargs) except EmptyDatasetError: obsens = Ensemble(name=name, title=title, obs_list=obs_list, basetype=Dataset) else: obsens = Ensemble(name=name, title=title, obs_list=obs_list, basetype=Dataset) # add default obs back in if they were removed earlier if clim_ens is not None: for clim_ds in clim_ens[::-1]: # add observations in correct order: adding backwards allows successive insertion ... obsens.insertMember(iobs,clim_ds) # ... at the point where the name block starts # load stream gage data from WSC; should not interfere with anything else; append to ensemble if lWSC: # another special case: river hydrographs from datasets.WSC import GageStationError, loadGageStation try: if aggregation is not None and seasons is None: dataset_mode = 'climatology' # handled differently with gage data if WSC_period is None: WSC_period = kwargs.get('obs_period',kwargs.get('period',None)) dataset = loadGageStation(basin=basins, varlist=['runoff'], aggregation=aggregation, period=WSC_period, mode=dataset_mode, filetype='monthly', basin_list=basin_list, lfill=True, lexpand=True) # always load runoff/discharge if seasons: method = aggregation if aggregation.isupper() else aggregation.title() if aggregation: dataset = getattr(dataset,'seasonal'+method)(season=seasons, taxis='time') else: dataset = dataset.seasonalSample(season=seasons) if slices is not None: dataset = dataset(**slices) # slice immediately obsens += dataset.load() except GageStationError: pass # just ignore, if gage station data is missing # return ensembles (will be wrapped in a list, if BatchLoad is used) return obsens
def loadShapeEnsemble(names=None, seasons=None, basins=None, provs=None, shapes=None, varlist=None, aggregation='mean', slices=None, shapetype=None, filetypes=None, period=None, obs_period=None, WSC_period=None, name=None, title=None, variable_list=None, WRF_exps=None, CESM_exps=None, WRF_ens=None, CESM_ens=None, basin_list=None, lforceList=True, obs_list=None, obs_ts=None, obs_clim=None, ensemble_list=None, ensemble_product='inner', **kwargs): ''' convenience function to load shape ensembles (in Ensemble container) or observations; kwargs are passed to loadEnsembleTS ''' names = list(names) # make a new list (copy) # separate observations if obs_list is None: obs_list = observational_datasets obs_names = []; iobs = []; ens_names = []; iens = [] for i,name in enumerate(names): if name in obs_list or name in obs_aliases: obs_names.append(name); iobs.append(i) else: ens_names.append(name); iens.append(i) assert len(iens) == len(ens_names) and len(iobs) == len(obs_names) if len(obs_names) > 0: # assemble arguments obs_args = dict(obs=obs_names, seasons=seasons, basins=basins, provs=provs, shapes=shapes, varlist=varlist, slices=slices, aggregation=aggregation, shapetype=shapetype, period=period, obs_period=obs_period, obs_ts=obs_ts, obs_clim=obs_clim, variable_list=variable_list, basin_list=basin_list, WSC_period=WSC_period, ensemble_list=ensemble_list, ensemble_product=ensemble_product, **kwargs) # check if we have to modify to preserve ensemble_list expansion if ensemble_list and ensemble_product == 'inner' and 'names' in ensemble_list and len(ensemble_list) > 1: for key in ensemble_list: if key != 'names': ens_list = obs_args[key] obs_args[key] = [ens_list[i] for i in iobs] # observations for basins require special treatment to merge basin averages with gage values # load observations by redirecting to appropriate loader function obsens = loadShapeObservations(name=name, title=title, obs_list=obs_list, **obs_args) else: obsens = [] if len(ens_names) > 0: # has to be a list # prepare arguments variables, filetypes = _resolveVarlist(varlist=varlist, filetypes=filetypes, params=shp_params, variable_list=variable_list, lforceList=lforceList) # configure slicing (extract basin/province/shape and period) slices = _configSlices(slices=slices, basins=basins, provs=provs, shapes=shapes, period=period) # assemble arguments ens_args = dict(names=ens_names, season=seasons, slices=slices, varlist=variables, shape=shapetype, aggregation=aggregation, period=period, obs_period=obs_period, WRF_exps=WRF_exps, CESM_exps=CESM_exps, WRF_ens=WRF_ens, CESM_ens=CESM_ens, filetypes=filetypes, ensemble_list=ensemble_list, ensemble_product=ensemble_product, **kwargs) # check if we have to remove obs datasets to preserve ensemble_list expansion if ensemble_list and ensemble_product == 'inner' and 'names' in ensemble_list and len(ensemble_list) > 1: for key in ensemble_list: if key != 'names': ens_list = ens_args[key] ens_args[key] = [ens_list[i] for i in iens] # load ensemble (no iteration here) shpens = loadEnsemble(name=name, title=title, obs_list=obs_list, **ens_args) else: shpens = Ensemble(name=name, title=title, basetype='Dataset') # get resolution tag (will be added below) res = None for member in shpens: if 'resstr' in member.atts: if res is None: res = member.atts['resstr'] elif res != member.atts['resstr']: res = None; break # no common resolution # return ensembles (will be wrapped in a list, if BatchLoad is used) if len(obsens) > 0 and len(shpens) > 0: for name,i in zip(obs_names,iobs): shpens.insertMember(i,obsens[name]) # add known observations in correct order del obsens[name] # remove the ones we already know from list, so we can deal with the rest j = i + 1 # add remaining obs datasets after last one for i,obs in enumerate(obsens): shpens.insertMember(j+i,obs) elif len(obsens) > 0 and len(shpens) == 0: shpens = obsens shpens.resolution = res # ad resolution tag now, to make sure it is there return shpens