def performRegridding(dataset, mode, griddef, dataargs, loverwrite=False, varlist=None, lwrite=True, lreturn=False, ldebug=False, lparallel=False, pidstr='', logger=None): ''' worker function to perform regridding for a given dataset and target grid ''' # input checking if not isinstance(dataset, basestring): raise TypeError if not isinstance(dataargs, dict): raise TypeError # all dataset arguments are kwargs if not isinstance(griddef, GridDefinition): raise TypeError if lparallel: if not lwrite: raise IOError, 'Can only write to disk in parallel mode (i.e. lwrite = True).' if lreturn: raise IOError, 'Can not return datasets in parallel mode (i.e. lreturn = False).' # logging if logger is None: # make new logger logger = logging.getLogger() # new logger logger.addHandler(logging.StreamHandler()) else: if isinstance(logger, basestring): logger = logging.getLogger(name=logger) # connect to existing one elif not isinstance(logger, logging.Logger): raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format( str(logger)) ## extract meta data from arguments dataargs, loadfct, srcage, datamsgstr = getMetaData( dataset, mode, dataargs) dataset_name = dataargs.dataset_name periodstr = dataargs.periodstr avgfolder = dataargs.avgfolder # get filename for target dataset and do some checks filename = getTargetFile( dataset=dataset, mode=mode, dataargs=dataargs, lwrite=lwrite, grid=griddef.name.lower(), ) # prepare target dataset if ldebug: filename = 'test_' + filename if not os.path.exists(avgfolder): raise IOError, "Dataset folder '{:s}' does not exist!".format( avgfolder) lskip = False # else just go ahead if lwrite: if lreturn: tmpfilename = filename # no temporary file if dataset is passed on (can't rename the file while it is open!) else: if lparallel: tmppfx = 'tmp_regrid_{:s}_'.format(pidstr[1:-1]) else: tmppfx = 'tmp_regrid_'.format(pidstr[1:-1]) tmpfilename = tmppfx + filename filepath = avgfolder + filename tmpfilepath = avgfolder + tmpfilename if os.path.exists(filepath): if not loverwrite: age = datetime.fromtimestamp(os.path.getmtime(filepath)) # if source file is newer than sink file or if sink file is a stub, recompute, otherwise skip if age > srcage and os.path.getsize(filepath) > 1e6: lskip = True if hasattr(griddef, 'filepath') and griddef.filepath is not None: gridage = datetime.fromtimestamp( os.path.getmtime(griddef.filepath)) if age < gridage: lskip = False # N.B.: NetCDF files smaller than 1MB are usually incomplete header fragments from a previous crashed # depending on last modification time of file or overwrite setting, start computation, or skip if lskip: # print message skipmsg = "\n{:s} >>> Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format( pidstr, filename, dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format(pidstr, filepath) logger.info(skipmsg) else: ## actually load datasets source = loadfct() # load source # check period if 'period' in source.atts and dataargs.periodstr != source.atts.period: # a NetCDF attribute raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format( periodstr, source.atts.period) # print message if mode == 'climatology': opmsgstr = 'Regridding Climatology ({:s}) to {:s} Grid'.format( periodstr, griddef.name) elif mode == 'time-series': opmsgstr = 'Regridding Time-series to {:s} Grid'.format( griddef.name) else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode) # print feedback to logger logger.info( '\n{0:s} *** {1:^65s} *** \n{0:s} *** {2:^65s} *** \n' .format(pidstr, datamsgstr, opmsgstr)) if not lparallel and ldebug: logger.info('\n' + str(source) + '\n') ## create new sink/target file # set attributes atts = source.atts.copy() atts['period'] = periodstr atts['name'] = dataset_name atts['grid'] = griddef.name if mode == 'climatology': atts['title'] = '{:s} Climatology on {:s} Grid'.format( dataset_name, griddef.name) elif mode == 'time-series': atts['title'] = '{:s} Time-series on {:s} Grid'.format( dataset_name, griddef.name) # make new dataset if lwrite: # write to NetCDF file if os.path.exists(tmpfilepath): os.remove(tmpfilepath) # remove old temp files sink = DatasetNetCDF(folder=avgfolder, filelist=[tmpfilename], atts=atts, mode='w') else: sink = Dataset(atts=atts) # ony create dataset in memory # initialize processing CPU = CentralProcessingUnit(source, sink, varlist=varlist, tmp=False, feedback=ldebug) # perform regridding (if target grid is different from native grid!) if griddef.name != dataset: # reproject and resample (regrid) dataset CPU.Regrid(griddef=griddef, flush=True) # get results CPU.sync(flush=True) # add geolocators sink = addGeoLocator(sink, griddef=griddef, lgdal=True, lreplace=True, lcheck=True) # N.B.: WRF datasets come with their own geolocator arrays - we need to replace those! # add length and names of month if mode == 'climatology' and not sink.hasVariable( 'length_of_month') and sink.hasVariable('time'): addLengthAndNamesOfMonth( sink, noleap=True if dataset.upper() in ('WRF', 'CESM') else False) # print dataset if not lparallel and ldebug: logger.info('\n' + str(sink) + '\n') # write results to file if lwrite: sink.sync() writemsg = "\n{:s} >>> Writing to file '{:s}' in dataset {:s}".format( pidstr, filename, dataset_name) writemsg += "\n{:s} >>> ('{:s}')\n".format(pidstr, filepath) logger.info(writemsg) # rename file to proper name if not lreturn: sink.unload() sink.close() del sink # destroy all references if os.path.exists(filepath): os.remove(filepath) # remove old file os.rename( tmpfilepath, filepath) # this would also overwrite the old file... # N.B.: there is no temporary file if the dataset is returned, because an open file can't be renamed # clean up and return source.unload() del source, CPU if lreturn: return sink # return dataset for further use (netcdf file still open!) else: return 0 # "exit code"
def performExtraction(dataset, mode, stnfct, dataargs, loverwrite=False, varlist=None, lwrite=True, lreturn=False, ldebug=False, lparallel=False, pidstr='', logger=None): ''' worker function to extract point data from gridded dataset ''' # input checking if not isinstance(dataset,basestring): raise TypeError if not isinstance(dataargs,dict): raise TypeError # all dataset arguments are kwargs if not callable(stnfct): raise TypeError # function to load station dataset if lparallel: if not lwrite: raise IOError, 'In parallel mode we can only write to disk (i.e. lwrite = True).' if lreturn: raise IOError, 'Can not return datasets in parallel mode (i.e. lreturn = False).' # logging if logger is None: # make new logger logger = logging.getLogger() # new logger logger.addHandler(logging.StreamHandler()) else: if isinstance(logger,basestring): logger = logging.getLogger(name=logger) # connect to existing one elif not isinstance(logger,logging.Logger): raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format(str(logger)) lclim = False; lts = False if mode == 'climatology': lclim = True elif mode == 'time-series': lts = True else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode) ## extract meta data from arguments module, dataargs, loadfct, filepath, datamsgstr = getMetaData(dataset, mode, dataargs) dataset_name = dataargs.dataset_name; periodstr = dataargs.periodstr; avgfolder = dataargs.avgfolder # load template dataset stndata = stnfct() # load station dataset from function if not isinstance(stndata, Dataset): raise TypeError # N.B.: the loading function is necessary, because DataseNetCDF instances do not pickle well # determine age of source file if not loverwrite: sourceage = datetime.fromtimestamp(os.path.getmtime(filepath)) # get filename for target dataset and do some checks filename = getTargetFile(stndata.name, dataset, mode, module, dataargs, lwrite) if ldebug: filename = 'test_' + filename if not os.path.exists(avgfolder): raise IOError, "Dataset folder '{:s}' does not exist!".format(avgfolder) lskip = False # else just go ahead if lwrite: if lreturn: tmpfilename = filename # no temporary file if dataset is passed on (can't rename the file while it is open!) else: if lparallel: tmppfx = 'tmp_exstns_{:s}_'.format(pidstr[1:-1]) else: tmppfx = 'tmp_exstns_'.format(pidstr[1:-1]) tmpfilename = tmppfx + filename filepath = avgfolder + filename tmpfilepath = avgfolder + tmpfilename if os.path.exists(filepath): if not loverwrite: age = datetime.fromtimestamp(os.path.getmtime(filepath)) # if source file is newer than sink file or if sink file is a stub, recompute, otherwise skip if age > sourceage and os.path.getsize(filepath) > 1e5: lskip = True # N.B.: NetCDF files smaller than 100kB are usually incomplete header fragments from a previous crashed if not lskip: os.remove(filepath) # recompute # depending on last modification time of file or overwrite setting, start computation, or skip if lskip: # print message skipmsg = "\n{:s} >>> Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format(pidstr,filename,dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,filepath) logger.info(skipmsg) else: ## actually load datasets source = loadfct() # load source # check period if 'period' in source.atts and dataargs.periodstr != source.atts.period: # a NetCDF attribute raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format(periodstr,source.atts.period) # print message if lclim: opmsgstr = "Extracting '{:s}'-type Point Data from Climatology ({:s})".format(stndata.name, periodstr) elif lts: opmsgstr = "Extracting '{:s}'-type Point Data from Time-series".format(stndata.name) else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode) # print feedback to logger logger.info('\n{0:s} *** {1:^65s} *** \n{0:s} *** {2:^65s} *** \n'.format(pidstr,datamsgstr,opmsgstr)) if not lparallel and ldebug: logger.info('\n'+str(source)+'\n') ## create new sink/target file # set attributes atts=source.atts.copy() atts['period'] = dataargs.periodstr if dataargs.periodstr else 'time-series' atts['name'] = dataset_name; atts['station'] = stndata.name atts['title'] = '{:s} (Stations) from {:s} {:s}'.format(stndata.title,dataset_name,mode.title()) # make new dataset if lwrite: # write to NetCDF file if os.path.exists(tmpfilepath): os.remove(tmpfilepath) # remove old temp files sink = DatasetNetCDF(folder=avgfolder, filelist=[tmpfilename], atts=atts, mode='w') else: sink = Dataset(atts=atts) # ony create dataset in memory # initialize processing CPU = CentralProcessingUnit(source, sink, varlist=varlist, tmp=False, feedback=ldebug) # extract data at station locations CPU.Extract(template=stndata, flush=True) # get results CPU.sync(flush=True) # print dataset if not lparallel and ldebug: logger.info('\n'+str(sink)+'\n') # write results to file if lwrite: sink.sync() writemsg = "\n{:s} >>> Writing to file '{:s}' in dataset {:s}".format(pidstr,filename,dataset_name) writemsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,filepath) logger.info(writemsg) # rename file to proper name if not lreturn: sink.unload(); sink.close(); del sink # destroy all references if os.path.exists(filepath): os.remove(filepath) # remove old file os.rename(tmpfilepath,filepath) # N.B.: there is no temporary file if the dataset is returned, because an open file can't be renamed # clean up and return source.unload(); del source#, CPU if lreturn: return sink # return dataset for further use (netcdf file still open!) else: return 0 # "exit code"
def performExport(dataset, mode, dataargs, expargs, bcargs, loverwrite=False, ldebug=False, lparallel=False, pidstr='', logger=None): ''' worker function to export ASCII rasters for a given dataset ''' # input checking if not isinstance(dataset,basestring): raise TypeError if not isinstance(dataargs,dict): raise TypeError # all dataset arguments are kwargs # logging if logger is None: # make new logger logger = logging.getLogger() # new logger logger.addHandler(logging.StreamHandler()) else: if isinstance(logger,basestring): logger = logging.getLogger(name=logger) # connect to existing one elif not isinstance(logger,logging.Logger): raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format(str(logger)) ## extract meta data from arguments dataargs, loadfct, srcage, datamsgstr = getMetaData(dataset, mode, dataargs, lone=False) dataset_name = dataargs.dataset_name; periodstr = dataargs.periodstr; domain = dataargs.domain # figure out bias correction parameters if bcargs: bcargs = bcargs.copy() # first copy, then modify... bc_method = bcargs.pop('method',None) if bc_method is None: raise ArgumentError("Need to specify bias-correction method to use bias correction!") bc_obs = bcargs.pop('obs_dataset',None) if bc_obs is None: raise ArgumentError("Need to specify observational dataset to use bias correction!") bc_reference = bcargs.pop('reference',None) if bc_reference is None: # infer from experiment name if dataset_name[-5:] in ('-2050','-2100'): bc_reference = dataset_name[:-5] # cut of period indicator and hope for the best else: bc_reference = dataset_name bc_grid = bcargs.pop('grid',None) if bc_grid is None: bc_grid = dataargs.grid bc_domain = bcargs.pop('domain',None) if bc_domain is None: bc_domain = domain bc_varlist = bcargs.pop('varlist',None) bc_varmap = bcargs.pop('varmap',None) bc_tag = bcargs.pop('tag',None) # an optional name extension/tag bc_pattern = bcargs.pop('file_pattern',None) # usually default in getPickleFile lgzip = bcargs.pop('lgzip',None) # if pickle is gzipped (None: auto-detect based on file name extension) # get name of pickle file (and folder) picklefolder = dataargs.avgfolder.replace(dataset_name,bc_reference) picklefile = getPickleFileName(method=bc_method, obs_name=bc_obs, gridstr=bc_grid, domain=bc_domain, tag=bc_tag, pattern=bc_pattern) picklepath = '{:s}/{:s}'.format(picklefolder,picklefile) if lgzip: picklepath += '.gz' # add extension if not os.path.exists(picklepath): raise IOError(picklepath) elif lgzip is None: lgzip = False if not os.path.exists(picklepath): lgzip = True # assume gzipped file picklepath += '.gz' # try with extension... if not os.path.exists(picklepath): raise IOError(picklepath) elif not os.path.exists(picklepath): raise IOError(picklepath) pickleage = datetime.fromtimestamp(os.path.getmtime(picklepath)) # determine age of pickle file and compare against source age else: bc_method = False pickleage = srcage # parse export options expargs = expargs.copy() # first copy, then modify... lm3 = expargs.pop('lm3') # convert kg/m^2/s to m^3/m^2/s (water flux) expformat = expargs.pop('format') # needed to get FileFormat object exp_list= expargs.pop('exp_list') # this handled outside of export compute_list = expargs.pop('compute_list', []) # variables to be (re-)computed - by default all # initialize FileFormat class instance fileFormat = getFileFormat(expformat, bc_method=bc_method, **expargs) # get folder for target dataset and do some checks expname = '{:s}_d{:02d}'.format(dataset_name,domain) if domain else dataset_name expfolder = fileFormat.defineDataset(dataset=dataset, mode=mode, dataargs=dataargs, lwrite=True, ldebug=ldebug) # prepare destination for new dataset lskip = fileFormat.prepareDestination(srcage=max(srcage,pickleage), loverwrite=loverwrite) # depending on last modification time of file or overwrite setting, start computation, or skip if lskip: # print message skipmsg = "\n{:s} >>> Skipping: Format '{:s} for dataset '{:s}' already exists and is newer than source file.".format(pidstr,expformat,dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,expfolder) logger.info(skipmsg) else: ## actually load datasets source = loadfct() # load source data # check period if 'period' in source.atts and dataargs.periodstr != source.atts.period: # a NetCDF attribute raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format(periodstr,source.atts.period) # load BiasCorrection object from pickle if bc_method: op = gzip.open if lgzip else open with op(picklepath, 'r') as filehandle: BC = pickle.load(filehandle) # assemble logger entry bcmsgstr = "(performing bias-correction using {:s} from {:s} towards {:s})".format(BC.long_name,bc_reference,bc_obs) # print message if mode == 'climatology': opmsgstr = 'Exporting Climatology ({:s}) to {:s} Format'.format(periodstr, expformat) elif mode == 'time-series': opmsgstr = 'Exporting Time-series to {:s} Format'.format(expformat) elif mode[-5:] == '-mean': opmsgstr = 'Exporting {:s}-Mean ({:s}) to {:s} Format'.format(mode[:-5], periodstr, expformat) else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode) # print feedback to logger logmsg = '\n{0:s} *** {1:^65s} *** \n{0:s} *** {2:^65s} *** \n'.format(pidstr,datamsgstr,opmsgstr) if bc_method: logmsg += "{0:s} *** {1:^65s} *** \n".format(pidstr,bcmsgstr) logger.info(logmsg) if not lparallel and ldebug: logger.info('\n'+str(source)+'\n') # create GDAL-enabled target dataset sink = Dataset(axes=(source.xlon,source.ylat), name=expname, title=source.title, atts=source.atts.copy()) addGDALtoDataset(dataset=sink, griddef=source.griddef) assert sink.gdal, sink # apply bias-correction if bc_method: source = BC.correct(source, asNC=False, varlist=bc_varlist, varmap=bc_varmap) # load bias-corrected variables into memory # N.B.: for variables that are not bias-corrected, data are not loaded immediately but on demand; this way # I/O and computing can be further disentangled and not all variables are always needed # compute intermediate variables, if necessary for varname in exp_list: variables = None # variable list var = None # (re-)compute variable, if desired... if varname in compute_list: if varname == 'precip': var = newvars.computeTotalPrecip(source) elif varname == 'waterflx': var = newvars.computeWaterFlux(source) elif varname == 'liqwatflx': var = newvars.computeLiquidWaterFlux(source) elif varname == 'netrad': var = newvars.computeNetRadiation(source, asVar=True) elif varname == 'netrad_bb': var = newvars.computeNetRadiation(source, asVar=True, lrad=False, name='netrad_bb') elif varname == 'netrad_bb0': var = newvars.computeNetRadiation(source, asVar=True, lrad=False, lA=False, name='netrad_bb0') elif varname == 'vapdef': var = newvars.computeVaporDeficit(source) elif varname in ('pet','pet_pm','petrad','petwnd') and 'pet' not in sink: if 'petrad' in exp_list or 'petwnd' in exp_list: variables = newvars.computePotEvapPM(source, lterms=True) # default; returns mutliple PET terms else: var = newvars.computePotEvapPM(source, lterms=False) # returns only PET elif varname == 'pet_th': var = None # skip for now #var = computePotEvapTh(source) # simplified formula (less prerequisites) # ... otherwise load from source file if var is None and variables is None and varname in source: var = source[varname].load() # load data (may not have to load all) #else: raise VariableError, "Unsupported Variable '{:s}'.".format(varname) # for now, skip variables that are None if var or variables: # handle lists as well if var and variables: raise VariableError, (var,variables) elif var: variables = (var,) for var in variables: addGDALtoVar(var=var, griddef=sink.griddef) if not var.gdal and isinstance(fileFormat,ASCII_raster): raise GDALError, "Exporting to ASCII_raster format requires GDAL-enabled variables." # add to new dataset sink += var # convert units if lm3: for var in sink: if var.units == 'kg/m^2/s': var /= 1000. # divide to get m^3/m^2/s var.units = 'm^3/m^2/s' # update units # compute seasonal mean if we are in mean-mode if mode[-5:] == '-mean': sink = sink.seasonalMean(season=mode[:-5], lclim=True) # N.B.: to remain consistent with other output modes, # we need to prevent renaming of the time axis sink = concatDatasets([sink,sink], axis='time', lensembleAxis=True) sink.squeeze() # we need the year-axis until now to distinguish constant fields; now remove # print dataset if not lparallel and ldebug: logger.info('\n'+str(sink)+'\n') # export new dataset to selected format fileFormat.exportDataset(sink) # write results to file writemsg = "\n{:s} >>> Export of Dataset '{:s}' to Format '{:s}' complete.".format(pidstr,expname, expformat) writemsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,expfolder) logger.info(writemsg) # clean up and return source.unload(); #del source return 0 # "exit code"
def performExtraction(dataset, mode, stnfct, dataargs, loverwrite=False, varlist=None, lwrite=True, lreturn=False, ldebug=False, lparallel=False, pidstr='', logger=None): ''' worker function to extract point data from gridded dataset ''' # input checking if not isinstance(dataset, basestring): raise TypeError if not isinstance(dataargs, dict): raise TypeError # all dataset arguments are kwargs if not callable(stnfct): raise TypeError # function to load station dataset if lparallel: if not lwrite: raise IOError, 'In parallel mode we can only write to disk (i.e. lwrite = True).' if lreturn: raise IOError, 'Can not return datasets in parallel mode (i.e. lreturn = False).' # logging if logger is None: # make new logger logger = logging.getLogger() # new logger logger.addHandler(logging.StreamHandler()) else: if isinstance(logger, basestring): logger = logging.getLogger(name=logger) # connect to existing one elif not isinstance(logger, logging.Logger): raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format( str(logger)) lclim = False lts = False if mode == 'climatology': lclim = True elif mode == 'time-series': lts = True else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode) ## extract meta data from arguments dataargs, loadfct, srcage, datamsgstr = getMetaData( dataset, mode, dataargs) dataset_name = dataargs.dataset_name periodstr = dataargs.periodstr avgfolder = dataargs.avgfolder # load template dataset stndata = stnfct() # load station dataset from function if not isinstance(stndata, Dataset): raise TypeError # N.B.: the loading function is necessary, because DataseNetCDF instances do not pickle well # get filename for target dataset and do some checks filename = getTargetFile(dataset=dataset, mode=mode, dataargs=dataargs, lwrite=lwrite, station=stndata.name) if ldebug: filename = 'test_' + filename if not os.path.exists(avgfolder): raise IOError, "Dataset folder '{:s}' does not exist!".format( avgfolder) lskip = False # else just go ahead if lwrite: if lreturn: tmpfilename = filename # no temporary file if dataset is passed on (can't rename the file while it is open!) else: if lparallel: tmppfx = 'tmp_exstns_{:s}_'.format(pidstr[1:-1]) else: tmppfx = 'tmp_exstns_'.format(pidstr[1:-1]) tmpfilename = tmppfx + filename filepath = avgfolder + filename tmpfilepath = avgfolder + tmpfilename if os.path.exists(filepath): if not loverwrite: age = datetime.fromtimestamp(os.path.getmtime(filepath)) # if source file is newer than sink file or if sink file is a stub, recompute, otherwise skip if age > srcage and os.path.getsize(filepath) > 1e5: lskip = True # N.B.: NetCDF files smaller than 100kB are usually incomplete header fragments from a previous crashed # depending on last modification time of file or overwrite setting, start computation, or skip if lskip: # print message skipmsg = "\n{:s} >>> Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format( pidstr, filename, dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format(pidstr, filepath) logger.info(skipmsg) else: ## actually load datasets source = loadfct() # load source # check period if 'period' in source.atts and dataargs.periodstr != source.atts.period: # a NetCDF attribute raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format( periodstr, source.atts.period) # print message if lclim: opmsgstr = "Extracting '{:s}'-type Point Data from Climatology ({:s})".format( stndata.name, periodstr) elif lts: opmsgstr = "Extracting '{:s}'-type Point Data from Time-series".format( stndata.name) else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode) # print feedback to logger logger.info( '\n{0:s} *** {1:^65s} *** \n{0:s} *** {2:^65s} *** \n' .format(pidstr, datamsgstr, opmsgstr)) if not lparallel and ldebug: logger.info('\n' + str(source) + '\n') ## create new sink/target file # set attributes atts = source.atts.copy() atts[ 'period'] = dataargs.periodstr if dataargs.periodstr else 'time-series' atts['name'] = dataset_name atts['station'] = stndata.name atts['title'] = '{:s} (Stations) from {:s} {:s}'.format( stndata.title, dataset_name, mode.title()) # make new dataset if lwrite: # write to NetCDF file if os.path.exists(tmpfilepath): os.remove(tmpfilepath) # remove old temp files sink = DatasetNetCDF(folder=avgfolder, filelist=[tmpfilename], atts=atts, mode='w') else: sink = Dataset(atts=atts) # ony create dataset in memory # initialize processing CPU = CentralProcessingUnit(source, sink, varlist=varlist, tmp=False, feedback=ldebug) # extract data at station locations CPU.Extract(template=stndata, flush=True) # get results CPU.sync(flush=True) # print dataset if not lparallel and ldebug: logger.info('\n' + str(sink) + '\n') # write results to file if lwrite: sink.sync() writemsg = "\n{:s} >>> Writing to file '{:s}' in dataset {:s}".format( pidstr, filename, dataset_name) writemsg += "\n{:s} >>> ('{:s}')\n".format(pidstr, filepath) logger.info(writemsg) # rename file to proper name if not lreturn: sink.unload() sink.close() del sink # destroy all references if os.path.exists(filepath): os.remove(filepath) # remove old file os.rename(tmpfilepath, filepath) # N.B.: there is no temporary file if the dataset is returned, because an open file can't be renamed # clean up and return source.unload() del source #, CPU if lreturn: return sink # return dataset for further use (netcdf file still open!) else: return 0 # "exit code"
def performRegridding(dataset, mode, griddef, dataargs, loverwrite=False, varlist=None, lwrite=True, lreturn=False, ldebug=False, lparallel=False, pidstr='', logger=None): ''' worker function to perform regridding for a given dataset and target grid ''' # input checking if not isinstance(dataset,basestring): raise TypeError if not isinstance(dataargs,dict): raise TypeError # all dataset arguments are kwargs if not isinstance(griddef,GridDefinition): raise TypeError if lparallel: if not lwrite: raise IOError, 'Can only write to disk in parallel mode (i.e. lwrite = True).' if lreturn: raise IOError, 'Can not return datasets in parallel mode (i.e. lreturn = False).' # logging if logger is None: # make new logger logger = logging.getLogger() # new logger logger.addHandler(logging.StreamHandler()) else: if isinstance(logger,basestring): logger = logging.getLogger(name=logger) # connect to existing one elif not isinstance(logger,logging.Logger): raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format(str(logger)) ## extract meta data from arguments dataargs, loadfct, srcage, datamsgstr = getMetaData(dataset, mode, dataargs) dataset_name = dataargs.dataset_name; periodstr = dataargs.periodstr; avgfolder = dataargs.avgfolder # get filename for target dataset and do some checks filename = getTargetFile(dataset=dataset, mode=mode, dataargs=dataargs, lwrite=lwrite, grid=griddef.name.lower(), period=None, filetype=None) # prepare target dataset if ldebug: filename = 'test_' + filename if not os.path.exists(avgfolder): raise IOError, "Dataset folder '{:s}' does not exist!".format(avgfolder) lskip = False # else just go ahead if lwrite: if lreturn: tmpfilename = filename # no temporary file if dataset is passed on (can't rename the file while it is open!) else: if lparallel: tmppfx = 'tmp_regrid_{:s}_'.format(pidstr[1:-1]) else: tmppfx = 'tmp_regrid_'.format(pidstr[1:-1]) tmpfilename = tmppfx + filename filepath = avgfolder + filename tmpfilepath = avgfolder + tmpfilename if os.path.exists(filepath): if not loverwrite: age = datetime.fromtimestamp(os.path.getmtime(filepath)) # if source file is newer than sink file or if sink file is a stub, recompute, otherwise skip if age > srcage and os.path.getsize(filepath) > 1e6: lskip = True if hasattr(griddef, 'filepath') and griddef.filepath is not None: gridage = datetime.fromtimestamp(os.path.getmtime(griddef.filepath)) if age < gridage: lskip = False # N.B.: NetCDF files smaller than 1MB are usually incomplete header fragments from a previous crashed if not lskip: os.remove(filepath) # recompute # depending on last modification time of file or overwrite setting, start computation, or skip if lskip: # print message skipmsg = "\n{:s} >>> Skipping: file '{:s}' in dataset '{:s}' already exists and is newer than source file.".format(pidstr,filename,dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,filepath) logger.info(skipmsg) else: ## actually load datasets source = loadfct() # load source # check period if 'period' in source.atts and dataargs.periodstr != source.atts.period: # a NetCDF attribute raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format(periodstr,source.atts.period) # print message if mode == 'climatology': opmsgstr = 'Regridding Climatology ({:s}) to {:s} Grid'.format(periodstr, griddef.name) elif mode == 'time-series': opmsgstr = 'Regridding Time-series to {:s} Grid'.format(griddef.name) else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode) # print feedback to logger logger.info('\n{0:s} *** {1:^65s} *** \n{0:s} *** {2:^65s} *** \n'.format(pidstr,datamsgstr,opmsgstr)) if not lparallel and ldebug: logger.info('\n'+str(source)+'\n') ## create new sink/target file # set attributes atts=source.atts.copy() atts['period'] = periodstr; atts['name'] = dataset_name; atts['grid'] = griddef.name if mode == 'climatology': atts['title'] = '{:s} Climatology on {:s} Grid'.format(dataset_name, griddef.name) elif mode == 'time-series': atts['title'] = '{:s} Time-series on {:s} Grid'.format(dataset_name, griddef.name) # make new dataset if lwrite: # write to NetCDF file if os.path.exists(tmpfilepath): os.remove(tmpfilepath) # remove old temp files sink = DatasetNetCDF(folder=avgfolder, filelist=[tmpfilename], atts=atts, mode='w') else: sink = Dataset(atts=atts) # ony create dataset in memory # initialize processing CPU = CentralProcessingUnit(source, sink, varlist=varlist, tmp=False, feedback=ldebug) # perform regridding (if target grid is different from native grid!) if griddef.name != dataset: # reproject and resample (regrid) dataset CPU.Regrid(griddef=griddef, flush=True) # get results CPU.sync(flush=True) # add geolocators sink = addGeoLocator(sink, griddef=griddef, lgdal=True, lreplace=True, lcheck=True) # N.B.: WRF datasets come with their own geolocator arrays - we need to replace those! # add length and names of month if mode == 'climatology' and not sink.hasVariable('length_of_month') and sink.hasVariable('time'): addLengthAndNamesOfMonth(sink, noleap=True if dataset.upper() in ('WRF','CESM') else False) # print dataset if not lparallel and ldebug: logger.info('\n'+str(sink)+'\n') # write results to file if lwrite: sink.sync() writemsg = "\n{:s} >>> Writing to file '{:s}' in dataset {:s}".format(pidstr,filename,dataset_name) writemsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,filepath) logger.info(writemsg) # rename file to proper name if not lreturn: sink.unload(); sink.close(); del sink # destroy all references if os.path.exists(filepath): os.remove(filepath) # remove old file os.rename(tmpfilepath,filepath) # this would also overwrite the old file... # N.B.: there is no temporary file if the dataset is returned, because an open file can't be renamed # clean up and return source.unload(); del source, CPU if lreturn: return sink # return dataset for further use (netcdf file still open!) else: return 0 # "exit code"
def generateBiasCorrection(dataset, mode, dataargs, obs_dataset, bc_method, bc_args, loverwrite=False, lgzip=None, tag=None, ldebug=False, lparallel=False, pidstr='', logger=None): ''' worker function to generate a bias correction objects for a given dataset ''' # input checking if not isinstance(dataset, basestring): raise TypeError if not isinstance(dataargs, dict): raise TypeError # all dataset arguments are kwargs # logging if logger is None: # make new logger logger = logging.getLogger() # new logger logger.addHandler(logging.StreamHandler()) else: if isinstance(logger, basestring): logger = logging.getLogger(name=logger) # connect to existing one elif not isinstance(logger, logging.Logger): raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format( str(logger)) ## extract meta data from arguments dataargs, loadfct, srcage, datamsgstr = getMetaData(dataset, mode, dataargs, lone=False) dataset_name = dataargs.dataset_name periodstr = dataargs.periodstr avgfolder = dataargs.avgfolder # parse export options bc_args = bc_args.copy() # first copy, then modify... # initialize BiasCorrection class instance BC = getBCmethods(bc_method, **bc_args) # get folder for target dataset and do some checks picklefile = BC.picklefile(obs_name=obs_dataset.name, gridstr=dataargs.grid, domain=dataargs.domain, tag=tag) if ldebug: picklefile = 'test_' + picklefile picklepath = '{:s}/{:s}'.format(avgfolder, picklefile) # check if we are overwriting an existing file if not os.path.exists(avgfolder): raise IOError, "Dataset folder '{:s}' does not exist!".format( avgfolder) lskip = False # else just go ahead if os.path.exists(picklepath) and not loverwrite: age = datetime.fromtimestamp(os.path.getmtime(picklepath)) # if source file is newer than sink file or if sink file is a stub, recompute, otherwise skip if age > srcage: lskip = True if hasattr(obs_dataset, 'filepath') and obs_dataset.filepath is not None: obsage = datetime.fromtimestamp( os.path.getmtime(obs_dataset.filepath)) if age < obsage: lskip = False # depending on last modification time of file or overwrite setting, start computation, or skip if lskip: # print message skipmsg = "\n{:s} >>> Skipping: Bias-correction '{:s} for dataset '{:s}' already exists and is newer than source file.".format( pidstr, BC.long_name, dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format(pidstr, picklepath) logger.info(skipmsg) del BC else: ## actually load datasets dataset = loadfct() # load source data # check period if 'period' in dataset.atts and dataargs.periodstr != dataset.atts.period: # a NetCDF attribute raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format( periodstr, dataset.atts.period) # print message if mode == 'climatology': opmsgstr = 'Bias-correcting Climatology ({:s}) using {:s}'.format( periodstr, BC.long_name) elif mode == 'time-series': opmsgstr = 'Bias-correcting Time-series using {:s}'.format( BC.long_name) elif mode[-5:] == '-mean': opmsgstr = 'Bias-correcting {:s}-Mean ({:s}) using {:s}'.format( mode[:-5], periodstr, BC.long_name) else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode) # print feedback to logger logger.info( '\n{0:s} *** {1:^65s} *** \n{0:s} *** {2:^65s} *** \n' .format(pidstr, datamsgstr, opmsgstr)) if not lparallel and ldebug: logger.info('\n' + str(dataset) + '\n') # N.B.: data are not loaded immediately but on demand; this way I/O and computing are further # disentangled and not all variables are always needed # "train", i.e. optimize fit parameters BC.train(dataset, obs_dataset) # print bias-correction if not lparallel and ldebug: logger.info('\n' + str(BC) + '\n') print("Bias-correction Statistics:") BC.validate(dataset, obs_dataset, lprint=True) print('') ## pickle bias-correction object with trained parameters # open file and save pickle if os.path.exists(picklepath): os.remove(picklepath) if lgzip: op = gzip.open picklepath += '.gz' else: op = open with op(picklepath, 'wb') as filehandle: pickle.dump(BC, filehandle, protocol=-1) # should be new binary protocol if not os.path.exists(picklepath): raise IOError, "Error while saving Pickle to '{0:s}'".format( picklepath) # write results to file writemsg = "\n{:s} >>> Generation of BiasCorrection '{:s}' for Dataset '{:s}' complete.".format( pidstr, bc_method, dataset_name, ) writemsg += "\n{:s} >>> ('{:s}')\n".format(pidstr, picklepath) logger.info(writemsg) # clean up and return dataset.unload() del dataset, BC return 0 # "exit code"
def performExport(dataset, mode, dataargs, expargs, loverwrite=False, ldebug=False, lparallel=False, pidstr='', logger=None): ''' worker function to perform regridding for a given dataset and target grid ''' # input checking if not isinstance(dataset,basestring): raise TypeError if not isinstance(dataargs,dict): raise TypeError # all dataset arguments are kwargs # logging if logger is None: # make new logger logger = logging.getLogger() # new logger logger.addHandler(logging.StreamHandler()) else: if isinstance(logger,basestring): logger = logging.getLogger(name=logger) # connect to existing one elif not isinstance(logger,logging.Logger): raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format(str(logger)) ## extract meta data from arguments dataargs, loadfct, srcage, datamsgstr = getMetaData(dataset, mode, dataargs, lone=False) dataset_name = dataargs.dataset_name; periodstr = dataargs.periodstr; domain = dataargs.domain # parse export options expargs = expargs.copy() # first copy, then modify... lm3 = expargs.pop('lm3') # convert kg/m^2/s to m^3/m^2/s (water flux) expformat = expargs.pop('format') # needed to get FileFormat object varlist = expargs.pop('varlist') # this handled outside of export # initialize FileFormat class instance fileFormat = getFileFormat(expformat, **expargs) # get folder for target dataset and do some checks expname = '{:s}_d{:02d}'.format(dataset_name,domain) if domain else dataset_name expfolder = fileFormat.defineDataset(name=dataset_name, dataset=dataset, mode=mode, dataargs=dataargs, lwrite=True, ldebug=ldebug) # prepare destination for new dataset lskip = fileFormat.prepareDestination(srcage=srcage, loverwrite=loverwrite) # depending on last modification time of file or overwrite setting, start computation, or skip if lskip: # print message skipmsg = "\n{:s} >>> Skipping: Format '{:s} for dataset '{:s}' already exists and is newer than source file.".format(pidstr,expformat,dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,expfolder) logger.info(skipmsg) else: ## actually load datasets source = loadfct() # load source data # check period if 'period' in source.atts and dataargs.periodstr != source.atts.period: # a NetCDF attribute raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format(periodstr,source.atts.period) # print message if mode == 'climatology': opmsgstr = 'Exporting Climatology ({:s}) to {:s} Format'.format(periodstr, expformat) elif mode == 'time-series': opmsgstr = 'Exporting Time-series to {:s} Format'.format(expformat) else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode) # print feedback to logger logger.info('\n{0:s} *** {1:^65s} *** \n{0:s} *** {2:^65s} *** \n'.format(pidstr,datamsgstr,opmsgstr)) if not lparallel and ldebug: logger.info('\n'+str(source)+'\n') # create GDAL-enabled target dataset sink = Dataset(axes=(source.xlon,source.ylat), name=expname, title=source.title) addGDALtoDataset(dataset=sink, griddef=source.griddef) assert sink.gdal, sink # N.B.: data are not loaded immediately but on demand; this way I/O and computing are further # disentangled and not all variables are always needed # Compute intermediate variables, if necessary for varname in varlist: vars = None # variable list if varname in source: var = source[varname].load() # load data (may not have to load all) else: var = None if varname == 'waterflx': var = newvars.computeWaterFlux(source) elif varname == 'liqwatflx': var = newvars.computeLiquidWaterFlux(source) elif varname == 'netrad': var = newvars.computeNetRadiation(source, asVar=True) elif varname == 'netrad_0': var = newvars.computeNetRadiation(source, asVar=True, lA=False, name='netrad_0') elif varname == 'netrad_bb': var = newvars.computeNetRadiation(source, asVar=True, lrad=False, name='netrad_bb') elif varname == 'vapdef': var = newvars.computeVaporDeficit(source) elif varname == 'pet' or varname == 'pet_pm': vars = newvars.computePotEvapPM(source, lterms=True) # default; returns mutliple PET terms #var = newvars.computePotEvapPM(source, lterms=False) # returns only PET elif varname == 'pet_th': var = None # skip for now #var = computePotEvapTh(source) # simplified formula (less prerequisites) else: raise VariableError, "Unsupported Variable '{:s}'.".format(varname) # for now, skip variables that are None if var or vars: # handle lists as well if var and vars: raise VariableError, (var,vars) if var: vars = (var,) for var in vars: addGDALtoVar(var=var, griddef=sink.griddef) if not var.gdal and isinstance(fileFormat,ASCII_raster): raise GDALError, "Exporting to ASCII_raster format requires GDAL-enabled variables." # add to new dataset sink += var # convert units if lm3: for var in sink: if var.units == 'kg/m^2/s': var /= 1000. # divide to get m^3/m^2/s var.units = 'm^3/m^2/s' # update units # print dataset if not lparallel and ldebug: logger.info('\n'+str(sink)+'\n') # export new dataset to selected format fileFormat.exportDataset(sink) # write results to file writemsg = "\n{:s} >>> Export of Dataset '{:s}' to Format '{:s}' complete.".format(pidstr,expname, expformat) writemsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,expfolder) logger.info(writemsg) # clean up and return source.unload(); #del source return 0 # "exit code"
def generateBiasCorrection(dataset, mode, dataargs, obs_dataset, bc_method, bc_args, loverwrite=False, lgzip=None, tag=None, ldebug=False, lparallel=False, pidstr='', logger=None): ''' worker function to generate a bias correction objects for a given dataset ''' # input checking if not isinstance(dataset,basestring): raise TypeError if not isinstance(dataargs,dict): raise TypeError # all dataset arguments are kwargs # logging if logger is None: # make new logger logger = logging.getLogger() # new logger logger.addHandler(logging.StreamHandler()) else: if isinstance(logger,basestring): logger = logging.getLogger(name=logger) # connect to existing one elif not isinstance(logger,logging.Logger): raise TypeError, 'Expected logger ID/handle in logger KW; got {}'.format(str(logger)) ## extract meta data from arguments dataargs, loadfct, srcage, datamsgstr = getMetaData(dataset, mode, dataargs, lone=False) dataset_name = dataargs.dataset_name; periodstr = dataargs.periodstr; avgfolder = dataargs.avgfolder # parse export options bc_args = bc_args.copy() # first copy, then modify... # initialize BiasCorrection class instance BC = getBCmethods(bc_method, **bc_args) # get folder for target dataset and do some checks picklefile = BC.picklefile(obs_name=obs_dataset.name, gridstr=dataargs.grid, domain=dataargs.domain, tag=tag) if ldebug: picklefile = 'test_' + picklefile picklepath = '{:s}/{:s}'.format(avgfolder,picklefile) # check if we are overwriting an existing file if not os.path.exists(avgfolder): raise IOError, "Dataset folder '{:s}' does not exist!".format(avgfolder) lskip = False # else just go ahead if os.path.exists(picklepath) and not loverwrite: age = datetime.fromtimestamp(os.path.getmtime(picklepath)) # if source file is newer than sink file or if sink file is a stub, recompute, otherwise skip if age > srcage: lskip = True if hasattr(obs_dataset, 'filepath') and obs_dataset.filepath is not None: obsage = datetime.fromtimestamp(os.path.getmtime(obs_dataset.filepath)) if age < obsage: lskip = False # depending on last modification time of file or overwrite setting, start computation, or skip if lskip: # print message skipmsg = "\n{:s} >>> Skipping: Bias-correction '{:s} for dataset '{:s}' already exists and is newer than source file.".format(pidstr,BC.long_name,dataset_name) skipmsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,picklepath) logger.info(skipmsg) del BC else: ## actually load datasets dataset = loadfct() # load source data # check period if 'period' in dataset.atts and dataargs.periodstr != dataset.atts.period: # a NetCDF attribute raise DateError, "Specifed period is inconsistent with netcdf records: '{:s}' != '{:s}'".format(periodstr,dataset.atts.period) # print message if mode == 'climatology': opmsgstr = 'Bias-correcting Climatology ({:s}) using {:s}'.format(periodstr, BC.long_name) elif mode == 'time-series': opmsgstr = 'Bias-correcting Time-series using {:s}'.format(BC.long_name) elif mode[-5:] == '-mean': opmsgstr = 'Bias-correcting {:s}-Mean ({:s}) using {:s}'.format(mode[:-5], periodstr, BC.long_name) else: raise NotImplementedError, "Unrecognized Mode: '{:s}'".format(mode) # print feedback to logger logger.info('\n{0:s} *** {1:^65s} *** \n{0:s} *** {2:^65s} *** \n'.format(pidstr,datamsgstr,opmsgstr)) if not lparallel and ldebug: logger.info('\n'+str(dataset)+'\n') # N.B.: data are not loaded immediately but on demand; this way I/O and computing are further # disentangled and not all variables are always needed # "train", i.e. optimize fit parameters BC.train(dataset, obs_dataset) # print bias-correction if not lparallel and ldebug: logger.info('\n'+str(BC)+'\n') print("Bias-correction Statistics:") BC.validate(dataset, obs_dataset, lprint=True) print('') ## pickle bias-correction object with trained parameters # open file and save pickle if os.path.exists(picklepath): os.remove(picklepath) if lgzip: op = gzip.open picklepath += '.gz' else: op = open with op(picklepath, 'wb') as filehandle: pickle.dump(BC, filehandle, protocol=-1) # should be new binary protocol if not os.path.exists(picklepath): raise IOError, "Error while saving Pickle to '{0:s}'".format(picklepath) # write results to file writemsg = "\n{:s} >>> Generation of BiasCorrection '{:s}' for Dataset '{:s}' complete.".format(pidstr,bc_method, dataset_name,) writemsg += "\n{:s} >>> ('{:s}')\n".format(pidstr,picklepath) logger.info(writemsg) # clean up and return dataset.unload(); del dataset, BC return 0 # "exit code"