Пример #1
0
def cdrop(obj, rm=True, force=False):
    """
    Deletes the cached file for a CliMAF object, if it exists

    Args:
     obj (cobject or string) : object to delete, or its string representation (CRS)

     force (bool) : should we delete the object even if it is 'protected'
    
     rm (bool) : for advanced use only; should we actually delete (rm) the file, or just forget it in CliMAF cache index

    Returns:
     None if object does not exists, False if failing to delete, True if OK

    Example ::

    >>> dg=ds(project='example', simulation='AMIPV6ALB2G', variable='tas', period='1980-1981')
    >>> f=cfile(dg)
    >>> os.system('ls -al '+f)
    >>> cdrop(dg)
    
    """
    global crs2filename
    global dropped_crs

    if (isinstance(obj, cobject)):
        crs = ` obj `
        if (isinstance(obj, cdataset)): crs = "select(" + crs + ")"
    elif type(obj) is str: crs = obj
    else:
        clogger.error("%s is not a CliMAF object" % ` obj `)
        return
    if crs in crs2filename:
        clogger.info("Discarding cached value for %s (expect if protected)" %
                     crs)
        fil = crs2filename[crs]
        if rm:
            try:
                if force: os.system("chmod +w " + fil)
                if not os.access(fil, os.W_OK):
                    clogger.info("Object %s is protected" % crs)
                    return
                path_file = os.path.dirname(fil)
                os.remove(fil)
                crs2filename.pop(crs)
                dropped_crs.append(crs)
                try:
                    os.rmdir(path_file)
                except OSError as ex:
                    clogger.warning(ex)

                return True
            except:
                clogger.warning(
                    "When trying to remove %s : file does not exist in cache" %
                    crs)
                return False
    else:
        clogger.info("%s is not cached" % crs)
        return None
Пример #2
0
def set_variable(obj, varname, format) :
    """ Change to VARNAME the variable name for OBJ, which FORMAT 
    maybe 'file' or 'MaskedArray'. 
    Also set the variable long_name using CF convention (TBD)
    """
    if obj is None : return None
    long_name=CFlongname(varname)
    if (format=='file') :
        oldvarname=varOfFile(obj)
        if (oldvarname != varname) :
            command="ncrename -v %s,%s %s >/dev/null 2>&1"%(oldvarname,varname,obj)
            if ( os.system(command) != 0 ) :
                clogger.error("Issue with changing varname to %s in %s"%(varname,obj))
                return None
            clogger.debug("Varname changed to %s in %s"%(varname,obj))
            command="ncatted -a long_name,%s,o,c,%s %s"%(varname,long_name,obj)
            if ( os.system(command) != 0 ) :
                clogger.error("Issue with changing long_name for var %s in %s"%
                              (varname,obj))
                return None
            return True
    elif (format=='MaskedArray') :
        clogger.warning('TBD - Cannot yet set the varname for MaskedArray')
    else :
        clogger.error('Cannot handle format %s'%format)
Пример #3
0
def csync(update=False) :
    """
    Write cache dictionary to disk

    If arg `update` is True, first updates dictionary from actual 
    cache file content
    """
    import pickle
    global cacheIndexFileName

    # check if cache index is up to date; if not the 
    # function 'rebuild' is called
    if update :
        clogger.warning("Listing crs from files present in cache")
        crs_in_cache=list_cache()
        crs_in_cache.sort()
        crs_in_index=crs2filename.values()
        crs_in_index.sort()
        if crs_in_index != crs_in_cache:
            clogger.warning("Rebuilding cache index")
            rebuild()  

    # Save to disk
    cacheIndexFile=file(os.path.expanduser(cacheIndexFileName), "w")
    pickle.dump(crs2filename,cacheIndexFile)  
    cacheIndexFile.close()
Пример #4
0
def generateUniqueFileName_safe(expression, operator=None, format="nc"):
    """ Generate a filename path from string EXPRESSION and FILEFORMAT, unique for the
    expression and the set of cache directories currently listed in cache.cachedirs
    OPERATOR may be a function that provides a prefix, using EXPRESSION

    This uses hashlib.sha224, which are truncated to 3 (or more) characters.
    More characters are used if a shorter name is already in use for another
    expression in one of the known cache directories

    Generated names drive a structure where each directory name 1 or 2
    characters and file names have no more characters

    Exits if uniqueness is unachievable (quite unexpectable !) """
    #
    if format is None:
        return ""
    prefix = ""
    if operator is not None:
        prefix2 = operator(expression)
        if prefix2 is not None:
            prefix = prefix2 + "/"
    full = hashlib.sha224(expression).hexdigest()
    number = fileNameLength
    guess = full[0:number - 1]
    existing = searchFile(prefix + stringToPath(guess, directoryNameLength) +
                          "." + format)
    if existing:
        readCRS = getCRS(existing)
        # Update index if needed
        if readCRS not in crs2filename:
            clogger.warning(
                "existing data %s in file %s was not yet registered in cache index"
                % (readCRS, existing))
            crs2filename[readCRS] = existing
    while (existing is not None) and (readCRS != expression):
        clogger.debug("must skip %s which CRS is %s" %
                      (existing, getCRS(existing)))
        number += 2
        if number >= len(full):
            clogger.critical("Critical issue in cache : " + len(full) +
                             " digits is not enough for " + expression)
            exit
        guess = full[0:number - 1]
        existing = searchFile(prefix +
                              stringToPath(guess, directoryNameLength) + "." +
                              format)
        if existing:
            readCRS = getCRS(existing)
    rep = currentCache + "/" + prefix + stringToPath(
        full[0:number - 1], directoryNameLength) + "." + format
    rep = os.path.expanduser(rep)
    # Create the relevant directory, so that user scripts don't have to care
    dirn = os.path.dirname(rep)
    if not os.path.exists(dirn):
        os.makedirs(dirn)
    clogger.debug("returning %s" % rep)
    return rep
Пример #5
0
def csync(update=False):
    """
    Merges current in-memory cache index and current on-file cache index
    for updating both

    If arg `update` is True, additionally ensures consistency between files
    set and index content, either :

    - if cache.stamping is true, by reading CRS in all files
    - else, by removing files which are not in the index; this may erase
      result files which have been computed by another running
      instance of CliMAF
    """
    #
    import pickle
    global cacheIndexFileName
    global dropped_crs

    # Merge index on file and index in memory
    file_index = cload(True)
    for crs in dropped_crs:
        file_index.pop(crs, None)
    crs2filename.update(file_index)

    # check if cache index is up to date; if not enforce consistency
    if update:
        clogger.info("Listing crs from files present in cache")
        files_in_cache = list_cache()
        files_in_cache.sort()
        files_in_index = crs2filename.values()
        files_in_index.sort()
        if files_in_index != files_in_cache:
            if stamping:
                clogger.info("Rebuilding cache index from file content")
                rebuild()
            else:
                clogger.warning(
                    'In no stamp mode, there is no way to seriously identify CRS from files in cache  !'
                )
                # clogger.warning('Removing cache files which content is not known.
                # This is an issue in concurrent mode !')
                # for fil in files_in_cache :
                #     if fil not in files_in_index :
                #         os.system("rm %"%fil)
                # else :
                # Should also remove empty files, as soon as
                # file creation will be atomic enough
    # Save index to disk
    fn = os.path.expanduser(cacheIndexFileName)
    try:
        with open(fn, "w") as cacheIndexFile:
            pickle.dump(crs2filename, cacheIndexFile)
        dropped_crs = []
    except:
        if update:
            if os.path.isfile(fn) and len(files_in_cache > 0):
                clogger.error("Issue when writing cache index %s" % fn)
Пример #6
0
def cimport(cobject,crs) :
    clogger.debug("cimport called with argument",cobject)  
    clogger.debug("should check syntax of arg 'crs' -TBD")
    clogger.warning("cimport is not for the dummies - Playing at your own risks !")
    import numpy, numpy.ma
    if isinstance(cobject,numpy.ma.MaskedArray) :
        clogger.debug("for now, use a file for importing - should revisit - TBD")
        clogger.error("not yet implemented fro Masked Arrays - TBD")
    elif isinstance(cobject,str) :
        cache.register(cobject,crs)
    else :
        clogger.error("argument is not a Masked Array nor a filename",cobject)
Пример #7
0
def generateUniqueFileName(expression, operator=None, format="nc"):
    """ Generate a filename path from string EXPRESSION and FILEFORMAT, unique for the
    expression and the set of cache directories currently listed in cache.cachedirs 
    OPERATOR may be a function that provides a prefix, using EXPRESSION

    This uses hashlib.sha224, which are truncated to 3 (or more) characters. 
    More characters are used if a shorter name is already in use for another
    expression in one of the known cache directories 

    Generated names drive a structure where each directory name 1 or 2
    characters and file names have no more characters

    Exits if uniqueness is unachievable (quite unexpectable !) """
    #
    import hashlib
    directoryNameLength=2
    #
    if format==None : return ""
    prefix=""
    if operator is not None :
        prefix2=operator(expression)
        if prefix2 is not None : prefix=prefix2+"/"
    full=hashlib.sha224(expression).hexdigest()
    number=4
    guess=full[0 : number - 1 ]
    existing=searchFile(prefix+stringToPath(guess, directoryNameLength )+"."+format)
    if existing : 
        readCRS=getCRS(existing)
        # Update index if needed
        if readCRS not in crs2filename :
            clogger.warning("existing data %s in file %s was not yet registered in cache index"%\
                                (readCRS,existing))
            crs2filename[readCRS]=existing
    while ( ( existing is not None ) and ( readCRS != expression )) :
        clogger.debug("must skip %s which CRS is %s"%\
                      (existing, getCRS(existing) ))
        number += 2
        if (number >= len(full) ) :
            clogger.critical("Critical issue in cache : "+len(full)+" digits is not enough for "+expression)
            exit
        guess=full[0 : number - 1 ]
        existing=searchFile(prefix+stringToPath(guess, directoryNameLength )+"."+format)
        if existing : readCRS=getCRS(existing)
    rep=currentCache+"/"+prefix+stringToPath(full[0 : number - 1 ], directoryNameLength )+"."+format
    rep=os.path.expanduser(rep)
    # Create the relevant directory, so that user scripts don't have to care
    dirn=os.path.dirname(rep)
    if not os.path.exists(dirn) : os.makedirs(dirn)
    clogger.debug("returning %s"%rep)
    return(rep)
Пример #8
0
def efile(obj, filename, forced=False) :
    """
    Create a single file for an ensemble of CliMAF objects (launch computation if needed).
    
    This is a convenience function. Such files are not handled in CliMAF cache

    Args:
    
        obj (CliMAF object) : an ensemble of CliMAF objects ('cens' objet)
        
        filename (str) : output filename. It will include a field for each 
         ensemble's member, with a variable name suffixed by the member
         label (e.g. : tas_CNRM-CM, tas_IPSL-CM... ) (more formally : 
         'var(obj.members[n])'_'obj.labels[n]')

        forced (logical, optional) : if True, CliMAF will override the file
         'filename' if it already exists 
                
    """
    if isinstance(obj,classes.cens) :

        if os.path.isfile(filename):
            if forced:
                os.system("rm -rf %s" %filename)
                clogger.warning("File '%s' already exists and was overriding" %filename)
            else:
                raise Climaf_Driver_Error("File '%s' already exists: use 'forced=True' to override it" %filename)
                
        for memb,lab in zip(obj.members,obj.labels):
            ffile=cfile(memb)
            
            f = tempfile.NamedTemporaryFile(suffix=".nc")
            command="ncrename -O -v %s,%s_%s %s %s"%(varOf(memb), varOf(memb), lab, ffile, f.name)
            if ( os.system(command) != 0 ) :
                raise Climaf_Driver_Error("ncrename failed : %s" %command)         
            
            command2="ncks -A %s %s"%(f.name,filename)
            if ( os.system(command2) != 0 ) :
                raise Climaf_Driver_Error("Issue when merging %s and %s (using command: %s)"%(f.name,filename,command2))
            f.close()
      
    else:
        clogger.warning("objet is not a 'cens' objet")
Пример #9
0
def cdrop(obj, rm=True) :
    """
    Deletes the cached file for a CliMAF object, if it exists

    Args:
     obj (cobject or string) : object to delete, or its string representation (CRS)

     rm (bool) : for advanced use only; should we actually delete (rm) the file, or just forget it in CliMAF cache index
    
    Returns:
     None if object does not exists, False if failing to delete, True if OK

    Example ::

    >>> dg=ds(project='example', simulation='AMIPV6ALB2G', variable='tas', period='1980-1981')
    >>> f=cfile(dg)
    >>> os.system('ls -al '+f)
    >>> cdrop(dg)
    
    """
    global crs2filename

    if (isinstance(obj,cobject) ):
        crs=`obj`
        if (isinstance(obj, cdataset) ) : crs="select("+crs+")"
    elif type(obj) is str : crs=obj
    else :
        clogger.error("%s is not a CliMAF object"%`obj`)
        return
    if crs in crs2filename :
        clogger.info("discarding cached value for "+crs)
        fil=crs2filename.pop(crs)
        if rm :
            try :
                os.remove(fil)
                return True
            except:
                clogger.warning("When trying to remove %s : file does not exist in cache"%crs)
                return False
    else :
        clogger.info("%s is not cached"%crs)
        return None
Пример #10
0
def rebuild():
    """
    Rebuild the in-memory content of CliMAF cache index

    """
    global crs2filename

    if not stamping:
        clogger.warning(
            "Cannot rebuild cache index, because we are not in 'stamping' mode"
        )
        return None
    files_in_cache = list_cache()
    crs2filename.clear()
    for files in files_in_cache:
        filecrs = getCRS(files)
        if filecrs:
            crs2filename[filecrs] = files
        else:
            os.system('rm -f ' + files)
            clogger.warning("File %s is removed" % files)
    return crs2filename
Пример #11
0
def selectGenericFiles(urls, **kwargs):
    """
    Allow to describe a ``generic`` file organization : the list of files returned 
    by this function is composed of files which :

    - match the patterns in ``url`` once these patterns are instantiated by 
      the values in kwargs, and 

     - contain the ``variable`` provided in kwargs

     - match the `period`` provided in kwargs

    In the pattern strings, no keyword is mandatory

    Example :

    >>> selectGenericFiles(project='my_projet',model='my_model', simulation='lastexp', variable='tas', period='1980', urls=['~/DATA/${project}/${model}/*${variable}*YYYY*.nc)']
    /home/stephane/DATA/my_project/my_model/somefilewith_tas_Y1980.nc

    In the pattern strings, the keywords that can be used in addition to the argument
    names (e.g. ${model}) are:
    
    - ${variable} : use it if the files are split by variable and 
      filenames do include the variable name, as this speed up the search

    - YYYY, YYYYMM, YYYYMMDD : use it for indicating the start date of
      the period covered by each file, if this is applicable in the
      file naming; use a second time for end date, if applicable
      (otherwise the assumption is that the whole year -resp. month or
      day- is included in the file

    - wildcards '?' and '*' for matching respectively one and any number of characters


    """
    rep=[]
    period=kwargs['period']
    if type(period) is str : period=init_period(period)
    variable=kwargs['variable']
    mustHaveVariable=False
    if "filenameVar" in kwargs and kwargs['filenameVar'] :
        kwargs['variable']=kwargs['filenameVar']
        mustHaveVariable=True
    for l in urls :
        template=Template(l)
        # There is no use to look for files which path is not specific
        # to the required variable when we know it should
        if l.find("${variable}") < 0 and mustHaveVariable :
            continue
        #
        # Instantiate keywords in pattern with attributes values
        template=template.safe_substitute(**kwargs)
        #print "template after attributes replace : "+template
        #
        # Construct a pattern for globbing dates
        temp2=template
        dt=dict(YYYY="????",YYYYMM="??????",YYYYMMDD="????????")
        for k in dt : temp2=temp2.replace(k,dt[k])
        clogger.debug("Globbing on : "+temp2)
        lfiles=glob.glob(temp2)
        #
        # Analyze all filenames
        for f in lfiles :
            # print "looking at file"+f
            # Construct regexp for extracting dates from filename
            dt=dict(YYYY="([0-9]{4})",YYYYMM="([0-9]{6})",
                    YYYYMMDD="([0-9]{10})")
            regexp=None
            # print "template before searching dates : "+template
            lkeys=dt.keys() ; lkeys.sort(reverse=True)
            for key in lkeys :
                # print "searchin "+key+" in "+template
                start=template.find(key)
                if (start>=0 ) :
                    # print "found "+key
                    regexp=template.replace(key,dt[key],1)
                    hasEnd=False
                    start=regexp.find(key) 
                    if (start >=0 ) :
                        hasEnd=True
                        regexp=regexp.replace(key,dt[key],1)
                    break
            #
            # Analyze file time period
            fperiod=None
            if regexp :
                regexp=regexp.replace("*",".*").replace("?",r".")
                # print "regexp for extracting dates : "+regexp
                start=re.sub(regexp,r'\1',f)
                if hasEnd :
                    end=re.sub(regexp,r'\2',f)
                    fperiod=init_period("%s-%s"%(start,end))
                else :
                    fperiod=init_period(start)
                #
                # Filter file time period against required period
            else :
                if ( 'frequency' in kwargs and kwargs['frequency']=="fx") :
                    if (l.find("${variable}")>=0) or fileHasVar(f,variable) : 
                        clogger.debug("adding fixed field :"+f)
                        rep.append(f)
                else :
                    clogger.warning("Cannot yet filter files re. time using only file content. TBD")
                    rep.append(f)
            if (fperiod and period.intersects(fperiod)) or not regexp :
                # Filter against variable 
                if (l.find("${variable}")>=0) or fileHasVar(f,variable) : 
                    # Should check time period in the file if not regexp
                    # print "appending "+f
                    rep.append(f)
    return rep
Пример #12
0
    def __init__(self,name,  *args, **kwargs) :
        """
        Declare a project and its facets/attributes in CliMAF (see below)

        Args:
          name (string) : project name; 
           do not use the chosen separator in it (see below)
          args (strings) : attribute names; 
           they are free; do not use the chosen separator in it (see below); **CliMAF 
           anyway will add attributes : 
           project, simulation, variable, period, and domain**
          kwargs (dict) :
           can only be used with keywords :

            - ``sep`` or ``separator`` for indicating the symbol separating
              facets in the dataset syntax. Defaults to ".".
            - ``ensemble`` for declaring a list of attribute
              names which are allowed for defining an ensemble in
              this project ('simulation' is automatically allowed)

        Returns : a cproject object, which string representation is
        the pattern later used in CliMAF Refreence Syntax for
        representing datasets in this project

        A 'cproject' is the definition of a set of attributes, or
        facets, which values will completely define a 'dataset' as
        managed by CliMAF. Its name is one of the possible keys 
        for describing data locations (see
        :py:class:`~climaf.dataloc.dataloc`)

        For instance, cproject CMIP5, after its Data Reference Syntax, 
        has attributes : 
        experiment, model, rip (here called simulation), variable, frequency, realm, table, version


        **A number of projects are built-in**. See :py:mod:`~climaf.projects`
        
        A dataset in a cproject declared as ::

        >>> cproject('MINE','myfreq','myfacet',sep='_')

        will return ::

          ${project}_${simulation}_${variable}_${period}_${domain}_${myfreq}_${myfacet}

        and will have datasets represented as  e.g.::

          'MINE_hist_tas_[1980-1999]_global_decadal_gabu'

        while an example for built-in cproject CMIP5 will be::

          'CMIP5.historical.pr.[1980].global.monthly.CNRM-CM5.r1i1p1.mon.Amon.atmos.last'

        The attributes list should include all facets which are useful
        for distinguishing datasets from each other, and for computing
        datafile pathnames in the 'generic' organization (see
        :py:class:`~climaf.dataloc.dataloc`)

        A default value for a given facet can be specified, by providing a tuple
        (facet_name,default_value) instead of the facet name. This default value is
        however of lower priority than the value set using :py:func:`cdef`

        A project can be declared as having non-standard variable
        names, or variables that should undergo re-scaling; see
        :py:func:`~climaf.classes.calias`

        A project can be declared as having non-standard frequency names (this is 
        used when accessing datafiles); see :py:func:`~climaf.classes.cfreqs`)

        """
        if name in cprojects : clogger.warning("Redefining project %s"%name)
        self.project=name
        #
        self.facets=[]
        self.facet_defaults=dict()
        forced=['project','simulation', 'variable', 'period', 'domain']
        for f in forced : self.facets.append(f)
        for a in args :
            if isinstance(a,tuple) :
                facet_name,facet_default=a
                self.facet_defaults[facet_name]=facet_default
            else :
                facet_name=a
            if not facet_name in forced : self.facets.append(facet_name)
        #
        self.separator="."
        if "separator" in kwargs : self.separator=kwargs['separator']
        if "sep"       in kwargs : self.separator=kwargs['sep']
        cprojects[name]=self
        self.crs=""
        # Build the pattern for the datasets CRS for this cproject
        for f in self.facets : 
            self.crs += "${%s}%s"%(f,self.separator)
        self.crs=self.crs[:-1]
        # Create an attribute hodling the list of facets which are allowed
        # for defining an ensemble, and put a first facet there
        self.attributes_for_ensemble=['simulation']
        if 'ensemble' in kwargs :
            self.attributes_for_ensemble.extend(kwargs["ensemble"])
Пример #13
0
def clim_average_fast(dat, season):
    """
    Computes climatological averages on the annual cycle of a dataset, on the months
    specified with 'season', either:

    - the annual mean climatology (season => 'ann','annual','climato','clim','climatology','annual_average','anm')
    - seasonal climatologies (e.g. season = 'DJF' or 'djf' to compute the seasonal climatology
      over December-January-February; available seasons: DJF, MAM, JJA, SON, JFM, JAS, JJAS
    - individual monthly climatologies (e.g. season = 'january', 'jan', '1' or 1 to get
      the climatological January)
    - annual maximum or minimum (typically makes sense with the mixed layer depth)

    Note that you can use upper case or lower case characters to specify the months or seasons.

    clim_average computes the annual cycle for you.

      >>> dat= ....   # some dataset, with whatever variable
      >>> climds_JFM = clim_average(dat,'JFM')         # The climatology of dat over January-February-March
      >>> climds_ANM = clim_average(dat,'annual_mean') # The annual mean climatology
      >>> climds_September = clim_average(dat,'September') # The annual mean climatology of September
      >>> climds_September = clim_average(dat,9) # Same as previous example, with a float

    """
    #
    if str(season).lower() in ['ann', 'annual', 'climato', 'clim', 'climatology', 'annual_average', 'anm',
                               'annual_mean']:
        avg = time_average_fast(dat)
    else:
        #
        # -- Compute the annual cycle
        scyc = annual_cycle_fast(dat)
        #
        # -- Classic atmospheric seasons
        selmonths = selmonth = None
        if str(season).upper() == 'DJF':
            selmonths = '1,2,12'
            clogger.warning('DJF is actually processed as JF....D. Maybe an issue for short periods !')
        if str(season).upper() == 'DJFM':
            selmonths = '1,2,3,12'
        if str(season).upper() == 'MAM':
            selmonths = '3,4,5'
        if str(season).upper() == 'JJA':
            selmonths = '6,7,8'
        if str(season).upper() == 'SON':
            selmonths = '9,10,11'
        # -- Classic oceanic seasons
        if str(season).upper() == 'JFM':
            selmonths = '1,2,3'
        if str(season).upper() == 'JAS':
            selmonths = '7,8,9'
        if str(season).upper() == 'JJAS':
            selmonths = '6,7,8,9'
        # -- Biogeochemistry season
        if str(season).upper() == 'NDJ':
            selmonths = '11,12,1'
        if str(season).upper() == 'AMJ':
            selmonths = '4,5,6'

        if selmonths:
            avg = ccdo_fast(scyc, operator='timmean -seltimestep,' + selmonths)
            # avg = ccdo(scyc,operator='timmean -selmon,'+selmonths)
        #
        #
        # -- Individual months
        if str(season).lower() in ['january', 'jan', '1']:
            selmonth = '1'
        if str(season).lower() in ['february', 'feb', '2']:
            selmonth = '2'
        if str(season).lower() in ['march', 'mar', '3']:
            selmonth = '3'
        if str(season).lower() in ['april', 'apr', '4']:
            selmonth = '4'
        if str(season).lower() in ['may', '5']:
            selmonth = '5'
        if str(season).lower() in ['june', 'jun', '6']:
            selmonth = '6'
        if str(season).lower() in ['july', 'jul', '7']:
            selmonth = '7'
        if str(season).lower() in ['august', 'aug', '8']:
            selmonth = '8'
        if str(season).lower() in ['september', 'sep', '9']:
            selmonth = '9'
        if str(season).lower() in ['october', 'oct', '10']:
            selmonth = '10'
        if str(season).lower() in ['november', 'nov', '11']:
            selmonth = '11'
        if str(season).lower() in ['december', 'dec', '12']:
            selmonth = '12'
        if selmonth:
            avg = ccdo_fast(scyc, operator='selmon,' + selmonth)
        #
        # -- Annual Maximum
        if str(season).lower() in ['max', 'annual max', 'annual_max']:
            avg = ccdo_fast(scyc, operator='timmax')
        #
        # -- Annual Minimum
        if str(season).lower() in ['min', 'annual min', 'annual_min']:
            avg = ccdo_fast(scyc, operator='timmin')
    #
    return avg
Пример #14
0
def selectLocalFiles(**kwargs):
    """
    Returns the shortest list of (local) files which include the data
    for the list of (facet,value) pairs provided

    Method : 
    
    - use datalocations indexed by :py:func:`~climaf.dataloc.dataloc` to 
      identify data organization and data store urls for these (facet,value) 
      pairs

    - check that data organization si sa known one, i.e. is one of 'generic', 
      CMIP5_DRS' or 'EM'
    
    - derive relevant filenames search function such as as :
      py:func:`~climaf.dataloc.selectCmip5DrsFiles` from data
      organization scheme

    - pass urls and relevant facet values to this filenames search function

    """
    rep=[]
    project=kwargs['project']
    simulation=kwargs['simulation']
    variable=kwargs['variable']
    period=kwargs['period']

    if 'model' in kwargs : model=kwargs['model']
    else : model="*"
    if 'frequency' in kwargs : frequency=kwargs['frequency']
    else : frequency="*"

    ofu=getlocs(project=project, model=model, simulation=simulation, frequency=frequency)
    clogger.debug("locs="+ `ofu`)
    if ( len(ofu) == 0 ) :
        clogger.warning("no datalocation found for %s %s %s %s "%(project, model, simulation, frequency))
    for org,freq,urls in ofu :
        kwargs2=kwargs.copy()
        # Convert normalized frequency to project-specific frequency if applicable
        if "frequency" in kwargs and project in classes.frequencies :
            normfreq=kwargs2['frequency'] 
            if normfreq in classes.frequencies[project]: 
                kwargs2['frequency']=classes.frequencies[project][normfreq]
        #
        # Call organization-specific routine
        if (org == "EM") :
            rep.extend(selectEmFiles(**kwargs2))
        elif (org == "CMIP5_DRS") :
            rep.extend(selectCmip5DrsFiles(urls,**kwargs2))
        elif (org == "generic") :
            rep.extend(selectGenericFiles(urls, **kwargs2))
        else :
            raise Climaf_Data_Error("cannot process organization "+org+ \
                " for simulation "+simulation+" and model "+model+\
                " of project "+project)
    if (not ofu) :
        return None
    else :
        if (len(rep) == 0 ) :
            clogger.warning("no file found for %s, at these "
                            "data locations %s "%(`kwargs` , `urls`))
            return None
    # Discard duplicates (assumes that sorting is harmless for later processing)
    rep.sort()
    last=None
    for f in rep :
        if f == last : rep.remove(last)
        last=f
    # Assemble filenames in one single string
    return(string.join(rep))
Пример #15
0
    def __init__(self,name, command, format="nc", canOpendap=False, 
                 commuteWithTimeConcatenation=False, commuteWithSpaceConcatenation=False, **kwargs):
        """
        Declare a script or binary as a 'CliMAF operator', and define a Python function with the same name

        Args:
          name (str): name for the CliMAF operator.
          command (str): script calling sequence, according to the syntax described below.
          format (str): script outputs format -- either 'nc' or 'png' or 'None'; defaults to 'nc'
          canOpendap (bool, optional): is the script able to use OpenDAP URIs ? default to False
          commuteWithTimeConcatenation (bool, optional): can the operation commute with concatenation
            of time periods ? set it to true, if the operator can be applied on time
            chunks separately, in order to allow for incremental computation / time chunking;
            defaults to False
          commuteWithSpaceConcatenation (bool, optional): can the operation commute with concatenation
            of space domains ? defaults to False (see commuteWithTimeConcatenation)
          **kwargs : possible keyword arguments, with keys matching '<outname>_var', for providing
            a format string allowing to compute the variable name for output 'outname' (see below).
        
        Returns:
          None
          
        The script calling sequence pattern string (arg 'command') indicates how to build the system call
        which actually launches the script, with a match between python objects and formal arguments;

        For introducing the syntax, please consider this example, with the following commands::
        
        >>> cscript('mycdo','cdo ${operator} ${in} ${out}')
        >>> # define some dataset
        >>> tas_ds = ds(project='example', simulation='AMIPV6', variable='tas', period='1980-1981')
        >>> # Apply operator 'mycdo' to dataset 'tas_ds', choosing a given 'operator' argument
        >>> tas_avg = mycdo(tas_ds,operator='timavg')
        
        CliMAF will later on launch this call behind the curtain::
        
        $ cdo tim_avg /home/my/tmp/climaf_cache/8a/5.nc /home/my/tmp/climaf_cache/4e/4.nc

        where :

        - the last filename is generated by CliMAF from the formal exprerssion describing 'tas_avg'
        - the first filename provide a file generated by CliMAF which includes the required data fot tas_ds

        There are a number of examples in module :download:`standard_operators
        <../climaf/standard_operators.py>`.

        **Detailed syntax**:

        -  formal arguments appear as : ``${argument}`` (in the example : ``${in}``, ``${out}``, ``${operator}`` )
        
        -  except for reserved keywords, arguments in the pattern will be
           replaced by the values for corresponding keywords used when invoking
           the diagnostic operator:

          - in the example above : argument ``operator`` is replaced by value ``timavg``,
            which is a keyword known to the external binary called, CDO  
        
        -  reserved argument keywords are :
        
         - **in, in_<digit>, ins, ins_<digit>, mmin** : they will be
           replaced by CliMAF managed filenames for input data, as
           deduced from dataset description or upstream computation; these
           filenames can actually be remote URLs (if the script can use
           OpenDAP, see args), local 'raw' data files, or CliMAF cache
           filenames
        
          -  **in** stands for the URL of the first dataset invoked in the
             operator call
        
          -  **in_<digit>** stands for the next ones, in the same order
        
          -  **ins** and **ins_<digit>** stand for the case where the script can
             select input from multiple input files or URLs (e.g. when the
             whole period to process spans over multiple files); in that case,
             a single string (surrounded with double quotes) will carry
             multiple URLs

          - **mmin** stands for the case where the script accepts an
            ensemble of datasets (only for first input stream
            yet). CliMAF will replace the keyword by a string
            composed of the corresponding input filenames (not surrounded
            by quotes - please add them yourself in declaration); see also
            ``labels`` below
        
         -  **var, var_<digit>** : when a script can select a variable in a
            multi-variable input stream, this is declared by adding this
            keyword in the calling sequence; CliMAF will replace it by the
            actual variable name to process; 'var' stands for first input
            stream, 'var_<digit>' for the next ones;

            - in the example above, we assume that external binary CDO is
              not tasked with selecting the variable, and that CliMAF must
              feed CDO with a datafile where it has already performed the
              selection
         
         
         - **period, period_<digit>** : when a script can select a time
           period in the content of a file or stream, it should declare it
           by putting this keyword in the pattern, which will be replaced at
           call time by the period written as <date1>-<date2>, where date is
           formated as YYYYMMDD ;

            - time intervals must be interpreted as [date1, date2[

            - 'period' stands for the first input_stream,

            - 'period_<n>' for the next ones, in the order of actual call;

           - in the example above, this keyword is not used, which means that
             CliMAF has to select the period upstream of feeding CDO with the
             data
         
         - **period_iso, period_iso_<digit>** : as for **period** above,
           except that the date formating fits CDO conventions : 

            - date format is ISO : YYYY-MM-DDTHH:MM:SS

            - interval is [date1,date2_iso], where date2_iso is 1 minute before
              date2

            - separator between dates is : ,  

         - **domain, domain_<digit>** : when a script can select a domain 
           in the input grid, this is declared by adding this
           keyword in the calling sequence; CliMAF will replace it by the
           domain definition if needed, as 'latmin,latmax,lonmin,lonmax' ;
           'domain' stands for first input stream, 'domain_<digit>' for the 
           next ones :

            - in the example above, we assume that external binary CDO is
              not tasked with selecting the domain, and that CliMAF must
              feed CDO with a datafile where it has already performed the
              selection
         
         - **out, out_<word>** : CliMAF provide file names for output
           files (if there is no such field, the script will have
           only 'side effects', e.g. launch a viewer). Main output
           file must be created by the script with the name provided
           at the location of argument ${out}. Using arguments like
           'out_<word>' tells CliMAF that the script provide some
           secondary output, which will be symbolically known in
           CliMAF syntax as an attribute of the main object; by
           default, the variable name of each output equals the name
           of the output (except for the main ouput, which variable
           name is supposed to be the same as for the first input);
           for other cases, see argument \*\*kwargs to provide a
           format string, used to derive the variable name from first
           input variable name as in e.g. :
           ``output2_var='std_dev(%s)'`` for the output labelled
           output2 (i.e. declared as '${out_output2}')

           - in the example above, we just apply the convention used by CDO,
             which expects that you provide an output filename as last
             argument on the command line. See example mean_and_sdev in doc
             for advanced usage.

         - **crs** : will be replaced by the CliMAF Reference Syntax expression
           describing the first input stream; can be useful for plot title
           or legend

         - **alias** : means that the script can make an on the fly re-scaling
           and renaming of a variable. Will be replaced by a string which 
           pattern is : 'new_varname,file_varname,scale,offset'. The script 
           should then transform on reading as new_varname = 
           file_varname * scale + offset

         - **units, units_<digit>** : means that the script can set the units 
           on-the-fly while reading one of the input streams

         - **missing** : means that the script can make an on-the-fly 
           transformation of a givent constant to missing values

         - **labels** : for script accepting ensembles, CliMAF will
           replace this keyword by a string bearing the labels
           associated with the ensemble, with delimiter $ as e.g. in:
           "CNRM-CM5 is fine$IPSL-CM5-LR is not bad$CCSM-29 is ..."

        """
        # Check that script name do not clash with an existing symbol 
        if name in sys.modules['__main__'].__dict__ and name not in scripts :
            clogger.error("trying to define %s as an operator, "
                          "while it exists as smthing else"%name)
            return None
        if name in scripts : clogger.warning("Redefining CliMAF script %s"%name)
        #
        # Check now that script is executable
        scriptcommand=command.split(' ')[0].replace("(","")
        ex=subprocess.Popen(['which',scriptcommand], stdout=subprocess.PIPE)
        if ex.wait() != 0 :
            Climaf_Operator_Error("defining %s : command %s is not "
                                  "executable"%(name,scriptcommand))
        executable=ex.stdout.read().replace('\n','')
        #
        # Analyze inputs field keywords and populate dict 
        # attribute 'inputs' with some properties
        self.inputs=dict()
        commuteWithEnsemble=True
        it=re.finditer(
            r"\${(?P<keyw>(?P<mult>mm)?in(?P<serie>s)?(_(?P<n>([\d]+)))?)}",
            command)
        for oc in it : 
            if (oc.group("n") is not None) : rank=int(oc.group("n"))
            else : rank=0
            if rank in self.inputs :
                Climaf_Operator_Error(
                    "When defining %s : duplicate declaration for input #%d"%\
                        (name,rank))
            serie=(oc.group("serie") is not None)
            multiple=(oc.group("mult") is not None)
            if multiple :
                if rank  != 0 : 
                    raise Climaf_Operator_Error(
                        "Only first operand may accept members")
                if serie : 
                    raise Climaf_Operator_Error(
                        "Operand %s cannot both accept"
                        "members and files set"%oc.group("keyw"))
                commuteWithEnsemble=False
            self.inputs[rank]=(oc.group("keyw"),multiple,serie)
        if len(self.inputs)==0 : 
            Climaf_Operator_Error(
                "When defining %s : command %s must include at least one of "
                "${in} ${ins} ${mmin} or ${in_..} ... for specifying how CliMAF"
                " will provide the input filename(s)"% (name,command))
        #print self.inputs
        for i in range(len(self.inputs)) :
            if i+1 not in self.inputs and not ( i == 0 and 0  in self.inputs) :
                Climaf_Operator_Error(
                    "When defining %s : error in input sequence for rank %d"%\
                        (name,i+1))
        #
        # Check if command includes an argument allowing for 
        # providing an output filename
        if command.find("${out") < 0 : format=None
        #
        # Search in call arguments for keywords matching "<output_name>_var" 
        # which may provide format string for 'computing' outputs variable 
        # name from input variable name
        outvarnames=dict() ; pattern=r"^(.*)_var$"
        for p in kwargs : 
            if re.match(pattern,p):
                outvarnames[re.findall(pattern,p)[0]]=kwargs[p]
        #clogger.debug("outvarnames = "+`outvarnames`)
        #
        # Analyze outputs names , associated variable names 
        # (or format strings), and store it in attribute dict 'outputs' 
        self.outputs=dict()
        it=re.finditer(r"\${out(_(?P<outname>[\w-]*))?}",command)
        for occ in it :
            outname=occ.group("outname") 
            if outname is not None :
                if (outname in outvarnames) : 
                    self.outputs[outname]=outvarnames[outname]
                else :
                    self.outputs[outname]=outname
            else:
                self.outputs[None]="%s"
        #clogger.debug("outputs = "+`self.outputs`)
        #
        canSelectVar= (command.find("${var}") > 0 )
        canAggregateTime=(command.find("${ins}") > 0 or command.find("${ins_1}") > 0)
        canAlias= (command.find("${alias}") > 0 )
        canMissing= (command.find("${missing}") > 0 )
        canSelectTime=False
        if command.find("${period}") > 0  or command.find("${period_1}") > 0 :
            canSelectTime=True
        if command.find("${period_iso}") > 0  or command.find("${period_iso_1}") > 0 :
            canSelectTime=True
        canSelectDomain=(command.find("${domain}") > 0  or command.find("${domain_1}") > 0)
        #
        self.name=name
        self.command=command
        self.flags=scriptFlags(canOpendap, canSelectVar, canSelectTime, \
            canSelectDomain, canAggregateTime, canAlias, canMissing,\
            commuteWithEnsemble,\
            commuteWithTimeConcatenation, commuteWithSpaceConcatenation )
        self.outputFormat=format
        scripts[name]=self

        # Init doc string for the operator
        doc="CliMAF wrapper for command : %s"%self.command
        # try to get a better doc string from colocated doc/directory
        docfilename=os.path.dirname(__file__)+"/../doc/scripts/"+name+".rst"
        #print "docfilen= "+docfilename
        try:
            docfile=open(docfilename)
            doc=docfile.read()
            docfile.close()
        except:
            pass
        #
        # creates a function named as requested, which will invoke
        # capply with that name and same arguments
        defs='def %s(*args,**dic) :\n  """%s"""\n  return driver.capply("%s",*args,**dic)\n'\
                % (name,doc,name)
        exec defs in globals() #
        exec "from climaf.operators import %s"%name in \
            sys.modules['__main__'].__dict__
        clogger.debug("CliMAF script %s has been declared"%name)
Пример #16
0
def derive(project, derivedVar, Operator, *invars, **params) :
    """
    Define that 'derivedVar' is a derived variable in 'project', computed by
    applying 'Operator' to input streams which are datasets whose 
    variable names take the values in ``*invars`` and the parameter/arguments 
    of Operator take the values in ``**params``

    'project' may be the wildcard : '*'

    Example , assuming that operator 'minus' has been defined as ::
    
    >>> cscript('minus','cdo sub ${in_1} ${in_2} ${out}')
    
    which means that ``minus`` uses CDO for substracting the two datasets;
    you may define, for a given project 'CMIP5', a new variable e.g.
    for cloud radiative effect at the surface, named 'rscre',
    using the difference of values of all-sky and clear-sky net
    radiation at the surface by::
    
    >>> derive('CMIP5', 'rscre','minus','rs','rscs')

    You may then use this variable name at any location you 
    would use any other variable name

    Note : you may use wildcard '*' for the project

    Another example is rescaling or renaming some variable; 
    here, let us define how variable 'ta'
    can be derived from ERAI variable 't' :

    >>> derive('erai', 'ta','rescale', 't', scale=1., offset=0.)

    **However, this is not the most efficient way to do that**. 
    See :py:func:`~climaf.classes.calias()`

    Expert use : argument 'derivedVar' may be a dictionary, which
    keys are derived variable names and values are scripts outputs
    names; example ::
    
    >>> cscript('vertical_interp', 'vinterp.sh ${in} surface_pressure=${in_2} ${out_l500} ${out_l850} method=${opt}')
    >>> derive('*', {'z500' : 'l500' , 'z850' : 'l850'},'vertical_interp', 'zg', 'ps', opt='log'}
    
    """
    # Action : register the information in a dedicated dict which keys
    # are single derived variable names, and which will be used at the
    # object evaluation step
    # Also : some consistency checks w.r.t. script definition
    if Operator in scripts :
        if not isinstance(derivedVar,dict) : derivedVar=dict(out=derivedVar)
        for outname in derivedVar :
            if (outname != 'out' and
                (not getattr(Operator,"outvarnames",None)  
                 or outname not in Operator.outvarnames )):
                raise Climaf_Operator_Error(
                    "%s is not a named  ouput for operator %s; type help(%s)"%\
                        (outname,Operator,Operator))
            s=scripts[Operator]
            if s.inputs_number() != len(invars) :
                clogger.error("number of input variables for operator"
                              "%s is %d, which is inconsistent with "
                              "script declaration : %s"\
                              %(s.name,len(invars),s.command))
                return
            # TBD : check parameters number  ( need to build 
            # its list in cscript.init() )
            if project not in derived_variables :  
                derived_variables[project]=dict()
            derived_variables[project][derivedVar[outname]]=(Operator, outname, list(invars), params)
    elif Operator in operators :
        clogger.warning("Cannot yet handle derived variables based on internal operators")
    else : 
        clogger.error("second argument must be a script or operator, already declared")
Пример #17
0
    def __init__(self,
                 name,
                 command,
                 format="nc",
                 canOpendap=False,
                 commuteWithTimeConcatenation=False,
                 commuteWithSpaceConcatenation=False,
                 canSelectVar=False,
                 **kwargs):
        """
        Declare a script or binary as a 'CliMAF operator', and define a Python function with the same name

        Args:
          name (str): name for the CliMAF operator.
          command (str): script calling sequence, according to the syntax described below.
          format (str): script outputs format -- either 'nc', 'png', 'pdf', 'eps', 'None'
            or 'graph' ('graph' allows to the user to choose three different graphic output
            formats: 'png', 'pdf' or 'eps') or 'txt' (the text output are not managed by CliMAF,
            but only displayed - 'txt' allows to use e.g. 'ncdump -h' from inside CliMAF);
            defaults to 'nc'
          canOpendap (bool, optional): is the script able to use OpenDAP URIs ? default to False
          commuteWithTimeConcatenation (bool, optional): can the operation commute with concatenation
            of time periods ? set it to true, if the operator can be applied on time
            chunks separately, in order to allow for incremental computation / time chunking;
            defaults to False
          commuteWithSpaceConcatenation (bool, optional): can the operation commute with concatenation
            of space domains ? defaults to False (see commuteWithTimeConcatenation)
          **kwargs : possible keyword arguments, with keys matching '<outname>_var', for providing
            a format string allowing to compute the variable name for output 'outname' (see below).

        Returns:
          None

        The script calling sequence pattern string (arg 'command') indicates how to build the system call
        which actually launches the script, with a match between python objects and formal arguments;

        For introducing the syntax, please consider this example, with the following commands::

        >>> cscript('mycdo','cdo ${operator} ${in} ${out}')
        >>> # define some dataset
        >>> tas_ds = ds(project='example', simulation='AMIPV6ALB2G', variable='tas', period='1980-1981')
        >>> # Apply operator 'mycdo' to dataset 'tas_ds', choosing a given 'operator' argument
        >>> tas_avg = mycdo(tas_ds,operator='timavg')

        CliMAF will later on launch this call behind the curtain::

        $ cdo tim_avg /home/my/tmp/climaf_cache/8a/5.nc /home/my/tmp/climaf_cache/4e/4.nc

        where :

        - the last filename is generated by CliMAF from the formal expression describing 'tas_avg', and
          will receive the result
        - the first filename provides a file generated by CliMAF which includes the data required for tas_ds

        There are a number of examples declared in module :download:`standard_operators
        <../climaf/standard_operators.py>`.

        **Detailed syntax**:

        -  formal arguments appear as : ``${argument}`` (in the example : ``${in}``, ``${out}``, ``${operator}`` )

        -  except for reserved keywords, arguments in the pattern will be
           replaced by the values for corresponding keywords used when invoking
           the diagnostic operator:

          - in the example above : argument ``operator`` is replaced by value ``timavg``,
            which is a keyword known to the external binary called, CDO

        -  reserved argument keywords are :

         - **in, in_<digit>, ins, ins_<digit>, mmin** : they will be
           replaced by CliMAF managed filenames for input data, as
           deduced from dataset description or upstream computation; these
           filenames can actually be remote URLs (if the script can use
           OpenDAP, see args), local 'raw' data files, or CliMAF cache
           filenames

          -  **in** stands for the URL of the first dataset invoked in the
             operator call

          -  **in_<digit>** stands for the next ones, in the same order

          -  **ins** and **ins_<digit>** stand for the case where the script can
             select input from multiple input files or URLs (e.g. when the
             whole period to process spans over multiple files); in that case,
             a single string (surrounded with double quotes) will carry
             multiple URLs

          - **mmin** stands for the case where the script accepts an
            ensemble of datasets (only for first input stream
            yet). CliMAF will replace the keyword by a string
            composed of the corresponding input filenames (not surrounded
            by quotes - please add them yourself in declaration); see also
            ``labels`` below

         -  **var, var_<digit>** : when a script can select a variable in a
            multi-variable input stream, this is declared by adding this
            keyword in the calling sequence; CliMAF will replace it by the
            actual variable name to process; 'var' stands for first input
            stream, 'var_<digit>' for the next ones;

            - in the example above, we assume that external binary CDO is
              not tasked with selecting the variable, and that CliMAF must
              feed CDO with a datafile where it has already performed the
              selection


         - **period, period_<digit>** : when a script can select a time
           period in the content of a file or stream, it should declare it
           by putting this keyword in the pattern, which will be replaced at
           call time by the period written as <date1>-<date2>, where date is
           formated as YYYYMMDD ;

            - time intervals must be interpreted as [date1, date2[

            - 'period' stands for the first input_stream,

            - 'period_<n>' for the next ones, in the order of actual call;

           - in the example above, this keyword is not used, which means that
             CliMAF has to select the period upstream of feeding CDO with the
             data

         - **period_iso, period_iso_<digit>** : as for **period** above,
           except that the date formating fits CDO conventions :

            - date format is ISO : YYYY-MM-DDTHH:MM:SS

            - interval is [date1,date2_iso], where date2_iso is 1 minute before
              date2

            - separator between dates is : ,

         - **domain, domain_<digit>** : when a script can select a domain
           in the input grid, this is declared by adding this
           keyword in the calling sequence; CliMAF will replace it by the
           domain definition if needed, as 'latmin,latmax,lonmin,lonmax' ;
           'domain' stands for first input stream, 'domain_<digit>' for the
           next ones :

            - in the example above, we assume that external binary CDO is
              not tasked with selecting the domain, and that CliMAF must
              feed CDO with a datafile where it has already performed the
              selection

         - **out, out_<word>** : CliMAF provide file names for output
           files (if there is no such field, the script will have
           only 'side effects', e.g. launch a viewer). Main output
           file must be created by the script with the name provided
           at the location of argument ${out}. Using arguments like
           'out_<word>' tells CliMAF that the script provide some
           secondary output, which will be symbolically known in
           CliMAF syntax as an attribute of the main object; by
           default, the variable name of each output equals the name
           of the output (except for the main ouput, which variable
           name is supposed to be the same as for the first input);
           for other cases, see argument \*\*kwargs to provide a
           format string, used to derive the variable name from first
           input variable name as in e.g. :
           ``output2_var='std_dev(%s)'`` for the output labelled
           output2 (i.e. declared as '${out_output2}') or ``_var='std_dev(%s)'``
           for the default (main) output


           - in the example above, we just apply the convention used by CDO,
             which expects that you provide an output filename as last
             argument on the command line. See example mean_and_sdev in doc
             for advanced usage.

         - **crs** : will be replaced by the CliMAF Reference Syntax expression
           describing the first input stream; can be useful for plot title
           or legend

         - **alias** : used if the script can make an on the fly re-scaling
           and renaming of a variable. Will be replaced by a string which
           pattern is : 'new_varname,file_varname,scale,offset'. The script
           should then transform on reading as new_varname =
           file_varname * scale + offset

         - **units, units_<digit>** : means that the script can set the units
           on-the-fly while reading one of the input streams

         - **missing** : means that the script can make an on-the-fly
           transformation of a given constant to missing values

         - **labels** : for script accepting ensembles, CliMAF will
           replace this keyword by a string bearing the labels
           associated with the ensemble, with delimiter $ as e.g. in:
           "CNRM-CM5 is fine$IPSL-CM5-LR is not bad$CCSM-29 is ..."

        """
        # Check that script name do not clash with an existing symbol
        if name in sys.modules['__main__'].__dict__ and name not in scripts:
            clogger.error("trying to define %s as an operator, "
                          "while it exists as smthing else" % name)
            return None
        if name in scripts:
            clogger.warning("Redefining CliMAF script %s" % name)
        #
        # Check now that script is executable
        scriptcommand = command.split(' ')[0].replace("(", "")
        ex = subprocess.Popen(['which', scriptcommand], stdout=subprocess.PIPE)
        if ex.wait() != 0:
            Climaf_Operator_Error("defining %s : command %s is not "
                                  "executable" % (name, scriptcommand))
        executable = ex.stdout.read().replace('\n', '')
        #
        # Analyze inputs field keywords and populate dict
        # attribute 'inputs' with some properties
        self.inputs = dict()
        commuteWithEnsemble = True
        it = re.finditer(
            r"\${(?P<keyw>(?P<mult>mm)?in(?P<serie>s)?(_(?P<n>([\d]+)))?)}",
            command)
        for oc in it:
            if oc.group("n") is not None:
                rank = int(oc.group("n"))
            else:
                rank = 0
            if rank in self.inputs:
                Climaf_Operator_Error(
                    "When defining %s : duplicate declaration for input #%d" %
                    (name, rank))
            serie = (oc.group("serie") is not None)
            multiple = (oc.group("mult") is not None)
            if multiple:
                if rank != 0:
                    raise Climaf_Operator_Error(
                        "Only first operand may accept members")
                if serie:
                    raise Climaf_Operator_Error("Operand %s cannot both accept"
                                                "members and files set" %
                                                oc.group("keyw"))
                commuteWithEnsemble = False
            self.inputs[rank] = (oc.group("keyw"), multiple, serie)
        if len(self.inputs) == 0:
            Climaf_Operator_Error(
                "When defining %s : command %s must include at least one of "
                "${in} ${ins} ${mmin} or ${in_..} ... for specifying how CliMAF"
                " will provide the input filename(s)" % (name, command))
        # print self.inputs
        for i in range(len(self.inputs)):
            if i + 1 not in self.inputs and not (i == 0 and 0 in self.inputs):
                Climaf_Operator_Error(
                    "When defining %s : error in input sequence for rank %d" %
                    (name, i + 1))
        #
        # Check if command includes an argument allowing for
        # providing an output filename
        if command.find("${out") < 0:
            if format is not "txt":
                format = None
        #
        # Search in call arguments for keywords matching "<output_name>_var"
        # which may provide format string for 'computing' outputs variable
        # name from input variable name
        outvarnames = dict()
        pattern = r"^(.*)_var$"
        for p in kwargs:
            if re.match(pattern, p):
                outvarnames[re.findall(pattern, p)[0]] = kwargs[p]
        clogger.debug("outvarnames for script %s = %s" %
                      (name, repr(outvarnames)))
        #
        # Analyze outputs names , associated variable names
        # (or format strings), and store it in attribute dict 'outputs'
        self.outputs = dict()
        it = re.finditer(r"\${out(_(?P<outname>[\w-]*))?}", command)
        for occ in it:
            outname = occ.group("outname")
            if outname is not None:
                if outname in outvarnames:
                    self.outputs[outname] = outvarnames[outname]
                else:
                    self.outputs[outname] = "%s"  # outname
            else:
                self.outputs[None] = outvarnames.get('', "%s")
                self.outputs[''] = outvarnames.get('', "%s")
        # clogger.debug("outputs = "+`self.outputs`)
        #
        canSelectVar = canSelectVar or (command.find("${var}") > 0)
        canAggregateTime = (command.find("${ins}") > 0
                            or command.find("${ins_1}") > 0)
        canAlias = (command.find("${alias}") > 0)
        canMissing = (command.find("${missing}") > 0)
        canSelectTime = False
        if command.find("${period}") > 0 or command.find("${period_1}") > 0:
            canSelectTime = True
        if command.find("${period_iso}") > 0 or command.find(
                "${period_iso_1}") > 0:
            canSelectTime = True
        canSelectDomain = (command.find("${domain}") > 0
                           or command.find("${domain_1}") > 0)
        #
        self.name = name
        self.command = command
        self.fixedfields = None
        self.flags = scriptFlags(canOpendap, canSelectVar, canSelectTime,
                                 canSelectDomain, canAggregateTime, canAlias,
                                 canMissing, commuteWithEnsemble,
                                 commuteWithTimeConcatenation,
                                 commuteWithSpaceConcatenation)
        if format in known_formats or format in graphic_formats or format in none_formats:
            self.outputFormat = format
        else:
            raise Climaf_Operator_Error(
                "Allowed formats yet are : 'object', 'nc', 'txt', %s" %
                ', '.join([repr(x) for x in graphic_formats]))
        scripts[name] = self

        # Init doc string for the operator
        doc = "CliMAF wrapper for command : %s" % self.command
        # try to get a better doc string from colocated doc/directory
        docfilename = os.path.dirname(
            __file__) + "/../doc/scripts/" + name + ".rst"
        # print "docfilen= "+docfilename
        try:
            docfile = open(docfilename)
            doc = docfile.read()
            docfile.close()
        except:
            pass
        #
        # creates a function named as requested, which will invoke
        # capply with that name and same arguments
        defs = 'def %s(*args,**dic) :\n  """%s"""\n  return driver.capply("%s",*args,**dic)\n' \
               % (name, doc, name)
        exec defs in globals()  #
        exec "from climaf.operators import %s" % name in \
            sys.modules['__main__'].__dict__
        clogger.debug("CliMAF script %s has been declared" % name)
Пример #18
0
def selectGenericFiles(urls, **kwargs):
    """
    Allow to describe a ``generic`` file organization : the list of files returned 
    by this function is composed of files which :

    - match the patterns in ``url`` once these patterns are instantiated by 
      the values in kwargs, and 

     - contain the ``variable`` provided in kwargs

     - match the `period`` provided in kwargs

    In the pattern strings, no keyword is mandatory

    Example :

    >>> selectGenericFiles(project='my_projet',model='my_model', simulation='lastexp', variable='tas', period='1980', urls=['~/DATA/${project}/${model}/*${variable}*YYYY*.nc)']
    /home/stephane/DATA/my_project/my_model/somefilewith_tas_Y1980.nc

    In the pattern strings, the keywords that can be used in addition to the argument
    names (e.g. ${model}) are:
    
    - ${variable} : use it if the files are split by variable and 
      filenames do include the variable name, as this speed up the search

    - YYYY, YYYYMM, YYYYMMDD : use it for indicating the start date of
      the period covered by each file, if this is applicable in the
      file naming; use a second time for end date, if applicable
      (otherwise the assumption is that the whole year -resp. month or
      day- is included in the file

    - wildcards '?' and '*' for matching respectively one and any number of characters


    """
    rep=[]
    period=kwargs['period']
    if type(period) is str : period=init_period(period)
    variable=kwargs['variable']
    altvar=kwargs.get('filenameVar',variable)
    # a dict and an ordered list of date globbing patterns
    dt=dict(YYYY="????",YYYYMM="??????",YYYYMMDD="????????")
    lkeys=dt.keys() ; lkeys.sort(reverse=True)
    # a dict and an ordered list for matching dates
    dr=dict(YYYY="([0-9]{4})",YYYYMM="([0-9]{6})", YYYYMMDD="([0-9]{8})")
    rkeys=dr.keys() ; rkeys.sort(reverse=True)
    #
    for l in urls :
        # Instantiate keywords in pattern with attributes values
        template=Template(l).safe_substitute(**kwargs)
        #print "template after attributes replace : "+template
        #
        # Construct a pattern for globbing dates
        temp2=template ; 
        for k in lkeys : temp2=temp2.replace(k,dt[k])
        lfiles=glob.glob(temp2)
        clogger.debug("Globbing %d files for varname on %s : "%(len(lfiles),temp2))
        #
        # If unsuccessful using varname, try with filenameVar
        if len(lfiles)==0 and "filenameVar" in kwargs and kwargs['filenameVar'] :
            kwargs['variable']=kwargs['filenameVar']
            template=Template(l).safe_substitute(**kwargs)
            temp2=template
            for k in lkeys : temp2=temp2.replace(k,dt[k])
            #
            lfiles=glob.glob(temp2)
            clogger.debug("Globbing %d files for filenamevar on %s: "%(len(lfiles),temp2))

        # Construct regexp for extracting dates from filename
        regexp=None
        #print "template before searching dates : "+template
        for key in rkeys :
            #print "searchin "+key+" in "+=Template(l)
            start=template.find(key)
            if (start>=0 ) :
                #print "found "+key
                regexp=template.replace(key,dr[key],1)
                hasEnd=False
                start=regexp.find(key)
                if (start >=0 ) :
                    hasEnd=True
                    regexp=regexp.replace(key,dr[key],1)
                break
        #print "regexp before searching dates : "+regexp
        #
        for f in lfiles :
            #print "processing file "+f
            #
            # Analyze file time period
            fperiod=None
            if regexp :
                regexp0=regexp.replace("*",".*").replace("?",r".")
                #print "regexp for extracting dates : "+regexp
                start=re.sub(regexp0,r'\1',f)
                if start==f:
                    raise Climaf_Data_Error("Start period not found") #? LV
                if hasEnd :
                    end=re.sub(regexp0,r'\2',f)
                    fperiod=init_period("%s-%s"%(start,end))
                else :
                    fperiod=init_period(start)
                #print "period for file %s is %s"%(f,fperiod)
                #
                # Filter file time period against required period
            else :
                if ( 'frequency' in kwargs and ((kwargs['frequency']=="fx") or \
                    kwargs['frequency']=="seasonnal" or kwargs['frequency']=="annual_cycle" )) :
                    if (l.find("${variable}")>=0) or fileHasVar(f,variable) or fileHasVar(f,altvar) : 
                        clogger.debug("adding fixed field :"+f)
                        rep.append(f)
                else :
                    clogger.warning("Cannot yet filter files re. time using only file content. TBD")
                    rep.append(f)
            if (fperiod and period.intersects(fperiod)) or not regexp :
                clogger.debug('Period is OK - Considering variable filtering on %s and %s for %s'%(variable,altvar,f)) 
                # Filter against variable 
                if (l.find("${variable}")>=0):
                    clogger.debug('appending %s based on variable in filename'%f)
                    rep.append(f)
                    continue
                if f not in rep and ( fileHasVar(f,variable) or fileHasVar(f,altvar) or ("," in variable)):
                    # Should check time period in the file if not regexp
                    clogger.debug('appending %s based on multi-var or var exists in file '%f)
                    rep.append(f)
            else:
                if not fperiod :
                    clogger.debug('not appending %s because period is None '%f)
                else:
                    if not period.intersects(fperiod) :
                        clogger.debug('not appending %s because period doesn t intersect %s'%(f,period))

    return rep
Пример #19
0
def selectFiles(return_wildcards=None, merge_periods_on=None, **kwargs):
    """
    Returns the shortest list of (local or remote) files which include
    the data for the list of (facet,value) pairs provided

    Method : 
    
    - use datalocations indexed by :py:func:`~climaf.dataloc.dataloc` to 
      identify data organization and data store urls for these (facet,value) 
      pairs

    - check that data organization is as known one, i.e. is one of 'generic', 
      CMIP5_DRS' or 'EM'
    
    - derive relevant filenames search function such as as :
      py:func:`~climaf.dataloc.selectCmip5DrsFiles` from data
      organization scheme

    - pass urls and relevant facet values to this filenames search function

    """
    rep=[]
    project=kwargs['project']
    simulation=kwargs['simulation']

    if 'model' in kwargs : model=kwargs['model']
    else : model="*"
    if 'frequency' in kwargs : frequency=kwargs['frequency']
    else : frequency="*"

    ofu=getlocs(project=project, model=model, simulation=simulation, frequency=frequency)
    clogger.debug("locs="+ `ofu`)
    if ( len(ofu) == 0 ) :
        clogger.warning("no datalocation found for %s %s %s %s "%(project, model, simulation, frequency))
    for org,freq,urls in ofu :
        if return_wildcards is not None and org is not "generic" :
            raise classes.Climaf_Error("Can hanle multipe facet query only for organization=generic ")
        kwargs2=kwargs.copy()
        # Convert normalized frequency to project-specific frequency if applicable
        if "frequency" in kwargs and project in classes.frequencies :
            normfreq=kwargs2['frequency'] 
            if normfreq in classes.frequencies[project]: 
                kwargs2['frequency']=classes.frequencies[project][normfreq]
        # JS # Convert normalized realm to project-specific realm if applicable
        if "realm" in kwargs and project in classes.realms :
            normrealm=kwargs2['realm']
            if normrealm in classes.realms[project]:
                kwargs2['realm']=classes.realms[project][normrealm]
        #
        # Call organization-specific routine
        if (org == "EM") :
            rep.extend(selectEmFiles(**kwargs2))
        elif (org == "CMIP5_DRS") :
            rep.extend(selectCmip5DrsFiles(urls,**kwargs2))
        elif (org == "generic") :
            rep.extend(selectGenericFiles(urls, return_wildcards=return_wildcards, \
                                          merge_periods_on=merge_periods_on,**kwargs2))
        else :
            raise classes.Climaf_Error("Cannot process organization "+org+ \
                " for simulation "+simulation+" and model "+model+\
                " of project "+project)
    if (not ofu) :
        return None
    else :
        if (len(rep) == 0 ) :
            clogger.warning("no file found for %s, at these "
                            "data locations %s "%(`kwargs` , `urls`))
            if any([ kwargs[k] == '' for k in kwargs ]) :
                clogger.warning("Please check these empty attributes %s"%\
                                [ k for k in kwargs if kwargs[k]=='' ])
            return None
    # Discard duplicates (assumes that sorting is harmless for later processing)
    rep.sort()
    last=None
    for f in rep :
        if f == last : rep.remove(last)
        last=f
    # Assemble filenames in one single string
    return(string.join(rep))
Пример #20
0
def derive(project, derivedVar, Operator, *invars, **params):
    """
    Define that 'derivedVar' is a derived variable in 'project', computed by
    applying 'Operator' to input streams which are datasets whose
    variable names take the values in ``*invars`` and the parameter/arguments
    of Operator take the values in ``**params``

    'project' may be the wildcard : '*'

    Example, assuming that operator 'minus' has been defined as ::

    >>> cscript('minus','cdo sub ${in_1} ${in_2} ${out}')

    which means that ``minus`` uses CDO for substracting the two datasets;
    you may define, for a given project 'CMIP5', a new variable e.g.
    for cloud radiative effect at the surface, named 'rscre',
    using the difference of values of all-sky and clear-sky net
    radiation at the surface by::

    >>> derive('CMIP5', 'rscre','minus','rs','rscs')

    You may then use this variable name at any location you
    would use any other variable name

    Note : you may use wildcard '*' for the project

    Another example is rescaling or renaming some variable;
    here, let us define how variable 'ta'
    can be derived from ERAI variable 't' :

    >>> derive('erai', 'ta','rescale', 't', scale=1., offset=0.)

    **However, this is not the most efficient way to do that**.
    See :py:func:`~climaf.classes.calias()`

    Expert use : argument 'derivedVar' may be a dictionary, which
    keys are derived variable names and values are scripts outputs
    names; example ::

    >>> cscript('vertical_interp', 'vinterp.sh ${in} surface_pressure=${in_2} ${out_l500} ${out_l850} method=${opt}')
    >>> derive('*', {'z500' : 'l500' , 'z850' : 'l850'},'vertical_interp', 'zg', 'ps', opt='log'}

    """
    # Action : register the information in a dedicated dict which keys
    # are single derived variable names, and which will be used at the
    # object evaluation step
    # Also : some consistency checks w.r.t. script definition
    if Operator in scripts:
        if not isinstance(derivedVar, dict):
            derivedVar = dict(out=derivedVar)
        for outname in derivedVar:
            if (outname != 'out'
                    and (not getattr(Operator, "outvarnames", None)
                         or outname not in Operator.outvarnames)):
                raise Climaf_Operator_Error(
                    "%s is not a named  ouput for operator %s; type help(%s)" %
                    (outname, Operator, Operator))
            s = scripts[Operator]
            if s.inputs_number() != len(invars):
                clogger.error(
                    "number of input variables for operator %s is %d, which is inconsistent with "
                    "script declaration : %s" %
                    (s.name, len(invars), s.command))
                return
            # TBD : check parameters number  ( need to build
            # its list in cscript.init() )
            if project not in derived_variables:
                derived_variables[project] = dict()
            derived_variables[project][derivedVar[outname]] = (Operator,
                                                               outname,
                                                               list(invars),
                                                               params)
    elif Operator in operators:
        clogger.warning(
            "Cannot yet handle derived variables based on internal operators")
    else:
        clogger.error(
            "second argument (%s) must be a script or operator, already declared"
            % repr(Operator))