def instantiateHandler(cls, *args, **extra_args): """ Instantiate the handler class with the specified arguments and extra arguments, but filtering out anything that it doesn't support. """ passed_args = {} supported_args = inspect.getargspec(cls.__init__).args for k, v in extra_args.iteritems(): if k in supported_args: passed_args[k] = v else: debug("Discarding arg '%s' not supported by handler" % k) return cls(*args, **passed_args)
def __init__(self, url, port=None, key_file=None, cert_file=None, debug=False): """ Create a Hessian proxy. url String of the form http[s]://host/path. Note that the port is specified separately. port Defaults to 80 for http, 443 for https. key_file Key file in PEM format, if the scheme is https and client authentication is to be used. cert_file User certificate in PEM format, if the scheme is https and client authentication is to be used. debug True iff debug info is to be printed. """ # Creates a Hessian proxy object global DEBUG self.service_type = 'HESSIAN' self._url = url self._port = port self._key_file = key_file self._cert_file = cert_file # print "Using key file = %s, cert file = %s"%(key_file, cert_file) messaging.debug("Using key file = %s, cert file = %s" % (key_file, cert_file)) if debug: DEBUG = True # get the uri scheme, uri = urllib.splittype(url) if scheme not in ["http", "https"]: raise IOError, "unsupported Hessian protocol" self._scheme = scheme self._host, self._uri = urllib.splithost(uri)
def validateFile(self, fileobj): """ for CMIP6, this will first verify if the data is written by CMOR at the correct version set in the ini file. If so, the file is declared valid. If not, file will go through PrePARE (CV) check. PrePARE runs CFChecker Raises ESGPublishError if settings are missing or file fails the checks. Raise ESGInvalidMetadataFormat if the file cannot be processed by this handler. """ validator = PrePARE.PrePARE f = fileobj.path if self.replica: debug("skipping PrePARE for replica (file %s)" % f) return # todo refactoring these could loaded upfront in the constructor config = getConfig() project_section = 'project:' + self.name project_config_section = 'config:' + self.name min_cmor_version = config.get(project_section, "min_cmor_version", default="0.0.0") min_ds_version = config.get(project_section, "min_data_specs_version", default="0.0.0") data_specs_version = config.get(project_config_section, "data_specs_version", default="master") cmor_table_path = config.get(project_config_section, "cmor_table_path", default=DEFAULT_CMOR_TABLE_PATH) try: file_cmor_version = fileobj.getAttribute('cmor_version', None) except: file_cmor_version = None debug('File %s missing cmor_version attribute; will proceed with PrePARE check' % f) passed_cmor = False if compareLibVersions(min_cmor_version, file_cmor_version): debug('File %s cmor-ized at version %s, passed!"'%(f, file_cmor_version)) passed_cmor = True try: table = fileobj.getAttribute('table_id', None) except: raise ESGPublishError("File %s missing required table_id global attribute" % f) try: variable_id = fileobj.getAttribute('variable_id', None) except: raise ESGPublishError("File %s missing required variable_id global attribute" % f) # data_specs_version drives CMOR table fetching # Behavior A (default): fetches "master" branch" (if not "data_specs_version" in esg.ini") # Behavior A: fetches branch specified by "data_specs_version=my_branch" into esg.ini # Behavior B: fetches branch specified by file global attributes using "data_specs_version=file" into esg.ini try: file_data_specs_version = fileobj.getAttribute('data_specs_version', None) except Exception as e: raise ESGPublishError("File %s missing required data_specs_version global attribute"%f) if not compareLibVersions(min_ds_version, file_data_specs_version): raise ESGPublishError("File %s data_specs_version is %s, which is less than the required minimum version of %s"%(f,file_data_specs_version,min_ds_version)) # at this point the file has the correct data specs version. # if also was CMORized and has the correct version tag, we can exit if passed_cmor: return if data_specs_version == "file": data_specs_version = file_data_specs_version checkAndUpdateRepo(cmor_table_path, data_specs_version) try: process = validator.checkCMIP6(cmor_table_path) if process is None: raise ESGPublishError("File %s failed the CV check - object create failure"%f) process.ControlVocab(f) except: raise ESGPublishError("File %s failed the CV check"%f)
def validateFile(self, fileobj): """ for CMIP6, this will first verify if the data is written by CMOR at the correct version set in the ini file. If so, the file is declared valid. If not, file will go through PrePARE (CV) check. PrePARE runs CFChecker Raises ESGPublishError if settings are missing or file fails the checks. Raise ESGInvalidMetadataFormat if the file cannot be processed by this handler. """ validator = PrePARE.PrePARE f = fileobj.path config = getConfig() projectSection = 'project:' + self.name min_cmor_version = config.get(projectSection, "min_cmor_version", default="0.0.0") file_cmor_version = "0.0.0" try: file_cmor_version = fileobj.getAttribute('cmor_version', None) except: debug( 'File %s missing cmor_version attribute; will proceed with PrePARE check' % f) if compareLibVersions(min_cmor_version, file_cmor_version): debug('File %s cmor-ized at version %s, passed!"' % (f, file_cmor_version)) return # PrePARE is going to handle the CF check now # min_cf_version = config.get(projectSection, "min_cf_version", defaut="") # if len(min_cf_version) == 0: # raise ESGPublishError("Minimum CF version not set in esg.ini") # fakeversion = ["cfchecker.py", "-v", min_cf_version # , "foo"] # (badc,coards,uploader,useFileName,standardName,areaTypes,udunitsDat,version,files)=getargs(fakeversion) # CF_Chk_obj = CFChecker(uploader=uploader, useFileName=useFileName, badc=badc, coards=coards, cfStandardNamesXML=standardName, cfAreaTypesXML=areaTypes, udunitsDat=udunitsDat, version=version) # rc = CF_Chk_obj.checker(f) # if (rc > 0): # raise ESGPublishError("File %s fails CF check"%f) file_data_specs_version = None try: file_data_specs_version = fileobj.getAttribute( 'data_specs_version', None) except Exception as e: raise ESGPublishError( "File %s missing required data_specs_version global attribute" % f) table = None try: table = fileobj.getAttribute('table_id', None) except: raise ESGPublishError( "File %s missing required table_id global attribute" % f) try: variable_id = fileobj.getAttribute('variable_id', None) except: raise ESGPublishError( "File %s missing required variable_id global attribute" % f) project_section = 'config:cmip6' cmor_table_path = "" try: cmor_table_path = config.get(projectSection, "cmor_table_path", defaut="") except: debug("Missing cmor_table_path setting. Using default location") if cmor_table_path == "": cmor_table_path = DEFAULT_CMOR_TABLE_PATH checkAndUpdateRepo(cmor_table_path, self, file_data_specs_version) table_file = cmor_table_path + '/CMIP6_' + table + '.json' fakeargs = ['--variable', variable_id, table_file, f] parser = argparse.ArgumentParser(prog='esgpublisher') parser.add_argument('--variable') parser.add_argument('cmip6_table', action=validator.JSONAction) parser.add_argument('infile', action=validator.CDMSAction) parser.add_argument('outfile', nargs='?', help='Output file (default stdout)', type=argparse.FileType('w'), default=sys.stdout) args = parser.parse_args(fakeargs) # print "About to CV check:", f try: process = validator.checkCMIP6(args) if process is None: raise ESGPublishError( "File %s failed the CV check - object create failure" % f) process.ControlVocab() except: raise ESGPublishError("File %s failed the CV check" % f)
def nodeIterator(top, nodefilt, filefilt, followSymLinks=True, allFiles=False): """Generate an iterator over non-empty directories that match a pattern. Returns an iterator that returns a tuple (*path*, *sample_file*, *groupdict*) at each iteration, where: - *path* is the node (directory) path - *sample_file* is a file in the node that matches the file filter - *groupdict* is the group dictionary generated by the match. For example, if *nodefilt* contains a named group '(?P<model>) that matches 'some_value', then *groupdict* maps 'model' => 'some_value' top A list or tuple of top level directory names. nodefilt A regular expression as defined in the Python re module. Each node returned matches the expression. May also be a list of regular expressions, in which case each node returned matches at least one expression in the list. filefilt A regular expression as defined in the Python re module. Each sample file returned has basename matching the filter. followSymLinks Boolean flag. Symbolic links are followed unless followSymLinks is False. allFiles = False Boolean flag. If True, iterate over all files that match the filter. Otherwise just return the first file that matches. """ try: names = os.listdir(top) except os.error: return if type(nodefilt) is not type([]): nodefilt = [nodefilt] foundOne = False for basename in names: name = os.path.join(top, basename) try: if followSymLinks: st = os.stat(name) else: st = os.lstat(name) except os.error: continue # Search regular files in top directory if stat.S_ISREG(st.st_mode): if not foundOne or allFiles: # Find the first node filter that matches for filt in nodefilt: result = re.match(filt, top) debug("Comparing %s with filter %s ..."%(top, filt)) if result is not None: debug("... match") break debug("... no match") # If the node pattern matches and the file not a directory and the file filter matches: if (result is not None) and (re.match(filefilt, basename) is not None): groupdict = result.groupdict() foundOne = True yield (top, basename, groupdict) # Search subdirectories elif stat.S_ISDIR(st.st_mode): for nodepath, filepath, gdict in nodeIterator(name, nodefilt, filefilt, followSymLinks=followSymLinks): yield (nodepath, filepath, gdict) return
def aggregateVariables(datasetName, dbSession, aggregateDimensionName=None, cfHandler=None, progressCallback=None, stopEvent=None, datasetInstance=None): """ Aggregate file variables into variables, and add to the database. Populates the database tables: - variable - file_variable - associated attribute tables Returns a Dataset object. datasetName String dataset identifier. dbSession A database Session. aggregateDimensionName The name of the dimension across which the dataset is aggregated, if any. cfHandler A CFHandler to validate standard names, etc. progressCallback Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported. stopEvent Object with boolean attribute ``stop_extract`` (for example, ``utility.StopEvent``). If set to True (in another thread) the extraction is stopped. datasetInstance Existing dataset instance. If not provided, the instance is regenerated from the database. """ session = dbSession() info("Aggregating variables") # Lookup the dataset if datasetInstance is None: dset = session.query(Dataset).filter_by(name=datasetName).first() for variable in dset.variables: session.delete(variable) for attrname, attr in dset.attributes.items(): if not attr.is_category: del dset.attributes[attrname] session.commit() dset.variables = [] else: dset = datasetInstance # session.save_or_update(dset) session.add(dset) if dset is None: raise ESGPublishError("Dataset not found: %s"%datasetName) dsetindex = {} # dsetindex[varname] = [(variable, domain), (variable, domain), ...] # where domain = ((dim0, len0, 0), (dim1, len1, 1), ...) # Note: # (1) If a dim0 is the aggregate dimension, len0 is 0 # (2) A dsetindex entry will only have multiple tuples if # there are more than one variable with the same name # and different domains. varindex = {} # varindex[(varname, domain, attrname)] = attribute globalAttrIndex = {} # globalAttrIndex[attname] = attval, for global attributes dsetvars = [] # Create variables seq = 0 nfiles = len(dset.getFiles()) for file in dset.getFiles(): for filevar in file.file_variables: # Get the filevar and variable domain fvdomain = map(lambda x: (x.name, x.length, x.seq), filevar.dimensions) fvdomain.sort(lambda x,y: cmp(x[SEQ], y[SEQ])) filevar.domain = fvdomain if len(fvdomain)>0 and fvdomain[0][0]==aggregateDimensionName: vardomain = ((aggregateDimensionName, 0, 0),)+tuple(fvdomain[1:]) # Zero out aggregate dimension length else: vardomain = tuple(fvdomain) # Create the variable if necessary varlist = dsetindex.get(filevar.short_name, None) if varlist is None or vardomain not in [item[1] for item in varlist]: var = Variable(filevar.short_name, filevar.long_name) var.domain = vardomain # Record coordinate variable range if applicable if filevar.coord_type is not None: var.coord_type = filevar.coord_type if var.coord_type=='Z': var.coord_values = filevar.coord_values var.coord_range = filevar.coord_range dsetvars.append(var) if varlist is None: dsetindex[var.short_name] = [(var, vardomain)] else: varlist.append((var, vardomain)) else: for tvar, domain in varlist: if domain==vardomain: var = tvar break # Attach the file variable to the variable var.file_variables.append(filevar) # Create attributes for fvattribute in filevar.attributes: vattribute = varindex.get((var.short_name, vardomain, fvattribute.name), None) if vattribute is None: attribute = VariableAttribute(fvattribute.name, map_to_charset(fvattribute.value), fvattribute.datatype, fvattribute.length) var.attributes.append(attribute) varindex[(var.short_name, vardomain, attribute.name)] = attribute if attribute.name == 'units': var.units = attribute.value # Create global attributes for fileattr in file.attributes: fattribute = globalAttrIndex.get(fileattr.name, None) if fattribute is None and fileattr.name not in ['readDimension']: attribute = DatasetAttribute(fileattr.name, map_to_charset(fileattr.value), fileattr.datatype, fileattr.length) dset.attributes[attribute.name] = attribute globalAttrIndex[attribute.name] = attribute seq += 1 try: issueCallback(progressCallback, seq, nfiles, 0, 0.25, stopEvent=stopEvent) except: session.rollback() session.close() raise # Find the aggregation dimension bounds variable, if any aggDim = lookupVar(aggregateDimensionName, dsetindex) boundsName = lookupAttr(aggDim, 'bounds') aggUnits = lookupAttr(aggDim, 'units') aggDimBounds = lookupVar(boundsName, dsetindex) # Set calendar for time aggregation isTime = cfHandler.axisIsTime(aggDim) if isTime: calendar = cfHandler.getCalendarTag(aggDim) if calendar is None: calendar = "gregorian" else: calendar = None dset.calendar = calendar dset.aggdim_name = aggregateDimensionName dset.aggdim_units = aggUnits cdcalendar = cfHandler.tagToCalendar(calendar) # Add the non-aggregate dimension variables to the dataset for var in dsetvars: if var not in [aggDim, aggDimBounds]: dset.variables.append(var) # Set coordinate ranges for var in dset.variables: for name, length, seq in var.domain: if name==aggregateDimensionName: continue dvar = lookupCoord(name, dsetindex, length) if dvar is not None: units = lookupAttr(dvar, 'units') if units is None: warning("Missing units, variable=%s"%dvar.short_name) units = '' if hasattr(dvar, 'coord_type'): if dvar.coord_type=='X': var.eastwest_range = dvar.coord_range+':'+units elif dvar.coord_type=='Y': var.northsouth_range = dvar.coord_range+':'+units elif dvar.coord_type=='Z': var.updown_range = dvar.coord_range+':'+units var.updown_values = dvar.coord_values # Attach aggregate dimension filevars to files if aggDim is not None: for filevar in aggDim.file_variables: filevar.file.aggDim = filevar if aggDimBounds is not None: for filevar in aggDimBounds.file_variables: filevar.file.aggDimBounds = filevar # Combine aggregate dimensions: # Scan all variables with the aggregate dimension in the domain. For each such variable, # create an aggregate dimension variable, and bounds if needed. timevars = [] for var in dset.variables: if len(var.domain)>0 and aggregateDimensionName==var.domain[0][NAME]: aggVar = createAggregateVar(var, 'aggDim', aggregateDimensionName) aggBoundsVar = createAggregateVar(var, 'aggDimBounds', aggregateDimensionName) if aggVar is not None: aggVar.units = aggUnits timevars.append(aggVar) if aggBoundsVar is not None: timevars.append(aggBoundsVar) # Create variable dimensions, aggregating the agg dimension debug("Creating dimensions") i = 0 nvars = len(dset.variables+timevars) for var in dset.variables+timevars: vardomain = var.domain # Increment aggregate dimension length if len(vardomain)>0 and aggregateDimensionName==vardomain[0][NAME]: for filevar in var.file_variables: fvdomain = filevar.domain vardomain = ((aggregateDimensionName, vardomain[0][LENGTH]+fvdomain[0][LENGTH], vardomain[0][SEQ]),)+tuple(vardomain[1:]) var.domain = vardomain # Create the variable domain for name, length, seq in vardomain: dimension = VariableDimension(name, length, seq) var.dimensions.append(dimension) i += 1 try: issueCallback(progressCallback, i, nvars, 0.25, 0.5, stopEvent=stopEvent) except: session.rollback() session.close() raise # Set variable aggregate dimension ranges debug("Setting aggregate dimension ranges") seq = 0 nvars = len(dset.variables+timevars) for var in dset.variables+timevars: vardomain = var.domain if len(vardomain)>0 and vardomain[0][NAME]==aggregateDimensionName: # Adjust times so they have consistent base units try: filevarRanges = [(x.file.getLocation(), cfHandler.normalizeTime(x.aggdim_first, x.aggdim_units, aggUnits, calendar=cdcalendar), cfHandler.normalizeTime(x.aggdim_last, x.aggdim_units, aggUnits, calendar=cdcalendar)) for x in var.file_variables] except: for fv in var.file_variables: try: firstt = cfHandler.normalizeTime(fv.aggdim_first, fv.aggdim_units, aggUnits, calendar=cdcalendar) lastt = cfHandler.normalizeTime(fv.aggdim_last, fv.aggdim_units, aggUnits, calendar=cdcalendar) except: error("path=%s, Invalid aggregation dimension value or units: first_value=%f, last_value=%f, units=%s"%(fv.file.getLocation(), fv.aggdim_first, fv.aggdim_last, fv.aggdim_units)) raise mono = cmp(filevarRanges[0][1], filevarRanges[0][2]) if mono<=0: filevarRanges.sort(lambda x, y: cmp(x[1], y[1])) else: filevarRanges.sort(lambda x, y: -cmp(x[1], y[1])) # Check that ranges don't overlap. Aggregate dimension and bounds may be duplicated. lastValues = numpy.array(map(lambda x: x[2], filevarRanges)) firstValues = numpy.array(map(lambda x: x[1], filevarRanges)) if (var not in [aggDim, aggDimBounds]): if mono<=0: compare = (lastValues[0:-1] >= firstValues[1:]) else: compare = (lastValues[0:-1] <= firstValues[1:]) if compare.any(): overlaps = compare.nonzero()[0] dset.warning("Variable %s is duplicated:"%(var.short_name), WARNING_LEVEL, AGGREGATE_MODULE) var.has_errors = True nprint = min(len(overlaps), 3) for i in range(nprint): dset.warning(" %s: (%d, %d)"%filevarRanges[overlaps[i]], WARNING_LEVEL, AGGREGATE_MODULE) dset.warning(" %s: (%d, %d)"%filevarRanges[overlaps[i]+1], WARNING_LEVEL, AGGREGATE_MODULE) if len(overlaps)>nprint: dset.warning(" ... (%d duplications total)"%len(overlaps), WARNING_LEVEL, AGGREGATE_MODULE) # Check monotonicity of last values. else: if mono<=0: compare = (lastValues[0:-1] < lastValues[1:]).all() else: compare = (lastValues[0:-1] > lastValues[1:]).all() if not compare: dset.warning("File aggregate dimension ranges are not monotonic for variable %s: %s"%(var.short_name, `filevarRanges`), WARNING_LEVEL, AGGREGATE_MODULE) var.has_errors = True var.aggdim_first = float(firstValues[0]) var.aggdim_last = float(lastValues[-1]) seq += 1 try: issueCallback(progressCallback, seq, nvars, 0.5, 0.75, stopEvent=stopEvent) except: session.rollback() session.close() raise # Combine identical aggregate dimensions and add to the dataset timevardict = {} for var in timevars: timevardict[(var.short_name, var.domain, var.aggdim_first, var.aggdim_last)] = var for var in timevardict.values(): dset.variables.append(var) # Validate standard names seq = 0 nvars = len(dset.variables) for var in dset.variables: attr = lookupAttr(var, 'standard_name') if (attr is not None): if (cfHandler is not None) and (not cfHandler.validateStandardName(attr)): info("Invalid standard name: %s for variable %s"%(attr, var.short_name)) else: var.standard_name = attr seq += 1 try: issueCallback(progressCallback, seq, nvars, 0.75, 1.0, stopEvent=stopEvent) except: session.rollback() session.close() raise debug("Adding variable info to database") session.commit() session.close()
def extractFromFile(dataset, openfile, fileobj, session, cfHandler, aggdimName=None, varlocate=None, **context): """ Extract metadata from a file, add to a database. dataset The dataset instance. openfile An open netCDF file object. fileobj A (logical) file instance. session A database session instance. cfHandler A CF handler instance aggdimName The name of the dimension which is split across files, if any. varlocate List with elements [varname, pattern]. The variable will be extracted from the file only if the filename matches the pattern at the start. Example: [['ps', 'ps\_'], ['xyz', 'xyz\_']] context A dictionary with keys project, model, experiment, and run. """ fileVersion = fileobj.versions[-1] # Get the aggregate dimension range if aggdimName is not None and openfile.hasVariable(aggdimName): aggvarFirst = openfile.getVariable(aggdimName, index=0) aggvarLast = openfile.getVariable(aggdimName, index=-1) aggvarLen = openfile.inquireVariableShape(aggdimName)[0] aggvarunits = map_to_charset(openfile.getAttribute("units", aggdimName)) if aggdimName.lower()=="time" or (openfile.hasAttribute("axis", aggdimName) and openfile.getAttribute("axis", aggdimName)=="T"): if abs(aggvarFirst)>1.e12 or abs(aggvarLast)>1.e12: dataset.warning("File: %s has time range: [%f, %f], looks bogus."%(fileVersion.location, aggvarFirst, aggvarLast), WARNING_LEVEL, AGGREGATE_MODULE) if aggdimName is not None and not openfile.hasVariable(aggdimName): info("Aggregate dimension not found: %s"%aggdimName) varlocatedict = {} if varlocate is not None: for varname, pattern in varlocate: varlocatedict[varname] = pattern # For each variable in the file: for varname in openfile.inquireVariableList(): varshape = openfile.inquireVariableShape(varname) debug("%s%s"%(varname, `varshape`)) # Check varlocate if varlocatedict.has_key(varname) and not re.match(varlocatedict[varname], os.path.basename(fileVersion.location)): debug("Skipping variable %s in %s"%(varname, fileVersion.location)) continue # Create a file variable filevar = FileVariable(varname, openfile.getAttribute('long_name', varname, None)) fileobj.file_variables.append(filevar) # Create attributes: for attname in openfile.inquireAttributeList(varname): attvalue = openfile.getAttribute(attname, varname) atttype, attlen = getTypeAndLen(attvalue) attribute = FileVariableAttribute(attname, map_to_charset(attvalue), atttype, attlen) filevar.attributes.append(attribute) debug(' %s.%s = %s'%(varname, attname, `attvalue`)) # Create dimensions seq = 0 dimensionList = openfile.inquireVariableDimensions(varname) for dimname, dimlen in zip(dimensionList, varshape): dimension = FileVariableDimension(dimname, dimlen, seq) filevar.dimensions.append(dimension) if dimname==aggdimName: filevar.aggdim_first = float(aggvarFirst) filevar.aggdim_last = float(aggvarLast) filevar.aggdim_units = aggvarunits seq += 1 # Set coordinate axis range and type if applicable if len(varshape)==1: var0 = openfile.getVariable(varname, index=0) varn = openfile.getVariable(varname, index=-1) if cfHandler.axisIsLatitude(filevar): filevar.coord_range = genCoordinateRange(var0, varn) if not isValidCoordinateRange(var0, varn): warning("Latitude coordinate range: %s is suspicious, file = %s, variable = %s"%(filevar.coord_range, openfile.path, varname)) filevar.coord_type = 'Y' elif cfHandler.axisIsLongitude(filevar): filevar.coord_range = genCoordinateRange(var0, varn) if not isValidCoordinateRange(var0, varn): warning("Longitude coordinate range: %s is suspicious, file = %s, variable = %s"%(filevar.coord_range, openfile.path, varname)) filevar.coord_type = 'X' elif cfHandler.axisIsLevel(filevar): vararray = openfile.getVariable(varname) filevar.coord_range = genCoordinateRange(var0, varn) if not isValidCoordinateRange(var0, varn): warning("Vertical level coordinate range: %s is suspicious, file = %s, variable = %s"%(filevar.coord_range, openfile.path, varname)) filevar.coord_type = 'Z' filevar.coord_values = str(vararray)[1:-1] # See set_printoptions call above # Create global attribute for attname in openfile.inquireAttributeList(): attvalue = openfile.getAttribute(attname, None) atttype, attlen = getTypeAndLen(attvalue) attribute = FileAttribute(attname, map_to_charset(attvalue), atttype, attlen) fileobj.attributes.append(attribute) if attname=='tracking_id': fileVersion.tracking_id = attvalue debug('.%s = %s'%(attname, attvalue))
def aggregateVariables(datasetName, dbSession, aggregateDimensionName=None, cfHandler=None, progressCallback=None, stopEvent=None, datasetInstance=None): """ Aggregate file variables into variables, and add to the database. Populates the database tables: - variable - file_variable - associated attribute tables Returns a Dataset object. datasetName String dataset identifier. dbSession A database Session. aggregateDimensionName The name of the dimension across which the dataset is aggregated, if any. cfHandler A CFHandler to validate standard names, etc. progressCallback Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported. stopEvent Object with boolean attribute ``stop_extract`` (for example, ``utility.StopEvent``). If set to True (in another thread) the extraction is stopped. datasetInstance Existing dataset instance. If not provided, the instance is regenerated from the database. """ session = dbSession() info("Aggregating variables") # Lookup the dataset if datasetInstance is None: dset = session.query(Dataset).filter_by(name=datasetName).first() for variable in dset.variables: session.delete(variable) for attrname, attr in dset.attributes.items(): if not attr.is_category: del dset.attributes[attrname] session.commit() dset.variables = [] else: dset = datasetInstance # session.save_or_update(dset) session.add(dset) if dset is None: raise ESGPublishError("Dataset not found: %s"%datasetName) dsetindex = {} # dsetindex[varname] = [(variable, domain), (variable, domain), ...] # where domain = ((dim0, len0, 0), (dim1, len1, 1), ...) # Note: # (1) If a dim0 is the aggregate dimension, len0 is 0 # (2) A dsetindex entry will only have multiple tuples if # there are more than one variable with the same name # and different domains. varindex = {} # varindex[(varname, domain, attrname)] = attribute globalAttrIndex = {} # globalAttrIndex[attname] = attval, for global attributes dsetvars = [] # list of all target variables of a dataset dset_target_vars = set() # Create variables seq = 0 nfiles = len(dset.getFiles()) for file in dset.getFiles(): for filevar in file.file_variables: if filevar.is_target_variable: dset_target_vars.add(filevar.short_name) # Get the filevar and variable domain fvdomain = map(lambda x: (x.name, x.length, x.seq), filevar.dimensions) fvdomain.sort(lambda x,y: cmp(x[SEQ], y[SEQ])) filevar.domain = fvdomain if len(fvdomain)>0 and fvdomain[0][0]==aggregateDimensionName: vardomain = ((aggregateDimensionName, 0, 0),)+tuple(fvdomain[1:]) # Zero out aggregate dimension length else: vardomain = tuple(fvdomain) # Create the variable if necessary varlist = dsetindex.get(filevar.short_name, None) if varlist is None or vardomain not in [item[1] for item in varlist]: var = Variable(filevar.short_name, filevar.long_name) var.domain = vardomain # Record coordinate variable range if applicable if filevar.coord_type is not None: var.coord_type = filevar.coord_type if var.coord_type=='Z': var.coord_values = filevar.coord_values var.coord_range = filevar.coord_range dsetvars.append(var) if varlist is None: dsetindex[var.short_name] = [(var, vardomain)] else: varlist.append((var, vardomain)) else: for tvar, domain in varlist: if domain==vardomain: var = tvar break # Attach the file variable to the variable var.file_variables.append(filevar) # Create attributes for fvattribute in filevar.attributes: vattribute = varindex.get((var.short_name, vardomain, fvattribute.name), None) if vattribute is None: attribute = VariableAttribute(fvattribute.name, map_to_charset(fvattribute.value), fvattribute.datatype, fvattribute.length) var.attributes.append(attribute) varindex[(var.short_name, vardomain, attribute.name)] = attribute if attribute.name == 'units': var.units = attribute.value # Create global attributes for fileattr in file.attributes: fattribute = globalAttrIndex.get(fileattr.name, None) if fattribute is None and fileattr.name not in ['readDimension']: attribute = DatasetAttribute(fileattr.name, map_to_charset(fileattr.value), fileattr.datatype, fileattr.length) dset.attributes[attribute.name] = attribute globalAttrIndex[attribute.name] = attribute seq += 1 try: issueCallback(progressCallback, seq, nfiles, 0, 0.25, stopEvent=stopEvent) except: session.rollback() session.close() raise # Find the aggregation dimension bounds variable, if any aggDim = lookupVar(aggregateDimensionName, dsetindex) boundsName = lookupAttr(aggDim, 'bounds') aggUnits = lookupAttr(aggDim, 'units') aggDimBounds = lookupVar(boundsName, dsetindex) # Set calendar for time aggregation isTime = cfHandler.axisIsTime(aggDim) if isTime: calendar = cfHandler.getCalendarTag(aggDim) if calendar is None: calendar = "gregorian" else: calendar = None dset.calendar = calendar dset.aggdim_name = aggregateDimensionName dset.aggdim_units = aggUnits cdcalendar = cfHandler.tagToCalendar(calendar) # Add the non-aggregate dimension variables to the dataset for var in dsetvars: if var not in [aggDim, aggDimBounds] and var.short_name in dset_target_vars: dset.variables.append(var) # Set coordinate ranges for var in dset.variables: for name, length, seq in var.domain: if name==aggregateDimensionName: continue dvar = lookupCoord(name, dsetindex, length) if dvar is not None: units = lookupAttr(dvar, 'units') if units is None: warning("Missing units, variable=%s"%dvar.short_name) units = '' if hasattr(dvar, 'coord_type'): if dvar.coord_type=='X': var.eastwest_range = dvar.coord_range+':'+units elif dvar.coord_type=='Y': var.northsouth_range = dvar.coord_range+':'+units elif dvar.coord_type=='Z': var.updown_range = dvar.coord_range+':'+units var.updown_values = dvar.coord_values # Attach aggregate dimension filevars to files if aggDim is not None: for filevar in aggDim.file_variables: filevar.file.aggDim = filevar if aggDimBounds is not None: for filevar in aggDimBounds.file_variables: filevar.file.aggDimBounds = filevar # Combine aggregate dimensions: # Scan all variables with the aggregate dimension in the domain. For each such variable, # create an aggregate dimension variable, and bounds if needed. timevars = [] for var in dset.variables: if len(var.domain)>0 and aggregateDimensionName==var.domain[0][NAME]: aggVar = createAggregateVar(var, 'aggDim', aggregateDimensionName) aggBoundsVar = createAggregateVar(var, 'aggDimBounds', aggregateDimensionName) if aggVar is not None: aggVar.units = aggUnits timevars.append(aggVar) if aggBoundsVar is not None: timevars.append(aggBoundsVar) # Create variable dimensions, aggregating the agg dimension debug("Creating dimensions") i = 0 nvars = len(dset.variables+timevars) for var in dset.variables+timevars: vardomain = var.domain # Increment aggregate dimension length if len(vardomain)>0 and aggregateDimensionName==vardomain[0][NAME]: for filevar in var.file_variables: fvdomain = filevar.domain vardomain = ((aggregateDimensionName, vardomain[0][LENGTH]+fvdomain[0][LENGTH], vardomain[0][SEQ]),)+tuple(vardomain[1:]) var.domain = vardomain # Create the variable domain for name, length, seq in vardomain: dimension = VariableDimension(name, length, seq) var.dimensions.append(dimension) i += 1 try: issueCallback(progressCallback, i, nvars, 0.25, 0.5, stopEvent=stopEvent) except: session.rollback() session.close() raise # Set variable aggregate dimension ranges debug("Setting aggregate dimension ranges") seq = 0 nvars = len(dset.variables+timevars) for var in dset.variables+timevars: vardomain = var.domain if len(vardomain)>0 and vardomain[0][NAME]==aggregateDimensionName: # Adjust times so they have consistent base units try: filevarRanges = [(x.file.getLocation(), cfHandler.normalizeTime(x.aggdim_first, x.aggdim_units, aggUnits, calendar=cdcalendar), cfHandler.normalizeTime(x.aggdim_last, x.aggdim_units, aggUnits, calendar=cdcalendar)) for x in var.file_variables] except: for fv in var.file_variables: try: firstt = cfHandler.normalizeTime(fv.aggdim_first, fv.aggdim_units, aggUnits, calendar=cdcalendar) lastt = cfHandler.normalizeTime(fv.aggdim_last, fv.aggdim_units, aggUnits, calendar=cdcalendar) except: error("path=%s, Invalid aggregation dimension value or units: first_value=%f, last_value=%f, units=%s"%(fv.file.getLocation(), fv.aggdim_first, fv.aggdim_last, fv.aggdim_units)) raise mono = cmp(filevarRanges[0][1], filevarRanges[0][2]) if mono<=0: filevarRanges.sort(lambda x, y: cmp(x[1], y[1])) else: filevarRanges.sort(lambda x, y: -cmp(x[1], y[1])) # Check that ranges don't overlap. Aggregate dimension and bounds may be duplicated. lastValues = numpy.array(map(lambda x: x[2], filevarRanges)) firstValues = numpy.array(map(lambda x: x[1], filevarRanges)) if (var not in [aggDim, aggDimBounds]): if mono<=0: compare = (lastValues[0:-1] >= firstValues[1:]) else: compare = (lastValues[0:-1] <= firstValues[1:]) if compare.any(): overlaps = compare.nonzero()[0] dset.warning("Variable %s is duplicated:"%(var.short_name), WARNING_LEVEL, AGGREGATE_MODULE) var.has_errors = True nprint = min(len(overlaps), 3) for i in range(nprint): dset.warning(" %s: (%d, %d)"%filevarRanges[overlaps[i]], WARNING_LEVEL, AGGREGATE_MODULE) dset.warning(" %s: (%d, %d)"%filevarRanges[overlaps[i]+1], WARNING_LEVEL, AGGREGATE_MODULE) if len(overlaps)>nprint: dset.warning(" ... (%d duplications total)"%len(overlaps), WARNING_LEVEL, AGGREGATE_MODULE) # Check monotonicity of last values. else: if mono<=0: compare = (lastValues[0:-1] < lastValues[1:]).all() else: compare = (lastValues[0:-1] > lastValues[1:]).all() if not compare: dset.warning("File aggregate dimension ranges are not monotonic for variable %s: %s"%(var.short_name, `filevarRanges`), WARNING_LEVEL, AGGREGATE_MODULE) var.has_errors = True var.aggdim_first = float(firstValues[0]) var.aggdim_last = float(lastValues[-1]) seq += 1 try: issueCallback(progressCallback, seq, nvars, 0.5, 0.75, stopEvent=stopEvent) except: session.rollback() session.close() raise # Combine identical aggregate dimensions and add to the dataset timevardict = {} for var in timevars: timevardict[(var.short_name, var.domain, var.aggdim_first, var.aggdim_last)] = var for var in timevardict.values(): dset.variables.append(var) # Validate standard names seq = 0 nvars = len(dset.variables) for var in dset.variables: attr = lookupAttr(var, 'standard_name') if (attr is not None): if (cfHandler is not None) and (not cfHandler.validateStandardName(attr)): info("Invalid standard name: %s for variable %s"%(attr, var.short_name)) else: var.standard_name = attr seq += 1 try: issueCallback(progressCallback, seq, nvars, 0.75, 1.0, stopEvent=stopEvent) except: session.rollback() session.close() raise debug("Adding variable info to database") session.commit() session.close()
def extractFromFile(dataset, openfile, fileobj, session, handler, cfHandler, aggdimName=None, varlocate=None, exclude_variables=None, perVariable=None, **context): """ Extract metadata from a file, add to a database. dataset The dataset instance. openfile An open netCDF file object. fileobj A (logical) file instance. session A database session instance. cfHandler A CF handler instance handler Project handler aggdimName The name of the dimension which is split across files, if any. varlocate List with elements [varname, pattern]. The variable will be extracted from the file only if the filename matches the pattern at the start. Example: [['ps', 'ps\_'], ['xyz', 'xyz\_']] exclude_variables List of thredds_exclude_variables perVariable Boolean, Try to find a target_variable if true and extract all variables if false context A dictionary with keys project, model, experiment, and run. """ fileVersion = fileobj.versions[-1] # Get the aggregate dimension range if aggdimName is not None and openfile.hasVariable(aggdimName): aggvarFirst = openfile.getVariable(aggdimName, index=0) aggvarLast = openfile.getVariable(aggdimName, index=-1) aggvarLen = openfile.inquireVariableShape(aggdimName)[0] aggvarunits = map_to_charset(openfile.getAttribute("units", aggdimName)) if aggdimName.lower()=="time" or (openfile.hasAttribute("axis", aggdimName) and openfile.getAttribute("axis", aggdimName)=="T"): if abs(aggvarFirst)>1.e12 or abs(aggvarLast)>1.e12: dataset.warning("File: %s has time range: [%f, %f], looks bogus."%(fileVersion.location, aggvarFirst, aggvarLast), WARNING_LEVEL, AGGREGATE_MODULE) if aggdimName is not None and not openfile.hasVariable(aggdimName): info("Aggregate dimension not found: %s"%aggdimName) varlocatedict = {} if varlocate is not None: for varname, pattern in varlocate: varlocatedict[varname.strip()] = pattern.strip() # Create global attribute target_variable = None for attname in openfile.inquireAttributeList(): attvalue = openfile.getAttribute(attname, None) atttype, attlen = getTypeAndLen(attvalue) attribute = FileAttribute(attname, map_to_charset(attvalue), atttype, attlen) fileobj.attributes.append(attribute) if attname == 'tracking_id': fileVersion.tracking_id = attvalue # extract target_variable from global attributes if attname == 'variable_id' and perVariable: target_variable = attvalue debug('Extracted target variable from global attributes: %s' % target_variable) debug('.%s = %s' % (attname, attvalue)) # try to get target_variable from DRS if not found in global attributes if not target_variable and perVariable: config = getConfig() if config is not None: drs_pattern = handler.getFilters()[0][1:-1] drs_file_pattern = '%s/(?P<filename>[\w.-]+)$' % drs_pattern drs_parts = re.search(drs_file_pattern, openfile.path).groupdict() if 'variable' in drs_parts: target_variable = drs_parts['variable'] debug('Extracted target variable from DRS: %s' % target_variable) # target_variable must be present in the file if target_variable not in openfile.inquireVariableList(): target_variable = None # For each variable in the file: for varname in openfile.inquireVariableList(): # we need to extract only target, aggregation and coverage variables if target_variable: is_coverage_variable = check_coverage_variable(varname, openfile) if not is_coverage_variable and varname != target_variable and varname != aggdimName: debug("Skipping variable %s in %s (not target (%s), coverage or aggregation (%s) variable)" % (varname, fileVersion.location, target_variable, aggdimName)) continue varshape = openfile.inquireVariableShape(varname) debug("%s%s"%(varname, `varshape`)) # Check varlocate if varlocatedict.has_key(varname) and not re.match(varlocatedict[varname].strip(), os.path.basename(fileVersion.location)): debug("Skipping variable %s in %s"%(varname, fileVersion.location)) continue is_target_variable = True if target_variable and target_variable != varname: is_target_variable = False elif varname in exclude_variables: is_target_variable = False # Create a file variable varstr = openfile.getAttribute('long_name', varname, None) if not varstr is None and len(varstr) > 255: varstr = varstr[0:255] filevar = FileVariable(varname, varstr, is_target_variable=is_target_variable) fileobj.file_variables.append(filevar) # Create attributes: for attname in openfile.inquireAttributeList(varname): attvalue = openfile.getAttribute(attname, varname) atttype, attlen = getTypeAndLen(attvalue) attribute = FileVariableAttribute(attname, map_to_charset(attvalue), atttype, attlen) filevar.attributes.append(attribute) debug(' %s.%s = %s'%(varname, attname, `attvalue`)) # Create dimensions seq = 0 dimensionList = openfile.inquireVariableDimensions(varname) for dimname, dimlen in zip(dimensionList, varshape): dimension = FileVariableDimension(dimname, dimlen, seq) filevar.dimensions.append(dimension) if dimname==aggdimName: filevar.aggdim_first = float(aggvarFirst) filevar.aggdim_last = float(aggvarLast) filevar.aggdim_units = aggvarunits seq += 1 # Set coordinate axis range and type if applicable if len(varshape)==1: var0 = openfile.getVariable(varname, index=0) if var0 is None: continue varn = openfile.getVariable(varname, index=-1) if cfHandler.axisIsLatitude(filevar): filevar.coord_range = genCoordinateRange(var0, varn) if not isValidCoordinateRange(var0, varn): warning("Latitude coordinate range: %s is suspicious, file = %s, variable = %s"%(filevar.coord_range, openfile.path, varname)) filevar.coord_type = 'Y' elif cfHandler.axisIsLongitude(filevar): filevar.coord_range = genCoordinateRange(var0, varn) if not isValidCoordinateRange(var0, varn): warning("Longitude coordinate range: %s is suspicious, file = %s, variable = %s"%(filevar.coord_range, openfile.path, varname)) filevar.coord_type = 'X' elif cfHandler.axisIsLevel(filevar): vararray = openfile.getVariable(varname) filevar.coord_range = genCoordinateRange(var0, varn) if not isValidCoordinateRange(var0, varn): warning("Vertical level coordinate range: %s is suspicious, file = %s, variable = %s"%(filevar.coord_range, openfile.path, varname)) filevar.coord_type = 'Z' filevar.coord_values = str(vararray)[1:-1] # See set_printoptions call above
def extractFromDataset(datasetName, fileIterator, dbSession, handler, cfHandler, aggregateDimensionName=None, offline=False, operation=CREATE_OP, progressCallback=None, stopEvent=None, perVariable=None, keepVersion=False, newVersion=None, extraFields=None, masterGateway=None, comment=None, useVersion=-1, forceRescan=False, nodbwrite=False, pid_connector=None, test_publication=False, **context): """ Extract metadata from a dataset represented by a list of files, add to a database. Populates the database tables: - dataset - dataset_version - file - file_version - dataset_file_version - file_variable (partially) - associated attribute tables Returns a Dataset object. datasetName String dataset identifier. fileIterator An iterator that returns an iteration of (file_path, file_size), where file_size is an integer. dbSession A database Session. handler Project handler cfHandler A CF handler instance aggregateDimensionName The name of the dimension across which the dataset is aggregated, if any. offline Boolean, True if the files are offline, cannot be scanned. operation Publication operation, one of CREATE_OP, DELETE_OP, RENAME_OP, UPDATE_OP progressCallback Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported. stopEvent Object with boolean attribute ``stop_extract`` (for example, ``utility.StopEvent``). If set to True (in another thread) the extraction is stopped. perVariable=None Boolean, overrides ``variable_per_file`` config option. keepVersion Boolean, True if the dataset version should not be incremented. newVersion Set the new version number explicitly. By default the version number is incremented by 1. See keepVersion. extraFields Extra fields dictionary, as from ``readDatasetMap``. masterGateway The gateway that owns the master copy of the datasets. If None, the dataset is not replicated. Otherwise the TDS catalog is written with a 'master_gateway' property, flagging the dataset(s) as replicated. comment String comment on the dataset version. If the dataset version is not increased, the comment is ignored. useVersion=-1: Integer version number of the dataset version to modify. By default the latest version is modified. forceRescan Boolean, if True force all files to be rescanned on an update. pid_connector ESGF_PID_connector object to register PIDs test_publication Flag whether publication is for production or test context A dictionary with keys ``project``, ``model``, ``experiment``, etc. The context consists of all fields needed to uniquely define the dataset. """ session = dbSession() # Get configuration options related to the scan configOptions = {} config = getConfig() if config is not None: section = 'project:%s'%context.get('project') vlstring = config.get(section, 'variable_locate', default=None) if vlstring is not None: fields = splitLine(vlstring) varlocate = [s.split(',') for s in fields] else: varlocate = None line = config.get('DEFAULT', 'checksum', default=None) if line is not None: checksumClient, checksumType = splitLine(line) else: checksumClient = None checksumType = None versionByDate = config.getboolean(section, 'version_by_date', default=False) if not offline: if perVariable is None: perVariable = config.getboolean(section, 'variable_per_file', False) else: perVariable = False else: varlocate = None checksumClient = None checksumType = None versionByDate = False exclude_variables = splitLine(config.get(section, 'thredds_exclude_variables', default=''), sep=',') configOptions['variable_locate'] = varlocate configOptions['checksumClient'] = checksumClient configOptions['checksumType'] = checksumType configOptions['exclude_variables'] = exclude_variables configOptions['perVariable'] = perVariable # Check if the dataset / version is already in the database dset = session.query(Dataset).filter_by(name=datasetName).first() if dset is not None: if operation==CREATE_OP: operation = REPLACE_OP else: if operation in [UPDATE_OP, REPLACE_OP]: operation = CREATE_OP elif operation in [DELETE_OP, RENAME_OP]: raise ESGPublishError("No such dataset: %s"%datasetName) # Cannot add online files to offline dataset, and vice versa if dset is not None and dset.offline != offline: if dset.offline: raise ESGPublishError("Dataset %s is offline, set offline flag or replace the dataset."%dset.name) else: raise ESGPublishError("Dataset %s is online, but offline flag is set."%dset.name) # Cannot publish a replica with the same ID as a local dataset and vice versa if dset is not None and dset.master_gateway != masterGateway: if dset.master_gateway is None: raise ESGPublishError("Dataset %s exists and is not a replica - delete it before publishing a replica of the same name."%dset.name) else: raise ESGPublishError("Dataset %s exists and is a replica. Use --replica or delete the existing dataset."%dset.name) createTime = datetime.datetime.now() # DatasetVersion creation_time fobjs = None pathlist = [item for item in fileIterator] if (nodbwrite): dset = Dataset(datasetName, context.get('project', None), context.get('model', None), context.get('experiment', None), context.get('run_name', None), offline=offline, masterGateway=masterGateway) addNewVersion, fobjs = createDataset(dset, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, masterGateway=masterGateway, **context) info("dataset scan complete, not writing to database") return dset elif operation==CREATE_OP: # Create a new dataset info("Creating dataset: %s"%datasetName) dset = Dataset(datasetName, context.get('project', None), context.get('model', None), context.get('experiment', None), context.get('run_name', None), offline=offline, masterGateway=masterGateway) session.add(dset) # Create an initial dataset version existingVersion = 0 eventFlag = CREATE_DATASET_EVENT addNewVersion, fobjs = createDataset(dset, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, masterGateway=masterGateway, useVersion=useVersion, **context) elif operation in [UPDATE_OP, REPLACE_OP]: if operation==REPLACE_OP: versionObj = dset.getVersionObj(-1) else: versionObj = dset.getVersionObj(useVersion) if versionObj is None: raise ESGPublishError("Version %d of dataset %s not found, cannot republish."%(useVersion, dset.name)) existingVersion = dset.getVersion() eventFlag = UPDATE_DATASET_EVENT addNewVersion, fobjs = updateDatasetVersion(dset, versionObj, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, replace=(operation==REPLACE_OP), forceRescan=forceRescan, useVersion=useVersion, **context) elif operation==RENAME_OP: versionObj = dset.getVersionObj(useVersion) if versionObj is None: raise ESGPublishError("Version %d of dataset %s not found, cannot republish."%(useVersion, dset.name)) existingVersion = dset.getVersion() eventFlag = UPDATE_DATASET_EVENT addNewVersion = renameFilesVersion(dset, versionObj, pathlist, session, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, **context) elif operation==DELETE_OP: versionObj = dset.getVersionObj(useVersion) if versionObj is None: raise ESGPublishError("Version %d of dataset %s not found, cannot republish."%(useVersion, dset.name)) existingVersion = dset.getVersion() eventFlag = UPDATE_DATASET_EVENT addNewVersion, fobjs = deleteFilesVersion(dset, versionObj, pathlist, session, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, **context) else: raise ESGPublishError("Invalid dataset operation: %s"%`operation`) # Create a new dataset version if necessary if useVersion == -1: if keepVersion: if existingVersion<=0: newVersion = getInitialDatasetVersion(versionByDate) else: newVersion = existingVersion elif newVersion is None: newVersion = getNextDatasetVersion(existingVersion, versionByDate) else: newVersion = useVersion dset.reaggregate = False if newVersion<existingVersion: versionList = dset.getVersionList() if newVersion in versionList: addNewVersion = False # Add a new version if addNewVersion: datasetTechNotes = datasetTechNotesTitle = None if hasattr(dset, "dataset_tech_notes"): datasetTechNotes = dset.dataset_tech_notes if hasattr(dset, "dataset_tech_notes_title"): datasetTechNotesTitle = dset.dataset_tech_notes_title # if project uses PIDs, generate PID for dataset dataset_pid = None if pid_connector: dataset_pid = pid_connector.make_handle_from_drsid_and_versionnumber(drs_id=datasetName, version_number=newVersion) info("Assigned PID to dataset %s.v%s: %s " % (datasetName, newVersion, dataset_pid)) # if project uses citation, build citation url project_config_section = 'config:%s' %context.get('project') citation_url = handler.get_citation_url(project_config_section, config, datasetName, newVersion, test_publication) newDsetVersionObj = DatasetVersionFactory(dset, version=newVersion, creation_time=createTime, comment=comment, tech_notes=datasetTechNotes, tech_notes_title=datasetTechNotesTitle, pid=dataset_pid, citation_url=citation_url) info("New dataset version = %d"%newDsetVersionObj.version) try: for var in dset.variables: session.delete(var) except IntegrityError as ie: debug("sqlalchemy IntegrityError: " + str(ie)) raise ESGPublishError("Error in creating dataset version, did you already publish this version to the database?") newDsetVersionObj.files.extend(fobjs) event = Event(datasetName, newDsetVersionObj.version, eventFlag) dset.events.append(event) dset.reaggregate = True # Keep the current (latest) version elif addNewVersion and newVersion==existingVersion and operation in [UPDATE_OP, REPLACE_OP]: versionObj.deleteChildren(session) versionObj.reset(creation_time=createTime, comment=comment) info("Keeping dataset version = %d"%versionObj.version) for var in dset.variables: session.delete(var) session.commit() versionObj.files.extend(fobjs) event = Event(datasetName, versionObj.version, eventFlag) dset.events.append(event) dset.reaggregate = True elif masterGateway is not None: # Force version set on replication info("Dataset version = %d"%newVersion) dset.setVersion(newVersion) event = Event(datasetName, newVersion, eventFlag) dset.events.append(event) info("Adding file info to database") session.commit() session.close() return dset
def nodeIterator(top, nodefilt, filefilt, followSymLinks=True, allFiles=False): """Generate an iterator over non-empty directories that match a pattern. Returns an iterator that returns a tuple (*path*, *sample_file*, *groupdict*) at each iteration, where: - *path* is the node (directory) path - *sample_file* is a file in the node that matches the file filter - *groupdict* is the group dictionary generated by the match. For example, if *nodefilt* contains a named group '(?P<model>) that matches 'some_value', then *groupdict* maps 'model' => 'some_value' top A list or tuple of top level directory names. nodefilt A regular expression as defined in the Python re module. Each node returned matches the expression. May also be a list of regular expressions, in which case each node returned matches at least one expression in the list. filefilt A regular expression as defined in the Python re module. Each sample file returned has basename matching the filter. followSymLinks Boolean flag. Symbolic links are followed unless followSymLinks is False. allFiles = False Boolean flag. If True, iterate over all files that match the filter. Otherwise just return the first file that matches. """ try: names = os.listdir(top) except os.error: return if type(nodefilt) is not type([]): nodefilt = [nodefilt] foundOne = False for basename in names: name = os.path.join(top, basename) try: if followSymLinks: st = os.stat(name) else: st = os.lstat(name) except os.error: continue # Search regular files in top directory if stat.S_ISREG(st.st_mode): if not foundOne or allFiles: # Find the first node filter that matches for filt in nodefilt: result = re.match(filt, top) debug("Comparing %s with filter %s ..." % (top, filt)) if result is not None: debug("... match") break debug("... no match") # If the node pattern matches and the file not a directory and the file filter matches: if (result is not None) and (re.match(filefilt, basename) is not None): groupdict = result.groupdict() foundOne = True yield (top, basename, groupdict) # Search subdirectories elif stat.S_ISDIR(st.st_mode): for nodepath, filepath, gdict in nodeIterator( name, nodefilt, filefilt, followSymLinks=followSymLinks): yield (nodepath, filepath, gdict) return
def checkAndUpdateRepo(cmor_table_path, ds_version): """ Checks for a file written to a predefined location. if not present or too old, will pull the repo based on the input path argument and update the timestamp. """ # This is run during handler initialization and not for each file validation # Pull repo if fetched more than one day ago # or if never fetched before if os.path.exists(UPDATE_TIMESTAMP): mtime = os.path.getmtime(UPDATE_TIMESTAMP) now = time() if now - mtime > (86400.0): pull_cmor_repo = True else: pull_cmor_repo = False else: pull_cmor_repo = True if pull_cmor_repo: try: # Go into CMOR table path # Git fetch CMOR table repo # Go back to previous working directory checkedRun(('cd {} && git fetch --quiet').format(cmor_table_path)) # Update local timestamp f = open(UPDATE_TIMESTAMP, "w") f.write("CMOR table updated at {}".format(time())) f.close() debug("Local CMOR table repository fetched or updated") except Exception as e: warning( "Attempt to update the cmor table repo and encountered an error: " + str(e)) # Change repo branch in any case try: # Go into CMOR table path # Stash any changes from previous checkout # Checkout to the appropriate CMOR table tag # Go back to previous working directory checkedRun( ('cd {} && git stash --quiet && git checkout {} --quiet').format( cmor_table_path, ds_version)) # Update local timestamp f = open(UPDATE_TIMESTAMP, "w") f.write("CMOR table updated at {}".format(time())) f.close() debug("Consider CMOR table tag: {}".format(ds_version)) except Exception as e: raise ESGPublishError( "Error data_specs_version tag %s not found in the CMOR tables or other error. Please contact support" % ds_version) # Get most up to date CMIP6_CV in any case if ds_version != "master": try: # Go into CMOR table path # PrePARE requires to have the most up to date CMIP6 CV. # Update CMIP6_CV.json from master branch. # Go back to previous working directory checkedRun(('cd {} && git checkout master CMIP6_CV.json --quiet' ).format(cmor_table_path)) debug("CMIP6 CV updated from master") except Exception as e: raise ESGPublishError( "Master branch does not exists or CMIP6_CV.json not found or other error. Please contact support" % ds_version)
def validateFile(self, fileobj): """ for CMIP6, this will first verify if the data is written by CMOR at the correct version set in the ini file. If so, the file is declared valid. If not, file will go through PrePARE (CV) check. PrePARE runs CFChecker Raises ESGPublishError if settings are missing or file fails the checks. Raise ESGInvalidMetadataFormat if the file cannot be processed by this handler. """ validator = PrePARE.PrePARE f = fileobj.path # todo refactoring these could loaded upfront in the constructor config = getConfig() project_section = 'project:' + self.name project_config_section = 'config:' + self.name min_cmor_version = config.get(project_section, "min_cmor_version", default="0.0.0") min_ds_version = config.get(project_section, "min_data_specs_version", default="0.0.0") data_specs_version = config.get(project_config_section, "data_specs_version", default="master") cmor_table_path = config.get(project_config_section, "cmor_table_path", default=DEFAULT_CMOR_TABLE_PATH) force_validation = config.getboolean(project_config_section, "force_validation", default=False) cmor_table_subdirs = config.getboolean(project_config_section, "cmor_table_subdirs", default=False) if not force_validation: if self.replica: info("skipping PrePARE for replica (file %s)" % f) return try: file_cmor_version = fileobj.getAttribute('cmor_version', None) except: file_cmor_version = None debug('File %s missing cmor_version attribute; will proceed with PrePARE check' % f) passed_cmor = False if compareLibVersions(min_cmor_version, file_cmor_version): debug('File %s cmor-ized at version %s, passed!'%(f, file_cmor_version)) passed_cmor = True try: table = fileobj.getAttribute('table_id', None) except: raise ESGPublishError("File %s missing required table_id global attribute" % f) try: variable_id = fileobj.getAttribute('variable_id', None) except: raise ESGPublishError("File %s missing required variable_id global attribute" % f) # data_specs_version drives CMOR table fetching # Behavior A (default): fetches "master" branch" (if not "data_specs_version" in esg.ini") # Behavior A: fetches branch specified by "data_specs_version=my_branch" into esg.ini # Behavior B: fetches branch specified by file global attributes using "data_specs_version=file" into esg.ini try: file_data_specs_version = fileobj.getAttribute('data_specs_version', None) except Exception as e: raise ESGPublishError("File %s missing required data_specs_version global attribute"%f) if not compareLibVersions(min_ds_version, file_data_specs_version): raise ESGPublishError("File %s data_specs_version is %s, which is less than the required minimum version of %s"%(f,file_data_specs_version,min_ds_version)) # at this point the file has the correct data specs version. # if also was CMORized and has the correct version tag, we can exit if (not force_validation) and passed_cmor: return if data_specs_version == "file": data_specs_version = file_data_specs_version table_dir = getTableDir(cmor_table_path, data_specs_version, cmor_table_subdirs) debug("Validating {} using tables dir: {}".format(f, table_dir)) try: process = validator.checkCMIP6(table_dir) if process is None: raise ESGPublishError("File %s failed the CV check - object create failure"%f) process.ControlVocab(f) except: raise ESGPublishError("File %s failed the CV check"%f)
def checkAndUpdateRepo(cmor_table_path, ds_version): """ Checks for a file written to a predefined location. if not present or too old, will pull the repo based on the input path argument and update the timestamp. """ # This is run during handler initialization and not for each file validation # Pull repo if fetched more than one day ago # or if never fetched before if os.path.exists(UPDATE_TIMESTAMP): mtime = os.path.getmtime(UPDATE_TIMESTAMP) now = time() if now - mtime > (86400.0): pull_cmor_repo = True else: pull_cmor_repo = False else: pull_cmor_repo = True if pull_cmor_repo: try: # Go into CMOR table path # Git fetch CMOR table repo # Go back to previous working directory checkedRun(('cd {} && git fetch --quiet' ).format(cmor_table_path)) # Update local timestamp f = open(UPDATE_TIMESTAMP, "w") f.write("CMOR table updated at {}".format(time())) f.close() debug("Local CMOR table repository fetched or updated") except Exception as e : warning("Attempt to update the cmor table repo and encountered an error: " + str(e)) # Change repo branch in any case try: # Go into CMOR table path # Stash any changes from previous checkout # Checkout to the appropriate CMOR table tag # Go back to previous working directory checkedRun(('cd {} && git stash --quiet && git checkout {} --quiet' ).format(cmor_table_path, ds_version)) # Update local timestamp f = open(UPDATE_TIMESTAMP, "w") f.write("CMOR table updated at {}".format(time())) f.close() debug("Consider CMOR table tag: {}".format(ds_version)) except Exception as e: raise ESGPublishError("Error data_specs_version tag %s not found in the CMOR tables or other error. Please contact support"%ds_version) # Get most up to date CMIP6_CV in any case if ds_version != "master": try: # Go into CMOR table path # PrePARE requires to have the most up to date CMIP6 CV. # Update CMIP6_CV.json from master branch. # Go back to previous working directory checkedRun(('cd {} && git checkout master CMIP6_CV.json --quiet' ).format(cmor_table_path)) debug("CMIP6 CV updated from master") except Exception as e: raise ESGPublishError("Master branch does not exists or CMIP6_CV.json not found or other error. Please contact support" % ds_version)