def readContext(self, cdfile): "Get a dictionary of key/value pairs from an open file." f = cdfile.file result = {} if hasattr(f, 'title'): result['title'] = f.title if hasattr(f, 'Conventions'): result['Conventions'] = f.Conventions if hasattr(f, 'source'): result['source'] = f.source if hasattr(f, 'history'): result['history'] = f.history config = getConfig() projectSection = 'project:' + self.name config_key = "extract_global_attrs" if config.has_option(projectSection, config_key): cdms_file = cdms_open(self.path) for key in splitLine(config.get(projectSection, config_key), ','): # check for mapped keys if ':' in key: parts = key.split(':') value = cdms_file.__getattribute__(parts[0]) result[parts[1]] = value else: result[key] = cdms_file.__getattribute__(key) return result
def getDatasetIdFields(self): """Get a list of (lists of) fields associated with the dataset ID. This may be passed to ``generateDatasetId``. """ config = getConfig() section = 'project:'+self.name dataset_id_formats = splitLine(config.get(section, 'dataset_id', raw=True)) idfields = [re.findall(_patpat, format) for format in dataset_id_formats] return idfields, dataset_id_formats
def initializeFields(self, Session): """Initialize field names and options based on the configuration file.""" from esgcet.model import Model, Experiment config = getConfig() projectSection = "project:" + self.name categoryOption = config.get(projectSection, "categories") categorySpecs = splitRecord(categoryOption) for category, categoryTypeS, isMandatoryS, isThreddsPropertyS, displayOrderS in categorySpecs: categoryType = getCategoryType(categoryTypeS) isMandatory = getBoolean(isMandatoryS) isThreddsProperty = getBoolean(isThreddsPropertyS) displayOrder = string.atoi(displayOrderS) self.fieldNames[category] = (categoryType, isMandatory, isThreddsProperty, displayOrder) categoryDefaultsOption = config.get(projectSection, "category_defaults", default=None, raw=True) if categoryDefaultsOption is not None: categoryDefaultsSpecs = splitRecord(categoryDefaultsOption) for category, categoryDefault in categoryDefaultsSpecs: self.categoryDefaults[category] = categoryDefault session = Session() # Find any new experiments. This allows experiments to be added to the config file without # running esginitialize. if self.fieldNames.has_key("experiment") and self.fieldNames["experiment"][WIDGET_TYPE] == ENUM: initializeExperiments(config, self.name, session) for category in self.getFieldNames(): # At the moment some fields are predefined if category == "project": projects = splitRecord(config.get(projectSection, "project_options", default="")) self.validValues["project"] = [x[0] for x in projects] elif category == "model": models = session.query(Model).filter_by(project=self.name).all() self.validValues["model"] = [x.name for x in models] elif category == "experiment": experiments = session.query(Experiment).filter_by(project=self.name).all() self.validValues["experiment"] = [x.name for x in experiments] elif category == "creator": creators = splitRecord(config.get(projectSection, "creator_options", default="")) self.validValues["creator"] = [x[0] for x in creators] self.validMaps["creator"] = genMap(creators) elif category == "publisher": publishers = splitRecord(config.get(projectSection, "publisher_options", default="")) self.validValues["publisher"] = [x[0] for x in publishers] self.validMaps["publisher"] = genMap(publishers) else: categoryType = self.getFieldType(category) if categoryType == ENUM: option = category + "_options" self.validValues[category] = splitLine(config.get(projectSection, option), ",") self.context[category] = "" session.close()
def getDatasetIdFields(self): """Get a list of (lists of) fields associated with the dataset ID. This may be passed to ``generateDatasetId``. """ config = getConfig() section = 'project:' + self.name dataset_id_formats = splitLine( config.get(section, 'dataset_id', raw=True)) idfields = [ re.findall(_patpat, format) for format in dataset_id_formats ] return idfields, dataset_id_formats
def getDirectoryFormatFilters(self): """Return a list of regular expression filters associated with the ``directory_format`` option in the configuration file. This can be passed to ``nodeIterator`` and ``processNodeMatchIterator``. """ config = getConfig() section = "project:" + self.name directory_format = config.get(section, "directory_format", raw=True) formats = splitLine(directory_format) filters = [] for format in formats: pat = format.strip() pat2 = pat.replace("\.", "__ESCAPE_DOT__") pat3 = pat2.replace(".", r"\.") pat4 = pat3.replace("__ESCAPE_DOT__", r"\.") # pattern = re.sub(_patpat, r'(?P<\1>[^/.]*)', pat4) pattern = re.sub(_patpat, r"(?P<\1>[^/]*)", pat4) filter = "^" + pattern + "$" filters.append(filter) return filters
def getFilters(self, option='directory_format'): """Return a list of regular expression filters associated with the option in the configuration file. This can be passed to ``nodeIterator`` and ``processNodeMatchIterator``. """ config = getConfig() section = 'project:'+self.name directory_format = config.get(section, option, raw=True) formats = splitLine(directory_format) filters = [] for format in formats: pat = format.strip() pat2 = pat.replace('\.','__ESCAPE_DOT__') pat3 = pat2.replace('.', r'\.') pat4 = pat3.replace('__ESCAPE_DOT__', r'\.') # pattern = re.sub(_patpat, r'(?P<\1>[^/.]*)', pat4) pattern = re.sub(_patpat, r'(?P<\1>[^/]*)', pat4) filter = '^'+pattern+'$' filters.append(filter) return filters
def getFilters(self, option='directory_format'): """Return a list of regular expression filters associated with the option in the configuration file. This can be passed to ``nodeIterator`` and ``processNodeMatchIterator``. """ config = getConfig() section = 'project:' + self.name directory_format = config.get(section, option, raw=True) formats = splitLine(directory_format) filters = [] for format in formats: pat = format.strip() pat2 = pat.replace('\.', '__ESCAPE_DOT__') pat3 = pat2.replace('.', r'\.') pat4 = pat3.replace('__ESCAPE_DOT__', r'\.') # pattern = re.sub(_patpat, r'(?P<\1>[^/.]*)', pat4) pattern = re.sub(_patpat, r'(?P<\1>[^/]*)', pat4) filter = '^' + pattern + '$' filters.append(filter) return filters
def getMaps(self): """Get a dictionary of maps from the project section. """ config = getConfig() section = 'project:'+self.name if self.mapdict is None: mapdict = {} projectMaps = splitLine(config.get(section, 'maps', default=""), ',') for option in projectMaps: if option=="": continue fromcat, tocat, projectMap = splitMap(config.get(section, option)) for to_index, field in enumerate(tocat): value = (fromcat, projectMap, to_index) if mapdict.has_key(field): mapdict[field].append(value) else: mapdict[field] = [value] self.mapdict = mapdict return self.mapdict
def getMaps(self): """Get a dictionary of maps from the project section. """ config = getConfig() section = 'project:' + self.name if self.mapdict is None: mapdict = {} projectMaps = splitLine(config.get(section, 'maps', default=""), ',') for option in projectMaps: if option == "": continue fromcat, tocat, projectMap = splitMap( config.get(section, option)) for to_index, field in enumerate(tocat): value = (fromcat, projectMap, to_index) if mapdict.has_key(field): mapdict[field].append(value) else: mapdict[field] = [value] self.mapdict = mapdict return self.mapdict
def readContext(self, cdfile): "Get a dictionary of key/value pairs from an open file." f = cdfile.file result = {} if hasattr(f, 'title'): result['title'] = f.title if hasattr(f, 'Conventions'): result['Conventions'] = f.Conventions if hasattr(f, 'source'): result['source'] = f.source if hasattr(f, 'history'): result['history'] = f.history config = getConfig() projectSection = 'project:' + self.name config_key = "extract_global_attrs" if config.has_option(projectSection, config_key): for key in splitLine(config.get(projectSection, config_key), ','): result[key] = cdfile.getAttribute(key, None) return result
def validateContext(self, context): """ Validate context values: - Mandatory values must be non-blank, and if enumerated have a valid value - If enumerated, non-mandatory values must be blank or have a valid value otherwise if enumerated the field must be either be blank or one of the valid values Raises ESGPublishError if a validation error occurs If the validate configuration option is set to False in the project section, validation always succeeds. """ if not self.validate: return for key in context.keys(): fieldType = self.getFieldType(key) # Ignore non-configured fields if fieldType is None: continue isenum = (fieldType==ENUM) if isenum: options = self.getFieldOptions(key) value = context[key] config = getConfig() project_section = 'project:%s' % self.name delimiter = config.get(project_section, key + "_delimiter", default="") if value in ['', None]: # if value not in default context, try to get it from key_pattern or *_map option = '%s_pattern' % key if config.has_option(project_section, option): value = config.get(project_section, option, False, context) context[key] = value elif config.has_option(project_section, 'maps'): for map_option in splitLine(config.get(project_section, 'maps', default=''), ','): from_keys, to_keys, value_dict = splitMap(config.get(project_section, map_option)) if key in to_keys: from_values = tuple(context[k] for k in from_keys) to_values = value_dict[from_values] value = to_values[to_keys.index(key)] context[key] = value if self.isMandatory(key): if value in ['', None]: if isenum: raise ESGInvalidMandatoryField("Mandatory field '%s' not set, must be one of %s"%(key, `options`)) else: raise ESGInvalidMandatoryField("Mandatory field '%s' not set"%key) elif isenum and not self.compareEnumeratedValue(value, options, delimiter): validOptions = self.mapValidFieldOptions(key, options) raise ESGInvalidMandatoryField("Invalid value of mandatory field '%s': %s, must be one of %s"%(key, value, `validOptions`)) elif isenum: # non-mandatory field options += ['', None] if not self.compareEnumeratedValue(value, options, delimiter): validOptions = self.mapValidFieldOptions(key, options) raise ESGPublishError("Invalid value of '%s': %s, must be one of %s"%(key, value, `validOptions`))
def readDatasetMap(mappath, parse_extra_fields=False): """Read a dataset map. A dataset map is a text file, each line having the form: dataset_id | absolute_file_path | size [ | ``from_file`` =<path> [ | extra_field=extra_value ...]] where dataset_id has the form dataset_name[#version] Returns (if parse_extra_fields=False) a dataset map - a dictionary: dataset_id => [(path, size), (path, size), ...] If parse_extra_fields=True, returns a tuple (dataset_map, extra_dictionary). See parse_extra_fields. mappath Name of the dataset map. parse_extra_fields Boolean; if True then parse any extra fields of the form *extra_field=extra_value*, and return a dictionary with items of the form: extrafields[(dataset_name, version_number, absolute_file_path, *field_name*)] => field_value where *field_name* is one of: - ``from_file`` - ``mod_time`` """ datasetMap = {} extraFieldMap = {} mapfile = open(mappath) for line in mapfile.readlines(): if line[0] == '#' or line.strip() == '': continue if parse_extra_fields: fields = splitLine(line) versionName, path, size = fields[0:3] datasetName, versionno = parseDatasetVersionId(versionName) if len(fields) > 3: for field in fields[3:]: efield, evalue = field.split('=') extraFieldMap[(datasetName, versionno, path, efield.strip())] = evalue.strip() if datasetMap.has_key((datasetName, versionno)): datasetMap[(datasetName, versionno)].append((path, size)) else: datasetMap[(datasetName, versionno)] = [(path, size)] else: datasetId, path, size = splitLine(line)[0:3] versionId = parseDatasetVersionId(datasetId) if datasetMap.has_key(versionId): datasetMap[versionId].append((path, size)) else: datasetMap[versionId] = [(path, size)] mapfile.close() for value in datasetMap.values(): value.sort() if parse_extra_fields: return (datasetMap, extraFieldMap) else: return datasetMap
def extractFromDataset(datasetName, fileIterator, dbSession, handler, cfHandler, aggregateDimensionName=None, offline=False, operation=CREATE_OP, progressCallback=None, stopEvent=None, perVariable=None, keepVersion=False, newVersion=None, extraFields=None, masterGateway=None, comment=None, useVersion=-1, forceRescan=False, nodbwrite=False, pid_connector=None, test_publication=False, **context): """ Extract metadata from a dataset represented by a list of files, add to a database. Populates the database tables: - dataset - dataset_version - file - file_version - dataset_file_version - file_variable (partially) - associated attribute tables Returns a Dataset object. datasetName String dataset identifier. fileIterator An iterator that returns an iteration of (file_path, file_size), where file_size is an integer. dbSession A database Session. handler Project handler cfHandler A CF handler instance aggregateDimensionName The name of the dimension across which the dataset is aggregated, if any. offline Boolean, True if the files are offline, cannot be scanned. operation Publication operation, one of CREATE_OP, DELETE_OP, RENAME_OP, UPDATE_OP progressCallback Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported. stopEvent Object with boolean attribute ``stop_extract`` (for example, ``utility.StopEvent``). If set to True (in another thread) the extraction is stopped. perVariable=None Boolean, overrides ``variable_per_file`` config option. keepVersion Boolean, True if the dataset version should not be incremented. newVersion Set the new version number explicitly. By default the version number is incremented by 1. See keepVersion. extraFields Extra fields dictionary, as from ``readDatasetMap``. masterGateway The gateway that owns the master copy of the datasets. If None, the dataset is not replicated. Otherwise the TDS catalog is written with a 'master_gateway' property, flagging the dataset(s) as replicated. comment String comment on the dataset version. If the dataset version is not increased, the comment is ignored. useVersion=-1: Integer version number of the dataset version to modify. By default the latest version is modified. forceRescan Boolean, if True force all files to be rescanned on an update. pid_connector ESGF_PID_connector object to register PIDs test_publication Flag whether publication is for production or test context A dictionary with keys ``project``, ``model``, ``experiment``, etc. The context consists of all fields needed to uniquely define the dataset. """ session = dbSession() # Get configuration options related to the scan configOptions = {} config = getConfig() if config is not None: section = 'project:%s'%context.get('project') vlstring = config.get(section, 'variable_locate', default=None) if vlstring is not None: fields = splitLine(vlstring) varlocate = [s.split(',') for s in fields] else: varlocate = None line = config.get('DEFAULT', 'checksum', default=None) if line is not None: checksumClient, checksumType = splitLine(line) else: checksumClient = None checksumType = None versionByDate = config.getboolean(section, 'version_by_date', default=False) if not offline: if perVariable is None: perVariable = config.getboolean(section, 'variable_per_file', False) else: perVariable = False else: varlocate = None checksumClient = None checksumType = None versionByDate = False exclude_variables = splitLine(config.get(section, 'thredds_exclude_variables', default=''), sep=',') configOptions['variable_locate'] = varlocate configOptions['checksumClient'] = checksumClient configOptions['checksumType'] = checksumType configOptions['exclude_variables'] = exclude_variables configOptions['perVariable'] = perVariable # Check if the dataset / version is already in the database dset = session.query(Dataset).filter_by(name=datasetName).first() if dset is not None: if operation==CREATE_OP: operation = REPLACE_OP else: if operation in [UPDATE_OP, REPLACE_OP]: operation = CREATE_OP elif operation in [DELETE_OP, RENAME_OP]: raise ESGPublishError("No such dataset: %s"%datasetName) # Cannot add online files to offline dataset, and vice versa if dset is not None and dset.offline != offline: if dset.offline: raise ESGPublishError("Dataset %s is offline, set offline flag or replace the dataset."%dset.name) else: raise ESGPublishError("Dataset %s is online, but offline flag is set."%dset.name) # Cannot publish a replica with the same ID as a local dataset and vice versa if dset is not None and dset.master_gateway != masterGateway: if dset.master_gateway is None: raise ESGPublishError("Dataset %s exists and is not a replica - delete it before publishing a replica of the same name."%dset.name) else: raise ESGPublishError("Dataset %s exists and is a replica. Use --replica or delete the existing dataset."%dset.name) createTime = datetime.datetime.now() # DatasetVersion creation_time fobjs = None pathlist = [item for item in fileIterator] if (nodbwrite): dset = Dataset(datasetName, context.get('project', None), context.get('model', None), context.get('experiment', None), context.get('run_name', None), offline=offline, masterGateway=masterGateway) addNewVersion, fobjs = createDataset(dset, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, masterGateway=masterGateway, **context) info("dataset scan complete, not writing to database") return dset elif operation==CREATE_OP: # Create a new dataset info("Creating dataset: %s"%datasetName) dset = Dataset(datasetName, context.get('project', None), context.get('model', None), context.get('experiment', None), context.get('run_name', None), offline=offline, masterGateway=masterGateway) session.add(dset) # Create an initial dataset version existingVersion = 0 eventFlag = CREATE_DATASET_EVENT addNewVersion, fobjs = createDataset(dset, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, masterGateway=masterGateway, useVersion=useVersion, **context) elif operation in [UPDATE_OP, REPLACE_OP]: if operation==REPLACE_OP: versionObj = dset.getVersionObj(-1) else: versionObj = dset.getVersionObj(useVersion) if versionObj is None: raise ESGPublishError("Version %d of dataset %s not found, cannot republish."%(useVersion, dset.name)) existingVersion = dset.getVersion() eventFlag = UPDATE_DATASET_EVENT addNewVersion, fobjs = updateDatasetVersion(dset, versionObj, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, replace=(operation==REPLACE_OP), forceRescan=forceRescan, useVersion=useVersion, **context) elif operation==RENAME_OP: versionObj = dset.getVersionObj(useVersion) if versionObj is None: raise ESGPublishError("Version %d of dataset %s not found, cannot republish."%(useVersion, dset.name)) existingVersion = dset.getVersion() eventFlag = UPDATE_DATASET_EVENT addNewVersion = renameFilesVersion(dset, versionObj, pathlist, session, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, **context) elif operation==DELETE_OP: versionObj = dset.getVersionObj(useVersion) if versionObj is None: raise ESGPublishError("Version %d of dataset %s not found, cannot republish."%(useVersion, dset.name)) existingVersion = dset.getVersion() eventFlag = UPDATE_DATASET_EVENT addNewVersion, fobjs = deleteFilesVersion(dset, versionObj, pathlist, session, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, **context) else: raise ESGPublishError("Invalid dataset operation: %s"%`operation`) # Create a new dataset version if necessary if useVersion == -1: if keepVersion: if existingVersion<=0: newVersion = getInitialDatasetVersion(versionByDate) else: newVersion = existingVersion elif newVersion is None: newVersion = getNextDatasetVersion(existingVersion, versionByDate) else: newVersion = useVersion dset.reaggregate = False if newVersion<existingVersion: versionList = dset.getVersionList() if newVersion in versionList: addNewVersion = False # Add a new version if addNewVersion: datasetTechNotes = datasetTechNotesTitle = None if hasattr(dset, "dataset_tech_notes"): datasetTechNotes = dset.dataset_tech_notes if hasattr(dset, "dataset_tech_notes_title"): datasetTechNotesTitle = dset.dataset_tech_notes_title # if project uses PIDs, generate PID for dataset dataset_pid = None if pid_connector: dataset_pid = pid_connector.make_handle_from_drsid_and_versionnumber(drs_id=datasetName, version_number=newVersion) info("Assigned PID to dataset %s.v%s: %s " % (datasetName, newVersion, dataset_pid)) # if project uses citation, build citation url project_config_section = 'config:%s' %context.get('project') citation_url = handler.get_citation_url(project_config_section, config, datasetName, newVersion, test_publication) newDsetVersionObj = DatasetVersionFactory(dset, version=newVersion, creation_time=createTime, comment=comment, tech_notes=datasetTechNotes, tech_notes_title=datasetTechNotesTitle, pid=dataset_pid, citation_url=citation_url) info("New dataset version = %d"%newDsetVersionObj.version) try: for var in dset.variables: session.delete(var) except IntegrityError as ie: debug("sqlalchemy IntegrityError: " + str(ie)) raise ESGPublishError("Error in creating dataset version, did you already publish this version to the database?") newDsetVersionObj.files.extend(fobjs) event = Event(datasetName, newDsetVersionObj.version, eventFlag) dset.events.append(event) dset.reaggregate = True # Keep the current (latest) version elif addNewVersion and newVersion==existingVersion and operation in [UPDATE_OP, REPLACE_OP]: versionObj.deleteChildren(session) versionObj.reset(creation_time=createTime, comment=comment) info("Keeping dataset version = %d"%versionObj.version) for var in dset.variables: session.delete(var) session.commit() versionObj.files.extend(fobjs) event = Event(datasetName, versionObj.version, eventFlag) dset.events.append(event) dset.reaggregate = True elif masterGateway is not None: # Force version set on replication info("Dataset version = %d"%newVersion) dset.setVersion(newVersion) event = Event(datasetName, newVersion, eventFlag) dset.events.append(event) info("Adding file info to database") session.commit() session.close() return dset
def esgscanWrapper(directoryList, **kw): if len(directoryList) == 0: raise ESGPublishError('No directory specified') output = sys.stdout appendMap = None appendPath = kw.get("appendPath", None) if appendPath is not None: if os.path.exists(appendPath): appendMap = readDatasetMap(appendPath) else: appendMap = {} output = open(appendPath, 'a') datasetName = kw.get("datasetName", None) filefilt = kw.get("fileFilt", '.*\.nc$') init_file = kw.get("initFile", None) offline = kw.get("offline", False) outputPath = kw.get("outputPath", None) if outputPath is not None: output = open(outputPath, 'w') else: output = sys.stdout projectName = kw.get("projectName", None) readFiles = kw.get("readFiles", False) service = kw.get("service", None) # Load the configuration and set up a database connection config, Session = initdb(init_file=init_file) # Register project handlers registerHandlers() if not offline: # Determine if checksumming is enabled line = config.get('DEFAULT', 'checksum', default=None) if line is not None: checksumClient, checksumType = splitLine(line) else: checksumClient = None if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: multiIter = multiDirectoryIterator(directoryList, filefilt=filefilt) firstFile, size = multiIter.next() handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError( "No project found in file %s, specify with --project." % firstFile) projectName = handler.name if not readFiles: datasetMap = handler.generateDirectoryMap(directoryList, filefilt, datasetName=datasetName) else: datasetMap = handler.generateDirectoryMapFromFiles( directoryList, filefilt, datasetName=datasetName) # Output the map keys = datasetMap.keys() keys.sort() for datasetId in keys: direcTuple = datasetMap[datasetId] direcTuple.sort() for nodepath, filepath in direcTuple: # If readFiles is not set, generate a map entry for each file in the directory # that matches filefilt ... if not readFiles: itr = directoryIterator(nodepath, filefilt=filefilt, followSubdirectories=False) # ... otherwise if readFiles is set, generate a map entry for each file else: itr = fnIterator([filepath]) for filepath, sizet in itr: size, mtime = sizet extraStuff = "mod_time=%f" % float(mtime) if checksumClient is not None: csum = checksum(filepath, checksumClient) extraStuff += " | checksum=%s | checksum_type=%s" % ( csum, checksumType) # Print the map entry if: # - The map is being created, not appended, or # - The existing map does not have the dataset, or # - The existing map has the dataset, but not the file. if (appendMap is None) or (not appendMap.has_key(datasetId)) or ( (filepath, "%d" % size) not in appendMap[datasetId]): print >> output, "%s | %s | %d | %s" % ( datasetId, filepath, size, extraStuff) else: # offline if projectName is not None: handler = getHandlerByName(projectName, None, Session, offline=True) else: raise ESGPublishError( "Must specify --project for offline datasets.") listerSection = getOfflineLister(config, "project:%s" % projectName, service) offlineLister = config.get(listerSection, 'offline_lister_executable') commandArgs = "--config-section %s " % listerSection commandArgs += " ".join(directoryList) for dsetName, filepath, sizet in processNodeMatchIterator( offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True): size, mtime = sizet extrastuff = "" if mtime is not None: extrastuff = "| mod_time=%f" % float(mtime) if (appendMap is None) or (not appendMap.has_key(dsetName)) or ( (filepath, "%d" % size) not in appendMap[dsetName]): print >> output, "%s | %s | %d %s" % (dsetName, filepath, size, extrastuff) if output is not sys.stdout: output.close()
def generateDirectoryMap(self, directoryList, filefilt, initContext=None, datasetName=None, use_version=False): """Generate a directory map. Recursively scan each directory in *directoryList*, locating each directory with at least one file matching filefilt. Returns a directory map (dictionary) mapping dataset_id => [(directory_path, filepath), (directory_path, filepath), ...] where the dataset_id is generated by matching the 'directory_format' configuration option to each directory path. The map has one entry per directory, where it is assumed that all files in the directory belong to the same dataset. directoryList List of directories to scan. The scan searches for directories matching the 'directory_format' configuration file option for this project, and having at least one file matching *filefilt*. filefilt Regular expression as defined by the Python **re** module. Matched against the file basename. initContext Dictionary of field => value items. Entries override values found from matching the directory paths. datasetName Name of the dataset. If not specified, generate with ``generateDatasetId()``. """ from esgcet.publish import nodeIterator # If the dataset name is specified, no need to get directory format filters if datasetName is None: # Get the dataset_id and filters filters = self.getFilters() config = getConfig() section = 'project:'+self.name dataset_id_formats = splitLine(config.get(section, 'dataset_id', raw=True)) idfields = [re.findall(_patpat, format) for format in dataset_id_formats] else: filters = [r'.*$'] # Iterate over nodes mapdict = self.getMaps() datasetMap = {} for direc in directoryList: if direc[-1]=='/': direc = direc[:-1] nodeiter = nodeIterator(direc, filters, filefilt) for nodepath, filepath, groupdict in nodeiter: if initContext is not None: groupdict.update(initContext) if not groupdict.has_key('project'): groupdict['project'] = self.name if datasetName is None: try: datasetId = self.generateDatasetId('dataset_id', idfields, groupdict, multiformat=dataset_id_formats) if use_version and 'version' in groupdict: drsversion = groupdict['version'] if not re.match('^[0-9]+$', drsversion[0]): # e.g. vYYYYMMDD drsversion = drsversion[1:] datasetId += '#%s'%drsversion except: allfields = reduce(lambda x,y: set(x)+set(y), idfields) missingFields = list((set(allfields)-set(groupdict.keys()))-set(config.options(section))) raise ESGPublishError("Cannot generate a value for dataset_id. One of the following fields could not be determined from the directory structure: %s\nDirectory = %s"%(`missingFields`, nodepath)) else: warning("Empty dataset name. Check that directory hierarchy format matches the configured format string in esg.ini") datasetId = datasetName if datasetMap.has_key(datasetId): datasetMap[datasetId].append((nodepath, filepath)) else: datasetMap[datasetId] = [(nodepath, filepath)] if (len(datasetMap) == 0 ): warning("Empty datasetMap. Check that directory hierarchy format matches the configured format string in esg.ini") return datasetMap
def generateDirectoryMap(self, directoryList, filefilt, initContext=None, datasetName=None, use_version=False): """Generate a directory map. Recursively scan each directory in *directoryList*, locating each directory with at least one file matching filefilt. Returns a directory map (dictionary) mapping dataset_id => [(directory_path, filepath), (directory_path, filepath), ...] where the dataset_id is generated by matching the 'directory_format' configuration option to each directory path. The map has one entry per directory, where it is assumed that all files in the directory belong to the same dataset. directoryList List of directories to scan. The scan searches for directories matching the 'directory_format' configuration file option for this project, and having at least one file matching *filefilt*. filefilt Regular expression as defined by the Python **re** module. Matched against the file basename. initContext Dictionary of field => value items. Entries override values found from matching the directory paths. datasetName Name of the dataset. If not specified, generate with ``generateDatasetId()``. """ from esgcet.publish import nodeIterator # If the dataset name is specified, no need to get directory format filters if datasetName is None: # Get the dataset_id and filters filters = self.getFilters() config = getConfig() section = 'project:' + self.name dataset_id_formats = splitLine( config.get(section, 'dataset_id', raw=True)) idfields = [ re.findall(_patpat, format) for format in dataset_id_formats ] else: filters = [r'.*$'] # Iterate over nodes mapdict = self.getMaps() datasetMap = {} for direc in directoryList: if direc[-1] == '/': direc = direc[:-1] nodeiter = nodeIterator(direc, filters, filefilt) for nodepath, filepath, groupdict in nodeiter: if initContext is not None: groupdict.update(initContext) if not groupdict.has_key('project'): groupdict['project'] = self.name if datasetName is None: try: datasetId = self.generateDatasetId( 'dataset_id', idfields, groupdict, multiformat=dataset_id_formats) if use_version and 'version' in groupdict: drsversion = groupdict['version'] if not re.match('^[0-9]+$', drsversion[0]): # e.g. vYYYYMMDD drsversion = drsversion[1:] datasetId += '#%s' % drsversion except: allfields = reduce(lambda x, y: set(x) + set(y), idfields) missingFields = list((set(allfields) - set(groupdict.keys())) - set(config.options(section))) raise ESGPublishError( "Cannot generate a value for dataset_id. One of the following fields could not be determined from the directory structure: %s\nDirectory = %s" % ( ` missingFields `, nodepath)) else: warning( "Empty dataset name. Check that directory hierarchy format matches the configured format string in esg.ini" ) datasetId = datasetName if datasetMap.has_key(datasetId): datasetMap[datasetId].append((nodepath, filepath)) else: datasetMap[datasetId] = [(nodepath, filepath)] if (len(datasetMap) == 0): warning( "Empty datasetMap. Check that directory hierarchy format matches the configured format string in esg.ini" ) return datasetMap
def validateContext(self, context): """ Validate context values: - Mandatory values must be non-blank, and if enumerated have a valid value - If enumerated, non-mandatory values must be blank or have a valid value otherwise if enumerated the field must be either be blank or one of the valid values Raises ESGPublishError if a validation error occurs If the validate configuration option is set to False in the project section, validation always succeeds. """ if not self.validate: return for key in context.keys(): fieldType = self.getFieldType(key) # Ignore non-configured fields if fieldType is None: continue isenum = (fieldType == ENUM) if isenum: options = self.getFieldOptions(key) value = context[key] config = getConfig() project_section = 'project:%s' % self.name delimiter = config.get(project_section, key + "_delimiter", default="") if value in ['', None]: # if value not in default context, try to get it from key_pattern or *_map option = '%s_pattern' % key if config.has_option(project_section, option): value = config.get(project_section, option, False, context) context[key] = value elif config.has_option(project_section, 'maps'): for map_option in splitLine( config.get(project_section, 'maps', default=''), ','): from_keys, to_keys, value_dict = splitMap( config.get(project_section, map_option)) if key in to_keys: from_values = tuple(context[k] for k in from_keys) to_values = value_dict[from_values] value = to_values[to_keys.index(key)] context[key] = value if self.isMandatory(key): if value in ['', None]: if isenum: raise ESGInvalidMandatoryField( "Mandatory field '%s' not set, must be one of %s" % (key, ` options `)) else: raise ESGInvalidMandatoryField( "Mandatory field '%s' not set" % key) elif isenum and not self.compareEnumeratedValue( value, options, delimiter): validOptions = self.mapValidFieldOptions(key, options) raise ESGInvalidMandatoryField( "Invalid value of mandatory field '%s': %s, must be one of %s" % (key, value, ` validOptions `)) elif isenum: # non-mandatory field options += ['', None] if not self.compareEnumeratedValue(value, options, delimiter): validOptions = self.mapValidFieldOptions(key, options) raise ESGPublishError( "Invalid value of '%s': %s, must be one of %s" % (key, value, ` validOptions `))
def initializeFields(self, Session): """Initialize field names and options based on the configuration file.""" from esgcet.model import Model, Experiment config = getConfig() projectSection = 'project:' + self.name categoryOption = config.get(projectSection, 'categories') categorySpecs = splitRecord(categoryOption) for category, categoryTypeS, isMandatoryS, isThreddsPropertyS, displayOrderS in categorySpecs: categoryType = getCategoryType(categoryTypeS) isMandatory = getBoolean(isMandatoryS) isThreddsProperty = getBoolean(isThreddsPropertyS) displayOrder = string.atoi(displayOrderS) self.fieldNames[category] = (categoryType, isMandatory, isThreddsProperty, displayOrder) categoryDefaultsOption = config.get(projectSection, 'category_defaults', default=None, raw=True) if categoryDefaultsOption is not None: categoryDefaultsSpecs = splitRecord(categoryDefaultsOption) for category, categoryDefault in categoryDefaultsSpecs: self.categoryDefaults[category] = categoryDefault session = Session() # Find any new experiments. This allows experiments to be added to the config file without # running esginitialize. if self.fieldNames.has_key('experiment') and self.fieldNames[ 'experiment'][WIDGET_TYPE] == ENUM: initializeExperiments(config, self.name, session) for category in self.getFieldNames(): # At the moment some fields are predefined if category == "project": projects = splitRecord( config.get(projectSection, 'project_options', default='')) self.validValues['project'] = [x[0] for x in projects] elif category == "model": models = session.query(Model).filter_by( project=self.name).all() self.validValues['model'] = [x.name for x in models] elif category == "experiment": experiments = session.query(Experiment).filter_by( project=self.name).all() self.validValues['experiment'] = [x.name for x in experiments] elif category == "creator": creators = splitRecord( config.get(projectSection, 'creator_options', default='')) self.validValues['creator'] = [x[0] for x in creators] self.validMaps['creator'] = genMap(creators) elif category == "publisher": publishers = splitRecord( config.get(projectSection, 'publisher_options', default='')) self.validValues['publisher'] = [x[0] for x in publishers] self.validMaps['publisher'] = genMap(publishers) else: categoryType = self.getFieldType(category) if categoryType == ENUM: option = category + "_options" self.validValues[category] = splitLine( config.get(projectSection, option), ',') self.context[category] = '' session.close()
def readDatasetMap(mappath, parse_extra_fields=False): """Read a dataset map. A dataset map is a text file, each line having the form: dataset_id | absolute_file_path | size [ | ``from_file`` =<path> [ | extra_field=extra_value ...]] where dataset_id has the form dataset_name[#version] Returns (if parse_extra_fields=False) a dataset map - a dictionary: dataset_id => [(path, size), (path, size), ...] If parse_extra_fields=True, returns a tuple (dataset_map, extra_dictionary). See parse_extra_fields. mappath Name of the dataset map. parse_extra_fields Boolean; if True then parse any extra fields of the form *extra_field=extra_value*, and return a dictionary with items of the form: extrafields[(dataset_name, version_number, absolute_file_path, *field_name*)] => field_value where *field_name* is one of: - ``from_file`` - ``mod_time`` """ datasetMap = {} extraFieldMap = {} mapfile = open(mappath) for line in mapfile.readlines(): if line[0]=='#' or line.strip()=='': continue if parse_extra_fields: fields = splitLine(line) versionName, path, size = fields[0:3] datasetName,versionno = parseDatasetVersionId(versionName) if len(fields)>3: for field in fields[3:]: efield, evalue = field.split('=') extraFieldMap[(datasetName, versionno, path, efield.strip())] = evalue.strip() if datasetMap.has_key((datasetName, versionno)): datasetMap[(datasetName, versionno)].append((path, size)) else: datasetMap[(datasetName, versionno)] = [(path, size)] else: datasetId, path, size = splitLine(line)[0:3] versionId = parseDatasetVersionId(datasetId) if datasetMap.has_key(versionId): datasetMap[versionId].append((path, size)) else: datasetMap[versionId] = [(path, size)] mapfile.close() for value in datasetMap.values(): value.sort() if parse_extra_fields: return (datasetMap, extraFieldMap) else: return datasetMap
def parseDatasetName(self, datasetName, context): """Parse a dataset name. Returns a dictionary, mapping field => value. The config file option 'dataset_id' is used to parse the name into fields. datasetName String dataset identifier. context Initial context dictionary. This argument is altered on output. """ config = getConfig() section = 'project:' + self.name datasetIdFormatList = config.get(section, 'dataset_id', raw=True, default=None) if datasetIdFormatList is None: # warning("No dataset_id option found for project %s"%self.name) return context datasetIdFormats = splitLine(datasetIdFormatList) formatMatched = False for idFormat in datasetIdFormats: # '.' => '\.' newinit = re.sub(r'\.', r'\.', idFormat.strip()) # %(name)s => (?P<name>[^.]*) newinit = re.sub(_patpat, r'(?P<\1>[^.]*)', newinit) # If experiment is enumerated, match on the experiment options. This allows # experiment ids to contain periods (.) . experimentOptions = self.getFieldOptions('experiment') # Map to case-sensitive options experimentOptions = self.mapValidFieldOptions( 'experiment', experimentOptions) if idFormat.find( '%(experiment)s') != -1 and experimentOptions is not None: if len(experimentOptions) > 0: optionOr = reduce(lambda x, y: x + '|' + y, experimentOptions) experimentPattern = r'(?P<experiment>%s)' % optionOr newinit = newinit.replace('(?P<experiment>[^.]*)', experimentPattern) if newinit[-1] != '$': newinit += '$' match = re.match(newinit, datasetName) if match is None: continue else: result = match.groupdict() formatMatched = True for key, value in result.items(): if context.has_key(key) and value != context[key]: warning("Dataset ID=%s, but %s=%s" % (datasetName, key, context[key])) else: context[str(key)] = value break if not formatMatched: warning( "Dataset ID: %s does not match the dataset_id format(s): %s" % (datasetName, ` datasetIdFormats `)) return context
def esgscanWrapper(directoryList, **kw): if len(directoryList) == 0: raise ESGPublishError("No directory specified") output = sys.stdout appendMap = None appendPath = kw.get("appendPath", None) if appendPath is not None: if os.path.exists(appendPath): appendMap = readDatasetMap(appendPath) else: appendMap = {} output = open(appendPath, "a") datasetName = kw.get("datasetName", None) filefilt = kw.get("fileFilt", ".*\.nc$") init_file = kw.get("initFile", None) offline = kw.get("offline", False) outputPath = kw.get("outputPath", None) if outputPath is not None: output = open(outputPath, "w") else: output = sys.stdout projectName = kw.get("projectName", None) readFiles = kw.get("readFiles", False) service = kw.get("service", None) # Load the configuration and set up a database connection config, Session = initdb(init_file=init_file) # Register project handlers registerHandlers() if not offline: # Determine if checksumming is enabled line = config.get("DEFAULT", "checksum", default=None) if line is not None: checksumClient, checksumType = splitLine(line) else: checksumClient = None if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: multiIter = multiDirectoryIterator(directoryList, filefilt=filefilt) firstFile, size = multiIter.next() handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError("No project found in file %s, specify with --project." % firstFile) projectName = handler.name if not readFiles: datasetMap = handler.generateDirectoryMap(directoryList, filefilt, datasetName=datasetName) else: datasetMap = handler.generateDirectoryMapFromFiles(directoryList, filefilt, datasetName=datasetName) # Output the map keys = datasetMap.keys() keys.sort() for datasetId in keys: direcTuple = datasetMap[datasetId] direcTuple.sort() for nodepath, filepath in direcTuple: # If readFiles is not set, generate a map entry for each file in the directory # that matches filefilt ... if not readFiles: itr = directoryIterator(nodepath, filefilt=filefilt, followSubdirectories=False) # ... otherwise if readFiles is set, generate a map entry for each file else: itr = fnIterator([filepath]) for filepath, sizet in itr: size, mtime = sizet extraStuff = "mod_time=%f" % float(mtime) if checksumClient is not None: csum = checksum(filepath, checksumClient) extraStuff += " | checksum=%s | checksum_type=%s" % (csum, checksumType) # Print the map entry if: # - The map is being created, not appended, or # - The existing map does not have the dataset, or # - The existing map has the dataset, but not the file. if ( (appendMap is None) or (not appendMap.has_key(datasetId)) or ((filepath, "%d" % size) not in appendMap[datasetId]) ): print >> output, "%s | %s | %d | %s" % (datasetId, filepath, size, extraStuff) else: # offline if projectName is not None: handler = getHandlerByName(projectName, None, Session, offline=True) else: raise ESGPublishError("Must specify --project for offline datasets.") listerSection = getOfflineLister(config, "project:%s" % projectName, service) offlineLister = config.get(listerSection, "offline_lister_executable") commandArgs = "--config-section %s " % listerSection commandArgs += " ".join(directoryList) for dsetName, filepath, sizet in processNodeMatchIterator( offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True ): size, mtime = sizet extrastuff = "" if mtime is not None: extrastuff = "| mod_time=%f" % float(mtime) if ( (appendMap is None) or (not appendMap.has_key(dsetName)) or ((filepath, "%d" % size) not in appendMap[dsetName]) ): print >> output, "%s | %s | %d %s" % (dsetName, filepath, size, extrastuff) if output is not sys.stdout: output.close()
def main(argv): try: args, lastargs = getopt.getopt(argv, "a:ehi:o:p:", ['dataset=', 'dataset-tech-notes=', 'dataset-tech-notes-title=',\ 'filter=', 'help', 'max-threads=', 'offline', 'output=', 'project=', 'property=', 'read-directories', 'read-files',\ 'service=', 'use-version-dir', 'version=']) except getopt.error: print sys.exc_value return if len(lastargs)==0: print 'No directory specified' return appendMap = None datasetName = None datasetTechNotesURL = None datasetTechNotesTitle = None filefilt = '.*\.nc$' init_file = None offline = False output = sys.stdout projectName = None properties = {} readFiles = False service = None max_threads = 4 version_dir = False use_version = None for flag, arg in args: if flag=='-a': if os.path.exists(arg): appendMap = readDatasetMap(arg) else: appendMap = {} output = open(arg, 'a') elif flag=='--dataset': datasetName = arg elif flag=='--dataset-tech-notes': datasetTechNotesURL = arg elif flag=='--dataset-tech-notes-title': datasetTechNotesTitle = arg elif flag=='--filter': filefilt = arg elif flag in ['-h', '--help']: print usage sys.exit(0) elif flag=='-i': init_file = arg elif flag=='--max-threads': max_threads = int(arg) elif flag in ['-o', '--output']: output = open(arg, 'w') elif flag=='--offline': offline = True elif flag=='--project': projectName = arg elif flag in ['-p', '--property']: name, value = arg.split('=') properties[name] = value elif flag in ['-e', '--read-directories']: readFiles = False elif flag=='--read-files': readFiles = True elif flag=='--service': service = arg elif flag=='--use-version-dir': version_dir = True elif flag=='--version': version_dir = True if not re.match('^[0-9]+$', arg[0]): # e.g. 'vYYYYMMDD' use_version = arg[1:] else: use_version = arg # Load the configuration and set up a database connection config = loadConfig(init_file) engine = create_engine(config.getdburl('extract'), echo=False, pool_recycle=3600) initLogging('extract', override_sa=engine) Session = sessionmaker(bind=engine, autoflush=True, autocommit=False) # Register project handlers registerHandlers() if not offline: # Determine if checksumming is enabled line = config.get('DEFAULT', 'checksum', default=None) if line is not None: checksumClient, checksumType = splitLine(line) else: checksumClient = None if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: warning("No project name specified!") multiIter = multiDirectoryIterator(lastargs, filefilt=filefilt) firstFile, size = multiIter.next() handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError("No project found in file %s, specify with --project."%firstFile) projectName = handler.name if not readFiles: datasetMap = handler.generateDirectoryMap(lastargs, filefilt, initContext=properties, datasetName=datasetName, use_version=version_dir) else: datasetMap = handler.generateDirectoryMapFromFiles(lastargs, filefilt, initContext=properties, datasetName=datasetName) # Output the map keys = datasetMap.keys() keys.sort() datasetMapVersion = {} if version_dir: # check for version directory for dataset_id in keys: ds_id_version = dataset_id.split('#') if len(ds_id_version) == 2: ds_id, ds_version = ds_id_version if not re.match('^[0-9]+$', ds_version): warning("Version must be an integer. Skipping version %s of dataset %s."%(ds_version, ds_id)) continue if use_version and ds_version != use_version: continue if ds_id in datasetMapVersion: datasetMapVersion[ds_id].append(ds_version) else: datasetMapVersion[ds_id] = [ds_version] else: error("No version directory found. Skipping dataset %s."%dataset_id) if datasetMapVersion: keys = datasetMapVersion.keys() keys.sort() else: if use_version: error("Version %s not found. No datasets to process."%use_version) else: error("No datasets to process.") return for dataset_id in keys: skip_dataset = False dataset_id_version = dataset_id path_version = None # if multiple versions of the same dataset available use latest version if version_dir: path_version = sorted(datasetMapVersion[dataset_id])[-1] if len(datasetMapVersion[dataset_id]) > 1: info("Multiple versions for %s (%s), processing latest (%s)"%(dataset_id, datasetMapVersion[dataset_id], path_version)) dataset_id_version = '%s#%s'%(dataset_id, path_version) direcTuple = datasetMap[dataset_id_version] direcTuple.sort() mapfile_md = {} for nodepath, filepath in direcTuple: # If readFiles is not set, generate a map entry for each file in the directory # that matches filefilt ... if not readFiles: itr = directoryIterator(nodepath, filefilt=filefilt, followSubdirectories=False) # ... otherwise if readFiles is set, generate a map entry for each file else: itr = fnIterator([filepath]) for filepath, sizet in itr: size, mtime = sizet mapfile_md[filepath] = [size] mapfile_md[filepath].append("mod_time=%f"%float(mtime)) extraStuff = "mod_time=%f"%float(mtime) if datasetTechNotesURL is not None: mapfile_md[filepath].append('dataset_tech_notes=%s'%datasetTechNotesURL) if datasetTechNotesURL is not None: mapfile_md[filepath].append('dataset_tech_notes_title=%s'%datasetTechNotesTitle) if checksumClient is not None: pool = ThreadPool(processes=max_threads) args = [(filepath, checksumClient) for filepath in mapfile_md] checksum_list = pool.map(calc_checksum_wrapper, args) for entry in checksum_list: if not entry[1]: error('Calculation of checksum for file %s failed. Skipping dataset %s ...'%(entry[0], dataset_id)) skip_dataset = True # skip entire dataset if we have one file without checksum break mapfile_md[entry[0]].append('checksum=%s'%entry[1]) mapfile_md[entry[0]].append('checksum_type=%s'%checksumType) for fpath in mapfile_md: mapfile_line = '%s | %s | %d'%(dataset_id_version, fpath, mapfile_md[fpath][0]) for md in mapfile_md[fpath][1:]: mapfile_line+=' | %s'%md # Print the map entry if: # - Checksum exists for all files of dataset (in case checksumming is enabled) # - The map is being created, not appended, or # - The existing map does not have the dataset, or # - The existing map has the dataset, but not the file. if path_version: ds_id = (dataset_id, int(path_version)) else: ds_id = (dataset_id, -1) if not skip_dataset and ( (appendMap is None) or (not appendMap.has_key(ds_id)) or (( fpath, "%d"% mapfile_md[fpath][1]) not in appendMap[ds_id]) ): print >>output, mapfile_line else: # offline if projectName is not None: handler = getHandlerByName(projectName, None, Session, offline=True) else: raise ESGPublishError("Must specify --project for offline datasets.") listerSection = getOfflineLister(config, "project:%s"%projectName, service) offlineLister = config.get(listerSection, 'offline_lister_executable') commandArgs = "--config-section %s "%listerSection commandArgs += " ".join(lastargs) for dsetName, filepath, sizet in processNodeMatchIterator(offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True): size, mtime = sizet extrastuff = "" if mtime is not None: extrastuff = "| mod_time=%f"%float(mtime) if (appendMap is None) or (not appendMap.has_key(dsetName)) or ((filepath, "%d"%size) not in appendMap[dsetName]): print >>output, "%s | %s | %d %s"%(dsetName, filepath, size, extrastuff) if output is not sys.stdout: output.close()
def parseDatasetName(self, datasetName, context): """Parse a dataset name. Returns a dictionary, mapping field => value. The config file option 'dataset_id' is used to parse the name into fields. datasetName String dataset identifier. context Initial context dictionary. This argument is altered on output. """ config = getConfig() section = 'project:'+self.name datasetIdFormatList = config.get(section, 'dataset_id', raw=True, default=None) if datasetIdFormatList is None: # warning("No dataset_id option found for project %s"%self.name) return context datasetIdFormats = splitLine(datasetIdFormatList) formatMatched = False for idFormat in datasetIdFormats: # '.' => '\.' newinit = re.sub(r'\.', r'\.', idFormat.strip()) # %(name)s => (?P<name>[^.]*) newinit = re.sub(_patpat, r'(?P<\1>[^.]*)', newinit) # If experiment is enumerated, match on the experiment options. This allows # experiment ids to contain periods (.) . experimentOptions = self.getFieldOptions('experiment') # Map to case-sensitive options experimentOptions = self.mapValidFieldOptions('experiment', experimentOptions) if idFormat.find('%(experiment)s')!=-1 and experimentOptions is not None: if len(experimentOptions) > 0: optionOr = reduce(lambda x,y: x+'|'+y, experimentOptions) experimentPattern = r'(?P<experiment>%s)'%optionOr newinit = newinit.replace('(?P<experiment>[^.]*)', experimentPattern) if newinit[-1]!='$': newinit += '$' match = re.match(newinit, datasetName) if match is None: continue else: result = match.groupdict() formatMatched = True for key, value in result.items(): if context.has_key(key) and value!=context[key]: warning("Dataset ID=%s, but %s=%s"%(datasetName, key, context[key])) else: context[str(key)] = value break if not formatMatched: warning("Dataset ID: %s does not match the dataset_id format(s): %s"%(datasetName, `datasetIdFormats`)) return context
def extractFromDataset(datasetName, fileIterator, dbSession, handler, cfHandler, aggregateDimensionName=None, offline=False, operation=CREATE_OP, progressCallback=None, stopEvent=None, keepVersion=False, newVersion=None, extraFields=None, masterGateway=None, comment=None, useVersion=-1, forceRescan=False, **context): """ Extract metadata from a dataset represented by a list of files, add to a database. Populates the database tables: - dataset - dataset_version - file - file_version - dataset_file_version - file_variable (partially) - associated attribute tables Returns a Dataset object. datasetName String dataset identifier. fileIterator An iterator that returns an iteration of (file_path, file_size), where file_size is an integer. dbSession A database Session. handler Project handler cfHandler A CF handler instance aggregateDimensionName The name of the dimension across which the dataset is aggregated, if any. offline Boolean, True if the files are offline, cannot be scanned. operation Publication operation, one of CREATE_OP, DELETE_OP, RENAME_OP, UPDATE_OP progressCallback Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported. stopEvent Object with boolean attribute ``stop_extract`` (for example, ``utility.StopEvent``). If set to True (in another thread) the extraction is stopped. keepVersion Boolean, True if the dataset version should not be incremented. newVersion Set the new version number explicitly. By default the version number is incremented by 1. See keepVersion. extraFields Extra fields dictionary, as from ``readDatasetMap``. masterGateway The gateway that owns the master copy of the datasets. If None, the dataset is not replicated. Otherwise the TDS catalog is written with a 'master_gateway' property, flagging the dataset(s) as replicated. comment String comment on the dataset version. If the dataset version is not increased, the comment is ignored. useVersion=-1: Integer version number of the dataset version to modify. By default the latest version is modified. forceRescan Boolean, if True force all files to be rescanned on an update. context A dictionary with keys ``project``, ``model``, ``experiment``, etc. The context consists of all fields needed to uniquely define the dataset. """ session = dbSession() # Get configuration options related to the scan configOptions = {} config = getConfig() if config is not None: section = 'project:%s'%context.get('project') vlstring = config.get(section, 'variable_locate', default=None) if vlstring is not None: fields = splitLine(vlstring) varlocate = [s.split(',') for s in fields] else: varlocate = None line = config.get('DEFAULT', 'checksum', default=None) if line is not None: checksumClient, checksumType = splitLine(line) else: checksumClient = None checksumType = None versionByDate = config.getboolean(section, 'version_by_date', default=False) else: varlocate = None checksumClient = None checksumType = None versionByDate = False configOptions['variable_locate'] = varlocate configOptions['checksumClient'] = checksumClient configOptions['checksumType'] = checksumType # Check if the dataset / version is already in the database dset = session.query(Dataset).filter_by(name=datasetName).first() if dset is not None: if operation==CREATE_OP: operation = REPLACE_OP else: if operation in [UPDATE_OP, REPLACE_OP]: operation = CREATE_OP elif operation in [DELETE_OP, RENAME_OP]: raise ESGPublishError("No such dataset: %s"%datasetName) # Cannot add online files to offline dataset, and vice versa if dset is not None and dset.offline != offline: if dset.offline: raise ESGPublishError("Dataset %s is offline, set offline flag or replace the dataset."%dset.name) else: raise ESGPublishError("Dataset %s is online, but offline flag is set."%dset.name) # Cannot publish a replica with the same ID as a local dataset and vice versa if dset is not None and dset.master_gateway != masterGateway: if dset.master_gateway is None: raise ESGPublishError("Dataset %s exists and is not a replica - delete it before publishing a replica of the same name."%dset.name) else: raise ESGPublishError("Dataset %s exists and is a replica. Use --replica or delete the existing dataset."%dset.name) createTime = datetime.datetime.now() # DatasetVersion creation_time fobjs = None pathlist = [item for item in fileIterator] if operation==CREATE_OP: # Create a new dataset info("Creating dataset: %s"%datasetName) dset = Dataset(datasetName, context.get('project', None), context.get('model', None), context.get('experiment', None), context.get('run_name', None), offline=offline, masterGateway=masterGateway) session.add(dset) # Create an initial dataset version existingVersion = 0 eventFlag = CREATE_DATASET_EVENT addNewVersion, fobjs = createDataset(dset, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, masterGateway=masterGateway, **context) elif operation in [UPDATE_OP, REPLACE_OP]: versionObj = dset.getVersionObj(useVersion) if versionObj is None: raise ESGPublishError("Version %d of dataset %s not found, cannot republish."%(useVersion, dset.name)) existingVersion = dset.getVersion() eventFlag = UPDATE_DATASET_EVENT addNewVersion, fobjs = updateDatasetVersion(dset, versionObj, pathlist, session, handler, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, replace=(operation==REPLACE_OP), forceRescan=forceRescan, **context) elif operation==RENAME_OP: versionObj = dset.getVersionObj(useVersion) if versionObj is None: raise ESGPublishError("Version %d of dataset %s not found, cannot republish."%(useVersion, dset.name)) existingVersion = dset.getVersion() eventFlag = UPDATE_DATASET_EVENT addNewVersion = renameFilesVersion(dset, versionObj, pathlist, session, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, **context) elif operation==DELETE_OP: versionObj = dset.getVersionObj(useVersion) if versionObj is None: raise ESGPublishError("Version %d of dataset %s not found, cannot republish."%(useVersion, dset.name)) existingVersion = dset.getVersion() eventFlag = UPDATE_DATASET_EVENT addNewVersion, fobjs = deleteFilesVersion(dset, versionObj, pathlist, session, cfHandler, configOptions, aggregateDimensionName=aggregateDimensionName, offline=offline, progressCallback=progressCallback, stopEvent=stopEvent, extraFields=extraFields, **context) else: raise ESGPublishError("Invalid dataset operation: %s"%`operation`) # Create a new dataset version if necessary if keepVersion: if existingVersion<=0: newVersion = getInitialDatasetVersion(versionByDate) else: newVersion = existingVersion elif newVersion is None: newVersion = getNextDatasetVersion(existingVersion, versionByDate) dset.reaggregate = False # Add a new version if addNewVersion and newVersion>existingVersion: datasetTechNotes = datasetTechNotesTitle = None if hasattr(dset, "dataset_tech_notes"): datasetTechNotes = dset.dataset_tech_notes if hasattr(dset, "dataset_tech_notes_title"): datasetTechNotesTitle = dset.dataset_tech_notes_title newDsetVersionObj = DatasetVersionFactory(dset, version=newVersion, creation_time=createTime, comment=comment, tech_notes=datasetTechNotes, tech_notes_title=datasetTechNotesTitle) info("New dataset version = %d"%newDsetVersionObj.version) for var in dset.variables: session.delete(var) newDsetVersionObj.files.extend(fobjs) event = Event(datasetName, newDsetVersionObj.version, eventFlag) dset.events.append(event) dset.reaggregate = True # Keep the current (latest) version elif addNewVersion and newVersion==existingVersion and operation in [UPDATE_OP, REPLACE_OP]: versionObj.deleteChildren(session) versionObj.reset(creation_time=createTime, comment=comment) info("Keeping dataset version = %d"%versionObj.version) for var in dset.variables: session.delete(var) session.commit() versionObj.files.extend(fobjs) event = Event(datasetName, versionObj.version, eventFlag) dset.events.append(event) dset.reaggregate = True elif masterGateway is not None: # Force version set on replication info("Dataset version = %d"%newVersion) dset.setVersion(newVersion) event = Event(datasetName, newVersion, eventFlag) dset.events.append(event) info("Adding file info to database") session.commit() session.close() return dset