def evt_popup_fields_window(self, parent, projectName): if self.evt_fields_flg: return self.evt_fields_flg = True #--------------------------------------------------------------------------------- # Create the Fields dialog window #--------------------------------------------------------------------------------- self.fields = Pmw.Dialog(parent, buttons=('OK', 'Cancel'), defaultbutton='OK', title='Set Additional Mandatory Fields', command=pub_controls.Command( self.close_fields_dialog, parent)) self.fields.withdraw() self.fields.transient(parent) frame = Pmw.ScrolledFrame( self.fields.interior(), usehullsize=1, horizflex='expand', ) # Add additional mandatory fields to allow the user to set default settings handler = getHandlerByName(projectName, None, self.Session) list_fields = getQueryFields(handler) for x in list_fields: if handler.isMandatory(x): if x.lower() != "project": field_options = handler.getFieldOptions(x) if field_options is not None: if x in self.defaultGlobalValues.keys(): set_to = self.defaultGlobalValues[x] else: set_to = x + " (Default Global Setting)" field_options.insert(0, x + " (Default Global Setting)") self.dataset_fields[x] = show_field( parent, frame.interior(), x.capitalize(), field_options, set_to, 1, 1) Pmw.alignlabels(self.dataset_fields.values()) frame.pack(side='top', expand=1, fill='both') #--------------------------------------------------------------------------------- # Position dialog popup #--------------------------------------------------------------------------------- import string parent_geom = parent.geometry() geom = string.split(parent_geom, '+') d1 = string.atoi(geom[1]) d2 = string.atoi(geom[2]) self.fields.geometry("500x200+%d+%d" % (d1, d2)) self.fields.show()
def evt_popup_fields_window(self, parent, projectName): if self.evt_fields_flg: return self.evt_fields_flg = True # --------------------------------------------------------------------------------- # Create the Fields dialog window # --------------------------------------------------------------------------------- self.fields = Pmw.Dialog( parent, buttons=("OK", "Cancel"), defaultbutton="OK", title="Set Additional Mandatory Fields", command=pub_controls.Command(self.close_fields_dialog, parent), ) self.fields.withdraw() self.fields.transient(parent) frame = Pmw.ScrolledFrame(self.fields.interior(), usehullsize=1, horizflex="expand") # Add additional mandatory fields to allow the user to set default settings handler = getHandlerByName(projectName, None, self.Session) list_fields = getQueryFields(handler) for x in list_fields: if handler.isMandatory(x): if x.lower() != "project": field_options = handler.getFieldOptions(x) if field_options is not None: if x in self.defaultGlobalValues.keys(): set_to = self.defaultGlobalValues[x] else: set_to = x + " (Default Global Setting)" field_options.insert(0, x + " (Default Global Setting)") self.dataset_fields[x] = show_field( parent, frame.interior(), x.capitalize(), field_options, set_to, 1, 1 ) Pmw.alignlabels(self.dataset_fields.values()) frame.pack(side="top", expand=1, fill="both") # --------------------------------------------------------------------------------- # Position dialog popup # --------------------------------------------------------------------------------- import string parent_geom = parent.geometry() geom = string.split(parent_geom, "+") d1 = string.atoi(geom[1]) d2 = string.atoi(geom[2]) self.fields.geometry("500x200+%d+%d" % (d1, d2)) self.fields.show()
def updateDatasetFromContext(context, datasetName, Session): """ Update a persistent dataset with values from context (name/value dictionary). The context may have fields such as event fields, not associated with the project handler. context A property (name/value) dictionary. datasetName String dataset identifier. Session Database session factory. """ dset = Dataset.lookup(datasetName, Session) if dset is None: raise ESGQueryError("Dataset not found: %s" % datasetName) projectName = dset.get_project(Session) handler = getHandlerByName(projectName, None, Session) basicHeaders, eventHeaders, categories, derivedHeaders = getQueryFields( handler, return_list=False) properties = context.copy() # Set basic and event properties session = Session() session.add(dset) for key, value in properties.items(): if key in basicHeaders: if key != 'id': if key == 'name': if len(handler.parseDatasetName(value, {})) == 0: warning( "Dataset name: %s does not match dataset_id pattern in config file." % value) setattr(dset, key, value) else: warning("Cannot update id field") del properties[key] elif key in eventHeaders: event = dset.events[-1] setEvent(event, key, value) del properties[key] # Set attribute headers handler.setContext(properties) handler.saveContext(datasetName, Session) session.commit() session.close()
def load_configuration(parent): import os import pub_controls from esgcet.config import getHandler, getHandlerByName, registerHandlers, CFHandler from sqlalchemy.orm import sessionmaker from esgcet.publish import multiDirectoryIterator, datasetMapIterator offline = parent.offline firstFile = parent.firstFile projectName = parent.projectName config = parent.config Session = parent.Session dmap = parent.dmap datasetNames = parent.datasetNames datasetMapfile = parent.datasetMapfile for datasetName in datasetNames: # Get a file iterator and sample file if datasetMapfile is not None: firstFile = dmap[datasetName][0][0] fileiter = datasetMapIterator(dmap, datasetName) else: direcTuples = parent.directoryMap[datasetName] firstDirec, sampleFile = direcTuples[0] firstFile = os.path.join(firstDirec, sampleFile) fileiter = multiDirectoryIterator( [direc for direc, sampfile in direcTuples], parent.filefilt) # Register project handlers registerHandlers() # If the project is not specified, try to read it from the first file validate = True if projectName is not None: handler = getHandlerByName(projectName, firstFile, Session, validate=validate, offline=offline) else: handler = getHandler(firstFile, Session, validate=validate) parent.handler = handler # View the collection of datasets tab_name = "Collection %i" % parent.top_ct parent.ntk.new_page(parent, tab_name)
def load_configuration( parent ): import os import pub_controls from esgcet.config import getHandler, getHandlerByName, registerHandlers, CFHandler from sqlalchemy.orm import sessionmaker from esgcet.publish import multiDirectoryIterator, datasetMapIterator offline = parent.offline firstFile = parent.firstFile projectName = parent.projectName config = parent.config Session = parent.Session dmap = parent.dmap datasetNames = parent.datasetNames datasetMapfile = parent.datasetMapfile for datasetName in datasetNames: # Get a file iterator and sample file if datasetMapfile is not None: firstFile = dmap[datasetName][0][0] fileiter = datasetMapIterator(dmap, datasetName) else: direcTuples = parent.directoryMap[datasetName] firstDirec, sampleFile = direcTuples[0] firstFile = os.path.join(firstDirec, sampleFile) fileiter = multiDirectoryIterator([direc for direc, sampfile in direcTuples],parent.filefilt) # Register project handlers registerHandlers() # If the project is not specified, try to read it from the first file validate = True if projectName is not None: handler = getHandlerByName(projectName,firstFile,Session,validate=validate,offline=offline) else: handler = getHandler(firstFile, Session, validate=validate) parent.handler = handler # View the collection of datasets tab_name= "Collection %i" % parent.top_ct parent.ntk.new_page( parent, tab_name )
def new_query_page(self, parent, tab_name=None, query_id=None): # Start the busy routine to indicate to the users something is happening self.parent.parent.busyCursor = "watch" self.parent.parent.busyWidgets = [ self.parent.parent.pane2.pane("EditPaneTop"), self.parent.parent.pane2.pane("EditPaneBottom"), self.parent.parent.pane2.pane("EditPaneStatus"), self.parent.parent.pane.pane("ControlPane"), ] pub_busy.busyStart(self.parent.parent) try: properties = {} projectName = self.parent.query_fields["project"].get() # Must have projectName handler = getHandlerByName(projectName, None, self.Session) tabcolor = Pmw.Color.changebrightness(self.parent.parent, pub_controls.query_tab_color, 0.6) # works up to here if query_id is None: for x in self.parent.query_fields.keys(): query_string = self.parent.query_fields[x].get().lstrip() if (query_string == "-Any-") or (len(query_string) == 0): properties[x] = (2, "%") elif query_string != "-Any-": properties[x] = (1, query_string) if properties["id"] == (2, "%"): del properties["id"] # This causes an error because you cannot modify the 'id' listProperties = False result, headers = queryDatasets(projectName, handler, self.Session, properties) # works up to here # running this causes it to fail! self.new_page( parent, tabName=None, tab_color=tabcolor, page_type="query", query_result=result, list_fields=headers, ) else: result, headers = queryDatasets(projectName, handler, self.Session, properties) for x in result: query_id_found = False if query_id == x[0][:-1]: self.new_page( parent, tabName=None, tab_color=tabcolor, page_type="query", query_result=[x], list_fields=headers, ) query_id_found = True break if query_id_found is False: warning("The specified dataset id '%s' was not found.", query_id) # fails here # Enable the "Data Publication" button self.parent.ControlButton3.configure(state="normal") datasetNames = [] for x in result: datasetNames.append(x[1]) dmap, offline_map, extraFields = queryDatasetMap(datasetNames, self.Session, extra_fields=True) # Check if offline or not, then set the iteration values for each page selected_page = self.parent.parent.main_frame.selected_top_page self.parent.parent.hold_offline[selected_page] = offline_map self.parent.parent.main_frame.projectName[selected_page] = projectName self.parent.parent.main_frame.dmap[selected_page] = dmap self.parent.parent.main_frame.extraFields[selected_page] = extraFields self.parent.parent.main_frame.datasetMapfile[selected_page] = None self.parent.parent.directoryMap[selected_page] = None self.parent.parent.main_frame.dirp_firstfile[selected_page] = None self.parent.parent.defaultGlobalValues[selected_page] = {} except: pub_busy.busyEnd(self.parent.parent) # catch here in order to turn off the busy cursor ganz raise finally: pub_busy.busyEnd(self.parent.parent)
def esgscanWrapper(directoryList, **kw): if len(directoryList) == 0: raise ESGPublishError('No directory specified') output = sys.stdout appendMap = None appendPath = kw.get("appendPath", None) if appendPath is not None: if os.path.exists(appendPath): appendMap = readDatasetMap(appendPath) else: appendMap = {} output = open(appendPath, 'a') datasetName = kw.get("datasetName", None) filefilt = kw.get("fileFilt", '.*\.nc$') init_file = kw.get("initFile", None) offline = kw.get("offline", False) outputPath = kw.get("outputPath", None) if outputPath is not None: output = open(outputPath, 'w') else: output = sys.stdout projectName = kw.get("projectName", None) readFiles = kw.get("readFiles", False) service = kw.get("service", None) # Load the configuration and set up a database connection config, Session = initdb(init_file=init_file) # Register project handlers registerHandlers() if not offline: # Determine if checksumming is enabled line = config.get('DEFAULT', 'checksum', default=None) if line is not None: checksumClient, checksumType = splitLine(line) else: checksumClient = None if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: multiIter = multiDirectoryIterator(directoryList, filefilt=filefilt) firstFile, size = multiIter.next() handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError( "No project found in file %s, specify with --project." % firstFile) projectName = handler.name if not readFiles: datasetMap = handler.generateDirectoryMap(directoryList, filefilt, datasetName=datasetName) else: datasetMap = handler.generateDirectoryMapFromFiles( directoryList, filefilt, datasetName=datasetName) # Output the map keys = datasetMap.keys() keys.sort() for datasetId in keys: direcTuple = datasetMap[datasetId] direcTuple.sort() for nodepath, filepath in direcTuple: # If readFiles is not set, generate a map entry for each file in the directory # that matches filefilt ... if not readFiles: itr = directoryIterator(nodepath, filefilt=filefilt, followSubdirectories=False) # ... otherwise if readFiles is set, generate a map entry for each file else: itr = fnIterator([filepath]) for filepath, sizet in itr: size, mtime = sizet extraStuff = "mod_time=%f" % float(mtime) if checksumClient is not None: csum = checksum(filepath, checksumClient) extraStuff += " | checksum=%s | checksum_type=%s" % ( csum, checksumType) # Print the map entry if: # - The map is being created, not appended, or # - The existing map does not have the dataset, or # - The existing map has the dataset, but not the file. if (appendMap is None) or (not appendMap.has_key(datasetId)) or ( (filepath, "%d" % size) not in appendMap[datasetId]): print >> output, "%s | %s | %d | %s" % ( datasetId, filepath, size, extraStuff) else: # offline if projectName is not None: handler = getHandlerByName(projectName, None, Session, offline=True) else: raise ESGPublishError( "Must specify --project for offline datasets.") listerSection = getOfflineLister(config, "project:%s" % projectName, service) offlineLister = config.get(listerSection, 'offline_lister_executable') commandArgs = "--config-section %s " % listerSection commandArgs += " ".join(directoryList) for dsetName, filepath, sizet in processNodeMatchIterator( offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True): size, mtime = sizet extrastuff = "" if mtime is not None: extrastuff = "| mod_time=%f" % float(mtime) if (appendMap is None) or (not appendMap.has_key(dsetName)) or ( (filepath, "%d" % size) not in appendMap[dsetName]): print >> output, "%s | %s | %d %s" % (dsetName, filepath, size, extrastuff) if output is not sys.stdout: output.close()
def publishDatasetList(datasetNames, Session, parentId=None, handlerDictionary=None, publish=True, thredds=True, las=False, progressCallback=None, service=None, perVariable=None, threddsCatalogDictionary=None, reinitThredds=None, readFromCatalog=False, restInterface=False, schema=None, pid_connector=None, project_config_section=None): """ Publish a list of datasets: - For each dataset, write a THREDDS catalog. - Add the new catalogs to the THREDDS catalog tree and reinitilize the THREDDS server. - Reinitialize the LAS server. - Publish each dataset to the gateway. Returns a dictionary: (datasetName, version) => status datasetNames A list of (string_dataset_name, version) tuples. Session A database Session. parentId The string (or dictionary) persistent identifier of the parent of the datasets. If None (the default), the parent id for each dataset is generated from ``handler.getParentId()``. If a dictionary, each dataset name is used as a key to lookup the respective parent id. If a string, the parent id is set to the string for all datasets being published. This function can be overridden in the project handler to implement a project-specific dataset hierarchy. handlerDictionary A dictionary mapping dataset_name => handler. publish Boolean flag: if true (the default), contact the gateway to publish this dataset. thredds Boolean flag: if true (the default), write the associated THREDDS catalog. las Boolean flag: if true (the default), write the associated LAS catalog. progressCallback Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported. service String service name. If omitted, the first online/offline service in the configuration is used. perVariable Boolean, overrides ``variable_per_file`` config option. threddsCatalogDictionary If not None, just generate catalogs in strings, not the THREDDS directories, and set threddsCatalogDictionary[datasetId] = string_catalog reinitThredds Boolean flag. If True, create the TDS master catalog and reinitialize the TDS server. If None, defaults to value of thredds option. readFromCatalog Boolean flag. If True, read the TDS catalog definitions from threddsCatalogDictionary. threddsCatalogDictionary must also be set. restInterface Boolean flag. If True, publish datasets with the RESTful publication services. schema (Optional) String name of the schema to validate against, for RESTful publication calls. pid_connector esgfpid.Connector object to register PIDs project_config_section Name of the project config section in esg.ini (for user specific project configs) """ session = Session() resultDict = {} if readFromCatalog and threddsCatalogDictionary is None: raise ESGPublishError("Must set THREDDS catalog dictionary when readFromCatalog is True.") # Get handlers for each dataset if handlerDictionary is None: handlers = {} for datasetName,versionno in datasetNames: dset = session.query(Dataset).filter_by(name=datasetName).first() if dset is None: raise ESGPublishError("Dataset not found: %s"%datasetName) handler = getHandlerByName(dset.project, None, Session) handlers[datasetName] = handler else: handlers = handlerDictionary # reinitThredds defaults to the value of thredds option if reinitThredds is None: reinitThredds = thredds if thredds: for datasetName,versionno in datasetNames: dset = session.query(Dataset).filter_by(name=datasetName).first() # If the dataset version is not the latest, publish as a per-time dataset without aggregation, # since the dataset variables only relate to the latest dataset version latestVersion = dset.getVersion() if versionno==-1: versionno=latestVersion if versionno!=latestVersion: if perVariable: messaging.info("Generating THREDDS catalog in per-time format, since version %d is not the latest version (%d)"%(versionno,latestVersion)) perVariable = False handler = handlers[datasetName] # If threddsCatalogDictionary is not set, create the TDS catalog in the TDS content directory ... if threddsCatalogDictionary is None: threddsOutputPath = generateThreddsOutputPath(datasetName, versionno, Session, handler) threddsOutput = open(threddsOutputPath, "w") generateThredds(datasetName, Session, threddsOutput, handler, service=service, perVariable=perVariable, versionNumber=versionno, pid_connector=pid_connector) threddsOutput.close() try: os.chmod(threddsOutputPath, 0664) except: pass # ... else if threddsCatalogDictionary is the catalog source: elif readFromCatalog: catalogString = threddsCatalogDictionary[(datasetName,versionno)] threddsOutputPath = generateThreddsOutputPath(datasetName, versionno, Session, handler) threddsOutput = open(threddsOutputPath, "w") messaging.info("Writing THREDDS catalog %s"%threddsOutputPath) threddsOutput.write(catalogString) threddsOutput.close() try: os.chmod(threddsOutputPath, 0664) except: pass # ... otherwise write the catalog in a 'string file' else: threddsOutputPath = generateThreddsOutputPath(datasetName, versionno, Session, handler) # Creates catalog entry threddsOutput = cStringIO.StringIO() generateThredds(datasetName, Session, threddsOutput, handler, service=service, perVariable=perVariable, versionNumber=versionno, pid_connector=pid_connector) threddsCatalogDictionary[(datasetName,versionno)] = threddsOutput.getvalue() threddsOutput.close() if reinitThredds: updateThreddsMasterCatalog(Session) result = reinitializeThredds() if las: try: result = reinitializeLAS() except Exception, e: messaging.error("Error on LAS reinitialization: %s, new datasets not added."%e)
def __init__(self, parent): self.parent = parent self.Session = parent.parent.Session self.select_button = {} self.select_labelV = {} self.select_label = {} # ---------------------------------------------------------------------------------------- # Begin the creation of the dataset ID pulldown query selection # ---------------------------------------------------------------------------------------- glFont = tkFont.Font( self.parent.parent, family=pub_controls.button_group_font_type, size=pub_controls.button_group_font_size, weight=font_weight, ) bnFont = tkFont.Font( self.parent.parent, family=pub_controls.label_button_font_type, size=pub_controls.label_button_font_size, weight=font_weight, ) self.group_query = Pmw.Group(self.parent.control_frame4, tag_text="Query ID", tag_font=glFont, tagindent=25) # ---------------------------------------------------------------------------------------- # Create and pack the EntryFields # ---------------------------------------------------------------------------------------- self.query_entry = Pmw.EntryField( self.group_query.interior(), labelpos="w", label_text="Dataset Id:", label_font=bnFont, entry_width=200, validate=None, command=pub_controls.Command(self.evt_show_data_set_from_id), ) self.query_entry.pack(side="left", expand=1, padx=10, pady=10) # ---------------------------------------------------------------------------------------- # End the creation of the dataset ID pulldown query selection # ---------------------------------------------------------------------------------------- # ---------------------------------------------------------------------------------------- # Begin the creation of the project pulldown query selection # ---------------------------------------------------------------------------------------- self.button_controls = Pmw.Group( self.parent.control_frame4, tag_text="Query Project Infomation", tag_font=glFont, tagindent=25 ) # ---------------------------------------------------------------------------------------- # Create an instance of the notebook # ---------------------------------------------------------------------------------------- ntk = generate_notebook(parent, self.Session) parent.parent.ntk = ntk # Set the first instance of the notebook self.bdataset_sframe = Pmw.ScrolledFrame(self.button_controls.interior()) self.bdataset_sframe.pack(side="top") self.bdataset_frame = self.bdataset_sframe.interior() # ---------------------------------------------------------------------------------------- # Display the Project selection # ---------------------------------------------------------------------------------------- projectOption = self.parent.parent.config.get("initialize", "project_options") projectSpecs = splitRecord(projectOption) projectName = projectSpecs[0][0] projectList = [] for x in projectSpecs: projectList.append(x[0]) projectList.sort() parent.query_fields = {} parent.validate = {} parent.query_fields["project"] = show_field( self.parent, self.bdataset_frame, "Project", projectList, projectName, 1, 1, for_query=True ) parent.validate["project"] = 1 handler = getHandlerByName(projectName, None, self.Session) list_fields = getQueryFields(handler) validate = [] mandatory = [] options = {} for x in list_fields: if handler.getFieldType(x) is not None: validate.append(handler.getFieldType(x)) else: validate.append(2) options[x] = handler.getFieldOptions(x) mandatory.append(handler.isMandatory(x)) for j in range(1, 5): for i in range(len(list_fields)): if list_fields[i] is not "project": if options[list_fields[i]] is None: value = "" else: value = options[list_fields[i]] # ganz bug fix [0] if validate[i] == 1: options[list_fields[i]].insert(0, "-Any-") value = "-Any-" if j == validate[i]: parent.query_fields[list_fields[i]] = show_field( self.parent, self.bdataset_frame, list_fields[i].capitalize(), options[list_fields[i]], value, mandatory[i], validate[i], for_query=True, ) parent.validate[list_fields[i]] = validate[i] Pmw.alignlabels(parent.query_fields.values()) # ---------------------------------------------------------------------------------------- # Create button to update extraction # ---------------------------------------------------------------------------------------- w = Tkinter.Button( self.button_controls.interior(), text="Query Data Information", font=bnFont, background="lightblue", command=pub_controls.Command(ntk.new_query_page, parent, None), ) w.pack(side="top", padx=0, pady=3) self.button_controls.pack(side="top", fill="x", pady=3)
def iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, operation, filefilt, initcontext, offlineArg, properties, testProgress1=None, testProgress2=None, handlerDictionary=None, keepVersion=False, newVersion=None, extraFields=None, masterGateway=None, comment=None, forceAggregate=False, readFiles=False): """ Scan and aggregate (if possible) a list of datasets. The datasets and associated files are specified in one of two ways: either as a *dataset map* (see ``dmap``) or a *directory map* (see ``directoryMap``). All dataset information is persisted in the database. This is a 'helper' routine for esgpublish[_gui]. Returns a list of persistent Dataset instances. projectName String name of the project associated with the datasets. If None, it is determined by the first handler found that can open a sample file from the dataset. dmap A dictionary dataset map, as returned from ``readDatasetMap``. If None, ``directoryMap`` must be specified. directoryMap A dictionary directory map, as returned from ``ProjectHandler.generateDirectoryMap``. datasetNames A list of dataset names identifying the datasets to be scanned. Session An SQLAlchemy Session. aggregateDimension Name of the dimension on which to aggregate the datasets. operation The publication operation, one of esgcet.publish.CREATE_OP, DELETE_OP, RENAME_OP, UPDATE_OP filefilt String regular expression as defined by the Python re module. If a ``directoryMap`` is specified, only files whose basename matches the filter are scanned. If ``dmap`` is specified, the filter is ignored. initcontext Dictionary of initial context values for *all* datasets. These values will override metadata contained in datafiles. Contrast with ``properties``. offlineArg Boolean flag or dictionary If a boolean flag: if True the files are treated as offline (not local) and are not scanned or aggregated. The associated metadata will be a minimal set including file name and size. If a dictionary, maps dataset_name => offline flag properties Dictionary of property/value pairs. The properties must be configured in the initialization file section corresponding to the project, and do not override existing metadata values. Contrast with ``initcontext``. testProgress1=None Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*, ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to the scan phase. testProgress2=None Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*, ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to the aggregation phase. handlerDictionary=None A dictionary mapping datasetName => handler. If None, handlers are determined by project name. keepVersion Boolean, True if the dataset version should not be incremented. newVersion Integer or dictionary. Set the new version number explicitly. If a dictionary, maps dataset_id => version. By default the version number is incremented by 1. See keepVersion. extraFields Extra dataset map fields, as from **readDatasetMap**. masterGateway The gateway that owns the master copy of the datasets. If None, the dataset is not replicated. Otherwise the TDS catalog is written with a 'master_gateway' property, flagging the dataset(s) as replicated. comment=None String comment to associate with new datasets created. forceAggregate=False If True, run the aggregation step regardless. readFiles=False If True, interpret directoryMap as having one entry per file, instead of one per directory. """ from esgcet.publish import extractFromDataset, aggregateVariables versionIsMap = (type(newVersion) is types.DictType) if versionIsMap: saveVersionMap = newVersion prevProject = None datasets = [] ct = len(datasetNames) for iloop in range(ct): datasetName,versionno = datasetNames[iloop] # If using a version map, lookup the version for this dataset if versionIsMap: try: newVersion = saveVersionMap[datasetName] except KeyError: raise ESGPublishError("Dataset not found in version map: %s"%datasetName) context = initcontext.copy() # Get offline flag if type(offlineArg) is dict: offline = offlineArg[datasetName] else: offline = offlineArg # Don't try to aggregate offline datasets if offline: forceAggregate=False # Get a file iterator and sample file if dmap is not None: if len(dmap[(datasetName,versionno)])==0: warning("No files specified for dataset %s, version %d."%(datasetName,versionno)) continue firstFile = dmap[(datasetName,versionno)][0][0] fileiter = datasetMapIterator(dmap, datasetName, versionno, extraFields=extraFields, offline=offlineArg) else: direcTuples = directoryMap[datasetName] firstDirec, sampleFile = direcTuples[0] firstFile = os.path.join(firstDirec, sampleFile) if not readFiles: fileiter = multiDirectoryIterator([direc for direc, sampfile in direcTuples], filefilt) else: fileiter = fnIterator([sampfile for direc, sampfile in direcTuples]) # If the project is not specified, try to read it from the first file if handlerDictionary is not None and handlerDictionary.has_key(datasetName): handler = handlerDictionary[datasetName] elif projectName is not None: handler = getHandlerByName(projectName, firstFile, Session, validate=True, offline=offline) else: handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError("No project found in file %s, specify with --project."%firstFile) projectName = handler.name info("Using project name = %s"%projectName) if prevProject is not None and projectName!=prevProject: raise ESGPublishError("Multiple projects found: %s, %s. Can only publish from one project"%(prevProject, projectName)) prevProject = projectName # Generate the initial context from the dataset name context = handler.parseDatasetName(datasetName, context) # Load the rest of the context from the first file, if possible context = handler.getContext(**context) # Add properties from the command line fieldNames = handler.getFieldNames() for name, value in properties.items(): if name not in fieldNames: warning('Property not configured: %s, was ignored'%name) else: context[name] = value # Update the handler context and fill in default values handler.updateContext(context, True) # Ensure that fields are valid: try: handler.validateContext(context) except ESGInvalidMandatoryField: if offline: error("Dataset id has a missing or invalid mandatory field") raise # Create a CFHandler for validation of standard names, checking time axes, etc. cfHandler = handler.getMetadataHandler(sessionMaker=Session) dataset=None if testProgress1 is not None: testProgress1[1] = (100./ct)*iloop if not offline: testProgress1[2] = (100./ct)*iloop + (50./ct) else: testProgress1[2] = (100./ct)*iloop + (100./ct) dataset = extractFromDataset(datasetName, fileiter, Session, handler, cfHandler, aggregateDimensionName=aggregateDimension, offline=offline, operation=operation, progressCallback=testProgress1, keepVersion=keepVersion, newVersion=newVersion, extraFields=extraFields, masterGateway=masterGateway, comment=comment, useVersion=versionno, forceRescan=forceAggregate, **context) # If republishing an existing version, only aggregate if online and no variables exist (yet) for the dataset. runAggregate = (not offline) if hasattr(dataset, 'reaggregate'): runAggregate = (runAggregate and dataset.reaggregate) runAggregate = runAggregate or forceAggregate if testProgress2 is not None: testProgress2[1] = (100./ct)*iloop + 50./ct testProgress2[2] = (100./ct)*(iloop + 1) if runAggregate: aggregateVariables(datasetName, Session, aggregateDimensionName=aggregateDimension, cfHandler=cfHandler, progressCallback=testProgress2, datasetInstance=dataset) elif testProgress2 is not None: # Just finish the progress GUI issueCallback(testProgress2, 1, 1, 0.0, 1.0) # Save the context with the dataset, so that it can be searched later handler.saveContext(datasetName, Session) datasets.append(dataset) return datasets
def return_content2(self, appendOpt=False): from esgcet.publish import iterateOverDatasets, processIterator from esgcet.config import getHandlerByName from esgcet.model import eventName from esgcet.config import loadConfig # Initialize parameters for interating over datasets initcontext = {} aggregateOnly = False # appendOpt = False initcontext = {} properties = {} publish = False publishOnly = False thredds = False testProgress1 = [self.parent.parent.statusbar.show, 0, 50] testProgress2 = [self.parent.parent.statusbar.show, 50, 100] handlerDictionary = {} # Get the currently selected tab and the selected datasets tab_name = self.parent.parent.top_notebook.getcurselection() selected_page = self.parent.parent.main_frame.selected_top_page datasetNames = [] # datasetNames2 = [] if (selected_page is None): warning("Must generate a list of datasets to scan before data extraction can occur.") return if (selected_page is not None) or (self.parent.parent.hold_offline[selected_page] == True): extraFields = None if (self.parent.parent.hold_offline[selected_page] == False) or (isinstance(self.parent.parent.hold_offline[selected_page], types.DictType)): for x in self.parent.parent.main_frame.top_page_id[selected_page]: dsetVersionName = self.parent.parent.main_frame.top_page_id2[selected_page][x].cget('text') # GANZ TODO version_label # ganz added this 1/21/11 if (self.parent.parent.main_frame.version_label[selected_page] ): dset_name = self.parent.parent.main_frame.top_page_id2[selected_page][x].cget('text') dsetVersion = self.parent.parent.main_frame.version_label[selected_page][x].cget('text') ##################################################################################### else: dset_name, dsetVersion = parseDatasetVersionId(dsetVersionName) # Retrieve all the datasets in the collection for display """ ganz test code status = pollDatasetPublicationStatus(dset_name, self.Session) status_text = pub_controls.return_status_text( status ) if status_text != 'Error': dsetTuple = parseDatasetVersionId(self.parent.parent.main_frame.top_page_id2[selected_page][x].cget('text')) datasetNames2.append(dsetTuple) """ # Retrieve only the datasets that have been selected if self.parent.parent.main_frame.top_page_id[selected_page][x].cget('bg') != 'salmon': dsetTuple = parseDatasetVersionId(self.parent.parent.main_frame.top_page_id2[selected_page][x].cget('text')) datasetNames.append(dsetTuple) dmap = self.parent.parent.main_frame.dmap[selected_page] extraFields = self.parent.parent.main_frame.extraFields[selected_page] datasetMapfile = self.parent.parent.main_frame.datasetMapfile[selected_page] projectName = self.parent.parent.main_frame.projectName[selected_page] directoryMap = self.parent.parent.directoryMap[selected_page] if dmap is not None: for x in datasetNames: dsetId = x[0] datasetName = x try: dmapentry = dmap[datasetName] except: # Check if the dataset map key was changed from (dsetname,-1) to (dsetname,version). # If so, replace the entry with the new key. trykey = (datasetName[0], -1) dmapentry = dmap[trykey] del dmap[trykey] dmap[datasetName] = dmapentry firstFile = dmapentry[0][0] self.parent.parent.handlerDictionary[dsetId] = getHandlerByName(projectName, firstFile, self.Session) handler = self.parent.parent.handlerDictionary[dsetId] # Copy the defaultGlobalValues into initcontext initcontext = self.parent.parent.main_frame.defaultGlobalValues[selected_page] else: # more test code myholdDirectoryMap = self.parent.parent.directoryMap[selected_page] #mydatasetNames = [(item,-1) for item in myholdDirectoryMap.keys()] mydatasetNames = [(item) for item in myholdDirectoryMap.keys()] #end for x in mydatasetNames: dsetId = x[0] datasetName = x # ganz this is test code try: dmapentry = myholdDirectoryMap[datasetName] except: # Check if the dataset map key was changed from (dsetname,-1) to (dsetname,version). # If so, replace the entry with the new key. trykey = (datasetName[0], -1) dmapentry = myholdDirectoryMap[trykey] del myholdDirectoryMap[trykey] myholdDirectoryMap[datasetName] = dmapentry firstFile = dmapentry[0][1] #end of test code #firstFile = self.parent.parent.main_frame.dirp_firstfile[selected_page] self.parent.parent.handlerDictionary[dsetId] = getHandlerByName(projectName, firstFile, self.Session) handler = self.parent.parent.handlerDictionary[dsetId] else: # working off-line projectName = self.parent.parent.main_frame.projectName[selected_page] if self.parent.parent.offline_file_directory[selected_page] == "directory": if self.parent.parent.config is None: extraction_controls.call_sessionmaker( self.parent.parent ) datasetPaths = [] dmap = {self.parent.parent.offline_datasetName : datasetPaths} listerSection = getOfflineLister(self.parent.parent.config, "project:%s"%projectName, None) offlineLister = self.parent.parent.config.get(listerSection, 'offline_lister_executable') lastargs = self.parent.parent.offline_directories commandArgs = "--config-section %s "%listerSection commandArgs += " ".join(lastargs) for filepath, size in processIterator(offlineLister, commandArgs, filefilt=self.parent.parent.filefilt): datasetPaths.append((filepath, str(size))) datasetNames = self.parent.parent.datasetNames directoryMap = None # get the handler for x in datasetNames: dsetId = x[0] self.parent.parent.handlerDictionary[dsetId] = getHandlerByName(projectName, None, self.Session, offline=True) elif self.parent.parent.offline_file_directory[selected_page] == "file": dmap = self.parent.parent.main_frame.dmap[selected_page] extraFields = self.parent.parent.main_frame.extraFields[selected_page] datasetMapfile = self.parent.parent.main_frame.datasetMapfile[selected_page] projectName = self.parent.parent.main_frame.projectName[selected_page] directoryMap = None if datasetMapfile is not None: dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True) datasetNames = dmap.keys() # get the handlers for x in datasetNames: dsetId = x[0] self.parent.parent.handlerDictionary[dsetId] = getHandlerByName(projectName, None, self.Session, offline=True) # Iterate over datasets if appendOpt: operation = UPDATE_OP else: operation = CREATE_OP datasets = iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, self.Session, self.parent.parent.aggregateDimension, operation, self.parent.parent.filefilt, initcontext, self.parent.parent.hold_offline[selected_page], properties, comment=self.comments, testProgress1=testProgress1, testProgress2=testProgress2 , handlerDictionary=self.parent.parent.handlerDictionary, extraFields=extraFields, readFiles=True) # If working on-line then replace the scanned list of datasets with # the complete list of datasets #test """ print 'datasetNames:' for t1 in datasetNames: print t1 print 'datasetNames2:' for t2 in datasetNames2: print t2 """ if not self.parent.parent.hold_offline[selected_page]: datasets = [] versionObjs = [] # ganz finally, tested datasetNames2 here for dsetName, version in datasetNames: result = Dataset.lookup(dsetName, self.Session, version=version) if result is not None: entry, versionObj = result datasets.append(entry) versionObjs.append(versionObj) # Get the summary of errors after doing a data extraction dset_error = [] for dset in datasets: status = dset.get_publication_status(self.Session) status_name = eventName[status] if dset.has_warnings(self.Session): dset_error.append(dset.get_name(self.Session)) try: list_fields = getQueryFields( handler ) except: handler = getHandlerByName(projectName, None, self.Session) list_fields = getQueryFields( handler ) # Display the datasets in the "Collection" page # if self.parent.parent.hold_offline[selected_page] == True: # tab_name = "Collection_Offline" # from_tab = "Collection" # pub_editorviewer = self.parent.parent.create_publisher_editor_viewer( self.parent.parent, tab_name, dataset, from_tab, self.Session) # Show the extracted datasets self.set_column_labels( len(datasets), list_fields ) self.show_extracted_info(datasets, dset_error, list_fields, versionObjs) # Enable the "Data Publication" button self.parent.ControlButton3.configure( state = 'normal' )
def esgscanWrapper(directoryList, **kw): if len(directoryList) == 0: raise ESGPublishError("No directory specified") output = sys.stdout appendMap = None appendPath = kw.get("appendPath", None) if appendPath is not None: if os.path.exists(appendPath): appendMap = readDatasetMap(appendPath) else: appendMap = {} output = open(appendPath, "a") datasetName = kw.get("datasetName", None) filefilt = kw.get("fileFilt", ".*\.nc$") init_file = kw.get("initFile", None) offline = kw.get("offline", False) outputPath = kw.get("outputPath", None) if outputPath is not None: output = open(outputPath, "w") else: output = sys.stdout projectName = kw.get("projectName", None) readFiles = kw.get("readFiles", False) service = kw.get("service", None) # Load the configuration and set up a database connection config, Session = initdb(init_file=init_file) # Register project handlers registerHandlers() if not offline: # Determine if checksumming is enabled line = config.get("DEFAULT", "checksum", default=None) if line is not None: checksumClient, checksumType = splitLine(line) else: checksumClient = None if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: multiIter = multiDirectoryIterator(directoryList, filefilt=filefilt) firstFile, size = multiIter.next() handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError("No project found in file %s, specify with --project." % firstFile) projectName = handler.name if not readFiles: datasetMap = handler.generateDirectoryMap(directoryList, filefilt, datasetName=datasetName) else: datasetMap = handler.generateDirectoryMapFromFiles(directoryList, filefilt, datasetName=datasetName) # Output the map keys = datasetMap.keys() keys.sort() for datasetId in keys: direcTuple = datasetMap[datasetId] direcTuple.sort() for nodepath, filepath in direcTuple: # If readFiles is not set, generate a map entry for each file in the directory # that matches filefilt ... if not readFiles: itr = directoryIterator(nodepath, filefilt=filefilt, followSubdirectories=False) # ... otherwise if readFiles is set, generate a map entry for each file else: itr = fnIterator([filepath]) for filepath, sizet in itr: size, mtime = sizet extraStuff = "mod_time=%f" % float(mtime) if checksumClient is not None: csum = checksum(filepath, checksumClient) extraStuff += " | checksum=%s | checksum_type=%s" % (csum, checksumType) # Print the map entry if: # - The map is being created, not appended, or # - The existing map does not have the dataset, or # - The existing map has the dataset, but not the file. if ( (appendMap is None) or (not appendMap.has_key(datasetId)) or ((filepath, "%d" % size) not in appendMap[datasetId]) ): print >> output, "%s | %s | %d | %s" % (datasetId, filepath, size, extraStuff) else: # offline if projectName is not None: handler = getHandlerByName(projectName, None, Session, offline=True) else: raise ESGPublishError("Must specify --project for offline datasets.") listerSection = getOfflineLister(config, "project:%s" % projectName, service) offlineLister = config.get(listerSection, "offline_lister_executable") commandArgs = "--config-section %s " % listerSection commandArgs += " ".join(directoryList) for dsetName, filepath, sizet in processNodeMatchIterator( offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True ): size, mtime = sizet extrastuff = "" if mtime is not None: extrastuff = "| mod_time=%f" % float(mtime) if ( (appendMap is None) or (not appendMap.has_key(dsetName)) or ((filepath, "%d" % size) not in appendMap[dsetName]) ): print >> output, "%s | %s | %d %s" % (dsetName, filepath, size, extrastuff) if output is not sys.stdout: output.close()
def main(argv): try: args, lastargs = getopt.getopt(argv, "a:cdehi:m:p:ru", ['append', 'create', 'dataset=', 'delete-files', 'echo-sql', 'experiment=', 'filter=', 'help', 'keep-version', 'log=', 'map=', 'message=', 'model=', 'offline', 'parent=', 'per-time', 'per-variable', 'project=', 'property=', 'publish', 'new-version=', 'no-thredds-reinit', 'noscan', 'read-directories', 'read-files', 'rename-files', 'replace', 'replica=', 'rest-api', 'service=', 'set-replica', 'summarize-errors', 'thredds', 'thredds-reinit', 'update', 'use-existing=', 'use-list=', 'validate=', 'version-list=', 'nodbwrite']) except getopt.error: print sys.exc_value return aggregateDimension = "time" datasetMapfile = None datasetName = None echoSql = False filefilt = '.*\.nc$' init_file = None initcontext = {} keepVersion = False las = False log_filename = None masterGateway = None message = None offline = False parent = None perVariable = None projectName = None properties = {} publish = False publishOnly = False publishOp = CREATE_OP readFiles = False rescan = False rescanDatasetName = [] restApi = None schema = None service = None summarizeErrors = False testProgress1 = testProgress2 = None thredds = False threddsReinit = None version = None versionList = None nodbwrite = False for flag, arg in args: if flag=='-a': aggregateDimension = arg elif flag=='--append': publishOp = UPDATE_OP elif flag in ['-c', '--create']: publishOp = CREATE_OP elif flag=='--dataset': datasetName = arg elif flag in ['-d', '--delete-files']: publishOp = DELETE_OP elif flag=='--echo-sql': echoSql = True elif flag=='--experiment': initcontext['experiment'] = arg elif flag=='--filter': filefilt = arg elif flag in ['-h', '--help']: print usage sys.exit(0) elif flag=='-i': init_file = arg elif flag=='--keep-version': keepVersion = True elif flag=='--log': log_filename = arg elif flag=='--map': datasetMapfile = arg elif flag in ['-m', '--message']: message = arg elif flag=='--model': initcontext['model'] = arg elif flag=='--nodbwrite': nodbwrite = True elif flag=='--new-version': try: version = string.atoi(arg) if version <=0: raise ValueError except ValueError: raise ESGPublishError("Version number must be a positive integer: %s"%arg) elif flag=='--no-thredds-reinit': threddsReinit = False elif flag=='--noscan': publishOnly = True elif flag=='--offline': offline = True elif flag=='--parent': parent = arg elif flag=='--per-time': perVariable = False elif flag=='--per-variable': perVariable = True elif flag=='--project': projectName = arg elif flag in ['-p', '--property']: name, value = arg.split('=') properties[name] = value elif flag=='--publish': publish = True elif flag in ['-e', '--read-directories']: readFiles = False elif flag=='--read-files': readFiles = True elif flag=='--rename-files': publishOp = RENAME_OP elif flag in ['-r', '--replace']: publishOp = REPLACE_OP elif flag=='--replica': masterGateway = arg warning("The --replica option is deprecated. Use --set-replica instead") elif flag=='--rest-api': restApi = True elif flag=='--service': service = arg elif flag=='--set-replica': masterGateway = 'DEFAULT' elif flag=='--summarize-errors': summarizeErrors = True elif flag=='--thredds': thredds = True elif flag=='--thredds-reinit': threddsReinit = True elif flag in ['-u', '--update']: publishOp = UPDATE_OP elif flag=='--use-existing': rescan = True rescanDatasetName.append(arg) elif flag=='--use-list': rescan = True if arg=='-': namelist=sys.stdin else: namelist = open(arg) for line in namelist.readlines(): line = line.strip() if line[0]!='#': rescanDatasetName.append(line) elif flag=='--validate': schema = arg restApi = True elif flag=='--version-list': versionList = arg # If offline, the project must be specified if offline and (projectName is None): raise ESGPublishError("Must specify project with --project for offline datasets") if version is not None and versionList is not None: raise ESGPublishError("Cannot specify both --new-version and --version-list") if versionList is not None: version = {} f = open(versionList) lines = f.readlines() f.close() for line in lines: line = line.strip() dsid, vers = line.split('|') dsid = dsid.strip() vers = int(vers.strip()) version[dsid] = vers # Load the configuration and set up a database connection config = loadConfig(init_file) engine = create_engine(config.getdburl('extract'), echo=echoSql, pool_recycle=3600) initLogging('extract', override_sa=engine, log_filename=log_filename) Session = sessionmaker(bind=engine, autoflush=True, autocommit=False) # Register project handlers registerHandlers() # Get the default publication interface (REST or Hessian) if restApi is None: restApi = config.getboolean('DEFAULT', 'use_rest_api', default=False) # If the dataset map is input, just read it ... dmap = None directoryMap = None extraFields = None if datasetMapfile is not None: dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True) datasetNames = dmap.keys() elif rescan: # Note: No need to get the extra fields, such as mod_time, since # they are already in the database, and will be used for file comparison if necessary. dmap, offline = queryDatasetMap(rescanDatasetName, Session) datasetNames = dmap.keys() # ... otherwise generate the directory map. else: # Online dataset(s) if not offline: if len(lastargs)==0: print "No directories specified." return if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: multiIter = multiDirectoryIterator(lastargs, filefilt=filefilt) firstFile, size = multiIter.next() listIter = list(multiIter) handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError("No project found in file %s, specify with --project."%firstFile) projectName = handler.name props = properties.copy() props.update(initcontext) if not readFiles: directoryMap = handler.generateDirectoryMap(lastargs, filefilt, initContext=props, datasetName=datasetName) else: directoryMap = handler.generateDirectoryMapFromFiles(lastargs, filefilt, initContext=props, datasetName=datasetName) datasetNames = [(item,-1) for item in directoryMap.keys()] # Offline dataset. Format the spec as a dataset map : dataset_name => [(path, size), (path, size), ...] else: handler = getHandlerByName(projectName, None, Session, offline=True) dmap = {} listerSection = getOfflineLister(config, "project:%s"%projectName, service) offlineLister = config.get(listerSection, 'offline_lister_executable') commandArgs = "--config-section %s "%listerSection commandArgs += " ".join(lastargs) for dsetName, filepath, sizet in processNodeMatchIterator(offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True): size, mtime = sizet if dmap.has_key((dsetName,-1)): dmap[(dsetName,-1)].append((filepath, str(size))) else: dmap[(dsetName,-1)] = [(filepath, str(size))] datasetNames = dmap.keys() datasetNames.sort() if len(datasetNames)==0: warning("No datasets found.") min_version = -1 else: min_version = sorted(datasetNames, key=lambda x: x[1])[0][1] # Must specify version for replications if min_version == -1 and masterGateway is not None and version is None and versionList is None: raise ESGPublishError("Must specify version with --new-version (or --version-list) for replicated datasets") # Iterate over datasets if not publishOnly: # pdb.set_trace() datasets = iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, publishOp, filefilt, initcontext, offline, properties, keepVersion=keepVersion, newVersion=version, extraFields=extraFields, masterGateway=masterGateway, comment=message, readFiles=readFiles, nodbwrite=nodbwrite) if (not nodbwrite): result = publishDatasetList(datasetNames, Session, publish=publish, thredds=thredds, las=las, parentId=parent, service=service, perVariable=perVariable, reinitThredds=threddsReinit, restInterface=restApi, schema=schema) # print `result` if summarizeErrors: print 'Summary of errors:' for name,versionno in datasetNames: dset = Dataset.lookup(name, Session) print dset.get_name(Session), dset.get_project(Session), dset.get_model(Session), dset.get_experiment(Session), dset.get_run_name(Session) if dset.has_warnings(Session): print '=== Dataset: %s ==='%dset.name for line in dset.get_warnings(Session): print line
def main(argv): try: args, lastargs = getopt.getopt(argv, "a:ehi:o:p:", ['dataset=', 'dataset-tech-notes=', 'dataset-tech-notes-title=',\ 'filter=', 'help', 'max-threads=', 'offline', 'output=', 'project=', 'property=', 'read-directories', 'read-files',\ 'service=', 'use-version-dir', 'version=']) except getopt.error: print sys.exc_value return if len(lastargs)==0: print 'No directory specified' return appendMap = None datasetName = None datasetTechNotesURL = None datasetTechNotesTitle = None filefilt = '.*\.nc$' init_file = None offline = False output = sys.stdout projectName = None properties = {} readFiles = False service = None max_threads = 4 version_dir = False use_version = None for flag, arg in args: if flag=='-a': if os.path.exists(arg): appendMap = readDatasetMap(arg) else: appendMap = {} output = open(arg, 'a') elif flag=='--dataset': datasetName = arg elif flag=='--dataset-tech-notes': datasetTechNotesURL = arg elif flag=='--dataset-tech-notes-title': datasetTechNotesTitle = arg elif flag=='--filter': filefilt = arg elif flag in ['-h', '--help']: print usage sys.exit(0) elif flag=='-i': init_file = arg elif flag=='--max-threads': max_threads = int(arg) elif flag in ['-o', '--output']: output = open(arg, 'w') elif flag=='--offline': offline = True elif flag=='--project': projectName = arg elif flag in ['-p', '--property']: name, value = arg.split('=') properties[name] = value elif flag in ['-e', '--read-directories']: readFiles = False elif flag=='--read-files': readFiles = True elif flag=='--service': service = arg elif flag=='--use-version-dir': version_dir = True elif flag=='--version': version_dir = True if not re.match('^[0-9]+$', arg[0]): # e.g. 'vYYYYMMDD' use_version = arg[1:] else: use_version = arg # Load the configuration and set up a database connection config = loadConfig(init_file) engine = create_engine(config.getdburl('extract'), echo=False, pool_recycle=3600) initLogging('extract', override_sa=engine) Session = sessionmaker(bind=engine, autoflush=True, autocommit=False) # Register project handlers registerHandlers() if not offline: # Determine if checksumming is enabled line = config.get('DEFAULT', 'checksum', default=None) if line is not None: checksumClient, checksumType = splitLine(line) else: checksumClient = None if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: warning("No project name specified!") multiIter = multiDirectoryIterator(lastargs, filefilt=filefilt) firstFile, size = multiIter.next() handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError("No project found in file %s, specify with --project."%firstFile) projectName = handler.name if not readFiles: datasetMap = handler.generateDirectoryMap(lastargs, filefilt, initContext=properties, datasetName=datasetName, use_version=version_dir) else: datasetMap = handler.generateDirectoryMapFromFiles(lastargs, filefilt, initContext=properties, datasetName=datasetName) # Output the map keys = datasetMap.keys() keys.sort() datasetMapVersion = {} if version_dir: # check for version directory for dataset_id in keys: ds_id_version = dataset_id.split('#') if len(ds_id_version) == 2: ds_id, ds_version = ds_id_version if not re.match('^[0-9]+$', ds_version): warning("Version must be an integer. Skipping version %s of dataset %s."%(ds_version, ds_id)) continue if use_version and ds_version != use_version: continue if ds_id in datasetMapVersion: datasetMapVersion[ds_id].append(ds_version) else: datasetMapVersion[ds_id] = [ds_version] else: error("No version directory found. Skipping dataset %s."%dataset_id) if datasetMapVersion: keys = datasetMapVersion.keys() keys.sort() else: if use_version: error("Version %s not found. No datasets to process."%use_version) else: error("No datasets to process.") return for dataset_id in keys: skip_dataset = False dataset_id_version = dataset_id path_version = None # if multiple versions of the same dataset available use latest version if version_dir: path_version = sorted(datasetMapVersion[dataset_id])[-1] if len(datasetMapVersion[dataset_id]) > 1: info("Multiple versions for %s (%s), processing latest (%s)"%(dataset_id, datasetMapVersion[dataset_id], path_version)) dataset_id_version = '%s#%s'%(dataset_id, path_version) direcTuple = datasetMap[dataset_id_version] direcTuple.sort() mapfile_md = {} for nodepath, filepath in direcTuple: # If readFiles is not set, generate a map entry for each file in the directory # that matches filefilt ... if not readFiles: itr = directoryIterator(nodepath, filefilt=filefilt, followSubdirectories=False) # ... otherwise if readFiles is set, generate a map entry for each file else: itr = fnIterator([filepath]) for filepath, sizet in itr: size, mtime = sizet mapfile_md[filepath] = [size] mapfile_md[filepath].append("mod_time=%f"%float(mtime)) extraStuff = "mod_time=%f"%float(mtime) if datasetTechNotesURL is not None: mapfile_md[filepath].append('dataset_tech_notes=%s'%datasetTechNotesURL) if datasetTechNotesURL is not None: mapfile_md[filepath].append('dataset_tech_notes_title=%s'%datasetTechNotesTitle) if checksumClient is not None: pool = ThreadPool(processes=max_threads) args = [(filepath, checksumClient) for filepath in mapfile_md] checksum_list = pool.map(calc_checksum_wrapper, args) for entry in checksum_list: if not entry[1]: error('Calculation of checksum for file %s failed. Skipping dataset %s ...'%(entry[0], dataset_id)) skip_dataset = True # skip entire dataset if we have one file without checksum break mapfile_md[entry[0]].append('checksum=%s'%entry[1]) mapfile_md[entry[0]].append('checksum_type=%s'%checksumType) for fpath in mapfile_md: mapfile_line = '%s | %s | %d'%(dataset_id_version, fpath, mapfile_md[fpath][0]) for md in mapfile_md[fpath][1:]: mapfile_line+=' | %s'%md # Print the map entry if: # - Checksum exists for all files of dataset (in case checksumming is enabled) # - The map is being created, not appended, or # - The existing map does not have the dataset, or # - The existing map has the dataset, but not the file. if path_version: ds_id = (dataset_id, int(path_version)) else: ds_id = (dataset_id, -1) if not skip_dataset and ( (appendMap is None) or (not appendMap.has_key(ds_id)) or (( fpath, "%d"% mapfile_md[fpath][1]) not in appendMap[ds_id]) ): print >>output, mapfile_line else: # offline if projectName is not None: handler = getHandlerByName(projectName, None, Session, offline=True) else: raise ESGPublishError("Must specify --project for offline datasets.") listerSection = getOfflineLister(config, "project:%s"%projectName, service) offlineLister = config.get(listerSection, 'offline_lister_executable') commandArgs = "--config-section %s "%listerSection commandArgs += " ".join(lastargs) for dsetName, filepath, sizet in processNodeMatchIterator(offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True): size, mtime = sizet extrastuff = "" if mtime is not None: extrastuff = "| mod_time=%f"%float(mtime) if (appendMap is None) or (not appendMap.has_key(dsetName)) or ((filepath, "%d"%size) not in appendMap[dsetName]): print >>output, "%s | %s | %d %s"%(dsetName, filepath, size, extrastuff) if output is not sys.stdout: output.close()
def iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, operation, filefilt, initcontext, offlineArg, properties, testProgress1=None, testProgress2=None, handlerDictionary=None, perVariable=None, keepVersion=False, newVersion=None, extraFields=None, masterGateway=None, comment=None, forceAggregate=False, readFiles=False, nodbwrite=False, pid_connector=None): """ Scan and aggregate (if possible) a list of datasets. The datasets and associated files are specified in one of two ways: either as a *dataset map* (see ``dmap``) or a *directory map* (see ``directoryMap``). All dataset information is persisted in the database. This is a 'helper' routine for esgpublish[_gui]. Returns a list of persistent Dataset instances. projectName String name of the project associated with the datasets. If None, it is determined by the first handler found that can open a sample file from the dataset. dmap A dictionary dataset map, as returned from ``readDatasetMap``. If None, ``directoryMap`` must be specified. directoryMap A dictionary directory map, as returned from ``ProjectHandler.generateDirectoryMap``. datasetNames A list of dataset names identifying the datasets to be scanned. Session An SQLAlchemy Session. aggregateDimension Name of the dimension on which to aggregate the datasets. operation The publication operation, one of esgcet.publish.CREATE_OP, DELETE_OP, RENAME_OP, UPDATE_OP filefilt String regular expression as defined by the Python re module. If a ``directoryMap`` is specified, only files whose basename matches the filter are scanned. If ``dmap`` is specified, the filter is ignored. initcontext Dictionary of initial context values for *all* datasets. These values will override metadata contained in datafiles. Contrast with ``properties``. offlineArg Boolean flag or dictionary If a boolean flag: if True the files are treated as offline (not local) and are not scanned or aggregated. The associated metadata will be a minimal set including file name and size. If a dictionary, maps dataset_name => offline flag properties Dictionary of property/value pairs. The properties must be configured in the initialization file section corresponding to the project, and do not override existing metadata values. Contrast with ``initcontext``. testProgress1=None Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*, ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to the scan phase. testProgress2=None Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*, ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to the aggregation phase. handlerDictionary=None A dictionary mapping datasetName => handler. If None, handlers are determined by project name. perVariable=None Boolean, overrides ``variable_per_file`` config option. keepVersion Boolean, True if the dataset version should not be incremented. newVersion Integer or dictionary. Set the new version number explicitly. If a dictionary, maps dataset_id => version. By default the version number is incremented by 1. See keepVersion. extraFields Extra dataset map fields, as from **readDatasetMap**. masterGateway The gateway that owns the master copy of the datasets. If None, the dataset is not replicated. Otherwise the TDS catalog is written with a 'master_gateway' property, flagging the dataset(s) as replicated. comment=None String comment to associate with new datasets created. forceAggregate=False If True, run the aggregation step regardless. readFiles=False If True, interpret directoryMap as having one entry per file, instead of one per directory. pid_connector esgfpid.Connector object to register PIDs """ from esgcet.publish import extractFromDataset, aggregateVariables versionIsMap = (type(newVersion) is types.DictType) if versionIsMap: saveVersionMap = newVersion prevProject = None datasets = [] ct = len(datasetNames) for iloop in range(ct): datasetName, versionno = datasetNames[iloop] # Must specify version for replications if masterGateway: if not newVersion and versionno < 0: raise ESGPublishError( "Must specify a version for replicated datasets, e.g. in the mapfile or with --new-version/--version-list." ) # If using a version map, lookup the version for this dataset if versionIsMap: try: newVersion = saveVersionMap[datasetName] except KeyError: raise ESGPublishError("Dataset not found in version map: %s" % datasetName) context = initcontext.copy() # Get offline flag if type(offlineArg) is dict: offline = offlineArg[datasetName] else: offline = offlineArg # Don't try to aggregate offline datasets if offline: forceAggregate = False # Get a file iterator and sample file if dmap is not None: if len(dmap[(datasetName, versionno)]) == 0: warning("No files specified for dataset %s, version %d." % (datasetName, versionno)) continue firstFile = dmap[(datasetName, versionno)][0][0] fileiter = datasetMapIterator(dmap, datasetName, versionno, extraFields=extraFields, offline=offlineArg) else: direcTuples = directoryMap[datasetName] firstDirec, sampleFile = direcTuples[0] firstFile = os.path.join(firstDirec, sampleFile) if not readFiles: fileiter = multiDirectoryIterator( [direc for direc, sampfile in direcTuples], filefilt) else: fileiter = fnIterator( [sampfile for direc, sampfile in direcTuples]) # If the project is not specified, try to read it from the first file if handlerDictionary is not None and handlerDictionary.has_key( datasetName): handler = handlerDictionary[datasetName] elif projectName is not None: handler = getHandlerByName(projectName, firstFile, Session, validate=True, offline=offline) else: handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError( "No project found in file %s, specify with --project." % firstFile) projectName = handler.name info("Using project name = %s" % projectName) if prevProject is not None and projectName != prevProject: raise ESGPublishError( "Multiple projects found: %s, %s. Can only publish from one project" % (prevProject, projectName)) prevProject = projectName # Generate the initial context from the dataset name context = handler.parseDatasetName(datasetName, context) # Load the rest of the context from the first file, if possible context = handler.getContext(**context) # Add properties from the command line fieldNames = handler.getFieldNames() for name, value in properties.items(): if name not in fieldNames: warning('Property not configured: %s, was ignored' % name) else: context[name] = value # add dataset_version to context to allow version to be a mandatory field if versionno > -1: context['dataset_version'] = versionno elif newVersion is not None: context['dataset_version'] = newVersion # Update the handler context and fill in default values handler.updateContext(context, True) # Ensure that fields are valid: try: handler.validateContext(context) except ESGInvalidMandatoryField: if offline: error("Dataset id has a missing or invalid mandatory field") raise # Create a CFHandler for validation of standard names, checking time axes, etc. cfHandler = handler.getMetadataHandler(sessionMaker=Session) dataset = None if testProgress1 is not None: testProgress1[1] = (100. / ct) * iloop if not offline: testProgress1[2] = (100. / ct) * iloop + (50. / ct) else: testProgress1[2] = (100. / ct) * iloop + (100. / ct) dataset = extractFromDataset(datasetName, fileiter, Session, handler, cfHandler, aggregateDimensionName=aggregateDimension, offline=offline, operation=operation, progressCallback=testProgress1, perVariable=perVariable, keepVersion=keepVersion, newVersion=newVersion, extraFields=extraFields, masterGateway=masterGateway, comment=comment, useVersion=versionno, forceRescan=forceAggregate, nodbwrite=nodbwrite, pid_connector=pid_connector, **context) # If republishing an existing version, only aggregate if online and no variables exist (yet) for the dataset. runAggregate = (not offline) if hasattr(dataset, 'reaggregate'): runAggregate = (runAggregate and dataset.reaggregate) runAggregate = runAggregate or forceAggregate if testProgress2 is not None: testProgress2[1] = (100. / ct) * iloop + 50. / ct testProgress2[2] = (100. / ct) * (iloop + 1) if runAggregate and (not nodbwrite): aggregateVariables(datasetName, Session, aggregateDimensionName=aggregateDimension, cfHandler=cfHandler, progressCallback=testProgress2, datasetInstance=dataset) elif testProgress2 is not None: # Just finish the progress GUI issueCallback(testProgress2, 1, 1, 0.0, 1.0) # Save the context with the dataset, so that it can be searched later if (not nodbwrite): handler.saveContext(datasetName, Session) datasets.append(dataset) return datasets
def esgpublishWrapper(**kw): from esgcet.query import queryDatasetMap aggregateDimension = kw.get("aggregateDimension", "time") datasetMapfile = kw.get("datasetMapfile", None) datasetName = kw.get("datasetName", None) directoryList = kw.get("directoryList", None) echoSql = kw.get("echoSql", False) filefilt = kw.get("filefilt", ".*\.nc$") init_file = kw.get("init_file", None) initcontext = kw.get("initcontext", {}) keepVersion = kw.get("keepVersion", False) las = kw.get("las", False) log_filename = kw.get("log_filename", None) masterGateway = kw.get("masterGateway", None) message = kw.get("message", None) offline = kw.get("offline", False) parent = kw.get("parent", None) perVariable = kw.get("perVariable", None) projectName = kw.get("projectName", None) properties = kw.get("properties", {}) publish = kw.get("publish", False) publishOnly = kw.get("publishOnly", False) publishOp = kw.get("publishOp", CREATE_OP) readFiles = kw.get("readFiles", False) readFromCatalog = kw.get("readFromCatalog", False) reinitThredds = kw.get("reinitThredds", None) rescan = kw.get("rescan", False) rescanDatasetName = kw.get("rescanDatasetName", []) resultThreddsDictionary = None service = kw.get("service", None) summarizeErrors = kw.get("summarizeErrors", False) testProgress1 = kw.get("testProgress1", None) testProgress2 = kw.get("testProgress2", None) thredds = kw.get("thredds", False) threddsCatalogDictionary = kw.get("threddsCatalogDictionary", None) version = kw.get("version", None) # If offline, the project must be specified if offline and (projectName is None): raise ESGPublishError("Must specify project with --project for offline datasets") # Must specify version for replications if masterGateway is not None and version is None: raise ESGPublishError("Must specify version with --new-version for replicated datasets") # Load the configuration and set up a database connection config, Session = initdb(init_file=init_file, echoSql=echoSql, log_filename=log_filename) # Register project handlers registerHandlers() # If the dataset map is input, just read it ... dmap = None directoryMap = None extraFields = None if datasetMapfile is not None: dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True) datasetNames = dmap.keys() elif rescan: # Note: No need to get the extra fields, such as mod_time, since # they are already in the database, and will be used for file comparison if necessary. dmap, offline = queryDatasetMap(rescanDatasetName, Session) datasetNames = dmap.keys() # ... otherwise generate the directory map. else: # Online dataset(s) if not offline: if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: multiIter = multiDirectoryIterator(directoryList, filefilt=filefilt) firstFile, size = multiIter.next() listIter = list(multiIter) handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError("No project found in file %s, specify with --project." % firstFile) projectName = handler.name props = properties.copy() props.update(initcontext) if not readFiles: directoryMap = handler.generateDirectoryMap( directoryList, filefilt, initContext=props, datasetName=datasetName ) else: directoryMap = handler.generateDirectoryMapFromFiles( directoryList, filefilt, initContext=props, datasetName=datasetName ) datasetNames = [(item, -1) for item in directoryMap.keys()] # Offline dataset. Format the spec as a dataset map : dataset_name => [(path, size), (path, size), ...] else: handler = getHandlerByName(projectName, None, Session, offline=True) dmap = {} listerSection = getOfflineLister(config, "project:%s" % projectName, service) offlineLister = config.get(listerSection, "offline_lister_executable") commandArgs = "--config-section %s " % listerSection commandArgs += " ".join(directoryList) for dsetName, filepath, sizet in processNodeMatchIterator( offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True ): size, mtime = sizet if dmap.has_key((dsetName, -1)): dmap[(dsetName, -1)].append((filepath, str(size))) else: dmap[(dsetName, -1)] = [(filepath, str(size))] datasetNames = dmap.keys() datasetNames.sort() if len(datasetNames) == 0: warning("No datasets found.") # Iterate over datasets if not publishOnly: datasets = iterateOverDatasets( projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, publishOp, filefilt, initcontext, offline, properties, keepVersion=keepVersion, newVersion=version, extraFields=extraFields, masterGateway=masterGateway, comment=message, readFiles=readFiles, ) result = publishDatasetList( datasetNames, Session, publish=publish, thredds=thredds, las=las, parentId=parent, service=service, perVariable=perVariable, threddsCatalogDictionary=threddsCatalogDictionary, reinitThredds=reinitThredds, readFromCatalog=readFromCatalog, ) return result
def publishDatasetList(datasetNames, Session, parentId=None, handlerDictionary=None, publish=True, thredds=True, las=False, progressCallback=None, service=None, perVariable=None, threddsCatalogDictionary=None, reinitThredds=None, readFromCatalog=False, restInterface=False, schema=None): """ Publish a list of datasets: - For each dataset, write a THREDDS catalog. - Add the new catalogs to the THREDDS catalog tree and reinitilize the THREDDS server. - Reinitialize the LAS server. - Publish each dataset to the gateway. Returns a dictionary: (datasetName, version) => status datasetNames A list of (string_dataset_name, version) tuples. Session A database Session. parentId The string (or dictionary) persistent identifier of the parent of the datasets. If None (the default), the parent id for each dataset is generated from ``handler.getParentId()``. If a dictionary, each dataset name is used as a key to lookup the respective parent id. If a string, the parent id is set to the string for all datasets being published. This function can be overridden in the project handler to implement a project-specific dataset hierarchy. handlerDictionary A dictionary mapping dataset_name => handler. publish Boolean flag: if true (the default), contact the gateway to publish this dataset. thredds Boolean flag: if true (the default), write the associated THREDDS catalog. las Boolean flag: if true (the default), write the associated LAS catalog. progressCallback Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported. service String service name. If omitted, the first online/offline service in the configuration is used. perVariable Boolean, overrides ``variable_per_file`` config option. threddsCatalogDictionary If not None, just generate catalogs in strings, not the THREDDS directories, and set threddsCatalogDictionary[datasetId] = string_catalog reinitThredds Boolean flag. If True, create the TDS master catalog and reinitialize the TDS server. If None, defaults to value of thredds option. readFromCatalog Boolean flag. If True, read the TDS catalog definitions from threddsCatalogDictionary. threddsCatalogDictionary must also be set. restInterface Boolean flag. If True, publish datasets with the RESTful publication services. schema (Optional) String name of the schema to validate against, for RESTful publication calls. """ session = Session() resultDict = {} if readFromCatalog and threddsCatalogDictionary is None: raise ESGPublishError("Must set THREDDS catalog dictionary when readFromCatalog is True.") # Get handlers for each dataset if handlerDictionary is None: handlers = {} for datasetName,versionno in datasetNames: dset = session.query(Dataset).filter_by(name=datasetName).first() if dset is None: raise ESGPublishError("Dataset not found: %s"%datasetName) handler = getHandlerByName(dset.project, None, Session) handlers[datasetName] = handler else: handlers = handlerDictionary # reinitThredds defaults to the value of thredds option if reinitThredds is None: reinitThredds = thredds if thredds: for datasetName,versionno in datasetNames: dset = session.query(Dataset).filter_by(name=datasetName).first() # If the dataset version is not the latest, publish as a per-time dataset without aggregation, # since the dataset variables only relate to the latest dataset version latestVersion = dset.getVersion() if versionno==-1: versionno=latestVersion if versionno!=latestVersion: if perVariable: messaging.info("Generating THREDDS catalog in per-time format, since version %d is not the latest version (%d)"%(versionno,latestVersion)) perVariable = False handler = handlers[datasetName] # If threddsCatalogDictionary is not set, create the TDS catalog in the TDS content directory ... if threddsCatalogDictionary is None: threddsOutputPath = generateThreddsOutputPath(datasetName, versionno, Session, handler) threddsOutput = open(threddsOutputPath, "w") generateThredds(datasetName, Session, threddsOutput, handler, service=service, perVariable=perVariable, versionNumber=versionno) threddsOutput.close() try: os.chmod(threddsOutputPath, 0664) except: pass # ... else if threddsCatalogDictionary is the catalog source: elif readFromCatalog: catalogString = threddsCatalogDictionary[(datasetName,versionno)] threddsOutputPath = generateThreddsOutputPath(datasetName, versionno, Session, handler) threddsOutput = open(threddsOutputPath, "w") messaging.info("Writing THREDDS catalog %s"%threddsOutputPath) threddsOutput.write(catalogString) threddsOutput.close() try: os.chmod(threddsOutputPath, 0664) except: pass # ... otherwise write the catalog in a 'string file' else: threddsOutputPath = generateThreddsOutputPath(datasetName, versionno, Session, handler) # Creates catalog entry threddsOutput = cStringIO.StringIO() generateThredds(datasetName, Session, threddsOutput, handler, service=service, perVariable=perVariable, versionNumber=versionno) threddsCatalogDictionary[(datasetName,versionno)] = threddsOutput.getvalue() threddsOutput.close() if reinitThredds: updateThreddsMasterCatalog(Session) result = reinitializeThredds() if las: try: result = reinitializeLAS() except Exception, e: messaging.error("Error on LAS reinitialization: %s, new datasets not added."%e)
def esgpublishWrapper(**kw): from esgcet.query import queryDatasetMap aggregateDimension = kw.get("aggregateDimension", "time") datasetMapfile = kw.get("datasetMapfile", None) datasetName = kw.get("datasetName", None) directoryList = kw.get("directoryList", None) echoSql = kw.get("echoSql", False) filefilt = kw.get("filefilt", '.*\.nc$') init_file = kw.get("init_file", None) initcontext = kw.get("initcontext", {}) keepVersion = kw.get("keepVersion", False) las = kw.get("las", False) log_filename = kw.get("log_filename", None) masterGateway = kw.get("masterGateway", None) message = kw.get("message", None) offline = kw.get("offline", False) parent = kw.get("parent", None) perVariable = kw.get("perVariable", None) projectName = kw.get("projectName", None) properties = kw.get("properties", {}) publish = kw.get("publish", False) publishOnly = kw.get("publishOnly", False) publishOp = kw.get("publishOp", CREATE_OP) readFiles = kw.get("readFiles", False) readFromCatalog = kw.get("readFromCatalog", False) reinitThredds = kw.get("reinitThredds", None) rescan = kw.get("rescan", False) rescanDatasetName = kw.get("rescanDatasetName", []) resultThreddsDictionary = None service = kw.get("service", None) summarizeErrors = kw.get("summarizeErrors", False) testProgress1 = kw.get("testProgress1", None) testProgress2 = kw.get("testProgress2", None) thredds = kw.get("thredds", False) threddsCatalogDictionary = kw.get("threddsCatalogDictionary", None) version = kw.get("version", None) # If offline, the project must be specified if offline and (projectName is None): raise ESGPublishError( "Must specify project with --project for offline datasets") # Must specify version for replications if masterGateway is not None and version is None: raise ESGPublishError( "Must specify version with --new-version for replicated datasets") # Load the configuration and set up a database connection config, Session = initdb(init_file=init_file, echoSql=echoSql, log_filename=log_filename) # Register project handlers registerHandlers() # If the dataset map is input, just read it ... dmap = None directoryMap = None extraFields = None if datasetMapfile is not None: dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True) datasetNames = dmap.keys() elif rescan: # Note: No need to get the extra fields, such as mod_time, since # they are already in the database, and will be used for file comparison if necessary. dmap, offline = queryDatasetMap(rescanDatasetName, Session) datasetNames = dmap.keys() # ... otherwise generate the directory map. else: # Online dataset(s) if not offline: if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: multiIter = multiDirectoryIterator(directoryList, filefilt=filefilt) firstFile, size = multiIter.next() listIter = list(multiIter) handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError( "No project found in file %s, specify with --project." % firstFile) projectName = handler.name props = properties.copy() props.update(initcontext) if not readFiles: directoryMap = handler.generateDirectoryMap( directoryList, filefilt, initContext=props, datasetName=datasetName) else: directoryMap = handler.generateDirectoryMapFromFiles( directoryList, filefilt, initContext=props, datasetName=datasetName) datasetNames = [(item, -1) for item in directoryMap.keys()] # Offline dataset. Format the spec as a dataset map : dataset_name => [(path, size), (path, size), ...] else: handler = getHandlerByName(projectName, None, Session, offline=True) dmap = {} listerSection = getOfflineLister(config, "project:%s" % projectName, service) offlineLister = config.get(listerSection, 'offline_lister_executable') commandArgs = "--config-section %s " % listerSection commandArgs += " ".join(directoryList) for dsetName, filepath, sizet in processNodeMatchIterator( offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True): size, mtime = sizet if dmap.has_key((dsetName, -1)): dmap[(dsetName, -1)].append((filepath, str(size))) else: dmap[(dsetName, -1)] = [(filepath, str(size))] datasetNames = dmap.keys() datasetNames.sort() if len(datasetNames) == 0: warning("No datasets found.") # Iterate over datasets if not publishOnly: datasets = iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, publishOp, filefilt, initcontext, offline, properties, keepVersion=keepVersion, newVersion=version, extraFields=extraFields, masterGateway=masterGateway, comment=message, readFiles=readFiles) result = publishDatasetList( datasetNames, Session, publish=publish, thredds=thredds, las=las, parentId=parent, service=service, perVariable=perVariable, threddsCatalogDictionary=threddsCatalogDictionary, reinitThredds=reinitThredds, readFromCatalog=readFromCatalog) return result