def fill_in_data_information_file(self, dirfilename, onoff_line="online"): from esgcet.publish import readDatasetMap dmap, extraFields = readDatasetMap(dirfilename, parse_extra_fields=True) datasetNames = dmap.keys() datasetNames.sort() self.parent.parent.dmap = dmap self.parent.parent.extraFields = extraFields self.parent.parent.datasetNames = datasetNames self.parent.parent.datasetMapfile = dirfilename if onoff_line == "online": tab_name = "Collection %i" % self.parent.parent.top_ct page_type = "collection" tcolor = 'lightgreen' onoff_flag = False else: tab_name = "Offline %i" % self.parent.parent.top_ct page_type = "offline" tcolor = 'orange' onoff_flag = True self.parent.parent.ntk.new_page(self.parent.parent, tab_name, tab_color=tcolor, page_type=page_type) # Copy the defaultGlobalValues into initcontext setdefaultGlobalValues = {} initcontext = {} for x in self.defaultGlobalValues.keys(): if self.defaultGlobalValues[x].find( "Default Global Setting") == -1: setdefaultGlobalValues[x] = self.defaultGlobalValues[x] initcontext.update(setdefaultGlobalValues) # Update the initcontext # Set the iteration values for each page selected_page = self.parent.parent.main_frame.selected_top_page self.parent.parent.main_frame.dmap[selected_page] = dmap self.parent.parent.main_frame.extraFields[selected_page] = extraFields self.parent.parent.main_frame.datasetMapfile[ selected_page] = dirfilename self.parent.parent.main_frame.dirp_firstfile[selected_page] = None self.parent.parent.offline_file_directory[selected_page] = "file" self.parent.parent.defaultGlobalValues[selected_page] = initcontext self.parent.parent.directoryMap[selected_page] = None self.parent.parent.hold_offline[selected_page] = onoff_flag self.parent.parent.main_frame.projectName[ selected_page] = self.project_dataset.get()
def fill_in_data_information_file(self, dirfilename, onoff_line="online"): from esgcet.publish import readDatasetMap dmap, extraFields = readDatasetMap(dirfilename, parse_extra_fields=True) datasetNames = dmap.keys() datasetNames.sort() self.parent.parent.dmap = dmap self.parent.parent.extraFields = extraFields self.parent.parent.datasetNames = datasetNames self.parent.parent.datasetMapfile = dirfilename if onoff_line == "online": tab_name = "Collection %i" % self.parent.parent.top_ct page_type = "collection" tcolor = "lightgreen" onoff_flag = False else: tab_name = "Offline %i" % self.parent.parent.top_ct page_type = "offline" tcolor = "orange" onoff_flag = True self.parent.parent.ntk.new_page(self.parent.parent, tab_name, tab_color=tcolor, page_type=page_type) # Copy the defaultGlobalValues into initcontext setdefaultGlobalValues = {} initcontext = {} for x in self.defaultGlobalValues.keys(): if self.defaultGlobalValues[x].find("Default Global Setting") == -1: setdefaultGlobalValues[x] = self.defaultGlobalValues[x] initcontext.update(setdefaultGlobalValues) # Update the initcontext # Set the iteration values for each page selected_page = self.parent.parent.main_frame.selected_top_page self.parent.parent.main_frame.dmap[selected_page] = dmap self.parent.parent.main_frame.extraFields[selected_page] = extraFields self.parent.parent.main_frame.datasetMapfile[selected_page] = dirfilename self.parent.parent.main_frame.dirp_firstfile[selected_page] = None self.parent.parent.offline_file_directory[selected_page] = "file" self.parent.parent.defaultGlobalValues[selected_page] = initcontext self.parent.parent.directoryMap[selected_page] = None self.parent.parent.hold_offline[selected_page] = onoff_flag self.parent.parent.main_frame.projectName[selected_page] = self.project_dataset.get()
def main(argv): try: args, lastargs = getopt.getopt(argv, "a:ehi:o:p:", ['dataset=', 'dataset-tech-notes=', 'dataset-tech-notes-title=',\ 'filter=', 'help', 'max-threads=', 'offline', 'output=', 'project=', 'property=', 'read-directories', 'read-files',\ 'service=', 'use-version-dir', 'version=']) except getopt.error: print sys.exc_value return if len(lastargs)==0: print 'No directory specified' return appendMap = None datasetName = None datasetTechNotesURL = None datasetTechNotesTitle = None filefilt = '.*\.nc$' init_file = None offline = False output = sys.stdout projectName = None properties = {} readFiles = False service = None max_threads = 4 version_dir = False use_version = None for flag, arg in args: if flag=='-a': if os.path.exists(arg): appendMap = readDatasetMap(arg) else: appendMap = {} output = open(arg, 'a') elif flag=='--dataset': datasetName = arg elif flag=='--dataset-tech-notes': datasetTechNotesURL = arg elif flag=='--dataset-tech-notes-title': datasetTechNotesTitle = arg elif flag=='--filter': filefilt = arg elif flag in ['-h', '--help']: print usage sys.exit(0) elif flag=='-i': init_file = arg elif flag=='--max-threads': max_threads = int(arg) elif flag in ['-o', '--output']: output = open(arg, 'w') elif flag=='--offline': offline = True elif flag=='--project': projectName = arg elif flag in ['-p', '--property']: name, value = arg.split('=') properties[name] = value elif flag in ['-e', '--read-directories']: readFiles = False elif flag=='--read-files': readFiles = True elif flag=='--service': service = arg elif flag=='--use-version-dir': version_dir = True elif flag=='--version': version_dir = True if not re.match('^[0-9]+$', arg[0]): # e.g. 'vYYYYMMDD' use_version = arg[1:] else: use_version = arg # Load the configuration and set up a database connection config = loadConfig(init_file) engine = create_engine(config.getdburl('extract'), echo=False, pool_recycle=3600) initLogging('extract', override_sa=engine) Session = sessionmaker(bind=engine, autoflush=True, autocommit=False) # Register project handlers registerHandlers() if not offline: # Determine if checksumming is enabled line = config.get('DEFAULT', 'checksum', default=None) if line is not None: checksumClient, checksumType = splitLine(line) else: checksumClient = None if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: warning("No project name specified!") multiIter = multiDirectoryIterator(lastargs, filefilt=filefilt) firstFile, size = multiIter.next() handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError("No project found in file %s, specify with --project."%firstFile) projectName = handler.name if not readFiles: datasetMap = handler.generateDirectoryMap(lastargs, filefilt, initContext=properties, datasetName=datasetName, use_version=version_dir) else: datasetMap = handler.generateDirectoryMapFromFiles(lastargs, filefilt, initContext=properties, datasetName=datasetName) # Output the map keys = datasetMap.keys() keys.sort() datasetMapVersion = {} if version_dir: # check for version directory for dataset_id in keys: ds_id_version = dataset_id.split('#') if len(ds_id_version) == 2: ds_id, ds_version = ds_id_version if not re.match('^[0-9]+$', ds_version): warning("Version must be an integer. Skipping version %s of dataset %s."%(ds_version, ds_id)) continue if use_version and ds_version != use_version: continue if ds_id in datasetMapVersion: datasetMapVersion[ds_id].append(ds_version) else: datasetMapVersion[ds_id] = [ds_version] else: error("No version directory found. Skipping dataset %s."%dataset_id) if datasetMapVersion: keys = datasetMapVersion.keys() keys.sort() else: if use_version: error("Version %s not found. No datasets to process."%use_version) else: error("No datasets to process.") return for dataset_id in keys: skip_dataset = False dataset_id_version = dataset_id path_version = None # if multiple versions of the same dataset available use latest version if version_dir: path_version = sorted(datasetMapVersion[dataset_id])[-1] if len(datasetMapVersion[dataset_id]) > 1: info("Multiple versions for %s (%s), processing latest (%s)"%(dataset_id, datasetMapVersion[dataset_id], path_version)) dataset_id_version = '%s#%s'%(dataset_id, path_version) direcTuple = datasetMap[dataset_id_version] direcTuple.sort() mapfile_md = {} for nodepath, filepath in direcTuple: # If readFiles is not set, generate a map entry for each file in the directory # that matches filefilt ... if not readFiles: itr = directoryIterator(nodepath, filefilt=filefilt, followSubdirectories=False) # ... otherwise if readFiles is set, generate a map entry for each file else: itr = fnIterator([filepath]) for filepath, sizet in itr: size, mtime = sizet mapfile_md[filepath] = [size] mapfile_md[filepath].append("mod_time=%f"%float(mtime)) extraStuff = "mod_time=%f"%float(mtime) if datasetTechNotesURL is not None: mapfile_md[filepath].append('dataset_tech_notes=%s'%datasetTechNotesURL) if datasetTechNotesURL is not None: mapfile_md[filepath].append('dataset_tech_notes_title=%s'%datasetTechNotesTitle) if checksumClient is not None: pool = ThreadPool(processes=max_threads) args = [(filepath, checksumClient) for filepath in mapfile_md] checksum_list = pool.map(calc_checksum_wrapper, args) for entry in checksum_list: if not entry[1]: error('Calculation of checksum for file %s failed. Skipping dataset %s ...'%(entry[0], dataset_id)) skip_dataset = True # skip entire dataset if we have one file without checksum break mapfile_md[entry[0]].append('checksum=%s'%entry[1]) mapfile_md[entry[0]].append('checksum_type=%s'%checksumType) for fpath in mapfile_md: mapfile_line = '%s | %s | %d'%(dataset_id_version, fpath, mapfile_md[fpath][0]) for md in mapfile_md[fpath][1:]: mapfile_line+=' | %s'%md # Print the map entry if: # - Checksum exists for all files of dataset (in case checksumming is enabled) # - The map is being created, not appended, or # - The existing map does not have the dataset, or # - The existing map has the dataset, but not the file. if path_version: ds_id = (dataset_id, int(path_version)) else: ds_id = (dataset_id, -1) if not skip_dataset and ( (appendMap is None) or (not appendMap.has_key(ds_id)) or (( fpath, "%d"% mapfile_md[fpath][1]) not in appendMap[ds_id]) ): print >>output, mapfile_line else: # offline if projectName is not None: handler = getHandlerByName(projectName, None, Session, offline=True) else: raise ESGPublishError("Must specify --project for offline datasets.") listerSection = getOfflineLister(config, "project:%s"%projectName, service) offlineLister = config.get(listerSection, 'offline_lister_executable') commandArgs = "--config-section %s "%listerSection commandArgs += " ".join(lastargs) for dsetName, filepath, sizet in processNodeMatchIterator(offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True): size, mtime = sizet extrastuff = "" if mtime is not None: extrastuff = "| mod_time=%f"%float(mtime) if (appendMap is None) or (not appendMap.has_key(dsetName)) or ((filepath, "%d"%size) not in appendMap[dsetName]): print >>output, "%s | %s | %d %s"%(dsetName, filepath, size, extrastuff) if output is not sys.stdout: output.close()
def return_content2(self, appendOpt=False): from esgcet.publish import iterateOverDatasets, processIterator from esgcet.config import getHandlerByName from esgcet.model import eventName from esgcet.config import loadConfig # Initialize parameters for interating over datasets initcontext = {} aggregateOnly = False # appendOpt = False initcontext = {} properties = {} publish = False publishOnly = False thredds = False testProgress1 = [self.parent.parent.statusbar.show, 0, 50] testProgress2 = [self.parent.parent.statusbar.show, 50, 100] handlerDictionary = {} # Get the currently selected tab and the selected datasets tab_name = self.parent.parent.top_notebook.getcurselection() selected_page = self.parent.parent.main_frame.selected_top_page datasetNames = [] # datasetNames2 = [] if (selected_page is None): warning("Must generate a list of datasets to scan before data extraction can occur.") return if (selected_page is not None) or (self.parent.parent.hold_offline[selected_page] == True): extraFields = None if (self.parent.parent.hold_offline[selected_page] == False) or (isinstance(self.parent.parent.hold_offline[selected_page], types.DictType)): for x in self.parent.parent.main_frame.top_page_id[selected_page]: dsetVersionName = self.parent.parent.main_frame.top_page_id2[selected_page][x].cget('text') # GANZ TODO version_label # ganz added this 1/21/11 if (self.parent.parent.main_frame.version_label[selected_page] ): dset_name = self.parent.parent.main_frame.top_page_id2[selected_page][x].cget('text') dsetVersion = self.parent.parent.main_frame.version_label[selected_page][x].cget('text') ##################################################################################### else: dset_name, dsetVersion = parseDatasetVersionId(dsetVersionName) # Retrieve all the datasets in the collection for display """ ganz test code status = pollDatasetPublicationStatus(dset_name, self.Session) status_text = pub_controls.return_status_text( status ) if status_text != 'Error': dsetTuple = parseDatasetVersionId(self.parent.parent.main_frame.top_page_id2[selected_page][x].cget('text')) datasetNames2.append(dsetTuple) """ # Retrieve only the datasets that have been selected if self.parent.parent.main_frame.top_page_id[selected_page][x].cget('bg') != 'salmon': dsetTuple = parseDatasetVersionId(self.parent.parent.main_frame.top_page_id2[selected_page][x].cget('text')) datasetNames.append(dsetTuple) dmap = self.parent.parent.main_frame.dmap[selected_page] extraFields = self.parent.parent.main_frame.extraFields[selected_page] datasetMapfile = self.parent.parent.main_frame.datasetMapfile[selected_page] projectName = self.parent.parent.main_frame.projectName[selected_page] directoryMap = self.parent.parent.directoryMap[selected_page] if dmap is not None: for x in datasetNames: dsetId = x[0] datasetName = x try: dmapentry = dmap[datasetName] except: # Check if the dataset map key was changed from (dsetname,-1) to (dsetname,version). # If so, replace the entry with the new key. trykey = (datasetName[0], -1) dmapentry = dmap[trykey] del dmap[trykey] dmap[datasetName] = dmapentry firstFile = dmapentry[0][0] self.parent.parent.handlerDictionary[dsetId] = getHandlerByName(projectName, firstFile, self.Session) handler = self.parent.parent.handlerDictionary[dsetId] # Copy the defaultGlobalValues into initcontext initcontext = self.parent.parent.main_frame.defaultGlobalValues[selected_page] else: # more test code myholdDirectoryMap = self.parent.parent.directoryMap[selected_page] #mydatasetNames = [(item,-1) for item in myholdDirectoryMap.keys()] mydatasetNames = [(item) for item in myholdDirectoryMap.keys()] #end for x in mydatasetNames: dsetId = x[0] datasetName = x # ganz this is test code try: dmapentry = myholdDirectoryMap[datasetName] except: # Check if the dataset map key was changed from (dsetname,-1) to (dsetname,version). # If so, replace the entry with the new key. trykey = (datasetName[0], -1) dmapentry = myholdDirectoryMap[trykey] del myholdDirectoryMap[trykey] myholdDirectoryMap[datasetName] = dmapentry firstFile = dmapentry[0][1] #end of test code #firstFile = self.parent.parent.main_frame.dirp_firstfile[selected_page] self.parent.parent.handlerDictionary[dsetId] = getHandlerByName(projectName, firstFile, self.Session) handler = self.parent.parent.handlerDictionary[dsetId] else: # working off-line projectName = self.parent.parent.main_frame.projectName[selected_page] if self.parent.parent.offline_file_directory[selected_page] == "directory": if self.parent.parent.config is None: extraction_controls.call_sessionmaker( self.parent.parent ) datasetPaths = [] dmap = {self.parent.parent.offline_datasetName : datasetPaths} listerSection = getOfflineLister(self.parent.parent.config, "project:%s"%projectName, None) offlineLister = self.parent.parent.config.get(listerSection, 'offline_lister_executable') lastargs = self.parent.parent.offline_directories commandArgs = "--config-section %s "%listerSection commandArgs += " ".join(lastargs) for filepath, size in processIterator(offlineLister, commandArgs, filefilt=self.parent.parent.filefilt): datasetPaths.append((filepath, str(size))) datasetNames = self.parent.parent.datasetNames directoryMap = None # get the handler for x in datasetNames: dsetId = x[0] self.parent.parent.handlerDictionary[dsetId] = getHandlerByName(projectName, None, self.Session, offline=True) elif self.parent.parent.offline_file_directory[selected_page] == "file": dmap = self.parent.parent.main_frame.dmap[selected_page] extraFields = self.parent.parent.main_frame.extraFields[selected_page] datasetMapfile = self.parent.parent.main_frame.datasetMapfile[selected_page] projectName = self.parent.parent.main_frame.projectName[selected_page] directoryMap = None if datasetMapfile is not None: dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True) datasetNames = dmap.keys() # get the handlers for x in datasetNames: dsetId = x[0] self.parent.parent.handlerDictionary[dsetId] = getHandlerByName(projectName, None, self.Session, offline=True) # Iterate over datasets if appendOpt: operation = UPDATE_OP else: operation = CREATE_OP datasets = iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, self.Session, self.parent.parent.aggregateDimension, operation, self.parent.parent.filefilt, initcontext, self.parent.parent.hold_offline[selected_page], properties, comment=self.comments, testProgress1=testProgress1, testProgress2=testProgress2 , handlerDictionary=self.parent.parent.handlerDictionary, extraFields=extraFields, readFiles=True) # If working on-line then replace the scanned list of datasets with # the complete list of datasets #test """ print 'datasetNames:' for t1 in datasetNames: print t1 print 'datasetNames2:' for t2 in datasetNames2: print t2 """ if not self.parent.parent.hold_offline[selected_page]: datasets = [] versionObjs = [] # ganz finally, tested datasetNames2 here for dsetName, version in datasetNames: result = Dataset.lookup(dsetName, self.Session, version=version) if result is not None: entry, versionObj = result datasets.append(entry) versionObjs.append(versionObj) # Get the summary of errors after doing a data extraction dset_error = [] for dset in datasets: status = dset.get_publication_status(self.Session) status_name = eventName[status] if dset.has_warnings(self.Session): dset_error.append(dset.get_name(self.Session)) try: list_fields = getQueryFields( handler ) except: handler = getHandlerByName(projectName, None, self.Session) list_fields = getQueryFields( handler ) # Display the datasets in the "Collection" page # if self.parent.parent.hold_offline[selected_page] == True: # tab_name = "Collection_Offline" # from_tab = "Collection" # pub_editorviewer = self.parent.parent.create_publisher_editor_viewer( self.parent.parent, tab_name, dataset, from_tab, self.Session) # Show the extracted datasets self.set_column_labels( len(datasets), list_fields ) self.show_extracted_info(datasets, dset_error, list_fields, versionObjs) # Enable the "Data Publication" button self.parent.ControlButton3.configure( state = 'normal' )
def esgpublishWrapper(**kw): from esgcet.query import queryDatasetMap aggregateDimension = kw.get("aggregateDimension", "time") datasetMapfile = kw.get("datasetMapfile", None) datasetName = kw.get("datasetName", None) directoryList = kw.get("directoryList", None) echoSql = kw.get("echoSql", False) filefilt = kw.get("filefilt", '.*\.nc$') init_file = kw.get("init_file", None) initcontext = kw.get("initcontext", {}) keepVersion = kw.get("keepVersion", False) las = kw.get("las", False) log_filename = kw.get("log_filename", None) masterGateway = kw.get("masterGateway", None) message = kw.get("message", None) offline = kw.get("offline", False) parent = kw.get("parent", None) perVariable = kw.get("perVariable", None) projectName = kw.get("projectName", None) properties = kw.get("properties", {}) publish = kw.get("publish", False) publishOnly = kw.get("publishOnly", False) publishOp = kw.get("publishOp", CREATE_OP) readFiles = kw.get("readFiles", False) readFromCatalog = kw.get("readFromCatalog", False) reinitThredds = kw.get("reinitThredds", None) rescan = kw.get("rescan", False) rescanDatasetName = kw.get("rescanDatasetName", []) resultThreddsDictionary = None service = kw.get("service", None) summarizeErrors = kw.get("summarizeErrors", False) testProgress1 = kw.get("testProgress1", None) testProgress2 = kw.get("testProgress2", None) thredds = kw.get("thredds", False) threddsCatalogDictionary = kw.get("threddsCatalogDictionary", None) version = kw.get("version", None) # If offline, the project must be specified if offline and (projectName is None): raise ESGPublishError( "Must specify project with --project for offline datasets") # Must specify version for replications if masterGateway is not None and version is None: raise ESGPublishError( "Must specify version with --new-version for replicated datasets") # Load the configuration and set up a database connection config, Session = initdb(init_file=init_file, echoSql=echoSql, log_filename=log_filename) # Register project handlers registerHandlers() # If the dataset map is input, just read it ... dmap = None directoryMap = None extraFields = None if datasetMapfile is not None: dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True) datasetNames = dmap.keys() elif rescan: # Note: No need to get the extra fields, such as mod_time, since # they are already in the database, and will be used for file comparison if necessary. dmap, offline = queryDatasetMap(rescanDatasetName, Session) datasetNames = dmap.keys() # ... otherwise generate the directory map. else: # Online dataset(s) if not offline: if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: multiIter = multiDirectoryIterator(directoryList, filefilt=filefilt) firstFile, size = multiIter.next() listIter = list(multiIter) handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError( "No project found in file %s, specify with --project." % firstFile) projectName = handler.name props = properties.copy() props.update(initcontext) if not readFiles: directoryMap = handler.generateDirectoryMap( directoryList, filefilt, initContext=props, datasetName=datasetName) else: directoryMap = handler.generateDirectoryMapFromFiles( directoryList, filefilt, initContext=props, datasetName=datasetName) datasetNames = [(item, -1) for item in directoryMap.keys()] # Offline dataset. Format the spec as a dataset map : dataset_name => [(path, size), (path, size), ...] else: handler = getHandlerByName(projectName, None, Session, offline=True) dmap = {} listerSection = getOfflineLister(config, "project:%s" % projectName, service) offlineLister = config.get(listerSection, 'offline_lister_executable') commandArgs = "--config-section %s " % listerSection commandArgs += " ".join(directoryList) for dsetName, filepath, sizet in processNodeMatchIterator( offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True): size, mtime = sizet if dmap.has_key((dsetName, -1)): dmap[(dsetName, -1)].append((filepath, str(size))) else: dmap[(dsetName, -1)] = [(filepath, str(size))] datasetNames = dmap.keys() datasetNames.sort() if len(datasetNames) == 0: warning("No datasets found.") # Iterate over datasets if not publishOnly: datasets = iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, publishOp, filefilt, initcontext, offline, properties, keepVersion=keepVersion, newVersion=version, extraFields=extraFields, masterGateway=masterGateway, comment=message, readFiles=readFiles) result = publishDatasetList( datasetNames, Session, publish=publish, thredds=thredds, las=las, parentId=parent, service=service, perVariable=perVariable, threddsCatalogDictionary=threddsCatalogDictionary, reinitThredds=reinitThredds, readFromCatalog=readFromCatalog) return result
def esgscanWrapper(directoryList, **kw): if len(directoryList) == 0: raise ESGPublishError('No directory specified') output = sys.stdout appendMap = None appendPath = kw.get("appendPath", None) if appendPath is not None: if os.path.exists(appendPath): appendMap = readDatasetMap(appendPath) else: appendMap = {} output = open(appendPath, 'a') datasetName = kw.get("datasetName", None) filefilt = kw.get("fileFilt", '.*\.nc$') init_file = kw.get("initFile", None) offline = kw.get("offline", False) outputPath = kw.get("outputPath", None) if outputPath is not None: output = open(outputPath, 'w') else: output = sys.stdout projectName = kw.get("projectName", None) readFiles = kw.get("readFiles", False) service = kw.get("service", None) # Load the configuration and set up a database connection config, Session = initdb(init_file=init_file) # Register project handlers registerHandlers() if not offline: # Determine if checksumming is enabled line = config.get('DEFAULT', 'checksum', default=None) if line is not None: checksumClient, checksumType = splitLine(line) else: checksumClient = None if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: multiIter = multiDirectoryIterator(directoryList, filefilt=filefilt) firstFile, size = multiIter.next() handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError( "No project found in file %s, specify with --project." % firstFile) projectName = handler.name if not readFiles: datasetMap = handler.generateDirectoryMap(directoryList, filefilt, datasetName=datasetName) else: datasetMap = handler.generateDirectoryMapFromFiles( directoryList, filefilt, datasetName=datasetName) # Output the map keys = datasetMap.keys() keys.sort() for datasetId in keys: direcTuple = datasetMap[datasetId] direcTuple.sort() for nodepath, filepath in direcTuple: # If readFiles is not set, generate a map entry for each file in the directory # that matches filefilt ... if not readFiles: itr = directoryIterator(nodepath, filefilt=filefilt, followSubdirectories=False) # ... otherwise if readFiles is set, generate a map entry for each file else: itr = fnIterator([filepath]) for filepath, sizet in itr: size, mtime = sizet extraStuff = "mod_time=%f" % float(mtime) if checksumClient is not None: csum = checksum(filepath, checksumClient) extraStuff += " | checksum=%s | checksum_type=%s" % ( csum, checksumType) # Print the map entry if: # - The map is being created, not appended, or # - The existing map does not have the dataset, or # - The existing map has the dataset, but not the file. if (appendMap is None) or (not appendMap.has_key(datasetId)) or ( (filepath, "%d" % size) not in appendMap[datasetId]): print >> output, "%s | %s | %d | %s" % ( datasetId, filepath, size, extraStuff) else: # offline if projectName is not None: handler = getHandlerByName(projectName, None, Session, offline=True) else: raise ESGPublishError( "Must specify --project for offline datasets.") listerSection = getOfflineLister(config, "project:%s" % projectName, service) offlineLister = config.get(listerSection, 'offline_lister_executable') commandArgs = "--config-section %s " % listerSection commandArgs += " ".join(directoryList) for dsetName, filepath, sizet in processNodeMatchIterator( offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True): size, mtime = sizet extrastuff = "" if mtime is not None: extrastuff = "| mod_time=%f" % float(mtime) if (appendMap is None) or (not appendMap.has_key(dsetName)) or ( (filepath, "%d" % size) not in appendMap[dsetName]): print >> output, "%s | %s | %d %s" % (dsetName, filepath, size, extrastuff) if output is not sys.stdout: output.close()
def esgpublishWrapper(**kw): from esgcet.query import queryDatasetMap aggregateDimension = kw.get("aggregateDimension", "time") datasetMapfile = kw.get("datasetMapfile", None) datasetName = kw.get("datasetName", None) directoryList = kw.get("directoryList", None) echoSql = kw.get("echoSql", False) filefilt = kw.get("filefilt", ".*\.nc$") init_file = kw.get("init_file", None) initcontext = kw.get("initcontext", {}) keepVersion = kw.get("keepVersion", False) las = kw.get("las", False) log_filename = kw.get("log_filename", None) masterGateway = kw.get("masterGateway", None) message = kw.get("message", None) offline = kw.get("offline", False) parent = kw.get("parent", None) perVariable = kw.get("perVariable", None) projectName = kw.get("projectName", None) properties = kw.get("properties", {}) publish = kw.get("publish", False) publishOnly = kw.get("publishOnly", False) publishOp = kw.get("publishOp", CREATE_OP) readFiles = kw.get("readFiles", False) readFromCatalog = kw.get("readFromCatalog", False) reinitThredds = kw.get("reinitThredds", None) rescan = kw.get("rescan", False) rescanDatasetName = kw.get("rescanDatasetName", []) resultThreddsDictionary = None service = kw.get("service", None) summarizeErrors = kw.get("summarizeErrors", False) testProgress1 = kw.get("testProgress1", None) testProgress2 = kw.get("testProgress2", None) thredds = kw.get("thredds", False) threddsCatalogDictionary = kw.get("threddsCatalogDictionary", None) version = kw.get("version", None) # If offline, the project must be specified if offline and (projectName is None): raise ESGPublishError("Must specify project with --project for offline datasets") # Must specify version for replications if masterGateway is not None and version is None: raise ESGPublishError("Must specify version with --new-version for replicated datasets") # Load the configuration and set up a database connection config, Session = initdb(init_file=init_file, echoSql=echoSql, log_filename=log_filename) # Register project handlers registerHandlers() # If the dataset map is input, just read it ... dmap = None directoryMap = None extraFields = None if datasetMapfile is not None: dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True) datasetNames = dmap.keys() elif rescan: # Note: No need to get the extra fields, such as mod_time, since # they are already in the database, and will be used for file comparison if necessary. dmap, offline = queryDatasetMap(rescanDatasetName, Session) datasetNames = dmap.keys() # ... otherwise generate the directory map. else: # Online dataset(s) if not offline: if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: multiIter = multiDirectoryIterator(directoryList, filefilt=filefilt) firstFile, size = multiIter.next() listIter = list(multiIter) handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError("No project found in file %s, specify with --project." % firstFile) projectName = handler.name props = properties.copy() props.update(initcontext) if not readFiles: directoryMap = handler.generateDirectoryMap( directoryList, filefilt, initContext=props, datasetName=datasetName ) else: directoryMap = handler.generateDirectoryMapFromFiles( directoryList, filefilt, initContext=props, datasetName=datasetName ) datasetNames = [(item, -1) for item in directoryMap.keys()] # Offline dataset. Format the spec as a dataset map : dataset_name => [(path, size), (path, size), ...] else: handler = getHandlerByName(projectName, None, Session, offline=True) dmap = {} listerSection = getOfflineLister(config, "project:%s" % projectName, service) offlineLister = config.get(listerSection, "offline_lister_executable") commandArgs = "--config-section %s " % listerSection commandArgs += " ".join(directoryList) for dsetName, filepath, sizet in processNodeMatchIterator( offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True ): size, mtime = sizet if dmap.has_key((dsetName, -1)): dmap[(dsetName, -1)].append((filepath, str(size))) else: dmap[(dsetName, -1)] = [(filepath, str(size))] datasetNames = dmap.keys() datasetNames.sort() if len(datasetNames) == 0: warning("No datasets found.") # Iterate over datasets if not publishOnly: datasets = iterateOverDatasets( projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, publishOp, filefilt, initcontext, offline, properties, keepVersion=keepVersion, newVersion=version, extraFields=extraFields, masterGateway=masterGateway, comment=message, readFiles=readFiles, ) result = publishDatasetList( datasetNames, Session, publish=publish, thredds=thredds, las=las, parentId=parent, service=service, perVariable=perVariable, threddsCatalogDictionary=threddsCatalogDictionary, reinitThredds=reinitThredds, readFromCatalog=readFromCatalog, ) return result
def esgscanWrapper(directoryList, **kw): if len(directoryList) == 0: raise ESGPublishError("No directory specified") output = sys.stdout appendMap = None appendPath = kw.get("appendPath", None) if appendPath is not None: if os.path.exists(appendPath): appendMap = readDatasetMap(appendPath) else: appendMap = {} output = open(appendPath, "a") datasetName = kw.get("datasetName", None) filefilt = kw.get("fileFilt", ".*\.nc$") init_file = kw.get("initFile", None) offline = kw.get("offline", False) outputPath = kw.get("outputPath", None) if outputPath is not None: output = open(outputPath, "w") else: output = sys.stdout projectName = kw.get("projectName", None) readFiles = kw.get("readFiles", False) service = kw.get("service", None) # Load the configuration and set up a database connection config, Session = initdb(init_file=init_file) # Register project handlers registerHandlers() if not offline: # Determine if checksumming is enabled line = config.get("DEFAULT", "checksum", default=None) if line is not None: checksumClient, checksumType = splitLine(line) else: checksumClient = None if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: multiIter = multiDirectoryIterator(directoryList, filefilt=filefilt) firstFile, size = multiIter.next() handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError("No project found in file %s, specify with --project." % firstFile) projectName = handler.name if not readFiles: datasetMap = handler.generateDirectoryMap(directoryList, filefilt, datasetName=datasetName) else: datasetMap = handler.generateDirectoryMapFromFiles(directoryList, filefilt, datasetName=datasetName) # Output the map keys = datasetMap.keys() keys.sort() for datasetId in keys: direcTuple = datasetMap[datasetId] direcTuple.sort() for nodepath, filepath in direcTuple: # If readFiles is not set, generate a map entry for each file in the directory # that matches filefilt ... if not readFiles: itr = directoryIterator(nodepath, filefilt=filefilt, followSubdirectories=False) # ... otherwise if readFiles is set, generate a map entry for each file else: itr = fnIterator([filepath]) for filepath, sizet in itr: size, mtime = sizet extraStuff = "mod_time=%f" % float(mtime) if checksumClient is not None: csum = checksum(filepath, checksumClient) extraStuff += " | checksum=%s | checksum_type=%s" % (csum, checksumType) # Print the map entry if: # - The map is being created, not appended, or # - The existing map does not have the dataset, or # - The existing map has the dataset, but not the file. if ( (appendMap is None) or (not appendMap.has_key(datasetId)) or ((filepath, "%d" % size) not in appendMap[datasetId]) ): print >> output, "%s | %s | %d | %s" % (datasetId, filepath, size, extraStuff) else: # offline if projectName is not None: handler = getHandlerByName(projectName, None, Session, offline=True) else: raise ESGPublishError("Must specify --project for offline datasets.") listerSection = getOfflineLister(config, "project:%s" % projectName, service) offlineLister = config.get(listerSection, "offline_lister_executable") commandArgs = "--config-section %s " % listerSection commandArgs += " ".join(directoryList) for dsetName, filepath, sizet in processNodeMatchIterator( offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True ): size, mtime = sizet extrastuff = "" if mtime is not None: extrastuff = "| mod_time=%f" % float(mtime) if ( (appendMap is None) or (not appendMap.has_key(dsetName)) or ((filepath, "%d" % size) not in appendMap[dsetName]) ): print >> output, "%s | %s | %d %s" % (dsetName, filepath, size, extrastuff) if output is not sys.stdout: output.close()
def main(argv): try: args, lastargs = getopt.getopt(argv, "hi:", [ 'database-delete', 'database-only', 'echo-sql', 'map=', 'no-republish', 'no-thredds-reinit', 'skip-gateway', 'skip-index', 'las', 'log=', 'rest-api', 'skip-thredds', 'sync-thredds', 'use-list=' ]) except getopt.error: print sys.exc_value return deleteAll = False datasetMap = None deleteDset = False unpublishOnGateway = False echoSql = False init_file = None gatewayOp = DELETE las = False log_filename = None republish = True restApi = None thredds = True syncThredds = False useList = False threddsReinit = True for flag, arg in args: if flag == '--database-delete': deleteDset = True elif flag == '--database-only': gatewayOp = NO_OPERATION thredds = False deleteDset = True elif flag == '--echo-sql': echoSql = True elif flag in ['-h', '--help']: return elif flag == '-i': init_file = arg elif flag == '--map': datasetMap = readDatasetMap(arg) elif flag == '--skip-gateway': gatewayOp = NO_OPERATION elif flag == '--skip-index': gatewayOp = NO_OPERATION elif flag == '--las': las = True elif flag == '--log': log_filename = arg elif flag == '--no-republish': republish = False elif flag == '--no-thredds-reinit': threddsReinit = False elif flag == '--rest-api': restApi = True elif flag == '--skip-thredds': thredds = False elif flag == '--sync-thredds': syncThredds = True elif flag == '--use-list': useList = True useListPath = arg if gatewayOp != NO_OPERATION and unpublishOnGateway: gatewayOp = UNPUBLISH # Load the configuration and set up a database connection config = loadConfig(init_file) engine = create_engine(config.getdburl('extract'), echo=echoSql, pool_recycle=3600) initLogging('extract', override_sa=engine, log_filename=log_filename) Session = sessionmaker(bind=engine, autoflush=True, autocommit=False) if config is None: raise ESGPublishError("No configuration file found.") threddsRoot = config.get('DEFAULT', 'thredds_root') # Get the default publication interface (REST or Hessian) if restApi is None: restApi = config.getboolean('DEFAULT', 'use_rest_api', default=False) if datasetMap is None: if not useList: datasetNames = [parseDatasetVersionId(item) for item in lastargs] else: if useListPath == '-': namelist = sys.stdin else: namelist = open(useListPath) datasetNames = [] for line in namelist.readlines(): versionId = parseDatasetVersionId(line.strip()) datasetNames.append(versionId) else: datasetNames = datasetMap.keys() datasetNames.sort() result = deleteDatasetList(datasetNames, Session, gatewayOp, thredds, las, deleteDset, deleteAll=deleteAll, republish=republish, reinitThredds=threddsReinit, restInterface=restApi) # Republish previous versions as needed. This will happen if the latest version # was deleted from the database, and is not # the only version. In this case the previous version will be rescanned to generate the aggregations. if republish: statusDict, republishList = result if len(republishList) > 0: # Register project handlers. registerHandlers() info("Republishing modified datasets:") republishDatasetNames = [ generateDatasetVersionId(dsetTuple) for dsetTuple in republishList ] dmap, offline = queryDatasetMap(republishDatasetNames, Session) datasetNames = dmap.keys() datasets = iterateOverDatasets(None, dmap, None, republishList, Session, "time", UPDATE_OP, None, {}, offline, {}, forceAggregate=True) republishOp = (gatewayOp != NO_OPERATION ) # Don't republish if skipping the gateway op result = publishDatasetList(datasetNames, Session, publish=republishOp, thredds=thredds) # Synchronize database and THREDDS catalogs if syncThredds: threddsRoot = config.get('DEFAULT', 'thredds_root') # Make a dictionary of catalogs from the database session = Session() subcatalogs = session.query(Catalog).select_from( join(Catalog, Dataset, Catalog.dataset_name == Dataset.name)).all() catdict = {} for catalog in subcatalogs: location = os.path.join(threddsRoot, catalog.location) catdict[location] = 1 session.close() # Scan all XML files in the threddsroot os.path.walk(threddsRoot, cleanupCatalogs, catdict)
def main(argv): try: args, lastargs = getopt.getopt(argv, "a:cdehi:m:p:ru", ['append', 'create', 'dataset=', 'delete-files', 'echo-sql', 'experiment=', 'filter=', 'help', 'keep-version', 'log=', 'map=', 'message=', 'model=', 'offline', 'parent=', 'per-time', 'per-variable', 'project=', 'property=', 'publish', 'new-version=', 'no-thredds-reinit', 'noscan', 'read-directories', 'read-files', 'rename-files', 'replace', 'replica=', 'rest-api', 'service=', 'set-replica', 'summarize-errors', 'thredds', 'thredds-reinit', 'update', 'use-existing=', 'use-list=', 'validate=', 'version-list=', 'nodbwrite']) except getopt.error: print sys.exc_value return aggregateDimension = "time" datasetMapfile = None datasetName = None echoSql = False filefilt = '.*\.nc$' init_file = None initcontext = {} keepVersion = False las = False log_filename = None masterGateway = None message = None offline = False parent = None perVariable = None projectName = None properties = {} publish = False publishOnly = False publishOp = CREATE_OP readFiles = False rescan = False rescanDatasetName = [] restApi = None schema = None service = None summarizeErrors = False testProgress1 = testProgress2 = None thredds = False threddsReinit = None version = None versionList = None nodbwrite = False for flag, arg in args: if flag=='-a': aggregateDimension = arg elif flag=='--append': publishOp = UPDATE_OP elif flag in ['-c', '--create']: publishOp = CREATE_OP elif flag=='--dataset': datasetName = arg elif flag in ['-d', '--delete-files']: publishOp = DELETE_OP elif flag=='--echo-sql': echoSql = True elif flag=='--experiment': initcontext['experiment'] = arg elif flag=='--filter': filefilt = arg elif flag in ['-h', '--help']: print usage sys.exit(0) elif flag=='-i': init_file = arg elif flag=='--keep-version': keepVersion = True elif flag=='--log': log_filename = arg elif flag=='--map': datasetMapfile = arg elif flag in ['-m', '--message']: message = arg elif flag=='--model': initcontext['model'] = arg elif flag=='--nodbwrite': nodbwrite = True elif flag=='--new-version': try: version = string.atoi(arg) if version <=0: raise ValueError except ValueError: raise ESGPublishError("Version number must be a positive integer: %s"%arg) elif flag=='--no-thredds-reinit': threddsReinit = False elif flag=='--noscan': publishOnly = True elif flag=='--offline': offline = True elif flag=='--parent': parent = arg elif flag=='--per-time': perVariable = False elif flag=='--per-variable': perVariable = True elif flag=='--project': projectName = arg elif flag in ['-p', '--property']: name, value = arg.split('=') properties[name] = value elif flag=='--publish': publish = True elif flag in ['-e', '--read-directories']: readFiles = False elif flag=='--read-files': readFiles = True elif flag=='--rename-files': publishOp = RENAME_OP elif flag in ['-r', '--replace']: publishOp = REPLACE_OP elif flag=='--replica': masterGateway = arg warning("The --replica option is deprecated. Use --set-replica instead") elif flag=='--rest-api': restApi = True elif flag=='--service': service = arg elif flag=='--set-replica': masterGateway = 'DEFAULT' elif flag=='--summarize-errors': summarizeErrors = True elif flag=='--thredds': thredds = True elif flag=='--thredds-reinit': threddsReinit = True elif flag in ['-u', '--update']: publishOp = UPDATE_OP elif flag=='--use-existing': rescan = True rescanDatasetName.append(arg) elif flag=='--use-list': rescan = True if arg=='-': namelist=sys.stdin else: namelist = open(arg) for line in namelist.readlines(): line = line.strip() if line[0]!='#': rescanDatasetName.append(line) elif flag=='--validate': schema = arg restApi = True elif flag=='--version-list': versionList = arg # If offline, the project must be specified if offline and (projectName is None): raise ESGPublishError("Must specify project with --project for offline datasets") if version is not None and versionList is not None: raise ESGPublishError("Cannot specify both --new-version and --version-list") if versionList is not None: version = {} f = open(versionList) lines = f.readlines() f.close() for line in lines: line = line.strip() dsid, vers = line.split('|') dsid = dsid.strip() vers = int(vers.strip()) version[dsid] = vers # Load the configuration and set up a database connection config = loadConfig(init_file) engine = create_engine(config.getdburl('extract'), echo=echoSql, pool_recycle=3600) initLogging('extract', override_sa=engine, log_filename=log_filename) Session = sessionmaker(bind=engine, autoflush=True, autocommit=False) # Register project handlers registerHandlers() # Get the default publication interface (REST or Hessian) if restApi is None: restApi = config.getboolean('DEFAULT', 'use_rest_api', default=False) # If the dataset map is input, just read it ... dmap = None directoryMap = None extraFields = None if datasetMapfile is not None: dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True) datasetNames = dmap.keys() elif rescan: # Note: No need to get the extra fields, such as mod_time, since # they are already in the database, and will be used for file comparison if necessary. dmap, offline = queryDatasetMap(rescanDatasetName, Session) datasetNames = dmap.keys() # ... otherwise generate the directory map. else: # Online dataset(s) if not offline: if len(lastargs)==0: print "No directories specified." return if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: multiIter = multiDirectoryIterator(lastargs, filefilt=filefilt) firstFile, size = multiIter.next() listIter = list(multiIter) handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError("No project found in file %s, specify with --project."%firstFile) projectName = handler.name props = properties.copy() props.update(initcontext) if not readFiles: directoryMap = handler.generateDirectoryMap(lastargs, filefilt, initContext=props, datasetName=datasetName) else: directoryMap = handler.generateDirectoryMapFromFiles(lastargs, filefilt, initContext=props, datasetName=datasetName) datasetNames = [(item,-1) for item in directoryMap.keys()] # Offline dataset. Format the spec as a dataset map : dataset_name => [(path, size), (path, size), ...] else: handler = getHandlerByName(projectName, None, Session, offline=True) dmap = {} listerSection = getOfflineLister(config, "project:%s"%projectName, service) offlineLister = config.get(listerSection, 'offline_lister_executable') commandArgs = "--config-section %s "%listerSection commandArgs += " ".join(lastargs) for dsetName, filepath, sizet in processNodeMatchIterator(offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True): size, mtime = sizet if dmap.has_key((dsetName,-1)): dmap[(dsetName,-1)].append((filepath, str(size))) else: dmap[(dsetName,-1)] = [(filepath, str(size))] datasetNames = dmap.keys() datasetNames.sort() if len(datasetNames)==0: warning("No datasets found.") min_version = -1 else: min_version = sorted(datasetNames, key=lambda x: x[1])[0][1] # Must specify version for replications if min_version == -1 and masterGateway is not None and version is None and versionList is None: raise ESGPublishError("Must specify version with --new-version (or --version-list) for replicated datasets") # Iterate over datasets if not publishOnly: # pdb.set_trace() datasets = iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, publishOp, filefilt, initcontext, offline, properties, keepVersion=keepVersion, newVersion=version, extraFields=extraFields, masterGateway=masterGateway, comment=message, readFiles=readFiles, nodbwrite=nodbwrite) if (not nodbwrite): result = publishDatasetList(datasetNames, Session, publish=publish, thredds=thredds, las=las, parentId=parent, service=service, perVariable=perVariable, reinitThredds=threddsReinit, restInterface=restApi, schema=schema) # print `result` if summarizeErrors: print 'Summary of errors:' for name,versionno in datasetNames: dset = Dataset.lookup(name, Session) print dset.get_name(Session), dset.get_project(Session), dset.get_model(Session), dset.get_experiment(Session), dset.get_run_name(Session) if dset.has_warnings(Session): print '=== Dataset: %s ==='%dset.name for line in dset.get_warnings(Session): print line