def esgpublishWrapper(**kw): from esgcet.query import queryDatasetMap aggregateDimension = kw.get("aggregateDimension", "time") datasetMapfile = kw.get("datasetMapfile", None) datasetName = kw.get("datasetName", None) directoryList = kw.get("directoryList", None) echoSql = kw.get("echoSql", False) filefilt = kw.get("filefilt", '.*\.nc$') init_file = kw.get("init_file", None) initcontext = kw.get("initcontext", {}) keepVersion = kw.get("keepVersion", False) las = kw.get("las", False) log_filename = kw.get("log_filename", None) masterGateway = kw.get("masterGateway", None) message = kw.get("message", None) offline = kw.get("offline", False) parent = kw.get("parent", None) perVariable = kw.get("perVariable", None) projectName = kw.get("projectName", None) properties = kw.get("properties", {}) publish = kw.get("publish", False) publishOnly = kw.get("publishOnly", False) publishOp = kw.get("publishOp", CREATE_OP) readFiles = kw.get("readFiles", False) readFromCatalog = kw.get("readFromCatalog", False) reinitThredds = kw.get("reinitThredds", None) rescan = kw.get("rescan", False) rescanDatasetName = kw.get("rescanDatasetName", []) resultThreddsDictionary = None service = kw.get("service", None) summarizeErrors = kw.get("summarizeErrors", False) testProgress1 = kw.get("testProgress1", None) testProgress2 = kw.get("testProgress2", None) thredds = kw.get("thredds", False) threddsCatalogDictionary = kw.get("threddsCatalogDictionary", None) version = kw.get("version", None) # If offline, the project must be specified if offline and (projectName is None): raise ESGPublishError( "Must specify project with --project for offline datasets") # Must specify version for replications if masterGateway is not None and version is None: raise ESGPublishError( "Must specify version with --new-version for replicated datasets") # Load the configuration and set up a database connection config, Session = initdb(init_file=init_file, echoSql=echoSql, log_filename=log_filename) # Register project handlers registerHandlers() # If the dataset map is input, just read it ... dmap = None directoryMap = None extraFields = None if datasetMapfile is not None: dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True) datasetNames = dmap.keys() elif rescan: # Note: No need to get the extra fields, such as mod_time, since # they are already in the database, and will be used for file comparison if necessary. dmap, offline = queryDatasetMap(rescanDatasetName, Session) datasetNames = dmap.keys() # ... otherwise generate the directory map. else: # Online dataset(s) if not offline: if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: multiIter = multiDirectoryIterator(directoryList, filefilt=filefilt) firstFile, size = multiIter.next() listIter = list(multiIter) handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError( "No project found in file %s, specify with --project." % firstFile) projectName = handler.name props = properties.copy() props.update(initcontext) if not readFiles: directoryMap = handler.generateDirectoryMap( directoryList, filefilt, initContext=props, datasetName=datasetName) else: directoryMap = handler.generateDirectoryMapFromFiles( directoryList, filefilt, initContext=props, datasetName=datasetName) datasetNames = [(item, -1) for item in directoryMap.keys()] # Offline dataset. Format the spec as a dataset map : dataset_name => [(path, size), (path, size), ...] else: handler = getHandlerByName(projectName, None, Session, offline=True) dmap = {} listerSection = getOfflineLister(config, "project:%s" % projectName, service) offlineLister = config.get(listerSection, 'offline_lister_executable') commandArgs = "--config-section %s " % listerSection commandArgs += " ".join(directoryList) for dsetName, filepath, sizet in processNodeMatchIterator( offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True): size, mtime = sizet if dmap.has_key((dsetName, -1)): dmap[(dsetName, -1)].append((filepath, str(size))) else: dmap[(dsetName, -1)] = [(filepath, str(size))] datasetNames = dmap.keys() datasetNames.sort() if len(datasetNames) == 0: warning("No datasets found.") # Iterate over datasets if not publishOnly: datasets = iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, publishOp, filefilt, initcontext, offline, properties, keepVersion=keepVersion, newVersion=version, extraFields=extraFields, masterGateway=masterGateway, comment=message, readFiles=readFiles) result = publishDatasetList( datasetNames, Session, publish=publish, thredds=thredds, las=las, parentId=parent, service=service, perVariable=perVariable, threddsCatalogDictionary=threddsCatalogDictionary, reinitThredds=reinitThredds, readFromCatalog=readFromCatalog) return result
def esgpublishWrapper(**kw): from esgcet.query import queryDatasetMap aggregateDimension = kw.get("aggregateDimension", "time") datasetMapfile = kw.get("datasetMapfile", None) datasetName = kw.get("datasetName", None) directoryList = kw.get("directoryList", None) echoSql = kw.get("echoSql", False) filefilt = kw.get("filefilt", ".*\.nc$") init_file = kw.get("init_file", None) initcontext = kw.get("initcontext", {}) keepVersion = kw.get("keepVersion", False) las = kw.get("las", False) log_filename = kw.get("log_filename", None) masterGateway = kw.get("masterGateway", None) message = kw.get("message", None) offline = kw.get("offline", False) parent = kw.get("parent", None) perVariable = kw.get("perVariable", None) projectName = kw.get("projectName", None) properties = kw.get("properties", {}) publish = kw.get("publish", False) publishOnly = kw.get("publishOnly", False) publishOp = kw.get("publishOp", CREATE_OP) readFiles = kw.get("readFiles", False) readFromCatalog = kw.get("readFromCatalog", False) reinitThredds = kw.get("reinitThredds", None) rescan = kw.get("rescan", False) rescanDatasetName = kw.get("rescanDatasetName", []) resultThreddsDictionary = None service = kw.get("service", None) summarizeErrors = kw.get("summarizeErrors", False) testProgress1 = kw.get("testProgress1", None) testProgress2 = kw.get("testProgress2", None) thredds = kw.get("thredds", False) threddsCatalogDictionary = kw.get("threddsCatalogDictionary", None) version = kw.get("version", None) # If offline, the project must be specified if offline and (projectName is None): raise ESGPublishError("Must specify project with --project for offline datasets") # Must specify version for replications if masterGateway is not None and version is None: raise ESGPublishError("Must specify version with --new-version for replicated datasets") # Load the configuration and set up a database connection config, Session = initdb(init_file=init_file, echoSql=echoSql, log_filename=log_filename) # Register project handlers registerHandlers() # If the dataset map is input, just read it ... dmap = None directoryMap = None extraFields = None if datasetMapfile is not None: dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True) datasetNames = dmap.keys() elif rescan: # Note: No need to get the extra fields, such as mod_time, since # they are already in the database, and will be used for file comparison if necessary. dmap, offline = queryDatasetMap(rescanDatasetName, Session) datasetNames = dmap.keys() # ... otherwise generate the directory map. else: # Online dataset(s) if not offline: if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: multiIter = multiDirectoryIterator(directoryList, filefilt=filefilt) firstFile, size = multiIter.next() listIter = list(multiIter) handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError("No project found in file %s, specify with --project." % firstFile) projectName = handler.name props = properties.copy() props.update(initcontext) if not readFiles: directoryMap = handler.generateDirectoryMap( directoryList, filefilt, initContext=props, datasetName=datasetName ) else: directoryMap = handler.generateDirectoryMapFromFiles( directoryList, filefilt, initContext=props, datasetName=datasetName ) datasetNames = [(item, -1) for item in directoryMap.keys()] # Offline dataset. Format the spec as a dataset map : dataset_name => [(path, size), (path, size), ...] else: handler = getHandlerByName(projectName, None, Session, offline=True) dmap = {} listerSection = getOfflineLister(config, "project:%s" % projectName, service) offlineLister = config.get(listerSection, "offline_lister_executable") commandArgs = "--config-section %s " % listerSection commandArgs += " ".join(directoryList) for dsetName, filepath, sizet in processNodeMatchIterator( offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True ): size, mtime = sizet if dmap.has_key((dsetName, -1)): dmap[(dsetName, -1)].append((filepath, str(size))) else: dmap[(dsetName, -1)] = [(filepath, str(size))] datasetNames = dmap.keys() datasetNames.sort() if len(datasetNames) == 0: warning("No datasets found.") # Iterate over datasets if not publishOnly: datasets = iterateOverDatasets( projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, publishOp, filefilt, initcontext, offline, properties, keepVersion=keepVersion, newVersion=version, extraFields=extraFields, masterGateway=masterGateway, comment=message, readFiles=readFiles, ) result = publishDatasetList( datasetNames, Session, publish=publish, thredds=thredds, las=las, parentId=parent, service=service, perVariable=perVariable, threddsCatalogDictionary=threddsCatalogDictionary, reinitThredds=reinitThredds, readFromCatalog=readFromCatalog, ) return result
def main(argv): try: args, lastargs = getopt.getopt(argv, "hi:", [ 'database-delete', 'database-only', 'echo-sql', 'map=', 'no-republish', 'no-thredds-reinit', 'skip-gateway', 'skip-index', 'las', 'log=', 'rest-api', 'skip-thredds', 'sync-thredds', 'use-list=' ]) except getopt.error: print sys.exc_value return deleteAll = False datasetMap = None deleteDset = False unpublishOnGateway = False echoSql = False init_file = None gatewayOp = DELETE las = False log_filename = None republish = True restApi = None thredds = True syncThredds = False useList = False threddsReinit = True for flag, arg in args: if flag == '--database-delete': deleteDset = True elif flag == '--database-only': gatewayOp = NO_OPERATION thredds = False deleteDset = True elif flag == '--echo-sql': echoSql = True elif flag in ['-h', '--help']: return elif flag == '-i': init_file = arg elif flag == '--map': datasetMap = readDatasetMap(arg) elif flag == '--skip-gateway': gatewayOp = NO_OPERATION elif flag == '--skip-index': gatewayOp = NO_OPERATION elif flag == '--las': las = True elif flag == '--log': log_filename = arg elif flag == '--no-republish': republish = False elif flag == '--no-thredds-reinit': threddsReinit = False elif flag == '--rest-api': restApi = True elif flag == '--skip-thredds': thredds = False elif flag == '--sync-thredds': syncThredds = True elif flag == '--use-list': useList = True useListPath = arg if gatewayOp != NO_OPERATION and unpublishOnGateway: gatewayOp = UNPUBLISH # Load the configuration and set up a database connection config = loadConfig(init_file) engine = create_engine(config.getdburl('extract'), echo=echoSql, pool_recycle=3600) initLogging('extract', override_sa=engine, log_filename=log_filename) Session = sessionmaker(bind=engine, autoflush=True, autocommit=False) if config is None: raise ESGPublishError("No configuration file found.") threddsRoot = config.get('DEFAULT', 'thredds_root') # Get the default publication interface (REST or Hessian) if restApi is None: restApi = config.getboolean('DEFAULT', 'use_rest_api', default=False) if datasetMap is None: if not useList: datasetNames = [parseDatasetVersionId(item) for item in lastargs] else: if useListPath == '-': namelist = sys.stdin else: namelist = open(useListPath) datasetNames = [] for line in namelist.readlines(): versionId = parseDatasetVersionId(line.strip()) datasetNames.append(versionId) else: datasetNames = datasetMap.keys() datasetNames.sort() result = deleteDatasetList(datasetNames, Session, gatewayOp, thredds, las, deleteDset, deleteAll=deleteAll, republish=republish, reinitThredds=threddsReinit, restInterface=restApi) # Republish previous versions as needed. This will happen if the latest version # was deleted from the database, and is not # the only version. In this case the previous version will be rescanned to generate the aggregations. if republish: statusDict, republishList = result if len(republishList) > 0: # Register project handlers. registerHandlers() info("Republishing modified datasets:") republishDatasetNames = [ generateDatasetVersionId(dsetTuple) for dsetTuple in republishList ] dmap, offline = queryDatasetMap(republishDatasetNames, Session) datasetNames = dmap.keys() datasets = iterateOverDatasets(None, dmap, None, republishList, Session, "time", UPDATE_OP, None, {}, offline, {}, forceAggregate=True) republishOp = (gatewayOp != NO_OPERATION ) # Don't republish if skipping the gateway op result = publishDatasetList(datasetNames, Session, publish=republishOp, thredds=thredds) # Synchronize database and THREDDS catalogs if syncThredds: threddsRoot = config.get('DEFAULT', 'thredds_root') # Make a dictionary of catalogs from the database session = Session() subcatalogs = session.query(Catalog).select_from( join(Catalog, Dataset, Catalog.dataset_name == Dataset.name)).all() catdict = {} for catalog in subcatalogs: location = os.path.join(threddsRoot, catalog.location) catdict[location] = 1 session.close() # Scan all XML files in the threddsroot os.path.walk(threddsRoot, cleanupCatalogs, catdict)
def new_query_page(self, parent, tab_name=None, query_id=None): # Start the busy routine to indicate to the users something is happening self.parent.parent.busyCursor = "watch" self.parent.parent.busyWidgets = [ self.parent.parent.pane2.pane("EditPaneTop"), self.parent.parent.pane2.pane("EditPaneBottom"), self.parent.parent.pane2.pane("EditPaneStatus"), self.parent.parent.pane.pane("ControlPane"), ] pub_busy.busyStart(self.parent.parent) try: properties = {} projectName = self.parent.query_fields["project"].get() # Must have projectName handler = getHandlerByName(projectName, None, self.Session) tabcolor = Pmw.Color.changebrightness(self.parent.parent, pub_controls.query_tab_color, 0.6) # works up to here if query_id is None: for x in self.parent.query_fields.keys(): query_string = self.parent.query_fields[x].get().lstrip() if (query_string == "-Any-") or (len(query_string) == 0): properties[x] = (2, "%") elif query_string != "-Any-": properties[x] = (1, query_string) if properties["id"] == (2, "%"): del properties["id"] # This causes an error because you cannot modify the 'id' listProperties = False result, headers = queryDatasets(projectName, handler, self.Session, properties) # works up to here # running this causes it to fail! self.new_page( parent, tabName=None, tab_color=tabcolor, page_type="query", query_result=result, list_fields=headers, ) else: result, headers = queryDatasets(projectName, handler, self.Session, properties) for x in result: query_id_found = False if query_id == x[0][:-1]: self.new_page( parent, tabName=None, tab_color=tabcolor, page_type="query", query_result=[x], list_fields=headers, ) query_id_found = True break if query_id_found is False: warning("The specified dataset id '%s' was not found.", query_id) # fails here # Enable the "Data Publication" button self.parent.ControlButton3.configure(state="normal") datasetNames = [] for x in result: datasetNames.append(x[1]) dmap, offline_map, extraFields = queryDatasetMap(datasetNames, self.Session, extra_fields=True) # Check if offline or not, then set the iteration values for each page selected_page = self.parent.parent.main_frame.selected_top_page self.parent.parent.hold_offline[selected_page] = offline_map self.parent.parent.main_frame.projectName[selected_page] = projectName self.parent.parent.main_frame.dmap[selected_page] = dmap self.parent.parent.main_frame.extraFields[selected_page] = extraFields self.parent.parent.main_frame.datasetMapfile[selected_page] = None self.parent.parent.directoryMap[selected_page] = None self.parent.parent.main_frame.dirp_firstfile[selected_page] = None self.parent.parent.defaultGlobalValues[selected_page] = {} except: pub_busy.busyEnd(self.parent.parent) # catch here in order to turn off the busy cursor ganz raise finally: pub_busy.busyEnd(self.parent.parent)
def main(argv): try: args, lastargs = getopt.getopt(argv, "a:cdehi:m:p:ru", ['append', 'create', 'dataset=', 'delete-files', 'echo-sql', 'experiment=', 'filter=', 'help', 'keep-version', 'log=', 'map=', 'message=', 'model=', 'offline', 'parent=', 'per-time', 'per-variable', 'project=', 'property=', 'publish', 'new-version=', 'no-thredds-reinit', 'noscan', 'read-directories', 'read-files', 'rename-files', 'replace', 'replica=', 'rest-api', 'service=', 'set-replica', 'summarize-errors', 'thredds', 'thredds-reinit', 'update', 'use-existing=', 'use-list=', 'validate=', 'version-list=', 'nodbwrite']) except getopt.error: print sys.exc_value return aggregateDimension = "time" datasetMapfile = None datasetName = None echoSql = False filefilt = '.*\.nc$' init_file = None initcontext = {} keepVersion = False las = False log_filename = None masterGateway = None message = None offline = False parent = None perVariable = None projectName = None properties = {} publish = False publishOnly = False publishOp = CREATE_OP readFiles = False rescan = False rescanDatasetName = [] restApi = None schema = None service = None summarizeErrors = False testProgress1 = testProgress2 = None thredds = False threddsReinit = None version = None versionList = None nodbwrite = False for flag, arg in args: if flag=='-a': aggregateDimension = arg elif flag=='--append': publishOp = UPDATE_OP elif flag in ['-c', '--create']: publishOp = CREATE_OP elif flag=='--dataset': datasetName = arg elif flag in ['-d', '--delete-files']: publishOp = DELETE_OP elif flag=='--echo-sql': echoSql = True elif flag=='--experiment': initcontext['experiment'] = arg elif flag=='--filter': filefilt = arg elif flag in ['-h', '--help']: print usage sys.exit(0) elif flag=='-i': init_file = arg elif flag=='--keep-version': keepVersion = True elif flag=='--log': log_filename = arg elif flag=='--map': datasetMapfile = arg elif flag in ['-m', '--message']: message = arg elif flag=='--model': initcontext['model'] = arg elif flag=='--nodbwrite': nodbwrite = True elif flag=='--new-version': try: version = string.atoi(arg) if version <=0: raise ValueError except ValueError: raise ESGPublishError("Version number must be a positive integer: %s"%arg) elif flag=='--no-thredds-reinit': threddsReinit = False elif flag=='--noscan': publishOnly = True elif flag=='--offline': offline = True elif flag=='--parent': parent = arg elif flag=='--per-time': perVariable = False elif flag=='--per-variable': perVariable = True elif flag=='--project': projectName = arg elif flag in ['-p', '--property']: name, value = arg.split('=') properties[name] = value elif flag=='--publish': publish = True elif flag in ['-e', '--read-directories']: readFiles = False elif flag=='--read-files': readFiles = True elif flag=='--rename-files': publishOp = RENAME_OP elif flag in ['-r', '--replace']: publishOp = REPLACE_OP elif flag=='--replica': masterGateway = arg warning("The --replica option is deprecated. Use --set-replica instead") elif flag=='--rest-api': restApi = True elif flag=='--service': service = arg elif flag=='--set-replica': masterGateway = 'DEFAULT' elif flag=='--summarize-errors': summarizeErrors = True elif flag=='--thredds': thredds = True elif flag=='--thredds-reinit': threddsReinit = True elif flag in ['-u', '--update']: publishOp = UPDATE_OP elif flag=='--use-existing': rescan = True rescanDatasetName.append(arg) elif flag=='--use-list': rescan = True if arg=='-': namelist=sys.stdin else: namelist = open(arg) for line in namelist.readlines(): line = line.strip() if line[0]!='#': rescanDatasetName.append(line) elif flag=='--validate': schema = arg restApi = True elif flag=='--version-list': versionList = arg # If offline, the project must be specified if offline and (projectName is None): raise ESGPublishError("Must specify project with --project for offline datasets") if version is not None and versionList is not None: raise ESGPublishError("Cannot specify both --new-version and --version-list") if versionList is not None: version = {} f = open(versionList) lines = f.readlines() f.close() for line in lines: line = line.strip() dsid, vers = line.split('|') dsid = dsid.strip() vers = int(vers.strip()) version[dsid] = vers # Load the configuration and set up a database connection config = loadConfig(init_file) engine = create_engine(config.getdburl('extract'), echo=echoSql, pool_recycle=3600) initLogging('extract', override_sa=engine, log_filename=log_filename) Session = sessionmaker(bind=engine, autoflush=True, autocommit=False) # Register project handlers registerHandlers() # Get the default publication interface (REST or Hessian) if restApi is None: restApi = config.getboolean('DEFAULT', 'use_rest_api', default=False) # If the dataset map is input, just read it ... dmap = None directoryMap = None extraFields = None if datasetMapfile is not None: dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True) datasetNames = dmap.keys() elif rescan: # Note: No need to get the extra fields, such as mod_time, since # they are already in the database, and will be used for file comparison if necessary. dmap, offline = queryDatasetMap(rescanDatasetName, Session) datasetNames = dmap.keys() # ... otherwise generate the directory map. else: # Online dataset(s) if not offline: if len(lastargs)==0: print "No directories specified." return if projectName is not None: handler = getHandlerByName(projectName, None, Session) else: multiIter = multiDirectoryIterator(lastargs, filefilt=filefilt) firstFile, size = multiIter.next() listIter = list(multiIter) handler = getHandler(firstFile, Session, validate=True) if handler is None: raise ESGPublishError("No project found in file %s, specify with --project."%firstFile) projectName = handler.name props = properties.copy() props.update(initcontext) if not readFiles: directoryMap = handler.generateDirectoryMap(lastargs, filefilt, initContext=props, datasetName=datasetName) else: directoryMap = handler.generateDirectoryMapFromFiles(lastargs, filefilt, initContext=props, datasetName=datasetName) datasetNames = [(item,-1) for item in directoryMap.keys()] # Offline dataset. Format the spec as a dataset map : dataset_name => [(path, size), (path, size), ...] else: handler = getHandlerByName(projectName, None, Session, offline=True) dmap = {} listerSection = getOfflineLister(config, "project:%s"%projectName, service) offlineLister = config.get(listerSection, 'offline_lister_executable') commandArgs = "--config-section %s "%listerSection commandArgs += " ".join(lastargs) for dsetName, filepath, sizet in processNodeMatchIterator(offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True): size, mtime = sizet if dmap.has_key((dsetName,-1)): dmap[(dsetName,-1)].append((filepath, str(size))) else: dmap[(dsetName,-1)] = [(filepath, str(size))] datasetNames = dmap.keys() datasetNames.sort() if len(datasetNames)==0: warning("No datasets found.") min_version = -1 else: min_version = sorted(datasetNames, key=lambda x: x[1])[0][1] # Must specify version for replications if min_version == -1 and masterGateway is not None and version is None and versionList is None: raise ESGPublishError("Must specify version with --new-version (or --version-list) for replicated datasets") # Iterate over datasets if not publishOnly: # pdb.set_trace() datasets = iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, publishOp, filefilt, initcontext, offline, properties, keepVersion=keepVersion, newVersion=version, extraFields=extraFields, masterGateway=masterGateway, comment=message, readFiles=readFiles, nodbwrite=nodbwrite) if (not nodbwrite): result = publishDatasetList(datasetNames, Session, publish=publish, thredds=thredds, las=las, parentId=parent, service=service, perVariable=perVariable, reinitThredds=threddsReinit, restInterface=restApi, schema=schema) # print `result` if summarizeErrors: print 'Summary of errors:' for name,versionno in datasetNames: dset = Dataset.lookup(name, Session) print dset.get_name(Session), dset.get_project(Session), dset.get_model(Session), dset.get_experiment(Session), dset.get_run_name(Session) if dset.has_warnings(Session): print '=== Dataset: %s ==='%dset.name for line in dset.get_warnings(Session): print line