def start_harvest(self, parent):
        from esgcet.publish import publishDatasetList
        from esgcet.model import Dataset, PUBLISH_FAILED_EVENT, ERROR_LEVEL

        dcolor1 = Pmw.Color.changebrightness(self.parent.parent, 'aliceblue',
                                             0.8)

        # Make sure the publisher is logged in
        # if not self.parent.parent.password_flg:
        #    self.parent.parent.menu.login_menu.evt_login( self.parent.parent )

        # Start the busy routine to indicate to the users something is happening
        self.parent.parent.busyCursor = 'watch'
        self.parent.parent.busyWidgets = [
            self.parent.parent.pane2.pane('EditPaneTop'),
            self.parent.parent.pane2.pane('EditPaneBottom'),
            self.parent.parent.pane2.pane('EditPaneStatus'),
            self.parent.parent.pane.pane('ControlPane')
        ]
        pub_busy.busyStart(self.parent.parent)
        try:
            # Generate the list of datasets to be published
            datasetNames = []
            GUI_line = {}
            tab_name = self.parent.parent.top_notebook.getcurselection()
            selected_page = self.parent.parent.main_frame.selected_top_page

            if (selected_page is None):
                warning(
                    "Must generate a list of datasets to scan before publishing can occur."
                )
                pub_busy.busyEnd(self.parent.parent)
                return

            for x in self.parent.parent.main_frame.top_page_id[selected_page]:

                if self.parent.parent.main_frame.top_page_id[selected_page][x].cget(
                        'bg'
                ) != 'salmon' and self.parent.parent.main_frame.top_page_id2[
                        selected_page][x].cget('bg') != 'salmon':
                    dset_name = self.parent.parent.main_frame.top_page_id2[
                        selected_page][x].cget('text')
                    #######################################
                    # ganz added this 1/18/11
                    versionNum = self.parent.parent.main_frame.version_label[
                        selected_page][x].cget('text')
                    dsetTuple = (dset_name, versionNum)
                    #dsetName = generateDatasetVersionId(dsetTuple)
                    #####################################################################################
                    # dsetTuple = parseDatasetVersionId(dset_name) # ganz no longer necessary
                    datasetNames.append(dsetTuple)
                    GUI_line[dset_name] = x
                else:
                    if self.parent.parent.main_frame.top_page_id2[
                            selected_page][x].cget('bg') == 'salmon':
                        self.parent.parent.main_frame.top_page_id[
                            selected_page][x].configure(relief='raised',
                                                        background='salmon',
                                                        image=self.off)

        # Publish collection of datasets
            testProgress = (self.parent.parent.statusbar.show, 0, 100)
            publishThredds = (quality_control_widgets.get_CheckBox3() == 1)
            publishGateway = (quality_control_widgets.get_CheckBox2() == 1)
            if (publishThredds):
                print 'publishing to Thredds'
            if (publishGateway):
                print 'publishing to Gateway'

            status_dict = publishDatasetList(datasetNames,
                                             self.Session,
                                             publish=publishGateway,
                                             thredds=publishThredds,
                                             progressCallback=testProgress)

            # Show the published status
            for x in status_dict.keys():
                status = status_dict[x]
                dsetName, versionNo = x
                dsetVersionName = generateDatasetVersionId(x)
                guiLine = GUI_line[dsetName]  # dsetVersionName]

                self.parent.parent.main_frame.status_label[selected_page][
                    guiLine].configure(
                        text=pub_controls.return_status_text(status))
                dset = Dataset.lookup(dsetName, self.Session)
                if dset.has_warnings(self.Session):
                    warningLevel = dset.get_max_warning_level(self.Session)
                    if warningLevel >= ERROR_LEVEL:
                        buttonColor = "pink"
                        buttonText = "Error"
                    else:
                        buttonColor = "yellow"
                        buttonText = "Warning"
                    self.parent.parent.main_frame.ok_err[selected_page][
                        guiLine].configure(
                            text=buttonText,
                            bg=buttonColor,
                            relief='raised',
                            command=pub_controls.Command(
                                self.parent.parent.pub_buttonexpansion.
                                extraction_widgets.error_extraction_button,
                                dset))
                else:
                    self.parent.parent.main_frame.ok_err[selected_page][
                        guiLine].configure(
                            text='Ok',
                            bg=dcolor1,
                            highlightcolor=dcolor1,
                            relief='sunken',
                        )
        except:
            pub_busy.busyEnd(
                self.parent.parent
            )  # catch here in order to turn off the busy cursor ganz
            raise
        finally:
            pub_busy.busyEnd(self.parent.parent)
            self.my_refresh()
예제 #2
0
def esgpublishWrapper(**kw):

    from esgcet.query import queryDatasetMap

    aggregateDimension = kw.get("aggregateDimension", "time")
    datasetMapfile = kw.get("datasetMapfile", None)
    datasetName = kw.get("datasetName", None)
    directoryList = kw.get("directoryList", None)
    echoSql = kw.get("echoSql", False)
    filefilt = kw.get("filefilt", '.*\.nc$')
    init_file = kw.get("init_file", None)
    initcontext = kw.get("initcontext", {})
    keepVersion = kw.get("keepVersion", False)
    las = kw.get("las", False)
    log_filename = kw.get("log_filename", None)
    masterGateway = kw.get("masterGateway", None)
    message = kw.get("message", None)
    offline = kw.get("offline", False)
    parent = kw.get("parent", None)
    perVariable = kw.get("perVariable", None)
    projectName = kw.get("projectName", None)
    properties = kw.get("properties", {})
    publish = kw.get("publish", False)
    publishOnly = kw.get("publishOnly", False)
    publishOp = kw.get("publishOp", CREATE_OP)
    readFiles = kw.get("readFiles", False)
    readFromCatalog = kw.get("readFromCatalog", False)
    reinitThredds = kw.get("reinitThredds", None)
    rescan = kw.get("rescan", False)
    rescanDatasetName = kw.get("rescanDatasetName", [])
    resultThreddsDictionary = None
    service = kw.get("service", None)
    summarizeErrors = kw.get("summarizeErrors", False)
    testProgress1 = kw.get("testProgress1", None)
    testProgress2 = kw.get("testProgress2", None)
    thredds = kw.get("thredds", False)
    threddsCatalogDictionary = kw.get("threddsCatalogDictionary", None)
    version = kw.get("version", None)

    # If offline, the project must be specified
    if offline and (projectName is None):
        raise ESGPublishError(
            "Must specify project with --project for offline datasets")

    # Must specify version for replications
    if masterGateway is not None and version is None:
        raise ESGPublishError(
            "Must specify version with --new-version for replicated datasets")

    # Load the configuration and set up a database connection
    config, Session = initdb(init_file=init_file,
                             echoSql=echoSql,
                             log_filename=log_filename)

    # Register project handlers
    registerHandlers()

    # If the dataset map is input, just read it ...
    dmap = None
    directoryMap = None
    extraFields = None
    if datasetMapfile is not None:
        dmap, extraFields = readDatasetMap(datasetMapfile,
                                           parse_extra_fields=True)
        datasetNames = dmap.keys()

    elif rescan:
        # Note: No need to get the extra fields, such as mod_time, since
        # they are already in the database, and will be used for file comparison if necessary.
        dmap, offline = queryDatasetMap(rescanDatasetName, Session)
        datasetNames = dmap.keys()

    # ... otherwise generate the directory map.
    else:
        # Online dataset(s)
        if not offline:

            if projectName is not None:
                handler = getHandlerByName(projectName, None, Session)
            else:
                multiIter = multiDirectoryIterator(directoryList,
                                                   filefilt=filefilt)
                firstFile, size = multiIter.next()
                listIter = list(multiIter)
                handler = getHandler(firstFile, Session, validate=True)
                if handler is None:
                    raise ESGPublishError(
                        "No project found in file %s, specify with --project."
                        % firstFile)
                projectName = handler.name

            props = properties.copy()
            props.update(initcontext)
            if not readFiles:
                directoryMap = handler.generateDirectoryMap(
                    directoryList,
                    filefilt,
                    initContext=props,
                    datasetName=datasetName)
            else:
                directoryMap = handler.generateDirectoryMapFromFiles(
                    directoryList,
                    filefilt,
                    initContext=props,
                    datasetName=datasetName)

            datasetNames = [(item, -1) for item in directoryMap.keys()]

        # Offline dataset. Format the spec as a dataset map : dataset_name => [(path, size), (path, size), ...]
        else:
            handler = getHandlerByName(projectName,
                                       None,
                                       Session,
                                       offline=True)
            dmap = {}
            listerSection = getOfflineLister(config,
                                             "project:%s" % projectName,
                                             service)
            offlineLister = config.get(listerSection,
                                       'offline_lister_executable')
            commandArgs = "--config-section %s " % listerSection
            commandArgs += " ".join(directoryList)
            for dsetName, filepath, sizet in processNodeMatchIterator(
                    offlineLister,
                    commandArgs,
                    handler,
                    filefilt=filefilt,
                    datasetName=datasetName,
                    offline=True):
                size, mtime = sizet
                if dmap.has_key((dsetName, -1)):
                    dmap[(dsetName, -1)].append((filepath, str(size)))
                else:
                    dmap[(dsetName, -1)] = [(filepath, str(size))]

            datasetNames = dmap.keys()

    datasetNames.sort()
    if len(datasetNames) == 0:
        warning("No datasets found.")

    # Iterate over datasets
    if not publishOnly:
        datasets = iterateOverDatasets(projectName,
                                       dmap,
                                       directoryMap,
                                       datasetNames,
                                       Session,
                                       aggregateDimension,
                                       publishOp,
                                       filefilt,
                                       initcontext,
                                       offline,
                                       properties,
                                       keepVersion=keepVersion,
                                       newVersion=version,
                                       extraFields=extraFields,
                                       masterGateway=masterGateway,
                                       comment=message,
                                       readFiles=readFiles)

    result = publishDatasetList(
        datasetNames,
        Session,
        publish=publish,
        thredds=thredds,
        las=las,
        parentId=parent,
        service=service,
        perVariable=perVariable,
        threddsCatalogDictionary=threddsCatalogDictionary,
        reinitThredds=reinitThredds,
        readFromCatalog=readFromCatalog)

    return result
예제 #3
0
def esgpublishWrapper(**kw):

    from esgcet.query import queryDatasetMap

    aggregateDimension = kw.get("aggregateDimension", "time")
    datasetMapfile = kw.get("datasetMapfile", None)
    datasetName = kw.get("datasetName", None)
    directoryList = kw.get("directoryList", None)
    echoSql = kw.get("echoSql", False)
    filefilt = kw.get("filefilt", ".*\.nc$")
    init_file = kw.get("init_file", None)
    initcontext = kw.get("initcontext", {})
    keepVersion = kw.get("keepVersion", False)
    las = kw.get("las", False)
    log_filename = kw.get("log_filename", None)
    masterGateway = kw.get("masterGateway", None)
    message = kw.get("message", None)
    offline = kw.get("offline", False)
    parent = kw.get("parent", None)
    perVariable = kw.get("perVariable", None)
    projectName = kw.get("projectName", None)
    properties = kw.get("properties", {})
    publish = kw.get("publish", False)
    publishOnly = kw.get("publishOnly", False)
    publishOp = kw.get("publishOp", CREATE_OP)
    readFiles = kw.get("readFiles", False)
    readFromCatalog = kw.get("readFromCatalog", False)
    reinitThredds = kw.get("reinitThredds", None)
    rescan = kw.get("rescan", False)
    rescanDatasetName = kw.get("rescanDatasetName", [])
    resultThreddsDictionary = None
    service = kw.get("service", None)
    summarizeErrors = kw.get("summarizeErrors", False)
    testProgress1 = kw.get("testProgress1", None)
    testProgress2 = kw.get("testProgress2", None)
    thredds = kw.get("thredds", False)
    threddsCatalogDictionary = kw.get("threddsCatalogDictionary", None)
    version = kw.get("version", None)

    # If offline, the project must be specified
    if offline and (projectName is None):
        raise ESGPublishError("Must specify project with --project for offline datasets")

    # Must specify version for replications
    if masterGateway is not None and version is None:
        raise ESGPublishError("Must specify version with --new-version for replicated datasets")

    # Load the configuration and set up a database connection
    config, Session = initdb(init_file=init_file, echoSql=echoSql, log_filename=log_filename)

    # Register project handlers
    registerHandlers()

    # If the dataset map is input, just read it ...
    dmap = None
    directoryMap = None
    extraFields = None
    if datasetMapfile is not None:
        dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True)
        datasetNames = dmap.keys()

    elif rescan:
        # Note: No need to get the extra fields, such as mod_time, since
        # they are already in the database, and will be used for file comparison if necessary.
        dmap, offline = queryDatasetMap(rescanDatasetName, Session)
        datasetNames = dmap.keys()

    # ... otherwise generate the directory map.
    else:
        # Online dataset(s)
        if not offline:

            if projectName is not None:
                handler = getHandlerByName(projectName, None, Session)
            else:
                multiIter = multiDirectoryIterator(directoryList, filefilt=filefilt)
                firstFile, size = multiIter.next()
                listIter = list(multiIter)
                handler = getHandler(firstFile, Session, validate=True)
                if handler is None:
                    raise ESGPublishError("No project found in file %s, specify with --project." % firstFile)
                projectName = handler.name

            props = properties.copy()
            props.update(initcontext)
            if not readFiles:
                directoryMap = handler.generateDirectoryMap(
                    directoryList, filefilt, initContext=props, datasetName=datasetName
                )
            else:
                directoryMap = handler.generateDirectoryMapFromFiles(
                    directoryList, filefilt, initContext=props, datasetName=datasetName
                )

            datasetNames = [(item, -1) for item in directoryMap.keys()]

        # Offline dataset. Format the spec as a dataset map : dataset_name => [(path, size), (path, size), ...]
        else:
            handler = getHandlerByName(projectName, None, Session, offline=True)
            dmap = {}
            listerSection = getOfflineLister(config, "project:%s" % projectName, service)
            offlineLister = config.get(listerSection, "offline_lister_executable")
            commandArgs = "--config-section %s " % listerSection
            commandArgs += " ".join(directoryList)
            for dsetName, filepath, sizet in processNodeMatchIterator(
                offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True
            ):
                size, mtime = sizet
                if dmap.has_key((dsetName, -1)):
                    dmap[(dsetName, -1)].append((filepath, str(size)))
                else:
                    dmap[(dsetName, -1)] = [(filepath, str(size))]

            datasetNames = dmap.keys()

    datasetNames.sort()
    if len(datasetNames) == 0:
        warning("No datasets found.")

    # Iterate over datasets
    if not publishOnly:
        datasets = iterateOverDatasets(
            projectName,
            dmap,
            directoryMap,
            datasetNames,
            Session,
            aggregateDimension,
            publishOp,
            filefilt,
            initcontext,
            offline,
            properties,
            keepVersion=keepVersion,
            newVersion=version,
            extraFields=extraFields,
            masterGateway=masterGateway,
            comment=message,
            readFiles=readFiles,
        )

    result = publishDatasetList(
        datasetNames,
        Session,
        publish=publish,
        thredds=thredds,
        las=las,
        parentId=parent,
        service=service,
        perVariable=perVariable,
        threddsCatalogDictionary=threddsCatalogDictionary,
        reinitThredds=reinitThredds,
        readFromCatalog=readFromCatalog,
    )

    return result
예제 #4
0
def main(argv):

    try:
        args, lastargs = getopt.getopt(argv, "hi:", [
            'database-delete', 'database-only', 'echo-sql', 'map=',
            'no-republish', 'no-thredds-reinit', 'skip-gateway', 'skip-index',
            'las', 'log=', 'rest-api', 'skip-thredds', 'sync-thredds',
            'use-list='
        ])
    except getopt.error:
        print sys.exc_value
        return

    deleteAll = False
    datasetMap = None
    deleteDset = False
    unpublishOnGateway = False
    echoSql = False
    init_file = None
    gatewayOp = DELETE
    las = False
    log_filename = None
    republish = True
    restApi = None
    thredds = True
    syncThredds = False
    useList = False
    threddsReinit = True
    for flag, arg in args:
        if flag == '--database-delete':
            deleteDset = True
        elif flag == '--database-only':
            gatewayOp = NO_OPERATION
            thredds = False
            deleteDset = True
        elif flag == '--echo-sql':
            echoSql = True
        elif flag in ['-h', '--help']:
            return
        elif flag == '-i':
            init_file = arg
        elif flag == '--map':
            datasetMap = readDatasetMap(arg)
        elif flag == '--skip-gateway':
            gatewayOp = NO_OPERATION
        elif flag == '--skip-index':
            gatewayOp = NO_OPERATION
        elif flag == '--las':
            las = True
        elif flag == '--log':
            log_filename = arg
        elif flag == '--no-republish':
            republish = False
        elif flag == '--no-thredds-reinit':
            threddsReinit = False
        elif flag == '--rest-api':
            restApi = True
        elif flag == '--skip-thredds':
            thredds = False
        elif flag == '--sync-thredds':
            syncThredds = True
        elif flag == '--use-list':
            useList = True
            useListPath = arg

    if gatewayOp != NO_OPERATION and unpublishOnGateway:
        gatewayOp = UNPUBLISH

    # Load the configuration and set up a database connection
    config = loadConfig(init_file)
    engine = create_engine(config.getdburl('extract'),
                           echo=echoSql,
                           pool_recycle=3600)
    initLogging('extract', override_sa=engine, log_filename=log_filename)
    Session = sessionmaker(bind=engine, autoflush=True, autocommit=False)

    if config is None:
        raise ESGPublishError("No configuration file found.")
    threddsRoot = config.get('DEFAULT', 'thredds_root')

    # Get the default publication interface (REST or Hessian)
    if restApi is None:
        restApi = config.getboolean('DEFAULT', 'use_rest_api', default=False)

    if datasetMap is None:
        if not useList:
            datasetNames = [parseDatasetVersionId(item) for item in lastargs]
        else:
            if useListPath == '-':
                namelist = sys.stdin
            else:
                namelist = open(useListPath)
            datasetNames = []
            for line in namelist.readlines():
                versionId = parseDatasetVersionId(line.strip())
                datasetNames.append(versionId)
    else:
        datasetNames = datasetMap.keys()
        datasetNames.sort()
    result = deleteDatasetList(datasetNames,
                               Session,
                               gatewayOp,
                               thredds,
                               las,
                               deleteDset,
                               deleteAll=deleteAll,
                               republish=republish,
                               reinitThredds=threddsReinit,
                               restInterface=restApi)

    # Republish previous versions as needed. This will happen if the latest version
    # was deleted from the database, and is not
    # the only version. In this case the previous version will be rescanned to generate the aggregations.
    if republish:
        statusDict, republishList = result
        if len(republishList) > 0:

            # Register project handlers.
            registerHandlers()

            info("Republishing modified datasets:")
            republishDatasetNames = [
                generateDatasetVersionId(dsetTuple)
                for dsetTuple in republishList
            ]
            dmap, offline = queryDatasetMap(republishDatasetNames, Session)
            datasetNames = dmap.keys()
            datasets = iterateOverDatasets(None,
                                           dmap,
                                           None,
                                           republishList,
                                           Session,
                                           "time",
                                           UPDATE_OP,
                                           None, {},
                                           offline, {},
                                           forceAggregate=True)
            republishOp = (gatewayOp != NO_OPERATION
                           )  # Don't republish if skipping the gateway op
            result = publishDatasetList(datasetNames,
                                        Session,
                                        publish=republishOp,
                                        thredds=thredds)

    # Synchronize database and THREDDS catalogs
    if syncThredds:
        threddsRoot = config.get('DEFAULT', 'thredds_root')

        # Make a dictionary of catalogs from the database
        session = Session()
        subcatalogs = session.query(Catalog).select_from(
            join(Catalog, Dataset,
                 Catalog.dataset_name == Dataset.name)).all()
        catdict = {}
        for catalog in subcatalogs:
            location = os.path.join(threddsRoot, catalog.location)
            catdict[location] = 1
        session.close()

        # Scan all XML files in the threddsroot
        os.path.walk(threddsRoot, cleanupCatalogs, catdict)
예제 #5
0
def main(argv):

    try:
        args, lastargs = getopt.getopt(argv, "a:cdehi:m:p:ru", ['append', 'create', 'dataset=', 'delete-files', 'echo-sql', 'experiment=', 'filter=', 'help', 'keep-version', 'log=', 'map=', 'message=', 'model=', 'offline',  'parent=', 'per-time', 'per-variable', 'project=', 'property=', 'publish', 'new-version=', 'no-thredds-reinit', 'noscan', 'read-directories', 'read-files', 'rename-files', 'replace', 'replica=', 'rest-api', 'service=', 'set-replica', 'summarize-errors', 'thredds', 'thredds-reinit', 'update', 'use-existing=', 'use-list=', 'validate=', 'version-list=', 'nodbwrite'])
    except getopt.error:
        print sys.exc_value
        return

    aggregateDimension = "time"
    datasetMapfile = None
    datasetName = None
    echoSql = False
    filefilt = '.*\.nc$'
    init_file = None
    initcontext = {}
    keepVersion = False
    las = False
    log_filename = None
    masterGateway = None
    message = None
    offline = False
    parent = None
    perVariable = None
    projectName = None
    properties = {}
    publish = False
    publishOnly = False
    publishOp = CREATE_OP
    readFiles = False
    rescan = False
    rescanDatasetName = []
    restApi = None
    schema = None
    service = None
    summarizeErrors = False
    testProgress1 = testProgress2 = None
    thredds = False
    threddsReinit = None
    version = None
    versionList = None
    nodbwrite = False

    for flag, arg in args:
        if flag=='-a':
            aggregateDimension = arg
        elif flag=='--append':
            publishOp = UPDATE_OP
        elif flag in ['-c', '--create']:
            publishOp = CREATE_OP
        elif flag=='--dataset':
            datasetName = arg
        elif flag in ['-d', '--delete-files']:
            publishOp = DELETE_OP
        elif flag=='--echo-sql':
            echoSql = True
        elif flag=='--experiment':
            initcontext['experiment'] = arg
        elif flag=='--filter':
            filefilt = arg
        elif flag in ['-h', '--help']:
            print usage
            sys.exit(0)
        elif flag=='-i':
            init_file = arg
        elif flag=='--keep-version':
            keepVersion = True
        elif flag=='--log':
            log_filename = arg
        elif flag=='--map':
            datasetMapfile = arg
        elif flag in ['-m', '--message']:
            message = arg
        elif flag=='--model':
            initcontext['model'] = arg
        elif flag=='--nodbwrite':
            nodbwrite = True
        elif flag=='--new-version':
            try:
                version = string.atoi(arg)
                if version <=0:
                    raise ValueError
            except ValueError:
                raise ESGPublishError("Version number must be a positive integer: %s"%arg)
        elif flag=='--no-thredds-reinit':
            threddsReinit = False
        elif flag=='--noscan':
            publishOnly = True
        elif flag=='--offline':
            offline = True
        elif flag=='--parent':
            parent = arg
        elif flag=='--per-time':
            perVariable = False
        elif flag=='--per-variable':
            perVariable = True
        elif flag=='--project':
            projectName = arg
        elif flag in ['-p', '--property']:
            name, value = arg.split('=')
            properties[name] = value
        elif flag=='--publish':
            publish = True
        elif flag in ['-e', '--read-directories']:
            readFiles = False
        elif flag=='--read-files':
            readFiles = True
        elif flag=='--rename-files':
            publishOp = RENAME_OP
        elif flag in ['-r', '--replace']:
            publishOp = REPLACE_OP
        elif flag=='--replica':
            masterGateway = arg
            warning("The --replica option is deprecated. Use --set-replica instead")
        elif flag=='--rest-api':
            restApi = True
        elif flag=='--service':
            service = arg
        elif flag=='--set-replica':
            masterGateway = 'DEFAULT'
        elif flag=='--summarize-errors':
            summarizeErrors = True
        elif flag=='--thredds':
            thredds = True
        elif flag=='--thredds-reinit':
            threddsReinit = True
        elif flag in ['-u', '--update']:
            publishOp = UPDATE_OP
        elif flag=='--use-existing':
            rescan = True
            rescanDatasetName.append(arg)
        elif flag=='--use-list':
            rescan = True
            if arg=='-':
                namelist=sys.stdin
            else:
                namelist = open(arg)
            for line in namelist.readlines():
                line = line.strip()
                if line[0]!='#':
                    rescanDatasetName.append(line)
        elif flag=='--validate':
            schema = arg
            restApi = True
        elif flag=='--version-list':
            versionList = arg

    # If offline, the project must be specified
    if offline and (projectName is None):
        raise ESGPublishError("Must specify project with --project for offline datasets")

    if version is not None and versionList is not None:
        raise ESGPublishError("Cannot specify both --new-version and --version-list")

    if versionList is not None:
        version = {}
        f = open(versionList)
        lines = f.readlines()
        f.close()
        for line in lines:
            line = line.strip()
            dsid, vers = line.split('|')
            dsid = dsid.strip()
            vers = int(vers.strip())
            version[dsid] = vers

    # Load the configuration and set up a database connection
    config = loadConfig(init_file)
    engine = create_engine(config.getdburl('extract'), echo=echoSql, pool_recycle=3600)
    initLogging('extract', override_sa=engine, log_filename=log_filename)
    Session = sessionmaker(bind=engine, autoflush=True, autocommit=False)

    # Register project handlers
    registerHandlers()

    # Get the default publication interface (REST or Hessian)
    if restApi is None:
        restApi = config.getboolean('DEFAULT', 'use_rest_api', default=False)

    # If the dataset map is input, just read it ...
    dmap = None
    directoryMap = None
    extraFields = None
    if datasetMapfile is not None:
        dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True)
        datasetNames = dmap.keys()

    elif rescan:
        # Note: No need to get the extra fields, such as mod_time, since
        # they are already in the database, and will be used for file comparison if necessary.
        dmap, offline = queryDatasetMap(rescanDatasetName, Session)
        datasetNames = dmap.keys()

    # ... otherwise generate the directory map.
    else:
        # Online dataset(s)
        if not offline:
            if len(lastargs)==0:
                print "No directories specified."
                return

            if projectName is not None:
                handler = getHandlerByName(projectName, None, Session)
            else:
                multiIter = multiDirectoryIterator(lastargs, filefilt=filefilt)
                firstFile, size = multiIter.next()
                listIter = list(multiIter)
                handler = getHandler(firstFile, Session, validate=True)
                if handler is None:
                    raise ESGPublishError("No project found in file %s, specify with --project."%firstFile)
                projectName = handler.name

            props = properties.copy()
            props.update(initcontext)
            if not readFiles:
                directoryMap = handler.generateDirectoryMap(lastargs, filefilt, initContext=props, datasetName=datasetName)
            else:
                directoryMap = handler.generateDirectoryMapFromFiles(lastargs, filefilt, initContext=props, datasetName=datasetName)
            datasetNames = [(item,-1) for item in directoryMap.keys()]

        # Offline dataset. Format the spec as a dataset map : dataset_name => [(path, size), (path, size), ...]
        else:
            handler = getHandlerByName(projectName, None, Session, offline=True)
            dmap = {}
            listerSection = getOfflineLister(config, "project:%s"%projectName, service)
            offlineLister = config.get(listerSection, 'offline_lister_executable')
            commandArgs = "--config-section %s "%listerSection
            commandArgs += " ".join(lastargs)
            for dsetName, filepath, sizet in processNodeMatchIterator(offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True):
                size, mtime = sizet
                if dmap.has_key((dsetName,-1)):
                    dmap[(dsetName,-1)].append((filepath, str(size)))
                else:
                    dmap[(dsetName,-1)] = [(filepath, str(size))]

            datasetNames = dmap.keys()

    datasetNames.sort()
    if len(datasetNames)==0:
        warning("No datasets found.")
        min_version = -1
    else:
        min_version = sorted(datasetNames, key=lambda x: x[1])[0][1]

    # Must specify version for replications
    if min_version == -1 and masterGateway is not None and version is None and versionList is None:
        raise ESGPublishError("Must specify version with --new-version (or --version-list) for replicated datasets")
    
    # Iterate over datasets
    if not publishOnly:

#        pdb.set_trace()

        datasets = iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, publishOp, filefilt, initcontext, offline, properties, keepVersion=keepVersion, newVersion=version, extraFields=extraFields, masterGateway=masterGateway, comment=message, readFiles=readFiles, nodbwrite=nodbwrite)


    if (not nodbwrite):
        result = publishDatasetList(datasetNames, Session, publish=publish, thredds=thredds, las=las, parentId=parent, service=service, perVariable=perVariable, reinitThredds=threddsReinit, restInterface=restApi, schema=schema)
    # print `result`

    if summarizeErrors:
        print 'Summary of errors:'
        for name,versionno in datasetNames:
            dset = Dataset.lookup(name, Session)
            print dset.get_name(Session), dset.get_project(Session), dset.get_model(Session), dset.get_experiment(Session), dset.get_run_name(Session)
            if dset.has_warnings(Session):
                print '=== Dataset: %s ==='%dset.name
                for line in dset.get_warnings(Session):
                    print line
    def start_harvest( self, parent ):
        from esgcet.publish import publishDatasetList
        from esgcet.model import Dataset, PUBLISH_FAILED_EVENT, ERROR_LEVEL

        dcolor1 = Pmw.Color.changebrightness(self.parent.parent, 'aliceblue', 0.8 )

        # Make sure the publisher is logged in
       # if not self.parent.parent.password_flg:
       #    self.parent.parent.menu.login_menu.evt_login( self.parent.parent )

        # Start the busy routine to indicate to the users something is happening
        self.parent.parent.busyCursor = 'watch'
        self.parent.parent.busyWidgets = [self.parent.parent.pane2.pane( 'EditPaneTop' ), self.parent.parent.pane2.pane( 'EditPaneBottom' ), self.parent.parent.pane2.pane( 'EditPaneStatus' ), self.parent.parent.pane.pane( 'ControlPane' )]
        pub_busy.busyStart( self.parent.parent )
        try:
        # Generate the list of datasets to be published
           datasetNames=[]
           GUI_line = {}
           tab_name = self.parent.parent.top_notebook.getcurselection()
           selected_page = self.parent.parent.main_frame.selected_top_page

           if (selected_page is None):
              warning("Must generate a list of datasets to scan before publishing can occur.")
              pub_busy.busyEnd( self.parent.parent )
              return

           for x in self.parent.parent.main_frame.top_page_id[selected_page]:

               if self.parent.parent.main_frame.top_page_id[selected_page][x].cget('bg') != 'salmon' and self.parent.parent.main_frame.top_page_id2[selected_page][x].cget('bg') != 'salmon':
                   dset_name = self.parent.parent.main_frame.top_page_id2[selected_page][x].cget('text')
                #######################################
                                  # ganz added this 1/18/11    
                   versionNum = self.parent.parent.main_frame.version_label[selected_page][x].cget('text') 
                   dsetTuple = (dset_name, versionNum)
                #dsetName = generateDatasetVersionId(dsetTuple)                
                  #####################################################################################
                # dsetTuple = parseDatasetVersionId(dset_name) # ganz no longer necessary
                   datasetNames.append(dsetTuple)
                   GUI_line[ dset_name ] = x
               else:
                   if self.parent.parent.main_frame.top_page_id2[selected_page][x].cget('bg') == 'salmon':
                      self.parent.parent.main_frame.top_page_id[selected_page][x].configure(relief = 'raised', background = 'salmon', image = self.off)

        # Publish collection of datasets
           testProgress = (self.parent.parent.statusbar.show, 0, 100)
           publishThredds = (quality_control_widgets.get_CheckBox3()==1)
           publishGateway = (quality_control_widgets.get_CheckBox2()==1)
           if (publishThredds):
               print 'publishing to Thredds'
           if (publishGateway):
               print 'publishing to Gateway'  
                      
           status_dict = publishDatasetList(datasetNames, self.Session, publish=publishGateway, thredds=publishThredds, progressCallback=testProgress)

        # Show the published status
           for x in status_dict.keys():
                status = status_dict[ x ]
                dsetName, versionNo = x
                dsetVersionName = generateDatasetVersionId(x)
                guiLine = GUI_line[dsetName] # dsetVersionName]
            
                self.parent.parent.main_frame.status_label[selected_page][guiLine].configure(text=pub_controls.return_status_text( status) )
                dset = Dataset.lookup(dsetName, self.Session)
                if dset.has_warnings(self.Session):
                    warningLevel = dset.get_max_warning_level(self.Session)
                    if warningLevel>=ERROR_LEVEL:
                        buttonColor = "pink"
                        buttonText = "Error"
                    else:
                        buttonColor = "yellow"
                        buttonText = "Warning"
                    self.parent.parent.main_frame.ok_err[selected_page][guiLine].configure(
                        text = buttonText,
                        bg = buttonColor,
                        relief = 'raised',
                        command = pub_controls.Command( self.parent.parent.pub_buttonexpansion.extraction_widgets.error_extraction_button, dset ) )
                else:
                    self.parent.parent.main_frame.ok_err[selected_page][guiLine].configure(
                        text = 'Ok',
                        bg = dcolor1,
                        highlightcolor = dcolor1,
                        relief = 'sunken',
                        )
        except:
            pub_busy.busyEnd( self.parent.parent )  # catch here in order to turn off the busy cursor ganz
            raise
        finally:
           pub_busy.busyEnd( self.parent.parent )
           self.my_refresh()