示例#1
0
    def evt_popup_fields_window(self, parent, projectName):
        if self.evt_fields_flg: return
        self.evt_fields_flg = True

        #---------------------------------------------------------------------------------
        # Create the Fields dialog window
        #---------------------------------------------------------------------------------
        self.fields = Pmw.Dialog(parent,
                                 buttons=('OK', 'Cancel'),
                                 defaultbutton='OK',
                                 title='Set Additional Mandatory Fields',
                                 command=pub_controls.Command(
                                     self.close_fields_dialog, parent))

        self.fields.withdraw()
        self.fields.transient(parent)

        frame = Pmw.ScrolledFrame(
            self.fields.interior(),
            usehullsize=1,
            horizflex='expand',
        )

        # Add additional mandatory fields to allow the user to set default settings
        handler = getHandlerByName(projectName, None, self.Session)
        list_fields = getQueryFields(handler)
        for x in list_fields:
            if handler.isMandatory(x):
                if x.lower() != "project":
                    field_options = handler.getFieldOptions(x)
                    if field_options is not None:
                        if x in self.defaultGlobalValues.keys():
                            set_to = self.defaultGlobalValues[x]
                        else:
                            set_to = x + " (Default Global Setting)"
                        field_options.insert(0,
                                             x + " (Default Global Setting)")
                        self.dataset_fields[x] = show_field(
                            parent, frame.interior(), x.capitalize(),
                            field_options, set_to, 1, 1)

        Pmw.alignlabels(self.dataset_fields.values())

        frame.pack(side='top', expand=1, fill='both')

        #---------------------------------------------------------------------------------
        # Position dialog popup
        #---------------------------------------------------------------------------------
        import string
        parent_geom = parent.geometry()
        geom = string.split(parent_geom, '+')
        d1 = string.atoi(geom[1])
        d2 = string.atoi(geom[2])
        self.fields.geometry("500x200+%d+%d" % (d1, d2))
        self.fields.show()
    def evt_popup_fields_window(self, parent, projectName):
        if self.evt_fields_flg:
            return
        self.evt_fields_flg = True

        # ---------------------------------------------------------------------------------
        # Create the Fields dialog window
        # ---------------------------------------------------------------------------------
        self.fields = Pmw.Dialog(
            parent,
            buttons=("OK", "Cancel"),
            defaultbutton="OK",
            title="Set Additional Mandatory Fields",
            command=pub_controls.Command(self.close_fields_dialog, parent),
        )

        self.fields.withdraw()
        self.fields.transient(parent)

        frame = Pmw.ScrolledFrame(self.fields.interior(), usehullsize=1, horizflex="expand")

        # Add additional mandatory fields to allow the user to set default settings
        handler = getHandlerByName(projectName, None, self.Session)
        list_fields = getQueryFields(handler)
        for x in list_fields:
            if handler.isMandatory(x):
                if x.lower() != "project":
                    field_options = handler.getFieldOptions(x)
                    if field_options is not None:
                        if x in self.defaultGlobalValues.keys():
                            set_to = self.defaultGlobalValues[x]
                        else:
                            set_to = x + " (Default Global Setting)"
                        field_options.insert(0, x + " (Default Global Setting)")
                        self.dataset_fields[x] = show_field(
                            parent, frame.interior(), x.capitalize(), field_options, set_to, 1, 1
                        )

        Pmw.alignlabels(self.dataset_fields.values())

        frame.pack(side="top", expand=1, fill="both")

        # ---------------------------------------------------------------------------------
        # Position dialog popup
        # ---------------------------------------------------------------------------------
        import string

        parent_geom = parent.geometry()
        geom = string.split(parent_geom, "+")
        d1 = string.atoi(geom[1])
        d2 = string.atoi(geom[2])
        self.fields.geometry("500x200+%d+%d" % (d1, d2))
        self.fields.show()
示例#3
0
def updateDatasetFromContext(context, datasetName, Session):
    """

    Update a persistent dataset with values from context (name/value dictionary). The context may have
    fields such as event fields, not associated with the project handler.

    context
      A property (name/value) dictionary.

    datasetName
      String dataset identifier.

    Session
      Database session factory.

    """

    dset = Dataset.lookup(datasetName, Session)
    if dset is None:
        raise ESGQueryError("Dataset not found: %s" % datasetName)
    projectName = dset.get_project(Session)
    handler = getHandlerByName(projectName, None, Session)
    basicHeaders, eventHeaders, categories, derivedHeaders = getQueryFields(
        handler, return_list=False)
    properties = context.copy()

    # Set basic and event properties
    session = Session()
    session.add(dset)
    for key, value in properties.items():
        if key in basicHeaders:
            if key != 'id':
                if key == 'name':
                    if len(handler.parseDatasetName(value, {})) == 0:
                        warning(
                            "Dataset name: %s does not match dataset_id pattern in config file."
                            % value)
                setattr(dset, key, value)
            else:
                warning("Cannot update id field")
            del properties[key]
        elif key in eventHeaders:
            event = dset.events[-1]
            setEvent(event, key, value)
            del properties[key]

    # Set attribute headers
    handler.setContext(properties)
    handler.saveContext(datasetName, Session)

    session.commit()
    session.close()
示例#4
0
def updateDatasetFromContext(context, datasetName, Session):
    """

    Update a persistent dataset with values from context (name/value dictionary). The context may have
    fields such as event fields, not associated with the project handler.

    context
      A property (name/value) dictionary.

    datasetName
      String dataset identifier.

    Session
      Database session factory.

    """

    dset = Dataset.lookup(datasetName, Session)
    if dset is None:
        raise ESGQueryError("Dataset not found: %s" % datasetName)
    projectName = dset.get_project(Session)
    handler = getHandlerByName(projectName, None, Session)
    basicHeaders, eventHeaders, categories, derivedHeaders = getQueryFields(
        handler, return_list=False)
    properties = context.copy()

    # Set basic and event properties
    session = Session()
    session.add(dset)
    for key, value in properties.items():
        if key in basicHeaders:
            if key != 'id':
                if key == 'name':
                    if len(handler.parseDatasetName(value, {})) == 0:
                        warning(
                            "Dataset name: %s does not match dataset_id pattern in config file."
                            % value)
                setattr(dset, key, value)
            else:
                warning("Cannot update id field")
            del properties[key]
        elif key in eventHeaders:
            event = dset.events[-1]
            setEvent(event, key, value)
            del properties[key]

    # Set attribute headers
    handler.setContext(properties)
    handler.saveContext(datasetName, Session)

    session.commit()
    session.close()
示例#5
0
def load_configuration(parent):
    import os
    import pub_controls
    from esgcet.config import getHandler, getHandlerByName, registerHandlers, CFHandler
    from sqlalchemy.orm import sessionmaker
    from esgcet.publish import multiDirectoryIterator, datasetMapIterator

    offline = parent.offline
    firstFile = parent.firstFile
    projectName = parent.projectName
    config = parent.config
    Session = parent.Session

    dmap = parent.dmap
    datasetNames = parent.datasetNames
    datasetMapfile = parent.datasetMapfile

    for datasetName in datasetNames:

        # Get a file iterator and sample file
        if datasetMapfile is not None:
            firstFile = dmap[datasetName][0][0]
            fileiter = datasetMapIterator(dmap, datasetName)
        else:
            direcTuples = parent.directoryMap[datasetName]
            firstDirec, sampleFile = direcTuples[0]
            firstFile = os.path.join(firstDirec, sampleFile)
            fileiter = multiDirectoryIterator(
                [direc for direc, sampfile in direcTuples], parent.filefilt)

        # Register project handlers
        registerHandlers()

        # If the project is not specified, try to read it from the first file
        validate = True
        if projectName is not None:
            handler = getHandlerByName(projectName,
                                       firstFile,
                                       Session,
                                       validate=validate,
                                       offline=offline)
        else:
            handler = getHandler(firstFile, Session, validate=validate)

        parent.handler = handler

    # View the collection of datasets
    tab_name = "Collection %i" % parent.top_ct
    parent.ntk.new_page(parent, tab_name)
def load_configuration( parent ):
    import os
    import pub_controls
    from esgcet.config import getHandler, getHandlerByName, registerHandlers, CFHandler
    from sqlalchemy.orm import sessionmaker
    from esgcet.publish import multiDirectoryIterator, datasetMapIterator

    offline = parent.offline
    firstFile = parent.firstFile
    projectName = parent.projectName
    config = parent.config
    Session = parent.Session

    dmap = parent.dmap
    datasetNames = parent.datasetNames
    datasetMapfile = parent.datasetMapfile

    for datasetName in datasetNames:

        # Get a file iterator and sample file
        if datasetMapfile is not None:
            firstFile = dmap[datasetName][0][0]
            fileiter = datasetMapIterator(dmap, datasetName)
        else:
            direcTuples = parent.directoryMap[datasetName]
            firstDirec, sampleFile = direcTuples[0]
            firstFile = os.path.join(firstDirec, sampleFile)
            fileiter  = multiDirectoryIterator([direc for direc, sampfile in direcTuples],parent.filefilt)

        # Register project handlers
        registerHandlers()

        # If the project is not specified, try to read it from the first file
        validate = True
        if projectName is not None:
            handler = getHandlerByName(projectName,firstFile,Session,validate=validate,offline=offline)
        else:
            handler = getHandler(firstFile, Session, validate=validate)

        parent.handler = handler

    # View the collection of datasets
    tab_name= "Collection %i" % parent.top_ct
    parent.ntk.new_page( parent, tab_name )
    def new_query_page(self, parent, tab_name=None, query_id=None):
        # Start the busy routine to indicate to the users something is happening

        self.parent.parent.busyCursor = "watch"
        self.parent.parent.busyWidgets = [
            self.parent.parent.pane2.pane("EditPaneTop"),
            self.parent.parent.pane2.pane("EditPaneBottom"),
            self.parent.parent.pane2.pane("EditPaneStatus"),
            self.parent.parent.pane.pane("ControlPane"),
        ]
        pub_busy.busyStart(self.parent.parent)

        try:
            properties = {}
            projectName = self.parent.query_fields["project"].get()  # Must have projectName
            handler = getHandlerByName(projectName, None, self.Session)
            tabcolor = Pmw.Color.changebrightness(self.parent.parent, pub_controls.query_tab_color, 0.6)

            # works up to here

            if query_id is None:
                for x in self.parent.query_fields.keys():
                    query_string = self.parent.query_fields[x].get().lstrip()
                    if (query_string == "-Any-") or (len(query_string) == 0):
                        properties[x] = (2, "%")
                    elif query_string != "-Any-":
                        properties[x] = (1, query_string)

                if properties["id"] == (2, "%"):
                    del properties["id"]  # This causes an error because you cannot modify the 'id'

                listProperties = False

                result, headers = queryDatasets(projectName, handler, self.Session, properties)
                # works up to here

                # running this causes it to fail!
                self.new_page(
                    parent,
                    tabName=None,
                    tab_color=tabcolor,
                    page_type="query",
                    query_result=result,
                    list_fields=headers,
                )

            else:
                result, headers = queryDatasets(projectName, handler, self.Session, properties)
                for x in result:
                    query_id_found = False
                    if query_id == x[0][:-1]:
                        self.new_page(
                            parent,
                            tabName=None,
                            tab_color=tabcolor,
                            page_type="query",
                            query_result=[x],
                            list_fields=headers,
                        )
                        query_id_found = True
                        break
                if query_id_found is False:
                    warning("The specified dataset id '%s' was not found.", query_id)

            # fails here

            # Enable the "Data Publication" button
            self.parent.ControlButton3.configure(state="normal")

            datasetNames = []
            for x in result:
                datasetNames.append(x[1])
            dmap, offline_map, extraFields = queryDatasetMap(datasetNames, self.Session, extra_fields=True)
            # Check if offline or not, then set the iteration values for each page

            selected_page = self.parent.parent.main_frame.selected_top_page
            self.parent.parent.hold_offline[selected_page] = offline_map
            self.parent.parent.main_frame.projectName[selected_page] = projectName
            self.parent.parent.main_frame.dmap[selected_page] = dmap
            self.parent.parent.main_frame.extraFields[selected_page] = extraFields
            self.parent.parent.main_frame.datasetMapfile[selected_page] = None
            self.parent.parent.directoryMap[selected_page] = None
            self.parent.parent.main_frame.dirp_firstfile[selected_page] = None
            self.parent.parent.defaultGlobalValues[selected_page] = {}

        except:
            pub_busy.busyEnd(self.parent.parent)  # catch here in order to turn off the busy cursor ganz
            raise
        finally:
            pub_busy.busyEnd(self.parent.parent)
示例#8
0
def esgscanWrapper(directoryList, **kw):

    if len(directoryList) == 0:
        raise ESGPublishError('No directory specified')

    output = sys.stdout
    appendMap = None
    appendPath = kw.get("appendPath", None)
    if appendPath is not None:
        if os.path.exists(appendPath):
            appendMap = readDatasetMap(appendPath)
        else:
            appendMap = {}
        output = open(appendPath, 'a')
    datasetName = kw.get("datasetName", None)
    filefilt = kw.get("fileFilt", '.*\.nc$')
    init_file = kw.get("initFile", None)
    offline = kw.get("offline", False)
    outputPath = kw.get("outputPath", None)
    if outputPath is not None:
        output = open(outputPath, 'w')
    else:
        output = sys.stdout
    projectName = kw.get("projectName", None)
    readFiles = kw.get("readFiles", False)
    service = kw.get("service", None)

    # Load the configuration and set up a database connection
    config, Session = initdb(init_file=init_file)

    # Register project handlers
    registerHandlers()

    if not offline:

        # Determine if checksumming is enabled
        line = config.get('DEFAULT', 'checksum', default=None)
        if line is not None:
            checksumClient, checksumType = splitLine(line)
        else:
            checksumClient = None

        if projectName is not None:
            handler = getHandlerByName(projectName, None, Session)
        else:
            multiIter = multiDirectoryIterator(directoryList,
                                               filefilt=filefilt)
            firstFile, size = multiIter.next()
            handler = getHandler(firstFile, Session, validate=True)
            if handler is None:
                raise ESGPublishError(
                    "No project found in file %s, specify with --project." %
                    firstFile)
            projectName = handler.name

        if not readFiles:
            datasetMap = handler.generateDirectoryMap(directoryList,
                                                      filefilt,
                                                      datasetName=datasetName)
        else:
            datasetMap = handler.generateDirectoryMapFromFiles(
                directoryList, filefilt, datasetName=datasetName)

        # Output the map
        keys = datasetMap.keys()
        keys.sort()
        for datasetId in keys:
            direcTuple = datasetMap[datasetId]
            direcTuple.sort()
            for nodepath, filepath in direcTuple:

                # If readFiles is not set, generate a map entry for each file in the directory
                # that matches filefilt ...
                if not readFiles:
                    itr = directoryIterator(nodepath,
                                            filefilt=filefilt,
                                            followSubdirectories=False)
                # ... otherwise if readFiles is set, generate a map entry for each file
                else:
                    itr = fnIterator([filepath])

                for filepath, sizet in itr:
                    size, mtime = sizet
                    extraStuff = "mod_time=%f" % float(mtime)

                    if checksumClient is not None:
                        csum = checksum(filepath, checksumClient)
                        extraStuff += " | checksum=%s | checksum_type=%s" % (
                            csum, checksumType)

                    # Print the map entry if:
                    # - The map is being created, not appended, or
                    # - The existing map does not have the dataset, or
                    # - The existing map has the dataset, but not the file.
                    if (appendMap is
                            None) or (not appendMap.has_key(datasetId)) or (
                                (filepath, "%d" % size)
                                not in appendMap[datasetId]):
                        print >> output, "%s | %s | %d | %s" % (
                            datasetId, filepath, size, extraStuff)
    else:  # offline
        if projectName is not None:
            handler = getHandlerByName(projectName,
                                       None,
                                       Session,
                                       offline=True)
        else:
            raise ESGPublishError(
                "Must specify --project for offline datasets.")
        listerSection = getOfflineLister(config, "project:%s" % projectName,
                                         service)
        offlineLister = config.get(listerSection, 'offline_lister_executable')
        commandArgs = "--config-section %s " % listerSection
        commandArgs += " ".join(directoryList)
        for dsetName, filepath, sizet in processNodeMatchIterator(
                offlineLister,
                commandArgs,
                handler,
                filefilt=filefilt,
                datasetName=datasetName,
                offline=True):
            size, mtime = sizet
            extrastuff = ""
            if mtime is not None:
                extrastuff = "| mod_time=%f" % float(mtime)
            if (appendMap is None) or (not appendMap.has_key(dsetName)) or (
                (filepath, "%d" % size) not in appendMap[dsetName]):
                print >> output, "%s | %s | %d %s" % (dsetName, filepath, size,
                                                      extrastuff)

    if output is not sys.stdout:
        output.close()
示例#9
0
def publishDatasetList(datasetNames, Session, parentId=None, handlerDictionary=None, publish=True, thredds=True, las=False, progressCallback=None,
                       service=None, perVariable=None, threddsCatalogDictionary=None, reinitThredds=None, readFromCatalog=False, restInterface=False,
                       schema=None, pid_connector=None, project_config_section=None):
    """
    Publish a list of datasets:

    - For each dataset, write a THREDDS catalog.
    - Add the new catalogs to the THREDDS catalog tree and reinitilize the THREDDS server.
    - Reinitialize the LAS server.
    - Publish each dataset to the gateway.

    Returns a dictionary: (datasetName, version) => status
    
    datasetNames
      A list of (string_dataset_name, version) tuples.

    Session
      A database Session.

    parentId
      The string (or dictionary) persistent identifier of the parent of the datasets. If None (the default),
      the parent id for each dataset is generated from ``handler.getParentId()``. If a dictionary, each
      dataset name is used as a key to lookup the respective parent id. If a string, the parent id is
      set to the string for all datasets being published. This function
      can be overridden in the project handler to implement a project-specific dataset hierarchy.

    handlerDictionary
      A dictionary mapping dataset_name => handler.

    publish
      Boolean flag: if true (the default), contact the gateway to publish this dataset.

    thredds
      Boolean flag: if true (the default), write the associated THREDDS catalog.

    las
      Boolean flag: if true (the default), write the associated LAS catalog.

    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    service
      String service name. If omitted, the first online/offline service in the configuration is used.

    perVariable
      Boolean, overrides ``variable_per_file`` config option.

    threddsCatalogDictionary
      If not None, just generate catalogs in strings, not the THREDDS directories, and set
      threddsCatalogDictionary[datasetId] = string_catalog

    reinitThredds
      Boolean flag. If True, create the TDS master catalog and reinitialize the TDS server.
      If None, defaults to value of thredds option.

    readFromCatalog
      Boolean flag. If True, read the TDS catalog definitions from threddsCatalogDictionary. 
      threddsCatalogDictionary must also be set.

    restInterface
      Boolean flag. If True, publish datasets with the RESTful publication services.

    schema
      (Optional) String name of the schema to validate against, for RESTful publication calls.

    pid_connector
        esgfpid.Connector object to register PIDs

    project_config_section
        Name of the project config section in esg.ini (for user specific project configs)

    """

    session = Session()
    resultDict = {}
    if readFromCatalog and threddsCatalogDictionary is None:
            raise ESGPublishError("Must set THREDDS catalog dictionary when readFromCatalog is True.")

    # Get handlers for each dataset
    if handlerDictionary is None:
        handlers = {}
        for datasetName,versionno in datasetNames:
            dset = session.query(Dataset).filter_by(name=datasetName).first()
            if dset is None:
                raise ESGPublishError("Dataset not found: %s"%datasetName)
            handler = getHandlerByName(dset.project, None, Session)
            handlers[datasetName] = handler
    else:
        handlers = handlerDictionary

    # reinitThredds defaults to the value of thredds option
    if reinitThredds is None:
        reinitThredds = thredds

    if thredds:
        for datasetName,versionno in datasetNames:
            dset = session.query(Dataset).filter_by(name=datasetName).first()

            # If the dataset version is not the latest, publish as a per-time dataset without aggregation,
            # since the dataset variables only relate to the latest dataset version
            latestVersion = dset.getVersion()
            if versionno==-1:
                versionno=latestVersion
            if versionno!=latestVersion:
                if perVariable:
                    messaging.info("Generating THREDDS catalog in per-time format, since version %d is not the latest version (%d)"%(versionno,latestVersion))
                perVariable = False

            handler = handlers[datasetName]

            # If threddsCatalogDictionary is not set, create the TDS catalog in the TDS content directory ...
            if threddsCatalogDictionary is None:
                threddsOutputPath = generateThreddsOutputPath(datasetName, versionno, Session, handler)
                threddsOutput = open(threddsOutputPath, "w")
                generateThredds(datasetName, Session, threddsOutput, handler, service=service, perVariable=perVariable, versionNumber=versionno,
                                pid_connector=pid_connector)
                threddsOutput.close()
                try:
                    os.chmod(threddsOutputPath, 0664)
                except:
                    pass

            # ... else if threddsCatalogDictionary is the catalog source:
            elif readFromCatalog:
                catalogString = threddsCatalogDictionary[(datasetName,versionno)]
                threddsOutputPath = generateThreddsOutputPath(datasetName, versionno, Session, handler)
                threddsOutput = open(threddsOutputPath, "w")
                messaging.info("Writing THREDDS catalog %s"%threddsOutputPath)
                threddsOutput.write(catalogString)
                threddsOutput.close()
                try:
                    os.chmod(threddsOutputPath, 0664)
                except:
                    pass

            # ... otherwise write the catalog in a 'string file'
            else:
                threddsOutputPath = generateThreddsOutputPath(datasetName, versionno, Session, handler) # Creates catalog entry
                threddsOutput = cStringIO.StringIO()
                generateThredds(datasetName, Session, threddsOutput, handler, service=service, perVariable=perVariable, versionNumber=versionno,
                                pid_connector=pid_connector)
                threddsCatalogDictionary[(datasetName,versionno)] = threddsOutput.getvalue()
                threddsOutput.close()

    if reinitThredds:
        updateThreddsMasterCatalog(Session)
        result = reinitializeThredds()

    if las:    
        try:
            result = reinitializeLAS()
        except Exception, e:
            messaging.error("Error on LAS reinitialization: %s, new datasets not added."%e)
    def __init__(self, parent):
        self.parent = parent
        self.Session = parent.parent.Session
        self.select_button = {}
        self.select_labelV = {}
        self.select_label = {}

        # ----------------------------------------------------------------------------------------
        # Begin the creation of the dataset ID pulldown query selection
        # ----------------------------------------------------------------------------------------
        glFont = tkFont.Font(
            self.parent.parent,
            family=pub_controls.button_group_font_type,
            size=pub_controls.button_group_font_size,
            weight=font_weight,
        )
        bnFont = tkFont.Font(
            self.parent.parent,
            family=pub_controls.label_button_font_type,
            size=pub_controls.label_button_font_size,
            weight=font_weight,
        )
        self.group_query = Pmw.Group(self.parent.control_frame4, tag_text="Query ID", tag_font=glFont, tagindent=25)

        # ----------------------------------------------------------------------------------------
        # Create and pack the EntryFields
        # ----------------------------------------------------------------------------------------
        self.query_entry = Pmw.EntryField(
            self.group_query.interior(),
            labelpos="w",
            label_text="Dataset Id:",
            label_font=bnFont,
            entry_width=200,
            validate=None,
            command=pub_controls.Command(self.evt_show_data_set_from_id),
        )

        self.query_entry.pack(side="left", expand=1, padx=10, pady=10)
        # ----------------------------------------------------------------------------------------
        # End the creation of the dataset ID pulldown query selection
        # ----------------------------------------------------------------------------------------

        # ----------------------------------------------------------------------------------------
        # Begin the creation of the project pulldown query selection
        # ----------------------------------------------------------------------------------------
        self.button_controls = Pmw.Group(
            self.parent.control_frame4, tag_text="Query Project Infomation", tag_font=glFont, tagindent=25
        )

        # ----------------------------------------------------------------------------------------
        # Create an instance of the notebook
        # ----------------------------------------------------------------------------------------
        ntk = generate_notebook(parent, self.Session)
        parent.parent.ntk = ntk  # Set the first instance of the notebook

        self.bdataset_sframe = Pmw.ScrolledFrame(self.button_controls.interior())
        self.bdataset_sframe.pack(side="top")
        self.bdataset_frame = self.bdataset_sframe.interior()

        # ----------------------------------------------------------------------------------------
        # Display the Project selection
        # ----------------------------------------------------------------------------------------
        projectOption = self.parent.parent.config.get("initialize", "project_options")
        projectSpecs = splitRecord(projectOption)
        projectName = projectSpecs[0][0]
        projectList = []
        for x in projectSpecs:
            projectList.append(x[0])
        projectList.sort()

        parent.query_fields = {}
        parent.validate = {}
        parent.query_fields["project"] = show_field(
            self.parent, self.bdataset_frame, "Project", projectList, projectName, 1, 1, for_query=True
        )
        parent.validate["project"] = 1

        handler = getHandlerByName(projectName, None, self.Session)
        list_fields = getQueryFields(handler)

        validate = []
        mandatory = []
        options = {}
        for x in list_fields:
            if handler.getFieldType(x) is not None:
                validate.append(handler.getFieldType(x))
            else:
                validate.append(2)
            options[x] = handler.getFieldOptions(x)
            mandatory.append(handler.isMandatory(x))

        for j in range(1, 5):
            for i in range(len(list_fields)):
                if list_fields[i] is not "project":
                    if options[list_fields[i]] is None:
                        value = ""
                    else:
                        value = options[list_fields[i]]  # ganz bug fix [0]
                    if validate[i] == 1:
                        options[list_fields[i]].insert(0, "-Any-")
                        value = "-Any-"
                    if j == validate[i]:
                        parent.query_fields[list_fields[i]] = show_field(
                            self.parent,
                            self.bdataset_frame,
                            list_fields[i].capitalize(),
                            options[list_fields[i]],
                            value,
                            mandatory[i],
                            validate[i],
                            for_query=True,
                        )
                        parent.validate[list_fields[i]] = validate[i]

        Pmw.alignlabels(parent.query_fields.values())

        # ----------------------------------------------------------------------------------------
        # Create button to update extraction
        # ----------------------------------------------------------------------------------------
        w = Tkinter.Button(
            self.button_controls.interior(),
            text="Query Data Information",
            font=bnFont,
            background="lightblue",
            command=pub_controls.Command(ntk.new_query_page, parent, None),
        )
        w.pack(side="top", padx=0, pady=3)

        self.button_controls.pack(side="top", fill="x", pady=3)
示例#11
0
def iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, operation, filefilt, initcontext, offlineArg, properties, testProgress1=None, testProgress2=None, handlerDictionary=None, keepVersion=False, newVersion=None, extraFields=None, masterGateway=None, comment=None, forceAggregate=False, readFiles=False):
    """
    Scan and aggregate (if possible) a list of datasets. The datasets and associated files are specified
    in one of two ways: either as a *dataset map* (see ``dmap``) or a *directory map* (see ``directoryMap``).
    All dataset information is persisted in the database. This is a 'helper' routine for esgpublish[_gui].

    Returns a list of persistent Dataset instances.

    projectName
      String name of the project associated with the datasets. If None, it is determined by the first handler found that
      can open a sample file from the dataset.
      
    dmap
      A dictionary dataset map, as returned from ``readDatasetMap``. If None, ``directoryMap`` must be specified.

    directoryMap
      A dictionary directory map, as returned from ``ProjectHandler.generateDirectoryMap``.
      
    datasetNames
      A list of dataset names identifying the datasets to be scanned.

    Session
      An SQLAlchemy Session.
      
    aggregateDimension
      Name of the dimension on which to aggregate the datasets.

    operation
      The publication operation, one of esgcet.publish.CREATE_OP, DELETE_OP, RENAME_OP, UPDATE_OP

    filefilt
      String regular expression as defined by the Python re module. If a ``directoryMap`` is specified, only files whose
      basename matches the filter are scanned. If ``dmap`` is specified, the filter is ignored.

    initcontext
      Dictionary of initial context values for *all* datasets. These values will override metadata contained in datafiles.
      Contrast with ``properties``.

    offlineArg
      Boolean flag or dictionary
      
      If a boolean flag: if True the files are treated as offline (not local) and are not scanned or aggregated. The associated
      metadata will be a minimal set including file name and size.

      If a dictionary, maps dataset_name => offline flag

    properties
      Dictionary of property/value pairs. The properties must be configured in the initialization file section
      corresponding to the project, and do not override existing metadata values. Contrast with ``initcontext``.

    testProgress1=None
      Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*,
      ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to
      the scan phase.

    testProgress2=None
      Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*,
      ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to
      the aggregation phase.

    handlerDictionary=None
      A dictionary mapping datasetName => handler. If None, handlers are determined by project name.

    keepVersion
      Boolean, True if the dataset version should not be incremented.

    newVersion
      Integer or dictionary. Set the new version number
      explicitly. If a dictionary, maps dataset_id => version. By
      default the version number is incremented by 1. See keepVersion.

    extraFields
      Extra dataset map fields, as from **readDatasetMap**.

    masterGateway
      The gateway that owns the master copy of the datasets. If None, the dataset is not replicated.
      Otherwise the TDS catalog is written with a 'master_gateway' property, flagging the dataset(s)
      as replicated.

    comment=None
      String comment to associate with new datasets created.

    forceAggregate=False
      If True, run the aggregation step regardless.

    readFiles=False
      If True, interpret directoryMap as having one entry per file, instead of one per directory.

    """
    from esgcet.publish import extractFromDataset, aggregateVariables

    versionIsMap = (type(newVersion) is types.DictType)
    if versionIsMap:
        saveVersionMap = newVersion

    prevProject = None
    datasets = []
    ct = len(datasetNames)
    for iloop in range(ct): 
        datasetName,versionno = datasetNames[iloop]

        # If using a version map, lookup the version for this dataset
        if versionIsMap:
            try:
                newVersion = saveVersionMap[datasetName]
            except KeyError:
                raise ESGPublishError("Dataset not found in version map: %s"%datasetName)
            
        context = initcontext.copy()

        # Get offline flag
        if type(offlineArg) is dict:
            offline = offlineArg[datasetName]
        else:
            offline = offlineArg

        # Don't try to aggregate offline datasets
        if offline:
            forceAggregate=False

        # Get a file iterator and sample file
        if dmap is not None:
            if len(dmap[(datasetName,versionno)])==0:
                warning("No files specified for dataset %s, version %d."%(datasetName,versionno))
                continue
            firstFile = dmap[(datasetName,versionno)][0][0]
            fileiter = datasetMapIterator(dmap, datasetName, versionno, extraFields=extraFields, offline=offlineArg)
        else:
            direcTuples = directoryMap[datasetName]
            firstDirec, sampleFile = direcTuples[0]
            firstFile = os.path.join(firstDirec, sampleFile)
            if not readFiles:
                fileiter  = multiDirectoryIterator([direc for direc, sampfile in direcTuples], filefilt)
            else:
                fileiter = fnIterator([sampfile for direc, sampfile in direcTuples])

        # If the project is not specified, try to read it from the first file
        if handlerDictionary is not None and handlerDictionary.has_key(datasetName):
            handler = handlerDictionary[datasetName]
        elif projectName is not None:
            handler = getHandlerByName(projectName, firstFile, Session, validate=True, offline=offline)
        else:
            handler = getHandler(firstFile, Session, validate=True)
            if handler is None:
                raise ESGPublishError("No project found in file %s, specify with --project."%firstFile)
            projectName = handler.name
            info("Using project name = %s"%projectName)
        if prevProject is not None and projectName!=prevProject:
            raise ESGPublishError("Multiple projects found: %s, %s. Can only publish from one project"%(prevProject, projectName))
        prevProject = projectName

        # Generate the initial context from the dataset name
        context = handler.parseDatasetName(datasetName, context)

        # Load the rest of the context from the first file, if possible
        context = handler.getContext(**context)

        # Add properties from the command line
        fieldNames = handler.getFieldNames()
        for name, value in properties.items():
            if name not in fieldNames:
                warning('Property not configured: %s, was ignored'%name)
            else:
                context[name] = value

        # Update the handler context and fill in default values
        handler.updateContext(context, True)

        # Ensure that fields are valid:
        try:
            handler.validateContext(context)
        except ESGInvalidMandatoryField:
            if offline:
                error("Dataset id has a missing or invalid mandatory field")
            raise

        # Create a CFHandler for validation of standard names, checking time axes, etc.
        cfHandler = handler.getMetadataHandler(sessionMaker=Session)

        dataset=None
        if testProgress1 is not None:
           testProgress1[1] = (100./ct)*iloop
           if not offline:
              testProgress1[2] = (100./ct)*iloop + (50./ct)
           else:
              testProgress1[2] = (100./ct)*iloop + (100./ct)
        dataset = extractFromDataset(datasetName, fileiter, Session, handler, cfHandler, aggregateDimensionName=aggregateDimension, offline=offline, operation=operation, progressCallback=testProgress1, keepVersion=keepVersion, newVersion=newVersion, extraFields=extraFields, masterGateway=masterGateway, comment=comment, useVersion=versionno, forceRescan=forceAggregate, **context)

        # If republishing an existing version, only aggregate if online and no variables exist (yet) for the dataset.
        runAggregate = (not offline)
        if hasattr(dataset, 'reaggregate'):
            runAggregate = (runAggregate and dataset.reaggregate)
        runAggregate = runAggregate or forceAggregate

        if testProgress2 is not None:
           testProgress2[1] = (100./ct)*iloop + 50./ct
           testProgress2[2] = (100./ct)*(iloop + 1)
        if runAggregate:
            aggregateVariables(datasetName, Session, aggregateDimensionName=aggregateDimension, cfHandler=cfHandler, progressCallback=testProgress2, datasetInstance=dataset)
        elif testProgress2 is not None:
            # Just finish the progress GUI
            issueCallback(testProgress2, 1, 1, 0.0, 1.0)
            
        # Save the context with the dataset, so that it can be searched later
        handler.saveContext(datasetName, Session)
        datasets.append(dataset)

    return datasets
    def return_content2(self, appendOpt=False):
        from esgcet.publish import iterateOverDatasets, processIterator
        from esgcet.config import getHandlerByName
        from esgcet.model import eventName
        from esgcet.config import loadConfig

        # Initialize parameters for interating over datasets
        initcontext = {}
        aggregateOnly = False
        # appendOpt = False
        initcontext = {}
        properties = {}
        publish = False
        publishOnly = False
        thredds = False
        testProgress1 = [self.parent.parent.statusbar.show, 0, 50]
        testProgress2 = [self.parent.parent.statusbar.show, 50, 100]
        handlerDictionary = {}

        # Get the currently selected tab and the selected datasets
        tab_name = self.parent.parent.top_notebook.getcurselection()
        selected_page = self.parent.parent.main_frame.selected_top_page
        datasetNames = []
       # datasetNames2 = []
        if (selected_page is None):
           warning("Must generate a list of datasets to scan before data extraction can occur.")
           return

        if (selected_page is not None) or (self.parent.parent.hold_offline[selected_page] == True):
           extraFields = None 
           if (self.parent.parent.hold_offline[selected_page] == False) or (isinstance(self.parent.parent.hold_offline[selected_page], types.DictType)):
              for x in self.parent.parent.main_frame.top_page_id[selected_page]:
                dsetVersionName = self.parent.parent.main_frame.top_page_id2[selected_page][x].cget('text') # GANZ TODO version_label
                
                   # ganz added this 1/21/11
                if (self.parent.parent.main_frame.version_label[selected_page] ):
                    dset_name = self.parent.parent.main_frame.top_page_id2[selected_page][x].cget('text')               
                    dsetVersion = self.parent.parent.main_frame.version_label[selected_page][x].cget('text')                 
                  #####################################################################################               
                else:
                    dset_name, dsetVersion = parseDatasetVersionId(dsetVersionName)

                # Retrieve all the datasets in the collection for display
                """ ganz test code
                status = pollDatasetPublicationStatus(dset_name, self.Session)
                status_text = pub_controls.return_status_text( status )
                if status_text != 'Error':
                   dsetTuple = parseDatasetVersionId(self.parent.parent.main_frame.top_page_id2[selected_page][x].cget('text'))
                   datasetNames2.append(dsetTuple)
                """
                # Retrieve only the datasets that have been selected
                if self.parent.parent.main_frame.top_page_id[selected_page][x].cget('bg') != 'salmon':
                   dsetTuple =  parseDatasetVersionId(self.parent.parent.main_frame.top_page_id2[selected_page][x].cget('text'))
                   datasetNames.append(dsetTuple)

              dmap = self.parent.parent.main_frame.dmap[selected_page]
              extraFields = self.parent.parent.main_frame.extraFields[selected_page]
              datasetMapfile = self.parent.parent.main_frame.datasetMapfile[selected_page]
              projectName = self.parent.parent.main_frame.projectName[selected_page]
              directoryMap = self.parent.parent.directoryMap[selected_page]

              if dmap is not None:
                 for x in datasetNames:
                    dsetId = x[0] 
                    datasetName = x
                    try:
                        dmapentry = dmap[datasetName]
                    except:

                        # Check if the dataset map key was changed from (dsetname,-1) to (dsetname,version).
                        # If so, replace the entry with the new key.
                        trykey = (datasetName[0], -1)
                        dmapentry = dmap[trykey]
                        del dmap[trykey]
                        dmap[datasetName] = dmapentry
                    firstFile = dmapentry[0][0]
  
                    self.parent.parent.handlerDictionary[dsetId] = getHandlerByName(projectName, firstFile, self.Session)
                    handler = self.parent.parent.handlerDictionary[dsetId]
                 # Copy the defaultGlobalValues into initcontext
                 initcontext = self.parent.parent.main_frame.defaultGlobalValues[selected_page]
              else:
                  # more test code
                 myholdDirectoryMap = self.parent.parent.directoryMap[selected_page] 
                 #mydatasetNames = [(item,-1) for item in myholdDirectoryMap.keys()]
                 mydatasetNames = [(item) for item in myholdDirectoryMap.keys()]
                 #end
                 for x in mydatasetNames:
                    dsetId = x[0] 
                    datasetName = x
                    # ganz this is test code
                    try:
                        dmapentry = myholdDirectoryMap[datasetName]
                    except:

                        # Check if the dataset map key was changed from (dsetname,-1) to (dsetname,version).
                        # If so, replace the entry with the new key.
                        
                        trykey = (datasetName[0], -1)
                        dmapentry = myholdDirectoryMap[trykey]
                        del myholdDirectoryMap[trykey]
                        myholdDirectoryMap[datasetName] = dmapentry
                        
                    firstFile = dmapentry[0][1]
                    #end of test code
                    
                    #firstFile = self.parent.parent.main_frame.dirp_firstfile[selected_page]
 
                    self.parent.parent.handlerDictionary[dsetId] = getHandlerByName(projectName, firstFile, self.Session)
                    handler = self.parent.parent.handlerDictionary[dsetId]
           else:      # working off-line
              projectName = self.parent.parent.main_frame.projectName[selected_page]
              if self.parent.parent.offline_file_directory[selected_page] == "directory":
                 if self.parent.parent.config is None:
                    extraction_controls.call_sessionmaker( self.parent.parent )
                 datasetPaths = []
                 dmap = {self.parent.parent.offline_datasetName : datasetPaths}
                 listerSection = getOfflineLister(self.parent.parent.config, "project:%s"%projectName, None)
                 offlineLister = self.parent.parent.config.get(listerSection, 'offline_lister_executable')
                 lastargs = self.parent.parent.offline_directories
                 commandArgs = "--config-section %s "%listerSection
                 commandArgs += " ".join(lastargs)
                 for filepath, size in processIterator(offlineLister, commandArgs, filefilt=self.parent.parent.filefilt):
                   datasetPaths.append((filepath, str(size)))
                 datasetNames = self.parent.parent.datasetNames
                 directoryMap = None

                 # get the handler
                 for x in datasetNames:
                    dsetId = x[0] 
                    self.parent.parent.handlerDictionary[dsetId] = getHandlerByName(projectName, None, self.Session, offline=True)

              elif self.parent.parent.offline_file_directory[selected_page] == "file":
                 dmap = self.parent.parent.main_frame.dmap[selected_page]
                 extraFields = self.parent.parent.main_frame.extraFields[selected_page]
                 datasetMapfile = self.parent.parent.main_frame.datasetMapfile[selected_page]
                 projectName = self.parent.parent.main_frame.projectName[selected_page]
                 directoryMap = None
                 if datasetMapfile is not None:
                     dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True)
                     datasetNames = dmap.keys()

                 # get the handlers
                 for x in datasetNames:
                    dsetId = x[0] 
                    self.parent.parent.handlerDictionary[dsetId] = getHandlerByName(projectName, None, self.Session, offline=True)


           # Iterate over datasets
           if appendOpt:
               operation = UPDATE_OP
           else:
               operation = CREATE_OP
        
           datasets = iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, self.Session, self.parent.parent.aggregateDimension, operation, self.parent.parent.filefilt, initcontext, self.parent.parent.hold_offline[selected_page], properties, comment=self.comments, testProgress1=testProgress1, testProgress2=testProgress2 , handlerDictionary=self.parent.parent.handlerDictionary, extraFields=extraFields, readFiles=True)

           # If working on-line then replace the scanned list of datasets with 
           # the complete list of datasets
           #test
           """
           print 'datasetNames:'
           for t1 in datasetNames:
               print t1
           print 'datasetNames2:'    
           for t2 in datasetNames2:
               print t2
           """   
           if not self.parent.parent.hold_offline[selected_page]:
              datasets = []
              versionObjs = []
              # ganz finally, tested datasetNames2 here
              for dsetName, version in datasetNames:
                  result = Dataset.lookup(dsetName, self.Session, version=version)
                  if result is not None:
                      entry, versionObj = result
                      datasets.append(entry)
                      versionObjs.append(versionObj)

           # Get the summary of errors after doing a data extraction
           dset_error = []
           for dset in datasets:
               status = dset.get_publication_status(self.Session)
               status_name = eventName[status]
               if dset.has_warnings(self.Session):
                   dset_error.append(dset.get_name(self.Session))

           try:
              list_fields = getQueryFields( handler )
           except:
              handler = getHandlerByName(projectName, None, self.Session)
              list_fields = getQueryFields( handler )

           # Display the datasets in the "Collection" page
#           if self.parent.parent.hold_offline[selected_page] == True:
#              tab_name = "Collection_Offline"
#              from_tab = "Collection"
#              pub_editorviewer = self.parent.parent.create_publisher_editor_viewer( self.parent.parent, tab_name, dataset, from_tab, self.Session)

           # Show the extracted datasets
           self.set_column_labels( len(datasets), list_fields )
           self.show_extracted_info(datasets, dset_error, list_fields, versionObjs)

        # Enable the "Data Publication" button
        self.parent.ControlButton3.configure( state = 'normal' )
示例#13
0
def esgscanWrapper(directoryList, **kw):

    if len(directoryList) == 0:
        raise ESGPublishError("No directory specified")

    output = sys.stdout
    appendMap = None
    appendPath = kw.get("appendPath", None)
    if appendPath is not None:
        if os.path.exists(appendPath):
            appendMap = readDatasetMap(appendPath)
        else:
            appendMap = {}
        output = open(appendPath, "a")
    datasetName = kw.get("datasetName", None)
    filefilt = kw.get("fileFilt", ".*\.nc$")
    init_file = kw.get("initFile", None)
    offline = kw.get("offline", False)
    outputPath = kw.get("outputPath", None)
    if outputPath is not None:
        output = open(outputPath, "w")
    else:
        output = sys.stdout
    projectName = kw.get("projectName", None)
    readFiles = kw.get("readFiles", False)
    service = kw.get("service", None)

    # Load the configuration and set up a database connection
    config, Session = initdb(init_file=init_file)

    # Register project handlers
    registerHandlers()

    if not offline:

        # Determine if checksumming is enabled
        line = config.get("DEFAULT", "checksum", default=None)
        if line is not None:
            checksumClient, checksumType = splitLine(line)
        else:
            checksumClient = None

        if projectName is not None:
            handler = getHandlerByName(projectName, None, Session)
        else:
            multiIter = multiDirectoryIterator(directoryList, filefilt=filefilt)
            firstFile, size = multiIter.next()
            handler = getHandler(firstFile, Session, validate=True)
            if handler is None:
                raise ESGPublishError("No project found in file %s, specify with --project." % firstFile)
            projectName = handler.name

        if not readFiles:
            datasetMap = handler.generateDirectoryMap(directoryList, filefilt, datasetName=datasetName)
        else:
            datasetMap = handler.generateDirectoryMapFromFiles(directoryList, filefilt, datasetName=datasetName)

        # Output the map
        keys = datasetMap.keys()
        keys.sort()
        for datasetId in keys:
            direcTuple = datasetMap[datasetId]
            direcTuple.sort()
            for nodepath, filepath in direcTuple:

                # If readFiles is not set, generate a map entry for each file in the directory
                # that matches filefilt ...
                if not readFiles:
                    itr = directoryIterator(nodepath, filefilt=filefilt, followSubdirectories=False)
                # ... otherwise if readFiles is set, generate a map entry for each file
                else:
                    itr = fnIterator([filepath])

                for filepath, sizet in itr:
                    size, mtime = sizet
                    extraStuff = "mod_time=%f" % float(mtime)

                    if checksumClient is not None:
                        csum = checksum(filepath, checksumClient)
                        extraStuff += " | checksum=%s | checksum_type=%s" % (csum, checksumType)

                    # Print the map entry if:
                    # - The map is being created, not appended, or
                    # - The existing map does not have the dataset, or
                    # - The existing map has the dataset, but not the file.
                    if (
                        (appendMap is None)
                        or (not appendMap.has_key(datasetId))
                        or ((filepath, "%d" % size) not in appendMap[datasetId])
                    ):
                        print >> output, "%s | %s | %d | %s" % (datasetId, filepath, size, extraStuff)
    else:  # offline
        if projectName is not None:
            handler = getHandlerByName(projectName, None, Session, offline=True)
        else:
            raise ESGPublishError("Must specify --project for offline datasets.")
        listerSection = getOfflineLister(config, "project:%s" % projectName, service)
        offlineLister = config.get(listerSection, "offline_lister_executable")
        commandArgs = "--config-section %s " % listerSection
        commandArgs += " ".join(directoryList)
        for dsetName, filepath, sizet in processNodeMatchIterator(
            offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True
        ):
            size, mtime = sizet
            extrastuff = ""
            if mtime is not None:
                extrastuff = "| mod_time=%f" % float(mtime)
            if (
                (appendMap is None)
                or (not appendMap.has_key(dsetName))
                or ((filepath, "%d" % size) not in appendMap[dsetName])
            ):
                print >> output, "%s | %s | %d %s" % (dsetName, filepath, size, extrastuff)

    if output is not sys.stdout:
        output.close()
示例#14
0
def main(argv):

    try:
        args, lastargs = getopt.getopt(argv, "a:cdehi:m:p:ru", ['append', 'create', 'dataset=', 'delete-files', 'echo-sql', 'experiment=', 'filter=', 'help', 'keep-version', 'log=', 'map=', 'message=', 'model=', 'offline',  'parent=', 'per-time', 'per-variable', 'project=', 'property=', 'publish', 'new-version=', 'no-thredds-reinit', 'noscan', 'read-directories', 'read-files', 'rename-files', 'replace', 'replica=', 'rest-api', 'service=', 'set-replica', 'summarize-errors', 'thredds', 'thredds-reinit', 'update', 'use-existing=', 'use-list=', 'validate=', 'version-list=', 'nodbwrite'])
    except getopt.error:
        print sys.exc_value
        return

    aggregateDimension = "time"
    datasetMapfile = None
    datasetName = None
    echoSql = False
    filefilt = '.*\.nc$'
    init_file = None
    initcontext = {}
    keepVersion = False
    las = False
    log_filename = None
    masterGateway = None
    message = None
    offline = False
    parent = None
    perVariable = None
    projectName = None
    properties = {}
    publish = False
    publishOnly = False
    publishOp = CREATE_OP
    readFiles = False
    rescan = False
    rescanDatasetName = []
    restApi = None
    schema = None
    service = None
    summarizeErrors = False
    testProgress1 = testProgress2 = None
    thredds = False
    threddsReinit = None
    version = None
    versionList = None
    nodbwrite = False

    for flag, arg in args:
        if flag=='-a':
            aggregateDimension = arg
        elif flag=='--append':
            publishOp = UPDATE_OP
        elif flag in ['-c', '--create']:
            publishOp = CREATE_OP
        elif flag=='--dataset':
            datasetName = arg
        elif flag in ['-d', '--delete-files']:
            publishOp = DELETE_OP
        elif flag=='--echo-sql':
            echoSql = True
        elif flag=='--experiment':
            initcontext['experiment'] = arg
        elif flag=='--filter':
            filefilt = arg
        elif flag in ['-h', '--help']:
            print usage
            sys.exit(0)
        elif flag=='-i':
            init_file = arg
        elif flag=='--keep-version':
            keepVersion = True
        elif flag=='--log':
            log_filename = arg
        elif flag=='--map':
            datasetMapfile = arg
        elif flag in ['-m', '--message']:
            message = arg
        elif flag=='--model':
            initcontext['model'] = arg
        elif flag=='--nodbwrite':
            nodbwrite = True
        elif flag=='--new-version':
            try:
                version = string.atoi(arg)
                if version <=0:
                    raise ValueError
            except ValueError:
                raise ESGPublishError("Version number must be a positive integer: %s"%arg)
        elif flag=='--no-thredds-reinit':
            threddsReinit = False
        elif flag=='--noscan':
            publishOnly = True
        elif flag=='--offline':
            offline = True
        elif flag=='--parent':
            parent = arg
        elif flag=='--per-time':
            perVariable = False
        elif flag=='--per-variable':
            perVariable = True
        elif flag=='--project':
            projectName = arg
        elif flag in ['-p', '--property']:
            name, value = arg.split('=')
            properties[name] = value
        elif flag=='--publish':
            publish = True
        elif flag in ['-e', '--read-directories']:
            readFiles = False
        elif flag=='--read-files':
            readFiles = True
        elif flag=='--rename-files':
            publishOp = RENAME_OP
        elif flag in ['-r', '--replace']:
            publishOp = REPLACE_OP
        elif flag=='--replica':
            masterGateway = arg
            warning("The --replica option is deprecated. Use --set-replica instead")
        elif flag=='--rest-api':
            restApi = True
        elif flag=='--service':
            service = arg
        elif flag=='--set-replica':
            masterGateway = 'DEFAULT'
        elif flag=='--summarize-errors':
            summarizeErrors = True
        elif flag=='--thredds':
            thredds = True
        elif flag=='--thredds-reinit':
            threddsReinit = True
        elif flag in ['-u', '--update']:
            publishOp = UPDATE_OP
        elif flag=='--use-existing':
            rescan = True
            rescanDatasetName.append(arg)
        elif flag=='--use-list':
            rescan = True
            if arg=='-':
                namelist=sys.stdin
            else:
                namelist = open(arg)
            for line in namelist.readlines():
                line = line.strip()
                if line[0]!='#':
                    rescanDatasetName.append(line)
        elif flag=='--validate':
            schema = arg
            restApi = True
        elif flag=='--version-list':
            versionList = arg

    # If offline, the project must be specified
    if offline and (projectName is None):
        raise ESGPublishError("Must specify project with --project for offline datasets")

    if version is not None and versionList is not None:
        raise ESGPublishError("Cannot specify both --new-version and --version-list")

    if versionList is not None:
        version = {}
        f = open(versionList)
        lines = f.readlines()
        f.close()
        for line in lines:
            line = line.strip()
            dsid, vers = line.split('|')
            dsid = dsid.strip()
            vers = int(vers.strip())
            version[dsid] = vers

    # Load the configuration and set up a database connection
    config = loadConfig(init_file)
    engine = create_engine(config.getdburl('extract'), echo=echoSql, pool_recycle=3600)
    initLogging('extract', override_sa=engine, log_filename=log_filename)
    Session = sessionmaker(bind=engine, autoflush=True, autocommit=False)

    # Register project handlers
    registerHandlers()

    # Get the default publication interface (REST or Hessian)
    if restApi is None:
        restApi = config.getboolean('DEFAULT', 'use_rest_api', default=False)

    # If the dataset map is input, just read it ...
    dmap = None
    directoryMap = None
    extraFields = None
    if datasetMapfile is not None:
        dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True)
        datasetNames = dmap.keys()

    elif rescan:
        # Note: No need to get the extra fields, such as mod_time, since
        # they are already in the database, and will be used for file comparison if necessary.
        dmap, offline = queryDatasetMap(rescanDatasetName, Session)
        datasetNames = dmap.keys()

    # ... otherwise generate the directory map.
    else:
        # Online dataset(s)
        if not offline:
            if len(lastargs)==0:
                print "No directories specified."
                return

            if projectName is not None:
                handler = getHandlerByName(projectName, None, Session)
            else:
                multiIter = multiDirectoryIterator(lastargs, filefilt=filefilt)
                firstFile, size = multiIter.next()
                listIter = list(multiIter)
                handler = getHandler(firstFile, Session, validate=True)
                if handler is None:
                    raise ESGPublishError("No project found in file %s, specify with --project."%firstFile)
                projectName = handler.name

            props = properties.copy()
            props.update(initcontext)
            if not readFiles:
                directoryMap = handler.generateDirectoryMap(lastargs, filefilt, initContext=props, datasetName=datasetName)
            else:
                directoryMap = handler.generateDirectoryMapFromFiles(lastargs, filefilt, initContext=props, datasetName=datasetName)
            datasetNames = [(item,-1) for item in directoryMap.keys()]

        # Offline dataset. Format the spec as a dataset map : dataset_name => [(path, size), (path, size), ...]
        else:
            handler = getHandlerByName(projectName, None, Session, offline=True)
            dmap = {}
            listerSection = getOfflineLister(config, "project:%s"%projectName, service)
            offlineLister = config.get(listerSection, 'offline_lister_executable')
            commandArgs = "--config-section %s "%listerSection
            commandArgs += " ".join(lastargs)
            for dsetName, filepath, sizet in processNodeMatchIterator(offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True):
                size, mtime = sizet
                if dmap.has_key((dsetName,-1)):
                    dmap[(dsetName,-1)].append((filepath, str(size)))
                else:
                    dmap[(dsetName,-1)] = [(filepath, str(size))]

            datasetNames = dmap.keys()

    datasetNames.sort()
    if len(datasetNames)==0:
        warning("No datasets found.")
        min_version = -1
    else:
        min_version = sorted(datasetNames, key=lambda x: x[1])[0][1]

    # Must specify version for replications
    if min_version == -1 and masterGateway is not None and version is None and versionList is None:
        raise ESGPublishError("Must specify version with --new-version (or --version-list) for replicated datasets")
    
    # Iterate over datasets
    if not publishOnly:

#        pdb.set_trace()

        datasets = iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, publishOp, filefilt, initcontext, offline, properties, keepVersion=keepVersion, newVersion=version, extraFields=extraFields, masterGateway=masterGateway, comment=message, readFiles=readFiles, nodbwrite=nodbwrite)


    if (not nodbwrite):
        result = publishDatasetList(datasetNames, Session, publish=publish, thredds=thredds, las=las, parentId=parent, service=service, perVariable=perVariable, reinitThredds=threddsReinit, restInterface=restApi, schema=schema)
    # print `result`

    if summarizeErrors:
        print 'Summary of errors:'
        for name,versionno in datasetNames:
            dset = Dataset.lookup(name, Session)
            print dset.get_name(Session), dset.get_project(Session), dset.get_model(Session), dset.get_experiment(Session), dset.get_run_name(Session)
            if dset.has_warnings(Session):
                print '=== Dataset: %s ==='%dset.name
                for line in dset.get_warnings(Session):
                    print line
def main(argv):
    try:
        args, lastargs = getopt.getopt(argv, "a:ehi:o:p:", ['dataset=', 'dataset-tech-notes=', 'dataset-tech-notes-title=',\
            'filter=', 'help', 'max-threads=', 'offline', 'output=', 'project=', 'property=', 'read-directories', 'read-files',\
            'service=', 'use-version-dir', 'version='])
    except getopt.error:
        print sys.exc_value
        return

    if len(lastargs)==0:
        print 'No directory specified'
        return

    appendMap = None
    datasetName = None
    datasetTechNotesURL = None
    datasetTechNotesTitle = None
    filefilt = '.*\.nc$'
    init_file = None
    offline = False
    output = sys.stdout
    projectName = None
    properties = {}
    readFiles = False
    service = None
    max_threads = 4
    version_dir = False
    use_version = None
    
    for flag, arg in args:
        if flag=='-a':
            if os.path.exists(arg):
                appendMap = readDatasetMap(arg)
            else:
                appendMap = {}
            output = open(arg, 'a')
        elif flag=='--dataset':
            datasetName = arg
        elif flag=='--dataset-tech-notes':
            datasetTechNotesURL = arg
        elif flag=='--dataset-tech-notes-title':
            datasetTechNotesTitle = arg
        elif flag=='--filter':
            filefilt = arg
        elif flag in ['-h', '--help']:
            print usage
            sys.exit(0)
        elif flag=='-i':
            init_file = arg
        elif flag=='--max-threads':
            max_threads = int(arg)
        elif flag in ['-o', '--output']:
            output = open(arg, 'w')
        elif flag=='--offline':
            offline = True
        elif flag=='--project':
            projectName = arg
        elif flag in ['-p', '--property']:
            name, value = arg.split('=')
            properties[name] = value
        elif flag in ['-e', '--read-directories']:
            readFiles = False
        elif flag=='--read-files':
            readFiles = True
        elif flag=='--service':
            service = arg
        elif flag=='--use-version-dir':
            version_dir = True
        elif flag=='--version':
            version_dir = True
            if not re.match('^[0-9]+$', arg[0]): # e.g. 'vYYYYMMDD'
                use_version = arg[1:]
            else:
                use_version = arg
    
    # Load the configuration and set up a database connection
    config = loadConfig(init_file)
    engine = create_engine(config.getdburl('extract'), echo=False, pool_recycle=3600)
    initLogging('extract', override_sa=engine)
    Session = sessionmaker(bind=engine, autoflush=True, autocommit=False)

    # Register project handlers
    registerHandlers()

    if not offline:

        # Determine if checksumming is enabled
        line = config.get('DEFAULT', 'checksum', default=None)
        if line is not None:
            checksumClient, checksumType = splitLine(line)
        else:
            checksumClient = None

        if projectName is not None:
            handler = getHandlerByName(projectName, None, Session)
        else:
            warning("No project name specified!")
            multiIter = multiDirectoryIterator(lastargs, filefilt=filefilt)
            firstFile, size = multiIter.next()
            handler = getHandler(firstFile, Session, validate=True)
            if handler is None:
                raise ESGPublishError("No project found in file %s, specify with --project."%firstFile)
            projectName = handler.name

        if not readFiles:
            datasetMap = handler.generateDirectoryMap(lastargs, filefilt, initContext=properties, datasetName=datasetName, use_version=version_dir)
        else:
            datasetMap = handler.generateDirectoryMapFromFiles(lastargs, filefilt, initContext=properties, datasetName=datasetName)

        # Output the map
        keys = datasetMap.keys()
        keys.sort()

        datasetMapVersion = {}
        if version_dir:
            # check for version directory
            for dataset_id in keys:
                ds_id_version = dataset_id.split('#')
                if len(ds_id_version) == 2:
                    ds_id, ds_version = ds_id_version
                    if not re.match('^[0-9]+$', ds_version):
                        warning("Version must be an integer. Skipping version %s of dataset %s."%(ds_version, ds_id))
                        continue
                    if use_version and ds_version != use_version:
                            continue
                    if ds_id in datasetMapVersion:
                        datasetMapVersion[ds_id].append(ds_version)
                    else:
                        datasetMapVersion[ds_id] = [ds_version]
                else:
                    error("No version directory found. Skipping dataset %s."%dataset_id)

            if datasetMapVersion:
                keys = datasetMapVersion.keys()
                keys.sort()
            else:
                if use_version:
                    error("Version %s not found. No datasets to process."%use_version)
                else:
                    error("No datasets to process.")
                return

        for dataset_id in keys:
            skip_dataset = False
            dataset_id_version = dataset_id
            path_version = None
            # if multiple versions of the same dataset available use latest version
            if version_dir:
                path_version = sorted(datasetMapVersion[dataset_id])[-1]
                if len(datasetMapVersion[dataset_id]) > 1:
                    info("Multiple versions for %s (%s), processing latest (%s)"%(dataset_id, datasetMapVersion[dataset_id], path_version))
                dataset_id_version = '%s#%s'%(dataset_id, path_version)

            direcTuple = datasetMap[dataset_id_version]
            direcTuple.sort()
            mapfile_md = {}

            for nodepath, filepath in direcTuple:

                # If readFiles is not set, generate a map entry for each file in the directory
                # that matches filefilt ...
                if not readFiles:
                    itr = directoryIterator(nodepath, filefilt=filefilt, followSubdirectories=False)
                # ... otherwise if readFiles is set, generate a map entry for each file
                else:
                    itr = fnIterator([filepath])

                for filepath, sizet in itr:
                    size, mtime = sizet

                    mapfile_md[filepath] = [size]
                    mapfile_md[filepath].append("mod_time=%f"%float(mtime))

                    extraStuff = "mod_time=%f"%float(mtime)

                    if datasetTechNotesURL is not None:
                        mapfile_md[filepath].append('dataset_tech_notes=%s'%datasetTechNotesURL)
                        if datasetTechNotesURL is not None:
                            mapfile_md[filepath].append('dataset_tech_notes_title=%s'%datasetTechNotesTitle)

            if checksumClient is not None:
                pool = ThreadPool(processes=max_threads)
                args = [(filepath, checksumClient) for filepath in mapfile_md]
                checksum_list = pool.map(calc_checksum_wrapper, args)

                for entry in checksum_list:
                    if not entry[1]:
                        error('Calculation of checksum for file %s failed. Skipping dataset %s ...'%(entry[0], dataset_id))
                        skip_dataset = True     # skip entire dataset if we have one file without checksum
                        break
                    mapfile_md[entry[0]].append('checksum=%s'%entry[1])
                    mapfile_md[entry[0]].append('checksum_type=%s'%checksumType)

            for fpath in mapfile_md:
                mapfile_line = '%s | %s | %d'%(dataset_id_version, fpath, mapfile_md[fpath][0])

                for md in mapfile_md[fpath][1:]:
                    mapfile_line+=' | %s'%md

                # Print the map entry if:
                # - Checksum exists for all files of dataset (in case checksumming is enabled)
                # - The map is being created, not appended, or
                # - The existing map does not have the dataset, or
                # - The existing map has the dataset, but not the file.
                if path_version:
                    ds_id = (dataset_id, int(path_version))
                else:
                    ds_id = (dataset_id, -1)
                if not skip_dataset and ( (appendMap is None) or (not appendMap.has_key(ds_id)) or (( fpath, "%d"% mapfile_md[fpath][1]) not in appendMap[ds_id]) ):
                    print >>output, mapfile_line

    else:                               # offline
        if projectName is not None:
            handler = getHandlerByName(projectName, None, Session, offline=True)
        else:
            raise ESGPublishError("Must specify --project for offline datasets.")
        listerSection = getOfflineLister(config, "project:%s"%projectName, service)
        offlineLister = config.get(listerSection, 'offline_lister_executable')
        commandArgs = "--config-section %s "%listerSection
        commandArgs += " ".join(lastargs)
        for dsetName, filepath, sizet in processNodeMatchIterator(offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True):
            size, mtime = sizet
            extrastuff = ""
            if mtime is not None:
                extrastuff = "| mod_time=%f"%float(mtime)
            if (appendMap is None) or (not appendMap.has_key(dsetName)) or ((filepath, "%d"%size) not in appendMap[dsetName]):
                print >>output, "%s | %s | %d %s"%(dsetName, filepath, size, extrastuff)

    if output is not sys.stdout:
        output.close()
示例#16
0
def iterateOverDatasets(projectName,
                        dmap,
                        directoryMap,
                        datasetNames,
                        Session,
                        aggregateDimension,
                        operation,
                        filefilt,
                        initcontext,
                        offlineArg,
                        properties,
                        testProgress1=None,
                        testProgress2=None,
                        handlerDictionary=None,
                        perVariable=None,
                        keepVersion=False,
                        newVersion=None,
                        extraFields=None,
                        masterGateway=None,
                        comment=None,
                        forceAggregate=False,
                        readFiles=False,
                        nodbwrite=False,
                        pid_connector=None):
    """
    Scan and aggregate (if possible) a list of datasets. The datasets and associated files are specified
    in one of two ways: either as a *dataset map* (see ``dmap``) or a *directory map* (see ``directoryMap``).
    All dataset information is persisted in the database. This is a 'helper' routine for esgpublish[_gui].

    Returns a list of persistent Dataset instances.

    projectName
      String name of the project associated with the datasets. If None, it is determined by the first handler found that
      can open a sample file from the dataset.
      
    dmap
      A dictionary dataset map, as returned from ``readDatasetMap``. If None, ``directoryMap`` must be specified.

    directoryMap
      A dictionary directory map, as returned from ``ProjectHandler.generateDirectoryMap``.
      
    datasetNames
      A list of dataset names identifying the datasets to be scanned.

    Session
      An SQLAlchemy Session.
      
    aggregateDimension
      Name of the dimension on which to aggregate the datasets.

    operation
      The publication operation, one of esgcet.publish.CREATE_OP, DELETE_OP, RENAME_OP, UPDATE_OP

    filefilt
      String regular expression as defined by the Python re module. If a ``directoryMap`` is specified, only files whose
      basename matches the filter are scanned. If ``dmap`` is specified, the filter is ignored.

    initcontext
      Dictionary of initial context values for *all* datasets. These values will override metadata contained in datafiles.
      Contrast with ``properties``.

    offlineArg
      Boolean flag or dictionary
      
      If a boolean flag: if True the files are treated as offline (not local) and are not scanned or aggregated. The associated
      metadata will be a minimal set including file name and size.

      If a dictionary, maps dataset_name => offline flag

    properties
      Dictionary of property/value pairs. The properties must be configured in the initialization file section
      corresponding to the project, and do not override existing metadata values. Contrast with ``initcontext``.

    testProgress1=None
      Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*,
      ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to
      the scan phase.

    testProgress2=None
      Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*,
      ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to
      the aggregation phase.

    handlerDictionary=None
      A dictionary mapping datasetName => handler. If None, handlers are determined by project name.

    perVariable=None
      Boolean, overrides ``variable_per_file`` config option.

    keepVersion
      Boolean, True if the dataset version should not be incremented.

    newVersion
      Integer or dictionary. Set the new version number
      explicitly. If a dictionary, maps dataset_id => version. By
      default the version number is incremented by 1. See keepVersion.

    extraFields
      Extra dataset map fields, as from **readDatasetMap**.

    masterGateway
      The gateway that owns the master copy of the datasets. If None, the dataset is not replicated.
      Otherwise the TDS catalog is written with a 'master_gateway' property, flagging the dataset(s)
      as replicated.

    comment=None
      String comment to associate with new datasets created.

    forceAggregate=False
      If True, run the aggregation step regardless.

    readFiles=False
      If True, interpret directoryMap as having one entry per file, instead of one per directory.

    pid_connector
        esgfpid.Connector object to register PIDs

    """
    from esgcet.publish import extractFromDataset, aggregateVariables

    versionIsMap = (type(newVersion) is types.DictType)
    if versionIsMap:
        saveVersionMap = newVersion

    prevProject = None
    datasets = []
    ct = len(datasetNames)
    for iloop in range(ct):
        datasetName, versionno = datasetNames[iloop]

        # Must specify version for replications
        if masterGateway:
            if not newVersion and versionno < 0:
                raise ESGPublishError(
                    "Must specify a version for replicated datasets, e.g. in the mapfile or with --new-version/--version-list."
                )

        # If using a version map, lookup the version for this dataset
        if versionIsMap:
            try:
                newVersion = saveVersionMap[datasetName]
            except KeyError:
                raise ESGPublishError("Dataset not found in version map: %s" %
                                      datasetName)

        context = initcontext.copy()

        # Get offline flag
        if type(offlineArg) is dict:
            offline = offlineArg[datasetName]
        else:
            offline = offlineArg

        # Don't try to aggregate offline datasets
        if offline:
            forceAggregate = False

        # Get a file iterator and sample file
        if dmap is not None:
            if len(dmap[(datasetName, versionno)]) == 0:
                warning("No files specified for dataset %s, version %d." %
                        (datasetName, versionno))
                continue
            firstFile = dmap[(datasetName, versionno)][0][0]
            fileiter = datasetMapIterator(dmap,
                                          datasetName,
                                          versionno,
                                          extraFields=extraFields,
                                          offline=offlineArg)
        else:
            direcTuples = directoryMap[datasetName]
            firstDirec, sampleFile = direcTuples[0]
            firstFile = os.path.join(firstDirec, sampleFile)
            if not readFiles:
                fileiter = multiDirectoryIterator(
                    [direc for direc, sampfile in direcTuples], filefilt)
            else:
                fileiter = fnIterator(
                    [sampfile for direc, sampfile in direcTuples])

        # If the project is not specified, try to read it from the first file
        if handlerDictionary is not None and handlerDictionary.has_key(
                datasetName):
            handler = handlerDictionary[datasetName]
        elif projectName is not None:
            handler = getHandlerByName(projectName,
                                       firstFile,
                                       Session,
                                       validate=True,
                                       offline=offline)
        else:
            handler = getHandler(firstFile, Session, validate=True)
            if handler is None:
                raise ESGPublishError(
                    "No project found in file %s, specify with --project." %
                    firstFile)
            projectName = handler.name
            info("Using project name = %s" % projectName)
        if prevProject is not None and projectName != prevProject:
            raise ESGPublishError(
                "Multiple projects found: %s, %s. Can only publish from one project"
                % (prevProject, projectName))
        prevProject = projectName

        # Generate the initial context from the dataset name
        context = handler.parseDatasetName(datasetName, context)

        # Load the rest of the context from the first file, if possible
        context = handler.getContext(**context)

        # Add properties from the command line
        fieldNames = handler.getFieldNames()
        for name, value in properties.items():
            if name not in fieldNames:
                warning('Property not configured: %s, was ignored' % name)
            else:
                context[name] = value

        # add dataset_version to context to allow version to be a mandatory field
        if versionno > -1:
            context['dataset_version'] = versionno
        elif newVersion is not None:
            context['dataset_version'] = newVersion

        # Update the handler context and fill in default values
        handler.updateContext(context, True)

        # Ensure that fields are valid:
        try:
            handler.validateContext(context)
        except ESGInvalidMandatoryField:
            if offline:
                error("Dataset id has a missing or invalid mandatory field")
            raise

        # Create a CFHandler for validation of standard names, checking time axes, etc.
        cfHandler = handler.getMetadataHandler(sessionMaker=Session)

        dataset = None
        if testProgress1 is not None:
            testProgress1[1] = (100. / ct) * iloop
            if not offline:
                testProgress1[2] = (100. / ct) * iloop + (50. / ct)
            else:
                testProgress1[2] = (100. / ct) * iloop + (100. / ct)

        dataset = extractFromDataset(datasetName,
                                     fileiter,
                                     Session,
                                     handler,
                                     cfHandler,
                                     aggregateDimensionName=aggregateDimension,
                                     offline=offline,
                                     operation=operation,
                                     progressCallback=testProgress1,
                                     perVariable=perVariable,
                                     keepVersion=keepVersion,
                                     newVersion=newVersion,
                                     extraFields=extraFields,
                                     masterGateway=masterGateway,
                                     comment=comment,
                                     useVersion=versionno,
                                     forceRescan=forceAggregate,
                                     nodbwrite=nodbwrite,
                                     pid_connector=pid_connector,
                                     **context)

        # If republishing an existing version, only aggregate if online and no variables exist (yet) for the dataset.

        runAggregate = (not offline)
        if hasattr(dataset, 'reaggregate'):
            runAggregate = (runAggregate and dataset.reaggregate)
        runAggregate = runAggregate or forceAggregate

        if testProgress2 is not None:
            testProgress2[1] = (100. / ct) * iloop + 50. / ct
            testProgress2[2] = (100. / ct) * (iloop + 1)
        if runAggregate and (not nodbwrite):
            aggregateVariables(datasetName,
                               Session,
                               aggregateDimensionName=aggregateDimension,
                               cfHandler=cfHandler,
                               progressCallback=testProgress2,
                               datasetInstance=dataset)
        elif testProgress2 is not None:
            # Just finish the progress GUI
            issueCallback(testProgress2, 1, 1, 0.0, 1.0)

        # Save the context with the dataset, so that it can be searched later
        if (not nodbwrite):
            handler.saveContext(datasetName, Session)
        datasets.append(dataset)

    return datasets
示例#17
0
def esgpublishWrapper(**kw):

    from esgcet.query import queryDatasetMap

    aggregateDimension = kw.get("aggregateDimension", "time")
    datasetMapfile = kw.get("datasetMapfile", None)
    datasetName = kw.get("datasetName", None)
    directoryList = kw.get("directoryList", None)
    echoSql = kw.get("echoSql", False)
    filefilt = kw.get("filefilt", ".*\.nc$")
    init_file = kw.get("init_file", None)
    initcontext = kw.get("initcontext", {})
    keepVersion = kw.get("keepVersion", False)
    las = kw.get("las", False)
    log_filename = kw.get("log_filename", None)
    masterGateway = kw.get("masterGateway", None)
    message = kw.get("message", None)
    offline = kw.get("offline", False)
    parent = kw.get("parent", None)
    perVariable = kw.get("perVariable", None)
    projectName = kw.get("projectName", None)
    properties = kw.get("properties", {})
    publish = kw.get("publish", False)
    publishOnly = kw.get("publishOnly", False)
    publishOp = kw.get("publishOp", CREATE_OP)
    readFiles = kw.get("readFiles", False)
    readFromCatalog = kw.get("readFromCatalog", False)
    reinitThredds = kw.get("reinitThredds", None)
    rescan = kw.get("rescan", False)
    rescanDatasetName = kw.get("rescanDatasetName", [])
    resultThreddsDictionary = None
    service = kw.get("service", None)
    summarizeErrors = kw.get("summarizeErrors", False)
    testProgress1 = kw.get("testProgress1", None)
    testProgress2 = kw.get("testProgress2", None)
    thredds = kw.get("thredds", False)
    threddsCatalogDictionary = kw.get("threddsCatalogDictionary", None)
    version = kw.get("version", None)

    # If offline, the project must be specified
    if offline and (projectName is None):
        raise ESGPublishError("Must specify project with --project for offline datasets")

    # Must specify version for replications
    if masterGateway is not None and version is None:
        raise ESGPublishError("Must specify version with --new-version for replicated datasets")

    # Load the configuration and set up a database connection
    config, Session = initdb(init_file=init_file, echoSql=echoSql, log_filename=log_filename)

    # Register project handlers
    registerHandlers()

    # If the dataset map is input, just read it ...
    dmap = None
    directoryMap = None
    extraFields = None
    if datasetMapfile is not None:
        dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True)
        datasetNames = dmap.keys()

    elif rescan:
        # Note: No need to get the extra fields, such as mod_time, since
        # they are already in the database, and will be used for file comparison if necessary.
        dmap, offline = queryDatasetMap(rescanDatasetName, Session)
        datasetNames = dmap.keys()

    # ... otherwise generate the directory map.
    else:
        # Online dataset(s)
        if not offline:

            if projectName is not None:
                handler = getHandlerByName(projectName, None, Session)
            else:
                multiIter = multiDirectoryIterator(directoryList, filefilt=filefilt)
                firstFile, size = multiIter.next()
                listIter = list(multiIter)
                handler = getHandler(firstFile, Session, validate=True)
                if handler is None:
                    raise ESGPublishError("No project found in file %s, specify with --project." % firstFile)
                projectName = handler.name

            props = properties.copy()
            props.update(initcontext)
            if not readFiles:
                directoryMap = handler.generateDirectoryMap(
                    directoryList, filefilt, initContext=props, datasetName=datasetName
                )
            else:
                directoryMap = handler.generateDirectoryMapFromFiles(
                    directoryList, filefilt, initContext=props, datasetName=datasetName
                )

            datasetNames = [(item, -1) for item in directoryMap.keys()]

        # Offline dataset. Format the spec as a dataset map : dataset_name => [(path, size), (path, size), ...]
        else:
            handler = getHandlerByName(projectName, None, Session, offline=True)
            dmap = {}
            listerSection = getOfflineLister(config, "project:%s" % projectName, service)
            offlineLister = config.get(listerSection, "offline_lister_executable")
            commandArgs = "--config-section %s " % listerSection
            commandArgs += " ".join(directoryList)
            for dsetName, filepath, sizet in processNodeMatchIterator(
                offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True
            ):
                size, mtime = sizet
                if dmap.has_key((dsetName, -1)):
                    dmap[(dsetName, -1)].append((filepath, str(size)))
                else:
                    dmap[(dsetName, -1)] = [(filepath, str(size))]

            datasetNames = dmap.keys()

    datasetNames.sort()
    if len(datasetNames) == 0:
        warning("No datasets found.")

    # Iterate over datasets
    if not publishOnly:
        datasets = iterateOverDatasets(
            projectName,
            dmap,
            directoryMap,
            datasetNames,
            Session,
            aggregateDimension,
            publishOp,
            filefilt,
            initcontext,
            offline,
            properties,
            keepVersion=keepVersion,
            newVersion=version,
            extraFields=extraFields,
            masterGateway=masterGateway,
            comment=message,
            readFiles=readFiles,
        )

    result = publishDatasetList(
        datasetNames,
        Session,
        publish=publish,
        thredds=thredds,
        las=las,
        parentId=parent,
        service=service,
        perVariable=perVariable,
        threddsCatalogDictionary=threddsCatalogDictionary,
        reinitThredds=reinitThredds,
        readFromCatalog=readFromCatalog,
    )

    return result
示例#18
0
def publishDatasetList(datasetNames, Session, parentId=None, handlerDictionary=None, publish=True, thredds=True, las=False, progressCallback=None, service=None, perVariable=None, threddsCatalogDictionary=None, reinitThredds=None, readFromCatalog=False, restInterface=False, schema=None):
    """
    Publish a list of datasets:

    - For each dataset, write a THREDDS catalog.
    - Add the new catalogs to the THREDDS catalog tree and reinitilize the THREDDS server.
    - Reinitialize the LAS server.
    - Publish each dataset to the gateway.

    Returns a dictionary: (datasetName, version) => status
    
    datasetNames
      A list of (string_dataset_name, version) tuples.

    Session
      A database Session.

    parentId
      The string (or dictionary) persistent identifier of the parent of the datasets. If None (the default),
      the parent id for each dataset is generated from ``handler.getParentId()``. If a dictionary, each
      dataset name is used as a key to lookup the respective parent id. If a string, the parent id is
      set to the string for all datasets being published. This function
      can be overridden in the project handler to implement a project-specific dataset hierarchy.

    handlerDictionary
      A dictionary mapping dataset_name => handler.

    publish
      Boolean flag: if true (the default), contact the gateway to publish this dataset.

    thredds
      Boolean flag: if true (the default), write the associated THREDDS catalog.

    las
      Boolean flag: if true (the default), write the associated LAS catalog.

    progressCallback
      Tuple (callback, initial, final) where ``callback`` is a function of the form ``callback(progress)``, ``initial`` is the initial value reported, ``final`` is the final value reported.

    service
      String service name. If omitted, the first online/offline service in the configuration is used.

    perVariable
      Boolean, overrides ``variable_per_file`` config option.

    threddsCatalogDictionary
      If not None, just generate catalogs in strings, not the THREDDS directories, and set
      threddsCatalogDictionary[datasetId] = string_catalog

    reinitThredds
      Boolean flag. If True, create the TDS master catalog and reinitialize the TDS server.
      If None, defaults to value of thredds option.

    readFromCatalog
      Boolean flag. If True, read the TDS catalog definitions from threddsCatalogDictionary. 
      threddsCatalogDictionary must also be set.

    restInterface
      Boolean flag. If True, publish datasets with the RESTful publication services.

    schema
      (Optional) String name of the schema to validate against, for RESTful publication calls.

    """

    session = Session()
    resultDict = {}
    if readFromCatalog and threddsCatalogDictionary is None:
            raise ESGPublishError("Must set THREDDS catalog dictionary when readFromCatalog is True.")

    # Get handlers for each dataset
    if handlerDictionary is None:
        handlers = {}
        for datasetName,versionno in datasetNames:
            dset = session.query(Dataset).filter_by(name=datasetName).first()
            if dset is None:
                raise ESGPublishError("Dataset not found: %s"%datasetName)
            handler = getHandlerByName(dset.project, None, Session)
            handlers[datasetName] = handler
    else:
        handlers = handlerDictionary

    # reinitThredds defaults to the value of thredds option
    if reinitThredds is None:
        reinitThredds = thredds

    if thredds:
        for datasetName,versionno in datasetNames:
            dset = session.query(Dataset).filter_by(name=datasetName).first()

            # If the dataset version is not the latest, publish as a per-time dataset without aggregation,
            # since the dataset variables only relate to the latest dataset version
            latestVersion = dset.getVersion()
            if versionno==-1:
                versionno=latestVersion
            if versionno!=latestVersion:
                if perVariable:
                    messaging.info("Generating THREDDS catalog in per-time format, since version %d is not the latest version (%d)"%(versionno,latestVersion))
                perVariable = False

            handler = handlers[datasetName]

            # If threddsCatalogDictionary is not set, create the TDS catalog in the TDS content directory ...
            if threddsCatalogDictionary is None:
                threddsOutputPath = generateThreddsOutputPath(datasetName, versionno, Session, handler)
                threddsOutput = open(threddsOutputPath, "w")
                generateThredds(datasetName, Session, threddsOutput, handler, service=service, perVariable=perVariable, versionNumber=versionno)
                threddsOutput.close()
                try:
                    os.chmod(threddsOutputPath, 0664)
                except:
                    pass

            # ... else if threddsCatalogDictionary is the catalog source:
            elif readFromCatalog:
                catalogString = threddsCatalogDictionary[(datasetName,versionno)]
                threddsOutputPath = generateThreddsOutputPath(datasetName, versionno, Session, handler)
                threddsOutput = open(threddsOutputPath, "w")
                messaging.info("Writing THREDDS catalog %s"%threddsOutputPath)
                threddsOutput.write(catalogString)
                threddsOutput.close()
                try:
                    os.chmod(threddsOutputPath, 0664)
                except:
                    pass

            # ... otherwise write the catalog in a 'string file'
            else:
                threddsOutputPath = generateThreddsOutputPath(datasetName, versionno, Session, handler) # Creates catalog entry
                threddsOutput = cStringIO.StringIO()
                generateThredds(datasetName, Session, threddsOutput, handler, service=service, perVariable=perVariable, versionNumber=versionno)
                threddsCatalogDictionary[(datasetName,versionno)] = threddsOutput.getvalue()
                threddsOutput.close()

    if reinitThredds:
        updateThreddsMasterCatalog(Session)
        result = reinitializeThredds()

    if las:    
        try:
            result = reinitializeLAS()
        except Exception, e:
            messaging.error("Error on LAS reinitialization: %s, new datasets not added."%e)
示例#19
0
def esgpublishWrapper(**kw):

    from esgcet.query import queryDatasetMap

    aggregateDimension = kw.get("aggregateDimension", "time")
    datasetMapfile = kw.get("datasetMapfile", None)
    datasetName = kw.get("datasetName", None)
    directoryList = kw.get("directoryList", None)
    echoSql = kw.get("echoSql", False)
    filefilt = kw.get("filefilt", '.*\.nc$')
    init_file = kw.get("init_file", None)
    initcontext = kw.get("initcontext", {})
    keepVersion = kw.get("keepVersion", False)
    las = kw.get("las", False)
    log_filename = kw.get("log_filename", None)
    masterGateway = kw.get("masterGateway", None)
    message = kw.get("message", None)
    offline = kw.get("offline", False)
    parent = kw.get("parent", None)
    perVariable = kw.get("perVariable", None)
    projectName = kw.get("projectName", None)
    properties = kw.get("properties", {})
    publish = kw.get("publish", False)
    publishOnly = kw.get("publishOnly", False)
    publishOp = kw.get("publishOp", CREATE_OP)
    readFiles = kw.get("readFiles", False)
    readFromCatalog = kw.get("readFromCatalog", False)
    reinitThredds = kw.get("reinitThredds", None)
    rescan = kw.get("rescan", False)
    rescanDatasetName = kw.get("rescanDatasetName", [])
    resultThreddsDictionary = None
    service = kw.get("service", None)
    summarizeErrors = kw.get("summarizeErrors", False)
    testProgress1 = kw.get("testProgress1", None)
    testProgress2 = kw.get("testProgress2", None)
    thredds = kw.get("thredds", False)
    threddsCatalogDictionary = kw.get("threddsCatalogDictionary", None)
    version = kw.get("version", None)

    # If offline, the project must be specified
    if offline and (projectName is None):
        raise ESGPublishError(
            "Must specify project with --project for offline datasets")

    # Must specify version for replications
    if masterGateway is not None and version is None:
        raise ESGPublishError(
            "Must specify version with --new-version for replicated datasets")

    # Load the configuration and set up a database connection
    config, Session = initdb(init_file=init_file,
                             echoSql=echoSql,
                             log_filename=log_filename)

    # Register project handlers
    registerHandlers()

    # If the dataset map is input, just read it ...
    dmap = None
    directoryMap = None
    extraFields = None
    if datasetMapfile is not None:
        dmap, extraFields = readDatasetMap(datasetMapfile,
                                           parse_extra_fields=True)
        datasetNames = dmap.keys()

    elif rescan:
        # Note: No need to get the extra fields, such as mod_time, since
        # they are already in the database, and will be used for file comparison if necessary.
        dmap, offline = queryDatasetMap(rescanDatasetName, Session)
        datasetNames = dmap.keys()

    # ... otherwise generate the directory map.
    else:
        # Online dataset(s)
        if not offline:

            if projectName is not None:
                handler = getHandlerByName(projectName, None, Session)
            else:
                multiIter = multiDirectoryIterator(directoryList,
                                                   filefilt=filefilt)
                firstFile, size = multiIter.next()
                listIter = list(multiIter)
                handler = getHandler(firstFile, Session, validate=True)
                if handler is None:
                    raise ESGPublishError(
                        "No project found in file %s, specify with --project."
                        % firstFile)
                projectName = handler.name

            props = properties.copy()
            props.update(initcontext)
            if not readFiles:
                directoryMap = handler.generateDirectoryMap(
                    directoryList,
                    filefilt,
                    initContext=props,
                    datasetName=datasetName)
            else:
                directoryMap = handler.generateDirectoryMapFromFiles(
                    directoryList,
                    filefilt,
                    initContext=props,
                    datasetName=datasetName)

            datasetNames = [(item, -1) for item in directoryMap.keys()]

        # Offline dataset. Format the spec as a dataset map : dataset_name => [(path, size), (path, size), ...]
        else:
            handler = getHandlerByName(projectName,
                                       None,
                                       Session,
                                       offline=True)
            dmap = {}
            listerSection = getOfflineLister(config,
                                             "project:%s" % projectName,
                                             service)
            offlineLister = config.get(listerSection,
                                       'offline_lister_executable')
            commandArgs = "--config-section %s " % listerSection
            commandArgs += " ".join(directoryList)
            for dsetName, filepath, sizet in processNodeMatchIterator(
                    offlineLister,
                    commandArgs,
                    handler,
                    filefilt=filefilt,
                    datasetName=datasetName,
                    offline=True):
                size, mtime = sizet
                if dmap.has_key((dsetName, -1)):
                    dmap[(dsetName, -1)].append((filepath, str(size)))
                else:
                    dmap[(dsetName, -1)] = [(filepath, str(size))]

            datasetNames = dmap.keys()

    datasetNames.sort()
    if len(datasetNames) == 0:
        warning("No datasets found.")

    # Iterate over datasets
    if not publishOnly:
        datasets = iterateOverDatasets(projectName,
                                       dmap,
                                       directoryMap,
                                       datasetNames,
                                       Session,
                                       aggregateDimension,
                                       publishOp,
                                       filefilt,
                                       initcontext,
                                       offline,
                                       properties,
                                       keepVersion=keepVersion,
                                       newVersion=version,
                                       extraFields=extraFields,
                                       masterGateway=masterGateway,
                                       comment=message,
                                       readFiles=readFiles)

    result = publishDatasetList(
        datasetNames,
        Session,
        publish=publish,
        thredds=thredds,
        las=las,
        parentId=parent,
        service=service,
        perVariable=perVariable,
        threddsCatalogDictionary=threddsCatalogDictionary,
        reinitThredds=reinitThredds,
        readFromCatalog=readFromCatalog)

    return result