示例#1
0
def load_configuration(parent):
    import os
    import pub_controls
    from esgcet.config import getHandler, getHandlerByName, registerHandlers, CFHandler
    from sqlalchemy.orm import sessionmaker
    from esgcet.publish import multiDirectoryIterator, datasetMapIterator

    offline = parent.offline
    firstFile = parent.firstFile
    projectName = parent.projectName
    config = parent.config
    Session = parent.Session

    dmap = parent.dmap
    datasetNames = parent.datasetNames
    datasetMapfile = parent.datasetMapfile

    for datasetName in datasetNames:

        # Get a file iterator and sample file
        if datasetMapfile is not None:
            firstFile = dmap[datasetName][0][0]
            fileiter = datasetMapIterator(dmap, datasetName)
        else:
            direcTuples = parent.directoryMap[datasetName]
            firstDirec, sampleFile = direcTuples[0]
            firstFile = os.path.join(firstDirec, sampleFile)
            fileiter = multiDirectoryIterator(
                [direc for direc, sampfile in direcTuples], parent.filefilt)

        # Register project handlers
        registerHandlers()

        # If the project is not specified, try to read it from the first file
        validate = True
        if projectName is not None:
            handler = getHandlerByName(projectName,
                                       firstFile,
                                       Session,
                                       validate=validate,
                                       offline=offline)
        else:
            handler = getHandler(firstFile, Session, validate=validate)

        parent.handler = handler

    # View the collection of datasets
    tab_name = "Collection %i" % parent.top_ct
    parent.ntk.new_page(parent, tab_name)
def load_configuration( parent ):
    import os
    import pub_controls
    from esgcet.config import getHandler, getHandlerByName, registerHandlers, CFHandler
    from sqlalchemy.orm import sessionmaker
    from esgcet.publish import multiDirectoryIterator, datasetMapIterator

    offline = parent.offline
    firstFile = parent.firstFile
    projectName = parent.projectName
    config = parent.config
    Session = parent.Session

    dmap = parent.dmap
    datasetNames = parent.datasetNames
    datasetMapfile = parent.datasetMapfile

    for datasetName in datasetNames:

        # Get a file iterator and sample file
        if datasetMapfile is not None:
            firstFile = dmap[datasetName][0][0]
            fileiter = datasetMapIterator(dmap, datasetName)
        else:
            direcTuples = parent.directoryMap[datasetName]
            firstDirec, sampleFile = direcTuples[0]
            firstFile = os.path.join(firstDirec, sampleFile)
            fileiter  = multiDirectoryIterator([direc for direc, sampfile in direcTuples],parent.filefilt)

        # Register project handlers
        registerHandlers()

        # If the project is not specified, try to read it from the first file
        validate = True
        if projectName is not None:
            handler = getHandlerByName(projectName,firstFile,Session,validate=validate,offline=offline)
        else:
            handler = getHandler(firstFile, Session, validate=validate)

        parent.handler = handler

    # View the collection of datasets
    tab_name= "Collection %i" % parent.top_ct
    parent.ntk.new_page( parent, tab_name )
def main(argv):
    try:
        args, lastargs = getopt.getopt(argv, "a:ehi:o:p:", ['dataset=', 'dataset-tech-notes=', 'dataset-tech-notes-title=',\
            'filter=', 'help', 'max-threads=', 'offline', 'output=', 'project=', 'property=', 'read-directories', 'read-files',\
            'service=', 'use-version-dir', 'version='])
    except getopt.error:
        print sys.exc_value
        return

    if len(lastargs)==0:
        print 'No directory specified'
        return

    appendMap = None
    datasetName = None
    datasetTechNotesURL = None
    datasetTechNotesTitle = None
    filefilt = '.*\.nc$'
    init_file = None
    offline = False
    output = sys.stdout
    projectName = None
    properties = {}
    readFiles = False
    service = None
    max_threads = 4
    version_dir = False
    use_version = None
    
    for flag, arg in args:
        if flag=='-a':
            if os.path.exists(arg):
                appendMap = readDatasetMap(arg)
            else:
                appendMap = {}
            output = open(arg, 'a')
        elif flag=='--dataset':
            datasetName = arg
        elif flag=='--dataset-tech-notes':
            datasetTechNotesURL = arg
        elif flag=='--dataset-tech-notes-title':
            datasetTechNotesTitle = arg
        elif flag=='--filter':
            filefilt = arg
        elif flag in ['-h', '--help']:
            print usage
            sys.exit(0)
        elif flag=='-i':
            init_file = arg
        elif flag=='--max-threads':
            max_threads = int(arg)
        elif flag in ['-o', '--output']:
            output = open(arg, 'w')
        elif flag=='--offline':
            offline = True
        elif flag=='--project':
            projectName = arg
        elif flag in ['-p', '--property']:
            name, value = arg.split('=')
            properties[name] = value
        elif flag in ['-e', '--read-directories']:
            readFiles = False
        elif flag=='--read-files':
            readFiles = True
        elif flag=='--service':
            service = arg
        elif flag=='--use-version-dir':
            version_dir = True
        elif flag=='--version':
            version_dir = True
            if not re.match('^[0-9]+$', arg[0]): # e.g. 'vYYYYMMDD'
                use_version = arg[1:]
            else:
                use_version = arg
    
    # Load the configuration and set up a database connection
    config = loadConfig(init_file)
    engine = create_engine(config.getdburl('extract'), echo=False, pool_recycle=3600)
    initLogging('extract', override_sa=engine)
    Session = sessionmaker(bind=engine, autoflush=True, autocommit=False)

    # Register project handlers
    registerHandlers()

    if not offline:

        # Determine if checksumming is enabled
        line = config.get('DEFAULT', 'checksum', default=None)
        if line is not None:
            checksumClient, checksumType = splitLine(line)
        else:
            checksumClient = None

        if projectName is not None:
            handler = getHandlerByName(projectName, None, Session)
        else:
            warning("No project name specified!")
            multiIter = multiDirectoryIterator(lastargs, filefilt=filefilt)
            firstFile, size = multiIter.next()
            handler = getHandler(firstFile, Session, validate=True)
            if handler is None:
                raise ESGPublishError("No project found in file %s, specify with --project."%firstFile)
            projectName = handler.name

        if not readFiles:
            datasetMap = handler.generateDirectoryMap(lastargs, filefilt, initContext=properties, datasetName=datasetName, use_version=version_dir)
        else:
            datasetMap = handler.generateDirectoryMapFromFiles(lastargs, filefilt, initContext=properties, datasetName=datasetName)

        # Output the map
        keys = datasetMap.keys()
        keys.sort()

        datasetMapVersion = {}
        if version_dir:
            # check for version directory
            for dataset_id in keys:
                ds_id_version = dataset_id.split('#')
                if len(ds_id_version) == 2:
                    ds_id, ds_version = ds_id_version
                    if not re.match('^[0-9]+$', ds_version):
                        warning("Version must be an integer. Skipping version %s of dataset %s."%(ds_version, ds_id))
                        continue
                    if use_version and ds_version != use_version:
                            continue
                    if ds_id in datasetMapVersion:
                        datasetMapVersion[ds_id].append(ds_version)
                    else:
                        datasetMapVersion[ds_id] = [ds_version]
                else:
                    error("No version directory found. Skipping dataset %s."%dataset_id)

            if datasetMapVersion:
                keys = datasetMapVersion.keys()
                keys.sort()
            else:
                if use_version:
                    error("Version %s not found. No datasets to process."%use_version)
                else:
                    error("No datasets to process.")
                return

        for dataset_id in keys:
            skip_dataset = False
            dataset_id_version = dataset_id
            path_version = None
            # if multiple versions of the same dataset available use latest version
            if version_dir:
                path_version = sorted(datasetMapVersion[dataset_id])[-1]
                if len(datasetMapVersion[dataset_id]) > 1:
                    info("Multiple versions for %s (%s), processing latest (%s)"%(dataset_id, datasetMapVersion[dataset_id], path_version))
                dataset_id_version = '%s#%s'%(dataset_id, path_version)

            direcTuple = datasetMap[dataset_id_version]
            direcTuple.sort()
            mapfile_md = {}

            for nodepath, filepath in direcTuple:

                # If readFiles is not set, generate a map entry for each file in the directory
                # that matches filefilt ...
                if not readFiles:
                    itr = directoryIterator(nodepath, filefilt=filefilt, followSubdirectories=False)
                # ... otherwise if readFiles is set, generate a map entry for each file
                else:
                    itr = fnIterator([filepath])

                for filepath, sizet in itr:
                    size, mtime = sizet

                    mapfile_md[filepath] = [size]
                    mapfile_md[filepath].append("mod_time=%f"%float(mtime))

                    extraStuff = "mod_time=%f"%float(mtime)

                    if datasetTechNotesURL is not None:
                        mapfile_md[filepath].append('dataset_tech_notes=%s'%datasetTechNotesURL)
                        if datasetTechNotesURL is not None:
                            mapfile_md[filepath].append('dataset_tech_notes_title=%s'%datasetTechNotesTitle)

            if checksumClient is not None:
                pool = ThreadPool(processes=max_threads)
                args = [(filepath, checksumClient) for filepath in mapfile_md]
                checksum_list = pool.map(calc_checksum_wrapper, args)

                for entry in checksum_list:
                    if not entry[1]:
                        error('Calculation of checksum for file %s failed. Skipping dataset %s ...'%(entry[0], dataset_id))
                        skip_dataset = True     # skip entire dataset if we have one file without checksum
                        break
                    mapfile_md[entry[0]].append('checksum=%s'%entry[1])
                    mapfile_md[entry[0]].append('checksum_type=%s'%checksumType)

            for fpath in mapfile_md:
                mapfile_line = '%s | %s | %d'%(dataset_id_version, fpath, mapfile_md[fpath][0])

                for md in mapfile_md[fpath][1:]:
                    mapfile_line+=' | %s'%md

                # Print the map entry if:
                # - Checksum exists for all files of dataset (in case checksumming is enabled)
                # - The map is being created, not appended, or
                # - The existing map does not have the dataset, or
                # - The existing map has the dataset, but not the file.
                if path_version:
                    ds_id = (dataset_id, int(path_version))
                else:
                    ds_id = (dataset_id, -1)
                if not skip_dataset and ( (appendMap is None) or (not appendMap.has_key(ds_id)) or (( fpath, "%d"% mapfile_md[fpath][1]) not in appendMap[ds_id]) ):
                    print >>output, mapfile_line

    else:                               # offline
        if projectName is not None:
            handler = getHandlerByName(projectName, None, Session, offline=True)
        else:
            raise ESGPublishError("Must specify --project for offline datasets.")
        listerSection = getOfflineLister(config, "project:%s"%projectName, service)
        offlineLister = config.get(listerSection, 'offline_lister_executable')
        commandArgs = "--config-section %s "%listerSection
        commandArgs += " ".join(lastargs)
        for dsetName, filepath, sizet in processNodeMatchIterator(offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True):
            size, mtime = sizet
            extrastuff = ""
            if mtime is not None:
                extrastuff = "| mod_time=%f"%float(mtime)
            if (appendMap is None) or (not appendMap.has_key(dsetName)) or ((filepath, "%d"%size) not in appendMap[dsetName]):
                print >>output, "%s | %s | %d %s"%(dsetName, filepath, size, extrastuff)

    if output is not sys.stdout:
        output.close()
    def fill_in_data_information_directory(self, dirp, readFiles, onoff_line="online"):
        from esgcet.publish import multiDirectoryIterator
        from esgcet.config import getHandler, getHandlerByName

        if onoff_line == "online":
            self.parent.parent.filefilt = ".*\\" + self.parent.parent.extension[0][1:] + "$"

            lastargs = [dirp]
            multiIter = multiDirectoryIterator(lastargs, filefilt=self.parent.parent.filefilt)
            firstFile, size = multiIter.next()
            handler = getHandler(firstFile, self.Session, validate=True)
            if handler is None:
                raise ESGPublishError("No project found in file %s, specify with --project." % firstFile)
            projectName = handler.name

            self.parent.parent.multiIter = multiIter
            self.parent.parent.firstFile = firstFile

            initcontext = {}

            # Copy the defaultGlobalValues into initcontext
            setdefaultGlobalValues = {}
            for x in self.defaultGlobalValues.keys():
                if self.defaultGlobalValues[x].find("Default Global Setting") == -1:
                    setdefaultGlobalValues[x] = self.defaultGlobalValues[x]
            initcontext.update(setdefaultGlobalValues)  # Update the initcontext

            properties = {}
            props = properties.copy()
            props.update(initcontext)

            if not readFiles:
                holdDirectoryMap = handler.generateDirectoryMap(
                    lastargs, self.parent.parent.filefilt, initContext=props
                )
            else:
                holdDirectoryMap = handler.generateDirectoryMapFromFiles(
                    lastargs, self.parent.parent.filefilt, initContext=props
                )

            self.parent.parent.datasetNames = [(item, -1) for item in holdDirectoryMap.keys()]
            self.parent.parent.datasetNames.sort()
            tab_name = "Collection %i" % self.parent.parent.top_ct
            page_type = "collection"
            tcolor = "lightgreen"
            onoff_flag = False
        else:
            holdDirectoryMap = None
            firstFile = None
            page_type = "offline"
            tab_name = "Offline %i" % self.parent.parent.top_ct
            self.parent.parent.datasetNames = [self.parent.parent.offline_datasetName]
            tcolor = "orange"
            onoff_flag = True

        self.parent.parent.ntk.new_page(self.parent.parent, tab_name, tab_color=tcolor, page_type=page_type)

        # Set the iteration values for each page
        selected_page = self.parent.parent.main_frame.selected_top_page
        self.parent.parent.main_frame.dmap[selected_page] = None
        self.parent.parent.main_frame.extraFields[selected_page] = None
        self.parent.parent.main_frame.datasetMapfile[selected_page] = None
        self.parent.parent.main_frame.dirp_firstfile[selected_page] = firstFile
        self.parent.parent.offline_file_directory[selected_page] = "directory"
        self.parent.parent.directoryMap[selected_page] = holdDirectoryMap
        self.parent.parent.hold_offline[selected_page] = onoff_flag
        self.parent.parent.main_frame.projectName[selected_page] = self.project_dataset.get()
示例#5
0
def iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, operation, filefilt, initcontext, offlineArg, properties, testProgress1=None, testProgress2=None, handlerDictionary=None, keepVersion=False, newVersion=None, extraFields=None, masterGateway=None, comment=None, forceAggregate=False, readFiles=False):
    """
    Scan and aggregate (if possible) a list of datasets. The datasets and associated files are specified
    in one of two ways: either as a *dataset map* (see ``dmap``) or a *directory map* (see ``directoryMap``).
    All dataset information is persisted in the database. This is a 'helper' routine for esgpublish[_gui].

    Returns a list of persistent Dataset instances.

    projectName
      String name of the project associated with the datasets. If None, it is determined by the first handler found that
      can open a sample file from the dataset.
      
    dmap
      A dictionary dataset map, as returned from ``readDatasetMap``. If None, ``directoryMap`` must be specified.

    directoryMap
      A dictionary directory map, as returned from ``ProjectHandler.generateDirectoryMap``.
      
    datasetNames
      A list of dataset names identifying the datasets to be scanned.

    Session
      An SQLAlchemy Session.
      
    aggregateDimension
      Name of the dimension on which to aggregate the datasets.

    operation
      The publication operation, one of esgcet.publish.CREATE_OP, DELETE_OP, RENAME_OP, UPDATE_OP

    filefilt
      String regular expression as defined by the Python re module. If a ``directoryMap`` is specified, only files whose
      basename matches the filter are scanned. If ``dmap`` is specified, the filter is ignored.

    initcontext
      Dictionary of initial context values for *all* datasets. These values will override metadata contained in datafiles.
      Contrast with ``properties``.

    offlineArg
      Boolean flag or dictionary
      
      If a boolean flag: if True the files are treated as offline (not local) and are not scanned or aggregated. The associated
      metadata will be a minimal set including file name and size.

      If a dictionary, maps dataset_name => offline flag

    properties
      Dictionary of property/value pairs. The properties must be configured in the initialization file section
      corresponding to the project, and do not override existing metadata values. Contrast with ``initcontext``.

    testProgress1=None
      Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*,
      ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to
      the scan phase.

    testProgress2=None
      Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*,
      ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to
      the aggregation phase.

    handlerDictionary=None
      A dictionary mapping datasetName => handler. If None, handlers are determined by project name.

    keepVersion
      Boolean, True if the dataset version should not be incremented.

    newVersion
      Integer or dictionary. Set the new version number
      explicitly. If a dictionary, maps dataset_id => version. By
      default the version number is incremented by 1. See keepVersion.

    extraFields
      Extra dataset map fields, as from **readDatasetMap**.

    masterGateway
      The gateway that owns the master copy of the datasets. If None, the dataset is not replicated.
      Otherwise the TDS catalog is written with a 'master_gateway' property, flagging the dataset(s)
      as replicated.

    comment=None
      String comment to associate with new datasets created.

    forceAggregate=False
      If True, run the aggregation step regardless.

    readFiles=False
      If True, interpret directoryMap as having one entry per file, instead of one per directory.

    """
    from esgcet.publish import extractFromDataset, aggregateVariables

    versionIsMap = (type(newVersion) is types.DictType)
    if versionIsMap:
        saveVersionMap = newVersion

    prevProject = None
    datasets = []
    ct = len(datasetNames)
    for iloop in range(ct): 
        datasetName,versionno = datasetNames[iloop]

        # If using a version map, lookup the version for this dataset
        if versionIsMap:
            try:
                newVersion = saveVersionMap[datasetName]
            except KeyError:
                raise ESGPublishError("Dataset not found in version map: %s"%datasetName)
            
        context = initcontext.copy()

        # Get offline flag
        if type(offlineArg) is dict:
            offline = offlineArg[datasetName]
        else:
            offline = offlineArg

        # Don't try to aggregate offline datasets
        if offline:
            forceAggregate=False

        # Get a file iterator and sample file
        if dmap is not None:
            if len(dmap[(datasetName,versionno)])==0:
                warning("No files specified for dataset %s, version %d."%(datasetName,versionno))
                continue
            firstFile = dmap[(datasetName,versionno)][0][0]
            fileiter = datasetMapIterator(dmap, datasetName, versionno, extraFields=extraFields, offline=offlineArg)
        else:
            direcTuples = directoryMap[datasetName]
            firstDirec, sampleFile = direcTuples[0]
            firstFile = os.path.join(firstDirec, sampleFile)
            if not readFiles:
                fileiter  = multiDirectoryIterator([direc for direc, sampfile in direcTuples], filefilt)
            else:
                fileiter = fnIterator([sampfile for direc, sampfile in direcTuples])

        # If the project is not specified, try to read it from the first file
        if handlerDictionary is not None and handlerDictionary.has_key(datasetName):
            handler = handlerDictionary[datasetName]
        elif projectName is not None:
            handler = getHandlerByName(projectName, firstFile, Session, validate=True, offline=offline)
        else:
            handler = getHandler(firstFile, Session, validate=True)
            if handler is None:
                raise ESGPublishError("No project found in file %s, specify with --project."%firstFile)
            projectName = handler.name
            info("Using project name = %s"%projectName)
        if prevProject is not None and projectName!=prevProject:
            raise ESGPublishError("Multiple projects found: %s, %s. Can only publish from one project"%(prevProject, projectName))
        prevProject = projectName

        # Generate the initial context from the dataset name
        context = handler.parseDatasetName(datasetName, context)

        # Load the rest of the context from the first file, if possible
        context = handler.getContext(**context)

        # Add properties from the command line
        fieldNames = handler.getFieldNames()
        for name, value in properties.items():
            if name not in fieldNames:
                warning('Property not configured: %s, was ignored'%name)
            else:
                context[name] = value

        # Update the handler context and fill in default values
        handler.updateContext(context, True)

        # Ensure that fields are valid:
        try:
            handler.validateContext(context)
        except ESGInvalidMandatoryField:
            if offline:
                error("Dataset id has a missing or invalid mandatory field")
            raise

        # Create a CFHandler for validation of standard names, checking time axes, etc.
        cfHandler = handler.getMetadataHandler(sessionMaker=Session)

        dataset=None
        if testProgress1 is not None:
           testProgress1[1] = (100./ct)*iloop
           if not offline:
              testProgress1[2] = (100./ct)*iloop + (50./ct)
           else:
              testProgress1[2] = (100./ct)*iloop + (100./ct)
        dataset = extractFromDataset(datasetName, fileiter, Session, handler, cfHandler, aggregateDimensionName=aggregateDimension, offline=offline, operation=operation, progressCallback=testProgress1, keepVersion=keepVersion, newVersion=newVersion, extraFields=extraFields, masterGateway=masterGateway, comment=comment, useVersion=versionno, forceRescan=forceAggregate, **context)

        # If republishing an existing version, only aggregate if online and no variables exist (yet) for the dataset.
        runAggregate = (not offline)
        if hasattr(dataset, 'reaggregate'):
            runAggregate = (runAggregate and dataset.reaggregate)
        runAggregate = runAggregate or forceAggregate

        if testProgress2 is not None:
           testProgress2[1] = (100./ct)*iloop + 50./ct
           testProgress2[2] = (100./ct)*(iloop + 1)
        if runAggregate:
            aggregateVariables(datasetName, Session, aggregateDimensionName=aggregateDimension, cfHandler=cfHandler, progressCallback=testProgress2, datasetInstance=dataset)
        elif testProgress2 is not None:
            # Just finish the progress GUI
            issueCallback(testProgress2, 1, 1, 0.0, 1.0)
            
        # Save the context with the dataset, so that it can be searched later
        handler.saveContext(datasetName, Session)
        datasets.append(dataset)

    return datasets
示例#6
0
    def fill_in_data_information_directory(self,
                                           dirp,
                                           readFiles,
                                           onoff_line="online"):
        from esgcet.publish import multiDirectoryIterator
        from esgcet.config import getHandler, getHandlerByName
        if onoff_line == "online":
            self.parent.parent.filefilt = ".*\\" + self.parent.parent.extension[
                0][1:] + "$"

            lastargs = [dirp]
            multiIter = multiDirectoryIterator(
                lastargs, filefilt=self.parent.parent.filefilt)
            firstFile, size = multiIter.next()
            handler = getHandler(firstFile, self.Session, validate=True)
            if handler is None:
                raise ESGPublishError(
                    "No project found in file %s, specify with --project." %
                    firstFile)
            projectName = handler.name

            self.parent.parent.multiIter = multiIter
            self.parent.parent.firstFile = firstFile

            initcontext = {}

            # Copy the defaultGlobalValues into initcontext
            setdefaultGlobalValues = {}
            for x in self.defaultGlobalValues.keys():
                if self.defaultGlobalValues[x].find(
                        "Default Global Setting") == -1:
                    setdefaultGlobalValues[x] = self.defaultGlobalValues[x]
            initcontext.update(
                setdefaultGlobalValues)  # Update the initcontext

            properties = {}
            props = properties.copy()
            props.update(initcontext)

            if not readFiles:
                holdDirectoryMap = handler.generateDirectoryMap(
                    lastargs, self.parent.parent.filefilt, initContext=props)
            else:
                holdDirectoryMap = handler.generateDirectoryMapFromFiles(
                    lastargs, self.parent.parent.filefilt, initContext=props)

            self.parent.parent.datasetNames = [
                (item, -1) for item in holdDirectoryMap.keys()
            ]
            self.parent.parent.datasetNames.sort()
            tab_name = "Collection %i" % self.parent.parent.top_ct
            page_type = "collection"
            tcolor = 'lightgreen'
            onoff_flag = False
        else:
            holdDirectoryMap = None
            firstFile = None
            page_type = "offline"
            tab_name = "Offline %i" % self.parent.parent.top_ct
            self.parent.parent.datasetNames = [
                self.parent.parent.offline_datasetName
            ]
            tcolor = 'orange'
            onoff_flag = True

        self.parent.parent.ntk.new_page(self.parent.parent,
                                        tab_name,
                                        tab_color=tcolor,
                                        page_type=page_type)

        # Set the iteration values for each page
        selected_page = self.parent.parent.main_frame.selected_top_page
        self.parent.parent.main_frame.dmap[selected_page] = None
        self.parent.parent.main_frame.extraFields[selected_page] = None
        self.parent.parent.main_frame.datasetMapfile[selected_page] = None
        self.parent.parent.main_frame.dirp_firstfile[selected_page] = firstFile
        self.parent.parent.offline_file_directory[selected_page] = "directory"
        self.parent.parent.directoryMap[selected_page] = holdDirectoryMap
        self.parent.parent.hold_offline[selected_page] = onoff_flag
        self.parent.parent.main_frame.projectName[
            selected_page] = self.project_dataset.get()
示例#7
0
def esgpublishWrapper(**kw):

    from esgcet.query import queryDatasetMap

    aggregateDimension = kw.get("aggregateDimension", "time")
    datasetMapfile = kw.get("datasetMapfile", None)
    datasetName = kw.get("datasetName", None)
    directoryList = kw.get("directoryList", None)
    echoSql = kw.get("echoSql", False)
    filefilt = kw.get("filefilt", '.*\.nc$')
    init_file = kw.get("init_file", None)
    initcontext = kw.get("initcontext", {})
    keepVersion = kw.get("keepVersion", False)
    las = kw.get("las", False)
    log_filename = kw.get("log_filename", None)
    masterGateway = kw.get("masterGateway", None)
    message = kw.get("message", None)
    offline = kw.get("offline", False)
    parent = kw.get("parent", None)
    perVariable = kw.get("perVariable", None)
    projectName = kw.get("projectName", None)
    properties = kw.get("properties", {})
    publish = kw.get("publish", False)
    publishOnly = kw.get("publishOnly", False)
    publishOp = kw.get("publishOp", CREATE_OP)
    readFiles = kw.get("readFiles", False)
    readFromCatalog = kw.get("readFromCatalog", False)
    reinitThredds = kw.get("reinitThredds", None)
    rescan = kw.get("rescan", False)
    rescanDatasetName = kw.get("rescanDatasetName", [])
    resultThreddsDictionary = None
    service = kw.get("service", None)
    summarizeErrors = kw.get("summarizeErrors", False)
    testProgress1 = kw.get("testProgress1", None)
    testProgress2 = kw.get("testProgress2", None)
    thredds = kw.get("thredds", False)
    threddsCatalogDictionary = kw.get("threddsCatalogDictionary", None)
    version = kw.get("version", None)

    # If offline, the project must be specified
    if offline and (projectName is None):
        raise ESGPublishError(
            "Must specify project with --project for offline datasets")

    # Must specify version for replications
    if masterGateway is not None and version is None:
        raise ESGPublishError(
            "Must specify version with --new-version for replicated datasets")

    # Load the configuration and set up a database connection
    config, Session = initdb(init_file=init_file,
                             echoSql=echoSql,
                             log_filename=log_filename)

    # Register project handlers
    registerHandlers()

    # If the dataset map is input, just read it ...
    dmap = None
    directoryMap = None
    extraFields = None
    if datasetMapfile is not None:
        dmap, extraFields = readDatasetMap(datasetMapfile,
                                           parse_extra_fields=True)
        datasetNames = dmap.keys()

    elif rescan:
        # Note: No need to get the extra fields, such as mod_time, since
        # they are already in the database, and will be used for file comparison if necessary.
        dmap, offline = queryDatasetMap(rescanDatasetName, Session)
        datasetNames = dmap.keys()

    # ... otherwise generate the directory map.
    else:
        # Online dataset(s)
        if not offline:

            if projectName is not None:
                handler = getHandlerByName(projectName, None, Session)
            else:
                multiIter = multiDirectoryIterator(directoryList,
                                                   filefilt=filefilt)
                firstFile, size = multiIter.next()
                listIter = list(multiIter)
                handler = getHandler(firstFile, Session, validate=True)
                if handler is None:
                    raise ESGPublishError(
                        "No project found in file %s, specify with --project."
                        % firstFile)
                projectName = handler.name

            props = properties.copy()
            props.update(initcontext)
            if not readFiles:
                directoryMap = handler.generateDirectoryMap(
                    directoryList,
                    filefilt,
                    initContext=props,
                    datasetName=datasetName)
            else:
                directoryMap = handler.generateDirectoryMapFromFiles(
                    directoryList,
                    filefilt,
                    initContext=props,
                    datasetName=datasetName)

            datasetNames = [(item, -1) for item in directoryMap.keys()]

        # Offline dataset. Format the spec as a dataset map : dataset_name => [(path, size), (path, size), ...]
        else:
            handler = getHandlerByName(projectName,
                                       None,
                                       Session,
                                       offline=True)
            dmap = {}
            listerSection = getOfflineLister(config,
                                             "project:%s" % projectName,
                                             service)
            offlineLister = config.get(listerSection,
                                       'offline_lister_executable')
            commandArgs = "--config-section %s " % listerSection
            commandArgs += " ".join(directoryList)
            for dsetName, filepath, sizet in processNodeMatchIterator(
                    offlineLister,
                    commandArgs,
                    handler,
                    filefilt=filefilt,
                    datasetName=datasetName,
                    offline=True):
                size, mtime = sizet
                if dmap.has_key((dsetName, -1)):
                    dmap[(dsetName, -1)].append((filepath, str(size)))
                else:
                    dmap[(dsetName, -1)] = [(filepath, str(size))]

            datasetNames = dmap.keys()

    datasetNames.sort()
    if len(datasetNames) == 0:
        warning("No datasets found.")

    # Iterate over datasets
    if not publishOnly:
        datasets = iterateOverDatasets(projectName,
                                       dmap,
                                       directoryMap,
                                       datasetNames,
                                       Session,
                                       aggregateDimension,
                                       publishOp,
                                       filefilt,
                                       initcontext,
                                       offline,
                                       properties,
                                       keepVersion=keepVersion,
                                       newVersion=version,
                                       extraFields=extraFields,
                                       masterGateway=masterGateway,
                                       comment=message,
                                       readFiles=readFiles)

    result = publishDatasetList(
        datasetNames,
        Session,
        publish=publish,
        thredds=thredds,
        las=las,
        parentId=parent,
        service=service,
        perVariable=perVariable,
        threddsCatalogDictionary=threddsCatalogDictionary,
        reinitThredds=reinitThredds,
        readFromCatalog=readFromCatalog)

    return result
示例#8
0
def esgscanWrapper(directoryList, **kw):

    if len(directoryList) == 0:
        raise ESGPublishError('No directory specified')

    output = sys.stdout
    appendMap = None
    appendPath = kw.get("appendPath", None)
    if appendPath is not None:
        if os.path.exists(appendPath):
            appendMap = readDatasetMap(appendPath)
        else:
            appendMap = {}
        output = open(appendPath, 'a')
    datasetName = kw.get("datasetName", None)
    filefilt = kw.get("fileFilt", '.*\.nc$')
    init_file = kw.get("initFile", None)
    offline = kw.get("offline", False)
    outputPath = kw.get("outputPath", None)
    if outputPath is not None:
        output = open(outputPath, 'w')
    else:
        output = sys.stdout
    projectName = kw.get("projectName", None)
    readFiles = kw.get("readFiles", False)
    service = kw.get("service", None)

    # Load the configuration and set up a database connection
    config, Session = initdb(init_file=init_file)

    # Register project handlers
    registerHandlers()

    if not offline:

        # Determine if checksumming is enabled
        line = config.get('DEFAULT', 'checksum', default=None)
        if line is not None:
            checksumClient, checksumType = splitLine(line)
        else:
            checksumClient = None

        if projectName is not None:
            handler = getHandlerByName(projectName, None, Session)
        else:
            multiIter = multiDirectoryIterator(directoryList,
                                               filefilt=filefilt)
            firstFile, size = multiIter.next()
            handler = getHandler(firstFile, Session, validate=True)
            if handler is None:
                raise ESGPublishError(
                    "No project found in file %s, specify with --project." %
                    firstFile)
            projectName = handler.name

        if not readFiles:
            datasetMap = handler.generateDirectoryMap(directoryList,
                                                      filefilt,
                                                      datasetName=datasetName)
        else:
            datasetMap = handler.generateDirectoryMapFromFiles(
                directoryList, filefilt, datasetName=datasetName)

        # Output the map
        keys = datasetMap.keys()
        keys.sort()
        for datasetId in keys:
            direcTuple = datasetMap[datasetId]
            direcTuple.sort()
            for nodepath, filepath in direcTuple:

                # If readFiles is not set, generate a map entry for each file in the directory
                # that matches filefilt ...
                if not readFiles:
                    itr = directoryIterator(nodepath,
                                            filefilt=filefilt,
                                            followSubdirectories=False)
                # ... otherwise if readFiles is set, generate a map entry for each file
                else:
                    itr = fnIterator([filepath])

                for filepath, sizet in itr:
                    size, mtime = sizet
                    extraStuff = "mod_time=%f" % float(mtime)

                    if checksumClient is not None:
                        csum = checksum(filepath, checksumClient)
                        extraStuff += " | checksum=%s | checksum_type=%s" % (
                            csum, checksumType)

                    # Print the map entry if:
                    # - The map is being created, not appended, or
                    # - The existing map does not have the dataset, or
                    # - The existing map has the dataset, but not the file.
                    if (appendMap is
                            None) or (not appendMap.has_key(datasetId)) or (
                                (filepath, "%d" % size)
                                not in appendMap[datasetId]):
                        print >> output, "%s | %s | %d | %s" % (
                            datasetId, filepath, size, extraStuff)
    else:  # offline
        if projectName is not None:
            handler = getHandlerByName(projectName,
                                       None,
                                       Session,
                                       offline=True)
        else:
            raise ESGPublishError(
                "Must specify --project for offline datasets.")
        listerSection = getOfflineLister(config, "project:%s" % projectName,
                                         service)
        offlineLister = config.get(listerSection, 'offline_lister_executable')
        commandArgs = "--config-section %s " % listerSection
        commandArgs += " ".join(directoryList)
        for dsetName, filepath, sizet in processNodeMatchIterator(
                offlineLister,
                commandArgs,
                handler,
                filefilt=filefilt,
                datasetName=datasetName,
                offline=True):
            size, mtime = sizet
            extrastuff = ""
            if mtime is not None:
                extrastuff = "| mod_time=%f" % float(mtime)
            if (appendMap is None) or (not appendMap.has_key(dsetName)) or (
                (filepath, "%d" % size) not in appendMap[dsetName]):
                print >> output, "%s | %s | %d %s" % (dsetName, filepath, size,
                                                      extrastuff)

    if output is not sys.stdout:
        output.close()
示例#9
0
def iterateOverDatasets(projectName,
                        dmap,
                        directoryMap,
                        datasetNames,
                        Session,
                        aggregateDimension,
                        operation,
                        filefilt,
                        initcontext,
                        offlineArg,
                        properties,
                        testProgress1=None,
                        testProgress2=None,
                        handlerDictionary=None,
                        perVariable=None,
                        keepVersion=False,
                        newVersion=None,
                        extraFields=None,
                        masterGateway=None,
                        comment=None,
                        forceAggregate=False,
                        readFiles=False,
                        nodbwrite=False,
                        pid_connector=None):
    """
    Scan and aggregate (if possible) a list of datasets. The datasets and associated files are specified
    in one of two ways: either as a *dataset map* (see ``dmap``) or a *directory map* (see ``directoryMap``).
    All dataset information is persisted in the database. This is a 'helper' routine for esgpublish[_gui].

    Returns a list of persistent Dataset instances.

    projectName
      String name of the project associated with the datasets. If None, it is determined by the first handler found that
      can open a sample file from the dataset.
      
    dmap
      A dictionary dataset map, as returned from ``readDatasetMap``. If None, ``directoryMap`` must be specified.

    directoryMap
      A dictionary directory map, as returned from ``ProjectHandler.generateDirectoryMap``.
      
    datasetNames
      A list of dataset names identifying the datasets to be scanned.

    Session
      An SQLAlchemy Session.
      
    aggregateDimension
      Name of the dimension on which to aggregate the datasets.

    operation
      The publication operation, one of esgcet.publish.CREATE_OP, DELETE_OP, RENAME_OP, UPDATE_OP

    filefilt
      String regular expression as defined by the Python re module. If a ``directoryMap`` is specified, only files whose
      basename matches the filter are scanned. If ``dmap`` is specified, the filter is ignored.

    initcontext
      Dictionary of initial context values for *all* datasets. These values will override metadata contained in datafiles.
      Contrast with ``properties``.

    offlineArg
      Boolean flag or dictionary
      
      If a boolean flag: if True the files are treated as offline (not local) and are not scanned or aggregated. The associated
      metadata will be a minimal set including file name and size.

      If a dictionary, maps dataset_name => offline flag

    properties
      Dictionary of property/value pairs. The properties must be configured in the initialization file section
      corresponding to the project, and do not override existing metadata values. Contrast with ``initcontext``.

    testProgress1=None
      Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*,
      ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to
      the scan phase.

    testProgress2=None
      Tuple (callback, initial, final) where ``callback`` is a function of the form *callback(progress)*,
      ``initial`` is the initial value reported, ``final`` is the final value reported. This callback applies only to
      the aggregation phase.

    handlerDictionary=None
      A dictionary mapping datasetName => handler. If None, handlers are determined by project name.

    perVariable=None
      Boolean, overrides ``variable_per_file`` config option.

    keepVersion
      Boolean, True if the dataset version should not be incremented.

    newVersion
      Integer or dictionary. Set the new version number
      explicitly. If a dictionary, maps dataset_id => version. By
      default the version number is incremented by 1. See keepVersion.

    extraFields
      Extra dataset map fields, as from **readDatasetMap**.

    masterGateway
      The gateway that owns the master copy of the datasets. If None, the dataset is not replicated.
      Otherwise the TDS catalog is written with a 'master_gateway' property, flagging the dataset(s)
      as replicated.

    comment=None
      String comment to associate with new datasets created.

    forceAggregate=False
      If True, run the aggregation step regardless.

    readFiles=False
      If True, interpret directoryMap as having one entry per file, instead of one per directory.

    pid_connector
        esgfpid.Connector object to register PIDs

    """
    from esgcet.publish import extractFromDataset, aggregateVariables

    versionIsMap = (type(newVersion) is types.DictType)
    if versionIsMap:
        saveVersionMap = newVersion

    prevProject = None
    datasets = []
    ct = len(datasetNames)
    for iloop in range(ct):
        datasetName, versionno = datasetNames[iloop]

        # Must specify version for replications
        if masterGateway:
            if not newVersion and versionno < 0:
                raise ESGPublishError(
                    "Must specify a version for replicated datasets, e.g. in the mapfile or with --new-version/--version-list."
                )

        # If using a version map, lookup the version for this dataset
        if versionIsMap:
            try:
                newVersion = saveVersionMap[datasetName]
            except KeyError:
                raise ESGPublishError("Dataset not found in version map: %s" %
                                      datasetName)

        context = initcontext.copy()

        # Get offline flag
        if type(offlineArg) is dict:
            offline = offlineArg[datasetName]
        else:
            offline = offlineArg

        # Don't try to aggregate offline datasets
        if offline:
            forceAggregate = False

        # Get a file iterator and sample file
        if dmap is not None:
            if len(dmap[(datasetName, versionno)]) == 0:
                warning("No files specified for dataset %s, version %d." %
                        (datasetName, versionno))
                continue
            firstFile = dmap[(datasetName, versionno)][0][0]
            fileiter = datasetMapIterator(dmap,
                                          datasetName,
                                          versionno,
                                          extraFields=extraFields,
                                          offline=offlineArg)
        else:
            direcTuples = directoryMap[datasetName]
            firstDirec, sampleFile = direcTuples[0]
            firstFile = os.path.join(firstDirec, sampleFile)
            if not readFiles:
                fileiter = multiDirectoryIterator(
                    [direc for direc, sampfile in direcTuples], filefilt)
            else:
                fileiter = fnIterator(
                    [sampfile for direc, sampfile in direcTuples])

        # If the project is not specified, try to read it from the first file
        if handlerDictionary is not None and handlerDictionary.has_key(
                datasetName):
            handler = handlerDictionary[datasetName]
        elif projectName is not None:
            handler = getHandlerByName(projectName,
                                       firstFile,
                                       Session,
                                       validate=True,
                                       offline=offline)
        else:
            handler = getHandler(firstFile, Session, validate=True)
            if handler is None:
                raise ESGPublishError(
                    "No project found in file %s, specify with --project." %
                    firstFile)
            projectName = handler.name
            info("Using project name = %s" % projectName)
        if prevProject is not None and projectName != prevProject:
            raise ESGPublishError(
                "Multiple projects found: %s, %s. Can only publish from one project"
                % (prevProject, projectName))
        prevProject = projectName

        # Generate the initial context from the dataset name
        context = handler.parseDatasetName(datasetName, context)

        # Load the rest of the context from the first file, if possible
        context = handler.getContext(**context)

        # Add properties from the command line
        fieldNames = handler.getFieldNames()
        for name, value in properties.items():
            if name not in fieldNames:
                warning('Property not configured: %s, was ignored' % name)
            else:
                context[name] = value

        # add dataset_version to context to allow version to be a mandatory field
        if versionno > -1:
            context['dataset_version'] = versionno
        elif newVersion is not None:
            context['dataset_version'] = newVersion

        # Update the handler context and fill in default values
        handler.updateContext(context, True)

        # Ensure that fields are valid:
        try:
            handler.validateContext(context)
        except ESGInvalidMandatoryField:
            if offline:
                error("Dataset id has a missing or invalid mandatory field")
            raise

        # Create a CFHandler for validation of standard names, checking time axes, etc.
        cfHandler = handler.getMetadataHandler(sessionMaker=Session)

        dataset = None
        if testProgress1 is not None:
            testProgress1[1] = (100. / ct) * iloop
            if not offline:
                testProgress1[2] = (100. / ct) * iloop + (50. / ct)
            else:
                testProgress1[2] = (100. / ct) * iloop + (100. / ct)

        dataset = extractFromDataset(datasetName,
                                     fileiter,
                                     Session,
                                     handler,
                                     cfHandler,
                                     aggregateDimensionName=aggregateDimension,
                                     offline=offline,
                                     operation=operation,
                                     progressCallback=testProgress1,
                                     perVariable=perVariable,
                                     keepVersion=keepVersion,
                                     newVersion=newVersion,
                                     extraFields=extraFields,
                                     masterGateway=masterGateway,
                                     comment=comment,
                                     useVersion=versionno,
                                     forceRescan=forceAggregate,
                                     nodbwrite=nodbwrite,
                                     pid_connector=pid_connector,
                                     **context)

        # If republishing an existing version, only aggregate if online and no variables exist (yet) for the dataset.

        runAggregate = (not offline)
        if hasattr(dataset, 'reaggregate'):
            runAggregate = (runAggregate and dataset.reaggregate)
        runAggregate = runAggregate or forceAggregate

        if testProgress2 is not None:
            testProgress2[1] = (100. / ct) * iloop + 50. / ct
            testProgress2[2] = (100. / ct) * (iloop + 1)
        if runAggregate and (not nodbwrite):
            aggregateVariables(datasetName,
                               Session,
                               aggregateDimensionName=aggregateDimension,
                               cfHandler=cfHandler,
                               progressCallback=testProgress2,
                               datasetInstance=dataset)
        elif testProgress2 is not None:
            # Just finish the progress GUI
            issueCallback(testProgress2, 1, 1, 0.0, 1.0)

        # Save the context with the dataset, so that it can be searched later
        if (not nodbwrite):
            handler.saveContext(datasetName, Session)
        datasets.append(dataset)

    return datasets
示例#10
0
def esgpublishWrapper(**kw):

    from esgcet.query import queryDatasetMap

    aggregateDimension = kw.get("aggregateDimension", "time")
    datasetMapfile = kw.get("datasetMapfile", None)
    datasetName = kw.get("datasetName", None)
    directoryList = kw.get("directoryList", None)
    echoSql = kw.get("echoSql", False)
    filefilt = kw.get("filefilt", ".*\.nc$")
    init_file = kw.get("init_file", None)
    initcontext = kw.get("initcontext", {})
    keepVersion = kw.get("keepVersion", False)
    las = kw.get("las", False)
    log_filename = kw.get("log_filename", None)
    masterGateway = kw.get("masterGateway", None)
    message = kw.get("message", None)
    offline = kw.get("offline", False)
    parent = kw.get("parent", None)
    perVariable = kw.get("perVariable", None)
    projectName = kw.get("projectName", None)
    properties = kw.get("properties", {})
    publish = kw.get("publish", False)
    publishOnly = kw.get("publishOnly", False)
    publishOp = kw.get("publishOp", CREATE_OP)
    readFiles = kw.get("readFiles", False)
    readFromCatalog = kw.get("readFromCatalog", False)
    reinitThredds = kw.get("reinitThredds", None)
    rescan = kw.get("rescan", False)
    rescanDatasetName = kw.get("rescanDatasetName", [])
    resultThreddsDictionary = None
    service = kw.get("service", None)
    summarizeErrors = kw.get("summarizeErrors", False)
    testProgress1 = kw.get("testProgress1", None)
    testProgress2 = kw.get("testProgress2", None)
    thredds = kw.get("thredds", False)
    threddsCatalogDictionary = kw.get("threddsCatalogDictionary", None)
    version = kw.get("version", None)

    # If offline, the project must be specified
    if offline and (projectName is None):
        raise ESGPublishError("Must specify project with --project for offline datasets")

    # Must specify version for replications
    if masterGateway is not None and version is None:
        raise ESGPublishError("Must specify version with --new-version for replicated datasets")

    # Load the configuration and set up a database connection
    config, Session = initdb(init_file=init_file, echoSql=echoSql, log_filename=log_filename)

    # Register project handlers
    registerHandlers()

    # If the dataset map is input, just read it ...
    dmap = None
    directoryMap = None
    extraFields = None
    if datasetMapfile is not None:
        dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True)
        datasetNames = dmap.keys()

    elif rescan:
        # Note: No need to get the extra fields, such as mod_time, since
        # they are already in the database, and will be used for file comparison if necessary.
        dmap, offline = queryDatasetMap(rescanDatasetName, Session)
        datasetNames = dmap.keys()

    # ... otherwise generate the directory map.
    else:
        # Online dataset(s)
        if not offline:

            if projectName is not None:
                handler = getHandlerByName(projectName, None, Session)
            else:
                multiIter = multiDirectoryIterator(directoryList, filefilt=filefilt)
                firstFile, size = multiIter.next()
                listIter = list(multiIter)
                handler = getHandler(firstFile, Session, validate=True)
                if handler is None:
                    raise ESGPublishError("No project found in file %s, specify with --project." % firstFile)
                projectName = handler.name

            props = properties.copy()
            props.update(initcontext)
            if not readFiles:
                directoryMap = handler.generateDirectoryMap(
                    directoryList, filefilt, initContext=props, datasetName=datasetName
                )
            else:
                directoryMap = handler.generateDirectoryMapFromFiles(
                    directoryList, filefilt, initContext=props, datasetName=datasetName
                )

            datasetNames = [(item, -1) for item in directoryMap.keys()]

        # Offline dataset. Format the spec as a dataset map : dataset_name => [(path, size), (path, size), ...]
        else:
            handler = getHandlerByName(projectName, None, Session, offline=True)
            dmap = {}
            listerSection = getOfflineLister(config, "project:%s" % projectName, service)
            offlineLister = config.get(listerSection, "offline_lister_executable")
            commandArgs = "--config-section %s " % listerSection
            commandArgs += " ".join(directoryList)
            for dsetName, filepath, sizet in processNodeMatchIterator(
                offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True
            ):
                size, mtime = sizet
                if dmap.has_key((dsetName, -1)):
                    dmap[(dsetName, -1)].append((filepath, str(size)))
                else:
                    dmap[(dsetName, -1)] = [(filepath, str(size))]

            datasetNames = dmap.keys()

    datasetNames.sort()
    if len(datasetNames) == 0:
        warning("No datasets found.")

    # Iterate over datasets
    if not publishOnly:
        datasets = iterateOverDatasets(
            projectName,
            dmap,
            directoryMap,
            datasetNames,
            Session,
            aggregateDimension,
            publishOp,
            filefilt,
            initcontext,
            offline,
            properties,
            keepVersion=keepVersion,
            newVersion=version,
            extraFields=extraFields,
            masterGateway=masterGateway,
            comment=message,
            readFiles=readFiles,
        )

    result = publishDatasetList(
        datasetNames,
        Session,
        publish=publish,
        thredds=thredds,
        las=las,
        parentId=parent,
        service=service,
        perVariable=perVariable,
        threddsCatalogDictionary=threddsCatalogDictionary,
        reinitThredds=reinitThredds,
        readFromCatalog=readFromCatalog,
    )

    return result
示例#11
0
def esgscanWrapper(directoryList, **kw):

    if len(directoryList) == 0:
        raise ESGPublishError("No directory specified")

    output = sys.stdout
    appendMap = None
    appendPath = kw.get("appendPath", None)
    if appendPath is not None:
        if os.path.exists(appendPath):
            appendMap = readDatasetMap(appendPath)
        else:
            appendMap = {}
        output = open(appendPath, "a")
    datasetName = kw.get("datasetName", None)
    filefilt = kw.get("fileFilt", ".*\.nc$")
    init_file = kw.get("initFile", None)
    offline = kw.get("offline", False)
    outputPath = kw.get("outputPath", None)
    if outputPath is not None:
        output = open(outputPath, "w")
    else:
        output = sys.stdout
    projectName = kw.get("projectName", None)
    readFiles = kw.get("readFiles", False)
    service = kw.get("service", None)

    # Load the configuration and set up a database connection
    config, Session = initdb(init_file=init_file)

    # Register project handlers
    registerHandlers()

    if not offline:

        # Determine if checksumming is enabled
        line = config.get("DEFAULT", "checksum", default=None)
        if line is not None:
            checksumClient, checksumType = splitLine(line)
        else:
            checksumClient = None

        if projectName is not None:
            handler = getHandlerByName(projectName, None, Session)
        else:
            multiIter = multiDirectoryIterator(directoryList, filefilt=filefilt)
            firstFile, size = multiIter.next()
            handler = getHandler(firstFile, Session, validate=True)
            if handler is None:
                raise ESGPublishError("No project found in file %s, specify with --project." % firstFile)
            projectName = handler.name

        if not readFiles:
            datasetMap = handler.generateDirectoryMap(directoryList, filefilt, datasetName=datasetName)
        else:
            datasetMap = handler.generateDirectoryMapFromFiles(directoryList, filefilt, datasetName=datasetName)

        # Output the map
        keys = datasetMap.keys()
        keys.sort()
        for datasetId in keys:
            direcTuple = datasetMap[datasetId]
            direcTuple.sort()
            for nodepath, filepath in direcTuple:

                # If readFiles is not set, generate a map entry for each file in the directory
                # that matches filefilt ...
                if not readFiles:
                    itr = directoryIterator(nodepath, filefilt=filefilt, followSubdirectories=False)
                # ... otherwise if readFiles is set, generate a map entry for each file
                else:
                    itr = fnIterator([filepath])

                for filepath, sizet in itr:
                    size, mtime = sizet
                    extraStuff = "mod_time=%f" % float(mtime)

                    if checksumClient is not None:
                        csum = checksum(filepath, checksumClient)
                        extraStuff += " | checksum=%s | checksum_type=%s" % (csum, checksumType)

                    # Print the map entry if:
                    # - The map is being created, not appended, or
                    # - The existing map does not have the dataset, or
                    # - The existing map has the dataset, but not the file.
                    if (
                        (appendMap is None)
                        or (not appendMap.has_key(datasetId))
                        or ((filepath, "%d" % size) not in appendMap[datasetId])
                    ):
                        print >> output, "%s | %s | %d | %s" % (datasetId, filepath, size, extraStuff)
    else:  # offline
        if projectName is not None:
            handler = getHandlerByName(projectName, None, Session, offline=True)
        else:
            raise ESGPublishError("Must specify --project for offline datasets.")
        listerSection = getOfflineLister(config, "project:%s" % projectName, service)
        offlineLister = config.get(listerSection, "offline_lister_executable")
        commandArgs = "--config-section %s " % listerSection
        commandArgs += " ".join(directoryList)
        for dsetName, filepath, sizet in processNodeMatchIterator(
            offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True
        ):
            size, mtime = sizet
            extrastuff = ""
            if mtime is not None:
                extrastuff = "| mod_time=%f" % float(mtime)
            if (
                (appendMap is None)
                or (not appendMap.has_key(dsetName))
                or ((filepath, "%d" % size) not in appendMap[dsetName])
            ):
                print >> output, "%s | %s | %d %s" % (dsetName, filepath, size, extrastuff)

    if output is not sys.stdout:
        output.close()
示例#12
0
def main(argv):

    try:
        args, lastargs = getopt.getopt(argv, "a:cdehi:m:p:ru", ['append', 'create', 'dataset=', 'delete-files', 'echo-sql', 'experiment=', 'filter=', 'help', 'keep-version', 'log=', 'map=', 'message=', 'model=', 'offline',  'parent=', 'per-time', 'per-variable', 'project=', 'property=', 'publish', 'new-version=', 'no-thredds-reinit', 'noscan', 'read-directories', 'read-files', 'rename-files', 'replace', 'replica=', 'rest-api', 'service=', 'set-replica', 'summarize-errors', 'thredds', 'thredds-reinit', 'update', 'use-existing=', 'use-list=', 'validate=', 'version-list=', 'nodbwrite'])
    except getopt.error:
        print sys.exc_value
        return

    aggregateDimension = "time"
    datasetMapfile = None
    datasetName = None
    echoSql = False
    filefilt = '.*\.nc$'
    init_file = None
    initcontext = {}
    keepVersion = False
    las = False
    log_filename = None
    masterGateway = None
    message = None
    offline = False
    parent = None
    perVariable = None
    projectName = None
    properties = {}
    publish = False
    publishOnly = False
    publishOp = CREATE_OP
    readFiles = False
    rescan = False
    rescanDatasetName = []
    restApi = None
    schema = None
    service = None
    summarizeErrors = False
    testProgress1 = testProgress2 = None
    thredds = False
    threddsReinit = None
    version = None
    versionList = None
    nodbwrite = False

    for flag, arg in args:
        if flag=='-a':
            aggregateDimension = arg
        elif flag=='--append':
            publishOp = UPDATE_OP
        elif flag in ['-c', '--create']:
            publishOp = CREATE_OP
        elif flag=='--dataset':
            datasetName = arg
        elif flag in ['-d', '--delete-files']:
            publishOp = DELETE_OP
        elif flag=='--echo-sql':
            echoSql = True
        elif flag=='--experiment':
            initcontext['experiment'] = arg
        elif flag=='--filter':
            filefilt = arg
        elif flag in ['-h', '--help']:
            print usage
            sys.exit(0)
        elif flag=='-i':
            init_file = arg
        elif flag=='--keep-version':
            keepVersion = True
        elif flag=='--log':
            log_filename = arg
        elif flag=='--map':
            datasetMapfile = arg
        elif flag in ['-m', '--message']:
            message = arg
        elif flag=='--model':
            initcontext['model'] = arg
        elif flag=='--nodbwrite':
            nodbwrite = True
        elif flag=='--new-version':
            try:
                version = string.atoi(arg)
                if version <=0:
                    raise ValueError
            except ValueError:
                raise ESGPublishError("Version number must be a positive integer: %s"%arg)
        elif flag=='--no-thredds-reinit':
            threddsReinit = False
        elif flag=='--noscan':
            publishOnly = True
        elif flag=='--offline':
            offline = True
        elif flag=='--parent':
            parent = arg
        elif flag=='--per-time':
            perVariable = False
        elif flag=='--per-variable':
            perVariable = True
        elif flag=='--project':
            projectName = arg
        elif flag in ['-p', '--property']:
            name, value = arg.split('=')
            properties[name] = value
        elif flag=='--publish':
            publish = True
        elif flag in ['-e', '--read-directories']:
            readFiles = False
        elif flag=='--read-files':
            readFiles = True
        elif flag=='--rename-files':
            publishOp = RENAME_OP
        elif flag in ['-r', '--replace']:
            publishOp = REPLACE_OP
        elif flag=='--replica':
            masterGateway = arg
            warning("The --replica option is deprecated. Use --set-replica instead")
        elif flag=='--rest-api':
            restApi = True
        elif flag=='--service':
            service = arg
        elif flag=='--set-replica':
            masterGateway = 'DEFAULT'
        elif flag=='--summarize-errors':
            summarizeErrors = True
        elif flag=='--thredds':
            thredds = True
        elif flag=='--thredds-reinit':
            threddsReinit = True
        elif flag in ['-u', '--update']:
            publishOp = UPDATE_OP
        elif flag=='--use-existing':
            rescan = True
            rescanDatasetName.append(arg)
        elif flag=='--use-list':
            rescan = True
            if arg=='-':
                namelist=sys.stdin
            else:
                namelist = open(arg)
            for line in namelist.readlines():
                line = line.strip()
                if line[0]!='#':
                    rescanDatasetName.append(line)
        elif flag=='--validate':
            schema = arg
            restApi = True
        elif flag=='--version-list':
            versionList = arg

    # If offline, the project must be specified
    if offline and (projectName is None):
        raise ESGPublishError("Must specify project with --project for offline datasets")

    if version is not None and versionList is not None:
        raise ESGPublishError("Cannot specify both --new-version and --version-list")

    if versionList is not None:
        version = {}
        f = open(versionList)
        lines = f.readlines()
        f.close()
        for line in lines:
            line = line.strip()
            dsid, vers = line.split('|')
            dsid = dsid.strip()
            vers = int(vers.strip())
            version[dsid] = vers

    # Load the configuration and set up a database connection
    config = loadConfig(init_file)
    engine = create_engine(config.getdburl('extract'), echo=echoSql, pool_recycle=3600)
    initLogging('extract', override_sa=engine, log_filename=log_filename)
    Session = sessionmaker(bind=engine, autoflush=True, autocommit=False)

    # Register project handlers
    registerHandlers()

    # Get the default publication interface (REST or Hessian)
    if restApi is None:
        restApi = config.getboolean('DEFAULT', 'use_rest_api', default=False)

    # If the dataset map is input, just read it ...
    dmap = None
    directoryMap = None
    extraFields = None
    if datasetMapfile is not None:
        dmap, extraFields = readDatasetMap(datasetMapfile, parse_extra_fields=True)
        datasetNames = dmap.keys()

    elif rescan:
        # Note: No need to get the extra fields, such as mod_time, since
        # they are already in the database, and will be used for file comparison if necessary.
        dmap, offline = queryDatasetMap(rescanDatasetName, Session)
        datasetNames = dmap.keys()

    # ... otherwise generate the directory map.
    else:
        # Online dataset(s)
        if not offline:
            if len(lastargs)==0:
                print "No directories specified."
                return

            if projectName is not None:
                handler = getHandlerByName(projectName, None, Session)
            else:
                multiIter = multiDirectoryIterator(lastargs, filefilt=filefilt)
                firstFile, size = multiIter.next()
                listIter = list(multiIter)
                handler = getHandler(firstFile, Session, validate=True)
                if handler is None:
                    raise ESGPublishError("No project found in file %s, specify with --project."%firstFile)
                projectName = handler.name

            props = properties.copy()
            props.update(initcontext)
            if not readFiles:
                directoryMap = handler.generateDirectoryMap(lastargs, filefilt, initContext=props, datasetName=datasetName)
            else:
                directoryMap = handler.generateDirectoryMapFromFiles(lastargs, filefilt, initContext=props, datasetName=datasetName)
            datasetNames = [(item,-1) for item in directoryMap.keys()]

        # Offline dataset. Format the spec as a dataset map : dataset_name => [(path, size), (path, size), ...]
        else:
            handler = getHandlerByName(projectName, None, Session, offline=True)
            dmap = {}
            listerSection = getOfflineLister(config, "project:%s"%projectName, service)
            offlineLister = config.get(listerSection, 'offline_lister_executable')
            commandArgs = "--config-section %s "%listerSection
            commandArgs += " ".join(lastargs)
            for dsetName, filepath, sizet in processNodeMatchIterator(offlineLister, commandArgs, handler, filefilt=filefilt, datasetName=datasetName, offline=True):
                size, mtime = sizet
                if dmap.has_key((dsetName,-1)):
                    dmap[(dsetName,-1)].append((filepath, str(size)))
                else:
                    dmap[(dsetName,-1)] = [(filepath, str(size))]

            datasetNames = dmap.keys()

    datasetNames.sort()
    if len(datasetNames)==0:
        warning("No datasets found.")
        min_version = -1
    else:
        min_version = sorted(datasetNames, key=lambda x: x[1])[0][1]

    # Must specify version for replications
    if min_version == -1 and masterGateway is not None and version is None and versionList is None:
        raise ESGPublishError("Must specify version with --new-version (or --version-list) for replicated datasets")
    
    # Iterate over datasets
    if not publishOnly:

#        pdb.set_trace()

        datasets = iterateOverDatasets(projectName, dmap, directoryMap, datasetNames, Session, aggregateDimension, publishOp, filefilt, initcontext, offline, properties, keepVersion=keepVersion, newVersion=version, extraFields=extraFields, masterGateway=masterGateway, comment=message, readFiles=readFiles, nodbwrite=nodbwrite)


    if (not nodbwrite):
        result = publishDatasetList(datasetNames, Session, publish=publish, thredds=thredds, las=las, parentId=parent, service=service, perVariable=perVariable, reinitThredds=threddsReinit, restInterface=restApi, schema=schema)
    # print `result`

    if summarizeErrors:
        print 'Summary of errors:'
        for name,versionno in datasetNames:
            dset = Dataset.lookup(name, Session)
            print dset.get_name(Session), dset.get_project(Session), dset.get_model(Session), dset.get_experiment(Session), dset.get_run_name(Session)
            if dset.has_warnings(Session):
                print '=== Dataset: %s ==='%dset.name
                for line in dset.get_warnings(Session):
                    print line