Exemplo n.º 1
0
def split_by_lumi(config, dataset_info, task_list):
    if config.has_key('lumi mask'):
        lumi_mask = LumiList(filename=config['lumi mask'])
        dataset_info.total_lumis = 0
        for file in dataset_info.files:
            dataset_info.lumis[file] = lumi_mask.filterLumis(dataset_info.lumis[file])
            dataset_info.total_lumis += len(dataset_info.lumis[file])

    lumis_per_task = config['lumis per task']
    lumis_processed = 0
    task_id = 0
    tasks = []
    files = iter(dataset_info.files)
    file = files.next()
    input_files_this_task = [file]
    task_lumis_remaining = dataset_info.lumis[file]
    while lumis_processed < dataset_info.total_lumis:
        for file in input_files_this_task:
            common_lumis = set(dataset_info.lumis[file]).intersection(set(task_lumis_remaining))
            if len(common_lumis) == 0 or len(dataset_info.lumis[file]) == 0:
                input_files_this_task.remove(file)
        while lumis_per_task <= len(task_lumis_remaining):
            task_lumis = LumiList(lumis=task_lumis_remaining[:lumis_per_task])
            task_lumis_remaining = task_lumis_remaining[lumis_per_task:]
            tasks.append((input_files_this_task, task_lumis.getVLuminosityBlockRange()))
            task_id += 1
            lumis_processed += lumis_per_task
        try:
            file = files.next()
            input_files_this_task.append(file)
            task_lumis_remaining.extend(dataset_info.lumis[file])
        except:
            lumis_per_task = len(task_lumis_remaining)

    with open(task_list, 'w') as json_file:
        json.dump(tasks, json_file)

    return len(tasks)
Exemplo n.º 2
0
    def fetchDBSInfo(self):
        """
        Contact DBS
        """
        # make assumption that same host won't be used for both
        # this check should catch most deployed servers

        (useDBS2, useDBS3, dbs2_url, dbs3_url) = verify_dbs_url(self)
        # DBS2 is gone
        dbs_url=dbs3_url
        useDBS2 = False
        useDBS3 = True
        verifyDBS23 = False

        common.logger.info("Accessing DBS at: %s" % dbs_url)

        ## check if runs are selected
        runselection = []
        if (self.cfg_params.has_key('CMSSW.runselection')):
            runselection = parseRange2(self.cfg_params['CMSSW.runselection'])
            if len(runselection)>1000000:
                common.logger.info("ERROR: runselection range has more then 1M numbers")
                common.logger.info("ERROR: Too large. runselection is ignored")
                runselection=[]

        ## check if various lumi parameters are set
        self.lumiMask = self.cfg_params.get('CMSSW.lumi_mask',None)
        self.lumiParams = self.cfg_params.get('CMSSW.total_number_of_lumis',None) or \
                          self.cfg_params.get('CMSSW.lumis_per_job',None)

        lumiList = None
        if self.lumiMask:
            lumiList = LumiList(filename=self.lumiMask)
        if runselection:
            runList = LumiList(runs = runselection)

        self.splitByRun = int(self.cfg_params.get('CMSSW.split_by_run', 0))
        self.splitDataByEvent = int(self.cfg_params.get('CMSSW.split_by_event', 0))
        common.logger.log(10-1,"runselection is: %s"%runselection)

        if not self.splitByRun:
            self.splitByLumi = self.lumiMask or self.lumiParams or self.ads

        if self.splitByRun and not runselection:
            msg = "Error: split_by_run must be combined with a runselection"
            raise CrabException(msg)

        ## service API
        if useDBS2 or verifyDBS23:
            args = {}
            args['url']     = dbs2_url
            args['level']   = 'CRITICAL'

        ## check if has been requested to use the parent info
        useparent = int(self.cfg_params.get('CMSSW.use_parent',0))


        defaultName = common.work_space.shareDir()+'AnalyzedBlocks.txt'
        ## check if has been asked for a non default file to store/read analyzed fileBlocks
        #SB no no, we do not want this, it is not even documented !
        #fileBlocks_FileName = os.path.abspath(self.cfg_params.get('CMSSW.fileblocks_file',defaultName))
        if self.cfg_params.get('CMSSW.fileblocks_file') :
            msg = "CMSSW.fileblocks_file option non supported"
            raise CrabException(msg)
        fileBlocks_FileName = os.path.abspath(defaultName)

        if useDBS2 or verifyDBS23:
            #common.logger.info("looking up DBS2 ...")
            import DBSAPI.dbsApi
            import DBSAPI.dbsApiException
            start_time=time.time()
            api2 = DBSAPI.dbsApi.DbsApi(args)
            files2 = self.queryDbs(api2,path=self.datasetPath,runselection=runselection,useParent=useparent)
            elapsed=time.time() - start_time
            common.logger.info("DBS2 lookup took %5.2f sec" % elapsed)
            if useDBS2:
                self.files = files2
        if useDBS3 or verifyDBS23:
            #common.logger.info("looking up DBS3 ...")
            from dbs.apis.dbsClient import DbsApi
            start_time=time.time()
            api3 = DbsApi(dbs3_url)
            files3 = self.queryDbs3(api3,path=self.datasetPath,runselection=runselection,useParent=useparent)
            elapsed=time.time() - start_time
            common.logger.info("DBS3 lookup took %5.2f sec" % elapsed)
            if useDBS3:
                self.files = files3


        # Check to see what the dataset is
        pdsName = self.datasetPath.split("/")[1]
        if useDBS2 :
            primDSs = api2.listPrimaryDatasets(pdsName)
            dataType = primDSs[0]['Type']
        elif useDBS3 :
            dataType=api3.listDataTypes(dataset=self.datasetPath)[0]['data_type']

        common.logger.info("Datatype is %s" % dataType)
        if dataType == 'data' and not \
            (self.splitByRun or self.splitByLumi or self.splitDataByEvent):
            msg = 'Data must be split by lumi or by run. ' \
                  'Please see crab -help for the correct settings'
            raise  CrabException(msg)



        anFileBlocks = []
        if self.skipBlocks: anFileBlocks = readTXTfile(self, fileBlocks_FileName)

        # parse files and fill arrays
        for file in self.files :
            parList  = []
            fileLumis = [] # List of tuples
            # skip already analyzed blocks
            fileblock = file['Block']['Name']
            if fileblock not in anFileBlocks :
                filename = file['LogicalFileName']
                # asked retry the list of parent for the given child
                if useparent==1:
                    parList = [x['LogicalFileName'] for x in file['ParentList']]
                if self.splitByLumi:
                    fileLumis = [ (x['RunNumber'], x['LumiSectionNumber'])
                                 for x in file['LumiList'] ]
                self.parent[filename] = parList
                # For LumiMask, intersection of two lists.
                if self.lumiMask and runselection:
                    self.lumis[filename] = runList.filterLumis(lumiList.filterLumis(fileLumis))
                elif runselection:
                    self.lumis[filename] = runList.filterLumis(fileLumis)
                elif self.lumiMask:
                    self.lumis[filename] = lumiList.filterLumis(fileLumis)
                else:
                    self.lumis[filename] = fileLumis

                if filename.find('.dat') < 0 :
                    events    = file['NumberOfEvents']
                    # Count number of events and lumis per block
                    if fileblock in self.eventsPerBlock.keys() :
                        self.eventsPerBlock[fileblock] += events
                    else :
                        self.eventsPerBlock[fileblock] = events
                    # Number of events per file
                    self.eventsPerFile[filename] = events

                    # List of files per block
                    if fileblock in self.blocksinfo.keys() :
                        self.blocksinfo[fileblock].append(filename)
                    else :
                        self.blocksinfo[fileblock] = [filename]

                    # total number of events
                    self.maxEvents += events
                    self.maxLumis  += len(self.lumis[filename])

        if  self.skipBlocks and len(self.eventsPerBlock.keys()) == 0:
            msg = "No new fileblocks available for dataset: "+str(self.datasetPath)
            raise  CrabException(msg)

        if len(self.eventsPerBlock) <= 0:
            msg="No data for %s in DBS\n Check datasetpath parameter in crab.cfg" % self.datasetPath
            raise  CrabException(msg)
Exemplo n.º 3
0
    def fetchDBSInfo(self):
        """
        Contact DBS
        """
        ## get DBS URL
        global_url = "http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet"
        dbs_url = self.cfg_params.get('CMSSW.dbs_url', global_url)
        common.logger.info("Accessing DBS at: " + dbs_url)

        ## check if runs are selected
        runselection = []
        if (self.cfg_params.has_key('CMSSW.runselection')):
            runselection = parseRange2(self.cfg_params['CMSSW.runselection'])
            if len(runselection) > 1000000:
                common.logger.info(
                    "ERROR: runselection range has more then 1M numbers")
                common.logger.info("ERROR: Too large. runselection is ignored")
                runselection = []

        ## check if various lumi parameters are set
        self.lumiMask = self.cfg_params.get('CMSSW.lumi_mask', None)
        self.lumiParams = self.cfg_params.get('CMSSW.total_number_of_lumis',None) or \
                          self.cfg_params.get('CMSSW.lumis_per_job',None)

        lumiList = None
        if self.lumiMask:
            lumiList = LumiList(filename=self.lumiMask)
        if runselection:
            runList = LumiList(runs=runselection)

        self.splitByRun = int(self.cfg_params.get('CMSSW.split_by_run', 0))
        self.splitDataByEvent = int(
            self.cfg_params.get('CMSSW.split_by_event', 0))
        common.logger.log(10 - 1, "runselection is: %s" % runselection)

        if not self.splitByRun:
            self.splitByLumi = self.lumiMask or self.lumiParams or self.ads

        if self.splitByRun and not runselection:
            msg = "Error: split_by_run must be combined with a runselection"
            raise CrabException(msg)

        ## service API
        args = {}
        args['url'] = dbs_url
        args['level'] = 'CRITICAL'

        ## check if has been requested to use the parent info
        useparent = int(self.cfg_params.get('CMSSW.use_parent', 0))

        ## check if has been asked for a non default file to store/read analyzed fileBlocks
        defaultName = common.work_space.shareDir() + 'AnalyzedBlocks.txt'
        fileBlocks_FileName = os.path.abspath(
            self.cfg_params.get('CMSSW.fileblocks_file', defaultName))

        api = DBSAPI.dbsApi.DbsApi(args)
        self.files = self.queryDbs(api,
                                   path=self.datasetPath,
                                   runselection=runselection,
                                   useParent=useparent)

        # Check to see what the dataset is
        pdsName = self.datasetPath.split("/")[1]
        primDSs = api.listPrimaryDatasets(pdsName)
        dataType = primDSs[0]['Type']
        common.logger.debug("Datatype is %s" % dataType)
        if dataType == 'data' and not \
            (self.splitByRun or self.splitByLumi or self.splitDataByEvent):
            msg = 'Data must be split by lumi or by run. ' \
                  'Please see crab -help for the correct settings'
            raise CrabException(msg)

        anFileBlocks = []
        if self.skipBlocks:
            anFileBlocks = readTXTfile(self, fileBlocks_FileName)

        # parse files and fill arrays
        for file in self.files:
            parList = []
            fileLumis = []  # List of tuples
            # skip already analyzed blocks
            fileblock = file['Block']['Name']
            if fileblock not in anFileBlocks:
                filename = file['LogicalFileName']
                # asked retry the list of parent for the given child
                if useparent == 1:
                    parList = [
                        x['LogicalFileName'] for x in file['ParentList']
                    ]
                if self.splitByLumi:
                    fileLumis = [(x['RunNumber'], x['LumiSectionNumber'])
                                 for x in file['LumiList']]
                self.parent[filename] = parList
                # For LumiMask, intersection of two lists.
                if self.lumiMask and runselection:
                    self.lumis[filename] = runList.filterLumis(
                        lumiList.filterLumis(fileLumis))
                elif runselection:
                    self.lumis[filename] = runList.filterLumis(fileLumis)
                elif self.lumiMask:
                    self.lumis[filename] = lumiList.filterLumis(fileLumis)
                else:
                    self.lumis[filename] = fileLumis
                if filename.find('.dat') < 0:
                    events = file['NumberOfEvents']
                    # Count number of events and lumis per block
                    if fileblock in self.eventsPerBlock.keys():
                        self.eventsPerBlock[fileblock] += events
                    else:
                        self.eventsPerBlock[fileblock] = events
                    # Number of events per file
                    self.eventsPerFile[filename] = events

                    # List of files per block
                    if fileblock in self.blocksinfo.keys():
                        self.blocksinfo[fileblock].append(filename)
                    else:
                        self.blocksinfo[fileblock] = [filename]

                    # total number of events
                    self.maxEvents += events
                    self.maxLumis += len(self.lumis[filename])

        if self.skipBlocks and len(self.eventsPerBlock.keys()) == 0:
            msg = "No new fileblocks available for dataset: " + str(
                self.datasetPath)
            raise CrabException(msg)

        if len(self.eventsPerBlock) <= 0:
            raise NotExistingDatasetError(
                ("\nNo data for %s in DBS\nPlease check" +
                 " dataset path variables in crab.cfg") % self.datasetPath)
Exemplo n.º 4
0
    def fetchDBSInfo(self):
        """
        Contact DBS
        """
        ## get DBS URL
        global_url="http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet"
        dbs_url=  self.cfg_params.get('CMSSW.dbs_url', global_url)
        common.logger.info("Accessing DBS at: "+dbs_url)

        ## check if runs are selected
        runselection = []
        if (self.cfg_params.has_key('CMSSW.runselection')):
            runselection = parseRange2(self.cfg_params['CMSSW.runselection'])

        ## check if various lumi parameters are set
        self.lumiMask = self.cfg_params.get('CMSSW.lumi_mask',None)
        self.lumiParams = self.cfg_params.get('CMSSW.total_number_of_lumis',None) or \
                          self.cfg_params.get('CMSSW.lumis_per_job',None)

        lumiList = None
        if self.lumiMask:
            lumiList = LumiList(filename=self.lumiMask)
        if runselection:
            runList = LumiList(runs = runselection)

        self.splitByRun = int(self.cfg_params.get('CMSSW.split_by_run', 0))
        self.splitDataByEvent = int(self.cfg_params.get('CMSSW.split_by_event', 0))
        common.logger.log(10-1,"runselection is: %s"%runselection)

        if not self.splitByRun:
            self.splitByLumi = self.lumiMask or self.lumiParams or self.ads

        if self.splitByRun and not runselection:
            msg = "Error: split_by_run must be combined with a runselection"
            raise CrabException(msg)

        ## service API
        args = {}
        args['url']     = dbs_url
        args['level']   = 'CRITICAL'

        ## check if has been requested to use the parent info
        useparent = int(self.cfg_params.get('CMSSW.use_parent',0))

        ## check if has been asked for a non default file to store/read analyzed fileBlocks
        defaultName = common.work_space.shareDir()+'AnalyzedBlocks.txt'
        fileBlocks_FileName = os.path.abspath(self.cfg_params.get('CMSSW.fileblocks_file',defaultName))

        api = DBSAPI.dbsApi.DbsApi(args)
        self.files = self.queryDbs(api,path=self.datasetPath,runselection=runselection,useParent=useparent)

        # Check to see what the dataset is
        pdsName = self.datasetPath.split("/")[1]
        primDSs = api.listPrimaryDatasets(pdsName)
        dataType = primDSs[0]['Type']
        common.logger.debug("Datatype is %s" % dataType)
        if dataType == 'data' and not \
            (self.splitByRun or self.splitByLumi or self.splitDataByEvent):
            msg = 'Data must be split by lumi or by run. ' \
                  'Please see crab -help for the correct settings'
            raise  CrabException(msg)



        anFileBlocks = []
        if self.skipBlocks: anFileBlocks = readTXTfile(self, fileBlocks_FileName)

        # parse files and fill arrays
        for file in self.files :
            parList  = []
            fileLumis = [] # List of tuples
            # skip already analyzed blocks
            fileblock = file['Block']['Name']
            if fileblock not in anFileBlocks :
                filename = file['LogicalFileName']
                # asked retry the list of parent for the given child
                if useparent==1:
                    parList = [x['LogicalFileName'] for x in file['ParentList']]
                if self.splitByLumi:
                    fileLumis = [ (x['RunNumber'], x['LumiSectionNumber'])
                                 for x in file['LumiList'] ]
                self.parent[filename] = parList
                # For LumiMask, intersection of two lists.
                if self.lumiMask and runselection:
                    self.lumis[filename] = runList.filterLumis(lumiList.filterLumis(fileLumis))
                elif runselection:
                    self.lumis[filename] = runList.filterLumis(fileLumis)
                elif self.lumiMask:
                    self.lumis[filename] = lumiList.filterLumis(fileLumis)
                else:
                    self.lumis[filename] = fileLumis
                if filename.find('.dat') < 0 :
                    events    = file['NumberOfEvents']
                    # Count number of events and lumis per block
                    if fileblock in self.eventsPerBlock.keys() :
                        self.eventsPerBlock[fileblock] += events
                    else :
                        self.eventsPerBlock[fileblock] = events
                    # Number of events per file
                    self.eventsPerFile[filename] = events

                    # List of files per block
                    if fileblock in self.blocksinfo.keys() :
                        self.blocksinfo[fileblock].append(filename)
                    else :
                        self.blocksinfo[fileblock] = [filename]

                    # total number of events
                    self.maxEvents += events
                    self.maxLumis  += len(self.lumis[filename])

        if  self.skipBlocks and len(self.eventsPerBlock.keys()) == 0:
            msg = "No new fileblocks available for dataset: "+str(self.datasetPath)
            raise  CrabException(msg)


        if len(self.eventsPerBlock) <= 0:
            raise NotExistingDatasetError(("\nNo data for %s in DBS\nPlease check"
                                            + " dataset path variables in crab.cfg")
                                            % self.datasetPath)
Exemplo n.º 5
0
    def fetchDBSInfo(self):
        """
        Contact DBS
        """
        # make assumption that same host won't be used for both
        # this check should catch most deployed servers
        DBS2HOST = 'cmsdbsprod.cern.ch'
        DBS3HOST = 'cmsweb.cern.ch'
        useDBS2 = False
        useDBS3 = False
        verifyDBS23 = False
        useDAS = False

        # knwon DBS end-points
        known_dbs_urls = []
        global_dbs2 = "http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet"
        global_dbs3 = "https://cmsweb.cern.ch/dbs/prod/global/DBSReader"
        caf_dbs2_01 = "http://cmsdbsprod.cern.ch/cms_dbs_caf_analysis_01/servlet/DBSServlet"
        local_dbs2_01 = "http://cmsdbsprod.cern.ch/cms_dbs_ph_analysis_01/servlet/DBSServlet"
        local_dbs2_02 = "http://cmsdbsprod.cern.ch/cms_dbs_ph_analysis_02/servlet/DBSServlet"
        caf_dbs3_01   = "https://cmsweb.cern.ch/dbs/prod/caf01/DBSReader"
        local_dbs3_01 = "https://cmsweb.cern.ch/dbs/prod/phys01/DBSReader"
        local_dbs3_02 = "https://cmsweb.cern.ch/dbs/prod/phys02/DBSReader"
        local_dbs3_03 = "https://cmsweb.cern.ch/dbs/prod/phys03/DBSReader"
        known_dbs_urls = [ \
            global_dbs2, caf_dbs2_01, local_dbs2_01, local_dbs2_02, \
            global_dbs3, caf_dbs3_01, local_dbs3_01, local_dbs2_01, local_dbs3_03 \
            ]

        ## correspondence maps of DBS2/3 isntances
        dbs2to3={}
        dbs3to2={}
        dbs2to3[global_dbs2] = global_dbs3
        dbs2to3[caf_dbs2_01]   = caf_dbs3_01
        dbs2to3[local_dbs2_01] = local_dbs3_01
        dbs2to3[local_dbs2_02] = local_dbs3_02
        dbs2to3[local_dbs3_01] = local_dbs3_01
        dbs2to3[local_dbs3_02] = local_dbs3_02
        dbs2to3[local_dbs3_03] = local_dbs3_03
        # reverse map:
        for key,value in dbs2to3.iteritems():
            dbs3to2[value]=key



        ## get DBS URL specified by user (default to global DBS2)
        dbs_url = self.cfg_params.get('CMSSW.dbs_url', global_dbs2)
        
        if self.cfg_params.get('CMSSW.use_dbs3'):
            useDBS3 = int(self.cfg_params.get('CMSSW.use_dbs3'))==1

        if self.cfg_params.get('CMSSW.verify_dbs23'):
            verifyDBS23 = int(self.cfg_params.get('CMSSW.verify_dbs23'))==1
        if verifyDBS23 and not dbs_url in known_dbs_urls:
            common.logger.info ("automatic verification DBS2/3 not possible for non standard dbs_url=%s"%dbs_url)
            verifyDBS23 = False

        # support shortcuts for local scope DBS's
        if dbs_url == "dbs2_caf_01" :  dbs_url=caf_dbs2_01
        if dbs_url == "analysis_01" :  dbs_url=local_dbs2_01
        if dbs_url == "analysis_02" :  dbs_url=local_dbs2_02
        if dbs_url == "caf01"  :       dbs_url=caf_dbs3_01
        if dbs_url == "phys01" :       dbs_url=local_dbs3_01
        if dbs_url == "phys02" :       dbs_url=local_dbs3_02
        if dbs_url == "phys03" :       dbs_url=local_dbs3_03

        # if user asked for DBS3, remap DBS url if needed
        # and possible, i.e. using a known URL
        if useDBS3 and dbs_url in known_dbs_urls:
            dbs_url = dbs2to3 [dbs_url]
        common.logger.info("Accessing DBS at: "+dbs_url)


        endpoint_components = urlparse.urlsplit(dbs_url)

        if endpoint_components.hostname == DBS3HOST or useDBS3:
            useDBS3=True
            dbs_url_3 = dbs_url
            if dbs_url in known_dbs_urls:
                dbs_url_2 = dbs3to2[dbs_url]
        elif endpoint_components.hostname == DBS2HOST:
            useDBS2=True
            dbs_url_2 = dbs_url
            if dbs_url in known_dbs_urls:
                dbs_url_3 = dbs2to3[dbs_url]
        else:
            # if we do not know this URL, better be a DBS3 test instance
            useDBS3=True
            dbs_url_3 = dbs_url

        
        if useDBS2 and useDBS3:
            msg = "trying to use DBS2 and DBS3 at same time ?"
            raise  CrabException(msg)

        if self.cfg_params.get('CMSSW.use_das'):
            useDAS = int(self.cfg_params.get('CMSSW.use_das'))==1


        if useDBS2:
            common.logger.info("Will do Data Discovery using  DBS2")
        if useDBS3:
            common.logger.info("Will do Data Discovery using  DBS3")
        if useDAS :
            common.logger.info("will use DAS to talk to DBS")
        if verifyDBS23:
            common.logger.info("Will verify that DBS2 and DBS3 return same information")


        ## check if runs are selected
        runselection = []
        if (self.cfg_params.has_key('CMSSW.runselection')):
            runselection = parseRange2(self.cfg_params['CMSSW.runselection'])
            if len(runselection)>1000000:
                common.logger.info("ERROR: runselection range has more then 1M numbers")
                common.logger.info("ERROR: Too large. runselection is ignored")
                runselection=[]

        ## check if various lumi parameters are set
        self.lumiMask = self.cfg_params.get('CMSSW.lumi_mask',None)
        self.lumiParams = self.cfg_params.get('CMSSW.total_number_of_lumis',None) or \
                          self.cfg_params.get('CMSSW.lumis_per_job',None)

        lumiList = None
        if self.lumiMask:
            lumiList = LumiList(filename=self.lumiMask)
        if runselection:
            runList = LumiList(runs = runselection)

        self.splitByRun = int(self.cfg_params.get('CMSSW.split_by_run', 0))
        self.splitDataByEvent = int(self.cfg_params.get('CMSSW.split_by_event', 0))
        common.logger.log(10-1,"runselection is: %s"%runselection)

        if not self.splitByRun:
            self.splitByLumi = self.lumiMask or self.lumiParams or self.ads

        if self.splitByRun and not runselection:
            msg = "Error: split_by_run must be combined with a runselection"
            raise CrabException(msg)

        ## service API
        if useDBS2 or verifyDBS23:
            args = {}
            args['url']     = dbs_url_2
            args['level']   = 'CRITICAL'

        ## check if has been requested to use the parent info
        useparent = int(self.cfg_params.get('CMSSW.use_parent',0))


        defaultName = common.work_space.shareDir()+'AnalyzedBlocks.txt'
        ## check if has been asked for a non default file to store/read analyzed fileBlocks
        #SB no no, we do not want this, it is not even documented !
        #fileBlocks_FileName = os.path.abspath(self.cfg_params.get('CMSSW.fileblocks_file',defaultName))
        if self.cfg_params.get('CMSSW.fileblocks_file') :
            msg = "CMSSW.fileblocks_file option non supported"
            raise CrabException(msg)
        fileBlocks_FileName = os.path.abspath(defaultName)

        if useDBS2 or verifyDBS23:
            #common.logger.info("looking up DBS2 ...")
            import DBSAPI.dbsApi
            import DBSAPI.dbsApiException
            start_time=time.time()
            api2 = DBSAPI.dbsApi.DbsApi(args)
            files2 = self.queryDbs(api2,path=self.datasetPath,runselection=runselection,useParent=useparent)
            elapsed=time.time() - start_time
            common.logger.info("DBS2 lookup took %5.2f sec" % elapsed)
            if useDBS2:
                self.files = files2
        if useDBS3 or verifyDBS23:
            #common.logger.info("looking up DBS3 ...")
            from dbs.apis.dbsClient import DbsApi
            start_time=time.time()
            api3 = DbsApi(dbs_url_3)
            files3 = self.queryDbs3(api3,path=self.datasetPath,runselection=runselection,useParent=useparent)
            elapsed=time.time() - start_time
            common.logger.info("DBS3 lookup took %5.2f sec" % elapsed)
            if useDBS3:
                self.files = files3
        if useDAS :
            self.files = self.queryDas(path=self.datasetPath,runselection=runselection,useParent=useparent)

        if verifyDBS23:
            if not self.compareFilesStructure(files2,files3):
                common.logger.info("ERROR: DBS2 - DB3 comparsion failed, please run crab -uploadLog and report to crabFeedback")
        

        # Check to see what the dataset is
        pdsName = self.datasetPath.split("/")[1]
        if useDBS2 :
            primDSs = api2.listPrimaryDatasets(pdsName)
            dataType = primDSs[0]['Type']
        elif useDBS3 :
            dataType=api3.listDataTypes(dataset=self.datasetPath)[0]['data_type']

        common.logger.info("Datatype is %s" % dataType)
        if dataType == 'data' and not \
            (self.splitByRun or self.splitByLumi or self.splitDataByEvent):
            msg = 'Data must be split by lumi or by run. ' \
                  'Please see crab -help for the correct settings'
            raise  CrabException(msg)



        anFileBlocks = []
        if self.skipBlocks: anFileBlocks = readTXTfile(self, fileBlocks_FileName)

        # parse files and fill arrays
        for file in self.files :
            parList  = []
            fileLumis = [] # List of tuples
            # skip already analyzed blocks
            fileblock = file['Block']['Name']
            if fileblock not in anFileBlocks :
                filename = file['LogicalFileName']
                # asked retry the list of parent for the given child
                if useparent==1:
                    parList = [x['LogicalFileName'] for x in file['ParentList']]
                if self.splitByLumi:
                    fileLumis = [ (x['RunNumber'], x['LumiSectionNumber'])
                                 for x in file['LumiList'] ]
                self.parent[filename] = parList
                # For LumiMask, intersection of two lists.
                if self.lumiMask and runselection:
                    self.lumis[filename] = runList.filterLumis(lumiList.filterLumis(fileLumis))
                elif runselection:
                    self.lumis[filename] = runList.filterLumis(fileLumis)
                elif self.lumiMask:
                    self.lumis[filename] = lumiList.filterLumis(fileLumis)
                else:
                    self.lumis[filename] = fileLumis

                if filename.find('.dat') < 0 :
                    events    = file['NumberOfEvents']
                    # Count number of events and lumis per block
                    if fileblock in self.eventsPerBlock.keys() :
                        self.eventsPerBlock[fileblock] += events
                    else :
                        self.eventsPerBlock[fileblock] = events
                    # Number of events per file
                    self.eventsPerFile[filename] = events

                    # List of files per block
                    if fileblock in self.blocksinfo.keys() :
                        self.blocksinfo[fileblock].append(filename)
                    else :
                        self.blocksinfo[fileblock] = [filename]

                    # total number of events
                    self.maxEvents += events
                    self.maxLumis  += len(self.lumis[filename])

        if  self.skipBlocks and len(self.eventsPerBlock.keys()) == 0:
            msg = "No new fileblocks available for dataset: "+str(self.datasetPath)
            raise  CrabException(msg)

        if len(self.eventsPerBlock) <= 0:
            msg="No data for %s in DBS\n Check datasetpath parameter in crab.cfg" % self.datasetPath
            raise  CrabException(msg)
Exemplo n.º 6
0
    def fetchDBSInfo(self):
        """
        Contact DBS
        """
        # make assumption that same host won't be used for both
        # this check should catch most deployed servers

        (useDBS2, useDBS3, dbs2_url, dbs3_url) = verify_dbs_url(self)
        # DBS2 is gone
        dbs_url = dbs3_url
        useDBS2 = False
        useDBS3 = True
        verifyDBS23 = False

        common.logger.info("Accessing DBS at: %s" % dbs_url)

        ## check if runs are selected
        runselection = []
        if (self.cfg_params.has_key('CMSSW.runselection')):
            runselection = parseRange2(self.cfg_params['CMSSW.runselection'])
            if len(runselection) > 1000000:
                common.logger.info(
                    "ERROR: runselection range has more then 1M numbers")
                common.logger.info("ERROR: Too large. runselection is ignored")
                runselection = []

        ## check if various lumi parameters are set
        self.lumiMask = self.cfg_params.get('CMSSW.lumi_mask', None)
        self.lumiParams = self.cfg_params.get('CMSSW.total_number_of_lumis',None) or \
                          self.cfg_params.get('CMSSW.lumis_per_job',None)

        lumiList = None
        if self.lumiMask:
            lumiList = LumiList(filename=self.lumiMask)
        if runselection:
            runList = LumiList(runs=runselection)

        self.splitByRun = int(self.cfg_params.get('CMSSW.split_by_run', 0))
        self.splitDataByEvent = int(
            self.cfg_params.get('CMSSW.split_by_event', 0))
        common.logger.log(10 - 1, "runselection is: %s" % runselection)

        if not self.splitByRun:
            self.splitByLumi = self.lumiMask or self.lumiParams or self.ads

        if self.splitByRun and not runselection:
            msg = "Error: split_by_run must be combined with a runselection"
            raise CrabException(msg)

        ## service API
        if useDBS2 or verifyDBS23:
            args = {}
            args['url'] = dbs2_url
            args['level'] = 'CRITICAL'

        ## check if has been requested to use the parent info
        useparent = int(self.cfg_params.get('CMSSW.use_parent', 0))

        defaultName = common.work_space.shareDir() + 'AnalyzedBlocks.txt'
        ## check if has been asked for a non default file to store/read analyzed fileBlocks
        #SB no no, we do not want this, it is not even documented !
        #fileBlocks_FileName = os.path.abspath(self.cfg_params.get('CMSSW.fileblocks_file',defaultName))
        if self.cfg_params.get('CMSSW.fileblocks_file'):
            msg = "CMSSW.fileblocks_file option non supported"
            raise CrabException(msg)
        fileBlocks_FileName = os.path.abspath(defaultName)

        if useDBS2 or verifyDBS23:
            #common.logger.info("looking up DBS2 ...")
            import DBSAPI.dbsApi
            import DBSAPI.dbsApiException
            start_time = time.time()
            api2 = DBSAPI.dbsApi.DbsApi(args)
            files2 = self.queryDbs(api2,
                                   path=self.datasetPath,
                                   runselection=runselection,
                                   useParent=useparent)
            elapsed = time.time() - start_time
            common.logger.info("DBS2 lookup took %5.2f sec" % elapsed)
            if useDBS2:
                self.files = files2
        if useDBS3 or verifyDBS23:
            #common.logger.info("looking up DBS3 ...")
            from dbs.apis.dbsClient import DbsApi
            start_time = time.time()
            api3 = DbsApi(dbs3_url)
            files3 = self.queryDbs3(api3,
                                    path=self.datasetPath,
                                    runselection=runselection,
                                    useParent=useparent)
            elapsed = time.time() - start_time
            common.logger.info("DBS3 lookup took %5.2f sec" % elapsed)
            if useDBS3:
                self.files = files3

        # Check to see what the dataset is
        pdsName = self.datasetPath.split("/")[1]
        if useDBS2:
            primDSs = api2.listPrimaryDatasets(pdsName)
            dataType = primDSs[0]['Type']
        elif useDBS3:
            dataType = api3.listDataTypes(
                dataset=self.datasetPath)[0]['data_type']

        common.logger.info("Datatype is %s" % dataType)
        if dataType == 'data' and not \
            (self.splitByRun or self.splitByLumi or self.splitDataByEvent):
            msg = 'Data must be split by lumi or by run. ' \
                  'Please see crab -help for the correct settings'
            raise CrabException(msg)

        anFileBlocks = []
        if self.skipBlocks:
            anFileBlocks = readTXTfile(self, fileBlocks_FileName)

        # parse files and fill arrays
        for file in self.files:
            parList = []
            fileLumis = []  # List of tuples
            # skip already analyzed blocks
            fileblock = file['Block']['Name']
            if fileblock not in anFileBlocks:
                filename = file['LogicalFileName']
                # asked retry the list of parent for the given child
                if useparent == 1:
                    parList = [
                        x['LogicalFileName'] for x in file['ParentList']
                    ]
                if self.splitByLumi:
                    fileLumis = [(x['RunNumber'], x['LumiSectionNumber'])
                                 for x in file['LumiList']]
                self.parent[filename] = parList
                # For LumiMask, intersection of two lists.
                if self.lumiMask and runselection:
                    self.lumis[filename] = runList.filterLumis(
                        lumiList.filterLumis(fileLumis))
                elif runselection:
                    self.lumis[filename] = runList.filterLumis(fileLumis)
                elif self.lumiMask:
                    self.lumis[filename] = lumiList.filterLumis(fileLumis)
                else:
                    self.lumis[filename] = fileLumis

                if filename.find('.dat') < 0:
                    events = file['NumberOfEvents']
                    # Count number of events and lumis per block
                    if fileblock in self.eventsPerBlock.keys():
                        self.eventsPerBlock[fileblock] += events
                    else:
                        self.eventsPerBlock[fileblock] = events
                    # Number of events per file
                    self.eventsPerFile[filename] = events

                    # List of files per block
                    if fileblock in self.blocksinfo.keys():
                        self.blocksinfo[fileblock].append(filename)
                    else:
                        self.blocksinfo[fileblock] = [filename]

                    # total number of events
                    self.maxEvents += events
                    self.maxLumis += len(self.lumis[filename])

        if self.skipBlocks and len(self.eventsPerBlock.keys()) == 0:
            msg = "No new fileblocks available for dataset: " + str(
                self.datasetPath)
            raise CrabException(msg)

        if len(self.eventsPerBlock) <= 0:
            msg = "No data for %s in DBS\n Check datasetpath parameter in crab.cfg" % self.datasetPath
            raise CrabException(msg)