예제 #1
0
    def getBlockSitesFromLocalDBS3(self,dbs_url):

        ## find the location for each block in the list
        from dbs.apis.dbsClient import DbsApi
        api = DbsApi(dbs_url)

        from NodeNameUtils import getMapOfSEHostName2PhedexNodeNameFromPhEDEx

        se2pnn = getMapOfSEHostName2PhedexNodeNameFromPhEDEx()

        blockSites = {}
        for block in self.Listfileblocks:
            blockInfo=api.listBlocks(block_name=block,detail=True)
            location=blockInfo[0]['origin_site_name']
            if location == 'UNKNOWN':
                blockSites[block] = []
            else:
                #if locationIsValidPNN:
                if location.startswith('T2_') or location.startswith('T3_'):
                    blockSites[block] = [location]
                else:
                    if location in se2pnn.keys():
                        blockSites[block] = [se2pnn[location]]
                    else:
                        msg = "ERROR: unknown location for block: %s. Skip this block" % location
                        common.logger.info(msg)
                        blockSites[block] = []

        return blockSites
def getSize(dataset):
       # initialize API to DBS3
        dbsapi = DbsApi(url=dbs3_url)
        # retrieve dataset summary
        reply = dbsapi.listBlocks(dataset=dataset,detail=True)
        sum = 0
        for block in reply:
           sum = sum + block['block_size']
        return sum
예제 #3
0
    def getBlockSitesFromLocalDBS3(self,dbs_url):

        ## find the location for each block in the list
        from dbs.apis.dbsClient import DbsApi
        api = DbsApi(dbs_url)

        blockSites = {}
        for block in self.Listfileblocks:
            blockInfo=api.listBlocks(block_name=block,detail=True)
            location=blockInfo[0]['origin_site_name']
            blockSites[block] = [location]

        return blockSites
예제 #4
0
def hasAllBlocksClosed(dataset):
    """
    checks if a given dataset has all blocks closed
    """
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)
    # retrieve dataset summary
    reply = dbsapi.listBlocks(dataset=dataset, detail=True)
    for block in reply:
        #print block['block_name']
        #print block['open_for_writing']
        if block['open_for_writing']:
            return False
    return True
예제 #5
0
def main():
    if len(sys.argv) != 2:
        print("You must provide a dataset name. E.g.: python listDsetLumis.py /EGamma/Run2018A-v1/RAW")
        sys.exit(1)
    dset = sys.argv[1]

    dbsApi = DbsApi(url='https://cmsweb-testbed.cern.ch/dbs/int/global/DBSReader/')
    listBlocks = dbsApi.listBlocks(dataset=dset)
    for block in listBlocks:
        print("\nBlock: %s" % block['block_name'])
        blockInfo = dbsApi.listFileLumis(block_name=block['block_name'])
        for info in blockInfo:
            print("LFN: %s" % info['logical_file_name'])
            print("    Total lumis: %s\tLumis: %s" % (len(info['lumi_section_num']),
                                                      sorted(info['lumi_section_num'])))
예제 #6
0
def getEventsDetails(acquisitionEra, dataTierName, searchStr, date):
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)
    # retrieve dataset summary
    reply = dbsapi.listBlocks(data_tier_name=dataTierName, min_cdate=1394818770, max_cdate=1395407514)

    sum = 0

    for block in reply:
        if acquisitionEra in block["block_name"] and (
            searchStr == "NA" or (searchStr != "NA" and searchStr in block["block_name"])
        ):
            events = getEventCountBlock(block["block_name"])
            #          print ' - block - ',block['block_name'],events
            sum = sum + events

    return sum
예제 #7
0
def getEventsDetails(acquisitionEra, dataTierName, searchStr, date):
    # initialize API to DBS3
    dbsapi = DbsApi(url=dbs3_url)
    # retrieve dataset summary
    reply = dbsapi.listBlocks(data_tier_name=dataTierName,
                              min_cdate=1394818770,
                              max_cdate=1395407514)

    sum = 0

    for block in reply:
        if (acquisitionEra in block['block_name'] and
            (searchStr == 'NA' or
             (searchStr != 'NA' and searchStr in block['block_name']))):
            events = getEventCountBlock(block['block_name'])
            #          print ' - block - ',block['block_name'],events
            sum = sum + events

    return sum
예제 #8
0
def getMaxLumi(dataset):
    """
    Gets the number of the last lumi in a given dataset
    This is useful for appending new events to dataset
    without collision
    """
    dbsapi = DbsApi(url=dbs3_url)
    reply = dbsapi.listBlocks(dataset=dataset)
    maxl = 0
    for b in reply:
        reply2 = dbsapi.listFileLumis(block_name=b['block_name'])
        #retrieve lumis for each file
        for f in reply2:
            lumis = f['lumi_section_num']
            #check max of lumi
            lumi = max(lumis)
            if lumi > maxl:
                maxl = lumi
    return maxl   
예제 #9
0
def main():
    if len(sys.argv) != 2:
        print("You must provide a dataset name. E.g.: python listEmptyDBSBlocks.py /EGamma/Run2018A-v1/RAW")
        sys.exit(1)
    dset = sys.argv[1]

    summaryLoss = {}
    badBlocks = []
    dbsApi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader/')
    listBlocks = dbsApi.listBlocks(dataset=dset)
    for block in listBlocks:
        blockInfo = dbsApi.listFileSummaries(block_name=block['block_name'], validFileOnly=1)
        if not blockInfo or not blockInfo[0]['num_file']:
            blockInfo = dbsApi.listFileSummaries(block_name=block['block_name'], validFileOnly=0)
            print("Block %s doesn't contain any valid files. Block summary: %s" % (block['block_name'], blockInfo))
            updateSummary(summaryLoss, blockInfo)
            badBlocks.append(block['block_name'])
    print("List of blocks that will be skipped:\n%s\n" % list(set(badBlocks)))
    print("Summary of blocks with all files invalid: %s" % summaryLoss)
예제 #10
0
def getMaxLumi(dataset):
    """
    Gets the number of the last lumi in a given dataset
    This is useful for appending new events to dataset
    without collision
    """
    dbsapi = DbsApi(url=dbs3_url)
    reply = dbsapi.listBlocks(dataset=dataset)
    maxl = 0
    for b in reply:
        reply2 = dbsapi.listFileLumis(block_name=b['block_name'])
        #retrieve lumis for each file
        for f in reply2:
            lumis = f['lumi_section_num']
            #check max of lumi
            lumi = max(lumis)
            if lumi > maxl:
                maxl = lumi
    return maxl
예제 #11
0
def getDatasetInfo(dataset):
    """
    Gets a summary of a dataset, returns a tuple with:
    ( Open for writing: 1 if at least one block is open for writing,
    creation date: the creation date of the first block,
    last modified: the latest modification date)
    """
    dbsapi = DbsApi(url=dbs3_url)
    reply = dbsapi.listBlocks(dataset=dataset, detail=True)
    if not reply:
        return (0,0,0)
    #first block
    max_last_modified = reply[0]['last_modification_date']
    min_creation_date = reply[0]['creation_date']
    open_for_writing = reply[0]['open_for_writing']
    #for all the blocks, get the details
    for block in reply:
        max_last_modified = max(max_last_modified, block['last_modification_date'])
        min_creation_date = min(min_creation_date, block['creation_date'])
        open_for_writing |= block['open_for_writing']
    return (open_for_writing, min_creation_date, max_last_modified)
예제 #12
0
def getDatasetInfo(dataset):
    """
    Gets a summary of a dataset, returns a tuple with:
    ( Open for writing: 1 if at least one block is open for writing,
    creation date: the creation date of the first block,
    last modified: the latest modification date)
    """
    dbsapi = DbsApi(url=dbs3_url)
    reply = dbsapi.listBlocks(dataset=dataset, detail=True)
    if not reply:
        return (0,0,0)
    #first block
    max_last_modified = reply[0]['last_modification_date']
    min_creation_date = reply[0]['creation_date']
    open_for_writing = reply[0]['open_for_writing']
    #for all the blocks, get the details
    for block in reply:
        max_last_modified = max(max_last_modified, block['last_modification_date'])
        min_creation_date = min(min_creation_date, block['creation_date'])
        open_for_writing |= block['open_for_writing']
    return (open_for_writing, min_creation_date, max_last_modified)
예제 #13
0
def main():

	usage="%prog <options>"

	parser = OptionParser(usage=usage)
	parser.add_option("-u", "--url", dest="url", help="DBS Instance url. default is https://cmsweb.cern.ch/dbs/prod/global/DBSReader", metavar="<url>")
	parser.add_option("-l", "--length", dest="length", help="Number of days for calculate the accumated events. It is Optional, default is 30 days.", metavar="<length>")
	parser.add_option("-d", "--dataset", dest="dataset", help="The dataset name for cacluate the events. Can be optional if datatier is used.", metavar="<dataset>")
	parser.add_option("-t", "--datatier", dest="datatier", help="The datatier name for cacluate the events. Can be optional if dataset is used. In this version datatier is not supported yet.", metavar="<data_tier_name>")
	parser.add_option("-a", "--access_type", dest="ds_access_type", help="Dataset access types: VALID, PRODUCTION or ALL(VALID+PRODUCTION). Default is ALL", metavar="<dataset_access_type>")
	parser.set_defaults(url="https://cmsweb.cern.ch/dbs/prod/global/DBSReader")
	parser.set_defaults(length=30)
	parser.set_defaults(ds_access_type="ALL")

	(opts, args) = parser.parse_args()
	if not (opts.dataset or opts.datatier):
		parser.print_help()
		parser.error('either --dataset or --datatier is required')

	dataset	 = opts.dataset
	#seconds per day    
	sdays = 86400
	lenth = int(opts.length)
	now = time.time()
	#now = 1391353032
	then = now - sdays*lenth
	url = opts.url
	api=DbsApi(url=url)
	outputDataSets = []
    
	f = [0 for x in range(lenth)]
	min_cdate = int(then)
	max_cdate = int(now)
	if (opts.ds_access_type == "ALL"):
		outputDataSetsValid = api.listDatasets(dataset=dataset, min_cdate=min_cdate-30*sdays, 
                          max_cdate=max_cdate, dataset_access_type="VALID")
		outputDataSetsProd = api.listDatasets(dataset=dataset, min_cdate=min_cdate-30*sdays,
                          max_cdate=max_cdate, dataset_access_type="PRODUCTION")
		outputDataSets = outputDataSetsValid + outputDataSetsProd
	elif (opts.ds_access_type == "VALID"):
		outputDataSets = api.listDatasets(dataset=dataset, min_cdate=min_cdate-30*sdays,
                          max_cdate=max_cdate, dataset_access_type="VALID")
	elif (opts.ds_access_type == "PRODUCTION"):
		outputDataSets = api.listDatasets(dataset=dataset, min_cdate=min_cdate-30*sdays,
                          max_cdate=max_cdate, dataset_access_type="PRODUCTION")
	for dataset in outputDataSets:
		outputBlocks = api.listBlocks(dataset=dataset["dataset"], detail=1, min_cdate=min_cdate, max_cdate=max_cdate)
		blockList = []
		blockCdate = {}
		for block in outputBlocks:
			blockList.append(block["block_name"])
			blockCdate[block["block_name"]] = block["creation_date"]
		blockSum = []
		if blockList: 
			blockSum = api.listBlockSummaries(block_name=blockList, detail=1)
		for b in blockSum:
			cdate= blockCdate[b["block_name"]]
			day = int((now-cdate)/sdays)
			f[day] = f[day] + b["num_event"] 
	for i in range(lenth):
		#print (lenth-1)-i, ":  ", f[i], "  ", sum(item['all'] for item in f[i:lenth]) 
		print i, ": ", f[(lenth-1)-i], " ", sum(item for item in f[(lenth-1)-i:lenth])
	sys.exit(0);
예제 #14
0
class RequestQuery:

    def __init__(self,config):
        self.br=Browser()

        self.config = config
        
        # Initialise connections
        self.mySiteDB = SiteDBJSON()
        self.phedex = PhEDEx({"endpoint":"https://cmsweb.cern.ch/phedex/datasvc/json/prod/"}, "json")
        self.dbsPhys01 = DbsApi(url = dbs_base_url+"phys01/DBSReader/")
        self.dbsPhys02 = DbsApi(url = dbs_base_url+"phys02/DBSReader/")
        self.dbsPhys03 = DbsApi(url = dbs_base_url+"phys03/DBSReader/")
        
    def __del__(self):
        self.br.close()

    def login2Savannah(self):
        """
        login2Savannah log into savannah with the given parameters in the config (username and password) 
        User must have admin priviledges for store results requests
        """
        login_page='https://savannah.cern.ch/account/login.php?uri=%2F'
        savannah_page='https://savannah.cern.ch/task/?group=cms-storeresults'
        
        self.br.open(login_page)

        ## 'Search' form is form 0
        ## login form is form 1
        self.br.select_form(nr=1)

        username = self.config["SavannahUser"]
    
        self.br['form_loginname']=username
        self.br['form_pw']=self.config["SavannahPasswd"]
        
        self.br.submit()
        
        response = self.br.open(savannah_page)
        
        # Check to see if login was successful
        if not re.search('Logged in as ' + username, response.read()):
            print('login unsuccessful, please check your username and password')
            return False
        else:
            return True
    
    def selectQueryForm(self,**kargs):       
        """
        selectQueryForm create the browser view to get all the store result tickets from savannah
        """
        if self.isLoggedIn:
            self.br.select_form(name="bug_form")

            ## Use right query form labelled Test
            control = self.br.find_control("report_id",type="select")

            for item in control.items:
                if item.attrs['label'] == "Test":
                    control.value = [item.attrs['value']]
                    
            ##select number of entries displayed per page
            control = self.br.find_control("chunksz",type="text")
            control.value = "150"

            ##check additional searching parameter
            for arg in kargs:
                if arg == "approval_status":
                    control = self.br.find_control("resolution_id",type="select")
                    for item in control.items:
                        if item.attrs['label'] == kargs[arg].strip():
                            control.value = [item.attrs['value']]

                elif arg == "task_status":
                    control = self.br.find_control("status_id",type="select")
                    for item in control.items:
                        if item.attrs['label'] == kargs[arg].strip():
                            control.value = [item.attrs['value']]
                            
                elif arg == "team":
                    control = self.br.find_control("custom_sb5",type="select")
                    for item in control.items:
                        if item.attrs['label'] == kargs[arg].strip():
                            control.value = [item.attrs['value']]

            response = self.br.submit()
            response.read()

        return

    def getScramArchByCMSSW(self):
        """
        Get from the list of available CMSSW releases
        return a dictionary of ScramArchitecture by CMSSW
        """
        
        # Set temporary conection to the server and get the response from cmstags
        url = 'https://cmssdt.cern.ch/SDT/cgi-bin/ReleasesXML'
        br = Browser()
        br.set_handle_robots(False)
        response=br.open(url)
        soup = BeautifulSoup(response.read())
        
        # Dictionary form
        # {'CMSSW_X_X_X':[slc5_amd64_gcc472], ... }
        archByCmssw={}
        
        # Fill the dictionary
        for arch in soup.find_all('architecture'): 
            for cmssw in arch.find_all('project'): 
                # CMSSW release
                cmsswLabel = cmssw.get('label').encode('ascii', 'ignore')
                if cmsswLabel not in archByCmssw:
                    archByCmssw[cmsswLabel]=[]
                # ScramArch related to this CMSSW release
                archName = arch.get('name').encode('ascii', 'ignore')
                archByCmssw[cmsswLabel].append(archName)
        
        return archByCmssw
      
    def createValueDicts(self):       
        """
        Init dictionaries by value/label:
        - Releases by Value
        - Physics group by value
        - DBS url by value
        - DBS rul by label
        - Status of savannah request by value 
        - Status of savannah ticket by value (Open/Closed/Any)
        """      
        if self.isLoggedIn:
            self.br.select_form(name="bug_form")
            
            control = self.br.find_control("custom_sb2",type="select")
            self.ReleaseByValueDict = self.getLabelByValueDict(control)

            control = self.br.find_control("custom_sb3",type="select")
            self.GroupByValueDict = self.getLabelByValueDict(control)

            control = self.br.find_control("custom_sb4",type="select")
            self.DBSByValueDict = self.getLabelByValueDict(control)
            self.DBSByLabelDict = self.getValueByLabelDict(control)

            control = self.br.find_control("resolution_id",type="select")
            self.StatusByValueDict = self.getLabelByValueDict(control)

            control = self.br.find_control("status_id",type="select")
            self.TicketStatusByLabelDict = self.getValueByLabelDict(control)

        return
    
    def getDatasetOriginSites(self, dbs_url, data):
        """
        Get the origin sites for each block of the dataset.
        Return a list block origin sites.
        """
        
        local_dbs = dbs_url.split('/')[5]
        if local_dbs == 'phys01':
            response = self.dbsPhys01.listBlocks(detail=True,dataset=data)
        elif local_dbs == 'phys02':
            response = self.dbsPhys02.listBlocks(detail=True,dataset=data)
        elif local_dbs == 'phys03':
            response = self.dbsPhys03.listBlocks(detail=True,dataset=data)

        pnnList = set()
        for block in response:
            pnnList.add(block['origin_site_name'])
        psnList = self.mySiteDB.PNNstoPSNs(pnnList)
        
        return psnList, list(pnnList)

    def phEDExNodetocmsName(self, nodeList):
        """
        Convert PhEDEx node name list to cms names list 
        """
        names = []
        for node in nodeList:
            name = node.replace('_MSS',
                                '').replace('_Disk',
                                    '').replace('_Buffer',
                                        '').replace('_Export', '')
            if name not in names:
                names.append(name)
        return names
    
    def setGlobalTagFromOrigin(self, dbs_url,input_dataset):
        """
        Get the global tag of the dataset from the source dbs url. If it is not set, then set global tag to 'UNKNOWN'
        """
        
        globalTag = ""
        local_dbs = dbs_url.split('/')[5]
        if local_dbs == 'phys01':
            response = self.dbsPhys01.listOutputConfigs(dataset=input_dataset)
        elif local_dbs == 'phys02':
            response = self.dbsPhys02.listOutputConfigs(dataset=input_dataset)
        elif local_dbs == 'phys03':
            response = self.dbsPhys03.listOutputConfigs(dataset=input_dataset)
        
        globalTag = response[0]['global_tag']
        # GlobalTag cannot be empty
        if globalTag == '':
            globalTag = 'UNKNOWN'
            
        return globalTag
    
    def isDataAtUrl(self, dbs_url,input_dataset):
        """
        Returns True if the dataset is at the dbs url, if not returns False
        """
        local_dbs = dbs_url.split('/')[5]
        if local_dbs == 'phys01':
            response = self.dbsPhys01.listDatasets(dataset=input_dataset)
        elif local_dbs == 'phys02':
            response = self.dbsPhys02.listDatasets(dataset=input_dataset)
        elif local_dbs == 'phys03':
            response = self.dbsPhys03.listDatasets(dataset=input_dataset)
        # This means that the dataset is not at the url
        if not response:
            return False
        else:
            return True
         
    def getLabelByValueDict(self, control):
        """
        From control items, create a dictionary by values
        """   
        d = {}
        for item in control.items:
            value = item.attrs['value']
            label = item.attrs['label']
            d[value] = label
                
        return d
    
    def getValueByLabelDict(self, control):
        """
        From control items, create a dictionary by labels
        """
        d = {}
        for item in control.items:
            value = item.attrs['value']
            label = item.attrs['label']
            d[label] = value

        return d
    
    def getRequests(self,**kargs):
        """
        getRequests Actually goes through all the savannah requests and create json files if the 
        ticket is not Closed and the status of the item is Done.
        It also reports back the summary of the requests in savannah
        """
        requests = []
        
        # Open Browser and login into Savannah
        self.br=Browser()
        self.isLoggedIn = self.login2Savannah()
        
        if self.isLoggedIn:
            if not kargs:
                self.selectQueryForm(approval_status='1',task_status='0')
            else:
                self.selectQueryForm(**kargs)
            self.createValueDicts()
        
            self.br.select_form(name="bug_form")
            response = self.br.submit()

            html_ouput = response.read()
            
            scramArchByCMSSW = self.getScramArchByCMSSW()
            self.nodeMappings = self.phedex.getNodeMap()
            
            for link in self.br.links(text_regex="#[0-9]+"):
                response = self.br.follow_link(link)
                
                try:
                    ## Get Information
                    self.br.select_form(name="item_form")

                    ## remove leading &nbsp and # from task
                    task = link.text.replace('#','').decode('utf-8').strip()
                    print("Processing ticket: %s" % task)
                    
                    ## Get input dataset name
                    control = self.br.find_control("custom_tf1",type="text")
                    input_dataset = control.value
                    input_primary_dataset = input_dataset.split('/')[1].replace(' ','')
                    input_processed_dataset = input_dataset.split('/')[2].replace(' ','')
                    data_tier = input_dataset.split('/')[3].replace(' ','')
                    
                    ## Get DBS URL by Drop Down
                    control = self.br.find_control("custom_sb4",type="select")
                    dbs_url = self.DBSByValueDict[control.value[0]]

                    ## Get DBS URL by text field (for old entries)
                    if dbs_url=='None':
                        control = self.br.find_control("custom_tf4",type="text")
                        dbs_url = control.value.replace(' ','')
                    else: # Transform input value to a valid DBS url
                        #dbs_url = "https://cmsweb.cern.ch/dbs/prod/"+dbs_url+"/DBSReader"
                        dbs_url = dbs_base_url+dbs_url+"/DBSReader"
                        
                    ## Get Release
                    control = self.br.find_control("custom_sb2",type="select")
                    release_id = control.value
                    
                    ## Get current request status
                    control = self.br.find_control("status_id",type="select")
                    request_status_id = control.value
                    RequestStatusByValueDict = self.getLabelByValueDict(control)
                    
                    # close the request if deprecated release was used
                    try:
                        release = self.ReleaseByValueDict[release_id[0]]
                    except:
                        if len(self.ReleaseByValueDict)>0 and RequestStatusByValueDict[request_status_id[0]] != "Closed":
                            msg = "Your request is not valid anymore, since the given CMSSW release is deprecated. If your request should be still processed, please reopen the request and update the CMSSW release to a more recent *working* release.\n"
                            msg+= "\n"
                            msg+= "Thanks,\n"
                            msg+= "Your StoreResults team"
                            self.closeRequest(task,msg)
                            self.br.back()
                            print("I tried to Close ticket %s due to CMSSW not valid" % task)
                            continue
                    
                    # close the request if release has not ScramArch match
                    if release not in scramArchByCMSSW:
                        if len(self.ReleaseByValueDict)>0 and RequestStatusByValueDict[request_status_id[0]] != "Closed":
                            msg = "Your request is not valid, there is no ScramArch match for the given CMSSW release.\n"
                            msg+= "If your request should be still processed, please reopen the request and update the CMSSW release according to: https://cmssdt.cern.ch/SDT/cgi-bin/ReleasesXML \n"
                            msg+= "\n"
                            msg+= "Thanks,\n"
                            msg+= "Your StoreResults team"
                            self.closeRequest(task,msg)
                            self.br.back()
                            print("I tried to Close ticket %s due to ScramArch mismatch" % task)
                            continue
                    else: 
                        index=len(scramArchByCMSSW[release])
                        scram_arch = scramArchByCMSSW[release][index-1]

                    # close the request if dataset is not at dbs url
                    try:
                        data_at_url = self.isDataAtUrl(dbs_url,input_dataset)
                    except:
                        print('I got an error trying to look for dataset %s at %s, please look at this ticket: %s' %(input_dataset,dbs_url,task))
                        continue
                    if not data_at_url:
                        msg = "Your request is not valid, I could not find the given dataset at %s\n" % dbs_url
                        msg+= "If your request should be still processed, please reopen the request and change DBS url properly \n"
                        msg+= "\n"
                        msg+= "Thanks,\n"
                        msg+= "Your StoreResults team"
                        self.closeRequest(task,msg)
                        self.br.back()
                        print("I tried to Close ticket %s, dataset is not at DBS url" % task)
                        continue
                        
                    # Avoid not approved Tickets
                    #if not RequestStatusByValueDict[request_status_id[0]] == "Done":
                    #    continue

                    ## Get Physics Group
                    control = self.br.find_control("custom_sb3",type="select")
                    group_id = control.value[0]
                    group_squad = 'cms-storeresults-'+self.GroupByValueDict[group_id].replace("-","_").lower()

                    ## Get Dataset Version
                    control = self.br.find_control("custom_tf3",type="text")
                    dataset_version = control.value.replace(' ','')
                    if dataset_version == "": dataset_version = '1'
                                        
                    ## Get current status
                    control = self.br.find_control("resolution_id",type="select")
                    status_id = control.value

                    ## Get assigned to
                    control = self.br.find_control("assigned_to",type="select")
                    AssignedToByValueDict = self.getLabelByValueDict(control)
                    assignedTo_id = control.value

                    ##Assign task to the physics group squad
                    if AssignedToByValueDict[assignedTo_id[0]]!=group_squad:
                        assignedTo_id = [self.getValueByLabelDict(control)[group_squad]]
                        control.value = assignedTo_id
                        self.br.submit()

                    # Set default Adquisition Era for StoreResults 
                    acquisitionEra = "StoreResults"

                    ## Construction of the new dataset name (ProcessingString)
                    ## remove leading hypernews or physics group name and StoreResults+Version
                    if input_processed_dataset.find(self.GroupByValueDict[group_id])==0:
                        new_dataset = input_processed_dataset.replace(self.GroupByValueDict[group_id],"",1)
                    else:
                        stripped_dataset = input_processed_dataset.split("-")[1:]
                        new_dataset = '_'.join(stripped_dataset)
                    
                except Exception as ex:
                    self.br.back()
                    print("There is a problem with this ticket %s, please have a look to the error:" % task)
                    print(str(ex))
                    print(traceback.format_exc())
                    continue
                
                self.br.back()
                
                # Get dataset site info:
                psnList, pnnList = self.getDatasetOriginSites(dbs_url,input_dataset)
                
                infoDict = {}
                # Build store results json
                # First add all the defaults values
                infoDict["RequestType"] = "StoreResults"
                infoDict["UnmergedLFNBase"] = "/store/unmerged" 
                infoDict["MergedLFNBase"] = "/store/results/" + self.GroupByValueDict[group_id].replace("-","_").lower()
                infoDict["MinMergeSize"] = 1500000000
                infoDict["MaxMergeSize"] = 5000000000
                infoDict["MaxMergeEvents"] = 100000
                infoDict["TimePerEvent"] = 40
                infoDict["SizePerEvent"] = 512.0
                infoDict["Memory"] = 2394
                infoDict["CmsPath"] = "/uscmst1/prod/sw/cms"                                        
                infoDict["Group"] = "DATAOPS"
                infoDict["DbsUrl"] = dbs_url
                
                # Add all the information pulled from Savannah
                infoDict["AcquisitionEra"] = acquisitionEra
                infoDict["GlobalTag"] = self.setGlobalTagFromOrigin(dbs_url,input_dataset)
                infoDict["DataTier"] = data_tier
                infoDict["InputDataset"] = input_dataset
                infoDict["ProcessingString"] = new_dataset
                infoDict["CMSSWVersion"] = release
                infoDict["ScramArch"] = scram_arch
                infoDict["ProcessingVersion"] = dataset_version                    
                infoDict["SiteWhitelist"] = psnList
                
                # Create report for Migration2Global
                report = {}
                 
                #Fill json file, if status is done
                if self.StatusByValueDict[status_id[0]]=='Done' and RequestStatusByValueDict[request_status_id[0]] != "Closed":
                    self.writeJSONFile(task, infoDict)
                    report["json"] = 'y'
                else:
                    report["json"] = 'n'
                    
                report["task"] = int(task)
                report["InputDataset"] = input_dataset
                report["ProcessingString"] = new_dataset
                report["ticketStatus"] = self.StatusByValueDict[status_id[0]]
                report["assignedTo"] = AssignedToByValueDict[assignedTo_id[0]]
                report["localUrl"] = dbs_url
                report["sites"] = psnList
                report["pnns"] = pnnList

                # if the request is closed, change the item status to report to Closed
                if report["ticketStatus"] == "Done" and RequestStatusByValueDict[request_status_id[0]] == "Closed":
                    report["ticketStatus"] = "Closed"

                requests.append(report)
                    
            # Print out report
            self.printReport(requests)
        # Close connections
        self.br.close()
        
        return requests

    def closeRequest(self,task,msg):
        """
        This close a specific savannag ticket
        Insert a message in the ticket
        """
        if self.isLoggedIn:
            #self.createValueDicts()
            
            response = self.br.open('https://savannah.cern.ch/task/?'+str(task))

            html = response.read()

            self.br.select_form(name="item_form")

            control = self.br.find_control("status_id",type="select")
            control.value = [self.TicketStatusByLabelDict["Closed"]]

            #Put reason to the comment field
            control = self.br.find_control("comment",type="textarea")
            control.value = msg
                        
            #DBS Drop Down is a mandatory field, if set to None (for old requests), it is not possible to close the request
            self.setDBSDropDown()
                        
            self.br.submit()

            #remove JSON ticket
            self.removeJSONFile(task)
            
            self.br.back()
        return

    def setDBSDropDown(self):
        ## Get DBS URL by Drop Down
        control = self.br.find_control("custom_sb4",type="select")
        dbs_url = self.DBSByValueDict[control.value[0]]

        ## Get DBS URL by text field (for old entries)
        if dbs_url=='None':
            tmp = self.br.find_control("custom_tf4",type="text")
            dbs_url = tmp.value.replace(' ','')

            if dbs_url.find("phys01")!=-1:
                control.value = [self.DBSByLabelDict["phys01"]]
            elif dbs_url.find("phys02")!=-1:
                control.value = [self.DBSByLabelDict["phys02"]]
            elif dbs_url.find("phys03")!=-1:
                control.value = [self.DBSByLabelDict["phys03"]]
            else:
                msg = 'DBS URL of the old request is neither phys01, phys02 nor phys03. Please, check!'
                print(msg)
                raise RuntimeError(msg)

        return

    def writeJSONFile(self, task, infoDict):
        """
        This writes a JSON file at ComponentDir
        """
        ##check if file already exists
        filename = self.config["ComponentDir"]+'/Ticket_'+str(task)+'.json'
        if not os.access(filename,os.F_OK):
            jsonfile = open(filename,'w')
            request = {'createRequest':infoDict} ## CHECK THIS BEFORE FINISHING
            jsonfile.write(json.dumps(request,sort_keys=True, indent=4))
            jsonfile.close

        return

    def removeJSONFile(self,task):
        """
        This removes the JSON file at ComponentDir if it was created
        """
        filename = self.config["ComponentDir"]+'/Ticket_'+str(task)+'.json'

        if os.access(filename,os.F_OK):
            os.remove(filename)

        return

    def printReport(self, requests):
        """
        Print out a report
        """
        print("%20s %10s %5s %35s %10s %50s %50s" %( 'Savannah Ticket','Status','json','Assigned to','local DBS','Sites','pnns')) 
        print("%20s %10s %5s %35s %10s %50s %50s" %( '-'*20,'-'*10,'-'*5,'-'*35,'-'*10,'-'*50,'-'*50 ))
        
        for report in requests:
            
            json = report["json"]
            ticket = report["task"]
            status = report["ticketStatus"]
            assigned = report["assignedTo"]
            localUrl = report["localUrl"].split('/')[5]
            site = ', '.join(report["sites"])
            pnns = ', '.join(report["pnns"])
            print("%20s %10s %5s %35s %10s %50s %50s" %(ticket,status,json,assigned,localUrl,site,pnns))  
예제 #15
0
class RequestQuery:

    def __init__(self,config):
        self.br=Browser()

        self.config = config
        
        # Initialise connections
        self.mySiteDB = SiteDBJSON()
        self.dbsPhys01 = DbsApi(url = dbs_base_url+"phys01/DBSReader/")
        self.dbsPhys02 = DbsApi(url = dbs_base_url+"phys02/DBSReader/")
        self.dbsPhys03 = DbsApi(url = dbs_base_url+"phys03/DBSReader/")
        
    def __del__(self):
        self.br.close()

    def getScramArchByCMSSW(self):
        """
        Get from the list of available CMSSW releases
        return a dictionary of ScramArchitecture by CMSSW
        """
        
        # Set temporary conection to the server and get the response from cmstags
        url = 'https://cmssdt.cern.ch/SDT/cgi-bin/ReleasesXML'
        br = Browser()
        br.set_handle_robots(False)
        response=br.open(url)
        soup = BeautifulSoup(response.read())
        
        # Dictionary form
        # {'CMSSW_X_X_X':[slc5_amd64_gcc472], ... }
        archByCmssw={}
        
        # Fill the dictionary
        for arch in soup.find_all('architecture'): 
            for cmssw in arch.find_all('project'): 
                # CMSSW release
                cmsswLabel = cmssw.get('label').encode('ascii', 'ignore')
                if cmsswLabel not in archByCmssw:
                    archByCmssw[cmsswLabel]=[]
                # ScramArch related to this CMSSW release
                archName = arch.get('name').encode('ascii', 'ignore')
                archByCmssw[cmsswLabel].append(archName)
        
        return archByCmssw
      
    def getDatasetOriginSites(self, dbs_url, data):
        """
        Get the origin sites for each block of the dataset.
        Return a list block origin sites.
        """
        
        local_dbs = dbs_url.split('/')[5]
        if local_dbs == 'phys01':
            response = self.dbsPhys01.listBlocks(detail=True,dataset=data)
        elif local_dbs == 'phys02':
            response = self.dbsPhys02.listBlocks(detail=True,dataset=data)
        elif local_dbs == 'phys03':
            response = self.dbsPhys03.listBlocks(detail=True,dataset=data)
        
        pnnList = set()
        for block in response:
            pnnList.add(block['origin_site_name'])
        psnList = self.mySiteDB.PNNstoPSNs(pnnList)
        
        return psnList, list(pnnList)
    
    def setGlobalTagFromOrigin(self, dbs_url,input_dataset):
        """
        Get the global tag of the dataset from the source dbs url. If it is not set, then set global tag to 'UNKNOWN'
        """
        
        globalTag = ""
        local_dbs = dbs_url.split('/')[5]
        if local_dbs == 'phys01':
            response = self.dbsPhys01.listOutputConfigs(dataset=input_dataset)
        elif local_dbs == 'phys02':
            response = self.dbsPhys02.listOutputConfigs(dataset=input_dataset)
        elif local_dbs == 'phys03':
            response = self.dbsPhys03.listOutputConfigs(dataset=input_dataset)
        
        globalTag = response[0]['global_tag']
        # GlobalTag cannot be empty
        if globalTag == '':
            globalTag = 'UNKNOWN'
            
        return globalTag
    
    def isDataAtUrl(self, dbs_url,input_dataset):
        """
        Returns True if the dataset is at the dbs url, if not returns False
        """
        local_dbs = dbs_url.split('/')[5]
        if local_dbs == 'phys01':
            response = self.dbsPhys01.listDatasets(dataset=input_dataset)
        elif local_dbs == 'phys02':
            response = self.dbsPhys02.listDatasets(dataset=input_dataset)
        elif local_dbs == 'phys03':
            response = self.dbsPhys03.listDatasets(dataset=input_dataset)
        # This means that the dataset is not at the url
        if not response:
            return False
        else:
            return True
         
    def getLabelByValueDict(self, control):
        """
        From control items, create a dictionary by values
        """   
        d = {}
        for item in control.items:
            value = item.attrs['value']
            label = item.attrs['label']
            d[value] = label
                
        return d
    
    def getValueByLabelDict(self, control):
        """
        From control items, create a dictionary by labels
        """
        d = {}
        for item in control.items:
            value = item.attrs['value']
            label = item.attrs['label']
            d[label] = value

        return d
    
    def createRequestJSON(self, ticket, input_dataset, dbs_url, cmssw_release, group_name, version = 1):
        """
        Creates a JSON file 'Ticket_#TICKET.json' with the needed
        information for creating a requeston ReqMgr.
        Input:
            - ticket: the ticket #, for instance 110773 on https://ggus.eu/?mode=ticket_info&ticket_id=110773
            - input_dataset
            - dbs_url: only the instance name, For example: "phys01" for 
             https://cmsweb.cern.ch/dbs/prod/phys01/DBSReader
            - cmssw_release
            - group_name: the physics group name
            - version: the dataset version, 1 by default.
        It returns a dictionary that contains the request information.
        """

        scramArchByCMSSW = self.getScramArchByCMSSW()
        self.nodeMappings = self.phedex.getNodeMap()
        task = ticket
        print("Processing ticket: %s" % task)
        
        #splitting input dataset       
        input_primary_dataset = input_dataset.split('/')[1].replace(' ','')
        input_processed_dataset = input_dataset.split('/')[2].replace(' ','')
        data_tier = input_dataset.split('/')[3].replace(' ','')
                
        # Transform input value to a valid DBS url
        #dbs_url = "https://cmsweb.cern.ch/dbs/prod/"+dbs_url+"/DBSReader"
        dbs_url = dbs_base_url+dbs_url+"/DBSReader"
        release_id = cmssw_release
                
        # check if deprecated release was used
        release = cmssw_release
        # check if release has not ScramArch match
        if release not in scramArchByCMSSW:
            raise Exception("Error on ticket %s due to ScramArch mismatch" % task)
        else:
            scram_arch = scramArchByCMSSW[release][-1]

        # check if dataset is not at dbs url
        try:
            data_at_url = self.isDataAtUrl(dbs_url,input_dataset)
        except:
            raise Exception('Error on ticket %s, dataset %s not available at %s' %(task, input_dataset,dbs_url))

        if not data_at_url:
            raise Exception('Error on ticket %s, dataset %s not available at %s' %(task, input_dataset,dbs_url))
                    
        ## Get Physics Group
        group_squad = 'cms-storeresults-'+group_name.replace("-","_").lower()

        ## Get Dataset Version
        dataset_version = str(version)

        # Set default Adquisition Era for StoreResults 
        acquisitionEra = "StoreResults"

        ## Construction of the new dataset name (ProcessingString)
        ## remove leading hypernews or physics group name and StoreResults+Version
        if input_processed_dataset.find(group_name)==0:
            new_dataset = input_processed_dataset.replace(group_name,"",1)
        else:
            stripped_dataset = input_processed_dataset.split("-")[1:]
            new_dataset = '_'.join(stripped_dataset)
                        
        # Get dataset site info:
        psnList, pnnList = self.getDatasetOriginSites(dbs_url,input_dataset)

        infoDict = {}
        # Build store results json
        # First add all the defaults values
        infoDict["RequestType"] = "StoreResults"
        infoDict["UnmergedLFNBase"] = "/store/unmerged" 
        infoDict["MergedLFNBase"] = "/store/results/" + group_name.replace("-","_").lower()
        infoDict["MinMergeSize"] = 1500000000
        infoDict["MaxMergeSize"] = 5000000000
        infoDict["MaxMergeEvents"] = 100000
        infoDict["TimePerEvent"] = 40
        infoDict["SizePerEvent"] = 512.0
        infoDict["Memory"] = 2394
        infoDict["CmsPath"] = "/uscmst1/prod/sw/cms"                                        
        infoDict["Group"] = "DATAOPS"
        infoDict["DbsUrl"] = dbs_url
        
        # Add all the information pulled from Savannah
        infoDict["AcquisitionEra"] = acquisitionEra
        infoDict["GlobalTag"] = self.setGlobalTagFromOrigin(dbs_url, input_dataset)
        infoDict["DataTier"] = data_tier
        infoDict["InputDataset"] = input_dataset
        infoDict["ProcessingString"] = new_dataset
        infoDict["CMSSWVersion"] = release
        infoDict["ScramArch"] = scram_arch
        infoDict["ProcessingVersion"] = dataset_version                    
        infoDict["SiteWhitelist"] = psnList
        
        # Create report for Migration2Global
        report = {}
         
        #Fill json file, if status is done
        self.writeJSONFile(task, infoDict)
        report["json"] = 'y'
        report["task"] = int(task)
        report["InputDataset"] = input_dataset
        report["ProcessingString"] = new_dataset
        report["localUrl"] = dbs_url
        report["sites"] = psnList
        report["pnns"] = pnnList

        return report

    def writeJSONFile(self, task, infoDict):
        """
        This writes a JSON file at ComponentDir
        """
        ##check if file already exists
        filename = self.config["ComponentDir"]+'/Ticket_'+str(task)+'.json'
        if not os.access(filename,os.F_OK):
            jsonfile = open(filename,'w')
            request = {'createRequest':infoDict} ## CHECK THIS BEFORE FINISHING
            jsonfile.write(json.dumps(request,sort_keys=True, indent=4))
            jsonfile.close

        return

    def removeJSONFile(self,task):
        """
        This removes the JSON file at ComponentDir if it was created
        """
        filename = self.config["ComponentDir"]+'/Ticket_'+str(task)+'.json'

        if os.access(filename,os.F_OK):
            os.remove(filename)
        return

    def printReport(self, report):
        """
        Print out a report
        """
        print("%20s %5s %10s %50s %50s" %( 'Ticket','json','local DBS','Sites','pnns')) 
        print("%20s %5s %10s %50s %50s" %( '-'*20,'-'*5,'-'*10,'-'*50,'-'*50 ))
        
        json = report["json"]
        ticket = report["task"]
        #status = report["ticketStatus"]
        localUrl = report["localUrl"].split('/')[5]
        site = ', '.join(report["sites"])
        pnns = ', '.join(report["pnns"])
        print("%20s %5s %10s %50s %50s" %(ticket,json,localUrl,site,pnns))  
예제 #16
0
class DBS3Reader(object):
    """
    _DBSReader_

    General API for reading data from DBS
    """
    # cache all the datatiers known by DBS
    _datatiers = {}

    def __init__(self, url, **contact):

        # instantiate dbs api object
        try:
            self.dbsURL = url
            self.dbs = DbsApi(url, **contact)
        except dbsClientException as ex:
            msg = "Error in DBSReader with DbsApi\n"
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        # connection to PhEDEx (Use default endpoint url)
        self.phedex = PhEDEx(responseType="json")

    def _getLumiList(self, blockName=None, lfns=None, validFileOnly=1):
        """
        currently only take one lfn but dbs api need be updated
        """
        try:
            if blockName:
                lumiLists = self.dbs.listFileLumis(block_name=blockName,
                                                   validFileOnly=validFileOnly)
            elif lfns:
                lumiLists = []
                for slfn in grouper(lfns, 50):
                    lumiLists.extend(
                        self.dbs.listFileLumiArray(logical_file_name=slfn))
            else:
                # shouldn't call this with both blockName and lfns empty
                # but still returns empty dict for that case
                return {}
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFileLumiArray(%s)\n" % lfns
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        lumiDict = {}
        for lumisItem in lumiLists:
            lumiDict.setdefault(lumisItem['logical_file_name'], [])
            item = {}
            item["RunNumber"] = lumisItem['run_num']
            item['LumiSectionNumber'] = lumisItem['lumi_section_num']
            if lumisItem.get('event_count', None) is not None:
                item['EventCount'] = lumisItem['event_count']
            lumiDict[lumisItem['logical_file_name']].append(item)
            # TODO: add key for lumi and event pair.
        return lumiDict

    def checkDBSServer(self):
        """
        check whether dbs server is up and running
        returns {"dbs_instance": "prod/global", "dbs_version": "3.3.144"}
        """
        try:
            return self.dbs.serverinfo()
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBS server is not up: %s" % self.dbsURL
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listPrimaryDatasets(self, match='*'):
        """
        _listPrimaryDatasets_

        return a list of primary datasets, The full dataset name must be provided
        pattern based mathcing is no longer supported.
        If no expression is provided, all datasets are returned
        """
        try:
            result = self.dbs.listPrimaryDatasets(primary_ds_name=match)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listPrimaryDataset(%s)\n" % match
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['primary_ds_name'] for x in result]
        return result

    def matchProcessedDatasets(self, primary, tier, process):
        """
        _matchProcessedDatasets_

        return a list of Processed datasets
        """
        result = []
        try:
            datasets = self.dbs.listDatasets(primary_ds_name=primary,
                                             data_tier_name=tier,
                                             detail=True)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listProcessedDatasets(%s)\n" % primary
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        for dataset in datasets:
            dataset = remapDBS3Keys(dataset, processed_ds_name='Name')
            dataset['PathList'] = [dataset['dataset']]
            if dataset['Name'] == process:
                result.append(dataset)
        return result

    def listRuns(self, dataset=None, block=None):
        """
        it gets list of DbsRun object but for our purpose
        only list of number is collected.
        DbsRun (RunNumber,
                NumberOfEvents,
                NumberOfLumiSections,
                TotalLuminosity,
                StoreNumber,
                StartOfRungetLong,
                EndOfRun,
                CreationDate,
                CreatedBy,
                LastModificationDate,
                LastModifiedBy
                )
        """
        runs = []
        try:
            if block:
                results = self.dbs.listRuns(block_name=block)
            else:
                results = self.dbs.listRuns(dataset=dataset)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listRuns(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)
        [runs.extend(x['run_num']) for x in results]
        return runs

    def listRunLumis(self, dataset=None, block=None):
        """
        It gets a list of DBSRun objects and returns the number of lumisections per run
        DbsRun (RunNumber,
                NumberOfEvents,
                NumberOfLumiSections,
                TotalLuminosity,
                StoreNumber,
                StartOfRungetLong,
                EndOfRun,
                CreationDate,
                CreatedBy,
                LastModificationDate,
                LastModifiedBy
                )
        """
        # Pointless code in python3
        if isinstance(block, str):
            block = unicode(block)
        if isinstance(dataset, str):
            dataset = unicode(dataset)

        try:
            if block:
                results = self.dbs.listRuns(block_name=block)
            else:
                results = self.dbs.listRuns(dataset=dataset)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listRuns(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        # send runDict format as result, this format is for sync with dbs2 call
        # which has {run_number: num_lumis} but dbs3 call doesn't return num Lumis
        # So it returns {run_number: None}
        # TODO: After DBS2 is completely removed change the return format more sensible one

        runDict = {}
        for x in results:
            for runNumber in x["run_num"]:
                runDict[runNumber] = None
        return runDict

    def listProcessedDatasets(self, primary, dataTier='*'):
        """
        _listProcessedDatasets_

        return a list of Processed datasets for the primary and optional
        data tier value

        """
        try:
            result = self.dbs.listDatasets(primary_ds_name=primary,
                                           data_tier_name=dataTier)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listProcessedDatasets(%s)\n" % primary
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['dataset'].split('/')[2] for x in result]
        return result

    def listDatasetFiles(self, datasetPath):
        """
        _listDatasetFiles_

        Get list of files for dataset

        """
        return [
            x['logical_file_name']
            for x in self.dbs.listFileArray(dataset=datasetPath)
        ]

    @staticmethod
    def listDatatiers(dbsUrl=None):
        """
        _listDatatiers_

        Get a list of datatiers known by DBS.
        """
        if dbsUrl is None:
            msg = "Error in DBSReader.listDatatiers(). DBS Url not set."
            raise DBSReaderError(msg)

        timenow = int(time.time())
        if DBS3Reader._datatiers and timenow - 7200 < DBS3Reader._datatiers[
                'ts']:
            return DBS3Reader._datatiers['tiers']

        try:
            DBS3Reader._setDatatiersCache(timenow, dbsUrl)
        except Exception as ex:
            if not DBS3Reader._datatiers:
                msg = "Error in DBSReader.listDatatiers\n%s" % formatEx3(ex)
                raise DBSReaderError(msg)
        return DBS3Reader._datatiers['tiers']

    @staticmethod
    def _setDatatiersCache(ts, dbsUrl):
        """
        Set a timestamp and update the list of datatiers cached in
        the class property
        """
        dbs = DbsApi(dbsUrl)
        DBS3Reader._datatiers['ts'] = ts
        DBS3Reader._datatiers['tiers'] = [
            tier['data_tier_name'] for tier in dbs.listDataTiers()
        ]

        return

    def listDatasetFileDetails(self,
                               datasetPath,
                               getParents=False,
                               validFileOnly=1):
        """
        TODO: This is completely wrong need to be redone. or be removed - getting dataset altogether
        might be to costly

        _listDatasetFileDetails_

        Get list of lumis, events, and parents for each file in a dataset
        Return a dict where the keys are the files, and for each file we have something like:
            { 'NumberOfEvents': 545,
              'BlockName': '/HighPileUp/Run2011A-v1/RAW#dd6e0796-cbcc-11e0-80a9-003048caaace',
              'Lumis': {173658: [8, 12, 9, 14, 19, 109, 105]},
              'Parents': [],
              'Checksum': '22218315',
              'Adler32': 'a41a1446',
              'FileSize': 286021145,
              'ValidFile': 1
            }

        """
        fileDetails = self.getFileListByDataset(dataset=datasetPath,
                                                validFileOnly=validFileOnly,
                                                detail=True)
        blocks = set()  # the set of blocks of the dataset
        # Iterate over the files and prepare the set of blocks and a dict where the keys are the files
        files = {}
        for f in fileDetails:
            blocks.add(f['block_name'])
            files[f['logical_file_name']] = remapDBS3Keys(f, stringify=True)
            files[f['logical_file_name']]['ValidFile'] = f['is_file_valid']
            files[f['logical_file_name']]['Lumis'] = {}
            files[f['logical_file_name']]['Parents'] = []

        # Iterate over the blocks and get parents and lumis
        for blockName in blocks:
            # get the parents
            if getParents:
                parents = self.dbs.listFileParents(block_name=blockName)
                for p in parents:
                    if p['logical_file_name'] in files:  # invalid files are not there if validFileOnly=1
                        files[p['logical_file_name']]['Parents'].extend(
                            p['parent_logical_file_name'])
            # get the lumis
            file_lumis = self.dbs.listFileLumis(block_name=blockName)
            for f in file_lumis:
                if f['logical_file_name'] in files:  # invalid files are not there if validFileOnly=1
                    if f['run_num'] in files[f['logical_file_name']]['Lumis']:
                        files[f['logical_file_name']]['Lumis'][
                            f['run_num']].extend(f['lumi_section_num'])
                    else:
                        files[f['logical_file_name']]['Lumis'][
                            f['run_num']] = f['lumi_section_num']

        return files

    def crossCheck(self, datasetPath, *lfns):
        """
        _crossCheck_

        For the dataset provided, check that the lfns listed all exist
        in the dataset.

        Return the list of lfns that are in the dataset

        """
        allLfns = self.dbs.listFileArray(dataset=datasetPath,
                                         validFileOnly=1,
                                         detail=False)
        setOfAllLfns = set(allLfns)
        setOfKnownLfns = set(lfns)
        return list(setOfAllLfns.intersection(setOfKnownLfns))

    def crossCheckMissing(self, datasetPath, *lfns):
        """
        _crossCheckMissing_

        As cross check, but return value is a list of files that
        are *not* known by DBS

        """
        allLfns = self.dbs.listFileArray(dataset=datasetPath,
                                         validFileOnly=1,
                                         detail=False)
        setOfAllLfns = set(allLfns)
        setOfKnownLfns = set(lfns)
        knownFiles = setOfAllLfns.intersection(setOfKnownLfns)
        unknownFiles = setOfKnownLfns.difference(knownFiles)
        return list(unknownFiles)

    def getDBSSummaryInfo(self, dataset=None, block=None):
        """
        Get dataset summary includes # of files, events, blocks and total size
        """
        # FIXME: Doesnt raise exceptions on missing data as old api did
        if dataset:
            self.checkDatasetPath(dataset)
        try:
            if block:
                summary = self.dbs.listFileSummaries(block_name=block,
                                                     validFileOnly=1)
            else:  # dataset case dataset shouldn't be None
                summary = self.dbs.listFileSummaries(dataset=dataset,
                                                     validFileOnly=1)
        except Exception as ex:
            msg = "Error in DBSReader.getDBSSummaryInfo(%s, %s)\n" % (dataset,
                                                                      block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)
        if not summary or summary[0].get(
                'file_size') is None:  # appears to indicate missing dataset
            msg = "DBSReader.listDatasetSummary(%s, %s): No matching data"
            raise DBSReaderError(msg % (dataset, block))
        result = remapDBS3Keys(summary[0], stringify=True)
        result['path'] = dataset if dataset else ''
        result['block'] = block if block else ''
        return result

    def getFileBlocksInfo(self,
                          dataset,
                          onlyClosedBlocks=False,
                          blockName=None,
                          locations=True):
        """
        """
        self.checkDatasetPath(dataset)
        args = {'dataset': dataset, 'detail': True}
        if blockName:
            args['block_name'] = blockName
        try:
            blocks = self.dbs.listBlocks(**args)
        except Exception as ex:
            msg = "Error in DBSReader.getFileBlocksInfo(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        blocks = [
            remapDBS3Keys(block, stringify=True, block_name='Name')
            for block in blocks
        ]
        # only raise if blockName not specified - mimic dbs2 error handling
        if not blocks and not blockName:
            msg = "DBSReader.getFileBlocksInfo(%s, %s): No matching data"
            raise DBSReaderError(msg % (dataset, blockName))
        if locations:
            for block in blocks:
                block['PhEDExNodeList'] = [{
                    'Name': x
                } for x in self.listFileBlockLocation(block['Name'])]

        if onlyClosedBlocks:
            return [x for x in blocks if str(x['OpenForWriting']) != "1"]

        return blocks

    def listFileBlocks(self, dataset, onlyClosedBlocks=False, blockName=None):
        """
        _listFileBlocks_

        Retrieve a list of fileblock names for a dataset

        """
        self.checkDatasetPath(dataset)
        args = {'dataset': dataset, 'detail': False}
        if blockName:
            args['block_name'] = blockName
        if onlyClosedBlocks:
            args['detail'] = True
        try:
            blocks = self.dbs.listBlocks(**args)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listFileBlocks(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if onlyClosedBlocks:
            result = [
                x['block_name'] for x in blocks
                if str(x['open_for_writing']) != "1"
            ]

        else:
            result = [x['block_name'] for x in blocks]

        return result

    def listOpenFileBlocks(self, dataset):
        """
        _listOpenFileBlocks_

        Retrieve a list of open fileblock names for a dataset

        """
        self.checkDatasetPath(dataset)
        try:
            blocks = self.dbs.listBlocks(dataset=dataset, detail=True)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listFileBlocks(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [
            x['block_name'] for x in blocks
            if str(x['open_for_writing']) == "1"
        ]

        return result

    def blockExists(self, fileBlockName):
        """
        _blockExists_

        Check to see if block with name provided exists in the DBS
        Instance.

        Return True if exists, False if not

        """
        self.checkBlockName(fileBlockName)
        try:

            blocks = self.dbs.listBlocks(block_name=fileBlockName)
        except Exception as ex:
            msg = "Error in "
            msg += "DBSReader.blockExists(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if len(blocks) == 0:
            return False
        return True

    def listFilesInBlock(self, fileBlockName, lumis=True, validFileOnly=1):
        """
        _listFilesInBlock_

        Get a list of files in the named fileblock
        TODO: lumis can be false when lumi splitting is not required
        However WMBSHelper expect file['LumiList'] to get the run number
        so for now it will be always true.
        We need to clean code up when dbs2 is completely deprecated.
        calling lumis for run number is expensive.
        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.listFilesInBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            files = self.dbs.listFileArray(block_name=fileBlockName,
                                           validFileOnly=validFileOnly,
                                           detail=True)
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlock(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if lumis:
            lumiDict = self._getLumiList(blockName=fileBlockName,
                                         validFileOnly=validFileOnly)

        result = []
        for fileInfo in files:
            if lumis:
                fileInfo["LumiList"] = lumiDict[fileInfo['logical_file_name']]
            result.append(remapDBS3Keys(fileInfo, stringify=True))
        return result

    def listFilesInBlockWithParents(self,
                                    fileBlockName,
                                    lumis=True,
                                    validFileOnly=1):
        """
        _listFilesInBlockWithParents_

        Get a list of files in the named fileblock including
        the parents of that file.
        TODO: lumis can be false when lumi splitting is not required
        However WMBSHelper expect file['LumiList'] to get the run number
        so for now it will be always true.

        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.listFilesInBlockWithParents(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            # TODO: shoud we get only valid block for this?
            files = self.dbs.listFileParents(block_name=fileBlockName)
            fileDetails = self.listFilesInBlock(fileBlockName, lumis,
                                                validFileOnly)

        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlockWithParents(%s)\n" % (
                fileBlockName, )
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        childByParents = defaultdict(list)
        for f in files:
            # Probably a child can have more than 1 parent file
            for fp in f['parent_logical_file_name']:
                childByParents[fp].append(f['logical_file_name'])

        parentsLFNs = childByParents.keys()

        if len(parentsLFNs) == 0:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlockWithParents(%s)\n There is no parents files" % (
                fileBlockName)
            raise DBSReaderError(msg)

        parentFilesDetail = []
        # TODO: slicing parentLFNs util DBS api is handling that.
        # Remove slicing if DBS api handles
        for pLFNs in grouper(parentsLFNs, 50):
            parentFilesDetail.extend(
                self.dbs.listFileArray(logical_file_name=pLFNs, detail=True))

        if lumis:
            parentLumis = self._getLumiList(lfns=parentsLFNs)

        parentsByLFN = defaultdict(list)

        for pf in parentFilesDetail:
            parentLFN = pf['logical_file_name']
            dbsFile = remapDBS3Keys(pf, stringify=True)
            if lumis:
                dbsFile["LumiList"] = parentLumis[parentLFN]

            for childLFN in childByParents[parentLFN]:
                parentsByLFN[childLFN].append(dbsFile)

        for fileInfo in fileDetails:
            fileInfo["ParentList"] = parentsByLFN[
                fileInfo['logical_file_name']]

        return fileDetails

    def lfnsInBlock(self, fileBlockName):
        """
        _lfnsInBlock_

        LFN list only for block, details = False => faster query

        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.lfnsInBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            lfns = self.dbs.listFileArray(block_name=fileBlockName,
                                          validFileOnly=1,
                                          detail=False)
            return lfns
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlock(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listFileBlockLocation(self, fileBlockNames, dbsOnly=False):
        """
        _listFileBlockLocation_

        Get origin_site_name of a block

        """

        singleBlockName = None
        if isinstance(fileBlockNames, basestring):
            singleBlockName = fileBlockNames
            fileBlockNames = [fileBlockNames]

        for block in fileBlockNames:
            self.checkBlockName(block)

        locations = {}
        node_filter = set(['UNKNOWN', None])

        if dbsOnly:
            blocksInfo = {}
            try:
                for block in fileBlockNames:
                    blocksInfo.setdefault(block, [])
                    # there should be only one element with a single origin site string ...
                    for blockInfo in self.dbs.listBlockOrigin(
                            block_name=block):
                        blocksInfo[block].append(blockInfo['origin_site_name'])
            except dbsClientException as ex:
                msg = "Error in DBS3Reader: self.dbs.listBlockOrigin(block_name=%s)\n" % fileBlockNames
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)
        else:
            try:
                blocksInfo = self.phedex.getReplicaPhEDExNodesForBlocks(
                    block=fileBlockNames, complete='y')
            except Exception as ex:
                msg = "Error while getting block location from PhEDEx for block_name=%s)\n" % fileBlockNames
                msg += "%s\n" % str(ex)
                raise Exception(msg)

        for block in fileBlockNames:
            valid_nodes = set(blocksInfo.get(block, [])) - node_filter
            locations[block] = list(valid_nodes)

        # returning single list if a single block is passed
        if singleBlockName:
            return locations[singleBlockName]

        return locations

    def getFileBlock(self, fileBlockName, dbsOnly=False):
        """
        _getFileBlock_

        dbsOnly flag is mostly meant for StoreResults, since there is no
        data in TMDB.

        return a dictionary:
        { blockName: {
             "PhEDExNodeNames" : [<pnn list>],
             "Files" : { LFN : Events },
             }
        }


        """
        # Pointless code in python3
        if isinstance(fileBlockName, str):
            fileBlockName = unicode(fileBlockName)
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.getFileBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        result = {
            fileBlockName: {
                "PhEDExNodeNames":
                self.listFileBlockLocation(fileBlockName, dbsOnly),
                "Files":
                self.listFilesInBlock(fileBlockName),
                "IsOpen":
                self.blockIsOpen(fileBlockName)
            }
        }
        return result

    def getFileBlockWithParents(self, fileBlockName):
        """
        _getFileBlockWithParents_

        return a dictionary:
        { blockName: {
             "PhEDExNodeNames" : [<pnn list>],
             "Files" : dictionaries representing each file
             }
        }

        files

        """
        if isinstance(fileBlockName, str):
            fileBlockName = unicode(fileBlockName)

        if not self.blockExists(fileBlockName):
            msg = "DBSReader.getFileBlockWithParents(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        result = {
            fileBlockName: {
                "PhEDExNodeNames": self.listFileBlockLocation(fileBlockName),
                "Files": self.listFilesInBlockWithParents(fileBlockName),
                "IsOpen": self.blockIsOpen(fileBlockName)
            }
        }
        return result

    def getFiles(self, dataset, onlyClosedBlocks=False):
        """
        _getFiles_

        Returns a dictionary of block names for the dataset where
        each block constists of a dictionary containing the PhEDExNodeNames
        for that block and the files in that block by LFN mapped to NEvents

        """
        result = {}
        blocks = self.listFileBlocks(dataset, onlyClosedBlocks)

        [result.update(self.getFileBlock(x)) for x in blocks]

        return result

    def listBlockParents(self, blockName):
        """Get parent blocks for block"""
        result = []
        self.checkBlockName(blockName)
        blocks = self.dbs.listBlockParents(block_name=blockName)
        for block in blocks:
            toreturn = {'Name': block['parent_block_name']}
            toreturn['PhEDExNodeList'] = self.listFileBlockLocation(
                toreturn['Name'])
            result.append(toreturn)
        return result

    def blockIsOpen(self, blockName):
        """
        _blockIsOpen_

        Return True if named block is open, false if not, or if block
        doenst exist

        """
        self.checkBlockName(blockName)
        blockInstance = self.dbs.listBlocks(block_name=blockName, detail=True)
        if len(blockInstance) == 0:
            return False
        blockInstance = blockInstance[0]
        isOpen = blockInstance.get('open_for_writing', 1)
        if isOpen == 0:
            return False
        return True

    def blockToDatasetPath(self, blockName):
        """
        _blockToDatasetPath_

        Given a block name, get the dataset Path associated with that
        Block.

        Returns the dataset path, or None if not found

        """
        self.checkBlockName(blockName)
        try:
            blocks = self.dbs.listBlocks(block_name=blockName, detail=True)
        except Exception as ex:
            msg = "Error in "
            msg += "DBSReader.blockToDatasetPath(%s)\n" % blockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if blocks == []:
            return None

        pathname = blocks[-1].get('dataset', None)
        return pathname

    def listDatasetLocation(self, datasetName, dbsOnly=False):
        """
        _listDatasetLocation_

        List the origin SEs where there is at least a block of the given
        dataset.
        """
        self.checkDatasetPath(datasetName)

        locations = set()

        if dbsOnly:
            try:
                blocksInfo = self.dbs.listBlockOrigin(dataset=datasetName)
            except dbsClientException as ex:
                msg = "Error in DBSReader: dbsApi.listBlocks(dataset=%s)\n" % datasetName
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)

            if not blocksInfo:  # no data location from dbs
                return list()

            for blockInfo in blocksInfo:
                locations.update(blockInfo['origin_site_name'])

            locations.difference_update(
                ['UNKNOWN', None])  # remove entry when SE name is 'UNKNOWN'
        else:
            try:
                blocksInfo = self.phedex.getReplicaPhEDExNodesForBlocks(
                    dataset=[datasetName], complete='y')
            except Exception as ex:
                msg = "Error while getting block location from PhEDEx for dataset=%s)\n" % datasetName
                msg += "%s\n" % str(ex)
                raise Exception(msg)

            if blocksInfo:
                for blockSites in blocksInfo.values():
                    locations.update(blockSites)

        return list(locations)

    def checkDatasetPath(self, pathName):
        """
         _checkDatasetPath_
        """
        if pathName in ("", None):
            raise DBSReaderError("Invalid Dataset Path name: => %s <=" %
                                 pathName)
        else:
            try:
                result = self.dbs.listDatasets(dataset=pathName,
                                               dataset_access_type='*')
                if len(result) == 0:
                    raise DBSReaderError("Dataset %s doesn't exist in DBS %s" %
                                         (pathName, self.dbsURL))
            except (dbsClientException, HTTPError) as ex:
                msg = "Error in "
                msg += "DBSReader.checkDatasetPath(%s)\n" % pathName
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)
        return

    def checkBlockName(self, blockName):
        """
         _checkBlockName_
        """
        if blockName in ("", "*", None):
            raise DBSReaderError("Invalid Block name: => %s <=" % blockName)

    def getFileListByDataset(self, dataset, validFileOnly=1, detail=True):
        """
        _getFileListByDataset_

        Given a dataset, retrieves all blocks, lfns and number of events (among other
        not really important info).
        Returns a list of dict.
        """

        try:
            fileList = self.dbs.listFileArray(dataset=dataset,
                                              validFileOnly=validFileOnly,
                                              detail=detail)
            return fileList
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.getFileListByDataset(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listDatasetParents(self, childDataset):
        """
        list the the parents dataset path given childDataset
        """
        try:
            parentList = self.dbs.listDatasetParents(dataset=childDataset)
            return parentList
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listDatasetParents(%s)\n" % childDataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)
예제 #17
0
파일: DBS3Reader.py 프로젝트: dmwm/WMCore
class DBS3Reader(object):
    """
    _DBSReader_

    General API for reading data from DBS
    """
    # cache all the datatiers known by DBS
    _datatiers = {}

    def __init__(self, url, logger=None, **contact):

        # instantiate dbs api object
        try:
            self.dbsURL = url
            self.dbs = DbsApi(url, **contact)
            self.logger = logger or logging.getLogger(self.__class__.__name__)
        except dbsClientException as ex:
            msg = "Error in DBSReader with DbsApi\n"
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        # connection to PhEDEx (Use default endpoint url)
        self.phedex = PhEDEx(responseType="json", dbsUrl=self.dbsURL)

    def _getLumiList(self, blockName=None, lfns=None, validFileOnly=1):
        """
        currently only take one lfn but dbs api need be updated
        """
        try:
            if blockName:
                lumiLists = self.dbs.listFileLumis(block_name=blockName, validFileOnly=validFileOnly)
            elif lfns:
                lumiLists = []
                for slfn in grouper(lfns, 50):
                    lumiLists.extend(self.dbs.listFileLumiArray(logical_file_name=slfn))
            else:
                # shouldn't call this with both blockName and lfns empty
                # but still returns empty dict for that case
                return {}
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFileLumiArray(%s)\n" % lfns
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        lumiDict = {}
        for lumisItem in lumiLists:
            lumiDict.setdefault(lumisItem['logical_file_name'], [])
            item = {}
            item["RunNumber"] = lumisItem['run_num']
            item['LumiSectionNumber'] = lumisItem['lumi_section_num']
            if lumisItem.get('event_count', None) is not None:
                item['EventCount'] = lumisItem['event_count']
            lumiDict[lumisItem['logical_file_name']].append(item)
            # TODO: add key for lumi and event pair.
        return lumiDict

    def checkDBSServer(self):
        """
        check whether dbs server is up and running
        returns {"dbs_instance": "prod/global", "dbs_version": "3.3.144"}
        """
        try:
            return self.dbs.serverinfo()
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBS server is not up: %s" % self.dbsURL
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listPrimaryDatasets(self, match='*'):
        """
        _listPrimaryDatasets_

        return a list of primary datasets, The full dataset name must be provided
        pattern based mathcing is no longer supported.
        If no expression is provided, all datasets are returned
        """
        try:
            result = self.dbs.listPrimaryDatasets(primary_ds_name=match)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listPrimaryDataset(%s)\n" % match
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['primary_ds_name'] for x in result]
        return result

    def matchProcessedDatasets(self, primary, tier, process):
        """
        _matchProcessedDatasets_

        return a list of Processed datasets
        """
        result = []
        try:
            datasets = self.dbs.listDatasets(primary_ds_name=primary, data_tier_name=tier, detail=True)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listProcessedDatasets(%s)\n" % primary
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        for dataset in datasets:
            dataset = remapDBS3Keys(dataset, processed_ds_name='Name')
            dataset['PathList'] = [dataset['dataset']]
            if dataset['Name'] == process:
                result.append(dataset)
        return result

    def listRuns(self, dataset=None, block=None):
        """
        it gets list of DbsRun object but for our purpose
        only list of number is collected.
        DbsRun (RunNumber,
                NumberOfEvents,
                NumberOfLumiSections,
                TotalLuminosity,
                StoreNumber,
                StartOfRungetLong,
                EndOfRun,
                CreationDate,
                CreatedBy,
                LastModificationDate,
                LastModifiedBy
                )
        """
        runs = []
        try:
            if block:
                results = self.dbs.listRuns(block_name=block)
            else:
                results = self.dbs.listRuns(dataset=dataset)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listRuns(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)
        for x in results:
            runs.extend(x['run_num'])
        return runs

    def listRunLumis(self, dataset=None, block=None):
        """
        It gets a list of DBSRun objects and returns the number of lumisections per run
        DbsRun (RunNumber,
                NumberOfEvents,
                NumberOfLumiSections,
                TotalLuminosity,
                StoreNumber,
                StartOfRungetLong,
                EndOfRun,
                CreationDate,
                CreatedBy,
                LastModificationDate,
                LastModifiedBy
                )
        """
        # Pointless code in python3
        if isinstance(block, str):
            block = unicode(block)
        if isinstance(dataset, str):
            dataset = unicode(dataset)

        try:
            if block:
                results = self.dbs.listRuns(block_name=block)
            else:
                results = self.dbs.listRuns(dataset=dataset)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listRuns(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        # send runDict format as result, this format is for sync with dbs2 call
        # which has {run_number: num_lumis} but dbs3 call doesn't return num Lumis
        # So it returns {run_number: None}
        # TODO: After DBS2 is completely removed change the return format more sensible one

        runDict = {}
        for x in results:
            for runNumber in x["run_num"]:
                runDict[runNumber] = None
        return runDict

    def listProcessedDatasets(self, primary, dataTier='*'):
        """
        _listProcessedDatasets_

        return a list of Processed datasets for the primary and optional
        data tier value

        """
        try:
            result = self.dbs.listDatasets(primary_ds_name=primary, data_tier_name=dataTier)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listProcessedDatasets(%s)\n" % primary
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['dataset'].split('/')[2] for x in result]
        return result

    def listDatasetFiles(self, datasetPath):
        """
        _listDatasetFiles_

        Get list of files for dataset

        """
        return [x['logical_file_name'] for x in self.dbs.listFileArray(dataset=datasetPath)]

    @staticmethod
    def listDatatiers(dbsUrl=None):
        """
        _listDatatiers_

        Get a list of datatiers known by DBS.
        """
        if dbsUrl is None:
            msg = "Error in DBSReader.listDatatiers(). DBS Url not set."
            raise DBSReaderError(msg)

        timenow = int(time.time())
        if DBS3Reader._datatiers and timenow - 7200 < DBS3Reader._datatiers['ts']:
            return DBS3Reader._datatiers['tiers']

        try:
            DBS3Reader._setDatatiersCache(timenow, dbsUrl)
        except Exception as ex:
            if not DBS3Reader._datatiers:
                msg = "Error in DBSReader.listDatatiers\n%s" % formatEx3(ex)
                raise DBSReaderError(msg)
        return DBS3Reader._datatiers['tiers']

    @staticmethod
    def _setDatatiersCache(ts, dbsUrl):
        """
        Set a timestamp and update the list of datatiers cached in
        the class property
        """
        dbs = DbsApi(dbsUrl)
        DBS3Reader._datatiers['ts'] = ts
        DBS3Reader._datatiers['tiers'] = [tier['data_tier_name'] for tier in dbs.listDataTiers()]

        return

    def listDatasetFileDetails(self, datasetPath, getParents=False, getLumis=True, validFileOnly=1):
        """
        TODO: This is completely wrong need to be redone. or be removed - getting dataset altogether
        might be to costly

        _listDatasetFileDetails_

        Get list of lumis, events, and parents for each file in a dataset
        Return a dict where the keys are the files, and for each file we have something like:
            { 'NumberOfEvents': 545,
              'BlockName': '/HighPileUp/Run2011A-v1/RAW#dd6e0796-cbcc-11e0-80a9-003048caaace',
              'Lumis': {173658: [8, 12, 9, 14, 19, 109, 105]},
              'Parents': [],
              'Checksum': '22218315',
              'Adler32': 'a41a1446',
              'FileSize': 286021145,
              'ValidFile': 1
            }

        """
        fileDetails = self.getFileListByDataset(dataset=datasetPath, validFileOnly=validFileOnly, detail=True)
        blocks = set()  # the set of blocks of the dataset
        # Iterate over the files and prepare the set of blocks and a dict where the keys are the files
        files = {}
        for f in fileDetails:
            blocks.add(f['block_name'])
            files[f['logical_file_name']] = remapDBS3Keys(f, stringify=True)
            files[f['logical_file_name']]['ValidFile'] = f['is_file_valid']
            files[f['logical_file_name']]['Lumis'] = {}
            files[f['logical_file_name']]['Parents'] = []

        # Iterate over the blocks and get parents and lumis
        for blockName in blocks:
            # get the parents
            if getParents:
                parents = self.dbs.listFileParents(block_name=blockName)
                for p in parents:
                    if p['logical_file_name'] in files:  # invalid files are not there if validFileOnly=1
                        files[p['logical_file_name']]['Parents'].extend(p['parent_logical_file_name'])

            if getLumis:
                # get the lumis
                file_lumis = self.dbs.listFileLumis(block_name=blockName)
                for f in file_lumis:
                    if f['logical_file_name'] in files:  # invalid files are not there if validFileOnly=1
                        if f['run_num'] in files[f['logical_file_name']]['Lumis']:
                            files[f['logical_file_name']]['Lumis'][f['run_num']].extend(f['lumi_section_num'])
                        else:
                            files[f['logical_file_name']]['Lumis'][f['run_num']] = f['lumi_section_num']

        return files

    def crossCheck(self, datasetPath, *lfns):
        """
        _crossCheck_

        For the dataset provided, check that the lfns listed all exist
        in the dataset.

        Return the list of lfns that are in the dataset

        """
        allLfns = self.dbs.listFileArray(dataset=datasetPath, validFileOnly=1, detail=False)
        setOfAllLfns = set(allLfns)
        setOfKnownLfns = set(lfns)
        return list(setOfAllLfns.intersection(setOfKnownLfns))

    def crossCheckMissing(self, datasetPath, *lfns):
        """
        _crossCheckMissing_

        As cross check, but return value is a list of files that
        are *not* known by DBS

        """
        allLfns = self.dbs.listFileArray(dataset=datasetPath, validFileOnly=1, detail=False)
        setOfAllLfns = set(allLfns)
        setOfKnownLfns = set(lfns)
        knownFiles = setOfAllLfns.intersection(setOfKnownLfns)
        unknownFiles = setOfKnownLfns.difference(knownFiles)
        return list(unknownFiles)

    def getDBSSummaryInfo(self, dataset=None, block=None):
        """
        Get dataset summary includes # of files, events, blocks and total size
        """
        if dataset:
            self.checkDatasetPath(dataset)
        try:
            if block:
                summary = self.dbs.listFileSummaries(block_name=block, validFileOnly=1)
            else:
                summary = self.dbs.listFileSummaries(dataset=dataset, validFileOnly=1)
        except Exception as ex:
            msg = "Error in DBSReader.getDBSSummaryInfo(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if not summary:  # missing data or all files invalid
            return {}

        result = remapDBS3Keys(summary[0], stringify=True)
        result['path'] = dataset if dataset else ''
        result['block'] = block if block else ''
        return result

    def getFileBlocksInfo(self, dataset, onlyClosedBlocks=False,
                          blockName=None, locations=True):
        """
        """
        self.checkDatasetPath(dataset)
        args = {'dataset': dataset, 'detail': True}
        if blockName:
            args['block_name'] = blockName
        try:
            blocks = self.dbs.listBlocks(**args)
        except Exception as ex:
            msg = "Error in DBSReader.getFileBlocksInfo(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        blocks = [remapDBS3Keys(block, stringify=True, block_name='Name') for block in blocks]
        # only raise if blockName not specified - mimic dbs2 error handling
        if not blocks and not blockName:
            msg = "DBSReader.getFileBlocksInfo(%s, %s): No matching data"
            raise DBSReaderError(msg % (dataset, blockName))
        if locations:
            for block in blocks:
                block['PhEDExNodeList'] = [{'Name': x} for x in self.listFileBlockLocation(block['Name'])]

        if onlyClosedBlocks:
            return [x for x in blocks if str(x['OpenForWriting']) != "1"]

        return blocks

    def listFileBlocks(self, dataset, onlyClosedBlocks=False, blockName=None):
        """
        _listFileBlocks_

        Retrieve a list of fileblock names for a dataset

        """
        self.checkDatasetPath(dataset)
        args = {'dataset': dataset, 'detail': False}
        if blockName:
            args['block_name'] = blockName
        if onlyClosedBlocks:
            args['detail'] = True
        try:
            blocks = self.dbs.listBlocks(**args)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listFileBlocks(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if onlyClosedBlocks:
            result = [x['block_name'] for x in blocks if str(x['open_for_writing']) != "1"]

        else:
            result = [x['block_name'] for x in blocks]

        return result

    def listOpenFileBlocks(self, dataset):
        """
        _listOpenFileBlocks_

        Retrieve a list of open fileblock names for a dataset

        """
        self.checkDatasetPath(dataset)
        try:
            blocks = self.dbs.listBlocks(dataset=dataset, detail=True)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listFileBlocks(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['block_name'] for x in blocks if str(x['open_for_writing']) == "1"]

        return result

    def blockExists(self, fileBlockName):
        """
        _blockExists_

        Check to see if block with name provided exists in the DBS
        Instance.

        Return True if exists, False if not

        """
        self.checkBlockName(fileBlockName)
        try:

            blocks = self.dbs.listBlocks(block_name=fileBlockName)
        except Exception as ex:
            msg = "Error in "
            msg += "DBSReader.blockExists(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if len(blocks) == 0:
            return False
        return True

    def listFilesInBlock(self, fileBlockName, lumis=True, validFileOnly=1):
        """
        _listFilesInBlock_

        Get a list of files in the named fileblock
        TODO: lumis can be false when lumi splitting is not required
        However WMBSHelper expect file['LumiList'] to get the run number
        so for now it will be always true.
        We need to clean code up when dbs2 is completely deprecated.
        calling lumis for run number is expensive.
        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.listFilesInBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            files = self.dbs.listFileArray(block_name=fileBlockName, validFileOnly=validFileOnly, detail=True)
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlock(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if lumis:
            lumiDict = self._getLumiList(blockName=fileBlockName, validFileOnly=validFileOnly)

        result = []
        for fileInfo in files:
            if lumis:
                fileInfo["LumiList"] = lumiDict[fileInfo['logical_file_name']]
            result.append(remapDBS3Keys(fileInfo, stringify=True))
        return result

    def listFilesInBlockWithParents(self, fileBlockName, lumis=True, validFileOnly=1):
        """
        _listFilesInBlockWithParents_

        Get a list of files in the named fileblock including
        the parents of that file.
        TODO: lumis can be false when lumi splitting is not required
        However WMBSHelper expect file['LumiList'] to get the run number
        so for now it will be always true.

        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.listFilesInBlockWithParents(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            # TODO: shoud we get only valid block for this?
            files = self.dbs.listFileParents(block_name=fileBlockName)
            fileDetails = self.listFilesInBlock(fileBlockName, lumis, validFileOnly)

        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlockWithParents(%s)\n" % (
                fileBlockName,)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        childByParents = defaultdict(list)
        for f in files:
            # Probably a child can have more than 1 parent file
            for fp in f['parent_logical_file_name']:
                childByParents[fp].append(f['logical_file_name'])

        parentsLFNs = childByParents.keys()

        if len(parentsLFNs) == 0:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlockWithParents(%s)\n There is no parents files" % (
                fileBlockName)
            raise DBSReaderError(msg)

        parentFilesDetail = []
        # TODO: slicing parentLFNs util DBS api is handling that.
        # Remove slicing if DBS api handles
        for pLFNs in grouper(parentsLFNs, 50):
            parentFilesDetail.extend(self.dbs.listFileArray(logical_file_name=pLFNs, detail=True))

        if lumis:
            parentLumis = self._getLumiList(lfns=parentsLFNs)

        parentsByLFN = defaultdict(list)

        for pf in parentFilesDetail:
            parentLFN = pf['logical_file_name']
            dbsFile = remapDBS3Keys(pf, stringify=True)
            if lumis:
                dbsFile["LumiList"] = parentLumis[parentLFN]

            for childLFN in childByParents[parentLFN]:
                parentsByLFN[childLFN].append(dbsFile)

        for fileInfo in fileDetails:
            fileInfo["ParentList"] = parentsByLFN[fileInfo['logical_file_name']]

        return fileDetails

    def lfnsInBlock(self, fileBlockName):
        """
        _lfnsInBlock_

        LFN list only for block, details = False => faster query

        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.lfnsInBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            lfns = self.dbs.listFileArray(block_name=fileBlockName, validFileOnly=1, detail=False)
            return lfns
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlock(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listFileBlockLocation(self, fileBlockNames, dbsOnly=False):
        """
        _listFileBlockLocation_

        Get origin_site_name of a block

        """

        singleBlockName = None
        if isinstance(fileBlockNames, basestring):
            singleBlockName = fileBlockNames
            fileBlockNames = [fileBlockNames]

        for block in fileBlockNames:
            self.checkBlockName(block)

        locations = {}
        node_filter = set(['UNKNOWN', None])

        if dbsOnly:
            blocksInfo = {}
            try:
                for block in fileBlockNames:
                    blocksInfo.setdefault(block, [])
                    # there should be only one element with a single origin site string ...
                    for blockInfo in self.dbs.listBlockOrigin(block_name=block):
                        blocksInfo[block].append(blockInfo['origin_site_name'])
            except dbsClientException as ex:
                msg = "Error in DBS3Reader: self.dbs.listBlockOrigin(block_name=%s)\n" % fileBlockNames
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)
        else:
            try:
                blocksInfo = self.phedex.getReplicaPhEDExNodesForBlocks(block=fileBlockNames, complete='y')
            except Exception as ex:
                msg = "Error while getting block location from PhEDEx for block_name=%s)\n" % fileBlockNames
                msg += "%s\n" % str(ex)
                raise Exception(msg)

        for block in fileBlockNames:
            valid_nodes = set(blocksInfo.get(block, [])) - node_filter
            locations[block] = list(valid_nodes)

        # returning single list if a single block is passed
        if singleBlockName:
            return locations[singleBlockName]

        return locations

    def getFileBlock(self, fileBlockName, dbsOnly=False):
        """
        _getFileBlock_

        dbsOnly flag is mostly meant for StoreResults, since there is no
        data in TMDB.

        return a dictionary:
        { blockName: {
             "PhEDExNodeNames" : [<pnn list>],
             "Files" : { LFN : Events },
             }
        }


        """
        # Pointless code in python3
        if isinstance(fileBlockName, str):
            fileBlockName = unicode(fileBlockName)
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.getFileBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        result = {fileBlockName: {
            "PhEDExNodeNames": self.listFileBlockLocation(fileBlockName, dbsOnly),
            "Files": self.listFilesInBlock(fileBlockName),
            "IsOpen": self.blockIsOpen(fileBlockName)
        }
        }
        return result

    def getFileBlockWithParents(self, fileBlockName):
        """
        _getFileBlockWithParents_

        return a dictionary:
        { blockName: {
             "PhEDExNodeNames" : [<pnn list>],
             "Files" : dictionaries representing each file
             }
        }

        files

        """
        if isinstance(fileBlockName, str):
            fileBlockName = unicode(fileBlockName)

        if not self.blockExists(fileBlockName):
            msg = "DBSReader.getFileBlockWithParents(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        result = {fileBlockName: {
            "PhEDExNodeNames": self.listFileBlockLocation(fileBlockName),
            "Files": self.listFilesInBlockWithParents(fileBlockName),
            "IsOpen": self.blockIsOpen(fileBlockName)
        }
        }
        return result

    def getFiles(self, dataset, onlyClosedBlocks=False):
        """
        _getFiles_

        Returns a dictionary of block names for the dataset where
        each block constists of a dictionary containing the PhEDExNodeNames
        for that block and the files in that block by LFN mapped to NEvents

        """
        result = {}
        blocks = self.listFileBlocks(dataset, onlyClosedBlocks)

        for x in blocks:
            result.update(self.getFileBlock(x))

        return result

    def listBlockParents(self, blockName):
        """Get parent blocks for block"""
        result = []
        self.checkBlockName(blockName)
        blocks = self.dbs.listBlockParents(block_name=blockName)
        for block in blocks:
            toreturn = {'Name': block['parent_block_name']}
            toreturn['PhEDExNodeList'] = self.listFileBlockLocation(toreturn['Name'])
            result.append(toreturn)
        return result

    def blockIsOpen(self, blockName):
        """
        _blockIsOpen_

        Return True if named block is open, false if not, or if block
        doenst exist

        """
        self.checkBlockName(blockName)
        blockInstance = self.dbs.listBlocks(block_name=blockName, detail=True)
        if len(blockInstance) == 0:
            return False
        blockInstance = blockInstance[0]
        isOpen = blockInstance.get('open_for_writing', 1)
        if isOpen == 0:
            return False
        return True

    def blockToDatasetPath(self, blockName):
        """
        _blockToDatasetPath_

        Given a block name, get the dataset Path associated with that
        Block.

        Returns the dataset path, or None if not found

        """
        self.checkBlockName(blockName)
        try:
            blocks = self.dbs.listBlocks(block_name=blockName, detail=True)
        except Exception as ex:
            msg = "Error in "
            msg += "DBSReader.blockToDatasetPath(%s)\n" % blockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if blocks == []:
            return None

        pathname = blocks[-1].get('dataset', None)
        return pathname

    def listDatasetLocation(self, datasetName, dbsOnly=False):
        """
        _listDatasetLocation_

        List the origin SEs where there is at least a block of the given
        dataset.
        """
        self.checkDatasetPath(datasetName)

        locations = set()

        if dbsOnly:
            try:
                blocksInfo = self.dbs.listBlockOrigin(dataset=datasetName)
            except dbsClientException as ex:
                msg = "Error in DBSReader: dbsApi.listBlocks(dataset=%s)\n" % datasetName
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)

            if not blocksInfo:  # no data location from dbs
                return list()

            for blockInfo in blocksInfo:
                locations.update(blockInfo['origin_site_name'])

            locations.difference_update(['UNKNOWN', None])  # remove entry when SE name is 'UNKNOWN'
        else:
            try:
                blocksInfo = self.phedex.getReplicaPhEDExNodesForBlocks(dataset=[datasetName], complete='y')
            except Exception as ex:
                msg = "Error while getting block location from PhEDEx for dataset=%s)\n" % datasetName
                msg += "%s\n" % str(ex)
                raise Exception(msg)

            if blocksInfo:
                for blockSites in blocksInfo.values():
                    locations.update(blockSites)

        return list(locations)

    def checkDatasetPath(self, pathName):
        """
         _checkDatasetPath_
        """
        if pathName in ("", None):
            raise DBSReaderError("Invalid Dataset Path name: => %s <=" % pathName)
        else:
            try:
                result = self.dbs.listDatasets(dataset=pathName, dataset_access_type='*')
                if len(result) == 0:
                    raise DBSReaderError("Dataset %s doesn't exist in DBS %s" % (pathName, self.dbsURL))
            except (dbsClientException, HTTPError) as ex:
                msg = "Error in "
                msg += "DBSReader.checkDatasetPath(%s)\n" % pathName
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)
        return

    def checkBlockName(self, blockName):
        """
         _checkBlockName_
        """
        if blockName in ("", "*", None):
            raise DBSReaderError("Invalid Block name: => %s <=" % blockName)

    def getFileListByDataset(self, dataset, validFileOnly=1, detail=True):

        """
        _getFileListByDataset_

        Given a dataset, retrieves all blocks, lfns and number of events (among other
        not really important info).
        Returns a list of dict.
        """

        try:
            fileList = self.dbs.listFileArray(dataset=dataset, validFileOnly=validFileOnly, detail=detail)
            return fileList
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.getFileListByDataset(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listDatasetParents(self, childDataset):
        """
        list the the parents dataset path given childDataset
        """
        try:
            parentList = self.dbs.listDatasetParents(dataset=childDataset)
            return parentList
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listDatasetParents(%s)\n" % childDataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    # def getListFilesByLumiAndDataset(self, dataset, files):
    #     "Unsing pycurl to get all the child parents pair for given dataset"
    #
    #     urls = ['%s/data/dbs/fileparentbylumis?block_name=%s' % (
    #              self.dbsURL, b["block_name"]) for b in self.dbs.listBlocks(dataset=dataset)]
    #
    #     data = multi_getdata(urls, ckey(), cert())
    #     rdict = {}
    #     for row in data:
    #         try:
    #             data = json.loads(row['data'])
    #             rdict[req] = data['result'][0]  # we get back {'result': [workflow]} dict
    #         except Exception as exp:
    #             print("ERROR: fail to load data as json record, error=%s" % str(exp))
    #             print(row)
    #     return rdict

    def getParentFilesGivenParentDataset(self, parentDataset, childLFNs):
        """
        returns parent files for given childLFN when DBS doesn't have direct parent child relationship in DB
        Only use this for finding missing parents

        :param parentDataset: parent dataset for childLFN
        :param childLFN: a file in child dataset
        :return: set of parent files for childLFN
        """
        fInfo = self.dbs.listFileLumiArray(logical_file_name=childLFNs)
        parentFiles = defaultdict(set)
        for f in fInfo:
            pFileList = self.dbs.listFiles(dataset=parentDataset, run_num=f['run_num'], lumi_list=f['lumi_section_num'])
            pFiles = set([x['logical_file_name'] for x in pFileList])
            parentFiles[f['logical_file_name']] = parentFiles[f['logical_file_name']].union(pFiles)
        return parentFiles

    def getParentFilesByLumi(self, childLFN):
        """
        get the parent file's lfns by lumi (This might not be the actual parentage relations in DBS just parentage by Lumis).
        use for only specific lfn for validating purpose, for the parentage fix use findAndInsertMissingParentage
        :param childLFN:
        :return: list of dictionary with parent files for given child LFN and parent dataset
        [{"ParentDataset": /abc/bad/ddd, "ParentFiles": [alf, baf, ...]]
        """
        childDatasets = self.dbs.listDatasets(logical_file_name=childLFN)
        result = []
        for i in childDatasets:
            parents = self.dbs.listDatasetParents(dataset=i["dataset"])
            for parent in parents:
                parentFiles = self.getParentFilesGivenParentDataset(parent['parent_dataset'], childLFN)
                result.append({"ParentDataset": parent['parent_dataset'], "ParentFiles": list(parentFiles)})
        return result

    def listParentsByLumi(self, childBlockName, childLFNs=None):
        """
        :param childBlockName: child block name
        :param childLFNs: list of child lfns if it is not specified, all the file in the block will be used,
               if specified, dbs validate child lfns from the childBlockName
        :return: list of list with child and parent id pair.  [[1,2], [3,4]...]
        """
        childLFNs = childLFNs or []
        return self.dbs.listFileParentsByLumi(block_name=childBlockName, logical_file_name=childLFNs)

    def insertFileParents(self, childBlockName, childParentsIDPairs):
        """
        :param childBlockName: child block name
        :param childParentsIDPairs: list of list child and parent file ids, i.e. [[1,2], [3,4]...]
                dbs validate child ids from the childBlockName
        :return: None
        """
        return self.dbs.insertFileParents({"block_name": childBlockName, "child_parent_id_list": childParentsIDPairs})

    def findAndInsertMissingParentage(self, childBlockName, childLFNs=None, insertFlag=True):
        """
        :param childBlockName: child block name
        :param childLFNs: list of child lfns if it is not specified, all the file in the block will be used,
               if specified, dbs validate child lfns from the childBlockName
        :return: number of file parents pair inserted
        """
        childLFNs = childLFNs or []
        fileParents = self.dbs.listFileParentsByLumi(block_name=childBlockName, logical_file_name=childLFNs)
        childParentsIDPairs = fileParents[0]["child_parent_id_list"]

        if insertFlag:
            self.dbs.insertFileParents({"block_name": childBlockName, "child_parent_id_list": childParentsIDPairs})
        return len(childParentsIDPairs)

    def listBlocksWithNoParents(self, childDataset):
        """
        :param childDataset: child dataset for
        :return: set of child blocks with no parentBlock
        """
        allBlocks = self.dbs.listBlocks(dataset=childDataset)
        blockNames = []
        for block in allBlocks:
            blockNames.append(block['block_name'])
        parentBlocks = self.dbs.listBlockParents(block_name=blockNames)

        cblock = set()
        for pblock in parentBlocks:
            cblock.add(pblock['this_block_name'])

        noParentBlocks = set(blockNames) - cblock
        return noParentBlocks

    def listFilesWithNoParents(self, childBlockName):
        """
        :param childBlockName:
        :return:
        """
        allFiles = self.dbs.listFiles(block_name=childBlockName)
        parentFiles = self.dbs.listFileParents(block_name=childBlockName)

        allFileNames = set()
        for fInfo in allFiles:
            allFileNames.add(fInfo['logical_file_name'])

        cfile = set()
        for pFile in parentFiles:
            cfile.add(pFile['logical_file_name'])

        noParentFiles = allFileNames - cfile
        return list(noParentFiles)

    def fixMissingParentageDatasets(self, childDataset, insertFlag=True):
        """
        :param childDataset: child dataset need to set the parentage correctly.
        :return: blocks which failed to insert parentage. for retry
        """
        pDatasets = self.listDatasetParents(childDataset)
        # print("parent datasets %s\n" % pDatasets)
        # pDatasets format is
        # [{'this_dataset': '/SingleMuon/Run2016D-03Feb2017-v1/MINIAOD', 'parent_dataset_id': 13265209, 'parent_dataset': '/SingleMuon/Run2016D-23Sep2016-v1/AOD'}]
        if not pDatasets:
            self.logger.warning("No parent dataset found for child dataset %s", childDataset)
            return {}

        blocks = self.listBlocksWithNoParents(childDataset)
        failedBlocks = []
        for blockName in blocks:
            try:
                numFiles = self.findAndInsertMissingParentage(blockName, insertFlag=insertFlag)
                self.logger.debug("%s file parentage added for block %s" % (numFiles, blockName))
            except Exception as ex:
                self.logger.exception("Parentage updated failed for block %s", blockName)
                failedBlocks.append(blockName)

        return failedBlocks

    def insertMissingParentageForAllFiles(self, childDataset, filterFilesWithParents=True, insertFlag=False):
        """
        :param childDataset: child dataset need to set the parentage correctly.
        :param filterFilesWithParents: if True, only select files without parents, if False all the files in the dataset
        :param insertFlag: if True, insert to DBS, if False just get the list of the file parentage without insert
        :return: blocks which failed to insert parentage. should be used for retrying
        """
        blocks = [b['block_name'] for b in self.dbs.listBlocks(dataset=childDataset)]
        failedBlocks = []
        print("Handling %d blocks" % len(blocks))
        totalFiles = 0
        for blockName in blocks:
            try:
                if filterFilesWithParents:
                    childLFNs = self.listFilesWithNoParents(blockName)
                    if len(childLFNs) == 0:
                        continue
                else:
                    childLFNs = []

                numFiles = self.findAndInsertMissingParentage(blockName, childLFNs=childLFNs, insertFlag=insertFlag)
                print("%s file parentage added for block %s" % (numFiles, blockName))
                totalFiles += numFiles
            except Exception as e:
                print(traceback.format_exc())
                failedBlocks.append(blockName)
        print("Total pairs: ", totalFiles)
        return failedBlocks
예제 #18
0
class DBS3Reader(object):
    """
    _DBSReader_

    General API for reading data from DBS
    """
    # cache all the datatiers known by DBS
    _datatiers = {}

    def __init__(self, url, **contact):

        # instantiate dbs api object
        try:
            self.dbsURL = url
            self.dbs = DbsApi(url, **contact)
        except dbsClientException as ex:
            msg = "Error in DBSReader with DbsApi\n"
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        # connection to PhEDEx (Use default endpoint url)
        self.phedex = PhEDEx(responseType="json")

    def _getLumiList(self, blockName=None, lfns=None, validFileOnly=1):
        """
        currently only take one lfn but dbs api need be updated
        """
        try:
            if blockName:
                lumiLists = self.dbs.listFileLumis(block_name=blockName, validFileOnly=validFileOnly)
            elif lfns:
                lumiLists = []
                for slfn in grouper(lfns, 50):
                    lumiLists.extend(self.dbs.listFileLumiArray(logical_file_name = slfn))
            else:
                # shouldn't call this with both blockName and lfns empty
                # but still returns empty dict for that case
                return {}
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFileLumiArray(%s)\n" % lfns
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        lumiDict = {}
        for lumisItem in lumiLists:
            lumiDict.setdefault(lumisItem['logical_file_name'], [])
            item = {}
            item["RunNumber"] = lumisItem['run_num']
            item['LumiSectionNumber'] = lumisItem['lumi_section_num']
            lumiDict[lumisItem['logical_file_name']].append(item)
        return lumiDict

    def checkDBSServer(self):
        """
        check whether dbs server is up and running
        returns {"dbs_instance": "prod/global", "dbs_version": "3.3.144"}
        """
        try:
            return self.dbs.serverinfo()
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBS server is not up: %s" % self.dbsURL
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listPrimaryDatasets(self, match='*'):
        """
        _listPrimaryDatasets_

        return a list of primary datasets, The full dataset name must be provided
        pattern based mathcing is no longer supported.
        If no expression is provided, all datasets are returned
        """
        try:
            result = self.dbs.listPrimaryDatasets(primary_ds_name=match)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listPrimaryDataset(%s)\n" % match
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['primary_ds_name'] for x in result]
        return result

    def matchProcessedDatasets(self, primary, tier, process):
        """
        _matchProcessedDatasets_

        return a list of Processed datasets
        """
        result = []
        try:
            datasets = self.dbs.listDatasets(primary_ds_name=primary, data_tier_name=tier, detail=True)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listProcessedDatasets(%s)\n" % primary
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        for dataset in datasets:
            dataset = remapDBS3Keys(dataset, processed_ds_name='Name')
            dataset['PathList'] = [dataset['dataset']]
            if dataset['Name'] == process:
                result.append(dataset)
        return result

    def listRuns(self, dataset=None, block=None):
        """
        it gets list of DbsRun object but for our purpose
        only list of number is collected.
        DbsRun (RunNumber,
                NumberOfEvents,
                NumberOfLumiSections,
                TotalLuminosity,
                StoreNumber,
                StartOfRungetLong,
                EndOfRun,
                CreationDate,
                CreatedBy,
                LastModificationDate,
                LastModifiedBy
                )
        """
        runs = []
        try:
            if block:
                results = self.dbs.listRuns(block_name=block)
            else:
                results = self.dbs.listRuns(dataset=dataset)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listRuns(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)
        [runs.extend(x['run_num']) for x in results]
        return runs

    def listRunLumis(self, dataset=None, block=None):
        """
        It gets a list of DBSRun objects and returns the number of lumisections per run
        DbsRun (RunNumber,
                NumberOfEvents,
                NumberOfLumiSections,
                TotalLuminosity,
                StoreNumber,
                StartOfRungetLong,
                EndOfRun,
                CreationDate,
                CreatedBy,
                LastModificationDate,
                LastModifiedBy
                )
        """
        # Pointless code in python3
        if isinstance(block, str):
            block = unicode(block)
        if isinstance(dataset, str):
            dataset = unicode(dataset)

        try:
            if block:
                results = self.dbs.listRuns(block_name=block)
            else:
                results = self.dbs.listRuns(dataset=dataset)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listRuns(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        # send runDict format as result, this format is for sync with dbs2 call
        # which has {run_number: num_lumis} but dbs3 call doesn't return num Lumis
        # So it returns {run_number: None}
        # TODO: After DBS2 is completely removed change the return format more sensible one

        runDict = {}
        for x in results:
            for runNumber in x["run_num"]:
                runDict[runNumber] = None
        return runDict

    def listProcessedDatasets(self, primary, dataTier='*'):
        """
        _listProcessedDatasets_

        return a list of Processed datasets for the primary and optional
        data tier value

        """
        try:
            result = self.dbs.listDatasets(primary_ds_name=primary, data_tier_name=dataTier)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listProcessedDatasets(%s)\n" % primary
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['dataset'].split('/')[2] for x in result]
        return result

    def listDatasetFiles(self, datasetPath):
        """
        _listDatasetFiles_

        Get list of files for dataset

        """
        return [x['logical_file_name'] for x in self.dbs.listFileArray(dataset=datasetPath)]

    @staticmethod
    def listDatatiers(dbsUrl=None):
        """
        _listDatatiers_

        Get a list of datatiers known by DBS.
        """
        if dbsUrl is None:
            msg = "Error in DBSReader.listDatatiers(). DBS Url not set."
            raise DBSReaderError(msg)

        timenow = int(time.time())
        if DBS3Reader._datatiers and timenow - 7200 < DBS3Reader._datatiers['ts']:
            return DBS3Reader._datatiers['tiers']

        try:
            DBS3Reader._setDatatiersCache(timenow, dbsUrl)
        except Exception as ex:
            if not DBS3Reader._datatiers:
                msg = "Error in DBSReader.listDatatiers\n%s" % formatEx3(ex)
                raise DBSReaderError(msg)
        return DBS3Reader._datatiers['tiers']

    @staticmethod
    def _setDatatiersCache(ts, dbsUrl):
        """
        Set a timestamp and update the list of datatiers cached in
        the class property
        """
        dbs = DbsApi(dbsUrl)
        DBS3Reader._datatiers['ts'] = ts
        DBS3Reader._datatiers['tiers'] = [tier['data_tier_name'] for tier in dbs.listDataTiers()]

        return

    def listDatasetFileDetails(self, datasetPath, getParents=False, validFileOnly=1):
        """
        TODO: This is completely wrong need to be redone. or be removed - getting dataset altogether
        might be to costly

        _listDatasetFileDetails_

        Get list of lumis, events, and parents for each file in a dataset
        Return a dict where the keys are the files, and for each file we have something like:
            { 'NumberOfEvents': 545,
              'BlockName': '/HighPileUp/Run2011A-v1/RAW#dd6e0796-cbcc-11e0-80a9-003048caaace',
              'Lumis': {173658: [8, 12, 9, 14, 19, 109, 105]},
              'Parents': [],
              'Checksum': '22218315',
              'Adler32': 'a41a1446',
              'FileSize': 286021145,
              'ValidFile': 1
            }

        """
        fileDetails = self.getFileListByDataset(dataset=datasetPath, validFileOnly=validFileOnly, detail=True)
        blocks = set()  # the set of blocks of the dataset
        # Iterate over the files and prepare the set of blocks and a dict where the keys are the files
        files = {}
        for f in fileDetails:
            blocks.add(f['block_name'])
            files[f['logical_file_name']] = remapDBS3Keys(f, stringify=True)
            files[f['logical_file_name']]['ValidFile'] = f['is_file_valid']
            files[f['logical_file_name']]['Lumis'] = {}
            files[f['logical_file_name']]['Parents'] = []

        # Iterate over the blocks and get parents and lumis
        for blockName in blocks:
            # get the parents
            if getParents:
                parents = self.dbs.listFileParents(block_name=blockName)
                for p in parents:
                    if p['logical_file_name'] in files:  # invalid files are not there if validFileOnly=1
                        files[p['logical_file_name']]['Parents'].extend(p['parent_logical_file_name'])
            # get the lumis
            file_lumis = self.dbs.listFileLumis(block_name=blockName)
            for f in file_lumis:
                if f['logical_file_name'] in files:  # invalid files are not there if validFileOnly=1
                    if f['run_num'] in files[f['logical_file_name']]['Lumis']:
                        files[f['logical_file_name']]['Lumis'][f['run_num']].extend(f['lumi_section_num'])
                    else:
                        files[f['logical_file_name']]['Lumis'][f['run_num']] = f['lumi_section_num']

        return files

    def crossCheck(self, datasetPath, *lfns):
        """
        _crossCheck_

        For the dataset provided, check that the lfns listed all exist
        in the dataset.

        Return the list of lfns that are in the dataset

        """
        allLfns = self.dbs.listFileArray(dataset=datasetPath, validFileOnly=1, detail=False)
        setOfAllLfns = set(allLfns)
        setOfKnownLfns = set(lfns)
        return list(setOfAllLfns.intersection(setOfKnownLfns))

    def crossCheckMissing(self, datasetPath, *lfns):
        """
        _crossCheckMissing_

        As cross check, but return value is a list of files that
        are *not* known by DBS

        """
        allLfns = self.dbs.listFileArray(dataset=datasetPath, validFileOnly=1, detail=False)
        setOfAllLfns = set(allLfns)
        setOfKnownLfns = set(lfns)
        knownFiles = setOfAllLfns.intersection(setOfKnownLfns)
        unknownFiles = setOfKnownLfns.difference(knownFiles)
        return list(unknownFiles)

    def getDBSSummaryInfo(self, dataset=None, block=None):
        """
        Get dataset summary includes # of files, events, blocks and total size
        """
        # FIXME: Doesnt raise exceptions on missing data as old api did
        if dataset:
            self.checkDatasetPath(dataset)
        try:
            if block:
                summary = self.dbs.listFileSummaries(block_name=block, validFileOnly=1)
            else:  # dataset case dataset shouldn't be None
                summary = self.dbs.listFileSummaries(dataset=dataset, validFileOnly=1)
        except Exception as ex:
            msg = "Error in DBSReader.getDBSSummaryInfo(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)
        if not summary or summary[0].get('file_size') is None:  # appears to indicate missing dataset
            msg = "DBSReader.listDatasetSummary(%s, %s): No matching data"
            raise DBSReaderError(msg % (dataset, block))
        result = remapDBS3Keys(summary[0], stringify=True)
        result['path'] = dataset if dataset else ''
        result['block'] = block if block else ''
        return result

    def getFileBlocksInfo(self, dataset, onlyClosedBlocks=False,
                          blockName=None, locations=True):
        """
        """
        self.checkDatasetPath(dataset)
        args = {'dataset': dataset, 'detail': True}
        if blockName:
            args['block_name'] = blockName
        try:
            blocks = self.dbs.listBlocks(**args)
        except Exception as ex:
            msg = "Error in DBSReader.getFileBlocksInfo(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        blocks = [remapDBS3Keys(block, stringify=True, block_name='Name') for block in blocks]
        # only raise if blockName not specified - mimic dbs2 error handling
        if not blocks and not blockName:
            msg = "DBSReader.getFileBlocksInfo(%s, %s): No matching data"
            raise DBSReaderError(msg % (dataset, blockName))
        if locations:
            for block in blocks:
                block['PhEDExNodeList'] = [{'Name': x} for x in self.listFileBlockLocation(block['Name'])]

        if onlyClosedBlocks:
            return [x for x in blocks if str(x['OpenForWriting']) != "1"]

        return blocks

    def listFileBlocks(self, dataset, onlyClosedBlocks=False, blockName=None):
        """
        _listFileBlocks_

        Retrieve a list of fileblock names for a dataset

        """
        self.checkDatasetPath(dataset)
        args = {'dataset': dataset, 'detail': False}
        if blockName:
            args['block_name'] = blockName
        if onlyClosedBlocks:
            args['detail'] = True
        try:
            blocks = self.dbs.listBlocks(**args)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listFileBlocks(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if onlyClosedBlocks:
            result = [x['block_name'] for x in blocks if str(x['open_for_writing']) != "1"]

        else:
            result = [x['block_name'] for x in blocks]

        return result

    def listOpenFileBlocks(self, dataset):
        """
        _listOpenFileBlocks_

        Retrieve a list of open fileblock names for a dataset

        """
        self.checkDatasetPath(dataset)
        try:
            blocks = self.dbs.listBlocks(dataset=dataset, detail=True)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listFileBlocks(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['block_name'] for x in blocks if str(x['open_for_writing']) == "1"]

        return result

    def blockExists(self, fileBlockName):
        """
        _blockExists_

        Check to see if block with name provided exists in the DBS
        Instance.

        Return True if exists, False if not

        """
        self.checkBlockName(fileBlockName)
        try:

            blocks = self.dbs.listBlocks(block_name=fileBlockName)
        except Exception as ex:
            msg = "Error in "
            msg += "DBSReader.blockExists(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if len(blocks) == 0:
            return False
        return True

    def listFilesInBlock(self, fileBlockName, lumis=True, validFileOnly=1):
        """
        _listFilesInBlock_

        Get a list of files in the named fileblock
        TODO: lumis can be false when lumi splitting is not required
        However WMBSHelper expect file['LumiList'] to get the run number
        so for now it will be always true.
        We need to clean code up when dbs2 is completely deprecated.
        calling lumis for run number is expensive.
        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.listFilesInBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            files = self.dbs.listFileArray(block_name=fileBlockName, validFileOnly=validFileOnly, detail=True)
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlock(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if lumis:
            lumiDict = self._getLumiList(blockName=fileBlockName, validFileOnly=validFileOnly)

        result = []
        for fileInfo in files:
            if lumis:
                fileInfo["LumiList"] = lumiDict[fileInfo['logical_file_name']]
            result.append(remapDBS3Keys(fileInfo, stringify=True))
        return result

    def listFilesInBlockWithParents(self, fileBlockName, lumis=True, validFileOnly=1):
        """
        _listFilesInBlockWithParents_

        Get a list of files in the named fileblock including
        the parents of that file.
        TODO: lumis can be false when lumi splitting is not required
        However WMBSHelper expect file['LumiList'] to get the run number
        so for now it will be always true.

        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.listFilesInBlockWithParents(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            # TODO: shoud we get only valid block for this?
            files = self.dbs.listFileParents(block_name=fileBlockName)
            fileDetails = self.listFilesInBlock(fileBlockName, lumis, validFileOnly)

        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlockWithParents(%s)\n" % (
                fileBlockName,)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        childByParents = defaultdict(list)
        for f in files:
            # Probably a child can have more than 1 parent file
            for fp in f['parent_logical_file_name']:
                childByParents[fp].append(f['logical_file_name'])
        
        parentsLFNs = childByParents.keys()
        
        if len(parentsLFNs) == 0:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlockWithParents(%s)\n There is no parents files" % (
                fileBlockName)
            raise DBSReaderError(msg)

        parentFilesDetail = []
        # TODO: slicing parentLFNs util DBS api is handling that.
        # Remove slicing if DBS api handles
        for pLFNs in grouper(parentsLFNs, 50):
            parentFilesDetail.extend(self.dbs.listFileArray(logical_file_name=pLFNs, detail=True))

        if lumis:
            parentLumis = self._getLumiList(lfns=parentsLFNs)

        parentsByLFN = defaultdict(list)

        for pf in parentFilesDetail:
            parentLFN = pf['logical_file_name']
            dbsFile = remapDBS3Keys(pf, stringify=True)
            if lumis:
                dbsFile["LumiList"] = parentLumis[parentLFN]

            for childLFN in childByParents[parentLFN]:
                parentsByLFN[childLFN].append(dbsFile)

        for fileInfo in fileDetails:
            fileInfo["ParentList"] = parentsByLFN[fileInfo['logical_file_name']]

        return fileDetails

    def lfnsInBlock(self, fileBlockName):
        """
        _lfnsInBlock_

        LFN list only for block, details = False => faster query

        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.lfnsInBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            lfns = self.dbs.listFileArray(block_name=fileBlockName, validFileOnly=1, detail=False)
            return lfns
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlock(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listFileBlockLocation(self, fileBlockNames, dbsOnly=False):
        """
        _listFileBlockLocation_

        Get origin_site_name of a block

        """

        singleBlockName = None
        if isinstance(fileBlockNames, basestring):
            singleBlockName = fileBlockNames
            fileBlockNames = [fileBlockNames]

        for block in fileBlockNames:
            self.checkBlockName(block)

        locations = {}
        node_filter = set(['UNKNOWN', None])

        if dbsOnly:
            blocksInfo = {}
            try:
                for block in fileBlockNames:
                    blocksInfo.setdefault(block, [])
                    # there should be only one element with a single origin site string ...
                    for blockInfo in self.dbs.listBlockOrigin(block_name=block):
                        blocksInfo[block].append(blockInfo['origin_site_name'])
            except dbsClientException as ex:
                msg = "Error in DBS3Reader: self.dbs.listBlockOrigin(block_name=%s)\n" % fileBlockNames
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)
        else:
            try:
                blocksInfo = self.phedex.getReplicaPhEDExNodesForBlocks(block=fileBlockNames, complete='y')
            except Exception as ex:
                msg = "Error while getting block location from PhEDEx for block_name=%s)\n" % fileBlockNames
                msg += "%s\n" % str(ex)
                raise Exception(msg)

        for block in fileBlockNames:
            valid_nodes = set(blocksInfo.get(block, [])) - node_filter
            locations[block] = list(valid_nodes)

        # returning single list if a single block is passed
        if singleBlockName:
            return locations[singleBlockName]

        return locations

    def getFileBlock(self, fileBlockName, dbsOnly=False):
        """
        _getFileBlock_

        dbsOnly flag is mostly meant for StoreResults, since there is no
        data in TMDB.

        return a dictionary:
        { blockName: {
             "PhEDExNodeNames" : [<pnn list>],
             "Files" : { LFN : Events },
             }
        }


        """
        # Pointless code in python3
        if isinstance(fileBlockName, str):
            fileBlockName = unicode(fileBlockName)
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.getFileBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        result = {fileBlockName: {
            "PhEDExNodeNames": self.listFileBlockLocation(fileBlockName, dbsOnly),
            "Files": self.listFilesInBlock(fileBlockName),
            "IsOpen": self.blockIsOpen(fileBlockName)
            }
        }
        return result

    def getFileBlockWithParents(self, fileBlockName):
        """
        _getFileBlockWithParents_

        return a dictionary:
        { blockName: {
             "PhEDExNodeNames" : [<pnn list>],
             "Files" : dictionaries representing each file
             }
        }

        files

        """
        if isinstance(fileBlockName, str):
            fileBlockName = unicode(fileBlockName)

        if not self.blockExists(fileBlockName):
            msg = "DBSReader.getFileBlockWithParents(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        result = {fileBlockName: {
            "PhEDExNodeNames": self.listFileBlockLocation(fileBlockName),
            "Files": self.listFilesInBlockWithParents(fileBlockName),
            "IsOpen": self.blockIsOpen(fileBlockName)
            }
        }
        return result

    def getFiles(self, dataset, onlyClosedBlocks=False):
        """
        _getFiles_

        Returns a dictionary of block names for the dataset where
        each block constists of a dictionary containing the PhEDExNodeNames
        for that block and the files in that block by LFN mapped to NEvents

        """
        result = {}
        blocks = self.listFileBlocks(dataset, onlyClosedBlocks)

        [result.update(self.getFileBlock(x)) for x in blocks]

        return result

    def listBlockParents(self, blockName):
        """Get parent blocks for block"""
        result = []
        self.checkBlockName(blockName)
        blocks = self.dbs.listBlockParents(block_name=blockName)
        for block in blocks:
            toreturn = {'Name': block['parent_block_name']}
            toreturn['PhEDExNodeList'] = self.listFileBlockLocation(toreturn['Name'])
            result.append(toreturn)
        return result

    def blockIsOpen(self, blockName):
        """
        _blockIsOpen_

        Return True if named block is open, false if not, or if block
        doenst exist

        """
        self.checkBlockName(blockName)
        blockInstance = self.dbs.listBlocks(block_name=blockName, detail=True)
        if len(blockInstance) == 0:
            return False
        blockInstance = blockInstance[0]
        isOpen = blockInstance.get('open_for_writing', 1)
        if isOpen == 0:
            return False
        return True

    def blockToDatasetPath(self, blockName):
        """
        _blockToDatasetPath_

        Given a block name, get the dataset Path associated with that
        Block.

        Returns the dataset path, or None if not found

        """
        self.checkBlockName(blockName)
        try:
            blocks = self.dbs.listBlocks(block_name=blockName, detail=True)
        except Exception as ex:
            msg = "Error in "
            msg += "DBSReader.blockToDatasetPath(%s)\n" % blockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if blocks == []:
            return None

        pathname = blocks[-1].get('dataset', None)
        return pathname

    def listDatasetLocation(self, datasetName, dbsOnly=False):
        """
        _listDatasetLocation_

        List the origin SEs where there is at least a block of the given
        dataset.
        """
        self.checkDatasetPath(datasetName)

        locations = set()

        if dbsOnly:
            try:
                blocksInfo = self.dbs.listBlockOrigin(dataset=datasetName)
            except dbsClientException as ex:
                msg = "Error in DBSReader: dbsApi.listBlocks(dataset=%s)\n" % datasetName
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)

            if not blocksInfo:  # no data location from dbs
                return list()

            for blockInfo in blocksInfo:
                locations.update(blockInfo['origin_site_name'])

            locations.difference_update(['UNKNOWN', None])  # remove entry when SE name is 'UNKNOWN'
        else:
            try:
                blocksInfo = self.phedex.getReplicaPhEDExNodesForBlocks(dataset=[datasetName], complete='y')
            except Exception as ex:
                msg = "Error while getting block location from PhEDEx for dataset=%s)\n" % datasetName
                msg += "%s\n" % str(ex)
                raise Exception(msg)

            if blocksInfo:
                for blockSites in blocksInfo.values():
                    locations.update(blockSites)

        return list(locations)

    def checkDatasetPath(self, pathName):
        """
         _checkDatasetPath_
        """
        if pathName in ("", None):
            raise DBSReaderError("Invalid Dataset Path name: => %s <=" % pathName)
        else:
            try:
                result = self.dbs.listDatasets(dataset=pathName, dataset_access_type='*')
                if len(result) == 0:
                    raise DBSReaderError("Dataset %s doesn't exist in DBS %s" % (pathName, self.dbsURL))
            except (dbsClientException, HTTPError) as ex:
                msg = "Error in "
                msg += "DBSReader.checkDatasetPath(%s)\n" % pathName
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)
        return

    def checkBlockName(self, blockName):
        """
         _checkBlockName_
        """
        if blockName in ("", "*", None):
            raise DBSReaderError("Invalid Block name: => %s <=" % blockName)

    def getFileListByDataset(self, dataset, validFileOnly=1, detail=True):

        """
        _getFileListByDataset_

        Given a dataset, retrieves all blocks, lfns and number of events (among other
        not really important info).
        Returns a list of dict.
        """

        try:
            fileList = self.dbs.listFileArray(dataset=dataset, validFileOnly=validFileOnly, detail=detail)
            return fileList
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.getFileListByDataset(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listDatasetParents(self, childDataset):
        """
        list the the parents dataset path given childDataset
        """
        try:
            parentList = self.dbs.listDatasetParents(dataset=childDataset)
            return parentList
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listDatasetParents(%s)\n" % childDataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)
예제 #19
0
class DBSUploadPoller(BaseWorkerThread):
    """
    Handles poll-based DBSUpload

    """

    def __init__(self, config):
        """
        Initialise class members
        """
        logging.info("Running __init__ for DBS3 Uploader")
        BaseWorkerThread.__init__(self)
        self.config = config

        # This is slightly dangerous, but DBSUpload depends
        # on DBSInterface anyway
        self.dbsUrl = self.config.DBS3Upload.dbsUrl

        self.dbsUtil = DBSBufferUtil()

        myThread = threading.currentThread()
        self.daoFactory = DAOFactory(package="WMComponent.DBS3Buffer",
                                     logger=myThread.logger,
                                     dbinterface=myThread.dbi)

        self.pool = []
        self.blocksToCheck = []
        self.workInput = None
        self.workResult = None
        self.nProc = getattr(self.config.DBS3Upload, 'nProcesses', 4)
        self.wait = getattr(self.config.DBS3Upload, 'dbsWaitTime', 2)
        self.nTries = getattr(self.config.DBS3Upload, 'dbsNTries', 300)
        self.physicsGroup = getattr(self.config.DBS3Upload, "physicsGroup", "NoGroup")
        self.datasetType = getattr(self.config.DBS3Upload, "datasetType", "PRODUCTION")
        self.primaryDatasetType = getattr(self.config.DBS3Upload, "primaryDatasetType", "mc")
        self.blockCount = 0
        self.dbsApi = DbsApi(url=self.dbsUrl)

        # List of blocks currently in processing
        self.queuedBlocks = []

        # Set up the pool of worker processes
        self.setupPool()

        # Setting up any cache objects
        self.blockCache = {}

        self.filesToUpdate = []

        self.produceCopy = getattr(self.config.DBS3Upload, 'dumpBlock', False)

        self.copyPath = os.path.join(getattr(self.config.DBS3Upload, 'componentDir', '/data/srv/'),
                                     'dbsuploader_block.json')

        self.timeoutWaiver = 1

        return

    def setupPool(self):
        """
        _setupPool_

        Set up the processing pool for work
        """
        if len(self.pool) > 0:
            # Then something already exists.  Continue
            return

        self.workInput = multiprocessing.Queue()
        self.workResult = multiprocessing.Queue()

        # Starting up the pool:
        for _ in range(self.nProc):
            p = multiprocessing.Process(target=uploadWorker,
                                        args=(self.workInput,
                                              self.workResult,
                                              self.dbsUrl))
            p.start()
            self.pool.append(p)

        return

    def __del__(self):
        """
        __del__

        Trigger a close of connections if necessary
        """
        self.close()
        return

    def close(self):
        """
        _close_

        Kill all connections and terminate
        """
        terminate = False
        for _ in self.pool:
            try:
                self.workInput.put('STOP')
            except Exception as ex:
                # Something very strange happens here
                # It's like it raises a blank exception
                # Upon being told to return
                msg = "Hit some exception in deletion\n"
                msg += str(ex)
                logging.debug(msg)
                terminate = True
        try:
            self.workInput.close()
            self.workResult.close()
        except Exception:
            # What are you going to do?
            pass
        for proc in self.pool:
            if terminate:
                proc.terminate()
            else:
                proc.join()
        self.pool = []
        self.workInput = None
        self.workResult = None
        return

    def terminate(self, params):
        """
        Do one more pass, then terminate

        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)

    @timeFunction
    def algorithm(self, parameters=None):
        """
        _algorithm_

        First, check blocks that may be already uploaded
        Then, load blocks
        Then, load files
        Then, move files into blocks
        Then add new blocks in DBSBuffer
        Then add blocks to DBS
        Then mark blocks as done in DBSBuffer
        """
        try:
            logging.info("Starting the DBSUpload Polling Cycle")
            self.checkBlocks()
            self.loadBlocks()
            self.loadFiles()
            self.checkBlockCompletion()
            self.inputBlocks()
            self.retrieveBlocks()
        except WMException:
            raise
        except Exception as ex:
            msg = "Unhandled Exception in DBSUploadPoller!\n"
            msg += str(ex)
            msg += str(str(traceback.format_exc()))
            logging.error(msg)
            raise DBSUploadException(msg)

    def loadBlocks(self):
        """
        _loadBlocks_

        Find all blocks; make sure they're in the cache
        """
        openBlocks = self.dbsUtil.findOpenBlocks()
        logging.info("Found %d open blocks.", len(openBlocks))
        logging.debug("These are the openblocks: %s", openBlocks)

        # Load them if we don't have them
        blocksToLoad = []
        for block in openBlocks:
            if not block['blockname'] in self.blockCache.keys():
                blocksToLoad.append(block['blockname'])

        # Now load the blocks
        try:
            loadedBlocks = self.dbsUtil.loadBlocks(blocksToLoad)
            logging.info("Loaded %d blocks.", len(loadedBlocks))
        except WMException:
            raise
        except Exception as ex:
            msg = "Unhandled exception while loading blocks.\n"
            msg += str(ex)
            logging.error(msg)
            logging.debug("Blocks to load: %s\n", blocksToLoad)
            raise DBSUploadException(msg)

        for blockInfo in loadedBlocks:
            block = DBSBufferBlock(name=blockInfo['block_name'],
                                   location=blockInfo['origin_site_name'],
                                   datasetpath=blockInfo['datasetpath'])
            block.FillFromDBSBuffer(blockInfo)
            blockname = block.getName()

            # Now we have to load files...
            try:
                files = self.dbsUtil.loadFilesByBlock(blockname=blockname)
                logging.info("Have %i files for block %s", len(files), blockname)
            except WMException:
                raise
            except Exception as ex:
                msg = "Unhandled exception while loading files for existing blocks.\n"
                msg += str(ex)
                logging.error(msg)
                logging.debug("Blocks being loaded: %s\n", blockname)
                raise DBSUploadException(msg)

            # Add the loaded files to the block
            for f in files:
                block.addFile(f, self.datasetType, self.primaryDatasetType)

            # Add to the cache
            self.blockCache[blockInfo['block_name']] = block

        return

    def loadFiles(self):
        """
        _loadFiles_

        Load all files that need to be loaded.  I will do this by DatasetPath
        to break the monstrous calls down into smaller chunks.
        """
        dspList = self.dbsUtil.findUploadableDAS()

        readyBlocks = []
        for dspInfo in dspList:

            datasetpath = dspInfo['DatasetPath']

            # Get the files
            try:
                loadedFiles = self.dbsUtil.findUploadableFilesByDAS(datasetpath=datasetpath)
            except WMException:
                raise
            except Exception as ex:
                msg = "Unhandled exception while loading uploadable files for DatasetPath.\n"
                msg += str(ex)
                logging.error(msg)
                logging.debug("DatasetPath being loaded: %s\n", datasetpath)
                raise DBSUploadException(msg)

            # Sort the files and blocks by location
            fileDict = sortListByKey(loadedFiles, 'locations')

            # Now add each file
            for location in fileDict.keys():

                files = fileDict.get(location)

                if len(files) > 0:

                    currentBlock = self.getBlock(files[0], location, True)
                    currentBlock.setAcquisitionEra(era=dspInfo['AcquisitionEra'])
                    currentBlock.setProcessingVer(procVer=dspInfo['ProcessingVer'])

                    for newFile in files:

                        if not newFile.get('block', 1) is None:
                            # Then this file already has a block
                            # It should be accounted for somewhere
                            # Or loaded with the block
                            continue

                        # Check if we can put files in this block
                        if not self.isBlockOpen(newFile=newFile,
                                                block=currentBlock):
                            # Then we have to close the block and get a new one
                            currentBlock.setPendingAndCloseBlock()
                            readyBlocks.append(currentBlock)
                            currentBlock = self.getBlock(newFile=newFile,
                                                         location=location)
                            currentBlock.setAcquisitionEra(era=dspInfo['AcquisitionEra'])
                            currentBlock.setProcessingVer(procVer=dspInfo['ProcessingVer'])

                        # Now deal with the file
                        currentBlock.addFile(newFile, self.datasetType, self.primaryDatasetType)
                        self.filesToUpdate.append({'filelfn': newFile['lfn'],
                                                   'block': currentBlock.getName()})
                    # Done with the location
                    readyBlocks.append(currentBlock)

        for block in readyBlocks:
            self.blockCache[block.getName()] = block

        return

    def checkBlockCompletion(self):
        """
        _checkBlockCompletion_

        Mark Open blocks as Pending if they have timed out or their workflows have completed
        """
        completedWorkflows = self.dbsUtil.getCompletedWorkflows()
        for block in self.blockCache.values():
            if block.status == "Open":
                if (block.getTime() > block.getMaxBlockTime()) or any(
                                key in completedWorkflows for key in block.workflows):
                    block.setPendingAndCloseBlock()

        return

    def isBlockOpen(self, newFile, block, doTime=False):
        """
        _isBlockOpen_

        Check and see if a block is full
        This will check on time, but that's disabled by default
        The plan is to do a time check after we do everything else,
        so open blocks about to time out can still get more
        files put in them.
        """

        if block.getMaxBlockFiles() is None or block.getMaxBlockNumEvents() is None or \
                        block.getMaxBlockSize() is None or block.getMaxBlockTime() is None:
            return True
        if block.status != 'Open':
            # Then somebody has dumped this already
            return False
        if block.getSize() + newFile['size'] > block.getMaxBlockSize():
            return False
        if block.getNumEvents() + newFile['events'] > block.getMaxBlockNumEvents():
            return False
        if block.getNFiles() >= block.getMaxBlockFiles():
            # Then we have to dump it because this file
            # will put it over the limit.
            return False
        if block.getTime() > block.getMaxBlockTime() and doTime:
            return False

        return True

    def getBlock(self, newFile, location, skipOpenCheck=False):
        """
        _getBlock_

        Retrieve a block is one exists with matching datasetpath/location and is open.
        If no such block is found create and return a new one.
        """
        datasetpath = newFile["datasetPath"]

        for block in self.blockCache.values():
            if datasetpath == block.getDatasetPath() and location == block.getLocation():
                if not self.isBlockOpen(newFile=newFile, block=block) and not skipOpenCheck:
                    # Block isn't open anymore.  Mark it as pending so that it gets uploaded.
                    block.setPendingAndCloseBlock()
                else:
                    return block

        # A suitable open block does not exist.  Create a new one.
        blockname = "%s#%s" % (datasetpath, makeUUID())
        newBlock = DBSBufferBlock(name=blockname,
                                  location=location,
                                  datasetpath=datasetpath)
        self.blockCache[blockname] = newBlock
        return newBlock

    def inputBlocks(self):
        """
        _inputBlocks_

        Loop through all of the "active" blocks and sort them so we can act
        appropriately on them.  Everything will be sorted based on the
        following:
         Queued - Block is already being acted on by another process.  We just
          ignore it.
         Pending, not in DBSBuffer - Block that has been closed and needs to
           be injected into DBS and also written to DBSBuffer.  We'll do both.
         Pending, in DBSBuffer - Block has been closed and written to
           DBSBuffer.  We just need to inject it into DBS.
         Open, not in DBSBuffer - Newly created block that needs to be written
           not DBSBuffer.
         Open, in DBSBuffer - Newly created block that has already been
           written to DBSBuffer.  We don't have to do anything with it.
        """
        myThread = threading.currentThread()

        createBlocksDAO = self.daoFactory(classname="CreateBlocks")
        updateBlocksDAO = self.daoFactory(classname="UpdateBlocks")
        setBlockFilesDAO = self.daoFactory(classname="SetBlockFiles")

        createInDBS = []
        createInDBSBuffer = []
        updateInDBSBuffer = []
        for block in self.blockCache.values():
            if block.getName() in self.queuedBlocks:
                # Block is already being dealt with by another process.  We'll
                # ignore it here.
                continue
            if block.status == 'Pending':
                # All pending blocks need to be injected into DBS.
                createInDBS.append(block)

                # If this is a new block it needs to be added to DBSBuffer
                # otherwise it just needs to be updated in DBSBuffer.
                if not block.inBuff:
                    createInDBSBuffer.append(block)
                else:
                    updateInDBSBuffer.append(block)
            if block.status == 'Open' and not block.inBuff:
                # New block that needs to be added to DBSBuffer.
                createInDBSBuffer.append(block)

        # Build the pool if it was closed
        if len(self.pool) == 0:
            self.setupPool()

        # First handle new and updated blocks
        if len(createInDBSBuffer) > 0 or len(updateInDBSBuffer) > 0:
            try:
                myThread.transaction.begin()
                if len(createInDBSBuffer) > 0:
                    createBlocksDAO.execute(blocks=createInDBSBuffer,
                                            conn=myThread.transaction.conn,
                                            transaction=True)
                if len(updateInDBSBuffer) > 0:
                    updateBlocksDAO.execute(blocks=updateInDBSBuffer,
                                            conn=myThread.transaction.conn,
                                            transaction=True)
            except WMException:
                myThread.transaction.rollback()
                raise
            except Exception as ex:
                myThread.transaction.rollback()
                msg = "Unhandled exception while writing new blocks into DBSBuffer\n"
                msg += str(ex)
                logging.error(msg)
                logging.debug("Blocks for DBSBuffer: %s\n", createInDBSBuffer)
                logging.debug("Blocks for Update: %s\n", updateInDBSBuffer)
                raise DBSUploadException(msg)
            else:
                myThread.transaction.commit()

        # Update block status in the block cache.  Mark the blocks that we have
        # added to DBSBuffer as being in DBSBuffer.
        for block in createInDBSBuffer:
            self.blockCache.get(block.getName()).inBuff = True

        # Record new file/block associations in DBSBuffer.
        if len(self.filesToUpdate) > 0:
            try:
                myThread.transaction.begin()
                setBlockFilesDAO.execute(binds=self.filesToUpdate,
                                         conn=myThread.transaction.conn,
                                         transaction=True)
                self.filesToUpdate = []
            except WMException:
                myThread.transaction.rollback()
                raise
            except Exception as ex:
                myThread.transaction.rollback()
                msg = "Unhandled exception while setting blocks in files.\n"
                msg += str(ex)
                logging.error(msg)
                logging.debug("Files to Update: %s\n", self.filesToUpdate)
                raise DBSUploadException(msg)
            else:
                myThread.transaction.commit()

        # Finally upload blocks to DBS.
        for block in createInDBS:
            if len(block.files) < 1:
                # What are we doing?
                logging.debug("Skipping empty block")
                continue
            if block.getDataset() == None:
                # Then we have to fix the dataset
                dbsFile = block.files[0]
                block.setDataset(datasetName=dbsFile['datasetPath'],
                                 primaryType=self.primaryDatasetType,
                                 datasetType=self.datasetType,
                                 physicsGroup=dbsFile.get('physicsGroup', None),
                                 prep_id=dbsFile.get('prep_id', None))
            logging.debug("Found block %s in blocks", block.getName())
            block.setPhysicsGroup(group=self.physicsGroup)

            encodedBlock = block.convertToDBSBlock()
            logging.info("About to insert block %s", block.getName())
            self.workInput.put({'name': block.getName(), 'block': encodedBlock})
            self.blockCount += 1
            if self.produceCopy:
                with open(self.copyPath, 'w') as jo:
                    json.dump(encodedBlock, jo, indent=2)
            self.queuedBlocks.append(block.getName())

        # And all work is in and we're done for now
        return

    def retrieveBlocks(self):
        """
        _retrieveBlocks_

        Once blocks are in DBS, we have to retrieve them and see what's
        in them.  What we do is get everything out of the result queue,
        and then update it in DBSBuffer.

        To do this, the result queue needs to pass back the blockname
        """
        myThread = threading.currentThread()

        updateBlocksDAO = self.daoFactory(classname="UpdateBlocks")
        updateFilesDAO = self.daoFactory(classname="UpdateFiles")

        blocksToClose = []
        emptyCount = 0
        while self.blockCount > 0:
            if emptyCount > self.nTries:

                # When timeoutWaiver is 0 raise error.
                # It could take long time to get upload data to DBS
                # if there are a lot of files are cumulated in the buffer.
                # in first try but second try should be faster.
                # timeoutWaiver is set as component variable - only resets when component restarted.
                # The reason for that is only back log will occur when component is down
                # for a long time while other component still running and feeding the data to
                # dbsbuffer

                if self.timeoutWaiver == 0:
                    msg = "Exceeded max number of waits while waiting for DBS to finish"
                    raise DBSUploadException(msg)
                else:
                    self.timeoutWaiver = 0
                    return
            try:
                # Get stuff out of the queue with a ridiculously
                # short wait time
                blockresult = self.workResult.get(timeout=self.wait)
                blocksToClose.append(blockresult)
                self.blockCount -= 1
                logging.debug("Got a block to close")
            except Queue.Empty:
                # This means the queue has no current results
                time.sleep(2)
                emptyCount += 1
                continue

        loadedBlocks = []
        for result in blocksToClose:
            # Remove from list of work being processed
            self.queuedBlocks.remove(result.get('name'))
            if result["success"] == "uploaded":
                block = self.blockCache.get(result.get('name'))
                block.status = 'InDBS'
                loadedBlocks.append(block)
            elif result["success"] == "check":
                block = result["name"]
                self.blocksToCheck.append(block)
            else:
                logging.error("Error found in multiprocess during process of block %s", result.get('name'))
                logging.error(result['error'])
                # Continue to the next block
                # Block will remain in pending status until it is transferred

        if len(loadedBlocks) > 0:
            try:
                myThread.transaction.begin()
                updateFilesDAO.execute(blocks=loadedBlocks, status="InDBS",
                                       conn=myThread.transaction.conn,
                                       transaction=True)
                updateBlocksDAO.execute(blocks=loadedBlocks,
                                        conn=myThread.transaction.conn,
                                        transaction=True)
            except Exception as ex:
                myThread.transaction.rollback()
                # possible deadlock with PhEDExInjector, retry once after 10s
                logging.warning("Oracle exception, possible deadlock due to race condition, retry after 10s sleep")
                time.sleep(10)
                try:
                    myThread.transaction.begin()
                    updateFilesDAO.execute(blocks=loadedBlocks, status="InDBS",
                                           conn=myThread.transaction.conn,
                                           transaction=True)
                    updateBlocksDAO.execute(blocks=loadedBlocks,
                                            conn=myThread.transaction.conn,
                                            transaction=True)
                except Exception as ex:
                    myThread.transaction.rollback()
                    msg = "Unhandled exception while finished closed blocks in DBSBuffer\n"
                    msg += str(ex)
                    logging.error(msg)
                    logging.debug("Blocks for Update: %s\n", loadedBlocks)
                    raise DBSUploadException(msg)
                else:
                    myThread.transaction.commit()

            else:
                myThread.transaction.commit()

        for block in loadedBlocks:
            # Clean things up
            name = block.getName()
            del self.blockCache[name]

        # Clean up the pool so we don't have stuff waiting around
        if len(self.pool) > 0:
            self.close()

        # And we're done
        return

    def checkBlocks(self):
        """
        _checkBlocks_

        Check with DBS3 if the blocks marked as check are
        uploaded or not.
        """
        myThread = threading.currentThread()

        updateBlocksDAO = self.daoFactory(classname="UpdateBlocks")
        updateFilesDAO = self.daoFactory(classname="UpdateFiles")

        blocksUploaded = []

        # See if there is anything to check
        for block in self.blocksToCheck:
            logging.debug("Checking block existence: %s", block)
            # Check in DBS if the block was really inserted
            try:
                result = self.dbsApi.listBlocks(block_name=block)
                for blockResult in result:
                    if blockResult['block_name'] == block:
                        loadedBlock = self.blockCache.get(block)
                        loadedBlock.status = 'InDBS'
                        blocksUploaded.append(loadedBlock)
                        break
            except Exception as ex:
                exString = str(ex)
                msg = "Error trying to check block %s through DBS.\n" % block
                msg += exString
                logging.error(msg)
                logging.error(str(traceback.format_exc()))

        # Update the status of those blocks that were truly inserted
        if len(blocksUploaded) > 0:
            try:
                myThread.transaction.begin()
                updateBlocksDAO.execute(blocks=blocksUploaded,
                                        conn=myThread.transaction.conn,
                                        transaction=True)
                updateFilesDAO.execute(blocks=blocksUploaded, status="InDBS",
                                       conn=myThread.transaction.conn,
                                       transaction=True)
            except WMException:
                myThread.transaction.rollback()
                raise
            except Exception as ex:
                myThread.transaction.rollback()
                msg = "Unhandled exception while finished closed blocks in DBSBuffer\n"
                msg += str(ex)
                logging.error(msg)
                logging.debug("Blocks for Update: %s\n", blocksUploaded)
                raise DBSUploadException(msg)
            else:
                myThread.transaction.commit()

        for block in blocksUploaded:
            # Clean things up
            name = block.getName()
            del self.blockCache[name]

        # Clean the check list
        self.blocksToCheck = []

        # We're done
        return
예제 #20
0
def process_dataset(dataset, globalTag, **kwargs):
    '''
    Script to call validation of multiple runs
      kwargs:
        argument    type      default    comment
        run         int       0          if nonzero, will process a single run, else, process all available
        force       bool      False      do a run regardless if processed already or number of events
        triggers    list(str) []         list of additional triggers to run over
    '''
    singleRun = kwargs.pop('run', 0)
    force = kwargs.pop('force', False)
    dryRun = kwargs.get('dryRun', False)
    curTime = time.time()

    url = 'https://cmsweb.cern.ch/dbs/prod/global/DBSReader'
    dbsclient = DbsApi(url)

    # get stream info
    [filler, stream, version, eventContent] = dataset.split('/')
    if "GEN" in dataset: stream = fix_stream(dataset)

    # setup working directory for stream
    python_mkdir(stream)

    # begin running
    start = time.strftime("%Y/%m/%d %H:%M:%S", time.localtime())
    print "CSCVal job initiated at " + start
    os.chdir(stream)

    print "Reading previously processed runs"
    procFile = 'processedRuns.txt'
    open(procFile, 'a').close()
    with open(procFile, 'r') as file:
        procRuns = file.readlines()
    procRuns = [x.rstrip() for x in procRuns]  # format: RUNUM_NUMEVTS

    print "Reading previous process time"
    timeFile = 'processTime.txt'
    open(timeFile, 'a').close()
    with open(timeFile, 'r') as file:
        procTimes = file.readlines()
    procTimes = [x.rstrip() for x in procTimes]
    prevTime = float(procTimes[-1]) - 12 * 60 * 60 if procTimes else float(
        time.time()
    ) - 7 * 24 * 60 * 60  # default to 7 days before now or 12 hours before last run
    print prevTime

    # run each individual validation
    if singleRun:
        files = dbsclient.listFiles(dataset=dataset,
                                    run_num=singleRun,
                                    validFileOnly=1,
                                    detail=True)
        num = sum([f['event_count'] for f in files])
        input_files = [f['logical_file_name'] for f in files]
        print "Processing run %s" % str(singleRun)
        if force: print "Forcing reprocessing"
        run_validation(dataset,
                       globalTag,
                       str(singleRun),
                       stream,
                       eventContent,
                       str(num),
                       input_files,
                       force=force,
                       **kwargs)
    else:
        # first get new blocks since a time
        blocks = dbsclient.listBlocks(dataset=dataset, min_cdate=int(prevTime))

        # iterate over each block
        updatedRuns = set()
        for block in blocks:
            # get runs in block
            runs = dbsclient.listRuns(block_name=block['block_name'])
            updatedRuns.update(set(runs[0]['run_num']))

        # iterate over runs
        updatedRuns = sorted(updatedRuns)
        fileRunMap = {}
        eventRunMap = {}
        files = dbsclient.listFiles(dataset=dataset,
                                    run_num=updatedRuns,
                                    validFileOnly=1,
                                    detail=True)
        if "GEN" in dataset: updatedRuns = [1]
        for run in updatedRuns:
            eventRunMap[run] = sum(
                [f['event_count'] for f in files if f['run_num'] == run])
            fileRunMap[run] = [
                f['logical_file_name'] for f in files if f['run_num'] == run
            ]

        runsToUpdate = [
            run for run in updatedRuns
            if fileRunMap[run] and eventRunMap[run] > 25000
        ]
        if "GEN" in dataset:
            runsToUpdate = [run for run in updatedRuns if fileRunMap[run]]

        print 'Runs to update:'
        for run in runsToUpdate:
            print '    Run {0}: {1} files, {2} events'.format(
                run, len(fileRunMap[run]), eventRunMap[run])

        for run in runsToUpdate:
            if int(run) < MINRUN: continue
            print "Processing run %s" % run
            run_validation(dataset,
                           globalTag,
                           str(run),
                           stream,
                           eventContent,
                           str(eventRunMap[run]),
                           fileRunMap[run],
                           force=force,
                           **kwargs)

    with open(timeFile, 'a') as file:
        if not dryRun: file.write('{0}\n'.format(curTime))
    os.chdir('../')

    # now finish up
    end = time.strftime("%Y/%m/%d %H:%M:%S", time.localtime())
    print "CSCVal job finished at " + end
예제 #21
0
def QueryForRquestedEventsPerDay(dbsurl,couchurl,outputdict,data_regexp):
    #
    # query couch DB and extract list of requests per day

    # these status values are for rejected workflows
    rejected_status = ['rejected','rejected-archived']

    basenames_to_print = ['SUS-Spring14miniaod-00017_00029_v0_']

    # load requests from json
    header = {'Content-type': 'application/json', 'Accept': 'application/json'}
    conn = httplib.HTTPConnection(couchurl)
    conn.request("GET", '/latency_analytics/_design/latency/_view/maria', headers= header)
    response = conn.getresponse()
    data = response.read()
    conn.close()
    myString = data.decode('utf-8')
    workflows = json.loads(myString)['rows']
    
    # first extract workflows per workflow basename to identify actual requests in case of clones or other 
    basenames = {}
    for entry in workflows:
        # extract information
        workflowname = entry['id']
        info = entry['value']
        workflow_dict = {
                          'Campaign' : info[0],
                          'Tier' : info[1],
                          'Task type' : info[2],
                          'Status' : info[3],
                          'Priority' : info[4],
                          'Requested events' : info[5],
                          '% Complete' : info[6],
                          'Completed events' : 0,
                          'Request date' : time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(info[7])),
                          'Processing dataset name' : '',
                          'Input Dataset' : info[8],
                          'Output Datasets' : info[9],
                          'Filter efficiency' : info[10],
                          'Run white list' : info[11],
                          }
        if workflowname == 'pdmvserv_SUS-Spring14miniaod-00016_00029_v0__140728_120018_4477':
            print workflowname,workflow_dict

        # filter for data_regexp
        match = False
        try:
          for output_dataset in workflow_dict['Output Datasets']:
              if re.compile(data_regexp).match(output_dataset) is not None:
                  match = True
                  break
        except:
          for output_dataset in workflow_dict['Output Datasets']:
              if re.compile(data_regexp).match(output_dataset[0]) is not None:
                  match = True
                  break

        if match == False: continue

        # extract workflow basename, split by '_', remove first field that is the username who injected the workflow, and the last 3 fields that are date, time and fractions of a second (?)
        workflowname_array = workflowname.split('_')
        basename_array = workflowname_array[1:-3]

        # continue if basename_array length == 0
        if len(basename_array) == 0: continue

        # filter out ACDC and tests
        if workflowname.lower().count('acdc') > 0: continue
        if workflowname.lower().count('test') > 0: continue

        # Jen's username is jen_a, from split above a_ could remain, remove
        if basename_array[0].lower() == 'a':
            basename_array = basename_array[1:]

        # if extension, remove EXT from beginning of basename
        if basename_array[0].lower() == 'ext':
            basename_array = basename_array[1:]

        basename = '_'.join(basename_array)
        requestdatetime = int(workflowname_array[-1]) + int(workflowname_array[-2]) * 1E4 + int(workflowname_array[-3]) * 1E10
        if basename not in basenames.keys(): basenames[basename] = {}
        basenames[basename][requestdatetime] = [workflowname,workflow_dict]
    
    # select the original workflow removing clones, etc
    selected = {}
    rejected = {}
    for basename in basenames.keys():
        if basename in basenames_to_print:
            print 'selected basename:',basename
            for date in sorted(basenames[basename].keys()):
                print basenames[basename][date]

        if basename in selected.keys() or basename in rejected.keys(): continue

        # look at all the workflow names of a basename ordered by injection time

        # if the first workflow name of a basename ordered by injection time is not a rejected status, select it
        if basenames[basename][sorted(basenames[basename].keys())[0]][1]['Status'] not in rejected_status:
            selected[basename] = basenames[basename][sorted(basenames[basename].keys())[0]]
        else :
            # if the last workflow is not in rejected status (indication that the workflow never started to run), choose the first workflow as reference
            if basenames[basename][sorted(basenames[basename].keys())[-1]][1]['Status'] not in rejected_status:
                selected[basename] = basenames[basename][sorted(basenames[basename].keys())[0]]
            else :
                # if there is only one workflow for the basename and if the status is rejected
                if len(basenames[basename]) ==  1 and basenames[basename][basenames[basename].keys()[0]][1]['Status'] in rejected_status:
                    rejected[basename] = basenames[basename][basenames[basename].keys()[0]]
                else :
                    # go through workflowname per basename ordered by status, select the first status that is not a rejected status
                    firstvalidentry = None
                    for entry in sorted(basenames[basename].keys()):
                        if basenames[basename][entry][1]['Status'] not in rejected_status:
                            firstvalidentry = entry
                            break
                    if firstvalidentry != None:
                        selected[basename] = basenames[basename][firstvalidentry]
                    else:
                        # check if there are only workflownames per basename that are in a rejected status
                        nonrejectedstatus = False
                        for entry in basenames[basename].keys():
                            if basenames[basename][entry][1]['Status'] not in rejected_status:
                                nonrejectedstatus = True
                                break
                        if nonrejectedstatus == False :
                            # select last one
                            rejected[basename] = basenames[basename][sorted(basenames[basename].keys())[-1]]
                            
        if basename in selected.keys() or basename in rejected.keys(): continue
        print 'could not decide which workflow is the original workflow for basename:',basename
        for date in sorted(basenames[basename].keys()):
            print basenames[basename][date]
        sys.exit(1)
    
    # loop over selected workflows and fill requested events per day
    # only fill day if defined as key of outputdict
    api=DbsApi(url=dbsurl)
    for basename in selected.keys():
        print 'selected basename:',basename
        for date in sorted(basenames[basename].keys()):
            print basenames[basename][date]
        workflowname = selected[basename][0]
        workflow_dict = selected[basename][1]

        # extract unix time of start of day of request date
        request_date = datetime.datetime.strptime(workflow_dict['Request date'],"%Y-%m-%d %H:%M:%S")
        request_date = request_date.replace(tzinfo=pytz.timezone('UTC'))
        request_day = int(datetime.datetime(request_date.year, request_date.month, request_date.day,0,0,0,0, tzinfo=pytz.timezone('UTC')).strftime("%s"))
        if str(request_day) not in outputdict.keys(): continue
        if 'REQUESTED' not in outputdict[str(request_day)].keys(): outputdict[str(request_day)]['REQUESTED'] = 0
        if 'WORKFLOWS' not in outputdict[str(request_day)].keys(): outputdict[str(request_day)]['WORKFLOWS'] = []
        outputdict[str(request_day)]['WORKFLOWS'].append(workflowname)
        request_events = int(workflow_dict['Requested events'])
        if request_events == 0 and workflow_dict['Input Dataset'] != '':
            blocks = api.listBlocks(dataset=workflow_dict['Input Dataset'], detail=False)
            for block in blocks:
                reply= api.listBlockSummaries(block_name=block['block_name'])
                request_events += reply[0]['num_event']
        if workflow_dict['Filter efficiency'] == None :
            outputdict[str(request_day)]['REQUESTED'] += int(request_events)
        else:
            outputdict[str(request_day)]['REQUESTED'] += int(request_events) * float(workflow_dict['Filter efficiency'])
예제 #22
0
        writeUrl = url + 'DBSWriter'

    readApi  = DbsApi(url=readUrl)
    writeApi = DbsApi(url=writeUrl)

    dataset = options.dataset
    if options.new_location:
        new_location = options.new_location

    ###sanitize input
    # dataset name
    Lexicon.dataset(dataset)
    
    # PNN
    if new_location:
        Lexicon.cmsname(new_location)

    # process dataset by blocks

    blockDicts = readApi.listBlocks(dataset=dataset, detail=True)
    for block in blockDicts:
        blName = block['block_name']
        location = block['origin_site_name']
        logging.debug('block %s at location: %s' % (blName, location))
        if new_location:
            writeApi.updateBlockSiteName(block_name=blName, origin_site_name=new_location)
            logging.debug('location set to %s' % (new_location))
        

    logging.info("Done")
예제 #23
0
def dbs3():
    dbsApi = DbsApi(url = 'https://cmsweb-testbed.cern.ch/dbs/prod/global/DBSWriter')
    result = dbsApi.listBlocks(block_name = '/PYTHIA6_Tauola_TTbar_TuneZ2star_14TeV/Summer13-UpgrdPhase2LB4PS_POSTLS261_V2-v1/GEN-SIM#1f7f8d76-c40a-11e2-83c6-003048f0e3f4')
    print result
예제 #24
0
class DBS3Reader(object):
    """
    _DBSReader_

    General API for reading data from DBS
    """
    def __init__(self, url, logger=None, **contact):

        # instantiate dbs api object
        try:
            self.dbsURL = url.replace("cmsweb.cern.ch", "cmsweb-prod.cern.ch")
            self.dbs = DbsApi(self.dbsURL, **contact)
            self.logger = logger or logging.getLogger(self.__class__.__name__)
        except dbsClientException as ex:
            msg = "Error in DBSReader with DbsApi\n"
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def _getLumiList(self, blockName=None, lfns=None, validFileOnly=1):
        """
        currently only take one lfn but dbs api need be updated
        """
        try:
            if blockName:
                lumiLists = self.dbs.listFileLumis(block_name=blockName,
                                                   validFileOnly=validFileOnly)
            elif lfns:
                lumiLists = []
                for slfn in grouper(lfns, 50):
                    lumiLists.extend(
                        self.dbs.listFileLumiArray(logical_file_name=slfn))
            else:
                # shouldn't call this with both blockName and lfns empty
                # but still returns empty dict for that case
                return {}
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFileLumiArray(%s)\n" % lfns
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        lumiDict = {}
        for lumisItem in lumiLists:
            lumiDict.setdefault(lumisItem['logical_file_name'], [])
            item = {}
            item["RunNumber"] = lumisItem['run_num']
            item['LumiSectionNumber'] = lumisItem['lumi_section_num']
            if lumisItem.get('event_count', None) is not None:
                item['EventCount'] = lumisItem['event_count']
            lumiDict[lumisItem['logical_file_name']].append(item)
            # TODO: add key for lumi and event pair.
        return lumiDict

    def checkDBSServer(self):
        """
        check whether dbs server is up and running
        returns {"dbs_instance": "prod/global", "dbs_version": "3.3.144"}
        """
        try:
            return self.dbs.serverinfo()
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBS server is not up: %s" % self.dbsURL
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listPrimaryDatasets(self, match='*'):
        """
        _listPrimaryDatasets_

        return a list of primary datasets, The full dataset name must be provided
        pattern based mathcing is no longer supported.
        If no expression is provided, all datasets are returned
        """
        try:
            result = self.dbs.listPrimaryDatasets(primary_ds_name=match)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listPrimaryDataset(%s)\n" % match
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['primary_ds_name'] for x in result]
        return result

    def matchProcessedDatasets(self, primary, tier, process):
        """
        _matchProcessedDatasets_

        return a list of Processed datasets
        """
        result = []
        try:
            datasets = self.dbs.listDatasets(primary_ds_name=primary,
                                             data_tier_name=tier,
                                             detail=True)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listProcessedDatasets(%s)\n" % primary
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        for dataset in datasets:
            dataset = remapDBS3Keys(dataset, processed_ds_name='Name')
            dataset['PathList'] = [dataset['dataset']]
            if dataset['Name'] == process:
                result.append(dataset)
        return result

    def listRuns(self, dataset=None, block=None):
        """
        it gets list of DbsRun object but for our purpose
        only list of number is collected.
        DbsRun (RunNumber,
                NumberOfEvents,
                NumberOfLumiSections,
                TotalLuminosity,
                StoreNumber,
                StartOfRungetLong,
                EndOfRun,
                CreationDate,
                CreatedBy,
                LastModificationDate,
                LastModifiedBy
                )
        """
        runs = []
        try:
            if block:
                results = self.dbs.listRuns(block_name=block)
            else:
                results = self.dbs.listRuns(dataset=dataset)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listRuns(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)
        for x in results:
            runs.extend(x['run_num'])
        return runs

    def listRunLumis(self, dataset=None, block=None):
        """
        It gets a list of DBSRun objects and returns the number of lumisections per run
        DbsRun (RunNumber,
                NumberOfEvents,
                NumberOfLumiSections,
                TotalLuminosity,
                StoreNumber,
                StartOfRungetLong,
                EndOfRun,
                CreationDate,
                CreatedBy,
                LastModificationDate,
                LastModifiedBy
                )
        """
        # Pointless code in python3
        block = decodeBytesToUnicode(block)
        dataset = decodeBytesToUnicode(dataset)

        try:
            if block:
                results = self.dbs.listRuns(block_name=block)
            else:
                results = self.dbs.listRuns(dataset=dataset)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listRuns(%s, %s)\n" % (dataset, block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        # send runDict format as result, this format is for sync with dbs2 call
        # which has {run_number: num_lumis} but dbs3 call doesn't return num Lumis
        # So it returns {run_number: None}
        # TODO: After DBS2 is completely removed change the return format more sensible one

        runDict = {}
        for x in results:
            for runNumber in x["run_num"]:
                runDict[runNumber] = None
        return runDict

    def listProcessedDatasets(self, primary, dataTier='*'):
        """
        _listProcessedDatasets_

        return a list of Processed datasets for the primary and optional
        data tier value

        """
        try:
            result = self.dbs.listDatasets(primary_ds_name=primary,
                                           data_tier_name=dataTier)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listProcessedDatasets(%s)\n" % primary
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [x['dataset'].split('/')[2] for x in result]
        return result

    def listDatasetFiles(self, datasetPath):
        """
        _listDatasetFiles_

        Get list of files for dataset

        """
        return [
            x['logical_file_name']
            for x in self.dbs.listFileArray(dataset=datasetPath)
        ]

    def listDatatiers(self):
        """
        _listDatatiers_

        Get a list of datatiers known by DBS.
        """
        return [tier['data_tier_name'] for tier in self.dbs.listDataTiers()]

    def listDatasetFileDetails(self,
                               datasetPath,
                               getParents=False,
                               getLumis=True,
                               validFileOnly=1):
        """
        TODO: This is completely wrong need to be redone. or be removed - getting dataset altogether
        might be to costly

        _listDatasetFileDetails_

        Get list of lumis, events, and parents for each file in a dataset
        Return a dict where the keys are the files, and for each file we have something like:
            { 'NumberOfEvents': 545,
              'BlockName': '/HighPileUp/Run2011A-v1/RAW#dd6e0796-cbcc-11e0-80a9-003048caaace',
              'Lumis': {173658: [8, 12, 9, 14, 19, 109, 105]},
              'Parents': [],
              'Checksum': '22218315',
              'Adler32': 'a41a1446',
              'FileSize': 286021145,
              'ValidFile': 1
            }

        """
        fileDetails = self.getFileListByDataset(dataset=datasetPath,
                                                validFileOnly=validFileOnly,
                                                detail=True)
        blocks = set()  # the set of blocks of the dataset
        # Iterate over the files and prepare the set of blocks and a dict where the keys are the files
        files = {}
        for f in fileDetails:
            blocks.add(f['block_name'])
            files[f['logical_file_name']] = remapDBS3Keys(f, stringify=True)
            files[f['logical_file_name']]['ValidFile'] = f['is_file_valid']
            files[f['logical_file_name']]['Lumis'] = {}
            files[f['logical_file_name']]['Parents'] = []

        # Iterate over the blocks and get parents and lumis
        for blockName in blocks:
            # get the parents
            if getParents:
                parents = self.dbs.listFileParents(block_name=blockName)
                for p in parents:
                    if p['logical_file_name'] in files:  # invalid files are not there if validFileOnly=1
                        files[p['logical_file_name']]['Parents'].extend(
                            p['parent_logical_file_name'])

            if getLumis:
                # get the lumis
                file_lumis = self.dbs.listFileLumis(block_name=blockName)
                for f in file_lumis:
                    if f['logical_file_name'] in files:  # invalid files are not there if validFileOnly=1
                        if f['run_num'] in files[
                                f['logical_file_name']]['Lumis']:
                            files[f['logical_file_name']]['Lumis'][
                                f['run_num']].extend(f['lumi_section_num'])
                        else:
                            files[f['logical_file_name']]['Lumis'][
                                f['run_num']] = f['lumi_section_num']

        return files

    def crossCheck(self, datasetPath, *lfns):
        """
        _crossCheck_

        For the dataset provided, check that the lfns listed all exist
        in the dataset.

        Return the list of lfns that are in the dataset

        """
        allLfns = self.dbs.listFileArray(dataset=datasetPath,
                                         validFileOnly=1,
                                         detail=False)
        setOfAllLfns = set(allLfns)
        setOfKnownLfns = set(lfns)
        return list(setOfAllLfns.intersection(setOfKnownLfns))

    def crossCheckMissing(self, datasetPath, *lfns):
        """
        _crossCheckMissing_

        As cross check, but return value is a list of files that
        are *not* known by DBS

        """
        allLfns = self.dbs.listFileArray(dataset=datasetPath,
                                         validFileOnly=1,
                                         detail=False)
        setOfAllLfns = set(allLfns)
        setOfKnownLfns = set(lfns)
        knownFiles = setOfAllLfns.intersection(setOfKnownLfns)
        unknownFiles = setOfKnownLfns.difference(knownFiles)
        return list(unknownFiles)

    def getDBSSummaryInfo(self, dataset=None, block=None):
        """
        Get dataset summary includes # of files, events, blocks and total size
        """
        if dataset:
            self.checkDatasetPath(dataset)
        try:
            if block:
                summary = self.dbs.listFileSummaries(block_name=block,
                                                     validFileOnly=1)
            else:
                summary = self.dbs.listFileSummaries(dataset=dataset,
                                                     validFileOnly=1)
        except Exception as ex:
            msg = "Error in DBSReader.getDBSSummaryInfo(%s, %s)\n" % (dataset,
                                                                      block)
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if not summary:  # missing data or all files invalid
            return {}

        result = remapDBS3Keys(summary[0], stringify=True)
        result['path'] = dataset if dataset else ''
        result['block'] = block if block else ''
        return result

    def listFileBlocks(self, dataset, onlyClosedBlocks=False, blockName=None):
        """
        _listFileBlocks_

        Retrieve a list of fileblock names for a dataset

        """
        self.checkDatasetPath(dataset)
        args = {'dataset': dataset, 'detail': False}
        if blockName:
            args['block_name'] = blockName
        if onlyClosedBlocks:
            args['detail'] = True
        try:
            blocks = self.dbs.listBlocks(**args)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listFileBlocks(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if onlyClosedBlocks:
            result = [
                x['block_name'] for x in blocks
                if str(x['open_for_writing']) != "1"
            ]

        else:
            result = [x['block_name'] for x in blocks]

        return result

    def listOpenFileBlocks(self, dataset):
        """
        _listOpenFileBlocks_

        Retrieve a list of open fileblock names for a dataset

        """
        self.checkDatasetPath(dataset)
        try:
            blocks = self.dbs.listBlocks(dataset=dataset, detail=True)
        except dbsClientException as ex:
            msg = "Error in DBSReader.listFileBlocks(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        result = [
            x['block_name'] for x in blocks
            if str(x['open_for_writing']) == "1"
        ]

        return result

    def blockExists(self, fileBlockName):
        """
        _blockExists_

        Check to see if block with name provided exists in the DBS
        Instance.

        Return True if exists, False if not

        """
        self.checkBlockName(fileBlockName)
        try:

            blocks = self.dbs.listBlocks(block_name=fileBlockName)
        except Exception as ex:
            msg = "Error in "
            msg += "DBSReader.blockExists(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if len(blocks) == 0:
            return False
        return True

    def listFilesInBlock(self, fileBlockName, lumis=True, validFileOnly=1):
        """
        _listFilesInBlock_

        Get a list of files in the named fileblock
        TODO: lumis can be false when lumi splitting is not required
        However WMBSHelper expect file['LumiList'] to get the run number
        so for now it will be always true.
        We need to clean code up when dbs2 is completely deprecated.
        calling lumis for run number is expensive.
        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.listFilesInBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            files = self.dbs.listFileArray(block_name=fileBlockName,
                                           validFileOnly=validFileOnly,
                                           detail=True)
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlock(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if lumis:
            lumiDict = self._getLumiList(blockName=fileBlockName,
                                         validFileOnly=validFileOnly)

        result = []
        for fileInfo in files:
            if lumis:
                fileInfo["LumiList"] = lumiDict[fileInfo['logical_file_name']]
            result.append(remapDBS3Keys(fileInfo, stringify=True))
        return result

    def listFilesInBlockWithParents(self,
                                    fileBlockName,
                                    lumis=True,
                                    validFileOnly=1):
        """
        _listFilesInBlockWithParents_

        Get a list of files in the named fileblock including
        the parents of that file.
        TODO: lumis can be false when lumi splitting is not required
        However WMBSHelper expect file['LumiList'] to get the run number
        so for now it will be always true.

        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.listFilesInBlockWithParents(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            # TODO: shoud we get only valid block for this?
            files = self.dbs.listFileParents(block_name=fileBlockName)
            fileDetails = self.listFilesInBlock(fileBlockName, lumis,
                                                validFileOnly)

        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlockWithParents(%s)\n" % (
                fileBlockName, )
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        childByParents = defaultdict(list)
        for f in files:
            # Probably a child can have more than 1 parent file
            for fp in f['parent_logical_file_name']:
                childByParents[fp].append(f['logical_file_name'])

        parentsLFNs = list(childByParents)

        if len(parentsLFNs) == 0:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlockWithParents(%s)\n There is no parents files" % (
                fileBlockName)
            raise DBSReaderError(msg)

        parentFilesDetail = []
        # TODO: slicing parentLFNs util DBS api is handling that.
        # Remove slicing if DBS api handles
        for pLFNs in grouper(parentsLFNs, 50):
            parentFilesDetail.extend(
                self.dbs.listFileArray(logical_file_name=pLFNs, detail=True))

        if lumis:
            parentLumis = self._getLumiList(lfns=parentsLFNs)

        parentsByLFN = defaultdict(list)

        for pf in parentFilesDetail:
            parentLFN = pf['logical_file_name']
            dbsFile = remapDBS3Keys(pf, stringify=True)
            if lumis:
                dbsFile["LumiList"] = parentLumis[parentLFN]

            for childLFN in childByParents[parentLFN]:
                parentsByLFN[childLFN].append(dbsFile)

        for fileInfo in fileDetails:
            fileInfo["ParentList"] = parentsByLFN[
                fileInfo['logical_file_name']]

        return fileDetails

    def lfnsInBlock(self, fileBlockName):
        """
        _lfnsInBlock_

        LFN list only for block, details = False => faster query

        """
        if not self.blockExists(fileBlockName):
            msg = "DBSReader.lfnsInBlock(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        try:
            lfns = self.dbs.listFileArray(block_name=fileBlockName,
                                          validFileOnly=1,
                                          detail=False)
            return lfns
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listFilesInBlock(%s)\n" % fileBlockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listFileBlockLocation(self, fileBlockNames):
        """
        _listFileBlockLocation_

        Get origin_site_name of a block

        """

        singleBlockName = None
        if isinstance(fileBlockNames, (str, bytes)):
            singleBlockName = fileBlockNames
            fileBlockNames = [fileBlockNames]

        for block in fileBlockNames:
            self.checkBlockName(block)

        locations = {}
        node_filter = set(['UNKNOWN', None])

        blocksInfo = {}
        try:
            for block in fileBlockNames:
                blocksInfo.setdefault(block, [])
                # there should be only one element with a single origin site string ...
                for blockInfo in self.dbs.listBlockOrigin(block_name=block):
                    blocksInfo[block].append(blockInfo['origin_site_name'])
        except dbsClientException as ex:
            msg = "Error in DBS3Reader: self.dbs.listBlockOrigin(block_name=%s)\n" % fileBlockNames
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        for block in fileBlockNames:
            valid_nodes = set(blocksInfo.get(block, [])) - node_filter
            locations[block] = list(valid_nodes)

        # returning single list if a single block is passed
        if singleBlockName:
            return locations[singleBlockName]

        return locations

    def getFileBlock(self, fileBlockName):
        """
        Retrieve a list of files in the block; a flag whether the
        block is still open or not; and it used to resolve the block
        location via PhEDEx.

        :return: a dictionary in the format of:
            {"PhEDExNodeNames" : [],
             "Files" : { LFN : Events },
             "IsOpen" : True|False}
        """
        result = {
            "PhEDExNodeNames": [],  # FIXME: we better get rid of this line!
            "Files": self.listFilesInBlock(fileBlockName),
            "IsOpen": self.blockIsOpen(fileBlockName)
        }
        return result

    def getFileBlockWithParents(self, fileBlockName):
        """
        Retrieve a list of parent files in the block; a flag whether the
        block is still open or not; and it used to resolve the block
        location via PhEDEx.

        :return: a dictionary in the format of:
            {"PhEDExNodeNames" : [],
             "Files" : { LFN : Events },
             "IsOpen" : True|False}
        """
        fileBlockName = decodeBytesToUnicode(fileBlockName)

        if not self.blockExists(fileBlockName):
            msg = "DBSReader.getFileBlockWithParents(%s): No matching data"
            raise DBSReaderError(msg % fileBlockName)

        result = {
            "PhEDExNodeNames": [],  # FIXME: we better get rid of this line!
            "Files": self.listFilesInBlockWithParents(fileBlockName),
            "IsOpen": self.blockIsOpen(fileBlockName)
        }
        return result

    def listBlockParents(self, blockName):
        """
        Return a list of parent blocks for a given child block name
        """
        # FIXME: note the different returned data structure
        result = []
        self.checkBlockName(blockName)
        blocks = self.dbs.listBlockParents(block_name=blockName)
        result = [block['parent_block_name'] for block in blocks]
        return result

    def blockIsOpen(self, blockName):
        """
        _blockIsOpen_

        Return True if named block is open, false if not, or if block
        doenst exist

        """
        self.checkBlockName(blockName)
        blockInstance = self.dbs.listBlocks(block_name=blockName, detail=True)
        if len(blockInstance) == 0:
            return False
        blockInstance = blockInstance[0]
        isOpen = blockInstance.get('open_for_writing', 1)
        if isOpen == 0:
            return False
        return True

    def blockToDatasetPath(self, blockName):
        """
        _blockToDatasetPath_

        Given a block name, get the dataset Path associated with that
        Block.

        Returns the dataset path, or None if not found

        """
        self.checkBlockName(blockName)
        try:
            blocks = self.dbs.listBlocks(block_name=blockName, detail=True)
        except Exception as ex:
            msg = "Error in "
            msg += "DBSReader.blockToDatasetPath(%s)\n" % blockName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if blocks == []:
            return None

        pathname = blocks[-1].get('dataset', None)
        return pathname

    def listDatasetLocation(self, datasetName):
        """
        _listDatasetLocation_

        List the origin SEs where there is at least a block of the given
        dataset.
        """
        self.checkDatasetPath(datasetName)

        locations = set()
        try:
            blocksInfo = self.dbs.listBlockOrigin(dataset=datasetName)
        except dbsClientException as ex:
            msg = "Error in DBSReader: dbsApi.listBlocks(dataset=%s)\n" % datasetName
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

        if not blocksInfo:  # no data location from dbs
            return list()

        for blockInfo in blocksInfo:
            locations.update(blockInfo['origin_site_name'])

        locations.difference_update(
            ['UNKNOWN', None])  # remove entry when SE name is 'UNKNOWN'

        return list(locations)

    def checkDatasetPath(self, pathName):
        """
         _checkDatasetPath_
        """
        if pathName in ("", None):
            raise DBSReaderError("Invalid Dataset Path name: => %s <=" %
                                 pathName)
        else:
            try:
                result = self.dbs.listDatasets(dataset=pathName,
                                               dataset_access_type='*')
                if len(result) == 0:
                    raise DBSReaderError("Dataset %s doesn't exist in DBS %s" %
                                         (pathName, self.dbsURL))
            except (dbsClientException, HTTPError) as ex:
                msg = "Error in "
                msg += "DBSReader.checkDatasetPath(%s)\n" % pathName
                msg += "%s\n" % formatEx3(ex)
                raise DBSReaderError(msg)
        return

    def checkBlockName(self, blockName):
        """
         _checkBlockName_
        """
        if blockName in ("", "*", None):
            raise DBSReaderError("Invalid Block name: => %s <=" % blockName)

    def getFileListByDataset(self, dataset, validFileOnly=1, detail=True):
        """
        _getFileListByDataset_

        Given a dataset, retrieves all blocks, lfns and number of events (among other
        not really important info).
        Returns a list of dict.
        """

        try:
            fileList = self.dbs.listFileArray(dataset=dataset,
                                              validFileOnly=validFileOnly,
                                              detail=detail)
            return fileList
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.getFileListByDataset(%s)\n" % dataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    def listDatasetParents(self, childDataset):
        """
        list the the parents dataset path given childDataset
        """
        try:
            parentList = self.dbs.listDatasetParents(dataset=childDataset)
            return parentList
        except dbsClientException as ex:
            msg = "Error in "
            msg += "DBSReader.listDatasetParents(%s)\n" % childDataset
            msg += "%s\n" % formatEx3(ex)
            raise DBSReaderError(msg)

    # def getListFilesByLumiAndDataset(self, dataset, files):
    #     "Unsing pycurl to get all the child parents pair for given dataset"
    #
    #     urls = ['%s/data/dbs/fileparentbylumis?block_name=%s' % (
    #              self.dbsURL, b["block_name"]) for b in self.dbs.listBlocks(dataset=dataset)]
    #
    #     data = multi_getdata(urls, ckey(), cert())
    #     rdict = {}
    #     for row in data:
    #         try:
    #             data = json.loads(row['data'])
    #             rdict[req] = data['result'][0]  # we get back {'result': [workflow]} dict
    #         except Exception as exp:
    #             print("ERROR: fail to load data as json record, error=%s" % str(exp))
    #             print(row)
    #     return rdict

    def getParentFilesGivenParentDataset(self, parentDataset, childLFNs):
        """
        returns parent files for given childLFN when DBS doesn't have direct parent child relationship in DB
        Only use this for finding missing parents

        :param parentDataset: parent dataset for childLFN
        :param childLFN: a file in child dataset
        :return: set of parent files for childLFN
        """
        fInfo = self.dbs.listFileLumiArray(logical_file_name=childLFNs)
        parentFiles = defaultdict(set)
        for f in fInfo:
            pFileList = self.dbs.listFiles(dataset=parentDataset,
                                           run_num=f['run_num'],
                                           lumi_list=f['lumi_section_num'])
            pFiles = set([x['logical_file_name'] for x in pFileList])
            parentFiles[f['logical_file_name']] = parentFiles[
                f['logical_file_name']].union(pFiles)
        return parentFiles

    def getParentFilesByLumi(self, childLFN):
        """
        get the parent file's lfns by lumi (This might not be the actual parentage relations in DBS just parentage by Lumis).
        use for only specific lfn for validating purpose, for the parentage fix use findAndInsertMissingParentage
        :param childLFN:
        :return: list of dictionary with parent files for given child LFN and parent dataset
        [{"ParentDataset": /abc/bad/ddd, "ParentFiles": [alf, baf, ...]]
        """
        childDatasets = self.dbs.listDatasets(logical_file_name=childLFN)
        result = []
        for i in childDatasets:
            parents = self.dbs.listDatasetParents(dataset=i["dataset"])
            for parent in parents:
                parentFiles = self.getParentFilesGivenParentDataset(
                    parent['parent_dataset'], childLFN)
                result.append({
                    "ParentDataset": parent['parent_dataset'],
                    "ParentFiles": list(parentFiles)
                })
        return result

    def insertFileParents(self, childBlockName, childParentsIDPairs):
        """
        :param childBlockName: child block name
        :param childParentsIDPairs: list of list child and parent file ids, i.e. [[1,2], [3,4]...]
                dbs validate child ids from the childBlockName
        :return: None
        """
        return self.dbs.insertFileParents({
            "block_name":
            childBlockName,
            "child_parent_id_list":
            childParentsIDPairs
        })

    def findAndInsertMissingParentage(self,
                                      childBlockName,
                                      parentData,
                                      insertFlag=True):
        """
        :param childBlockName: child block name
        :param parentData: a dictionary with complete parent dataset file/run/lumi information
        :param insertFlag: boolean to allow parentage insertion into DBS or not
        :return: number of file parents pair inserted
        """
        # in the format of: {'fileid': [[run_num1, lumi1], [run_num1, lumi2], etc]
        # e.g. {'554307997': [[1, 557179], [1, 557178], [1, 557181],
        childBlockData = self.dbs.listBlockTrio(block_name=childBlockName)

        # runs the actual mapping logic, like {"child_id": ["parent_id", "parent_id2", ...], etc
        mapChildParent = {}
        # there should be only 1 item, but we better be safe
        for item in childBlockData:
            for childFileID in item:
                for runLumiPair in item[childFileID]:
                    frozenKey = frozenset(runLumiPair)
                    parentId = parentData.get(frozenKey)
                    if parentId is None:
                        msg = "Child file id: %s, with run/lumi: %s, has no match in the parent dataset"
                        self.logger.warning(msg, childFileID, frozenKey)
                        continue
                    mapChildParent.setdefault(childFileID, set())
                    mapChildParent[childFileID].add(parentId)

        if insertFlag and mapChildParent:
            # convert dictionary to list of unique childID, parentID tuples
            listChildParent = []
            for childID in mapChildParent:
                for parentID in mapChildParent[childID]:
                    listChildParent.append([int(childID), int(parentID)])
            self.dbs.insertFileParents({
                "block_name": childBlockName,
                "child_parent_id_list": listChildParent
            })
        return len(mapChildParent)

    def listBlocksWithNoParents(self, childDataset):
        """
        :param childDataset: child dataset for
        :return: set of child blocks with no parentBlock
        """
        allBlocks = self.dbs.listBlocks(dataset=childDataset)
        blockNames = []
        for block in allBlocks:
            blockNames.append(block['block_name'])
        parentBlocks = self.dbs.listBlockParents(block_name=blockNames)

        cblock = set()
        for pblock in parentBlocks:
            cblock.add(pblock['this_block_name'])

        noParentBlocks = set(blockNames) - cblock
        return noParentBlocks

    def listFilesWithNoParents(self, childBlockName):
        """
        :param childBlockName:
        :return:
        """
        allFiles = self.dbs.listFiles(block_name=childBlockName)
        parentFiles = self.dbs.listFileParents(block_name=childBlockName)

        allFileNames = set()
        for fInfo in allFiles:
            allFileNames.add(fInfo['logical_file_name'])

        cfile = set()
        for pFile in parentFiles:
            cfile.add(pFile['logical_file_name'])

        noParentFiles = allFileNames - cfile
        return list(noParentFiles)

    def fixMissingParentageDatasets(self, childDataset, insertFlag=True):
        """
        :param childDataset: child dataset need to set the parentage correctly.
        :return: blocks which failed to insert parentage. for retry
        """
        pDatasets = self.listDatasetParents(childDataset)
        self.logger.info("Parent datasets for %s are: %s", childDataset,
                         pDatasets)
        # print("parent datasets %s\n" % pDatasets)
        # pDatasets format is
        # [{'this_dataset': '/SingleMuon/Run2016D-03Feb2017-v1/MINIAOD', 'parent_dataset_id': 13265209, 'parent_dataset': '/SingleMuon/Run2016D-23Sep2016-v1/AOD'}]
        if not pDatasets:
            self.logger.warning("No parent dataset found for child dataset %s",
                                childDataset)
            return {}

        parentFullInfo = self.getParentDatasetTrio(childDataset)
        blocks = self.listBlocksWithNoParents(childDataset)
        failedBlocks = []
        self.logger.info("Found %d blocks without parentage information",
                         len(blocks))
        for blockName in blocks:
            try:
                self.logger.info("Fixing parentage for block: %s", blockName)
                numFiles = self.findAndInsertMissingParentage(
                    blockName, parentFullInfo, insertFlag=insertFlag)
                self.logger.debug("%s file parentage added for block %s",
                                  numFiles, blockName)
            except Exception as ex:
                self.logger.exception("Parentage updated failed for block %s",
                                      blockName)
                failedBlocks.append(blockName)

        return failedBlocks

    def getParentDatasetTrio(self, childDataset):
        """
        Provided a dataset name, return all the parent dataset information, such as:
          - file ids, run number and lumi section
        NOTE: This API is meant to be used by the StepChainParentage thread only!!!
        :param childDataset: name of the child dataset
        :return: a dictionary where the key is a set of run/lumi, its value is the fileid
        """
        # this will return data in the format of:
        # {'554307997': [[1, 557179], [1, 557178],...
        # such that: key is file id, in each list is [run_number, lumi_section_numer].
        parentFullInfo = self.dbs.listParentDSTrio(dataset=childDataset)

        # runs the actual mapping logic, like {"child_id": ["parent_id", "parent_id2", ...], etc
        parentFrozenData = {}
        for item in parentFullInfo:
            for fileId in item:
                for runLumiPair in item[fileId]:
                    frozenKey = frozenset(runLumiPair)
                    parentFrozenData[frozenKey] = fileId
        return parentFrozenData
예제 #25
0
datatiers = {}
datatiers['data'] = ['RAW','RECO','AOD','RAW-RECO','USER']
datatiers['mc'] = ['GEN','GEN-SIM','GEN-RAW','GEN-SIM-RECO','AODSIM']
separations = ['PromptReco','PromptSkim']
exclusion_strings = {}
exclusion_strings['mc'] = ['test','backfill','jobrobot','sam','bunnies','penguins']
exclusion_strings['data'] = ['test','backfill','StoreResults','monitor','Error/','Scouting','MiniDaq','/Alca','L1Accept','/Hcal','express','Interfill']

api3 = DbsApi(url)

if read == None:
	for category in datatiers.keys():
		if category not in dbs_query_results.keys(): dbs_query_results[category] = {}
		for datatier in datatiers[category]:
			if datatier not in dbs_query_results[category].keys(): dbs_query_results[category][datatier] = {}
			blocks = api3.listBlocks(data_tier_name=datatier,min_cdate=startdate.strftime("%s"),max_cdate=enddate.strftime("%s"))
			for block in blocks:
				exclude = False
				for exclusion_string in exclusion_strings[category]:
					if exclusion_string.lower() in block['block_name'].lower():
						if verbose == True: print 'blockname was rejected:',block['block_name']
						exclude = True
						continue
				if exclude == True: continue
				if verbose == True: print 'Querying for the summary for block:',block['block_name'],'!'
				properties = api3.listBlockSummaries(block_name=block['block_name'])
				dbs_query_results[category][datatier][block['block_name']] = properties
	
	if persist != None:
		outputfile = open(persist,'w')
		json.dump(dbs_query_results,outputfile)
예제 #26
0
import optparse

parser = optparse.OptionParser()
parser.add_option('--dataset', dest='dataset')
parser.add_option('--runwhitelist', dest='runwhitelist')
parser.add_option('--output_fname', dest='output_fname')

(options, args) = parser.parse_args()

if options.dataset == None or options.runwhitelist == None or options.output_fname == None:
    print "Usage: python2.6 get_list_of_blocks.py --dataset DATASETNAME --runwhitelist RUNWHITELIST --output_fname OUTPUTFILENAME"
    sys.exit(0)

dataset = options.dataset
runwhitelist = options.runwhitelist
output_fname = options.output_fname

dbsApi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader')

blocks = dbsApi.listBlocks(dataset=dataset, run_num=runwhitelist)

output_file = open(output_fname, 'w')

for block in blocks:
    print >> output_file, block['block_name']

if True:
    sys.exit(1)
else:
    sys.exit(0)
예제 #27
0
class RequestQuery:

    def __init__(self,config):
        self.br=Browser()

        self.config = config
        
        # Initialise connections
        self.phedex = PhEDEx({"endpoint":"https://cmsweb.cern.ch/phedex/datasvc/json/prod/"}, "json")
        self.dbsPhys01 = DbsApi(url = dbs_base_url+"phys01/DBSReader/")
        self.dbsPhys02 = DbsApi(url = dbs_base_url+"phys02/DBSReader/")
        self.dbsPhys03 = DbsApi(url = dbs_base_url+"phys03/DBSReader/")
        
    def __del__(self):
        self.br.close()

    def getScramArchByCMSSW(self):
        """
        Get from the list of available CMSSW releases
        return a dictionary of ScramArchitecture by CMSSW
        """
        
        # Set temporary conection to the server and get the response from cmstags
        url = 'https://cmssdt.cern.ch/SDT/cgi-bin/ReleasesXML'
        br = Browser()
        br.set_handle_robots(False)
        response=br.open(url)
        soup = BeautifulSoup(response.read())
        
        # Dictionary form
        # {'CMSSW_X_X_X':[slc5_amd64_gcc472], ... }
        archByCmssw={}
        
        # Fill the dictionary
        for arch in soup.find_all('architecture'): 
            for cmssw in arch.find_all('project'): 
                # CMSSW release
                cmsswLabel = cmssw.get('label').encode('ascii', 'ignore')
                if cmsswLabel not in archByCmssw:
                    archByCmssw[cmsswLabel]=[]
                # ScramArch related to this CMSSW release
                archName = arch.get('name').encode('ascii', 'ignore')
                archByCmssw[cmsswLabel].append(archName)
        
        return archByCmssw
      
    def getDatasetOriginSites(self, dbs_url, data):
        """
        Get the origin sites for each block of the dataset.
        Return a list block origin sites.
        """
        
        sites=[]
        local_dbs = dbs_url.split('/')[5]
        if local_dbs == 'phys01':
            response = self.dbsPhys01.listBlocks(detail=True,dataset=data)
        elif local_dbs == 'phys02':
            response = self.dbsPhys02.listBlocks(detail=True,dataset=data)
        elif local_dbs == 'phys03':
            response = self.dbsPhys03.listBlocks(detail=True,dataset=data)
        
        seList = []
        for block in response:
            if block['origin_site_name'] not in seList:
                seList.append(block['origin_site_name'])
        
        siteNames = []
        for node in self.nodeMappings['phedex']['node']:
            if node['se'] in seList:
                siteNames.append(node['name']) 
        
        return siteNames, seList
    
    def phEDExNodetocmsName(self, nodeList):
        """
        Convert PhEDEx node name list to cms names list 
        """
        names = []
        for node in nodeList:
            name = node.replace('_MSS',
                                '').replace('_Disk',
                                    '').replace('_Buffer',
                                        '').replace('_Export', '')
            if name not in names:
                names.append(name)
        return names
    
    def setGlobalTagFromOrigin(self, dbs_url,input_dataset):
        """
        Get the global tag of the dataset from the source dbs url. If it is not set, then set global tag to 'UNKNOWN'
        """
        
        globalTag = ""
        local_dbs = dbs_url.split('/')[5]
        if local_dbs == 'phys01':
            response = self.dbsPhys01.listOutputConfigs(dataset=input_dataset)
        elif local_dbs == 'phys02':
            response = self.dbsPhys02.listOutputConfigs(dataset=input_dataset)
        elif local_dbs == 'phys03':
            response = self.dbsPhys03.listOutputConfigs(dataset=input_dataset)
        
        globalTag = response[0]['global_tag']
        # GlobalTag cannot be empty
        if globalTag == '':
            globalTag = 'UNKNOWN'
            
        return globalTag
    
    def isDataAtUrl(self, dbs_url,input_dataset):
        """
        Returns True if the dataset is at the dbs url, if not returns False
        """
        local_dbs = dbs_url.split('/')[5]
        if local_dbs == 'phys01':
            response = self.dbsPhys01.listDatasets(dataset=input_dataset)
        elif local_dbs == 'phys02':
            response = self.dbsPhys02.listDatasets(dataset=input_dataset)
        elif local_dbs == 'phys03':
            response = self.dbsPhys03.listDatasets(dataset=input_dataset)
        # This means that the dataset is not at the url
        if not response:
            return False
        else:
            return True
         
    def getLabelByValueDict(self, control):
        """
        From control items, create a dictionary by values
        """   
        d = {}
        for item in control.items:
            value = item.attrs['value']
            label = item.attrs['label']
            d[value] = label
                
        return d
    
    def getValueByLabelDict(self, control):
        """
        From control items, create a dictionary by labels
        """
        d = {}
        for item in control.items:
            value = item.attrs['value']
            label = item.attrs['label']
            d[label] = value

        return d
    
    def createRequestJSON(self, ticket, input_dataset, dbs_url, cmssw_release, group_name, version = 1):
        """
        Creates a JSON file 'Ticket_#TICKET.json' with the needed
        information for creating a requeston ReqMgr.
        Input:
            - ticket: the ticket #, for instance 110773 on https://ggus.eu/?mode=ticket_info&ticket_id=110773
            - input_dataset
            - dbs_url: only the instance name, For example: "phys01" for 
             https://cmsweb.cern.ch/dbs/prod/phys01/DBSReader
            - cmssw_release
            - group_name: the physics group name
            - version: the dataset version, 1 by default.
        It returns a dictionary that contains the request information.
        """

        scramArchByCMSSW = self.getScramArchByCMSSW()
        self.nodeMappings = self.phedex.getNodeMap()
        task = ticket
        print "Processing ticket: %s" % task
        
        #splitting input dataset       
        input_primary_dataset = input_dataset.split('/')[1].replace(' ','')
        input_processed_dataset = input_dataset.split('/')[2].replace(' ','')
        data_tier = input_dataset.split('/')[3].replace(' ','')
                
        # Transform input value to a valid DBS url
        #dbs_url = "https://cmsweb.cern.ch/dbs/prod/"+dbs_url+"/DBSReader"
        dbs_url = dbs_base_url+dbs_url+"/DBSReader"
        release_id = cmssw_release
                
        # check if deprecated release was used
        release = cmssw_release
        # check if release has not ScramArch match
        if release not in scramArchByCMSSW:
            raise Exception("Error on ticket %s due to ScramArch mismatch" % task)
        else:
            scram_arch = scramArchByCMSSW[release][-1]

        # check if dataset is not at dbs url
        try:
            data_at_url = self.isDataAtUrl(dbs_url,input_dataset)
        except:
            raise Exception('Error on ticket %s, dataset %s not available at %s' %(task, input_dataset,dbs_url))

        if not data_at_url:
            raise Exception('Error on ticket %s, dataset %s not available at %s' %(task, input_dataset,dbs_url))
                    
        ## Get Physics Group
        group_squad = 'cms-storeresults-'+group_name.replace("-","_").lower()

        ## Get Dataset Version
        dataset_version = str(version)

        # Set default Adquisition Era for StoreResults 
        acquisitionEra = "StoreResults"

        ## Construction of the new dataset name (ProcessingString)
        ## remove leading hypernews or physics group name and StoreResults+Version
        if input_processed_dataset.find(group_name)==0:
            new_dataset = input_processed_dataset.replace(group_name,"",1)
        else:
            stripped_dataset = input_processed_dataset.split("-")[1:]
            new_dataset = '_'.join(stripped_dataset)
                        
        # Get dataset site info:
        phedex_map, se_names = self.getDatasetOriginSites(dbs_url,input_dataset)
        sites = self.phEDExNodetocmsName(phedex_map)
        
        infoDict = {}
        # Build store results json
        # First add all the defaults values
        infoDict["RequestType"] = "StoreResults"
        infoDict["UnmergedLFNBase"] = "/store/unmerged" 
        infoDict["MergedLFNBase"] = "/store/results/" + group_name.replace("-","_").lower()
        infoDict["MinMergeSize"] = 1500000000
        infoDict["MaxMergeSize"] = 5000000000
        infoDict["MaxMergeEvents"] = 100000
        infoDict["TimePerEvent"] = 40
        infoDict["SizePerEvent"] = 512.0
        infoDict["Memory"] = 2394
        infoDict["CmsPath"] = "/uscmst1/prod/sw/cms"                                        
        infoDict["Group"] = "DATAOPS"
        infoDict["DbsUrl"] = dbs_url
        
        # Add all the information pulled from Savannah
        infoDict["AcquisitionEra"] = acquisitionEra
        infoDict["GlobalTag"] = self.setGlobalTagFromOrigin(dbs_url, input_dataset)
        infoDict["DataTier"] = data_tier
        infoDict["InputDataset"] = input_dataset
        infoDict["ProcessingString"] = new_dataset
        infoDict["CMSSWVersion"] = release
        infoDict["ScramArch"] = scram_arch
        infoDict["ProcessingVersion"] = dataset_version                    
        infoDict["SiteWhitelist"] = list(sites)
        
        # Create report for Migration2Global
        report = {}
         
        #Fill json file, if status is done
        self.writeJSONFile(task, infoDict)
        report["json"] = 'y'
        report["task"] = int(task)
        report["InputDataset"] = input_dataset
        report["ProcessingString"] = new_dataset
        report["localUrl"] = dbs_url
        report["sites"] = list(sites)
        report["se_names"] = list(se_names)

        return report

    def writeJSONFile(self, task, infoDict):
        """
        This writes a JSON file at ComponentDir
        """
        ##check if file already exists
        filename = self.config["ComponentDir"]+'/Ticket_'+str(task)+'.json'
        if not os.access(filename,os.F_OK):
            jsonfile = open(filename,'w')
            request = {'createRequest':infoDict} ## CHECK THIS BEFORE FINISHING
            jsonfile.write(json.dumps(request,sort_keys=True, indent=4))
            jsonfile.close

        return

    def removeJSONFile(self,task):
        """
        This removes the JSON file at ComponentDir if it was created
        """
        filename = self.config["ComponentDir"]+'/Ticket_'+str(task)+'.json'

        if os.access(filename,os.F_OK):
            os.remove(filename)
        return

    def printReport(self, report):
        """
        Print out a report
        """
        print "%20s %5s %10s %50s %50s" %( 'Ticket','json','local DBS','Sites','se_names') 
        print "%20s %5s %10s %50s %50s" %( '-'*20,'-'*5,'-'*10,'-'*50,'-'*50 )
        
        json = report["json"]
        ticket = report["task"]
        #status = report["ticketStatus"]
        localUrl = report["localUrl"].split('/')[5]
        site = ', '.join(report["sites"])
        se_names = ', '.join(report["se_names"])
        print "%20s %5s %10s %50s %50s" %(ticket,json,localUrl,site,se_names)  
예제 #28
0
class DBSUploadPoller(BaseWorkerThread):
    """
    Handles poll-based DBSUpload

    """


    def __init__(self, config, dbsconfig = None):
        """
        Initialise class members
        """
        logging.info("Running __init__ for DBS3 Uploader")
        BaseWorkerThread.__init__(self)
        self.config     = config

        # This is slightly dangerous, but DBSUpload depends
        # on DBSInterface anyway
        self.dbsUrl           = self.config.DBS3Upload.dbsUrl

        self.dbsUtil = DBSBufferUtil()


        self.pool   = []
        self.blocksToCheck = []
        self.input  = None
        self.result = None
        self.nProc  = getattr(self.config.DBS3Upload, 'nProcesses', 4)
        self.wait   = getattr(self.config.DBS3Upload, 'dbsWaitTime', 2)
        self.nTries = getattr(self.config.DBS3Upload, 'dbsNTries', 300)
        self.dbs3UploadOnly = getattr(self.config.DBS3Upload, "dbs3UploadOnly", False)
        self.physicsGroup   = getattr(self.config.DBS3Upload, "physicsGroup", "NoGroup")
        self.datasetType    = getattr(self.config.DBS3Upload, "datasetType", "PRODUCTION")
        self.primaryDatasetType = getattr(self.config.DBS3Upload, "primaryDatasetType", "mc")
        self.blockCount     = 0
        self.dbsApi = DbsApi(url = self.dbsUrl)

        # List of blocks currently in processing
        self.queuedBlocks = []

        # Set up the pool of worker processes
        self.setupPool()

        # Setting up any cache objects
        self.blockCache = {}
        self.dasCache   = {}

        self.filesToUpdate = []

        self.produceCopy = getattr(self.config.DBS3Upload, 'copyBlock', False)
        self.copyPath    = getattr(self.config.DBS3Upload, 'copyBlockPath',
                                   '/data/mnorman/block.json')
        
        self.timeoutWaiver = 1

        return

    def setupPool(self):
        """
        _setupPool_

        Set up the processing pool for work
        """
        if len(self.pool) > 0:
            # Then something already exists.  Continue
            return

        self.input  = multiprocessing.Queue()
        self.result = multiprocessing.Queue()

        # Starting up the pool:
        for _ in range(self.nProc):
            p = multiprocessing.Process(target = uploadWorker,
                                        args = (self.input,
                                                self.result, self.dbsUrl))
            p.start()
            self.pool.append(p)

        return

    def __del__(self):
        """
        __del__

        Trigger a close of connections if necessary
        """
        self.close()
        return

    def close(self):
        """
        _close_

        Kill all connections and terminate
        """
        terminate = False
        for _ in self.pool:
            try:
                self.input.put('STOP')
            except Exception as ex:
                # Something very strange happens here
                # It's like it raises a blank exception
                # Upon being told to return
                msg =  "Hit some exception in deletion\n"
                msg += str(ex)
                logging.debug(msg)
                terminate = True
        try:
            self.input.close()
            self.result.close()
        except:
            # What are you going to do?
            pass
        for proc in self.pool:
            if terminate:
                proc.terminate()
            else:
                proc.join()
        self.pool   = []
        self.input  = None
        self.result = None
        return



    def terminate(self, params):
        """
        Do one more pass, then terminate

        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(params)


    def algorithm(self, parameters = None):
        """
        _algorithm_

        First, check blocks that may be already uploaded
        Then, load blocks
        Then, load files
        Then, move files into blocks
        Then add new blocks in DBSBuffer
        Then add blocks to DBS
        Then mark blocks as done in DBSBuffer
        """
        try:
            logging.info("Starting the DBSUpload Polling Cycle")
            self.checkBlocks()
            self.loadBlocks()

            # The following two functions will actually place new files into
            # blocks.  In DBS3 upload mode we rely on something else to do that
            # for us and will skip this step.
            if not self.dbs3UploadOnly:
                self.loadFiles()
                self.checkTimeout()
                self.checkCompleted()

            self.inputBlocks() 
            self.retrieveBlocks()
        except WMException:
            raise
        except Exception as ex:
            msg =  "Unhandled Exception in DBSUploadPoller!\n"
            msg += str(ex)
            msg += str(str(traceback.format_exc()))
            logging.error(msg)
            raise DBSUploadException(msg)

    def loadBlocks(self):
        """
        _loadBlocks_

        Find all blocks; make sure they're in the cache
        """
        openBlocks = self.dbsUtil.findOpenBlocks(self.dbs3UploadOnly)
        logging.info("These are the openblocks: %s" % openBlocks)

        # Load them if we don't have them
        blocksToLoad = []
        for block in openBlocks:
            if not block['blockname'] in self.blockCache.keys():
                blocksToLoad.append(block['blockname'])


        # Now load the blocks
        try:
            loadedBlocks = self.dbsUtil.loadBlocks(blocksToLoad, self.dbs3UploadOnly)
            logging.info("Loaded blocks: %s" % loadedBlocks)
        except WMException:
            raise
        except Exception as ex:
            msg =  "Unhandled exception while loading blocks.\n"
            msg += str(ex)
            logging.error(msg)
            logging.debug("Blocks to load: %s\n" % blocksToLoad)
            raise DBSUploadException(msg)

        for blockInfo in loadedBlocks:
            das  = blockInfo['DatasetAlgo']
            loc  = blockInfo['origin_site_name']
            workflow =  blockInfo['workflow']
            block = DBSBlock(name = blockInfo['block_name'],
                             location = loc, das = das, workflow = workflow)
            block.FillFromDBSBuffer(blockInfo)
            blockname = block.getName()

            # Now we have to load files...
            try:
                files = self.dbsUtil.loadFilesByBlock(blockname = blockname)
                logging.info("Have %i files for block %s" % (len(files), blockname))
            except WMException:
                raise
            except Exception as ex:
                msg =  "Unhandled exception while loading files for existing blocks.\n"
                msg += str(ex)
                logging.error(msg)
                logging.debug("Blocks being loaded: %s\n" % blockname)
                raise DBSUploadException(msg)

            # Add the loaded files to the block
            for file in files:
                block.addFile(file, self.datasetType, self.primaryDatasetType)

            # Add to the cache
            self.addNewBlock(block = block)

        # All blocks should now be loaded and present
        # in both the block cache (which has all the info)
        # and the dasCache (which is a list of name pointers
        # to the keys in the block cache).
        return


    def loadFiles(self):
        """
        _loadFiles_

        Load all files that need to be loaded.  I will do this by DAS for now to
        break the monstrous calls down into smaller chunks.
        """
        # Grab all the Dataset-Algo combindations
        dasList = self.dbsUtil.findUploadableDAS()

        if len(dasList) < 1:
            # Then there's nothing to do
            return []

        readyBlocks = []
        for dasInfo in dasList:

            dasID = dasInfo['DAS_ID']

            # Get the files
            try:
                loadedFiles = self.dbsUtil.findUploadableFilesByDAS(das = dasID)
            except WMException:
                raise
            except Exception as ex:
                msg =  "Unhandled exception while loading uploadable files for DAS.\n"
                msg += str(ex)
                logging.error(msg)
                logging.debug("DAS being loaded: %s\n" % dasID)
                raise DBSUploadException(msg)

            # Sort the files and blocks by location
            fileDict = sortListByKey(input = loadedFiles, key = 'locations')

            # Now add each file
            for location in fileDict.keys():
                files = fileDict.get(location)

                if len(files) < 1:
                    # Nothing to do here
                    continue

                currentBlock = self.getBlock(files[0], location, dasID, True)
                currentBlock.setAcquisitionEra(era = dasInfo['AcquisitionEra'])
                currentBlock.setProcessingVer(procVer = dasInfo['ProcessingVer'])

                for newFile in files:
                    if not newFile.get('block', 1) == None:
                        # Then this file already has a block
                        # It should be accounted for somewhere
                        # Or loaded with the block
                        continue

                    # Check if we can put files in this block
                    if not self.isBlockOpen(newFile = newFile,
                                            block = currentBlock):
                        # Then we have to close the block and get a new one
                        currentBlock.setPendingAndCloseBlock()
                        readyBlocks.append(currentBlock)
                        currentBlock = self.getBlock(newFile = newFile,
                                                     location = location,
                                                     das = dasID)
                        currentBlock.setAcquisitionEra(era = dasInfo['AcquisitionEra'])
                        currentBlock.setProcessingVer(procVer = dasInfo['ProcessingVer'])

                    # Now deal with the file
                    currentBlock.addFile(newFile, self.datasetType, self.primaryDatasetType)
                    self.filesToUpdate.append({'filelfn': newFile['lfn'],
                                               'block': currentBlock.getName()})
                # Done with the location
                readyBlocks.append(currentBlock)

            # Should be done with the DAS once we've added all files

        # Update the blockCache with what is now ready.
        for block in readyBlocks:
            self.blockCache[block.getName()] = block
        return

    def checkTimeout(self):
        """
        _checkTimeout_

        Loop all Open blocks and mark them as Pending if they have timed out.
        """
        for block in self.blockCache.values():
            if block.status == "Open" and block.getTime() > block.getMaxBlockTime():
                block.setPendingAndCloseBlock()
                self.blockCache[block.getName()] = block
    
    def checkCompleted(self):
        """
        _checkTimeout_

        Loop all Open blocks and mark them as Pending if they have timed out.
        """
        completedWorkflows = self.dbsUtil.getCompletedWorkflows()
        for block in self.blockCache.values():
            if block.status == "Open":
                if block.workflow in completedWorkflows:
                    block.setPendingAndCloseBlock()
                    self.blockCache[block.getName()] = block

    def addNewBlock(self, block):
        """
        _addNewBlock_

        Add a new block everywhere it has to go
        """
        name     = block.getName()
        location = block.getLocation()
        das      = block.das

        self.blockCache[name] = block
        if not das in self.dasCache.keys():
            self.dasCache[das] = {}
            self.dasCache[das][location] = []
        elif not location in self.dasCache[das].keys():
            self.dasCache[das][location] = []
        if name not in self.dasCache[das][location]:
            self.dasCache[das][location].append(name)

        return

    def isBlockOpen(self, newFile, block, doTime = False):
        """
        _isBlockOpen_

        Check and see if a block is full
        This will check on time, but that's disabled by default
        The plan is to do a time check after we do everything else,
        so open blocks about to time out can still get more
        files put in them.
        """

        if block.getMaxBlockFiles() is None or block.getMaxBlockNumEvents() is None or \
            block.getMaxBlockSize() is None or block.getMaxBlockTime() is None:
            return True
        if block.status != 'Open':
            # Then somebody has dumped this already
            return False
        if block.getSize() + newFile['size'] > block.getMaxBlockSize():
            return False
        if block.getNumEvents() + newFile['events'] > block.getMaxBlockNumEvents():
            return False
        if block.getNFiles() >= block.getMaxBlockFiles():
            # Then we have to dump it because this file
            # will put it over the limit.
            return False
        if block.getTime() > block.getMaxBlockTime() and doTime:
            return False

        return True

    def getBlock(self, newFile, location, das, skipOpenCheck = False):
        """
        _getBlock_

        Retrieve a block is one exists and is open.  If no open block is found
        create and return a new one.
        """
        if das in self.dasCache.keys() and location in self.dasCache[das].keys():
            for blockName in self.dasCache[das][location]:
                block = self.blockCache.get(blockName)
                if not self.isBlockOpen(newFile = newFile, block = block) and not skipOpenCheck:
                    # Block isn't open anymore.  Mark it as pending so that it gets
                    # uploaded.
                    block.setPendingAndCloseBlock()
                    self.blockCache[blockName] = block
                else:
                    return block

        # A suitable open block does not exist.  Create a new one.
        blockname = "%s#%s" % (newFile["datasetPath"], makeUUID())
        newBlock = DBSBlock(name = blockname, location = location, 
                            das = das, workflow = newFile["workflow"])
        self.addNewBlock(block = newBlock)
        return newBlock

    def inputBlocks(self):
        """
        _inputBlocks_

        Loop through all of the "active" blocks and sort them so we can act
        appropriately on them.  Everything will be sorted based on the
        following:
         Queued - Block is already being acted on by another process.  We just
          ignore it.
         Pending, not in DBSBuffer - Block that has been closed and needs to
           be injected into DBS and also written to DBSBuffer.  We'll do both.
         Pending, in DBSBuffer - Block has been closed and written to
           DBSBuffer.  We just need to inject it into DBS.
         Open, not in DBSBuffer - Newly created block that needs to be written
           not DBSBuffer.
         Open, in DBSBuffer - Newly created block that has already been
           written to DBSBuffer.  We don't have to do anything with it.
        """
        myThread = threading.currentThread()

        createInDBS = []
        createInDBSBuffer = []
        updateInDBSBuffer = []
        for block in self.blockCache.values():
            if block.getName() in self.queuedBlocks:
                # Block is already being dealt with by another process.  We'll
                # ignore it here.
                continue
            if block.status == 'Pending':
                # All pending blocks need to be injected into DBS.
                createInDBS.append(block)

                # If this is a new block it needs to be added to DBSBuffer
                # otherwise it just needs to be updated in DBSBuffer.
                if not block.inBuff:
                    createInDBSBuffer.append(block)
                else:
                    updateInDBSBuffer.append(block)
            if block.status == 'Open' and not block.inBuff:
                # New block that needs to be added to DBSBuffer.
                createInDBSBuffer.append(block)

        # Build the pool if it was closed
        if len(self.pool) == 0:
            self.setupPool()

        # First handle new and updated blocks
        try:
            myThread.transaction.begin()
            self.dbsUtil.createBlocks(blocks = createInDBSBuffer)
            self.dbsUtil.updateBlocks(blocks = updateInDBSBuffer,
                                      dbs3UploadOnly = self.dbs3UploadOnly)
            myThread.transaction.commit()
        except WMException:
            myThread.transaction.rollback()
            raise
        except Exception as ex:
            msg =  "Unhandled exception while writing new blocks into DBSBuffer\n"
            msg += str(ex)
            logging.error(msg)
            logging.debug("Blocks for DBSBuffer: %s\n" % createInDBSBuffer)
            logging.debug("Blocks for Update: %s\n" % updateInDBSBuffer)
            myThread.transaction.rollback()
            raise DBSUploadException(msg)

        # Update block status in the block cache.  Mark the blocks that we have
        # added to DBSBuffer as being in DBSBuffer.
        for block in createInDBSBuffer:
            self.blockCache.get(block.getName()).inBuff = True

        # Record new file/block associations in DBSBuffer.
        try:
            myThread.transaction.begin()
            self.dbsUtil.setBlockFiles(binds = self.filesToUpdate)
            self.filesToUpdate = []
            myThread.transaction.commit()
        except WMException:
            myThread.transaction.rollback()
            raise
        except Exception as ex:
            msg =  "Unhandled exception while setting blocks in files.\n"
            msg += str(ex)
            logging.error(msg)
            logging.debug("Files to Update: %s\n" % self.filesToUpdate)
            myThread.transaction.rollback()
            raise DBSUploadException(msg)

        # Finally upload blocks to DBS.
        for block in createInDBS:
            if len(block.files) < 1:
                # What are we doing?
                logging.debug("Skipping empty block")
                continue
            if block.getDataset() == None:
                # Then we have to fix the dataset
                dbsFile = block.files[0]
                block.setDataset(datasetName  = dbsFile['datasetPath'],
                                 primaryType  = self.primaryDatasetType,
                                 datasetType  = self.datasetType,
                                 physicsGroup = dbsFile.get('physicsGroup', None),
                                 prep_id = dbsFile.get('prep_id', None))
            logging.debug("Found block %s in blocks" % block.getName())
            block.setPhysicsGroup(group = self.physicsGroup)
            
            encodedBlock = block.convertToDBSBlock()
            logging.info("About to insert block %s" % block.getName())
            self.input.put({'name': block.getName(), 'block': encodedBlock})
            self.blockCount += 1
            if self.produceCopy:
                import json
                f = open(self.copyPath, 'w')
                f.write(json.dumps(encodedBlock))
                f.close()
            self.queuedBlocks.append(block.getName())

        # And all work is in and we're done for now
        return

    def retrieveBlocks(self):
        """
        _retrieveBlocks_

        Once blocks are in DBS, we have to retrieve them and see what's
        in them.  What we do is get everything out of the result queue,
        and then update it in DBSBuffer.

        To do this, the result queue needs to pass back the blockname
        """
        myThread = threading.currentThread()

        blocksToClose = []
        emptyCount    = 0
        while self.blockCount > 0:
            if emptyCount > self.nTries:
                
                # When timeoutWaiver is 0 raise error. 
                # It could take long time to get upload data to DBS 
                # if there are a lot of files are cumulated in the buffer.
                # in first try but second try should be faster.
                # timeoutWaiver is set as component variable - only resets when component restarted.
                # The reason for that is only back log will occur when component is down 
                # for a long time while other component still running and feeding the data to 
                # dbsbuffer
        
                if self.timeoutWaiver == 0:
                    msg = "Exceeded max number of waits while waiting for DBS to finish"
                    raise DBSUploadException(msg)
                else:
                    self.timeoutWaiver = 0
                    return
            try:
                # Get stuff out of the queue with a ridiculously
                # short wait time
                blockresult = self.result.get(timeout = self.wait)
                blocksToClose.append(blockresult)
                self.blockCount -= 1
                logging.debug("Got a block to close")
            except Queue.Empty:
                # This means the queue has no current results
                time.sleep(2)
                emptyCount += 1
                continue

        loadedBlocks = []
        for result in blocksToClose:
            # Remove from list of work being processed
            self.queuedBlocks.remove(result.get('name'))
            if result["success"] == "uploaded":
                block = self.blockCache.get(result.get('name'))
                block.status = 'InDBS'
                loadedBlocks.append(block)
            elif result["success"] == "check":
                block = result["name"]
                self.blocksToCheck.append(block)
            else:
                logging.error("Error found in multiprocess during process of block %s" % result.get('name'))
                logging.error(result['error'])
                # Continue to the next block
                # Block will remain in pending status until it is transferred

        try:
            myThread.transaction.begin()
            self.dbsUtil.updateBlocks(loadedBlocks, self.dbs3UploadOnly)
            if not self.dbs3UploadOnly:
                self.dbsUtil.updateFileStatus(loadedBlocks, "InDBS")
            myThread.transaction.commit()
        except WMException:
            myThread.transaction.rollback()
            raise
        except Exception as ex:
            msg =  "Unhandled exception while finished closed blocks in DBSBuffer\n"
            msg += str(ex)
            logging.error(msg)
            logging.debug("Blocks for Update: %s\n" % loadedBlocks)
            myThread.transaction.rollback()
            raise DBSUploadException(msg)

        for block in loadedBlocks:
            # Clean things up
            name     = block.getName()
            location = block.getLocation()
            das      = block.das
            self.dasCache[das][location].remove(name)
            del self.blockCache[name]

        # Clean up the pool so we don't have stuff waiting around
        if len(self.pool) > 0:
            self.close()

        # And we're done
        return

    def checkBlocks(self):
        """
        _checkBlocks_

        Check with DBS3 if the blocks marked as check are
        uploaded or not.
        """
        myThread = threading.currentThread()
        blocksUploaded = []

        # See if there is anything to check
        for block in self.blocksToCheck:
            logging.debug("Checking block existence: %s" % block)
            # Check in DBS if the block was really inserted
            try:
                result = self.dbsApi.listBlocks(block_name = block)
                for blockResult in result:
                    if blockResult['block_name'] == block:
                        loadedBlock = self.blockCache.get(block)
                        loadedBlock.status = 'InDBS'
                        blocksUploaded.append(loadedBlock)
                        break
            except Exception as ex:
                exString = str(ex)
                msg =  "Error trying to check block %s through DBS.\n" % block
                msg += exString
                logging.error(msg)
                logging.error(str(traceback.format_exc()))
        # Update the status of those blocks that were truly inserted
        try:
            myThread.transaction.begin()
            self.dbsUtil.updateBlocks(blocksUploaded, self.dbs3UploadOnly)
            if not self.dbs3UploadOnly:
                self.dbsUtil.updateFileStatus(blocksUploaded, "InDBS")
            myThread.transaction.commit()
        except WMException:
            myThread.transaction.rollback()
            raise
        except Exception as ex:
            msg =  "Unhandled exception while finished closed blocks in DBSBuffer\n"
            msg += str(ex)
            logging.error(msg)
            logging.debug("Blocks for Update: %s\n" % blocksUploaded)
            myThread.transaction.rollback()
            raise DBSUploadException(msg)

        for block in blocksUploaded:
            # Clean things up
            name     = block.getName()
            location = block.getLocation()
            das      = block.das
            self.dasCache[das][location].remove(name)
            del self.blockCache[name]

        # Clean the check list
        self.blocksToCheck = []

        # We're done
        return
parser = optparse.OptionParser()
parser.add_option('--dataset', dest='dataset')
parser.add_option('--runwhitelist', dest='runwhitelist')
parser.add_option('--output_fname', dest='output_fname')

(options,args) = parser.parse_args()

if options.dataset == None or options.runwhitelist == None or options.output_fname == None:
    print "Usage: python2.6 get_list_of_blocks.py --dataset DATASETNAME --runwhitelist RUNWHITELIST --output_fname OUTPUTFILENAME"
    sys.exit(0)


dataset=options.dataset
runwhitelist = options.runwhitelist
output_fname = options.output_fname

dbsApi = DbsApi(url = 'https://cmsweb.cern.ch/dbs/prod/global/DBSReader')

blocks=dbsApi.listBlocks(dataset = dataset, run_num = runwhitelist)

output_file = open(output_fname,'w')

for block in blocks:
    print >> output_file, block['block_name']

if True:
    sys.exit(1)
else:
    sys.exit(0)
예제 #30
0
class DBSUploadPoller(BaseWorkerThread):
    """
    Handles poll-based DBSUpload

    """
    def __init__(self, config):
        """
        Initialise class members
        """
        logging.info("Running __init__ for DBS3 Uploader")
        BaseWorkerThread.__init__(self)
        self.config = config

        # This is slightly dangerous, but DBSUpload depends
        # on DBSInterface anyway
        self.dbsUrl = self.config.DBS3Upload.dbsUrl

        # Tier0 Agent don't need this
        if hasattr(self.config, "Tier0Feeder"):
            self.wmstatsServerSvc = None
        else:
            wmstatsSvcURL = self.config.General.centralWMStatsURL.replace(
                "couchdb/wmstats", "wmstatsserver")
            self.wmstatsServerSvc = WMStatsServer(wmstatsSvcURL)

        self.dbsUtil = DBSBufferUtil()

        myThread = threading.currentThread()
        daoFactory = DAOFactory(package="WMComponent.DBS3Buffer",
                                logger=myThread.logger,
                                dbinterface=myThread.dbi)
        self.updateBlocksDAO = daoFactory(classname="UpdateBlocks")
        self.updateFilesDAO = daoFactory(classname="UpdateFiles")
        self.createBlocksDAO = daoFactory(classname="CreateBlocks")
        self.setBlockFilesDAO = daoFactory(classname="SetBlockFiles")

        self.pool = []
        self.blocksToCheck = []
        self.workInput = None
        self.workResult = None
        self.nProc = getattr(self.config.DBS3Upload, 'nProcesses', 4)
        self.wait = getattr(self.config.DBS3Upload, 'dbsWaitTime', 2)
        self.nTries = getattr(self.config.DBS3Upload, 'dbsNTries', 300)
        self.physicsGroup = getattr(self.config.DBS3Upload, "physicsGroup",
                                    "NoGroup")
        self.datasetType = getattr(self.config.DBS3Upload, "datasetType",
                                   "PRODUCTION")
        self.primaryDatasetType = getattr(self.config.DBS3Upload,
                                          "primaryDatasetType", "mc")
        self.blockCount = 0
        self.dbsApi = DbsApi(url=self.dbsUrl)

        # List of blocks currently in processing
        self.queuedBlocks = []

        # Set up the pool of worker processes
        self.setupPool()

        # Setting up any cache objects
        self.blockCache = {}

        self.filesToUpdate = []

        self.produceCopy = getattr(self.config.DBS3Upload, 'dumpBlock', False)

        self.copyPath = os.path.join(
            getattr(self.config.DBS3Upload, 'componentDir', '/data/srv/'),
            'dbsuploader_block.json')

        self.timeoutWaiver = 1

        self.datasetParentageCache = {}

        return

    def setupPool(self):
        """
        _setupPool_

        Set up the processing pool for work
        """
        if self.pool:
            # Then something already exists.  Continue
            return

        self.workInput = multiprocessing.Queue()
        self.workResult = multiprocessing.Queue()

        # Starting up the pool:
        for _ in range(self.nProc):
            p = multiprocessing.Process(target=uploadWorker,
                                        args=(self.workInput, self.workResult,
                                              self.dbsUrl))
            p.start()
            self.pool.append(p)

        return

    def __del__(self):
        """
        __del__

        Trigger a close of connections if necessary
        """
        self.close()
        return

    def close(self):
        """
        _close_

        Kill all connections and terminate
        """
        terminate = False
        for _ in self.pool:
            try:
                self.workInput.put('STOP')
            except Exception as ex:
                # Something very strange happens here
                # It's like it raises a blank exception
                # Upon being told to return
                msg = "Hit some exception in deletion\n"
                msg += str(ex)
                logging.debug(msg)
                terminate = True
        try:
            self.workInput.close()
            self.workResult.close()
        except Exception:
            # What are you going to do?
            pass
        for proc in self.pool:
            if terminate:
                proc.terminate()
            else:
                proc.join()
        self.pool = []
        self.workInput = None
        self.workResult = None
        return

    def terminate(self, parameters):
        """
        Do one more pass, then terminate

        """
        logging.debug("terminating. doing one more pass before we die")
        self.algorithm(parameters)

    @timeFunction
    def algorithm(self, parameters=None):
        """
        _algorithm_

        First, check blocks that may be already uploaded
        Then, load blocks
        Then, load files
        Then, move files into blocks
        Then add new blocks in DBSBuffer
        Then add blocks to DBS
        Then mark blocks as done in DBSBuffer
        """
        logging.info("Starting the DBSUpload Polling Cycle")
        # refreshing parentageCache every cycle
        if self.updateDatasetParentageCache() is False:
            return

        logging.debug("Dataset parentage map: %s", self.datasetParentageCache)
        try:
            self.checkBlocks()
            self.loadBlocks()
            self.loadFiles()
            self.checkBlockCompletion()
            self.inputBlocks()
            self.retrieveBlocks()
        except WMException:
            raise
        except Exception as ex:
            msg = "Unhandled Exception in DBSUploadPoller! Error: %s" % str(ex)
            logging.exception(msg)
            raise DBSUploadException(msg)

    def updateDatasetParentageCache(self):
        """
        Return True to indicate it successfully fetched the parentage
        map. If there was an exception, return False
        """
        myThread = threading.currentThread()

        success = True
        if not self.wmstatsServerSvc:
            self.datasetParentageCache = {}
            return success

        try:
            self.datasetParentageCache = self.wmstatsServerSvc.getChildParentDatasetMap(
            )
        except Exception as ex:
            excReason = getattr(ex, 'reason', '')
            errorMsg = 'Failed to fetch parentage map from WMStats, skipping this cycle. '
            errorMsg += 'Exception: {}. Reason: {}. Error: {}. '.format(
                type(ex).__name__, excReason, str(ex))
            if isPassiveError(ex):
                logging.warning(errorMsg)
            else:
                errorMsg += 'Hit a terminal exception in DBSUploadPoller.'
                raise DBSUploadException(errorMsg)
            myThread.logdbClient.post("DBS3Upload_parentMap", errorMsg,
                                      "warning")
            success = False
        else:
            myThread.logdbClient.delete("DBS3Upload_parentMap",
                                        "warning",
                                        this_thread=True)

        return success

    def loadBlocks(self):
        """
        _loadBlocks_

        Find all blocks; make sure they're in the cache
        """
        openBlocks = self.dbsUtil.findOpenBlocks()
        logging.info("Found %d open blocks.", len(openBlocks))
        logging.debug("These are the openblocks: %s", openBlocks)

        # Load them if we don't have them
        blocksToLoad = []
        for block in openBlocks:
            if block['blockname'] not in self.blockCache:
                blocksToLoad.append(block['blockname'])

        # Now load the blocks
        try:
            loadedBlocks = self.dbsUtil.loadBlocks(blocksToLoad)
            logging.info("Loaded %d blocks.", len(loadedBlocks))
        except WMException:
            raise
        except Exception as ex:
            msg = "Unhandled exception while loading blocks.\n"
            msg += str(ex)
            logging.error(msg)
            logging.debug("Blocks to load: %s\n", blocksToLoad)
            raise DBSUploadException(msg)

        for blockInfo in loadedBlocks:
            block = DBSBufferBlock(name=blockInfo['block_name'],
                                   location=blockInfo['origin_site_name'],
                                   datasetpath=blockInfo['datasetpath'])

            parent = self.datasetParentageCache.get(blockInfo['datasetpath'])
            if parent:
                block.addDatasetParent(parent)
                logging.debug(
                    "Load block: Child dataset %s, Parent dataset %s",
                    blockInfo['datasetpath'], parent)
            block.FillFromDBSBuffer(blockInfo)
            blockname = block.getName()

            # Now we have to load files...
            try:
                files = self.dbsUtil.loadFilesByBlock(blockname=blockname)
                logging.info("Have %i files for block %s", len(files),
                             blockname)
            except WMException:
                raise
            except Exception as ex:
                msg = "Unhandled exception while loading files for existing blocks.\n"
                msg += str(ex)
                logging.error(msg)
                logging.debug("Blocks being loaded: %s\n", blockname)
                raise DBSUploadException(msg)

            # Add the loaded files to the block
            for f in files:
                block.addFile(f, self.datasetType, self.primaryDatasetType)

            # Add to the cache
            self.blockCache[blockInfo['block_name']] = block

        return

    def loadFiles(self):
        """
        _loadFiles_

        Load all files that need to be loaded.  I will do this by DatasetPath
        to break the monstrous calls down into smaller chunks.
        """
        dspList = self.dbsUtil.findUploadableDAS()

        readyBlocks = []
        for dspInfo in dspList:

            datasetpath = dspInfo['DatasetPath']

            # Get the files
            try:
                loadedFiles = self.dbsUtil.findUploadableFilesByDAS(
                    datasetpath=datasetpath)
            except WMException:
                raise
            except Exception as ex:
                msg = "Unhandled exception while loading uploadable files for DatasetPath.\n"
                msg += str(ex)
                logging.error(msg)
                logging.debug("DatasetPath being loaded: %s\n", datasetpath)
                raise DBSUploadException(msg)

            # Sort the files and blocks by location
            fileDict = sortListByKey(loadedFiles, 'locations')

            # Now add each file
            for location in fileDict.keys():

                files = fileDict.get(location)

                if files:
                    currentBlock = self.getBlock(files[0], location, True)
                    currentBlock.setAcquisitionEra(
                        era=dspInfo['AcquisitionEra'])
                    currentBlock.setProcessingVer(
                        procVer=dspInfo['ProcessingVer'])

                    for newFile in files:

                        if not newFile.get('block', 1) is None:
                            # Then this file already has a block
                            # It should be accounted for somewhere
                            # Or loaded with the block
                            continue

                        # Check if we can put files in this block
                        if not self.isBlockOpen(newFile=newFile,
                                                block=currentBlock):
                            # Then we have to close the block and get a new one
                            currentBlock.setPendingAndCloseBlock()
                            readyBlocks.append(currentBlock)
                            currentBlock = self.getBlock(newFile=newFile,
                                                         location=location)
                            currentBlock.setAcquisitionEra(
                                era=dspInfo['AcquisitionEra'])
                            currentBlock.setProcessingVer(
                                procVer=dspInfo['ProcessingVer'])

                        # Now deal with the file
                        currentBlock.addFile(newFile, self.datasetType,
                                             self.primaryDatasetType)
                        self.filesToUpdate.append({
                            'filelfn':
                            newFile['lfn'],
                            'block':
                            currentBlock.getName()
                        })
                    # Done with the location
                    readyBlocks.append(currentBlock)

        for block in readyBlocks:
            self.blockCache[block.getName()] = block

        return

    def checkBlockCompletion(self):
        """
        _checkBlockCompletion_

        Mark Open blocks as Pending if they have timed out or their workflows have completed
        """
        completedWorkflows = self.dbsUtil.getCompletedWorkflows()
        for block in self.blockCache.values():
            if block.status == "Open":
                if (block.getTime() > block.getMaxBlockTime()) or any(
                        key in completedWorkflows for key in block.workflows):
                    block.setPendingAndCloseBlock()

        return

    def isBlockOpen(self, newFile, block, doTime=False):
        """
        _isBlockOpen_

        Check and see if a block is full
        This will check on time, but that's disabled by default
        The plan is to do a time check after we do everything else,
        so open blocks about to time out can still get more
        files put in them.
        """

        if block.getMaxBlockFiles() is None or block.getMaxBlockNumEvents() is None or \
                block.getMaxBlockSize() is None or block.getMaxBlockTime() is None:
            return True
        if block.status != 'Open':
            # Then somebody has dumped this already
            return False
        if block.getSize() + newFile['size'] > block.getMaxBlockSize():
            return False
        if block.getNumEvents(
        ) + newFile['events'] > block.getMaxBlockNumEvents():
            return False
        if block.getNFiles() >= block.getMaxBlockFiles():
            # Then we have to dump it because this file
            # will put it over the limit.
            return False
        if block.getTime() > block.getMaxBlockTime() and doTime:
            return False

        return True

    def getBlock(self, newFile, location, skipOpenCheck=False):
        """
        _getBlock_

        Retrieve a block is one exists with matching datasetpath/location and is open.
        If no such block is found create and return a new one.
        """
        datasetpath = newFile["datasetPath"]

        for block in self.blockCache.values():
            if datasetpath == block.getDatasetPath(
            ) and location == block.getLocation():
                if not self.isBlockOpen(newFile=newFile,
                                        block=block) and not skipOpenCheck:
                    # Block isn't open anymore.  Mark it as pending so that it gets uploaded.
                    block.setPendingAndCloseBlock()
                else:
                    return block

        # A suitable open block does not exist.  Create a new one.
        blockname = "%s#%s" % (datasetpath, makeUUID())
        newBlock = DBSBufferBlock(name=blockname,
                                  location=location,
                                  datasetpath=datasetpath)

        parent = self.datasetParentageCache.get(datasetpath)
        if parent:
            newBlock.addDatasetParent(parent)
            logging.debug("Get block: Child dataset %s, Parent dataset %s",
                          datasetpath, parent)

        self.blockCache[blockname] = newBlock
        return newBlock

    def inputBlocks(self):
        """
        _inputBlocks_

        Loop through all of the "active" blocks and sort them so we can act
        appropriately on them.  Everything will be sorted based on the
        following:
         Queued - Block is already being acted on by another process.  We just
          ignore it.
         Pending, not in DBSBuffer - Block that has been closed and needs to
           be injected into DBS and also written to DBSBuffer.  We'll do both.
         Pending, in DBSBuffer - Block has been closed and written to
           DBSBuffer.  We just need to inject it into DBS.
         Open, not in DBSBuffer - Newly created block that needs to be written
           not DBSBuffer.
         Open, in DBSBuffer - Newly created block that has already been
           written to DBSBuffer.  We don't have to do anything with it.
        """
        if not self.blockCache:
            return

        myThread = threading.currentThread()

        createInDBS = []
        createInDBSBuffer = []
        updateInDBSBuffer = []
        for block in self.blockCache.values():
            if block.getName() in self.queuedBlocks:
                # Block is already being dealt with by another process.  We'll
                # ignore it here.
                continue
            if block.status == 'Pending':
                # All pending blocks need to be injected into DBS.
                createInDBS.append(block)

                # If this is a new block it needs to be added to DBSBuffer
                # otherwise it just needs to be updated in DBSBuffer.
                if not block.inBuff:
                    createInDBSBuffer.append(block)
                else:
                    updateInDBSBuffer.append(block)
            if block.status == 'Open' and not block.inBuff:
                # New block that needs to be added to DBSBuffer.
                createInDBSBuffer.append(block)

        # First handle new and updated blocks
        if createInDBSBuffer or updateInDBSBuffer:
            try:
                myThread.transaction.begin()
                if createInDBSBuffer:
                    self.createBlocksDAO.execute(
                        blocks=createInDBSBuffer,
                        conn=myThread.transaction.conn,
                        transaction=True)
                if updateInDBSBuffer:
                    self.updateBlocksDAO.execute(
                        blocks=updateInDBSBuffer,
                        conn=myThread.transaction.conn,
                        transaction=True)
            except WMException:
                myThread.transaction.rollback()
                raise
            except Exception as ex:
                myThread.transaction.rollback()
                msg = "Unhandled exception while writing new blocks into DBSBuffer\n"
                msg += str(ex)
                logging.error(msg)
                logging.debug("Blocks for DBSBuffer: %s\n", createInDBSBuffer)
                logging.debug("Blocks for Update: %s\n", updateInDBSBuffer)
                raise DBSUploadException(msg)
            else:
                myThread.transaction.commit()

        # Update block status in the block cache.  Mark the blocks that we have
        # added to DBSBuffer as being in DBSBuffer.
        for block in createInDBSBuffer:
            self.blockCache.get(block.getName()).inBuff = True

        # Record new file/block associations in DBSBuffer.
        if self.filesToUpdate:
            try:
                myThread.transaction.begin()
                self.setBlockFilesDAO.execute(binds=self.filesToUpdate,
                                              conn=myThread.transaction.conn,
                                              transaction=True)
                self.filesToUpdate = []
            except WMException:
                myThread.transaction.rollback()
                raise
            except Exception as ex:
                myThread.transaction.rollback()
                msg = "Unhandled exception while setting blocks in files.\n"
                msg += str(ex)
                logging.error(msg)
                logging.debug("Files to Update: %s\n", self.filesToUpdate)
                raise DBSUploadException(msg)
            else:
                myThread.transaction.commit()

        if not createInDBS:
            # then there is nothing else to do
            return

        # Build the pool if it was closed
        if not self.pool:
            self.setupPool()

        # Finally upload blocks to DBS.
        for block in createInDBS:
            if not block.files:
                # What are we doing?
                logging.debug("Skipping empty block")
                continue
            if block.getDataset() is None:
                # Then we have to fix the dataset
                dbsFile = block.files[0]
                block.setDataset(datasetName=dbsFile['datasetPath'],
                                 primaryType=self.primaryDatasetType,
                                 datasetType=self.datasetType,
                                 physicsGroup=dbsFile.get(
                                     'physicsGroup', None),
                                 prep_id=dbsFile.get('prep_id', None))
            logging.debug("Found block %s in blocks", block.getName())
            block.setPhysicsGroup(group=self.physicsGroup)

            encodedBlock = block.convertToDBSBlock()
            logging.info("About to insert block %s", block.getName())
            self.workInput.put({
                'name': block.getName(),
                'block': encodedBlock
            })
            self.blockCount += 1
            if self.produceCopy:
                with open(self.copyPath, 'w') as jo:
                    json.dump(encodedBlock, jo, indent=2)
            self.queuedBlocks.append(block.getName())

        # And all work is in and we're done for now
        return

    def retrieveBlocks(self):
        """
        _retrieveBlocks_

        Once blocks are in DBS, we have to retrieve them and see what's
        in them.  What we do is get everything out of the result queue,
        and then update it in DBSBuffer.

        To do this, the result queue needs to pass back the blockname
        """
        myThread = threading.currentThread()

        blocksToClose = []
        emptyCount = 0
        while self.blockCount > 0:
            if emptyCount > self.nTries:

                # When timeoutWaiver is 0 raise error.
                # It could take long time to get upload data to DBS
                # if there are a lot of files are cumulated in the buffer.
                # in first try but second try should be faster.
                # timeoutWaiver is set as component variable - only resets when component restarted.
                # The reason for that is only back log will occur when component is down
                # for a long time while other component still running and feeding the data to
                # dbsbuffer

                if self.timeoutWaiver == 0:
                    msg = "Exceeded max number of waits while waiting for DBS to finish"
                    raise DBSUploadException(msg)
                else:
                    self.timeoutWaiver = 0
                    return
            try:
                # Get stuff out of the queue with a ridiculously
                # short wait time
                blockresult = self.workResult.get(timeout=self.wait)
                blocksToClose.append(blockresult)
                self.blockCount -= 1
                logging.debug("Got a block to close")
            except queue.Empty:
                # This means the queue has no current results
                time.sleep(2)
                emptyCount += 1
                continue

        loadedBlocks = []
        for result in blocksToClose:
            # Remove from list of work being processed
            self.queuedBlocks.remove(result.get('name'))
            if result["success"] == "uploaded":
                block = self.blockCache.get(result.get('name'))
                block.status = 'InDBS'
                loadedBlocks.append(block)
            elif result["success"] == "check":
                block = result["name"]
                self.blocksToCheck.append(block)
            else:
                logging.error(
                    "Error found in multiprocess during process of block %s",
                    result.get('name'))
                logging.error(result['error'])
                # Continue to the next block
                # Block will remain in pending status until it is transferred

        if loadedBlocks:
            try:
                myThread.transaction.begin()
                self.updateFilesDAO.execute(blocks=loadedBlocks,
                                            status="InDBS",
                                            conn=myThread.transaction.conn,
                                            transaction=True)
                self.updateBlocksDAO.execute(blocks=loadedBlocks,
                                             conn=myThread.transaction.conn,
                                             transaction=True)
            except Exception as ex:
                myThread.transaction.rollback()
                # possible deadlock with PhEDExInjector, retry once after 10s
                logging.warning(
                    "Oracle exception, possible deadlock due to race condition, retry after 10s sleep"
                )
                time.sleep(10)
                try:
                    myThread.transaction.begin()
                    self.updateFilesDAO.execute(blocks=loadedBlocks,
                                                status="InDBS",
                                                conn=myThread.transaction.conn,
                                                transaction=True)
                    self.updateBlocksDAO.execute(
                        blocks=loadedBlocks,
                        conn=myThread.transaction.conn,
                        transaction=True)
                except Exception as ex:
                    myThread.transaction.rollback()
                    msg = "Unhandled exception while finished closed blocks in DBSBuffer\n"
                    msg += str(ex)
                    logging.error(msg)
                    logging.debug("Blocks for Update: %s\n", loadedBlocks)
                    raise DBSUploadException(msg)
                else:
                    myThread.transaction.commit()

            else:
                myThread.transaction.commit()

        for block in loadedBlocks:
            # Clean things up
            name = block.getName()
            del self.blockCache[name]

        # Clean up the pool so we don't have stuff waiting around
        if self.pool:
            self.close()

        # And we're done
        return

    def checkBlocks(self):
        """
        _checkBlocks_

        Check with DBS3 if the blocks marked as check are
        uploaded or not.
        """
        myThread = threading.currentThread()

        blocksUploaded = []

        # See if there is anything to check
        for block in self.blocksToCheck:
            logging.debug("Checking block existence: %s", block)
            # Check in DBS if the block was really inserted
            try:
                result = self.dbsApi.listBlocks(block_name=block)
                # it is an empty list if block cannot be found
                if result:
                    loadedBlock = self.blockCache.get(block)
                    loadedBlock.status = 'InDBS'
                    blocksUploaded.append(loadedBlock)
            except Exception as ex:
                msg = "Error trying to check block %s through DBS. Error: %s" % (
                    block, str(ex))
                logging.exception(msg)

        # Update the status of those blocks that were truly inserted
        if blocksUploaded:
            try:
                myThread.transaction.begin()
                self.updateBlocksDAO.execute(blocks=blocksUploaded,
                                             conn=myThread.transaction.conn,
                                             transaction=True)
                self.updateFilesDAO.execute(blocks=blocksUploaded,
                                            status="InDBS",
                                            conn=myThread.transaction.conn,
                                            transaction=True)
            except WMException:
                myThread.transaction.rollback()
                raise
            except Exception as ex:
                myThread.transaction.rollback()
                msg = "Unhandled exception while finished closed blocks in DBSBuffer\n"
                msg += str(ex)
                logging.exception(msg)
                logging.debug("Blocks for Update: %s\n", blocksUploaded)
                raise DBSUploadException(msg)
            else:
                myThread.transaction.commit()

        for block in blocksUploaded:
            # Clean things up
            name = block.getName()
            del self.blockCache[name]

        # Clean the check list
        self.blocksToCheck = []

        # We're done
        return
예제 #31
0
        writeUrl = url + 'DBSWriter'

    readApi = DbsApi(url=readUrl)
    writeApi = DbsApi(url=writeUrl)

    dataset = options.dataset
    if options.new_location:
        new_location = options.new_location

    ###sanitize input
    # dataset name
    Lexicon.dataset(dataset)

    # PNN
    if new_location:
        Lexicon.cmsname(new_location)

    # process dataset by blocks

    blockDicts = readApi.listBlocks(dataset=dataset, detail=True)
    for block in blockDicts:
        blName = block['block_name']
        location = block['origin_site_name']
        logging.debug('block %s at location: %s' % (blName, location))
        if new_location:
            writeApi.updateBlockSiteName(block_name=blName,
                                         origin_site_name=new_location)
            logging.debug('location set to %s' % (new_location))

    logging.info("Done")