Exemplo n.º 1
0
def batchor(url):
    UC = unifiedConfiguration()
    SI = global_SI()
    CI = campaignInfo()
    BI = batchInfo()
    ## get all workflows in assignment-approved with SubRequestType = relval
    all_wfs = []
    for user in UC.get("user_relval"):
        all_wfs.extend(
            getWorkflows(url,
                         'assignment-approved',
                         details=True,
                         user=user,
                         rtype='TaskChain'))

    wfs = filter(
        lambda r: r['SubRequestType'] == 'RelVal'
        if 'SubRequestType' in r else False, all_wfs)
    ## need a special treatment for those
    hi_wfs = filter(
        lambda r: r['SubRequestType'] == 'HIRelVal'
        if 'SubRequestType' in r else False, all_wfs)

    by_campaign = defaultdict(set)
    by_hi_campaign = defaultdict(set)
    for wf in wfs:
        print "Relval:", wf['RequestName'], wf['Campaign']
        by_campaign[wf['Campaign']].add(wf['PrepID'])

    for wf in hi_wfs:
        print "HI Relval:", wf['RequestName'], wf['Campaign']
        by_hi_campaign[wf['Campaign']].add(wf['PrepID'])

    default_setup = {
        "go": True,
        "parameters": {
            "SiteWhitelist": ["T1_US_FNAL"],
            "MergedLFNBase": "/store/relval",
            "Team": "relval",
            "NonCustodialGroup": "RelVal"
        },
        "custodial_override": "notape",
        "phedex_group": "RelVal",
        "lumisize": -1,
        "fractionpass": 0.0,
        "maxcopies": 1
    }
    default_hi_setup = copy.deepcopy(default_setup)

    add_on = {}
    relval_routing = UC.get('relval_routing')

    def pick_one_site(p):
        ## modify the parameters on the spot to have only one site
        if "parameters" in p and "SiteWhitelist" in p["parameters"] and len(
                p["parameters"]["SiteWhitelist"]) > 1:
            choose_from = list(
                set(p["parameters"]["SiteWhitelist"]) & set(SI.sites_ready))
            picked = random.choice(choose_from)
            print "picked", picked, "from", choose_from
            p["parameters"]["SiteWhitelist"] = [picked]

    batches = BI.all()
    for campaign in by_campaign:
        if campaign in batches: continue
        ## get a bunch of information
        setup = copy.deepcopy(default_setup)

        for key in relval_routing:
            if key in campaign:
                ## augment with the routing information
                augment_with = relval_routing[key]
                print "Modifying the batch configuration because of keyword", key
                print "with", augment_with
                setup = deep_update(setup, augment_with)

        pick_one_site(setup)
        add_on[campaign] = setup
        sendLog('batchor',
                'Adding the relval campaigns %s with parameters \n%s' %
                (campaign, json.dumps(setup, indent=2)),
                level='critical')
        BI.update(campaign, by_campaign[campaign])

    for campaign in by_hi_campaign:
        if campaign in batches: continue
        ## get a bunch of information
        setup = copy.deepcopy(default_hi_setup)
        possible_sites = set(["T1_DE_KIT", "T1_FR_CCIN2P3"])
        hi_site = random.choice(list(possible_sites))
        setup["parameters"]["SiteWhitelist"] = [hi_site]

        pick_one_site(setup)
        add_on[campaign] = setup
        sendLog('batchor',
                'Adding the HI relval campaigns %s with parameters \n%s' %
                (campaign, json.dumps(setup, indent=2)),
                level='critical')
        BI.update(campaign, by_hi_campaign[campaign])

    ## only new campaigns in announcement
    for new_campaign in list(
            set(add_on.keys()) - set(CI.all(c_type='relval'))):
        ## this is new, and can be announced as such
        print new_campaign, "is new stuff"
        subject = "Request of RelVal samples batch %s" % new_campaign
        text = """Dear all, 
A new batch of relval workflows was requested.

Batch ID:

%s

Details of the workflows:

https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s

This is an automated message""" % (
            new_campaign,
            new_campaign,
        )

        print subject
        print text
        to = ['*****@*****.**']
        sendEmail(subject, text, destination=to)
        sendLog('batchor', text, level='critical')

    ## go through all existing campaigns and remove the ones not in use anymore ?
    for old_campaign in CI.all(c_type='relval'):
        all_in_batch = getWorkflowByCampaign(url, old_campaign, details=True)
        if not all_in_batch: continue
        is_batch_done = all(
            map(
                lambda s: not s in [
                    'completed', 'force-complete', 'running-open',
                    'running-closed', 'acquired', 'assigned',
                    'assignment-approved'
                ], [wf['RequestStatus'] for wf in all_in_batch]))
        ## check all statuses
        if is_batch_done:
            #print "batch",old_campaign,"can be closed or removed if necessary"
            #campaigns[old_campaign]['go'] = False ## disable
            CI.pop(old_campaign)  ## or just drop it all together ?
            BI.pop(old_campaign)
            print "batch", old_campaign, " configuration was removed"

    ## merge all anyways
    CI.update(add_on, c_type='relval')
Exemplo n.º 2
0
def assignor(url ,specific = None, talk=True, options=None):
    if userLock(): return
    if duplicateLock(): return
    if not componentInfo().check(): return

    UC = unifiedConfiguration()
    CI = campaignInfo()
    #SI = siteInfo()
    SI = global_SI()
    #NLI = newLockInfo()
    #if not NLI.free() and not options.go: return
    LI = lockInfo()
    if not LI.free() and not options.go: return

    n_assigned = 0
    n_stalled = 0

    wfos=[]
    fetch_from = []
    if specific or options.early:
        fetch_from.extend(['considered','staging'])
    if specific:
        fetch_from.extend(['considered-tried'])
    
    fetch_from.extend(['staged'])

    if options.from_status:
        fetch_from = options.from_status.split(',')
        print "Overriding to read from",fetch_from

    for status in fetch_from:
        wfos.extend(session.query(Workflow).filter(Workflow.status==status).all())

    ## in case of partial, go for fetching a list from json ?
    #if options.partial and not specific:
    #    pass

    dataset_endpoints = json.loads(open('%s/dataset_endpoints.json'%monitor_dir).read())
    aaa_mapping = json.loads(open('%s/equalizor.json'%monitor_pub_dir).read())['mapping']

    all_stuck = set()
    all_stuck.update( json.loads( open('%s/stuck_transfers.json'%monitor_dir).read() ))
    all_stuck.update( getAllStuckDataset()) 

    max_per_round = UC.get('max_per_round').get('assignor',None)
    max_cpuh_block = UC.get('max_cpuh_block')
    random.shuffle( wfos )
    for wfo in wfos:
        
        if options.limit and (n_stalled+n_assigned)>options.limit:
            break

        if max_per_round and (n_stalled+n_assigned)>max_per_round:
            break

        if specific:
            if not any(map(lambda sp: sp in wfo.name,specific.split(','))): continue
            #if not specific in wfo.name: continue
        print "\n\n"
        wfh = workflowInfo( url, wfo.name)

        if options.priority and int(wfh.request['RequestPriority']) < options.priority:
            continue

        options_text=""
        if options.early: options_text+=", early option is ON"
        if options.partial: 
            options_text+=", partial option is ON"
            options_text+=", good fraction is %.2f"%options.good_enough
        


        wfh.sendLog('assignor',"%s to be assigned%s"%(wfo.name, options_text))

        ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores
        (lheinput,primary,parent,secondary, sites_allowed) = wfh.getSiteWhiteList()
        output_tiers = list(set([o.split('/')[-1] for o in wfh.request['OutputDatasets']]))
        
        
        is_stuck = (all_stuck & primary)
        if is_stuck:
            wfh.sendLog('assignor',"%s are stuck input"%(','.join( is_stuck)))

        ## check if by configuration we gave it a GO
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True

        allowed_secondary = {}
        assign_parameters = {}
        check_secondary = False
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns:
                assign_parameters.update( CI.campaigns[campaign] )

            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[campaign]:
                if CI.campaigns[campaign]['secondaries']:
                    allowed_secondary.update( CI.campaigns[campaign]['secondaries'] )
                    check_secondary = True
            if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[campaign]:
                banned_tier = list(set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers))
                if banned_tier:
                    no_go=True
                    wfh.sendLog('assignor','These data tiers %s are not allowed'%(','.join( banned_tier)))
                    sendLog('assignor','These data tiers %s are not allowed'%(','.join( banned_tier)), level='critical')

        if secondary and check_secondary:
            if (set(secondary)&set(allowed_secondary.keys())!=set(secondary)):
                wfh.sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-set(allowed_secondary.keys()))))
                sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-set(allowed_secondary.keys()))), level='critical')
                if not options.go:
                    no_go = True
            ## then get whether there is something more to be done by secondary
            for sec in secondary:
                if sec in allowed_secondary:# and 'parameters' in allowed_secondary[sec]:
                    assign_parameters.update( allowed_secondary[sec] )

        if no_go:
            n_stalled+=1 
            continue


            
        ## check on current status for by-passed assignment
        if wfh.request['RequestStatus'] !='assignment-approved':
            if not options.test:
                wfh.sendLog('assignor',"setting %s away and skipping"%wfo.name)
                ## the module picking up from away will do what is necessary of it
                wfo.wm_status = wfh.request['RequestStatus']
                wfo.status = 'away'
                session.commit()
                continue
            else:
                print wfo.name,wfh.request['RequestStatus']

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version=wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                wfh.sendLog('assignor',"cannot decide on version number")
                n_stalled+=1
                wfo.status = 'trouble'
                session.commit()
                continue


        original_sites_allowed = copy.deepcopy( sites_allowed )
        wfh.sendLog('assignor',"Site white list %s"%sorted(sites_allowed))
        override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', [])

        blocks = []
        if 'BlockWhitelist' in wfh.request:
            blocks = wfh.request['BlockWhitelist']
        if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']:
            ## augment with run white list
            for dataset in primary:
                blocks = list(set( blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist'] ) ))

        wfh.sendLog('assignor',"Allowed %s"%sorted(sites_allowed))
        secondary_locations=None

        primary_aaa = options.primary_aaa
        secondary_aaa = options.secondary_aaa
        do_partial = False #options.good_enough if options.partial else 0

        if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns:
            assign_parameters.update( CI.campaigns[wfh.request['Campaign']] )

        if 'primary_AAA' in assign_parameters:
            primary_aaa = primary_aaa or assign_parameters['primary_AAA']
        if 'secondary_AAA' in assign_parameters:
            secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA']
        if 'partial_copy' in assign_parameters:
            ## can this only work if there is a stuck input ? maybe not
            ## this is a number. 0 means no
            print "Could do partial disk copy assignment"
            if is_stuck or options.partial:
                do_partial = assign_parameters['partial_copy']
                wfh.sendLog('assignor',"Overiding partial copy assignment to %.2f fraction"% do_partial)
                #sendEmail('stuck input to assignment','%s is stuck for assigning %s and going fractional'%(','.join( is_stuck), wfo.name))
            
        do_partial = options.good_enough if options.partial else do_partial


        for sec in list(secondary):
            if override_sec_location: 
                print "We don't care where the secondary is"
                print "Cannot pass for now"
                #sendEmail("tempting to pass sec location check","but we cannot yet IMO")
                #pass
            if secondary_aaa:
                #just continue without checking
                continue

            presence = getDatasetPresence( url, sec )
            print sec
            print json.dumps(presence, indent=2)
            one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.]
            #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
            if secondary_locations==None:
                secondary_locations = one_secondary_locations
            else:
                secondary_locations = list(set(secondary_locations) & set(one_secondary_locations))
            ## reduce the site white list to site with secondary only
            #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])]
            sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations]
            
        wfh.sendLog('assignor',"From secondary requirement, now Allowed%s"%sorted(sites_allowed))

        initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it

        sites_all_data = copy.deepcopy( sites_allowed )
        sites_with_data = copy.deepcopy( sites_allowed )
        sites_with_any_data = copy.deepcopy( sites_allowed )
        primary_locations = None
        available_fractions = {}
        set_lfn = '/store/mc' ## by default
        endpoints = set()
        for prim in list(primary):
            if prim in dataset_endpoints:
                print "endpoints from stagor",dataset_endpoints[prim]
                endpoints.update( dataset_endpoints[prim] )
            set_lfn = getLFNbase( prim )
            presence = getDatasetPresence( url, prim , only_blocks=blocks)
            if talk:
                print prim
                print json.dumps(presence, indent=2)
            available_fractions[prim] =  getDatasetBlocksFraction(url, prim, sites = [SI.CE_to_SE(site) for site in sites_allowed] , only_blocks = blocks)
            #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])]
            #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])]
            sites_all_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,(there,frac)) in presence.items() if there]]
            sites_with_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,frac) in presence.items() if frac[1]>90.]]
            sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()]
            wfh.sendLog('assignor',"Holding the data but not allowed %s"%sorted(list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed]))))
            if primary_locations==None:
                primary_locations = presence.keys()
            else:
                primary_locations = list(set(primary_locations) & set(presence.keys() ))

        sites_with_data = list(set(sites_with_data))
        sites_with_any_data = list(set(sites_with_any_data))

        opportunistic_sites=[]
        down_time = False
        ## opportunistic running where any piece of data is available
        if secondary_locations or primary_locations:
            ## intersection of both any pieces of the primary and good IO
            #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))]
            if secondary_locations and primary_locations:
                opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed]))]
            elif primary_locations:
                opportunistic_sites = [SI.SE_to_CE(site) for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed]))]
            else:
                opportunistic_sites = []
            wfh.sendLog('assignor',"We could be running in addition at %s"% sorted(opportunistic_sites))
            if any([osite in SI.sites_not_ready for osite in opportunistic_sites]):
                wfh.sendLog('assignor',"One of the usable site is in downtime %s"%([osite for osite in opportunistic_sites if osite in SI.sites_not_ready]))
                down_time = True
                ## should this be send back to considered ?
                

        ## should be 2 but for the time-being let's lower it to get things going
        copies_wanted,cpuh = wfh.getNCopies()
        wfh.sendLog('assignor',"we need %s CPUh"%cpuh)
        if cpuh>max_cpuh_block and not options.go:
            #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**'])
            sendLog('assignor','%s requires a large numbr of CPUh %s , not assigning, please check with requester'%( wfo.name, cpuh), level='critical')
            wfh.sendLog('assignor',"Requiring a large number of CPUh %s, not assigning"%cpuh)
            continue

        if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]:
            copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies']
            copies_wanted = min(copies_needed_from_campaign, copies_wanted)
        
        if not options.early:
            less_copies_than_requested = UC.get("less_copies_than_requested")
            copies_wanted = max(1,copies_wanted-less_copies_than_requested) # take one out for the efficiency
        else:
            ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going
            pass

        wfh.sendLog('assignor',"needed availability fraction %s"% copies_wanted)

        ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently

        ## default back to white list to original white list with any data
        wfh.sendLog('assignor',"Allowed sites :%s"% sorted(sites_allowed))

        if primary_aaa:
            ## remove the sites not reachable localy if not in having the data
            if not sites_all_data:
                wfh.sendLog('assignor',"Overiding the primary on AAA setting to Off")
                primary_aaa=False
            else:
                aaa_grid = set(sites_all_data)
                for site in list(aaa_grid):
                    aaa_grid.update( aaa_mapping.get(site,[]) )
                sites_allowed = list(set(initial_sites_allowed) & aaa_grid)
                wfh.sendLog('assignor',"Selected to read primary through xrootd %s"%sorted(sites_allowed))
                
        if not primary_aaa:
            sites_allowed = sites_with_any_data
            wfh.sendLog('assignor',"Selected for any data %s"%sorted(sites_allowed))

        ### check on endpoints for on-going transfers
        if do_partial:
            if endpoints:
                end_sites = [SI.SE_to_CE(s) for s in endpoints]
                sites_allowed = list(set(sites_allowed + end_sites))
                if down_time and not any(osite in SI.sites_not_ready for osite in end_sites):
                    print "Flip the status of downtime, since our destinations are good"
                    down_time = False
                print "with added endpoints",sorted(end_sites)
            else:
                print "Cannot do partial assignment without knowin the endpoints"
                n_stalled+=1
                continue
            
            
        #if not len(sites_allowed):
        #    if not options.early:
        #        wfh.sendLog('assignor',"cannot be assign with no matched sites")
        #        sendLog('assignor','%s has no whitelist'% wfo.name, level='critical')
        #    n_stalled+=1
        #    continue


        low_pressure = SI.sites_low_pressure(0.4)
        ## if any of the site allowed is low pressure : reduce to 1 copy so that it gets started
        allowed_and_low = sorted(set(low_pressure) & set(sites_allowed))
        if allowed_and_low:
            wfh.sendLog('assignor',"The workflow can run at %s under low pressure currently"%( ','.join( allowed_and_low )))
            copies_wanted = max(1., copies_wanted-1.)


        if available_fractions and not all([available>=copies_wanted for available in available_fractions.values()]):
            not_even_once = not all([available>=1. for available in available_fractions.values()])
            above_good = all([available >= do_partial for available in available_fractions.values()])
            wfh.sendLog('assignor',"The input dataset is not available %s times, only %s"%( copies_wanted, available_fractions.values()))
            if down_time and not options.go and not options.early:
                wfo.status = 'considered'
                session.commit()
                wfh.sendLog('assignor',"sending back to considered because of site downtime, instead of waiting")
                #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name)
                sendLog('assignor','%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.'%( wfo.name ), level='delay')
                n_stalled+=1
                continue
                #pass

            print json.dumps(available_fractions)
            if (options.go and not_even_once) or not options.go:
                known = []
                try:
                    known = json.loads(open('cannot_assign.json').read())
                except:
                    pass
                if not wfo.name in known and not options.limit and not options.go and not options.early and not (do_partial and above_good):
                    wfh.sendLog('assignor',"cannot be assigned, %s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions)))
                    #sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions)))
                    known.append( wfo.name )
                    open('cannot_assign.json','w').write(json.dumps( known, indent=2))
                
                if options.early:
                    if wfo.status == 'considered':
                        wfh.sendLog('assignor',"setting considered-tried")
                        wfo.status = 'considered-tried'
                        session.commit()
                    else:
                        print "tried but status is",wfo.status
                if do_partial and above_good:
                    print "Will move on with partial locations"
                else:
                    n_stalled+=1
                    continue

        if not len(sites_allowed):
            if not options.early:
                wfh.sendLog('assignor',"cannot be assign with no matched sites")
                sendLog('assignor','%s has no whitelist'% wfo.name, level='critical')
            n_stalled+=1
            continue


        t1_only = [ce for ce in sites_allowed if ce.startswith('T1')]
        if t1_only:
            # try to pick from T1 only first
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])]
        else:
            # then pick any otherwise
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])]
            
            
        wfh.sendLog('assignor',"Placing the output on %s"%sites_out)
        parameters={
            'SiteWhitelist' : sites_allowed,
            'NonCustodialSites' : sites_out,
            'AutoApproveSubscriptionSites' : list(set(sites_out)),
            'AcquisitionEra' : wfh.acquisitionEra(),
            'ProcessingString' : wfh.processingString(),
            'MergedLFNBase' : set_lfn,
            'ProcessingVersion' : version,
            }

        if primary_aaa:
            parameters['TrustSitelists'] = True
            wfh.sendLog('assignor',"Reading primary through xrootd at %s"%sorted(sites_allowed))            

        if secondary_aaa:
            parameters['TrustPUSitelists'] = True
            wfh.sendLog('assignor',"Reading secondary through xrootd at %s"%sorted(sites_allowed))            


        if 'parameters' in assign_parameters:
            parameters.update( assign_parameters['parameters'] )

        ## plain assignment here
        team='production'
        if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM')
        if options and options.team:
            team = options.team

        if False and 'T2_CH_CERN' in parameters['SiteWhitelist']:
            ## add some check on 
            ### the amount pending to HLT
            ### the size of the request
            ### the priority of the request (maybe not if we decide to overflow during runs)
            parameters['SiteWhitelist'] = ['T2_CH_CERN_HLT']
            team = 'hlt'
            ## reduce the splitting by factor of 4, regardless of type of splitting
            sendEmail("sending work to HLT","%s was assigned to HLT"%wfo.name)
            

        ##parse options entered in command line if any
        if options:
            for key in reqMgrClient.assignWorkflow.keys:
                v=getattr(options,key)
                if v!=None:
                    if type(v)==str and ',' in v: 
                        parameters[key] = filter(None,v.split(','))
                    else: 
                        parameters[key] = v

        if lheinput:
            ## throttle reading LHE article 
            wfh.sendLog('assignor', 'Setting the number of events per job to 500k max')
            parameters['EventsPerJob'] = 500000

        ## pick up campaign specific assignment parameters
        #parameters.update( CI.parameters(wfh.request['Campaign']) )
        parameters.update( assign_parameters.get('parameters',{}) )

        if not options.test:
            parameters['execute'] = True

        split_check = wfh.checkWorkflowSplitting()
        if split_check!=True:
            parameters.update( split_check )
            if 'NoGo' in split_check.values():
                wfh.sendLog('assignor', "Failing splitting check")
                sendLog('assignor','the workflow %s is failing the splitting check. Verify in the logs'% wfo.name, level='critical')
                n_stalled+=1
                continue

            if 'EventBased' in split_check.values():
                wfh.sendLog('assignor', "Falling back to event splitting.")
                #sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name)
                sendLog('assignor','the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting ?'%wfo.name, level='critical')
                ## we have a problem here, that EventBased should never be used as a backup
                if not options.go:  
                    n_stalled+=1
                    continue
                continue ## skip all together
            elif 'EventsPerJob' in split_check.values():
                wfh.sendLog('assignor', "Modifying the number of events per job")
                #sendEmail("Modifying the job per events","the workflow %s is too heavy in number of jobs explosion"%wfo.name)
                sendLog('assignor',"the workflow %s is too heavy in number of jobs explosion"%wfo.name, level='critical')
            elif 'EventsPerLumi' in split_check.values():
                wfh.sendLog('assignor', "Modifying the number of events per lumi to be able to process this")

        # Handle run-dependent MC
        pstring = wfh.processingString()
        if 'PU_RD' in pstring:
            numEvents = wfh.getRequestNumEvents()
            eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary]
            eventsPerLumi = sum(eventsPerLumi)/float(len(eventsPerLumi))
            reqJobs = 500
            if 'PU_RD2' in pstring:
                reqJobs = 2000
                eventsPerJob = int(numEvents/(reqJobs*1.4))
                lumisPerJob = int(eventsPerJob/eventsPerLumi)
                if lumisPerJob==0:
                    #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob), level='critical')
                    wfh.sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    parameters['EventsPerJob'] = eventsPerJob
                else:
                    spl = wfh.getSplittings()[0]
                    eventsPerJobEstimated = spl['events_per_job'] if 'events_per_job' in spl else None
                    eventsPerJobEstimated = spl['avg_events_per_job'] if 'avg_events_per_job' in spl else None
                    if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob:
                        #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob), level='critical')
                        wfh.sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        parameters['LumisPerJob'] = lumisPerJob
                    else:
                        #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name)
                        sendLog('assignor',"leaving splitting untouched for %s, please check on %s"%( pstring, wfo.name), level='critical')
                        wfh.sendLog('assignor',"leaving splitting untouched for PU_RD*, please check.")


        
        
        result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters)


        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
                n_assigned+=1
                wfh.sendLog('assignor',"Properly assigned\n%s"%(json.dumps( parameters, indent=2)))
                try:
                    ## refetch information and lock output
                    new_wfi = workflowInfo( url, wfo.name)
                    (_,prim,_,sec) = new_wfi.getIO()
                    for secure in list(prim)+list(sec)+new_wfi.request['OutputDatasets']:
                        ## lock all outputs flat
                        #NLI.lock( secure )
                        LI.lock( secure, reason = 'assigning')
                    #for site in [SI.CE_to_SE(site) for site in sites_allowed]:
                    #    for output in new_wfi.request['OutputDatasets']:
                    #        LI.lock( output, site, 'dataset in production')
                    #    for primary in prim:
                    #        LI.lock( primary, site, 'dataset used in input')
                    #    for secondary in sec:
                    #        LI.lock( secondary, site, 'required for mixing' )

                except Exception as e:
                    print "fail in locking output"
                    
                    print str(e)
                    sendEmail("failed locking of output",str(e))


            else:
                wfh.sendLog('assignor',"Failed to assign. Please check the logs")
                print "ERROR could not assign",wfo.name
        else:
            pass
    print "Assignment summary:"
    sendLog('assignor',"Assigned %d Stalled %s"%(n_assigned, n_stalled))
Exemplo n.º 3
0
def spawn_harvesting(url, wfi, sites_for_DQMHarvest):
    SI = global_SI()

    all_OK = {}
    requests = []
    outputs = wfi.request['OutputDatasets']
    if ('EnableHarvesting' in wfi.request
            and not wfi.request['EnableHarvesting']) and (
                'DQMConfigCacheID' in wfi.request
                and wfi.request['DQMConfigCacheID']):
        if not 'MergedLFNBase' in wfi.request:
            print "f****d up"
            sendEmail('screwed up wl cache',
                      '%s wl cache is bad' % (wfi.request['RequestName']))
            all_OK['fake'] = False
            return all_OK, requests

        wfi = workflowInfo(url, wfi.request['RequestName'])
        dqms = [out for out in outputs if '/DQM' in out]
        #if not all([in_full[dqm_input] for dqm_input in dqms]):
        #    wfi.sendLog('closor',"will not be able to assign the harvesting: holding up")
        #    for dqm_input in dqms:
        #        all_OK[dqm_input] = False
        ## raise the subscription to high priority

        for dqm_input in dqms:
            ## handle it properly
            harvesting_schema = {
                'Requestor': os.getenv('USER'),
                'RequestType': 'DQMHarvest',
                'Group': 'DATAOPS'
            }
            copy_over = [
                'AcquisitionEra',
                'ProcessingString',
                'DQMUploadUrl',
                'CMSSWVersion',
                'CouchDBName',
                'CouchWorkloadDBName',
                'ConfigCacheUrl',
                'DbsUrl',
                'inputMode',
                'DQMConfigCacheID',
                'OpenRunningTimeout',
                'ScramArch',
                'CMSSWVersion',
                'Campaign',
                'Memory',  #dummy
                'SizePerEvent',  #dummy
                'GlobalTag',  #dummy
            ]
            for item in copy_over:
                if item in wfi.request:
                    harvesting_schema[item] = copy.deepcopy(wfi.request[item])
                else:
                    print item, "is not in initial schema"

            harvesting_schema['InputDataset'] = dqm_input
            harvesting_schema['TimePerEvent'] = 1
            harvesting_schema['PrepID'] = 'Harvest-' + wfi.request['PrepID']
            if len(wfi.request['RequestString']) > 60:
                wfi.request['RequestString'] = wfi.request[
                    'RequestString'][:60]
                print "truncating request string", wfi.request['RequestString']

            harvesting_schema[
                'RequestString'] = 'HARVEST-' + wfi.request['RequestString']
            harvesting_schema['DQMHarvestUnit'] = 'byRun'
            harvesting_schema['RequestPriority'] = min(
                wfi.request['RequestPriority'] * 10, 999999)

            harvest_request = reqMgrClient.submitWorkflow(
                url, harvesting_schema)
            if not harvest_request:
                print "Error in making harvesting for", wfi.request[
                    'RequestName']
                print "schema"
                print json.dumps(harvesting_schema, indent=2)
                harvest_request = reqMgrClient.submitWorkflow(
                    url, harvesting_schema)
                if not harvest_request:
                    print "Error twice in harvesting for", wfi.request[
                        'RequestName']
                    print "schema"
                    print json.dumps(harvesting_schema, indent=2)

            if harvest_request:
                requests.append(harvest_request)
                ## should we protect for setting approved ? no, it's notified below, assignment will fail, likely
                data = reqMgrClient.setWorkflowApproved(url, harvest_request)
                print "created", harvest_request, "for harvesting of", dqm_input
                wfi.sendLog(
                    'closor', "created %s for harvesting of %s" %
                    (harvest_request, dqm_input))
                ## assign it directly
                team = wfi.request['Team']
                parameters = {
                    'SiteWhitelist': [
                        SI.SE_to_CE(se)
                        for se in wfi.request['NonCustodialSites']
                    ],
                    'AcquisitionEra':
                    wfi.acquisitionEra(),
                    'ProcessingString':
                    wfi.processingString(),
                    'MergedLFNBase':
                    wfi.request['MergedLFNBase'],
                    'ProcessingVersion':
                    wfi.request['ProcessingVersion'],
                    'execute':
                    True
                }
                #if in_full[dqm_input]:
                #    print "using full copy at",in_full[dqm_input]
                #    parameters['SiteWhitelist'] = [SI.SE_to_CE(se) for se in in_full[dqm_input]]
                #else:
                #    print "cannot do anything if not having a full copy somewhere"

                #    all_OK[dqm_input]=False
                #    continue
                parameters['SiteWhitelist'] = sites_for_DQMHarvest

                result = reqMgrClient.assignWorkflow(url, harvest_request,
                                                     team, parameters)
                if not result:
                    #sendEmail('harvesting request created','%s was created at announcement of %s in %s, failed to assign'%(harvest_request, dqm_input, wfi.request['RequestName']), destination=[wfi.request['Requestor']+'@cern.ch'])
                    wfi.sendLog(
                        'closor',
                        '%s was created at announcement of %s in %s, failed to assign'
                        % (harvest_request, dqm_input,
                           wfi.request['RequestName']))
                    sendLog(
                        'closor',
                        '%s was created at announcement of %s in %s, failed to assign'
                        % (harvest_request, dqm_input,
                           wfi.request['RequestName']),
                        level='critical')
                else:
                    #sendEmail('harvesting request assigned','%s was created at announcement of %s in %s, and assigned'%(harvest_request, dqm_input, wfi.request['RequestName']), destination=[wfi.request['Requestor']+'@cern.ch'])
                    wfi.sendLog(
                        'closor',
                        '%s was created at announcement of %s in %s, and assigned'
                        % (harvest_request, dqm_input,
                           wfi.request['RequestName']))

            else:
                #print "could not make the harvesting for",wfo.name,"not announcing"
                wfi.sendLog('closor', "could not make the harvesting request")
                sendLog('closor',
                        "could not make the harvesting request for %s" %
                        wfi.request['RequestName'],
                        level='critical')
                all_OK[dqm_input] = False
    return (all_OK, requests)
Exemplo n.º 4
0
def batchor( url ):
    UC = unifiedConfiguration()
    SI = global_SI()
    ## get all workflows in assignment-approved with SubRequestType = relval
    all_wfs = []
    for user in UC.get("user_relval"):
        all_wfs.extend( getWorkflows(url, 'assignment-approved', details=True, user=user, rtype='TaskChain') )

    wfs = filter( lambda r :r['SubRequestType'] == 'RelVal' if 'SubRequestType' in r else False, all_wfs)
    ## need a special treatment for those
    hi_wfs = filter( lambda r :r['SubRequestType'] == 'HIRelVal' if 'SubRequestType' in r else False, all_wfs)

    by_campaign = defaultdict(set)
    by_hi_campaign = defaultdict(set)
    for wf in wfs:
        print "Relval:",wf['RequestName'], wf['Campaign']
        #by_campaign[wf['Campaign']].add( wf['RequestName'] )
        by_campaign[wf['Campaign']].add( wf['PrepID'] )


    for wf in hi_wfs:
        print "HI Relval:",wf['RequestName'], wf['Campaign']
        #by_hi_campaign[wf['Campaign']].add( wf['RequestName'] )
        by_hi_campaign[wf['Campaign']].add( wf['PrepID'] )
        
    default_setup = {
        "go" :True,
        "parameters" : {
            "SiteWhitelist": [ "T1_US_FNAL" ],
            "MergedLFNBase": "/store/relval",
            "Team" : "relval",
            "NonCustodialGroup" : "RelVal"
            },
        "custodial" : "T1_US_FNAL_MSS",
        "custodial_override" : ["DQMIO"],
        "phedex_group" : "RelVal",
        "lumisize" : -1,
        "fractionpass" : 0.0,
        "maxcopies" : 1
        }
    default_hi_setup = copy.deepcopy( default_setup )

    add_on = {}
    batches = json.loads( open('batches.json').read() )
    relval_routing = UC.get('relval_routing')
    def pick_one_site( p):
        ## modify the parameters on the spot to have only one site
        if "parameters" in p and "SiteWhitelist" in p["parameters"] and len(p["parameters"]["SiteWhitelist"])>1:
            choose_from = list(set(p["parameters"]["SiteWhitelist"]) & set(SI.sites_ready))
            picked = random.choice( choose_from )
            print "picked",picked,"from",choose_from
            p["parameters"]["SiteWhitelist"] = [picked]
            
    for campaign in by_campaign:
        if campaign in batches: continue
        ## get a bunch of information
        setup  = copy.deepcopy( default_setup )

        for key in relval_routing:
            if key in campaign:
                ## augment with the routing information
                augment_with = relval_routing[key]
                print "Modifying the batch configuration because of keyword",key
                print "with",augment_with
                setup = deep_update( setup, augment_with )
        #if 'cc7' in campaign: setup["parameters"]["SiteWhitelist"] = ["T2_US_Nebraska"]
        pick_one_site( setup )
        add_on[campaign] = setup
        sendLog('batchor','Adding the relval campaigns %s with parameters \n%s'%( campaign, json.dumps( setup, indent=2)),level='critical')
        if not campaign in batches: batches[campaign] = []
        batches[campaign] = list(set(list(copy.deepcopy( by_campaign[campaign] )) + batches[campaign] ))

    for campaign in by_hi_campaign:
        if campaign in batches: continue
        ## get a bunch of information
        setup  = copy.deepcopy( default_hi_setup )
        hi_site = random.choice(["T1_DE_KIT","T1_FR_CCIN2P3"])
        setup["parameters"]["SiteWhitelist"]=[ hi_site ]
        #setup["parameters"]["SiteWhitelist"]=["T1_DE_KIT","T1_FR_CCIN2P3"]

        pick_one_site( setup )
        add_on[campaign] = setup
        sendLog('batchor','Adding the HI relval campaigns %s with parameters \n%s'%( campaign, json.dumps( setup, indent=2)),level='critical')
        if not campaign in batches: batches[campaign] = []
        batches[campaign] = list(set(list(copy.deepcopy( by_hi_campaign[campaign] )) + batches[campaign] ))
        
    
    open('batches.json','w').write( json.dumps( batches , indent=2 ) )

    ## open the campaign configuration 
    campaigns = json.loads( open('campaigns.relval.json').read() )


    ## protect for overwriting ??
    for new_campaign in list(set(add_on.keys())-set(campaigns.keys())):
        ## this is new, and can be announced as such
        print new_campaign,"is new stuff"
        subject = "Request of RelVal samples batch %s"% new_campaign
        text="""Dear all, 
A new batch of relval workflows was requested.

Batch ID:

%s

Details of the workflows:

https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s

This is an automated message"""%( new_campaign, 
                                  new_campaign,
                                  )


        print subject
        print text
        to = ['*****@*****.**']
        sendEmail(subject, text, destination=to)
        sendLog('batchor',text, level='critical')

    ## go through all existing campaigns and remove the ones not in use anymore ?
    for old_campaign in campaigns.keys():
        all_in_batch = getWorkflowByCampaign(url, old_campaign, details=True)
        is_batch_done = all(map(lambda s : not s in ['completed','running-open','running-closed','acquired','assigned','assignment-approved'], [wf['RequestStatus']for wf in all_in_batch]))
        ## check all statuses
        if is_batch_done:
            #print "batch",old_campaign,"can be closed or removed if necessary"
            #campaigns[old_campaign]['go'] = False ## disable
            campaigns.pop( old_campaign ) ## or just drop it all together ?
            print "batch",old_campaign," configuration was removed"

    ## merge all anyways
    campaigns.update( add_on )

    ## write it out for posterity
    open('campaigns.json.updated','w').write(json.dumps( campaigns , indent=2))

    ## read back
    rread = json.loads(open('campaigns.json.updated').read())

    os.system('mv campaigns.json.updated campaigns.relval.json')
Exemplo n.º 5
0
def assignor(url, specific=None, talk=True, options=None):
    if userLock() and not options.manual: return
    mlock = moduleLock()
    if mlock() and not options.manual: return
    if not componentInfo().check() and not options.manual: return

    UC = unifiedConfiguration()
    CI = campaignInfo()
    SI = siteInfo()
    SI = global_SI()
    ###NLI = newLockInfo()
    ###if not NLI.free() and not options.go: return
    LI = lockInfo()
    #if not LI.free() and not options.go and not options.manual: return

    n_assigned = 0
    n_stalled = 0

    wfos = []
    fetch_from = []
    if specific or options.early:
        fetch_from.extend(['considered', 'staging'])
    if specific:
        fetch_from.extend(['considered-tried'])

    if options.early:
        print "Option Early is on"

    fetch_from.extend(['staged'])

    if options.from_status:
        fetch_from = options.from_status.split(',')
        print "Overriding to read from", fetch_from

    for status in fetch_from:
        print "getting wf in", status
        wfos.extend(
            session.query(Workflow).filter(Workflow.status == status).all())
        print len(wfos)

    ## in case of partial, go for fetching a list from json ?
    #if options.partial and not specific:
    #    pass

    aaa_mapping = json.loads(eosRead('%s/equalizor.json' %
                                     monitor_pub_dir))['mapping']
    all_stuck = set()
    all_stuck.update(
        json.loads(eosRead('%s/stuck_transfers.json' % monitor_pub_dir)))

    max_per_round = UC.get('max_per_round').get('assignor', None)
    max_cpuh_block = UC.get('max_cpuh_block')

    # Temporarily switch off prioritization
    random.shuffle(wfos)
    ##order by priority instead of random
    """
    if options.early:
        cache = sorted(getWorkflows(url, 'assignment-approved', details=True), key = lambda r : r['RequestPriority'])
        cache = [r['RequestName'] for r in cache]
        def rank( wfn ):
            return cache.index( wfn ) if wfn in cache else 0

        wfos = sorted(wfos, key = lambda wfo : rank( wfo.name ),reverse=True)
        print "10 first",[wfo.name for wfo in wfos[:10]]
        print "10 last",[wfo.name for wfo in wfos[-10:]]
    else:
        random.shuffle( wfos )
    """

    for wfo in wfos:

        if options.limit and (n_stalled + n_assigned) > options.limit:
            break

        if max_per_round and (n_stalled + n_assigned) > max_per_round:
            break

        if specific:
            if not any(map(lambda sp: sp in wfo.name, specific.split(','))):
                continue
            #if not specific in wfo.name: continue

        if not options.manual and 'rucio' in (wfo.name).lower(): continue
        print "\n\n"
        wfh = workflowInfo(url, wfo.name)

        if wfh.request['RequestStatus'] in [
                'rejected', 'aborted', 'aborted-completed', 'aborted-archived',
                'rejected-archived'
        ] and wfh.isRelval():
            wfo.status = 'forget'
            session.commit()
            n_stalled += 1
            continue

        if options.priority and int(
                wfh.request['RequestPriority']) < options.priority:
            continue

        options_text = ""
        if options.early: options_text += ", early option is ON"

        wfh.sendLog('assignor',
                    "%s to be assigned %s" % (wfo.name, options_text))

        ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores
        (lheinput, primary, parent, secondary, sites_allowed,
         sites_not_allowed) = wfh.getSiteWhiteList()

        output_tiers = list(
            set([o.split('/')[-1] for o in wfh.request['OutputDatasets']]))

        if not output_tiers:
            n_stalled += 1
            wfh.sendLog('assignor', 'There is no output at all')
            sendLog('assignor',
                    'Workflow %s has no output at all' % (wfo.name),
                    level='critical')
            continue

        is_stuck = (all_stuck & primary)
        if is_stuck:
            wfh.sendLog('assignor',
                        "%s are stuck input" % (','.join(is_stuck)))

        ## check if by configuration we gave it a GO
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True

        allowed_secondary = {}
        assign_parameters = {}
        check_secondary = (not wfh.isRelval())
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns:
                assign_parameters.update(CI.campaigns[campaign])

            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[
                    campaign]:
                if CI.campaigns[campaign]['secondaries']:
                    allowed_secondary.update(
                        CI.campaigns[campaign]['secondaries'])
                    check_secondary = True
            if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[
                    campaign]:
                banned_tier = list(
                    set(CI.campaigns[campaign]['banned_tier'])
                    & set(output_tiers))
                if banned_tier:
                    no_go = True
                    wfh.sendLog(
                        'assignor', 'These data tiers %s are not allowed' %
                        (','.join(banned_tier)))
                    sendLog('assignor',
                            'These data tiers %s are not allowed' %
                            (','.join(banned_tier)),
                            level='critical')

        if secondary and check_secondary:
            if (set(secondary) & set(allowed_secondary.keys()) !=
                    set(secondary)):
                msg = '%s is not an allowed secondary' % (
                    ', '.join(set(secondary) - set(allowed_secondary.keys())))
                wfh.sendLog('assignor', msg)
                critical_msg = msg + '\nWorkflow URL: https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=task_{}'.format(
                    wfh.getPrepIDs()[0])
                sendLog('assignor', critical_msg, level='critical')
                if not options.go:
                    no_go = True
            ## then get whether there is something more to be done by secondary
            for sec in secondary:
                if sec in allowed_secondary:  # and 'parameters' in allowed_secondary[sec]:
                    assign_parameters.update(allowed_secondary[sec])

        if no_go:
            n_stalled += 1
            ## make a very loud noise if >100k priority stalled
            continue

        ## check on current status for by-passed assignment
        if wfh.request['RequestStatus'] != 'assignment-approved':
            if not options.test:
                wfh.sendLog('assignor',
                            "setting %s away and skipping" % wfo.name)
                ## the module picking up from away will do what is necessary of it
                wfo.wm_status = wfh.request['RequestStatus']
                wfo.status = 'away'
                session.commit()
                continue
            else:
                print wfo.name, wfh.request['RequestStatus']

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version = wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                wfh.sendLog('assignor', "cannot decide on version number")
                n_stalled += 1
                wfo.status = 'trouble'
                session.commit()
                continue

        wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed))

        blocks = wfh.getBlocks()
        if blocks:
            wfh.sendLog(
                'assignor',
                "Needs {} blocks in input {}".format(len(blocks),
                                                     '\n'.join(blocks)))
        wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed))

        primary_aaa = options.primary_aaa
        secondary_aaa = options.secondary_aaa

        if 'Campaign' in wfh.request and wfh.request[
                'Campaign'] in CI.campaigns:
            assign_parameters.update(CI.campaigns[wfh.request['Campaign']])

        if 'primary_AAA' in assign_parameters and primary:
            primary_aaa = primary_aaa or assign_parameters['primary_AAA']
        if 'secondary_AAA' in assign_parameters:
            secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA']

        wfh.sendLog(
            'assignor',
            "Initial values for primary_AAA=%s and secondary_AAA=%s" %
            (primary_aaa, secondary_aaa))

        if primary_aaa:
            if "T2_CH_CERN_HLT" in sites_allowed:
                sites_allowed.remove("T2_CH_CERN_HLT")
            if "T2_CH_CERN_HLT" not in sites_not_allowed:
                sites_not_allowed.append("T2_CH_CERN_HLT")

        ## keep track of this, after secondary input location restriction : that's how you want to operate it
        initial_sites_allowed = copy.deepcopy(sites_allowed)

        set_lfn = '/store/mc'  ## by default

        for prim in list(primary):
            set_lfn = getLFNbase(prim)
            ## if they are requested for processing, they should bbe all closed already
            # FIXME: remove this closeAllBlocks
            #closeAllBlocks(url, prim, blocks)

        ## should be 2 but for the time-being let's lower it to get things going
        _copies_wanted, cpuh = wfh.getNCopies()
        wfh.sendLog('assignor', "we need %s CPUh" % cpuh)
        if cpuh > max_cpuh_block and not options.go:
            #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**'])
            sendLog(
                'assignor',
                '%s requires a large numbr of CPUh %s , not assigning, please check with requester'
                % (wfo.name, cpuh),
                level='critical')
            wfh.sendLog(
                'assignor',
                "Requiring a large number of CPUh %s, not assigning" % cpuh)
            continue

        ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently

        ## default back to white list to original white list with any data
        wfh.sendLog('assignor', "Allowed sites :%s" % sorted(sites_allowed))

        # TODO Alan on 1/april/2020: keep the AAA functionality
        if primary_aaa:
            ## remove the sites not reachable localy if not in having the data
            if not sites_allowed:
                wfh.sendLog('assignor',
                            "Overiding the primary on AAA setting to Off")
                primary_aaa = False
            else:
                aaa_grid = set(sites_allowed)
                for site in list(aaa_grid):
                    aaa_grid.update(aaa_mapping.get(site, []))
                sites_allowed = list(set(initial_sites_allowed) & aaa_grid)
                wfh.sendLog(
                    'assignor', "Selected to read primary through xrootd %s" %
                    sorted(sites_allowed))

        isStoreResults = ('StoreResults' == wfh.request.setdefault(
            'RequestType', None))

        if isStoreResults:
            if 'MergedLFNBase' in wfh.request:
                set_lfn = wfh.request['MergedLFNBase']
            else:
                n_stalled += 1
                wfh.sendLog(
                    'assignor',
                    "Cannot assign StoreResults request because MergedLFN is missing"
                )
                sendLog(
                    'assignor',
                    'Cannot assign StoreResults request because MergedLFN is missing',
                    level='critical')
                continue

        if not primary_aaa:
            if isStoreResults:
                ## if we are dealing with a StoreResults request, we don't need to check dataset availability and
                ## should use the SiteWhiteList set in the original request
                if 'SiteWhitelist' in wfh.request:
                    sites_allowed = wfh.request['SiteWhitelist']
                else:
                    wfh.sendLog(
                        'assignor',
                        "Cannot assign StoreResults request because SiteWhitelist is missing"
                    )
                    sendLog(
                        'assignor',
                        'Cannot assign StoreResults request because SiteWhitelist is missing',
                        level='critical')
                    n_stalled += 1
                    continue
            wfh.sendLog('assignor',
                        "Selected for any data %s" % sorted(sites_allowed))

        #if not len(sites_allowed):
        #    if not options.early:
        #        wfh.sendLog('assignor',"cannot be assign with no matched sites")
        #        sendLog('assignor','%s has no whitelist'% wfo.name, level='critical')
        #    n_stalled+=1
        #    continue

        if not len(sites_allowed) and not options.SiteWhitelist:
            if not options.early:
                wfh.sendLog('assignor',
                            "cannot be assign with no matched sites")
                sendLog('assignor',
                        '%s has no whitelist' % wfo.name,
                        level='critical')
            n_stalled += 1
            continue

        t1t2_only = [
            ce for ce in sites_allowed
            if [ce.startswith('T1') or ce.startswith('T2')]
        ]
        if t1t2_only:
            # try to pick from T1T2 only first
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1t2_only])]
            # then pick any otherwise
        else:
            sites_out = [
                SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])
            ]

        print "available=", SI.disk[sites_out[0]]
        wfh.sendLog('assignor', "Placing the output on %s" % sites_out)
        parameters = {
            'SiteWhitelist': sites_allowed,
            'SiteBlacklist': sites_not_allowed,
            'NonCustodialSites': sites_out,
            'AutoApproveSubscriptionSites': list(set(sites_out)),
            'AcquisitionEra': wfh.acquisitionEra(),
            'ProcessingString': wfh.processingString(),
            'MergedLFNBase': set_lfn,
            'ProcessingVersion': version,
        }

        if primary_aaa:
            parameters['TrustSitelists'] = True
            wfh.sendLog(
                'assignor',
                "Reading primary through xrootd at %s" % sorted(sites_allowed))

        if secondary_aaa:
            # Do not set TrustPUSitelist to True if there is no secondary
            if secondary:
                parameters['TrustPUSitelists'] = True
                wfh.sendLog(
                    'assignor', "Reading secondary through xrootd at %s" %
                    sorted(sites_allowed))

        ## plain assignment here
        team = 'production'
        if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM')
        if options and options.team:
            team = options.team
        parameters['Team'] = team

        if lheinput:
            ## throttle reading LHE article
            wfh.sendLog('assignor',
                        'Setting the number of events per job to 500k max')
            parameters['EventsPerJob'] = 500000

        def pick_options(options, parameters):
            ##parse options entered in command line if any
            if options:
                for key in reqMgrClient.assignWorkflow.keys:
                    v = getattr(options, key)
                    if v != None:
                        if type(v) == str and ',' in v:
                            parameters[key] = filter(None, v.split(','))
                        else:
                            parameters[key] = v

        def pick_campaign(assign_parameters, parameters):
            ## pick up campaign specific assignment parameters
            parameters.update(assign_parameters.get('parameters', {}))

        if options.force_options:
            pick_campaign(assign_parameters, parameters)
            pick_options(options, parameters)
        else:
            ## campaign parameters update last
            pick_options(options, parameters)
            pick_campaign(assign_parameters, parameters)

        if not options.test:
            parameters['execute'] = True

        hold_split, split_check = wfh.checkSplitting()
        if hold_split and not options.go:
            if split_check:
                wfh.sendLog(
                    'assignor', 'Holding on to the change in splitting %s' %
                    ('\n\n'.join([str(i) for i in split_check])))
            else:
                wfh.sendLog('assignor', 'Change of splitting is on hold')
            n_stalled += 1
            continue

        if split_check == None or split_check == False:
            n_stalled += 1
            continue
        elif split_check:
            ## operate all recommended changes
            reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check)
            wfh.sendLog(
                'assignor', 'Applying the change in splitting %s' %
                ('\n\n'.join([str(i) for i in split_check])))

        split_check = True  ## bypass completely and use the above

        # Handle run-dependent MC
        pstring = wfh.processingString()
        if 'PU_RD' in pstring:
            numEvents = wfh.getRequestNumEvents()
            eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary]
            eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi))
            reqJobs = 500
            if 'PU_RD2' in pstring:
                reqJobs = 2000
                eventsPerJob = int(numEvents / (reqJobs * 1.4))
                lumisPerJob = int(eventsPerJob / eventsPerLumi)
                if lumisPerJob == 0:
                    #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    sendLog('assignor',
                            "%s needs to be split by event with %s per job" %
                            (wfo.name, eventsPerJob),
                            level='critical')
                    wfh.sendLog(
                        'assignor',
                        "%s needs to be split by event with %s per job" %
                        (wfo.name, eventsPerJob))
                    parameters['EventsPerJob'] = eventsPerJob
                else:
                    spl = wfh.getSplittings()[0]
                    # FIXME: decide which of the lines below needs to remain...
                    eventsPerJobEstimated = spl[
                        'events_per_job'] if 'events_per_job' in spl else None
                    eventsPerJobEstimated = spl[
                        'avg_events_per_job'] if 'avg_events_per_job' in spl else None
                    if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob:
                        #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        sendLog('assignor',
                                "%s was assigned with %s lumis/job" %
                                (wfo.name, lumisPerJob),
                                level='critical')
                        wfh.sendLog(
                            'assignor', "%s was assigned with %s lumis/job" %
                            (wfo.name, lumisPerJob))
                        parameters['LumisPerJob'] = lumisPerJob
                    else:
                        #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name)
                        sendLog(
                            'assignor',
                            "leaving splitting untouched for %s, please check on %s"
                            % (pstring, wfo.name),
                            level='critical')
                        wfh.sendLog(
                            'assignor',
                            "leaving splitting untouched for PU_RD*, please check."
                        )

        ## make sure to autoapprove all NonCustodialSites
        parameters['AutoApproveSubscriptionSites'] = list(
            set(parameters['NonCustodialSites'] +
                parameters.get('AutoApproveSubscriptionSites', [])))
        result = reqMgrClient.assignWorkflow(
            url, wfo.name, None,
            parameters)  ## team is not relevant anymore here

        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
                n_assigned += 1
                wfh.sendLog(
                    'assignor', "Properly assigned\n%s" %
                    (json.dumps(parameters, indent=2)))
                if wfh.producePremix() and (not wfh.isRelval()):
                    title = "Heavy workflow assigned to {}".format(
                        parameters['SiteWhitelist'])
                    body = "Workflow name: {}".format(
                        wfh.request['RequestName'])
                    body += "\nOutput dataset(s): {}".format(
                        wfh.request['OutputDatasets'])
                    body += "\nAssigned to: {}".format(
                        parameters['SiteWhitelist'])
                    sendEmail(
                        title,
                        body,
                        destination=[
                            '*****@*****.**'
                        ])

                try:
                    ## refetch information and lock output
                    new_wfi = workflowInfo(url, wfo.name)
                    (_, prim, _, sec) = new_wfi.getIO()
                    for secure in list(prim) + list(
                            sec) + new_wfi.request['OutputDatasets']:
                        ## lock all outputs
                        LI.lock(secure, reason='assigning')

                except Exception as e:
                    print "fail in locking output"

                    print str(e)
                    sendEmail("failed locking of output", str(e))

            else:
                wfh.sendLog(
                    'assignor',
                    "Failed to assign %s.\n%s \n Please check the logs" %
                    (wfo.name, reqMgrClient.assignWorkflow.errorMessage))
                sendLog('assignor',
                        "Failed to assign %s.\n%s \n Please check the logs" %
                        (wfo.name, reqMgrClient.assignWorkflow.errorMessage),
                        level='critical')
                print "ERROR could not assign", wfo.name
        else:
            pass
    print "Assignment summary:"
    sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled))
    if n_stalled and not options.go and not options.early:
        sendLog('assignor',
                "%s workflows cannot be assigned. Please take a look" %
                (n_stalled),
                level='critical')
Exemplo n.º 6
0
def parse_one(url, wfn, options=None):

    SI = global_SI()
    wfi = workflowInfo( url , wfn)
    where_to_run, missing_to_run,missing_to_run_at = wfi.getRecoveryInfo()       
    all_blocks,needed_blocks,files_in_blocks,files_notin_dbs = wfi.getRecoveryBlocks()

    ancestor = workflowInfo( url , wfn)
    lhe,prim,_,sec = ancestor.getIO()
    high_order_acdc = 0
    while ancestor.request['RequestType'] == 'Resubmission':
        ancestor = workflowInfo(url, ancestor.request['OriginalRequestName'])
        lhe,prim,_,sec = ancestor.getIO()
        high_order_acdc += 1

    no_input = (not lhe) and len(prim)==0 and len(sec)==0

    cache = 0
    if options:
        cache = options.cache
    print "cache timeout", cache

    err= wfi.getWMErrors(cache=cache)
    stat = wfi.getWMStats(cache=cache)
    #adcd = wfi.getRecoveryDoc()

    total_by_code_dash = defaultdict( int )
    total_by_site_dash = defaultdict( int )
    r_dashb =defaultdict( lambda : defaultdict( int ))
    dash_board_h = 1
    if True :#'pdmvserv_TOP-RunIISummer15wmLHEGS-00103_00183_v0__161005_165048_809' in wfn:
        ## NB get the since from when the wf has started, not a fixed value
        ## no dashboard until we get a better api
        #dashb = wfi.getFullPicture(since=dash_board_h,cache=cache)
        dashb = {}
        #print json.dumps( dashb , indent=2)
        for site,sinfo in dashb.items():
            for s_code,counts in sinfo.items():
                d_statuses = ['submitted','pending','app-unknown','done']
                total_by_code_dash[str(s_code)]+= counts.get('submitted',0)
                total_by_site_dash[site] += counts.get('submitted',0)
                r_dashb[str(s_code)][site] += counts.get('submitted',0)

        print json.dumps(total_by_code_dash , indent=2)
        print json.dumps(total_by_site_dash , indent=2)

    status_per_task = defaultdict(lambda : defaultdict(int))
    
    if not 'AgentJobInfo' in stat:
        stat['AgentJobInfo'] = {}
        #print "bad countent ?"
        #print json.dumps(  stat,  indent=2)

    for agent in stat['AgentJobInfo']:
        for task in stat['AgentJobInfo'][agent]['tasks']:
            if not 'status' in stat['AgentJobInfo'][agent]['tasks'][task]: continue
            for status in stat['AgentJobInfo'][agent]['tasks'][task]['status']:
                info = stat['AgentJobInfo'][agent]['tasks'][task]['status'][status]
                #print status,stat['AgentJobInfo'][agent]['tasks'][task]['status'][status]
                if type(info)==dict:
                    status_per_task[task][status] += sum( stat['AgentJobInfo'][agent]['tasks'][task]['status'][status].values())
                else:
                    status_per_task[task][status] += stat['AgentJobInfo'][agent]['tasks'][task]['status'][status]

    #print json.dumps( status_per_task, indent=2)
    db_total_per_site = defaultdict(int) 
    db_total_per_code = defaultdict(int)
    ## cannot do that since there is no task count in dashboard and we have to take away the submitted
    #for site in dashb:
    #    for error in dashb[site]:
    #        db_total_per_site[site] += dashb[site][error] 
    #        db_total_per_code[code] += dashb[site][error]
    
    print "ACDC Information"
    print json.dumps( where_to_run , indent=2)         
    print json.dumps(missing_to_run , indent=2)        
    print json.dumps(missing_to_run_at , indent=2)        
    
    task_error_site_count ={}
    one_explanation = defaultdict(set)

    do_JL = True
    do_CL = True
    do_all_error_code = False
    if options: 
        do_JL = not options.no_JL
        do_CL = not options.no_CL
        do_all_error_code = options.all_errors
    if high_order_acdc>=1:
        print high_order_acdc,"order request, pulling down all logs"
        do_all_error_code = True


    n_expose = 1
    if options:
        n_expose = options.expose 
    expose_archive_code = {'134':defaultdict(lambda : n_expose),#seg fault
                           '139':defaultdict(lambda : n_expose),# ???
                           '99109':defaultdict(lambda : n_expose),#stageout
                           '99303' : defaultdict(lambda : n_expose),#no pkl report. if you are lucky
                           '60450' : defaultdict(lambda : n_expose),#new
                           '50513':defaultdict(lambda : n_expose),#new
                           '8001': defaultdict(lambda : n_expose),# the usual exception in cmsRun
                           '11003': defaultdict(lambda : n_expose),# job extraction
                           '73': defaultdict(lambda : n_expose),# job extraction
                           }
    expose_condor_code = {'99109':defaultdict(lambda : n_expose),#stageout
                          '99303':defaultdict(lambda : n_expose),#no pkl report
                          '60450':defaultdict(lambda : n_expose),#new
                          '50513':defaultdict(lambda : n_expose),#new
                          '11003': defaultdict(lambda : n_expose),
                          }
    
    tasks = sorted(set(err.keys() + missing_to_run.keys()))

    if not tasks:
        print "no task to look at"
        #return task_error_site_count
        
    html="<html> <center><h1><a href=https://cmsweb.cern.ch/reqmgr2/fetch?rid=%s>%s</a><br><a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=%s>%s</a><br>"%(
        wfn,
        wfn,
        wfi.request['PrepID'],
        wfi.request['PrepID']
        )
    if wfi.request['RequestType'] in ['ReReco']:
        html += '<a href=../datalumi/lumi.%s.html>Lumisection Summary</a><br>'% wfi.request['PrepID']
        
    html+= '</center><hr>'
    if prim:
        html+='Reads in primary<br>'
        for dataset in prim:
            html +='<b>%s</b>'%dataset
            available = getDatasetBlocksFraction(url, dataset)
            html +='<br><br>Available %.2f (>1 more than one copy, <1 not in full on disk)<br>'% available
            html +='<ul>'
            presence = getDatasetPresence(url, dataset)

            for site in sorted(presence.keys()):
                html += '<li>%s : %.2f %%'%( site, presence[site][1] )
            html+='</ul><br>'

            
    if sec:
        html+='Reads in secondary<br>'
        for dataset in sec:
            presence = getDatasetPresence(url, dataset)
            html +='<b>%s</b><ul>'%dataset
            for site in sorted(presence.keys()):
                html += '<li>%s : %.2f %%'%( site, presence[site][1] )
            html+='</ul>'
        
    html += "Updated on %s (GMT)" % ( time.asctime(time.gmtime()) )
    html += """
<ul>
<li> <b><i>dashboard numbers over %d days</b></i>
<li> &uarr; %% with respect to total number of error in the code
<li> &rarr; %% with respect to total number of error at the site
</ul>
"""%(dash_board_h)

    html += '<hr><br>'

    if tasks:
        min_rank = min([task.count('/') for task in tasks])
    for task in tasks:  
        #print task
        task_rank = task.count('/')
        task_short = task.split('/')[-1]
        total_per_site = defaultdict(int)
        for agent in stat['AgentJobInfo']:
            if not task in stat['AgentJobInfo'][agent]['tasks']: continue
            if not 'sites' in stat['AgentJobInfo'][agent]['tasks'][task]:continue
            for site in stat['AgentJobInfo'][agent]['tasks'][task]['sites']:

                info = stat['AgentJobInfo'][agent]['tasks'][task]['sites'][site]
                #if site in ['T2_BE_IIHE']:                    print task,json.dumps( info, indent=2)
                #print info.keys()
                for s in ['success','failure','cooloff','submitted']:
                    if not s in info: continue
                    data = info[s]
                    #print s,data
                    if type(data)==dict:
                        total_per_site[site] += sum( data.values() )
                    else:
                        total_per_site[site] += data

        #is the task relevant to recover (discard log, cleanup)
        if any([v in task.lower() for v in ['logcol','cleanup']]): continue


        total_count= defaultdict(int)
        error_site_count = defaultdict( lambda : defaultdict(int))
        if not task in err:
            print task,"has not reported error"
            err[task] = {}
        #print err[task].keys()
        
        for exittype in err[task]:
            #print "\t",err[task][exittype].keys()
            for errorcode_s in err[task][exittype]:
                if errorcode_s == '0' : continue
                #print "\t\t",err[task][exittype][errorcode_s].keys()
                for site in err[task][exittype][errorcode_s]:
                    ce = SI.SE_to_CE(site)
                    count = err[task][exittype][errorcode_s][site]['errorCount']
                    total_count[errorcode_s] += count
                    #error_site_count[errorcode_s][site] += count
                    error_site_count[errorcode_s][ce] += count
                    for sample in err[task][exittype][errorcode_s][site]['samples']:
                        #print sample.keys()
                        for step in sample['errors']:
                            for report in  sample['errors'][step]:
                                if report['type'] == 'CMSExeption': continue
                                #if int(report['exitCode']) == int(errorcode_s):
                                one_explanation[errorcode_s].add("%s (Exit code: %s) \n%s"%(report['type'], report['exitCode'], report['details']))
                                #one_explanation[errorcode_s].add( report['details'] )
                                #else:
                                #one_explanation[
                        agent = sample['agent_name']
                        wmbs = sample['wmbsid']
                        workflow = sample['workflow']

                        if do_CL and ((errorcode_s in expose_condor_code and expose_condor_code[errorcode_s][agent]) or do_all_error_code) and 'cern' in agent:
                            os.system('ssh %s %s/WmAgentScripts/Unified/exec_expose.sh %s %s %s %s %s %s'%( agent, base_dir, workflow, wmbs, errorcode_s, base_dir, monitor_dir, task_short))
                            if errorcode_s in expose_condor_code:
                                expose_condor_code[errorcode_s][agent]-=1

                        for out in sample['output']:
                            #print out
                            if out['type'] == 'logArchive':
                                if do_JL and ((errorcode_s in expose_archive_code and expose_archive_code[errorcode_s][agent]) or (do_all_error_code)):
                                    if errorcode_s in expose_archive_code:
                                        expose_archive_code[errorcode_s][agent]-=1
                                    os.system('mkdir -p /tmp/%s'%(os.getenv('USER')))
                                    local = '/tmp/%s/%s'%(os.getenv('USER'),out['lfn'].split('/')[-1])
                                    command = 'xrdcp root://cms-xrd-global.cern.ch/%s %s'%( out['lfn'], local)
                                    ## get the file
                                    os.system( command )
                                    ## if this actually fail, let's get the file from eos using the new log mapping
                                    ## expose the content
                                    label=out['lfn'].split('/')[-1].split('.')[0]
                                    m_dir = '%s/joblogs/%s/%s/%s/%s'%(monitor_dir, 
                                                                   wfn, 
                                                                   errorcode_s,
                                                                   task_short,
                                                                   label)
                                    os.system('mkdir -p %s'%(m_dir))
                                    os.system('tar zxvf %s -C %s'%(local,m_dir))
                                    ## truncate the content ??
                                    for fn in os.popen('find %s -type f'%(m_dir)).read().split('\n'):
                                        if not fn: continue
                                        if any([p in fn for p in ['stdout.log']]):
                                            trunc = '/tmp/%s/%s'%(os.getenv('USER'), label)
                                            #print fn
                                            #print trunc
                                            head = tail = 1000
                                            os.system('(head -%d ; echo;echo;echo "<snip>";echo;echo ; tail -%d ) < %s > %s'%(head, tail, fn, trunc))
                                            os.system('mv %s %s'%(trunc, fn))

        #print task
        #print json.dumps( total_count, indent=2)
        #print json.dumps( explanations , indent=2)
        all_sites = set()
        all_codes = set()
        for code in error_site_count:
            for site in error_site_count[code]:
                all_sites.add( site )
                if code != '0':
                    all_codes.add( code)

        ## parse the dashboard data
        for site in total_by_site_dash:
            ## no. cannot discriminate by task in dashboard...
            #all_sites.add( site )
            pass

        ## parse the acdc data
        notreported='NotReported'
        all_missing_stats = set()
        for site in missing_to_run_at[task]:
            if not missing_to_run_at[task][site]: continue
            ce = SI.SE_to_CE( site )
            #all_sites.add( ce )
            all_missing_stats.add( ce )

            error_site_count[notreported][ce] = 0
            all_codes.add(notreported)
            ## no error code at that point
            
        all_missing_stats = all_missing_stats &set(SI.all_sites)
        all_not_reported = all_missing_stats - all_sites 
        #print task
        #print "site with no report",sorted(all_not_reported)
        #print sorted(all_sites)
        #print sorted(all_missing_stats)
        all_sites = all_missing_stats | all_sites
        all_sites = all_sites & set(SI.all_sites)
        #success = total_count['0']
        #total_jobs = sum(total_count.values())
        #print total_jobs,"jobs in total,",success,"successes"
        #miss = "{:,}".format(missing_to_run[task]) if task in missing_to_run else "N/A"

        ## show the total
        s_per_code =defaultdict(int)
        for site in all_sites:
            for code in sorted(all_codes):
                s_per_code[code] += error_site_count[code][site]

        #no_error = (sum(s_per_code.values())==0)
        no_error = len(all_not_reported)!=0

        if not no_error and notreported in all_codes:
            all_codes.remove( notreported )
        missing_events = missing_to_run[task] if task in missing_to_run else 0
        html += "<b>%s</b>"%task.split('/')[-1]
        if missing_events:
            html += " is missing <b>%s events</b>"%( "{:,}".format(missing_events) )
            if no_error:
                html +="<br><b><font color=red> and has UNreported error</font></b>"


        html += "<br><table border=1><thead><tr><th>Sites/Errors</th>"

        #for site in all_sites:
        #    html+='<th>%s</th>'%site
        for code in sorted(all_codes):
            html+='<th><a href="#%s">%s</a>'%(code,code)
            if str(code) in expose_archive_code or do_all_error_code:
                html += ' <a href=../joblogs/%s/%s/%s>, JL</a>'%( wfn, code, task_short )
            if str(code) in expose_condor_code or do_all_error_code:
                html += ' <a href=../condorlogs/%s/%s/%s>, CL</a>'%( wfn, code, task_short )
            html += '</th>'

        html+='<th>Total jobs</th><th>Site Ready</th>'
        html+='</tr></thead>\n'

        html+='<tr><td>Total</td>'
        for code in sorted(all_codes):
            html += '<td bgcolor=orange width=100>%d'%(s_per_code[code])
            if code in total_by_code_dash:
                html += ' (<b><i>%d</i></b>)'% total_by_code_dash[code]
            html += '</td>'

        ulist='<ul>'
        grand=0
        for status in sorted(status_per_task[task].keys()):
            ulist+='<li> %s %d'%( status, status_per_task[task][status])
            grand+= status_per_task[task][status]
        ulist+='<li><b> Total %d </b>'%grand
        ulist+='</ul>'
        #html += '<td bgcolor=orange> %.2f%% </td>'% (100.*(float(sum(s_per_code.values()))/sum(total_per_site.values())) if sum(total_per_site.values()) else 0.)
        html += '<td bgcolor=orange> &rarr; %.2f%% &larr; </td>'% (100.*(float(sum(s_per_code.values()))/ grand) if grand else 0.)
        html += '<td bgcolor=orange> %s </td>'% ulist
        
        html+='</tr>'


        def palette(frac):
            _range = { 
                0.0 : 'green',
                0.5 : 'green',
                0.6 : 'darkgreen',
                0.7 : 'orange',
                0.8 : 'salmon',
                0.9 : 'red'
                }
            which = [k for k in _range.keys() if k<=frac]
            if which:
                there = max(which)
            else:
                there=max(_range.keys())
            return _range[there]

        for site in sorted(all_sites):
            site_in = 'Yes'
            color = 'bgcolor=lightblue'
            if not site in SI.sites_ready:
                color = 'bgcolor=indianred'
                site_in ='<b>No</b>'
                if missing_to_run_at[task][SI.CE_to_SE(site)] == 0 or min_rank == task_rank:
                    color = 'bgcolor=aquamarine'
                    site_in = '<b>No</b> but fine'

            if not no_error:
                site_in +=" (%s events)"%"{:,}".format(missing_to_run_at[task][SI.CE_to_SE(site)])
            html+='<tr><td %s>%s</td>'%(color,site)
            for code in sorted(all_codes):
                if code == notreported:
                    html += '<td %s width=200>%s events </td>' %(color, "{:,}".format(missing_to_run_at[task][SI.CE_to_SE(site)]))
                else:
                    if error_site_count[code][site]:
                        er_frac = float(error_site_count[code][site])/s_per_code[code] if s_per_code[code] else 0.
                        si_frac = float(error_site_count[code][site])/total_per_site[site] if total_per_site[site] else 0.
                        html += '<td %s width=200>%d'%(color, error_site_count[code][site])
                        if code in r_dashb and site in r_dashb[code]:
                            html += ' (<b><i>%d</i></b>)'%( r_dashb[code][site] )

                        html += ', <font color=%s>&uarr; %.1f%%</font>, <font color=%s>&rarr; %.1f%%</font></td>'% (
                            palette(er_frac),100.*er_frac,
                            palette(si_frac), 100.*si_frac
                            )
                    else:
                        html += '<td %s>0</td>'% color
            html += '<td bgcolor=orange>%d</td>'% total_per_site[site]
            html += '<td %s>%s</td>'% (color, site_in)
            html +='</tr>\n'
        html+='</table><br>'
        task_error_site_count[task] = error_site_count

    html += '<hr><br>'
    html += "<b>Blocks (%d/%d) needed for recovery</b><br>"%( len(needed_blocks), len(all_blocks))
    for block in sorted(needed_blocks):
        html +='%s<br>'%block
    html += "<br><b>Files in no block</b><br>"
    for f in sorted(files_notin_dbs):
        html +='%s<br>'%f


    html += '<hr><br>'
    html += '<table border=1>'
    for code in one_explanation:
        html +='<tr><td><a name="%s">%s</a></td><td>%s</td></tr>'% ( code, code, '<br><br>'.join(one_explanation[code]).replace('\n','<br>' ))
        #explanations[code].update( one_explanation[code] )
    html+='</table>'
    html+=('<br>'*30)
    html +='</html>'
    wfi.sendLog( 'error', html, show=False)
    fn = '%s'% wfn
    open('%s/report/%s'%(monitor_dir,fn),'w').write( html )

    return task_error_site_count, one_explanation
Exemplo n.º 7
0
def assignor(url, specific=None, talk=True, options=None):
    if userLock(): return
    if duplicateLock(): return
    if not componentInfo().check(): return

    UC = unifiedConfiguration()
    CI = campaignInfo()
    #SI = siteInfo()
    SI = global_SI()
    #NLI = newLockInfo()
    #if not NLI.free() and not options.go: return
    LI = lockInfo()
    if not LI.free() and not options.go: return

    n_assigned = 0
    n_stalled = 0

    wfos = []
    fetch_from = []
    if specific or options.early:
        fetch_from.extend(['considered', 'staging'])
    if specific:
        fetch_from.extend(['considered-tried'])

    if options.early:
        print "Option Early is on"

    fetch_from.extend(['staged'])

    if options.from_status:
        fetch_from = options.from_status.split(',')
        print "Overriding to read from", fetch_from

    for status in fetch_from:
        print "getting wf in", status
        wfos.extend(
            session.query(Workflow).filter(Workflow.status == status).all())
        print len(wfos)

    ## in case of partial, go for fetching a list from json ?
    #if options.partial and not specific:
    #    pass

    dataset_endpoints = json.loads(
        open('%s/dataset_endpoints.json' % monitor_dir).read())
    aaa_mapping = json.loads(
        open('%s/equalizor.json' % monitor_pub_dir).read())['mapping']

    all_stuck = set()
    all_stuck.update(
        json.loads(open('%s/stuck_transfers.json' % monitor_pub_dir).read()))
    all_stuck.update(getAllStuckDataset())

    max_per_round = UC.get('max_per_round').get('assignor', None)
    max_cpuh_block = UC.get('max_cpuh_block')

    ##order by priority instead of random
    if options.early:
        cache = sorted(getWorkflows(url, 'assignment-approved', details=True),
                       key=lambda r: r['RequestPriority'])
        cache = [r['RequestName'] for r in cache]

        def rank(wfn):
            return cache.index(wfn) if wfn in cache else 0

        wfos = sorted(wfos, key=lambda wfo: rank(wfo.name), reverse=True)
        print "10 first", [wfo.name for wfo in wfos[:10]]
        print "10 last", [wfo.name for wfo in wfos[-10:]]
    else:
        random.shuffle(wfos)

    for wfo in wfos:

        if options.limit and (n_stalled + n_assigned) > options.limit:
            break

        if max_per_round and (n_stalled + n_assigned) > max_per_round:
            break

        if specific:
            if not any(map(lambda sp: sp in wfo.name, specific.split(','))):
                continue
            #if not specific in wfo.name: continue
        print "\n\n"
        wfh = workflowInfo(url, wfo.name)

        if wfh.request['RequestStatus'] in [
                'rejected', 'aborted', 'aborted-completed', 'aborted-archived',
                'rejected-archived'
        ] and wfh.isRelval():
            wfo.status = 'forget'
            session.commit()
            n_stalled += 1
            continue

        if options.priority and int(
                wfh.request['RequestPriority']) < options.priority:
            continue

        options_text = ""
        if options.early: options_text += ", early option is ON"
        if options.partial:
            options_text += ", partial option is ON"
            options_text += ", good fraction is %.2f" % options.good_enough

        wfh.sendLog('assignor',
                    "%s to be assigned%s" % (wfo.name, options_text))

        ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores
        (lheinput, primary, parent, secondary,
         sites_allowed) = wfh.getSiteWhiteList()
        output_tiers = list(
            set([o.split('/')[-1] for o in wfh.request['OutputDatasets']]))

        if not output_tiers:
            n_stalled += 1
            wfh.sendLog('assignor', 'There is no output at all')
            sendLog('assignor',
                    'Workflow %s has no output at all' % (wfo.name),
                    level='critical')
            continue

        is_stuck = (all_stuck & primary)
        if is_stuck:
            wfh.sendLog('assignor',
                        "%s are stuck input" % (','.join(is_stuck)))

        ## check if by configuration we gave it a GO
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True

        allowed_secondary = {}
        assign_parameters = {}
        check_secondary = (not wfh.isRelval())
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns:
                assign_parameters.update(CI.campaigns[campaign])

            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[
                    campaign]:
                if CI.campaigns[campaign]['secondaries']:
                    allowed_secondary.update(
                        CI.campaigns[campaign]['secondaries'])
                    check_secondary = True
            if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[
                    campaign]:
                banned_tier = list(
                    set(CI.campaigns[campaign]['banned_tier'])
                    & set(output_tiers))
                if banned_tier:
                    no_go = True
                    wfh.sendLog(
                        'assignor', 'These data tiers %s are not allowed' %
                        (','.join(banned_tier)))
                    sendLog('assignor',
                            'These data tiers %s are not allowed' %
                            (','.join(banned_tier)),
                            level='critical')

        if secondary and check_secondary:
            if (set(secondary) & set(allowed_secondary.keys()) !=
                    set(secondary)):
                wfh.sendLog(
                    'assignor', '%s is not an allowed secondary' %
                    (', '.join(set(secondary) -
                               set(allowed_secondary.keys()))))
                sendLog(
                    'assignor',
                    '%s is not an allowed secondary' %
                    (', '.join(set(secondary) -
                               set(allowed_secondary.keys()))),
                    level='critical')
                if not options.go:
                    no_go = True
            ## then get whether there is something more to be done by secondary
            for sec in secondary:
                if sec in allowed_secondary:  # and 'parameters' in allowed_secondary[sec]:
                    assign_parameters.update(allowed_secondary[sec])

        if no_go:
            n_stalled += 1
            ## make a very loud noise if >100k priority stalled
            continue

        ## check on current status for by-passed assignment
        if wfh.request['RequestStatus'] != 'assignment-approved':
            if not options.test:
                wfh.sendLog('assignor',
                            "setting %s away and skipping" % wfo.name)
                ## the module picking up from away will do what is necessary of it
                wfo.wm_status = wfh.request['RequestStatus']
                wfo.status = 'away'
                session.commit()
                continue
            else:
                print wfo.name, wfh.request['RequestStatus']

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version = wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                wfh.sendLog('assignor', "cannot decide on version number")
                n_stalled += 1
                wfo.status = 'trouble'
                session.commit()
                continue

        original_sites_allowed = copy.deepcopy(sites_allowed)
        wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed))
        override_sec_location = CI.get(wfh.request['Campaign'],
                                       'SecondaryLocation', [])

        blocks = wfh.getBlockWhiteList()
        rwl = wfh.getRunWhiteList()
        if rwl:
            ## augment with run white list
            for dataset in primary:
                blocks = list(set(blocks +
                                  getDatasetBlocks(dataset, runs=rwl)))
        lwl = wfh.getLumiWhiteList()
        if lwl:
            ## augment with lumi white list
            for dataset in primary:
                blocks = list(
                    set(blocks + getDatasetBlocks(dataset, lumis=lwl)))

        wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed))
        secondary_locations = None

        primary_aaa = options.primary_aaa
        secondary_aaa = options.secondary_aaa
        do_partial = False  #options.good_enough if options.partial else 0

        if 'Campaign' in wfh.request and wfh.request[
                'Campaign'] in CI.campaigns:
            assign_parameters.update(CI.campaigns[wfh.request['Campaign']])

        if 'primary_AAA' in assign_parameters:
            primary_aaa = primary_aaa or assign_parameters['primary_AAA']
        if 'secondary_AAA' in assign_parameters:
            secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA']
        if 'partial_copy' in assign_parameters:
            ## can this only work if there is a stuck input ? maybe not
            ## this is a number. 0 means no
            print "Could do partial disk copy assignment"
            if is_stuck or options.partial:
                do_partial = assign_parameters['partial_copy']
                wfh.sendLog(
                    'assignor',
                    "Overiding partial copy assignment to %.2f fraction" %
                    do_partial)
                #sendEmail('stuck input to assignment','%s is stuck for assigning %s and going fractional'%(','.join( is_stuck), wfo.name))

        do_partial = options.good_enough if options.partial else do_partial

        for sec in list(secondary):
            if override_sec_location:
                print "We don't care where the secondary is"
                print "Cannot pass for now"
                #sendEmail("tempting to pass sec location check","but we cannot yet IMO")
                #pass

            presence = getDatasetPresence(url, sec)
            print sec
            print json.dumps(presence, indent=2)
            one_secondary_locations = [
                site for (site, (there, frac)) in presence.items()
                if frac > 98.
            ]

            if secondary_aaa:
                if not one_secondary_locations:
                    sec_availability = getDatasetBlocksFraction(url, sec)
                    if sec_availability >= 1. and options.go:
                        ## there is at least one copy of each block on disk. We should go ahead and let it go.
                        wfh.sendLog(
                            'assignor',
                            "The secondary %s is available %s times on disk, and usable"
                            % (sec, sec_availability))
                    else:
                        ## not even a copy on disk anywhere !!!!
                        sites_allowed = []  ## will block the assignment
                        wfh.sendLog(
                            'assignor',
                            "The secondary %s is nowhere on disk" % sec)
                #just continue without checking
                continue

            #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
            if secondary_locations == None:
                secondary_locations = one_secondary_locations
            else:
                secondary_locations = list(
                    set(secondary_locations) & set(one_secondary_locations))
            ## reduce the site white list to site with secondary only
            #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])]
            sites_allowed = [
                site for site in sites_allowed
                if SI.CE_to_SE(site) in one_secondary_locations
            ]

        wfh.sendLog(
            'assignor', "From/after secondary requirement, now Allowed%s" %
            sorted(sites_allowed))

        initial_sites_allowed = copy.deepcopy(
            sites_allowed
        )  ## keep track of this, after secondary input location restriction : that's how you want to operate it

        sites_all_data = copy.deepcopy(sites_allowed)
        sites_with_data = copy.deepcopy(sites_allowed)
        sites_with_any_data = copy.deepcopy(sites_allowed)
        primary_locations = None
        available_fractions = {}
        set_lfn = '/store/mc'  ## by default

        endpoints = set()
        for prim in list(primary):
            if prim in dataset_endpoints:
                print "endpoints from stagor", dataset_endpoints[prim]
                endpoints.update(dataset_endpoints[prim])
            set_lfn = getLFNbase(prim)
            presence = getDatasetPresence(url, prim, only_blocks=blocks)
            if talk:
                print prim
                print json.dumps(presence, indent=2)
            available_fractions[prim] = getDatasetBlocksFraction(
                url,
                prim,
                sites=[SI.CE_to_SE(site) for site in sites_allowed],
                only_blocks=blocks)
            if primary_aaa:
                available_fractions[prim] = getDatasetBlocksFraction(
                    url, prim, only_blocks=blocks)

            sites_all_data = [
                site for site in sites_with_data if SI.CE_to_SE(site) in [
                    psite for (psite, (there, frac)) in presence.items()
                    if there
                ]
            ]
            if primary_aaa:
                sites_all_data = list(
                    set([
                        SI.SE_to_CE(psite)
                        for (psite, (there, frac)) in presence.items() if there
                    ]))
            sites_with_data = [
                site for site in sites_with_data if SI.CE_to_SE(site) in
                [psite for (psite, frac) in presence.items() if frac[1] > 90.]
            ]
            sites_with_any_data = [
                site for site in sites_with_any_data
                if SI.CE_to_SE(site) in presence.keys()
            ]
            if primary_aaa:
                sites_with_any_data = list(
                    set([SI.SE_to_CE(psite) for psite in presence.keys()]))

            wfh.sendLog(
                'assignor', "Holding the data but not allowed %s" % sorted(
                    list(
                        set([
                            se_site for se_site in presence.keys()
                            if not SI.SE_to_CE(se_site) in sites_allowed
                        ]))))
            if primary_locations == None:
                primary_locations = presence.keys()
            else:
                primary_locations = list(
                    set(primary_locations) & set(presence.keys()))

        sites_with_data = list(set(sites_with_data))
        sites_with_any_data = list(set(sites_with_any_data))

        opportunistic_sites = []
        down_time = False
        ## opportunistic running where any piece of data is available
        if secondary_locations or primary_locations:
            ## intersection of both any pieces of the primary and good IO
            #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))]
            if secondary_locations and primary_locations:
                opportunistic_sites = [
                    SI.SE_to_CE(site) for site in
                    list((set(secondary_locations) & set(primary_locations)) -
                         set([SI.CE_to_SE(site) for site in sites_allowed]))
                ]
            elif primary_locations:
                opportunistic_sites = [
                    SI.SE_to_CE(site) for site in list(
                        set(primary_locations) -
                        set([SI.CE_to_SE(site) for site in sites_allowed]))
                ]
            else:
                opportunistic_sites = []
            wfh.sendLog(
                'assignor', "We could be running in addition at %s" %
                sorted(opportunistic_sites))
            if any(
                [osite in SI.sites_not_ready
                 for osite in opportunistic_sites]):
                wfh.sendLog(
                    'assignor', "One of the usable site is in downtime %s" % ([
                        osite for osite in opportunistic_sites
                        if osite in SI.sites_not_ready
                    ]))
                down_time = True
                ## should this be send back to considered ?

        ## should be 2 but for the time-being let's lower it to get things going
        copies_wanted, cpuh = wfh.getNCopies()
        wfh.sendLog('assignor', "we need %s CPUh" % cpuh)
        if cpuh > max_cpuh_block and not options.go:
            #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**'])
            sendLog(
                'assignor',
                '%s requires a large numbr of CPUh %s , not assigning, please check with requester'
                % (wfo.name, cpuh),
                level='critical')
            wfh.sendLog(
                'assignor',
                "Requiring a large number of CPUh %s, not assigning" % cpuh)
            continue

        if 'Campaign' in wfh.request and wfh.request[
                'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[
                    wfh.request['Campaign']]:
            copies_needed_from_campaign = CI.campaigns[
                wfh.request['Campaign']]['maxcopies']
            copies_wanted = min(copies_needed_from_campaign, copies_wanted)

        if not options.early:
            less_copies_than_requested = UC.get("less_copies_than_requested")
            copies_wanted = max(
                1, copies_wanted -
                less_copies_than_requested)  # take one out for the efficiency
        else:
            ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going
            pass

        wfh.sendLog('assignor',
                    "needed availability fraction %s" % copies_wanted)

        ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently

        ## default back to white list to original white list with any data
        wfh.sendLog('assignor', "Allowed sites :%s" % sorted(sites_allowed))

        if primary_aaa:
            ## remove the sites not reachable localy if not in having the data
            if not sites_all_data:
                wfh.sendLog('assignor',
                            "Overiding the primary on AAA setting to Off")
                primary_aaa = False
            else:
                aaa_grid = set(sites_all_data)
                for site in list(aaa_grid):
                    aaa_grid.update(aaa_mapping.get(site, []))
                sites_allowed = list(set(initial_sites_allowed) & aaa_grid)
                wfh.sendLog(
                    'assignor', "Selected to read primary through xrootd %s" %
                    sorted(sites_allowed))

        isStoreResults = ('StoreResults' == wfh.request.setdefault(
            'RequestType', None))

        if isStoreResults:
            if 'MergedLFNBase' in wfh.request:
                set_lfn = wfh.request['MergedLFNBase']
            else:
                n_stalled += 1
                wfh.sendLog(
                    'assignor',
                    "Cannot assign StoreResults request because MergedLFN is missing"
                )
                sendLog(
                    'assignor',
                    'Cannot assign StoreResults request because MergedLFN is missing',
                    level='critical')
                continue

        if not primary_aaa:
            if not isStoreResults:
                sites_allowed = sites_with_any_data
            else:
                ## if we are dealing with a StoreResults request, we don't need to check dataset availability and
                ## should use the SiteWhiteList set in the original request
                if 'SiteWhitelist' in wfh.request:
                    sites_allowed = wfh.request['SiteWhitelist']
                else:
                    wfh.sendLog(
                        'assignor',
                        "Cannot assign StoreResults request because SiteWhitelist is missing"
                    )
                    sendLog(
                        'assignor',
                        'Cannot assign StoreResults request because SiteWhitelist is missing',
                        level='critical')
                    n_stalled += 1
                    continue
                available_fractions = {}
            wfh.sendLog('assignor',
                        "Selected for any data %s" % sorted(sites_allowed))

        ### check on endpoints for on-going transfers
        if do_partial:
            if endpoints:
                end_sites = [SI.SE_to_CE(s) for s in endpoints]
                sites_allowed = list(set(sites_allowed + end_sites))
                if down_time and not any(osite in SI.sites_not_ready
                                         for osite in end_sites):
                    print "Flip the status of downtime, since our destinations are good"
                    down_time = False
                print "with added endpoints", sorted(end_sites)
            else:
                print "Cannot do partial assignment without knowin the endpoints"
                n_stalled += 1
                continue

        #if not len(sites_allowed):
        #    if not options.early:
        #        wfh.sendLog('assignor',"cannot be assign with no matched sites")
        #        sendLog('assignor','%s has no whitelist'% wfo.name, level='critical')
        #    n_stalled+=1
        #    continue

        low_pressure = SI.sites_low_pressure(0.4)
        ## if any of the site allowed is low pressure : reduce to 1 copy so that it gets started
        allowed_and_low = sorted(set(low_pressure) & set(sites_allowed))
        if allowed_and_low:
            wfh.sendLog(
                'assignor',
                "The workflow can run at %s under low pressure currently" %
                (','.join(allowed_and_low)))
            copies_wanted = max(1., copies_wanted - 1.)

        if available_fractions and not all([
                available >= copies_wanted
                for available in available_fractions.values()
        ]):
            not_even_once = not all([
                available >= 1. for available in available_fractions.values()
            ])
            above_good = all([
                available >= do_partial
                for available in available_fractions.values()
            ])
            wfh.sendLog(
                'assignor',
                "The input dataset is not available %s times, only %s" %
                (copies_wanted, available_fractions.values()))
            if down_time and not options.go and not options.early:
                wfo.status = 'considered'
                session.commit()
                wfh.sendLog(
                    'assignor',
                    "sending back to considered because of site downtime, instead of waiting"
                )
                #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name)
                sendLog(
                    'assignor',
                    '%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.'
                    % (wfo.name),
                    level='delay')
                n_stalled += 1
                continue
                #pass

            print json.dumps(available_fractions)
            if (options.go and not_even_once) or not options.go:
                known = []
                try:
                    known = json.loads(open('cannot_assign.json').read())
                except:
                    pass
                if not wfo.name in known and not options.limit and not options.go and not options.early and not (
                        do_partial and above_good):
                    wfh.sendLog(
                        'assignor',
                        "cannot be assigned, %s is not sufficiently available.\n %s"
                        % (wfo.name, json.dumps(available_fractions)))
                    #sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions)))
                    known.append(wfo.name)
                    open('cannot_assign.json',
                         'w').write(json.dumps(known, indent=2))

                if options.early:
                    if wfo.status == 'considered':
                        wfh.sendLog('assignor', "setting considered-tried")
                        wfo.status = 'considered-tried'
                        session.commit()
                    else:
                        print "tried but status is", wfo.status
                if do_partial and above_good:
                    print "Will move on with partial locations"
                else:
                    n_stalled += 1
                    continue

        if not len(sites_allowed):
            if not options.early:
                wfh.sendLog('assignor',
                            "cannot be assign with no matched sites")
                sendLog('assignor',
                        '%s has no whitelist' % wfo.name,
                        level='critical')
            n_stalled += 1
            continue

        t1_only = [ce for ce in sites_allowed if ce.startswith('T1')]
        if t1_only:
            # try to pick from T1 only first
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])]
        else:
            # then pick any otherwise
            sites_out = [
                SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])
            ]

        wfh.sendLog('assignor', "Placing the output on %s" % sites_out)
        parameters = {
            'SiteWhitelist': sites_allowed,
            'NonCustodialSites': sites_out,
            'AutoApproveSubscriptionSites': list(set(sites_out)),
            'AcquisitionEra': wfh.acquisitionEra(),
            'ProcessingString': wfh.processingString(),
            'MergedLFNBase': set_lfn,
            'ProcessingVersion': version,
        }

        if primary_aaa:
            parameters['TrustSitelists'] = True
            wfh.sendLog(
                'assignor',
                "Reading primary through xrootd at %s" % sorted(sites_allowed))

        if secondary_aaa:
            parameters['TrustPUSitelists'] = True
            wfh.sendLog(
                'assignor', "Reading secondary through xrootd at %s" %
                sorted(sites_allowed))

        ## plain assignment here
        team = 'production'
        if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM')
        if options and options.team:
            team = options.team
        parameters['Team'] = team

        if lheinput:
            ## throttle reading LHE article
            wfh.sendLog('assignor',
                        'Setting the number of events per job to 500k max')
            parameters['EventsPerJob'] = 500000

        def pick_options(options, parameters):
            ##parse options entered in command line if any
            if options:
                for key in reqMgrClient.assignWorkflow.keys:
                    v = getattr(options, key)
                    if v != None:
                        if type(v) == str and ',' in v:
                            parameters[key] = filter(None, v.split(','))
                        else:
                            parameters[key] = v

        def pick_campaign(assign_parameters, parameters):
            ## pick up campaign specific assignment parameters
            parameters.update(assign_parameters.get('parameters', {}))

        if options.force_options:
            pick_campaign(assign_parameters, parameters)
            pick_options(options, parameters)
        else:
            ## campaign parameters update last
            pick_options(options, parameters)
            pick_campaign(assign_parameters, parameters)

        if not options.test:
            parameters['execute'] = True

        hold_split, split_check = wfh.checkSplitting()
        if hold_split and not options.go:
            if split_check:
                wfh.sendLog(
                    'assignor', 'Holding on to the change in splitting %s' %
                    ('\n\n'.join([str(i) for i in split_check])))
            else:
                wfh.sendLog('assignor', 'Change of splitting is on hold')
            n_stalled += 1
            continue

        if split_check == None or split_check == False:
            n_stalled += 1
            continue
        elif split_check:
            ## operate all recommended changes
            reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check)
            wfh.sendLog(
                'assignor', 'Applying the change in splitting %s' %
                ('\n\n'.join([str(i) for i in split_check])))

        split_check = True  ## bypass completely and use the above

        # Handle run-dependent MC
        pstring = wfh.processingString()
        if 'PU_RD' in pstring:
            numEvents = wfh.getRequestNumEvents()
            eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary]
            eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi))
            reqJobs = 500
            if 'PU_RD2' in pstring:
                reqJobs = 2000
                eventsPerJob = int(numEvents / (reqJobs * 1.4))
                lumisPerJob = int(eventsPerJob / eventsPerLumi)
                if lumisPerJob == 0:
                    #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    sendLog('assignor',
                            "%s needs to be split by event with %s per job" %
                            (wfo.name, eventsPerJob),
                            level='critical')
                    wfh.sendLog(
                        'assignor',
                        "%s needs to be split by event with %s per job" %
                        (wfo.name, eventsPerJob))
                    parameters['EventsPerJob'] = eventsPerJob
                else:
                    spl = wfh.getSplittings()[0]
                    eventsPerJobEstimated = spl[
                        'events_per_job'] if 'events_per_job' in spl else None
                    eventsPerJobEstimated = spl[
                        'avg_events_per_job'] if 'avg_events_per_job' in spl else None
                    if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob:
                        #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        sendLog('assignor',
                                "%s was assigned with %s lumis/job" %
                                (wfo.name, lumisPerJob),
                                level='critical')
                        wfh.sendLog(
                            'assignor', "%s was assigned with %s lumis/job" %
                            (wfo.name, lumisPerJob))
                        parameters['LumisPerJob'] = lumisPerJob
                    else:
                        #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name)
                        sendLog(
                            'assignor',
                            "leaving splitting untouched for %s, please check on %s"
                            % (pstring, wfo.name),
                            level='critical')
                        wfh.sendLog(
                            'assignor',
                            "leaving splitting untouched for PU_RD*, please check."
                        )

        if isHEPCloudReady(url) and wfh.isGoodForNERSC():
            parameters['Team'] = 'hepcloud'
            parameters['SiteWhitelist'] = ['T3_US_NERSC']
            if primary:
                parameters['TrustSitelists'] = True
            if secondary:
                parameters['TrustPUSitelists'] = True
            sendEmail("sending work to hepcloud",
                      "pleasse check on %s" % wfh.request['RequestName'],
                      destination=['*****@*****.**'])

        ## make sure to autoapprove all NonCustodialSites
        parameters['AutoApproveSubscriptionSites'] = list(
            set(parameters['NonCustodialSites'] +
                parameters.get('AutoApproveSubscriptionSites', [])))

        result = reqMgrClient.assignWorkflow(
            url, wfo.name, None,
            parameters)  ## team is not relevant anymore here

        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
                n_assigned += 1
                wfh.sendLog(
                    'assignor', "Properly assigned\n%s" %
                    (json.dumps(parameters, indent=2)))
                try:
                    ## refetch information and lock output
                    new_wfi = workflowInfo(url, wfo.name)
                    (_, prim, _, sec) = new_wfi.getIO()
                    for secure in list(prim) + list(
                            sec) + new_wfi.request['OutputDatasets']:
                        ## lock all outputs
                        LI.lock(secure, reason='assigning')

                except Exception as e:
                    print "fail in locking output"

                    print str(e)
                    sendEmail("failed locking of output", str(e))

            else:
                wfh.sendLog(
                    'assignor',
                    "Failed to assign %s.\n%s \n Please check the logs" %
                    (wfo.name, reqMgrClient.assignWorkflow.errorMessage))
                sendLog('assignor',
                        "Failed to assign %s.\n%s \n Please check the logs" %
                        (wfo.name, reqMgrClient.assignWorkflow.errorMessage),
                        level='critical')
                print "ERROR could not assign", wfo.name
        else:
            pass
    print "Assignment summary:"
    sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled))
    if n_stalled and not options.go and not options.early:
        sendLog('assignor',
                "%s workflows cannot be assigned. Please take a look" %
                (n_stalled),
                level='critical')
Exemplo n.º 8
0
def mappor(url, options=None):

    up = componentInfo(soft=['mcm', 'wtc', 'jira'])

    ## define regionality site => fallback allowed. feed on an ssb metric ??
    mapping = defaultdict(list)
    reversed_mapping = defaultdict(list)
    regions = defaultdict(list)

    UC = unifiedConfiguration()
    over_rides = []
    use_T0 = ('T0_CH_CERN' in UC.get("site_for_overflow"))
    if options.t0: use_T0 = True
    if use_T0: over_rides.append('T0_CH_CERN')

    use_HLT = ('T2_CH_CERN_HLT' in UC.get("site_for_overflow"))
    if options.hlt: use_HLT = True
    if use_HLT: over_rides.append('T2_CH_CERN_HLT')

    use_CSCS = ('T0_CH_CSCS_HPC' in UC.get("site_for_overflow"))
    if options.cscs: use_CSCS = True
    if use_CSCS: over_rides.append('T0_CH_CSCS_HPC')

    SI = global_SI(over_rides)
    #print sorted(SI.all_sites)
    #print sorted(SI.sites_T0s)

    CI = campaignInfo()

    #sites_to_consider = SI.all_sites
    sites_to_consider = SI.sites_ready
    for site in sites_to_consider:
        region = site.split('_')[1]
        if not region in [
                'US',
                'DE',
                'IT',
                'FR',
                'CH',
                'ES',
                'UK',
                'RU'  ### latest addition
        ]:
            continue
        regions[region] = [region]

    def site_in_depletion(s):
        return True
        if s in SI.sites_pressure:
            (m, r, pressure) = SI.sites_pressure[s]
            if float(m) < float(r):
                print s, m, r, "lacking pressure"
                return True
            else:
                print s, m, r, "pressure"
                pass

        return False

    for site in sites_to_consider:
        region = site.split('_')[1]
        ## fallback to the region, to site with on-going low pressure
        within_region = [
            fb for fb in sites_to_consider
            if any([('_%s_' %
                     (reg) in fb and fb != site and site_in_depletion(fb))
                    for reg in regions[region]])
        ]
        #print site,region, within_region
        mapping[site] = within_region

    for site in sites_to_consider:
        if site.split('_')[1] == 'US':  ## to all site in the US
            ## add NERSC
            mapping[site].append('T3_US_NERSC')
            mapping[site].append('T3_US_SDSC')
            mapping[site].append('T3_US_TACC')
            mapping[site].append('T3_US_PSC')
            ## add OSG
            mapping[site].append('T3_US_OSG')
            #mapping[site].append('T3_US_Colorado')
            pass

    if use_HLT:
        mapping['T2_CH_CERN'].append('T2_CH_CERN_HLT')

    if use_T0:
        ## who can read from T0
        mapping['T2_CH_CERN'].append('T0_CH_CERN')
        mapping['T1_IT_CNAF'].append('T0_CH_CERN')
        mapping['T1_FR_CCIN2P3'].append('T0_CH_CERN')
        mapping['T1_DE_KIT'].append('T0_CH_CERN')

    if use_CSCS:
        ## analog config to T0:
        mapping['T2_CH_CERN'].append('T0_CH_CSCS_HPC')
        mapping['T1_IT_CNAF'].append('T0_CH_CSCS_HPC')
        mapping['T1_FR_CCIN2P3'].append('T0_CH_CSCS_HPC')
        mapping['T1_DE_KIT'].append('T0_CH_CSCS_HPC')

    ## temptatively
    mapping['T0_CH_CERN'].append('T2_CH_CERN')

    ## all europ can read from CERN
    for reg in ['IT', 'DE', 'UK', 'FR', 'BE', 'ES']:
        mapping['T2_CH_CERN'].extend(
            [fb for fb in sites_to_consider if '_%s_' % reg in fb])
        pass

    ## all europ T1 among each others
    europ_t1 = [
        site for site in sites_to_consider if site.startswith('T1')
        and any([reg in site for reg in ['IT', 'DE', 'UK', 'FR', 'ES', 'RU']])
    ]
    #print europ_t1
    for one in europ_t1:
        for two in europ_t1:
            if one == two: continue
            mapping[one].append(two)
            pass
        ## all EU T1 can read from T0
        mapping['T0_CH_CERN'].append(one)

    mapping['T0_CH_CERN'].append('T1_US_FNAL')
    #mapping['T1_IT_CNAF'].append( 'T1_US_FNAL' )
    #mapping['T1_IT_CNAF'].extend( [site for site in SI.sites_ready if '_US_' in site] ) ## all US can read from CNAF
    mapping['T1_IT_CNAF'].append('T2_CH_CERN')
    mapping['T1_DE_KIT'].append('T2_CH_CERN')
    mapping['T2_CH_CERN'].append('T1_IT_CNAF')
    mapping['T2_CH_CERN'].append('T1_US_FNAL')
    mapping['T2_CH_CERN'].append('T3_CH_CERN_HelixNebula')
    mapping['T2_CH_CERN'].append('T3_CH_CERN_HelixNebula_REHA')

    for site in sites_to_consider:
        if '_US_' in site:
            mapping[site].append('T2_CH_CERN')
    ## make them appear as OK to use
    force_sites = []

    ## overflow CERN to underutilized T1s
    upcoming = json.loads(eosRead('%s/GQ.json' % monitor_dir))
    for possible in SI.sites_T1s:
        if not possible in upcoming:
            mapping['T2_CH_CERN'].append(possible)
            pass

    take_site_out = UC.get('site_out_of_overflow')

    for site, fallbacks in mapping.items():
        mapping[site] = list(set(fallbacks))

    ### mapping is a dictionnary where
    # key can read from site in values.
    ### reverserd mapping is a dictionnary where
    # key can be read by site in values.
    ## create the reverse mapping for the condor module
    for site, fallbacks in mapping.items():
        if site in take_site_out:
            print "taking", site, "out of overflow source by unified configuration"
            mapping.pop(site)
            continue
        for fb in fallbacks:
            if fb == site:
                ## remove self
                mapping[site].remove(fb)
                continue
            if fb in take_site_out:
                ## remove those to be removed
                print "taking", fb, "out of overflow destination by unified configuration"
                mapping[site].remove(fb)
                continue
            if not site in reversed_mapping[fb]:
                reversed_mapping[fb].append(site)

    ## write it out and bail
    cache = cacheInfo()
    cache.store('overflow_mapping', mapping)
    cache.store('overflow_reverse_mapping', reversed_mapping)
    return
Exemplo n.º 9
0
def spawn_harvesting(url, wfi , in_full):
    #SI = siteInfo()
    SI = global_SI()

    all_OK = {}
    requests = []
    outputs = wfi.request['OutputDatasets'] 
    if ('EnableHarvesting' in wfi.request and wfi.request['EnableHarvesting']) or ('DQMConfigCacheID' in wfi.request and wfi.request['DQMConfigCacheID']):
        if not 'MergedLFNBase' in wfi.request:
            print "f****d up"
            sendEmail('screwed up wl cache','%s wl cache is bad'%(wfi.request['RequestName']))
            all_OK['fake'] = False
            return all_OK,requests

        wfi = workflowInfo(url, wfi.request['RequestName'])
        dqms = [out for out in outputs if '/DQM' in out]
        if not all([in_full[dqm_input] for dqm_input in dqms]):
            wfi.sendLog('closor',"will not be able to assign the harvesting: holding up")
            for dqm_input in dqms:
                all_OK[dqm_input] = False
                ## raise the subscription to high priority
                sites = set(wfi.request['NonCustodialSites'])
                for site in sites:
                    res = updateSubscription(url, site, dqm_input, priority='high')
                    print "increased priority",res
                return all_OK,requests

        for dqm_input in dqms:
            ## handle it properly
            harvesting_schema = {
                'Requestor': os.getenv('USER'),
                'RequestType' : 'DQMHarvest',
                'Group' : 'DATAOPS'
                }
            copy_over = [
                'AcquisitionEra',
                'ProcessingString',
                'DQMUploadUrl',
                'CMSSWVersion',
                'CouchDBName',
                'CouchWorkloadDBName',
                'CouchURL',
                'DbsUrl',
                'inputMode',
                'DQMConfigCacheID',
                'OpenRunningTimeout',
                'ScramArch',
                'CMSSWVersion',
                'Campaign',
                'Memory', #dummy
                'SizePerEvent', #dummy
                'GlobalTag', #dummy
                ]
            for item in copy_over:
                if item in wfi.request:
                    harvesting_schema[item] = copy.deepcopy(wfi.request[item])
                else:
                    print item,"is not in initial schema"

            harvesting_schema['InputDataset'] = dqm_input
            harvesting_schema['TimePerEvent'] = 1
            harvesting_schema['PrepID'] = 'Harvest-'+wfi.request['PrepID']
            if len(wfi.request['RequestString'])>60:
                wfi.request['RequestString']= wfi.request['RequestString'][:60]
                print "truncating request string",wfi.request['RequestString']
                
            harvesting_schema['RequestString'] = 'HARVEST-'+wfi.request['RequestString']
            harvesting_schema['DQMHarvestUnit'] = 'byRun'
            harvesting_schema['ConfigCacheUrl'] = harvesting_schema['CouchURL'] ## uhm, how stupid is that ?
            harvesting_schema['RequestPriority'] = wfi.request['RequestPriority']*10

            harvest_request = reqMgrClient.submitWorkflow(url, harvesting_schema)
            if not harvest_request:
                print "Error in making harvesting for",wfi.request['RequestName']
                print "schema"
                print json.dumps( harvesting_schema, indent = 2)
                harvest_request = reqMgrClient.submitWorkflow(url, harvesting_schema)
                if not harvest_request:
                    print "Error twice in harvesting for",wfi.request['RequestName']
                    print "schema"
                    print json.dumps( harvesting_schema, indent = 2)

            if harvest_request:
                requests.append( harvest_request )
                ## should we protect for setting approved ? no, it's notified below, assignment will fail, likely
                data = reqMgrClient.setWorkflowApproved(url, harvest_request)
                print "created",harvest_request,"for harvesting of",dqm_input
                wfi.sendLog('closor',"created %s for harvesting of %s"%( harvest_request, dqm_input))
                ## assign it directly
                team = wfi.request['Teams'][0]
                parameters={
                    'SiteWhitelist' : [SI.SE_to_CE(se) for se in wfi.request['NonCustodialSites']],
                    'AcquisitionEra' : wfi.acquisitionEra(),
                    'ProcessingString' : wfi.processingString(),
                    'MergedLFNBase' : wfi.request['MergedLFNBase'], 
                    'ProcessingVersion' : wfi.request['ProcessingVersion'],
                    'execute' : True
                    }
                if in_full[dqm_input]:
                    print "using full copy at",in_full[dqm_input]
                    parameters['SiteWhitelist'] = [SI.SE_to_CE(se) for se in in_full[dqm_input]]
                else:
                    print "cannot do anything if not having a full copy somewhere"
                    
                    all_OK[dqm_input]=False
                    continue

                result = reqMgrClient.assignWorkflow(url, harvest_request, team, parameters)
                if not result:
                    #sendEmail('harvesting request created','%s was created at announcement of %s in %s, failed to assign'%(harvest_request, dqm_input, wfi.request['RequestName']), destination=[wfi.request['Requestor']+'@cern.ch'])
                    wfi.sendLog('closor','%s was created at announcement of %s in %s, failed to assign'%(harvest_request, dqm_input, wfi.request['RequestName']))
                    sendLog('closor','%s was created at announcement of %s in %s, failed to assign'%(harvest_request, dqm_input, wfi.request['RequestName']), level='critical')
                else:
                    #sendEmail('harvesting request assigned','%s was created at announcement of %s in %s, and assigned'%(harvest_request, dqm_input, wfi.request['RequestName']), destination=[wfi.request['Requestor']+'@cern.ch']) 
                    wfi.sendLog('closor','%s was created at announcement of %s in %s, and assigned'%(harvest_request, dqm_input, wfi.request['RequestName']))

            else:
                #print "could not make the harvesting for",wfo.name,"not announcing"
                wfi.sendLog('closor',"could not make the harvesting request")
                sendLog('closor',"could not make the harvesting request for %s"% wfi.request['RequestName'], level='critical')
                all_OK[dqm_input]=False                    
    return (all_OK, requests)
Exemplo n.º 10
0
def parse_one(url, wfn, options=None):
    def time_point(label="", sub_lap=False):
        now = time.mktime(time.gmtime())
        nows = time.asctime(time.gmtime())

        print "[showError] Time check (%s) point at : %s" % (label, nows)
        print "[showError] Since start: %s [s]" % (now - time_point.start)
        if sub_lap:
            print "[showError] Sub Lap : %s [s]" % (now - time_point.sub_lap)
            time_point.sub_lap = now
        else:
            print "[showError] Lap : %s [s]" % (now - time_point.lap)
            time_point.lap = now
            time_point.sub_lap = now

    time_point.sub_lap = time_point.lap = time_point.start = time.mktime(
        time.gmtime())

    task_error_site_count = {}
    one_explanation = defaultdict(set)
    per_task_explanation = defaultdict(set)

    if wfn in [
            'vlimant_task_EXO-RunIISummer15wmLHEGS-04800__v1_T_170906_141738_1357'
    ]:
        return task_error_site_count, one_explanation

    time_point("Starting with %s" % wfn)
    threads = []

    SI = global_SI()
    UC = unifiedConfiguration()
    wfi = workflowInfo(url, wfn)
    time_point("wfi", sub_lap=True)
    where_to_run, missing_to_run, missing_to_run_at = wfi.getRecoveryInfo()
    time_point("acdcinfo", sub_lap=True)
    all_blocks, needed_blocks_loc, files_in_blocks, files_and_loc_notin_dbs = wfi.getRecoveryBlocks(
    )
    time_point("inputs", sub_lap=True)

    ancestor = workflowInfo(url, wfn)
    lhe, prim, _, sec = ancestor.getIO()
    high_order_acdc = 0
    while ancestor.request['RequestType'] == 'Resubmission':
        ancestor = workflowInfo(url, ancestor.request['OriginalRequestName'])
        lhe, prim, _, sec = ancestor.getIO()
        high_order_acdc += 1

    no_input = (not lhe) and len(prim) == 0 and len(sec) == 0

    cache = options.cache
    print "cache timeout", cache

    err = wfi.getWMErrors(cache=cache)
    time_point("wmerrors", sub_lap=True)
    stat = wfi.getWMStats(cache=cache)
    time_point("wmstats", sub_lap=True)
    #adcd = wfi.getRecoveryDoc()

    total_by_code_dash = defaultdict(int)
    total_by_site_dash = defaultdict(int)
    r_dashb = defaultdict(lambda: defaultdict(int))
    dash_board_h = 1
    if False:
        ## NB get the since from when the wf has started, not a fixed value
        ## no dashboard until we get a better api
        #dashb = wfi.getFullPicture(since=dash_board_h,cache=cache)
        dashb = {}
        #print json.dumps( dashb , indent=2)
        for site, sinfo in dashb.items():
            for s_code, counts in sinfo.items():
                d_statuses = ['submitted', 'pending', 'app-unknown', 'done']
                total_by_code_dash[str(s_code)] += counts.get('submitted', 0)
                total_by_site_dash[site] += counts.get('submitted', 0)
                r_dashb[str(s_code)][site] += counts.get('submitted', 0)

        print json.dumps(total_by_code_dash, indent=2)
        print json.dumps(total_by_site_dash, indent=2)

    time_point("Got most input")

    status_per_task = defaultdict(lambda: defaultdict(int))

    if not 'AgentJobInfo' in stat:
        stat['AgentJobInfo'] = {}
        #print "bad countent ?"
        #print json.dumps(  stat,  indent=2)

    for agent in stat['AgentJobInfo']:
        for task in stat['AgentJobInfo'][agent]['tasks']:
            if not 'status' in stat['AgentJobInfo'][agent]['tasks'][task]:
                continue
            for status in stat['AgentJobInfo'][agent]['tasks'][task]['status']:
                info = stat['AgentJobInfo'][agent]['tasks'][task]['status'][
                    status]
                #print status,stat['AgentJobInfo'][agent]['tasks'][task]['status'][status]
                if type(info) == dict:
                    status_per_task[task][status] += sum(
                        stat['AgentJobInfo'][agent]['tasks'][task]['status']
                        [status].values())
                else:
                    status_per_task[task][status] += stat['AgentJobInfo'][
                        agent]['tasks'][task]['status'][status]

    #print json.dumps( status_per_task, indent=2)
    db_total_per_site = defaultdict(int)
    db_total_per_code = defaultdict(int)
    ## cannot do that since there is no task count in dashboard and we have to take away the submitted
    #for site in dashb:
    #    for error in dashb[site]:
    #        db_total_per_site[site] += dashb[site][error]
    #        db_total_per_code[code] += dashb[site][error]

    print "ACDC Information"
    print "\t where to re-run"
    print json.dumps(where_to_run, indent=2)
    print "\t Missing events"
    print json.dumps(missing_to_run, indent=2)
    print "\t Missing events per site"
    print json.dumps(missing_to_run_at, indent=2)

    if not where_to_run and not missing_to_run and not missing_to_run_at:
        print "showError is unable to run"
        #return task_error_site_count, one_explanation
        pass

    do_JL = not options.no_JL
    do_CL = not options.no_CL
    do_all_error_code = options.all_errors
    if high_order_acdc >= 1:
        print high_order_acdc, "order request, pulling down all logs"
        do_all_error_code = True
    if wfi.isRelval():
        print "getting all codes for relval"
        do_all_error_code = True

    tasks = sorted(set(err.keys() + missing_to_run.keys()))

    if not tasks:
        print "no task to look at"
        #return task_error_site_count

    html = "<html> <center><h1>%s, Updated on %s (GMT)" % (
        wfn, time.asctime(time.gmtime()))

    html += '</center>'
    html += '<a href=https://cmsweb.cern.ch/reqmgr2/fetch?rid=%s>dts</a>, ' % (
        wfn)
    html += '<a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=%s>ac</a>, ' % (
        wfi.request['PrepID'])
    html += '<a href=https://cms-gwmsmon.cern.ch/prodview/%s>Job Progress</a>, ' % (
        wfn)
    r_type = wfi.request.get('OriginalRequestType',
                             wfi.request.get('RequestType', 'NaT'))
    if r_type in ['ReReco']:
        html += '<a href=../datalumi/lumi.%s.html>Lumisection Summary</a>, ' % wfi.request[
            'PrepID']
    html += '<a href="https://its.cern.ch/jira/issues/?jql=text~%s AND project = CMSCOMPPR" target="_blank">jira</a>' % (
        wfi.request['PrepID'])
    html += '<hr>'
    html += '<a href=#IO>I/O</a>, <a href=#ERROR>Errors</a>, <a href=#BLOCK>blocks</a>, <a href=#FILE>files</a>, <a href=#CODES>Error codes</a><br>'
    html += '<hr>'

    time_point("Header writen")

    html += '<a name=IO></a>'
    if prim:
        html += 'Reads in primary<br>'
        rwl = wfi.getRunWhiteList()
        lwl = wfi.getLumiWhiteList()
        for dataset in prim:
            html += '<b>%s </b>(events/lumi ~%d)' % (
                dataset, getDatasetEventsPerLumi(dataset))
            blocks = getDatasetBlocks(dataset, runs=rwl) if rwl else None
            blocks = getDatasetBlocks(dataset, lumis=lwl) if lwl else None
            available = getDatasetBlocksFraction(url,
                                                 dataset,
                                                 only_blocks=blocks)
            html += '<br><br>Available %.2f (>1 more than one copy, <1 not in full on disk)<br>' % available
            html += '<ul>'
            presence = getDatasetPresence(url, dataset, only_blocks=blocks)

            for site in sorted(presence.keys()):
                html += '<li>%s : %.2f %%' % (site, presence[site][1])
            html += '</ul><br>'

    if sec:
        html += 'Reads in secondary<br>'
        for dataset in sec:
            presence = getDatasetPresence(url, dataset)
            html += '<b>%s</b><ul>' % dataset
            for site in sorted(presence.keys()):
                html += '<li>%s : %.2f %%' % (site, presence[site][1])
            html += '</ul>'

    outs = sorted(wfi.request['OutputDatasets'])
    if outs:
        html += 'Produces<br>'
        for dataset in outs:
            presence = getDatasetPresence(url, dataset)
            html += '<b>%s </b>(events/lumi ~ %d)<ul>' % (
                dataset, getDatasetEventsPerLumi(dataset))
            for site in sorted(presence.keys()):
                html += '<li>%s : %.2f %%' % (site, presence[site][1])
            html += '</ul>'

    time_point("Input checked")

    html += """
<hr><br>
<a name=ERROR></a>
<ul>
<li> <b><i>dashboard numbers over %d days</b></i>
<li> &uarr; %% with respect to total number of error in the code
<li> &rarr; %% with respect to total number of error at the site
</ul>
""" % (dash_board_h)

    html += '<br>'

    n_expose_base = options.expose  # if options else UC.get('n_error_exposed')
    print "getting", n_expose_base, "logs by default"
    if tasks:
        min_rank = min([task.count('/') for task in tasks])
    for task in tasks:
        n_expose = n_expose_base

        expose_archive_code = dict([(str(code), defaultdict(lambda: n_expose))
                                    for code in UC.get('expose_archive_code')])
        expose_condor_code = dict([(str(code), defaultdict(lambda: n_expose))
                                   for code in UC.get('expose_condor_code')])

        #print task
        task_rank = task.count('/')
        task_short = task.split('/')[-1]
        total_per_site = defaultdict(int)
        time_point("Starting with task %s" % task_short, sub_lap=True)

        notreported = 'NotReported'

        total_count = defaultdict(int)
        error_site_count = defaultdict(lambda: defaultdict(int))

        all_not_reported = set()
        for agent in stat['AgentJobInfo']:
            for site in stat['AgentJobInfo'][agent]['tasks'].get(task, {}).get(
                    'skipped', {}):
                info = stat['AgentJobInfo'][agent]['tasks'][task]['skipped'][
                    site]
                #print info
                all_not_reported.add(site)
                ce = SI.SE_to_CE(site)
                error_site_count[notreported][ce] += info.get(
                    'skippedFiles', 0)
                total_count[notreported] += info.get('skippedFiles', 0)

            for site in stat['AgentJobInfo'][agent]['tasks'].get(task, {}).get(
                    'sites', {}):

                info = stat['AgentJobInfo'][agent]['tasks'][task]['sites'][
                    site]
                for s in ['success', 'failure', 'cooloff', 'submitted']:
                    if not s in info: continue
                    data = info[s]
                    if type(data) == dict:
                        total_per_site[site] += sum(data.values())
                    else:
                        total_per_site[site] += data

        #is the task relevant to recover (discard log, cleanup)
        if any([v in task.lower() for v in ['logcol', 'cleanup']]): continue

        #total_count= defaultdict(int)
        #error_site_count = defaultdict( lambda : defaultdict(int))
        if not task in err:
            print task, "has not reported error"
            err[task] = {}
        #print err[task].keys()

        for exittype in err[task]:
            #print "\t",err[task][exittype].keys()
            for errorcode_s in err[task][exittype]:
                if errorcode_s == '0': continue
                #print "\t\t",err[task][exittype][errorcode_s].keys()
                for site in err[task][exittype][errorcode_s]:
                    ce = SI.SE_to_CE(site)
                    count = err[task][exittype][errorcode_s][site][
                        'errorCount']
                    total_count[errorcode_s] += count
                    #error_site_count[errorcode_s][site] += count
                    error_site_count[errorcode_s][ce] += count

        ## show the total
        all_sites = set()
        all_codes = set()
        for code in error_site_count:
            for site in error_site_count[code]:
                all_sites.add(site)
                if code != '0':
                    all_codes.add(code)

        s_per_code = defaultdict(int)
        for site in all_sites:
            for code in sorted(all_codes):
                s_per_code[code] += error_site_count[code][site]

        expose_top_N = UC.get('expose_top_N')
        count_top_N = min(
            sorted(s_per_code.values(),
                   reverse=True)[:expose_top_N]) if s_per_code else -1

        for exittype in err[task]:
            #print "\t",err[task][exittype].keys()
            for errorcode_s in err[task][exittype]:
                if errorcode_s == '0': continue
                #print "\t\t",err[task][exittype][errorcode_s].keys()
                force_code = (count_top_N > 0
                              and s_per_code[errorcode_s] >= count_top_N)
                if force_code: print "will expose", errorcode_s, "anyways"
                for site in err[task][exittype][errorcode_s]:
                    ce = SI.SE_to_CE(site)
                    count = err[task][exittype][errorcode_s][site][
                        'errorCount']
                    ###total_count[errorcode_s] += count
                    #error_site_count[errorcode_s][site] += count
                    ###error_site_count[errorcode_s][ce] += count
                    for sample in err[task][exittype][errorcode_s][site][
                            'samples']:
                        #print sample.keys()
                        for step in sample['errors']:
                            for report in sample['errors'][step]:
                                if report['type'] == 'CMSExeption': continue
                                #if int(report['exitCode']) == int(errorcode_s):
                                one_explanation[errorcode_s].add(
                                    "%s (Exit code: %s) \n%s" %
                                    (report['type'], report['exitCode'],
                                     report['details']))
                                per_task_explanation[
                                    "%s:%s" % (task_short, errorcode_s)].add(
                                        "%s (Exit code: %s) \n%s" %
                                        (report['type'], report['exitCode'],
                                         report['details']))
                                #one_explanation[errorcode_s].add( report['details'] )
                                #else:
                                #one_explanation[
                        agent = sample['agent_name']
                        wmbs = sample['wmbsid']
                        workflow = sample['workflow']
                        if force_code:
                            if not errorcode_s in expose_condor_code:
                                expose_condor_code[errorcode_s] = defaultdict(
                                    lambda: n_expose)
                            if not errorcode_s in expose_archive_code:
                                expose_archive_code[errorcode_s] = defaultdict(
                                    lambda: n_expose)

                        if do_CL and ((errorcode_s in expose_condor_code and
                                       expose_condor_code[errorcode_s][agent])
                                      ) and 'cern' in agent:
                            if errorcode_s in expose_condor_code:
                                expose_condor_code[errorcode_s][agent] -= 1
                            print errorcode_s, agent, "error count", expose_condor_code.get(
                                errorcode_s, {}).get(agent, 0)

                            threads.append(
                                AgentBuster(agent=agent,
                                            workflow=workflow,
                                            wmbs=wmbs,
                                            errorcode_s=errorcode_s,
                                            base_eos_dir=base_eos_dir,
                                            monitor_eos_dir=monitor_eos_dir,
                                            task_short=task_short))

                        for out in sample['output']:
                            #print out
                            if out['type'] == 'logArchive':
                                if do_JL and (
                                    (errorcode_s in expose_archive_code and
                                     expose_archive_code[errorcode_s][agent] >
                                     0)):
                                    if errorcode_s in expose_archive_code:
                                        expose_archive_code[errorcode_s][
                                            agent] -= 1
                                    print errorcode_s, agent, "error count", expose_archive_code.get(
                                        errorcode_s, {}).get(agent, 0)

                                    threads.append(
                                        XRDBuster(
                                            out_lfn=out['lfn'],
                                            monitor_eos_dir=monitor_eos_dir,
                                            wfn=wfn,
                                            errorcode_s=errorcode_s,
                                            task_short=task_short,
                                            from_eos=(
                                                not options.not_from_eos
                                            ),  # if options else True),
                                        ))

        #print task
        #print json.dumps( total_count, indent=2)
        #print json.dumps( explanations , indent=2)
        all_sites = set()
        all_codes = set()
        for code in error_site_count:
            for site in error_site_count[code]:
                all_sites.add(site)
                if code != '0':
                    all_codes.add(code)

        ## parse the dashboard data
        for site in total_by_site_dash:
            ## no. cannot discriminate by task in dashboard...
            #all_sites.add( site )
            pass

        ## parse the acdc data
        #notreported='NotReported'
        #all_missing_stats = set()
        #for site in missing_to_run_at[task] if task in missing_to_run_at else []:
        #    if not missing_to_run_at[task][site]: continue
        #    ce = SI.SE_to_CE( site )
        #    #all_sites.add( ce )
        #    all_missing_stats.add( ce )

        #all_missing_stats = all_missing_stats &set(SI.all_sites)
        #all_not_reported = all_missing_stats - all_sites
        #print task
        #print "site with no report",sorted(all_not_reported)
        #print sorted(all_sites)
        #print sorted(all_missing_stats)
        #all_sites = all_missing_stats | all_sites

        #all_sites = all_sites & set(SI.all_sites)

        no_error = len(all_not_reported) != 0

        if not no_error and notreported in all_codes:
            all_codes.remove(notreported)
        missing_events = missing_to_run[task] if task in missing_to_run else 0
        feff = wfi.getFilterEfficiency(task.split('/')[-1])
        html += "<a name=%s>" % task.split('/')[-1]
        html += "<b>%s</b>" % task.split('/')[-1]
        if missing_events:
            if feff != 1.:
                html += ' is missing %s events in input and <b>about %s events in output</b>' % (
                    "{:,}".format(missing_events), "{:,}".format(
                        int(missing_events * feff)))
            else:
                html += ' is missing <b>%s events in I/O</b>' % (
                    "{:,}".format(missing_events))

            html += ' <a href="https://cmsweb.cern.ch/couchdb/acdcserver/_design/ACDC/_view/byCollectionName?key=%%22%s%%22&include_docs=true&reduce=false" target=_blank>AC/DC</a>' % (
                wfn)
            if no_error:
                html += "<br><b><font color=red> and has UNreported error</font></b>"

        html += "<br><table border=1><thead><tr><th>Sites/Errors</th>"

        #for site in all_sites:
        #    html+='<th>%s</th>'%site
        for code in sorted(all_codes):
            #html+='<th><a href="#%s">%s</a>'%(code,code)
            html += '<th><a href="#%s:%s">%s</a>' % (task_short, code, code)
            if (str(code) in expose_archive_code
                    or do_all_error_code):  # and n_expose_base:
                html += ' <a href=%s/joblogs/%s/%s/%s>, JobLog</a>' % (
                    unified_url_eos, wfn, code, task_short)
            if (str(code) in expose_condor_code
                    or do_all_error_code):  # and n_expose_base:
                html += ' <a href=%s/condorlogs/%s/%s/%s>, CondorLog</a>' % (
                    unified_url_eos, wfn, code, task_short)
            html += '</th>'

        html += '<th>Total jobs</th><th>Site Ready</th>'
        html += '</tr></thead>\n'

        html += '<tr><td>Total</td>'
        for code in sorted(all_codes):
            html += '<td bgcolor=orange width=100>%d' % (s_per_code[code])
            if code in total_by_code_dash:
                html += ' (<b><i>%d</i></b>)' % total_by_code_dash[code]
            html += '</td>'

        ulist = '<ul>'
        grand = 0
        for status in sorted(status_per_task[task].keys()):
            ulist += '<li> %s %d' % (status, status_per_task[task][status])
            grand += status_per_task[task][status]
        ulist += '<li><b> Total %d </b>' % grand
        ulist += '</ul>'
        #html += '<td bgcolor=orange> %.2f%% </td>'% (100.*(float(sum(s_per_code.values()))/sum(total_per_site.values())) if sum(total_per_site.values()) else 0.)
        html += '<td bgcolor=orange> &rarr; %.2f%% &larr; </td>' % (
            100. * (float(sum(s_per_code.values())) / grand) if grand else 0.)
        html += '<td bgcolor=orange> %s </td>' % ulist

        html += '</tr>'

        def palette(frac):
            _range = {
                0.0: 'green',
                0.5: 'green',
                0.6: 'darkgreen',
                0.7: 'orange',
                0.8: 'salmon',
                0.9: 'red'
            }
            which = [k for k in _range.keys() if k <= frac]
            if which:
                there = max(which)
            else:
                there = max(_range.keys())
            return _range[there]

        for site in sorted(all_sites):
            site_in = 'Yes'
            color = 'bgcolor=lightblue'
            if not site in SI.sites_ready:
                color = 'bgcolor=indianred'
                site_in = '<b>No</b>'
                if task in missing_to_run_at and missing_to_run_at[task][
                        SI.CE_to_SE(site)] == 0 or min_rank == task_rank:
                    color = 'bgcolor=aquamarine'
                    site_in = '<b>No</b> but fine'

            if not no_error:
                site_in += " (%s events)" % ("{:,}".format(
                    missing_to_run_at[task][SI.CE_to_SE(site)]) if task
                                             in missing_to_run_at else '--')
            html += '<tr><td %s>%s</td>' % (color, site)
            for code in sorted(all_codes):
                if code == notreported:
                    html += '<td %s width=200>%s events </td>' % (
                        color, "{:,}".format(
                            missing_to_run_at[task][SI.CE_to_SE(site)]))
                else:
                    if error_site_count[code][site]:
                        er_frac = float(
                            error_site_count[code][site]
                        ) / s_per_code[code] if s_per_code[code] else 0.
                        si_frac = float(
                            error_site_count[code][site]) / total_per_site[
                                site] if total_per_site[site] else 0.
                        html += '<td %s width=200>%d' % (
                            color, error_site_count[code][site])
                        if code in r_dashb and site in r_dashb[code]:
                            html += ' (<b><i>%d</i></b>)' % (
                                r_dashb[code][site])

                        html += ', <font color=%s>&uarr; %.1f%%</font>, <font color=%s>&rarr; %.1f%%</font></td>' % (
                            palette(er_frac), 100. * er_frac, palette(si_frac),
                            100. * si_frac)
                    else:
                        html += '<td %s>0</td>' % color
            html += '<td bgcolor=orange>%d</td>' % total_per_site[site]
            html += '<td %s>%s</td>' % (color, site_in)
            html += '</tr>\n'
        html += '</table><br>'
        task_error_site_count[task] = error_site_count

    ## run all retrieval
    run_threads = ThreadHandler(
        threads=threads,
        n_threads=options.log_threads,  # if options else 5,
        sleepy=10,
        timeout=UC.get('retrieve_errors_timeout'),
        verbose=True)
    run_threads.start()

    html += '<hr><br>'
    html += '<a name=BLOCK></a>'
    html += "<b>Blocks (%d/%d) needed for recovery</b><br>" % (
        len(needed_blocks_loc), len(all_blocks))
    for block in sorted(needed_blocks_loc.keys()):
        html += '%s <b>@ %s</b><br>' % (block, ','.join(
            sorted(needed_blocks_loc[block])))

    html += '<a name=FILE></a>'
    html += "<br><b>Files in no block</b><br>"
    rthreads = []
    check_files = [f for f in files_and_loc_notin_dbs.keys() if '/store' in f]
    random.shuffle(check_files)
    check_files = check_files[:100]
    check_files = []  ## disable it completely
    by_f = {}
    if check_files:
        for f in check_files:
            rthreads.append(ReadBuster(file=f))
        print "checking on existence of", len(rthreads), "files"
        run_rthreads = ThreadHandler(threads=rthreads,
                                     n_threads=20,
                                     timeout=10)
        run_rthreads.start()
        while run_rthreads.is_alive():
            time.sleep(10)

        for t in run_rthreads.threads:
            by_f[t.file] = t.readable
            #print "checked",t.file,t.readable

    for f in sorted(files_and_loc_notin_dbs.keys()):
        readable = by_f.get(f, -1)
        if readable == -1:
            fs = '%s' % f
        elif readable == 0:
            fs = '<font color=light green>%s</font>' % f
            #print f,"is readable"
        else:
            fs = '<font color=red>%s</font>' % f
            #print f,"is not readable"

        html += '%s <b>@</b> %s<br>' % (fs, ','.join(
            sorted(files_and_loc_notin_dbs[f])))
        #html +='%s <b>@</b> %s<br>'%(f , ','.join(sorted(files_and_loc_notin_dbs[f])) )

    html += '<hr><br>'
    html += '<a name=CODES></a>'
    html += '<table border=1>'
    for code in per_task_explanation:
        html += '<tr><td><a name="%s">%s</a><br><a href=https://twiki.cern.ch/twiki/bin/view/CMSPublic/JobExitCodes>code twiki</a></td><td>%s</td></tr>' % (
            code, code, '<br><br>'.join(per_task_explanation[code]).replace(
                '\n', '<br>'))
    #for code in one_explanation:
    #    html +='<tr><td><a name="%s">%s</a></td><td>%s</td></tr>'% ( code, code, '<br><br>'.join(one_explanation[code]).replace('\n','<br>' ))

    html += '</table>'
    html += ('<br>' * 30)
    html += '</html>'
    time_point("Report finished")
    wfi.sendLog('error', html, show=False)
    fn = '%s' % wfn

    time_point("error send to ES")
    open('%s/report/%s' % (monitor_dir, fn), 'w').write(html)
    open('%s/report/%s' % (monitor_eos_dir, fn), 'w').write(html)

    time_point("Finished with showError")

    ## then wait for the retrivals to complete
    ping = 0
    while run_threads.is_alive():
        ping += 1
        if ping % 100:
            time_point("waiting for sub-threads to finish")
        time.sleep(6)

    time_point("Finished with retrieval threads")

    return task_error_site_count, one_explanation