Exemplo n.º 1
0
def transitiontime(wf, status):
    logs= filter(lambda change : change["Status"]==status, wf['RequestTransition'])
    if logs:
        return logs[-1]['UpdateTime']
    else:
        return None

delays={'assignment-approved' : (7,14),
        'new':(7,14),
        'completed':(14,21),
        'closed-out':(14,21),
        }

warnings=defaultdict(set)
for checkin,(warn,timeout) in delays.items():
    wfs = getWorkflows(url, checkin, user=None, details=True)
    for wf in wfs:
        if not 'backfill' in wf['RequestName'].lower(): continue
        transition = transitiontime(wf,checkin)
        if transition and (now - transition)>(timeout*24*60*60):
            ## that can go away
            print wf['RequestName'],"is old enough to be removed",wf['RequestStatus']
            reqMgrClient.invalidateWorkflow(url, wf['RequestName'], current_status=wf['RequestStatus'])
        elif transition and (now - transition)>(warn*24*60*60):
            ## warn requester
            print wf['RequestName'],"is old enough to be removed",wf['RequestStatus']
            warnings[wf['Requestor']].add( wf['RequestName'] )

for who in warnings:
    sendEmail('Old Backfill in the system','The following backfill should be removed or moved to rejected/announced\n\n%s'%('\n'.join(sorted(warnings[who]))), destination=[who+'@cern.ch'])
Exemplo n.º 2
0
def assignor(url, specific=None, talk=True, options=None):
    if userLock() and not options.manual: return
    mlock = moduleLock()
    if mlock() and not options.manual: return
    if not componentInfo().check() and not options.manual: return

    UC = unifiedConfiguration()
    CI = campaignInfo()
    SI = siteInfo()
    SI = global_SI()
    ###NLI = newLockInfo()
    ###if not NLI.free() and not options.go: return
    LI = lockInfo()
    #if not LI.free() and not options.go and not options.manual: return

    n_assigned = 0
    n_stalled = 0

    wfos = []
    fetch_from = []
    if specific or options.early:
        fetch_from.extend(['considered', 'staging'])
    if specific:
        fetch_from.extend(['considered-tried'])

    if options.early:
        print "Option Early is on"

    fetch_from.extend(['staged'])

    if options.from_status:
        fetch_from = options.from_status.split(',')
        print "Overriding to read from", fetch_from

    for status in fetch_from:
        print "getting wf in", status
        wfos.extend(
            session.query(Workflow).filter(Workflow.status == status).all())
        print len(wfos)

    ## in case of partial, go for fetching a list from json ?
    #if options.partial and not specific:
    #    pass

    aaa_mapping = json.loads(eosRead('%s/equalizor.json' %
                                     monitor_pub_dir))['mapping']
    all_stuck = set()
    all_stuck.update(
        json.loads(eosRead('%s/stuck_transfers.json' % monitor_pub_dir)))

    max_per_round = UC.get('max_per_round').get('assignor', None)
    max_cpuh_block = UC.get('max_cpuh_block')

    ##order by priority instead of random
    if options.early:
        cache = sorted(getWorkflows(url, 'assignment-approved', details=True),
                       key=lambda r: r['RequestPriority'])
        cache = [r['RequestName'] for r in cache]

        def rank(wfn):
            return cache.index(wfn) if wfn in cache else 0

        wfos = sorted(wfos, key=lambda wfo: rank(wfo.name), reverse=True)
        print "10 first", [wfo.name for wfo in wfos[:10]]
        print "10 last", [wfo.name for wfo in wfos[-10:]]
    else:
        random.shuffle(wfos)

    for wfo in wfos:

        if options.limit and (n_stalled + n_assigned) > options.limit:
            break

        if max_per_round and (n_stalled + n_assigned) > max_per_round:
            break

        if specific:
            if not any(map(lambda sp: sp in wfo.name, specific.split(','))):
                continue
            #if not specific in wfo.name: continue

        if not options.manual and 'rucio' in (wfo.name).lower(): continue
        print "\n\n"
        wfh = workflowInfo(url, wfo.name)

        if wfh.request['RequestStatus'] in [
                'rejected', 'aborted', 'aborted-completed', 'aborted-archived',
                'rejected-archived'
        ] and wfh.isRelval():
            wfo.status = 'forget'
            session.commit()
            n_stalled += 1
            continue

        if options.priority and int(
                wfh.request['RequestPriority']) < options.priority:
            continue

        options_text = ""
        if options.early: options_text += ", early option is ON"

        wfh.sendLog('assignor',
                    "%s to be assigned %s" % (wfo.name, options_text))

        ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores
        (lheinput, primary, parent, secondary, sites_allowed,
         sites_not_allowed) = wfh.getSiteWhiteList()

        output_tiers = list(
            set([o.split('/')[-1] for o in wfh.request['OutputDatasets']]))

        if not output_tiers:
            n_stalled += 1
            wfh.sendLog('assignor', 'There is no output at all')
            sendLog('assignor',
                    'Workflow %s has no output at all' % (wfo.name),
                    level='critical')
            continue

        is_stuck = (all_stuck & primary)
        if is_stuck:
            wfh.sendLog('assignor',
                        "%s are stuck input" % (','.join(is_stuck)))

        ## check if by configuration we gave it a GO
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True

        allowed_secondary = {}
        assign_parameters = {}
        check_secondary = (not wfh.isRelval())
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns:
                assign_parameters.update(CI.campaigns[campaign])

            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[
                    campaign]:
                if CI.campaigns[campaign]['secondaries']:
                    allowed_secondary.update(
                        CI.campaigns[campaign]['secondaries'])
                    check_secondary = True
            if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[
                    campaign]:
                banned_tier = list(
                    set(CI.campaigns[campaign]['banned_tier'])
                    & set(output_tiers))
                if banned_tier:
                    no_go = True
                    wfh.sendLog(
                        'assignor', 'These data tiers %s are not allowed' %
                        (','.join(banned_tier)))
                    sendLog('assignor',
                            'These data tiers %s are not allowed' %
                            (','.join(banned_tier)),
                            level='critical')

        if secondary and check_secondary:
            if (set(secondary) & set(allowed_secondary.keys()) !=
                    set(secondary)):
                msg = '%s is not an allowed secondary' % (
                    ', '.join(set(secondary) - set(allowed_secondary.keys())))
                wfh.sendLog('assignor', msg)
                critical_msg = msg + '\nWorkflow URL: https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=task_{}'.format(
                    wfh.getPrepIDs()[0])
                sendLog('assignor', critical_msg, level='critical')
                if not options.go:
                    no_go = True
            ## then get whether there is something more to be done by secondary
            for sec in secondary:
                if sec in allowed_secondary:  # and 'parameters' in allowed_secondary[sec]:
                    assign_parameters.update(allowed_secondary[sec])

        if no_go:
            n_stalled += 1
            ## make a very loud noise if >100k priority stalled
            continue

        ## check on current status for by-passed assignment
        if wfh.request['RequestStatus'] != 'assignment-approved':
            if not options.test:
                wfh.sendLog('assignor',
                            "setting %s away and skipping" % wfo.name)
                ## the module picking up from away will do what is necessary of it
                wfo.wm_status = wfh.request['RequestStatus']
                wfo.status = 'away'
                session.commit()
                continue
            else:
                print wfo.name, wfh.request['RequestStatus']

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version = wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                wfh.sendLog('assignor', "cannot decide on version number")
                n_stalled += 1
                wfo.status = 'trouble'
                session.commit()
                continue

        wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed))

        blocks = wfh.getBlocks()
        if blocks:
            wfh.sendLog(
                'assignor',
                "Needs {} blocks in input {}".format(len(blocks),
                                                     '\n'.join(blocks)))
        wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed))

        primary_aaa = options.primary_aaa
        secondary_aaa = options.secondary_aaa

        if 'Campaign' in wfh.request and wfh.request[
                'Campaign'] in CI.campaigns:
            assign_parameters.update(CI.campaigns[wfh.request['Campaign']])

        if 'primary_AAA' in assign_parameters and primary:
            primary_aaa = primary_aaa or assign_parameters['primary_AAA']
        if 'secondary_AAA' in assign_parameters:
            secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA']

        wfh.sendLog(
            'assignor',
            "Initial values for primary_AAA=%s and secondary_AAA=%s" %
            (primary_aaa, secondary_aaa))

        if primary_aaa:
            if "T2_CH_CERN_HLT" in sites_allowed:
                sites_allowed.remove("T2_CH_CERN_HLT")
            if "T2_CH_CERN_HLT" not in sites_not_allowed:
                sites_not_allowed.append("T2_CH_CERN_HLT")

        ## keep track of this, after secondary input location restriction : that's how you want to operate it
        initial_sites_allowed = copy.deepcopy(sites_allowed)

        set_lfn = '/store/mc'  ## by default

        for prim in list(primary):
            set_lfn = getLFNbase(prim)
            ## if they are requested for processing, they should bbe all closed already
            # FIXME: remove this closeAllBlocks
            #closeAllBlocks(url, prim, blocks)

        ## should be 2 but for the time-being let's lower it to get things going
        _copies_wanted, cpuh = wfh.getNCopies()
        wfh.sendLog('assignor', "we need %s CPUh" % cpuh)
        if cpuh > max_cpuh_block and not options.go:
            #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**'])
            sendLog(
                'assignor',
                '%s requires a large numbr of CPUh %s , not assigning, please check with requester'
                % (wfo.name, cpuh),
                level='critical')
            wfh.sendLog(
                'assignor',
                "Requiring a large number of CPUh %s, not assigning" % cpuh)
            continue

        ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently

        ## default back to white list to original white list with any data
        wfh.sendLog('assignor', "Allowed sites :%s" % sorted(sites_allowed))

        # TODO Alan on 1/april/2020: keep the AAA functionality
        if primary_aaa:
            ## remove the sites not reachable localy if not in having the data
            if not sites_allowed:
                wfh.sendLog('assignor',
                            "Overiding the primary on AAA setting to Off")
                primary_aaa = False
            else:
                aaa_grid = set(sites_allowed)
                for site in list(aaa_grid):
                    aaa_grid.update(aaa_mapping.get(site, []))
                sites_allowed = list(set(initial_sites_allowed) & aaa_grid)
                wfh.sendLog(
                    'assignor', "Selected to read primary through xrootd %s" %
                    sorted(sites_allowed))

        isStoreResults = ('StoreResults' == wfh.request.setdefault(
            'RequestType', None))

        if isStoreResults:
            if 'MergedLFNBase' in wfh.request:
                set_lfn = wfh.request['MergedLFNBase']
            else:
                n_stalled += 1
                wfh.sendLog(
                    'assignor',
                    "Cannot assign StoreResults request because MergedLFN is missing"
                )
                sendLog(
                    'assignor',
                    'Cannot assign StoreResults request because MergedLFN is missing',
                    level='critical')
                continue

        if not primary_aaa:
            if isStoreResults:
                ## if we are dealing with a StoreResults request, we don't need to check dataset availability and
                ## should use the SiteWhiteList set in the original request
                if 'SiteWhitelist' in wfh.request:
                    sites_allowed = wfh.request['SiteWhitelist']
                else:
                    wfh.sendLog(
                        'assignor',
                        "Cannot assign StoreResults request because SiteWhitelist is missing"
                    )
                    sendLog(
                        'assignor',
                        'Cannot assign StoreResults request because SiteWhitelist is missing',
                        level='critical')
                    n_stalled += 1
                    continue
            wfh.sendLog('assignor',
                        "Selected for any data %s" % sorted(sites_allowed))

        #if not len(sites_allowed):
        #    if not options.early:
        #        wfh.sendLog('assignor',"cannot be assign with no matched sites")
        #        sendLog('assignor','%s has no whitelist'% wfo.name, level='critical')
        #    n_stalled+=1
        #    continue

        low_pressure = SI.sites_low_pressure(0.4)
        ## if any of the site allowed is low pressure : reduce to 1 copy so that it gets started
        allowed_and_low = sorted(set(low_pressure) & set(sites_allowed))
        if allowed_and_low:
            wfh.sendLog(
                'assignor',
                "The workflow can run at %s under low pressure currently" %
                (','.join(allowed_and_low)))

        if not len(sites_allowed) and not options.SiteWhitelist:
            if not options.early:
                wfh.sendLog('assignor',
                            "cannot be assign with no matched sites")
                sendLog('assignor',
                        '%s has no whitelist' % wfo.name,
                        level='critical')
            n_stalled += 1
            continue

        t1t2_only = [
            ce for ce in sites_allowed
            if [ce.startswith('T1') or ce.startswith('T2')]
        ]
        if t1t2_only:
            # try to pick from T1T2 only first
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1t2_only])]
            # then pick any otherwise
        else:
            sites_out = [
                SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])
            ]

        print "available=", SI.disk[sites_out[0]]
        wfh.sendLog('assignor', "Placing the output on %s" % sites_out)
        parameters = {
            'SiteWhitelist': sites_allowed,
            'SiteBlacklist': sites_not_allowed,
            'NonCustodialSites': sites_out,
            'AutoApproveSubscriptionSites': list(set(sites_out)),
            'AcquisitionEra': wfh.acquisitionEra(),
            'ProcessingString': wfh.processingString(),
            'MergedLFNBase': set_lfn,
            'ProcessingVersion': version,
        }

        if primary_aaa:
            parameters['TrustSitelists'] = True
            wfh.sendLog(
                'assignor',
                "Reading primary through xrootd at %s" % sorted(sites_allowed))

        if secondary_aaa:
            parameters['TrustPUSitelists'] = True
            wfh.sendLog(
                'assignor', "Reading secondary through xrootd at %s" %
                sorted(sites_allowed))

        ## plain assignment here
        team = 'production'
        if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM')
        if options and options.team:
            team = options.team
        parameters['Team'] = team

        if lheinput:
            ## throttle reading LHE article
            wfh.sendLog('assignor',
                        'Setting the number of events per job to 500k max')
            parameters['EventsPerJob'] = 500000

        def pick_options(options, parameters):
            ##parse options entered in command line if any
            if options:
                for key in reqMgrClient.assignWorkflow.keys:
                    v = getattr(options, key)
                    if v != None:
                        if type(v) == str and ',' in v:
                            parameters[key] = filter(None, v.split(','))
                        else:
                            parameters[key] = v

        def pick_campaign(assign_parameters, parameters):
            ## pick up campaign specific assignment parameters
            parameters.update(assign_parameters.get('parameters', {}))

        if options.force_options:
            pick_campaign(assign_parameters, parameters)
            pick_options(options, parameters)
        else:
            ## campaign parameters update last
            pick_options(options, parameters)
            pick_campaign(assign_parameters, parameters)

        if not options.test:
            parameters['execute'] = True

        hold_split, split_check = wfh.checkSplitting()
        if hold_split and not options.go:
            if split_check:
                wfh.sendLog(
                    'assignor', 'Holding on to the change in splitting %s' %
                    ('\n\n'.join([str(i) for i in split_check])))
            else:
                wfh.sendLog('assignor', 'Change of splitting is on hold')
            n_stalled += 1
            continue

        if split_check == None or split_check == False:
            n_stalled += 1
            continue
        elif split_check:
            ## operate all recommended changes
            reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check)
            wfh.sendLog(
                'assignor', 'Applying the change in splitting %s' %
                ('\n\n'.join([str(i) for i in split_check])))

        split_check = True  ## bypass completely and use the above

        # Handle run-dependent MC
        pstring = wfh.processingString()
        if 'PU_RD' in pstring:
            numEvents = wfh.getRequestNumEvents()
            eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary]
            eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi))
            reqJobs = 500
            if 'PU_RD2' in pstring:
                reqJobs = 2000
                eventsPerJob = int(numEvents / (reqJobs * 1.4))
                lumisPerJob = int(eventsPerJob / eventsPerLumi)
                if lumisPerJob == 0:
                    #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    sendLog('assignor',
                            "%s needs to be split by event with %s per job" %
                            (wfo.name, eventsPerJob),
                            level='critical')
                    wfh.sendLog(
                        'assignor',
                        "%s needs to be split by event with %s per job" %
                        (wfo.name, eventsPerJob))
                    parameters['EventsPerJob'] = eventsPerJob
                else:
                    spl = wfh.getSplittings()[0]
                    # FIXME: decide which of the lines below needs to remain...
                    eventsPerJobEstimated = spl[
                        'events_per_job'] if 'events_per_job' in spl else None
                    eventsPerJobEstimated = spl[
                        'avg_events_per_job'] if 'avg_events_per_job' in spl else None
                    if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob:
                        #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        sendLog('assignor',
                                "%s was assigned with %s lumis/job" %
                                (wfo.name, lumisPerJob),
                                level='critical')
                        wfh.sendLog(
                            'assignor', "%s was assigned with %s lumis/job" %
                            (wfo.name, lumisPerJob))
                        parameters['LumisPerJob'] = lumisPerJob
                    else:
                        #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name)
                        sendLog(
                            'assignor',
                            "leaving splitting untouched for %s, please check on %s"
                            % (pstring, wfo.name),
                            level='critical')
                        wfh.sendLog(
                            'assignor',
                            "leaving splitting untouched for PU_RD*, please check."
                        )

        #if isHEPCloudReady(url) and wfh.isGoodForNERSC():
        #    parameters['Team'] = 'hepcloud'
        #    parameters['SiteWhitelist'] = ['T3_US_NERSC']
        #    if primary:
        #        parameters['TrustSitelists'] = True
        #    if secondary:
        #        parameters['TrustPUSitelists'] = True
        #    sendEmail("sending work to hepcloud","pleasse check on %s"% wfh.request['RequestName'], destination=['*****@*****.**'])

        ## make sure to autoapprove all NonCustodialSites
        parameters['AutoApproveSubscriptionSites'] = list(
            set(parameters['NonCustodialSites'] +
                parameters.get('AutoApproveSubscriptionSites', [])))
        result = reqMgrClient.assignWorkflow(
            url, wfo.name, None,
            parameters)  ## team is not relevant anymore here

        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
                n_assigned += 1
                wfh.sendLog(
                    'assignor', "Properly assigned\n%s" %
                    (json.dumps(parameters, indent=2)))
                if wfh.producePremix() and (not wfh.isRelval()):
                    title = "Heavy workflow assigned to {}".format(
                        parameters['SiteWhitelist'])
                    body = "Workflow name: {}".format(
                        wfh.request['RequestName'])
                    body += "\nOutput dataset(s): {}".format(
                        wfh.request['OutputDatasets'])
                    body += "\nAssigned to: {}".format(
                        parameters['SiteWhitelist'])
                    sendEmail(
                        title,
                        body,
                        destination=[
                            '*****@*****.**'
                        ])

                try:
                    ## refetch information and lock output
                    new_wfi = workflowInfo(url, wfo.name)
                    (_, prim, _, sec) = new_wfi.getIO()
                    for secure in list(prim) + list(
                            sec) + new_wfi.request['OutputDatasets']:
                        ## lock all outputs
                        LI.lock(secure, reason='assigning')

                except Exception as e:
                    print "fail in locking output"

                    print str(e)
                    sendEmail("failed locking of output", str(e))

            else:
                wfh.sendLog(
                    'assignor',
                    "Failed to assign %s.\n%s \n Please check the logs" %
                    (wfo.name, reqMgrClient.assignWorkflow.errorMessage))
                sendLog('assignor',
                        "Failed to assign %s.\n%s \n Please check the logs" %
                        (wfo.name, reqMgrClient.assignWorkflow.errorMessage),
                        level='critical')
                print "ERROR could not assign", wfo.name
        else:
            pass
    print "Assignment summary:"
    sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled))
    if n_stalled and not options.go and not options.early:
        sendLog('assignor',
                "%s workflows cannot be assigned. Please take a look" %
                (n_stalled),
                level='critical')
Exemplo n.º 3
0
def injector(url, options, specific):
    mlock = moduleLock()
    if mlock(): return

    use_mcm = True
    up = componentInfo(soft=['mcm', 'wtc'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    UC = unifiedConfiguration()

    transform_keywords = UC.get('convert_to_stepchain')

    workflows = getWorkflows(url, status=options.wmstatus, user=options.user)
    for user in UC.get("user_rereco"):
        workflows.extend(
            getWorkflows(url,
                         status=options.wmstatus,
                         user=user,
                         rtype="ReReco"))
    for user in (options.user_relval.split(',')
                 if options.user_relval else UC.get("user_relval")):
        workflows.extend(
            getWorkflows(url,
                         status=options.wmstatus,
                         user=user,
                         rtype="TaskChain"))
    for user in (options.user_storeresults.split(',') if
                 options.user_storeresults else UC.get("user_storeresults")):
        workflows.extend(
            getWorkflows(url,
                         status=options.wmstatus,
                         user=user,
                         rtype="StoreResults"))

    print len(workflows), "in line"
    cannot_inject = set()
    to_convert = set()
    status_cache = defaultdict(str)

    ## browse for assignment-approved requests, browsed for ours, insert the diff
    for wf in workflows:
        if specific and not specific in wf: continue

        exists = session.query(Workflow).filter(Workflow.name == wf).first()
        if not exists:
            wfi = workflowInfo(url, wf)
            ## check first that there isn't related here with something valid
            can_add = True
            ## first try at finding a match
            familly = session.query(Workflow).filter(
                Workflow.name.contains(wfi.request['PrepID'])).all()
            if not familly:
                pids = wfi.getPrepIDs()
                req_familly = []
                for pid in pids:
                    req_familly.extend(getWorkflowById(url, pid, details=True))

                familly = []
                print len(req_familly), "members"
                for req_member in req_familly:
                    #print "member",req_member['RequestName']
                    owfi = workflowInfo(url,
                                        req_member['RequestName'],
                                        request=req_member)
                    other_pids = owfi.getPrepIDs()
                    if set(pids) == set(other_pids):
                        ## this is a real match
                        familly.extend(
                            session.query(Workflow).filter(
                                Workflow.name ==
                                req_member['RequestName']).all())

            for lwfo in familly:
                if lwfo:
                    ## we have it already
                    if not lwfo.status in [
                            'forget', 'trouble', 'forget-unlock',
                            'forget-out-unlock'
                    ]:
                        wfi.sendLog(
                            'injector', "Should not put %s because of %s %s" %
                            (wf, lwfo.name, lwfo.status))
                        sendLog('injector',
                                "Should not put %s because of %s %s" %
                                (wf, lwfo.name, lwfo.status),
                                level='critical')
                        print "Should not put", wf, "because of", lwfo.name, lwfo.status
                        cannot_inject.add(wf)
                        can_add = False
            ## add a check on validity of input datasets
            _, prim, par, sec = wfi.getIO()
            for d in list(prim) + list(par) + list(sec):
                if not d in status_cache:
                    status_cache[d] = getDatasetStatus(d)
                if status_cache[d] != 'VALID':
                    wfi.sendLog(
                        'injector', "One of the input is not VALID. %s : %s" %
                        (d, status_cache[d]))
                    sendLog('injector',
                            "One of the input of %s is not VALID. %s : %s" %
                            (wf, d, status_cache[d]),
                            level='critical')
                    can_add = False
                ## check for any file in phedex, to verify existence
                _, ph_files, _, _ = getDatasetFiles(url, d)
                if not ph_files and not ('StoreResults'
                                         == wfi.request.setdefault(
                                             'RequestType', None)):
                    wfi.sendLog(
                        'injector',
                        "One of the input has no file in phedex: %s" % d)
                    sendLog('injector',
                            "One of the input has no file in phedex: %s" % d,
                            level='critical')
                    can_add = False

            ### ban some workflow that you don't like anymore
            #outputs = wfi.request['OutputDatasets']

            if not can_add: continue

            ## temporary hack to transform specific taskchain into stepchains
            #good_for_stepchain = wfi.isGoodToConvertToStepChain( keywords = transform_keywords)
            good_for_stepchain = wfi.isGoodToConvertToStepChain(keywords=None)

            ## match keywords and technical constraints
            #if (not options.no_convert) and good_for_stepchain and not wfi.isRelval():
            #    to_convert.add( wf )
            #    wfi.sendLog('injector','Transforming %s TaskChain into StepChain'%wf)
            #    #sendEmail('convertion to stepchain','Transforming %s TaskChain into StepChain'%wf)

            wfi.sendLog('injector', "considering %s" % wf)

            new_wf = Workflow(name=wf,
                              status=options.setstatus,
                              wm_status=options.wmstatus)
            session.add(new_wf)
            session.commit()
            time.sleep(0.5)
        else:
            #print "already have",wf
            pass

    if cannot_inject:
        #sendEmail('workflow duplicates','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject)))
        sendLog(
            'injector',
            'These workflow cannot be added in because of duplicates \n\n %s' %
            ('\n'.join(cannot_inject)),
            level='warning')

    for wf in to_convert:
        os.system(
            './Unified/rejector.py --clone --to_step --comments \"Transform to StepChain\" %s'
            % wf)

    ## passing a round of invalidation of what needs to be invalidated
    if use_mcm and (options.invalidate or True):
        invalidator(url)

    no_replacement = set()

    #print "getting all transfers"
    #all_transfers=session.query(Transfer).all()
    #print "go!"

    ## pick up replacements
    for wf in session.query(Workflow).filter(
            Workflow.status == 'trouble').all():
        print wf.name
        if specific and not specific in wf.name: continue
        print wf.name
        wfi = workflowInfo(url, wf.name)
        wl = wfi.request  #getWorkLoad(url, wf.name)
        familly = getWorkflowById(url, wl['PrepID'])
        true_familly = []
        for member in familly:
            if member == wf.name: continue
            fwl = getWorkLoad(url, member)
            if options.replace:
                if member != options.replace: continue
            else:
                if fwl['RequestDate'] < wl['RequestDate']: continue
                if fwl['RequestType'] == 'Resubmission': continue
                if fwl['RequestStatus'] in ['None', None, 'new']: continue
                if fwl['RequestStatus'] in [
                        'rejected', 'rejected-archived', 'aborted',
                        'aborted-archived'
                ]:
                    continue
            true_familly.append(fwl)

        if len(true_familly) == 0:
            #sendLog('injector','%s had no replacement'%wf.name, level='critical')
            if wfi.isRelval():
                #wfi.sendLog('injector','the workflow was found in trouble with no replacement. As a relval, there is no clean way to handle this.')
                wfi.sendLog(
                    'injector',
                    'the workflow was found in trouble with no replacement. As a relval, there is no clean way to handle this. Setting forget'
                )
                wf.status = 'forget'
                session.commit()
            else:
                wfi.sendLog(
                    'injector',
                    'the workflow was found in trouble with no replacement')
                no_replacement.add(wf.name)
            continue
        else:
            wfi.sendLog(
                'injector',
                'the workflow was found in trouble and has a replacement')

        print wf.name, "has", len(familly), "familly members"
        print wf.name, "has", len(true_familly), "true familly members"

        ##we cannot have more than one of them !!! pick the last one
        if len(true_familly) > 1:
            #sendEmail('multiple wf','please take a look at injector for %s'%wf.name)
            sendLog('injector',
                    'Multiple wf in line, will take the last one for %s \n%s' %
                    (wf.name, ', '.join(fwl['RequestName']
                                        for fwl in true_familly)),
                    level='critical')

        for fwl in true_familly[-1:]:
            member = fwl['RequestName']
            new_wf = session.query(Workflow).filter(
                Workflow.name == member).first()
            if not new_wf:
                sendLog('injector',
                        "putting %s as replacement of %s" % (member, wf.name))
                status = 'away'
                if fwl['RequestStatus'] in ['assignment-approved']:
                    status = 'considered'
                new_wf = Workflow(name=member,
                                  status=status,
                                  wm_status=fwl['RequestStatus'])
                wf.status = 'forget'
                session.add(new_wf)
            else:
                if new_wf.status == 'forget': continue
                sendLog(
                    'injector',
                    "getting %s as replacement of %s" % (new_wf.name, wf.name))
                wf.status = 'forget'

            for tr in session.query(TransferImp).filter(
                    TransferImp.workflow_id == wf.id).all():
                ## get all transfer working for the old workflow
                existing = session.query(TransferImp).filter(
                    TransferImp.phedexid == tr.phedexid).filter(
                        TransferImp.workflow_id == new_wf.id).all()
                tr.active = False  ## disable the old one
                if not existing:
                    ## create the transfer object for the new dependency
                    tri = TransferImp(phedexid=tr.phedexid, workflow=new_wf)
                    session.add(tri)
                session.commit()

        ## don't do that automatically
        #wf.status = 'forget'
        session.commit()
    if no_replacement:
        #sendEmail('workflow with no replacement','%s \n are dangling there'%( '\n'.join(no_replacement)))
        sendLog('injector',
                'workflow with no replacement\n%s \n are dangling there' %
                ('\n'.join(no_replacement)),
                level='critical')
Exemplo n.º 4
0
def equalizor(url , specific = None, options=None):
    up = componentInfo(mcm=False, soft=['mcm']) 
    if not up.check(): return 

    if not specific:
        workflows = getWorkflows(url, status='running-closed', details=True)
        workflows.extend(getWorkflows(url, status='running-open', details=True))

    ## start from scratch
    modifications = defaultdict(dict)
    ## define regionality site => fallback allowed. feed on an ssb metric ??
    mapping = defaultdict(list)
    reversed_mapping = defaultdict(list)
    regions = defaultdict(list)
    SI = siteInfo()
    for site in SI.sites_ready:
        region = site.split('_')[1]
        if not region in ['US','DE','IT']: continue
        regions[region] = [region] 

    def site_in_depletion(s):
        return True
        if s in SI.sites_pressure:
            (m, r, pressure) = SI.sites_pressure[s]
            if float(m) < float(r):
                print s,m,r,"lacking pressure"
                return True
            else:
                print s,m,r,"pressure"
                pass
                
        return False

    for site in SI.sites_ready:
        region = site.split('_')[1]
        ## fallback to the region, to site with on-going low pressure
        mapping[site] = [fb for fb in SI.sites_ready if any([('_%s_'%(reg) in fb and fb!=site and site_in_depletion(fb))for reg in regions[region]]) ]
    

    use_T0 = False
    if options.augment : use_T0 = True

    use_HLT = False
    if options.augment : use_HLT=True

    if use_HLT:
        mapping['T2_CH_CERN'].append('T2_CH_CERN_HLT')
    #mapping['T2_IT_Legnaro'].append('T1_IT_CNAF')
    for reg in ['IT','DE','UK']:
        mapping['T2_CH_CERN'].extend([fb for fb in SI.sites_ready if '_%s_'%reg in fb])

    for site,fallbacks in mapping.items():
        for fb in fallbacks:
            reversed_mapping[fb].append(site)

    ## this is the fallback mapping
    print json.dumps( mapping, indent=2)
    #print json.dumps( reversed_mapping, indent=2)

    altered_tasks = set()

    def running_idle( wfi , task_name):
        gmon = wfi.getGlideMon()
        #print gmon
        if not gmon: return (0,0)
        if not task_name in gmon: return (0,0)
        return (gmon[task_name]['Running'], gmon[task_name]['Idle'])

    def needs_action( wfi, task, min_idled = 100, pressure = 0.2):
        task_name = task.pathName.split('/')[-1]
        running, idled = running_idle( wfi, task_name)
        go = True
        if not idled and not running : 
            go = False
        if idled < 100: 
            go = False
        if (not running and idled) or (running and (idled / float(running) > pressure)):
            go = True
        else:
            go = False
        return go, task_name, running, idled

    def getcampaign( task ):
        taskname = task.pathName.split('/')[-1]
        if hasattr( task, 'prepID'):
            return task.prepID.split('-')[1]
        elif taskname.count('-')>=1:
            return taskname.split('-')[1]
        else:
            return None

    def close( interface ):
        open('%s/equalizor.json.new'%monitor_dir,'w').write( json.dumps( interface, indent=2))
        os.system('mv %s/equalizor.json.new %s/equalizor.json'%(monitor_dir,monitor_dir))
        os.system('cp %s/equalizor.json %s/logs/equalizor/equalizor.%s.json'%(monitor_dir,monitor_dir,time.mktime(time.gmtime())))

    interface = {
        'reversed_mapping' : reversed_mapping,
        'modifications' : {}
        }
    if options.augment or options.remove:
        interface['modifications'] = json.loads( open('%s/equalizor.json'%monitor_dir).read())['modifications']

    if options.remove:
        if specific in interface['modifications']:
            print "poping",specific
            interface['modifications'].pop(specific)
            close( interface )
        return 


    PU_locations = {}
    PU_overflow = {
        #'RunIISpring15PrePremix' : { 
        #    'sites' : ["T1_US_FNAL", "T1_DE_KIT" , "T1_IT_CNAF", "T1_RU_JINR" ,"T2_CH_CERN"],
        #    'max' : 20000,
        #    'pending' : 0
        #    },
        'RunIIFall15DR76' : {
            'sites':['T1_ES_PIC','T2_US_Purdue','T2_UK_SGrid_RALPP','T2_BE_IIHE','T2_DE_DESY','T2_IT_Legnaro','T2_US_Caltech','T1_DE_KIT',
                     'T2_UK_London_Brunel','T2_IT_Pisa',
                     'T1_US_FNAL',
                     'T2_IT_Rome','T2_US_Florida','T1_IT_CNAF','T1_RU_JINR','T2_UK_London_IC','T2_US_Nebraska','T2_FR_CCIN2P3','T2_US_UCSD','T2_ES_CIEMAT',
                     'T1_FR_CCIN2P3','T2_US_Wisconsin','T2_US_MIT','T2_DE_RWTH',
                     'T1_UK_RAL','T2_US_Vanderbilt','T2_CH_CERN'],
            'max': 20000,
            'pending' : 0},
        'RunIISpring16DR80' : {
            'sites':['T1_ES_PIC','T2_US_Purdue','T2_UK_SGrid_RALPP','T2_BE_IIHE','T2_DE_DESY','T2_IT_Legnaro','T2_US_Caltech','T1_DE_KIT',
                     'T2_UK_London_Brunel','T2_IT_Pisa',
                     'T1_US_FNAL',
                     'T2_IT_Rome','T2_US_Florida','T1_IT_CNAF','T1_RU_JINR','T2_UK_London_IC','T2_US_Nebraska','T2_FR_CCIN2P3','T2_US_UCSD','T2_ES_CIEMAT',
                     'T1_FR_CCIN2P3','T2_US_Wisconsin','T2_US_MIT','T2_DE_RWTH',
                     'T1_UK_RAL','T2_US_Vanderbilt','T2_CH_CERN'],
            'max': 20000,
            'pending' : 0,
            'force' : True},
        'RunIISpring15DR74' : {
            'sites' : ['T1_ES_PIC','T1_DE_KIT','T1_US_FNAL','T1_IT_CNAF','T1_RU_JINR','T1_FR_CCIN2P3','T1_UK_RAL','T2_CH_CERN'],
            'max' : 20000,
            'pending' : 0}
        }
    
    set_to = SI.sites_AAA
    LHE_overflow = {
        'RunIIWinter15GS' : set_to,
        'RunIISummer15GS' : set_to,
        'Summer12' : set_to,
        'Summer11Leg' : set_to
        #'RunIIFall15MiniAODv2' : set_to,
        }

    pending_HLT = 0
    max_HLT = 60000
    pending_T0 = 0
    max_T0 = 60000
    try:
        gmon = json.loads(os.popen('curl -s http://cms-gwmsmon.cern.ch/prodview/json/T2_CH_CERN_HLT').read())
        pending_HLT += gmon["Running"]
        pending_HLT += gmon["MatchingIdle"]
    except:
        pass

    t0_special = [
        'vlimant_BPH-RunIISummer15GS-00030_00212_v0__160129_135314_9755',
        'pdmvserv_TSG-RunIISummer15GS-00044_00240_v0__160210_121223_8582'
        ]
    no_routing = [ 
        #'vlimant_BPH-RunIISummer15GS-00030_00212_v0__160129_135314_9755',
        #'pdmvserv_TOP-RunIIWinter15GS-00074_00187_v0__160207_162312_1992',
                   ]

    stay_within_site_whitelist = False
    specific_task=None
    if specific and ":" in specific:
        specific,specific_task = specific.split(':')

    if specific:
        wfs = session.query(Workflow).filter(Workflow.name.contains(specific)).all()
    else:
        wfs = session.query(Workflow).filter(Workflow.status == 'away').all()
        
    random.shuffle( wfs )
    for wfo in wfs:
        if wfo.name in no_routing and not options.augment:
            continue

        if specific and not specific in wfo.name: 
            continue
        if specific:
            wfi = workflowInfo(url, wfo.name)
        else:
            cached = filter(lambda d : d['RequestName']==wfo.name, workflows)
            if not cached : continue
            wfi = workflowInfo(url, wfo.name, request = cached[0])
        
        ## only running should get re-routed
        if not wfi.request['RequestStatus'] in ['running-open','running-closed'] and not specific: continue

        tasks_and_campaigns = []
        for task in wfi.getWorkTasks():
            tasks_and_campaigns.append( (task, getcampaign(task) ) )
        
        _,_,_,sec = wfi.getIO()

        ## check needs override
        needs_overide = False
        if not needs_overide and  options.augment: needs_overide=True

        def overide_from_agent( wfi, needs_overide):
            bad_agents = []#'http://cmssrv219.fnal.gov:5984']
            if not bad_agents: return needs_overide
            if needs_overide: return True
            agents = wfi.getAgents()

            wqss = ['Running','Acquired']
            if any([agent in agents.get(wqs,{}).keys() for wqs,agent in itertools.product( wqss, bad_agents)]):
                print "overriding the need for bad agent"
                needs_overide = True
            return needs_overide

        ## now parse this for action
        for i_task,(task,campaign) in enumerate(tasks_and_campaigns):
            if options.augment:
                print task.pathName
                print campaign

            ### rule to avoid the issue of taskchain secondary jobs being stuck at sites processing the initial step
            if campaign in LHE_overflow:
                if task.taskType in ['Processing']:
                    needs, task_name, running, idled = needs_action(wfi, task)
                    needs_overide = overide_from_agent( wfi, needs_overide)
                    extend_to = copy.deepcopy( LHE_overflow[campaign] )
                    if stay_within_site_whitelist:
                        extend_to = list(set(extend_to) & set(wfi.request['SiteWhitelist'])) ## restrict to stupid-site-whitelist

                    if extend_to and needs or needs_overide:
                        print "\t",task_name,"of",wfo.name,"running",running,"and pending",idled,"taking action : ReplaceSiteWhitelist"
                        modifications[wfo.name][task.pathName] = { "ReplaceSiteWhitelist" : copy.deepcopy( LHE_overflow[campaign] ) ,"Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']}
                        #print json.dumps( modifications[wfo.name][task.pathName]['ReplaceSiteWhitelist']
                        altered_tasks.add( task.pathName )
                    else:
                        print task_name,"of",wfo.name,"running",running,"and pending",idled


            ### overflow the 76 digi-reco to the site holding the pileup
            if campaign in PU_overflow:
                force = PU_overflow[campaign]['force'] if 'force' in PU_overflow[campaign] else False
                secondary_locations = set(SI.sites_ready)
                for s in sec:
                    if not s in PU_locations:
                        presence = getDatasetPresence( url, s)
                        #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
                        one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.]
                        PU_locations[s] = one_secondary_locations
                    print "secondary is at",sorted(PU_locations[s])
                    secondary_locations = set(PU_locations[s]) & secondary_locations
                    
                ## we should add all sites that hold the secondary input if any
                secondary_locations = list(set(PU_overflow[campaign]['sites']) & set( SI.sites_ready ))
                if any([task.pathName.endswith(finish) for finish in ['_0','StepOneProc','Production']]) :
                    needs, task_name, running, idled = needs_action(wfi, task)
                    ## removing the ones in the site whitelist already since they encode the primary input location
                    if stay_within_site_whitelist:
                        original_site_in_use = set(wfi.request['SiteWhitelist'])
                    else:
                        original_site_in_use = set(secondary_locations)
                    ## remove the sites that have already running jobs
                    gmon = wfi.getGlideMon()
                    if gmon and task_name in gmon and 'Sites' in gmon[task_name]:
                        site_in_use = set(gmon[task_name]['Sites'])
                        ## that determines where you want to run in addition
                        #augment_by = list((set(secondary_locations)- site_in_use))
                        augment_by = list((set(secondary_locations)- site_in_use) & original_site_in_use) ## restrict to stupid-site-whitelist
                    else:
                        augment_by = list(original_site_in_use)

                    needs_overide = overide_from_agent( wfi, needs_overide)
                    if augment_by and (needs or needs_overide or force) and PU_overflow[campaign]['pending'] < PU_overflow[campaign]['max']:
                        PU_overflow[campaign]['pending'] += idled
                        print "raising overflow to",PU_overflow[campaign]['pending'],"for",PU_overflow[campaign]['max']
                        ## the step with an input ought to be the digi part : make this one go anywhere
                        modifications[wfo.name][task.pathName] = { "AddWhitelist" : augment_by , "Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']}
                        altered_tasks.add( task.pathName )
                        print "\t",task_name,"of",wfo.name,"running",running,"and pending",idled,"taking action : AddWhitelist"
                        #print json.dumps( augment_by, indent=2 )
                    else:
                        print task_name,"of",wfo.name,"running",running,"and pending",idled

            ### overflow the skims back to multi-core 
            if campaign in ['Run2015D','Run2015C_25ns'] and task.taskType =='Skim':
                original_swl = wfi.request['SiteWhitelist']
                needs, task_name, running, idled = needs_action(wfi, task)
                if (needs or needs_overide):
                    modifications[wfo.name][task.pathName] = { 'AddWhitelist' : original_swl, 
                                                               "Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']}
                    altered_tasks.add( task.pathName )
                    print "\t",task_name,"of",wfo.name,"running",running,"and pending",idled,"taking action : AddWhitelist"


            if options.augment:
                print sorted(wfi.request['SiteWhitelist']),i_task,use_HLT
            ### add the HLT at partner of CERN
            if 'T2_CH_CERN' in wfi.request['SiteWhitelist'] and i_task==0 and use_HLT:
                needs, task_name, running, idled = needs_action(wfi, task)
                if options.augment: needs=True
                needs = True
                ##needs = random.random()<0.40 remove the random, just add up to a limit
                if (needs or needs_overide) and pending_HLT < max_HLT:
                    pending_HLT += idled
                    if task.pathName in modifications[wfo.name] and 'AddWhitelist' in modifications[wfo.name][task.pathName]:
                        modifications[wfo.name][task.pathName]["AddWhitelist"].append( "T2_CH_CERN_HLT" )
                        print "\t",wfo.name,"adding addHLT up to",pending_HLT,"for",max_HLT
                        print task.pathName
                    ## this Replace does not work at all for HLT
                    #elif task.pathName in modifications[wfo.name] and 'ReplaceSiteWhitelist' in modifications[wfo.name][task.pathName]:
                        #modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"].append( "T2_CH_CERN_HLT" )
                        #print "\t",wfo.name,"adding replace HLT up to",pending_HLT,"for",max_HLT
                    else:
                        modifications[wfo.name][task.pathName] = { "AddWhitelist" : ["T2_CH_CERN_HLT"],
                                                                   "Priority" : wfi.request['RequestPriority'],
                                                                   "Running" : running,
                                                                   "Pending" : idled}
                        print "\t",wfo.name,"adding HLT up to",pending_HLT,"for",max_HLT
                        print task.pathName

            if i_task==0 and not sec and use_T0:
                needs, task_name, running, idled = needs_action(wfi, task)
                
                if options.augment: needs=True
                #needs = True
                #if not (wfo.name in t0_special) and not options.augment: needs = False
                if not wfi.request['RequestType'] in ['MonteCarlo','MonteCarloFromGEN'] and not options.augment: needs = False
                
                ##needs = random.random()<0.40 remove the random, just add up to a limit
                if (needs or needs_overide):
                    pending_T0 += idled
                    if task.pathName in modifications[wfo.name] and 'AddWhitelist' in modifications[wfo.name][task.pathName]:
                        modifications[wfo.name][task.pathName]["AddWhitelist"].append( "T0_CH_CERN" )
                        print "\t",wfo.name,"adding addT0 up to",pending_T0,"for",max_T0
                        print task.pathName
                    elif task.pathName in modifications[wfo.name] and 'ReplaceSiteWhitelist' in modifications[wfo.name][task.pathName]:
                        modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"].append( "T0_CH_CERN" )
                        print "\t",wfo.name,"adding replace T0 up to",pending_T0,"for",max_T0
                    else:
                        modifications[wfo.name][task.pathName] = { "AddWhitelist" : ["T0_CH_CERN"],
                                                                   "Priority" : wfi.request['RequestPriority'],
                                                                   "Running" : running,
                                                                   "Pending" : idled}
                        print "\t",wfo.name,"adding T0 up to",pending_T0,"for",max_T0
                        print task.pathName


    interface['modifications'].update( modifications )


    ## temporary core managing
    interface['cores']={'T2_CH_CERN_HLT': {'min':4,'max':16}, 'default': {'min':1, 'max':4}}
    #interface['max_cores']={'T2_CH_CERN_HLT': 16, 'default': 4}
    #interface['min_cores']={'T2_CH_CERN_HLT': 4, 'default': 1}
    #interface['resize_subtasks'] = 'RunIISpring16DR80'
    interface['resizes'] = ['RunIISpring16DR80','NotACampaign']

    ## close and save
    close( interface )
Exemplo n.º 5
0
from collections import defaultdict
import time
import json
import sys
import random
from assignSession import *


UC = unifiedConfiguration()
spec=None
if len(sys.argv) >1:
    spec = sys.argv[1]

url = 'cmsweb.cern.ch'

wfs = getWorkflows(url, 'acquired', details=True)
wfs.extend( getWorkflows(url, 'running-open', details=True) )
wfs.extend( getWorkflows(url, 'running-closed', details=True) )

jobs_for = defaultdict(lambda : defaultdict(int))
wf_for = defaultdict(lambda : defaultdict(set))
agent_for = defaultdict(lambda : defaultdict(set))
s_block_locations = {}
block_locations = defaultdict(lambda : defaultdict(list))
wfs_no_location_in_GQ = defaultdict(list)
si = siteInfo()  
#bad_blocks = defaultdict( set )
unprocessable = set()

not_runable_acdc=set()
agents_down = defaultdict(set)
Exemplo n.º 6
0
## add an addHoc list of things to lock. empyting this list would result in unlocking later
addHocLocks = json.loads(eosRead('%s/addhoc_lock.json' % base_eos_dir))

time_point("Starting addhoc")

for item in addHocLocks:
    ds = item.split('#')[0]
    LI.lock(ds, reason='addhoc lock')
    newly_locking.add(ds)

time_point("Starting reversed statuses check")

for status in statuses:
    print time.asctime(time.gmtime()), "CEST, fetching", status
    time_point("checking %s" % status, sub_lap=True)
    wfls = getWorkflows(url, status=status, details=True)
    print len(wfls), "in", status
    for wl in wfls:
        wfi = workflowInfo(url, wl['RequestName'], request=wl, spec=False)
        (_, primaries, _, secondaries) = wfi.getIO()
        outputs = wfi.request['OutputDatasets']
        ## unknonw to the system
        known = session.query(Workflow).filter(
            Workflow.name == wl['RequestName']).all()
        if not known:
            print wl['RequestName'], "is unknown to unified, relocking all I/O"
            for dataset in list(primaries) + list(secondaries) + outputs:
                print "\t", dataset
                also_locking_from_reqmgr.add(dataset)
            continue
Exemplo n.º 7
0
url = reqmgr_url

up = componentInfo(mcm=False, soft=['mcm'])
if not up.check(): sys.exit(0)

status = sys.argv[1]
max_wf = 0

print "Picked status",status

wfs = []
if status == 'wmagent':
    register=['assigned','acquired','running-open','running-closed','force-complete','completed','closed-out']
    for r in register:
        wfs.extend( getWorkflows(url, r) )

elif status.endswith('*'):
    wfs.extend([wfo.name for wfo in  session.query(Workflow).filter(Workflow.status.startswith(status[:-1])).all() ])
else:
    wfs.extend([wfo.name for wfo in  session.query(Workflow).filter(Workflow.status==status).all() ])



if max_wf: wfs = wfs[:max_wf]

random.shuffle( wfs )
all_blocks_at_sites = defaultdict(set)

#done = json.loads(open('myblock_done.json').read())
done = {}
Exemplo n.º 8
0
def injector(url, options, specific):

    use_mcm = True
    up = componentInfo( mcm = use_mcm, soft=['mcm'] )
    if not up.check(): return
    use_mcm = up.status['mcm']

    workflows = getWorkflows(url, status=options.wmstatus, user=options.user)
    workflows.extend( getWorkflows(url, status=options.wmstatus, user='******', rtype="ReReco")) ## regardless of users, pick up all ReReco on the table

    print len(workflows),"in line"
    cannot_inject = set()
    status_cache = defaultdict(str)
    ## browse for assignment-approved requests, browsed for ours, insert the diff
    for wf in workflows:
        if specific and not specific in wf: continue
        exists = session.query(Workflow).filter(Workflow.name == wf ).first()
        if not exists:
            wfi = workflowInfo(url, wf)
            #wl = getWorkLoad(url, wf)
            ## check first that there isn't related here with something valid
            can_add = True
            ## first try at finding a match
            #            print wfi.request
            familly = session.query(Workflow).filter(Workflow.name.contains(wfi.request['PrepID'])).all()
            if not familly:
                #req_familly = getWorkflowById( url, wl['PrepID'])
                #familly = [session.query(Workflow).filter(Workflow.name == member).first() for member in req_familly]
                pids = wfi.getPrepIDs()
                req_familly = []
                for pid in pids:
                    req_familly.extend( getWorkflowById( url, pid, details=True) )
                    
                familly = []
                print len(req_familly),"members"
                for req_member in req_familly:
                    #print "member",req_member['RequestName']
                    owfi = workflowInfo(url, req_member['RequestName'], request=req_member)
                    other_pids = owfi.getPrepIDs()
                    if set(pids) == set(other_pids):
                        ## this is a real match
                        familly.extend( session.query(Workflow).filter(Workflow.name == req_member['RequestName']).all() )

            for lwfo in familly:
                if lwfo:
                    ## we have it already
                    if not lwfo.status in ['forget','trouble','forget-unlock','forget-out-unlock']:
                        sendLog('injector',"Should not put %s because of %s %s"%( wf, lwfo.name,lwfo.status ))
                        print "Should not put",wf,"because of",lwfo.name,lwfo.status
                        cannot_inject.add( wf )
                        can_add = False
            ## add a check on validity of input datasets
            _,prim,par,sec = wfi.getIO()
            for d in list(prim)+list(par)+list(sec):
                if not d in status_cache:
                    status_cache[d] = getDatasetStatus(d)
                if status_cache[d] != 'VALID':
                    wfi.sendLog('injector',"One of the input is not VALID. %s : %s"%( d, status_cache[d]))
                    sendLog('injector',"One of the input of %s is not VALID. %s : %s"%( wf, d, status_cache[d]))
                    can_add = False
            if not can_add: continue
            wfi.sendLog('injector',"considering %s"%wf)

            new_wf = Workflow( name = wf , status = options.setstatus, wm_status = options.wmstatus) 
            session.add( new_wf )
            session.commit()
            time.sleep(0.5)
        else:
            #print "already have",wf
            pass
    
    if cannot_inject:
        #sendEmail('workflow duplicates','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject)))
        sendLog('injector','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject)), level='warning')

    ## passing a round of invalidation of what needs to be invalidated
    if use_mcm and (options.invalidate or True):
        invalidator(url)

    no_replacement = set()

    ## pick up replacements
    for wf in session.query(Workflow).filter(Workflow.status == 'trouble').all():
        print wf.name
        if specific and not specific in wf.name: continue
        print wf.name
        wfi = workflowInfo(url, wf.name )
        wl = wfi.request #getWorkLoad(url, wf.name)
        familly = getWorkflowById( url, wl['PrepID'] )
        true_familly = []
        for member in familly:
            if member == wf.name: continue
            fwl = getWorkLoad(url , member)
            if options.replace:
                if member != options.replace: continue
            else:
                if fwl['RequestDate'] < wl['RequestDate']: continue
                if fwl['RequestType']=='Resubmission': continue
                if fwl['RequestStatus'] in ['None',None,'new']: continue
                if fwl['RequestStatus'] in ['rejected','rejected-archived','aborted','aborted-archived']: continue
            true_familly.append( fwl )

        if len(true_familly)==0:
            #sendLog('injector','%s had no replacement'%wf.name, level='critical')
            wfi.sendLog('injector','the workflow was found in trouble with no replacement')
            no_replacement.add( wf.name )
            continue
        else:
            wfi.sendLog('injector','the workflow was found in trouble and has a replacement')
                    
        print wf.name,"has",len(familly),"familly members"
        print wf.name,"has",len(true_familly),"true familly members"

        ##we cannot have more than one of them !!! pick the last one
        if len(true_familly)>1:
            #sendEmail('multiple wf','please take a look at injector for %s'%wf.name)
            sendLog('injector','Multiple wf in line, will take the last one for %s \n%s'%( wf.name, ', '.join(fwl['RequestName'] for fwl in true_familly)), level='critical')

        for fwl in true_familly[-1:]:
            member = fwl['RequestName']
            new_wf = session.query(Workflow).filter(Workflow.name == member).first()
            if not new_wf:
                sendLog('injector',"putting %s as replacement of %s"%( member, wf.name))
                status = 'away'
                if fwl['RequestStatus'] in ['assignment-approved']:
                    status = 'considered'
                new_wf = Workflow( name = member, status = status, wm_status = fwl['RequestStatus'])
                wf.status = 'forget'
                session.add( new_wf ) 
            else:
                if new_wf.status == 'forget': continue
                sendLog('injector',"getting %s as replacement of %s"%( new_wf.name, wf.name ))
                wf.status = 'forget'

            for tr in session.query(Transfer).all():
                if wf.id in tr.workflows_id:
                    sw = copy.deepcopy(tr.workflows_id)
                    sw.remove( wf.id)
                    sw.append(new_wf.id)
                    tr.workflows_id = sw
                    print tr.phedexid,"got",new_wf.name
                    if new_wf.status != 'away':
                        print "\t setting it considered"
                        new_wf.status = 'considered'
                    if tr.phedexid<0: ## set it back to positive
                        tr.phedexid = -tr.phedexid
                    session.commit()
                        

        ## don't do that automatically
        #wf.status = 'forget'
        session.commit()
    if no_replacement:
        #sendEmail('workflow with no replacement','%s \n are dangling there'%( '\n'.join(no_replacement)))
        sendLog('injector','workflow with no replacement, %s \n are dangling there'% ( '\n'.join(no_replacement)), level='critical')
Exemplo n.º 9
0
def injector(url, options, specific):

    ## passing a round of invalidation of what needs to be invalidated
    if options.invalidate:
        invalidator(url)

    workflows = getWorkflows(url, status=options.wmstatus, user=options.user)
    existing = [wf.name for wf in session.query(Workflow).all()]
    ## browse for assignment-approved requests, browsed for ours, insert the diff
    for wf in workflows:
        if wf not in existing:
            print "putting", wf
            new_wf = Workflow(name=wf,
                              status=options.setstatus,
                              wm_status=options.wmstatus)
            session.add(new_wf)
            session.commit()

    existing = [wf.name for wf in session.query(Workflow).all()]

    ## pick up replacements
    for wf in session.query(Workflow).filter(
            Workflow.status == 'trouble').all():
        if specific and wf.name != specific:
            continue
        print wf.name
        wl = getWorkLoad(url, wf.name)
        familly = getWorkflowById(url, wl['PrepID'])
        if len(familly) == 1:
            print wf.name, "ERROR has no replacement"
            continue
        print wf.name, "has", len(familly), "familly members"
        for member in familly:
            if member != wf.name:
                fwl = getWorkLoad(url, member)
                if options.replace:
                    if member != options.replace: continue
                else:
                    if fwl['RequestDate'] < wl['RequestDate']: continue
                    if fwl['RequestType'] == 'Resubmission': continue
                    if fwl['RequestStatus'] in ['None', None]: continue

                new_wf = session.query(Workflow).filter(
                    Workflow.name == member).first()
                if not new_wf:
                    print "putting", member
                    status = 'away'
                    if fwl['RequestStatus'] in ['assignment-approved']:
                        status = 'considered'
                    new_wf = Workflow(name=member,
                                      status=status,
                                      wm_status=fwl['RequestStatus'])
                    wf.status = 'forget'
                    session.add(new_wf)
                    session.commit()
                else:
                    if new_wf.status == 'forget': continue
                    print "getting", new_wf.name, "as replacement of", wf.name

                for tr in session.query(Transfer).all():
                    if wf.id in tr.workflows_id:
                        sw = copy.deepcopy(tr.workflows_id)
                        sw.remove(wf.id)
                        sw.append(new_wf.id)
                        tr.workflows_id = sw
                        print tr.phedexid, "got", new_wf.name
                        if new_wf.status != 'away':
                            new_wf.status = 'staging'
                        session.commit()

        ## don't do that automatically
        #wf.status = 'forget'
        session.commit()
Exemplo n.º 10
0
def equalizor(url , specific = None, options=None):
    up = componentInfo(mcm=False, soft=['mcm']) 
    if not up.check(): return 

    if not specific:
        workflows = getWorkflows(url, status='running-closed', details=True)
        workflows.extend(getWorkflows(url, status='running-open', details=True))

    ## start from scratch
    modifications = defaultdict(dict)
    ## define regionality site => fallback allowed. feed on an ssb metric ??
    mapping = defaultdict(list)
    reversed_mapping = defaultdict(list)
    regions = defaultdict(list)
    SI = siteInfo()
    CI = campaignInfo()
    UC = unifiedConfiguration()
    for site in SI.sites_ready:
        region = site.split('_')[1]
        if not region in ['US'
                          ,'DE','IT','FR',
                          'ES',
                          'UK' ### latest addition
                          ]: continue
        regions[region] = [region] 

    def site_in_depletion(s):
        return True
        if s in SI.sites_pressure:
            (m, r, pressure) = SI.sites_pressure[s]
            if float(m) < float(r):
                print s,m,r,"lacking pressure"
                return True
            else:
                print s,m,r,"pressure"
                pass
                
        return False

    for site in SI.sites_ready:
        region = site.split('_')[1]
        ## fallback to the region, to site with on-going low pressure
        mapping[site] = [fb for fb in SI.sites_ready if any([('_%s_'%(reg) in fb and fb!=site and site_in_depletion(fb))for reg in regions[region]]) ]
    

    for site in SI.sites_ready:
        if site.split('_')[1] == 'US': ## to all site in the US
            ## add NERSC 
            mapping[site].append('T3_US_NERSC')
            ## add OSG            
            mapping[site].append('T3_US_OSG')
            pass
    #mapping['T2_IT_Rome'].append('T3_US_OSG')
    #mapping['T1_US_FNAL'].append('T3_US_NERSC')
    
    use_T0 = ('T0_CH_CERN' in UC.get("site_for_overflow"))
    if options.t0: use_T0 = True
    #if options.augment : use_T0 = True

    use_HLT = ('T2_CH_CERN_HLT' in UC.get("site_for_overflow"))
    if options.hlt: use_HLT = True
    #if options.augment : use_HLT=True

    if use_HLT:
        mapping['T2_CH_CERN'].append('T2_CH_CERN_HLT')

    if use_T0:
        mapping['T2_CH_CERN'].append('T0_CH_CERN')
        mapping['T1_IT_CNAF'].append('T0_CH_CERN')
        mapping['T1_FR_CCIN2P3'].append('T0_CH_CERN')
        mapping['T1_DE_KIT'].append('T0_CH_CERN')
        ## temptatively
        #mapping['T0_CH_CERN'].append( 'T2_CH_CERN' )

    ## all europ can read from CERN
    for reg in ['IT','DE','UK','FR','BE','ES']:
        mapping['T2_CH_CERN'].extend([fb for fb in SI.sites_ready if '_%s_'%reg in fb])
        pass

    ## all europ T1 among each others
    europ_t1 = [site for site in SI.sites_ready if site.startswith('T1') and any([reg in site for reg in ['IT','DE','UK','FR','ES']])]
    print europ_t1
    for one in europ_t1:
        for two in europ_t1:
            if one==two: continue
            mapping[one].append(two)
            pass

    ## fnal can read from cnaf ?
    #mapping['T1_IT_CNAF'].append( 'T1_US_FNAL' )
    mapping['T1_IT_CNAF'].extend( [site for site in SI.sites_ready if '_US_' in site] ) ## all US can read from CNAF
    mapping['T1_IT_CNAF'].append( 'T2_CH_CERN' )
    mapping['T1_DE_KIT'].append( 'T2_CH_CERN' )
    mapping['T2_CH_CERN'].append( 'T1_IT_CNAF' )
    mapping['T2_CH_CERN'].append( 'T1_US_FNAL' )
    #mapping['T2_UK_London_IC'].append( 'T2_CH_CERN' )
    #mapping['T1_UK_RAL'].append( 'T2_BE_IIHE' )
    mapping['T2_UK_London_IC'].append( 'T2_BE_IIHE' )
    mapping['T2_UK_London_IC'].append( 'T2_FR_CCIN2P3' )
    for site in SI.sites_ready:
        if '_US_' in site:
            mapping[site].append('T2_CH_CERN')
    ## make them appear as OK to use
    force_sites = []

    ## overflow CERN to underutilized T1s
    upcoming = json.loads( open('%s/GQ.json'%monitor_dir).read())
    for possible in SI.sites_T1s:
        if not possible in upcoming:
            mapping['T2_CH_CERN'].append(possible)
            pass

    ## remove add-hoc sites from overflow mapping
    prevent_sites = []#'T2_US_Purdue']
    for prevent in prevent_sites:
        if prevent in mapping: mapping.pop( prevent )
    for src in mapping:
        for prevent in prevent_sites:
            if prevent in mapping[src]:
                mapping[src].remove( prevent )

    ## create the reverse mapping for the condor module
    for site,fallbacks in mapping.items():
        for fb in fallbacks:
            if not site in reversed_mapping[fb]:
                reversed_mapping[fb].append(site)

    ## this is the fallback mapping
    print "Direct mapping : site => overflow"
    print json.dumps( mapping, indent=2)
    print "Reverse mapping : dest <= from origin"
    print json.dumps( reversed_mapping, indent=2)

    altered_tasks = set()

    def running_idle( wfi , task_name):
        gmon = wfi.getGlideMon()
        #print gmon
        if not gmon: return (0,0)
        if not task_name in gmon: return (0,0)
        return (gmon[task_name]['Running'], gmon[task_name]['Idle'])

    def needs_action( wfi, task, min_idled = 100):
        task_name = task.pathName.split('/')[-1]
        running, idled = running_idle( wfi, task_name)
        go = True
        if not idled and not running : 
            go = False
        if idled < 100: 
            go = False
        if (not running and idled) or (running and (idled / float(running) > needs_action.pressure)):
            go = True
        else:
            go = False
        return go, task_name, running, idled
    needs_action.pressure = UC.get('overflow_pressure')

    def getPerf( task , stats_to_go = 200):
        task = task.split('/')[1]+'/'+task.split('/')[-1]
        try:
            u = 'http://cms-gwmsmon.cern.ch/prodview/json/history/memoryusage720/%s'%task
            print u
            perf_data = json.loads(os.popen('curl -s --retry 5 %s'%u).read())
        except Exception as e:
            print str(e)
            return (None,None)
        buckets = filter(lambda i:i['key']!=0,perf_data['aggregations']["2"]['buckets'])
        buckets.sort( key = lambda i:i['key'])
        s=0
        for bucket in buckets:
            s+= bucket['doc_count']
            bucket['cum'] = s
        
        s_m = sum( bucket['key']*bucket['doc_count'] for bucket in buckets)
        w_m = sum( bucket['doc_count'] for bucket in buckets)
        m_m = max( bucket['key'] for bucket in buckets) if buckets else None
        
        #90% percentile calculation
        percentile_m = int(0.90 * w_m)
        p_m = 0
        s=0
        for bucket in buckets:
            p_m = bucket['key']
            if bucket['cum'] > percentile_m:
                break

        p_m *= 1.1
        print "percentile mem",p_m
        max_count_m = None
        max_count = 0
        for bucket in buckets:
            if bucket['doc_count'] > max_count:
                max_count_m = bucket['key']
                max_count = bucket['doc_count']

        if max_count_m:
            max_count_m *= 1.1
        print "max count mem",max_count_m

        b_m = None
        if w_m > stats_to_go:
            if p_m:
                b_m = int(p_m)
            else:
                b_m = int(m_m) ## this is very bad if there are just a couple of outliers
                b_m = int((s_m / float(w_m)) * 1.2)
        else:
            print "not enough stats for memory",w_m

        try:
            perf_data = json.loads(os.popen('curl -s --retry 5 http://cms-gwmsmon.cern.ch/prodview/json/history/runtime720/%s'%task).read())
        except Exception as e:
            print str(e)
            return (b_m,None)

        buckets = filter(lambda i:i['key']!=0,perf_data['aggregations']["2"]['buckets'])
        buckets.sort( key = lambda i:i['key'])
        s=0
        for bucket in buckets:
            s+= bucket['doc_count']
            bucket['cum'] = s
        
        s_t = sum( bucket['key']*bucket['doc_count'] for bucket in buckets)
        w_t = sum( bucket['doc_count'] for bucket in buckets)
        m_t = max( bucket['key'] for bucket in buckets) if buckets else None
        
        percentile_t = int(0.90 * w_t)
        p_t = 0
        for bucket in buckets:
            p_t = bucket['key']
            if bucket['cum'] > percentile_t:
                break

        p_t *= 1.1
        print "percentile time",p_t

        max_count_t = None
        max_count = 0
        for bucket in buckets:
            if bucket['doc_count'] > max_count:
                max_count_t = bucket['key']
                max_count = bucket['doc_count']

        if max_count_t:
            max_count_t *= 1.1
        print "max count time",max_count_t

        b_t = None
        if w_t > stats_to_go:
            b_t = m_t
        else:
            print "not enough stats for time",w_t

        return (b_m,b_t)
        
    def getcampaign( task ):
        try:
            taskname = task.pathName.split('/')[-1]
            if hasattr( task, 'prepID'):
                return task.prepID.split('-')[1]
            elif taskname.count('-')>=1:
                return taskname.split('-')[1]
            else:
                return None
        except Exception as e :
            print "Inconsistent prepid very likely"
            print str(e)
            return None
    def close( interface ):
        open('%s/equalizor.json.new'%monitor_pub_dir,'w').write( json.dumps( interface, indent=2))
        os.system('mv %s/equalizor.json.new %s/equalizor.json'%(monitor_pub_dir,monitor_pub_dir))
        os.system('cp %s/equalizor.json %s/logs/equalizor/equalizor.%s.json'%(monitor_pub_dir,monitor_dir,time.mktime(time.gmtime())))

    interface = {
        'mapping' : mapping,
        'reversed_mapping' : reversed_mapping,
        'modifications' : {},
        'time' : {},
        'memory' : {}
        }
    if options.augment or options.remove:
        interface['modifications'] = json.loads( open('%s/equalizor.json'%monitor_pub_dir).read())['modifications']
        interface['memory'] = json.loads( open('%s/equalizor.json'%monitor_pub_dir).read())['memory']
        interface['time'] = json.loads( open('%s/equalizor.json'%monitor_pub_dir).read())['time']
        
    if options.remove:
        if specific in interface['modifications']:
            print "poping",specific
            interface['modifications'].pop(specific)
            close( interface )
        return 


    PU_locations = {}
    PU_overflow = {}
    PRIM_overflow = {}
    PREMIX_overflow = {}
    LHE_overflow = {}
    tune_performance = []

    pending_HLT = 0
    max_HLT = 60000
    pending_T0 = 0
    max_T0 = 60000
    try:
        gmon = json.loads(os.popen('curl -s http://cms-gwmsmon.cern.ch/prodview/json/T2_CH_CERN_HLT').read())
        pending_HLT += gmon["Running"]
        pending_HLT += gmon["MatchingIdle"]
    except:
        pass

    restricting_to_ready = ['pdmvserv_HIG-RunIISummer15wmLHEGS-00420_00157_v0__160909_001612_2018',
                            'pdmvserv_HIG-RunIISummer15wmLHEGS-00418_00157_v0__160909_001621_321',
                            'pdmvserv_HIG-RunIISummer15wmLHEGS-00419_00157_v0__160909_001621_2641'
                            ]
    
    remove_from = {
        #'cerminar_Run2016B-v1-BTagCSV-23Sep2016_8020_160923_163224_2174' : ['T2_CH_CERN_HLT']
        }

    add_to = {
        #'pdmvserv_EXO-RunIISpring16MiniAODv2-05060_00552_v0__161001_151813_7925' : ['T3_US_OSG'],
        #'cerminar_Run2016C-v2-SingleElectron-23Sep2016_8020_160923_182146_3498' : ['T3_US_NERSC'],
        #'cerminar_Run2016C-v2-Tau-23Sep2016_8020_160923_182336_5649' : ['T3_US_NERSC'],
        }
    

    stay_within_site_whitelist = False
    specific_task=None
    if specific and ":" in specific:
        specific,specific_task = specific.split(':')

    if specific:
        wfs = session.query(Workflow).filter(Workflow.name.contains(specific)).all()
    else:
        wfs = session.query(Workflow).filter(Workflow.status == 'away').all()
        
    performance = {}
    resizing = {}
    no_routing = [
        ]
    random.shuffle( wfs )
    for wfo in wfs:
        if not wfo.status in ['away']: continue
        if wfo.name in no_routing and not options.augment:
            continue

        if specific and not specific in wfo.name: 
            continue
        if specific:
            wfi = workflowInfo(url, wfo.name)
        else:
            cached = filter(lambda d : d['RequestName']==wfo.name, workflows)
            if not cached : continue
            wfi = workflowInfo(url, wfo.name, request = cached[0])
        
        ## only running-* should get re-routed, unless done by hand
        if not wfi.request['RequestStatus'] in ['running-open','running-closed'] and not specific: continue

        is_chain = (wfi.request['RequestType'] in ['TaskChain','StepChain'])
        tasks_and_campaigns = []
        for task in wfi.getWorkTasks():
            tasks_and_campaigns.append( (task, getcampaign(task) ) )
        
        
        lhe,prim,_,sec,sites_allowed = wfi.getSiteWhiteList()#getIO()
        ncores = wfi.getMulticore()
        memory_allowed = SI.sitesByMemory( float(wfi.request['Memory']) , maxCore=ncores)

        if not lhe and not prim and not sec:
            ## no input at all: go for OSG!!!
            add_to[wfo.name] = ['T3_US_OSG']

        ## check needs override
        needs_overide = False
        if not needs_overide and  options.augment: needs_overide=True

        def overide_from_agent( wfi, needs_overide):
            bad_agents = []#'http://cmssrv219.fnal.gov:5984']
            if not bad_agents: return needs_overide
            if needs_overide: return True
            agents = wfi.getAgents()

            wqss = ['Running','Acquired']
            if any([agent in agents.get(wqs,{}).keys() for wqs,agent in itertools.product( wqss, bad_agents)]):
                print "overriding the need for bad agent"
                needs_overide = True
            return needs_overide

        ## now parse this for action
        for i_task,(task,campaign) in enumerate(tasks_and_campaigns):
            if options.augment:
                print task.pathName
                print campaign

            resize = CI.get(campaign,'resize',{})
            
            if resize and not is_chain:
                resizing[task.pathName] = resize

            tune = CI.get(campaign,'tune',options.tune)
            if tune and not campaign in tune_performance:
                tune_performance.append( campaign )

            overflow = CI.get(campaign,'overflow',{})
            if overflow:
                if "PRIM" in overflow and not campaign in PRIM_overflow:
                    PRIM_overflow[campaign] = copy.deepcopy(overflow['PRIM'])
                    print "adding",campaign,"to PRIM overflow"
                if "PREMIX" in overflow and not campaign in PREMIX_overflow:
                    PREMIX_overflow[campaign] = copy.deepcopy(overflow['PREMIX'])
                    print "adding",campaign,"to PREMIX overflow"
                if "PU" in overflow and not campaign in PU_overflow:
                    PU_overflow[campaign] = copy.deepcopy(overflow['PU'])
                    print "adding",campaign,"to PU overflow rules"
                if "LHE" in overflow and not campaign in LHE_overflow:
                    site_list = overflow['LHE'].get('site_list',"")
                    if site_list:
                        if type(site_list)==list:
                            LHE_overflow[campaign] = site_list
                        else:
                            print site_list
                            if hasattr(SI,site_list):
                                LHE_overflow[campaign] = copy.deepcopy( getattr(SI,site_list) )
                            else:
                                LHE_overflow[campaign] = site_list.split(',')
                    print "adding",campaign,"to light input overflow rules",LHE_overflow[campaign]

            ### setup the resizing

            ### get the task performance, for further massaging.
            if campaign in tune_performance or options.tune:
                print "performance",task.taskType,task.pathName
                if task.taskType in ['Processing','Production']:
                    set_memory,set_time = getPerf( task.pathName )
                    #print "Performance %s GB %s min"%( set_memory,set_time)
                    wfi.sendLog('equalizor','Performance tuning to %s GB %s min for %s'%( set_memory,set_time,task.pathName.split('/')[-1] ))
                    ## get values from gmwsmon
                    # massage the values : 95% percentile
                    performance[task.pathName] = {}
                    if set_memory:
                        performance[task.pathName]['memory']=min(set_memory,15000) ## max to 15GB
                    if set_time:
                        performance[task.pathName]['time'] = min(set_time, 1440) ## max to 24H
            
            ## rule to remove from the site whitelist site that do not look ready for unified (local banning)
            if wfo.name in restricting_to_ready:
                if task.taskType in ['Production']:
                    new_list = list(set(SI.sites_ready)&set(wfi.request['SiteWhitelist']))
                    modifications[wfo.name][task.pathName] = { "ReplaceSiteWhitelist" : new_list }

            if campaign in PREMIX_overflow:
                ## figure out secondary location and neighbors
                ## figure out primary presence and neighbors
                ## do the intersection and add if in need.
                needs, task_name, running, idled = needs_action(wfi, task)
                #needs = True

                ## trick to be removed once all wf are passed through the agent patch
                assigned_log = filter(lambda change : change["Status"] in ["assigned","acquired"],wfi.request['RequestTransition'])
                if assigned_log:
                    then = assigned_log[0]['UpdateTime']
                    if then < 1479481842:
                        print "assigned too early"
                        needs = False
                    else:
                        print "assigned later enough"
                else:
                    needs = False


                if is_chain and task.pathName.endswith('_1'):
                    print i_task,"in chain prevents overflowing"
                    needs = False

                if task.taskType in ['Processing','Production'] and needs:
                    secondary_locations = set(SI.sites_ready + force_sites)
                    for s in sec:
                        if not s in PU_locations:
                            presence = getDatasetPresence( url, s)
                            one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.]
                            PU_locations[s] = one_secondary_locations
                        print "secondary is at",sorted(PU_locations[s])
                        secondary_locations = set([SI.SE_to_CE(site) for site in PU_locations[s]]) & secondary_locations
                    aaa_sec_grid = set(secondary_locations)
                    for site in sorted(aaa_sec_grid):
                        aaa_sec_grid.update( mapping.get(site, []) )
                    
                    if len(prim):    
                        dataset = list(prim)[0]
                        all_blocks,blocks = wfi.getActiveBlocks()
                        count_all = sum([len(v) for k,v in all_blocks.items()])
                        presence = getDatasetPresence(url, dataset, only_blocks=blocks )
                        aaa_prim_grid = set([SI.SE_to_CE(site) for site in presence.keys()])
                        for site in sorted(aaa_prim_grid):
                            aaa_prim_grid.update( mapping.get(site, []) )

                        print sorted(aaa_prim_grid),"around primary location",sorted(presence.keys())
                        print sorted(aaa_sec_grid),"aroudn secondary location",sorted(secondary_locations)
                        ## intersect
                        aaa_grid = aaa_sec_grid & aaa_prim_grid
                    else:
                        print "premix overflow from a taskchain"
                        ### hack hack hack
                        #modifications[wfo.name][task.pathName]= {"ReplaceSiteWhitelist" : ['T2_CH_CERN','T1_US_FNAL']}
                        aaa_grid = set(wfi.request['SiteWhitelist'])

                    banned_until_you_find_a_way_to_do_this = ['T3_US_OSG']
                    aaa_grid  = filter(lambda s : not s in banned_until_you_find_a_way_to_do_this, aaa_grid)
                    if aaa_grid:
                        wfi.sendLog('equalizor','Extending site whitelist to %s'%sorted(aaa_grid))
                        modifications[wfo.name][task.pathName]= {"AddWhitelist" : sorted(aaa_grid)}

            ## rule to overflow jobs on the primary input
            if campaign in PRIM_overflow:
                if task.taskType in ['Processing','Production']:
                    if not wfi.request['TrustSitelists']:
                        ###xrootd is OFF
                        dataset = list(prim)[0]
                        all_blocks,blocks = wfi.getActiveBlocks()
                        count_all = sum([len(v) for k,v in all_blocks.items()])
                        
                        presence = getDatasetPresence(url, dataset, only_blocks=blocks )
                        in_full = [SI.SE_to_CE(site) for site,(there,_) in presence.items() if there]
                        aaa_grid= set()
                        aaa_grid_in_full = set(in_full)
                        for site in sorted(aaa_grid_in_full):
                            aaa_grid_in_full.update( mapping.get(site, []) )
                        ## just add the neighbors to the existing whitelist. we could do more with block classAd
                        for site in wfi.request['SiteWhitelist']:
                            aaa_grid.update( mapping.get(site, []) )
                        aaa_grid = aaa_grid & set(sites_allowed + ['T3_US_NERSC']) ## and restrict to site that would be allowed at all (mcore, mem)
                        aaa_grid_in_full = aaa_grid_in_full & set(sites_allowed + ['T3_US_NERSC']) ## and restrict to site that would be allowed at all (mcore, mem)
                        gmon = wfi.getGlideMon()
                        needs, task_name, running, idled = needs_action(wfi, task)
                        print needs,running,idled
                        site_in_use = set(gmon[task_name]['Sites']) if gmon and task_name in gmon and 'Sites' in gmon[task_name] else set()
                        print dataset,"at",sorted(in_full),len(blocks),"/",count_all
                        print "running at",sorted(site_in_use)
                        print "set for",sorted(wfi.request['SiteWhitelist'])
                        print "around current whitelist" ,sorted(aaa_grid)
                        print "around where the data is now in full", sorted(aaa_grid_in_full)

                        if needs and not (site_in_use & set(in_full)) and aaa_grid_in_full:
                            print "we could be going for replace at that point"
                            wfi.sendLog('equalizor','Replaceing site whitelie to %s dynamically'% sorted(aaa_grid_in_full))
                            modifications[wfo.name][task.pathName] = { "ReplaceSiteWhitelist" : sorted( aaa_grid_in_full) }
                        else:
                            if aaa_grid:
                                print wfo.name
                                wfi.sendLog('equalizor','Adding in site white list %s dynamically'% sorted(aaa_grid) )
                                if wfo.name in modifications and task.pathName in modifications[wfo.name] and 'AddWhitelist' in modifications[wfo.name][task.pathName]:
                                    modifications[wfo.name][task.pathName]["AddWhitelist"].extend(sorted(aaa_grid))
                                else:
                                    modifications[wfo.name][task.pathName] = { "AddWhitelist" : sorted(aaa_grid) }
                    else:
                        ## the request is already is in xrootd mode (either too generous, or just about right with neighbors of full data)
                        dataset = list(prim)[0]
                        all_blocks,blocks = wfi.getActiveBlocks()
                        count_all = sum([len(v) for k,v in all_blocks.items()])
                        fraction_left = float(len(blocks))/ count_all
                        #if fraction_left< 0.5:                            print '\n'.join( blocks )
                        presence = getDatasetPresence(url, dataset, only_blocks=blocks )
                        ## in full is really the only place we can go to safely, since we have no job-data matching
                        in_full = [SI.SE_to_CE(site) for site,(there,_) in presence.items() if there]
                        gmon = wfi.getGlideMon()
                        needs, task_name, running, idled = needs_action(wfi, task)
                        site_in_use = set(gmon[task_name]['Sites']) if gmon and task_name in gmon and 'Sites' in gmon[task_name] else set()
                        print needs,running,idled

                        aaa_grid = set(in_full)
                        for site in list(aaa_grid):
                            aaa_grid.update( mapping.get(site, []) )

                        new_ones = set(in_full) - set(wfi.request['SiteWhitelist']) ## symptomatic of data have been repositionned
                        common = set(in_full) & set(wfi.request['SiteWhitelist'])
                        extra_shit = set(wfi.request['SiteWhitelist']) - aaa_grid ## symptomatic of too generous site-whitelist

                        aaa_grid = aaa_grid & set(sites_allowed+ ['T3_US_NERSC']) ## restrict to site that would be allowed at all (mcore, mem)
                        new_grid = aaa_grid - set(wfi.request['SiteWhitelist'])
                        print dataset,"is in full ",len(blocks),"/",count_all," at",in_full
                        print '\n'.join( sorted(blocks) )
                        print "running at",site_in_use
                        print "in common of the site whitelist",sorted(common)
                        print "site now also hosting the data",sorted(new_ones)
                        print "site in whitelist with no data",sorted(extra_shit)## with no data and not within aaa reach
                        if new_ones:
                            ## we will be add sites 
                            if needs and aaa_grid:
                                print wfo.name,"would replace for",sorted(aaa_grid)
                                print "but no thanks"
                                wfi.sendLog('equalizor','Changing the site whitelist to %s dynamically'%(sorted(aaa_grid)))
                                modifications[wfo.name][task.pathName] = { "ReplaceSiteWhitelist" : sorted(aaa_grid) }
                            elif new_grid:
                                print wfo.name,"would complement up to",sorted(aaa_grid)
                                wfi.sendLog('equalizor','Adding site white list to %s dynamically'% sorted(new_grid) )
                                modifications[wfo.name][task.pathName] = { "AddWhitelist" : sorted(new_grid) }
                                
                        elif len(extra_shit)>5:
                            if aaa_grid:
                                print wfo.name,"would be restricting down to",sorted(aaa_grid),"because of",sorted(extra_shit)
                                wfi.sendLog('equalizor','Restricting the white list to %s dynamically'% sorted(aaa_grid) )
                                modifications[wfo.name][task.pathName] = { "ReplaceSiteWhitelist" : sorted(aaa_grid) }    
                        else:
                            print wfo.name,"don't do anything"                            



            if wfo.name in remove_from and task.taskType in ['Processing','Production']:
                remove = remove_from[wfo.name]
                restrict_to = set(wfi.request['SiteWhitelist'])
                intersection= set(remove)&set(restrict_to)
                if intersection:
                    print intersection,"is indeed in the original whitelist"
                    restrict_to = restrict_to - set(remove)
                    modifications[wfo.name][task.pathName] = { "ReplaceSiteWhitelist" : sorted(restrict_to) }

            if wfo.name in add_to:
                if task.taskType in ['Production','Processing']:
                    augment_to = add_to[wfo.name]
                    print "adding",sorted(augment_to),"to",wfo.name
                    if wfo.name in modifications and task.pathName in modifications[wfo.name] and 'AddWhitelist' in modifications[wfo.name][task.pathName]:
                        modifications[wfo.name][task.pathName]['AddWhitelist'].extend( augment_to )
                    else:
                        modifications[wfo.name][task.pathName] = { "AddWhitelist" : augment_to }

            ### rule to avoid the issue of taskchain secondary jobs being stuck at sites processing the initial step
            if campaign in LHE_overflow:
                #if not is_chain and task.taskType in ['Processing']:
                if task.taskType in ['Processing']:
                    needs, task_name, running, idled = needs_action(wfi, task)
                    needs_overide = overide_from_agent( wfi, needs_overide)
                    extend_to = list(set(copy.deepcopy( LHE_overflow[campaign] )))
                    if stay_within_site_whitelist:
                        extend_to = list(set(extend_to) & set(wfi.request['SiteWhitelist'])) ## restrict to stupid-site-whitelist
                    extend_to = list(set(extend_to) & set(SI.sites_ready + force_sites))

                    if is_chain:
                        print "further restricting to initially allowed sites"
                        ## restrict to initial allowed sites
                        extend_to = list(set(extend_to) & set(sites_allowed))

                    if not extend_to: 
                        print "Nowhere to extend to"
                        continue
                    if extend_to and needs or needs_overide:
                        
                        modifications[wfo.name][task.pathName] = { "ReplaceSiteWhitelist" : extend_to ,"Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']}
                        wfi.sendLog('equalizor','%s of %s is running %d and pending %d, taking action : ReplaceSiteWhitelist \n %s'%( task_name,
                                                                                                                                      wfo.name,
                                                                                                                                      running,
                                                                                                                                      idled ,
                                                                                                                                      json.dumps( sorted(modifications[wfo.name][task.pathName]['ReplaceSiteWhitelist']))))

                        altered_tasks.add( task.pathName )
                    else:
                        wfi.sendLog('equalizor','%s of %s is running %d and pending %d'%( task_name, wfo.name, running, idled))
                        


            ### overflow the 76 digi-reco to the site holding the pileup
            if campaign in PU_overflow:
                force = PU_overflow[campaign]['force'] if 'force' in PU_overflow[campaign] else False
                secondary_locations = set(SI.sites_ready + force_sites)
                for s in sec:
                    if not s in PU_locations:
                        presence = getDatasetPresence( url, s)
                        #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
                        one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.]
                        PU_locations[s] = one_secondary_locations
                    print "secondary is at",sorted(PU_locations[s])
                    secondary_locations = set([SI.SE_to_CE(site) for site in PU_locations[s]]) & secondary_locations
                    
                ## we should add all sites that hold the secondary input if any
                ### given that we have the secondary location available, it is not necessary to use the add-hoc list
                ##secondary_locations = list(set(PU_overflow[campaign]['sites']) & set( SI.sites_ready ))
                ## intersect with the sites that are allowed from the request requirement
                secondary_locations = secondary_locations & set(memory_allowed)

                if any([task.pathName.endswith(finish) for finish in ['_0','StepOneProc','Production']]) :
                    needs, task_name, running, idled = needs_action(wfi, task)
                    ## removing the ones in the site whitelist already since they encode the primary input location
                    if stay_within_site_whitelist:
                        original_site_in_use = set(wfi.request['SiteWhitelist'] & set(secondary_locations))
                    else:
                        original_site_in_use = set(secondary_locations)

                    mode = 'AddWhitelist'
                    if not prim and i_task==0:
                        print "because there isn't any input, one should be able to just replace the sitewhitelist instead of adding, with the restriction of not reaching every possible sites"
                        mode='ReplaceSiteWhitelist'

                    ## remove the sites that have already running jobs
                    gmon = wfi.getGlideMon()
                    if gmon and task_name in gmon and 'Sites' in gmon[task_name] and mode=='AddWhitelist':
                        site_in_use = set(gmon[task_name]['Sites'])
                        site_in_use = set([]) ## at this time I cannot find a reason to apply such limitation
                        print "removing",sorted(site_in_use)
                        ## that determines where you want to run in addition
                        augment_by = list((set(secondary_locations)- site_in_use) & original_site_in_use)
                    else:
                        print "no existing running site"
                        augment_by = list(original_site_in_use)

                    if not augment_by: print "Nowhere to extend to"

                    needs_overide = overide_from_agent( wfi, needs_overide)
                    if augment_by and (needs or needs_overide or force) and PU_overflow[campaign]['pending'] < PU_overflow[campaign]['max']:
                        PU_overflow[campaign]['pending'] += idled
                        print "raising overflow to",PU_overflow[campaign]['pending'],"for",PU_overflow[campaign]['max']
                        ## the step with an input ought to be the digi part : make this one go anywhere
                        modifications[wfo.name][task.pathName] = { mode : augment_by , "Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']}
                        altered_tasks.add( task.pathName )
                        wfi.sendLog('equalizor','%s of %s is running %d and pending %d, taking action : %s \n %s'%( task_name, wfo.name,
                                                                                                                    running, idled,
                                                                                                                    mode,
                                                                                                                    json.dumps( sorted(augment_by), indent=2 )))
                    else:
                        print task_name,"of",wfo.name,"running",running,"and pending",idled

            ### overflow the skims back to multi-core 
            if campaign in ['Run2015D','Run2015C_25ns'] and task.taskType =='Skim':
                original_swl = wfi.request['SiteWhitelist']
                needs, task_name, running, idled = needs_action(wfi, task)
                if (needs or needs_overide):
                    modifications[wfo.name][task.pathName] = { 'AddWhitelist' : original_swl, 
                                                               "Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']}
                    altered_tasks.add( task.pathName )
                    wfi.sendLog('equalizor','%s of %s is running %d and pending %d, taking action : AddWhitelist \n %s'%( task_name, wfo.name,
                                                                                                                              running, idled,
                                                                                                                          json.dumps( sorted(original_swl), indent=2 )))


            if options.augment:
                #print "uhm ....",sorted(wfi.request['SiteWhitelist']),i_task,use_HLT
                pass

            ### this is a hack when we need to kick gensim out of everything
            if campaign in [
                #'RunIIWinter15GS',
                #'RunIISummer15GS',
                #'RunIISummer15wmLHEGS',
                #'Summer12',
                ] and task.taskType in ['Production'] and is_chain:
                #what are the site you want to take out. What are the jobs in whitelist, make the diff and replace
                t1s = set([site for site in SI.all_sites if site.startswith('T1')])
                ust2s = set([site for site in SI.all_sites if site.startswith('T2_US')])
                #ust2s = set([site for site in SI.sites_mcore_ready if site.startswith('T2_US')])
                allmcores = set(SI.sites_mcore_ready)
                #set_for = set(wfi.request['SiteWhitelist']) - t1s
                #set_for = set(wfi.request['SiteWhitelist']) - t1s - ust2s
                #set_for = set(wfi.request['SiteWhitelist']) - allmcores
                set_for = set(wfi.request['SiteWhitelist']) & t1s
                print wfo.name,"going for",set_for
                print task.pathName
                if set_for:
                    modifications[wfo.name][task.pathName] = { "ReplaceSiteWhitelist" : sorted(set_for) }
                



            ### add the HLT at partner of CERN
            if 'T2_CH_CERN' in wfi.request['SiteWhitelist'] and i_task in [0,1] and use_HLT and not wfi.request['TrustSitelists']:
                needs, task_name, running, idled = needs_action(wfi, task)
                if options.augment: needs=True
                needs = True
                ##needs = random.random()<0.40 remove the random, just add up to a limit
                if (needs or needs_overide) and pending_HLT < max_HLT:
                    pending_HLT += idled
                    if task.pathName in modifications[wfo.name] and 'AddWhitelist' in modifications[wfo.name][task.pathName]:
                        modifications[wfo.name][task.pathName]["AddWhitelist"].append( "T2_CH_CERN_HLT" )
                        wfi.sendLog('equalizor','also adding the HLT in whitelist of %s to %d for %d'%( task.pathName, pending_HLT, max_HLT))

                    ## this Replace does not work at all for HLT
                    elif task.pathName in modifications[wfo.name] and 'ReplaceSiteWhitelist' in modifications[wfo.name][task.pathName]:
                        #modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"].append( "T2_CH_CERN_HLT" )
                        #print "\t",wfo.name,"adding replace HLT up to",pending_HLT,"for",max_HLT
                        print "already having a site replacement, not adding the HLT for now"
                        pass
                    else:
                        modifications[wfo.name][task.pathName] = { "AddWhitelist" : ["T2_CH_CERN_HLT"],
                                                                   "Priority" : wfi.request['RequestPriority'],
                                                                   "Running" : running,
                                                                   "Pending" : idled}
                        wfi.sendLog('equalizor','adding the HLT in whitelist of %s to %d for %d'%( task.pathName, pending_HLT, max_HLT))

            if i_task==0 and not sec and use_T0:
                needs, task_name, running, idled = needs_action(wfi, task)
                
                if options.augment: needs=True
                #needs = True
                good_type = wfi.request['RequestType'] in ['MonteCarlo','MonteCarloFromGEN'] 
                read_lhe = ((not 'LheInputFiles' in wfi.request) or bool(wfi.request['LheInputFiles']))
                good_type &= not read_lhe
                if not good_type and not options.augment: needs = False
                
                ##needs = random.random()<0.40 remove the random, just add up to a limit
                if (needs or needs_overide):
                    pending_T0 += idled
                    if task.pathName in modifications[wfo.name] and 'AddWhitelist' in modifications[wfo.name][task.pathName]:
                        if not "T0_CH_CERN" in modifications[wfo.name][task.pathName]["AddWhitelist"]:
                            modifications[wfo.name][task.pathName]["AddWhitelist"].append( "T0_CH_CERN" )
                            wfi,sendLog('equalizor','adding the T0 for %s to %d for %d'%( task.pathName, pending_T0, max_T0))
                    elif task.pathName in modifications[wfo.name] and 'ReplaceSiteWhitelist' in modifications[wfo.name][task.pathName]:
                        if not "T0_CH_CERN" in modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"]:
                            modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"].append( "T0_CH_CERN" )
                            wfi,sendLog('equalizor','adding the T0 to replacement for %s to %d for %d'%( task.pathName, pending_T0, max_T0))
                    else:
                        modifications[wfo.name][task.pathName] = { "AddWhitelist" : ["T0_CH_CERN"],
                                                                   "Priority" : wfi.request['RequestPriority'],
                                                                   "Running" : running,
                                                                   "Pending" : idled}
                        wfi,sendLog('equalizor','adding the T0 for %s to %d for %d'%( task.pathName, pending_T0, max_T0))


    interface['modifications'].update( modifications )



    ###  manage the number of core and job resizing
    #interface['cores']={'T2_CH_CERN_HLT': {'min':4,'max':16}, 'default': {'min':1, 'max':4}}
    #interface['resizes'] = ['RunIISpring16DR80']
    interface['resizing'] = resizing

    ### manage the modification of the memory and target time
    max_N_mem = 2
    max_N_time = 4
    ## discretize the memory to 10 at most values
    mems = set([o['memory'] for t,o in performance.items() if 'memory' in o])
    times = set([o['time'] for t,o in performance.items() if 'time' in o])
    if len(mems)>max_N_mem:
        mem_step = int((max(mems) - min(mems))/ float(max_N_mem))
        print "rebinning memory"
        for t in performance:
            if not 'memory' in performance[t]: continue
            (m,r) = divmod(performance[t]['memory'], mem_step)
            performance[t]['memory'] = (m+1)*mem_step
    if len(times)>max_N_time:
        print "rebinning memory"
        time_step = int((max(times) - min(times))/float(max_N_time))
        for t in performance:
            if not 'time' in performance[t]: continue
            (m,r) = divmod(performance[t]['time'], time_step)
            performance[t]['time'] = (m+1)*time_step

    new_times = defaultdict(list)
    new_memories = defaultdict(list)

    for t,o in performance.items():
        if 'time' in o:
            new_times[str(o['time'])].append( t )
        if 'memory' in o:
            new_memories[str(o['memory'])].append( t )

    interface['time'].update( new_times )
    interface['memory'].update( new_memories )

    ## close and save
    close( interface )
Exemplo n.º 11
0
def injector(url, options, specific):

    use_mcm = True
    up = componentInfo( mcm = use_mcm, soft=['mcm'] )
    if not up.check(): return
    use_mcm = up.status['mcm']

    workflows = getWorkflows(url, status=options.wmstatus, user=options.user)
    workflows.extend( getWorkflows(url, status=options.wmstatus, user='******', rtype="ReReco")) ## regardless of users, pick up all ReReco on the table

    existing = [wf.name for wf in session.query(Workflow).all()]
    ## browse for assignment-approved requests, browsed for ours, insert the diff
    for wf in workflows:
        if wf not in existing:
            print "putting",wf
            new_wf = Workflow( name = wf , status = options.setstatus, wm_status = options.wmstatus) 
            session.add( new_wf )
            session.commit()
            time.sleep(1)


    existing = [wf.name for wf in session.query(Workflow).all()]

    ## passing a round of invalidation of what needs to be invalidated
    if use_mcm and (options.invalidate or True):
        invalidator(url)


    ## pick up replacements
    for wf in session.query(Workflow).filter(Workflow.status == 'trouble').all():
        if specific and wf.name != specific:
            continue
        print wf.name
        wl = getWorkLoad(url, wf.name)
        familly = getWorkflowById( url, wl['PrepID'] )
        true_familly = []
        for member in familly:
            if member == wf.name: continue
            fwl = getWorkLoad(url , member)
            if options.replace:
                if member != options.replace: continue
            else:
                if fwl['RequestDate'] < wl['RequestDate']: continue
                if fwl['RequestType']=='Resubmission': continue
                if fwl['RequestStatus'] in ['None',None]: continue
            true_familly.append( fwl )

        if len(true_familly)==0:
            print wf.name,"ERROR has no replacement"
            known = []
            try:
                known = json.loads(open('no_replacement.json').read())
            except:
                pass
            if not wf.name in known:
                sendEmail('workflow in %s with no replacement'%(wl['RequestStatus']),'%s is dangling there'%(wf.name))
                known.append( wf.name )
                open('no_replacement.json','w').write( json.dumps( known, indent=2 ))
            continue
        print wf.name,"has",len(familly),"familly members"
        print wf.name,"has",len(true_familly),"true familly members"

        for fwl in true_familly:
            member = fwl['RequestName']
            new_wf = session.query(Workflow).filter(Workflow.name == member).first()
            if not new_wf:
                print "putting",member,"as replacement of",wf.name
                status = 'away'
                if fwl['RequestStatus'] in ['assignment-approved']:
                    status = 'considered'
                new_wf = Workflow( name = member, status = status, wm_status = fwl['RequestStatus'])
                wf.status = 'forget'
                session.add( new_wf ) 
            else:
                if new_wf.status == 'forget': continue
                print "getting",new_wf.name,"as replacement of",wf.name
                wf.status = 'forget'

            for tr in session.query(Transfer).all():
                if wf.id in tr.workflows_id:
                    sw = copy.deepcopy(tr.workflows_id)
                    sw.remove( wf.id)
                    sw.append(new_wf.id)
                    tr.workflows_id = sw
                    print tr.phedexid,"got",new_wf.name
                    if new_wf.status != 'away':
                        print "\t setting it considered"
                        new_wf.status = 'considered'
                    if tr.phedexid<0: ## set it back to positive
                        tr.phedexid = -tr.phedexid
                    session.commit()
                        

        ## don't do that automatically
        #wf.status = 'forget'
        session.commit()
Exemplo n.º 12
0
def equalizor(url , specific = None):

    if not specific:
        workflows = getWorkflows(url, status='running-closed', details=True)
        workflows.extend(getWorkflows(url, status='running-open', details=True))

    ## start from scratch
    modifications = defaultdict(dict)
    ## define regionality site => fallback allowed. feed on an ssb metric ??
    mapping = defaultdict(list)
    reversed_mapping = defaultdict(list)
    regions = defaultdict(list)
    SI = siteInfo()
    for site in SI.sites_ready:
        region = site.split('_')[1]
        if not region in ['US','IT']: continue
        regions[region] = [region] 

    def site_in_depletion(s):
        return True
        if s in SI.sites_pressure:
            (m, r, pressure) = SI.sites_pressure[s]
            if float(m) < float(r):
                print s,m,r,"lacking pressure"
                return True
            else:
                print s,m,r,"pressure"
                pass
                
        return False

    for site in SI.sites_ready:
        region = site.split('_')[1]
        ## fallback to the region, to site with on-going low pressure
        mapping[site] = [fb for fb in SI.sites_ready if any([('_%s_'%(reg) in fb and fb!=site and site_in_depletion(fb))for reg in regions[region]]) ]
    
    mapping['T2_CH_CERN'].append('T2_CH_CERN_HLT')


    for site,fallbacks in mapping.items():
        for fb in fallbacks:
            reversed_mapping[fb].append(site)

    ## this is the fallback mapping
    print json.dumps( mapping, indent=2)
    #print json.dumps( reversed_mapping, indent=2)

    altered_tasks = set()

    def running_idle( wfi , task_name):
        gmon = wfi.getGlideMon()
        if not gmon: return (0,0)
        if not task_name in gmon: return (0,0)
        return (gmon[task_name]['Running'], gmon[task_name]['Idle'])

    def needs_action( wfi, task):
        task_name = task.pathName.split('/')[-1]
        running, idled = running_idle( wfi, task_name)
        if not idled and not running : return False, task_name, running, idled
        if idled < 100: return False, task_name, running, idled
        if (not running and idled) or (idled / float(running) > 0.2):
            return True, task_name, running, idled
        else:
            return False, task_name, running, idled

    def getcampaign( task ):
        taskname = task.pathName.split('/')[-1]
        if hasattr( task, 'prepID'):
            return task.prepID.split('-')[1]
        elif taskname.count('-')>=1:
            return taskname.split('-')[1]
        else:
            return None

    for wfo  in session.query(Workflow).filter(Workflow.status == 'away').all():
        if specific and not specific in wfo.name: 
            continue
        if specific:
            wfi = workflowInfo(url, wfo.name)
        else:
            cached = filter(lambda d : d['RequestName']==wfo.name, workflows)
            if not cached : continue
            wfi = workflowInfo(url, wfo.name, request = cached[0])
        
        tasks_and_campaigns = []
        for task in wfi.getWorkTasks():
            tasks_and_campaigns.append( (task, getcampaign(task) ) )

        ## now parse this for action
        for i_task,(task,campaign) in enumerate(tasks_and_campaigns):
            #print task.pathName
            #print campaign
            if campaign in [ 'RunIIWinter15wmLHE', 'RunIISummer15GS'] and wfi.request['RequestType'] in ['TaskChain']:
                if task.taskType == 'Processing':
                    needs, task_name, running, idled = needs_action(wfi, task)
                    if needs:
                        print task_name,"of",wfo.name,"running",running,"and pending",idled,"taking action : ReplaceSiteWhitelist"
                        set_to = wfi.request['SiteWhitelist']
                        modifications[wfo.name][task.pathName] = { "ReplaceSiteWhitelist" : set_to ,"Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']}
                        altered_tasks.add( task.pathName )
                    else:
                        print task_name,"of",wfo.name,"running",running,"and pending",idled
            if campaign == 'RunIIFall15DR76':
                ## we should add all sites that hold the secondary input if any
                secondary_locations = ['T1_ES_PIC','T2_US_Purdue','T2_UK_SGrid_RALPP','T2_BE_IIHE','T2_DE_DESY','T2_IT_Legnaro','T2_US_Caltech','T1_DE_KIT','T2_UK_London_Brunel','T2_IT_Pisa','T1_US_FNAL','T2_IT_Rome','T2_US_Florida','T1_IT_CNAF','T1_RU_JINR','T2_UK_London_IC','T2_US_Nebraska','T2_FR_CCIN2P3','T2_US_UCSD','T2_ES_CIEMAT','T1_FR_CCIN2P3','T2_US_Wisconsin','T2_US_MIT','T2_DE_RWTH','T1_UK_RAL','T2_US_Vanderbilt','T2_CH_CERN']
                ## should discover the above from secondary location (remember to cache this)
                #(lheinput,primary,parent,secondary, sites_allowed) = wfh.getSiteWhiteList()

                ## removing the ones in the site whitelist already since they encode the primary input location
                augment_by = list(set(secondary_locations)- set(wfi.request['SiteWhitelist']))
                if task.pathName.endswith('_0'):
                    needs, task_name, running, idled = needs_action(wfi, task)
                    if needs:
                        ## the step with an input ought to be the digi part : make this one go anywhere
                        modifications[wfo.name][task.pathName] = { "AddWhitelist" : augment_by , "Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']}
                        altered_tasks.add( task.pathName )
                        print task_name,"of",wfo.name,"running",running,"and pending",idled,"taking action : AddWhitelist"
                    else:
                        print task_name,"of",wfo.name,"running",running,"and pending",idled
            if 'T2_CH_CERN' in wfi.request['SiteWhitelist'] and i_task==0:
                if random.random()<0.005:
                    if task.pathName in modifications[wfo.name] and 'AddWhitelist' in modifications[wfo.name][task.pathName]:
                        modifications[wfo.name][task.pathName]["AddWhitelist"].append( "T2_CH_CERN_HLT" )
                        print wfo.name,"adding HLT"
                    elif task.pathName in modifications[wfo.name] and 'ReplaceSiteWhitelist' in modifications[wfo.name][task.pathName]:
                        modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"].append( "T2_CH_CERN_HLT" )
                        print wfo.name,"adding HLT"
                    else:
                        modifications[wfo.name][task.pathName] = { "AddWhitelist" : ["T2_CH_CERN_HLT"], "Priority" : wfi.request['RequestPriority']}
                        print wfo.name,"adding HLT"


    interface = {
        'reversed_mapping' : reversed_mapping,
        'modifications' : {}
        }
    if options.augment:
        interface['modifications'] = json.loads( open('/afs/cern.ch/user/c/cmst2/www/unified/equalizor.json').read())['modifications']
    interface['modifications'].update( modifications )
    open('/afs/cern.ch/user/c/cmst2/www/unified/equalizor.json.new','w').write( json.dumps( interface, indent=2))
    os.system('mv /afs/cern.ch/user/c/cmst2/www/unified/equalizor.json.new /afs/cern.ch/user/c/cmst2/www/unified/equalizor.json')
    os.system('cp /afs/cern.ch/user/c/cmst2/www/unified/equalizor.json /afs/cern.ch/user/c/cmst2/www/unified/logs/equalizor/equalizor.%s.json'%(time.mktime(time.gmtime())))
    #open('/afs/cern.ch/user/c/cmst2/www/unified/logs/equalizor/equalizor.%s.json'%(time.gmtime()),'w').write( json.dumps( altered_tasks , indent=2))
    sendEmail("Altering the job whitelist","The following tasks had condor rule set for overflow \n%s"%("\n".join( altered_tasks )))
Exemplo n.º 13
0
def transferor(url, specific=None, talk=True, options=None):
    if userLock('transferor'): return

    if options and options.test:
        execute = False
    else:
        execute = True

    SI = siteInfo()
    CI = campaignInfo()
    mcm = McMClient(dev=False)
    dss = DSS()

    print "counting all being handled..."
    being_handled = len(
        session.query(Workflow).filter(Workflow.status == 'away').all())
    being_handled += len(
        session.query(Workflow).filter(
            Workflow.status.startswith('stag')).all())
    being_handled += len(
        session.query(Workflow).filter(
            Workflow.status.startswith('assistance')).all())
    max_to_handle = options.maxworkflows
    allowed_to_handle = max(0, max_to_handle - being_handled)
    wf_buffer = 5
    if allowed_to_handle <= wf_buffer:  ## buffer for having several wf per transfer
        print "Not allowed to run more than", max_to_handle, "at a time. Currently", being_handled, "and", wf_buffer, "buffer"
    else:
        print being_handled, "already being handled", max_to_handle, "max allowed,", allowed_to_handle, "remaining", "and", wf_buffer, "buffer"
    print "... done"

    all_transfers = defaultdict(list)
    workflow_dependencies = defaultdict(
        set)  ## list of wf.id per input dataset
    wfs_and_wfh = []
    print "getting all wf to consider ..."
    cache = getWorkflows(url, 'assignment-approved', details=True)
    for wfo in session.query(Workflow).filter(
            Workflow.status == 'considered').all():
        if specific and not specific in wfo.name: continue
        cache_r = filter(lambda d: d['RequestName'] == wfo.name, cache)
        if len(cache_r):
            wfs_and_wfh.append((wfo,
                                workflowInfo(url,
                                             wfo.name,
                                             spec=False,
                                             request=cache_r[0])))
        else:
            wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False)))
    print "... done"

    input_sizes = {}
    ## list the size of those in transfer already
    in_transfer_priority = 0
    min_transfer_priority = 100000000
    print "getting all wf in staging ..."
    for wfo in session.query(Workflow).filter(
            Workflow.status == 'staging').all():
        wfh = workflowInfo(url, wfo.name, spec=False)
        (_, primary, _, _) = wfh.getIO()
        for prim in primary:
            input_sizes[prim] = dss.get(prim)
        in_transfer_priority = max(in_transfer_priority,
                                   int(wfh.request['RequestPriority']))
        min_transfer_priority = min(min_transfer_priority,
                                    int(wfh.request['RequestPriority']))
    print "... done"
    print "Max priority in transfer already", in_transfer_priority
    print "Min priority in transfer already", min_transfer_priority
    in_transfer_already = sum(input_sizes.values())

    #sort by priority higher first
    wfs_and_wfh.sort(cmp=lambda i, j: cmp(int(i[1].request[
        'RequestPriority']), int(j[1].request['RequestPriority'])),
                     reverse=True)

    ## list the size of all inputs
    print "getting all input sizes ..."
    for (wfo, wfh) in wfs_and_wfh:
        (_, primary, _, _) = wfh.getIO()
        for prim in primary:
            input_sizes[prim] = dss.get(prim)
    print "... done"

    grand_total = sum(input_sizes.values())
    to_transfer = grand_total - in_transfer_already
    grand_transfer_limit = options.maxtransfer
    transfer_limit = grand_transfer_limit - in_transfer_already
    print "%15.4f GB already being transfered" % in_transfer_already
    print "%15.4f GB is the current requested transfer load" % to_transfer
    print "%15.4f GB is the global transfer limit" % grand_transfer_limit
    print "%15.4f GB is the available limit" % transfer_limit

    # the max priority value per dataset.
    max_priority = defaultdict(int)
    needs_transfer = 0  ## so that we can count'em
    passing_along = 0
    transfer_sizes = {}
    went_over_budget = False
    for (wfo, wfh) in wfs_and_wfh:
        print wfh.request['RequestPriority']
        print wfo.name, "to be transfered"
        #wfh = workflowInfo( url, wfo.name)

        (_, primary, _, _) = wfh.getIO()
        this_load = sum([input_sizes[prim] for prim in primary])
        if (this_load
                and (sum(transfer_sizes.values()) + this_load > transfer_limit
                     or went_over_budget)):
            if went_over_budget:
                print "Transfer has gone over bubget."
            else:
                print "Transfer will go over bubget."
            print "%15.4f GB this load" % this_load
            print "%15.4f GB already this round" % sum(transfer_sizes.values())
            print "%15.4f GB is the available limit" % transfer_limit
            went_over_budget = True
            if int(
                    wfh.request['RequestPriority']
            ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority:
                print "Higher priority sample", wfh.request[
                    'RequestPriority'], ">=", in_transfer_priority, "go-on over budget"
            else:
                if not options.go:
                    print min_transfer_priority, "minimum priority", wfh.request[
                        'RequestPriority'], "<", in_transfer_priority, "stop"
                    continue

        ## throtlle by campaign go
        if not CI.go(wfh.request['Campaign']):
            print "No go for", wfh.request['Campaign']
            if not options.go: continue

        ## check if the batch is announced
        announced = False
        is_real = False
        for b in mcm.getA('batches', query='contains=%s' % wfo.name):
            is_real = True
            if b['status'] == 'announced':
                announced = True
                break

        if not announced:
            print wfo.name, "does not look announced."  # skipping?, rejecting?, reporting?"

        if not is_real:
            print wfo.name, "does not appear to be genuine."
            ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status
            continue

        ## check on a grace period
        injection_time = time.mktime(
            time.strptime('.'.join(map(str, wfh.request['RequestDate'])),
                          "%Y.%m.%d.%H.%M.%S")) / (60. * 60.)
        now = time.mktime(time.gmtime()) / (60. * 60.)
        if float(now - injection_time) < 4.:
            if not options.go and not announced:
                print "It is too soon to start transfer: %3.2fH remaining" % (
                    now - injection_time)
                continue

        passing_along += 1
        if passing_along >= allowed_to_handle:
            if int(
                    wfh.request['RequestPriority']
            ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority:
                print "Higher priority sample", wfh.request[
                    'RequestPriority'], ">=", in_transfer_priority, "go-on over", max_to_handle
            else:
                print "Not allowed to pass more than", max_to_handle, "at a time. Currently", being_handled, "handled, and adding", passing_along
                break

        (lheinput, primary, parent, secondary) = wfh.getIO()
        if options and options.tosites:
            sites_allowed = options.tosites.split(',')
        else:
            sites_allowed = getSiteWhiteList(
                (lheinput, primary, parent, secondary))

        if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']):
            sites_allowed = CI.parameters(
                wfh.request['Campaign'])['SiteWhitelist']

        blocks = []
        if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']:
            blocks = wfh.request['BlockWhitelist']

        can_go = True
        staging = False
        if primary:
            if talk:
                print wfo.name, 'reads', ', '.join(primary), 'in primary'
            ## chope the primary dataset
            for prim in primary:
                max_priority[prim] = max(max_priority[prim],
                                         int(wfh.request['RequestPriority']))
                sites_really_allowed = [
                    site for site in sites_allowed if not any([
                        osite.startswith(site)
                        for osite in SI.sites_veto_transfer
                    ])
                ]
                print "Sites allowed minus the vetoed transfer"
                print sites_really_allowed
                copies_needed = int(
                    0.35 * len(sites_really_allowed)
                ) + 1  ## should just go for a fixed number based if the white list grows that big
                print "Would make", copies_needed, "copies"
                if options.maxcopy > 0:
                    copies_needed = min(options.maxcopy, copies_needed)

                ## remove the sites that do not want transfers
                print "need", copies_needed
                workflow_dependencies[prim].add(wfo.id)
                presence = getDatasetPresence(url, prim)
                prim_location = [
                    site for site, pres in presence.items() if pres[0] == True
                ]
                if len(prim_location) >= copies_needed:
                    print "The output is all fully in place at", len(
                        prim_location), "sites"
                    continue
                # reduce the number of copies required by existing full copies
                copies_needed = max(0, copies_needed - len(prim_location))
                print "now need", copies_needed
                subscriptions = listSubscriptions(url, prim)
                prim_destination = list(
                    set([
                        site
                        for (site, (tid, decision)) in subscriptions.items()
                        if decision and not any([
                            site.endswith(veto)
                            for veto in ['MSS', 'Export', 'Buffer']
                        ])
                    ]))
                ## need to reject from that list the ones with a full copy already: i.e the transfer corresponds to the copy in place
                prim_destination = [
                    site for site in prim_destination
                    if not site in prim_location
                ]
                ## add transfer dependencies
                latching_on_transfers = list(
                    set([
                        tid
                        for (site, (tid, decision)) in subscriptions.items()
                        if decision and site in prim_destination and not any([
                            site.endswith(veto)
                            for veto in ['MSS', 'Export', 'Buffer']
                        ])
                    ]))
                print latching_on_transfers
                for latching in latching_on_transfers:
                    tfo = session.query(Transfer).filter(
                        Transfer.phedexid == latching).first()
                    if not tfo:
                        tfo = Transfer(phedexid=latching)
                        tfo.workflows_id = []
                        session.add(tfo)

                    if not wfo.id in tfo.workflows_id:
                        print "adding", wfo.id, "to", tfo.id, "with phedexid", latching
                        l = copy.deepcopy(tfo.workflows_id)
                        l.append(wfo.id)
                        tfo.workflows_id = l
                    if not options.test:
                        session.commit()
                    else:
                        session.flush(
                        )  ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query
                    can_go = False
                    transfer_sizes[prim] = input_sizes[prim]
                    staging = True

                # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ??
                copies_needed = max(0, copies_needed - len(prim_destination))
                print "then need", copies_needed
                if copies_needed == 0:
                    print "The output is either fully in place or getting in full somewhere with", latching_on_transfers
                    can_go = True
                    continue
                prim_to_distribute = [
                    site for site in sites_allowed if not any(
                        [osite.startswith(site) for osite in prim_location])
                ]
                prim_to_distribute = [
                    site for site in prim_to_distribute if not any(
                        [osite.startswith(site) for osite in prim_destination])
                ]
                ## take out the ones that cannot receive transfers
                prim_to_distribute = [
                    site for site in prim_to_distribute if not any([
                        osite.startswith(site)
                        for osite in SI.sites_veto_transfer
                    ])
                ]
                if len(
                        prim_to_distribute
                ) > 0:  ## maybe that a parameter we can play with to limit the
                    if not options or options.chop:
                        spreading = distributeToSites(getDatasetChops(prim),
                                                      prim_to_distribute,
                                                      n_copies=copies_needed,
                                                      weights=SI.cpu_pledges)
                    else:
                        spreading = {}
                        for site in prim_to_distribute:
                            spreading[site] = [prim]
                    can_go = False
                    transfer_sizes[prim] = input_sizes[prim]
                    for (site, items) in spreading.items():
                        all_transfers[site].extend(items)

        if secondary:
            if talk:
                print wfo.name, 'reads', ', '.join(secondary), 'in secondary'
            for sec in secondary:
                workflow_dependencies[sec].add(wfo.id)
                presence = getDatasetPresence(url, sec)
                sec_location = [
                    site for site, pres in presence.items() if pres[1] > 90.
                ]  ## more than 90% of the minbias at sites
                subscriptions = listSubscriptions(url, sec)
                sec_destination = [site for site in subscriptions]
                sec_to_distribute = [
                    site for site in sites_allowed if
                    not any([osite.startswith(site) for osite in sec_location])
                ]
                sec_to_distribute = [
                    site for site in sec_to_distribute if not any(
                        [osite.startswith(site) for osite in sec_destination])
                ]
                sec_to_distribute = [
                    site for site in sec_to_distribute if not any([
                        osite.startswith(site)
                        for osite in SI.sites_veto_transfer
                    ])
                ]
                if len(sec_to_distribute) > 0:
                    for site in sec_to_distribute:
                        all_transfers[site].append(sec)
                        can_go = False

        ## is that possible to do something more
        if can_go:
            ## no explicit transfer required this time
            if staging:
                ## but using existing ones
                print wfo.name, "latches on existing transfers, and nothing else"
                wfo.status = 'staging'
            else:
                print wfo.name, "should just be assigned NOW to", sites_allowed
                wfo.status = 'staged'
            print "setting status to", wfo.status
            session.commit()
            continue
        else:
            ## there is an explicit transfer required
            if staging:
                ## and also using an existing one
                print wfo.name, "latches on existing transfers"
                if not options.test:
                    wfo.status = 'staging'
                    print "setting status to", wfo.status
                    session.commit()
            print wfo.name, "needs a transfer"
            needs_transfer += 1

    #print json.dumps(all_transfers)
    fake_id = -1
    wf_id_in_prestaging = set()

    for (site, items_to_transfer) in all_transfers.iteritems():
        items_to_transfer = list(set(items_to_transfer))
        ## convert to storage element
        site_se = SI.CE_to_SE(site)

        ## site that do not want input datasets
        if site in SI.sites_veto_transfer:
            print site, "does not want transfers"
            continue

        ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured.

        ## massage a bit the items
        blocks = [it for it in items_to_transfer if '#' in it]
        datasets = [it for it in items_to_transfer if not '#' in it]

        if execute:
            print "Making a replica to", site, "(CE)", site_se, "(SE) for"
        else:
            print "Would make a replica to", site, "(CE)", site_se, "(SE) for"

        print "\t", len(blocks), "blocks"
        ## remove blocks if full dataset is send out
        blocks = [
            block for block in blocks if not block.split('#')[0] in datasets
        ]
        print "\t", len(blocks), "needed blocks for", list(
            set([block.split('#')[0] for block in blocks]))
        print "\t", len(datasets), "datasets"
        print "\t", datasets
        items_to_transfer = blocks + datasets

        ## operate the transfer
        if options and options.stop:
            ## ask to move-on
            answer = raw_input('Continue with that ?')
            if not answer.lower() in ['y', 'yes', 'go']:
                continue

        if execute:
            result = makeReplicaRequest(url,
                                        site_se,
                                        items_to_transfer,
                                        'prestaging',
                                        priority='normal')
            ## make use of max_priority dataset:priority to set the subscriptions priority
            """
            ## does not function
            once = True
            for item in items_to_transfer:
                bds = item.split('#')[0]
                if max_priority[bds] >= 90000:
                    if once:
                        w=10
                        print "waiting",w,"s before raising priority"
                        time.sleep(w)
                        once=False
                    ## raise it to high priority
                    print item,"subscription priority raised to high at",site_se
                    #print "This does not work yet properly it seems"
                    print updateSubscription(url, site_se, item, priority='high')
            """
        else:
            #result= {'phedex':{'request_created' : [{'id' : fake_id}]}}
            result = {'phedex': {'request_created': []}}
            fake_id -= 1

        if not result:
            print "ERROR Could not make a replica request for", site, items_to_transfer, "pre-staging"
            continue
        for phedexid in [o['id'] for o in result['phedex']['request_created']]:
            new_transfer = session.query(Transfer).filter(
                Transfer.phedexid == phedexid).first()
            print phedexid, "transfer created"
            if not new_transfer:
                new_transfer = Transfer(phedexid=phedexid)
                session.add(new_transfer)
            new_transfer.workflows_id = set()
            for transfering in list(
                    set(map(lambda it: it.split('#')[0], items_to_transfer))):
                new_transfer.workflows_id.update(
                    workflow_dependencies[transfering])
            new_transfer.workflows_id = list(new_transfer.workflows_id)
            wf_id_in_prestaging.update(new_transfer.workflows_id)
            session.commit()
            ## auto approve it
            if execute:
                approved = approveSubscription(url, phedexid, [site_se])

    for wfid in wf_id_in_prestaging:
        tr_wf = session.query(Workflow).get(wfid)
        if tr_wf and tr_wf.status != 'staging':
            if execute:
                tr_wf.status = 'staging'
                if talk:
                    print "setting", tr_wf.name, "to staging"
        session.commit()
Exemplo n.º 14
0
def assignor(url, specific=None, talk=True, options=None):
    if userLock(): return
    if duplicateLock(): return
    if not componentInfo().check(): return

    UC = unifiedConfiguration()
    CI = campaignInfo()
    #SI = siteInfo()
    SI = global_SI()
    #NLI = newLockInfo()
    #if not NLI.free() and not options.go: return
    LI = lockInfo()
    if not LI.free() and not options.go: return

    n_assigned = 0
    n_stalled = 0

    wfos = []
    fetch_from = []
    if specific or options.early:
        fetch_from.extend(['considered', 'staging'])
    if specific:
        fetch_from.extend(['considered-tried'])

    if options.early:
        print "Option Early is on"

    fetch_from.extend(['staged'])

    if options.from_status:
        fetch_from = options.from_status.split(',')
        print "Overriding to read from", fetch_from

    for status in fetch_from:
        print "getting wf in", status
        wfos.extend(
            session.query(Workflow).filter(Workflow.status == status).all())
        print len(wfos)

    ## in case of partial, go for fetching a list from json ?
    #if options.partial and not specific:
    #    pass

    dataset_endpoints = json.loads(
        open('%s/dataset_endpoints.json' % monitor_dir).read())
    aaa_mapping = json.loads(
        open('%s/equalizor.json' % monitor_pub_dir).read())['mapping']

    all_stuck = set()
    all_stuck.update(
        json.loads(open('%s/stuck_transfers.json' % monitor_pub_dir).read()))
    all_stuck.update(getAllStuckDataset())

    max_per_round = UC.get('max_per_round').get('assignor', None)
    max_cpuh_block = UC.get('max_cpuh_block')

    ##order by priority instead of random
    if options.early:
        cache = sorted(getWorkflows(url, 'assignment-approved', details=True),
                       key=lambda r: r['RequestPriority'])
        cache = [r['RequestName'] for r in cache]

        def rank(wfn):
            return cache.index(wfn) if wfn in cache else 0

        wfos = sorted(wfos, key=lambda wfo: rank(wfo.name), reverse=True)
        print "10 first", [wfo.name for wfo in wfos[:10]]
        print "10 last", [wfo.name for wfo in wfos[-10:]]
    else:
        random.shuffle(wfos)

    for wfo in wfos:

        if options.limit and (n_stalled + n_assigned) > options.limit:
            break

        if max_per_round and (n_stalled + n_assigned) > max_per_round:
            break

        if specific:
            if not any(map(lambda sp: sp in wfo.name, specific.split(','))):
                continue
            #if not specific in wfo.name: continue
        print "\n\n"
        wfh = workflowInfo(url, wfo.name)

        if wfh.request['RequestStatus'] in [
                'rejected', 'aborted', 'aborted-completed', 'aborted-archived',
                'rejected-archived'
        ] and wfh.isRelval():
            wfo.status = 'forget'
            session.commit()
            n_stalled += 1
            continue

        if options.priority and int(
                wfh.request['RequestPriority']) < options.priority:
            continue

        options_text = ""
        if options.early: options_text += ", early option is ON"
        if options.partial:
            options_text += ", partial option is ON"
            options_text += ", good fraction is %.2f" % options.good_enough

        wfh.sendLog('assignor',
                    "%s to be assigned%s" % (wfo.name, options_text))

        ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores
        (lheinput, primary, parent, secondary,
         sites_allowed) = wfh.getSiteWhiteList()
        output_tiers = list(
            set([o.split('/')[-1] for o in wfh.request['OutputDatasets']]))

        if not output_tiers:
            n_stalled += 1
            wfh.sendLog('assignor', 'There is no output at all')
            sendLog('assignor',
                    'Workflow %s has no output at all' % (wfo.name),
                    level='critical')
            continue

        is_stuck = (all_stuck & primary)
        if is_stuck:
            wfh.sendLog('assignor',
                        "%s are stuck input" % (','.join(is_stuck)))

        ## check if by configuration we gave it a GO
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True

        allowed_secondary = {}
        assign_parameters = {}
        check_secondary = (not wfh.isRelval())
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns:
                assign_parameters.update(CI.campaigns[campaign])

            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[
                    campaign]:
                if CI.campaigns[campaign]['secondaries']:
                    allowed_secondary.update(
                        CI.campaigns[campaign]['secondaries'])
                    check_secondary = True
            if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[
                    campaign]:
                banned_tier = list(
                    set(CI.campaigns[campaign]['banned_tier'])
                    & set(output_tiers))
                if banned_tier:
                    no_go = True
                    wfh.sendLog(
                        'assignor', 'These data tiers %s are not allowed' %
                        (','.join(banned_tier)))
                    sendLog('assignor',
                            'These data tiers %s are not allowed' %
                            (','.join(banned_tier)),
                            level='critical')

        if secondary and check_secondary:
            if (set(secondary) & set(allowed_secondary.keys()) !=
                    set(secondary)):
                wfh.sendLog(
                    'assignor', '%s is not an allowed secondary' %
                    (', '.join(set(secondary) -
                               set(allowed_secondary.keys()))))
                sendLog(
                    'assignor',
                    '%s is not an allowed secondary' %
                    (', '.join(set(secondary) -
                               set(allowed_secondary.keys()))),
                    level='critical')
                if not options.go:
                    no_go = True
            ## then get whether there is something more to be done by secondary
            for sec in secondary:
                if sec in allowed_secondary:  # and 'parameters' in allowed_secondary[sec]:
                    assign_parameters.update(allowed_secondary[sec])

        if no_go:
            n_stalled += 1
            ## make a very loud noise if >100k priority stalled
            continue

        ## check on current status for by-passed assignment
        if wfh.request['RequestStatus'] != 'assignment-approved':
            if not options.test:
                wfh.sendLog('assignor',
                            "setting %s away and skipping" % wfo.name)
                ## the module picking up from away will do what is necessary of it
                wfo.wm_status = wfh.request['RequestStatus']
                wfo.status = 'away'
                session.commit()
                continue
            else:
                print wfo.name, wfh.request['RequestStatus']

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version = wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                wfh.sendLog('assignor', "cannot decide on version number")
                n_stalled += 1
                wfo.status = 'trouble'
                session.commit()
                continue

        original_sites_allowed = copy.deepcopy(sites_allowed)
        wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed))
        override_sec_location = CI.get(wfh.request['Campaign'],
                                       'SecondaryLocation', [])

        blocks = wfh.getBlockWhiteList()
        rwl = wfh.getRunWhiteList()
        if rwl:
            ## augment with run white list
            for dataset in primary:
                blocks = list(set(blocks +
                                  getDatasetBlocks(dataset, runs=rwl)))
        lwl = wfh.getLumiWhiteList()
        if lwl:
            ## augment with lumi white list
            for dataset in primary:
                blocks = list(
                    set(blocks + getDatasetBlocks(dataset, lumis=lwl)))

        wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed))
        secondary_locations = None

        primary_aaa = options.primary_aaa
        secondary_aaa = options.secondary_aaa
        do_partial = False  #options.good_enough if options.partial else 0

        if 'Campaign' in wfh.request and wfh.request[
                'Campaign'] in CI.campaigns:
            assign_parameters.update(CI.campaigns[wfh.request['Campaign']])

        if 'primary_AAA' in assign_parameters:
            primary_aaa = primary_aaa or assign_parameters['primary_AAA']
        if 'secondary_AAA' in assign_parameters:
            secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA']
        if 'partial_copy' in assign_parameters:
            ## can this only work if there is a stuck input ? maybe not
            ## this is a number. 0 means no
            print "Could do partial disk copy assignment"
            if is_stuck or options.partial:
                do_partial = assign_parameters['partial_copy']
                wfh.sendLog(
                    'assignor',
                    "Overiding partial copy assignment to %.2f fraction" %
                    do_partial)
                #sendEmail('stuck input to assignment','%s is stuck for assigning %s and going fractional'%(','.join( is_stuck), wfo.name))

        do_partial = options.good_enough if options.partial else do_partial

        for sec in list(secondary):
            if override_sec_location:
                print "We don't care where the secondary is"
                print "Cannot pass for now"
                #sendEmail("tempting to pass sec location check","but we cannot yet IMO")
                #pass

            presence = getDatasetPresence(url, sec)
            print sec
            print json.dumps(presence, indent=2)
            one_secondary_locations = [
                site for (site, (there, frac)) in presence.items()
                if frac > 98.
            ]

            if secondary_aaa:
                if not one_secondary_locations:
                    sec_availability = getDatasetBlocksFraction(url, sec)
                    if sec_availability >= 1. and options.go:
                        ## there is at least one copy of each block on disk. We should go ahead and let it go.
                        wfh.sendLog(
                            'assignor',
                            "The secondary %s is available %s times on disk, and usable"
                            % (sec, sec_availability))
                    else:
                        ## not even a copy on disk anywhere !!!!
                        sites_allowed = []  ## will block the assignment
                        wfh.sendLog(
                            'assignor',
                            "The secondary %s is nowhere on disk" % sec)
                #just continue without checking
                continue

            #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
            if secondary_locations == None:
                secondary_locations = one_secondary_locations
            else:
                secondary_locations = list(
                    set(secondary_locations) & set(one_secondary_locations))
            ## reduce the site white list to site with secondary only
            #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])]
            sites_allowed = [
                site for site in sites_allowed
                if SI.CE_to_SE(site) in one_secondary_locations
            ]

        wfh.sendLog(
            'assignor', "From/after secondary requirement, now Allowed%s" %
            sorted(sites_allowed))

        initial_sites_allowed = copy.deepcopy(
            sites_allowed
        )  ## keep track of this, after secondary input location restriction : that's how you want to operate it

        sites_all_data = copy.deepcopy(sites_allowed)
        sites_with_data = copy.deepcopy(sites_allowed)
        sites_with_any_data = copy.deepcopy(sites_allowed)
        primary_locations = None
        available_fractions = {}
        set_lfn = '/store/mc'  ## by default

        endpoints = set()
        for prim in list(primary):
            if prim in dataset_endpoints:
                print "endpoints from stagor", dataset_endpoints[prim]
                endpoints.update(dataset_endpoints[prim])
            set_lfn = getLFNbase(prim)
            presence = getDatasetPresence(url, prim, only_blocks=blocks)
            if talk:
                print prim
                print json.dumps(presence, indent=2)
            available_fractions[prim] = getDatasetBlocksFraction(
                url,
                prim,
                sites=[SI.CE_to_SE(site) for site in sites_allowed],
                only_blocks=blocks)
            if primary_aaa:
                available_fractions[prim] = getDatasetBlocksFraction(
                    url, prim, only_blocks=blocks)

            sites_all_data = [
                site for site in sites_with_data if SI.CE_to_SE(site) in [
                    psite for (psite, (there, frac)) in presence.items()
                    if there
                ]
            ]
            if primary_aaa:
                sites_all_data = list(
                    set([
                        SI.SE_to_CE(psite)
                        for (psite, (there, frac)) in presence.items() if there
                    ]))
            sites_with_data = [
                site for site in sites_with_data if SI.CE_to_SE(site) in
                [psite for (psite, frac) in presence.items() if frac[1] > 90.]
            ]
            sites_with_any_data = [
                site for site in sites_with_any_data
                if SI.CE_to_SE(site) in presence.keys()
            ]
            if primary_aaa:
                sites_with_any_data = list(
                    set([SI.SE_to_CE(psite) for psite in presence.keys()]))

            wfh.sendLog(
                'assignor', "Holding the data but not allowed %s" % sorted(
                    list(
                        set([
                            se_site for se_site in presence.keys()
                            if not SI.SE_to_CE(se_site) in sites_allowed
                        ]))))
            if primary_locations == None:
                primary_locations = presence.keys()
            else:
                primary_locations = list(
                    set(primary_locations) & set(presence.keys()))

        sites_with_data = list(set(sites_with_data))
        sites_with_any_data = list(set(sites_with_any_data))

        opportunistic_sites = []
        down_time = False
        ## opportunistic running where any piece of data is available
        if secondary_locations or primary_locations:
            ## intersection of both any pieces of the primary and good IO
            #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))]
            if secondary_locations and primary_locations:
                opportunistic_sites = [
                    SI.SE_to_CE(site) for site in
                    list((set(secondary_locations) & set(primary_locations)) -
                         set([SI.CE_to_SE(site) for site in sites_allowed]))
                ]
            elif primary_locations:
                opportunistic_sites = [
                    SI.SE_to_CE(site) for site in list(
                        set(primary_locations) -
                        set([SI.CE_to_SE(site) for site in sites_allowed]))
                ]
            else:
                opportunistic_sites = []
            wfh.sendLog(
                'assignor', "We could be running in addition at %s" %
                sorted(opportunistic_sites))
            if any(
                [osite in SI.sites_not_ready
                 for osite in opportunistic_sites]):
                wfh.sendLog(
                    'assignor', "One of the usable site is in downtime %s" % ([
                        osite for osite in opportunistic_sites
                        if osite in SI.sites_not_ready
                    ]))
                down_time = True
                ## should this be send back to considered ?

        ## should be 2 but for the time-being let's lower it to get things going
        copies_wanted, cpuh = wfh.getNCopies()
        wfh.sendLog('assignor', "we need %s CPUh" % cpuh)
        if cpuh > max_cpuh_block and not options.go:
            #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**'])
            sendLog(
                'assignor',
                '%s requires a large numbr of CPUh %s , not assigning, please check with requester'
                % (wfo.name, cpuh),
                level='critical')
            wfh.sendLog(
                'assignor',
                "Requiring a large number of CPUh %s, not assigning" % cpuh)
            continue

        if 'Campaign' in wfh.request and wfh.request[
                'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[
                    wfh.request['Campaign']]:
            copies_needed_from_campaign = CI.campaigns[
                wfh.request['Campaign']]['maxcopies']
            copies_wanted = min(copies_needed_from_campaign, copies_wanted)

        if not options.early:
            less_copies_than_requested = UC.get("less_copies_than_requested")
            copies_wanted = max(
                1, copies_wanted -
                less_copies_than_requested)  # take one out for the efficiency
        else:
            ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going
            pass

        wfh.sendLog('assignor',
                    "needed availability fraction %s" % copies_wanted)

        ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently

        ## default back to white list to original white list with any data
        wfh.sendLog('assignor', "Allowed sites :%s" % sorted(sites_allowed))

        if primary_aaa:
            ## remove the sites not reachable localy if not in having the data
            if not sites_all_data:
                wfh.sendLog('assignor',
                            "Overiding the primary on AAA setting to Off")
                primary_aaa = False
            else:
                aaa_grid = set(sites_all_data)
                for site in list(aaa_grid):
                    aaa_grid.update(aaa_mapping.get(site, []))
                sites_allowed = list(set(initial_sites_allowed) & aaa_grid)
                wfh.sendLog(
                    'assignor', "Selected to read primary through xrootd %s" %
                    sorted(sites_allowed))

        isStoreResults = ('StoreResults' == wfh.request.setdefault(
            'RequestType', None))

        if isStoreResults:
            if 'MergedLFNBase' in wfh.request:
                set_lfn = wfh.request['MergedLFNBase']
            else:
                n_stalled += 1
                wfh.sendLog(
                    'assignor',
                    "Cannot assign StoreResults request because MergedLFN is missing"
                )
                sendLog(
                    'assignor',
                    'Cannot assign StoreResults request because MergedLFN is missing',
                    level='critical')
                continue

        if not primary_aaa:
            if not isStoreResults:
                sites_allowed = sites_with_any_data
            else:
                ## if we are dealing with a StoreResults request, we don't need to check dataset availability and
                ## should use the SiteWhiteList set in the original request
                if 'SiteWhitelist' in wfh.request:
                    sites_allowed = wfh.request['SiteWhitelist']
                else:
                    wfh.sendLog(
                        'assignor',
                        "Cannot assign StoreResults request because SiteWhitelist is missing"
                    )
                    sendLog(
                        'assignor',
                        'Cannot assign StoreResults request because SiteWhitelist is missing',
                        level='critical')
                    n_stalled += 1
                    continue
                available_fractions = {}
            wfh.sendLog('assignor',
                        "Selected for any data %s" % sorted(sites_allowed))

        ### check on endpoints for on-going transfers
        if do_partial:
            if endpoints:
                end_sites = [SI.SE_to_CE(s) for s in endpoints]
                sites_allowed = list(set(sites_allowed + end_sites))
                if down_time and not any(osite in SI.sites_not_ready
                                         for osite in end_sites):
                    print "Flip the status of downtime, since our destinations are good"
                    down_time = False
                print "with added endpoints", sorted(end_sites)
            else:
                print "Cannot do partial assignment without knowin the endpoints"
                n_stalled += 1
                continue

        #if not len(sites_allowed):
        #    if not options.early:
        #        wfh.sendLog('assignor',"cannot be assign with no matched sites")
        #        sendLog('assignor','%s has no whitelist'% wfo.name, level='critical')
        #    n_stalled+=1
        #    continue

        low_pressure = SI.sites_low_pressure(0.4)
        ## if any of the site allowed is low pressure : reduce to 1 copy so that it gets started
        allowed_and_low = sorted(set(low_pressure) & set(sites_allowed))
        if allowed_and_low:
            wfh.sendLog(
                'assignor',
                "The workflow can run at %s under low pressure currently" %
                (','.join(allowed_and_low)))
            copies_wanted = max(1., copies_wanted - 1.)

        if available_fractions and not all([
                available >= copies_wanted
                for available in available_fractions.values()
        ]):
            not_even_once = not all([
                available >= 1. for available in available_fractions.values()
            ])
            above_good = all([
                available >= do_partial
                for available in available_fractions.values()
            ])
            wfh.sendLog(
                'assignor',
                "The input dataset is not available %s times, only %s" %
                (copies_wanted, available_fractions.values()))
            if down_time and not options.go and not options.early:
                wfo.status = 'considered'
                session.commit()
                wfh.sendLog(
                    'assignor',
                    "sending back to considered because of site downtime, instead of waiting"
                )
                #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name)
                sendLog(
                    'assignor',
                    '%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.'
                    % (wfo.name),
                    level='delay')
                n_stalled += 1
                continue
                #pass

            print json.dumps(available_fractions)
            if (options.go and not_even_once) or not options.go:
                known = []
                try:
                    known = json.loads(open('cannot_assign.json').read())
                except:
                    pass
                if not wfo.name in known and not options.limit and not options.go and not options.early and not (
                        do_partial and above_good):
                    wfh.sendLog(
                        'assignor',
                        "cannot be assigned, %s is not sufficiently available.\n %s"
                        % (wfo.name, json.dumps(available_fractions)))
                    #sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions)))
                    known.append(wfo.name)
                    open('cannot_assign.json',
                         'w').write(json.dumps(known, indent=2))

                if options.early:
                    if wfo.status == 'considered':
                        wfh.sendLog('assignor', "setting considered-tried")
                        wfo.status = 'considered-tried'
                        session.commit()
                    else:
                        print "tried but status is", wfo.status
                if do_partial and above_good:
                    print "Will move on with partial locations"
                else:
                    n_stalled += 1
                    continue

        if not len(sites_allowed):
            if not options.early:
                wfh.sendLog('assignor',
                            "cannot be assign with no matched sites")
                sendLog('assignor',
                        '%s has no whitelist' % wfo.name,
                        level='critical')
            n_stalled += 1
            continue

        t1_only = [ce for ce in sites_allowed if ce.startswith('T1')]
        if t1_only:
            # try to pick from T1 only first
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])]
        else:
            # then pick any otherwise
            sites_out = [
                SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])
            ]

        wfh.sendLog('assignor', "Placing the output on %s" % sites_out)
        parameters = {
            'SiteWhitelist': sites_allowed,
            'NonCustodialSites': sites_out,
            'AutoApproveSubscriptionSites': list(set(sites_out)),
            'AcquisitionEra': wfh.acquisitionEra(),
            'ProcessingString': wfh.processingString(),
            'MergedLFNBase': set_lfn,
            'ProcessingVersion': version,
        }

        if primary_aaa:
            parameters['TrustSitelists'] = True
            wfh.sendLog(
                'assignor',
                "Reading primary through xrootd at %s" % sorted(sites_allowed))

        if secondary_aaa:
            parameters['TrustPUSitelists'] = True
            wfh.sendLog(
                'assignor', "Reading secondary through xrootd at %s" %
                sorted(sites_allowed))

        ## plain assignment here
        team = 'production'
        if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM')
        if options and options.team:
            team = options.team
        parameters['Team'] = team

        if lheinput:
            ## throttle reading LHE article
            wfh.sendLog('assignor',
                        'Setting the number of events per job to 500k max')
            parameters['EventsPerJob'] = 500000

        def pick_options(options, parameters):
            ##parse options entered in command line if any
            if options:
                for key in reqMgrClient.assignWorkflow.keys:
                    v = getattr(options, key)
                    if v != None:
                        if type(v) == str and ',' in v:
                            parameters[key] = filter(None, v.split(','))
                        else:
                            parameters[key] = v

        def pick_campaign(assign_parameters, parameters):
            ## pick up campaign specific assignment parameters
            parameters.update(assign_parameters.get('parameters', {}))

        if options.force_options:
            pick_campaign(assign_parameters, parameters)
            pick_options(options, parameters)
        else:
            ## campaign parameters update last
            pick_options(options, parameters)
            pick_campaign(assign_parameters, parameters)

        if not options.test:
            parameters['execute'] = True

        hold_split, split_check = wfh.checkSplitting()
        if hold_split and not options.go:
            if split_check:
                wfh.sendLog(
                    'assignor', 'Holding on to the change in splitting %s' %
                    ('\n\n'.join([str(i) for i in split_check])))
            else:
                wfh.sendLog('assignor', 'Change of splitting is on hold')
            n_stalled += 1
            continue

        if split_check == None or split_check == False:
            n_stalled += 1
            continue
        elif split_check:
            ## operate all recommended changes
            reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check)
            wfh.sendLog(
                'assignor', 'Applying the change in splitting %s' %
                ('\n\n'.join([str(i) for i in split_check])))

        split_check = True  ## bypass completely and use the above

        # Handle run-dependent MC
        pstring = wfh.processingString()
        if 'PU_RD' in pstring:
            numEvents = wfh.getRequestNumEvents()
            eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary]
            eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi))
            reqJobs = 500
            if 'PU_RD2' in pstring:
                reqJobs = 2000
                eventsPerJob = int(numEvents / (reqJobs * 1.4))
                lumisPerJob = int(eventsPerJob / eventsPerLumi)
                if lumisPerJob == 0:
                    #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    sendLog('assignor',
                            "%s needs to be split by event with %s per job" %
                            (wfo.name, eventsPerJob),
                            level='critical')
                    wfh.sendLog(
                        'assignor',
                        "%s needs to be split by event with %s per job" %
                        (wfo.name, eventsPerJob))
                    parameters['EventsPerJob'] = eventsPerJob
                else:
                    spl = wfh.getSplittings()[0]
                    eventsPerJobEstimated = spl[
                        'events_per_job'] if 'events_per_job' in spl else None
                    eventsPerJobEstimated = spl[
                        'avg_events_per_job'] if 'avg_events_per_job' in spl else None
                    if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob:
                        #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        sendLog('assignor',
                                "%s was assigned with %s lumis/job" %
                                (wfo.name, lumisPerJob),
                                level='critical')
                        wfh.sendLog(
                            'assignor', "%s was assigned with %s lumis/job" %
                            (wfo.name, lumisPerJob))
                        parameters['LumisPerJob'] = lumisPerJob
                    else:
                        #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name)
                        sendLog(
                            'assignor',
                            "leaving splitting untouched for %s, please check on %s"
                            % (pstring, wfo.name),
                            level='critical')
                        wfh.sendLog(
                            'assignor',
                            "leaving splitting untouched for PU_RD*, please check."
                        )

        if isHEPCloudReady(url) and wfh.isGoodForNERSC():
            parameters['Team'] = 'hepcloud'
            parameters['SiteWhitelist'] = ['T3_US_NERSC']
            if primary:
                parameters['TrustSitelists'] = True
            if secondary:
                parameters['TrustPUSitelists'] = True
            sendEmail("sending work to hepcloud",
                      "pleasse check on %s" % wfh.request['RequestName'],
                      destination=['*****@*****.**'])

        ## make sure to autoapprove all NonCustodialSites
        parameters['AutoApproveSubscriptionSites'] = list(
            set(parameters['NonCustodialSites'] +
                parameters.get('AutoApproveSubscriptionSites', [])))

        result = reqMgrClient.assignWorkflow(
            url, wfo.name, None,
            parameters)  ## team is not relevant anymore here

        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
                n_assigned += 1
                wfh.sendLog(
                    'assignor', "Properly assigned\n%s" %
                    (json.dumps(parameters, indent=2)))
                try:
                    ## refetch information and lock output
                    new_wfi = workflowInfo(url, wfo.name)
                    (_, prim, _, sec) = new_wfi.getIO()
                    for secure in list(prim) + list(
                            sec) + new_wfi.request['OutputDatasets']:
                        ## lock all outputs
                        LI.lock(secure, reason='assigning')

                except Exception as e:
                    print "fail in locking output"

                    print str(e)
                    sendEmail("failed locking of output", str(e))

            else:
                wfh.sendLog(
                    'assignor',
                    "Failed to assign %s.\n%s \n Please check the logs" %
                    (wfo.name, reqMgrClient.assignWorkflow.errorMessage))
                sendLog('assignor',
                        "Failed to assign %s.\n%s \n Please check the logs" %
                        (wfo.name, reqMgrClient.assignWorkflow.errorMessage),
                        level='critical')
                print "ERROR could not assign", wfo.name
        else:
            pass
    print "Assignment summary:"
    sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled))
    if n_stalled and not options.go and not options.early:
        sendLog('assignor',
                "%s workflows cannot be assigned. Please take a look" %
                (n_stalled),
                level='critical')
Exemplo n.º 15
0
#!/usr/bin/env python
from utils import workflowInfo, getWorkflows, sendEmail, componentInfo, monitor_dir, reqmgr_url, newLockInfo
from assignSession import *
import reqMgrClient
import os
import sys
import json

url = reqmgr_url

#nl = newLockInfo()
#nl.lock('/Neutrino_E-10_gun/RunIISpring15PrePremix-AVE_25_BX_25ns_76X_mcRun2_asymptotic_v12-v3/GEN-SIM-DIGI-RAW')
#nl.lock('/DYJetsToLL_M-50_TuneCUETP8M1_13TeV-amcatnloFXFX-pythia8/RunIISummer15GS-MCRUN2_71_V1_ext1-v2/GEN-SIM')

## all dqmharvest completed to announced right away
wfs = getWorkflows(url, 'completed', user=None, rtype='DQMHarvest')
for wf in wfs:
    print "closing out", wf
    reqMgrClient.closeOutWorkflow(url, wf)
wfs = getWorkflows(url, 'closed-out', user=None, rtype='DQMHarvest')
for wf in wfs:
    print "announcing", wf
    reqMgrClient.announceWorkflow(url, wf)

#os.system('Unified/equalizor.py -a pdmvserv_task_HIG-RunIIFall15DR76-01039__v1_T_160120_002705_9423')
#os.system('Unified/equalizor.py -a pdmvserv_SMP-Summer12DR53X-00027_00440_v0__160224_044437_5031')

up = componentInfo(mcm=False, soft=['mcm'])
if not up.check():
    sys.exit(1)
Exemplo n.º 16
0
from utils import getDatasetBlockAndSite, siteInfo, getWorkflows, workflowInfo, monitor_dir
from collections import defaultdict
import time
import json
import sys
spec = None
if len(sys.argv) > 1:
    spec = sys.argv[1]

url = 'cmsweb.cern.ch'

wfs = getWorkflows(url, 'acquired', details=True)
wfs.extend(getWorkflows(url, 'running-open', details=True))
wfs.extend(getWorkflows(url, 'running-closed', details=True))
jobs_for = defaultdict(lambda: defaultdict(int))
wf_for = defaultdict(lambda: defaultdict(set))
agent_for = defaultdict(lambda: defaultdict(set))
s_block_locations = {}
block_locations = defaultdict(lambda: defaultdict(list))
wfs_no_location_in_GQ = defaultdict(list)
si = siteInfo()
#bad_blocks = defaultdict( set )
unprocessable = set()

for wf in wfs:
    if spec and not spec in wf['RequestName']: continue

    wfi = workflowInfo(url, wf['RequestName'], request=wf)
    sitewhitelist = wfi.request['SiteWhitelist']
    wqs = wfi.getWorkQueue()
Exemplo n.º 17
0
def assignor(url ,specific = None, talk=True, options=None):
    if userLock(): return
    mlock = moduleLock()
    if mlock(): return
    if not componentInfo().check(): return

    UC = unifiedConfiguration()
    CI = campaignInfo()
    #SI = siteInfo()
    SI = global_SI()
    #NLI = newLockInfo()
    #if not NLI.free() and not options.go: return
    LI = lockInfo()
    if not LI.free() and not options.go: return

    n_assigned = 0
    n_stalled = 0

    wfos=[]
    fetch_from = []
    if specific or options.early:
        fetch_from.extend(['considered','staging'])
    if specific:
        fetch_from.extend(['considered-tried'])


    if options.early:
        print "Option Early is on"

    fetch_from.extend(['staged'])

    if options.from_status:
        fetch_from = options.from_status.split(',')
        print "Overriding to read from",fetch_from

    for status in fetch_from:
        print "getting wf in",status
        wfos.extend(session.query(Workflow).filter(Workflow.status==status).all())
        print len(wfos)

    ## in case of partial, go for fetching a list from json ?
    #if options.partial and not specific:
    #    pass

    dataset_endpoints = json.loads(open('%s/dataset_endpoints.json'%monitor_dir).read())
    aaa_mapping = json.loads(open('%s/equalizor.json'%monitor_pub_dir).read())['mapping']
    all_stuck = set()
    all_stuck.update( json.loads( open('%s/stuck_transfers.json'%monitor_pub_dir).read() ))
    all_stuck.update( getAllStuckDataset()) 

    max_per_round = UC.get('max_per_round').get('assignor',None)
    max_cpuh_block = UC.get('max_cpuh_block')

    ##order by priority instead of random
    if options.early:
        cache = sorted(getWorkflows(url, 'assignment-approved', details=True), key = lambda r : r['RequestPriority'])
        cache = [r['RequestName'] for r in cache]
        def rank( wfn ):
            return cache.index( wfn ) if wfn in cache else 0

        wfos = sorted(wfos, key = lambda wfo : rank( wfo.name ),reverse=True)
        print "10 first",[wfo.name for wfo in wfos[:10]]
        print "10 last",[wfo.name for wfo in wfos[-10:]]
    else:
        random.shuffle( wfos )



    for wfo in wfos:
        
        if options.limit and (n_stalled+n_assigned)>options.limit:
            break

        if max_per_round and (n_stalled+n_assigned)>max_per_round:
            break

        if specific:
            if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue
            #if not specific in wfo.name: continue
        print "\n\n"
        wfh = workflowInfo( url, wfo.name)

        if wfh.request['RequestStatus'] in ['rejected','aborted','aborted-completed','aborted-archived','rejected-archived'] and wfh.isRelval():
            wfo.status = 'forget'
            session.commit()
            n_stalled+=1
            continue


        if options.priority and int(wfh.request['RequestPriority']) < options.priority:
            continue

        options_text=""
        if options.early: options_text+=", early option is ON"
        if options.partial: 
            options_text+=", partial option is ON"
            options_text+=", good fraction is %.2f"%options.good_enough
        


        wfh.sendLog('assignor',"%s to be assigned%s"%(wfo.name, options_text))

        ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores
        (lheinput,primary,parent,secondary, sites_allowed) = wfh.getSiteWhiteList()
        output_tiers = list(set([o.split('/')[-1] for o in wfh.request['OutputDatasets']]))

        if not output_tiers:
            n_stalled+=1
            wfh.sendLog('assignor','There is no output at all')
            sendLog('assignor','Workflow %s has no output at all'%( wfo.name), level='critical')
            continue

        is_stuck = (all_stuck & primary)
        if is_stuck:
            wfh.sendLog('assignor',"%s are stuck input"%(','.join( is_stuck)))

        ## check if by configuration we gave it a GO
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True

        allowed_secondary = {}
        assign_parameters = {}
        check_secondary = (not wfh.isRelval())
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns:
                assign_parameters.update( CI.campaigns[campaign] )

            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[campaign]:
                if CI.campaigns[campaign]['secondaries']:
                    allowed_secondary.update( CI.campaigns[campaign]['secondaries'] )
                    check_secondary = True
            if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[campaign]:
                banned_tier = list(set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers))
                if banned_tier:
                    no_go=True
                    wfh.sendLog('assignor','These data tiers %s are not allowed'%(','.join( banned_tier)))
                    sendLog('assignor','These data tiers %s are not allowed'%(','.join( banned_tier)), level='critical')

        if secondary and check_secondary:
            if (set(secondary)&set(allowed_secondary.keys())!=set(secondary)):
                wfh.sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-set(allowed_secondary.keys()))))
                sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-set(allowed_secondary.keys()))), level='critical')
                if not options.go:
                    no_go = True
            ## then get whether there is something more to be done by secondary
            for sec in secondary:
                if sec in allowed_secondary:# and 'parameters' in allowed_secondary[sec]:
                    assign_parameters.update( allowed_secondary[sec] )

        if no_go:
            n_stalled+=1
            ## make a very loud noise if >100k priority stalled
            continue


            
        ## check on current status for by-passed assignment
        if wfh.request['RequestStatus'] !='assignment-approved':
            if not options.test:
                wfh.sendLog('assignor',"setting %s away and skipping"%wfo.name)
                ## the module picking up from away will do what is necessary of it
                wfo.wm_status = wfh.request['RequestStatus']
                wfo.status = 'away'
                session.commit()
                continue
            else:
                print wfo.name,wfh.request['RequestStatus']

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version=wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                wfh.sendLog('assignor',"cannot decide on version number")
                n_stalled+=1
                wfo.status = 'trouble'
                session.commit()
                continue


        original_sites_allowed = copy.deepcopy( sites_allowed )
        wfh.sendLog('assignor',"Site white list %s"%sorted(sites_allowed))
        override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', [])

        blocks = wfh.getBlockWhiteList()
        rwl = wfh.getRunWhiteList()
        if rwl:
            ## augment with run white list
            for dataset in primary:
                blocks = list(set( blocks + getDatasetBlocks( dataset, runs=rwl ) ))
        lwl = wfh.getLumiWhiteList()
        if lwl:
            ## augment with lumi white list
            for dataset in primary:
                blocks = list(set( blocks + getDatasetBlocks( dataset, lumis=lwl)))

        wfh.sendLog('assignor',"Allowed %s"%sorted(sites_allowed))
        secondary_locations=None

        primary_aaa = options.primary_aaa
        secondary_aaa = options.secondary_aaa
        do_partial = False #options.good_enough if options.partial else 0

        if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns:
            assign_parameters.update( CI.campaigns[wfh.request['Campaign']] )

        if 'primary_AAA' in assign_parameters:
            primary_aaa = primary_aaa or assign_parameters['primary_AAA']
        if 'secondary_AAA' in assign_parameters:
            secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA']
        if 'partial_copy' in assign_parameters:
            ## can this only work if there is a stuck input ? maybe not
            ## this is a number. 0 means no
            print "Could do partial disk copy assignment"
            if is_stuck or options.partial:
                do_partial = assign_parameters['partial_copy']
                wfh.sendLog('assignor',"Overiding partial copy assignment to %.2f fraction"% do_partial)
                #sendEmail('stuck input to assignment','%s is stuck for assigning %s and going fractional'%(','.join( is_stuck), wfo.name))
            
        do_partial = options.good_enough if options.partial else do_partial


        for sec in list(secondary):
            if override_sec_location: 
                print "We don't care where the secondary is"
                print "Cannot pass for now"
                #sendEmail("tempting to pass sec location check","but we cannot yet IMO")
                #pass

            presence = getDatasetPresence( url, sec )
            print sec
            print json.dumps(presence, indent=2)
            one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.]

            if secondary_aaa:
                if not one_secondary_locations:
                    sec_availability = getDatasetBlocksFraction( url, sec )
                    if sec_availability >=1. and options.go:
                        ## there is at least one copy of each block on disk. We should go ahead and let it go.
                        wfh.sendLog('assignor',"The secondary %s is available %s times on disk, and usable"%( sec, sec_availability))
                    else:
                        ## not even a copy on disk anywhere !!!!
                        sites_allowed = [] ## will block the assignment
                        wfh.sendLog('assignor',"The secondary %s is nowhere on disk"% sec)
                #just continue without checking
                continue

            #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
            if secondary_locations==None:
                secondary_locations = one_secondary_locations
            else:
                secondary_locations = list(set(secondary_locations) & set(one_secondary_locations))
            ## reduce the site white list to site with secondary only
            #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])]
            sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations]
            
        wfh.sendLog('assignor',"Intersecting with secondary requirement, now allowed %s"%sorted(sites_allowed))

        initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it

        sites_all_data = copy.deepcopy( sites_allowed )
        sites_with_data = copy.deepcopy( sites_allowed )
        sites_with_any_data = copy.deepcopy( sites_allowed )
        primary_locations = None
        available_fractions = {}
        set_lfn = '/store/mc' ## by default

        endpoints = set()
        for prim in list(primary):
            if prim in dataset_endpoints:
                print "endpoints from stagor",dataset_endpoints[prim]
                endpoints.update( dataset_endpoints[prim] )
            set_lfn = getLFNbase( prim )
            ## if they are requested for processing, they should bbe all closed already
            closeAllBlocks(url, prim, blocks)
            presence = getDatasetPresence( url, prim , only_blocks=blocks)
            if talk:
                print prim
                print json.dumps(presence, indent=2)
            available_fractions[prim] =  getDatasetBlocksFraction(url, prim, sites = [SI.CE_to_SE(site) for site in sites_allowed] , only_blocks = blocks)
            if primary_aaa:
                available_fractions[prim] =  getDatasetBlocksFraction(url, prim, only_blocks = blocks)

            sites_all_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,(there,frac)) in presence.items() if there]]
            if primary_aaa:
                sites_all_data = set()
                for (psite,(there,frac)) in presence.items():
                    if there:
                        sites_all_data.update( SI.SE_to_CEs(psite) )
                sites_all_data = list(sites_all_data)
                #sites_all_data = list(set([SI.SE_to_CE(psite) for (psite,(there,frac)) in presence.items() if there]))
            sites_with_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,frac) in presence.items() if frac[1]>90.]]
            sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()]
            if primary_aaa:
                sites_with_any_data = set()
                for psite in presence.keys():
                    sites_with_any_data.update( SI.SE_to_CEs(psite) )
                sites_with_any_data = list(sites_with_any_data)
                #sites_with_any_data = list(set([SI.SE_to_CE(psite) for psite in presence.keys()]))

            holding_but_not_allowed = set()
            for se_site in presence.keys():
                if not (set(SI.SE_to_CEs(se_site)) & set(sites_allowed)):
                    holding_but_not_allowed.add( se_site )
            #wfh.sendLog('assignor',"Holding the data but not allowed %s"%sorted(list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed]))))
            wfh.sendLog('assignor',"Holding the data but not allowed %s"%sorted( holding_but_not_allowed ))
            if primary_locations==None:
                primary_locations = presence.keys()
            else:
                primary_locations = list(set(primary_locations) & set(presence.keys() ))

        sites_with_data = list(set(sites_with_data))
        sites_with_any_data = list(set(sites_with_any_data))

        opportunistic_sites=[]
        down_time = False
        ## opportunistic running where any piece of data is available
        if secondary_locations or primary_locations:
            ## intersection of both any pieces of the primary and good IO
            #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))]
            if secondary_locations and primary_locations:
                opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed]))]
            elif primary_locations:
                opportunistic_sites = [SI.SE_to_CE(site) for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed]))]
            else:
                opportunistic_sites = []
            wfh.sendLog('assignor',"We could be running in addition at %s"% sorted(opportunistic_sites))
            if any([osite in SI.sites_not_ready for osite in opportunistic_sites]):
                wfh.sendLog('assignor',"One of the usable site is in downtime %s"%([osite for osite in opportunistic_sites if osite in SI.sites_not_ready]))
                down_time = True
                ## should this be send back to considered ?
                

        ## should be 2 but for the time-being let's lower it to get things going
        copies_wanted,cpuh = wfh.getNCopies()
        wfh.sendLog('assignor',"we need %s CPUh"%cpuh)
        if cpuh>max_cpuh_block and not options.go:
            #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**'])
            sendLog('assignor','%s requires a large numbr of CPUh %s , not assigning, please check with requester'%( wfo.name, cpuh), level='critical')
            wfh.sendLog('assignor',"Requiring a large number of CPUh %s, not assigning"%cpuh)
            continue

        if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]:
            copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies']
            copies_wanted = min(copies_needed_from_campaign, copies_wanted)
        
        if not options.early:
            less_copies_than_requested = UC.get("less_copies_than_requested")
            copies_wanted = max(1,copies_wanted-less_copies_than_requested) # take one out for the efficiency
        else:
            ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going
            pass

        wfh.sendLog('assignor',"needed availability fraction %s"% copies_wanted)

        ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently

        ## default back to white list to original white list with any data
        wfh.sendLog('assignor',"Allowed sites :%s"% sorted(sites_allowed))

        if primary_aaa:
            ## remove the sites not reachable localy if not in having the data
            if not sites_all_data:
                wfh.sendLog('assignor',"Overiding the primary on AAA setting to Off")
                primary_aaa=False
            else:
                aaa_grid = set(sites_all_data)
                for site in list(aaa_grid):
                    aaa_grid.update( aaa_mapping.get(site,[]) )
                sites_allowed = list(set(initial_sites_allowed) & aaa_grid)
                wfh.sendLog('assignor',"Selected to read primary through xrootd %s"%sorted(sites_allowed))
                
        isStoreResults = ( 'StoreResults' == wfh.request.setdefault('RequestType',None) )

        if isStoreResults:
            if 'MergedLFNBase' in wfh.request:
                set_lfn = wfh.request['MergedLFNBase']
            else:
                n_stalled+= 1
                wfh.sendLog('assignor',"Cannot assign StoreResults request because MergedLFN is missing")
                sendLog('assignor','Cannot assign StoreResults request because MergedLFN is missing', level='critical')
                continue

        if not primary_aaa:
            if not isStoreResults:
                sites_allowed = sites_with_any_data
            else:
                ## if we are dealing with a StoreResults request, we don't need to check dataset availability and 
                ## should use the SiteWhiteList set in the original request
                if 'SiteWhitelist' in wfh.request:
                    sites_allowed = wfh.request['SiteWhitelist'] 
                else: 
                    wfh.sendLog('assignor',"Cannot assign StoreResults request because SiteWhitelist is missing")
                    sendLog('assignor','Cannot assign StoreResults request because SiteWhitelist is missing', level='critical')
                    n_stalled += 1
                    continue
                available_fractions = {}
            wfh.sendLog('assignor',"Selected for any data %s"%sorted(sites_allowed))

        ### check on endpoints for on-going transfers
        if do_partial:
            if endpoints:
                end_sites = [SI.SE_to_CE(s) for s in endpoints]
                sites_allowed = list(set(sites_allowed + end_sites))
                if down_time and not any(osite in SI.sites_not_ready for osite in end_sites):
                    print "Flip the status of downtime, since our destinations are good"
                    down_time = False
                print "with added endpoints",sorted(end_sites)
            else:
                print "Cannot do partial assignment without knowin the endpoints"
                n_stalled+=1
                continue
            
            
        #if not len(sites_allowed):
        #    if not options.early:
        #        wfh.sendLog('assignor',"cannot be assign with no matched sites")
        #        sendLog('assignor','%s has no whitelist'% wfo.name, level='critical')
        #    n_stalled+=1
        #    continue


        low_pressure = SI.sites_low_pressure(0.4)
        ## if any of the site allowed is low pressure : reduce to 1 copy so that it gets started
        allowed_and_low = sorted(set(low_pressure) & set(sites_allowed))
        if allowed_and_low:
            wfh.sendLog('assignor',"The workflow can run at %s under low pressure currently"%( ','.join( allowed_and_low )))
            copies_wanted = max(1., copies_wanted-1.)


        if available_fractions and not all([available>=copies_wanted for available in available_fractions.values()]):
            not_even_once = not all([available>=1. for available in available_fractions.values()])
            above_good = all([available >= do_partial for available in available_fractions.values()])
            wfh.sendLog('assignor',"The input dataset is not available %s times, only %s"%( copies_wanted, available_fractions.values()))
            if down_time and not options.go and not options.early:
                wfo.status = 'considered'
                session.commit()
                wfh.sendLog('assignor',"sending back to considered because of site downtime, instead of waiting")
                #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name)
                sendLog('assignor','%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.'%( wfo.name ), level='delay')
                n_stalled+=1
                continue
                #pass

            print json.dumps(available_fractions)
            if (options.go and not_even_once) or not options.go:
                known = []
                try:
                    known = json.loads(open('cannot_assign.json').read())
                except:
                    pass
                if not wfo.name in known and not options.limit and not options.go and not options.early and not (do_partial and above_good):
                    wfh.sendLog('assignor',"cannot be assigned, %s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions)))
                    #sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions)))
                    known.append( wfo.name )
                    open('cannot_assign.json','w').write(json.dumps( known, indent=2))
                
                if options.early:
                    if wfo.status == 'considered':
                        wfh.sendLog('assignor',"setting considered-tried")
                        wfo.status = 'considered-tried'
                        session.commit()
                    else:
                        print "tried but status is",wfo.status
                if do_partial and above_good:
                    print "Will move on with partial locations"
                else:
                    n_stalled+=1
                    continue

        if not len(sites_allowed) and not options.SiteWhitelist:
            if not options.early:
                wfh.sendLog('assignor',"cannot be assign with no matched sites")
                sendLog('assignor','%s has no whitelist'% wfo.name, level='critical')
            n_stalled+=1
            continue


        t1_only = [ce for ce in sites_allowed if ce.startswith('T1')]
        if t1_only:
            # try to pick from T1 only first
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])]
        else:
            # then pick any otherwise
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])]
            
            
        wfh.sendLog('assignor',"Placing the output on %s"%sites_out)
        parameters={
            'SiteWhitelist' : sites_allowed,
            'NonCustodialSites' : sites_out,
            'AutoApproveSubscriptionSites' : list(set(sites_out)),
            'AcquisitionEra' : wfh.acquisitionEra(),
            'ProcessingString' : wfh.processingString(),
            'MergedLFNBase' : set_lfn,
            'ProcessingVersion' : version,
            }

        if primary_aaa:
            parameters['TrustSitelists'] = True
            wfh.sendLog('assignor',"Reading primary through xrootd at %s"%sorted(sites_allowed))            

        if secondary_aaa:
            parameters['TrustPUSitelists'] = True
            wfh.sendLog('assignor',"Reading secondary through xrootd at %s"%sorted(sites_allowed))            

        ## plain assignment here
        team='production'
        if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM')
        if options and options.team:
            team = options.team
        parameters['Team'] = team


        if lheinput:
            ## throttle reading LHE article 
            wfh.sendLog('assignor', 'Setting the number of events per job to 500k max')
            parameters['EventsPerJob'] = 500000

        def pick_options(options, parameters):
            ##parse options entered in command line if any
            if options:
                for key in reqMgrClient.assignWorkflow.keys:
                    v=getattr(options,key)
                    if v!=None:
                        if type(v)==str and ',' in v: 
                            parameters[key] = filter(None,v.split(','))
                        else: 
                            parameters[key] = v

        def pick_campaign( assign_parameters, parameters):
            ## pick up campaign specific assignment parameters
            parameters.update( assign_parameters.get('parameters',{}) )

        if options.force_options:
            pick_campaign( assign_parameters, parameters)
            pick_options(options, parameters)
        else:
            ## campaign parameters update last
            pick_options(options, parameters)
            pick_campaign( assign_parameters, parameters)

        if not options.test:
            parameters['execute'] = True

        hold_split, split_check = wfh.checkSplitting()
        if hold_split and not options.go:
            if split_check:
                wfh.sendLog('assignor','Holding on to the change in splitting %s'%( '\n\n'.join([str(i) for i in split_check])))
            else:
                wfh.sendLog('assignor','Change of splitting is on hold')                
            n_stalled+=1
            continue            

        if split_check==None or split_check==False:
            n_stalled+=1
            continue
        elif split_check:
            ## operate all recommended changes
            reqMgrClient.setWorkflowSplitting(url, 
                                              wfo.name,
                                              split_check)
            wfh.sendLog('assignor','Applying the change in splitting %s'%( '\n\n'.join([str(i) for i in split_check])))

        split_check = True ## bypass completely and use the above

        # Handle run-dependent MC
        pstring = wfh.processingString()
        if 'PU_RD' in pstring:
            numEvents = wfh.getRequestNumEvents()
            eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary]
            eventsPerLumi = sum(eventsPerLumi)/float(len(eventsPerLumi))
            reqJobs = 500
            if 'PU_RD2' in pstring:
                reqJobs = 2000
                eventsPerJob = int(numEvents/(reqJobs*1.4))
                lumisPerJob = int(eventsPerJob/eventsPerLumi)
                if lumisPerJob==0:
                    #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob), level='critical')
                    wfh.sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    parameters['EventsPerJob'] = eventsPerJob
                else:
                    spl = wfh.getSplittings()[0]
                    eventsPerJobEstimated = spl['events_per_job'] if 'events_per_job' in spl else None
                    eventsPerJobEstimated = spl['avg_events_per_job'] if 'avg_events_per_job' in spl else None
                    if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob:
                        #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob), level='critical')
                        wfh.sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        parameters['LumisPerJob'] = lumisPerJob
                    else:
                        #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name)
                        sendLog('assignor',"leaving splitting untouched for %s, please check on %s"%( pstring, wfo.name), level='critical')
                        wfh.sendLog('assignor',"leaving splitting untouched for PU_RD*, please check.")

        if isHEPCloudReady(url) and wfh.isGoodForNERSC():
            parameters['Team'] = 'hepcloud'
            parameters['SiteWhitelist'] = ['T3_US_NERSC']
            if primary:
                parameters['TrustSitelists'] = True
            if secondary:
                parameters['TrustPUSitelists'] = True
            sendEmail("sending work to hepcloud","pleasse check on %s"% wfh.request['RequestName'], destination=['*****@*****.**'])
        
        ## make sure to autoapprove all NonCustodialSites
        parameters['AutoApproveSubscriptionSites'] = list(set(parameters['NonCustodialSites'] + parameters.get('AutoApproveSubscriptionSites',[])))
        
        result = reqMgrClient.assignWorkflow(url, wfo.name, None, parameters) ## team is not relevant anymore here


        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
                n_assigned+=1
                wfh.sendLog('assignor',"Properly assigned\n%s"%(json.dumps( parameters, indent=2)))
                try:
                    ## refetch information and lock output
                    new_wfi = workflowInfo( url, wfo.name)
                    (_,prim,_,sec) = new_wfi.getIO()
                    for secure in list(prim)+list(sec)+new_wfi.request['OutputDatasets']:
                        ## lock all outputs
                        LI.lock( secure, reason = 'assigning')

                except Exception as e:
                    print "fail in locking output"
                    
                    print str(e)
                    sendEmail("failed locking of output",str(e))


            else:
                wfh.sendLog('assignor',"Failed to assign %s.\n%s \n Please check the logs"%(wfo.name, reqMgrClient.assignWorkflow.errorMessage))
                sendLog('assignor',"Failed to assign %s.\n%s \n Please check the logs"%(wfo.name, reqMgrClient.assignWorkflow.errorMessage), level='critical')
                print "ERROR could not assign",wfo.name
        else:
            pass
    print "Assignment summary:"
    sendLog('assignor',"Assigned %d Stalled %s"%(n_assigned, n_stalled))
    if n_stalled and not options.go and not options.early:
        sendLog('assignor',"%s workflows cannot be assigned. Please take a look"%(n_stalled), level='critical')
Exemplo n.º 18
0
def checkor(url, spec=None, options=None):
    if userLock():   return
    if duplicateLock() and not options.go:  return

    fDB = closeoutInfo()

    UC = unifiedConfiguration()
    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    def time_point(label="",sub_lap=False):
        now = time.mktime(time.gmtime())
        nows = time.asctime(time.gmtime())

        print "Time check (%s) point at : %s"%(label, nows)
        print "Since start: %s [s]"% ( now - time_point.start)
        if sub_lap:
            print "Sub Lap : %s [s]"% ( now - time_point.sub_lap ) 
            time_point.sub_lap = now
        else:
            print "Lap : %s [s]"% ( now - time_point.lap ) 
            time_point.lap = now            
            time_point.sub_lap = now

    time_point.sub_lap = time_point.lap = time_point.start = time.mktime(time.gmtime())
    
    runnings = session.query(Workflow).filter(Workflow.status == 'away').all()
    standings = session.query(Workflow).filter(Workflow.status.startswith('assistance')).all()

    ## intersect with what is actually in completed status in request manager now
    all_completed = set(getWorkflows(url, 'completed' ))

    wfs=[]

    if options.strict:
        ## the one which were running and now have completed
        print "strict option is on: checking workflows that freshly completed"
        wfs.extend( filter(lambda wfo: wfo.name in all_completed , runnings))
    if options.update:
        print "update option is on: checking workflows that have not completed yet"
        wfs.extend( filter(lambda wfo: not wfo.name in all_completed , runnings))

    if options.clear:
        print "clear option is on: checking workflows that are ready to toggle closed-out"
        wfs.extend( filter(lambda wfo: 'custodial' in wfo.status, standings))
    if options.review:
        print "review option is on: checking the workflows that needed intervention"
        wfs.extend( filter(lambda wfo: not 'custodial' in wfo.status, standings))

    ## what is left out are the wf which were running and ended up aborted/failed/...

    

    custodials = defaultdict(list) #sites : dataset list
    transfers = defaultdict(list) #sites : dataset list
    invalidations = [] #a list of files
    SI = siteInfo()
    CI = campaignInfo()
    mcm = McMClient(dev=False) if use_mcm else None

    def get_campaign(output, wfi):
        ## this should be a perfect matching of output->task->campaign
        campaign = None
        era = None
        wf_campaign = None
        if 'Campaign' in wfi.request:   wf_campaign = wfi.request['Campaign']
        try:
            era = output.split('/')[2].split('-')[0]
        except:
            era = None
            
        if wfi.isRelval(): 
            campaign = wf_campaign
        else:
            campaign = era if era else wf_campaign
        return campaign

    ## retrieve bypass and onhold configuration
    bypasses = []
    forcings = []
    overrides = getForceCompletes()
    holdings = []

    
    actors = UC.get('allowed_bypass')

    for bypassor,email in actors:
        bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor)
        if not os.path.isfile(bypass_file):
            #sendLog('checkor','no file %s',bypass_file)
            continue
        try:
            bypasses.extend( json.loads(open(bypass_file).read()))
        except:
            sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor))
            sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email])
        
        holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor)
        if not os.path.isfile(holding_file):
            #sendLog('checkor',"no file %s"%holding_file)
            continue
        try:
            extending = json.loads(open(holding_file).read())
            print bypassor,"is holding",extending
            holdings.extend( extending )
        except:
            sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor))
            sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email])

    ## once this was force-completed, you want to bypass
    for rider,email in actors:
        rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider)
        if not os.path.isfile(rider_file):
            print "no file",rider_file
            #sendLog('checkor',"no file %s"%rider_file)
            continue
        try:
            bypasses.extend( json.loads(open( rider_file ).read() ) )
        except:
            sendLog('checkor',"cannot get force complete list from %s"%rider)
            sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email])

    if use_mcm:
        forcings = mcm.get('/restapi/requests/forcecomplete')
        #if forcings:
        #    sendEmail('force completing mechanism','please check what checkor is doing with %s'%( ','.join(forcings)))


    pattern_fraction_pass = UC.get('pattern_fraction_pass')

    total_running_time = 5.*60. 
    sleep_time = 1
    if len(wfs):
        sleep_time = min(max(0.5, total_running_time / len(wfs)), 10)

    random.shuffle( wfs )

    in_manual = 0

    ## now you have a record of what file was invalidated globally from TT
    TMDB_invalid = dataCache.get('file_invalidation') 
    #try:
    #    TMDB_invalid = set([row[3] for row in csv.reader( os.popen('curl -s "https://docs.google.com/spreadsheets/d/11fFsDOTLTtRcI4Q3gXw0GNj4ZS8IoXMoQDC3CbOo_2o/export?format=csv"'))])
    #    TMDB_invalid = map(lambda e : e.split(':')[-1], TMDB_invalid)
    #    print len(TMDB_invalid),"globally invalidated files"
    #except Exception as e:
    #    print "TMDB not fetched"
    #    print str(e)
    #    TMDB_invalid = []


    print len(wfs),"to consider, pausing for",sleep_time
    max_per_round = UC.get('max_per_round').get('checkor',None)
    if options.limit: max_per_round=options.limit
    if max_per_round and not spec: wfs = wfs[:max_per_round]



    for wfo in wfs:
        if spec and not (spec in wfo.name): continue
        
        time.sleep( sleep_time )
        
        time_point("Starting with %s"% wfo.name)

        ## get info
        wfi = workflowInfo(url, wfo.name)
        wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status))
        ## make sure the wm status is up to date.
        # and send things back/forward if necessary.
        wfo.wm_status = wfi.request['RequestStatus']
        if wfo.wm_status == 'closed-out':
            ## manually closed-out
            wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status))
            wfo.status = 'close'
            session.commit()
            continue

        elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']:
            ## went into trouble
            wfo.status = 'trouble'
            wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status))
            session.commit()
            continue
        elif wfo.wm_status in ['assigned','acquired']:
            ## not worth checking yet
            wfi.sendLog('checkor',"%s is not running yet"%wfo.name)
            session.commit()
            continue
        
        if '-onhold' in wfo.status:
            if wfo.name in holdings and wfo.name not in bypasses:
                wfi.sendLog('checkor',"%s is on hold"%wfo.name)
                continue

        if wfo.wm_status != 'completed': #and not wfo.name in bypasses:
            ## for sure move on with closeout check if in completed
            wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status))
            session.commit()
            continue

        if wfo.name in holdings and wfo.name not in bypasses:
            wfo.status = 'assistance-onhold'
            wfi.sendLog('checkor',"setting %s on hold"%wfo.name)
            session.commit()
            continue

        session.commit()        
        #sub_assistance="" # if that string is filled, there will be need for manual assistance
        existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance
        assistance_tags = set()

        is_closing = True

        ## get it from somewhere
        bypass_checks = False

        for bypass in bypasses:
            if bypass in wfo.name:
                wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass))
                bypass_checks = True
                break
        pids = wfi.getPrepIDs()
        force_by_mcm = False
        force_by_user = False
        for force in forcings:
            if force in pids:
                wfi.sendLog('checkor',"we can bypass checks and force complete %s because of prepid %s "%( wfo.name, force))
                bypass_checks = True
                force_by_mcm = True
                break
        for user in overrides:
            for force in overrides[user]:
                if force in wfo.name:
                    wfi.sendLog('checkor',"we can bypass checks and force complete %s because of keyword %s of user %s"%( wfo.name, force, user))
                    bypass_checks = True
                    force_by_user = True
                    break
        
        tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm*
        vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco
        to_ddm_tier = copy.deepcopy(UC.get('tiers_to_DDM'))
        campaigns = {} ## this mapping of campaign per output dataset assumes era==campaing, which is not true for relval
        expected_outputs = copy.deepcopy( wfi.request['OutputDatasets'] )
        for out in wfi.request['OutputDatasets']:
            c = get_campaign(out, wfi)
            campaigns[out] = c 
            if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]:
                vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override']))
                ## add those that we need to check for custodial copy
                tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check

        check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] ))
        wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])]
        check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] ))
        check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check ))
        check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) )

        wfi.sendLog('checkor', check_output_text )

        ## anything running on acdc : getting the real prepid is not worth it
        familly = getWorkflowById(url, wfi.request['PrepID'], details=True)
        acdc = []
        acdc_inactive = []
        forced_already=False
        acdc_bads = []
        true_familly = []
        for member in familly:
            if member['RequestType'] != 'Resubmission': continue
            if member['RequestName'] == wfo.name: continue
            if member['RequestDate'] < wfi.request['RequestDate']: continue
            if member['PrepID'] != wfi.request['PrepID'] : continue
            #if 'OriginalRequestName' in member and (not 'ACDC' in member['OriginalRequestName']) and member['OriginalRequestName'] != wfo.name: continue
            if member['RequestStatus'] == None: continue

            if not set(member['OutputDatasets']).issubset( set(expected_outputs)):
                if not member['RequestStatus'] in ['rejected-archived','rejected','aborted','aborted-archived']:
                    ##this is not good at all
                    wfi.sendLog('checkor','inconsistent ACDC %s'%member['RequestName'] )
                    #sendLog('checkor','inconsistent ACDC %s'%member['RequestName'], level='critical')
                    acdc_bads.append( member['RequestName'] )
                    is_closing = False
                    assistance_tags.add('manual')
                continue

            true_familly.append( member['RequestName'] )
            #try:
            #    parse_one(url, member['RequestName'])
            #except:
            #    print "Could not make error report for",member['RequestName']

            if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']:
                print wfo.name,"still has an ACDC running",member['RequestName']
                acdc.append( member['RequestName'] )
                ## cannot be bypassed!
                is_closing = False
                assistance_tags.add('recovering')
                if (force_by_mcm or force_by_user) and not forced_already:
                    wfi.sendLog('checkor','%s is being forced completed while recovering'%wfo.name)
                    wfi.notifyRequestor("The workflow %s was force completed"% wfo.name, do_batch=False)
                    forceComplete(url, wfi)
                    forced_already=True
            else:
                acdc_inactive.append( member['RequestName'] )
                assistance_tags.add('recovered')
        if acdc_bads:
            #sendEmail('inconsistent ACDC','for %s, ACDC %s is inconsistent, preventing from closing'%( wfo.name, ','.join(acdc_bads) ))
            sendLog('checkor','For %s, ACDC %s is inconsistent, preventing from closing or will create a mess.'%( wfo.name, ','.join(acdc_bads) ), level='critical')

        time_point("checked workflow familly", sub_lap=True)


        ## completion check
        percent_completions = {}
        if not 'TotalInputEvents' in wfi.request:
            event_expected,lumi_expected = 0,0
            if not 'recovery' in wfo.status:
                #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**'])
                sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical')
        else:
            event_expected,lumi_expected =  wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis']

        if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']):
            event_expected = int(wfi.request['RequestNumEvents'])
        elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']:
            event_expected = wfi.request['Task1']['RequestNumEvents']
            for i in range(1,20):
                if 'Task%d'%i in wfi.request:
                    ## this is wrong ibsolute
                    if 'FilterEfficiency' in wfi.request['Task%d'%i]:
                        event_expected *= float(wfi.request['Task%d'%i]['FilterEfficiency'])
            event_expected = int(event_expected)

        fractions_pass = {}
        events_per_lumi = {}

        over_100_pass = False
        (lhe,prim,_,_) = wfi.getIO()
        if lhe or prim: over_100_pass = False

        time_point("execpted statistics", sub_lap=True)

        for output in wfi.request['OutputDatasets']:
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=output)
            events_per_lumi[output] = event_count/float(lumi_count) if lumi_count else 100
                
            percent_completions[output] = 0.

            if lumi_expected:
                percent_completions[output] = lumi_count / float( lumi_expected )

            if event_expected:
                wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected ))
                percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) )

            default_pass = UC.get('default_fraction_pass')
            fractions_pass[output] = default_pass
            c = campaigns[output]
            if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]:
                if type(CI.campaigns[c]['fractionpass']) == dict:
                    tier = output.split('/')[-1]
                    priority = str(wfi.request['RequestPriority'])
                    ## defined per tier
                    fractions_pass[output] = CI.campaigns[c]['fractionpass'].get('all', default_pass)
                    if tier in CI.campaigns[c]['fractionpass']:
                        fractions_pass[output] = CI.campaigns[c]['fractionpass'][tier]
                    if priority in CI.campaigns[c]['fractionpass']:
                        fractions_pass[output] = CI.campaigns[c]['fractionpass'][priority]
                else:
                    fractions_pass[output] = CI.campaigns[c]['fractionpass']
                wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output))

            if options.fractionpass:
                fractions_pass[output] = options.fractionpass
                print "overriding fraction to",fractions_pass[output],"by command line for",output

            for key in pattern_fraction_pass:
                if key in output:
                    fractions_pass[output] = pattern_fraction_pass[key]
                    print "overriding fraction to",fractions_pass[output],"by dataset key",key
                    

        if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]):
            possible_recoveries = wfi.getRecoveryDoc()
            if possible_recoveries == []:
                wfi.sendLog('checkor','%s has missing statistics \n%s \n%s, but nothing is recoverable. passing through to annoucement'%( 
                        wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) ))
                sendLog('checkor','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name, level='critical')
                #sendEmail('nothing is recoverable','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name)#,destination=['*****@*****.**'])
                ## do not bypass for now, until Alan understands why we are loosing ACDC docs 
                bypass_checks = True
            else:
                wfi.sendLog('checkor','%s is not completed  \n%s \n%s'%( 
                        wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) ))

            ## hook for creating automatically ACDC ?
            if not bypass_checks:
                assistance_tags.add('recovery')
                is_closing = False

        if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]):
            print wfo.name,"is over completed"
            print json.dumps(percent_completions, indent=2)
            if not bypass_checks:
                assistance_tags.add('over100')
                is_closing = False

        time_point("checked output size", sub_lap=True)

        ## correct lumi < 300 event per lumi
        #for output in wfi.request['OutputDatasets']:
        #events_per_lumi[output] = getDatasetEventsPerLumi( output )


        lumi_upper_limit = {}
        for output in wfi.request['OutputDatasets']:
            upper_limit = 301.
            campaign = campaigns[output]

            if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]:
                upper_limit = CI.campaigns[campaign]['lumisize']
                print "overriding the upper lumi size to",upper_limit,"for",campaign

            if options.lumisize:
                upper_limit = options.lumisize
                print "overriding the upper lumi size to",upper_limit,"by command line"
                
            lumi_upper_limit[output] = upper_limit
            if wfi.request['RequestType'] in ['ReDigi','ReReco']: lumi_upper_limit[output] = -1
        
        if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]):
            print wfo.name,"has big lumisections"
            print json.dumps(events_per_lumi, indent=2)
            ## hook for rejecting the request ?
            if not bypass_checks:
                assistance_tags.add('biglumi')
                is_closing = False 


        any_presence = {}
        for output in wfi.request['OutputDatasets']:
            any_presence[output] = getDatasetPresence(url, output, vetoes=[])

        time_point("checked dataset presence", sub_lap=True)

        ## custodial copy
        custodial_locations = {}
        custodial_presences = {}
        for output in wfi.request['OutputDatasets']:
            custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s]
            custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output)

            if not custodial_locations[output]:
                custodial_locations[output] = []

        time_point("checked custodiality", sub_lap=True)

        ## presence in phedex
        phedex_presence ={}
        for output in wfi.request['OutputDatasets']:
            phedex_presence[output] = phedexClient.getFileCountDataset(url, output )

        time_point("checked phedex count", sub_lap=True)


            
        out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier]
        size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs
        size_worht_going_to_ddm = sum([getDatasetSize(out)/1023. for out in out_worth_checking if out.split('/')[-1] in to_ddm_tier ]) ## size in TBs of all outputs
        if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])):
            print wfo.name,"has not all custodial location"
            print json.dumps(custodial_locations, indent=2)

            ##########
            ## hook for making a custodial replica ?
            custodial = None
            ## get from other outputs
            for output in out_worth_checking:
                if len(custodial_locations[output]): 
                    custodial = custodial_locations[output][0]
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the other output custodial:",custodial,"because of limited space"
                custodial = None

            ## try to get it from campaign configuration
            if not custodial:
                for output in out_worth_checking:
                    campaign = campaigns[output]
                    if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]:
                        custodial = CI.campaigns[campaign]['custodial']
                        print "Setting custodial to",custodial,"from campaign configuration"

            group = None
            if campaign in CI.campaigns and 'phedex_group' in CI.campaigns[campaign]:
                group = CI.campaigns[campaign]['phedex_group']
                print "using group",group,"for replica"

            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the campaign configuration custodial:",custodial,"because of limited space"
                custodial = None

            ## get from the parent
            pick_custodial = True
            use_parent_custodial = UC.get('use_parent_custodial')
            tape_size_limit = options.tape_size_limit if options.tape_size_limit else UC.get("tape_size_limit")
                
            _,prim,_,_ = wfi.getIO()
            if not custodial and prim and use_parent_custodial:
                parent_dataset = prim.pop()
                ## this is terribly dangerous to assume only 
                parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset )
                ###parents_custodial = findCustodialLocation(url, parent_dataset)
                if not parents_custodial:
                    parents_custodial = []

                if len(parents_custodial):
                    custodial = parents_custodial[0]
                else:
                    print "the input dataset",parent_dataset,"does not have custodial in the first place. abort"
                    #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset)
                    ## does not work for RAWOADSIM
                    sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset)
                    ## cannot be bypassed, this is an issue to fix
                    is_closing = False
                    pick_custodial = False
                    assistance_tags.add('parentcustodial')
                                
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the parent custodial:",custodial,"because of limited space"
                custodial = None

            if not custodial and pick_custodial:
                ## pick one at random
                custodial = SI.pick_SE(size=size_worth_checking)


            if custodial and size_worht_going_to_ddm > tape_size_limit:
                print wfi.sendLog('checkor',"The total output size (%s TB) is too large for the limit set (%s TB)"%( size_worth_checking, tape_size_limit))
                custodial = None

            if not custodial:
                print "cannot find a custodial for",wfo.name
                wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking))
                sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical')
                
            if custodial and (is_closing or bypass_checks):
                print "picked",custodial,"for tape copy"

                ## remember how much you added this round already ; this stays locally
                SI.storage[custodial] -= size_worth_checking
                ## register the custodial request, if there are no other big issues
                for output in out_worth_checking:
                    if not len(custodial_locations[output]):
                        if phedex_presence[output]>=1:
                            wfi.sendLog('checkor','Using %s as a tape destination for %s'%(custodial, output))
                            custodials[custodial].append( output )
                            if group: custodials[custodial][-1]+='@%s'%group
                            ## let's wait and see if that's needed 
                            assistance_tags.add('custodial')
                        else:
                            print "no file in phedex for",output," not good to add to custodial requests"
            #cannot be bypassed


            is_closing = False

        time_point("determined tape location", sub_lap=True)

        ## disk copy 
        disk_copies = {}
        for output in wfi.request['OutputDatasets']:
            disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)]

        if not all(map( lambda sites : len(sites)!=0, disk_copies.values())):
            print wfo.name,"has not all output on disk"
            print json.dumps(disk_copies, indent=2)


        ## presence in dbs
        dbs_presence = {}
        dbs_invalid = {}
        for output in wfi.request['OutputDatasets']:
            dbs_presence[output] = dbs3Client.getFileCountDataset( output )
            dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True)

        
        time_point("dbs file count", sub_lap=True)

        if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            mismatch_notice = wfo.name+" has a dbs,phedex mismatch\n"
            mismatch_notice += "in dbs\n"+json.dumps(dbs_presence, indent=2) +"\n"
            mismatch_notice += "invalide in dbs\n"+json.dumps(dbs_invalid, indent=2) +"\n"
            mismatch_notice += "in phedex\n"+json.dumps(phedex_presence, indent=2) +"\n"

            wfi.sendLog('checkor',mismatch_notice)
            if not 'recovering' in assistance_tags:
                assistance_tags.add('filemismatch')
                #print this for show and tell if no recovery on-going
                for out in dbs_presence:
                    _,_,missing_phedex,missing_dbs  = getDatasetFiles(url, out)
                    if missing_phedex:
                        wfi.sendLog('checkor',"These %d files are missing in phedex\n%s"%(len(missing_phedex),
                                                                                          "\n".join( missing_phedex )))
                        were_invalidated = sorted(set(missing_phedex) & set(TMDB_invalid ))
                        if were_invalidated:
                            wfi.sendLog('checkor',"These %d files were invalidated globally\n%s"%(len(were_invalidated),
                                                                                                  "\n".join(were_invalidated)))
                            sendLog('checkor',"These %d files were invalidated globally\n%s\nand are invalidated in dbs"%(len(were_invalidated),
                                                                                                                          "\n".join(were_invalidated)), level='critical')
                            dbs3Client.setFileStatus( were_invalidated, newstatus=0 )
                                
                    if missing_dbs:
                        wfi.sendLog('checkor',"These %d files are missing in dbs\n%s"%(len(missing_dbs),
                                    "\n".join( missing_dbs )))
                        were_invalidated = sorted(set(missing_dbs) & set(TMDB_invalid ))
                        if were_invalidated:
                            wfi.sendLog('checkor',"These %d files were invalidated globally\n%s"%(len(were_invalidated),
                                                                                                  "\n".join(were_invalidated)))
            #if not bypass_checks:
            ## I don't think we can by pass this
            is_closing = False
        
        time_point("checked file count", sub_lap=True)

        fraction_invalid = 0.20
        if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignoreinvalid:
            print wfo.name,"has a dbs invalid file level too high"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            ## need to be going and taking an eye
            assistance_tags.add('invalidfiles')
            if not bypass_checks:
                #sub_assistance+="-invalidfiles"
                is_closing = False

        ## put that heavy part at the end
        ## duplication check
        duplications = {}
        files_per_rl = {}
        for output in wfi.request['OutputDatasets']:
            duplications[output] = "skiped"
            files_per_rl[output] = "skiped"

        time_point("checked invalidation", sub_lap=True)

        if (is_closing or bypass_checks) and (not options.ignoreduplicates):
            print "starting duplicate checker for",wfo.name
            for output in wfi.request['OutputDatasets']:
                print "\tchecking",output
                duplications[output] = True
                try:
                    duplications[output],files_per_rl[output] = dbs3Client.duplicateRunLumiFiles( output , skipInvalid=True, verbose=True)
                except:
                    try:
                        duplications[output],files_per_rl[output] = dbs3Client.duplicateRunLumiFiles( output , skipInvalid=True, verbose=True)
                    except Exception as e:
                        wfi.sendLog('checkor','Not possible to check on duplicate lumi count on %s'%(output))
                        sendLog('checkor','Not possible to check on duplicate lumi count on %s\n%s'%(output,str(e)),level='critical')
                        is_closing=False

            if is_closing and any(duplications.values()) and not options.ignoreduplicates:
                duplicate_notice = ""
                duplicate_notice += "%s has duplicates\n"%wfo.name
                duplicate_notice += json.dumps( duplications,indent=2)
                duplicate_notice += '\n'
                duplicate_notice += json.dumps( files_per_rl, indent=2)
                wfi.sendLog('checkor',duplicate_notice)
                ## hook for making file invalidation ?
                ## it shouldn't be allowed to bypass it
                assistance_tags.add('duplicates')
                is_closing = False 


        time_point("checked duplicates", sub_lap=True)

        time_point("done with %s"%wfo.name)

        ## for visualization later on
        if not wfo.name in fDB.record: 
            #print "adding",wfo.name,"to close out record"
            fDB.record[wfo.name] = {
            'datasets' :{},
            'name' : wfo.name,
            'closeOutWorkflow' : None,
            }
        fDB.record[wfo.name]['closeOutWorkflow'] = is_closing
        fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority']
        fDB.record[wfo.name]['prepid'] = wfi.request['PrepID']

        for output in wfi.request['OutputDatasets']:
            if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {}
            rec = fDB.record[wfo.name]['datasets'][output]
            #rec['percentage'] = float('%.2f'%(percent_completions[output]*100))
            rec['percentage'] = math.floor(percent_completions[output]*10000)/100.## round down
            rec['duplicate'] = duplications[output] if output in duplications else 'N/A'
            rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A'
            rec['closeOutDataset'] = is_closing
            rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A'
            rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True
            rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output])))
            rec['dbsFiles'] = dbs_presence[output]
            rec['dbsInvFiles'] = dbs_invalid[output]
            rec['phedexFiles'] = phedex_presence[output]
            rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive))
            rec['familly'] = true_familly
            now = time.gmtime()
            rec['timestamp'] = time.mktime(now)
            rec['updated'] = time.asctime(now)+' (GMT)'

        ## make the lumi summary 
        if wfi.request['RequestType'] == 'ReReco':
            try:
                os.system('python Unified/lumi_summary.py %s 1 > /dev/null'%(wfi.request['PrepID']))
                os.system('python Unified/lumi_plot.py %s > /dev/null'%(wfi.request['PrepID']))
                wfi.sendLog('checkor','Lumi summary available at %s/datalumi/lumi.%s.html'%(unified_url,wfi.request['PrepID']))
            except Exception as e:
                print str(e)
        ## make the error report
        
    
        ## and move on
        if is_closing:
            ## toggle status to closed-out in request manager
            wfi.sendLog('checkor',"setting %s closed-out"% wfo.name)
            if not options.test:
                if wfo.wm_status in ['closed-out','announced','normal-archived']:
                    print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does"
                    res = None
                else:
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    print "close out answer",res

                if not res in ["None",None]:
                    print "try to get the current status again"
                    wfi_bis = workflowInfo(url, wfo.name)
                    if wfi_bis.request['RequestStatus'] == 'closed-out':
                        print "the request did toggle to closed-out"
                        res = None
                    
                if not res in ["None",None]:
                    print "retrying to closing out"
                    print res
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    
                
                if res in [None,"None"]:
                    wfo.status = 'close'
                    session.commit()
                    if use_mcm and force_by_mcm:
                        ## shoot large on all prepids, on closing the wf
                        for pid in pids:
                            mcm.delete('/restapi/requests/forcecomplete/%s'%pid)
                else:
                    print "could not close out",wfo.name,"will try again next time"
        else:
            if not 'custodial' in assistance_tags or wfi.isRelval():
                ## do only the report for those
                for member in acdc+acdc_inactive+[wfo.name]:
                    try:
                        parse_one(url, member)
                    except:
                        print "Could not make error report for",member

            ## full known list
            #recovering # has active ACDC
            ##OUT #recovered #had inactive ACDC
            #recovery #not over the pass bar
            #over100 # over 100%
            #biglumi # has a big lumiblock
            #parentcustodial # the parent does not have a valid subscription yet
            #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear
            #filemismatch # there is a dbs/phedex mismatch
            #duplicates #a lumi section is there twice

            ## manual is not added yet, and should be so by recoveror
            print wfo.name,"was tagged with :",list(assistance_tags)
            if 'recovering' in assistance_tags:
                ## if active ACDC, being under threshold, filemismatch do not matter
                assistance_tags = assistance_tags - set(['recovery','filemismatch'])
            if 'recovery' in assistance_tags and 'recovered' in assistance_tags:
                ## should not set -recovery to anything that had ACDC already
                assistance_tags = assistance_tags - set(['recovery','recovered']) 
                ## straight to manual
                assistance_tags.add('manual')
                in_manual += 1
            if 'recovery' in assistance_tags and 'manual' in assistance_tags:
                ## this is likely because something bad is happening, so leave it to manual
                assistance_tags = assistance_tags - set(['recovery'])
                assistance_tags.add('manual')
                in_manual += 1

            ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it
            print wfo.name,"needs assistance with",",".join( assistance_tags )
            print wfo.name,"existing conditions",",".join( existing_assistance_tags )
            
            #########################################
            ##### notification to requester #########
            go_notify=False
            if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags:
                go_notify=True
            

            if go_notify:
                #if wfo.name in already_notified:
                #    print "double notification"
                #    sendEmail('double notification','please take a look at %s'%(wfo.name))                    
                #else:
                #    already_notified.append( wfo.name )

                ###detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s'
                #detailslink = 'https://cmsweb.cern.ch/reqmgr2/fetch?rid=%s'%(wfo.name)
                ###perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name)
                perflink = '%s/report/%s'%(unified_url,wfo.name)
                splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name)
                ## notify templates
                messages= {
                    'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ),
                    'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink),
                    'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ),
                    'filemismatch': 'Samples completed with inconsistency in DBS/Phedex',
                    #'manual' :                     'Workflow completed and requires manual checks by Ops',
                    }
                
                content = "The request PREPID (WORKFLOW) is facing issue in production.\n"
                motive = False
                for case in messages:
                    if case in assistance_tags:
                        content+= "\n"+messages[case]+"\n"
                        motive = True
                content += "You are invited to check, while this is being taken care of by Comp-Ops.\n"
                content += "This is an automated message from Comp-Ops.\n"

                items_notified = set()
                if use_mcm and motive:
                    wfi.notifyRequestor( content , mcm = mcm)

            #########################################


            ## logic to set the status further
            if assistance_tags:
                new_status = 'assistance-'+'-'.join(sorted(assistance_tags) )
            else:
                new_status = 'assistance'

            ## case where the workflow was in manual from recoveror
            if not 'manual' in wfo.status or new_status!='assistance-recovery':
                wfo.status = new_status
                if not options.test:
                    wfi.sendLog('checkor','setting %s to %s'%(wfo.name, wfo.status))
                    session.commit()
            else:
                print "current status is",wfo.status,"not changing to anything"

    #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2))

    fDB.html()
    if not spec and in_manual!=0:
        sendEmail("fresh assistance status available","Fresh status are available at %s/assistance.html"%unified_url,destination=['*****@*****.**'])
        #it's a bit annoying
        pass

    ## custodial requests
    print "Custodials"
    print json.dumps(custodials, indent=2)
    for site in custodials:
        items_at = defaultdict(set)
        for i in custodials[site]:
            item, group = i.split('@') if '@' in i else (i,'DataOps')
            items_at[group].add( item )
        for group,items in items_at.items():
            print ','.join(items),'=>',site,'@',group
            if not options.test:
                result = makeReplicaRequest(url, site, sorted(items) ,"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) , group=group)
                print result

    print "File Invalidation"
    print invalidations
Exemplo n.º 19
0
def transferor(url ,specific = None, talk=True, options=None):
    if userLock():   return
    if duplicateLock():  return

    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    if options and options.test:
        execute = False
    else:
        execute = True

    SI = siteInfo()
    CI = campaignInfo()
    LI = lockInfo()
    NLI = newLockInfo()
    mcm = McMClient(dev=False)
    dss = DSS()

    print "counting all being handled..."
    being_handled = len(session.query(Workflow).filter(Workflow.status == 'away').all())
    being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('stag')).all())
    being_transfered = len(session.query(Workflow).filter(Workflow.status == 'staging').all())
    being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all())

    max_to_handle = options.maxworkflows
    max_to_transfer = options.maxstaging

    allowed_to_handle = max(0,max_to_handle - being_handled)
    allowed_to_transfer = max(0,max_to_transfer - being_transfered)
    wf_buffer = 5
    if allowed_to_handle<=wf_buffer: ## buffer for having several wf per transfer
        print "Not allowed to run more than",max_to_handle,"at a time. Currently",being_handled,"and",wf_buffer,"buffer"
    else:
        print being_handled,"already being handled",max_to_handle,"max allowed,",allowed_to_handle,"remaining","and",wf_buffer,"buffer"

    if allowed_to_transfer <= wf_buffer:
        print "Not allowed to transfer more than",max_to_transfer,"at a time. Currently",being_transfered,"and",wf_buffer,"buffer"
    else:
        print being_transfered,"already being transfered",max_to_transfer,"max allowed,",allowed_to_transfer,"remaining","and",wf_buffer,"buffer"

    print "... done"

    all_transfers=defaultdict(list)
    needing_locks=defaultdict(list)
    workflow_dependencies = defaultdict(set) ## list of wf.id per input dataset
    wfs_and_wfh=[]
    print "getting all wf to consider ..."
    cache = getWorkflows(url, 'assignment-approved', details=True)
    for wfo in session.query(Workflow).filter(Workflow.status=='considered').all():
        print "\t",wfo.name
        if specific and not specific in wfo.name: continue
        cache_r =filter(lambda d:d['RequestName']==wfo.name, cache)
        if len(cache_r):
            wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False, request = cache_r[0]) ) )
        else:
            wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False) ) )
    print "... done"

    transfers_per_sites = defaultdict(int)
    input_sizes = {}
    input_cput = {}
    input_st = {}
    ## list the size of those in transfer already
    in_transfer_priority=0
    min_transfer_priority=100000000
    print "getting all wf in staging ..."
    for wfo in session.query(Workflow).filter(Workflow.status=='staging').all():
        wfh = workflowInfo( url, wfo.name, spec=False)
        (lheinput,primary,parent,secondary) = wfh.getIO()
        sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) )
        for site in sites_allowed: ## we should get the actual transfer destination instead of the full white list
            transfers_per_sites[site] += 1 
        #input_cput[wfo.name] = wfh.getComputingTime()
        #input_st[wfo.name] = wfh.getSystemTime()
        for prim in primary:  
            input_sizes[prim] = dss.get( prim )
            print "\t",wfo.name,"needs",input_sizes[prim],"GB"
        in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority']))
        min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority']))

    print "... done"
    print "Max priority in transfer already",in_transfer_priority
    print "Min priority in transfer already",min_transfer_priority
    print "transfers per sites"
    print json.dumps( transfers_per_sites, indent=2)
    in_transfer_already = sum(input_sizes.values())
    cput_in_transfer_already = sum(input_cput.values())
    st_in_transfer_already = sum(input_st.values())
    # shuffle first by name
    random.shuffle( wfs_and_wfh )
    #sort by priority higher first
    wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority']) ), reverse=True)
    

    ## list the size of all inputs
    print "getting all input sizes ..."
    for (wfo,wfh) in wfs_and_wfh:
        (_,primary,_,_) = wfh.getIO()
        #input_cput[wfo.name] = wfh.getComputingTime()
        #input_st[wfo.name] = wfh.getSystemTime()
        for prim in primary:
            input_sizes[prim] = dss.get( prim )
    print "... done"

    cput_grand_total = sum(input_cput.values())
    cput_to_transfer = cput_grand_total - cput_in_transfer_already
    st_grand_total = sum(input_st.values())
    st_to_transfer = st_grand_total - st_in_transfer_already
    print "%15.4f [CPU h] worth already in transfer"%cput_in_transfer_already
    print "%15.4f [CPU h] worth is the current requested transfer load"%cput_to_transfer
    print "%15.4f [h] worth of absolute system time in transfer"%( cput_in_transfer_already / SI.availableSlots())
    print "%15.4f [h] worth of absolute system time is the current requested transfer load"%( cput_to_transfer / SI.availableSlots())
    print "%15.4f [h] worth of theoritical system time in transfer"%( st_in_transfer_already )
    print "%15.4f [h] worth of theoritical system time is the current requested transfer load"%( st_to_transfer )


    grand_total =  sum(input_sizes.values()) 
    to_transfer = grand_total  - in_transfer_already
    grand_transfer_limit = options.maxtransfer 
    transfer_limit = grand_transfer_limit - in_transfer_already
    print "%15.4f GB already being transfered"%in_transfer_already
    print "%15.4f GB is the current requested transfer load"%to_transfer
    print "%15.4f GB is the global transfer limit"%grand_transfer_limit
    print "%15.4f GB is the available limit"%transfer_limit


    max_staging_per_site = options.maxstagingpersite
                    
    # the max priority value per dataset.
    max_priority = defaultdict(int)
    needs_transfer=0 ## so that we can count'em
    passing_along = 0
    transfer_sizes={}
    went_over_budget=False
    destination_cache = {}
    for (wfo,wfh) in wfs_and_wfh:
        print wfh.request['RequestPriority']
        print wfo.name,"to be transfered"
        #wfh = workflowInfo( url, wfo.name)

        (_,primary,_,_) = wfh.getIO()
        this_load=sum([input_sizes[prim] for prim in primary])
        if ( this_load and (sum(transfer_sizes.values())+this_load > transfer_limit or went_over_budget ) ):
            if went_over_budget:
                print "Transfer has gone over bubget."
            else:
                print "Transfer will go over bubget."
            print "%15.4f GB this load"%this_load
            print "%15.4f GB already this round"%sum(transfer_sizes.values())
            print "%15.4f GB is the available limit"%transfer_limit
            went_over_budget=True
            if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
                print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over budget"
            else:
                if not options.go: 
                    print min_transfer_priority,"minimum priority",wfh.request['RequestPriority'],"<",in_transfer_priority,"stop"
                    continue


        ## throtlle by campaign go
        if not CI.go( wfh.request['Campaign'] ):
            print "No go for",wfh.request['Campaign']
            if not options.go: 
                sendEmail("no go for managing","No go for "+wfh.request['Campaign'])
                continue

        ## check if the batch is announced

        def check_mcm(wfn):
            announced=False
            is_real=False
            if not wfn.startswith('pdmvserv'):
                is_real = True
            try:
                for b in mcm.getA('batches',query='contains=%s'% wfo.name):
                    is_real = True
                    if b['status']=='announced': 
                        announced=True 
                        break
            except:
                try:
                    for b in mcm.getA('batches',query='contains=%s'% wfo.name):
                        is_real = True
                        if b['status']=='announced': 
                            announced=True 
                            break
                except:
                    print "could not get mcm batch announcement, assuming not real"
            return announced,is_real

        if not use_mcm:
            announced,is_real = False,True
        else:
            announced,is_real = check_mcm( wfo.name )

        if not announced:
            print wfo.name,"does not look announced."# skipping?, rejecting?, reporting?"
            
        if not is_real:
            print wfo.name,"does not appear to be genuine."
            ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status
            continue

        ## check on a grace period
        injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.)
        now = time.mktime(time.gmtime()) / (60.*60.)
        if float(now - injection_time) < 4.:
            if not options.go and not announced: 
                print "It is too soon to start transfer: %3.2fH remaining"%(now - injection_time)
                continue


        if passing_along >= allowed_to_handle:
            #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
            if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority:
                ## higher priority, and not only this priority being transfered
                print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over",max_to_handle
            else:
                print "Not allowed to pass more than",max_to_handle,"at a time. Currently",being_handled,"handled, and adding",passing_along
                if not options.go: break

        if this_load and needs_transfer >= allowed_to_transfer:
            #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
            if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority:
                ## higher priority, and not only this priority being transfered
                print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over",max_to_transfer
            else:
                print "Not allowed to transfer more than",max_to_transfer,"at a time. Currently",being_transfered,"transfering, and adding",needs_transfer
                if not options.go: continue


        (lheinput,primary,parent,secondary) = wfh.getIO()
        for dataset in list(primary)+list(parent)+list(secondary):
            ## lock everything flat
            NLI.lock( dataset )

        if options and options.tosites:
            sites_allowed = options.tosites.split(',')
        else:
            sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) )

        if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']):
            sites_allowed = CI.parameters(wfh.request['Campaign'])['SiteWhitelist']

        if 'SiteBlacklist' in CI.parameters(wfh.request['Campaign']):
            sites_allowed = list(set(sites_allowed) - set(CI.parameters(wfh.request['Campaign'])['SiteBlacklist']))

        ## reduce right away to sites in case of memory limitation
        memory_allowed = SI.sitesByMemory( wfh.request['Memory'] )
        if memory_allowed!=None:
            print "sites allowing", wfh.request['Memory'],"are",memory_allowed
            sites_allowed = list(set(sites_allowed) & set(memory_allowed))

        if not sites_allowed:
            print wfo.name,"has no possible sites to run at"
            print "available for",wfh.request['Memory'],"are",memory_allowed
            sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name ))
            continue

        blocks = []
        if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']:
            blocks = wfh.request['BlockWhitelist']
        if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']:
            ## should make the block selection here
            pass

        if 'LumiList' in wfh.request and wfh.request['LumiList']:
            ## same, we could be doing the white list here too
            pass


        if blocks:
            print "Reading",len(blocks),"in whitelist"

        can_go = True
        staging=False
        allowed=True
        if primary:
            
            copies_needed_from_CPUh,CPUh = wfh.getNCopies()

            if talk:
                print wfo.name,'reads',', '.join(primary),'in primary'
            ## chope the primary dataset 
            for prim in primary:
                max_priority[prim] = max(max_priority[prim],int(wfh.request['RequestPriority']))
                sites_allowed = [site for site in sites_allowed if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                print "Sites allowed minus the vetoed transfer"
                print sorted(sites_allowed)

                copies_needed_from_site = int(0.35*len(sites_allowed))+1 ## should just go for a fixed number based if the white list grows that big
                print "Would make",copies_needed_from_site,"copies from site white list"
                copies_needed = copies_needed_from_site

                print "Would make",copies_needed_from_CPUh,"from cpu requirement",CPUh
                copies_needed = copies_needed_from_CPUh

                if options.maxcopy>0:
                    ## stop maxing things out ??
                    #copies_needed = min(options.maxcopy,copies_needed)
                    #print "Maxed to",copies_needed
                    if copies_needed_from_CPUh > options.maxcopy:
                        sendEmail('An example of more than three copies','for %s it could have been beneficial to make %s copies'%( wfo.name, copies_needed_from_CPUh))

                
                if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]:
                    copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies']
                    copies_needed = min(copies_needed_from_campaign,copies_needed_from_site)
                    print "Maxed to",copies_needed,"by campaign configuration",wfh.request['Campaign']

                ## remove the sites that do not want transfers                
                workflow_dependencies[prim].add( wfo.id )

                #####################################
                ###### JR 3/8/15 #### deprecating this
                """
                presence = getDatasetPresence( url, prim , within_sites = [SI.CE_to_SE(site) for site in sites_allowed])
                prim_location = [site for site,pres in presence.items() if pres[0]==True]
                prim_parts = [site for site,pres in presence.items() if pres[0]==False]
                if len(prim_location) >= copies_needed:
                    print "The output is all fully in place at",len(prim_location),"sites"
                    continue
                # reduce the number of copies required by existing full copies
                copies_needed = max(0,copies_needed - len(prim_location))
                print "now need",copies_needed
                subscriptions = listSubscriptions( url , prim , sites_allowed )
                prim_destination = list(set([site for (site,(tid,decision)) in subscriptions.items() if decision and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])]))
                ## remove the subscription where the dataset is in parts at
                #prim_destination = list(set([site for (site,(tid,decision)) in subscriptions.items() if decision and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']]) and not site in prim_parts]))
                ## need to reject from that list the ones with a full copy already: i.e the transfer corresponds to the copy in place
                prim_destination = [site for site in prim_destination if not site in prim_location]
                ## add transfer dependencies
                latching_on_transfers =  list(set([ tid for (site,(tid,decision)) in subscriptions.items() if decision and site in prim_destination and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])]))
                print latching_on_transfers
                """
                ###### JR 3/8/15 #### deprecating this
                #####################################


                ### new ways of making the whole thing
                destinations,all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks )
                #destinations,all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks, group='DataOps')
                #anaops_destinations,anaops_all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks, group='AnalysisOps' )
                print json.dumps(destinations, indent=2)

                ## get where the dataset is in full and completed
                prim_location = [site for (site,info) in destinations.items() if info['completion']==100 and info['data_fraction']==1]
                ## the rest is places it is going to be
                prim_destination = [site for site in destinations.keys() if not site in prim_location]
                ## need to take out the transfer veto
                prim_destination = [site for site in prim_destination if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                for dsite in prim_destination:
                    needing_locks[dsite].append( prim )

                if len(prim_location) >= copies_needed:
                    print "The output is all fully in place at",len(prim_location),"sites",prim_location
                    continue
                copies_needed = max(0,copies_needed - len(prim_location))
                print "now need",copies_needed
                
                copies_being_made = [ sum([info['blocks'].keys().count(block) for site,info in destinations.items() if site in prim_destination]) for block in all_block_names]

                latching_on_transfers = set()
                [latching_on_transfers.update(info['blocks'].values()) for site,info in destinations.items() if site in prim_destination]
                latching_on_transfers = list(latching_on_transfers)
                #print latching_on_transfers

                ## figure out where all this is going to go
                prim_to_distribute = [site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location]
                prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in prim_destination]
                ## take out the ones that cannot receive transfers
                prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])]

                if any([transfers_per_sites[site] < max_staging_per_site for site in prim_to_distribute]):
                    ## means there is openings let me go
                    print "There are transfer slots available:",[(site,transfers_per_sites[site]) for site in prim_to_distribute]
                    for site in sites_allowed:
                        #increment accross the board, regardless of real destination: could be changed
                        transfers_per_sites[site] += 1
                else:
                    if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
                        print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over transfer slots available"
                    else:
                        print "Not allowed to transfer more than",max_staging_per_site," per site at a time. Going overboard for",[site for site in prim_to_distribute if transfers_per_sites[site]>=max_staging_per_site]
                        if not options.go:
                            allowed = False
                            break

                for latching in latching_on_transfers:
                    tfo = session.query(Transfer).filter(Transfer.phedexid == latching).first()
                    if not tfo:
                        tfo = Transfer( phedexid = latching)
                        tfo.workflows_id = []
                        session.add(tfo)
                            
                    if not wfo.id in tfo.workflows_id:
                        print "adding",wfo.id,"to",tfo.id,"with phedexid",latching
                        l = copy.deepcopy( tfo.workflows_id )
                        l.append( wfo.id )
                        tfo.workflows_id = l
                    if not options.test:
                        session.commit()
                    else:
                        session.flush() ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query
                    can_go = False
                    transfer_sizes[prim] = input_sizes[prim]
                    staging = True

                # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ??
                #copies_needed = max(0,copies_needed - len(prim_destination))
                copies_needed = max(0,copies_needed - min(copies_being_made))
                print "then need",copies_needed
                if copies_needed == 0:
                    print "The output is either fully in place or getting in full somewhere with",latching_on_transfers
                    can_go = True
                    continue

                if len(prim_to_distribute)>0: ## maybe that a parameter we can play with to limit the 
                    if not options or options.chop:
                        chops,sizes = getDatasetChops(prim, chop_threshold = options.chopsize, only_blocks=blocks)
                        spreading = distributeToSites( chops, prim_to_distribute, n_copies = copies_needed, weights=SI.cpu_pledges, sizes=sizes)
                        transfer_sizes[prim] = sum(sizes)
                    else:
                        spreading = {} 
                        for site in prim_to_distribute: 
                            if blocks:
                                spreading[site]=blocks
                            else:
                                spreading[site]=[prim]
                        transfer_sizes[prim] = input_sizes[prim] ## this is approximate if blocks are specified
                    can_go = False
                    print "selected CE destinations",spreading.keys()
                    for (site,items) in spreading.items():
                        all_transfers[site].extend( items )

        if not allowed:
            print "Not allowed to move on with",wfo.name
            continue


        if secondary:
            if talk:
                print wfo.name,'reads',', '.join(secondary),'in secondary'
            for sec in secondary:
                workflow_dependencies[sec].add( wfo.id )

                if False:
                    ## new style, failing on minbias
                    if not sec in destination_cache:
                        ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist
                        destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed])
                    destinations = destination_cache[sec]
                    ## truncate location/destination to those making up for >90% of the dataset
                    bad_destinations = [destinations.pop(site) for (site,info) in destinations.items() if info['data_fraction']<0.9]
                    sec_location = [site for (site,info) in destinations.items() if info['completion']>=95]
                    sec_destination = [site for site in destinations.keys() if not site in sec_location]
                else:
                    ## old style
                    presence = getDatasetPresence( url, sec )
                    sec_location = [site for site,pres in presence.items() if pres[1]>90.] ## more than 90% of the minbias at sites
                    subscriptions = listSubscriptions( url ,sec )
                    sec_destination = [site for site in subscriptions] 

                for site in sec_location:
                    needing_locks[site].append( sec )
                for site in sec_destination:
                    needing_locks[site].append( sec )

                sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])]
                sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])]
                sec_to_distribute = [site for site in sec_to_distribute if not  any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                if len( sec_to_distribute )>0:
                    sec_size = dss.get( sec )
                    for site in sec_to_distribute:
                        site_se =SI.CE_to_SE(site)
                        if (SI.disk[site_se]*1024.) > sec_size:
                            all_transfers[site].append( sec )
                            can_go = False
                        else:
                            print "could not send the secondary input to",site_se,"because it is too big for the available disk",SI.disk[site_se]*1024,"GB need",sec_size
                            #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024))

        ## is that possible to do something more
        if can_go:
            ## no explicit transfer required this time
            if staging:
                ## but using existing ones
                print wfo.name,"latches on existing transfers, and nothing else"
                wfo.status = 'staging'
                needs_transfer+=1
            else:
                print wfo.name,"should just be assigned NOW to",sites_allowed
                wfo.status = 'staged'
            passing_along+=1
            print "setting status to",wfo.status
            session.commit()
            continue
        else:
            ## there is an explicit transfer required
            if staging:
                ## and also using an existing one
                print wfo.name,"latches on existing transfers"
                if not options.test:
                    wfo.status = 'staging'
                    print "setting status to",wfo.status
                    session.commit()
            print wfo.name,"needs a transfer"
            needs_transfer+=1
            passing_along+=1

    print "accumulated locks of dataset in place"
    print json.dumps(needing_locks, indent=2)
    for site,items in needing_locks.items():
        for item in items:
            LI.lock( item, SI.CE_to_SE(site), 'usable input')
        
    print "accumulated transfers"
    print json.dumps(all_transfers, indent=2)
    fake_id=-1
    wf_id_in_prestaging=set()

    for (site,items_to_transfer) in all_transfers.iteritems():
        items_to_transfer = list(set(items_to_transfer))

        ## convert to storage element
        site_se = SI.CE_to_SE(site)

        ## site that do not want input datasets
        if site in SI.sites_veto_transfer: 
            print site,"does not want transfers"
            continue

        ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured.

        ## massage a bit the items
        blocks = [it for it in items_to_transfer if '#' in it]
        datasets = [it for it in items_to_transfer if not '#' in it]

        if execute:
            print "Making a replica to",site,"(CE)",site_se,"(SE) for"
        else:
            print "Would make a replica to",site,"(CE)",site_se,"(SE) for"

        print "\t",len(blocks),"blocks"
        ## remove blocks if full dataset is send out
        blocks = [block for block in blocks if not block.split('#')[0] in datasets]
        print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks]))
        print "\t",len(datasets),"datasets"
        print "\t",datasets
        items_to_transfer = blocks + datasets

        ## operate the transfer
        if options and options.stop:
            ## ask to move-on
            answer = raw_input('Continue with that ?')
            if not answer.lower() in ['y','yes','go']:
                continue

        if execute:
            result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority='normal')
            ## make use of max_priority dataset:priority to set the subscriptions priority
            """
            ## does not function
            once = True
            for item in items_to_transfer:
                bds = item.split('#')[0]
                if max_priority[bds] >= 90000:
                    if once:
                        w=10
                        print "waiting",w,"s before raising priority"
                        time.sleep(w)
                        once=False
                    ## raise it to high priority
                    print item,"subscription priority raised to high at",site_se
                    #print "This does not work yet properly it seems"
                    print updateSubscription(url, site_se, item, priority='high')
            """
            #for item in list(set([it.split('#')[0] for it in items_to_transfer])):
            for item in items_to_transfer:
                LI.lock( item, site_se, 'pre-staging')
        else:
            #result= {'phedex':{'request_created' : [{'id' : fake_id}]}}
            result= {'phedex':{'request_created' : []}}
            fake_id-=1



        if not result:
            print "ERROR Could not make a replica request for",site,items_to_transfer,"pre-staging"
            continue
        for phedexid in [o['id'] for o in result['phedex']['request_created']]:
            new_transfer = session.query(Transfer).filter(Transfer.phedexid == phedexid).first()
            print phedexid,"transfer created"
            if not new_transfer:
                new_transfer = Transfer( phedexid = phedexid)
                session.add( new_transfer )                
            new_transfer.workflows_id = set()
            for transfering in list(set(map(lambda it : it.split('#')[0], items_to_transfer))):
                new_transfer.workflows_id.update( workflow_dependencies[transfering] )
            new_transfer.workflows_id = list(new_transfer.workflows_id)
            wf_id_in_prestaging.update(new_transfer.workflows_id)
            session.commit()
            ## auto approve it
            if execute:
                approved = approveSubscription(url, phedexid, [site_se])

    for wfid in wf_id_in_prestaging:
        tr_wf = session.query(Workflow).get(wfid)
        if tr_wf and tr_wf.status!='staging':
            if execute:
                tr_wf.status = 'staging'
                if talk:
                    print "setting",tr_wf.name,"to staging"
        session.commit()
Exemplo n.º 20
0
def transferor(url, specific=None, talk=True, options=None):
    if userLock(): return
    if duplicateLock(): return

    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    if options and options.test:
        execute = False
    else:
        execute = True

    SI = siteInfo()
    CI = campaignInfo()
    NLI = newLockInfo()
    mcm = McMClient(dev=False)
    dss = DSS()

    #allowed_secondary = UC.get('')
    print "counting all being handled..."
    being_handled = len(
        session.query(Workflow).filter(Workflow.status == 'away').all())
    being_handled += len(
        session.query(Workflow).filter(
            Workflow.status.startswith('stag')).all())
    being_transfered = len(
        session.query(Workflow).filter(Workflow.status == 'staging').all())
    being_handled += len(
        session.query(Workflow).filter(
            Workflow.status.startswith('assistance-')).all())

    max_to_handle = options.maxworkflows
    max_to_transfer = options.maxstaging

    allowed_to_handle = max(0, max_to_handle - being_handled)
    allowed_to_transfer = max(0, max_to_transfer - being_transfered)
    wf_buffer = 5
    if allowed_to_handle <= wf_buffer:  ## buffer for having several wf per transfer
        print "Not allowed to run more than", max_to_handle, "at a time. Currently", being_handled, "and", wf_buffer, "buffer"
    else:
        print being_handled, "already being handled", max_to_handle, "max allowed,", allowed_to_handle, "remaining", "and", wf_buffer, "buffer"

    if allowed_to_transfer <= wf_buffer:
        print "Not allowed to transfer more than", max_to_transfer, "at a time. Currently", being_transfered, "and", wf_buffer, "buffer"
    else:
        print being_transfered, "already being transfered", max_to_transfer, "max allowed,", allowed_to_transfer, "remaining", "and", wf_buffer, "buffer"

    print "... done"

    all_transfers = defaultdict(list)
    workflow_dependencies = defaultdict(
        set)  ## list of wf.id per input dataset
    wfs_and_wfh = []
    print "getting all wf to consider ..."
    cache = getWorkflows(url, 'assignment-approved', details=True)
    for wfo in session.query(Workflow).filter(
            Workflow.status.startswith('considered')).all():
        print "\t", wfo.name
        if specific and not specific in wfo.name: continue
        cache_r = filter(lambda d: d['RequestName'] == wfo.name, cache)
        if len(cache_r):
            wfs_and_wfh.append((wfo,
                                workflowInfo(url,
                                             wfo.name,
                                             spec=False,
                                             request=cache_r[0])))
        else:
            wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False)))
    print "... done"

    transfers_per_sites = defaultdict(int)
    input_sizes = {}
    ignored_input_sizes = {}
    input_cput = {}
    input_st = {}
    ## list the size of those in transfer already
    in_transfer_priority = None
    min_transfer_priority = None
    print "getting all wf in staging ..."
    stucks = json.loads(open('%s/stuck_transfers.json' % monitor_dir).read())

    for wfo in session.query(Workflow).filter(
            Workflow.status == 'staging').all():
        wfh = workflowInfo(url, wfo.name, spec=False)
        #(lheinput,primary,parent,secondary) = wfh.getIO()
        #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) )
        (lheinput, primary, parent, secondary,
         sites_allowed) = wfh.getSiteWhiteList()
        for site in sites_allowed:  ## we should get the actual transfer destination instead of the full white list
            transfers_per_sites[site] += 1
        #input_cput[wfo.name] = wfh.getComputingTime()
        #input_st[wfo.name] = wfh.getSystemTime()
        for prim in primary:
            ds_s = dss.get(prim)
            if prim in stucks:
                sendLog('transferor',
                        "%s appears stuck, so not counting it %s [GB]" %
                        (prim, ds_s),
                        wfi=wfh)
                ignored_input_sizes[prim] = ds_s
            else:
                input_sizes[prim] = ds_s
                sendLog('transferor',
                        "%s needs %s [GB]" % (wfo.name, ds_s),
                        wfi=wfh)
        if in_transfer_priority == None:
            in_transfer_priority = int(wfh.request['RequestPriority'])
        else:
            in_transfer_priority = max(in_transfer_priority,
                                       int(wfh.request['RequestPriority']))
        if min_transfer_priority == None:
            min_transfer_priority = int(wfh.request['RequestPriority'])
        else:
            min_transfer_priority = min(min_transfer_priority,
                                        int(wfh.request['RequestPriority']))

    if min_transfer_priority == None or in_transfer_priority == None:
        print "nothing is lining up for transfer"
        sendEmail("no request in staging", "no request in staging")
        return
        pass

    try:
        print "Ignored input sizes"
        ignored_values = list(ignored_input_sizes.items())
        ignored_values.sort(key=lambda i: i[1])
        print "\n".join(map(str, ignored_values))
        print "Considered input sizes"
        considered_values = list(input_sizes.items())
        considered_values.sort(key=lambda i: i[1])
        print "\n".join(map(str, considered_values))
    except Exception as e:
        print "trying to print the summary of input size"
        print str(e)

    print "... done"
    print "Max priority in transfer already", in_transfer_priority
    print "Min priority in transfer already", min_transfer_priority
    print "transfers per sites"
    print json.dumps(transfers_per_sites, indent=2)
    in_transfer_already = sum(input_sizes.values())
    cput_in_transfer_already = sum(input_cput.values())
    st_in_transfer_already = sum(input_st.values())

    ## list the size of all inputs
    primary_input_per_workflow_gb = defaultdict(float)
    print "getting all input sizes ..."
    for (wfo, wfh) in wfs_and_wfh:
        (_, primary, _, _) = wfh.getIO()
        #input_cput[wfo.name] = wfh.getComputingTime()
        #input_st[wfo.name] = wfh.getSystemTime()
        for prim in primary:
            ## do not count it if it appears stalled !
            prim_size = dss.get(prim)
            input_sizes[prim] = prim_size
            primary_input_per_workflow_gb[wfo.name] += prim_size
    print "... done"

    # shuffle first by name
    random.shuffle(wfs_and_wfh)

    # Sort smallest transfers first; allows us to transfer as many as possible workflows.
    def prio_and_size(i, j):
        if int(i[1].request['RequestPriority']) == int(
                j[1].request['RequestPriority']):
            return cmp(int(primary_input_per_workflow_gb.get(j[0].name, 0)),
                       int(primary_input_per_workflow_gb.get(i[0].name, 0)))
        else:
            return cmp(int(i[1].request['RequestPriority']),
                       int(j[1].request['RequestPriority']))

    #wfs_and_wfh.sort(cmp = prio_and_size, reverse=True)
    #wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(primary_input_per_workflow_gb.get(i[0].name, 0)), int(primary_input_per_workflow_gb.get(j[0].name, 0)) ))
    #sort by priority higher first
    wfs_and_wfh.sort(cmp=lambda i, j: cmp(int(i[1].request[
        'RequestPriority']), int(j[1].request['RequestPriority'])),
                     reverse=True)

    cput_grand_total = sum(input_cput.values())
    cput_to_transfer = cput_grand_total - cput_in_transfer_already
    st_grand_total = sum(input_st.values())
    st_to_transfer = st_grand_total - st_in_transfer_already
    print "%15.4f [CPU h] worth already in transfer" % cput_in_transfer_already
    print "%15.4f [CPU h] worth is the current requested transfer load" % cput_to_transfer
    print "%15.4f [h] worth of absolute system time in transfer" % (
        cput_in_transfer_already / SI.availableSlots())
    print "%15.4f [h] worth of absolute system time is the current requested transfer load" % (
        cput_to_transfer / SI.availableSlots())
    print "%15.4f [h] worth of theoritical system time in transfer" % (
        st_in_transfer_already)
    print "%15.4f [h] worth of theoritical system time is the current requested transfer load" % (
        st_to_transfer)

    grand_total = sum(input_sizes.values())
    to_transfer = grand_total - in_transfer_already
    grand_transfer_limit = options.maxtransfer
    #grand_transfer_limit = SI.total_disk()*0.25*1024## half of the free sapce in TB->GB

    transfer_limit = grand_transfer_limit - in_transfer_already
    print "%15.4f GB already being transfered" % in_transfer_already
    print "%15.4f GB is the current requested transfer load" % to_transfer
    print "%15.4f GB is the global transfer limit" % grand_transfer_limit
    print "%15.4f GB is the available limit" % transfer_limit

    max_staging_per_site = options.maxstagingpersite

    # the max priority value per dataset.
    max_priority = defaultdict(int)
    needs_transfer = 0  ## so that we can count'em
    passing_along = 0
    transfer_sizes = {}
    went_over_budget = False
    destination_cache = {}
    no_goes = set()

    max_per_round = UC.get('max_per_round').get('transferor', None)
    if max_per_round and not spec:
        wfs_and_wfh = wfs_and_wfh[:max_per_round]

    for (wfo, wfh) in wfs_and_wfh:
        print wfo.name, "to be transfered with priority", wfh.request[
            'RequestPriority']

        if wfh.request['RequestStatus'] != 'assignment-approved':
            if wfh.request['RequestStatus'] in [
                    'aborted', 'rejected', 'rejected-archived',
                    'aborted-archived'
            ]:
                wfo.status = 'trouble'  ## so that we look or a replacement
            else:
                wfo.status = 'away'
            wfh.sendLog(
                'transferor', '%s in status %s, setting %s' %
                (wfo.name, wfh.request['RequestStatus'], wfo.status))
            continue

        (_, primary, _, _) = wfh.getIO()
        this_load = sum([input_sizes[prim] for prim in primary])
        no_budget = False
        if (this_load
                and (sum(transfer_sizes.values()) + this_load > transfer_limit
                     or went_over_budget)):
            if went_over_budget:
                wfh.sendLog('transferor', "Transfer has gone over bubget.")
            else:
                wfh.sendLog('transferor', "Transfer will go over bubget.")
            wfh.sendLog(
                'transferor',
                "%15.4f GB this load, %15.4f GB already this round, %15.4f GB is the available limit"
                % (this_load, sum(transfer_sizes.values()), transfer_limit))
            #if sum(transfer_sizes.values()) > transfer_limit:
            went_over_budget = True
            if in_transfer_priority != None and min_transfer_priority != None:
                if int(
                        wfh.request['RequestPriority']
                ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority:
                    wfh.sendLog(
                        'transferor',
                        "Higher priority sample %s >= %s go-on over budget" %
                        (wfh.request['RequestPriority'], in_transfer_priority))
                else:
                    if not options.go:
                        wfh.sendLog(
                            'transferor',
                            "%s minimum priority %s < %s : stop" %
                            (min_transfer_priority,
                             wfh.request['RequestPriority'],
                             in_transfer_priority))
                        no_budget = True

        ## throtlle by campaign go
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True
            no_goes.add(wfo.name)

        allowed_secondary = set()
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[
                    campaign]:
                allowed_secondary.update(CI.campaigns[campaign]['secondaries'])
        if secondary:
            if (secondary and allowed_secondary) and (
                    set(secondary) & allowed_secondary != set(secondary)):
                wfh.sendLog(
                    'assignor', '%s is not an allowed secondary' %
                    (', '.join(set(secondary) - allowed_secondary)))
                no_go = True

        if no_go:
            continue
        ## check if the batch is announced

        def check_mcm(wfn):
            announced = False
            is_real = False
            if not wfn.startswith('pdmvserv'):
                is_real = True
            try:
                for b in mcm.getA('batches', query='contains=%s' % wfo.name):
                    is_real = True
                    if b['status'] == 'announced':
                        announced = True
                        break
            except:
                try:
                    for b in mcm.getA('batches',
                                      query='contains=%s' % wfo.name):
                        is_real = True
                        if b['status'] == 'announced':
                            announced = True
                            break
                except:
                    print "could not get mcm batch announcement, assuming not real"
            return announced, is_real

        if not use_mcm:
            announced, is_real = False, True
        else:
            if wfh.request['RequestType'] in ['ReReco']:
                announced, is_real = True, True
            else:
                announced, is_real = check_mcm(wfo.name)

        if not announced:
            wfh.sendLog('transferor', "does not look announced.")

        if not is_real:
            wfh.sendLog('transferor', "does not appear to be genuine.")

            ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status
            continue

        ## check on a grace period
        injection_time = time.mktime(
            time.strptime('.'.join(map(str, wfh.request['RequestDate'])),
                          "%Y.%m.%d.%H.%M.%S")) / (60. * 60.)
        now = time.mktime(time.gmtime()) / (60. * 60.)
        if float(now - injection_time) < 4.:
            if not options.go and not announced:
                wfh.sendLog(
                    'transferor',
                    "It is too soon to start transfer: %3.2fH remaining" %
                    (now - injection_time))
                continue

        if passing_along >= allowed_to_handle:
            #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
            if in_transfer_priority != None and min_transfer_priority != None:
                if int(wfh.request['RequestPriority']
                       ) >= in_transfer_priority and int(
                           wfh.request['RequestPriority']
                       ) != min_transfer_priority:
                    ## higher priority, and not only this priority being transfered
                    wfh.sendLog(
                        'transferor',
                        "Higher priority sample %s >= %s go-on over %s" %
                        (wfh.request['RequestPriority'], in_transfer_priority,
                         max_to_handle))
                else:
                    wfh.sendLog(
                        'transferor',
                        " Not allowed to pass more than %s at a time. Currently %s handled, and adding %s"
                        % (max_to_handle, being_handled, passing_along))
                    if not options.go:
                        ## should not allow to jump that fence
                        break

        if this_load and needs_transfer >= allowed_to_transfer:
            if in_transfer_priority != None and min_transfer_priority != None:
                if int(wfh.request['RequestPriority']
                       ) >= in_transfer_priority and int(
                           wfh.request['RequestPriority']
                       ) != min_transfer_priority:
                    ## higher priority, and not only this priority being transfered
                    wfh.sendLog(
                        'transferor',
                        "Higher priority sample %s >= %s go-on over %s" %
                        (wfh.request['RequestPriority'], in_transfer_priority,
                         max_to_transfer))
                else:
                    wfh.sendLog(
                        'transferor',
                        "Not allowed to transfer more than %s at a time. Currently %s transfering, and adding %s"
                        % (max_to_transfer, being_transfered, needs_transfer))
                    if not options.go:
                        no_budget = True

        if no_budget:
            continue

        ## the site white list considers site, campaign, memory and core information
        (lheinput, primary, parent, secondary,
         sites_allowed) = wfh.getSiteWhiteList()
        if options and options.tosites:
            sites_allowed = options.tosites.split(',')

        for dataset in list(primary) + list(parent) + list(secondary):
            ## lock everything flat
            NLI.lock(dataset)

        if not sites_allowed:
            wfh.sendLog('transferor', "not possible site to run at")
            #sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name ))
            sendLog('transferor',
                    "%s has no possible sites to run at" % (wfo.name),
                    level='critical')
            continue

        blocks = []
        if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']:
            blocks = wfh.request['BlockWhitelist']
        if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']:
            ## augment with run white list
            for dataset in primary:
                blocks = list(
                    set(blocks + getDatasetBlocks(
                        dataset, runs=wfh.request['RunWhitelist'])))
        if 'LumiList' in wfh.request and wfh.request['LumiList']:
            ## augment with the lumi white list
            blocks = list(
                set(blocks +
                    getDatasetBlocks(dataset, lumis=wfh.request['LumiList'])))

        if blocks:
            print "Reading", len(blocks), "in block whitelist"

        can_go = True
        staging = False
        allowed = True
        primary_destinations = set()
        if primary:

            copies_needed_from_CPUh, CPUh = wfh.getNCopies()

            if talk:
                print wfo.name, 'reads', ', '.join(primary), 'in primary'
            ## chope the primary dataset
            for prim in primary:
                ## keep track of what needs what
                workflow_dependencies[prim].add(wfo.id)

                max_priority[prim] = max(max_priority[prim],
                                         int(wfh.request['RequestPriority']))

                wfh.sendLog(
                    'transferor', "Would make %s  from cpu requirement %s" %
                    (copies_needed_from_CPUh, CPUh))
                copies_needed = copies_needed_from_CPUh

                if 'Campaign' in wfh.request and wfh.request[
                        'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[
                            wfh.request['Campaign']]:
                    copies_needed_from_campaign = CI.campaigns[
                        wfh.request['Campaign']]['maxcopies']
                    copies_needed = min(copies_needed_from_campaign,
                                        copies_needed)

                    wfh.sendLog(
                        'transferor',
                        "Maxed to %s by campaign configuration %s" %
                        (copies_needed, wfh.request['Campaign']))

                ### new ways of making the whole thing
                destinations, all_block_names = getDatasetDestinations(
                    url,
                    prim,
                    within_sites=[SI.CE_to_SE(site) for site in sites_allowed],
                    only_blocks=blocks)
                print json.dumps(destinations, indent=2)

                ## get where the dataset is in full and completed
                prim_location = [
                    site for (site, info) in destinations.items()
                    if info['completion'] == 100 and info['data_fraction'] == 1
                ]
                ## the rest is places it is going to be
                prim_destination = [
                    site for site in destinations.keys()
                    if not site in prim_location
                ]

                if len(prim_location) >= copies_needed:
                    wfh.sendLog(
                        'transferor',
                        "The input is all fully in place at %s sites %s" %
                        (len(prim_location), sorted(prim_location)))
                    continue
                copies_needed = max(0, copies_needed - len(prim_location))
                wfh.sendLog(
                    'transferor',
                    "not counting existing copies ; now need %s" %
                    copies_needed)
                copies_being_made = [
                    sum([
                        info['blocks'].keys().count(block)
                        for site, info in destinations.items()
                        if site in prim_destination
                    ]) for block in all_block_names
                ]

                latching_on_transfers = set()
                [
                    latching_on_transfers.update(info['blocks'].values())
                    for site, info in destinations.items()
                    if site in prim_destination
                ]
                latching_on_transfers = list(latching_on_transfers)
                #print latching_on_transfers

                ## figure out where all this is going to go
                prim_to_distribute = [
                    site for site in sites_allowed
                    if not SI.CE_to_SE(site) in prim_location
                ]
                prim_to_distribute = [
                    site for site in prim_to_distribute
                    if not SI.CE_to_SE(site) in prim_destination
                ]
                ## take out the ones that cannot receive transfers
                prim_to_distribute = [
                    site for site in prim_to_distribute if not any([
                        osite.startswith(site)
                        for osite in SI.sites_veto_transfer
                    ])
                ]

                wfh.sendLog(
                    'transferor',
                    "Could be going to: %s" % sorted(prim_to_distribute))
                if not prim_to_distribute or any([
                        transfers_per_sites[site] < max_staging_per_site
                        for site in prim_to_distribute
                ]):
                    ## means there is openings let me go
                    print "There are transfer slots available:", [
                        (site, transfers_per_sites[site])
                        for site in prim_to_distribute
                    ]
                    #for site in sites_allowed:
                    #    #increment accross the board, regardless of real destination: could be changed
                    #    transfers_per_sites[site] += 1
                else:
                    if int(
                            wfh.request['RequestPriority']
                    ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority:
                        wfh.sendLog(
                            'transferor',
                            "Higher priority sample %s >= %s go-on over transfer slots available"
                            % (wfh.request['RequestPriority'],
                               in_transfer_priority))
                    else:
                        wfh.sendLog(
                            'transferor',
                            "Not allowed to transfer more than %s per site at a time. Going overboard for %s"
                            % (max_staging_per_site,
                               sorted([
                                   site for site in prim_to_distribute
                                   if transfers_per_sites[site] >=
                                   max_staging_per_site
                               ])))
                        if not options.go:
                            allowed = False
                            break

                for latching in latching_on_transfers:
                    tfo = session.query(Transfer).filter(
                        Transfer.phedexid == int(latching)).first()
                    if not tfo:
                        tfo = session.query(Transfer).filter(
                            Transfer.phedexid == -int(latching)).first()

                    if not tfo:
                        tfo = Transfer(phedexid=latching)
                        tfo.workflows_id = []
                        session.add(tfo)
                    else:
                        tfo.phedexid = latching  ## make it positive ever

                    if not wfo.id in tfo.workflows_id:
                        print "adding", wfo.id, "to", tfo.id, "with phedexid", latching
                        l = copy.deepcopy(tfo.workflows_id)
                        l.append(wfo.id)
                        tfo.workflows_id = l
                    if not options.test:
                        session.commit()
                    else:
                        session.flush(
                        )  ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query
                    can_go = False
                    transfer_sizes[prim] = input_sizes[prim]
                    staging = True

                # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ??
                #copies_needed = max(0,copies_needed - len(prim_destination))
                copies_needed = max(0, copies_needed - min(copies_being_made))
                wfh.sendLog(
                    'transferor',
                    "Not counting the copies being made ; then need %s" %
                    copies_needed)
                if copies_needed == 0:
                    wfh.sendLog(
                        'transferor',
                        "The output is either fully in place or getting in full somewhere with %s"
                        % latching_on_transfers)
                    can_go = True
                    continue
                elif len(prim_to_distribute) == 0:
                    wfh.sendLog(
                        'transferor',
                        "We are going to need extra copies, but no destinations seems available"
                    )
                    prim_to_distribute = [
                        site for site in sites_allowed
                        if not SI.CE_to_SE(site) in prim_location
                    ]
                    prim_to_distribute = [
                        site for site in prim_to_distribute if not any([
                            osite.startswith(site)
                            for osite in SI.sites_veto_transfer
                        ])
                    ]

                if len(
                        prim_to_distribute
                ) > 0:  ## maybe that a parameter we can play with to limit the
                    if not options or options.chop:
                        ### hard include the tape disk andpoint ?
                        #tapes = [site for site in  getDatasetPresence( url, prim, vetos=['T0','T2','T3','Disk']) if site.endswith('MSS')]
                        chops, sizes = getDatasetChops(
                            prim,
                            chop_threshold=options.chopsize,
                            only_blocks=blocks)
                        spreading = distributeToSites(chops,
                                                      prim_to_distribute,
                                                      n_copies=copies_needed,
                                                      weights=SI.cpu_pledges,
                                                      sizes=sizes)
                        transfer_sizes[prim] = sum(sizes)
                        if not spreading:
                            sendLog(
                                'transferor',
                                'cannot send %s to any site, it cannot fit anywhere'
                                % prim,
                                level='critical')
                            wfh.sendLog(
                                'transferor',
                                "cannot send to any site. %s cannot seem to fit anywhere"
                                % (prim))
                            staging = False
                            can_go = False

                    else:
                        spreading = {}
                        for site in prim_to_distribute:
                            if blocks:
                                spreading[site] = blocks
                            else:
                                spreading[site] = [prim]
                        transfer_sizes[prim] = input_sizes[
                            prim]  ## this is approximate if blocks are specified
                    can_go = False
                    wfh.sendLog(
                        'transferor', "selected CE destinations %s" %
                        (sorted(spreading.keys())))
                    for (site, items) in spreading.items():
                        all_transfers[site].extend(items)
                        transfers_per_sites[site] += 1
                        primary_destinations.add(site)
        if not allowed:
            wfh.sendLog('transferor', "Not allowed to move on with")
            continue

        if secondary:

            override_sec_destination = []
            if 'SecondaryLocation' in CI.campaigns[wfh.request['Campaign']]:
                override_sec_destination = CI.campaigns[
                    wfh.request['Campaign']]['SecondaryLocation']

            print wfo.name, 'reads', ', '.join(secondary), 'in secondary'
            for sec in secondary:

                workflow_dependencies[sec].add(wfo.id)

                if True:
                    ## new style, failing on minbias
                    if not sec in destination_cache:
                        ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist
                        destination_cache[sec], _ = getDatasetDestinations(
                            url, sec)  ## NO SITE WHITE LIST ADDED
                        #destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed])

                    ## limit to the site whitelist NOW
                    se_allowed = [SI.CE_to_SE(site) for site in sites_allowed]
                    destinations = dict([
                        (k, v) for (k, v) in destination_cache[sec].items()
                        if site in se_allowed
                    ])
                    ## truncate location/destination to those making up for >90% of the dataset
                    bad_destinations = [
                        destinations.pop(site)
                        for (site, info) in destinations.items()
                        if info['data_fraction'] < 0.9
                    ]
                    sec_location = [
                        site for (site, info) in destinations.items()
                        if info['completion'] >= 95
                    ]
                    sec_destination = [
                        site for site in destinations.keys()
                        if not site in sec_location
                    ]
                else:
                    ## old style
                    presence = getDatasetPresence(url, sec)
                    sec_location = [
                        site for site, pres in presence.items()
                        if pres[1] > 90.
                    ]  ## more than 90% of the minbias at sites
                    subscriptions = listSubscriptions(url, sec)
                    sec_destination = [site for site in subscriptions]

                sec_to_distribute = [
                    site for site in sites_allowed if
                    not any([osite.startswith(site) for osite in sec_location])
                ]
                sec_to_distribute = [
                    site for site in sec_to_distribute if not any(
                        [osite.startswith(site) for osite in sec_destination])
                ]
                sec_to_distribute = [
                    site for site in sec_to_distribute if not any([
                        osite.startswith(site)
                        for osite in SI.sites_veto_transfer
                    ])
                ]
                if override_sec_destination:
                    ## intersect with where we want the PU to be
                    not_needed_anymore = list(
                        set(sec_to_distribute) - set(override_sec_destination))
                    #sendEmail("secondary superfluous","the dataset %s could be removed from %s"%( sec, not_needed_anymore ))
                    sendLog(
                        'transferor',
                        "the dataset %s could be removed from %s" %
                        (sec, not_needed_anymore))
                    sec_to_distribute = list(
                        set(sec_to_distribute) & set(override_sec_destination))

                if len(sec_to_distribute) > 0:
                    print "secondary could go to", sorted(sec_to_distribute)
                    sec_size = dss.get(sec)
                    for site in sec_to_distribute:
                        site_se = SI.CE_to_SE(site)
                        if (SI.disk[site_se] * 1024.) > sec_size:
                            all_transfers[site].append(sec)
                            can_go = False
                        else:
                            print "could not send the secondary input to", site_se, "because it is too big for the available disk", SI.disk[
                                site_se] * 1024, "GB need", sec_size
                            if primary_destinations and site in primary_destinations:
                                #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024))
                                sendLog('transferor',
                                        '%s is too big (%s) for %s (%s)' %
                                        (sec, sec_size, site_se,
                                         SI.disk[site_se] * 1024),
                                        level='critical')
                else:
                    print "the secondary input does not have to be send to site"

        ## is that possible to do something more
        if can_go:
            ## no explicit transfer required this time
            if staging:
                ## but using existing ones
                wfh.sendLog(
                    'transferor',
                    "latches on existing transfers, and nothing else, settin staging"
                )
                wfo.status = 'staging'
                needs_transfer += 1
            else:
                wfh.sendLog(
                    'transferor', "should just be assigned now to %s" %
                    sorted(sites_allowed))
                wfo.status = 'staged'
            passing_along += 1
            wfh.sendLog('transferor', "setting status to %s" % wfo.status)
            session.commit()
            continue
        else:
            ## there is an explicit transfer required
            if staging:
                ## and also using an existing one
                wfh.sendLog('transferor', "latches on existing transfers")
                if not options.test:
                    wfo.status = 'staging'
                    wfh.sendLog('transferor',
                                "setting status to %s" % wfo.status)
                    session.commit()
            wfh.sendLog('transferor', "needs a transfer")
            needs_transfer += 1
            passing_along += 1

    if no_goes:
        #sendEmail("no go for managing","No go for \n"+"\n".join( no_goes ))
        sendLog('transferor',
                "No go for \n" + "\n".join(no_goes),
                level='critical')

    print "accumulated transfers"
    print json.dumps(all_transfers, indent=2)
    fake_id = -1
    wf_id_in_prestaging = set()

    for (site, items_to_transfer) in all_transfers.iteritems():
        items_to_transfer = list(set(items_to_transfer))

        ## convert to storage element
        site_se = SI.CE_to_SE(site)

        ## site that do not want input datasets
        if site in SI.sites_veto_transfer:
            print site, "does not want transfers"
            continue

        ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured.

        ## massage a bit the items
        blocks = [it for it in items_to_transfer if '#' in it]
        block_datasets = list(set([it.split('#')[0] for it in blocks]))
        datasets = [it for it in items_to_transfer if not '#' in it]

        details_text = "Making a replica to %s (CE) %s (SE) for" % (site,
                                                                    site_se)

        #print "\t",len(blocks),"blocks"
        ## remove blocks if full dataset is send out
        blocks = [
            block for block in blocks if not block.split('#')[0] in datasets
        ]
        #print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks]))
        #print "\t",len(datasets),"datasets"
        #print "\t",datasets
        details_text += '\n\t%d blocks' % len(blocks)
        details_text += '\n\t%d needed blocks for %s' % (
            len(blocks),
            sorted(list(set([block.split('#')[0] for block in blocks]))))
        details_text += '\n\t%d datasets' % len(datasets)
        details_text += '\n\t%s' % sorted(datasets)

        items_to_transfer = blocks + datasets

        if execute:
            sendLog('transferor', details_text)
        else:
            print "Would make a replica to", site, "(CE)", site_se, "(SE) for"
            print details_text

        ## operate the transfer
        if options and options.stop:
            ## ask to move-on
            answer = raw_input('Continue with that ?')
            if not answer.lower() in ['y', 'yes', 'go']:
                continue

        if execute:
            priority = 'normal'
            cds = [
                ds for ds in datasets + block_datasets if ds in max_priority
            ]
            if cds and False:  ## I don't think this is working. subscription should be updated on the fly and regularly for raising the priority if needed
                ## decide on an overall priority : that's a bit too large though
                if any([max_priority[ds] >= 90000 for ds in cds]):
                    priority = 'high'
                elif all([max_priority[ds] < 80000 for ds in cds]):
                    priority = 'low'

            result = makeReplicaRequest(url,
                                        site_se,
                                        items_to_transfer,
                                        'prestaging',
                                        priority=priority)
        else:
            result = {'phedex': {'request_created': []}}
            fake_id -= 1

        if not result:
            print "ERROR Could not make a replica request for", site, items_to_transfer, "pre-staging"
            continue
        for phedexid in [o['id'] for o in result['phedex']['request_created']]:
            new_transfer = session.query(Transfer).filter(
                Transfer.phedexid == int(phedexid)).first()
            if not new_transfer:
                new_transfer = session.query(Transfer).filter(
                    Transfer.phedexid == -int(phedexid)).first()
            print phedexid, "transfer created"
            if not new_transfer:
                new_transfer = Transfer(phedexid=phedexid)
                session.add(new_transfer)
            else:
                new_transfer.phedexid = phedexid  ## make it positive again

            new_transfer.workflows_id = set()
            for transfering in list(
                    set(map(lambda it: it.split('#')[0], items_to_transfer))):
                new_transfer.workflows_id.update(
                    workflow_dependencies[transfering])
            new_transfer.workflows_id = list(new_transfer.workflows_id)
            wf_id_in_prestaging.update(new_transfer.workflows_id)
            session.commit()
            ## auto approve it
            if execute:
                approved = approveSubscription(url, phedexid, [site_se])

    for wfid in wf_id_in_prestaging:
        tr_wf = session.query(Workflow).get(wfid)
        if tr_wf and tr_wf.status != 'staging':
            if execute:
                tr_wf.status = 'staging'
                if talk:
                    print "setting", tr_wf.name, "to staging"
        session.commit()
Exemplo n.º 21
0
def equalizor(url, specific=None, options=None):
    up = componentInfo(mcm=False, soft=['mcm'])
    if not up.check(): return

    if not specific:
        workflows = getWorkflows(url, status='running-closed', details=True)
        workflows.extend(getWorkflows(url, status='running-open',
                                      details=True))

    ## start from scratch
    modifications = defaultdict(dict)
    ## define regionality site => fallback allowed. feed on an ssb metric ??
    mapping = defaultdict(list)
    reversed_mapping = defaultdict(list)
    regions = defaultdict(list)
    SI = siteInfo()
    for site in SI.sites_ready:
        region = site.split('_')[1]
        if not region in ['US', 'DE', 'IT']: continue
        regions[region] = [region]

    def site_in_depletion(s):
        return True
        if s in SI.sites_pressure:
            (m, r, pressure) = SI.sites_pressure[s]
            if float(m) < float(r):
                print s, m, r, "lacking pressure"
                return True
            else:
                print s, m, r, "pressure"
                pass

        return False

    for site in SI.sites_ready:
        region = site.split('_')[1]
        ## fallback to the region, to site with on-going low pressure
        mapping[site] = [
            fb for fb in SI.sites_ready
            if any([('_%s_' %
                     (reg) in fb and fb != site and site_in_depletion(fb))
                    for reg in regions[region]])
        ]

    use_T0 = False
    if options.augment: use_T0 = True

    use_HLT = False
    if options.augment: use_HLT = True

    if use_HLT:
        mapping['T2_CH_CERN'].append('T2_CH_CERN_HLT')
    #mapping['T2_IT_Legnaro'].append('T1_IT_CNAF')
    for reg in ['IT', 'DE', 'UK']:
        mapping['T2_CH_CERN'].extend(
            [fb for fb in SI.sites_ready if '_%s_' % reg in fb])

    for site, fallbacks in mapping.items():
        for fb in fallbacks:
            reversed_mapping[fb].append(site)

    ## this is the fallback mapping
    print json.dumps(mapping, indent=2)
    #print json.dumps( reversed_mapping, indent=2)

    altered_tasks = set()

    def running_idle(wfi, task_name):
        gmon = wfi.getGlideMon()
        #print gmon
        if not gmon: return (0, 0)
        if not task_name in gmon: return (0, 0)
        return (gmon[task_name]['Running'], gmon[task_name]['Idle'])

    def needs_action(wfi, task, min_idled=100, pressure=0.2):
        task_name = task.pathName.split('/')[-1]
        running, idled = running_idle(wfi, task_name)
        go = True
        if not idled and not running:
            go = False
        if idled < 100:
            go = False
        if (not running and idled) or (running and
                                       (idled / float(running) > pressure)):
            go = True
        else:
            go = False
        return go, task_name, running, idled

    def getcampaign(task):
        taskname = task.pathName.split('/')[-1]
        if hasattr(task, 'prepID'):
            return task.prepID.split('-')[1]
        elif taskname.count('-') >= 1:
            return taskname.split('-')[1]
        else:
            return None

    def close(interface):
        open('%s/equalizor.json.new' % monitor_dir,
             'w').write(json.dumps(interface, indent=2))
        os.system('mv %s/equalizor.json.new %s/equalizor.json' %
                  (monitor_dir, monitor_dir))
        os.system('cp %s/equalizor.json %s/logs/equalizor/equalizor.%s.json' %
                  (monitor_dir, monitor_dir, time.mktime(time.gmtime())))

    interface = {'reversed_mapping': reversed_mapping, 'modifications': {}}
    if options.augment or options.remove:
        interface['modifications'] = json.loads(
            open('%s/equalizor.json' % monitor_dir).read())['modifications']

    if options.remove:
        if specific in interface['modifications']:
            print "poping", specific
            interface['modifications'].pop(specific)
            close(interface)
        return

    PU_locations = {}
    PU_overflow = {
        #'RunIISpring15PrePremix' : {
        #    'sites' : ["T1_US_FNAL", "T1_DE_KIT" , "T1_IT_CNAF", "T1_RU_JINR" ,"T2_CH_CERN"],
        #    'max' : 20000,
        #    'pending' : 0
        #    },
        'RunIIFall15DR76': {
            'sites': [
                'T1_ES_PIC', 'T2_US_Purdue', 'T2_UK_SGrid_RALPP', 'T2_BE_IIHE',
                'T2_DE_DESY', 'T2_IT_Legnaro', 'T2_US_Caltech', 'T1_DE_KIT',
                'T2_UK_London_Brunel', 'T2_IT_Pisa', 'T1_US_FNAL',
                'T2_IT_Rome', 'T2_US_Florida', 'T1_IT_CNAF', 'T1_RU_JINR',
                'T2_UK_London_IC', 'T2_US_Nebraska', 'T2_FR_CCIN2P3',
                'T2_US_UCSD', 'T2_ES_CIEMAT', 'T1_FR_CCIN2P3',
                'T2_US_Wisconsin', 'T2_US_MIT', 'T2_DE_RWTH', 'T1_UK_RAL',
                'T2_US_Vanderbilt', 'T2_CH_CERN'
            ],
            'max':
            20000,
            'pending':
            0
        },
        'RunIISpring16DR80': {
            'sites': [
                'T1_ES_PIC', 'T2_US_Purdue', 'T2_UK_SGrid_RALPP', 'T2_BE_IIHE',
                'T2_DE_DESY', 'T2_IT_Legnaro', 'T2_US_Caltech', 'T1_DE_KIT',
                'T2_UK_London_Brunel', 'T2_IT_Pisa', 'T1_US_FNAL',
                'T2_IT_Rome', 'T2_US_Florida', 'T1_IT_CNAF', 'T1_RU_JINR',
                'T2_UK_London_IC', 'T2_US_Nebraska', 'T2_FR_CCIN2P3',
                'T2_US_UCSD', 'T2_ES_CIEMAT', 'T1_FR_CCIN2P3',
                'T2_US_Wisconsin', 'T2_US_MIT', 'T2_DE_RWTH', 'T1_UK_RAL',
                'T2_US_Vanderbilt', 'T2_CH_CERN'
            ],
            'max':
            20000,
            'pending':
            0,
            'force':
            True
        },
        'RunIISpring15DR74': {
            'sites': [
                'T1_ES_PIC', 'T1_DE_KIT', 'T1_US_FNAL', 'T1_IT_CNAF',
                'T1_RU_JINR', 'T1_FR_CCIN2P3', 'T1_UK_RAL', 'T2_CH_CERN'
            ],
            'max':
            20000,
            'pending':
            0
        }
    }

    set_to = SI.sites_AAA
    LHE_overflow = {
        'RunIIWinter15GS': set_to,
        'RunIISummer15GS': set_to,
        'Summer12': set_to,
        'Summer11Leg': set_to
        #'RunIIFall15MiniAODv2' : set_to,
    }

    pending_HLT = 0
    max_HLT = 60000
    pending_T0 = 0
    max_T0 = 60000
    try:
        gmon = json.loads(
            os.popen(
                'curl -s http://cms-gwmsmon.cern.ch/prodview/json/T2_CH_CERN_HLT'
            ).read())
        pending_HLT += gmon["Running"]
        pending_HLT += gmon["MatchingIdle"]
    except:
        pass

    t0_special = [
        'vlimant_BPH-RunIISummer15GS-00030_00212_v0__160129_135314_9755',
        'pdmvserv_TSG-RunIISummer15GS-00044_00240_v0__160210_121223_8582'
    ]
    no_routing = [
        #'vlimant_BPH-RunIISummer15GS-00030_00212_v0__160129_135314_9755',
        #'pdmvserv_TOP-RunIIWinter15GS-00074_00187_v0__160207_162312_1992',
    ]

    stay_within_site_whitelist = False
    specific_task = None
    if specific and ":" in specific:
        specific, specific_task = specific.split(':')

    if specific:
        wfs = session.query(Workflow).filter(
            Workflow.name.contains(specific)).all()
    else:
        wfs = session.query(Workflow).filter(Workflow.status == 'away').all()

    random.shuffle(wfs)
    for wfo in wfs:
        if wfo.name in no_routing and not options.augment:
            continue

        if specific and not specific in wfo.name:
            continue
        if specific:
            wfi = workflowInfo(url, wfo.name)
        else:
            cached = filter(lambda d: d['RequestName'] == wfo.name, workflows)
            if not cached: continue
            wfi = workflowInfo(url, wfo.name, request=cached[0])

        ## only running should get re-routed
        if not wfi.request['RequestStatus'] in [
                'running-open', 'running-closed'
        ] and not specific:
            continue

        tasks_and_campaigns = []
        for task in wfi.getWorkTasks():
            tasks_and_campaigns.append((task, getcampaign(task)))

        _, _, _, sec = wfi.getIO()

        ## check needs override
        needs_overide = False
        if not needs_overide and options.augment: needs_overide = True

        def overide_from_agent(wfi, needs_overide):
            bad_agents = []  #'http://cmssrv219.fnal.gov:5984']
            if not bad_agents: return needs_overide
            if needs_overide: return True
            agents = wfi.getAgents()

            wqss = ['Running', 'Acquired']
            if any([
                    agent in agents.get(wqs, {}).keys()
                    for wqs, agent in itertools.product(wqss, bad_agents)
            ]):
                print "overriding the need for bad agent"
                needs_overide = True
            return needs_overide

        ## now parse this for action
        for i_task, (task, campaign) in enumerate(tasks_and_campaigns):
            if options.augment:
                print task.pathName
                print campaign

            ### rule to avoid the issue of taskchain secondary jobs being stuck at sites processing the initial step
            if campaign in LHE_overflow:
                if task.taskType in ['Processing']:
                    needs, task_name, running, idled = needs_action(wfi, task)
                    needs_overide = overide_from_agent(wfi, needs_overide)
                    extend_to = copy.deepcopy(LHE_overflow[campaign])
                    if stay_within_site_whitelist:
                        extend_to = list(
                            set(extend_to) & set(wfi.request['SiteWhitelist'])
                        )  ## restrict to stupid-site-whitelist

                    if extend_to and needs or needs_overide:
                        print "\t", task_name, "of", wfo.name, "running", running, "and pending", idled, "taking action : ReplaceSiteWhitelist"
                        modifications[wfo.name][task.pathName] = {
                            "ReplaceSiteWhitelist":
                            copy.deepcopy(LHE_overflow[campaign]),
                            "Running":
                            running,
                            "Pending":
                            idled,
                            "Priority":
                            wfi.request['RequestPriority']
                        }
                        #print json.dumps( modifications[wfo.name][task.pathName]['ReplaceSiteWhitelist']
                        altered_tasks.add(task.pathName)
                    else:
                        print task_name, "of", wfo.name, "running", running, "and pending", idled

            ### overflow the 76 digi-reco to the site holding the pileup
            if campaign in PU_overflow:
                force = PU_overflow[campaign][
                    'force'] if 'force' in PU_overflow[campaign] else False
                secondary_locations = set(SI.sites_ready)
                for s in sec:
                    if not s in PU_locations:
                        presence = getDatasetPresence(url, s)
                        #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
                        one_secondary_locations = [
                            site for (site, (there, frac)) in presence.items()
                            if frac > 98.
                        ]
                        PU_locations[s] = one_secondary_locations
                    print "secondary is at", sorted(PU_locations[s])
                    secondary_locations = set(
                        PU_locations[s]) & secondary_locations

                ## we should add all sites that hold the secondary input if any
                secondary_locations = list(
                    set(PU_overflow[campaign]['sites']) & set(SI.sites_ready))
                if any([
                        task.pathName.endswith(finish)
                        for finish in ['_0', 'StepOneProc', 'Production']
                ]):
                    needs, task_name, running, idled = needs_action(wfi, task)
                    ## removing the ones in the site whitelist already since they encode the primary input location
                    if stay_within_site_whitelist:
                        original_site_in_use = set(
                            wfi.request['SiteWhitelist'])
                    else:
                        original_site_in_use = set(secondary_locations)
                    ## remove the sites that have already running jobs
                    gmon = wfi.getGlideMon()
                    if gmon and task_name in gmon and 'Sites' in gmon[
                            task_name]:
                        site_in_use = set(gmon[task_name]['Sites'])
                        ## that determines where you want to run in addition
                        #augment_by = list((set(secondary_locations)- site_in_use))
                        augment_by = list(
                            (set(secondary_locations) - site_in_use)
                            & original_site_in_use
                        )  ## restrict to stupid-site-whitelist
                    else:
                        augment_by = list(original_site_in_use)

                    needs_overide = overide_from_agent(wfi, needs_overide)
                    if augment_by and (
                            needs or needs_overide
                            or force) and PU_overflow[campaign][
                                'pending'] < PU_overflow[campaign]['max']:
                        PU_overflow[campaign]['pending'] += idled
                        print "raising overflow to", PU_overflow[campaign][
                            'pending'], "for", PU_overflow[campaign]['max']
                        ## the step with an input ought to be the digi part : make this one go anywhere
                        modifications[wfo.name][task.pathName] = {
                            "AddWhitelist": augment_by,
                            "Running": running,
                            "Pending": idled,
                            "Priority": wfi.request['RequestPriority']
                        }
                        altered_tasks.add(task.pathName)
                        print "\t", task_name, "of", wfo.name, "running", running, "and pending", idled, "taking action : AddWhitelist"
                        #print json.dumps( augment_by, indent=2 )
                    else:
                        print task_name, "of", wfo.name, "running", running, "and pending", idled

            ### overflow the skims back to multi-core
            if campaign in ['Run2015D', 'Run2015C_25ns'
                            ] and task.taskType == 'Skim':
                original_swl = wfi.request['SiteWhitelist']
                needs, task_name, running, idled = needs_action(wfi, task)
                if (needs or needs_overide):
                    modifications[wfo.name][task.pathName] = {
                        'AddWhitelist': original_swl,
                        "Running": running,
                        "Pending": idled,
                        "Priority": wfi.request['RequestPriority']
                    }
                    altered_tasks.add(task.pathName)
                    print "\t", task_name, "of", wfo.name, "running", running, "and pending", idled, "taking action : AddWhitelist"

            if options.augment:
                print sorted(wfi.request['SiteWhitelist']), i_task, use_HLT
            ### add the HLT at partner of CERN
            if 'T2_CH_CERN' in wfi.request[
                    'SiteWhitelist'] and i_task == 0 and use_HLT:
                needs, task_name, running, idled = needs_action(wfi, task)
                if options.augment: needs = True
                needs = True
                ##needs = random.random()<0.40 remove the random, just add up to a limit
                if (needs or needs_overide) and pending_HLT < max_HLT:
                    pending_HLT += idled
                    if task.pathName in modifications[
                            wfo.name] and 'AddWhitelist' in modifications[
                                wfo.name][task.pathName]:
                        modifications[wfo.name][task.pathName][
                            "AddWhitelist"].append("T2_CH_CERN_HLT")
                        print "\t", wfo.name, "adding addHLT up to", pending_HLT, "for", max_HLT
                        print task.pathName
                    ## this Replace does not work at all for HLT
                    #elif task.pathName in modifications[wfo.name] and 'ReplaceSiteWhitelist' in modifications[wfo.name][task.pathName]:
                    #modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"].append( "T2_CH_CERN_HLT" )
                    #print "\t",wfo.name,"adding replace HLT up to",pending_HLT,"for",max_HLT
                    else:
                        modifications[wfo.name][task.pathName] = {
                            "AddWhitelist": ["T2_CH_CERN_HLT"],
                            "Priority": wfi.request['RequestPriority'],
                            "Running": running,
                            "Pending": idled
                        }
                        print "\t", wfo.name, "adding HLT up to", pending_HLT, "for", max_HLT
                        print task.pathName

            if i_task == 0 and not sec and use_T0:
                needs, task_name, running, idled = needs_action(wfi, task)

                if options.augment: needs = True
                #needs = True
                #if not (wfo.name in t0_special) and not options.augment: needs = False
                if not wfi.request['RequestType'] in [
                        'MonteCarlo', 'MonteCarloFromGEN'
                ] and not options.augment:
                    needs = False

                ##needs = random.random()<0.40 remove the random, just add up to a limit
                if (needs or needs_overide):
                    pending_T0 += idled
                    if task.pathName in modifications[
                            wfo.name] and 'AddWhitelist' in modifications[
                                wfo.name][task.pathName]:
                        modifications[wfo.name][
                            task.pathName]["AddWhitelist"].append("T0_CH_CERN")
                        print "\t", wfo.name, "adding addT0 up to", pending_T0, "for", max_T0
                        print task.pathName
                    elif task.pathName in modifications[
                            wfo.
                            name] and 'ReplaceSiteWhitelist' in modifications[
                                wfo.name][task.pathName]:
                        modifications[wfo.name][task.pathName][
                            "ReplaceSiteWhitelist"].append("T0_CH_CERN")
                        print "\t", wfo.name, "adding replace T0 up to", pending_T0, "for", max_T0
                    else:
                        modifications[wfo.name][task.pathName] = {
                            "AddWhitelist": ["T0_CH_CERN"],
                            "Priority": wfi.request['RequestPriority'],
                            "Running": running,
                            "Pending": idled
                        }
                        print "\t", wfo.name, "adding T0 up to", pending_T0, "for", max_T0
                        print task.pathName

    interface['modifications'].update(modifications)

    ## temporary core managing
    interface['cores'] = {
        'T2_CH_CERN_HLT': {
            'min': 4,
            'max': 16
        },
        'default': {
            'min': 1,
            'max': 4
        }
    }
    #interface['max_cores']={'T2_CH_CERN_HLT': 16, 'default': 4}
    #interface['min_cores']={'T2_CH_CERN_HLT': 4, 'default': 1}
    #interface['resize_subtasks'] = 'RunIISpring16DR80'
    interface['resizes'] = ['RunIISpring16DR80', 'NotACampaign']

    ## close and save
    close(interface)
Exemplo n.º 22
0
def closor(url, specific=None, options=None):
    if userLock(): return
    mlock = moduleLock()
    if mlock() and not options.manual: return
    up = componentInfo(soft=['mcm', 'wtc'])
    if not up.check(): return

    UC = unifiedConfiguration()
    CI = campaignInfo()
    BI = batchInfo()
    CloseI = closeoutInfo()

    all_late_files = []

    jump_the_line = options.announce if options else False
    if jump_the_line:
        print "announce option is on. Checking on things on-going ready to be announced"
        wfs = session.query(Workflow).filter(
            Workflow.status.contains('announce')).filter(
                sqlalchemy.not_(Workflow.status.contains('announced'))).all()
    else:
        print "regular option. Checking on things done and to be announced"
        wfs = session.query(Workflow).filter(Workflow.status == 'close').all()

    if specific:
        wfs = [wfo for wfo in wfs if specific in wfo.name]
    wfs_n = [w.name for w in wfs]

    print "unique names?"
    print len(set(wfs_n)) == len(wfs_n)

    held = set()

    print len(wfs), "closing"
    random.shuffle(wfs)
    max_per_round = UC.get('max_per_round').get('closor', None)
    if options.limit: max_per_round = options.limit

    if max_per_round:
        ## order them by priority
        all_closedout = sorted(getWorkflows(url, 'closed-out', details=True),
                               key=lambda r: r['RequestPriority'])
        all_closedout = [r['RequestName'] for r in all_closedout]

        def rank(wfn):
            return all_closedout.index(wfn) if wfn in all_closedout else 0

        wfs = sorted(wfs, key=lambda wfo: rank(wfo.name), reverse=True)
        wfs = wfs[:max_per_round]

    batch_go = {}
    batch_warnings = defaultdict(set)
    batch_extreme_warnings = defaultdict(set)
    batch_goodness = UC.get("batch_goodness")

    closers = []

    print len(wfs), "closing"
    th_start = time.mktime(time.gmtime())

    for iwfo, wfo in enumerate(wfs):
        if specific and not specific in wfo.name: continue
        if not options.manual and (
                'cmsunified_task_HIG-RunIIFall17wmLHEGS-05036__v1_T_200712_005621_4159'
                .lower() in (wfo.name).lower() or
                'pdmvserv_task_HIG-RunIISummer16NanoAODv7-03979__v1_T_200915_013748_1986'
                .lower() in (wfo.name).lower()):
            continue
        closers.append(
            CloseBuster(
                wfo=wfo,
                url=url,
                CI=CI,
                UC=UC,
                jump_the_line=jump_the_line,
                batch_goodness=batch_goodness,
                batch_go=batch_go,
                #stats = stats,
                batch_warnings=batch_warnings,
                batch_extreme_warnings=batch_extreme_warnings,
                all_late_files=all_late_files,
                held=held,
            ))

    run_threads = ThreadHandler(threads=closers,
                                n_threads=options.threads,
                                sleepy=10,
                                timeout=None,
                                verbose=True,
                                label='closor')

    run_threads.start()

    ## waiting on all to complete
    while run_threads.is_alive():
        #print "Waiting on closing threads",time.asctime(time.gmtime())
        time.sleep(5)

    JC = JIRAClient() if up.status.get('jira', False) else None
    print len(
        run_threads.threads), "finished thread to gather information from"
    failed_threads = 0
    for to in run_threads.threads:
        if to.failed:
            failed_threads += 1
            continue
        if to.outs:
            for outO in to.outs:
                out = outO.datasetname
                odb = session.query(Output).filter(
                    Output.datasetname == out).first()
                if not odb:
                    print "adding an output object", out
                    session.add(outO)
                else:
                    odb.date = outO.date

        if to.to_status:
            to.wfo.status = to.to_status
            if JC and to.to_status == "done" and to.wfi:
                jiras = JC.find({"prepid": to.wfi.request['PrepID']})
                for jira in jiras:
                    JC.close(jira.key)

        if to.to_wm_status:
            to.wfo.wm_status = to.to_wm_status
        if to.closing:
            CloseI.pop(to.wfo.name)

        session.commit()

    th_stop = time.mktime(time.gmtime())

    if wfs:
        time_spend_per_workflow = (th_stop - th_start) / float(len(wfs))
        print "Average time spend per workflow is", time_spend_per_workflow

    if float(failed_threads / run_threads.n_threads) > 0:
        sendLog('checkor',
                '%d/%d threads have failed, better check this out' %
                (failed_threads, run_threads.n_threads),
                level='critical')
        sendEmail(
            'checkor', '%d/%d threads have failed, better check this out' %
            (failed_threads, run_threads.n_threads))

    days_late = 0.
    retries_late = 10

    really_late_files = [
        info for info in all_late_files if info['retries'] >= retries_late
    ]
    really_late_files = [
        info for info in really_late_files
        if info['delay'] / (60 * 60 * 24.) >= days_late
    ]

    if really_late_files:
        subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s' % (
            len(really_late_files), days_late, retries_late,
            json.dumps(really_late_files, indent=2))
        #sendEmail('waiting for files to announce', subject)
        sendLog('closor', subject, level='warning')
        sendLog('closor', subject)
        print subject
        open('%s/stuck_files.json' % monitor_dir,
             'w').write(json.dumps(really_late_files, indent=2))

    if held:
        sendLog('closor',
                "the workflows below are held up \n%s" %
                ("\n".join(sorted(held))),
                level='critical')

    for bname, go in batch_go.items():
        if go:
            subject = "Release Validation Samples Batch %s" % bname
            issues = ""
            #if batch_warnings[ bname ]:
            #    issues="The following datasets have outstanding completion (<%d%%) issues:\n\n"% batch_goodness
            #    issues+="\n".join( sorted( batch_warnings[ bname ] ))
            #    issues+="\n\n"
            if batch_extreme_warnings[bname]:
                subject = "Low Statistics for %s" % bname
                issues = "The following datasets have outstanding completion (<50%%) issues:\n\n"
                issues += "\n".join(sorted(batch_extreme_warnings[bname]))
                issues += "\n\n"
            elif batch_warnings[bname]:
                issues = "The following datasets have outstanding completion (<%d%%) issues:\n\n" % batch_goodness
                issues += "\n".join(sorted(batch_warnings[bname]))
                issues += "\n\n"
            text = ""
            text += "Dear all,\n\n"
            text += "A batch of release validation workflows has finished.\n\n"
            text += "Batch ID:\n\n"
            text += "%s\n\n" % (bname)
            text += "Detail of the workflows\n\n"
            text += "https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s\n\n" % (
                bname)
            text += "%s\n\n" % (issues)
            text += "This is an automated message.\n\n"
            text += ""
            to = ['*****@*****.**']
            sendEmail(subject, text, destination=to)
            ## just announced ; take it out now.
            BI.pop(bname)
            deleteCampaignConfig(bname)

    if os.path.isfile('.closor_stop'):
        print "The loop on workflows was shortened"
        sendEmail('closor',
                  'Closor loop was shortened artificially using .closor_stop')
        os.system('rm -f .closor_stop')
Exemplo n.º 23
0
def closor(url, specific=None, options=None):
    if userLock(): return
    mlock  = moduleLock()
    if mlock(): return
    up = componentInfo(soft=['mcm','wtc'])
    if not up.check(): return


    UC = unifiedConfiguration()
    CI = campaignInfo()
    BI = batchInfo()
    CloseI = closeoutInfo()

    all_late_files = []

    jump_the_line = options.announce if options else False
    if jump_the_line:
        print "announce option is on. Checking on things on-going ready to be announced"
        wfs = session.query(Workflow).filter(Workflow.status.contains('announce')).filter(sqlalchemy.not_(Workflow.status.contains('announced'))).all()
    else:
        print "regular option. Checking on things done and to be announced"
        wfs = session.query(Workflow).filter(Workflow.status=='close').all()

    if specific:
        wfs = [wfo for wfo in wfs if specific in wfo.name]
    wfs_n = [w.name for w in wfs]

    print "unique names?"
    print len(set(wfs_n)) == len(wfs_n)
    
    held = set()

    print len(wfs),"closing"
    random.shuffle( wfs )    
    max_per_round = UC.get('max_per_round').get('closor',None)
    if options.limit: max_per_round = options.limit

    if max_per_round: 
        ## order them by priority
        all_closedout = sorted(getWorkflows(url, 'closed-out', details=True), key = lambda r : r['RequestPriority'])
        all_closedout = [r['RequestName'] for r in all_closedout]
        def rank( wfn ):
            return all_closedout.index( wfn ) if wfn in all_closedout else 0

        wfs = sorted( wfs, key = lambda wfo : rank( wfo.name ),reverse=True)
        wfs = wfs[:max_per_round]

    batch_go = {}
    batch_warnings = defaultdict(set)
    batch_goodness = UC.get("batch_goodness")

    closers = []

    print len(wfs),"closing"
    th_start = time.mktime(time.gmtime())

    for iwfo,wfo in enumerate(wfs):
        if specific and not specific in wfo.name: continue

        closers.append( CloseBuster(
            wfo = wfo,
            url = url,
            CI = CI,
            UC = UC,
            jump_the_line = jump_the_line,
            batch_goodness = batch_goodness,
            batch_go = batch_go,
            #stats = stats,
            batch_warnings = batch_warnings,
            all_late_files = all_late_files,
            held = held,
            ))

    
    run_threads = ThreadHandler( threads = closers,
                                 n_threads = options.threads,
                                 sleepy = 10,
                                 timeout = None,
                                 verbose = True,
                                 label = 'closor')

    run_threads.start()


    ## waiting on all to complete
    while run_threads.is_alive():
        #print "Waiting on closing threads",time.asctime(time.gmtime())
        time.sleep(5)

    JC = JIRAClient() if up.status.get('jira',False) else None
    print len(run_threads.threads),"finished thread to gather information from"
    failed_threads = 0
    for to in run_threads.threads:
        if to.failed:
            failed_threads += 1
            continue
        if to.outs:
            for outO in to.outs:
                out = outO.datasetname
                odb = session.query(Output).filter(Output.datasetname==out).first()
                if not odb:
                    print "adding an output object",out
                    session.add( outO )
                else:
                    odb.date = outO.date
                
        if to.to_status:
            to.wfo.status = to.to_status
            if JC and to.to_status == "done" and to.wfi:
                jiras = JC.find({"prepid" : to.wfi.request['PrepID']})
                for jira in jiras:
                    JC.close(jira.key)

        if to.to_wm_status:
            to.wfo.wm_status = to.to_wm_status
        if to.closing:
            CloseI.pop( to.wfo.name )

        session.commit()

    th_stop = time.mktime(time.gmtime())

    if wfs:
        time_spend_per_workflow = (th_stop-th_start) / float(len(wfs))
        print "Average time spend per workflow is", time_spend_per_workflow

    if float(failed_threads/run_threads.n_threads) > 0:
        sendLog('checkor','%d/%d threads have failed, better check this out'% (failed_threads, run_threads.n_threads), level='critical')
        sendEmail('checkor','%d/%d threads have failed, better check this out'% (failed_threads,run_threads.n_threads))

    days_late = 0.
    retries_late = 10

    really_late_files = [info for info in all_late_files if info['retries']>=retries_late]
    really_late_files = [info for info in really_late_files if info['delay']/(60*60*24.)>=days_late]

    if really_late_files:
        subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s'%(len(really_late_files), days_late, retries_late, json.dumps( really_late_files , indent=2) )
        #sendEmail('waiting for files to announce', subject)
        sendLog('closor', subject, level='warning')
        sendLog('closor',subject)
        print subject
        open('%s/stuck_files.json'%monitor_dir,'w').write( json.dumps( really_late_files , indent=2))

    if held:
        sendLog('closor',"the workflows below are held up \n%s"%("\n".join( sorted(held) )), level='critical')

    for bname,go in batch_go.items():
        if go:
            subject = "Release Validation Samples Batch %s"% bname
            issues=""
            if batch_warnings[ bname ]:
                issues="The following datasets have outstanding completion (<%d%%) issues:\n\n"% batch_goodness
                issues+="\n".join( sorted( batch_warnings[ bname ] ))
                issues+="\n\n"
            text = """
Dear all,

a batch of release validation workflows has finished.

Batch ID:

%s

Detail of the workflows

https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s

%s 
This is an automated message.
"""%( bname, 
      bname,
      issues)
            to = ['*****@*****.**']
            sendEmail(subject, text, destination=to )
            ## just announced ; take it out now.
            BI.pop( bname )


    if os.path.isfile('.closor_stop'):
        print "The loop on workflows was shortened"
        sendEmail('closor','Closor loop was shortened artificially using .closor_stop')
        os.system('rm -f .closor_stop')
Exemplo n.º 24
0
def injector(url, options, specific):

    ## passing a round of invalidation of what needs to be invalidated
    if options.invalidate:
        invalidator(url)

    workflows = getWorkflows(url, status=options.wmstatus,user=options.user)
    existing = [wf.name for wf in session.query(Workflow).all()]
    ## browse for assignment-approved requests, browsed for ours, insert the diff
    for wf in workflows:
        if wf not in existing:
            print "putting",wf
            new_wf = Workflow( name = wf , status = options.setstatus, wm_status = options.wmstatus) 
            session.add( new_wf )
            session.commit()


    existing = [wf.name for wf in session.query(Workflow).all()]


    ## pick up replacements
    for wf in session.query(Workflow).filter(Workflow.status == 'trouble').all():
        if specific and wf.name != specific:
            continue
        print wf.name
        wl = getWorkLoad(url, wf.name)
        familly = getWorkflowById( url, wl['PrepID'] )
        if len(familly)==1:
            print wf.name,"ERROR has no replacement"
            continue
        print wf.name,"has",len(familly),"familly members"
        for member in familly:
            if member != wf.name:
                fwl = getWorkLoad(url , member)
                if options.replace:
                    if member != options.replace: continue
                else:
                    if fwl['RequestDate'] < wl['RequestDate']: continue
                    if fwl['RequestType']=='Resubmission': continue
                    if fwl['RequestStatus'] in ['None',None]: continue

                new_wf = session.query(Workflow).filter(Workflow.name == member).first()
                if not new_wf:
                    print "putting",member
                    status = 'away'
                    if fwl['RequestStatus'] in ['assignment-approved']:
                        status = 'considered'
                    new_wf = Workflow( name = member, status = status, wm_status = fwl['RequestStatus'])
                    wf.status = 'forget'
                    session.add( new_wf ) 
                    session.commit()
                else:
                    if new_wf.status == 'forget': continue
                    print "getting",new_wf.name,"as replacement of",wf.name


                for tr in session.query(Transfer).all():
                    if wf.id in tr.workflows_id:
                        sw = copy.deepcopy(tr.workflows_id)
                        sw.remove( wf.id)
                        sw.append(new_wf.id)
                        tr.workflows_id = sw
                        print tr.phedexid,"got",new_wf.name
                        if new_wf.status != 'away':
                            new_wf.status = 'staging'
                        session.commit()
                        

        ## don't do that automatically
        #wf.status = 'forget'
        session.commit()
Exemplo n.º 25
0
def htmlor(caller=""):
    up = componentInfo(mcm=False, soft=['mcm'])
    if not up.check(): return

    try:
        boost = json.loads(open('%s/equalizor.json' %
                                monitor_dir).read())['modifications']
    except:
        boost = {}
    cache = getWorkflows(reqmgr_url, 'assignment-approved', details=True)
    cache.extend(getWorkflows(reqmgr_url, 'acquired', details=True))
    cache.extend(getWorkflows(reqmgr_url, 'running-open', details=True))
    cache.extend(getWorkflows(reqmgr_url, 'running-closed', details=True))

    def getWL(wfn):
        cached = filter(lambda d: d['RequestName'] == wfn, cache)
        if cached:
            wl = cached[0]
        else:
            wl = getWorkLoad(reqmgr_url, wfn)
        return wl

    def wfl(wf,
            view=False,
            p=False,
            ms=False,
            within=False,
            ongoing=False,
            status=False,
            update=False):
        wfn = wf.name
        wfs = wf.wm_status
        wl = None
        pid = None
        wl_pid = None
        pids = filter(lambda seg: seg.count('-') == 2, wf.name.split('_'))
        if len(pids):
            pids = pids[:1]
            pid = pids[0]

        if not pids:
            wl = getWL(wf.name)
            pids = getPrepIDs(wl)
            pid = pids[0]

        wl_pid = pid
        if 'task' in wf.name:
            wl_pid = 'task_' + pid

        text = ', '.join([
            #wfn,
            #'<a href="https://cmsweb.cern.ch/reqmgr/view/details/%s" target="_blank">%s</a> '%(wfn,wfn),
            #'<table><tr><td>%s</td></tr></table>'%(wfn),
            #'<span>%s</span>'%(wfn),
            "%s " % wfn,
            '(%s) <br>' % wfs
        ])
        text += ', '.join([
            '<a href="https://%s/reqmgr2/fetch?rid=%s" target="_blank">dts</a>'
            % (reqmgr_url, wfn),
            '<a href="https://cmsweb.cern.ch/reqmgr/view/details/%s" target="_blank">dts-req1</a>'
            % wfn,
            #TOFIX '<a href=https://cmsweb.cern.ch/reqmgr/view/showWorkload?requestName=%s target="_blank">wkl</a>'%wfn,
            '<a href="https://%s/couchdb/reqmgr_workload_cache/%s" target="_blank">wfc</a>'
            % (reqmgr_url, wfn),
            '<a href="https://%s/reqmgr2/data/request?name=%s" target="_blank">req</a>'
            % (reqmgr_url, wfn),
            #'<a href="https://cmsweb.cern.ch/reqmgr/reqMgr/request?requestName=%s" target="_blank">dwkc</a>'%wfn,
            #TOFIX '<a href="https://cmsweb.cern.ch/reqmgr/view/splitting/%s" target="_blank">spl</a>'%wfn,
            '<a href="https://cms-pdmv.cern.ch/stats/?RN=%s" target="_blank">vw</a>'
            % wfn,
            '<a href="https://cms-pdmv.cern.ch/stats/restapi/get_one/%s" target="_blank">vwo</a>'
            % wfn,
            '<a href="https://cms-logbook.cern.ch/elog/Workflow+processing/?mode=full&reverse=0&reverse=1&npp=20&subtext=%s&sall=q" target="_blank">elog</a>'
            % pid,
            '<a href="http://cms-gwmsmon.cern.ch/prodview/%s" target="_blank">pv</a>'
            % wfn,
            #deprecated '<a href="https://cmsweb.cern.ch/reqmgr/reqMgr/outputDatasetsByRequestName/%s" target="_blank">out</a>'%wfn,
            '<a href="closeout.html#%s" target="_blank">clo</a>' % wfn,
            '<a href="statuses.html#%s" target="_blank">st</a>' % wfn,
            '<a href="https://%s/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s" target="_blank">perf</a>'
            % (reqmgr_url, wfn)
        ])
        if within and (not view or wfs == 'completed'):
            wl = getWL(wfn)
            dataset = None
            if 'InputDataset' in wl:
                dataset = wl['InputDataset']
            if 'Task1' in wl and 'InputDataset' in wl['Task1']:
                dataset = wl['Task1']['InputDataset']

            if dataset:
                text += ', '.join([
                    '',
                    '<a href=https://cmsweb.cern.ch/das/request?input=%s target=_blank>input</a>'
                    % dataset,
                    '<a href=https://cmsweb.cern.ch/phedex/prod/Data::Subscriptions#state=create_since=0;filter=%s target=_blank>sub</a>'
                    % dataset,
                    '<a href=https://cmsweb.cern.ch/phedex/datasvc/xml/prod/subscriptions?dataset=%s&collapse=n target=_blank>ds</a>'
                    % dataset,
                    '<a href=https://cmsweb.cern.ch/phedex/datasvc/xml/prod/blockreplicas?dataset=%s target=_blank>rep</a>'
                    % dataset,
                ])

        if p:
            cached = filter(lambda d: d['RequestName'] == wfn, cache)
            if cached:
                wl = cached[0]
            else:
                wl = getWorkLoad('cmsweb.cern.ch', wfn)
            text += ', (%s)' % (wl['RequestPriority'])
            pass

        if pid:
            if ms:
                mcm_s = json.loads(
                    os.popen(
                        'curl https://cms-pdmv.cern.ch/mcm/public/restapi/requests/get_status/%s --insecure'
                        % pid).read())[pid]
                text += ', <a href="https://cms-pdmv.cern.ch/mcm/requests?prepid=%s" target="_blank">mcm (%s)</a>' % (
                    pid, mcm_s)
            else:
                text += ', <a href="https://cms-pdmv.cern.ch/mcm/requests?prepid=%s" target="_blank">mcm</a>' % (
                    pid)
                text += ', <a href="https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=%s" target="_blank">ac</a>' % (
                    wl_pid)

        if status:
            if wf.status.startswith('assistance'):
                text += ', <a href="assistance.html#%s" target="_blank">assist</a>' % wfn
            text += ' : %s ' % (wf.status)

        if view and wfs != 'acquired':
            text += '<a href="https://cms-pdmv.web.cern.ch/cms-pdmv/stats/growth/%s.gif" target="_blank"><img src="https://cms-pdmv.web.cern.ch/cms-pdmv/stats/growth/%s.gif" style="height:50px"></a>' % (
                wfn.replace('_', '/'), wfn.replace('_', '/'))
        if ongoing:
            text += '<a href="http://cms-gwmsmon.cern.ch/prodview/%s" target="_blank"><img src="http://cms-gwmsmon.cern.ch/prodview/graphs/%s/daily" style="height:50px"></a>' % (
                wfn, wfn)

        if ongoing:
            date1 = time.strftime(
                '%Y-%m-%d+%H:%M',
                time.gmtime(time.mktime(time.gmtime()) - (15 * 24 * 60 * 60)))
            date2 = time.strftime('%Y-%m-%d+%H:%M', time.gmtime())
            text += '<a href="http://dashb-cms-job.cern.ch/dashboard/templates/web-job2/#table=Jobs&date1=%s&date2=%s&sortby=site&task=wmagent_%s">dashb</a>' % (
                date1, date2, wfn)

        if ongoing and wfn in boost:
            for task in boost[wfn]:
                overflow = boost[wfn][task].get('ReplaceSiteWhitelist', None)
                if not overflow:
                    overflow = boost[wfn][task].get('AddWhitelist', None)
                if overflow:
                    text += ',boost (<a href=equalizor.json>%d</a>)' % len(
                        overflow)

        #text+="<hr>"
        return text

    def phl(phid):
        text = ', '.join([
            str(phid),
            '<a href="https://cmsweb.cern.ch/phedex/prod/Request::View?request=%s" target="_blank">vw</a>'
            % phid,
            '<a href="https://cmsweb.cern.ch/phedex/prod/Data::Subscriptions?reqfilter=%s" target="_blank">sub</a>'
            % phid,
        ])
        return text

    def ol(out):
        return '<a href="https://cmsweb.cern.ch/das/request?input=%s" target="_blank"> %s</a>' % (
            out, out)

    def lap(comment):

        l = time.mktime(time.gmtime())
        spend = l - lap.start
        lap.start = l
        print "Spend %d [s] for %s" % (spend, comment)

    lap.start = time.mktime(time.gmtime())

    ## start to write it
    #html_doc = open('/afs/cern.ch/user/v/vlimant/public/ops/index.html','w')
    html_doc = open('%s/index.html.new' % monitor_dir, 'w')
    print "Updating the status page ..."

    UC = unifiedConfiguration()

    if not caller:
        try:
            #caller = sys._getframe(1).f_code.co_name
            caller = sys.argv[0].split('/')[-1].replace('.py', '')
            print "caller is"
            print caller
        except Exception as es:
            caller = 'none found'
            print "not getting frame"
            print str(es)

    html_doc.write("""
<html>
<head>
<META HTTP-EQUIV="refresh" CONTENT="900">
<script type="text/javascript">
 function showhide(id) {
    var e = document.getElementById(id);
    e.style.display = (e.style.display == 'block') ? 'none' : 'block';
 }
</script>
</head>
<body>

Last update on %s(CET), %s(GMT)
<br>
<a href=logs/ target=_blank>logs</a> <a href=logs/last.log target=_blank>last</a> <a href=statuses.html>statuses</a> <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/ target=_blank>prod mon</a> <a href=https://%s/wmstats/index.html target=_blank>wmstats</a> <a href=http://t3serv001.mit.edu/~cmsprod/IntelROCCS/Detox/SitesInfo.txt target=_blank>detox</a> <a href=locked.html>space</a> <a href=logs/subscribor/last.log target=_blank>blocks</a> <a href=logs/agents/last.log>agents</a>
<br>
<a href=https://twiki.cern.ch/twiki/bin/view/CMSPublic/CompOpsWorkflowL3Responsibilities#Automatic_Assignment_and_Unified>what am I</a> <a href=data.html>json interfaces</a> <a href=logs/addHoc/last.log>add-hoc op</a> created from <b>%s <a href=logs/last_running>last running</a></b> <object height=20 type="text/html" data="logs/last_running"><p>backup content</p></object>
<br><br>

""" % (time.asctime(time.localtime()), time.asctime(
        time.gmtime()), reqmgr_url, caller))

    text = ""
    count = 0
    count_by_campaign = defaultdict(lambda: defaultdict(int))
    for wf in session.query(Workflow).filter(
            Workflow.status.startswith('considered')).all():
        wl = getWL(wf.name)
        count_by_campaign[wl['Campaign']][int(wl['RequestPriority'])] += 1
        #print wf.name
        text += "<li> %s </li> \n" % wfl(wf, p=True)
        count += 1
    text_by_c = ""
    for c in count_by_campaign:
        text_by_c += "<li> %s (%d) : " % (c, sum(
            count_by_campaign[c].values()))
        for p in sorted(count_by_campaign[c].keys()):
            text_by_c += "%d (%d), " % (p, count_by_campaign[c][p])
        text_by_c += "</li>"

    html_doc.write("""
Worflow next to handle (%d) <a href=https://cms-pdmv.cern.ch/mcm/batches?status=new&page=-1 target="_blank"> batches</a> <a href=logs/injector/last.log target=_blank>log</a> <a href=logs/transferor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('considered')">[Click to show/hide]</a>
<br>
<div id="considered" style="display:none;">
<ul>
<li> By workflow (%d) </li><a href="javascript:showhide('considered_bywf')">[Click to show/hide]</a><div id="considered_bywf" style="display:none;">
 <ul>
 %s
 </ul></div>
<li> By campaigns (%d) </li><a href="javascript:showhide('considered_bycamp')">[Click to show/hide]</a><div id="considered_bycamp" style="display:none;">
 <ul>
 %s
 </ul></div>
</ul>
</div>
""" % (count, count, text, len(count_by_campaign), text_by_c))

    lap('done with considered')
    text = ""
    count = 0
    count_by_campaign = defaultdict(lambda: defaultdict(int))
    for wf in session.query(Workflow).filter(
            Workflow.status == 'staging').all():
        wl = getWL(wf.name)
        count_by_campaign[wl['Campaign']][int(wl['RequestPriority'])] += 1
        text += "<li> %s </li> \n" % wfl(wf, within=True)
        count += 1

    text_by_c = ""
    for c in count_by_campaign:
        text_by_c += "<li> %s (%d) : " % (c, sum(
            count_by_campaign[c].values()))
        for p in sorted(count_by_campaign[c].keys()):
            text_by_c += "%d (%d), " % (p, count_by_campaign[c][p])
        text_by_c += "</li>"

    html_doc.write("""
Worflow waiting in staging (%d) <a href=logs/transferor/last.log target=_blank>log</a> <a href=logs/stagor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('staging')">[Click to show/hide]</a>
<br>
<div id="staging" style="display:none;">
<ul>
<li> By workflow (%d) </li><a href="javascript:showhide('staging_bywf')">[Click to show/hide]</a><div id="staging_bywf" style="display:none;">                                                                                                                                                                       
 <ul>            
 %s
 </ul></div>
<li> By campaigns (%d) </li><a href="javascript:showhide('staging_bycamp')">[Click to show/hide]</a><div id="staging_bycamp" style="display:none;">                                                                                                                                                                  
 <ul>                                                                                                                                                                                                                                                                                                                      
 %s                                                                                                                                                                                                                                                                                                                        
 </ul></div>                                                                                                                                                                                                                                                                                                               
</ul>      
</div>
""" % (count, count, text, len(count_by_campaign), text_by_c))

    lap('done with staging')

    text = "<ul>"
    count = 0
    transfer_per_wf = defaultdict(list)
    for ts in session.query(Transfer).filter(Transfer.phedexid > 0).all():
        hide = True
        t_count = 0
        stext = ""
        for pid in ts.workflows_id:
            w = session.query(Workflow).get(pid)
            hide &= (w.status != 'staging')
            if w.status in ['considered', 'staging', 'staged']:
                stext += "<li> %s </li>\n" % (wfl(w, status=True))
                transfer_per_wf[w].append(ts.phedexid)
                t_count += 1
        stext = '<li> %s serves %d workflows<br><a href="javascript:showhide(\'%s\')">[show/hide]</a> <div id="%s" style="display:none;"><ul>\n' % (
            phl(ts.phedexid), t_count, ts.phedexid, ts.phedexid) + stext

        stext += "</ul></li>\n"
        if hide:
            #text+="<li> %s not needed anymore to start running (does not mean it went through completely)</li>"%phl(ts.phedexid)
            pass
        else:
            count += 1
            text += stext
    text += "</ul>"

    text_bywf = "<ul>"
    for wf in transfer_per_wf:
        text_bywf += "<li> %s </li>" % (wfl(wf, within=True))
        text_bywf += '<a href=javascript:showhide("transfer_%s")>[Click to show/hide] %d transfers</a>' % (
            wf.name, len(transfer_per_wf[wf]))
        text_bywf += '<div id="transfer_%s" style="display:none;">' % wf.name
        text_bywf += "<ul>"
        for pid in sorted(transfer_per_wf[wf]):
            text_bywf += "<li> %s </li>" % (phl(pid))
        text_bywf += "</ul></div><hr>"
    text_bywf += '</ul>'

    html_doc.write("""
Transfer on-going (%d) <a href=http://cmstransferteam.web.cern.ch/cmstransferteam/ target=_blank>dashboard</a> <a href=logs/transferor/last.log target=_blank>log</a> <a href=logs/stagor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('transfer')">[Click to show/hide]</a>
<br>
<div id="transfer" style="display:none;">
 <ul>
  <li> By Workflow
    <a href="javascript:showhide('transfer_bywf')">[Click to show/hide]</a>
    <div id="transfer_bywf" style="display:none;">
%s
    </div>
  </li>
  <li> By transfer request
    <a href="javascript:showhide('transfer_byreq')">[Click to show/hide]</a>
    <div id="transfer_byreq" style="display:none;"> 
%s
    </div>
  </li>
 </ul>
</div>
""" % (count, text_bywf, text))

    lap('done with transfers')

    text = ""
    count = 0
    count_by_campaign = defaultdict(lambda: defaultdict(int))
    for wf in session.query(Workflow).filter(
            Workflow.status == 'staged').all():
        wl = getWL(wf.name)
        count_by_campaign[wl['Campaign']][int(wl['RequestPriority'])] += 1
        text += "<li> %s </li> \n" % wfl(wf, p=True)
        count += 1
    text_by_c = ""
    for c in count_by_campaign:
        text_by_c += "<li> %s (%d) : " % (c, sum(
            count_by_campaign[c].values()))
        for p in sorted(count_by_campaign[c].keys()):
            text_by_c += "%d (%d), " % (p, count_by_campaign[c][p])
        text_by_c += "</li>"

    html_doc.write(
        """Worflow ready for assigning (%d) <a href=logs/stagor/last.log target=_blank>log</a> <a href=logs/assignor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('staged')">[Click to show/hide]</a>
<br>
<div id="staged" style="display:none;">
<br>
<ul>
<li> By workflow (%d) </li><a href="javascript:showhide('staged_bywf')">[Click to show/hide]</a><div id="staged_bywf" style="display:none;">                                                                                                                                                                             
 <ul>                                                                                                                                                                                                                                                                                                                      
 %s                                                                                                                                                                                                                                                                                                                        
 </ul></div>                                                                                                                                                                                                                                                                                                               
<li> By campaigns (%d) </li><a href="javascript:showhide('staged_bycamp')">[Click to show/hide]</a><div id="staged_bycamp" style="display:none;">                                                                                                                                                                        
 <ul>                                                                                                                                                                                                                                                                                                                      
 %s                                                                                                                                                                                                                                                                                                                        
 </ul></div>
</ul>
</div>
""" % (count, count, text, len(count_by_campaign), text_by_c))

    lap('done with staged')

    lines = []
    count_by_campaign = defaultdict(lambda: defaultdict(int))
    for wf in session.query(Workflow).filter(Workflow.status == 'away').all():
        wl = getWL(wf.name)
        count_by_campaign[wl['Campaign']][int(wl['RequestPriority'])] += 1
        lines.append("<li> %s <hr></li>" % wfl(wf, view=True, ongoing=True))
    text_by_c = ""
    for c in count_by_campaign:
        text_by_c += "<li> %s (%d) <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/campaign.php?campaign=%s>mon</a> <a href=https://cms-pdmv.cern.ch/pmp/historical?r=%s target=_blank>pmp</a> " % (
            c, sum(count_by_campaign[c].values()), c, c)
        for p in sorted(count_by_campaign[c].keys()):
            text_by_c += "%d (%d), " % (p, count_by_campaign[c][p])
        text_by_c += "</li>"

    lines.sort()
    html_doc.write("""
Worflow on-going (%d) <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests_in_production.php target=_blank>ongoing</a> <a href=https://cms-logbook.cern.ch/elog/Workflow+processing/?mode=summary target=_blank>elog</a> <a href=http://cms-gwmsmon.cern.ch/prodview target=_blank>queues</a> <a href=logs/assignor/last.log target=_blank>log</a> <a href=logs/checkor/last.log target=_blank>postlog</a> <a href=logs/equalizor/last.log target=_blank>equ</a> <a href=logs/completor/last.log target=_blank>comp</a>
<a href="javascript:showhide('away')">[Click to show/hide]</a>
<br>
<div id="away" style="display:none;">
<ul> 
<li>By workflow (%d) </li>
<a href="javascript:showhide('away_bywf')">[Click to show/hide]</a><div id="away_bywf" style="display:none;">
<ul>
%s
</ul></div>
<li> By campaigns (%d) </li><a href="javascript:showhide('away_bycamp')">[Click to show/hide]</a><div id="away_bycamp" style="display:none;">
<ul>
%s
</ul></div>
</ul>
</div>
""" % (len(lines), len(lines), '\n'.join(lines), len(count_by_campaign),
       text_by_c))

    lap('done with away')

    text = ""
    count = 0
    #for wf in session.query(Workflow).filter(Workflow.status == 'assistance-custodial').all():
    for wf in session.query(Workflow).filter(
            Workflow.status.startswith('assistance')).filter(
                Workflow.status.contains('custodial')).all():
        text += "<li> %s </li> \n" % wfl(
            wf, view=True, update=True, status=True)
        count += 1
    text += "</ul></div>\n"
    html_doc.write("""Worflow that are closing (%d)
<a href=closeout.html target=_blank>closeout</a> 
<a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('closing')">[Click to show/hide]</a>
<br>
<div id="closing" style="display:none;">
<br>
<ul>
""" % count)
    html_doc.write(text)

    lap('done with closing')

    assistance_by_type = defaultdict(list)
    text = ""
    count = 0
    for wf in session.query(Workflow).filter(
            Workflow.status.startswith('assistance-')).all():
        assistance_by_type[wf.status].append(wf)
        count += 1
    for assistance_type in assistance_by_type:
        text += "<li> %s (%d) <a href=\"javascript:showhide('%s')\">[Click to show/hide]</a><br><div id=\"%s\" style=\"display:none;\"><ul>" % (
            assistance_type,
            len(assistance_by_type[assistance_type]),
            assistance_type,
            assistance_type,
        )
        for wf in assistance_by_type[assistance_type]:
            text += "<li> %s <hr></li> \n" % wfl(
                wf, view=True, within=True, status=True, update=True)
        text += "</ul></div></li>\n"
    html_doc.write("""Worflow which need assistance (%d)
<a href=assistance.html target=_blank>assistance</a> 
<a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/recoveror/last.log target=_blank>postlog</a>
<a href="javascript:showhide('assistance')">[Click to show/hide]</a>
<br>
<div id="assistance" style="display:none;">
<br>
<ul>
%s
</ul>
</div>
""" % (count, text))

    lap('done with assistance')

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(Workflow.status == 'close').all():
        text += "<li> %s </li> \n" % wfl(wf)
        count += 1
    text += "</ul></div>\n"
    html_doc.write("""Worflow ready to close (%d)
<a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('close')">[Click to show/hide]</a>
<br>
<div id="close" style="display:none;">
<br>
<ul>
""" % count)
    html_doc.write(text)

    lap('done with annoucing')

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(
            Workflow.status == 'trouble').all():
        text += "<li> %s </li> \n" % wfl(wf)
        count += 1
    text += "</ul></div>\n"
    html_doc.write(
        """Worflow with issue (%d) <a href=logs/closor/last.log target=_blank>log</a> <a href=logs/injector/last.log target=_blank>postlog</a>
<a href="javascript:showhide('trouble')">[Click to show/hide]</a>
<br>
<div id="trouble" style="display:none;">
<br>
<ul>
""" % count)
    html_doc.write(text)

    lap('done with trouble')

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(
            Workflow.status == 'forget').all():
        text += "<li> %s </li> \n" % wfl(wf)
        count += 1
    text += "</ul></div>\n"
    html_doc.write("""
Worflow to forget (%d) <a href=logs/injector/last.log target=_blank>log</a> <a href=logs/lockor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('forget')">[Click to show/hide]</a>
<br>
<div id="forget" style="display:none;">
<br>
<ul>
""" % count)
    html_doc.write(text)

    lap('done with forget')

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(Workflow.status == 'done').all():
        text += "<li> %s </li> \n" % wfl(wf)  #,ms=True)
        count += 1
    text += "</ul></div>\n"
    html_doc.write("""
Worflow through (%d) <a href=logs/closor/last.log target=_blank>log</a> <a href=logs/lockor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('done')">[Click to show/hide]</a>
<br>
<div id="done" style="display:none;">
<br>
<ul>
""" % count)
    html_doc.write(text)

    lap('done with done')

    wfs = session.query(Workflow).filter(
        Workflow.status.endswith('-unlock')).all()
    html_doc.write(
        " Workflows unlocked : %s <a href=logs/lockor/last.log target=_blank>log</a><br>"
        % (len(wfs)))
    lap('done with unlocked')

    text = ""
    lines_thisweek = []
    lines_lastweek = []
    now = time.mktime(time.gmtime())
    this_week = int(time.strftime("%W", time.gmtime()))
    start_time_two_weeks_ago = time.mktime(
        time.gmtime(now - (20 * 24 * 60 * 60)))  # 20
    last_week = int(time.strftime("%W", time.gmtime(now - (7 * 24 * 60 * 60))))

    all_locks = json.loads(open('%s/globallocks.json' % monitor_dir).read())
    waiting_custodial = json.loads(
        open('%s/waiting_custodial.json' % monitor_dir).read())
    all_pending_approval_custodial = dict([
        (k, item) for k, item in waiting_custodial.items() if 'nodes' in item
        and not any([node['decided'] for node in item['nodes'].values()])
    ])
    n_pending_approval = len(all_pending_approval_custodial)
    #n_pending_approval = len([item for item in waiting_custodial.values() if 'nodes' in item and not any([node['decided'] for node in item['nodes'].values() ])])
    missing_approval_custodial = json.loads(
        open('%s/missing_approval_custodial.json' % monitor_dir).read())

    stuck_custudial = json.loads(
        open('%s/stuck_custodial.json' % monitor_dir).read())
    lagging_custudial = json.loads(
        open('%s/lagging_custodial.json' % monitor_dir).read())
    if len(stuck_custudial):
        stuck_string = ', <font color=red>%d appear to be <a href=stuck_custodial.json>stuck</a></font>' % len(
            stuck_custudial)
    else:
        stuck_string = ''
    if len(missing_approval_custodial):
        long_approve_string = ', <font color=red>%d more than %d days</font>' % (
            len(missing_approval_custodial), UC.get('transfer_timeout'))
    else:
        long_approve_string = ''

    output_within_two_weeks = session.query(Output).filter(
        Output.date >= start_time_two_weeks_ago).all()
    waiting_custodial_string = ""
    waiting_custodial_strings = []
    for ds in waiting_custodial:
        out = None
        ## lots of it will be within two weeks
        of = filter(lambda odb: odb.datasetname == ds, output_within_two_weeks)
        if of:
            out = of[0]
        else:
            out = session.query(Output).filter(
                Output.datasetname == ds).first()
        if out:
            info = waiting_custodial[out.datasetname]
            action = 'going'
            if out.datasetname in all_pending_approval_custodial:
                action = '<font color=red>pending</font>'
            try:
                size = str(info['size'])
            except:
                size = "x"

            destination = ",".join(info['nodes'].keys())
            if not destination:
                destination = '<font color=red>NO SITE</font>'

            a_waiting_custodial_string = "<li>on week %s : %s %s</li>" % (
                time.strftime("%W (%x %X)", time.gmtime(
                    out.date)), ol(out.datasetname),
                ' %s [GB] %s to %s on %s (<a href="https://cmsweb.cern.ch/phedex/datasvc/xml/prod/requestlist?dataset=%s&node=T*MSS">%d missing</a>)'
                % (size, action, destination,
                   time.asctime(time.gmtime(
                       info['checked'])), out.datasetname, info['nmissing']))
            waiting_custodial_strings.append(
                (out.date, a_waiting_custodial_string))

        waiting_custodial_strings.sort(key=lambda i: i[0])
        waiting_custodial_string = "\n".join(
            [i[1] for i in waiting_custodial_strings])
    #start_time_two_weeks_ago = time.mktime(time.strptime("15-0-%d"%(this_week-2), "%y-%w-%W"))
    for out in output_within_two_weeks:
        if not out.workflow:
            print "This is a problem with", out.datasetname
            continue
        if out.workflow.status in [
                'done-unlock', 'done', 'clean', 'clean-out', 'clean-unlock'
        ]:
            custodial = ''
            if out.datasetname in waiting_custodial:
                info = waiting_custodial[out.datasetname]
                try:
                    try:
                        size = str(info['size'])
                    except:
                        size = "x"
                    destination = ",".join(info['nodes'].keys())
                    if not destination:
                        destination = '<font color=red>NO SITE</font>'
                    action = 'going'
                    if out.datasetname in all_pending_approval_custodial:
                        action = '<font color=red>pending</font>'

                    custodial = ' %s [GB] %s to %s on %s (<a href="https://cmsweb.cern.ch/phedex/datasvc/xml/prod/requestlist?dataset=%s&node=T*MSS">%d missing</a>)' % (
                        size, action, destination,
                        time.asctime(time.gmtime(info['checked'])),
                        out.datasetname, info['nmissing'])
                except Exception as e:
                    #print info
                    #print str(e)
                    pass
            elif out.datasetname in all_locks:
                custodial = '<font color=green>LOCKED</font>'
            out_week = int(time.strftime("%W", time.gmtime(out.date)))
            ##only show current week, and the previous.
            if last_week == out_week:
                lines_lastweek.append(
                    "<li>on week %s : %s %s</li>" %
                    (time.strftime("%W (%x %X)", time.gmtime(
                        out.date)), ol(out.datasetname), custodial))
            if this_week == out_week:

                lines_thisweek.append(
                    "<li>on week %s : %s %s</li>" %
                    (time.strftime("%W (%x %X)", time.gmtime(
                        out.date)), ol(out.datasetname), custodial))
    lines_thisweek.sort()
    lines_lastweek.sort()

    html_doc.write(
        """Output produced (%d) <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?in_disagreement=1 target=_blank>disagreements</a>
<a href="javascript:showhide('output')">[Click to show/hide]</a>
<br>
<div id="output" style="display:none;">
<br>
<ul>
<li> %d waiting to go to tape</li>
<ul>
<li> %d waiting for tape approval%s</li>
<li> %d are not completed after %d days%s</li>
<li> Full list (%d) <a href="javascript:showhide('waiting-custodial')">[Click to show/hide]</a>
<div id="waiting-custodial" style="display:none;">
<ul>
%s
</ul>
</div>
</li>
</ul>
<li> Last week (%d) </li><a href="javascript:showhide('output_lastweek')">[Click to show/hide]</a><div id="output_lastweek" style="display:none;"><ul>
%s
</ul></div>
<li> This week (%d) </li><a href="javascript:showhide('output_thisweek')">[Click to show/hide]</a><div id="output_thisweek" style="display:none;"><ul>
%s
</ul></div></div>
""" % (len(lines_lastweek) + len(lines_thisweek),
       len(waiting_custodial), n_pending_approval, long_approve_string,
       len(lagging_custudial), UC.get('transfer_timeout'), stuck_string,
       len(waiting_custodial), waiting_custodial_string, len(lines_lastweek),
       '\n'.join(lines_lastweek), len(lines_thisweek),
       '\n'.join(lines_thisweek)))

    lap('done with output')

    html_doc.write("""Job installed
<a href="javascript:showhide('acron')">[Click to show/hide]</a>
<br>
<div id="acron" style="display:none;">
<br>
<pre>
%s
</pre>
""" % (os.popen('acrontab -l | grep Unified | grep -v \#').read()))

    per_module = defaultdict(list)
    for t in filter(
            None,
            os.popen('cat %s/logs/*/*.time' % monitor_dir).read().split('\n')):
        module_name, run_time, spend = t.split(':')
        ## then do what you want with it !
        if 'cleanor' in module_name: continue

        per_module[module_name].append(int(spend))

    def display_time(sec):
        m, s = divmod(sec, 60)
        h, m = divmod(m, 60)
        dis = ""
        if h:
            dis += "%d [h] " % h
        if h or m:
            dis += "%d [m] " % m
        if h or m or s:
            dis += "%d [s]" % s

        return dis

    html_doc.write("Module running time<ul>\n")
    for m, spends in per_module.items():
        avg = sum(spends) / float(len(spends))
        lasttime = spends[-1]
        html_doc.write("<li>%s : last %s, avg %s</li>\n" %
                       (m, display_time(lasttime), display_time(avg)))
    html_doc.write("</ul>")

    html_doc.write(
        "Last running <pre>%s</pre><br>" %
        (os.popen("tac %s/logs/running | head -5" % monitor_dir).read()))

    html_doc.write("Order in cycle <pre>%s</pre><br>" % ('\n'.join(
        map(
            lambda l: l.split('/')[-1].replace('.py', ''),
            filter(
                lambda l: not l.startswith('#') and 'Unified' in l and 'py' in
                l.split('/')[-1],
                open('%s/WmAgentScripts/cycle.sh' %
                     base_dir).read().split('\n'))))))

    html_doc.write("</div>\n")
    lap('done with jobs')

    text = ""
    count = 0
    for (c, info) in campaignInfo().campaigns.items():
        #if 'go' in info and info['go']:
        text += "<li>%s <br> <pre>%s</pre>  </li>" % (
            c, json.dumps(info, indent=2))
        count += 1

    html_doc.write("""Campaign configuration
<a href="javascript:showhide('campaign')">[Click to show/hide]</a>
<br>
<div id="campaign" style="display:none;">
<br>
<ul>
%s
</ul></div>
""" % (text))

    text = ""
    count = 0
    n_column = 4
    SI = siteInfo()
    date1 = time.strftime(
        '%Y-%m-%d+%H:%M',
        time.gmtime(time.mktime(time.gmtime()) -
                    (15 * 24 * 60 * 60)))  ## 15 days
    date2 = time.strftime('%Y-%m-%d+%H:%M', time.gmtime())
    for t in SI.types():
        text += "<li>%s<table border=1>" % t
        c = 0
        for site in getattr(SI, t):
            cpu = SI.cpu_pledges[site] if site in SI.cpu_pledges else 'N/A'
            disk = SI.disk[SI.CE_to_SE(site)] if SI.CE_to_SE(
                site) in SI.disk else 'N/A'
            if c == 0:
                text += "<tr>"
            if not disk:
                ht_disk = '<font color=red>Disk available: %s</font>' % disk
            else:
                ht_disk = 'Disk available: %s' % disk

            text += '<td><a href=http://dashb-ssb.cern.ch/dashboard/templates/sitePendingRunningJobs.html?site=%s>%s</a><br><a href="http://cms-gwmsmon.cern.ch/prodview/%s" target="_blank"><img src="http://cms-gwmsmon.cern.ch/prodview/graphs/%s/daily" style="height:50px"></a><br><a href="http://dashb-cms-job.cern.ch/dashboard/templates/web-job2/#user=&refresh=0&table=Jobs&p=1&records=25&activemenu=1&site=%s&submissiontool=wmagent&check=submitted&sortby=activity&scale=linear&bars=20&data1=%s&date2=%s">dashb</a><br>CPU pledge: %s<br>%s</td>' % (
                site, site, site, site, site, date1, date2, cpu, ht_disk)
            if c == n_column:
                c = 0
            else:
                c += 1
        text += "</table></li>"

    text += "<li> Sites in auto-approved transfer<ul>"
    for site in sorted(SI.sites_auto_approve):
        text += "<li>%s" % site
    text += "</ul></li>"

    text += "<li> Sites with vetoe transfer<ul>"
    for site in sorted(SI.sites_veto_transfer):
        text += "<li>%s" % site
    text += "</ul></li>"

    text += "<li> Sites banned from production<ul>"
    for site in sorted(SI.sites_banned):
        text += "<li>%s" % site
    text += "</ul></li>"

    text += "<li> Approximate Free Tape<ul>"
    for mss in SI.storage:
        waiting = 0
        try:
            waiting = float(
                os.popen(
                    "grep '%s is pending . Created since' %s/logs/lockor/last.log  -B 3 | grep size | awk '{ sum+=$6 ; print sum }' | tail -1"
                    % (mss, monitor_dir)).readline())
        except Exception as e:
            print str(e)

        oldest = ""
        os.system(
            'grep pending %s/logs/lockor/last.log | sort -u > %s/logs/pending.log'
            % (monitor_dir, monitor_dir))
        try:
            oldest = os.popen(
                "grep '%s is pending . Created since ' %s/logs/lockor/last.log | sort | awk '{print $10, $11, $12, $13, $14 }' | head -1"
                % (mss, monitor_dir)).readline()
        except Exception as e:
            print str(e)
        waiting /= 1024.
        text += "<li>%s : %d [TB]. Waiting for approval %d [TB] since %s </li>" % (
            mss, SI.storage[mss], waiting, oldest)
    text += "</ul></li>"

    lap('done with sites')

    open('%s/siteInfo.json' % monitor_dir, 'w').write(
        json.dumps(dict([(t, getattr(SI, t)) for t in SI.types()]), indent=2))

    lap('done with sites json')

    chart_data = defaultdict(list)
    for site in SI.quota:
        chart_data[site].append("""
var data_%s = google.visualization.arrayToDataTable([ 
['Overall', 'Space in TB'],
//['Quota' , %s],
['Locked' , %s],
['Free' , %s]
]);
""" % (
            site,
            SI.quota[site],
            SI.locked[site],
            SI.disk[site],
        ))
        chart_data[site].append("""
var chart_%s = new google.visualization.PieChart(document.getElementById('donutchart_%s'));
chart_%s.draw(data_%s, {title: '%s %s [TB]', pieHole:0.4, slices:{0:{color:'red'},1:{color:'green'}}});
""" % (site, site, site, site, site, SI.quota[site]))
        chart_data[site].append("""
<div id="donutchart_%s" style="height: 200px;width: 300px"></div>
""" % (site))

    ## make the locked/available donut chart
    donut_html = open('%s/locked.html' % monitor_dir, 'w')
    tables = "\n".join([info[0] for site, info in chart_data.items()])
    draws = "\n".join([info[1] for site, info in chart_data.items()])
    divs = "\n".join([info[2] for site, info in chart_data.items()])

    divs_table = "<table border=0>"
    for c, site in enumerate(sorted(chart_data.keys())):
        if c % 5 == 0:
            divs_table += "<tr>"
        divs_table += "<td>%s</td>" % (chart_data[site][2])
    divs_table += "</table>"

    donut_html.write("""
<html>
  <head>
    <script type="text/javascript" src="https://www.google.com/jsapi"></script>
    <script type="text/javascript">
      google.load("visualization", "1", {packages:["corechart"]});
      google.setOnLoadCallback(drawChart);
      function drawChart() {
%s

%s
      }
    </script>
  </head>
  <body>
%s
  </body>
</html>
""" % (tables, draws, divs_table))
    donut_html.close()

    html_doc.write("""Site configuration
<a href="javascript:showhide('site')">[Click to show/hide]</a>
<br>
<div id="site" style="display:none;">
<br>
<ul>
%s
</ul></div>
""" % (text))

    lap('done with space')

    text = ""
    for param in UC.configs:
        text += "<li>%s</li><ul>\n" % param
        for sub in sorted(UC.configs[param].keys()):
            text += "<li> %s : %s </li>\n" % (sub, UC.configs[param][sub])
        text += '</ul>\n'

    html_doc.write("""Unified configuration
<a href="javascript:showhide('config')">[Click to show/hide]</a>
<br>
<div id="config" style="display:none;">
<br>
<ul>
%s
</ul></div>                                                                                                                                                                                                                                                                                                                
""" % (text))

    lap('done with configuration')

    print "... done with status page."
    html_doc.write("""
</body>
</html>
""")

    html_doc.close()
    ## and put the file in place
    os.system('mv %s/index.html.new %s/index.html' %
              (monitor_dir, monitor_dir))

    statuses = json.loads(open('%s/statusmon.json' % monitor_dir).read())
    s_count = defaultdict(int)
    now = time.mktime(time.gmtime())
    for wf in session.query(Workflow).all():
        s_count[wf.status] += 1
    statuses[now] = dict(s_count)
    ## remove old entries
    for t in statuses.keys():
        if (now - float(t)) > 7 * 24 * 60 * 60:
            statuses.pop(t)
    open('%s/statusmon.json' % monitor_dir,
         'w').write(json.dumps(statuses, indent=2))

    html_doc = open('%s/statuses.html' % monitor_dir, 'w')
    html_doc.write(
        """                                                                                                                                                                                                                                                                                                      <html>        
<table border=1>
<thead>
<tr>
<th> workflow </th><th> status </th><th> wm status</th>
</tr>
</thead>
""")
    wfs = {}
    for wfo in session.query(Workflow).all():
        ## pass all that is unlocked and considered it gone
        wfs[wfo.name] = (wfo.status, wfo.wm_status)

    open('%s/statuses.json' % monitor_dir, 'w').write(json.dumps(wfs))
    for wfn in sorted(wfs.keys()):
        ## pass all that is unlocked and considered it gone
        if 'unlock' in wfs[wfn][0]: continue
        html_doc.write(
            '<tr><td><a id="%s">%s</a></td><td>%s</td><td>%s</td></tr>\n' %
            (wfn, wfn, wfs[wfn][0], wfs[wfn][1]))
    html_doc.write("</table>")
    html_doc.write("<br>" * 100)
    html_doc.write("end of page</html>")
    html_doc.close()
Exemplo n.º 26
0
def equalizor(url, specific=None, options=None):
    up = componentInfo(mcm=False, soft=['mcm'])
    if not up.check(): return

    if not specific:
        workflows = getWorkflows(url, status='running-closed', details=True)
        workflows.extend(getWorkflows(url, status='running-open',
                                      details=True))

    ## start from scratch
    modifications = defaultdict(dict)
    ## define regionality site => fallback allowed. feed on an ssb metric ??
    mapping = defaultdict(list)
    reversed_mapping = defaultdict(list)
    regions = defaultdict(list)
    SI = siteInfo()
    CI = campaignInfo()
    UC = unifiedConfiguration()
    for site in SI.sites_ready:
        region = site.split('_')[1]
        if not region in ['US', 'DE', 'IT']: continue
        regions[region] = [region]

    def site_in_depletion(s):
        return True
        if s in SI.sites_pressure:
            (m, r, pressure) = SI.sites_pressure[s]
            if float(m) < float(r):
                print s, m, r, "lacking pressure"
                return True
            else:
                print s, m, r, "pressure"
                pass

        return False

    for site in SI.sites_ready:
        region = site.split('_')[1]
        ## fallback to the region, to site with on-going low pressure
        mapping[site] = [
            fb for fb in SI.sites_ready
            if any([('_%s_' %
                     (reg) in fb and fb != site and site_in_depletion(fb))
                    for reg in regions[region]])
        ]

    use_T0 = ('T0_CH_CERN' in UC.get("site_for_overflow"))
    if options.t0: use_T0 = True
    #if options.augment : use_T0 = True

    use_HLT = ('T2_CH_CERN_HLT' in UC.get("site_for_overflow"))
    if options.hlt: use_HLT = True
    #if options.augment : use_HLT=True

    if use_HLT:
        mapping['T2_CH_CERN'].append('T2_CH_CERN_HLT')

    if use_T0:
        mapping['T2_CH_CERN'].append('T0_CH_CERN')
        #mapping['T1_FR_CCIN2P3'].append('T0_CH_CERN')

    #mapping['T2_IT_Legnaro'].append('T1_IT_CNAF')
    for reg in ['IT', 'DE', 'UK']:
        mapping['T2_CH_CERN'].extend(
            [fb for fb in SI.sites_ready if '_%s_' % reg in fb])

    ## make them appear as OK to use
    force_sites = []

    ## overflow CERN to underutilized T1s
    upcoming = json.loads(open('%s/GQ.json' % monitor_dir).read())
    for possible in SI.sites_T1s:
        if not possible in upcoming:
            mapping['T2_CH_CERN'].append(possible)

    ## remove add-hoc sites from overflow mapping
    prevent_sites = ['T2_US_Purdue']
    for prevent in prevent_sites:
        if prevent in mapping: mapping.pop(prevent)
    for src in mapping:
        for prevent in prevent_sites:
            if prevent in mapping[src]:
                mapping[src].remove(prevent)

    ## create the reverse mapping for the condor module
    for site, fallbacks in mapping.items():
        for fb in fallbacks:
            reversed_mapping[fb].append(site)

    ## this is the fallback mapping
    print "Direct mapping : site => overflow"
    print json.dumps(mapping, indent=2)
    print "Reverse mapping : dest <= from origin"
    print json.dumps(reversed_mapping, indent=2)

    altered_tasks = set()

    def running_idle(wfi, task_name):
        gmon = wfi.getGlideMon()
        #print gmon
        if not gmon: return (0, 0)
        if not task_name in gmon: return (0, 0)
        return (gmon[task_name]['Running'], gmon[task_name]['Idle'])

    def needs_action(wfi, task, min_idled=100, pressure=0.2):
        task_name = task.pathName.split('/')[-1]
        running, idled = running_idle(wfi, task_name)
        go = True
        if not idled and not running:
            go = False
        if idled < 100:
            go = False
        if (not running and idled) or (running and
                                       (idled / float(running) > pressure)):
            go = True
        else:
            go = False
        return go, task_name, running, idled

    def getPerf(task):
        task = task.split('/')[1] + '/' + task.split('/')[-1]
        try:
            u = 'http://cms-gwmsmon.cern.ch/prodview/json/history/memoryusage720/%s' % task
            print u
            perf_data = json.loads(os.popen('curl -s --retry 5 %s' % u).read())
        except Exception as e:
            print str(e)
            return (None, None)
        buckets = perf_data['aggregations']["2"]['buckets']
        s_m = sum(bucket['key'] * bucket['doc_count'] for bucket in buckets)
        w_m = sum(bucket['doc_count'] for bucket in buckets)
        m_m = max(bucket['key'] for bucket in buckets) if buckets else None

        b_m = None
        if w_m > 100:
            b_m = m_m

        try:
            perf_data = json.loads(
                os.popen(
                    'curl -s --retry 5 http://cms-gwmsmon.cern.ch/prodview/json/history/runtime720/%s'
                    % task).read())
        except Exception as e:
            print str(e)
            return (b_m, None)

        buckets = perf_data['aggregations']["2"]['buckets']
        s_t = sum(bucket['key'] * bucket['doc_count'] for bucket in buckets)
        w_t = sum(bucket['doc_count'] for bucket in buckets)
        m_t = max(bucket['key'] for bucket in buckets) if buckets else None

        b_t = None
        if w_t > 100:
            b_t = m_t

        return (b_m, b_t)

    def getcampaign(task):
        taskname = task.pathName.split('/')[-1]
        if hasattr(task, 'prepID'):
            return task.prepID.split('-')[1]
        elif taskname.count('-') >= 1:
            return taskname.split('-')[1]
        else:
            return None

    def close(interface):
        open('%s/equalizor.json.new' % monitor_dir,
             'w').write(json.dumps(interface, indent=2))
        os.system('mv %s/equalizor.json.new %s/equalizor.json' %
                  (monitor_dir, monitor_dir))
        os.system('cp %s/equalizor.json %s/logs/equalizor/equalizor.%s.json' %
                  (monitor_dir, monitor_dir, time.mktime(time.gmtime())))

    interface = {'reversed_mapping': reversed_mapping, 'modifications': {}}
    if options.augment or options.remove:
        interface['modifications'] = json.loads(
            open('%s/equalizor.json' % monitor_dir).read())['modifications']

    if options.remove:
        if specific in interface['modifications']:
            print "poping", specific
            interface['modifications'].pop(specific)
            close(interface)
        return

    PU_locations = {}
    PU_overflow = {}
    LHE_overflow = {}
    tune_performance = []

    pending_HLT = 0
    max_HLT = 60000
    pending_T0 = 0
    max_T0 = 60000
    try:
        gmon = json.loads(
            os.popen(
                'curl -s http://cms-gwmsmon.cern.ch/prodview/json/T2_CH_CERN_HLT'
            ).read())
        pending_HLT += gmon["Running"]
        pending_HLT += gmon["MatchingIdle"]
    except:
        pass

    stay_within_site_whitelist = False
    specific_task = None
    if specific and ":" in specific:
        specific, specific_task = specific.split(':')

    if specific:
        wfs = session.query(Workflow).filter(
            Workflow.name.contains(specific)).all()
    else:
        wfs = session.query(Workflow).filter(Workflow.status == 'away').all()

    performance = {}
    no_routing = []
    random.shuffle(wfs)
    for wfo in wfs:
        if wfo.name in no_routing and not options.augment:
            continue

        if specific and not specific in wfo.name:
            continue
        if specific:
            wfi = workflowInfo(url, wfo.name)
        else:
            cached = filter(lambda d: d['RequestName'] == wfo.name, workflows)
            if not cached: continue
            wfi = workflowInfo(url, wfo.name, request=cached[0])

        ## only running should get re-routed
        if not wfi.request['RequestStatus'] in [
                'running-open', 'running-closed'
        ] and not specific:
            continue

        tasks_and_campaigns = []
        for task in wfi.getWorkTasks():
            tasks_and_campaigns.append((task, getcampaign(task)))

        _, _, _, sec = wfi.getIO()

        ## check needs override
        needs_overide = False
        if not needs_overide and options.augment: needs_overide = True

        def overide_from_agent(wfi, needs_overide):
            bad_agents = []  #'http://cmssrv219.fnal.gov:5984']
            if not bad_agents: return needs_overide
            if needs_overide: return True
            agents = wfi.getAgents()

            wqss = ['Running', 'Acquired']
            if any([
                    agent in agents.get(wqs, {}).keys()
                    for wqs, agent in itertools.product(wqss, bad_agents)
            ]):
                print "overriding the need for bad agent"
                needs_overide = True
            return needs_overide

        ## now parse this for action
        for i_task, (task, campaign) in enumerate(tasks_and_campaigns):
            if options.augment:
                print task.pathName
                print campaign

            tune = CI.get(campaign, 'tune', options.tune)
            if tune and not campaign in tune_performance:
                tune_performance.append(campaign)

            overflow = CI.get(campaign, 'overflow', {})
            if overflow:
                if "PU" in overflow and not campaign in PU_overflow:
                    PU_overflow[campaign] = copy.deepcopy(overflow['PU'])
                    print "adding", campaign, "to PU overflow rules"
                if "LHE" in overflow and not campaign in LHE_overflow:
                    print "adding", campaign, "to light input overflow rules"
                    site_list = overflow['LHE']['site_list']
                    LHE_overflow[campaign] = copy.deepcopy(
                        getattr(SI, site_list))

            ### get the task performance, for further massaging.
            if campaign in tune_performance or options.tune:
                print "performance", task.taskType, task.pathName
                if task.taskType in ['Processing', 'Production']:
                    set_memory, set_time = getPerf(task.pathName)
                    #print "Performance %s GB %s min"%( set_memory,set_time)
                    wfi.sendLog(
                        'equalizor', 'Performance tuning to %s GB %s min' %
                        (set_memory, set_time))
                    ## get values from gmwsmon
                    # massage the values : 95% percentile
                    performance[task.pathName] = {}
                    if set_memory:
                        performance[task.pathName]['memory'] = set_memory
                    if set_time and False:
                        performance[task.pathName]['time'] = set_time

            ### rule to avoid the issue of taskchain secondary jobs being stuck at sites processing the initial step
            if campaign in LHE_overflow:
                if task.taskType in ['Processing']:
                    needs, task_name, running, idled = needs_action(wfi, task)
                    needs_overide = overide_from_agent(wfi, needs_overide)
                    extend_to = list(set(copy.deepcopy(
                        LHE_overflow[campaign])))
                    if stay_within_site_whitelist:
                        extend_to = list(
                            set(extend_to) & set(wfi.request['SiteWhitelist'])
                        )  ## restrict to stupid-site-whitelist
                    extend_to = list(
                        set(extend_to) & set(SI.sites_ready + force_sites))

                    if extend_to and needs or needs_overide:

                        modifications[wfo.name][task.pathName] = {
                            "ReplaceSiteWhitelist": extend_to,
                            "Running": running,
                            "Pending": idled,
                            "Priority": wfi.request['RequestPriority']
                        }
                        wfi.sendLog(
                            'equalizor',
                            '%s of %s is running %d and pending %d, taking action : ReplaceSiteWhitelist \n %s'
                            %
                            (task_name, wfo.name, running, idled,
                             json.dumps(
                                 sorted(modifications[wfo.name][task.pathName]
                                        ['ReplaceSiteWhitelist']))))

                        altered_tasks.add(task.pathName)
                    else:
                        wfi.sendLog(
                            'equalizor',
                            '%s of %s is running %d and pending %d' %
                            (task_name, wfo.name, running, idled))

            ### overflow the 76 digi-reco to the site holding the pileup
            if campaign in PU_overflow:
                force = PU_overflow[campaign][
                    'force'] if 'force' in PU_overflow[campaign] else False
                secondary_locations = set(SI.sites_ready + force_sites)
                for s in sec:
                    if not s in PU_locations:
                        presence = getDatasetPresence(url, s)
                        #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
                        one_secondary_locations = [
                            site for (site, (there, frac)) in presence.items()
                            if frac > 98.
                        ]
                        PU_locations[s] = one_secondary_locations
                    print "secondary is at", sorted(PU_locations[s])
                    secondary_locations = set(
                        [SI.SE_to_CE(site)
                         for site in PU_locations[s]]) & secondary_locations

                ## we should add all sites that hold the secondary input if any
                ### given that we have the secondary location available, it is not necessary to use the add-hoc list
                ##secondary_locations = list(set(PU_overflow[campaign]['sites']) & set( SI.sites_ready ))

                if any([
                        task.pathName.endswith(finish)
                        for finish in ['_0', 'StepOneProc', 'Production']
                ]):
                    needs, task_name, running, idled = needs_action(wfi, task)
                    ## removing the ones in the site whitelist already since they encode the primary input location
                    if stay_within_site_whitelist:
                        original_site_in_use = set(
                            wfi.request['SiteWhitelist'])
                    else:
                        original_site_in_use = set(secondary_locations)
                    ## remove the sites that have already running jobs
                    gmon = wfi.getGlideMon()
                    if gmon and task_name in gmon and 'Sites' in gmon[
                            task_name]:
                        site_in_use = set(gmon[task_name]['Sites'])
                        print "removing", sorted(site_in_use)
                        ## that determines where you want to run in addition
                        augment_by = list((set(secondary_locations) -
                                           site_in_use) & original_site_in_use)
                    else:
                        print "no existing running site"
                        augment_by = list(original_site_in_use)

                    needs_overide = overide_from_agent(wfi, needs_overide)
                    if augment_by and (
                            needs or needs_overide
                            or force) and PU_overflow[campaign][
                                'pending'] < PU_overflow[campaign]['max']:
                        PU_overflow[campaign]['pending'] += idled
                        print "raising overflow to", PU_overflow[campaign][
                            'pending'], "for", PU_overflow[campaign]['max']
                        ## the step with an input ought to be the digi part : make this one go anywhere
                        modifications[wfo.name][task.pathName] = {
                            "AddWhitelist": augment_by,
                            "Running": running,
                            "Pending": idled,
                            "Priority": wfi.request['RequestPriority']
                        }
                        altered_tasks.add(task.pathName)
                        wfi.sendLog(
                            'equalizor',
                            '%s of %s is running %d and pending %d, taking action : AddWhitelist \n %s'
                            % (task_name, wfo.name, running, idled,
                               json.dumps(sorted(augment_by), indent=2)))
                    else:
                        print task_name, "of", wfo.name, "running", running, "and pending", idled

            ### overflow the skims back to multi-core
            if campaign in ['Run2015D', 'Run2015C_25ns'
                            ] and task.taskType == 'Skim':
                original_swl = wfi.request['SiteWhitelist']
                needs, task_name, running, idled = needs_action(wfi, task)
                if (needs or needs_overide):
                    modifications[wfo.name][task.pathName] = {
                        'AddWhitelist': original_swl,
                        "Running": running,
                        "Pending": idled,
                        "Priority": wfi.request['RequestPriority']
                    }
                    altered_tasks.add(task.pathName)
                    wfi.sendLog(
                        'equalizor',
                        '%s of %s is running %d and pending %d, taking action : AddWhitelist \n %s'
                        % (task_name, wfo.name, running, idled,
                           json.dumps(sorted(original_swl), indent=2)))

            if options.augment:
                print sorted(wfi.request['SiteWhitelist']), i_task, use_HLT

            ### add the HLT at partner of CERN
            if 'T2_CH_CERN' in wfi.request['SiteWhitelist'] and i_task in [
                    0, 1
            ] and use_HLT:
                needs, task_name, running, idled = needs_action(wfi, task)
                if options.augment: needs = True
                needs = True
                ##needs = random.random()<0.40 remove the random, just add up to a limit
                if (needs or needs_overide) and pending_HLT < max_HLT:
                    pending_HLT += idled
                    if task.pathName in modifications[
                            wfo.name] and 'AddWhitelist' in modifications[
                                wfo.name][task.pathName]:
                        modifications[wfo.name][task.pathName][
                            "AddWhitelist"].append("T2_CH_CERN_HLT")
                        print "\t", wfo.name, "adding addHLT up to", pending_HLT, "for", max_HLT
                        print task.pathName
                    ## this Replace does not work at all for HLT
                    #elif task.pathName in modifications[wfo.name] and 'ReplaceSiteWhitelist' in modifications[wfo.name][task.pathName]:
                    #modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"].append( "T2_CH_CERN_HLT" )
                    #print "\t",wfo.name,"adding replace HLT up to",pending_HLT,"for",max_HLT
                    else:
                        modifications[wfo.name][task.pathName] = {
                            "AddWhitelist": ["T2_CH_CERN_HLT"],
                            "Priority": wfi.request['RequestPriority'],
                            "Running": running,
                            "Pending": idled
                        }
                        wfi.sendLog(
                            'equalizor',
                            'adding the HLT in whitelist of %s to %d for %d' %
                            (task.pathName, pending_HLT, max_HLT))

            if i_task == 0 and not sec and use_T0:
                needs, task_name, running, idled = needs_action(wfi, task)

                if options.augment: needs = True
                #needs = True
                good_type = wfi.request['RequestType'] in [
                    'MonteCarlo', 'MonteCarloFromGEN'
                ]
                read_lhe = ((not 'LheInputFiles' in wfi.request)
                            or bool(wfi.request['LheInputFiles']))
                good_type &= not read_lhe
                if not good_type and not options.augment: needs = False

                ##needs = random.random()<0.40 remove the random, just add up to a limit
                if (needs or needs_overide):
                    pending_T0 += idled
                    if task.pathName in modifications[
                            wfo.name] and 'AddWhitelist' in modifications[
                                wfo.name][task.pathName]:
                        if not "T0_CH_CERN" in modifications[wfo.name][
                                task.pathName]["AddWhitelist"]:
                            modifications[wfo.name][task.pathName][
                                "AddWhitelist"].append("T0_CH_CERN")
                            wfi, sendLog(
                                'equalizor',
                                'adding the T0 for %s to %d for %d' %
                                (task.pathName, pending_T0, max_T0))
                    elif task.pathName in modifications[
                            wfo.
                            name] and 'ReplaceSiteWhitelist' in modifications[
                                wfo.name][task.pathName]:
                        if not "T0_CH_CERN" in modifications[wfo.name][
                                task.pathName]["ReplaceSiteWhitelist"]:
                            modifications[wfo.name][task.pathName][
                                "ReplaceSiteWhitelist"].append("T0_CH_CERN")
                            wfi, sendLog(
                                'equalizor',
                                'adding the T0 to replacement for %s to %d for %d'
                                % (task.pathName, pending_T0, max_T0))
                    else:
                        modifications[wfo.name][task.pathName] = {
                            "AddWhitelist": ["T0_CH_CERN"],
                            "Priority": wfi.request['RequestPriority'],
                            "Running": running,
                            "Pending": idled
                        }
                        wfi, sendLog(
                            'equalizor', 'adding the T0 for %s to %d for %d' %
                            (task.pathName, pending_T0, max_T0))

    interface['modifications'].update(modifications)

    ###  manage the number of core and job resizing
    interface['cores'] = {
        'T2_CH_CERN_HLT': {
            'min': 4,
            'max': 16
        },
        'default': {
            'min': 1,
            'max': 4
        }
    }
    interface['resizes'] = ['RunIISpring16DR80']

    ### manage the modification of the memory and target time
    interface['time'] = defaultdict(list)
    interface['memory'] = defaultdict(list)

    max_N_mem = 10
    max_N_time = 10
    ## discretize the memory to 10 at most values
    mems = set([o['memory'] for t, o in performance.items() if 'memory' in o])
    times = set([o['time'] for t, o in performance.items() if 'time' in o])
    if len(mems) > max_N_mem:
        mem_step = int((max(mems) - min(mems)) / float(max_N_mem))
        for t in performance:
            if not 'memory' in performance[t]: continue
            (m, r) = divmod(performance[t]['memory'], mem_step)
            performance[t]['memory'] = (m + 1) * mem_step
    if len(times) > max_N_time:
        time_step = int((max(times) - min(times)) / float(max_N_time))
        for t in performance:
            if not 'time' in performance[t]: continue
            (m, r) = divmod(performance[t]['time'], time_step)
            performance[t]['time'] = (m + 1) * time_step

    for t, o in performance.items():
        if 'time' in o:
            interface['time'][str(o['time'])].append(t)
        if 'memory' in o:
            interface['memory'][str(o['memory'])].append(t)

    ## close and save
    close(interface)
Exemplo n.º 27
0
def equalizor(url , specific = None, options=None):
    up = componentInfo(mcm=False, soft=['mcm']) 
    if not up.check(): return 

    if not specific:
        workflows = getWorkflows(url, status='running-closed', details=True)
        workflows.extend(getWorkflows(url, status='running-open', details=True))

    ## start from scratch
    modifications = defaultdict(dict)
    ## define regionality site => fallback allowed. feed on an ssb metric ??
    mapping = defaultdict(list)
    reversed_mapping = defaultdict(list)
    regions = defaultdict(list)
    SI = siteInfo()
    CI = campaignInfo()
    UC = unifiedConfiguration()
    for site in SI.sites_ready:
        region = site.split('_')[1]
        if not region in ['US','DE','IT']: continue
        regions[region] = [region] 

    def site_in_depletion(s):
        return True
        if s in SI.sites_pressure:
            (m, r, pressure) = SI.sites_pressure[s]
            if float(m) < float(r):
                print s,m,r,"lacking pressure"
                return True
            else:
                print s,m,r,"pressure"
                pass
                
        return False

    for site in SI.sites_ready:
        region = site.split('_')[1]
        ## fallback to the region, to site with on-going low pressure
        mapping[site] = [fb for fb in SI.sites_ready if any([('_%s_'%(reg) in fb and fb!=site and site_in_depletion(fb))for reg in regions[region]]) ]
    

    use_T0 = ('T0_CH_CERN' in UC.get("site_for_overflow"))
    if options.t0: use_T0 = True
    #if options.augment : use_T0 = True

    use_HLT = ('T2_CH_CERN_HLT' in UC.get("site_for_overflow"))
    if options.hlt: use_HLT = True
    #if options.augment : use_HLT=True

    if use_HLT:
        mapping['T2_CH_CERN'].append('T2_CH_CERN_HLT')

    if use_T0:
        mapping['T2_CH_CERN'].append('T0_CH_CERN')
        #mapping['T1_FR_CCIN2P3'].append('T0_CH_CERN')

    #mapping['T2_IT_Legnaro'].append('T1_IT_CNAF')
    for reg in ['IT','DE','UK']:
        mapping['T2_CH_CERN'].extend([fb for fb in SI.sites_ready if '_%s_'%reg in fb])


    ## make them appear as OK to use
    force_sites = []

    ## overflow CERN to underutilized T1s
    upcoming = json.loads( open('%s/GQ.json'%monitor_dir).read())
    for possible in SI.sites_T1s:
        if not possible in upcoming:
            mapping['T2_CH_CERN'].append(possible)

    ## remove add-hoc sites from overflow mapping
    prevent_sites = ['T2_US_Purdue']
    for prevent in prevent_sites:
        if prevent in mapping: mapping.pop( prevent )
    for src in mapping:
        for prevent in prevent_sites:
            if prevent in mapping[src]:
                mapping[src].remove( prevent )

    ## create the reverse mapping for the condor module
    for site,fallbacks in mapping.items():
        for fb in fallbacks:
            reversed_mapping[fb].append(site)

    ## this is the fallback mapping
    print "Direct mapping : site => overflow"
    print json.dumps( mapping, indent=2)
    print "Reverse mapping : dest <= from origin"
    print json.dumps( reversed_mapping, indent=2)

    altered_tasks = set()

    def running_idle( wfi , task_name):
        gmon = wfi.getGlideMon()
        #print gmon
        if not gmon: return (0,0)
        if not task_name in gmon: return (0,0)
        return (gmon[task_name]['Running'], gmon[task_name]['Idle'])

    def needs_action( wfi, task, min_idled = 100, pressure = 0.2):
        task_name = task.pathName.split('/')[-1]
        running, idled = running_idle( wfi, task_name)
        go = True
        if not idled and not running : 
            go = False
        if idled < 100: 
            go = False
        if (not running and idled) or (running and (idled / float(running) > pressure)):
            go = True
        else:
            go = False
        return go, task_name, running, idled

    def getPerf( task ):
        task = task.split('/')[1]+'/'+task.split('/')[-1]
        try:
            u = 'http://cms-gwmsmon.cern.ch/prodview/json/history/memoryusage720/%s'%task
            print u
            perf_data = json.loads(os.popen('curl -s --retry 5 %s'%u).read())
        except Exception as e:
            print str(e)
            return (None,None)
        buckets = perf_data['aggregations']["2"]['buckets']
        s_m = sum( bucket['key']*bucket['doc_count'] for bucket in buckets)
        w_m = sum( bucket['doc_count'] for bucket in buckets)
        m_m = max( bucket['key'] for bucket in buckets) if buckets else None
        
        b_m = None
        if w_m > 100:
            b_m = m_m

        try:
            perf_data = json.loads(os.popen('curl -s --retry 5 http://cms-gwmsmon.cern.ch/prodview/json/history/runtime720/%s'%task).read())
        except Exception as e:
            print str(e)
            return (b_m,None)

        buckets = perf_data['aggregations']["2"]['buckets']
        s_t = sum( bucket['key']*bucket['doc_count'] for bucket in buckets)
        w_t = sum( bucket['doc_count'] for bucket in buckets)
        m_t = max( bucket['key'] for bucket in buckets) if buckets else None
        
        b_t = None
        if w_t > 100:
            b_t = m_t

        return (b_m,b_t)
        
    def getcampaign( task ):
        taskname = task.pathName.split('/')[-1]
        if hasattr( task, 'prepID'):
            return task.prepID.split('-')[1]
        elif taskname.count('-')>=1:
            return taskname.split('-')[1]
        else:
            return None

    def close( interface ):
        open('%s/equalizor.json.new'%monitor_dir,'w').write( json.dumps( interface, indent=2))
        os.system('mv %s/equalizor.json.new %s/equalizor.json'%(monitor_dir,monitor_dir))
        os.system('cp %s/equalizor.json %s/logs/equalizor/equalizor.%s.json'%(monitor_dir,monitor_dir,time.mktime(time.gmtime())))

    interface = {
        'reversed_mapping' : reversed_mapping,
        'modifications' : {}
        }
    if options.augment or options.remove:
        interface['modifications'] = json.loads( open('%s/equalizor.json'%monitor_dir).read())['modifications']

    if options.remove:
        if specific in interface['modifications']:
            print "poping",specific
            interface['modifications'].pop(specific)
            close( interface )
        return 


    PU_locations = {}
    PU_overflow = {}
    LHE_overflow = {}
    tune_performance = []

    pending_HLT = 0
    max_HLT = 60000
    pending_T0 = 0
    max_T0 = 60000
    try:
        gmon = json.loads(os.popen('curl -s http://cms-gwmsmon.cern.ch/prodview/json/T2_CH_CERN_HLT').read())
        pending_HLT += gmon["Running"]
        pending_HLT += gmon["MatchingIdle"]
    except:
        pass


    stay_within_site_whitelist = False
    specific_task=None
    if specific and ":" in specific:
        specific,specific_task = specific.split(':')

    if specific:
        wfs = session.query(Workflow).filter(Workflow.name.contains(specific)).all()
    else:
        wfs = session.query(Workflow).filter(Workflow.status == 'away').all()
        
    performance = {}
    no_routing = [
        ]
    random.shuffle( wfs )
    for wfo in wfs:
        if wfo.name in no_routing and not options.augment:
            continue

        if specific and not specific in wfo.name: 
            continue
        if specific:
            wfi = workflowInfo(url, wfo.name)
        else:
            cached = filter(lambda d : d['RequestName']==wfo.name, workflows)
            if not cached : continue
            wfi = workflowInfo(url, wfo.name, request = cached[0])
        
        ## only running should get re-routed
        if not wfi.request['RequestStatus'] in ['running-open','running-closed'] and not specific: continue

        tasks_and_campaigns = []
        for task in wfi.getWorkTasks():
            tasks_and_campaigns.append( (task, getcampaign(task) ) )
        
        _,_,_,sec = wfi.getIO()

        ## check needs override
        needs_overide = False
        if not needs_overide and  options.augment: needs_overide=True

        def overide_from_agent( wfi, needs_overide):
            bad_agents = []#'http://cmssrv219.fnal.gov:5984']
            if not bad_agents: return needs_overide
            if needs_overide: return True
            agents = wfi.getAgents()

            wqss = ['Running','Acquired']
            if any([agent in agents.get(wqs,{}).keys() for wqs,agent in itertools.product( wqss, bad_agents)]):
                print "overriding the need for bad agent"
                needs_overide = True
            return needs_overide

        ## now parse this for action
        for i_task,(task,campaign) in enumerate(tasks_and_campaigns):
            if options.augment:
                print task.pathName
                print campaign
    
            
            tune = CI.get(campaign,'tune',options.tune)
            if tune and not campaign in tune_performance:
                tune_performance.append( campaign )

            overflow = CI.get(campaign,'overflow',{})
            if overflow:
                if "PU" in overflow and not campaign in PU_overflow:
                    PU_overflow[campaign] = copy.deepcopy(overflow['PU'])
                    print "adding",campaign,"to PU overflow rules"
                if "LHE" in overflow and not campaign in LHE_overflow:
                    print "adding",campaign,"to light input overflow rules"
                    site_list = overflow['LHE']['site_list']
                    LHE_overflow[campaign] = copy.deepcopy( getattr(SI,site_list) )
                    

            ### get the task performance, for further massaging.
            if campaign in tune_performance or options.tune:
                print "performance",task.taskType,task.pathName
                if task.taskType in ['Processing','Production']:
                    set_memory,set_time = getPerf( task.pathName )
                    #print "Performance %s GB %s min"%( set_memory,set_time)
                    wfi.sendLog('equalizor','Performance tuning to %s GB %s min'%( set_memory,set_time))
                    ## get values from gmwsmon
                    # massage the values : 95% percentile
                    performance[task.pathName] = {}
                    if set_memory:
                        performance[task.pathName]['memory']=set_memory
                    if set_time and False:
                        performance[task.pathName]['time'] = set_time
            
            ### rule to avoid the issue of taskchain secondary jobs being stuck at sites processing the initial step
            if campaign in LHE_overflow:
                if task.taskType in ['Processing']:
                    needs, task_name, running, idled = needs_action(wfi, task)
                    needs_overide = overide_from_agent( wfi, needs_overide)
                    extend_to = list(set(copy.deepcopy( LHE_overflow[campaign] )))
                    if stay_within_site_whitelist:
                        extend_to = list(set(extend_to) & set(wfi.request['SiteWhitelist'])) ## restrict to stupid-site-whitelist
                    extend_to = list(set(extend_to) & set(SI.sites_ready + force_sites))

                    if extend_to and needs or needs_overide:

                        modifications[wfo.name][task.pathName] = { "ReplaceSiteWhitelist" : extend_to ,"Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']}
                        wfi.sendLog('equalizor','%s of %s is running %d and pending %d, taking action : ReplaceSiteWhitelist \n %s'%( task_name,
                                                                                                                                      wfo.name,
                                                                                                                                      running,
                                                                                                                                      idled ,
                                                                                                                                      json.dumps( sorted(modifications[wfo.name][task.pathName]['ReplaceSiteWhitelist']))))

                        altered_tasks.add( task.pathName )
                    else:
                        wfi.sendLog('equalizor','%s of %s is running %d and pending %d'%( task_name, wfo.name, running, idled))
                        


            ### overflow the 76 digi-reco to the site holding the pileup
            if campaign in PU_overflow:
                force = PU_overflow[campaign]['force'] if 'force' in PU_overflow[campaign] else False
                secondary_locations = set(SI.sites_ready + force_sites)
                for s in sec:
                    if not s in PU_locations:
                        presence = getDatasetPresence( url, s)
                        #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
                        one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.]
                        PU_locations[s] = one_secondary_locations
                    print "secondary is at",sorted(PU_locations[s])
                    secondary_locations = set([SI.SE_to_CE(site) for site in PU_locations[s]]) & secondary_locations
                    
                ## we should add all sites that hold the secondary input if any
                ### given that we have the secondary location available, it is not necessary to use the add-hoc list
                ##secondary_locations = list(set(PU_overflow[campaign]['sites']) & set( SI.sites_ready ))

                if any([task.pathName.endswith(finish) for finish in ['_0','StepOneProc','Production']]) :
                    needs, task_name, running, idled = needs_action(wfi, task)
                    ## removing the ones in the site whitelist already since they encode the primary input location
                    if stay_within_site_whitelist:
                        original_site_in_use = set(wfi.request['SiteWhitelist'])
                    else:
                        original_site_in_use = set(secondary_locations)
                    ## remove the sites that have already running jobs
                    gmon = wfi.getGlideMon()
                    if gmon and task_name in gmon and 'Sites' in gmon[task_name]:
                        site_in_use = set(gmon[task_name]['Sites'])
                        print "removing",sorted(site_in_use)
                        ## that determines where you want to run in addition
                        augment_by = list((set(secondary_locations)- site_in_use) & original_site_in_use)
                    else:
                        print "no existing running site"
                        augment_by = list(original_site_in_use)

                    needs_overide = overide_from_agent( wfi, needs_overide)
                    if augment_by and (needs or needs_overide or force) and PU_overflow[campaign]['pending'] < PU_overflow[campaign]['max']:
                        PU_overflow[campaign]['pending'] += idled
                        print "raising overflow to",PU_overflow[campaign]['pending'],"for",PU_overflow[campaign]['max']
                        ## the step with an input ought to be the digi part : make this one go anywhere
                        modifications[wfo.name][task.pathName] = { "AddWhitelist" : augment_by , "Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']}
                        altered_tasks.add( task.pathName )
                        wfi.sendLog('equalizor','%s of %s is running %d and pending %d, taking action : AddWhitelist \n %s'%( task_name, wfo.name,
                                                                                                                              running, idled,
                                                                                                                              json.dumps( sorted(augment_by), indent=2 )))
                    else:
                        print task_name,"of",wfo.name,"running",running,"and pending",idled

            ### overflow the skims back to multi-core 
            if campaign in ['Run2015D','Run2015C_25ns'] and task.taskType =='Skim':
                original_swl = wfi.request['SiteWhitelist']
                needs, task_name, running, idled = needs_action(wfi, task)
                if (needs or needs_overide):
                    modifications[wfo.name][task.pathName] = { 'AddWhitelist' : original_swl, 
                                                               "Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']}
                    altered_tasks.add( task.pathName )
                    wfi.sendLog('equalizor','%s of %s is running %d and pending %d, taking action : AddWhitelist \n %s'%( task_name, wfo.name,
                                                                                                                              running, idled,
                                                                                                                          json.dumps( sorted(original_swl), indent=2 )))


            if options.augment:
                print sorted(wfi.request['SiteWhitelist']),i_task,use_HLT

            ### add the HLT at partner of CERN
            if 'T2_CH_CERN' in wfi.request['SiteWhitelist'] and i_task in [0,1] and use_HLT:
                needs, task_name, running, idled = needs_action(wfi, task)
                if options.augment: needs=True
                needs = True
                ##needs = random.random()<0.40 remove the random, just add up to a limit
                if (needs or needs_overide) and pending_HLT < max_HLT:
                    pending_HLT += idled
                    if task.pathName in modifications[wfo.name] and 'AddWhitelist' in modifications[wfo.name][task.pathName]:
                        modifications[wfo.name][task.pathName]["AddWhitelist"].append( "T2_CH_CERN_HLT" )
                        print "\t",wfo.name,"adding addHLT up to",pending_HLT,"for",max_HLT
                        print task.pathName
                    ## this Replace does not work at all for HLT
                    #elif task.pathName in modifications[wfo.name] and 'ReplaceSiteWhitelist' in modifications[wfo.name][task.pathName]:
                        #modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"].append( "T2_CH_CERN_HLT" )
                        #print "\t",wfo.name,"adding replace HLT up to",pending_HLT,"for",max_HLT
                    else:
                        modifications[wfo.name][task.pathName] = { "AddWhitelist" : ["T2_CH_CERN_HLT"],
                                                                   "Priority" : wfi.request['RequestPriority'],
                                                                   "Running" : running,
                                                                   "Pending" : idled}
                        wfi.sendLog('equalizor','adding the HLT in whitelist of %s to %d for %d'%( task.pathName, pending_HLT, max_HLT))

            if i_task==0 and not sec and use_T0:
                needs, task_name, running, idled = needs_action(wfi, task)
                
                if options.augment: needs=True
                #needs = True
                good_type = wfi.request['RequestType'] in ['MonteCarlo','MonteCarloFromGEN'] 
                read_lhe = ((not 'LheInputFiles' in wfi.request) or bool(wfi.request['LheInputFiles']))
                good_type &= not read_lhe
                if not good_type and not options.augment: needs = False
                
                ##needs = random.random()<0.40 remove the random, just add up to a limit
                if (needs or needs_overide):
                    pending_T0 += idled
                    if task.pathName in modifications[wfo.name] and 'AddWhitelist' in modifications[wfo.name][task.pathName]:
                        if not "T0_CH_CERN" in modifications[wfo.name][task.pathName]["AddWhitelist"]:
                            modifications[wfo.name][task.pathName]["AddWhitelist"].append( "T0_CH_CERN" )
                            wfi,sendLog('equalizor','adding the T0 for %s to %d for %d'%( task.pathName, pending_T0, max_T0))
                    elif task.pathName in modifications[wfo.name] and 'ReplaceSiteWhitelist' in modifications[wfo.name][task.pathName]:
                        if not "T0_CH_CERN" in modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"]:
                            modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"].append( "T0_CH_CERN" )
                            wfi,sendLog('equalizor','adding the T0 to replacement for %s to %d for %d'%( task.pathName, pending_T0, max_T0))
                    else:
                        modifications[wfo.name][task.pathName] = { "AddWhitelist" : ["T0_CH_CERN"],
                                                                   "Priority" : wfi.request['RequestPriority'],
                                                                   "Running" : running,
                                                                   "Pending" : idled}
                        wfi,sendLog('equalizor','adding the T0 for %s to %d for %d'%( task.pathName, pending_T0, max_T0))


    interface['modifications'].update( modifications )



    ###  manage the number of core and job resizing
    interface['cores']={'T2_CH_CERN_HLT': {'min':4,'max':16}, 'default': {'min':1, 'max':4}}
    interface['resizes'] = ['RunIISpring16DR80']

    ### manage the modification of the memory and target time
    interface['time'] = defaultdict(list)
    interface['memory'] = defaultdict(list)

    max_N_mem = 10
    max_N_time = 10
    ## discretize the memory to 10 at most values
    mems = set([o['memory'] for t,o in performance.items() if 'memory' in o])
    times = set([o['time'] for t,o in performance.items() if 'time' in o])
    if len(mems)>max_N_mem:
        mem_step = int((max(mems) - min(mems))/ float(max_N_mem))
        for t in performance:
            if not 'memory' in performance[t]: continue
            (m,r) = divmod(performance[t]['memory'], mem_step)
            performance[t]['memory'] = (m+1)*mem_step
    if len(times)>max_N_time:
        time_step = int((max(times) - min(times))/float(max_N_time))
        for t in performance:
            if not 'time' in performance[t]: continue
            (m,r) = divmod(performance[t]['time'], time_step)
            performance[t]['time'] = (m+1)*time_step

    for t,o in performance.items():
        if 'time' in o:
            interface['time'][str(o['time'])] .append( t )
        if 'memory' in o:
            interface['memory'][str(o['memory'])].append( t )

    ## close and save
    close( interface )
Exemplo n.º 28
0
def injector(url, options, specific):

    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=["mcm"])
    if not up.check():
        return
    use_mcm = up.status["mcm"]

    workflows = getWorkflows(url, status=options.wmstatus, user=options.user)
    existing = [wf.name for wf in session.query(Workflow).all()]
    ## browse for assignment-approved requests, browsed for ours, insert the diff
    for wf in workflows:
        if wf not in existing:
            print "putting", wf
            new_wf = Workflow(name=wf, status=options.setstatus, wm_status=options.wmstatus)
            session.add(new_wf)
            session.commit()
            time.sleep(1)

    existing = [wf.name for wf in session.query(Workflow).all()]

    ## passing a round of invalidation of what needs to be invalidated
    if use_mcm and (options.invalidate or True):
        invalidator(url)

    ## pick up replacements
    for wf in session.query(Workflow).filter(Workflow.status == "trouble").all():
        if specific and wf.name != specific:
            continue
        print wf.name
        wl = getWorkLoad(url, wf.name)
        familly = getWorkflowById(url, wl["PrepID"])
        true_familly = []
        for member in familly:
            if member == wf.name:
                continue
            fwl = getWorkLoad(url, member)
            if options.replace:
                if member != options.replace:
                    continue
            else:
                if fwl["RequestDate"] < wl["RequestDate"]:
                    continue
                if fwl["RequestType"] == "Resubmission":
                    continue
                if fwl["RequestStatus"] in ["None", None]:
                    continue
            true_familly.append(fwl)

        if len(true_familly) == 0:
            print wf.name, "ERROR has no replacement"
            known = []
            try:
                known = json.loads(open("no_replacement.json").read())
            except:
                pass
            if not wf.name in known:
                sendEmail(
                    "workflow in %s with no replacement" % (wl["RequestStatus"]), "%s is dangling there" % (wf.name)
                )
                known.append(wf.name)
                open("no_replacement.json", "w").write(json.dumps(known, indent=2))
            continue
        print wf.name, "has", len(familly), "familly members"
        print wf.name, "has", len(true_familly), "true familly members"

        for fwl in true_familly:
            member = fwl["RequestName"]
            new_wf = session.query(Workflow).filter(Workflow.name == member).first()
            if not new_wf:
                print "putting", member, "as replacement of", wf.name
                status = "away"
                if fwl["RequestStatus"] in ["assignment-approved"]:
                    status = "considered"
                new_wf = Workflow(name=member, status=status, wm_status=fwl["RequestStatus"])
                wf.status = "forget"
                session.add(new_wf)
            else:
                if new_wf.status == "forget":
                    continue
                print "getting", new_wf.name, "as replacement of", wf.name
                wf.status = "forget"

            for tr in session.query(Transfer).all():
                if wf.id in tr.workflows_id:
                    sw = copy.deepcopy(tr.workflows_id)
                    sw.remove(wf.id)
                    sw.append(new_wf.id)
                    tr.workflows_id = sw
                    print tr.phedexid, "got", new_wf.name
                    if new_wf.status != "away":
                        print "\t setting it considered"
                        new_wf.status = "considered"
                    session.commit()

        ## don't do that automatically
        # wf.status = 'forget'
        session.commit()
Exemplo n.º 29
0
def closor(url, specific=None, options=None):
    if userLock(): return
    if duplicateLock(): return
    if not componentInfo().check(): return

    UC = unifiedConfiguration()
    CI = campaignInfo()

    all_late_files = []
    check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce')

    jump_the_line = options.announce if options else False
    if jump_the_line:
        print "announce option is on. Checking on things on-going ready to be announced"
        wfs = session.query(Workflow).filter(
            Workflow.status.contains('announce')).filter(
                sqlalchemy.not_(Workflow.status.contains('announced'))).all()
    else:
        print "regular option. Checking on things done and to be announced"
        wfs = session.query(Workflow).filter(Workflow.status == 'close').all()

    wfs_n = [w.name for w in wfs]
    print "unique names?"
    print len(set(wfs_n)) == len(wfs_n)

    held = set()

    print len(wfs), "closing"
    random.shuffle(wfs)
    max_per_round = UC.get('max_per_round').get('closor', None)
    if options.limit: max_per_round = options.limit

    if max_per_round:
        ## order them by priority
        all_closedout = sorted(getWorkflows(url, 'closed-out', details=True),
                               key=lambda r: r['RequestPriority'])
        all_closedout = [r['RequestName'] for r in all_closedout]

        def rank(wfn):
            return all_closedout.index(wfn) if wfn in all_closedout else 0

        wfs = sorted(wfs, key=lambda wfo: rank(wfo.name), reverse=True)
        wfs = wfs[:max_per_round]

    batch_go = {}
    batch_warnings = defaultdict(set)
    batch_goodness = UC.get("batch_goodness")

    for iwfo, wfo in enumerate(wfs):

        if specific and not specific in wfo.name: continue

        print "Progress [%d/%d]" % (iwfo, len(wfs))
        ## what is the expected #lumis
        wfi = workflowInfo(url, wfo.name)
        wfo.wm_status = wfi.request['RequestStatus']

        if wfi.isRelval():
            has_batch_go = False
            batch_name = wfi.getCampaign()
            if not batch_name in batch_go:
                ## do the esimatation whethere this can be announced : only once per batch
                in_batches = getWorkflowByCampaign(url,
                                                   batch_name,
                                                   details=True)
                batch_go[batch_name] = all(
                    map(
                        lambda s: not s in [
                            'completed', 'running-open', 'running-closed',
                            'acquired', 'assigned', 'assignment-approved'
                        ], [r['RequestStatus'] for r in in_batches]))
            ## already verified
            has_batch_go = batch_go[batch_name]
            if not has_batch_go:
                wfi.sendLog(
                    'closor',
                    'Cannot close for now because the batch <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?campaign=%s>%s</a> is not all close'
                    % (batch_name, batch_name))
                continue

        if wfi.request['RequestStatus'] in ['announced', 'normal-archived'
                                            ] and not options.force:
            ## manually announced ??
            wfo.status = 'done'
            wfo.wm_status = wfi.request['RequestStatus']
            wfi.sendLog(
                'closor',
                '%s is announced already : %s' % (wfo.name, wfo.wm_status))
        session.commit()

        if jump_the_line:
            wfi.sendLog('closor', 'Announcing while completing')

        expected_lumis = 1
        if not 'TotalInputLumis' in wfi.request:
            print wfo.name, "has not been assigned yet, or the database is corrupted"
        elif wfi.request['TotalInputLumis'] == 0:
            print wfo.name, "is corrupted with 0 expected lumis"
        else:
            expected_lumis = wfi.request['TotalInputLumis']

        ## what are the outputs
        outputs = wfi.request['OutputDatasets']
        ## check whether the number of lumis is as expected for each
        all_OK = defaultdict(lambda: False)
        stats = defaultdict(int)
        #print outputs
        if len(outputs):
            print wfo.name, wfi.request['RequestStatus']
        for out in outputs:
            event_count, lumi_count = getDatasetEventsAndLumis(dataset=out)
            odb = session.query(Output).filter(
                Output.datasetname == out).first()
            if not odb:
                print "adding an output object", out
                odb = Output(datasetname=out)
                odb.workflow = wfo
                session.add(odb)
            odb.nlumis = lumi_count
            odb.nevents = event_count
            odb.workfow_id = wfo.id
            if odb.expectedlumis < expected_lumis:
                odb.expectedlumis = expected_lumis
            else:
                expected_lumis = odb.expectedlumis
            odb.date = time.mktime(time.gmtime())
            session.commit()
            fraction = lumi_count / float(expected_lumis) * 100.

            completion_line = "%60s %d/%d = %3.2f%%" % (
                out, lumi_count, expected_lumis, fraction)
            wfi.sendLog('closor', "\t%s" % completion_line)
            if wfi.isRelval() and fraction < batch_goodness:
                batch_warnings[wfi.getCampaign()].add(completion_line)
            stats[out] = lumi_count
            all_OK[out] = True

        ## check for at least one full copy prior to moving on
        in_full = {}
        for out in outputs:
            in_full[out] = []
            presence = getDatasetPresence(url, out)
            where = [site for site, info in presence.items() if info[0]]
            if where:
                all_OK[out] = True
                print out, "is in full at", ",".join(where)
                in_full[out] = copy.deepcopy(where)
            else:

                going_to = wfi.request['NonCustodialSites'] + wfi.request[
                    'CustodialSites']
                wfi.sendLog(
                    'closor', "%s is not in full anywhere. send to %s" %
                    (out, ",".join(sorted(going_to))))
                at_destination = dict([(k, v) for (k, v) in presence.items()
                                       if k in going_to])
                else_where = dict([(k, v) for (k, v) in presence.items()
                                   if not k in going_to])
                print json.dumps(at_destination)
                print json.dumps(else_where, indent=2)
                ## do the full stuck transfer study, missing files and shit !
                for there in going_to:
                    late_info = findLateFiles(url, out, going_to=there)
                    for l in late_info:
                        l.update({"workflow": wfo.name, "dataset": out})
                    all_late_files.extend(late_info)
                if check_fullcopy_to_announce:
                    ## only set this false if the check is relevant
                    all_OK[out] = False

        ## verify if we have to do harvesting

        if not options.no_harvest and not jump_the_line:
            (OK, requests) = spawn_harvesting(url, wfi, in_full)
            all_OK.update(OK)

        ## only that status can let me go into announced
        if all(all_OK.values()) and (
            (wfi.request['RequestStatus'] in ['closed-out']) or options.force
                or jump_the_line):
            print wfo.name, "to be announced"
            results = []
            if not results:
                for out in outputs:
                    if out in stats and not stats[out]:
                        continue
                    _, dsn, process_string, tier = out.split('/')

                    if all_OK[out]:
                        results.append(setDatasetStatus(out, 'VALID'))
                    if all_OK[out] and wfi.isRelval():
                        ## make the specific relval rules and the replicas
                        ## figure the destination(s) out
                        destinations = set()
                        if tier != "RECO" and tier != "ALCARECO":
                            destinations.add('T2_CH_CERN')
                        if tier == "GEN-SIM":
                            destinations.add('T1_US_FNAL_Disk')
                        if tier == "GEN-SIM-DIGI-RAW":
                            destinations.add('T1_US_FNAL_Disk')
                        if tier == "GEN-SIM-RECO":
                            destinations.add('T1_US_FNAL_Disk')

                        if "RelValTTBar" in dsn and "TkAlMinBias" in process_string and tier != "ALCARECO":
                            destinations.add('T2_CH_CERN')

                        if "MinimumBias" in dsn and "SiStripCalMinBias" in process_string and tier != "ALCARECO":
                            destinations.add('T2_CH_CERN')

                        if destinations:
                            wfi.sendLog(
                                'closor', '%s to go to %s' %
                                (out, ', '.join(sorted(destinations))))

                        ## call to makereplicarequest under relval => done
                        for site in destinations:
                            result = makeReplicaRequest(
                                url,
                                site, [out],
                                'Copy for release validation consumption',
                                priority='normal',
                                approve=True,
                                mail=False,
                                group='RelVal')
                            try:
                                request_id = result['phedex'][
                                    'request_created'][0]['id']
                                results.append(True)
                            except:
                                results.append('Failed relval transfer')

                    elif all_OK[out]:

                        campaign = None
                        try:
                            campaign = out.split('/')[2].split('-')[0]
                        except:
                            if 'Campaign' in wfi.request and wfi.request[
                                    'Campaign']:
                                campaign = wfi.request['Campaign']
                        to_DDM = False
                        ## campaign override
                        if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[
                                campaign] and tier in CI.campaigns[campaign][
                                    'toDDM']:
                            to_DDM = True

                        ## by typical enabling
                        if tier in UC.get("tiers_to_DDM"):
                            to_DDM = True
                        ## check for unitarity
                        if not tier in UC.get("tiers_no_DDM") + UC.get(
                                "tiers_to_DDM"):
                            print "tier", tier, "neither TO or NO DDM for", out
                            results.append('Not recognitized tier %s' % tier)
                            #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out)
                            sendLog(
                                'closor',
                                "could not recognize %s for injecting in DDM" %
                                out,
                                level='critical')
                            continue

                        n_copies = 1
                        destinations = []
                        if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[
                                campaign]:
                            ddm_instructions = CI.campaigns[campaign][
                                'DDMcopies']
                            if type(ddm_instructions) == int:
                                n_copies = CI.campaigns[campaign]['DDMcopies']
                            elif type(ddm_instructions) == dict:
                                ## a more fancy configuration
                                for ddmtier, indication in ddm_instructions.items(
                                ):
                                    if ddmtier == tier or ddmtier in [
                                            '*', 'all'
                                    ]:
                                        ## this is for us
                                        if 'N' in indication:
                                            n_copies = indication['N']
                                        if 'host' in indication:
                                            destinations = indication['host']

                        destination_spec = ""
                        if destinations:
                            destination_spec = "--destination=" + ",".join(
                                destinations)
                        group_spec = ""  ## not used yet
                        ### should make this a campaign configuration
                        ## inject to DDM when necessary
                        if to_DDM:
                            print "Sending", out, " to DDM"
                            status = pass_to_dynamo(
                                [out],
                                N=n_copies,
                                sites=destinations if destinations else None,
                                group=group_spec if group_spec else None)
                            results.append(status)
                            if status == True:
                                wfi.sendLog(
                                    'closor',
                                    '%s is send to dynamo in %s copies %s %s' %
                                    (out, n_copies, sorted(destinations),
                                     group_spec))
                            else:
                                sendLog('closor',
                                        "could not add " + out +
                                        " to dynamo pool. check closor logs.",
                                        level='critical')
                                wfi.sendLog(
                                    'closor', "could not add " + out +
                                    " to dynamo pool. check closor logs.")
                    else:
                        print wfo.name, "no stats for announcing", out
                        results.append('No Stats')

                if all(
                        map(lambda result: result in ['None', None, True],
                            results)):
                    if not jump_the_line:
                        ## only announce if all previous are fine
                        res = reqMgrClient.announceWorkflowCascade(
                            url, wfo.name)
                        if not res in ['None', None]:
                            ## check the status again, it might well have toggled
                            wl_bis = workflowInfo(url, wfo.name)
                            wfo.wm_status = wl_bis.request['RequestStatus']
                            session.commit()
                            if wl_bis.request['RequestStatus'] in [
                                    'announced', 'normal-archived'
                            ]:
                                res = None
                            else:
                                ## retry ?
                                res = reqMgrClient.announceWorkflowCascade(
                                    url, wfo.name)

                        results.append(res)

            #print results
            if all(map(lambda result: result in ['None', None, True],
                       results)):
                if jump_the_line:
                    if not 'announced' in wfo.status:
                        wfo.status = wfo.status.replace(
                            'announce', 'announced')
                else:
                    wfo.status = 'done'
                session.commit()
                wfi.sendLog('closor', "workflow outputs are announced")
            else:
                wfi.sendLog(
                    'closor', "Error with %s to be announced \n%s" %
                    (wfo.name, json.dumps(results)))

        elif wfi.request['RequestStatus'] in [
                'failed', 'aborted', 'aborted-archived', 'rejected',
                'rejected-archived', 'aborted-completed'
        ]:
            if wfi.isRelval():
                wfo.status = 'forget'
                wfo.wm_status = wfi.request['RequestStatus']
                wfi.sendLog(
                    'closor',
                    "%s is %s, but will not be set in trouble to find a replacement."
                    % (wfo.name, wfo.wm_status))
            else:
                wfo.status = 'trouble'
                wfo.wm_status = wfi.request['RequestStatus']
            session.commit()
        else:
            print wfo.name, "not good for announcing:", wfi.request[
                'RequestStatus']
            wfi.sendLog('closor', "cannot be announced")
            held.add(wfo.name)

    days_late = 0.
    retries_late = 10

    really_late_files = [
        info for info in all_late_files if info['retries'] >= retries_late
    ]
    really_late_files = [
        info for info in really_late_files
        if info['delay'] / (60 * 60 * 24.) >= days_late
    ]

    if really_late_files:
        subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s' % (
            len(really_late_files), days_late, retries_late,
            json.dumps(really_late_files, indent=2))
        #sendEmail('waiting for files to announce', subject)
        sendLog('closor', subject, level='warning')
        sendLog('closor', subject)
        print subject
        open('%s/stuck_files.json' % monitor_dir,
             'w').write(json.dumps(really_late_files, indent=2))

    if held:
        sendLog('closor',
                "the workflows below are held up \n%s" %
                ("\n".join(sorted(held))),
                level='critical')

    #batches = json.loads(open('batches.json').read())
    for bname, go in batch_go.items():
        if go:
            subject = "Release Validation Samples Batch %s" % bname
            issues = ""
            if batch_warnings[bname]:
                issues = "The following datasets have outstanding completion (<%d%%) issues:\n\n" % batch_goodness
                issues += "\n".join(sorted(batch_warnings[bname]))
                issues += "\n\n"
            text = """
Dear all,

a batch of release validation workflows has finished.

Batch ID:

%s

Detail of the workflows

https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s

%s 
This is an automated message.
""" % (bname, bname, issues)
            to = ['*****@*****.**']
            sendEmail(subject, text, destination=to)
Exemplo n.º 30
0
from assignSession import *
import reqMgrClient
import os
import sys
import json
import time
import random

UC = unifiedConfiguration()

## get all acquired and push one to stepchain so that we can acquire it on nersc
N_for_cloud = isHEPCloudReady(reqmgr_url)
if N_for_cloud:
    print "HEP cloud is ready"
    wfs = getWorkflows(reqmgr_url, 'acquired', details=True)
    for wf in wfs:
        if N_for_cloud <= 0: break
        wfi = workflowInfo(reqmgr_url, wf['RequestName'], request=wf)
        print "testing", wf['RequestName']
        if wfi.isGoodToConvertToStepChain() and wfi.isGoodForNERSC(
                no_step=True) and N_for_cloud:
            print "good to convert to step so that we get something for hepcloud on next round", wf[
                'RequestName']
            os.system(
                'Unified/rejector.py --to_step --clone --comment "convert to step for hepcloud" %s'
                % wf['RequestName'])
            ## just do that once and be done with it
            N_for_cloud -= 1

## send something to T0
Exemplo n.º 31
0
def batchor( url ):
    UC = unifiedConfiguration()
    ## get all workflows in assignment-approved with SubRequestType = relval
    all_wfs = []
    for user in UC.get("user_relval"):
        all_wfs = getWorkflows(url, 'assignment-approved', details=True, user=user, rtype='TaskChain')

    wfs = filter( lambda r :r['SubRequestType'] == 'RelVal' if 'SubRequestType' in r else False, all_wfs)
    ## need a special treatment for those
    hi_wfs = filter( lambda r :r['SubRequestType'] == 'HIRelVal' if 'SubRequestType' in r else False, all_wfs)

    by_campaign = defaultdict(set)
    by_hi_campaign = defaultdict(set)
    for wf in wfs:
        print "Relval:",wf['RequestName'], wf['Campaign']
        by_campaign[wf['Campaign']].add( wf['RequestName'] )
    for wf in hi_wfs:
        print "HI Relval:",wf['RequestName'], wf['Campaign']
        by_hi_campaign[wf['Campaign']].add( wf['RequestName'] )
        
    default_setup = {
        "go" :True,
        "parameters" : {
            "SiteWhitelist": [ "T1_US_FNAL" ],
            "MergedLFNBase": "/store/relval",
            "Team" : "relval",
            "NonCustodialGroup" : "RelVal"
            },
        "custodial" : "T1_US_FNAL_MSS",
        "phedex_group" : "RelVal",
        "lumisize" : -1,
        "fractionpass" : 0.0,
        "maxcopies" : 1
        }
    default_hi_setup = copy.deepcopy( default_setup )

    add_on = {}
    batches = json.loads( open('batches.json').read() )
    for campaign in by_campaign:
        ## get a bunch of information
        setup  = copy.deepcopy( default_setup )
        add_on[campaign] = setup
        sendLog('batchor','Adding the relval campaigns %s with parameters \n%s'%( campaign, json.dumps( setup, indent=2)),level='critical')
        if not campaign in batches: batches[campaign] = []
        batches[campaign] = list(set(list(copy.deepcopy( by_campaign[campaign] )) + batches[campaign] ))
    for campaign in by_hi_campaign:
        ## get a bunch of information
        setup  = copy.deepcopy( default_hi_setup )
        hi_site = random.choice(["T1_DE_KIT","T1_FR_CCIN2P3"])
        setup["parameters"]["SiteWhitelist"]=[ hi_site ]

        add_on[campaign] = setup
        sendLog('batchor','Adding the HI relval campaigns %s with parameters \n%s'%( campaign, json.dumps( setup, indent=2)),level='critical')
        if not campaign in batches: batches[campaign] = []
        batches[campaign] = list(set(list(copy.deepcopy( by_hi_campaign[campaign] )) + batches[campaign] ))
        
    
    open('batches.json','w').write( json.dumps( batches , indent=2 ) )

    ## open the campaign configuration 
    campaigns = json.loads( open('campaigns.relval.json').read() )


    ## protect for overwriting ??
    for new_campaign in list(set(add_on.keys())-set(campaigns.keys())):
        ## this is new, and can be announced as such
        print new_campaign,"is new stuff"
        workflows = by_campaign[new_campaign]
        requester = list(set([wf.split('_')[0] for wf in workflows]))
        subject = "Request of RelVal samples batch %s"% new_campaign
        text="""Dear all, 
A new batch of relval workflows was requested.

Batch ID:

%s

Requestor:

%s

Details of the workflows:

https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s

This is an automated message"""%( new_campaign, 
                                  ', '.join(requester),
                                  new_campaign,
                                  #'\n'.join( sorted(workflows) ) 
                                  )


        print subject
        print text
        to = ['*****@*****.**']
        sendEmail(subject, text, destination=to)
        sendLog('batchor',text, level='critical')

    ## merge all anyways
    campaigns.update( add_on )

    ## write it out for posterity
    open('campaigns.json.updated','w').write(json.dumps( campaigns , indent=2))

    ## read back
    rread = json.loads(open('campaigns.json.updated').read())

    os.system('mv campaigns.json.updated campaigns.relval.json')
Exemplo n.º 32
0
from collections import defaultdict
from utils import monitor_pub_dir, workflowInfo, getWorkflows

register = [
    #'assigned','acquired',
    'running-open',
    'running-closed',
    'force-complete',
    'completed',
    'closed-out'
]
wfs = []
url = 'cmsweb.cern.ch'

for r in register:
    wfs.extend(getWorkflows(url, r, details=True))
    print len(wfs), "after collecting", r

lfns = defaultdict(set)
for wf in wfs:
    if 'OutputModulesLFNBases' not in wf:
        print wf['RequestName']
    for base in wf['OutputModulesLFNBases']:
        lfns[base].add(wf['RequestName'])

now = time.gmtime()
content = {
    "timestamp": time.mktime(now),
    "date": time.asctime(now),
    "protected": sorted(lfns.keys())
}
Exemplo n.º 33
0
def htmlor():
    cache = getWorkflows('cmsweb.cern.ch','assignment-approved', details=True)
    def wfl(wf,view=False,p=False,ms=False,within=False,ongoing=False,status=False,update=False):
        wfn = wf.name
        wfs = wf.wm_status
        pid = None
        pids=filter(lambda seg: seg.count('-')==2, wf.name.split('_'))
        if len(pids):
            pid=pids[0]
        text=', '.join([
                #wfn,
                '<a href="https://cmsweb.cern.ch/reqmgr/view/details/%s" target="_blank">%s</a>'%(wfn,wfn),
                '(%s) <br>'%wfs])
        text+=', '.join([
                '<a href="https://cmsweb.cern.ch/reqmgr/view/details/%s" target="_blank">dts</a>'%wfn,
                '<a href=https://cmsweb.cern.ch/reqmgr/view/showWorkload?requestName=%s target="_blank">wkl</a>'%wfn,
                '<a href="https://cmsweb.cern.ch/couchdb/reqmgr_workload_cache/%s" target="_blank">wfc</a>'%wfn,
                '<a href="https://cmsweb.cern.ch/reqmgr/reqMgr/request?requestName=%s" target="_blank">dwkc</a>'%wfn,
                '<a href="https://cmsweb.cern.ch/reqmgr/view/splitting/%s" target="_blank">spl</a>'%wfn,
                '<a href="https://cms-pdmv.cern.ch/stats/?RN=%s" target="_blank">vw</a>'%wfn,
                '<a href="https://cms-pdmv.cern.ch/stats/restapi/get_one/%s" target="_blank">vwo</a>'%wfn,
                '<a href="https://cms-logbook.cern.ch/elog/Workflow+processing/?mode=full&reverse=0&reverse=1&npp=20&subtext=%s&sall=q" target="_blank">elog</a>'%pid,
                '<a href="http://hcc-briantest.unl.edu/prodview/%s" target="_blank">pv</a>'%wfn,
                '<a href="https://cmsweb.cern.ch/reqmgr/reqMgr/outputDatasetsByRequestName/%s" target="_blank">out</a>'%wfn,
                '<a href="closeout.html#%s" target="_blank">clo</a>'%wfn,
                '<a href="statuses.html#%s" target="_blank">st</a>'%wfn,
                '<a href="https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s" target="_blank">perf</a>'%wfn
                ])
        if within and (not view or wfs=='completed'):
            cached = filter(lambda d : d['RequestName']==wfn, cache)
            if cached:
                wl = cached[0]
            else:
                wl = getWorkLoad('cmsweb.cern.ch',wfn)
            if 'InputDataset' in wl:
                dataset = wl['InputDataset']
                text+=', '.join(['',
                                 '<a href=https://cmsweb.cern.ch/das/request?input=%s target=_blank>input</a>'%dataset,
                                 '<a href=https://cmsweb.cern.ch/phedex/prod/Data::Subscriptions#state=create_since=0;filter=%s target=_blank>sub</a>'%dataset,
                                 '<a href=https://cmsweb.cern.ch/phedex/datasvc/xml/prod/subscriptions?dataset=%s&collapse=n target=_blank>ds</a>'%dataset,
                                 '<a href=https://cmsweb.cern.ch/phedex/datasvc/xml/prod/blockreplicas?dataset=%s target=_blank>rep</a>'%dataset,
                                 ])

        if p:
            cached = filter(lambda d : d['RequestName']==wfn, cache)
            if cached:
                wl = cached[0]
            else:
                wl = getWorkLoad('cmsweb.cern.ch',wfn)
            text+=', (%s)'%(wl['RequestPriority'])
            pass

        if pid:
            if ms:
                mcm_s = json.loads(os.popen('curl https://cms-pdmv.cern.ch/mcm/public/restapi/requests/get_status/%s --insecure'%pid).read())[pid]
                text+=', <a href="https://cms-pdmv.cern.ch/mcm/requests?prepid=%s" target="_blank">mcm (%s)</a>'%(pid,mcm_s)
            else:
                text+=', <a href="https://cms-pdmv.cern.ch/mcm/requests?prepid=%s" target="_blank">mcm</a>'%(pid)
                text+=', <a href="https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=%s" target="_blank">ac</a>'%(pid)
                
        if status:
            if wf.status.startswith('assistance'):
                text+=', <a href="assistance.html#%s" target="_blank">assist</a>'%wfn
            text+=' : %s '%(wf.status)


        if view and wfs!='acquired':
            text+='<a href="https://cms-pdmv.web.cern.ch/cms-pdmv/stats/growth/%s.gif" target="_blank"><img src="https://cms-pdmv.web.cern.ch/cms-pdmv/stats/growth/%s.gif" style="height:50px"></a>'%(wfn.replace('_','/'),wfn.replace('_','/'))
        if ongoing:
            text+='<a href="http://hcc-briantest.unl.edu/prodview/%s" target="_blank"><img src="http://hcc-briantest.unl.edu/prodview/graphs/%s/daily" style="height:50px"></a>'%(wfn,wfn)
        text+="<hr>"
        return text


    def phl(phid):
        text=', '.join([
                str(phid),
                '<a href="https://cmsweb.cern.ch/phedex/prod/Request::View?request=%s" target="_blank">vw</a>'%phid,
                '<a href="https://cmsweb.cern.ch/phedex/prod/Data::Subscriptions?reqfilter=%s" target="_blank">sub</a>'%phid,
                ])
        return text
            

    def ol(out):
        return '<a href="https://cmsweb.cern.ch/das/request?input=%s" target="_blank"> %s</a>'%(out,out)

    
    ## start to write it
    #html_doc = open('/afs/cern.ch/user/v/vlimant/public/ops/index.html','w')
    html_doc = open('/afs/cern.ch/user/c/cmst2/www/unified/index.html','w')
    print "Updating the status page ..." 
    html_doc.write("""
<html>
<head>
<META HTTP-EQUIV="refresh" CONTENT="900">
<script type="text/javascript">
 function showhide(id) {
    var e = document.getElementById(id);
    e.style.display = (e.style.display == 'block') ? 'none' : 'block';
 }
</script>
</head>
<body>

Last update on %s(CET), %s(GMT), <a href=logs/ target=_blank> logs</a> <a href=logs/last.log target=_blank>last</a> <a href=statuses.html>statuses</a> <a href=https://twiki.cern.ch/twiki/bin/view/CMSPublic/CompOpsWorkflowL3Responsibilities#Automatic_Assignment_and_Unified>what am I</a> <br><br>

""" %(time.asctime(time.localtime()),
      time.asctime(time.gmtime())))

    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status=='considered').all():
        text+="<li> %s </li> \n"%wfl(wf,p=True)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""
Worlfow next to handle <a href=https://cms-pdmv.cern.ch/mcm/batches?status=new&page=-1 target="_blank"> batches</a> (%d) <a href=logs/injector/last.log target=_blank>log</a> <a href=logs/transferor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('considered')">[Click to show/hide]</a>
<br>
<div id="considered" style="display:none;">
<ul>
"""%count)
    html_doc.write(text)

    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status=='staging').all():
        text+="<li> %s </li> \n"%wfl(wf,within=True)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""
Worlfow waiting in staging (%d) <a href=logs/transferor/last.log target=_blank>log</a> <a href=logs/stagor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('staging')">[Click to show/hide]</a>
<br>
<div id="staging" style="display:none;">
<ul>
"""%count)
    html_doc.write(text)

    text=""
    count=0
    for ts in session.query(Transfer).all():
        stext='<li> %s serves </li><a href="javascript:showhide(\'%s\')">[show/hide]</a> <div id="%s" style="display:none;"><ul>'%( phl(ts.phedexid), ts.phedexid, ts.phedexid )
        hide = True
        for pid in ts.workflows_id:
            w = session.query(Workflow).get(pid)
            hide &= (w.status != 'staging' )
            stext+="<li> %s </li>\n"%( wfl(w,status=True))
        stext+="</ul></div>\n"
        if hide:
            #text+="<li> %s not needed anymore to start running (does not mean it went through completely)</li>"%phl(ts.phedexid)
            pass
        else:
            count+=1
            text+=stext
    text+="</ul></div>"
    html_doc.write("""
Transfer on-going (%d) <a href=https://transferteam.web.cern.ch/transferteam/dashboard/ target=_blank>dashboard</a> <a href=logs/transferor/last.log target=_blank>log</a> <a href=logs/stagor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('transfer')">[Click to show/hide]</a>
<br>
<div id="transfer" style="display:none;">
<br>
<ul>"""%count)
    html_doc.write(text)



    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status=='staged').all():
        text+="<li> %s </li> \n"%wfl(wf,p=True)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""Worlfow ready for assigning (%d) <a href=logs/stagor/last.log target=_blank>log</a> <a href=logs/assignor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('staged')">[Click to show/hide]</a>
<br>
<div id="staged" style="display:none;">
<br>
<ul>
"""%count)
    html_doc.write(text)

    lines=[]
    for wf in session.query(Workflow).filter(Workflow.status=='away').all():
        lines.append("<li> %s </li>"%wfl(wf,view=True,ongoing=True))
    lines.sort()
    html_doc.write("""
Worlfow on-going (%d) <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests_in_production.php target=_blank>ongoing</a> <a href=https://cms-logbook.cern.ch/elog/Workflow+processing/?mode=summary target=_blank>elog</a> <a href=http://hcc-briantest.unl.edu/prodview target=_blank>queues</a> <a href=logs/assignor/last.log target=_blank>log</a> <a href=logs/checkor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('away')">[Click to show/hide]</a>
<br>
<div id="away" style="display:none;">
<br>
<ul>
%s
</ul>
</div>
"""%(len(lines),'\n'.join(lines)))

    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status == 'assistance').all():
        text+="<li> %s </li> \n"%wfl(wf,view=True,update=True,status=True)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""Worlfow that are closing (%d)
<a href=closeout.html target=_blank>closeout</a> 
<a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('closing')">[Click to show/hide]</a>
<br>
<div id="closing" style="display:none;">
<br>
<ul>
"""%count)
    html_doc.write(text)

    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all():
        text+="<li> %s </li> \n"%wfl(wf,view=True,within=True,status=True,update=True)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""Worlfow which need assistance (%d)
<a href=assistance.html target=_blank>assistance</a> 
<a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('assistance')">[Click to show/hide]</a>
<br>
<div id="assistance" style="display:none;">
<br>
<ul>
"""%count)
    html_doc.write(text)
    
    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status == 'close').all():
        text+="<li> %s </li> \n"%wfl(wf)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""Worlfow ready to close (%d)
<a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('close')">[Click to show/hide]</a>
<br>
<div id="close" style="display:none;">
<br>
<ul>
"""%count)
    html_doc.write(text)

    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status=='trouble').all():
        text+="<li> %s </li> \n"%wfl(wf)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""Worlfow with issue (%d) <a href=logs/closor/last.log target=_blank>log</a> <a href=logs/injector/last.log target=_blank>postlog</a>
<a href="javascript:showhide('trouble')">[Click to show/hide]</a>
<br>
<div id="trouble" style="display:none;">
<br>
<ul>
"""%count)
    html_doc.write(text)



    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status=='forget').all():
        text+="<li> %s </li> \n"%wfl(wf)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""
Worlfow to forget (%d)
<a href="javascript:showhide('forget')">[Click to show/hide]</a>
<br>
<div id="forget" style="display:none;">
<br>
<ul>
"""%count)
    html_doc.write(text)

    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status=='done').all():
        text+="<li> %s </li> \n"%wfl(wf)#,ms=True)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""
Worlfow through (%d) <a href=logs/closor/last.log target=_blank>log</a> <a href=logs/cleanor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('done')">[Click to show/hide]</a>
<br>
<div id="done" style="display:none;">
<br>
<ul>
"""%count)
    html_doc.write(text)

    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status=='clean').all():
        text+="<li> %s </li> \n"%wfl(wf)#,ms=True)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""
Worlfow clean for input (%d) <a href=logs/cleanor/last.log target=_blank>log</a>
<a href="javascript:showhide('clean')">[Click to show/hide]</a>
<br>
<div id="clean" style="display:none;">
<br>
<ul>
"""%count)
    html_doc.write(text)


    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status.endswith('-out')).all():
        text+="<li> %s </li> \n"%wfl(wf,status=True)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""
Worlfow clean for output (%d) <a href=logs/outcleanor/last.log target=_blank>log</a>
<a href="javascript:showhide('cleanout')">[Click to show/hide]</a>
<br>
<div id="cleanout" style="display:none;">
<br>
<ul>
"""%count)
    html_doc.write(text)






    text=""
    lines_thisweek=[]
    lines_lastweek=[]
    now = time.mktime(time.gmtime())
    this_week = int(time.strftime("%W",time.gmtime()))
    for out in session.query(Output).all():
        if not out.workflow: 
            print "This is a problem with",out.datasetname
            continue
        if  out.workflow.status in ['done','clean']:
            out_week = int(time.strftime("%W",time.gmtime(out.date)))
            ##only show current week, and the previous.
            if (this_week-out_week)==1:
                lines_lastweek.append("<li>on week %s : %s </li>"%(
                        time.strftime("%W (%x %X)",time.gmtime(out.date)),
                        ol(out.datasetname),
                        )
                             )
            if (this_week-out_week)==0:
                lines_thisweek.append("<li>on week %s : %s </li>"%(
                        time.strftime("%W (%x %X)",time.gmtime(out.date)),
                        ol(out.datasetname),
                        )
                             )
    lines_thisweek.sort()
    lines_lastweek.sort()

    html_doc.write("""Output produced <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?in_disagreement=1 target=_blank>disagreements</a> (%d)
<a href="javascript:showhide('output')">[Click to show/hide]</a>
<br>
<div id="output" style="display:none;">
<br>
<ul>
<li> Last week (%d) </li><a href="javascript:showhide('output_lastweek')">[Click to show/hide]</a><div id="output_lastweek" style="display:none;"><ul>
%s
</ul></div>
<li> This week (%d) </li><a href="javascript:showhide('output_thisweek')">[Click to show/hide]</a><div id="output_thisweek" style="display:none;"><ul>
%s
</ul></div></div>
"""%( len(lines_lastweek)+len(lines_thisweek),
      len(lines_lastweek),
     '\n'.join(lines_lastweek),
      len(lines_thisweek),
     '\n'.join(lines_thisweek))
                   )

    html_doc.write("""Job installed
<a href="javascript:showhide('acron')">[Click to show/hide]</a>
<br>
<div id="acron" style="display:none;">
<br>
<pre>
%s
</pre></div>
"""%(os.popen('acrontab -l | grep Unified').read()))

    text=""
    count=0
    for (c,info) in campaignInfo().campaigns.items():
        #if 'go' in info and info['go']:
        text+="<li>%s <br> <pre>%s</pre>  </li>"%( c, json.dumps( info, indent=2))
        count+=1

    html_doc.write("""Campaign configuration
<a href="javascript:showhide('campaign')">[Click to show/hide]</a>
<br>
<div id="campaign" style="display:none;">
<br>
<ul>
%s
</ul></div>
"""%(text))

    text=""
    count=0
    n_column = 4
    SI = siteInfo()
    for t in SI.types():
        #text+="<li>%s<ul>"%t
        #for site in getattr(SI,t):
        #    text+="<li><a href=http://hcc-briantest.unl.edu/prodview/%s>%s<a/> </li>"%( site, site)
        #    text+='<a href="http://hcc-briantest.unl.edu/prodview/%s" target="_blank"><img src="http://hcc-briantest.unl.edu/prodview/graphs/%s/daily" style="height:50px"></a>'%(site,site)
        #text+="</ul></li>"
        
        text+="<li>%s<table border=1>"%t
        c=0
        for site in getattr(SI,t):
            cpu = SI.cpu_pledges[site] if site in SI.cpu_pledges else 'N/A'
            disk = SI.disk[SI.CE_to_SE(site)] if SI.CE_to_SE(site) in SI.disk else 'N/A'
            if c==0:
                text+="<tr>"
            text+='<td><a href=http://dashb-ssb.cern.ch/dashboard/templates/sitePendingRunningJobs.html?site=%s>%s</a><br><a href="http://hcc-briantest.unl.edu/prodview/%s" target="_blank"><img src="http://hcc-briantest.unl.edu/prodview/graphs/%s/daily" style="height:50px"></a><br>CPU pledge: %s<br>Disk available: %s</td>'%(site,site,site,site,cpu,disk)
            if c==n_column:
                c=0
            else:
                c+=1
        text+="</table></li>"

    html_doc.write("""Site configuration
<a href="javascript:showhide('site')">[Click to show/hide]</a>
<br>
<div id="site" style="display:none;">
<br>
<ul>
%s
</ul></div>
"""%(text))

    print "... done with status page."
    html_doc.write("""
</body>
</html>
""")

    html_doc.close()

    html_doc = open('/afs/cern.ch/user/c/cmst2/www/unified/statuses.html','w')
    html_doc.write("""                                                                                                                                                                                                                                                                                                      <html>        
<table border=1>
<thead>
<tr>
<th> workflow </th><th> status </th><th> wm status</th>
</tr>
</thead>
""")
    wfs = {}
    for wfo in session.query(Workflow).all():
        wfs[wfo.name] = (wfo.status,wfo.wm_status)
    open('/afs/cern.ch/user/c/cmst2/www/unified/statuses.json','w').write(json.dumps( wfs ))
    for wfn in sorted(wfs.keys()):
        html_doc.write('<tr><td><a id="%s">%s</a></td><td>%s</td><td>%s</td></tr>\n'%( wfn, wfn, wfs[wfn][0],  wfs[wfn][1]))
    html_doc.write("</table>")
    html_doc.write("<br>"*100)
    html_doc.write("end of page</html>")
    html_doc.close()
Exemplo n.º 34
0
def batchor( url ):
    UC = unifiedConfiguration()
    SI = global_SI()
    ## get all workflows in assignment-approved with SubRequestType = relval
    all_wfs = []
    for user in UC.get("user_relval"):
        all_wfs.extend( getWorkflows(url, 'assignment-approved', details=True, user=user, rtype='TaskChain') )

    wfs = filter( lambda r :r['SubRequestType'] == 'RelVal' if 'SubRequestType' in r else False, all_wfs)
    ## need a special treatment for those
    hi_wfs = filter( lambda r :r['SubRequestType'] == 'HIRelVal' if 'SubRequestType' in r else False, all_wfs)

    by_campaign = defaultdict(set)
    by_hi_campaign = defaultdict(set)
    for wf in wfs:
        print "Relval:",wf['RequestName'], wf['Campaign']
        #by_campaign[wf['Campaign']].add( wf['RequestName'] )
        by_campaign[wf['Campaign']].add( wf['PrepID'] )


    for wf in hi_wfs:
        print "HI Relval:",wf['RequestName'], wf['Campaign']
        #by_hi_campaign[wf['Campaign']].add( wf['RequestName'] )
        by_hi_campaign[wf['Campaign']].add( wf['PrepID'] )
        
    default_setup = {
        "go" :True,
        "parameters" : {
            "SiteWhitelist": [ "T1_US_FNAL" ],
            "MergedLFNBase": "/store/relval",
            "Team" : "relval",
            "NonCustodialGroup" : "RelVal"
            },
        "custodial" : "T1_US_FNAL_MSS",
        "custodial_override" : ["DQMIO"],
        "phedex_group" : "RelVal",
        "lumisize" : -1,
        "fractionpass" : 0.0,
        "maxcopies" : 1
        }
    default_hi_setup = copy.deepcopy( default_setup )

    add_on = {}
    batches = json.loads( open('batches.json').read() )
    relval_routing = UC.get('relval_routing')
    def pick_one_site( p):
        ## modify the parameters on the spot to have only one site
        if "parameters" in p and "SiteWhitelist" in p["parameters"] and len(p["parameters"]["SiteWhitelist"])>1:
            choose_from = list(set(p["parameters"]["SiteWhitelist"]) & set(SI.sites_ready))
            picked = random.choice( choose_from )
            print "picked",picked,"from",choose_from
            p["parameters"]["SiteWhitelist"] = [picked]
            
    for campaign in by_campaign:
        if campaign in batches: continue
        ## get a bunch of information
        setup  = copy.deepcopy( default_setup )

        for key in relval_routing:
            if key in campaign:
                ## augment with the routing information
                augment_with = relval_routing[key]
                print "Modifying the batch configuration because of keyword",key
                print "with",augment_with
                setup = deep_update( setup, augment_with )
        #if 'cc7' in campaign: setup["parameters"]["SiteWhitelist"] = ["T2_US_Nebraska"]
        pick_one_site( setup )
        add_on[campaign] = setup
        sendLog('batchor','Adding the relval campaigns %s with parameters \n%s'%( campaign, json.dumps( setup, indent=2)),level='critical')
        if not campaign in batches: batches[campaign] = []
        batches[campaign] = list(set(list(copy.deepcopy( by_campaign[campaign] )) + batches[campaign] ))

    for campaign in by_hi_campaign:
        if campaign in batches: continue
        ## get a bunch of information
        setup  = copy.deepcopy( default_hi_setup )
        hi_site = random.choice(["T1_DE_KIT","T1_FR_CCIN2P3"])
        setup["parameters"]["SiteWhitelist"]=[ hi_site ]
        #setup["parameters"]["SiteWhitelist"]=["T1_DE_KIT","T1_FR_CCIN2P3"]

        pick_one_site( setup )
        add_on[campaign] = setup
        sendLog('batchor','Adding the HI relval campaigns %s with parameters \n%s'%( campaign, json.dumps( setup, indent=2)),level='critical')
        if not campaign in batches: batches[campaign] = []
        batches[campaign] = list(set(list(copy.deepcopy( by_hi_campaign[campaign] )) + batches[campaign] ))
        
    
    open('batches.json','w').write( json.dumps( batches , indent=2 ) )

    ## open the campaign configuration 
    campaigns = json.loads( open('campaigns.relval.json').read() )


    ## protect for overwriting ??
    for new_campaign in list(set(add_on.keys())-set(campaigns.keys())):
        ## this is new, and can be announced as such
        print new_campaign,"is new stuff"
        subject = "Request of RelVal samples batch %s"% new_campaign
        text="""Dear all, 
A new batch of relval workflows was requested.

Batch ID:

%s

Details of the workflows:

https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s

This is an automated message"""%( new_campaign, 
                                  new_campaign,
                                  )


        print subject
        print text
        to = ['*****@*****.**']
        sendEmail(subject, text, destination=to)
        sendLog('batchor',text, level='critical')

    ## go through all existing campaigns and remove the ones not in use anymore ?
    for old_campaign in campaigns.keys():
        all_in_batch = getWorkflowByCampaign(url, old_campaign, details=True)
        is_batch_done = all(map(lambda s : not s in ['completed','running-open','running-closed','acquired','assigned','assignment-approved'], [wf['RequestStatus']for wf in all_in_batch]))
        ## check all statuses
        if is_batch_done:
            #print "batch",old_campaign,"can be closed or removed if necessary"
            #campaigns[old_campaign]['go'] = False ## disable
            campaigns.pop( old_campaign ) ## or just drop it all together ?
            print "batch",old_campaign," configuration was removed"

    ## merge all anyways
    campaigns.update( add_on )

    ## write it out for posterity
    open('campaigns.json.updated','w').write(json.dumps( campaigns , indent=2))

    ## read back
    rread = json.loads(open('campaigns.json.updated').read())

    os.system('mv campaigns.json.updated campaigns.relval.json')
Exemplo n.º 35
0
def htmlor( caller = ""):
    up = componentInfo(mcm=False, soft=['mcm'])
    if not up.check(): return 
        
    try:
        boost = json.loads(open('%s/equalizor.json'%monitor_dir).read())['modifications']
    except:
        boost = {}
    cache = getWorkflows(reqmgr_url,'assignment-approved', details=True)
    cache.extend( getWorkflows(reqmgr_url,'acquired', details=True) )
    cache.extend( getWorkflows(reqmgr_url,'running-open', details=True) )
    cache.extend( getWorkflows(reqmgr_url,'running-closed', details=True) )
    def getWL( wfn ):
        cached = filter(lambda d : d['RequestName']==wfn, cache)
        if cached:
            wl = cached[0]
        else:
            wl = getWorkLoad(reqmgr_url,wfn)
        return wl

    def wfl(wf,view=False,p=False,ms=False,within=False,ongoing=False,status=False,update=False):
        wfn = wf.name
        wfs = wf.wm_status
        wl = None
        pid = None
        wl_pid = None
        pids=filter(lambda seg: seg.count('-')==2, wf.name.split('_'))
        if len(pids):
            pids = pids[:1]
            pid=pids[0]
            
        if not pids:
            wl = getWL( wf.name )
            pids = getPrepIDs( wl )
            pid = pids[0]

        wl_pid = pid
        if 'task' in wf.name:
            wl_pid = 'task_'+pid

        
        text=', '.join([
                #wfn,
                #'<a href="https://cmsweb.cern.ch/reqmgr/view/details/%s" target="_blank">%s</a> '%(wfn,wfn),
                #'<table><tr><td>%s</td></tr></table>'%(wfn),
                #'<span>%s</span>'%(wfn),
                "%s "%wfn,
                '(%s) <br>'%wfs])
        text+=', '.join([
                '<a href="https://%s/reqmgr2/fetch?rid=%s" target="_blank">dts</a>'%(reqmgr_url,wfn),
                '<a href="https://cmsweb.cern.ch/reqmgr/view/details/%s" target="_blank">dts-req1</a>'%wfn,
                #TOFIX '<a href=https://cmsweb.cern.ch/reqmgr/view/showWorkload?requestName=%s target="_blank">wkl</a>'%wfn,
                '<a href="https://%s/couchdb/reqmgr_workload_cache/%s" target="_blank">wfc</a>'%(reqmgr_url,wfn),
                '<a href="https://%s/reqmgr2/data/request?name=%s" target="_blank">req</a>'%(reqmgr_url,wfn),
                #'<a href="https://cmsweb.cern.ch/reqmgr/reqMgr/request?requestName=%s" target="_blank">dwkc</a>'%wfn,
                #TOFIX '<a href="https://cmsweb.cern.ch/reqmgr/view/splitting/%s" target="_blank">spl</a>'%wfn,
                '<a href="https://cms-pdmv.cern.ch/stats/?RN=%s" target="_blank">vw</a>'%wfn,
                '<a href="https://cms-pdmv.cern.ch/stats/restapi/get_one/%s" target="_blank">vwo</a>'%wfn,
                '<a href="https://cms-logbook.cern.ch/elog/Workflow+processing/?mode=full&reverse=0&reverse=1&npp=20&subtext=%s&sall=q" target="_blank">elog</a>'%pid,
                '<a href="http://cms-gwmsmon.cern.ch/prodview/%s" target="_blank">pv</a>'%wfn,
                #deprecated '<a href="https://cmsweb.cern.ch/reqmgr/reqMgr/outputDatasetsByRequestName/%s" target="_blank">out</a>'%wfn,
                '<a href="closeout.html#%s" target="_blank">clo</a>'%wfn,
                '<a href="statuses.html#%s" target="_blank">st</a>'%wfn,
                '<a href="https://%s/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s" target="_blank">perf</a>'%(reqmgr_url,wfn)
                ])
        if within and (not view or wfs=='completed'):
            wl = getWL( wfn )
            dataset =None
            if 'InputDataset' in wl:
                dataset = wl['InputDataset']                
            if 'Task1' in wl and 'InputDataset' in wl['Task1']:
                dataset = wl['Task1']['InputDataset']

            if dataset:
                text+=', '.join(['',
                                 '<a href=https://cmsweb.cern.ch/das/request?input=%s target=_blank>input</a>'%dataset,
                                 '<a href=https://cmsweb.cern.ch/phedex/prod/Data::Subscriptions#state=create_since=0;filter=%s target=_blank>sub</a>'%dataset,
                                 '<a href=https://cmsweb.cern.ch/phedex/datasvc/xml/prod/subscriptions?dataset=%s&collapse=n target=_blank>ds</a>'%dataset,
                                 '<a href=https://cmsweb.cern.ch/phedex/datasvc/xml/prod/blockreplicas?dataset=%s target=_blank>rep</a>'%dataset,
                                 ])

        if p:
            cached = filter(lambda d : d['RequestName']==wfn, cache)
            if cached:
                wl = cached[0]
            else:
                wl = getWorkLoad('cmsweb.cern.ch',wfn)
            text+=', (%s)'%(wl['RequestPriority'])
            pass

        if pid:
            if ms:
                mcm_s = json.loads(os.popen('curl https://cms-pdmv.cern.ch/mcm/public/restapi/requests/get_status/%s --insecure'%pid).read())[pid]
                text+=', <a href="https://cms-pdmv.cern.ch/mcm/requests?prepid=%s" target="_blank">mcm (%s)</a>'%(pid,mcm_s)
            else:
                text+=', <a href="https://cms-pdmv.cern.ch/mcm/requests?prepid=%s" target="_blank">mcm</a>'%(pid)
                text+=', <a href="https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=%s" target="_blank">ac</a>'%(wl_pid)
                
        if status:
            if wf.status.startswith('assistance'):
                text+=', <a href="assistance.html#%s" target="_blank">assist</a>'%wfn
            text+=' : %s '%(wf.status)

        if view and wfs!='acquired':
            text+='<a href="https://cms-pdmv.web.cern.ch/cms-pdmv/stats/growth/%s.gif" target="_blank"><img src="https://cms-pdmv.web.cern.ch/cms-pdmv/stats/growth/%s.gif" style="height:50px"></a>'%(wfn.replace('_','/'),wfn.replace('_','/'))
        if ongoing:
            text+='<a href="http://cms-gwmsmon.cern.ch/prodview/%s" target="_blank"><img src="http://cms-gwmsmon.cern.ch/prodview/graphs/%s/daily" style="height:50px"></a>'%(wfn,wfn)

        if ongoing:
            date1 = time.strftime('%Y-%m-%d+%H:%M', time.gmtime(time.mktime(time.gmtime())-(15*24*60*60)) )
            date2 = time.strftime('%Y-%m-%d+%H:%M', time.gmtime())
            text+='<a href="http://dashb-cms-job.cern.ch/dashboard/templates/web-job2/#table=Jobs&date1=%s&date2=%s&sortby=site&task=wmagent_%s">dashb</a>'%( date1, date2, wfn )

        if ongoing and wfn in boost:
            for task in boost[wfn]:
                overflow = boost[wfn][task].get('ReplaceSiteWhitelist',None)
                if not overflow:
                    overflow = boost[wfn][task].get('AddWhitelist',None)
                if overflow:
                    text+=',boost (<a href=equalizor.json>%d</a>)'%len(overflow)

        #text+="<hr>"
        return text


    def phl(phid):
        text=', '.join([
                str(phid),
                '<a href="https://cmsweb.cern.ch/phedex/prod/Request::View?request=%s" target="_blank">vw</a>'%phid,
                '<a href="https://cmsweb.cern.ch/phedex/prod/Data::Subscriptions?reqfilter=%s" target="_blank">sub</a>'%phid,
                ])
        return text
            

    def ol(out):
        return '<a href="https://cmsweb.cern.ch/das/request?input=%s" target="_blank"> %s</a>'%(out,out)


    def lap( comment ):
        
        l = time.mktime(time.gmtime())
        spend = l-lap.start
        lap.start =l 
        print "Spend %d [s] for %s"%( spend, comment )
    lap.start = time.mktime(time.gmtime())

    ## start to write it
    #html_doc = open('/afs/cern.ch/user/v/vlimant/public/ops/index.html','w')
    html_doc = open('%s/index.html.new'%monitor_dir,'w')
    print "Updating the status page ..." 

    UC = unifiedConfiguration()

    if not caller:
        try:
            #caller = sys._getframe(1).f_code.co_name
            caller = sys.argv[0].split('/')[-1].replace('.py','')
            print "caller is"
            print caller
        except Exception as es:
            caller = 'none found'
            print "not getting frame"
            print str(es)

    html_doc.write("""
<html>
<head>
<META HTTP-EQUIV="refresh" CONTENT="900">
<script type="text/javascript">
 function showhide(id) {
    var e = document.getElementById(id);
    e.style.display = (e.style.display == 'block') ? 'none' : 'block';
 }
</script>
</head>
<body>

Last update on %s(CET), %s(GMT)
<br>
<a href=logs/ target=_blank>logs</a> <a href=logs/last.log target=_blank>last</a> <a href=statuses.html>statuses</a> <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/ target=_blank>prod mon</a> <a href=https://%s/wmstats/index.html target=_blank>wmstats</a> <a href=http://t3serv001.mit.edu/~cmsprod/IntelROCCS/Detox/SitesInfo.txt target=_blank>detox</a> <a href=locked.html>space</a> <a href=logs/subscribor/last.log target=_blank>blocks</a> <a href=logs/agents/last.log>agents</a>
<br>
<a href=https://twiki.cern.ch/twiki/bin/view/CMSPublic/CompOpsWorkflowL3Responsibilities#Automatic_Assignment_and_Unified>what am I</a> <a href=data.html>json interfaces</a> <a href=logs/addHoc/last.log>add-hoc op</a> created from <b>%s <a href=logs/last_running>last running</a></b> <object height=20 type="text/html" data="logs/last_running"><p>backup content</p></object>
<br><br>

""" %(time.asctime(time.localtime()),
      time.asctime(time.gmtime()),
      reqmgr_url,
      caller
      )
                   )
        
    text=""
    count=0
    count_by_campaign=defaultdict(lambda : defaultdict(int))
    for wf in session.query(Workflow).filter(Workflow.status.startswith('considered')).all():
        wl = getWL( wf.name )
        count_by_campaign[wl['Campaign']][int(wl['RequestPriority'])]+=1
        #print wf.name
        text+="<li> %s </li> \n"%wfl(wf,p=True)
        count+=1
    text_by_c=""
    for c in count_by_campaign:
        text_by_c+="<li> %s (%d) : "%( c, sum(count_by_campaign[c].values()) )
        for p in sorted(count_by_campaign[c].keys()):
            text_by_c+="%d (%d), "%(p,count_by_campaign[c][p])
        text_by_c+="</li>"

    html_doc.write("""
Worflow next to handle (%d) <a href=https://cms-pdmv.cern.ch/mcm/batches?status=new&page=-1 target="_blank"> batches</a> <a href=logs/injector/last.log target=_blank>log</a> <a href=logs/transferor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('considered')">[Click to show/hide]</a>
<br>
<div id="considered" style="display:none;">
<ul>
<li> By workflow (%d) </li><a href="javascript:showhide('considered_bywf')">[Click to show/hide]</a><div id="considered_bywf" style="display:none;">
 <ul>
 %s
 </ul></div>
<li> By campaigns (%d) </li><a href="javascript:showhide('considered_bycamp')">[Click to show/hide]</a><div id="considered_bycamp" style="display:none;">
 <ul>
 %s
 </ul></div>
</ul>
</div>
"""%(count,
     count, text,
     len(count_by_campaign), text_by_c))
                   
    lap( 'done with considered' )
    text=""
    count=0
    count_by_campaign=defaultdict(lambda : defaultdict(int))
    for wf in session.query(Workflow).filter(Workflow.status=='staging').all():
        wl = getWL( wf.name )
        count_by_campaign[wl['Campaign']][int(wl['RequestPriority'])]+=1
        text+="<li> %s </li> \n"%wfl(wf,within=True)
        count+=1

    text_by_c=""
    for c in count_by_campaign:
        text_by_c+="<li> %s (%d) : "%( c, sum(count_by_campaign[c].values()) )
        for p in sorted(count_by_campaign[c].keys()):
            text_by_c+="%d (%d), "%(p,count_by_campaign[c][p])
        text_by_c+="</li>"


    html_doc.write("""
Worflow waiting in staging (%d) <a href=logs/transferor/last.log target=_blank>log</a> <a href=logs/stagor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('staging')">[Click to show/hide]</a>
<br>
<div id="staging" style="display:none;">
<ul>
<li> By workflow (%d) </li><a href="javascript:showhide('staging_bywf')">[Click to show/hide]</a><div id="staging_bywf" style="display:none;">                                                                                                                                                                       
 <ul>            
 %s
 </ul></div>
<li> By campaigns (%d) </li><a href="javascript:showhide('staging_bycamp')">[Click to show/hide]</a><div id="staging_bycamp" style="display:none;">                                                                                                                                                                  
 <ul>                                                                                                                                                                                                                                                                                                                      
 %s                                                                                                                                                                                                                                                                                                                        
 </ul></div>                                                                                                                                                                                                                                                                                                               
</ul>      
</div>
"""%(count, 
     count, text,
     len(count_by_campaign), text_by_c))

    lap ( 'done with staging' )

    text="<ul>"
    count=0
    transfer_per_wf = defaultdict(list)
    for ts in session.query(Transfer).filter(Transfer.phedexid>0).all():
        hide = True
        t_count = 0
        stext=""
        for pid in ts.workflows_id:
            w = session.query(Workflow).get(pid)
            hide &= (w.status != 'staging' )
            if w.status in ['considered','staging','staged']:
                stext += "<li> %s </li>\n"%( wfl(w,status=True))
                transfer_per_wf[w].append( ts.phedexid )
                t_count +=1
        stext = '<li> %s serves %d workflows<br><a href="javascript:showhide(\'%s\')">[show/hide]</a> <div id="%s" style="display:none;"><ul>\n'%( phl(ts.phedexid), t_count, ts.phedexid, ts.phedexid ) + stext
        
        stext+="</ul></li>\n"
        if hide:
            #text+="<li> %s not needed anymore to start running (does not mean it went through completely)</li>"%phl(ts.phedexid)
            pass
        else:
            count+=1
            text+=stext
    text+="</ul>"
    
    text_bywf="<ul>"
    for wf in transfer_per_wf:
        text_bywf += "<li> %s </li>"%(wfl(wf,within=True))
        text_bywf += '<a href=javascript:showhide("transfer_%s")>[Click to show/hide] %d transfers</a>'% (wf.name, len(transfer_per_wf[wf]))
        text_bywf += '<div id="transfer_%s" style="display:none;">'% wf.name
        text_bywf += "<ul>"
        for pid in sorted(transfer_per_wf[wf]):
            text_bywf += "<li> %s </li>"%(phl(pid))
        text_bywf += "</ul></div><hr>"
    text_bywf += '</ul>'

    html_doc.write("""
Transfer on-going (%d) <a href=http://cmstransferteam.web.cern.ch/cmstransferteam/ target=_blank>dashboard</a> <a href=logs/transferor/last.log target=_blank>log</a> <a href=logs/stagor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('transfer')">[Click to show/hide]</a>
<br>
<div id="transfer" style="display:none;">
 <ul>
  <li> By Workflow
    <a href="javascript:showhide('transfer_bywf')">[Click to show/hide]</a>
    <div id="transfer_bywf" style="display:none;">
%s
    </div>
  </li>
  <li> By transfer request
    <a href="javascript:showhide('transfer_byreq')">[Click to show/hide]</a>
    <div id="transfer_byreq" style="display:none;"> 
%s
    </div>
  </li>
 </ul>
</div>
"""%(count,
     text_bywf,
     text))

    lap( 'done with transfers' )

    text=""
    count=0
    count_by_campaign=defaultdict(lambda : defaultdict(int))
    for wf in session.query(Workflow).filter(Workflow.status=='staged').all():
        wl = getWL( wf.name )
        count_by_campaign[wl['Campaign']][int(wl['RequestPriority'])]+=1
        text+="<li> %s </li> \n"%wfl(wf,p=True)
        count+=1
    text_by_c=""
    for c in count_by_campaign:
        text_by_c+="<li> %s (%d) : "%( c, sum(count_by_campaign[c].values()) )
        for p in sorted(count_by_campaign[c].keys()):
            text_by_c+="%d (%d), "%(p,count_by_campaign[c][p])
        text_by_c+="</li>"

    html_doc.write("""Worflow ready for assigning (%d) <a href=logs/stagor/last.log target=_blank>log</a> <a href=logs/assignor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('staged')">[Click to show/hide]</a>
<br>
<div id="staged" style="display:none;">
<br>
<ul>
<li> By workflow (%d) </li><a href="javascript:showhide('staged_bywf')">[Click to show/hide]</a><div id="staged_bywf" style="display:none;">                                                                                                                                                                             
 <ul>                                                                                                                                                                                                                                                                                                                      
 %s                                                                                                                                                                                                                                                                                                                        
 </ul></div>                                                                                                                                                                                                                                                                                                               
<li> By campaigns (%d) </li><a href="javascript:showhide('staged_bycamp')">[Click to show/hide]</a><div id="staged_bycamp" style="display:none;">                                                                                                                                                                        
 <ul>                                                                                                                                                                                                                                                                                                                      
 %s                                                                                                                                                                                                                                                                                                                        
 </ul></div>
</ul>
</div>
"""%(count, 
     count, text,
     len(count_by_campaign), text_by_c))

    lap( 'done with staged' )
    
    lines=[]
    count_by_campaign=defaultdict(lambda : defaultdict(int))
    for wf in session.query(Workflow).filter(Workflow.status=='away').all():
        wl = getWL( wf.name )
        count_by_campaign[wl['Campaign']][int(wl['RequestPriority'])]+=1
        lines.append("<li> %s <hr></li>"%wfl(wf,view=True,ongoing=True))
    text_by_c=""
    for c in count_by_campaign:
        text_by_c+="<li> %s (%d) <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/campaign.php?campaign=%s>mon</a> <a href=https://cms-pdmv.cern.ch/pmp/historical?r=%s target=_blank>pmp</a> "%( c, sum(count_by_campaign[c].values()),c,c )
        for p in sorted(count_by_campaign[c].keys()):
            text_by_c+="%d (%d), "%(p,count_by_campaign[c][p])
        text_by_c+="</li>"

    lines.sort()
    html_doc.write("""
Worflow on-going (%d) <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests_in_production.php target=_blank>ongoing</a> <a href=https://cms-logbook.cern.ch/elog/Workflow+processing/?mode=summary target=_blank>elog</a> <a href=http://cms-gwmsmon.cern.ch/prodview target=_blank>queues</a> <a href=logs/assignor/last.log target=_blank>log</a> <a href=logs/checkor/last.log target=_blank>postlog</a> <a href=logs/equalizor/last.log target=_blank>equ</a> <a href=logs/completor/last.log target=_blank>comp</a>
<a href="javascript:showhide('away')">[Click to show/hide]</a>
<br>
<div id="away" style="display:none;">
<ul> 
<li>By workflow (%d) </li>
<a href="javascript:showhide('away_bywf')">[Click to show/hide]</a><div id="away_bywf" style="display:none;">
<ul>
%s
</ul></div>
<li> By campaigns (%d) </li><a href="javascript:showhide('away_bycamp')">[Click to show/hide]</a><div id="away_bycamp" style="display:none;">
<ul>
%s
</ul></div>
</ul>
</div>
"""%(len(lines),
     len(lines),
     '\n'.join(lines),
     len(count_by_campaign),
     text_by_c
     ))


    lap ( 'done with away' )

    text=""
    count=0
    #for wf in session.query(Workflow).filter(Workflow.status == 'assistance-custodial').all():
    for wf in session.query(Workflow).filter(Workflow.status.startswith('assistance')).filter(Workflow.status.contains('custodial')).all():
        text+="<li> %s </li> \n"%wfl(wf,view=True,update=True,status=True)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""Worflow that are closing (%d)
<a href=closeout.html target=_blank>closeout</a> 
<a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('closing')">[Click to show/hide]</a>
<br>
<div id="closing" style="display:none;">
<br>
<ul>
"""%count)
    html_doc.write(text)

    lap ( 'done with closing' )

    assistance_by_type = defaultdict(list)
    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all():
        assistance_by_type[wf.status].append( wf )
        count+=1
    for assistance_type in assistance_by_type:
        text += "<li> %s (%d) <a href=\"javascript:showhide('%s')\">[Click to show/hide]</a><br><div id=\"%s\" style=\"display:none;\"><ul>"%( assistance_type,
                                                                                                                                               len(assistance_by_type[assistance_type]),
                                                                                                                                               assistance_type,
                                                                                                                                               assistance_type,
                                                                                                                                               )
        for wf in assistance_by_type[assistance_type]:
            text+="<li> %s <hr></li> \n"%wfl(wf,view=True,within=True,status=True,update=True)
        text += "</ul></div></li>\n"
    html_doc.write("""Worflow which need assistance (%d)
<a href=assistance.html target=_blank>assistance</a> 
<a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/recoveror/last.log target=_blank>postlog</a>
<a href="javascript:showhide('assistance')">[Click to show/hide]</a>
<br>
<div id="assistance" style="display:none;">
<br>
<ul>
%s
</ul>
</div>
"""%(count, text))
    
    lap ( 'done with assistance' )

    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status == 'close').all():
        text+="<li> %s </li> \n"%wfl(wf)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""Worflow ready to close (%d)
<a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('close')">[Click to show/hide]</a>
<br>
<div id="close" style="display:none;">
<br>
<ul>
"""%count)
    html_doc.write(text)

    lap ( 'done with annoucing' )

    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status=='trouble').all():
        text+="<li> %s </li> \n"%wfl(wf)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""Worflow with issue (%d) <a href=logs/closor/last.log target=_blank>log</a> <a href=logs/injector/last.log target=_blank>postlog</a>
<a href="javascript:showhide('trouble')">[Click to show/hide]</a>
<br>
<div id="trouble" style="display:none;">
<br>
<ul>
"""%count)
    html_doc.write(text)

    lap ( 'done with trouble' )

    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status=='forget').all():
        text+="<li> %s </li> \n"%wfl(wf)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""
Worflow to forget (%d) <a href=logs/injector/last.log target=_blank>log</a> <a href=logs/lockor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('forget')">[Click to show/hide]</a>
<br>
<div id="forget" style="display:none;">
<br>
<ul>
"""%count)
    html_doc.write(text)

    lap ( 'done with forget' )

    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status=='done').all():
        text+="<li> %s </li> \n"%wfl(wf)#,ms=True)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""
Worflow through (%d) <a href=logs/closor/last.log target=_blank>log</a> <a href=logs/lockor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('done')">[Click to show/hide]</a>
<br>
<div id="done" style="display:none;">
<br>
<ul>
"""%count)
    html_doc.write(text)

    lap ( 'done with done' )


    wfs = session.query(Workflow).filter(Workflow.status.endswith('-unlock')).all()
    html_doc.write(" Workflows unlocked : %s <a href=logs/lockor/last.log target=_blank>log</a><br>"%(len(wfs)))
    lap ( 'done with unlocked' )



    text=""
    lines_thisweek=[]
    lines_lastweek=[]
    now = time.mktime(time.gmtime())
    this_week = int(time.strftime("%W",time.gmtime()))
    start_time_two_weeks_ago = time.mktime(time.gmtime(now - (20*24*60*60))) # 20
    last_week =  int(time.strftime("%W",time.gmtime(now - ( 7*24*60*60))))

    all_locks = json.loads(open('%s/globallocks.json'%monitor_dir).read())    
    waiting_custodial = json.loads(open('%s/waiting_custodial.json'%monitor_dir).read())
    all_pending_approval_custodial = dict([(k,item) for k,item in waiting_custodial.items() if 'nodes' in item and not any([node['decided'] for node in item['nodes'].values()]) ])
    n_pending_approval = len( all_pending_approval_custodial )
    #n_pending_approval = len([item for item in waiting_custodial.values() if 'nodes' in item and not any([node['decided'] for node in item['nodes'].values() ])])
    missing_approval_custodial = json.loads(open('%s/missing_approval_custodial.json'%monitor_dir).read())

    stuck_custudial = json.loads(open('%s/stuck_custodial.json'%monitor_dir).read())
    lagging_custudial = json.loads(open('%s/lagging_custodial.json'%monitor_dir).read())
    if len(stuck_custudial):
        stuck_string = ', <font color=red>%d appear to be <a href=stuck_custodial.json>stuck</a></font>'% len(stuck_custudial)
    else:
        stuck_string = ''
    if len(missing_approval_custodial):
        long_approve_string = ', <font color=red>%d more than %d days</font>'%( len(missing_approval_custodial), UC.get('transfer_timeout'))
    else:
        long_approve_string = ''
    

    output_within_two_weeks=session.query(Output).filter(Output.date>=start_time_two_weeks_ago).all()
    waiting_custodial_string=""
    waiting_custodial_strings=[]
    for ds in waiting_custodial:
        out = None
        ## lots of it will be within two weeks
        of = filter(lambda odb: odb.datasetname == ds, output_within_two_weeks)
        if of:
            out = of[0]
        else:
            out = session.query(Output).filter(Output.datasetname == ds).first()
        if out:
            info = waiting_custodial[out.datasetname]
            action = 'going'
            if out.datasetname in all_pending_approval_custodial:
                action = '<font color=red>pending</font>'
            try:
                size = str(info['size'])
            except:
                size = "x"

            destination = ",".join(info['nodes'].keys())
            if not destination:
                destination ='<font color=red>NO SITE</font>'

            a_waiting_custodial_string = "<li>on week %s : %s %s</li>"%(
                time.strftime("%W (%x %X)",time.gmtime(out.date)),
                ol(out.datasetname),
                ' %s [GB] %s to %s on %s (<a href="https://cmsweb.cern.ch/phedex/datasvc/xml/prod/requestlist?dataset=%s&node=T*MSS">%d missing</a>)'%( size, action, destination, time.asctime(time.gmtime(info['checked'])), out.datasetname, info['nmissing'])
                )
            waiting_custodial_strings.append( (out.date, a_waiting_custodial_string) )

        waiting_custodial_strings.sort( key = lambda i:i[0] )
        waiting_custodial_string="\n".join( [i[1] for i in waiting_custodial_strings] )
    #start_time_two_weeks_ago = time.mktime(time.strptime("15-0-%d"%(this_week-2), "%y-%w-%W"))
    for out in output_within_two_weeks:
        if not out.workflow: 
            print "This is a problem with",out.datasetname
            continue
        if  out.workflow.status in ['done-unlock','done','clean','clean-out','clean-unlock']:
            custodial=''
            if out.datasetname in waiting_custodial:
                info = waiting_custodial[out.datasetname]
                try:
                    try:
                        size = str(info['size'])
                    except:
                        size = "x"
                    destination = ",".join(info['nodes'].keys())
                    if not destination:
                        destination ='<font color=red>NO SITE</font>'
                    action = 'going'
                    if out.datasetname in all_pending_approval_custodial:
                        action = '<font color=red>pending</font>'

                    
                    custodial=' %s [GB] %s to %s on %s (<a href="https://cmsweb.cern.ch/phedex/datasvc/xml/prod/requestlist?dataset=%s&node=T*MSS">%d missing</a>)'%( size, action, destination, time.asctime(time.gmtime(info['checked'])), out.datasetname, info['nmissing'])
                except Exception as e:
                    #print info
                    #print str(e)
                    pass
            elif out.datasetname in all_locks:
                custodial='<font color=green>LOCKED</font>'
            out_week = int(time.strftime("%W",time.gmtime(out.date)))
            ##only show current week, and the previous.
            if last_week==out_week:
                lines_lastweek.append("<li>on week %s : %s %s</li>"%(
                        time.strftime("%W (%x %X)",time.gmtime(out.date)),
                        ol(out.datasetname),
                        custodial
                        )
                             )
            if this_week==out_week:

                lines_thisweek.append("<li>on week %s : %s %s</li>"%(
                        time.strftime("%W (%x %X)",time.gmtime(out.date)),
                        ol(out.datasetname),
                        custodial
                        )
                             )
    lines_thisweek.sort()
    lines_lastweek.sort()

    html_doc.write("""Output produced (%d) <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?in_disagreement=1 target=_blank>disagreements</a>
<a href="javascript:showhide('output')">[Click to show/hide]</a>
<br>
<div id="output" style="display:none;">
<br>
<ul>
<li> %d waiting to go to tape</li>
<ul>
<li> %d waiting for tape approval%s</li>
<li> %d are not completed after %d days%s</li>
<li> Full list (%d) <a href="javascript:showhide('waiting-custodial')">[Click to show/hide]</a>
<div id="waiting-custodial" style="display:none;">
<ul>
%s
</ul>
</div>
</li>
</ul>
<li> Last week (%d) </li><a href="javascript:showhide('output_lastweek')">[Click to show/hide]</a><div id="output_lastweek" style="display:none;"><ul>
%s
</ul></div>
<li> This week (%d) </li><a href="javascript:showhide('output_thisweek')">[Click to show/hide]</a><div id="output_thisweek" style="display:none;"><ul>
%s
</ul></div></div>
"""%( len(lines_lastweek)+len(lines_thisweek),
      len(waiting_custodial),
      n_pending_approval,long_approve_string,
      len(lagging_custudial),UC.get('transfer_timeout'),stuck_string,
      len(waiting_custodial),waiting_custodial_string,
      len(lines_lastweek),
     '\n'.join(lines_lastweek),
      len(lines_thisweek),
     '\n'.join(lines_thisweek))
                   )

    lap ( 'done with output' )


    html_doc.write("""Job installed
<a href="javascript:showhide('acron')">[Click to show/hide]</a>
<br>
<div id="acron" style="display:none;">
<br>
<pre>
%s
</pre>
"""%(os.popen('acrontab -l | grep Unified | grep -v \#').read()))


    per_module = defaultdict(list)
    for t in filter(None,os.popen('cat %s/logs/*/*.time'%monitor_dir).read().split('\n')):
        module_name,run_time,spend = t.split(':')
        ## then do what you want with it !
        if 'cleanor' in module_name: continue
        
        per_module[module_name].append( int(spend) )

    def display_time( sec ):
        m, s = divmod(sec, 60)
        h, m = divmod(m, 60)
        dis=""
        if h:
            dis += "%d [h] "%h
        if h or m:
            dis += "%d [m] "%m
        if h or m or s:
            dis += "%d [s]"%s
            
        return dis

    html_doc.write("Module running time<ul>\n")
    for m,spends in per_module.items():
        avg = sum(spends)/float(len(spends))
        lasttime =  spends[-1]
        html_doc.write("<li>%s : last %s, avg %s</li>\n"%( m, display_time(lasttime), display_time(avg)))
    html_doc.write("</ul>")

    html_doc.write("Last running <pre>%s</pre><br>"%( os.popen("tac %s/logs/running | head -5"%monitor_dir).read() ))


    html_doc.write("Order in cycle <pre>%s</pre><br>"%( '\n'.join(map(lambda l : l.split('/')[-1].replace('.py',''), filter(lambda l : not l.startswith('#') and 'Unified' in l and 'py' in l.split('/')[-1], open('%s/WmAgentScripts/cycle.sh'%base_dir).read().split('\n')))) ))


    html_doc.write("</div>\n")
    lap ( 'done with jobs' )


    text=""
    count=0
    for (c,info) in campaignInfo().campaigns.items():
        #if 'go' in info and info['go']:
        text+="<li>%s <br> <pre>%s</pre>  </li>"%( c, json.dumps( info, indent=2))
        count+=1

    html_doc.write("""Campaign configuration
<a href="javascript:showhide('campaign')">[Click to show/hide]</a>
<br>
<div id="campaign" style="display:none;">
<br>
<ul>
%s
</ul></div>
"""%(text))

    text=""
    count=0
    n_column = 4
    SI = siteInfo()
    date1 = time.strftime('%Y-%m-%d+%H:%M', time.gmtime(time.mktime(time.gmtime())-(15*24*60*60)) ) ## 15 days
    date2 = time.strftime('%Y-%m-%d+%H:%M', time.gmtime())
    for t in SI.types():
        text+="<li>%s<table border=1>"%t
        c=0
        for site in getattr(SI,t):
            cpu = SI.cpu_pledges[site] if site in SI.cpu_pledges else 'N/A'
            disk = SI.disk[SI.CE_to_SE(site)] if SI.CE_to_SE(site) in SI.disk else 'N/A'
            if c==0:
                text+="<tr>"
            if not disk:
                ht_disk = '<font color=red>Disk available: %s</font>'%disk
            else:
                ht_disk = 'Disk available: %s'%disk

            text+='<td><a href=http://dashb-ssb.cern.ch/dashboard/templates/sitePendingRunningJobs.html?site=%s>%s</a><br><a href="http://cms-gwmsmon.cern.ch/prodview/%s" target="_blank"><img src="http://cms-gwmsmon.cern.ch/prodview/graphs/%s/daily" style="height:50px"></a><br><a href="http://dashb-cms-job.cern.ch/dashboard/templates/web-job2/#user=&refresh=0&table=Jobs&p=1&records=25&activemenu=1&site=%s&submissiontool=wmagent&check=submitted&sortby=activity&scale=linear&bars=20&data1=%s&date2=%s">dashb</a><br>CPU pledge: %s<br>%s</td>'%(site,site,site,site,site,date1,date2,cpu,ht_disk)
            if c==n_column:
                c=0
            else:
                c+=1
        text+="</table></li>"

    text += "<li> Sites in auto-approved transfer<ul>"
    for site in sorted(SI.sites_auto_approve):
        text+="<li>%s"% site
    text += "</ul></li>"

    text += "<li> Sites with vetoe transfer<ul>"
    for site in sorted(SI.sites_veto_transfer):
        text+="<li>%s"% site
    text += "</ul></li>"

    text += "<li> Sites banned from production<ul>"
    for site in sorted(SI.sites_banned):
        text+="<li>%s"% site
    text += "</ul></li>"

    text += "<li> Approximate Free Tape<ul>"
    for mss in SI.storage:
        waiting = 0
        try:
            waiting = float(os.popen("grep '%s is pending . Created since' %s/logs/lockor/last.log  -B 3 | grep size | awk '{ sum+=$6 ; print sum }' | tail -1" % (mss,monitor_dir)).readline())
        except Exception as e:
            print str(e)

        oldest = ""
        os.system('grep pending %s/logs/lockor/last.log | sort -u > %s/logs/pending.log'%(monitor_dir,monitor_dir))
        try:
            oldest = os.popen("grep '%s is pending . Created since ' %s/logs/lockor/last.log | sort | awk '{print $10, $11, $12, $13, $14 }' | head -1"% (mss,monitor_dir)).readline()
        except Exception as e:
            print str(e)
        waiting /= 1024.
        text+="<li>%s : %d [TB]. Waiting for approval %d [TB] since %s </li>"%(mss, SI.storage[mss], waiting, oldest)
    text += "</ul></li>"

    lap ( 'done with sites' )

    open('%s/siteInfo.json'%monitor_dir,'w').write(json.dumps(dict([(t,getattr(SI,t)) for t in SI.types()]),indent=2))

    lap ( 'done with sites json' )

    chart_data = defaultdict(list)
    for site in SI.quota:
        chart_data[site].append("""
var data_%s = google.visualization.arrayToDataTable([ 
['Overall', 'Space in TB'],
//['Quota' , %s],
['Locked' , %s],
['Free' , %s]
]);
"""%( site,
      SI.quota[site], SI.locked[site], SI.disk[site],
      ))
        chart_data[site].append("""
var chart_%s = new google.visualization.PieChart(document.getElementById('donutchart_%s'));
chart_%s.draw(data_%s, {title: '%s %s [TB]', pieHole:0.4, slices:{0:{color:'red'},1:{color:'green'}}});
"""%(site,site,
     site,site,
     site,SI.quota[site]))
        chart_data[site].append("""
<div id="donutchart_%s" style="height: 200px;width: 300px"></div>
"""%(site))

        
    ## make the locked/available donut chart
    donut_html = open('%s/locked.html'%monitor_dir,'w')
    tables = "\n".join([info[0] for site,info in chart_data.items()])
    draws = "\n".join([info[1] for site,info in chart_data.items()])
    divs = "\n".join([info[2] for site,info in chart_data.items()])

    
    divs_table="<table border=0>"
    for c,site in enumerate(sorted(chart_data.keys())):
        if c%5==0:
            divs_table += "<tr>"
        divs_table += "<td>%s</td>"%(chart_data[site][2])
    divs_table += "</table>"

    donut_html.write("""
<html>
  <head>
    <script type="text/javascript" src="https://www.google.com/jsapi"></script>
    <script type="text/javascript">
      google.load("visualization", "1", {packages:["corechart"]});
      google.setOnLoadCallback(drawChart);
      function drawChart() {
%s

%s
      }
    </script>
  </head>
  <body>
%s
  </body>
</html>
"""%( tables,draws,divs_table   )
                     )
    donut_html.close()

    html_doc.write("""Site configuration
<a href="javascript:showhide('site')">[Click to show/hide]</a>
<br>
<div id="site" style="display:none;">
<br>
<ul>
%s
</ul></div>
"""%(text))

    lap ( 'done with space' )


    text = ""
    for param in UC.configs:
        text +="<li>%s</li><ul>\n"% param
        for sub in sorted(UC.configs[param].keys()):
            text +="<li> %s : %s </li>\n"%( sub, UC.configs[param][sub] )
        text += '</ul>\n'
        
    html_doc.write("""Unified configuration
<a href="javascript:showhide('config')">[Click to show/hide]</a>
<br>
<div id="config" style="display:none;">
<br>
<ul>
%s
</ul></div>                                                                                                                                                                                                                                                                                                                
"""%(text))

    lap ( 'done with configuration' )


    print "... done with status page."
    html_doc.write("""
</body>
</html>
""")

    html_doc.close()
    ## and put the file in place
    os.system('mv %s/index.html.new %s/index.html'%(monitor_dir,monitor_dir))

        
    statuses = json.loads(open('%s/statusmon.json'%monitor_dir).read())
    s_count = defaultdict(int)
    now = time.mktime(time.gmtime())
    for wf in session.query(Workflow).all():
        s_count[wf.status]+=1
    statuses[now] = dict( s_count )
    ## remove old entries
    for t in statuses.keys():
        if (now-float(t)) > 7*24*60*60:
            statuses.pop(t)
    open('%s/statusmon.json'%monitor_dir,'w').write( json.dumps( statuses , indent=2))

    html_doc = open('%s/statuses.html'%monitor_dir,'w')
    html_doc.write("""                                                                                                                                                                                                                                                                                                      <html>        
<table border=1>
<thead>
<tr>
<th> workflow </th><th> status </th><th> wm status</th>
</tr>
</thead>
""")
    wfs = {}
    for wfo in session.query(Workflow).all():
        ## pass all that is unlocked and considered it gone
        wfs[wfo.name] = (wfo.status,wfo.wm_status)

    open('%s/statuses.json'%monitor_dir,'w').write(json.dumps( wfs ))
    for wfn in sorted(wfs.keys()):
        ## pass all that is unlocked and considered it gone
        if 'unlock' in wfs[wfn][0]: continue
        html_doc.write('<tr><td><a id="%s">%s</a></td><td>%s</td><td>%s</td></tr>\n'%( wfn, wfn, wfs[wfn][0],  wfs[wfn][1]))
    html_doc.write("</table>")
    html_doc.write("<br>"*100)
    html_doc.write("end of page</html>")
    html_doc.close()
Exemplo n.º 36
0
#os.system('Unified/assignor.py RunIISummer16MiniAODv2')
#os.system('Unified/assignor.py --from_status staging RunIISummer16DR80Premix')
#os.system('Unified/assignor.py --from_status staging RunIISummer16DR80-')

up = componentInfo(mcm=False, soft=['mcm'])                                 
if not up.check(): sys.exit(0)

url = reqmgr_url

may_have_one=set()
may_have_one.update([wfo.name for wfo in session.query(Workflow).filter(Workflow.status.startswith('away')).all()])
may_have_one.update([wfo.name for wfo in session.query(Workflow).filter(Workflow.status.startswith('assistance')).all()])

wfs = []
wfs.extend( getWorkflows(url, 'running-open', details=True))
wfs.extend( getWorkflows(url, 'running-closed', details=True))
wfs.extend( getWorkflows(url, 'completed', details=True))

may_have_one_too = set()
for wf in wfs:
    if wf['RequestName'] in may_have_one:
        #print wf['RequestName'],"and familly"
        may_have_one_too.update( getWorkflowById(url, wf['PrepID']) )
        
may_have_one.update( may_have_one_too )

for logtype in ['report','joblogs','condorlogs']:
    for d in filter(None,os.popen('ls -d %s/%s/*'%(monitor_dir,logtype)).read().split('\n')):
        if not any([m in d for m in may_have_one]):
            ## that can be removed
Exemplo n.º 37
0
def checkor(url, spec=None, options=None):
    if userLock():   return
    if duplicateLock():  return


    fDB = closeoutInfo()

    UC = unifiedConfiguration()
    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    wfs=[]
    if options.new:
        ## get all in running and check

        ## you want to intersect with what is completed !
        if options.strict:
            completed_wfi = getWorkflows(url, status='completed')
            for wfo in session.query(Workflow).filter(Workflow.status == 'away').all():
                if wfo.name in completed_wfi:
                    wfs.append( wfo )
                else:
                    print wfo.name,"is not completed"
                    sendLog('checkor','%s is not completed'%( wfo.name))
        else:
            wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() )

    if options.current:
        ## recheck those already there, probably to just pass them along
        wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() )

    if options.old:
        ## than get all in need for assistance
        wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() )


    custodials = defaultdict(list) #sites : dataset list
    transfers = defaultdict(list) #sites : dataset list
    invalidations = [] #a list of files
    SI = global_SI
    CI = campaignInfo()
    mcm = McMClient(dev=False)

    def get_campaign(output, wfi):
        campaign = None
        try:
            campaign = output.split('/')[2].split('-')[0]
        except:
            if 'Campaign' in wfi.request:
                campaign = wfi.request['Campaign']
        return campaign

    ## retrieve bypass and onhold configuration
    bypasses = []
    holdings = []
    #try:
    #    already_notified = json.loads(open('already_notifified.json').read())
    #except:
    #    print "no record of already notified workflow. starting fresh"
    #    already_notified = []

    for bypassor,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**')]:
        bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor)
        if not os.path.isfile(bypass_file):
            #sendLog('checkor','no file %s',bypass_file)
            continue
        try:
            bypasses.extend( json.loads(open(bypass_file).read()))
        except:
            sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor))
            sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email])
        
        holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor)
        if not os.path.isfile(holding_file):
            #sendLog('checkor',"no file %s"%holding_file)
            continue
        try:
            holdings.extend( json.loads(open(holding_file).read()))
        except:
            sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor))
            sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email])

    ## once this was force-completed, you want to bypass
    for rider,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('srimanob','*****@*****.**')]:
        rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider)
        if not os.path.isfile(rider_file):
            print "no file",rider_file
            #sendLog('checkor',"no file %s"%rider_file)
            continue
        try:
            bypasses.extend( json.loads(open( rider_file ).read() ) )
        except:
            sendLog('checkor',"cannot get force complete list from %s"%rider)
            sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email])

    if use_mcm:
        mcm_force = mcm.get('/restapi/requests/forcecomplete')
        bypasses.extend( mcm_force )

    pattern_fraction_pass = UC.get('pattern_fraction_pass')

    total_running_time = 5.*60. 
    sleep_time = 1
    if len(wfs):
        sleep_time = min(max(0.5, total_running_time / len(wfs)), 10)

    random.shuffle( wfs )

    print len(wfs),"to consider, pausing for",sleep_time

    for wfo in wfs:
        if spec and not (spec in wfo.name): continue
        time.sleep( sleep_time )
        
        ## get info
        wfi = workflowInfo(url, wfo.name)
        wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status))
        ## make sure the wm status is up to date.
        # and send things back/forward if necessary.
        wfo.wm_status = wfi.request['RequestStatus']
        if wfo.wm_status == 'closed-out':
            ## manually closed-out
            wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status))
            wfo.status = 'close'
            session.commit()
            continue

        elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']:
            ## went into trouble
            wfo.status = 'trouble'
            wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status))
            session.commit()
            continue
        elif wfo.wm_status in ['assigned','acquired']:
            ## not worth checking yet
            wfi.sendLog('checkor',"%s is not running yet"%wfo.name)
            session.commit()
            continue
        
        if '-onhold' in wfo.status:
            if wfo.name in holdings and wfo.name not in bypasses:
                wfi.sendLog('checkor',"%s is on hold"%wfo.name)
                continue

        if wfo.wm_status != 'completed': #and not wfo.name in bypasses:
            ## for sure move on with closeout check if in completed
            wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status))
            session.commit()
            continue

        if wfo.name in holdings and wfo.name not in bypasses:
            wfo.status = 'assistance-onhold'
            wfi.sendLog('checkor',"setting %s on hold"%wfo.name)
            session.commit()
            continue

        session.commit()        
        #sub_assistance="" # if that string is filled, there will be need for manual assistance
        existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance
        assistance_tags = set()

        is_closing = True

        ## get it from somewhere
        bypass_checks = False
        pids = wfi.getPrepIDs()
        bypass_by_mcm = False
        for bypass in bypasses:
            if bypass in wfo.name:
                wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass))
                bypass_checks = True
                break
            if bypass in pids:
                wfi.sendLog('checkor',"we can bypass checks on %s because of prepid %s "%( wfo.name, bypass))
                bypass_checks = True
                bypass_by_mcm = True
                break
        
        #if not CI.go( wfi.request['Campaign'] ) and not bypass_checks:
        #    print "No go for",wfo.name
        #    wfi.sendLog('checkor',"No go for %s"%wfi.request['Campaign'])
        #    continue


        tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm*
        vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco
        campaigns = {}
        for out in wfi.request['OutputDatasets']:
            c = get_campaign(out, wfi)
            campaigns[out] = c
            if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]:
                vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override']))
                ## add those that we need to check for custodial copy
                tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check

        check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] ))
        wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])]
        check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] ))
        check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check ))
        check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) )

        wfi.sendLog('checkor', check_output_text )

        ## anything running on acdc : getting the real prepid is not worth it
        familly = getWorkflowById(url, wfi.request['PrepID'], details=True)
        acdc = []
        acdc_inactive = []
        for member in familly:
            if member['RequestType'] != 'Resubmission': continue
            if member['RequestName'] == wfo.name: continue
            if member['RequestDate'] < wfi.request['RequestDate']: continue
            if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']:
                print wfo.name,"still has an ACDC running",member['RequestName']
                acdc.append( member['RequestName'] )
                ## cannot be bypassed!
                is_closing = False
                assistance_tags.add('recovering')
            elif member['RequestStatus']==None:
                print member['RequestName'],"is not real"
                pass
            else:
                acdc_inactive.append( member['RequestName'] )
                assistance_tags.add('recovered')

        ## completion check
        percent_completions = {}
        if not 'TotalInputEvents' in wfi.request:
            event_expected,lumi_expected = 0,0
            if not 'recovery' in wfo.status:
                #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**'])
                sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical')
        else:
            event_expected,lumi_expected =  wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis']

        if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']):
            event_expected = int(wfi.request['RequestNumEvents'])
        elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']:
            event_expected = int(wfi.request['Task1']['RequestNumEvents'])

        fractions_pass = {}
        over_100_pass = False
        (lhe,prim,_,_) = wfi.getIO()
        if lhe or prim: over_100_pass = False

        for output in wfi.request['OutputDatasets']:
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=output)
            percent_completions[output] = 0.

            if lumi_expected:
                percent_completions[output] = lumi_count / float( lumi_expected )
            if event_expected:
                wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected ))
                percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) )

            fractions_pass[output] = 0.95
            c = campaigns[output]
            if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]:
                fractions_pass[output] = CI.campaigns[c]['fractionpass']
                wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output))

            if options.fractionpass:
                fractions_pass[output] = options.fractionpass
                print "overriding fraction to",fractions_pass[output],"by command line for",output

            for key in pattern_fraction_pass:
                if key in output:
                    fractions_pass[output] = pattern_fraction_pass[key]
                    print "overriding fraction to",fractions_pass[output],"by dataset key",key
                    

        if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]):
            print wfo.name,"is not completed"
            print json.dumps(percent_completions, indent=2)
            print json.dumps(fractions_pass, indent=2)
            ## hook for creating automatically ACDC ?
            if not bypass_checks:
                assistance_tags.add('recovery')
                is_closing = False

        if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]):
            print wfo.name,"is over completed"
            print json.dumps(percent_completions, indent=2)
            if not bypass_checks:
                assistance_tags.add('over100')
                is_closing = False

        ## correct lumi < 300 event per lumi
        events_per_lumi = {}
        for output in wfi.request['OutputDatasets']:
            events_per_lumi[output] = getDatasetEventsPerLumi( output )


        lumi_upper_limit = {}
        for output in wfi.request['OutputDatasets']:
            upper_limit = 301.
            campaign = campaigns[output]
            #if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request:
            #    upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency']
            #    print "setting the upper limit of lumisize to",upper_limit,"by request configuration"

            if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]:
                upper_limit = CI.campaigns[campaign]['lumisize']
                print "overriding the upper lumi size to",upper_limit,"for",campaign

            if options.lumisize:
                upper_limit = options.lumisize
                print "overriding the upper lumi size to",upper_limit,"by command line"
                
            lumi_upper_limit[output] = upper_limit
            if wfi.request['RequestType'] in ['ReDigi']: lumi_upper_limit[output] = -1
        
        if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]):
            print wfo.name,"has big lumisections"
            print json.dumps(events_per_lumi, indent=2)
            ## hook for rejecting the request ?
            if not bypass_checks:
                assistance_tags.add('biglumi')
                is_closing = False 


        any_presence = {}
        for output in wfi.request['OutputDatasets']:
            any_presence[output] = getDatasetPresence(url, output, vetoes=[])

        ## custodial copy
        custodial_locations = {}
        custodial_presences = {}
        for output in wfi.request['OutputDatasets']:
            custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s]
            custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output)

            if not custodial_locations[output]:
                custodial_locations[output] = []

        ## presence in phedex
        phedex_presence ={}
        for output in wfi.request['OutputDatasets']:
            phedex_presence[output] = phedexClient.getFileCountDataset(url, output )


            
        out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier]
        size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs
        if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])):
            print wfo.name,"has not all custodial location"
            print json.dumps(custodial_locations, indent=2)

            ##########
            ## hook for making a custodial replica ?
            custodial = None
            ## get from other outputs
            for output in out_worth_checking:
                if len(custodial_locations[output]): 
                    custodial = custodial_locations[output][0]
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the other output custodial:",custodial,"because of limited space"
                custodial = None

            ## try to get it from campaign configuration
            if not custodial:
                for output in out_worth_checking:
                    campaign = campaigns[output]
                    if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]:
                        custodial = CI.campaigns[campaign]['custodial']
                        print "Setting custodial to",custodial,"from campaign configuration"

            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the campaign configuration custodial:",custodial,"because of limited space"
                custodial = None

            ## get from the parent
            pick_custodial = True
            use_parent_custodial = UC.get('use_parent_custodial')
            _,prim,_,_ = wfi.getIO()
            if not custodial and prim and use_parent_custodial:
                parent_dataset = prim.pop()
                ## this is terribly dangerous to assume only 
                parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset )
                ###parents_custodial = findCustodialLocation(url, parent_dataset)
                if not parents_custodial:
                    parents_custodial = []

                if len(parents_custodial):
                    custodial = parents_custodial[0]
                else:
                    print "the input dataset",parent_dataset,"does not have custodial in the first place. abort"
                    #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset)
                    ## does not work for RAWOADSIM
                    sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset)
                    ## cannot be bypassed, this is an issue to fix
                    is_closing = False
                    pick_custodial = False
                    assistance_tags.add('parentcustodial')
                                
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the parent custodial:",custodial,"because of limited space"
                custodial = None

            if not custodial and pick_custodial:
                ## pick one at random
                custodial = SI.pick_SE(size=size_worth_checking)

            if not custodial:
                print "cannot find a custodial for",wfo.name
                wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking))
                #sendEmail( "cannot find a custodial","cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking))
                sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical')
                
            if custodial and (is_closing or bypass_checks):
                print "picked",custodial,"for tape copy"
                ## remember how much you added this round already ; this stays locally
                SI.storage[custodial] -= size_worth_checking
                ## register the custodial request, if there are no other big issues
                for output in out_worth_checking:
                    if not len(custodial_locations[output]):
                        if phedex_presence[output]>=1:
                            custodials[custodial].append( output )
                            ## let's wait and see if that's needed 
                            assistance_tags.add('custodial')
                        else:
                            print "no file in phedex for",output," not good to add to custodial requests"
            #cannot be bypassed


            is_closing = False

        ## disk copy 
        disk_copies = {}
        for output in wfi.request['OutputDatasets']:
            disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)]

        if not all(map( lambda sites : len(sites)!=0, disk_copies.values())):
            print wfo.name,"has not all output on disk"
            print json.dumps(disk_copies, indent=2)


        ## presence in dbs
        dbs_presence = {}
        dbs_invalid = {}
        for output in wfi.request['OutputDatasets']:
            dbs_presence[output] = dbs3Client.getFileCountDataset( output )
            dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True)

        fraction_invalid = 0.01
        if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            print wfo.name,"has a dbs,phedex mismatch"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            if not 'recovering' in assistance_tags:
                assistance_tags.add('filemismatch')
                #print this for show and tell if no recovery on-going
                for out in dbs_presence:
                    _,_,missing_phedex,missing_dbs  = getDatasetFiles(url, out)
                    if missing_phedex:
                        print "These %d files are missing in phedex"%(len(missing_phedex))
                        print "\n".join( missing_phedex )
                    if missing_dbs:
                        print "These %d files are missing in dbs"%(len(missing_dbs))
                        print "\n".join( missing_dbs )

            #if not bypass_checks:
            ## I don't think we can by pass this
            is_closing = False

        if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            print wfo.name,"has a dbs invalid file level too high"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            ## need to be going and taking an eye
            assistance_tags.add('invalidfiles')
            if not bypass_checks:
                #sub_assistance+="-invalidfiles"
                is_closing = False

        ## put that heavy part at the end
        ## duplication check
        duplications = {}
        if is_closing or bypass_checks:
            print "starting duplicate checker for",wfo.name
            for output in wfi.request['OutputDatasets']:
                print "\tchecking",output
                duplications[output] = True
                try:
                    duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True)
                except:
                    try:
                        duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True)
                    except:
                        print "was not possible to get the duplicate count for",output
                        is_closing=False

            if any(duplications.values()) and not options.ignoreduplicates:
                print wfo.name,"has duplicates"
                print json.dumps(duplications,indent=2)
                ## hook for making file invalidation ?
                ## it shouldn't be allowed to bypass it
                assistance_tags.add('duplicates')
                is_closing = False 



        ## for visualization later on
        if not wfo.name in fDB.record: 
            #print "adding",wfo.name,"to close out record"
            fDB.record[wfo.name] = {
            'datasets' :{},
            'name' : wfo.name,
            'closeOutWorkflow' : None,
            }
        fDB.record[wfo.name]['closeOutWorkflow'] = is_closing
        fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority']
        fDB.record[wfo.name]['prepid'] = wfi.request['PrepID']

        for output in wfi.request['OutputDatasets']:
            if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {}
            rec = fDB.record[wfo.name]['datasets'][output]
            rec['percentage'] = float('%.2f'%(percent_completions[output]*100))
            rec['duplicate'] = duplications[output] if output in duplications else 'N/A'
            rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A'
            rec['closeOutDataset'] = is_closing
            rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A'
            rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True
            rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output])))
            rec['dbsFiles'] = dbs_presence[output]
            rec['dbsInvFiles'] = dbs_invalid[output]
            rec['phedexFiles'] = phedex_presence[output]
            rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive))

        ## and move on
        if is_closing:
            ## toggle status to closed-out in request manager
            print "setting",wfo.name,"closed-out"
            if not options.test:
                if wfo.wm_status in ['closed-out','announced','normal-archived']:
                    print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does"
                    res = None
                else:
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    print "close out answer",res

                if not res in ["None",None]:
                    print "try to get the current status again"
                    wfi_bis = workflowInfo(url, wfo.name)
                    if wfi_bis.request['RequestStatus'] == 'closed-out':
                        print "the request did toggle to closed-out"
                        res = None
                    
                if not res in ["None",None]:
                    print "retrying to closing out"
                    print res
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    
                
                if res in [None,"None"]:
                    wfo.status = 'close'
                    session.commit()
                    if use_mcm and bypass_by_mcm:
                        ## shoot large on all prepids
                        for pid in pids:
                            mcm.delete('/restapi/requests/forcecomplete/%s'%pid)
                else:
                    print "could not close out",wfo.name,"will try again next time"
        else:
            ## full known list
            #recovering # has active ACDC
            ##OUT #recovered #had inactive ACDC
            #recovery #not over the pass bar
            #over100 # over 100%
            #biglumi # has a big lumiblock
            #parentcustodial # the parent does not have a valid subscription yet
            #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear
            #filemismatch # there is a dbs/phedex mismatch
            #duplicates #a lumi section is there twice

            ## manual is not added yet, and should be so by recoveror
            print wfo.name,"was tagged with :",list(assistance_tags)
            if 'recovering' in assistance_tags:
                ## if active ACDC, being under threshold, filemismatch do not matter
                assistance_tags = assistance_tags - set(['recovery','filemismatch'])
            if 'recovery' in assistance_tags and 'recovered' in assistance_tags:
                ## should not set -recovery to anything that add ACDC already
                assistance_tags = assistance_tags - set(['recovery','recovered']) 
                ## straight to manual
                assistance_tags.add('manual')


            ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it
            print wfo.name,"needs assistance with",",".join( assistance_tags )
            print wfo.name,"existing conditions",",".join( existing_assistance_tags )
            
            #########################################
            ##### notification to requester #########
            go_notify=False
            if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags:
                go_notify=True
            

            if go_notify:
                #if wfo.name in already_notified:
                #    print "double notification"
                #    sendEmail('double notification','please take a look at %s'%(wfo.name))                    
                #else:
                #    already_notified.append( wfo.name )

                detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s'
                perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name)
                splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name)
                ## notify templates
                messages= {
                    'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ),
                    'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink),
                    'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ),
                    'filemismatch': 'Samples completed with inconsistency in DBS/Phedex',
                    #'manual' :                     'Workflow completed and requires manual checks by Ops',
                    }
                
                content = "The request PREPID (WORKFLOW) is facing issue in production.\n"
                motive = False
                for case in messages:
                    if case in assistance_tags:
                        content+= "\n"+messages[case]+"\n"
                        motive = True
                content += "You are invited to check, while this is being taken care of by Comp-Ops.\n"
                content += "This is an automated message from Comp-Ops.\n"

                items_notified = set()
                if use_mcm and motive:
                    wfi.notifyRequestor( content , mcm = mcm)

            #########################################


            ## logic to set the status further
            if assistance_tags:
                new_status = 'assistance-'+'-'.join(sorted(assistance_tags) )
            else:
                new_status = 'assistance'

            ## case where the workflow was in manual from recoveror
            if not 'manual' in wfo.status or new_status!='assistance-recovery':
                wfo.status = new_status
                if not options.test:
                    print "setting",wfo.name,"to",wfo.status
                    session.commit()
            else:
                print "current status is",wfo.status,"not changing to anything"

    #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2))

    fDB.html()
    if not spec:
        #sendEmail("fresh assistance status available","Fresh status are available at https://cmst2.web.cern.ch/cmst2/unified/assistance.html",destination=['*****@*****.**'])
        #it's a bit annoying
        pass

    ## custodial requests
    print "Custodials"
    print json.dumps(custodials, indent=2)
    for site in custodials:
        print ','.join(custodials[site]),'=>',site
        if not options.test:
            result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) )
            print result

    print "Transfers"
    print json.dumps(transfers, indent=2)
    ## replicas requests
    for site in transfers:
        print ','.join(transfers[site]),'=>',site
        if not options.test:
            result = None
            #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out")
            print result

    print "File Invalidation"
    print invalidations
Exemplo n.º 38
0
def injector(url, options, specific):
    mlock = moduleLock()
    if mlock(): return

    use_mcm = True
    up = componentInfo(soft=['mcm','wtc','jira'] )
    if not up.check(): return
    use_mcm = up.status['mcm']

    UC = unifiedConfiguration()

    transform_keywords = UC.get('convert_to_stepchain')

    workflows = getWorkflows(url, status=options.wmstatus, user=options.user)
    for user in UC.get("user_rereco"):
        workflows.extend( getWorkflows(url, status=options.wmstatus, user=user, rtype="ReReco")) 
    for user in (options.user_relval.split(',') if options.user_relval else UC.get("user_relval")) :
        workflows.extend( getWorkflows(url, status=options.wmstatus, user=user, rtype="TaskChain")) 
    for user in (options.user_storeresults.split(',') if options.user_storeresults else UC.get("user_storeresults")) :
        workflows.extend( getWorkflows(url, status=options.wmstatus, user=user, rtype="StoreResults"))

    print len(workflows),"in line"
    cannot_inject = set()
    to_convert = set()
    status_cache = defaultdict(str)

    ## browse for assignment-approved requests, browsed for ours, insert the diff
    for wf in workflows:
        if specific and not specific in wf: continue

        exists = session.query(Workflow).filter(Workflow.name == wf ).first()
        if not exists:
            wfi = workflowInfo(url, wf)
            ## check first that there isn't related here with something valid
            can_add = True
            ## first try at finding a match
            familly = session.query(Workflow).filter(Workflow.name.contains(wfi.request['PrepID'])).all()
            if not familly:
                pids = wfi.getPrepIDs()
                req_familly = []
                for pid in pids:
                    req_familly.extend( getWorkflowById( url, pid, details=True) )
                    
                familly = []
                print len(req_familly),"members"
                for req_member in req_familly:
                    #print "member",req_member['RequestName']
                    owfi = workflowInfo(url, req_member['RequestName'], request=req_member)
                    other_pids = owfi.getPrepIDs()
                    if set(pids) == set(other_pids):
                        ## this is a real match
                        familly.extend( session.query(Workflow).filter(Workflow.name == req_member['RequestName']).all() )

            for lwfo in familly:
                if lwfo:
                    ## we have it already
                    if not lwfo.status in ['forget','trouble','forget-unlock','forget-out-unlock']:
                        wfi.sendLog('injector',"Should not put %s because of %s %s"%( wf, lwfo.name,lwfo.status ))
                        sendLog('injector',"Should not put %s because of %s %s"%( wf, lwfo.name,lwfo.status ), level='critical')
                        print "Should not put",wf,"because of",lwfo.name,lwfo.status
                        cannot_inject.add( wf )
                        can_add = False
            ## add a check on validity of input datasets
            _,prim,par,sec = wfi.getIO()
            for d in list(prim)+list(par)+list(sec):
                if not d in status_cache:
                    status_cache[d] = getDatasetStatus(d)
                if status_cache[d] != 'VALID':
                    wfi.sendLog('injector',"One of the input is not VALID. %s : %s"%( d, status_cache[d]))
                    sendLog('injector',"One of the input of %s is not VALID. %s : %s"%( wf, d, status_cache[d]), level='critical')
                    can_add = False
                #else:
                #    ##make sure that all blocks get closed
                #    closeAllBlocks(url, d)

                ## check for any file in phedex, to verify existence
                _,ph_files,_,_ = getDatasetFiles(url, d)
                if not ph_files and not ( 'StoreResults' == wfi.request.setdefault('RequestType',None) ):
                    wfi.sendLog('injector',"One of the input has no file in phedex: %s" % d )
                    sendLog('injector',"One of the input has no file in phedex: %s"% d, level='critical')
                    can_add = False

            ### ban some workflow that you don't like anymore
            #outputs = wfi.request['OutputDatasets']



            if not can_add: continue

            ## temporary hack to transform specific taskchain into stepchains
            good_for_stepchain = wfi.isGoodToConvertToStepChain( keywords = transform_keywords)
            #good_for_stepchain = wfi.isGoodToConvertToStepChain( keywords = None) 


            ## match keywords and technical constraints
            if (not options.no_convert) and good_for_stepchain and not wfi.isRelval():
                to_convert.add( wf )
                wfi.sendLog('injector','Transforming %s TaskChain into StepChain'%wf)
                sendEmail('convertion to stepchain','Transforming %s TaskChain into StepChain'%wf)

            wfi.sendLog('injector',"considering %s"%wf)

            new_wf = Workflow( name = wf , status = options.setstatus, wm_status = options.wmstatus) 
            session.add( new_wf )
            session.commit()
            time.sleep(0.5)
        else:
            #print "already have",wf
            pass
    

    if cannot_inject:
        #sendEmail('workflow duplicates','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject)))
        sendLog('injector','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject)), level='critical')
        
    for wf in to_convert:
        os.system('./Unified/rejector.py --clone --to_step --comments \"Transform to StepChain\" %s'% wf)

    ## passing a round of invalidation of what needs to be invalidated
    if use_mcm and (options.invalidate or True):
        invalidator(url)

    no_replacement = set()

    #print "getting all transfers"
    #all_transfers=session.query(Transfer).all()
    #print "go!"

    ## pick up replacements
    for wf in session.query(Workflow).filter(Workflow.status == 'trouble').all():
        print wf.name
        if specific and not specific in wf.name: continue
        print wf.name
        wfi = workflowInfo(url, wf.name )
        wl = wfi.request #getWorkLoad(url, wf.name)
        familly = getWorkflowById( url, wl['PrepID'] )
        true_familly = []
        for member in familly:
            if member == wf.name: continue
            fwl = getWorkLoad(url , member)
            if options.replace:
                if member != options.replace: continue
            else:
                if fwl['RequestDate'] < wl['RequestDate']: continue
                if fwl['RequestType']=='Resubmission': continue
                if fwl['RequestStatus'] in ['None',None,'new']: continue
                if fwl['RequestStatus'] in ['rejected','rejected-archived','aborted','aborted-archived']: continue
            true_familly.append( fwl )

        if len(true_familly)==0:
            #sendLog('injector','%s had no replacement'%wf.name, level='critical')
            if wfi.isRelval():
                #wfi.sendLog('injector','the workflow was found in trouble with no replacement. As a relval, there is no clean way to handle this.')
                wfi.sendLog('injector','the workflow was found in trouble with no replacement. As a relval, there is no clean way to handle this. Setting forget')
                wf.status = 'forget'
                session.commit()
            else:
                wfi.sendLog('injector','the workflow was found in trouble with no replacement')
                no_replacement.add( wf.name )
            continue
        else:
            wfi.sendLog('injector','the workflow was found in trouble and has a replacement')
                    
        print wf.name,"has",len(familly),"familly members"
        print wf.name,"has",len(true_familly),"true familly members"

        ##we cannot have more than one of them !!! pick the last one
        if len(true_familly)>1:
            #sendEmail('multiple wf','please take a look at injector for %s'%wf.name)
            sendLog('injector','Multiple wf in line, will take the last one for %s \n%s'%( wf.name, ', '.join(fwl['RequestName'] for fwl in true_familly)), level='critical')

        for fwl in true_familly[-1:]:
            member = fwl['RequestName']
            new_wf = session.query(Workflow).filter(Workflow.name == member).first()
            if not new_wf:
                sendLog('injector',"putting %s as replacement of %s"%( member, wf.name))
                status = 'away'
                if fwl['RequestStatus'] in ['assignment-approved']:
                    status = 'considered'
                new_wf = Workflow( name = member, status = status, wm_status = fwl['RequestStatus'])
                wf.status = 'forget'
                session.add( new_wf ) 
            else:
                if new_wf.status == 'forget': continue
                sendLog('injector',"getting %s as replacement of %s"%( new_wf.name, wf.name ))
                wf.status = 'forget'

            for tr in session.query(TransferImp).filter( TransferImp.workflow_id == wf.id).all():
                ## get all transfer working for the old workflow
                existing = session.query(TransferImp).filter( TransferImp.phedexid == tr.phedexid).filter( TransferImp.workflow_id == new_wf.id).all()
                tr.active = False ## disable the old one
                if not existing:
                    ## create the transfer object for the new dependency
                    tri = TransferImp( phedexid = tr.phedexid,
                                       workflow = new_wf)
                    session.add( tri )
                session.commit()


        ## don't do that automatically
        #wf.status = 'forget'
        session.commit()
    if no_replacement:
        #sendEmail('workflow with no replacement','%s \n are dangling there'%( '\n'.join(no_replacement)))
        sendLog('injector','workflow with no replacement\n%s \n are dangling there'% ( '\n'.join(no_replacement)), level='critical')
Exemplo n.º 39
0
def checkor(url, spec=None, options=None):
    if userLock():   return
    if duplicateLock():  return


    fDB = closeoutInfo()

    UC = unifiedConfiguration()
    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    wfs=[]
    if options.new:
        ## get all in running and check

        ## you want to intersect with what is completed !
        if options.strict:
            completed_wfi = getWorkflows(url, status='completed')
            for wfo in session.query(Workflow).filter(Workflow.status == 'away').all():
                if wfo.name in completed_wfi:
                    wfs.append( wfo )
                else:
                    print wfo.name,"is not completed"
                    sendLog('checkor','%s is not completed'%( wfo.name))
        else:
            wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() )

    if options.current:
        ## recheck those already there, probably to just pass them along
        wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() )

    if options.old:
        ## than get all in need for assistance
        wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() )


    custodials = defaultdict(list) #sites : dataset list
    transfers = defaultdict(list) #sites : dataset list
    invalidations = [] #a list of files
    SI = siteInfo()
    CI = campaignInfo()
    mcm = McMClient(dev=False)

    def get_campaign(output, wfi):
        campaign = None
        try:
            campaign = output.split('/')[2].split('-')[0]
        except:
            if 'Campaign' in wfi.request:
                campaign = wfi.request['Campaign']
        return campaign

    ## retrieve bypass and onhold configuration
    bypasses = []
    forcings = []
    overrides = getForceCompletes()
    holdings = []

    for bypassor,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('prozober','*****@*****.**')]:
        bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor)
        if not os.path.isfile(bypass_file):
            #sendLog('checkor','no file %s',bypass_file)
            continue
        try:
            bypasses.extend( json.loads(open(bypass_file).read()))
        except:
            sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor))
            sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email])
        
        holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor)
        if not os.path.isfile(holding_file):
            #sendLog('checkor',"no file %s"%holding_file)
            continue
        try:
            holdings.extend( json.loads(open(holding_file).read()))
        except:
            sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor))
            sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email])

    ## once this was force-completed, you want to bypass
    for rider,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('srimanob','*****@*****.**')]:
        rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider)
        if not os.path.isfile(rider_file):
            print "no file",rider_file
            #sendLog('checkor',"no file %s"%rider_file)
            continue
        try:
            bypasses.extend( json.loads(open( rider_file ).read() ) )
        except:
            sendLog('checkor',"cannot get force complete list from %s"%rider)
            sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email])

    if use_mcm:
        forcings = mcm.get('/restapi/requests/forcecomplete')
        if forcings:
            sendEmail('force completing mechanism','please check what checkor is doing with %s'%( ','.join(forcings)))


    pattern_fraction_pass = UC.get('pattern_fraction_pass')

    total_running_time = 5.*60. 
    sleep_time = 1
    if len(wfs):
        sleep_time = min(max(0.5, total_running_time / len(wfs)), 10)

    random.shuffle( wfs )

    print len(wfs),"to consider, pausing for",sleep_time
    max_per_round = UC.get('max_per_round').get('checkor',None)
    if max_per_round and not spec: wfs = wfs[:max_per_round]

    for wfo in wfs:
        if spec and not (spec in wfo.name): continue
        time.sleep( sleep_time )
        
        ## get info
        wfi = workflowInfo(url, wfo.name)
        wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status))
        ## make sure the wm status is up to date.
        # and send things back/forward if necessary.
        wfo.wm_status = wfi.request['RequestStatus']
        if wfo.wm_status == 'closed-out':
            ## manually closed-out
            wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status))
            wfo.status = 'close'
            session.commit()
            continue

        elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']:
            ## went into trouble
            wfo.status = 'trouble'
            wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status))
            session.commit()
            continue
        elif wfo.wm_status in ['assigned','acquired']:
            ## not worth checking yet
            wfi.sendLog('checkor',"%s is not running yet"%wfo.name)
            session.commit()
            continue
        
        if '-onhold' in wfo.status:
            if wfo.name in holdings and wfo.name not in bypasses:
                wfi.sendLog('checkor',"%s is on hold"%wfo.name)
                continue

        if wfo.wm_status != 'completed': #and not wfo.name in bypasses:
            ## for sure move on with closeout check if in completed
            wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status))
            session.commit()
            continue

        if wfo.name in holdings and wfo.name not in bypasses:
            wfo.status = 'assistance-onhold'
            wfi.sendLog('checkor',"setting %s on hold"%wfo.name)
            session.commit()
            continue

        session.commit()        
        #sub_assistance="" # if that string is filled, there will be need for manual assistance
        existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance
        assistance_tags = set()

        is_closing = True

        ## get it from somewhere
        bypass_checks = False

        for bypass in bypasses:
            if bypass in wfo.name:
                wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass))
                bypass_checks = True
                break
        pids = wfi.getPrepIDs()
        force_by_mcm = False
        force_by_user = False
        for force in forcings:
            if force in pids:
                wfi.sendLog('checkor',"we can bypass checks and force complete %s because of prepid %s "%( wfo.name, force))
                bypass_checks = True
                force_by_mcm = True
                break
        for user in overrides:
            for force in overrides[user]:
                if force in wfo.name:
                    wfi.sendLog('checkor',"we can bypass checks and force complete %s because of keyword %s of user %s"%( wfo.name, force, user))
                    bypass_checks = True
                    force_by_user = True
                    break
        
        tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm*
        vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco
        campaigns = {}
        expected_outputs = copy.deepcopy( wfi.request['OutputDatasets'] )
        for out in wfi.request['OutputDatasets']:
            c = get_campaign(out, wfi)
            campaigns[out] = c
            if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]:
                vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override']))
                ## add those that we need to check for custodial copy
                tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check

        check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] ))
        wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])]
        check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] ))
        check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check ))
        check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) )

        wfi.sendLog('checkor', check_output_text )

        ## anything running on acdc : getting the real prepid is not worth it
        familly = getWorkflowById(url, wfi.request['PrepID'], details=True)
        acdc = []
        acdc_inactive = []
        forced_already=False
        acdc_bads = []
        for member in familly:
            if member['RequestType'] != 'Resubmission': continue
            if member['RequestName'] == wfo.name: continue
            if member['RequestDate'] < wfi.request['RequestDate']: continue
            if 'OriginalRequestName' in member and member['OriginalRequestName'] != wfo.name: continue
            if member['RequestStatus'] == None: continue
            if not set(member['OutputDatasets']).issubset( set(expected_outputs)):
                if not member['RequestStatus'] in ['rejected-archived','rejected','aborted','aborted-archived']:
                    ##this is not good at all
                    wfi.sendLog('checkor','inconsistent ACDC %s'%member['RequestName'] )
                    acdc_bads.append( member['RequestName'] )
                    is_closing = False
                    assistance_tags.add('manual')
                continue
            if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']:
                print wfo.name,"still has an ACDC running",member['RequestName']
                acdc.append( member['RequestName'] )
                ## cannot be bypassed!
                is_closing = False
                assistance_tags.add('recovering')
                if (force_by_mcm or force_by_user) and not forced_already:
                    wfi.sendLog('checkor','%s is being forced completed while recovering'%wfo.name)
                    wfi.notifyRequestor("The workflow %s was force completed"% wfo.name, do_batch=False)
                    forceComplete(url, wfi)
                    forced_already=True
            else:
                acdc_inactive.append( member['RequestName'] )
                assistance_tags.add('recovered')
        if acdc_bads:
            sendEmail('inconsistent ACDC','for %s, ACDC %s is inconsistent, preventing from closing'%( wfo.name, ','.join(acdc_bads) ))

        ## completion check
        percent_completions = {}
        if not 'TotalInputEvents' in wfi.request:
            event_expected,lumi_expected = 0,0
            if not 'recovery' in wfo.status:
                #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**'])
                sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical')
        else:
            event_expected,lumi_expected =  wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis']

        if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']):
            event_expected = int(wfi.request['RequestNumEvents'])
        elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']:
            event_expected = wfi.request['Task1']['RequestNumEvents']
            for i in range(1,20):
                if 'Task%d'%i in wfi.request:
                    ## this is wrong ibsolute
                    if 'FilterEfficiency' in wfi.request['Task%d'%i]:
                        event_expected *= float(wfi.request['Task%d'%i]['FilterEfficiency'])
            event_expected = int(event_expected)

        fractions_pass = {}
        over_100_pass = False
        (lhe,prim,_,_) = wfi.getIO()
        if lhe or prim: over_100_pass = False

        for output in wfi.request['OutputDatasets']:
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=output)
            percent_completions[output] = 0.

            if lumi_expected:
                percent_completions[output] = lumi_count / float( lumi_expected )

            if event_expected:
                wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected ))
                percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) )

            fractions_pass[output] = 0.95
            c = campaigns[output]
            if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]:
                fractions_pass[output] = CI.campaigns[c]['fractionpass']
                wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output))

            if options.fractionpass:
                fractions_pass[output] = options.fractionpass
                print "overriding fraction to",fractions_pass[output],"by command line for",output

            for key in pattern_fraction_pass:
                if key in output:
                    fractions_pass[output] = pattern_fraction_pass[key]
                    print "overriding fraction to",fractions_pass[output],"by dataset key",key
                    

        if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]):
            possible_recoveries = wfi.getRecoveryDoc()
            if possible_recoveries == []:
                wfi.sendLog('checkor','%s has missing statistics \n%s \n%s, but nothing is recoverable. passing through to annoucement'%( 
                        wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) ))
                sendEmail('nothing is recoverable','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name)
                bypass_checks = True
            else:
                wfi.sendLog('checkor','%s is not completed  \n%s \n%s'%( 
                        wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) ))

            ## hook for creating automatically ACDC ?
            if not bypass_checks:
                assistance_tags.add('recovery')
                is_closing = False

        if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]):
            print wfo.name,"is over completed"
            print json.dumps(percent_completions, indent=2)
            if not bypass_checks:
                assistance_tags.add('over100')
                is_closing = False

        ## correct lumi < 300 event per lumi
        events_per_lumi = {}
        for output in wfi.request['OutputDatasets']:
            events_per_lumi[output] = getDatasetEventsPerLumi( output )


        lumi_upper_limit = {}
        for output in wfi.request['OutputDatasets']:
            upper_limit = 301.
            campaign = campaigns[output]
            #if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request:
            #    upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency']
            #    print "setting the upper limit of lumisize to",upper_limit,"by request configuration"

            if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]:
                upper_limit = CI.campaigns[campaign]['lumisize']
                print "overriding the upper lumi size to",upper_limit,"for",campaign

            if options.lumisize:
                upper_limit = options.lumisize
                print "overriding the upper lumi size to",upper_limit,"by command line"
                
            lumi_upper_limit[output] = upper_limit
            if wfi.request['RequestType'] in ['ReDigi']: lumi_upper_limit[output] = -1
        
        if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]):
            print wfo.name,"has big lumisections"
            print json.dumps(events_per_lumi, indent=2)
            ## hook for rejecting the request ?
            if not bypass_checks:
                assistance_tags.add('biglumi')
                is_closing = False 


        any_presence = {}
        for output in wfi.request['OutputDatasets']:
            any_presence[output] = getDatasetPresence(url, output, vetoes=[])

        ## custodial copy
        custodial_locations = {}
        custodial_presences = {}
        for output in wfi.request['OutputDatasets']:
            custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s]
            custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output)

            if not custodial_locations[output]:
                custodial_locations[output] = []

        ## presence in phedex
        phedex_presence ={}
        for output in wfi.request['OutputDatasets']:
            phedex_presence[output] = phedexClient.getFileCountDataset(url, output )


            
        out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier]
        size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs
        if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])):
            print wfo.name,"has not all custodial location"
            print json.dumps(custodial_locations, indent=2)

            ##########
            ## hook for making a custodial replica ?
            custodial = None
            ## get from other outputs
            for output in out_worth_checking:
                if len(custodial_locations[output]): 
                    custodial = custodial_locations[output][0]
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the other output custodial:",custodial,"because of limited space"
                custodial = None

            ## try to get it from campaign configuration
            if not custodial:
                for output in out_worth_checking:
                    campaign = campaigns[output]
                    if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]:
                        custodial = CI.campaigns[campaign]['custodial']
                        print "Setting custodial to",custodial,"from campaign configuration"

            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the campaign configuration custodial:",custodial,"because of limited space"
                custodial = None

            ## get from the parent
            pick_custodial = True
            use_parent_custodial = UC.get('use_parent_custodial')
            _,prim,_,_ = wfi.getIO()
            if not custodial and prim and use_parent_custodial:
                parent_dataset = prim.pop()
                ## this is terribly dangerous to assume only 
                parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset )
                ###parents_custodial = findCustodialLocation(url, parent_dataset)
                if not parents_custodial:
                    parents_custodial = []

                if len(parents_custodial):
                    custodial = parents_custodial[0]
                else:
                    print "the input dataset",parent_dataset,"does not have custodial in the first place. abort"
                    #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset)
                    ## does not work for RAWOADSIM
                    sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset)
                    ## cannot be bypassed, this is an issue to fix
                    is_closing = False
                    pick_custodial = False
                    assistance_tags.add('parentcustodial')
                                
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the parent custodial:",custodial,"because of limited space"
                custodial = None

            if not custodial and pick_custodial:
                ## pick one at random
                custodial = SI.pick_SE(size=size_worth_checking)

            if not custodial:
                print "cannot find a custodial for",wfo.name
                wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking))
                #sendEmail( "cannot find a custodial","cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking))
                sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical')
                
            if custodial and (is_closing or bypass_checks):
                print "picked",custodial,"for tape copy"
                ## remember how much you added this round already ; this stays locally
                SI.storage[custodial] -= size_worth_checking
                ## register the custodial request, if there are no other big issues
                for output in out_worth_checking:
                    if not len(custodial_locations[output]):
                        if phedex_presence[output]>=1:
                            custodials[custodial].append( output )
                            ## let's wait and see if that's needed 
                            assistance_tags.add('custodial')
                        else:
                            print "no file in phedex for",output," not good to add to custodial requests"
            #cannot be bypassed


            is_closing = False

        ## disk copy 
        disk_copies = {}
        for output in wfi.request['OutputDatasets']:
            disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)]

        if not all(map( lambda sites : len(sites)!=0, disk_copies.values())):
            print wfo.name,"has not all output on disk"
            print json.dumps(disk_copies, indent=2)


        ## presence in dbs
        dbs_presence = {}
        dbs_invalid = {}
        for output in wfi.request['OutputDatasets']:
            dbs_presence[output] = dbs3Client.getFileCountDataset( output )
            dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True)

        fraction_invalid = 0.01
        if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            print wfo.name,"has a dbs,phedex mismatch"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            if not 'recovering' in assistance_tags:
                assistance_tags.add('filemismatch')
                #print this for show and tell if no recovery on-going
                for out in dbs_presence:
                    _,_,missing_phedex,missing_dbs  = getDatasetFiles(url, out)
                    if missing_phedex:
                        wfi.sendLog('checkor',"These %d files are missing in phedex\n%s"%(len(missing_phedex),
                                    "\n".join( missing_phedex )))
                    if missing_dbs:
                        wfi.sendLog('checkor',"These %d files are missing in dbs\n%s"%(len(missing_dbs),
                                    "\n".join( missing_dbs )))

            #if not bypass_checks:
            ## I don't think we can by pass this
            is_closing = False

        if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            print wfo.name,"has a dbs invalid file level too high"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            ## need to be going and taking an eye
            assistance_tags.add('invalidfiles')
            if not bypass_checks:
                #sub_assistance+="-invalidfiles"
                is_closing = False

        ## put that heavy part at the end
        ## duplication check
        duplications = {}
        if is_closing or bypass_checks:
            print "starting duplicate checker for",wfo.name
            for output in wfi.request['OutputDatasets']:
                print "\tchecking",output
                duplications[output] = True
                try:
                    duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True)
                except:
                    try:
                        duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True)
                    except:
                        print "was not possible to get the duplicate count for",output
                        is_closing=False

            if any(duplications.values()) and not options.ignoreduplicates:
                print wfo.name,"has duplicates"
                print json.dumps(duplications,indent=2)
                ## hook for making file invalidation ?
                ## it shouldn't be allowed to bypass it
                assistance_tags.add('duplicates')
                is_closing = False 



        ## for visualization later on
        if not wfo.name in fDB.record: 
            #print "adding",wfo.name,"to close out record"
            fDB.record[wfo.name] = {
            'datasets' :{},
            'name' : wfo.name,
            'closeOutWorkflow' : None,
            }
        fDB.record[wfo.name]['closeOutWorkflow'] = is_closing
        fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority']
        fDB.record[wfo.name]['prepid'] = wfi.request['PrepID']

        for output in wfi.request['OutputDatasets']:
            if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {}
            rec = fDB.record[wfo.name]['datasets'][output]
            rec['percentage'] = float('%.2f'%(percent_completions[output]*100))
            rec['duplicate'] = duplications[output] if output in duplications else 'N/A'
            rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A'
            rec['closeOutDataset'] = is_closing
            rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A'
            rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True
            rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output])))
            rec['dbsFiles'] = dbs_presence[output]
            rec['dbsInvFiles'] = dbs_invalid[output]
            rec['phedexFiles'] = phedex_presence[output]
            rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive))
            now = time.gmtime()
            rec['timestamp'] = time.mktime(now)
            rec['updated'] = time.asctime(now)+' (GMT)'

        ## and move on
        if is_closing:
            ## toggle status to closed-out in request manager
            print "setting",wfo.name,"closed-out"
            if not options.test:
                if wfo.wm_status in ['closed-out','announced','normal-archived']:
                    print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does"
                    res = None
                else:
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    print "close out answer",res

                if not res in ["None",None]:
                    print "try to get the current status again"
                    wfi_bis = workflowInfo(url, wfo.name)
                    if wfi_bis.request['RequestStatus'] == 'closed-out':
                        print "the request did toggle to closed-out"
                        res = None
                    
                if not res in ["None",None]:
                    print "retrying to closing out"
                    print res
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    
                
                if res in [None,"None"]:
                    wfo.status = 'close'
                    session.commit()
                    if use_mcm and force_by_mcm:
                        ## shoot large on all prepids, on closing the wf
                        for pid in pids:
                            mcm.delete('/restapi/requests/forcecomplete/%s'%pid)
                else:
                    print "could not close out",wfo.name,"will try again next time"
        else:
            ## full known list
            #recovering # has active ACDC
            ##OUT #recovered #had inactive ACDC
            #recovery #not over the pass bar
            #over100 # over 100%
            #biglumi # has a big lumiblock
            #parentcustodial # the parent does not have a valid subscription yet
            #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear
            #filemismatch # there is a dbs/phedex mismatch
            #duplicates #a lumi section is there twice

            ## manual is not added yet, and should be so by recoveror
            print wfo.name,"was tagged with :",list(assistance_tags)
            if 'recovering' in assistance_tags:
                ## if active ACDC, being under threshold, filemismatch do not matter
                assistance_tags = assistance_tags - set(['recovery','filemismatch'])
            if 'recovery' in assistance_tags and 'recovered' in assistance_tags:
                ## should not set -recovery to anything that add ACDC already
                assistance_tags = assistance_tags - set(['recovery','recovered']) 
                ## straight to manual
                assistance_tags.add('manual')


            ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it
            print wfo.name,"needs assistance with",",".join( assistance_tags )
            print wfo.name,"existing conditions",",".join( existing_assistance_tags )
            
            #########################################
            ##### notification to requester #########
            go_notify=False
            if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags:
                go_notify=True
            

            if go_notify:
                #if wfo.name in already_notified:
                #    print "double notification"
                #    sendEmail('double notification','please take a look at %s'%(wfo.name))                    
                #else:
                #    already_notified.append( wfo.name )

                detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s'
                perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name)
                splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name)
                ## notify templates
                messages= {
                    'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ),
                    'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink),
                    'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ),
                    'filemismatch': 'Samples completed with inconsistency in DBS/Phedex',
                    #'manual' :                     'Workflow completed and requires manual checks by Ops',
                    }
                
                content = "The request PREPID (WORKFLOW) is facing issue in production.\n"
                motive = False
                for case in messages:
                    if case in assistance_tags:
                        content+= "\n"+messages[case]+"\n"
                        motive = True
                content += "You are invited to check, while this is being taken care of by Comp-Ops.\n"
                content += "This is an automated message from Comp-Ops.\n"

                items_notified = set()
                if use_mcm and motive:
                    wfi.notifyRequestor( content , mcm = mcm)

            #########################################


            ## logic to set the status further
            if assistance_tags:
                new_status = 'assistance-'+'-'.join(sorted(assistance_tags) )
            else:
                new_status = 'assistance'

            ## case where the workflow was in manual from recoveror
            if not 'manual' in wfo.status or new_status!='assistance-recovery':
                wfo.status = new_status
                if not options.test:
                    print "setting",wfo.name,"to",wfo.status
                    session.commit()
            else:
                print "current status is",wfo.status,"not changing to anything"

    #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2))

    fDB.html()
    if not spec:
        #sendEmail("fresh assistance status available","Fresh status are available at https://cmst2.web.cern.ch/cmst2/unified/assistance.html",destination=['*****@*****.**'])
        #it's a bit annoying
        pass

    ## custodial requests
    print "Custodials"
    print json.dumps(custodials, indent=2)
    for site in custodials:
        print ','.join(custodials[site]),'=>',site
        if not options.test:
            result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) )
            print result

    print "Transfers"
    print json.dumps(transfers, indent=2)
    ## replicas requests
    for site in transfers:
        print ','.join(transfers[site]),'=>',site
        if not options.test:
            result = None
            #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out")
            print result

    print "File Invalidation"
    print invalidations
Exemplo n.º 40
0
def transferor(url, specific=None, talk=True, options=None):
    if userLock(): return
    mlock = moduleLock()
    if mlock(): return

    use_mcm = True
    up = componentInfo(soft=['mcm', 'wtc', 'jira'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    if options and options.test:
        execute = False
    else:
        execute = True

    SI = siteInfo()
    CI = campaignInfo()
    #NLI = newLockInfo()
    #if not NLI.free(): return
    LI = lockInfo()
    if not LI.free(): return

    mcm = McMClient(dev=False)
    dss = DSS()

    #allowed_secondary = UC.get('')
    print "counting all being handled..."
    being_handled = len(
        session.query(Workflow).filter(Workflow.status == 'away').all())
    being_handled += len(
        session.query(Workflow).filter(
            Workflow.status.startswith('stag')).all())
    being_transfered = len(
        session.query(Workflow).filter(Workflow.status == 'staging').all())
    #being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all())
    being_handled += len(
        session.query(Workflow).filter(
            Workflow.status.startswith('assistance-')).filter(
                ~Workflow.status.contains('custodial')).all())

    max_to_handle = options.maxworkflows
    max_to_transfer = options.maxstaging

    allowed_to_handle = max(0, max_to_handle - being_handled)
    allowed_to_transfer = max(0, max_to_transfer - being_transfered)
    wf_buffer = 5
    if allowed_to_handle <= wf_buffer:  ## buffer for having several wf per transfer
        print "Not allowed to run more than", max_to_handle, "at a time. Currently", being_handled, "and", wf_buffer, "buffer"
    else:
        print being_handled, "already being handled", max_to_handle, "max allowed,", allowed_to_handle, "remaining", "and", wf_buffer, "buffer"

    if allowed_to_transfer <= wf_buffer:
        print "Not allowed to transfer more than", max_to_transfer, "at a time. Currently", being_transfered, "and", wf_buffer, "buffer"
    else:
        print being_transfered, "already being transfered", max_to_transfer, "max allowed,", allowed_to_transfer, "remaining", "and", wf_buffer, "buffer"

    print "... done"

    all_transfers = defaultdict(list)
    workflow_dependencies = defaultdict(
        set)  ## list of wf.id per input dataset
    wfs_and_wfh = []
    max_per_round = UC.get('max_per_round').get('transferor', None)

    print "getting all wf to consider ..."
    cache = getWorkflows(url, 'assignment-approved', details=True)
    all_to_include = session.query(Workflow).filter(
        Workflow.status.startswith('considered')).all()
    if len(cache) > 2000:
        max_to_include = max_per_round
        random.shuffle(cache)  ## randomize first by wf name
        cache = sorted(cache, key=lambda r: r['RequestPriority'],
                       reverse=True)  ## order by prio
        highest = [r['RequestName'] for r in cache[:max_to_include]]
        all_to_include = [wfo for wfo in all_to_include if wfo.name in highest]
        print "limiting what to consider to", max_to_include, "because there is too much stuff going on. Got", len(
            all_to_include)

    for wfo in all_to_include:
        print "\t", wfo.name
        if specific and not specific in wfo.name: continue
        cache_r = filter(lambda d: d['RequestName'] == wfo.name, cache)
        if len(cache_r):
            wfs_and_wfh.append((wfo,
                                workflowInfo(url,
                                             wfo.name,
                                             spec=False,
                                             request=cache_r[0])))
        else:
            wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False)))
    print "... done"

    transfers_per_sites = defaultdict(int)
    input_sizes = defaultdict(float)
    ignored_input_sizes = defaultdict(float)
    input_cput = {}
    input_st = {}
    ## list the size of those in transfer already
    in_transfer_priority = None
    min_transfer_priority = None
    print "getting all wf in staging ..."
    #stucks = json.loads(open('%s/stuck_transfers.json'%monitor_pub_dir).read())
    stucks = json.loads(eosRead('%s/stuck_transfers.json' % monitor_pub_dir))

    for wfo in session.query(Workflow).filter(
            Workflow.status == 'staging').all():
        wfh = workflowInfo(url, wfo.name, spec=False)
        #(lheinput,primary,parent,secondary) = wfh.getIO()
        #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) )
        print wfo.name, "staging"
        (lheinput, primary, parent, secondary,
         sites_allowed) = wfh.getSiteWhiteList()
        for site in sites_allowed:  ## we should get the actual transfer destination instead of the full white list
            transfers_per_sites[site] += 1
        #input_cput[wfo.name] = wfh.getComputingTime()
        #input_st[wfo.name] = wfh.getSystemTime()
        blocks = wfh.getBlocks()
        for prim in primary:
            ds_s = dss.get(prim, blocks=blocks)
            if prim in stucks:
                wfh.sendLog(
                    'transferor',
                    "%s appears stuck, so not counting it %s [GB]" %
                    (prim, ds_s))
                ignored_input_sizes[prim] = max(ds_s,
                                                ignored_input_sizes[prim])
            else:
                input_sizes[prim] = max(ds_s, input_sizes[prim])
                wfh.sendLog('transferor',
                            "%s needs %s [GB]" % (wfo.name, ds_s))
        if in_transfer_priority == None:
            in_transfer_priority = int(wfh.request['RequestPriority'])
        else:
            in_transfer_priority = max(in_transfer_priority,
                                       int(wfh.request['RequestPriority']))
        if min_transfer_priority == None:
            min_transfer_priority = int(wfh.request['RequestPriority'])
        else:
            min_transfer_priority = min(min_transfer_priority,
                                        int(wfh.request['RequestPriority']))

    try:
        print "Ignored input sizes"
        ignored_values = list(ignored_input_sizes.items())
        ignored_values.sort(key=lambda i: i[1])
        print "\n".join(map(str, ignored_values))
        print "Considered input sizes"
        considered_values = list(input_sizes.items())
        considered_values.sort(key=lambda i: i[1])
        print "\n".join(map(str, considered_values))
    except Exception as e:
        print "trying to print the summary of input size"
        print str(e)

    print "... done"
    print "Max priority in transfer already", in_transfer_priority
    print "Min priority in transfer already", min_transfer_priority
    print "transfers per sites"
    print json.dumps(transfers_per_sites, indent=2)
    in_transfer_already = sum(input_sizes.values())
    cput_in_transfer_already = sum(input_cput.values())
    st_in_transfer_already = sum(input_st.values())

    ## list the size of all inputs
    primary_input_per_workflow_gb = defaultdict(float)
    print "getting all input sizes ..."
    input_blocks = {}
    for (wfo, wfh) in wfs_and_wfh:
        (_, primary, _, _) = wfh.getIO()
        blocks = wfh.getBlocks()
        input_blocks[wfo.name] = blocks
        for prim in primary:
            ## do not count it if it appears stalled !
            prim_size = dss.get(prim, blocks=blocks)
            input_sizes[prim] = max(prim_size, input_sizes[prim])
            primary_input_per_workflow_gb[wfo.name] += prim_size
    print "... done"

    # shuffle first by name
    random.shuffle(wfs_and_wfh)

    # Sort smallest transfers first; allows us to transfer as many as possible workflows.
    def prio_and_size(i, j):
        if int(i[1].request['RequestPriority']) == int(
                j[1].request['RequestPriority']):
            return cmp(int(primary_input_per_workflow_gb.get(j[0].name, 0)),
                       int(primary_input_per_workflow_gb.get(i[0].name, 0)))
        else:
            return cmp(int(i[1].request['RequestPriority']),
                       int(j[1].request['RequestPriority']))

    #wfs_and_wfh.sort(cmp = prio_and_size, reverse=True)
    #wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(primary_input_per_workflow_gb.get(i[0].name, 0)), int(primary_input_per_workflow_gb.get(j[0].name, 0)) ))
    #sort by priority higher first
    wfs_and_wfh.sort(cmp=lambda i, j: cmp(int(i[1].request[
        'RequestPriority']), int(j[1].request['RequestPriority'])),
                     reverse=True)

    if min_transfer_priority == None or in_transfer_priority == None:
        print "nothing is lining up for transfer"
        sendLog(
            "transferor",
            "No request in staging, using first request to set priority limit")
        if len(wfs_and_wfh):
            min_transfer_priority = wfs_and_wfh[0][1].request[
                'RequestPriority']
            in_transfer_priority = wfs_and_wfh[0][1].request['RequestPriority']
        else:
            return

    cput_grand_total = sum(input_cput.values())
    cput_to_transfer = cput_grand_total - cput_in_transfer_already
    st_grand_total = sum(input_st.values())
    st_to_transfer = st_grand_total - st_in_transfer_already
    print "%15.4f [CPU h] worth already in transfer" % cput_in_transfer_already
    print "%15.4f [CPU h] worth is the current requested transfer load" % cput_to_transfer
    print "%15.4f [h] worth of absolute system time in transfer" % (
        cput_in_transfer_already / SI.availableSlots())
    print "%15.4f [h] worth of absolute system time is the current requested transfer load" % (
        cput_to_transfer / SI.availableSlots())
    print "%15.4f [h] worth of theoritical system time in transfer" % (
        st_in_transfer_already)
    print "%15.4f [h] worth of theoritical system time is the current requested transfer load" % (
        st_to_transfer)

    grand_total = sum(input_sizes.values())
    to_transfer = grand_total - in_transfer_already
    grand_transfer_limit = options.maxtransfer
    #grand_transfer_limit = SI.total_disk()*0.25*1024## half of the free sapce in TB->GB

    transfer_limit = grand_transfer_limit - in_transfer_already
    print "%15.4f GB already being transfered" % in_transfer_already
    print "%15.4f GB is the current requested transfer load" % to_transfer
    print "%15.4f GB is the global transfer limit" % grand_transfer_limit
    print "%15.4f GB is the available limit" % transfer_limit

    max_staging_per_site = options.maxstagingpersite

    # the max priority value per dataset.
    max_priority = defaultdict(int)
    needs_transfer = 0  ## so that we can count'em
    passing_along = 0
    transfer_sizes = defaultdict(float)
    went_over_budget = False
    destination_cache = {}
    no_goes = set()

    if max_per_round and not spec:
        wfs_and_wfh = wfs_and_wfh[:max_per_round]

    for (wfo, wfh) in wfs_and_wfh:
        print wfo.name, "to be transfered with priority", wfh.request[
            'RequestPriority']

        if wfh.request['RequestStatus'] != 'assignment-approved':
            if wfh.request['RequestStatus'] in [
                    'aborted', 'rejected', 'rejected-archived',
                    'aborted-archived'
            ]:
                if wfh.isRelval():
                    wfo.status = 'forget'
                else:
                    wfo.status = 'trouble'  ## so that we look or a replacement
            else:
                wfo.status = 'away'
            wfh.sendLog(
                'transferor', '%s in status %s, setting %s' %
                (wfo.name, wfh.request['RequestStatus'], wfo.status))
            continue

        (lheinput, primary, parent, secondary,
         sites_allowed) = wfh.getSiteWhiteList()
        blocks = input_blocks.get(wfo.name, wfh.getBlocks())
        if blocks:
            print "Reading only", len(blocks), "blocks in input"
        this_load = sum([dss.get(prim, blocks=blocks) for prim in primary])
        no_budget = False
        if (this_load
                and (sum(transfer_sizes.values()) + this_load > transfer_limit
                     or went_over_budget)):
            if went_over_budget:
                wfh.sendLog('transferor', "Transfer has gone over bubget.")
            else:
                wfh.sendLog('transferor', "Transfer will go over bubget.")
            wfh.sendLog(
                'transferor',
                "%15.4f GB this load, %15.4f GB already this round, %15.4f GB is the available limit"
                % (this_load, sum(transfer_sizes.values()), transfer_limit))
            #if sum(transfer_sizes.values()) > transfer_limit:
            went_over_budget = True
            if in_transfer_priority != None and min_transfer_priority != None:
                if int(
                        wfh.request['RequestPriority']
                ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority:
                    wfh.sendLog(
                        'transferor',
                        "Higher priority sample %s >= %s go-on over budget" %
                        (wfh.request['RequestPriority'], in_transfer_priority))
                else:
                    if not options.go:
                        wfh.sendLog(
                            'transferor',
                            "%s minimum priority %s < %s : stop" %
                            (min_transfer_priority,
                             wfh.request['RequestPriority'],
                             in_transfer_priority))
                        no_budget = True

        ## throtlle by campaign go
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True
            no_goes.add(wfo.name)

        allowed_secondary = {}
        overide_parameters = {}
        check_secondary = (not wfh.isRelval())
        output_tiers = list(
            set([o.split('/')[-1] for o in wfh.request['OutputDatasets']]))
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns:
                overide_parameters.update(CI.campaigns[campaign])
            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[
                    campaign]:
                if CI.campaigns[campaign]['secondaries']:
                    allowed_secondary.update(
                        CI.campaigns[campaign]['secondaries'])
                    check_secondary = True
            if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[
                    campaign]:
                banned_tier = list(
                    set(CI.campaigns[campaign]['banned_tier'])
                    & set(output_tiers))
                if banned_tier:
                    no_go = True
                    wfh.sendLog(
                        'transferor', 'These data tiers %s are not allowed' %
                        (','.join(banned_tier)))
                    sendLog('transferor',
                            'These data tiers %s are not allowed in %s' %
                            (','.join(banned_tier), wfo.name),
                            level='critical')

        if secondary and check_secondary:
            if (set(secondary) & set(allowed_secondary.keys()) !=
                    set(secondary)):
                msg = '%s is not an allowed secondary' % (
                    ', '.join(set(secondary) - set(allowed_secondary.keys())))
                wfh.sendLog('transferor', msg)
                critical_msg = msg + '\nWorkflow URL: https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=task_{}'.format(
                    wfh.getPrepIDs()[0])
                sendLog('transferor', critical_msg, level='critical')
                if not options.go:
                    no_go = True
            for sec in secondary:
                if sec in allowed_secondary:
                    overide_parameters.update(allowed_secondary[sec])

        if 'SiteWhitelist' in overide_parameters:
            sites_allowed = list(
                set(sites_allowed) & set(overide_parameters['SiteWhitelist']))
            wfh.sendLog(
                'transferor',
                'Intersecting with the overriding whitelist parameters, allowed sites become {}'
                .format(sites_allowed))

        if no_go:
            continue

        if passing_along >= allowed_to_handle:
            #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
            if in_transfer_priority != None and min_transfer_priority != None:
                if int(wfh.request['RequestPriority']
                       ) >= in_transfer_priority and int(
                           wfh.request['RequestPriority']
                       ) != min_transfer_priority:
                    ## higher priority, and not only this priority being transfered
                    wfh.sendLog(
                        'transferor',
                        "Higher priority sample %s >= %s go-on over %s" %
                        (wfh.request['RequestPriority'], in_transfer_priority,
                         max_to_handle))
                else:
                    wfh.sendLog(
                        'transferor',
                        " Not allowed to pass more than %s at a time. Currently %s handled, and adding %s"
                        % (max_to_handle, being_handled, passing_along))
                    if not options.go:
                        ## should not allow to jump that fence
                        break

        if this_load and needs_transfer >= allowed_to_transfer:
            if in_transfer_priority != None and min_transfer_priority != None:
                if int(wfh.request['RequestPriority']
                       ) >= in_transfer_priority and int(
                           wfh.request['RequestPriority']
                       ) != min_transfer_priority:
                    ## higher priority, and not only this priority being transfered
                    wfh.sendLog(
                        'transferor',
                        "Higher priority sample %s >= %s go-on over %s" %
                        (wfh.request['RequestPriority'], in_transfer_priority,
                         max_to_transfer))
                else:
                    wfh.sendLog(
                        'transferor',
                        "Not allowed to transfer more than %s at a time. Currently %s transfering, and adding %s"
                        % (max_to_transfer, being_transfered, needs_transfer))
                    if not options.go:
                        no_budget = True

        if no_budget:
            continue
        #    break ## try this for a while to make things faster

        ## the site white list considers site, campaign, memory and core information
        if options and options.tosites:
            sites_allowed = options.tosites.split(',')

        for dataset in list(primary) + list(parent) + list(secondary):
            LI.lock(dataset, reason='staging')

        if not sites_allowed:
            wfh.sendLog('transferor', "not possible site to run at")
            #sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name ))
            sendLog('transferor',
                    "%s has no possible sites to run at" % (wfo.name),
                    level='critical')
            continue

        can_go = True
        staging = False
        allowed = True
        primary_destinations = set()
        if primary:

            copies_needed_from_CPUh, CPUh = wfh.getNCopies()

            if talk:
                print wfo.name, 'reads', ', '.join(primary), 'in primary'
            ## chope the primary dataset
            for prim in primary:
                ## keep track of what needs what
                workflow_dependencies[prim].add(wfo.id)

                max_priority[prim] = max(max_priority[prim],
                                         int(wfh.request['RequestPriority']))

                wfh.sendLog(
                    'transferor', "Would make %s  from cpu requirement %s" %
                    (copies_needed_from_CPUh, CPUh))
                copies_needed = copies_needed_from_CPUh

                if 'Campaign' in wfh.request and wfh.request[
                        'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[
                            wfh.request['Campaign']]:
                    copies_needed_from_campaign = CI.campaigns[
                        wfh.request['Campaign']]['maxcopies']
                    copies_needed = min(copies_needed_from_campaign,
                                        copies_needed)

                    wfh.sendLog(
                        'transferor',
                        "Maxed to %s by campaign configuration %s" %
                        (copies_needed, wfh.request['Campaign']))

                if blocks:
                    print "limiting to blocks", "\n".join(sorted(blocks))
                ### new ways of making the whole thing
                destinations, all_block_names = getDatasetDestinations(
                    url,
                    prim,
                    within_sites=[SI.CE_to_SE(site) for site in sites_allowed],
                    only_blocks=blocks)
                print json.dumps(destinations, indent=2)

                ## get where the dataset is in full and completed
                prim_location = [
                    site for (site, info) in destinations.items()
                    if info['completion'] == 100 and info['data_fraction'] == 1
                ]
                ## the rest is places it is going to be
                #prim_destination = [site for site in destinations.keys() if not site in prim_location]
                prim_destination = [
                    site for (site, info) in destinations.items()
                    if info['data_fraction'] == 1 and info['completion'] != 100
                ]
                ## veto the site with no current disk space, for things that are not relval
                prim_destination = [
                    site for site in prim_destination
                    if (SI.disk[site] or wfh.isRelval())
                ]

                if len(prim_location) >= copies_needed:
                    wfh.sendLog(
                        'transferor',
                        "The input is all fully in place at %s sites %s" %
                        (len(prim_location), sorted(prim_location)))
                    continue
                copies_needed = max(0, copies_needed - len(prim_location))
                wfh.sendLog(
                    'transferor',
                    "Counting existing copies ; now need %s" % copies_needed)
                copies_being_made = [
                    sum([
                        info['blocks'].keys().count(block)
                        for site, info in destinations.items()
                        if site in prim_destination
                    ]) for block in all_block_names
                ]

                latching_on_transfers = set()
                [
                    latching_on_transfers.update(info['blocks'].values())
                    for site, info in destinations.items()
                    if site in prim_destination
                ]
                latching_on_transfers = list(latching_on_transfers)
                #print latching_on_transfers

                ## figure out where all this is going to go
                prim_to_distribute = [
                    site for site in sites_allowed
                    if not SI.CE_to_SE(site) in prim_location
                ]
                prim_to_distribute = [
                    site for site in prim_to_distribute
                    if not SI.CE_to_SE(site) in prim_destination
                ]
                ## take out the ones that cannot receive transfers
                potential_destinations = len(prim_to_distribute)
                #prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in SI.sites_veto_transfer]
                prim_to_distribute = [
                    site for site in prim_to_distribute
                    if (SI.disk[SI.CE_to_SE(site)] or wfh.isRelval())
                ]

                ## do we want to restrict transfers if the amount of site in vetoe are too large ?

                wfh.sendLog(
                    'transferor',
                    "Could be going to: %s" % sorted(prim_to_distribute))
                if not prim_to_distribute or any([
                        transfers_per_sites[site] < max_staging_per_site
                        for site in prim_to_distribute
                ]):
                    ## means there is openings let me go
                    print "There are transfer slots available:", [
                        (site, transfers_per_sites[site])
                        for site in prim_to_distribute
                    ]
                else:
                    if int(
                            wfh.request['RequestPriority']
                    ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority:
                        wfh.sendLog(
                            'transferor',
                            "Higher priority sample %s >= %s go-on over transfer slots available"
                            % (wfh.request['RequestPriority'],
                               in_transfer_priority))
                    else:
                        wfh.sendLog(
                            'transferor',
                            "Not allowed to transfer more than %s per site at a time. Going overboard for %s"
                            % (max_staging_per_site,
                               sorted([
                                   site for site in prim_to_distribute
                                   if transfers_per_sites[site] >=
                                   max_staging_per_site
                               ])))
                        if not options.go:
                            allowed = False
                            break

                for latching in latching_on_transfers:

                    existings = session.query(TransferImp).filter(
                        TransferImp.phedexid == int(latching)).filter(
                            TransferImp.workflow_id == wfo.id).all()
                    if not existings:
                        tri = TransferImp(phedexid=int(latching), workflow=wfo)
                        print "adding", wfo.id, "with phedexid", latching
                        session.add(tri)
                    else:
                        for existing in existings:
                            existing.active = True

                    session.flush()

                    can_go = False
                    transfer_sizes[prim] = max(this_load, transfer_sizes[prim])
                    staging = True

                # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ??
                #copies_needed = max(0,copies_needed - len(prim_destination))
                copies_needed = max(0, copies_needed - min(copies_being_made))
                wfh.sendLog(
                    'transferor',
                    "Counting the copies being made ; then need %s" %
                    copies_needed)
                if copies_needed == 0:
                    wfh.sendLog(
                        'transferor',
                        "The input is either fully in place or getting in full somewhere with %s"
                        % latching_on_transfers)
                    can_go = True
                    continue
                elif len(prim_to_distribute) == 0:
                    wfh.sendLog(
                        'transferor',
                        "We are going to need extra copies of %s, but no destinations seems available"
                        % (prim))
                    sendLog(
                        'transferor',
                        "We are going to need extra copies of %s, but no destinations seems available"
                        % (prim),
                        level='critical')

                    print json.dumps(prim_to_distribute, indent=2)
                    print json.dumps(prim_location, indent=2)
                    print json.dumps(prim_destination, indent=2)

                    prim_to_distribute = [
                        site for site in sites_allowed
                        if not SI.CE_to_SE(site) in prim_location
                    ]
                    #prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in SI.sites_veto_transfer ]
                    prim_to_distribute = [
                        site for site in prim_to_distribute
                        if (SI.disk[SI.CE_to_SE(site)] or wfh.isRelval())
                    ]

                    print "changed to"
                    print json.dumps(prim_to_distribute, indent=2)

                if len(
                        prim_to_distribute
                ) > 0:  ## maybe that a parameter we can play with to limit the
                    if not options or options.chop:
                        ### hard include the tape disk andpoint ?
                        #tapes = [site for site in  getDatasetPresence( url, prim, vetos=['T0','T2','T3','Disk']) if site.endswith('MSS')]
                        chops, sizes = getDatasetChops(
                            prim,
                            chop_threshold=options.chopsize,
                            only_blocks=blocks)
                        spreading = distributeToSites(chops,
                                                      prim_to_distribute,
                                                      n_copies=copies_needed,
                                                      weights=SI.cpu_pledges,
                                                      sizes=sizes)
                        ## prune the blocks/destination that are already in the making, so that subscription don't overlap
                        for site in spreading:
                            for block in list(spreading[site]):
                                if site in destinations and block in destinations[
                                        site]['blocks'].keys():
                                    ## prune it
                                    spreading[site].remove(block)

                        transfer_sizes[prim] = sum(sizes)
                        if not spreading:
                            sendLog(
                                'transferor',
                                'cannot send %s to any site, it cannot fit anywhere'
                                % prim,
                                level='critical')
                            wfh.sendLog(
                                'transferor',
                                "cannot send to any site. %s cannot seem to fit anywhere"
                                % (prim))
                            staging = False
                            can_go = False

                    else:
                        spreading = {}
                        for site in prim_to_distribute:
                            if blocks:
                                spreading[site] = blocks
                            else:
                                spreading[site] = [prim]
                        transfer_sizes[prim] = max(this_load,
                                                   transfer_sizes[prim])
                    can_go = False
                    wfh.sendLog(
                        'transferor', "selected CE destinations %s" %
                        (sorted(spreading.keys())))
                    for (site, items) in spreading.items():
                        all_transfers[site].extend(items)
                        transfers_per_sites[site] += 1
                        primary_destinations.add(site)
                else:
                    can_go = False
                    allowed = False

        if not allowed:
            wfh.sendLog('transferor', "Not allowed to move on with")
            continue

        if secondary:

            override_sec_destination = []
            if 'SecondaryLocation' in CI.campaigns[wfh.request['Campaign']]:
                override_sec_destination = CI.campaigns[
                    wfh.request['Campaign']]['SecondaryLocation']
            if 'SecondaryLocation' in overide_parameters:
                override_sec_destination = overide_parameters[
                    'SecondaryLocation']
            print wfo.name, 'reads', ', '.join(secondary), 'in secondary'
            for sec in secondary:

                workflow_dependencies[sec].add(wfo.id)

                if True:
                    ## new style, failing on minbias
                    if not sec in destination_cache:
                        ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist
                        destination_cache[sec], _ = getDatasetDestinations(
                            url, sec)  ## NO SITE WHITE LIST ADDED
                        #destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed])

                    ## limit to the site whitelist NOW
                    se_allowed = set(
                        [SI.CE_to_SE(site) for site in sites_allowed])
                    destinations = dict([
                        (k, v) for (k, v) in destination_cache[sec].items()
                        if k in se_allowed
                    ])
                    ## truncate location/destination to those making up for >90% of the dataset
                    bad_destinations = [
                        destinations.pop(site)
                        for (site, info) in destinations.items()
                        if info['data_fraction'] < 0.9
                    ]
                    print sec, json.dumps(destinations, indent=2)
                    sec_location = [
                        site for (site, info) in destinations.items()
                        if info['completion'] >= 95
                    ]
                    sec_destination = [
                        site for site in destinations.keys()
                        if not site in sec_location
                    ]  ## this is in SE
                else:
                    ## old style
                    presence = getDatasetPresence(url, sec)
                    sec_location = [
                        site for site, pres in presence.items()
                        if pres[1] > 90.
                    ]  ## more than 90% of the minbias at sites
                    subscriptions = listSubscriptions(url, sec)
                    sec_destination = [site for site in subscriptions]

                ## how to make unified understand that it has to wait for the secondary if the sec_destination and

                #sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])]
                sec_to_distribute = [
                    site for site in sites_allowed
                    if not SI.CE_to_SE(site) in sec_location
                ]
                #sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])]
                sec_to_distribute = [
                    site for site in sec_to_distribute
                    if not SI.CE_to_SE(site) in sec_destination
                ]
                presitespace_sec_to_distribute = copy.deepcopy(
                    sec_to_distribute)
                #sec_to_distribute = [site for site in sec_to_distribute if not  any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                #sec_to_distribute = [site for site in sec_to_distribute if not  SI.CE_to_SE(site) in SI.sites_veto_transfer]
                sec_to_distribute = [
                    site for site in sec_to_distribute
                    if (SI.disk[SI.CE_to_SE(site)] or wfh.isRelval())
                ]
                ## at this point you have a problem
                if len(sec_to_distribute) == 0 and len(
                        presitespace_sec_to_distribute):
                    sendLog(
                        'transferor',
                        '%s is getting no possible destinations because of lack of space. To be decided what to do in general'
                        % (sec),
                        level='critical')

                if override_sec_destination:
                    ## intersect with where we want the PU to be
                    not_needed_anymore = list(
                        set(sec_to_distribute) - set(override_sec_destination))
                    #sendEmail("secondary superfluous","the dataset %s could be removed from %s"%( sec, not_needed_anymore ))
                    sendLog(
                        'transferor',
                        "the dataset %s could be removed from %s" %
                        (sec, not_needed_anymore))
                    sec_to_distribute = list(
                        set(sec_to_distribute) & set(override_sec_destination))

                if len(sec_to_distribute) > 0:
                    print "secondary could go to", sorted(sec_to_distribute)
                    sec_size = dss.get(sec)
                    for site in sec_to_distribute:
                        site_se = SI.CE_to_SE(site)
                        if (SI.disk[site_se] *
                                1024.) > sec_size or wfh.isRelval():
                            wfh.sendLog('transferor',
                                        'Sending %s to %s' % (sec, site))
                            all_transfers[site].append(sec)
                            can_go = False
                        else:
                            print "could not send the secondary input to", site_se, "because it is too big for the available disk", SI.disk[
                                site_se] * 1024, "GB need", sec_size
                            if primary_destinations and site in primary_destinations:
                                #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024))
                                sendLog(
                                    'transferor',
                                    '%s is too big (%s) for %s (%s). %s will not be able to run there.'
                                    % (sec, sec_size, site_se,
                                       SI.disk[site_se] * 1024, wfo.name),
                                    level='critical')
                                wfh.sendLog(
                                    'transferor',
                                    '%s is too big (%s) for %s (%s). will not be able to run there.'
                                    % (sec, sec_size, site_se,
                                       SI.disk[site_se] * 1024))
                else:
                    ## this is bas overall
                    print "the secondary input does not have to be send to site"

        ## is that possible to do something more
        if can_go:
            ## no explicit transfer required this time
            if staging:
                ## but using existing ones
                wfh.sendLog(
                    'transferor',
                    "latches on existing transfers, and nothing else, settin staging"
                )
                wfo.status = 'staging'
                needs_transfer += 1
            else:
                wfh.sendLog(
                    'transferor', "should just be assigned now to %s" %
                    sorted(sites_allowed))
                wfo.status = 'staged'
            passing_along += 1
            wfh.sendLog('transferor',
                        "setting %s status to %s" % (wfo.name, wfo.status))
            #session.commit()
            continue
        else:
            ## there is an explicit transfer required
            if staging:
                ## and also using an existing one
                wfh.sendLog('transferor', "latches on existing transfers")
                if not options.test:
                    wfo.status = 'staging'
                    wfh.sendLog(
                        'transferor',
                        "setting %s status to %s" % (wfo.name, wfo.status))
                    #session.commit()
            wfh.sendLog('transferor', "needs a transfer")
            needs_transfer += 1
            passing_along += 1

    if no_goes:
        #sendEmail("no go for managing","No go for \n"+"\n".join( no_goes ))
        sendLog('transferor',
                "No go for \n" + "\n".join(sorted(no_goes)),
                level='critical')

    print "accumulated transfers"
    print json.dumps(all_transfers, indent=2)
    fake_id = -1
    wf_id_in_prestaging = set()

    for (site, items_to_transfer) in all_transfers.iteritems():
        items_to_transfer = list(set(items_to_transfer))

        ## convert to storage element
        site_se = SI.CE_to_SE(site)

        ## site that do not want input datasets
        #if site in SI.sites_veto_transfer:
        #    print site,"does not want transfers"
        #    continue

        ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured.

        ## massage a bit the items
        blocks = [it for it in items_to_transfer if '#' in it]
        block_datasets = list(set([it.split('#')[0] for it in blocks]))
        datasets = [it for it in items_to_transfer if not '#' in it]

        details_text = "Making a replica to %s (CE) %s (SE) for" % (site,
                                                                    site_se)

        #print "\t",len(blocks),"blocks"
        ## remove blocks if full dataset is send out
        blocks = [
            block for block in blocks if not block.split('#')[0] in datasets
        ]
        #print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks]))
        #print "\t",len(datasets),"datasets"
        #print "\t",datasets
        details_text += '\n\t%d blocks' % len(blocks)
        details_text += '\n\t%d needed blocks for %s' % (
            len(blocks),
            sorted(list(set([block.split('#')[0] for block in blocks]))))
        details_text += '\n\t%d datasets' % len(datasets)
        details_text += '\n\t%s' % sorted(datasets)

        items_to_transfer = blocks + datasets

        if execute:
            sendLog('transferor', details_text)
        else:
            print "Would make a replica to", site, "(CE)", site_se, "(SE) for"
            print details_text

        ## operate the transfer
        if options and options.stop:
            ## ask to move-on
            answer = raw_input('Continue with that ?')
            if not answer.lower() in ['y', 'yes', 'go']:
                continue
        transfered_items = defaultdict(set)
        if execute:
            priority = 'normal'
            cds = [
                ds for ds in set(datasets + block_datasets)
                if ds in max_priority
            ]
            ## bucketize the transfers by priority of workflows
            prioritized_items = defaultdict(set)
            for item in items_to_transfer:
                d = item.split('#')[0]
                p = max_priority.get(d, 80000)
                q = 'normal'
                if p > 100000:
                    q = 'reserved'
                elif p < 70000:
                    q = 'low'
                prioritized_items[q].add(item)

            for priority, items in prioritized_items.items():
                result = makeReplicaRequest(url,
                                            site_se,
                                            list(items),
                                            'prestaging',
                                            priority=priority,
                                            approve=True)
                if result:
                    these_transfers = [
                        o['id'] for o in result['phedex']['request_created']
                    ]
                    #phedexids.extend( these_transfers )
                    for ph in these_transfers:
                        transfered_items[ph].update(items)
                else:
                    sendLog(
                        'transferor',
                        'Could not make a replica request for items %s to site %s'
                        % (items, site_se),
                        level='critical')

            #result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority=priority, approve=True)
            #phedexids = [o['id'] for o in result['phedex']['request_created']]:
        #else:
        #    #result= {'phedex':{'request_created' : []}}
        #    phedexids = []
        #    fake_id-=1

        if not transfered_items:
            sendLog(
                'transferor',
                'Could not make a replica request for items %s to site %s' %
                (items_to_transfer, site),
                level='critical')
            continue
        for phedexid, items in transfered_items.items():
            print phedexid, "transfer created"
            for transfering in list(
                    set(map(lambda it: it.split('#')[0], items))):
                for wfid in workflow_dependencies[transfering]:
                    new_transfer = session.query(TransferImp).filter(
                        TransferImp.phedexid == int(phedexid)).filter(
                            TransferImp.workflow_id == wfid).first()
                    if not new_transfer:
                        new_transfer = TransferImp(
                            phedexid=phedexid,
                            workflow=session.query(Workflow).get(wfid))
                        session.add(new_transfer)
                    else:
                        new_transfer.active = True

                    wf_id_in_prestaging.add(wfid)
            #session.commit()

    for wfid in wf_id_in_prestaging:
        tr_wf = session.query(Workflow).get(wfid)
        if tr_wf and tr_wf.status != 'staging':
            if execute:
                tr_wf.status = 'staging'
                if talk:
                    print "setting", tr_wf.name, "to staging"
        #session.commit()

    ## one big session commit at the end that everything went fine
    session.commit()
Exemplo n.º 41
0
def htmlor(caller=""):
    cache = getWorkflows("cmsweb.cern.ch", "assignment-approved", details=True)
    cache.extend(getWorkflows("cmsweb.cern.ch", "running-open", details=True))
    cache.extend(getWorkflows("cmsweb.cern.ch", "running-closed", details=True))

    def getWL(wfn):
        cached = filter(lambda d: d["RequestName"] == wfn, cache)
        if cached:
            wl = cached[0]
        else:
            wl = getWorkLoad("cmsweb.cern.ch", wfn)
        return wl

    def wfl(wf, view=False, p=False, ms=False, within=False, ongoing=False, status=False, update=False):
        wfn = wf.name
        wfs = wf.wm_status
        wl = None
        pid = None
        pids = filter(lambda seg: seg.count("-") == 2, wf.name.split("_"))
        if len(pids):
            pids = pids[:1]
            pid = pids[0]

        if not pids:
            wl = getWL(wf.name)
            pids = getPrepIDs(wl)
            pid = pids[0]

        text = ", ".join(
            [
                # wfn,
                '<a href="https://cmsweb.cern.ch/reqmgr/view/details/%s" target="_blank">%s</a>' % (wfn, wfn),
                "(%s) <br>" % wfs,
            ]
        )
        text += ", ".join(
            [
                '<a href="https://cmsweb.cern.ch/reqmgr/view/details/%s" target="_blank">dts</a>' % wfn,
                '<a href=https://cmsweb.cern.ch/reqmgr/view/showWorkload?requestName=%s target="_blank">wkl</a>' % wfn,
                '<a href="https://cmsweb.cern.ch/couchdb/reqmgr_workload_cache/%s" target="_blank">wfc</a>' % wfn,
                '<a href="https://cmsweb.cern.ch/reqmgr/reqMgr/request?requestName=%s" target="_blank">dwkc</a>' % wfn,
                '<a href="https://cmsweb.cern.ch/reqmgr/view/splitting/%s" target="_blank">spl</a>' % wfn,
                '<a href="https://cms-pdmv.cern.ch/stats/?RN=%s" target="_blank">vw</a>' % wfn,
                '<a href="https://cms-pdmv.cern.ch/stats/restapi/get_one/%s" target="_blank">vwo</a>' % wfn,
                '<a href="https://cms-logbook.cern.ch/elog/Workflow+processing/?mode=full&reverse=0&reverse=1&npp=20&subtext=%s&sall=q" target="_blank">elog</a>'
                % pid,
                '<a href="http://cms-gwmsmon.cern.ch/prodview/%s" target="_blank">pv</a>' % wfn,
                '<a href="https://cmsweb.cern.ch/reqmgr/reqMgr/outputDatasetsByRequestName/%s" target="_blank">out</a>'
                % wfn,
                '<a href="closeout.html#%s" target="_blank">clo</a>' % wfn,
                '<a href="statuses.html#%s" target="_blank">st</a>' % wfn,
                '<a href="https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s" target="_blank">perf</a>'
                % wfn,
            ]
        )
        if within and (not view or wfs == "completed"):
            wl = getWL(wfn)
            dataset = None
            if "InputDataset" in wl:
                dataset = wl["InputDataset"]
            if "Task1" in wl and "InputDataset" in wl["Task1"]:
                dataset = wl["Task1"]["InputDataset"]

            if dataset:
                text += ", ".join(
                    [
                        "",
                        "<a href=https://cmsweb.cern.ch/das/request?input=%s target=_blank>input</a>" % dataset,
                        "<a href=https://cmsweb.cern.ch/phedex/prod/Data::Subscriptions#state=create_since=0;filter=%s target=_blank>sub</a>"
                        % dataset,
                        "<a href=https://cmsweb.cern.ch/phedex/datasvc/xml/prod/subscriptions?dataset=%s&collapse=n target=_blank>ds</a>"
                        % dataset,
                        "<a href=https://cmsweb.cern.ch/phedex/datasvc/xml/prod/blockreplicas?dataset=%s target=_blank>rep</a>"
                        % dataset,
                    ]
                )

        if p:
            cached = filter(lambda d: d["RequestName"] == wfn, cache)
            if cached:
                wl = cached[0]
            else:
                wl = getWorkLoad("cmsweb.cern.ch", wfn)
            text += ", (%s)" % (wl["RequestPriority"])
            pass

        if pid:
            if ms:
                mcm_s = json.loads(
                    os.popen(
                        "curl https://cms-pdmv.cern.ch/mcm/public/restapi/requests/get_status/%s --insecure" % pid
                    ).read()
                )[pid]
                text += ', <a href="https://cms-pdmv.cern.ch/mcm/requests?prepid=%s" target="_blank">mcm (%s)</a>' % (
                    pid,
                    mcm_s,
                )
            else:
                text += ', <a href="https://cms-pdmv.cern.ch/mcm/requests?prepid=%s" target="_blank">mcm</a>' % (pid)
                text += (
                    ', <a href="https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=%s" target="_blank">ac</a>'
                    % (pid)
                )

        if status:
            if wf.status.startswith("assistance"):
                text += ', <a href="assistance.html#%s" target="_blank">assist</a>' % wfn
            text += " : %s " % (wf.status)

        if view and wfs != "acquired":
            text += (
                '<a href="https://cms-pdmv.web.cern.ch/cms-pdmv/stats/growth/%s.gif" target="_blank"><img src="https://cms-pdmv.web.cern.ch/cms-pdmv/stats/growth/%s.gif" style="height:50px"></a>'
                % (wfn.replace("_", "/"), wfn.replace("_", "/"))
            )
        if ongoing:
            text += (
                '<a href="http://cms-gwmsmon.cern.ch/prodview/%s" target="_blank"><img src="http://cms-gwmsmon.cern.ch/prodview/graphs/%s/daily" style="height:50px"></a>'
                % (wfn, wfn)
            )

        if ongoing:
            date1 = time.strftime("%Y-%m-%d+%H:%M", time.gmtime(time.mktime(time.gmtime()) - (15 * 24 * 60 * 60)))
            date2 = time.strftime("%Y-%m-%d+%H:%M", time.gmtime())
            text += (
                '<a href="http://dashb-cms-job.cern.ch/dashboard/templates/web-job2/#table=Jobs&date1=%s&date2=%s&sortby=site&task=wmagent_%s">dashb</a>'
                % (date1, date2, wfn)
            )

        text += "<hr>"
        return text

    def phl(phid):
        text = ", ".join(
            [
                str(phid),
                '<a href="https://cmsweb.cern.ch/phedex/prod/Request::View?request=%s" target="_blank">vw</a>' % phid,
                '<a href="https://cmsweb.cern.ch/phedex/prod/Data::Subscriptions?reqfilter=%s" target="_blank">sub</a>'
                % phid,
            ]
        )
        return text

    def ol(out):
        return '<a href="https://cmsweb.cern.ch/das/request?input=%s" target="_blank"> %s</a>' % (out, out)

    def lap(comment):

        l = time.mktime(time.gmtime())
        spend = l - lap.start
        lap.start = l
        print "Spend %d [s] for %s" % (spend, comment)

    lap.start = time.mktime(time.gmtime())

    ## start to write it
    # html_doc = open('/afs/cern.ch/user/v/vlimant/public/ops/index.html','w')
    html_doc = open("/afs/cern.ch/user/c/cmst2/www/unified/index.html", "w")
    print "Updating the status page ..."

    if not caller:
        try:
            # caller = sys._getframe(1).f_code.co_name
            caller = sys.argv[0].split("/")[-1].replace(".py", "")
            print "caller is"
            print caller
        except Exception as es:
            caller = "none found"
            print "not getting frame"
            print str(es)

    html_doc.write(
        """
<html>
<head>
<META HTTP-EQUIV="refresh" CONTENT="900">
<script type="text/javascript">
 function showhide(id) {
    var e = document.getElementById(id);
    e.style.display = (e.style.display == 'block') ? 'none' : 'block';
 }
</script>
</head>
<body>

Last update on %s(CET), %s(GMT), <a href=logs/ target=_blank>logs</a> <a href=logs/last.log target=_blank>last</a> <a href=statuses.html>statuses</a> <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/ target=_blank>prod mon</a> <a href=https://cmsweb.cern.ch/wmstats/index.html target=_blank>wmstats</a> <a href=http://t3serv001.mit.edu/~cmsprod/IntelROCCS/Detox/SitesInfo.txt target=_blank>detox</a> <a href=locked.html>space</a> <a href=logs/subscribor/last.log target=_blank>blocks</a> <a href=https://twiki.cern.ch/twiki/bin/view/CMSPublic/CompOpsWorkflowL3Responsibilities#Automatic_Assignment_and_Unified>what am I</a> <a href=logs/addHoc/last.log>add-hoc op</a> created from <b>%s <a href=logs/last_running>last running</a></b><br><br>

"""
        % (time.asctime(time.localtime()), time.asctime(time.gmtime()), caller)
    )

    text = ""
    count = 0
    count_by_campaign = defaultdict(lambda: defaultdict(int))
    for wf in session.query(Workflow).filter(Workflow.status == "considered").all():
        wl = getWL(wf.name)
        count_by_campaign[wl["Campaign"]][int(wl["RequestPriority"])] += 1
        text += "<li> %s </li> \n" % wfl(wf, p=True)
        count += 1
    text_by_c = ""
    for c in count_by_campaign:
        text_by_c += "<li> %s (%d) : " % (c, sum(count_by_campaign[c].values()))
        for p in sorted(count_by_campaign[c].keys()):
            text_by_c += "%d (%d), " % (p, count_by_campaign[c][p])
        text_by_c += "</li>"

    html_doc.write(
        """
Worflow next to handle (%d) <a href=https://cms-pdmv.cern.ch/mcm/batches?status=new&page=-1 target="_blank"> batches</a> <a href=logs/injector/last.log target=_blank>log</a> <a href=logs/transferor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('considered')">[Click to show/hide]</a>
<br>
<div id="considered" style="display:none;">
<ul>
<li> By workflow (%d) </li><a href="javascript:showhide('considered_bywf')">[Click to show/hide]</a><div id="considered_bywf" style="display:none;">
 <ul>
 %s
 </ul></div>
<li> By campaigns (%d) </li><a href="javascript:showhide('considered_bycamp')">[Click to show/hide]</a><div id="considered_bycamp" style="display:none;">
 <ul>
 %s
 </ul></div>
</ul>
</div>
"""
        % (count, count, text, len(count_by_campaign), text_by_c)
    )

    lap("done with considered")
    text = ""
    count = 0
    count_by_campaign = defaultdict(lambda: defaultdict(int))
    for wf in session.query(Workflow).filter(Workflow.status == "staging").all():
        wl = getWL(wf.name)
        count_by_campaign[wl["Campaign"]][int(wl["RequestPriority"])] += 1
        text += "<li> %s </li> \n" % wfl(wf, within=True)
        count += 1

    text_by_c = ""
    for c in count_by_campaign:
        text_by_c += "<li> %s (%d) : " % (c, sum(count_by_campaign[c].values()))
        for p in sorted(count_by_campaign[c].keys()):
            text_by_c += "%d (%d), " % (p, count_by_campaign[c][p])
        text_by_c += "</li>"

    html_doc.write(
        """
Worflow waiting in staging (%d) <a href=logs/transferor/last.log target=_blank>log</a> <a href=logs/stagor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('staging')">[Click to show/hide]</a>
<br>
<div id="staging" style="display:none;">
<ul>
<li> By workflow (%d) </li><a href="javascript:showhide('staging_bywf')">[Click to show/hide]</a><div id="staging_bywf" style="display:none;">                                                                                                                                                                       
 <ul>            
 %s
 </ul></div>
<li> By campaigns (%d) </li><a href="javascript:showhide('staging_bycamp')">[Click to show/hide]</a><div id="staging_bycamp" style="display:none;">                                                                                                                                                                  
 <ul>                                                                                                                                                                                                                                                                                                                      
 %s                                                                                                                                                                                                                                                                                                                        
 </ul></div>                                                                                                                                                                                                                                                                                                               
</ul>      
</div>
"""
        % (count, count, text, len(count_by_campaign), text_by_c)
    )

    lap("done with staging")

    text = ""
    count = 0
    for ts in session.query(Transfer).all():
        stext = (
            '<li> %s serves </li><a href="javascript:showhide(\'%s\')">[show/hide] relevant workflows</a> <div id="%s" style="display:none;"><ul>'
            % (phl(ts.phedexid), ts.phedexid, ts.phedexid)
        )
        hide = True
        for pid in ts.workflows_id:
            w = session.query(Workflow).get(pid)
            hide &= w.status != "staging"
            if w.status in ["considered", "staging", "staged"]:
                stext += "<li> %s </li>\n" % (wfl(w, status=True))
        stext += "</ul></div>\n"
        if hide:
            # text+="<li> %s not needed anymore to start running (does not mean it went through completely)</li>"%phl(ts.phedexid)
            pass
        else:
            count += 1
            text += stext
    text += "</ul></div>"
    html_doc.write(
        """
Transfer on-going (%d) <a href=https://transferteam.web.cern.ch/transferteam/dashboard/ target=_blank>dashboard</a> <a href=logs/transferor/last.log target=_blank>log</a> <a href=logs/stagor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('transfer')">[Click to show/hide]</a>
<br>
<div id="transfer" style="display:none;">
<br>
<ul>"""
        % count
    )
    html_doc.write(text)

    lap("done with transfers")

    text = ""
    count = 0
    count_by_campaign = defaultdict(lambda: defaultdict(int))
    for wf in session.query(Workflow).filter(Workflow.status == "staged").all():
        wl = getWL(wf.name)
        count_by_campaign[wl["Campaign"]][int(wl["RequestPriority"])] += 1
        text += "<li> %s </li> \n" % wfl(wf, p=True)
        count += 1
    text_by_c = ""
    for c in count_by_campaign:
        text_by_c += "<li> %s (%d) : " % (c, sum(count_by_campaign[c].values()))
        for p in sorted(count_by_campaign[c].keys()):
            text_by_c += "%d (%d), " % (p, count_by_campaign[c][p])
        text_by_c += "</li>"

    html_doc.write(
        """Worflow ready for assigning (%d) <a href=logs/stagor/last.log target=_blank>log</a> <a href=logs/assignor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('staged')">[Click to show/hide]</a>
<br>
<div id="staged" style="display:none;">
<br>
<ul>
<li> By workflow (%d) </li><a href="javascript:showhide('staged_bywf')">[Click to show/hide]</a><div id="staged_bywf" style="display:none;">                                                                                                                                                                             
 <ul>                                                                                                                                                                                                                                                                                                                      
 %s                                                                                                                                                                                                                                                                                                                        
 </ul></div>                                                                                                                                                                                                                                                                                                               
<li> By campaigns (%d) </li><a href="javascript:showhide('staged_bycamp')">[Click to show/hide]</a><div id="staged_bycamp" style="display:none;">                                                                                                                                                                        
 <ul>                                                                                                                                                                                                                                                                                                                      
 %s                                                                                                                                                                                                                                                                                                                        
 </ul></div>
</ul>
</div>
"""
        % (count, count, text, len(count_by_campaign), text_by_c)
    )

    lap("done with staged")

    lines = []
    count_by_campaign = defaultdict(lambda: defaultdict(int))
    for wf in session.query(Workflow).filter(Workflow.status == "away").all():
        wl = getWL(wf.name)
        count_by_campaign[wl["Campaign"]][int(wl["RequestPriority"])] += 1
        lines.append("<li> %s </li>" % wfl(wf, view=True, ongoing=True))
    text_by_c = ""
    for c in count_by_campaign:
        text_by_c += "<li> %s (%d) : " % (c, sum(count_by_campaign[c].values()))
        for p in sorted(count_by_campaign[c].keys()):
            text_by_c += "%d (%d), " % (p, count_by_campaign[c][p])
        text_by_c += "</li>"

    lines.sort()
    html_doc.write(
        """
Worflow on-going (%d) <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests_in_production.php target=_blank>ongoing</a> <a href=https://cms-logbook.cern.ch/elog/Workflow+processing/?mode=summary target=_blank>elog</a> <a href=http://cms-gwmsmon.cern.ch/prodview target=_blank>queues</a> <a href=logs/assignor/last.log target=_blank>log</a> <a href=logs/checkor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('away')">[Click to show/hide]</a>
<br>
<div id="away" style="display:none;">
<ul> 
<li>By workflow (%d) </li>
<a href="javascript:showhide('away_bywf')">[Click to show/hide]</a><div id="away_bywf" style="display:none;">
<ul>
%s
</ul></div>
<li> By campaigns (%d) </li><a href="javascript:showhide('away_bycamp')">[Click to show/hide]</a><div id="away_bycamp" style="display:none;">
<ul>
%s
</ul></div>
</ul>
</div>
"""
        % (len(lines), len(lines), "\n".join(lines), len(count_by_campaign), text_by_c)
    )

    lap("done with away")

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(Workflow.status == "assistance").all():
        text += "<li> %s </li> \n" % wfl(wf, view=True, update=True, status=True)
        count += 1
    text += "</ul></div>\n"
    html_doc.write(
        """Worflow that are closing (%d)
<a href=closeout.html target=_blank>closeout</a> 
<a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('closing')">[Click to show/hide]</a>
<br>
<div id="closing" style="display:none;">
<br>
<ul>
"""
        % count
    )
    html_doc.write(text)

    lap("done with closing")

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(Workflow.status.startswith("assistance-")).all():
        text += "<li> %s </li> \n" % wfl(wf, view=True, within=True, status=True, update=True)
        count += 1
    text += "</ul></div>\n"
    html_doc.write(
        """Worflow which need assistance (%d)
<a href=assistance.html target=_blank>assistance</a> 
<a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/recoveror/last.log target=_blank>postlog</a>
<a href="javascript:showhide('assistance')">[Click to show/hide]</a>
<br>
<div id="assistance" style="display:none;">
<br>
<ul>
"""
        % count
    )
    html_doc.write(text)

    lap("done with assistance")

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(Workflow.status == "close").all():
        text += "<li> %s </li> \n" % wfl(wf)
        count += 1
    text += "</ul></div>\n"
    html_doc.write(
        """Worflow ready to close (%d)
<a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('close')">[Click to show/hide]</a>
<br>
<div id="close" style="display:none;">
<br>
<ul>
"""
        % count
    )
    html_doc.write(text)

    lap("done with annoucing")

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(Workflow.status == "trouble").all():
        text += "<li> %s </li> \n" % wfl(wf)
        count += 1
    text += "</ul></div>\n"
    html_doc.write(
        """Worflow with issue (%d) <a href=logs/closor/last.log target=_blank>log</a> <a href=logs/injector/last.log target=_blank>postlog</a>
<a href="javascript:showhide('trouble')">[Click to show/hide]</a>
<br>
<div id="trouble" style="display:none;">
<br>
<ul>
"""
        % count
    )
    html_doc.write(text)

    lap("done with trouble")

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(Workflow.status == "forget").all():
        text += "<li> %s </li> \n" % wfl(wf)
        count += 1
    text += "</ul></div>\n"
    html_doc.write(
        """
Worflow to forget (%d) <a href=logs/injector/last.log target=_blank>log</a> <a href=logs/outcleanor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('forget')">[Click to show/hide]</a>
<br>
<div id="forget" style="display:none;">
<br>
<ul>
"""
        % count
    )
    html_doc.write(text)

    lap("done with forget")

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(Workflow.status == "done").all():
        text += "<li> %s </li> \n" % wfl(wf)  # ,ms=True)
        count += 1
    text += "</ul></div>\n"
    html_doc.write(
        """
Worflow through (%d) <a href=logs/closor/last.log target=_blank>log</a> <a href=logs/cleanor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('done')">[Click to show/hide]</a>
<br>
<div id="done" style="display:none;">
<br>
<ul>
"""
        % count
    )
    html_doc.write(text)

    lap("done with done")

    wfs = session.query(Workflow).filter(Workflow.status.endswith("-unlock")).all()
    html_doc.write(" Workflows unlocked : %s <br>" % (len(wfs)))
    lap("done with unlocked")

    text = ""
    lines_thisweek = []
    lines_lastweek = []
    now = time.mktime(time.gmtime())
    this_week = int(time.strftime("%W", time.gmtime()))
    start_time_two_weeks_ago = time.mktime(time.strptime("15-0-%d" % (this_week - 2), "%y-%w-%W"))
    for out in session.query(Output).filter(Output.date >= start_time_two_weeks_ago).all():
        if not out.workflow:
            print "This is a problem with", out.datasetname
            continue
        if out.workflow.status in ["done", "clean", "clean-out", "clean-unlock"]:
            out_week = int(time.strftime("%W", time.gmtime(out.date)))
            ##only show current week, and the previous.
            if (this_week - out_week) == 1:
                lines_lastweek.append(
                    "<li>on week %s : %s </li>"
                    % (time.strftime("%W (%x %X)", time.gmtime(out.date)), ol(out.datasetname))
                )
            if (this_week - out_week) == 0:
                lines_thisweek.append(
                    "<li>on week %s : %s </li>"
                    % (time.strftime("%W (%x %X)", time.gmtime(out.date)), ol(out.datasetname))
                )
    lines_thisweek.sort()
    lines_lastweek.sort()

    html_doc.write(
        """Output produced (%d) <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?in_disagreement=1 target=_blank>disagreements</a>
<a href="javascript:showhide('output')">[Click to show/hide]</a>
<br>
<div id="output" style="display:none;">
<br>
<ul>
<li> Last week (%d) </li><a href="javascript:showhide('output_lastweek')">[Click to show/hide]</a><div id="output_lastweek" style="display:none;"><ul>
%s
</ul></div>
<li> This week (%d) </li><a href="javascript:showhide('output_thisweek')">[Click to show/hide]</a><div id="output_thisweek" style="display:none;"><ul>
%s
</ul></div></div>
"""
        % (
            len(lines_lastweek) + len(lines_thisweek),
            len(lines_lastweek),
            "\n".join(lines_lastweek),
            len(lines_thisweek),
            "\n".join(lines_thisweek),
        )
    )

    lap("done with output")

    html_doc.write(
        """Job installed
<a href="javascript:showhide('acron')">[Click to show/hide]</a>
<br>
<div id="acron" style="display:none;">
<br>
<pre>
%s
</pre>
"""
        % (os.popen("acrontab -l | grep Unified | grep -v \#").read())
    )

    per_module = defaultdict(list)
    for t in filter(None, os.popen("cat /afs/cern.ch/user/c/cmst2/www/unified/logs/*/*.time").read().split("\n")):
        module_name, run_time, spend = t.split(":")
        ## then do what you want with it !
        per_module[module_name].append(int(spend))

    html_doc.write("Module running time<ul>\n")
    for m, spends in per_module.items():
        html_doc.write("<li>%s : last %d [s], avg %d [s]</li>\n" % (m, spends[-1], sum(spends) / float(len(spends))))
    html_doc.write("</ul>")

    html_doc.write(
        "Last running <pre>%s</pre>"
        % (os.popen("tac /afs/cern.ch/user/c/cmst2/www/unified/logs/running | head -5").read())
    )
    html_doc.write("</div>\n")
    lap("done with jobs")

    text = ""
    count = 0
    for (c, info) in campaignInfo().campaigns.items():
        # if 'go' in info and info['go']:
        text += "<li>%s <br> <pre>%s</pre>  </li>" % (c, json.dumps(info, indent=2))
        count += 1

    html_doc.write(
        """Campaign configuration
<a href="javascript:showhide('campaign')">[Click to show/hide]</a>
<br>
<div id="campaign" style="display:none;">
<br>
<ul>
%s
</ul></div>
"""
        % (text)
    )

    text = ""
    count = 0
    n_column = 4
    SI = siteInfo()
    for t in SI.types():
        text += "<li>%s<table border=1>" % t
        c = 0
        for site in getattr(SI, t):
            cpu = SI.cpu_pledges[site] if site in SI.cpu_pledges else "N/A"
            disk = SI.disk[SI.CE_to_SE(site)] if SI.CE_to_SE(site) in SI.disk else "N/A"
            if c == 0:
                text += "<tr>"
            text += (
                '<td><a href=http://dashb-ssb.cern.ch/dashboard/templates/sitePendingRunningJobs.html?site=%s>%s</a><br><a href="http://cms-gwmsmon.cern.ch/prodview/%s" target="_blank"><img src="http://cms-gwmsmon.cern.ch/prodview/graphs/%s/daily" style="height:50px"></a><br>CPU pledge: %s<br>Disk available: %s</td>'
                % (site, site, site, site, cpu, disk)
            )
            if c == n_column:
                c = 0
            else:
                c += 1
        text += "</table></li>"

    lap("done with campaigns")

    open("/afs/cern.ch/user/c/cmst2/www/unified/siteInfo.json", "w").write(
        json.dumps(dict([(t, getattr(SI, t)) for t in SI.types()]), indent=2)
    )

    lap("done with sites json")

    chart_data = defaultdict(list)
    for site in SI.quota:
        chart_data[site].append(
            """
var data_%s = google.visualization.arrayToDataTable([ 
['Overall', 'Space in TB'],
//['Quota' , %s],
['Locked' , %s],
['Free' , %s]
]);
"""
            % (site, SI.quota[site], SI.locked[site], SI.disk[site])
        )
        chart_data[site].append(
            """
var chart_%s = new google.visualization.PieChart(document.getElementById('donutchart_%s'));
chart_%s.draw(data_%s, {title: '%s %s [TB]', pieHole:0.4, slices:{0:{color:'red'},1:{color:'green'}}});
"""
            % (site, site, site, site, site, SI.quota[site])
        )
        chart_data[site].append(
            """
<div id="donutchart_%s" style="height: 200px;"></div>
"""
            % (site)
        )

    ## make the locked/available donut chart
    donut_html = open("/afs/cern.ch/user/c/cmst2/www/unified/locked.html", "w")
    tables = "\n".join([info[0] for site, info in chart_data.items()])
    draws = "\n".join([info[1] for site, info in chart_data.items()])
    divs = "\n".join([info[2] for site, info in chart_data.items()])

    divs_table = "<table border=0>"
    for c, site in enumerate(sorted(chart_data.keys())):
        if c % 6 == 0:
            divs_table += "<tr>"
        divs_table += "<td>%s</td>" % (chart_data[site][2])
    divs_table += "</table>"

    donut_html.write(
        """
<html>
  <head>
    <script type="text/javascript" src="https://www.google.com/jsapi"></script>
    <script type="text/javascript">
      google.load("visualization", "1", {packages:["corechart"]});
      google.setOnLoadCallback(drawChart);
      function drawChart() {
%s

%s
      }
    </script>
  </head>
  <body>
%s
  </body>
</html>
"""
        % (tables, draws, divs_table)
    )
    donut_html.close()

    html_doc.write(
        """Site configuration
<a href="javascript:showhide('site')">[Click to show/hide]</a>
<br>
<div id="site" style="display:none;">
<br>
<ul>
%s
</ul></div>
"""
        % (text)
    )

    lap("done with space")

    UC = unifiedConfiguration()
    text = ""
    for param in UC.configs:
        text += "<li>%s</li><ul>\n" % param
        for sub in sorted(UC.configs[param].keys()):
            text += "<li> %s : %s </li>\n" % (sub, UC.configs[param][sub])
        text += "</ul>\n"

    html_doc.write(
        """Unified configuration
<a href="javascript:showhide('config')">[Click to show/hide]</a>
<br>
<div id="config" style="display:none;">
<br>
<ul>
%s
</ul></div>                                                                                                                                                                                                                                                                                                                
"""
        % (text)
    )

    lap("done with configuration")

    print "... done with status page."
    html_doc.write(
        """
</body>
</html>
"""
    )

    html_doc.close()

    html_doc = open("/afs/cern.ch/user/c/cmst2/www/unified/statuses.html", "w")
    html_doc.write(
        """                                                                                                                                                                                                                                                                                                      <html>        
<table border=1>
<thead>
<tr>
<th> workflow </th><th> status </th><th> wm status</th>
</tr>
</thead>
"""
    )
    wfs = {}
    for wfo in session.query(Workflow).all():
        wfs[wfo.name] = (wfo.status, wfo.wm_status)
    open("/afs/cern.ch/user/c/cmst2/www/unified/statuses.json", "w").write(json.dumps(wfs))
    for wfn in sorted(wfs.keys()):
        html_doc.write(
            '<tr><td><a id="%s">%s</a></td><td>%s</td><td>%s</td></tr>\n' % (wfn, wfn, wfs[wfn][0], wfs[wfn][1])
        )
    html_doc.write("</table>")
    html_doc.write("<br>" * 100)
    html_doc.write("end of page</html>")
    html_doc.close()
Exemplo n.º 42
0
def transferor(url ,specific = None, talk=True, options=None):
    if userLock():   return
    if duplicateLock():  return

    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    if options and options.test:
        execute = False
    else:
        execute = True

    SI = siteInfo()
    CI = campaignInfo()
    NLI = newLockInfo()
    mcm = McMClient(dev=False)
    dss = DSS()

    #allowed_secondary = UC.get('')
    print "counting all being handled..."
    being_handled = len(session.query(Workflow).filter(Workflow.status == 'away').all())
    being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('stag')).all())
    being_transfered = len(session.query(Workflow).filter(Workflow.status == 'staging').all())
    being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all())

    max_to_handle = options.maxworkflows
    max_to_transfer = options.maxstaging

    allowed_to_handle = max(0,max_to_handle - being_handled)
    allowed_to_transfer = max(0,max_to_transfer - being_transfered)
    wf_buffer = 5
    if allowed_to_handle<=wf_buffer: ## buffer for having several wf per transfer
        print "Not allowed to run more than",max_to_handle,"at a time. Currently",being_handled,"and",wf_buffer,"buffer"
    else:
        print being_handled,"already being handled",max_to_handle,"max allowed,",allowed_to_handle,"remaining","and",wf_buffer,"buffer"

    if allowed_to_transfer <= wf_buffer:
        print "Not allowed to transfer more than",max_to_transfer,"at a time. Currently",being_transfered,"and",wf_buffer,"buffer"
    else:
        print being_transfered,"already being transfered",max_to_transfer,"max allowed,",allowed_to_transfer,"remaining","and",wf_buffer,"buffer"

    print "... done"

    all_transfers=defaultdict(list)
    workflow_dependencies = defaultdict(set) ## list of wf.id per input dataset
    wfs_and_wfh=[]
    print "getting all wf to consider ..."
    cache = getWorkflows(url, 'assignment-approved', details=True)
    for wfo in session.query(Workflow).filter(Workflow.status.startswith('considered')).all():
        print "\t",wfo.name
        if specific and not specific in wfo.name: continue
        cache_r =filter(lambda d:d['RequestName']==wfo.name, cache)
        if len(cache_r):
            wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False, request = cache_r[0]) ) )
        else:
            wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False) ) )
    print "... done"

    transfers_per_sites = defaultdict(int)
    input_sizes = {}
    ignored_input_sizes = {}
    input_cput = {}
    input_st = {}
    ## list the size of those in transfer already
    in_transfer_priority=None
    min_transfer_priority=None
    print "getting all wf in staging ..."
    stucks = json.loads(open('%s/stuck_transfers.json'%monitor_dir).read())
    
    for wfo in session.query(Workflow).filter(Workflow.status=='staging').all():
        wfh = workflowInfo( url, wfo.name, spec=False)
        #(lheinput,primary,parent,secondary) = wfh.getIO()
        #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) )
        (lheinput,primary,parent,secondary,sites_allowed) = wfh.getSiteWhiteList()
        for site in sites_allowed: ## we should get the actual transfer destination instead of the full white list
            transfers_per_sites[site] += 1 
        #input_cput[wfo.name] = wfh.getComputingTime()
        #input_st[wfo.name] = wfh.getSystemTime()
        for prim in primary:  
            ds_s = dss.get( prim )
            if prim in stucks: 
                sendLog('transferor', "%s appears stuck, so not counting it %s [GB]"%( prim, ds_s), wfi=wfh)
                ignored_input_sizes[prim] = ds_s
            else:
                input_sizes[prim] = ds_s
                sendLog('transferor', "%s needs %s [GB]"%( wfo.name, ds_s), wfi=wfh)
        if in_transfer_priority==None:
            in_transfer_priority = int(wfh.request['RequestPriority'])
        else:
            in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority']))
        if min_transfer_priority==None:
            min_transfer_priority = int(wfh.request['RequestPriority'])
        else:
            min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority']))

    if min_transfer_priority==None or in_transfer_priority ==None:
        print "nothing is lining up for transfer"
        sendEmail("no request in staging","no request in staging")
        return 
        pass

    try:
        print "Ignored input sizes"
        ignored_values = list(ignored_input_sizes.items())
        ignored_values.sort( key = lambda i : i[1] )
        print "\n".join( map(str, ignored_values ) )
        print "Considered input sizes"
        considered_values = list(input_sizes.items())
        considered_values.sort( key = lambda i : i[1] )
        print "\n".join( map(str, considered_values) )
    except Exception as e:
        print "trying to print the summary of input size"
        print str(e)

    print "... done"
    print "Max priority in transfer already",in_transfer_priority
    print "Min priority in transfer already",min_transfer_priority
    print "transfers per sites"
    print json.dumps( transfers_per_sites, indent=2)
    in_transfer_already = sum(input_sizes.values())
    cput_in_transfer_already = sum(input_cput.values())
    st_in_transfer_already = sum(input_st.values())

    ## list the size of all inputs
    primary_input_per_workflow_gb = defaultdict(float)
    print "getting all input sizes ..."
    for (wfo,wfh) in wfs_and_wfh:
        (_,primary,_,_) = wfh.getIO()
        #input_cput[wfo.name] = wfh.getComputingTime()
        #input_st[wfo.name] = wfh.getSystemTime()
        for prim in primary:
            ## do not count it if it appears stalled !
            prim_size = dss.get( prim )
            input_sizes[prim] = prim_size
            primary_input_per_workflow_gb[wfo.name] += prim_size
    print "... done"

    # shuffle first by name
    random.shuffle( wfs_and_wfh )
    # Sort smallest transfers first; allows us to transfer as many as possible workflows.
    def prio_and_size( i, j):
        if int(i[1].request['RequestPriority']) == int(j[1].request['RequestPriority']):
            return cmp(int(primary_input_per_workflow_gb.get(j[0].name, 0)), int(primary_input_per_workflow_gb.get(i[0].name, 0)) )
        else:
            return cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority']))

    #wfs_and_wfh.sort(cmp = prio_and_size, reverse=True)
    #wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(primary_input_per_workflow_gb.get(i[0].name, 0)), int(primary_input_per_workflow_gb.get(j[0].name, 0)) ))
    #sort by priority higher first
    wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority']) ), reverse=True)

    cput_grand_total = sum(input_cput.values())
    cput_to_transfer = cput_grand_total - cput_in_transfer_already
    st_grand_total = sum(input_st.values())
    st_to_transfer = st_grand_total - st_in_transfer_already
    print "%15.4f [CPU h] worth already in transfer"%cput_in_transfer_already
    print "%15.4f [CPU h] worth is the current requested transfer load"%cput_to_transfer
    print "%15.4f [h] worth of absolute system time in transfer"%( cput_in_transfer_already / SI.availableSlots())
    print "%15.4f [h] worth of absolute system time is the current requested transfer load"%( cput_to_transfer / SI.availableSlots())
    print "%15.4f [h] worth of theoritical system time in transfer"%( st_in_transfer_already )
    print "%15.4f [h] worth of theoritical system time is the current requested transfer load"%( st_to_transfer )


    grand_total =  sum(input_sizes.values()) 
    to_transfer = grand_total  - in_transfer_already
    grand_transfer_limit = options.maxtransfer
    #grand_transfer_limit = SI.total_disk()*0.25*1024## half of the free sapce in TB->GB
    
    transfer_limit = grand_transfer_limit - in_transfer_already
    print "%15.4f GB already being transfered"%in_transfer_already
    print "%15.4f GB is the current requested transfer load"%to_transfer
    print "%15.4f GB is the global transfer limit"%grand_transfer_limit
    print "%15.4f GB is the available limit"%transfer_limit


    max_staging_per_site = options.maxstagingpersite
                    
    # the max priority value per dataset.
    max_priority = defaultdict(int)
    needs_transfer=0 ## so that we can count'em
    passing_along = 0
    transfer_sizes={}
    went_over_budget=False
    destination_cache = {}
    no_goes = set()

    max_per_round = UC.get('max_per_round').get('transferor',None)
    if max_per_round and not spec:
        wfs_and_wfh = wfs_and_wfh[:max_per_round]
    
    for (wfo,wfh) in wfs_and_wfh:
        print wfo.name,"to be transfered with priority",wfh.request['RequestPriority']

        if wfh.request['RequestStatus']!='assignment-approved':
            if wfh.request['RequestStatus'] in ['aborted','rejected','rejected-archived','aborted-archived']:
                wfo.status = 'trouble' ## so that we look or a replacement
            else:
                wfo.status = 'away'
            wfh.sendLog('transferor', '%s in status %s, setting %s'%( wfo.name,wfh.request['RequestStatus'],wfo.status))
            continue

        (_,primary,_,_) = wfh.getIO()
        this_load=sum([input_sizes[prim] for prim in primary])
        no_budget = False
        if ( this_load and (sum(transfer_sizes.values())+this_load > transfer_limit or went_over_budget ) ):
            if went_over_budget:
                wfh.sendLog('transferor', "Transfer has gone over bubget.")
            else:
                wfh.sendLog('transferor', "Transfer will go over bubget.")
            wfh.sendLog('transferor', "%15.4f GB this load, %15.4f GB already this round, %15.4f GB is the available limit"%(this_load, sum(transfer_sizes.values()), transfer_limit))
            #if sum(transfer_sizes.values()) > transfer_limit:
            went_over_budget = True
            if in_transfer_priority!=None and min_transfer_priority!=None:
                if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
                    wfh.sendLog('transferor',"Higher priority sample %s >= %s go-on over budget"%( wfh.request['RequestPriority'], in_transfer_priority))
                else:
                    if not options.go: 
                        wfh.sendLog('transferor',"%s minimum priority %s < %s : stop"%( min_transfer_priority,wfh.request['RequestPriority'],in_transfer_priority))
                        no_budget = True

        ## throtlle by campaign go
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True
            no_goes.add( wfo.name )
            
        allowed_secondary = set()
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[campaign]:
                allowed_secondary.update( CI.campaigns[campaign]['secondaries'] )
        if secondary:
            if (secondary and allowed_secondary) and (set(secondary)&allowed_secondary!=set(secondary)):
                wfh.sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-allowed_secondary)))
                no_go = True

        if no_go:
            continue
        ## check if the batch is announced

        def check_mcm(wfn):
            announced=False
            is_real=False
            if not wfn.startswith('pdmvserv'):
                is_real = True
            try:
                for b in mcm.getA('batches',query='contains=%s'% wfo.name):
                    is_real = True
                    if b['status']=='announced': 
                        announced=True 
                        break
            except:
                try:
                    for b in mcm.getA('batches',query='contains=%s'% wfo.name):
                        is_real = True
                        if b['status']=='announced': 
                            announced=True 
                            break
                except:
                    print "could not get mcm batch announcement, assuming not real"
            return announced,is_real

        if not use_mcm:
            announced,is_real = False,True
        else:
            if wfh.request['RequestType'] in ['ReReco']:
                announced,is_real = True,True
            else:
                announced,is_real = check_mcm( wfo.name )

        if not announced:
            wfh.sendLog('transferor', "does not look announced.")

            
        if not is_real:
            wfh.sendLog('transferor', "does not appear to be genuine.")

            ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status
            continue

        ## check on a grace period
        injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.)
        now = time.mktime(time.gmtime()) / (60.*60.)
        if float(now - injection_time) < 4.:
            if not options.go and not announced: 
                wfh.sendLog('transferor', "It is too soon to start transfer: %3.2fH remaining"%(now - injection_time))
                continue


        if passing_along >= allowed_to_handle:
            #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
            if in_transfer_priority!=None and min_transfer_priority!=None:
                if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority:
                    ## higher priority, and not only this priority being transfered
                    wfh.sendLog('transferor',"Higher priority sample %s >= %s go-on over %s"%( wfh.request['RequestPriority'], in_transfer_priority, max_to_handle))
                else:
                    wfh.sendLog('transferor'," Not allowed to pass more than %s at a time. Currently %s handled, and adding %s"%( max_to_handle, being_handled, passing_along))
                    if not options.go: 
                        ## should not allow to jump that fence
                        break

        if this_load and needs_transfer >= allowed_to_transfer:
            if in_transfer_priority!=None and min_transfer_priority!=None:
                if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority:
                    ## higher priority, and not only this priority being transfered
                    wfh.sendLog('transferor',"Higher priority sample %s >= %s go-on over %s"%(wfh.request['RequestPriority'], in_transfer_priority,max_to_transfer))
                else:
                    wfh.sendLog('transferor',"Not allowed to transfer more than %s at a time. Currently %s transfering, and adding %s"%( max_to_transfer, being_transfered, needs_transfer))
                    if not options.go: 
                        no_budget = True


        if no_budget:
            continue

        ## the site white list considers site, campaign, memory and core information
        (lheinput,primary,parent,secondary,sites_allowed) = wfh.getSiteWhiteList()
        if options and options.tosites:
            sites_allowed = options.tosites.split(',')


        for dataset in list(primary)+list(parent)+list(secondary):
            ## lock everything flat
            NLI.lock( dataset )

        if not sites_allowed:
            wfh.sendLog('transferor',"not possible site to run at")
            #sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name ))
            sendLog('transferor',"%s has no possible sites to run at"%( wfo.name ),level='critical')
            continue

        blocks = []
        if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']:
            blocks = wfh.request['BlockWhitelist']
        if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']:
            ## augment with run white list
            for dataset in primary:
                blocks = list(set( blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist'] ) ))
        if 'LumiList' in wfh.request and wfh.request['LumiList']:
            ## augment with the lumi white list
            blocks = list(set( blocks + getDatasetBlocks( dataset, lumis= wfh.request['LumiList'] ) ))

        if blocks:
            print "Reading",len(blocks),"in block whitelist"

        can_go = True
        staging=False
        allowed=True
        primary_destinations = set()
        if primary:
            
            copies_needed_from_CPUh,CPUh = wfh.getNCopies()

            if talk:
                print wfo.name,'reads',', '.join(primary),'in primary'
            ## chope the primary dataset 
            for prim in primary:
                ## keep track of what needs what
                workflow_dependencies[prim].add( wfo.id )

                max_priority[prim] = max(max_priority[prim],int(wfh.request['RequestPriority']))

                wfh.sendLog('transferor',"Would make %s  from cpu requirement %s"%( copies_needed_from_CPUh, CPUh))
                copies_needed = copies_needed_from_CPUh

                if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]:
                    copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies']
                    copies_needed = min(copies_needed_from_campaign, copies_needed)
                    
                    wfh.sendLog('transferor',"Maxed to %s by campaign configuration %s"%( copies_needed, wfh.request['Campaign']))


                ### new ways of making the whole thing
                destinations,all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks )
                print json.dumps(destinations, indent=2)

                ## get where the dataset is in full and completed
                prim_location = [site for (site,info) in destinations.items() if info['completion']==100 and info['data_fraction']==1]
                ## the rest is places it is going to be
                prim_destination = [site for site in destinations.keys() if not site in prim_location]


                if len(prim_location) >= copies_needed:
                    wfh.sendLog('transferor',"The input is all fully in place at %s sites %s"%( len(prim_location), sorted(prim_location)))
                    continue
                copies_needed = max(0,copies_needed - len(prim_location))
                wfh.sendLog('transferor',"not counting existing copies ; now need %s"% copies_needed)
                copies_being_made = [ sum([info['blocks'].keys().count(block) for site,info in destinations.items() if site in prim_destination]) for block in all_block_names]

                latching_on_transfers = set()
                [latching_on_transfers.update(info['blocks'].values()) for site,info in destinations.items() if site in prim_destination]
                latching_on_transfers = list(latching_on_transfers)
                #print latching_on_transfers

                ## figure out where all this is going to go
                prim_to_distribute = [site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location]
                prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in prim_destination]
                ## take out the ones that cannot receive transfers
                prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])]

                wfh.sendLog('transferor',"Could be going to: %s"% sorted( prim_to_distribute))
                if not prim_to_distribute or any([transfers_per_sites[site] < max_staging_per_site for site in prim_to_distribute]):
                    ## means there is openings let me go
                    print "There are transfer slots available:",[(site,transfers_per_sites[site]) for site in prim_to_distribute]
                    #for site in sites_allowed:
                    #    #increment accross the board, regardless of real destination: could be changed
                    #    transfers_per_sites[site] += 1
                else:
                    if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
                        wfh.sendLog('transferor', "Higher priority sample %s >= %s go-on over transfer slots available"%(wfh.request['RequestPriority'], in_transfer_priority))
                    else:
                        wfh.sendLog('transferor',"Not allowed to transfer more than %s per site at a time. Going overboard for %s"%( max_staging_per_site, sorted([site for site in prim_to_distribute if transfers_per_sites[site]>=max_staging_per_site])))
                        if not options.go:
                            allowed = False
                            break

                for latching in latching_on_transfers:
                    tfo = session.query(Transfer).filter(Transfer.phedexid == int(latching)).first()
                    if not tfo:
                        tfo = session.query(Transfer).filter(Transfer.phedexid == -int(latching)).first()
                        
                    if not tfo:
                        tfo = Transfer( phedexid = latching)
                        tfo.workflows_id = []
                        session.add(tfo)
                    else:
                        tfo.phedexid = latching ## make it positive ever

                    if not wfo.id in tfo.workflows_id:
                        print "adding",wfo.id,"to",tfo.id,"with phedexid",latching
                        l = copy.deepcopy( tfo.workflows_id )
                        l.append( wfo.id )
                        tfo.workflows_id = l
                    if not options.test:
                        session.commit()
                    else:
                        session.flush() ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query
                    can_go = False
                    transfer_sizes[prim] = input_sizes[prim]
                    staging = True

                # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ??
                #copies_needed = max(0,copies_needed - len(prim_destination))
                copies_needed = max(0,copies_needed - min(copies_being_made))
                wfh.sendLog('transferor', "Not counting the copies being made ; then need %s"% copies_needed)                    
                if copies_needed == 0:
                    wfh.sendLog('transferor', "The output is either fully in place or getting in full somewhere with %s"% latching_on_transfers)
                    can_go = True
                    continue
                elif len(prim_to_distribute)==0:
                    wfh.sendLog('transferor', "We are going to need extra copies, but no destinations seems available")
                    prim_to_distribute = [site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location]
                    prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])]

                if len(prim_to_distribute)>0: ## maybe that a parameter we can play with to limit the 
                    if not options or options.chop:
                        ### hard include the tape disk andpoint ?
                        #tapes = [site for site in  getDatasetPresence( url, prim, vetos=['T0','T2','T3','Disk']) if site.endswith('MSS')]
                        chops,sizes = getDatasetChops(prim, chop_threshold = options.chopsize, only_blocks=blocks)
                        spreading = distributeToSites( chops, prim_to_distribute, n_copies = copies_needed, weights=SI.cpu_pledges, sizes=sizes)
                        transfer_sizes[prim] = sum(sizes)
                        if not spreading:
                            sendLog('transferor','cannot send %s to any site, it cannot fit anywhere'% prim, level='critical')
                            wfh.sendLog('transferor', "cannot send to any site. %s cannot seem to fit anywhere"%(prim))
                            staging=False
                            can_go = False
                    
                    else:
                        spreading = {} 
                        for site in prim_to_distribute: 
                            if blocks:
                                spreading[site]=blocks
                            else:
                                spreading[site]=[prim]
                        transfer_sizes[prim] = input_sizes[prim] ## this is approximate if blocks are specified
                    can_go = False
                    wfh.sendLog('transferor', "selected CE destinations %s"%(sorted( spreading.keys())))
                    for (site,items) in spreading.items():
                        all_transfers[site].extend( items )
                        transfers_per_sites[site] += 1
                        primary_destinations.add( site ) 
        if not allowed:
            wfh.sendLog('transferor', "Not allowed to move on with")
            continue


        if secondary:

            override_sec_destination = []
            if 'SecondaryLocation' in CI.campaigns[wfh.request['Campaign']]:
                override_sec_destination  = CI.campaigns[wfh.request['Campaign']]['SecondaryLocation']

            print wfo.name,'reads',', '.join(secondary),'in secondary'
            for sec in secondary:

                workflow_dependencies[sec].add( wfo.id )

                if True:
                    ## new style, failing on minbias
                    if not sec in destination_cache:
                        ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist
                        destination_cache[sec],_ = getDatasetDestinations(url, sec) ## NO SITE WHITE LIST ADDED
                        #destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed])

                    ## limit to the site whitelist NOW
                    se_allowed = [SI.CE_to_SE(site) for site in sites_allowed]
                    destinations = dict([(k,v) for (k,v) in destination_cache[sec].items() if site in se_allowed])
                    ## truncate location/destination to those making up for >90% of the dataset
                    bad_destinations = [destinations.pop(site) for (site,info) in destinations.items() if info['data_fraction']<0.9]
                    sec_location = [site for (site,info) in destinations.items() if info['completion']>=95]
                    sec_destination = [site for site in destinations.keys() if not site in sec_location]
                else:
                    ## old style
                    presence = getDatasetPresence( url, sec )
                    sec_location = [site for site,pres in presence.items() if pres[1]>90.] ## more than 90% of the minbias at sites
                    subscriptions = listSubscriptions( url ,sec )
                    sec_destination = [site for site in subscriptions] 


                sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])]
                sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])]
                sec_to_distribute = [site for site in sec_to_distribute if not  any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                if override_sec_destination:
                    ## intersect with where we want the PU to be
                    not_needed_anymore = list(set(sec_to_distribute) - set(override_sec_destination))
                    #sendEmail("secondary superfluous","the dataset %s could be removed from %s"%( sec, not_needed_anymore ))
                    sendLog('transferor', "the dataset %s could be removed from %s"%( sec, not_needed_anymore ))
                    sec_to_distribute = list(set(sec_to_distribute) & set(override_sec_destination))

                if len( sec_to_distribute )>0:
                    print "secondary could go to",sorted(sec_to_distribute)
                    sec_size = dss.get( sec )
                    for site in sec_to_distribute:
                        site_se =SI.CE_to_SE(site)
                        if (SI.disk[site_se]*1024.) > sec_size:
                            all_transfers[site].append( sec )
                            can_go = False
                        else:
                            print "could not send the secondary input to",site_se,"because it is too big for the available disk",SI.disk[site_se]*1024,"GB need",sec_size
                            if primary_destinations and site in primary_destinations:
                                #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024))
                                sendLog('transferor', '%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024), level='critical')
                else:
                    print "the secondary input does not have to be send to site"

        ## is that possible to do something more
        if can_go:
            ## no explicit transfer required this time
            if staging:
                ## but using existing ones
                wfh.sendLog('transferor', "latches on existing transfers, and nothing else, settin staging")
                wfo.status = 'staging'
                needs_transfer+=1
            else:
                wfh.sendLog('transferor', "should just be assigned now to %s"%sorted(sites_allowed))
                wfo.status = 'staged'
            passing_along+=1
            wfh.sendLog('transferor', "setting status to %s"%wfo.status)
            session.commit()
            continue
        else:
            ## there is an explicit transfer required
            if staging:
                ## and also using an existing one
                wfh.sendLog('transferor', "latches on existing transfers")
                if not options.test:
                    wfo.status = 'staging'
                    wfh.sendLog('transferor', "setting status to %s"%wfo.status)
                    session.commit()
            wfh.sendLog('transferor',"needs a transfer")
            needs_transfer+=1
            passing_along+=1

    if no_goes:
        #sendEmail("no go for managing","No go for \n"+"\n".join( no_goes ))
        sendLog('transferor', "No go for \n"+"\n".join( no_goes ), level='critical')

    print "accumulated transfers"
    print json.dumps(all_transfers, indent=2)
    fake_id=-1
    wf_id_in_prestaging=set()

    for (site,items_to_transfer) in all_transfers.iteritems():
        items_to_transfer = list(set(items_to_transfer))

        ## convert to storage element
        site_se = SI.CE_to_SE(site)

        ## site that do not want input datasets
        if site in SI.sites_veto_transfer: 
            print site,"does not want transfers"
            continue

        ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured.

        ## massage a bit the items
        blocks = [it for it in items_to_transfer if '#' in it]
        block_datasets = list(set([it.split('#')[0] for it in blocks]))
        datasets = [it for it in items_to_transfer if not '#' in it]

        details_text = "Making a replica to %s (CE) %s (SE) for"%( site, site_se)
        

        #print "\t",len(blocks),"blocks"
        ## remove blocks if full dataset is send out
        blocks = [block for block in blocks if not block.split('#')[0] in datasets]
        #print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks]))
        #print "\t",len(datasets),"datasets"
        #print "\t",datasets
        details_text += '\n\t%d blocks'%len(blocks)
        details_text += '\n\t%d needed blocks for %s'%( len(blocks), sorted(list(set([block.split('#')[0] for block in blocks]))))
        details_text += '\n\t%d datasets'% len(datasets)
        details_text += '\n\t%s'%sorted(datasets)
        
        items_to_transfer = blocks + datasets

        if execute:
            sendLog('transferor', details_text)
        else:
            print "Would make a replica to",site,"(CE)",site_se,"(SE) for"
            print details_text

        ## operate the transfer
        if options and options.stop:
            ## ask to move-on
            answer = raw_input('Continue with that ?')
            if not answer.lower() in ['y','yes','go']:
                continue

        if execute:
            priority = 'normal'
            cds = [ds for ds in datasets+block_datasets if ds in max_priority]
            if cds and False: ## I don't think this is working. subscription should be updated on the fly and regularly for raising the priority if needed
                ## decide on an overall priority : that's a bit too large though
                if any([max_priority[ds]>=90000 for ds in cds]):
                    priority = 'high'
                elif all([max_priority[ds]<80000 for ds in cds]):
                    priority = 'low'
                
            result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority=priority)
        else:
            result= {'phedex':{'request_created' : []}}
            fake_id-=1



        if not result:
            print "ERROR Could not make a replica request for",site,items_to_transfer,"pre-staging"
            continue
        for phedexid in [o['id'] for o in result['phedex']['request_created']]:
            new_transfer = session.query(Transfer).filter(Transfer.phedexid == int(phedexid)).first()
            if not new_transfer:
                new_transfer = session.query(Transfer).filter(Transfer.phedexid == -int(phedexid)).first()
            print phedexid,"transfer created"
            if not new_transfer:
                new_transfer = Transfer( phedexid = phedexid)
                session.add( new_transfer )                
            else:
                new_transfer.phedexid = phedexid ## make it positive again

            new_transfer.workflows_id = set()
            for transfering in list(set(map(lambda it : it.split('#')[0], items_to_transfer))):
                new_transfer.workflows_id.update( workflow_dependencies[transfering] )
            new_transfer.workflows_id = list(new_transfer.workflows_id)
            wf_id_in_prestaging.update(new_transfer.workflows_id)
            session.commit()
            ## auto approve it
            if execute:
                approved = approveSubscription(url, phedexid, [site_se])

    for wfid in wf_id_in_prestaging:
        tr_wf = session.query(Workflow).get(wfid)
        if tr_wf and tr_wf.status!='staging':
            if execute:
                tr_wf.status = 'staging'
                if talk:
                    print "setting",tr_wf.name,"to staging"
        session.commit()
Exemplo n.º 43
0
from utils import workflowInfo, getWorkflows, sendEmail, componentInfo, monitor_dir, reqmgr_url, newLockInfo
from assignSession import *
import reqMgrClient
import os
import sys
import json

url = reqmgr_url

#nl = newLockInfo()
#nl.lock('/Neutrino_E-10_gun/RunIISpring15PrePremix-AVE_25_BX_25ns_76X_mcRun2_asymptotic_v12-v3/GEN-SIM-DIGI-RAW')
#nl.lock('/DYJetsToLL_M-50_TuneCUETP8M1_13TeV-amcatnloFXFX-pythia8/RunIISummer15GS-MCRUN2_71_V1_ext1-v2/GEN-SIM')


## all dqmharvest completed to announced right away
wfs = getWorkflows(url, 'completed', user=None, rtype='DQMHarvest')
for wf in wfs: 
    print "closing out",wf
    reqMgrClient.closeOutWorkflow(url, wf)
wfs = getWorkflows(url, 'closed-out', user=None, rtype='DQMHarvest')
for wf in wfs: 
    print "announcing",wf
    reqMgrClient.announceWorkflow(url, wf)


#os.system('Unified/equalizor.py -a pdmvserv_task_HIG-RunIIFall15DR76-01039__v1_T_160120_002705_9423')
#os.system('Unified/equalizor.py -a pdmvserv_SMP-Summer12DR53X-00027_00440_v0__160224_044437_5031')

up = componentInfo(mcm=False, soft=['mcm'])                                 
if not up.check():  
    sys.exit(1)     
Exemplo n.º 44
0
## those that are already in lock
already_locked = set(json.loads(open('%s/globallocks.json'%monitor_dir).read()))
if not already_locked:
    old = json.loads(open('datalocks.json').read())
    for site,locks in old.items():
        if type(locks) == float: continue
        for item,info in locks.items():
            if info['lock']==False: continue
            already_locked.add( item.split('#')[0] )
    print "found",len(already_locked),"old locks"

newly_locking = set()
## you want to take them in reverse order to make sure none go through a transition while you run this 
for status in reversed(statuses):
    wfls = getWorkflows(url , status = status,details=True)
    print len(wfls),"in",status
    for wl in wfls:
        ## unknonw to the system
        known = session.query(Workflow).filter(Workflow.name==wl['RequestName']).all()
        if not known: 
            #print wl['RequestName'],"is unknown, this is bad news" ## no it is not
            continue

        if status == 'assignment-approved':
            if all([wfo.status == 'considered' for wfo in known]):
                ## skip those only assignment-approved / considered
                continue

        wfi = workflowInfo( url,  wl['RequestName'], request = wl ,spec=False)
        (_,primaries,_,secondaries) = wfi.getIO()
Exemplo n.º 45
0
def transferor(url ,specific = None, talk=True, options=None):
    if userLock('transferor'):   return

    if options and options.test:
        execute = False
    else:
        execute = True

    SI = siteInfo()
    CI = campaignInfo()
    mcm = McMClient(dev=False)
    dss = DSS()

    print "counting all being handled..."
    being_handled = len(session.query(Workflow).filter(Workflow.status == 'away').all())
    being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('stag')).all())
    being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance')).all())
    max_to_handle = options.maxworkflows
    allowed_to_handle = max(0,max_to_handle - being_handled)
    wf_buffer = 5
    if allowed_to_handle<=wf_buffer: ## buffer for having several wf per transfer
        print "Not allowed to run more than",max_to_handle,"at a time. Currently",being_handled,"and",wf_buffer,"buffer"
    else:
        print being_handled,"already being handled",max_to_handle,"max allowed,",allowed_to_handle,"remaining","and",wf_buffer,"buffer"
    print "... done"

    all_transfers=defaultdict(list)
    workflow_dependencies = defaultdict(set) ## list of wf.id per input dataset
    wfs_and_wfh=[]
    print "getting all wf to consider ..."
    cache = getWorkflows(url, 'assignment-approved', details=True)
    for wfo in session.query(Workflow).filter(Workflow.status=='considered').all():
        if specific and not specific in wfo.name: continue
        cache_r =filter(lambda d:d['RequestName']==wfo.name, cache)
        if len(cache_r):
            wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False, request = cache_r[0]) ) )
        else:
            wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False) ) )
    print "... done"

    input_sizes = {}
    ## list the size of those in transfer already
    in_transfer_priority=0
    min_transfer_priority=100000000
    print "getting all wf in staging ..."
    for wfo in session.query(Workflow).filter(Workflow.status=='staging').all():
        wfh = workflowInfo( url, wfo.name, spec=False)
        (_,primary,_,_) = wfh.getIO()
        for prim in primary: 
            input_sizes[prim] = dss.get( prim )
        in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority']))
        min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority']))
    print "... done"
    print "Max priority in transfer already",in_transfer_priority
    print "Min priority in transfer already",min_transfer_priority
    in_transfer_already = sum(input_sizes.values())


    #sort by priority higher first
    wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority']) ), reverse=True)


    ## list the size of all inputs
    print "getting all input sizes ..."
    for (wfo,wfh) in wfs_and_wfh:
        (_,primary,_,_) = wfh.getIO()
        for prim in primary:
            input_sizes[prim] = dss.get( prim )
    print "... done"

    grand_total =  sum(input_sizes.values()) 
    to_transfer = grand_total  - in_transfer_already
    grand_transfer_limit = options.maxtransfer 
    transfer_limit = grand_transfer_limit - in_transfer_already
    print "%15.4f GB already being transfered"%in_transfer_already
    print "%15.4f GB is the current requested transfer load"%to_transfer
    print "%15.4f GB is the global transfer limit"%grand_transfer_limit
    print "%15.4f GB is the available limit"%transfer_limit

    # the max priority value per dataset.
    max_priority = defaultdict(int)
    needs_transfer=0 ## so that we can count'em
    passing_along = 0
    transfer_sizes={}
    went_over_budget=False
    for (wfo,wfh) in wfs_and_wfh:
        print wfh.request['RequestPriority']
        print wfo.name,"to be transfered"
        #wfh = workflowInfo( url, wfo.name)

        (_,primary,_,_) = wfh.getIO()
        this_load=sum([input_sizes[prim] for prim in primary])
        if ( this_load and (sum(transfer_sizes.values())+this_load > transfer_limit or went_over_budget ) ):
            if went_over_budget:
                print "Transfer has gone over bubget."
            else:
                print "Transfer will go over bubget."
            print "%15.4f GB this load"%this_load
            print "%15.4f GB already this round"%sum(transfer_sizes.values())
            print "%15.4f GB is the available limit"%transfer_limit
            went_over_budget=True
            if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
                print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over budget"
            else:
                if not options.go: 
                    print min_transfer_priority,"minimum priority",wfh.request['RequestPriority'],"<",in_transfer_priority,"stop"
                    continue


        ## throtlle by campaign go
        if not CI.go( wfh.request['Campaign'] ):
            print "No go for",wfh.request['Campaign']
            if not options.go: continue

        ## check if the batch is announced
        announced=False
        is_real=False
        for b in mcm.getA('batches',query='contains=%s'% wfo.name):
            is_real = True
            if b['status']=='announced': 
                announced=True 
                break

        if not announced:
            print wfo.name,"does not look announced."# skipping?, rejecting?, reporting?"
            
        if not is_real:
            print wfo.name,"does not appear to be genuine."
            ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status
            continue

        ## check on a grace period
        injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.)
        now = time.mktime(time.gmtime()) / (60.*60.)
        if float(now - injection_time) < 4.:
            if not options.go and not announced: 
                print "It is too soon to start transfer: %3.2fH remaining"%(now - injection_time)
                continue


        passing_along += 1
        if passing_along >= allowed_to_handle:
            if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
                print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over",max_to_handle
            else:
                print "Not allowed to pass more than",max_to_handle,"at a time. Currently",being_handled,"handled, and adding",passing_along
                break

        (lheinput,primary,parent,secondary) = wfh.getIO()
        if options and options.tosites:
            sites_allowed = options.tosites.split(',')
        else:
            sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) )

        if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']):
            sites_allowed = CI.parameters(wfh.request['Campaign'])['SiteWhitelist']

        blocks = []
        if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']:
            blocks = wfh.request['BlockWhitelist']

        can_go = True
        staging=False
        if primary:
            if talk:
                print wfo.name,'reads',', '.join(primary),'in primary'
            ## chope the primary dataset 
            for prim in primary:
                max_priority[prim] = max(max_priority[prim],int(wfh.request['RequestPriority']))
                sites_really_allowed = [site for site in sites_allowed if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                print "Sites allowed minus the vetoed transfer"
                print sites_really_allowed
                copies_needed = int(0.35*len(sites_really_allowed))+1 ## should just go for a fixed number based if the white list grows that big
                print "Would make",copies_needed,"copies"
                if options.maxcopy>0:
                    copies_needed = min(options.maxcopy,copies_needed)

                ## remove the sites that do not want transfers                
                print "need",copies_needed
                workflow_dependencies[prim].add( wfo.id )
                presence = getDatasetPresence( url, prim )
                prim_location = [site for site,pres in presence.items() if pres[0]==True]
                if len(prim_location) >= copies_needed:
                    print "The output is all fully in place at",len(prim_location),"sites"
                    continue
                # reduce the number of copies required by existing full copies
                copies_needed = max(0,copies_needed - len(prim_location))
                print "now need",copies_needed
                subscriptions = listSubscriptions( url , prim )
                prim_destination = list(set([site for (site,(tid,decision)) in subscriptions.items() if decision and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])]))
                ## need to reject from that list the ones with a full copy already: i.e the transfer corresponds to the copy in place
                prim_destination = [site for site in prim_destination if not site in prim_location]
                ## add transfer dependencies
                latching_on_transfers =  list(set([ tid for (site,(tid,decision)) in subscriptions.items() if decision and site in prim_destination and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])]))
                print latching_on_transfers
                for latching in latching_on_transfers:
                    tfo = session.query(Transfer).filter(Transfer.phedexid == latching).first()
                    if not tfo:
                        tfo = Transfer( phedexid = latching)
                        tfo.workflows_id = []
                        session.add(tfo)
                            
                    if not wfo.id in tfo.workflows_id:
                        print "adding",wfo.id,"to",tfo.id,"with phedexid",latching
                        l = copy.deepcopy( tfo.workflows_id )
                        l.append( wfo.id )
                        tfo.workflows_id = l
                    if not options.test:
                        session.commit()
                    else:
                        session.flush() ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query
                    can_go = False
                    transfer_sizes[prim] = input_sizes[prim]
                    staging = True

                # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ??
                copies_needed = max(0,copies_needed - len(prim_destination))
                print "then need",copies_needed
                if copies_needed == 0:
                    print "The output is either fully in place or getting in full somewhere with",latching_on_transfers
                    can_go = True
                    continue
                prim_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in prim_location])]
                prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in prim_destination])]
                ## take out the ones that cannot receive transfers
                prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                if len(prim_to_distribute)>0: ## maybe that a parameter we can play with to limit the 
                    if not options or options.chop:
                        spreading = distributeToSites( getDatasetChops(prim), prim_to_distribute, n_copies = copies_needed, weights=SI.cpu_pledges)
                    else:
                        spreading = {} 
                        for site in prim_to_distribute: spreading[site]=[prim]
                    can_go = False
                    transfer_sizes[prim] = input_sizes[prim]
                    for (site,items) in spreading.items():
                        all_transfers[site].extend( items )




        if secondary:
            if talk:
                print wfo.name,'reads',', '.join(secondary),'in secondary'
            for sec in secondary:
                workflow_dependencies[sec].add( wfo.id )
                presence = getDatasetPresence( url, sec )
                sec_location = [site for site,pres in presence.items() if pres[1]>90.] ## more than 90% of the minbias at sites
                subscriptions = listSubscriptions( url ,sec )
                sec_destination = [site for site in subscriptions] 
                sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])]
                sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])]
                sec_to_distribute = [site for site in sec_to_distribute if not  any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                if len( sec_to_distribute )>0:
                    for site in sec_to_distribute:
                        all_transfers[site].append( sec )
                        can_go = False
        
        ## is that possible to do something more
        if can_go:
            ## no explicit transfer required this time
            if staging:
                ## but using existing ones
                print wfo.name,"latches on existing transfers, and nothing else"
                wfo.status = 'staging'
            else:
                print wfo.name,"should just be assigned NOW to",sites_allowed
                wfo.status = 'staged'
            print "setting status to",wfo.status
            session.commit()
            continue
        else:
            ## there is an explicit transfer required
            if staging:
                ## and also using an existing one
                print wfo.name,"latches on existing transfers"
                if not options.test:
                    wfo.status = 'staging'
                    print "setting status to",wfo.status
                    session.commit()
            print wfo.name,"needs a transfer"
            needs_transfer+=1

    #print json.dumps(all_transfers)
    fake_id=-1
    wf_id_in_prestaging=set()

    for (site,items_to_transfer) in all_transfers.iteritems():
        items_to_transfer = list(set(items_to_transfer))
        ## convert to storage element
        site_se = SI.CE_to_SE(site)

        ## site that do not want input datasets
        if site in SI.sites_veto_transfer: 
            print site,"does not want transfers"
            continue

        ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured.

        ## massage a bit the items
        blocks = [it for it in items_to_transfer if '#' in it]
        datasets = [it for it in items_to_transfer if not '#' in it]

        if execute:
            print "Making a replica to",site,"(CE)",site_se,"(SE) for"
        else:
            print "Would make a replica to",site,"(CE)",site_se,"(SE) for"

        print "\t",len(blocks),"blocks"
        ## remove blocks if full dataset is send out
        blocks = [block for block in blocks if not block.split('#')[0] in datasets]
        print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks]))
        print "\t",len(datasets),"datasets"
        print "\t",datasets
        items_to_transfer = blocks + datasets

        ## operate the transfer
        if options and options.stop:
            ## ask to move-on
            answer = raw_input('Continue with that ?')
            if not answer.lower() in ['y','yes','go']:
                continue

        if execute:
            result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority='normal')
            ## make use of max_priority dataset:priority to set the subscriptions priority
            """
            ## does not function
            once = True
            for item in items_to_transfer:
                bds = item.split('#')[0]
                if max_priority[bds] >= 90000:
                    if once:
                        w=10
                        print "waiting",w,"s before raising priority"
                        time.sleep(w)
                        once=False
                    ## raise it to high priority
                    print item,"subscription priority raised to high at",site_se
                    #print "This does not work yet properly it seems"
                    print updateSubscription(url, site_se, item, priority='high')
            """
        else:
            #result= {'phedex':{'request_created' : [{'id' : fake_id}]}}
            result= {'phedex':{'request_created' : []}}
            fake_id-=1



        if not result:
            print "ERROR Could not make a replica request for",site,items_to_transfer,"pre-staging"
            continue
        for phedexid in [o['id'] for o in result['phedex']['request_created']]:
            new_transfer = session.query(Transfer).filter(Transfer.phedexid == phedexid).first()
            print phedexid,"transfer created"
            if not new_transfer:
                new_transfer = Transfer( phedexid = phedexid)
                session.add( new_transfer )                
            new_transfer.workflows_id = set()
            for transfering in list(set(map(lambda it : it.split('#')[0], items_to_transfer))):
                new_transfer.workflows_id.update( workflow_dependencies[transfering] )
            new_transfer.workflows_id = list(new_transfer.workflows_id)
            wf_id_in_prestaging.update(new_transfer.workflows_id)
            session.commit()
            ## auto approve it
            if execute:
                approved = approveSubscription(url, phedexid, [site_se])

    for wfid in wf_id_in_prestaging:
        tr_wf = session.query(Workflow).get(wfid)
        if tr_wf and tr_wf.status!='staging':
            if execute:
                tr_wf.status = 'staging'
                if talk:
                    print "setting",tr_wf.name,"to staging"
        session.commit()
Exemplo n.º 46
0
def injector(url, options, specific):

    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    workflows = getWorkflows(url, status=options.wmstatus, user=options.user)
    workflows.extend(
        getWorkflows(url,
                     status=options.wmstatus,
                     user='******',
                     rtype="ReReco")
    )  ## regardless of users, pick up all ReReco on the table

    print len(workflows), "in line"
    cannot_inject = set()
    ## browse for assignment-approved requests, browsed for ours, insert the diff
    for wf in workflows:
        if specific and not specific in wf: continue
        exists = session.query(Workflow).filter(Workflow.name == wf).first()
        if not exists:
            wfi = workflowInfo(url, wf)
            #wl = getWorkLoad(url, wf)
            ## check first that there isn't related here with something valid
            can_add = True
            ## first try at finding a match
            #            print wfi.request
            familly = session.query(Workflow).filter(
                Workflow.name.contains(wfi.request['PrepID'])).all()
            if not familly:
                #req_familly = getWorkflowById( url, wl['PrepID'])
                #familly = [session.query(Workflow).filter(Workflow.name == member).first() for member in req_familly]
                pids = wfi.getPrepIDs()
                req_familly = []
                for pid in pids:
                    req_familly.extend(getWorkflowById(url, pid, details=True))

                familly = []
                print len(req_familly), "members"
                for req_member in req_familly:
                    #print "member",req_member['RequestName']
                    owfi = workflowInfo(url,
                                        req_member['RequestName'],
                                        request=req_member)
                    other_pids = owfi.getPrepIDs()
                    if set(pids) == set(other_pids):
                        ## this is a real match
                        familly.extend(
                            session.query(Workflow).filter(
                                Workflow.name ==
                                req_member['RequestName']).all())

            for lwfo in familly:
                if lwfo:
                    ## we have it already
                    if not lwfo.status in [
                            'forget', 'trouble', 'forget-unlock',
                            'forget-out-unlock'
                    ]:
                        sendLog(
                            'injector', "Should not put %s because of %s %s" %
                            (wf, lwfo.name, lwfo.status))
                        print "Should not put", wf, "because of", lwfo.name, lwfo.status
                        cannot_inject.add(wf)
                        can_add = False
            if not can_add: continue
            wfi.sendLog('injector', "considering %s" % wf)
            new_wf = Workflow(name=wf,
                              status=options.setstatus,
                              wm_status=options.wmstatus)
            session.add(new_wf)
            session.commit()
            time.sleep(0.5)
        else:
            #print "already have",wf
            pass

    if cannot_inject:
        #sendEmail('workflow duplicates','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject)))
        sendLog(
            'injector',
            'These workflow cannot be added in because of duplicates \n\n %s' %
            ('\n'.join(cannot_inject)),
            level='warning')

    ## passing a round of invalidation of what needs to be invalidated
    if use_mcm and (options.invalidate or True):
        invalidator(url)

    no_replacement = set()

    ## pick up replacements
    for wf in session.query(Workflow).filter(
            Workflow.status == 'trouble').all():
        print wf.name
        if specific and not specific in wf.name: continue
        print wf.name
        wfi = workflowInfo(url, wf.name)
        wl = wfi.request  #getWorkLoad(url, wf.name)
        familly = getWorkflowById(url, wl['PrepID'])
        true_familly = []
        for member in familly:
            if member == wf.name: continue
            fwl = getWorkLoad(url, member)
            if options.replace:
                if member != options.replace: continue
            else:
                if fwl['RequestDate'] < wl['RequestDate']: continue
                if fwl['RequestType'] == 'Resubmission': continue
                if fwl['RequestStatus'] in ['None', None, 'new']: continue
                if fwl['RequestStatus'] in [
                        'rejected', 'rejected-archived', 'aborted',
                        'aborted-archived'
                ]:
                    continue
            true_familly.append(fwl)

        if len(true_familly) == 0:
            #sendLog('injector','%s had no replacement'%wf.name, level='critical')
            wfi.sendLog(
                'injector',
                'the workflow was found in trouble with no replacement')
            no_replacement.add(wf.name)
            continue
        else:
            wfi.sendLog(
                'injector',
                'the workflow was found in trouble and has a replacement')

        print wf.name, "has", len(familly), "familly members"
        print wf.name, "has", len(true_familly), "true familly members"

        ##we cannot have more than one of them !!! pick the last one
        if len(true_familly) > 1:
            #sendEmail('multiple wf','please take a look at injector for %s'%wf.name)
            sendLog('injector',
                    'Multiple wf in line, will take the last one for %s \n%s' %
                    (wf.name, ', '.join(fwl['RequestName']
                                        for fwl in true_familly)),
                    level='critical')

        for fwl in true_familly[-1:]:
            member = fwl['RequestName']
            new_wf = session.query(Workflow).filter(
                Workflow.name == member).first()
            if not new_wf:
                sendLog('injector',
                        "putting %s as replacement of %s" % (member, wf.name))
                status = 'away'
                if fwl['RequestStatus'] in ['assignment-approved']:
                    status = 'considered'
                new_wf = Workflow(name=member,
                                  status=status,
                                  wm_status=fwl['RequestStatus'])
                wf.status = 'forget'
                session.add(new_wf)
            else:
                if new_wf.status == 'forget': continue
                sendLog(
                    'injector',
                    "getting %s as replacement of %s" % (new_wf.name, wf.name))
                wf.status = 'forget'

            for tr in session.query(Transfer).all():
                if wf.id in tr.workflows_id:
                    sw = copy.deepcopy(tr.workflows_id)
                    sw.remove(wf.id)
                    sw.append(new_wf.id)
                    tr.workflows_id = sw
                    print tr.phedexid, "got", new_wf.name
                    if new_wf.status != 'away':
                        print "\t setting it considered"
                        new_wf.status = 'considered'
                    if tr.phedexid < 0:  ## set it back to positive
                        tr.phedexid = -tr.phedexid
                    session.commit()

        ## don't do that automatically
        #wf.status = 'forget'
        session.commit()
    if no_replacement:
        #sendEmail('workflow with no replacement','%s \n are dangling there'%( '\n'.join(no_replacement)))
        sendLog('injector',
                'workflow with no replacement, %s \n are dangling there' %
                ('\n'.join(no_replacement)),
                level='critical')
Exemplo n.º 47
0
url = reqmgr_url

up = componentInfo(soft=['mcm','wtc','jira'])
if not up.check(): sys.exit(0)

status = sys.argv[1]
max_wf = 0

print "Picked status",status

wfs = []
if status == 'wmagent':
    register=['assigned','acquired','running-open','running-closed','force-complete','completed','closed-out']
    for r in register:
        wfs.extend( getWorkflows(url, r, details=True) )

elif status.endswith('*'):
    wfs.extend([wfo.name for wfo in  session.query(Workflow).filter(Workflow.status.startswith(status[:-1])).all() ])
else:
    wfs.extend([wfo.name for wfo in  session.query(Workflow).filter(Workflow.status==status).all() ])



if max_wf: wfs = wfs[:max_wf]

random.shuffle( wfs )
all_blocks_at_sites = defaultdict(set)

#done = json.loads(open('myblock_done.json').read())
done = {}
Exemplo n.º 48
0
def batchor(url):
    UC = unifiedConfiguration()
    SI = global_SI()
    CI = campaignInfo()
    BI = batchInfo()
    ## get all workflows in assignment-approved with SubRequestType = relval
    all_wfs = []
    for user in UC.get("user_relval"):
        all_wfs.extend(
            getWorkflows(url,
                         'assignment-approved',
                         details=True,
                         user=user,
                         rtype='TaskChain'))

    wfs = filter(
        lambda r: r['SubRequestType'] == 'RelVal'
        if 'SubRequestType' in r else False, all_wfs)
    ## need a special treatment for those
    hi_wfs = filter(
        lambda r: r['SubRequestType'] == 'HIRelVal'
        if 'SubRequestType' in r else False, all_wfs)

    by_campaign = defaultdict(set)
    by_hi_campaign = defaultdict(set)
    for wf in wfs:
        print "Relval:", wf['RequestName'], wf['Campaign']
        by_campaign[wf['Campaign']].add(wf['PrepID'])

    for wf in hi_wfs:
        print "HI Relval:", wf['RequestName'], wf['Campaign']
        by_hi_campaign[wf['Campaign']].add(wf['PrepID'])

    default_setup = {
        "go": True,
        "parameters": {
            "SiteWhitelist": ["T1_US_FNAL"],
            "MergedLFNBase": "/store/relval",
            "Team": "relval",
            "NonCustodialGroup": "RelVal"
        },
        "custodial_override": "notape",
        "phedex_group": "RelVal",
        "lumisize": -1,
        "fractionpass": 0.0,
        "maxcopies": 1
    }
    default_hi_setup = copy.deepcopy(default_setup)

    add_on = {}
    relval_routing = UC.get('relval_routing')

    def pick_one_site(p):
        ## modify the parameters on the spot to have only one site
        if "parameters" in p and "SiteWhitelist" in p["parameters"] and len(
                p["parameters"]["SiteWhitelist"]) > 1:
            choose_from = list(
                set(p["parameters"]["SiteWhitelist"]) & set(SI.sites_ready))
            picked = random.choice(choose_from)
            print "picked", picked, "from", choose_from
            p["parameters"]["SiteWhitelist"] = [picked]

    batches = BI.all()
    for campaign in by_campaign:
        if campaign in batches: continue
        ## get a bunch of information
        setup = copy.deepcopy(default_setup)

        for key in relval_routing:
            if key in campaign:
                ## augment with the routing information
                augment_with = relval_routing[key]
                print "Modifying the batch configuration because of keyword", key
                print "with", augment_with
                setup = deep_update(setup, augment_with)

        pick_one_site(setup)
        add_on[campaign] = setup
        sendLog('batchor',
                'Adding the relval campaigns %s with parameters \n%s' %
                (campaign, json.dumps(setup, indent=2)),
                level='critical')
        BI.update(campaign, by_campaign[campaign])

    for campaign in by_hi_campaign:
        if campaign in batches: continue
        ## get a bunch of information
        setup = copy.deepcopy(default_hi_setup)
        possible_sites = set(["T1_DE_KIT", "T1_FR_CCIN2P3"])
        hi_site = random.choice(list(possible_sites))
        setup["parameters"]["SiteWhitelist"] = [hi_site]

        pick_one_site(setup)
        add_on[campaign] = setup
        sendLog('batchor',
                'Adding the HI relval campaigns %s with parameters \n%s' %
                (campaign, json.dumps(setup, indent=2)),
                level='critical')
        BI.update(campaign, by_hi_campaign[campaign])

    ## only new campaigns in announcement
    for new_campaign in list(
            set(add_on.keys()) - set(CI.all(c_type='relval'))):
        ## this is new, and can be announced as such
        print new_campaign, "is new stuff"
        subject = "Request of RelVal samples batch %s" % new_campaign
        text = """Dear all, 
A new batch of relval workflows was requested.

Batch ID:

%s

Details of the workflows:

https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s

This is an automated message""" % (
            new_campaign,
            new_campaign,
        )

        print subject
        print text
        to = ['*****@*****.**']
        sendEmail(subject, text, destination=to)
        sendLog('batchor', text, level='critical')

    ## go through all existing campaigns and remove the ones not in use anymore ?
    for old_campaign in CI.all(c_type='relval'):
        all_in_batch = getWorkflowByCampaign(url, old_campaign, details=True)
        if not all_in_batch: continue
        is_batch_done = all(
            map(
                lambda s: not s in [
                    'completed', 'force-complete', 'running-open',
                    'running-closed', 'acquired', 'assigned',
                    'assignment-approved'
                ], [wf['RequestStatus'] for wf in all_in_batch]))
        ## check all statuses
        if is_batch_done:
            #print "batch",old_campaign,"can be closed or removed if necessary"
            #campaigns[old_campaign]['go'] = False ## disable
            CI.pop(old_campaign)  ## or just drop it all together ?
            BI.pop(old_campaign)
            print "batch", old_campaign, " configuration was removed"

    ## merge all anyways
    CI.update(add_on, c_type='relval')
Exemplo n.º 49
0
def htmlor():
    cache = getWorkflows('cmsweb.cern.ch', 'assignment-approved', details=True)

    def wfl(wf,
            view=False,
            p=False,
            ms=False,
            within=False,
            ongoing=False,
            status=False,
            update=False):
        wfn = wf.name
        wfs = wf.wm_status
        pid = None
        pids = filter(lambda seg: seg.count('-') == 2, wf.name.split('_'))
        if len(pids):
            pid = pids[0]
        text = ', '.join([
            #wfn,
            '<a href="https://cmsweb.cern.ch/reqmgr/view/details/%s" target="_blank">%s</a>'
            % (wfn, wfn),
            '(%s) <br>' % wfs
        ])
        text += ', '.join([
            '<a href="https://cmsweb.cern.ch/reqmgr/view/details/%s" target="_blank">dts</a>'
            % wfn,
            '<a href=https://cmsweb.cern.ch/reqmgr/view/showWorkload?requestName=%s target="_blank">wkl</a>'
            % wfn,
            '<a href="https://cmsweb.cern.ch/couchdb/reqmgr_workload_cache/%s" target="_blank">wfc</a>'
            % wfn,
            '<a href="https://cmsweb.cern.ch/reqmgr/reqMgr/request?requestName=%s" target="_blank">dwkc</a>'
            % wfn,
            '<a href="https://cmsweb.cern.ch/reqmgr/view/splitting/%s" target="_blank">spl</a>'
            % wfn,
            '<a href="https://cms-pdmv.cern.ch/stats/?RN=%s" target="_blank">vw</a>'
            % wfn,
            '<a href="https://cms-pdmv.cern.ch/stats/restapi/get_one/%s" target="_blank">vwo</a>'
            % wfn,
            '<a href="https://cms-logbook.cern.ch/elog/Workflow+processing/?mode=full&reverse=0&reverse=1&npp=20&subtext=%s&sall=q" target="_blank">elog</a>'
            % pid,
            '<a href="http://hcc-briantest.unl.edu/prodview/%s" target="_blank">pv</a>'
            % wfn,
            '<a href="https://cmsweb.cern.ch/reqmgr/reqMgr/outputDatasetsByRequestName/%s" target="_blank">out</a>'
            % wfn,
            '<a href="closeout.html#%s" target="_blank">clo</a>' % wfn,
            '<a href="statuses.html#%s" target="_blank">st</a>' % wfn,
            '<a href="https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s" target="_blank">perf</a>'
            % wfn
        ])
        if within and (not view or wfs == 'completed'):
            cached = filter(lambda d: d['RequestName'] == wfn, cache)
            if cached:
                wl = cached[0]
            else:
                wl = getWorkLoad('cmsweb.cern.ch', wfn)
            if 'InputDataset' in wl:
                dataset = wl['InputDataset']
                text += ', '.join([
                    '',
                    '<a href=https://cmsweb.cern.ch/das/request?input=%s target=_blank>input</a>'
                    % dataset,
                    '<a href=https://cmsweb.cern.ch/phedex/prod/Data::Subscriptions#state=create_since=0;filter=%s target=_blank>sub</a>'
                    % dataset,
                    '<a href=https://cmsweb.cern.ch/phedex/datasvc/xml/prod/subscriptions?dataset=%s&collapse=n target=_blank>ds</a>'
                    % dataset,
                    '<a href=https://cmsweb.cern.ch/phedex/datasvc/xml/prod/blockreplicas?dataset=%s target=_blank>rep</a>'
                    % dataset,
                ])

        if p:
            cached = filter(lambda d: d['RequestName'] == wfn, cache)
            if cached:
                wl = cached[0]
            else:
                wl = getWorkLoad('cmsweb.cern.ch', wfn)
            text += ', (%s)' % (wl['RequestPriority'])
            pass

        if pid:
            if ms:
                mcm_s = json.loads(
                    os.popen(
                        'curl https://cms-pdmv.cern.ch/mcm/public/restapi/requests/get_status/%s --insecure'
                        % pid).read())[pid]
                text += ', <a href="https://cms-pdmv.cern.ch/mcm/requests?prepid=%s" target="_blank">mcm (%s)</a>' % (
                    pid, mcm_s)
            else:
                text += ', <a href="https://cms-pdmv.cern.ch/mcm/requests?prepid=%s" target="_blank">mcm</a>' % (
                    pid)
                text += ', <a href="https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=%s" target="_blank">ac</a>' % (
                    pid)

        if status:
            if wf.status.startswith('assistance'):
                text += ', <a href="assistance.html#%s" target="_blank">assist</a>' % wfn
            text += ' : %s ' % (wf.status)

        if view and wfs != 'acquired':
            text += '<a href="https://cms-pdmv.web.cern.ch/cms-pdmv/stats/growth/%s.gif" target="_blank"><img src="https://cms-pdmv.web.cern.ch/cms-pdmv/stats/growth/%s.gif" style="height:50px"></a>' % (
                wfn.replace('_', '/'), wfn.replace('_', '/'))
        if ongoing:
            text += '<a href="http://hcc-briantest.unl.edu/prodview/%s" target="_blank"><img src="http://hcc-briantest.unl.edu/prodview/graphs/%s/daily" style="height:50px"></a>' % (
                wfn, wfn)
        text += "<hr>"
        return text

    def phl(phid):
        text = ', '.join([
            str(phid),
            '<a href="https://cmsweb.cern.ch/phedex/prod/Request::View?request=%s" target="_blank">vw</a>'
            % phid,
            '<a href="https://cmsweb.cern.ch/phedex/prod/Data::Subscriptions?reqfilter=%s" target="_blank">sub</a>'
            % phid,
        ])
        return text

    def ol(out):
        return '<a href="https://cmsweb.cern.ch/das/request?input=%s" target="_blank"> %s</a>' % (
            out, out)

    ## start to write it
    #html_doc = open('/afs/cern.ch/user/v/vlimant/public/ops/index.html','w')
    html_doc = open('/afs/cern.ch/user/c/cmst2/www/unified/index.html', 'w')
    print "Updating the status page ..."
    html_doc.write("""
<html>
<head>
<META HTTP-EQUIV="refresh" CONTENT="900">
<script type="text/javascript">
 function showhide(id) {
    var e = document.getElementById(id);
    e.style.display = (e.style.display == 'block') ? 'none' : 'block';
 }
</script>
</head>
<body>

Last update on %s(CET), %s(GMT), <a href=logs/ target=_blank> logs</a> <a href=logs/last.log target=_blank>last</a> <a href=statuses.html>statuses</a> <a href=https://twiki.cern.ch/twiki/bin/view/CMSPublic/CompOpsWorkflowL3Responsibilities#Automatic_Assignment_and_Unified>what am I</a> <br><br>

""" % (time.asctime(time.localtime()), time.asctime(time.gmtime())))

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(
            Workflow.status == 'considered').all():
        text += "<li> %s </li> \n" % wfl(wf, p=True)
        count += 1
    text += "</ul></div>\n"
    html_doc.write("""
Worlfow next to handle <a href=https://cms-pdmv.cern.ch/mcm/batches?status=new&page=-1 target="_blank"> batches</a> (%d) <a href=logs/injector/last.log target=_blank>log</a> <a href=logs/transferor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('considered')">[Click to show/hide]</a>
<br>
<div id="considered" style="display:none;">
<ul>
""" % count)
    html_doc.write(text)

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(
            Workflow.status == 'staging').all():
        text += "<li> %s </li> \n" % wfl(wf, within=True)
        count += 1
    text += "</ul></div>\n"
    html_doc.write("""
Worlfow waiting in staging (%d) <a href=logs/transferor/last.log target=_blank>log</a> <a href=logs/stagor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('staging')">[Click to show/hide]</a>
<br>
<div id="staging" style="display:none;">
<ul>
""" % count)
    html_doc.write(text)

    text = ""
    count = 0
    for ts in session.query(Transfer).all():
        stext = '<li> %s serves </li><a href="javascript:showhide(\'%s\')">[show/hide]</a> <div id="%s" style="display:none;"><ul>' % (
            phl(ts.phedexid), ts.phedexid, ts.phedexid)
        hide = True
        for pid in ts.workflows_id:
            w = session.query(Workflow).get(pid)
            hide &= (w.status != 'staging')
            stext += "<li> %s </li>\n" % (wfl(w, status=True))
        stext += "</ul></div>\n"
        if hide:
            #text+="<li> %s not needed anymore to start running (does not mean it went through completely)</li>"%phl(ts.phedexid)
            pass
        else:
            count += 1
            text += stext
    text += "</ul></div>"
    html_doc.write("""
Transfer on-going (%d) <a href=https://transferteam.web.cern.ch/transferteam/dashboard/ target=_blank>dashboard</a> <a href=logs/transferor/last.log target=_blank>log</a> <a href=logs/stagor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('transfer')">[Click to show/hide]</a>
<br>
<div id="transfer" style="display:none;">
<br>
<ul>""" % count)
    html_doc.write(text)

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(
            Workflow.status == 'staged').all():
        text += "<li> %s </li> \n" % wfl(wf, p=True)
        count += 1
    text += "</ul></div>\n"
    html_doc.write(
        """Worlfow ready for assigning (%d) <a href=logs/stagor/last.log target=_blank>log</a> <a href=logs/assignor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('staged')">[Click to show/hide]</a>
<br>
<div id="staged" style="display:none;">
<br>
<ul>
""" % count)
    html_doc.write(text)

    lines = []
    for wf in session.query(Workflow).filter(Workflow.status == 'away').all():
        lines.append("<li> %s </li>" % wfl(wf, view=True, ongoing=True))
    lines.sort()
    html_doc.write("""
Worlfow on-going (%d) <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests_in_production.php target=_blank>ongoing</a> <a href=https://cms-logbook.cern.ch/elog/Workflow+processing/?mode=summary target=_blank>elog</a> <a href=http://hcc-briantest.unl.edu/prodview target=_blank>queues</a> <a href=logs/assignor/last.log target=_blank>log</a> <a href=logs/checkor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('away')">[Click to show/hide]</a>
<br>
<div id="away" style="display:none;">
<br>
<ul>
%s
</ul>
</div>
""" % (len(lines), '\n'.join(lines)))

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(
            Workflow.status == 'assistance').all():
        text += "<li> %s </li> \n" % wfl(
            wf, view=True, update=True, status=True)
        count += 1
    text += "</ul></div>\n"
    html_doc.write("""Worlfow that are closing (%d)
<a href=closeout.html target=_blank>closeout</a> 
<a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('closing')">[Click to show/hide]</a>
<br>
<div id="closing" style="display:none;">
<br>
<ul>
""" % count)
    html_doc.write(text)

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(
            Workflow.status.startswith('assistance-')).all():
        text += "<li> %s </li> \n" % wfl(
            wf, view=True, within=True, status=True, update=True)
        count += 1
    text += "</ul></div>\n"
    html_doc.write("""Worlfow which need assistance (%d)
<a href=assistance.html target=_blank>assistance</a> 
<a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('assistance')">[Click to show/hide]</a>
<br>
<div id="assistance" style="display:none;">
<br>
<ul>
""" % count)
    html_doc.write(text)

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(Workflow.status == 'close').all():
        text += "<li> %s </li> \n" % wfl(wf)
        count += 1
    text += "</ul></div>\n"
    html_doc.write("""Worlfow ready to close (%d)
<a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('close')">[Click to show/hide]</a>
<br>
<div id="close" style="display:none;">
<br>
<ul>
""" % count)
    html_doc.write(text)

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(
            Workflow.status == 'trouble').all():
        text += "<li> %s </li> \n" % wfl(wf)
        count += 1
    text += "</ul></div>\n"
    html_doc.write(
        """Worlfow with issue (%d) <a href=logs/closor/last.log target=_blank>log</a> <a href=logs/injector/last.log target=_blank>postlog</a>
<a href="javascript:showhide('trouble')">[Click to show/hide]</a>
<br>
<div id="trouble" style="display:none;">
<br>
<ul>
""" % count)
    html_doc.write(text)

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(
            Workflow.status == 'forget').all():
        text += "<li> %s </li> \n" % wfl(wf)
        count += 1
    text += "</ul></div>\n"
    html_doc.write("""
Worlfow to forget (%d)
<a href="javascript:showhide('forget')">[Click to show/hide]</a>
<br>
<div id="forget" style="display:none;">
<br>
<ul>
""" % count)
    html_doc.write(text)

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(Workflow.status == 'done').all():
        text += "<li> %s </li> \n" % wfl(wf)  #,ms=True)
        count += 1
    text += "</ul></div>\n"
    html_doc.write("""
Worlfow through (%d) <a href=logs/closor/last.log target=_blank>log</a> <a href=logs/cleanor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('done')">[Click to show/hide]</a>
<br>
<div id="done" style="display:none;">
<br>
<ul>
""" % count)
    html_doc.write(text)

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(Workflow.status == 'clean').all():
        text += "<li> %s </li> \n" % wfl(wf)  #,ms=True)
        count += 1
    text += "</ul></div>\n"
    html_doc.write("""
Worlfow clean for input (%d) <a href=logs/cleanor/last.log target=_blank>log</a>
<a href="javascript:showhide('clean')">[Click to show/hide]</a>
<br>
<div id="clean" style="display:none;">
<br>
<ul>
""" % count)
    html_doc.write(text)

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(
            Workflow.status.endswith('-out')).all():
        text += "<li> %s </li> \n" % wfl(wf, status=True)
        count += 1
    text += "</ul></div>\n"
    html_doc.write("""
Worlfow clean for output (%d) <a href=logs/outcleanor/last.log target=_blank>log</a>
<a href="javascript:showhide('cleanout')">[Click to show/hide]</a>
<br>
<div id="cleanout" style="display:none;">
<br>
<ul>
""" % count)
    html_doc.write(text)

    text = ""
    lines_thisweek = []
    lines_lastweek = []
    now = time.mktime(time.gmtime())
    this_week = int(time.strftime("%W", time.gmtime()))
    for out in session.query(Output).all():
        if not out.workflow:
            print "This is a problem with", out.datasetname
            continue
        if out.workflow.status in ['done', 'clean']:
            out_week = int(time.strftime("%W", time.gmtime(out.date)))
            ##only show current week, and the previous.
            if (this_week - out_week) == 1:
                lines_lastweek.append("<li>on week %s : %s </li>" % (
                    time.strftime("%W (%x %X)", time.gmtime(out.date)),
                    ol(out.datasetname),
                ))
            if (this_week - out_week) == 0:
                lines_thisweek.append("<li>on week %s : %s </li>" % (
                    time.strftime("%W (%x %X)", time.gmtime(out.date)),
                    ol(out.datasetname),
                ))
    lines_thisweek.sort()
    lines_lastweek.sort()

    html_doc.write(
        """Output produced <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?in_disagreement=1 target=_blank>disagreements</a> (%d)
<a href="javascript:showhide('output')">[Click to show/hide]</a>
<br>
<div id="output" style="display:none;">
<br>
<ul>
<li> Last week (%d) </li><a href="javascript:showhide('output_lastweek')">[Click to show/hide]</a><div id="output_lastweek" style="display:none;"><ul>
%s
</ul></div>
<li> This week (%d) </li><a href="javascript:showhide('output_thisweek')">[Click to show/hide]</a><div id="output_thisweek" style="display:none;"><ul>
%s
</ul></div></div>
""" % (len(lines_lastweek) + len(lines_thisweek), len(lines_lastweek),
       '\n'.join(lines_lastweek), len(lines_thisweek),
       '\n'.join(lines_thisweek)))

    html_doc.write("""Job installed
<a href="javascript:showhide('acron')">[Click to show/hide]</a>
<br>
<div id="acron" style="display:none;">
<br>
<pre>
%s
</pre></div>
""" % (os.popen('acrontab -l | grep Unified').read()))

    text = ""
    count = 0
    for (c, info) in campaignInfo().campaigns.items():
        #if 'go' in info and info['go']:
        text += "<li>%s <br> <pre>%s</pre>  </li>" % (
            c, json.dumps(info, indent=2))
        count += 1

    html_doc.write("""Campaign configuration
<a href="javascript:showhide('campaign')">[Click to show/hide]</a>
<br>
<div id="campaign" style="display:none;">
<br>
<ul>
%s
</ul></div>
""" % (text))

    text = ""
    count = 0
    n_column = 4
    SI = siteInfo()
    for t in SI.types():
        #text+="<li>%s<ul>"%t
        #for site in getattr(SI,t):
        #    text+="<li><a href=http://hcc-briantest.unl.edu/prodview/%s>%s<a/> </li>"%( site, site)
        #    text+='<a href="http://hcc-briantest.unl.edu/prodview/%s" target="_blank"><img src="http://hcc-briantest.unl.edu/prodview/graphs/%s/daily" style="height:50px"></a>'%(site,site)
        #text+="</ul></li>"

        text += "<li>%s<table border=1>" % t
        c = 0
        for site in getattr(SI, t):
            cpu = SI.cpu_pledges[site] if site in SI.cpu_pledges else 'N/A'
            disk = SI.disk[SI.CE_to_SE(site)] if SI.CE_to_SE(
                site) in SI.disk else 'N/A'
            if c == 0:
                text += "<tr>"
            text += '<td><a href=http://dashb-ssb.cern.ch/dashboard/templates/sitePendingRunningJobs.html?site=%s>%s</a><br><a href="http://hcc-briantest.unl.edu/prodview/%s" target="_blank"><img src="http://hcc-briantest.unl.edu/prodview/graphs/%s/daily" style="height:50px"></a><br>CPU pledge: %s<br>Disk available: %s</td>' % (
                site, site, site, site, cpu, disk)
            if c == n_column:
                c = 0
            else:
                c += 1
        text += "</table></li>"

    html_doc.write("""Site configuration
<a href="javascript:showhide('site')">[Click to show/hide]</a>
<br>
<div id="site" style="display:none;">
<br>
<ul>
%s
</ul></div>
""" % (text))

    print "... done with status page."
    html_doc.write("""
</body>
</html>
""")

    html_doc.close()

    html_doc = open('/afs/cern.ch/user/c/cmst2/www/unified/statuses.html', 'w')
    html_doc.write(
        """                                                                                                                                                                                                                                                                                                      <html>        
<table border=1>
<thead>
<tr>
<th> workflow </th><th> status </th><th> wm status</th>
</tr>
</thead>
""")
    wfs = {}
    for wfo in session.query(Workflow).all():
        wfs[wfo.name] = (wfo.status, wfo.wm_status)
    open('/afs/cern.ch/user/c/cmst2/www/unified/statuses.json',
         'w').write(json.dumps(wfs))
    for wfn in sorted(wfs.keys()):
        html_doc.write(
            '<tr><td><a id="%s">%s</a></td><td>%s</td><td>%s</td></tr>\n' %
            (wfn, wfn, wfs[wfn][0], wfs[wfn][1]))
    html_doc.write("</table>")
    html_doc.write("<br>" * 100)
    html_doc.write("end of page</html>")
    html_doc.close()