def transitiontime(wf, status): logs= filter(lambda change : change["Status"]==status, wf['RequestTransition']) if logs: return logs[-1]['UpdateTime'] else: return None delays={'assignment-approved' : (7,14), 'new':(7,14), 'completed':(14,21), 'closed-out':(14,21), } warnings=defaultdict(set) for checkin,(warn,timeout) in delays.items(): wfs = getWorkflows(url, checkin, user=None, details=True) for wf in wfs: if not 'backfill' in wf['RequestName'].lower(): continue transition = transitiontime(wf,checkin) if transition and (now - transition)>(timeout*24*60*60): ## that can go away print wf['RequestName'],"is old enough to be removed",wf['RequestStatus'] reqMgrClient.invalidateWorkflow(url, wf['RequestName'], current_status=wf['RequestStatus']) elif transition and (now - transition)>(warn*24*60*60): ## warn requester print wf['RequestName'],"is old enough to be removed",wf['RequestStatus'] warnings[wf['Requestor']].add( wf['RequestName'] ) for who in warnings: sendEmail('Old Backfill in the system','The following backfill should be removed or moved to rejected/announced\n\n%s'%('\n'.join(sorted(warnings[who]))), destination=[who+'@cern.ch'])
def assignor(url, specific=None, talk=True, options=None): if userLock() and not options.manual: return mlock = moduleLock() if mlock() and not options.manual: return if not componentInfo().check() and not options.manual: return UC = unifiedConfiguration() CI = campaignInfo() SI = siteInfo() SI = global_SI() ###NLI = newLockInfo() ###if not NLI.free() and not options.go: return LI = lockInfo() #if not LI.free() and not options.go and not options.manual: return n_assigned = 0 n_stalled = 0 wfos = [] fetch_from = [] if specific or options.early: fetch_from.extend(['considered', 'staging']) if specific: fetch_from.extend(['considered-tried']) if options.early: print "Option Early is on" fetch_from.extend(['staged']) if options.from_status: fetch_from = options.from_status.split(',') print "Overriding to read from", fetch_from for status in fetch_from: print "getting wf in", status wfos.extend( session.query(Workflow).filter(Workflow.status == status).all()) print len(wfos) ## in case of partial, go for fetching a list from json ? #if options.partial and not specific: # pass aaa_mapping = json.loads(eosRead('%s/equalizor.json' % monitor_pub_dir))['mapping'] all_stuck = set() all_stuck.update( json.loads(eosRead('%s/stuck_transfers.json' % monitor_pub_dir))) max_per_round = UC.get('max_per_round').get('assignor', None) max_cpuh_block = UC.get('max_cpuh_block') ##order by priority instead of random if options.early: cache = sorted(getWorkflows(url, 'assignment-approved', details=True), key=lambda r: r['RequestPriority']) cache = [r['RequestName'] for r in cache] def rank(wfn): return cache.index(wfn) if wfn in cache else 0 wfos = sorted(wfos, key=lambda wfo: rank(wfo.name), reverse=True) print "10 first", [wfo.name for wfo in wfos[:10]] print "10 last", [wfo.name for wfo in wfos[-10:]] else: random.shuffle(wfos) for wfo in wfos: if options.limit and (n_stalled + n_assigned) > options.limit: break if max_per_round and (n_stalled + n_assigned) > max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue #if not specific in wfo.name: continue if not options.manual and 'rucio' in (wfo.name).lower(): continue print "\n\n" wfh = workflowInfo(url, wfo.name) if wfh.request['RequestStatus'] in [ 'rejected', 'aborted', 'aborted-completed', 'aborted-archived', 'rejected-archived' ] and wfh.isRelval(): wfo.status = 'forget' session.commit() n_stalled += 1 continue if options.priority and int( wfh.request['RequestPriority']) < options.priority: continue options_text = "" if options.early: options_text += ", early option is ON" wfh.sendLog('assignor', "%s to be assigned %s" % (wfo.name, options_text)) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput, primary, parent, secondary, sites_allowed, sites_not_allowed) = wfh.getSiteWhiteList() output_tiers = list( set([o.split('/')[-1] for o in wfh.request['OutputDatasets']])) if not output_tiers: n_stalled += 1 wfh.sendLog('assignor', 'There is no output at all') sendLog('assignor', 'Workflow %s has no output at all' % (wfo.name), level='critical') continue is_stuck = (all_stuck & primary) if is_stuck: wfh.sendLog('assignor', "%s are stuck input" % (','.join(is_stuck))) ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: no_go = True allowed_secondary = {} assign_parameters = {} check_secondary = (not wfh.isRelval()) for campaign in wfh.getCampaigns(): if campaign in CI.campaigns: assign_parameters.update(CI.campaigns[campaign]) if campaign in CI.campaigns and 'secondaries' in CI.campaigns[ campaign]: if CI.campaigns[campaign]['secondaries']: allowed_secondary.update( CI.campaigns[campaign]['secondaries']) check_secondary = True if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[ campaign]: banned_tier = list( set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers)) if banned_tier: no_go = True wfh.sendLog( 'assignor', 'These data tiers %s are not allowed' % (','.join(banned_tier))) sendLog('assignor', 'These data tiers %s are not allowed' % (','.join(banned_tier)), level='critical') if secondary and check_secondary: if (set(secondary) & set(allowed_secondary.keys()) != set(secondary)): msg = '%s is not an allowed secondary' % ( ', '.join(set(secondary) - set(allowed_secondary.keys()))) wfh.sendLog('assignor', msg) critical_msg = msg + '\nWorkflow URL: https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=task_{}'.format( wfh.getPrepIDs()[0]) sendLog('assignor', critical_msg, level='critical') if not options.go: no_go = True ## then get whether there is something more to be done by secondary for sec in secondary: if sec in allowed_secondary: # and 'parameters' in allowed_secondary[sec]: assign_parameters.update(allowed_secondary[sec]) if no_go: n_stalled += 1 ## make a very loud noise if >100k priority stalled continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] != 'assignment-approved': if not options.test: wfh.sendLog('assignor', "setting %s away and skipping" % wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name, wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version = wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor', "cannot decide on version number") n_stalled += 1 wfo.status = 'trouble' session.commit() continue wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed)) blocks = wfh.getBlocks() if blocks: wfh.sendLog( 'assignor', "Needs {} blocks in input {}".format(len(blocks), '\n'.join(blocks))) wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed)) primary_aaa = options.primary_aaa secondary_aaa = options.secondary_aaa if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns: assign_parameters.update(CI.campaigns[wfh.request['Campaign']]) if 'primary_AAA' in assign_parameters and primary: primary_aaa = primary_aaa or assign_parameters['primary_AAA'] if 'secondary_AAA' in assign_parameters: secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA'] wfh.sendLog( 'assignor', "Initial values for primary_AAA=%s and secondary_AAA=%s" % (primary_aaa, secondary_aaa)) if primary_aaa: if "T2_CH_CERN_HLT" in sites_allowed: sites_allowed.remove("T2_CH_CERN_HLT") if "T2_CH_CERN_HLT" not in sites_not_allowed: sites_not_allowed.append("T2_CH_CERN_HLT") ## keep track of this, after secondary input location restriction : that's how you want to operate it initial_sites_allowed = copy.deepcopy(sites_allowed) set_lfn = '/store/mc' ## by default for prim in list(primary): set_lfn = getLFNbase(prim) ## if they are requested for processing, they should bbe all closed already # FIXME: remove this closeAllBlocks #closeAllBlocks(url, prim, blocks) ## should be 2 but for the time-being let's lower it to get things going _copies_wanted, cpuh = wfh.getNCopies() wfh.sendLog('assignor', "we need %s CPUh" % cpuh) if cpuh > max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog( 'assignor', '%s requires a large numbr of CPUh %s , not assigning, please check with requester' % (wfo.name, cpuh), level='critical') wfh.sendLog( 'assignor', "Requiring a large number of CPUh %s, not assigning" % cpuh) continue ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently ## default back to white list to original white list with any data wfh.sendLog('assignor', "Allowed sites :%s" % sorted(sites_allowed)) # TODO Alan on 1/april/2020: keep the AAA functionality if primary_aaa: ## remove the sites not reachable localy if not in having the data if not sites_allowed: wfh.sendLog('assignor', "Overiding the primary on AAA setting to Off") primary_aaa = False else: aaa_grid = set(sites_allowed) for site in list(aaa_grid): aaa_grid.update(aaa_mapping.get(site, [])) sites_allowed = list(set(initial_sites_allowed) & aaa_grid) wfh.sendLog( 'assignor', "Selected to read primary through xrootd %s" % sorted(sites_allowed)) isStoreResults = ('StoreResults' == wfh.request.setdefault( 'RequestType', None)) if isStoreResults: if 'MergedLFNBase' in wfh.request: set_lfn = wfh.request['MergedLFNBase'] else: n_stalled += 1 wfh.sendLog( 'assignor', "Cannot assign StoreResults request because MergedLFN is missing" ) sendLog( 'assignor', 'Cannot assign StoreResults request because MergedLFN is missing', level='critical') continue if not primary_aaa: if isStoreResults: ## if we are dealing with a StoreResults request, we don't need to check dataset availability and ## should use the SiteWhiteList set in the original request if 'SiteWhitelist' in wfh.request: sites_allowed = wfh.request['SiteWhitelist'] else: wfh.sendLog( 'assignor', "Cannot assign StoreResults request because SiteWhitelist is missing" ) sendLog( 'assignor', 'Cannot assign StoreResults request because SiteWhitelist is missing', level='critical') n_stalled += 1 continue wfh.sendLog('assignor', "Selected for any data %s" % sorted(sites_allowed)) #if not len(sites_allowed): # if not options.early: # wfh.sendLog('assignor',"cannot be assign with no matched sites") # sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') # n_stalled+=1 # continue low_pressure = SI.sites_low_pressure(0.4) ## if any of the site allowed is low pressure : reduce to 1 copy so that it gets started allowed_and_low = sorted(set(low_pressure) & set(sites_allowed)) if allowed_and_low: wfh.sendLog( 'assignor', "The workflow can run at %s under low pressure currently" % (','.join(allowed_and_low))) if not len(sites_allowed) and not options.SiteWhitelist: if not options.early: wfh.sendLog('assignor', "cannot be assign with no matched sites") sendLog('assignor', '%s has no whitelist' % wfo.name, level='critical') n_stalled += 1 continue t1t2_only = [ ce for ce in sites_allowed if [ce.startswith('T1') or ce.startswith('T2')] ] if t1t2_only: # try to pick from T1T2 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1t2_only])] # then pick any otherwise else: sites_out = [ SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed]) ] print "available=", SI.disk[sites_out[0]] wfh.sendLog('assignor', "Placing the output on %s" % sites_out) parameters = { 'SiteWhitelist': sites_allowed, 'SiteBlacklist': sites_not_allowed, 'NonCustodialSites': sites_out, 'AutoApproveSubscriptionSites': list(set(sites_out)), 'AcquisitionEra': wfh.acquisitionEra(), 'ProcessingString': wfh.processingString(), 'MergedLFNBase': set_lfn, 'ProcessingVersion': version, } if primary_aaa: parameters['TrustSitelists'] = True wfh.sendLog( 'assignor', "Reading primary through xrootd at %s" % sorted(sites_allowed)) if secondary_aaa: parameters['TrustPUSitelists'] = True wfh.sendLog( 'assignor', "Reading secondary through xrootd at %s" % sorted(sites_allowed)) ## plain assignment here team = 'production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team parameters['Team'] = team if lheinput: ## throttle reading LHE article wfh.sendLog('assignor', 'Setting the number of events per job to 500k max') parameters['EventsPerJob'] = 500000 def pick_options(options, parameters): ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v = getattr(options, key) if v != None: if type(v) == str and ',' in v: parameters[key] = filter(None, v.split(',')) else: parameters[key] = v def pick_campaign(assign_parameters, parameters): ## pick up campaign specific assignment parameters parameters.update(assign_parameters.get('parameters', {})) if options.force_options: pick_campaign(assign_parameters, parameters) pick_options(options, parameters) else: ## campaign parameters update last pick_options(options, parameters) pick_campaign(assign_parameters, parameters) if not options.test: parameters['execute'] = True hold_split, split_check = wfh.checkSplitting() if hold_split and not options.go: if split_check: wfh.sendLog( 'assignor', 'Holding on to the change in splitting %s' % ('\n\n'.join([str(i) for i in split_check]))) else: wfh.sendLog('assignor', 'Change of splitting is on hold') n_stalled += 1 continue if split_check == None or split_check == False: n_stalled += 1 continue elif split_check: ## operate all recommended changes reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check) wfh.sendLog( 'assignor', 'Applying the change in splitting %s' % ('\n\n'.join([str(i) for i in split_check]))) split_check = True ## bypass completely and use the above # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents / (reqJobs * 1.4)) lumisPerJob = int(eventsPerJob / eventsPerLumi) if lumisPerJob == 0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob), level='critical') wfh.sendLog( 'assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] # FIXME: decide which of the lines below needs to remain... eventsPerJobEstimated = spl[ 'events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl[ 'avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob), level='critical') wfh.sendLog( 'assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog( 'assignor', "leaving splitting untouched for %s, please check on %s" % (pstring, wfo.name), level='critical') wfh.sendLog( 'assignor', "leaving splitting untouched for PU_RD*, please check." ) #if isHEPCloudReady(url) and wfh.isGoodForNERSC(): # parameters['Team'] = 'hepcloud' # parameters['SiteWhitelist'] = ['T3_US_NERSC'] # if primary: # parameters['TrustSitelists'] = True # if secondary: # parameters['TrustPUSitelists'] = True # sendEmail("sending work to hepcloud","pleasse check on %s"% wfh.request['RequestName'], destination=['*****@*****.**']) ## make sure to autoapprove all NonCustodialSites parameters['AutoApproveSubscriptionSites'] = list( set(parameters['NonCustodialSites'] + parameters.get('AutoApproveSubscriptionSites', []))) result = reqMgrClient.assignWorkflow( url, wfo.name, None, parameters) ## team is not relevant anymore here # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned += 1 wfh.sendLog( 'assignor', "Properly assigned\n%s" % (json.dumps(parameters, indent=2))) if wfh.producePremix() and (not wfh.isRelval()): title = "Heavy workflow assigned to {}".format( parameters['SiteWhitelist']) body = "Workflow name: {}".format( wfh.request['RequestName']) body += "\nOutput dataset(s): {}".format( wfh.request['OutputDatasets']) body += "\nAssigned to: {}".format( parameters['SiteWhitelist']) sendEmail( title, body, destination=[ '*****@*****.**' ]) try: ## refetch information and lock output new_wfi = workflowInfo(url, wfo.name) (_, prim, _, sec) = new_wfi.getIO() for secure in list(prim) + list( sec) + new_wfi.request['OutputDatasets']: ## lock all outputs LI.lock(secure, reason='assigning') except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output", str(e)) else: wfh.sendLog( 'assignor', "Failed to assign %s.\n%s \n Please check the logs" % (wfo.name, reqMgrClient.assignWorkflow.errorMessage)) sendLog('assignor', "Failed to assign %s.\n%s \n Please check the logs" % (wfo.name, reqMgrClient.assignWorkflow.errorMessage), level='critical') print "ERROR could not assign", wfo.name else: pass print "Assignment summary:" sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled)) if n_stalled and not options.go and not options.early: sendLog('assignor', "%s workflows cannot be assigned. Please take a look" % (n_stalled), level='critical')
def injector(url, options, specific): mlock = moduleLock() if mlock(): return use_mcm = True up = componentInfo(soft=['mcm', 'wtc']) if not up.check(): return use_mcm = up.status['mcm'] UC = unifiedConfiguration() transform_keywords = UC.get('convert_to_stepchain') workflows = getWorkflows(url, status=options.wmstatus, user=options.user) for user in UC.get("user_rereco"): workflows.extend( getWorkflows(url, status=options.wmstatus, user=user, rtype="ReReco")) for user in (options.user_relval.split(',') if options.user_relval else UC.get("user_relval")): workflows.extend( getWorkflows(url, status=options.wmstatus, user=user, rtype="TaskChain")) for user in (options.user_storeresults.split(',') if options.user_storeresults else UC.get("user_storeresults")): workflows.extend( getWorkflows(url, status=options.wmstatus, user=user, rtype="StoreResults")) print len(workflows), "in line" cannot_inject = set() to_convert = set() status_cache = defaultdict(str) ## browse for assignment-approved requests, browsed for ours, insert the diff for wf in workflows: if specific and not specific in wf: continue exists = session.query(Workflow).filter(Workflow.name == wf).first() if not exists: wfi = workflowInfo(url, wf) ## check first that there isn't related here with something valid can_add = True ## first try at finding a match familly = session.query(Workflow).filter( Workflow.name.contains(wfi.request['PrepID'])).all() if not familly: pids = wfi.getPrepIDs() req_familly = [] for pid in pids: req_familly.extend(getWorkflowById(url, pid, details=True)) familly = [] print len(req_familly), "members" for req_member in req_familly: #print "member",req_member['RequestName'] owfi = workflowInfo(url, req_member['RequestName'], request=req_member) other_pids = owfi.getPrepIDs() if set(pids) == set(other_pids): ## this is a real match familly.extend( session.query(Workflow).filter( Workflow.name == req_member['RequestName']).all()) for lwfo in familly: if lwfo: ## we have it already if not lwfo.status in [ 'forget', 'trouble', 'forget-unlock', 'forget-out-unlock' ]: wfi.sendLog( 'injector', "Should not put %s because of %s %s" % (wf, lwfo.name, lwfo.status)) sendLog('injector', "Should not put %s because of %s %s" % (wf, lwfo.name, lwfo.status), level='critical') print "Should not put", wf, "because of", lwfo.name, lwfo.status cannot_inject.add(wf) can_add = False ## add a check on validity of input datasets _, prim, par, sec = wfi.getIO() for d in list(prim) + list(par) + list(sec): if not d in status_cache: status_cache[d] = getDatasetStatus(d) if status_cache[d] != 'VALID': wfi.sendLog( 'injector', "One of the input is not VALID. %s : %s" % (d, status_cache[d])) sendLog('injector', "One of the input of %s is not VALID. %s : %s" % (wf, d, status_cache[d]), level='critical') can_add = False ## check for any file in phedex, to verify existence _, ph_files, _, _ = getDatasetFiles(url, d) if not ph_files and not ('StoreResults' == wfi.request.setdefault( 'RequestType', None)): wfi.sendLog( 'injector', "One of the input has no file in phedex: %s" % d) sendLog('injector', "One of the input has no file in phedex: %s" % d, level='critical') can_add = False ### ban some workflow that you don't like anymore #outputs = wfi.request['OutputDatasets'] if not can_add: continue ## temporary hack to transform specific taskchain into stepchains #good_for_stepchain = wfi.isGoodToConvertToStepChain( keywords = transform_keywords) good_for_stepchain = wfi.isGoodToConvertToStepChain(keywords=None) ## match keywords and technical constraints #if (not options.no_convert) and good_for_stepchain and not wfi.isRelval(): # to_convert.add( wf ) # wfi.sendLog('injector','Transforming %s TaskChain into StepChain'%wf) # #sendEmail('convertion to stepchain','Transforming %s TaskChain into StepChain'%wf) wfi.sendLog('injector', "considering %s" % wf) new_wf = Workflow(name=wf, status=options.setstatus, wm_status=options.wmstatus) session.add(new_wf) session.commit() time.sleep(0.5) else: #print "already have",wf pass if cannot_inject: #sendEmail('workflow duplicates','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject))) sendLog( 'injector', 'These workflow cannot be added in because of duplicates \n\n %s' % ('\n'.join(cannot_inject)), level='warning') for wf in to_convert: os.system( './Unified/rejector.py --clone --to_step --comments \"Transform to StepChain\" %s' % wf) ## passing a round of invalidation of what needs to be invalidated if use_mcm and (options.invalidate or True): invalidator(url) no_replacement = set() #print "getting all transfers" #all_transfers=session.query(Transfer).all() #print "go!" ## pick up replacements for wf in session.query(Workflow).filter( Workflow.status == 'trouble').all(): print wf.name if specific and not specific in wf.name: continue print wf.name wfi = workflowInfo(url, wf.name) wl = wfi.request #getWorkLoad(url, wf.name) familly = getWorkflowById(url, wl['PrepID']) true_familly = [] for member in familly: if member == wf.name: continue fwl = getWorkLoad(url, member) if options.replace: if member != options.replace: continue else: if fwl['RequestDate'] < wl['RequestDate']: continue if fwl['RequestType'] == 'Resubmission': continue if fwl['RequestStatus'] in ['None', None, 'new']: continue if fwl['RequestStatus'] in [ 'rejected', 'rejected-archived', 'aborted', 'aborted-archived' ]: continue true_familly.append(fwl) if len(true_familly) == 0: #sendLog('injector','%s had no replacement'%wf.name, level='critical') if wfi.isRelval(): #wfi.sendLog('injector','the workflow was found in trouble with no replacement. As a relval, there is no clean way to handle this.') wfi.sendLog( 'injector', 'the workflow was found in trouble with no replacement. As a relval, there is no clean way to handle this. Setting forget' ) wf.status = 'forget' session.commit() else: wfi.sendLog( 'injector', 'the workflow was found in trouble with no replacement') no_replacement.add(wf.name) continue else: wfi.sendLog( 'injector', 'the workflow was found in trouble and has a replacement') print wf.name, "has", len(familly), "familly members" print wf.name, "has", len(true_familly), "true familly members" ##we cannot have more than one of them !!! pick the last one if len(true_familly) > 1: #sendEmail('multiple wf','please take a look at injector for %s'%wf.name) sendLog('injector', 'Multiple wf in line, will take the last one for %s \n%s' % (wf.name, ', '.join(fwl['RequestName'] for fwl in true_familly)), level='critical') for fwl in true_familly[-1:]: member = fwl['RequestName'] new_wf = session.query(Workflow).filter( Workflow.name == member).first() if not new_wf: sendLog('injector', "putting %s as replacement of %s" % (member, wf.name)) status = 'away' if fwl['RequestStatus'] in ['assignment-approved']: status = 'considered' new_wf = Workflow(name=member, status=status, wm_status=fwl['RequestStatus']) wf.status = 'forget' session.add(new_wf) else: if new_wf.status == 'forget': continue sendLog( 'injector', "getting %s as replacement of %s" % (new_wf.name, wf.name)) wf.status = 'forget' for tr in session.query(TransferImp).filter( TransferImp.workflow_id == wf.id).all(): ## get all transfer working for the old workflow existing = session.query(TransferImp).filter( TransferImp.phedexid == tr.phedexid).filter( TransferImp.workflow_id == new_wf.id).all() tr.active = False ## disable the old one if not existing: ## create the transfer object for the new dependency tri = TransferImp(phedexid=tr.phedexid, workflow=new_wf) session.add(tri) session.commit() ## don't do that automatically #wf.status = 'forget' session.commit() if no_replacement: #sendEmail('workflow with no replacement','%s \n are dangling there'%( '\n'.join(no_replacement))) sendLog('injector', 'workflow with no replacement\n%s \n are dangling there' % ('\n'.join(no_replacement)), level='critical')
def equalizor(url , specific = None, options=None): up = componentInfo(mcm=False, soft=['mcm']) if not up.check(): return if not specific: workflows = getWorkflows(url, status='running-closed', details=True) workflows.extend(getWorkflows(url, status='running-open', details=True)) ## start from scratch modifications = defaultdict(dict) ## define regionality site => fallback allowed. feed on an ssb metric ?? mapping = defaultdict(list) reversed_mapping = defaultdict(list) regions = defaultdict(list) SI = siteInfo() for site in SI.sites_ready: region = site.split('_')[1] if not region in ['US','DE','IT']: continue regions[region] = [region] def site_in_depletion(s): return True if s in SI.sites_pressure: (m, r, pressure) = SI.sites_pressure[s] if float(m) < float(r): print s,m,r,"lacking pressure" return True else: print s,m,r,"pressure" pass return False for site in SI.sites_ready: region = site.split('_')[1] ## fallback to the region, to site with on-going low pressure mapping[site] = [fb for fb in SI.sites_ready if any([('_%s_'%(reg) in fb and fb!=site and site_in_depletion(fb))for reg in regions[region]]) ] use_T0 = False if options.augment : use_T0 = True use_HLT = False if options.augment : use_HLT=True if use_HLT: mapping['T2_CH_CERN'].append('T2_CH_CERN_HLT') #mapping['T2_IT_Legnaro'].append('T1_IT_CNAF') for reg in ['IT','DE','UK']: mapping['T2_CH_CERN'].extend([fb for fb in SI.sites_ready if '_%s_'%reg in fb]) for site,fallbacks in mapping.items(): for fb in fallbacks: reversed_mapping[fb].append(site) ## this is the fallback mapping print json.dumps( mapping, indent=2) #print json.dumps( reversed_mapping, indent=2) altered_tasks = set() def running_idle( wfi , task_name): gmon = wfi.getGlideMon() #print gmon if not gmon: return (0,0) if not task_name in gmon: return (0,0) return (gmon[task_name]['Running'], gmon[task_name]['Idle']) def needs_action( wfi, task, min_idled = 100, pressure = 0.2): task_name = task.pathName.split('/')[-1] running, idled = running_idle( wfi, task_name) go = True if not idled and not running : go = False if idled < 100: go = False if (not running and idled) or (running and (idled / float(running) > pressure)): go = True else: go = False return go, task_name, running, idled def getcampaign( task ): taskname = task.pathName.split('/')[-1] if hasattr( task, 'prepID'): return task.prepID.split('-')[1] elif taskname.count('-')>=1: return taskname.split('-')[1] else: return None def close( interface ): open('%s/equalizor.json.new'%monitor_dir,'w').write( json.dumps( interface, indent=2)) os.system('mv %s/equalizor.json.new %s/equalizor.json'%(monitor_dir,monitor_dir)) os.system('cp %s/equalizor.json %s/logs/equalizor/equalizor.%s.json'%(monitor_dir,monitor_dir,time.mktime(time.gmtime()))) interface = { 'reversed_mapping' : reversed_mapping, 'modifications' : {} } if options.augment or options.remove: interface['modifications'] = json.loads( open('%s/equalizor.json'%monitor_dir).read())['modifications'] if options.remove: if specific in interface['modifications']: print "poping",specific interface['modifications'].pop(specific) close( interface ) return PU_locations = {} PU_overflow = { #'RunIISpring15PrePremix' : { # 'sites' : ["T1_US_FNAL", "T1_DE_KIT" , "T1_IT_CNAF", "T1_RU_JINR" ,"T2_CH_CERN"], # 'max' : 20000, # 'pending' : 0 # }, 'RunIIFall15DR76' : { 'sites':['T1_ES_PIC','T2_US_Purdue','T2_UK_SGrid_RALPP','T2_BE_IIHE','T2_DE_DESY','T2_IT_Legnaro','T2_US_Caltech','T1_DE_KIT', 'T2_UK_London_Brunel','T2_IT_Pisa', 'T1_US_FNAL', 'T2_IT_Rome','T2_US_Florida','T1_IT_CNAF','T1_RU_JINR','T2_UK_London_IC','T2_US_Nebraska','T2_FR_CCIN2P3','T2_US_UCSD','T2_ES_CIEMAT', 'T1_FR_CCIN2P3','T2_US_Wisconsin','T2_US_MIT','T2_DE_RWTH', 'T1_UK_RAL','T2_US_Vanderbilt','T2_CH_CERN'], 'max': 20000, 'pending' : 0}, 'RunIISpring16DR80' : { 'sites':['T1_ES_PIC','T2_US_Purdue','T2_UK_SGrid_RALPP','T2_BE_IIHE','T2_DE_DESY','T2_IT_Legnaro','T2_US_Caltech','T1_DE_KIT', 'T2_UK_London_Brunel','T2_IT_Pisa', 'T1_US_FNAL', 'T2_IT_Rome','T2_US_Florida','T1_IT_CNAF','T1_RU_JINR','T2_UK_London_IC','T2_US_Nebraska','T2_FR_CCIN2P3','T2_US_UCSD','T2_ES_CIEMAT', 'T1_FR_CCIN2P3','T2_US_Wisconsin','T2_US_MIT','T2_DE_RWTH', 'T1_UK_RAL','T2_US_Vanderbilt','T2_CH_CERN'], 'max': 20000, 'pending' : 0, 'force' : True}, 'RunIISpring15DR74' : { 'sites' : ['T1_ES_PIC','T1_DE_KIT','T1_US_FNAL','T1_IT_CNAF','T1_RU_JINR','T1_FR_CCIN2P3','T1_UK_RAL','T2_CH_CERN'], 'max' : 20000, 'pending' : 0} } set_to = SI.sites_AAA LHE_overflow = { 'RunIIWinter15GS' : set_to, 'RunIISummer15GS' : set_to, 'Summer12' : set_to, 'Summer11Leg' : set_to #'RunIIFall15MiniAODv2' : set_to, } pending_HLT = 0 max_HLT = 60000 pending_T0 = 0 max_T0 = 60000 try: gmon = json.loads(os.popen('curl -s http://cms-gwmsmon.cern.ch/prodview/json/T2_CH_CERN_HLT').read()) pending_HLT += gmon["Running"] pending_HLT += gmon["MatchingIdle"] except: pass t0_special = [ 'vlimant_BPH-RunIISummer15GS-00030_00212_v0__160129_135314_9755', 'pdmvserv_TSG-RunIISummer15GS-00044_00240_v0__160210_121223_8582' ] no_routing = [ #'vlimant_BPH-RunIISummer15GS-00030_00212_v0__160129_135314_9755', #'pdmvserv_TOP-RunIIWinter15GS-00074_00187_v0__160207_162312_1992', ] stay_within_site_whitelist = False specific_task=None if specific and ":" in specific: specific,specific_task = specific.split(':') if specific: wfs = session.query(Workflow).filter(Workflow.name.contains(specific)).all() else: wfs = session.query(Workflow).filter(Workflow.status == 'away').all() random.shuffle( wfs ) for wfo in wfs: if wfo.name in no_routing and not options.augment: continue if specific and not specific in wfo.name: continue if specific: wfi = workflowInfo(url, wfo.name) else: cached = filter(lambda d : d['RequestName']==wfo.name, workflows) if not cached : continue wfi = workflowInfo(url, wfo.name, request = cached[0]) ## only running should get re-routed if not wfi.request['RequestStatus'] in ['running-open','running-closed'] and not specific: continue tasks_and_campaigns = [] for task in wfi.getWorkTasks(): tasks_and_campaigns.append( (task, getcampaign(task) ) ) _,_,_,sec = wfi.getIO() ## check needs override needs_overide = False if not needs_overide and options.augment: needs_overide=True def overide_from_agent( wfi, needs_overide): bad_agents = []#'http://cmssrv219.fnal.gov:5984'] if not bad_agents: return needs_overide if needs_overide: return True agents = wfi.getAgents() wqss = ['Running','Acquired'] if any([agent in agents.get(wqs,{}).keys() for wqs,agent in itertools.product( wqss, bad_agents)]): print "overriding the need for bad agent" needs_overide = True return needs_overide ## now parse this for action for i_task,(task,campaign) in enumerate(tasks_and_campaigns): if options.augment: print task.pathName print campaign ### rule to avoid the issue of taskchain secondary jobs being stuck at sites processing the initial step if campaign in LHE_overflow: if task.taskType in ['Processing']: needs, task_name, running, idled = needs_action(wfi, task) needs_overide = overide_from_agent( wfi, needs_overide) extend_to = copy.deepcopy( LHE_overflow[campaign] ) if stay_within_site_whitelist: extend_to = list(set(extend_to) & set(wfi.request['SiteWhitelist'])) ## restrict to stupid-site-whitelist if extend_to and needs or needs_overide: print "\t",task_name,"of",wfo.name,"running",running,"and pending",idled,"taking action : ReplaceSiteWhitelist" modifications[wfo.name][task.pathName] = { "ReplaceSiteWhitelist" : copy.deepcopy( LHE_overflow[campaign] ) ,"Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']} #print json.dumps( modifications[wfo.name][task.pathName]['ReplaceSiteWhitelist'] altered_tasks.add( task.pathName ) else: print task_name,"of",wfo.name,"running",running,"and pending",idled ### overflow the 76 digi-reco to the site holding the pileup if campaign in PU_overflow: force = PU_overflow[campaign]['force'] if 'force' in PU_overflow[campaign] else False secondary_locations = set(SI.sites_ready) for s in sec: if not s in PU_locations: presence = getDatasetPresence( url, s) #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.] PU_locations[s] = one_secondary_locations print "secondary is at",sorted(PU_locations[s]) secondary_locations = set(PU_locations[s]) & secondary_locations ## we should add all sites that hold the secondary input if any secondary_locations = list(set(PU_overflow[campaign]['sites']) & set( SI.sites_ready )) if any([task.pathName.endswith(finish) for finish in ['_0','StepOneProc','Production']]) : needs, task_name, running, idled = needs_action(wfi, task) ## removing the ones in the site whitelist already since they encode the primary input location if stay_within_site_whitelist: original_site_in_use = set(wfi.request['SiteWhitelist']) else: original_site_in_use = set(secondary_locations) ## remove the sites that have already running jobs gmon = wfi.getGlideMon() if gmon and task_name in gmon and 'Sites' in gmon[task_name]: site_in_use = set(gmon[task_name]['Sites']) ## that determines where you want to run in addition #augment_by = list((set(secondary_locations)- site_in_use)) augment_by = list((set(secondary_locations)- site_in_use) & original_site_in_use) ## restrict to stupid-site-whitelist else: augment_by = list(original_site_in_use) needs_overide = overide_from_agent( wfi, needs_overide) if augment_by and (needs or needs_overide or force) and PU_overflow[campaign]['pending'] < PU_overflow[campaign]['max']: PU_overflow[campaign]['pending'] += idled print "raising overflow to",PU_overflow[campaign]['pending'],"for",PU_overflow[campaign]['max'] ## the step with an input ought to be the digi part : make this one go anywhere modifications[wfo.name][task.pathName] = { "AddWhitelist" : augment_by , "Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']} altered_tasks.add( task.pathName ) print "\t",task_name,"of",wfo.name,"running",running,"and pending",idled,"taking action : AddWhitelist" #print json.dumps( augment_by, indent=2 ) else: print task_name,"of",wfo.name,"running",running,"and pending",idled ### overflow the skims back to multi-core if campaign in ['Run2015D','Run2015C_25ns'] and task.taskType =='Skim': original_swl = wfi.request['SiteWhitelist'] needs, task_name, running, idled = needs_action(wfi, task) if (needs or needs_overide): modifications[wfo.name][task.pathName] = { 'AddWhitelist' : original_swl, "Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']} altered_tasks.add( task.pathName ) print "\t",task_name,"of",wfo.name,"running",running,"and pending",idled,"taking action : AddWhitelist" if options.augment: print sorted(wfi.request['SiteWhitelist']),i_task,use_HLT ### add the HLT at partner of CERN if 'T2_CH_CERN' in wfi.request['SiteWhitelist'] and i_task==0 and use_HLT: needs, task_name, running, idled = needs_action(wfi, task) if options.augment: needs=True needs = True ##needs = random.random()<0.40 remove the random, just add up to a limit if (needs or needs_overide) and pending_HLT < max_HLT: pending_HLT += idled if task.pathName in modifications[wfo.name] and 'AddWhitelist' in modifications[wfo.name][task.pathName]: modifications[wfo.name][task.pathName]["AddWhitelist"].append( "T2_CH_CERN_HLT" ) print "\t",wfo.name,"adding addHLT up to",pending_HLT,"for",max_HLT print task.pathName ## this Replace does not work at all for HLT #elif task.pathName in modifications[wfo.name] and 'ReplaceSiteWhitelist' in modifications[wfo.name][task.pathName]: #modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"].append( "T2_CH_CERN_HLT" ) #print "\t",wfo.name,"adding replace HLT up to",pending_HLT,"for",max_HLT else: modifications[wfo.name][task.pathName] = { "AddWhitelist" : ["T2_CH_CERN_HLT"], "Priority" : wfi.request['RequestPriority'], "Running" : running, "Pending" : idled} print "\t",wfo.name,"adding HLT up to",pending_HLT,"for",max_HLT print task.pathName if i_task==0 and not sec and use_T0: needs, task_name, running, idled = needs_action(wfi, task) if options.augment: needs=True #needs = True #if not (wfo.name in t0_special) and not options.augment: needs = False if not wfi.request['RequestType'] in ['MonteCarlo','MonteCarloFromGEN'] and not options.augment: needs = False ##needs = random.random()<0.40 remove the random, just add up to a limit if (needs or needs_overide): pending_T0 += idled if task.pathName in modifications[wfo.name] and 'AddWhitelist' in modifications[wfo.name][task.pathName]: modifications[wfo.name][task.pathName]["AddWhitelist"].append( "T0_CH_CERN" ) print "\t",wfo.name,"adding addT0 up to",pending_T0,"for",max_T0 print task.pathName elif task.pathName in modifications[wfo.name] and 'ReplaceSiteWhitelist' in modifications[wfo.name][task.pathName]: modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"].append( "T0_CH_CERN" ) print "\t",wfo.name,"adding replace T0 up to",pending_T0,"for",max_T0 else: modifications[wfo.name][task.pathName] = { "AddWhitelist" : ["T0_CH_CERN"], "Priority" : wfi.request['RequestPriority'], "Running" : running, "Pending" : idled} print "\t",wfo.name,"adding T0 up to",pending_T0,"for",max_T0 print task.pathName interface['modifications'].update( modifications ) ## temporary core managing interface['cores']={'T2_CH_CERN_HLT': {'min':4,'max':16}, 'default': {'min':1, 'max':4}} #interface['max_cores']={'T2_CH_CERN_HLT': 16, 'default': 4} #interface['min_cores']={'T2_CH_CERN_HLT': 4, 'default': 1} #interface['resize_subtasks'] = 'RunIISpring16DR80' interface['resizes'] = ['RunIISpring16DR80','NotACampaign'] ## close and save close( interface )
from collections import defaultdict import time import json import sys import random from assignSession import * UC = unifiedConfiguration() spec=None if len(sys.argv) >1: spec = sys.argv[1] url = 'cmsweb.cern.ch' wfs = getWorkflows(url, 'acquired', details=True) wfs.extend( getWorkflows(url, 'running-open', details=True) ) wfs.extend( getWorkflows(url, 'running-closed', details=True) ) jobs_for = defaultdict(lambda : defaultdict(int)) wf_for = defaultdict(lambda : defaultdict(set)) agent_for = defaultdict(lambda : defaultdict(set)) s_block_locations = {} block_locations = defaultdict(lambda : defaultdict(list)) wfs_no_location_in_GQ = defaultdict(list) si = siteInfo() #bad_blocks = defaultdict( set ) unprocessable = set() not_runable_acdc=set() agents_down = defaultdict(set)
## add an addHoc list of things to lock. empyting this list would result in unlocking later addHocLocks = json.loads(eosRead('%s/addhoc_lock.json' % base_eos_dir)) time_point("Starting addhoc") for item in addHocLocks: ds = item.split('#')[0] LI.lock(ds, reason='addhoc lock') newly_locking.add(ds) time_point("Starting reversed statuses check") for status in statuses: print time.asctime(time.gmtime()), "CEST, fetching", status time_point("checking %s" % status, sub_lap=True) wfls = getWorkflows(url, status=status, details=True) print len(wfls), "in", status for wl in wfls: wfi = workflowInfo(url, wl['RequestName'], request=wl, spec=False) (_, primaries, _, secondaries) = wfi.getIO() outputs = wfi.request['OutputDatasets'] ## unknonw to the system known = session.query(Workflow).filter( Workflow.name == wl['RequestName']).all() if not known: print wl['RequestName'], "is unknown to unified, relocking all I/O" for dataset in list(primaries) + list(secondaries) + outputs: print "\t", dataset also_locking_from_reqmgr.add(dataset) continue
url = reqmgr_url up = componentInfo(mcm=False, soft=['mcm']) if not up.check(): sys.exit(0) status = sys.argv[1] max_wf = 0 print "Picked status",status wfs = [] if status == 'wmagent': register=['assigned','acquired','running-open','running-closed','force-complete','completed','closed-out'] for r in register: wfs.extend( getWorkflows(url, r) ) elif status.endswith('*'): wfs.extend([wfo.name for wfo in session.query(Workflow).filter(Workflow.status.startswith(status[:-1])).all() ]) else: wfs.extend([wfo.name for wfo in session.query(Workflow).filter(Workflow.status==status).all() ]) if max_wf: wfs = wfs[:max_wf] random.shuffle( wfs ) all_blocks_at_sites = defaultdict(set) #done = json.loads(open('myblock_done.json').read()) done = {}
def injector(url, options, specific): use_mcm = True up = componentInfo( mcm = use_mcm, soft=['mcm'] ) if not up.check(): return use_mcm = up.status['mcm'] workflows = getWorkflows(url, status=options.wmstatus, user=options.user) workflows.extend( getWorkflows(url, status=options.wmstatus, user='******', rtype="ReReco")) ## regardless of users, pick up all ReReco on the table print len(workflows),"in line" cannot_inject = set() status_cache = defaultdict(str) ## browse for assignment-approved requests, browsed for ours, insert the diff for wf in workflows: if specific and not specific in wf: continue exists = session.query(Workflow).filter(Workflow.name == wf ).first() if not exists: wfi = workflowInfo(url, wf) #wl = getWorkLoad(url, wf) ## check first that there isn't related here with something valid can_add = True ## first try at finding a match # print wfi.request familly = session.query(Workflow).filter(Workflow.name.contains(wfi.request['PrepID'])).all() if not familly: #req_familly = getWorkflowById( url, wl['PrepID']) #familly = [session.query(Workflow).filter(Workflow.name == member).first() for member in req_familly] pids = wfi.getPrepIDs() req_familly = [] for pid in pids: req_familly.extend( getWorkflowById( url, pid, details=True) ) familly = [] print len(req_familly),"members" for req_member in req_familly: #print "member",req_member['RequestName'] owfi = workflowInfo(url, req_member['RequestName'], request=req_member) other_pids = owfi.getPrepIDs() if set(pids) == set(other_pids): ## this is a real match familly.extend( session.query(Workflow).filter(Workflow.name == req_member['RequestName']).all() ) for lwfo in familly: if lwfo: ## we have it already if not lwfo.status in ['forget','trouble','forget-unlock','forget-out-unlock']: sendLog('injector',"Should not put %s because of %s %s"%( wf, lwfo.name,lwfo.status )) print "Should not put",wf,"because of",lwfo.name,lwfo.status cannot_inject.add( wf ) can_add = False ## add a check on validity of input datasets _,prim,par,sec = wfi.getIO() for d in list(prim)+list(par)+list(sec): if not d in status_cache: status_cache[d] = getDatasetStatus(d) if status_cache[d] != 'VALID': wfi.sendLog('injector',"One of the input is not VALID. %s : %s"%( d, status_cache[d])) sendLog('injector',"One of the input of %s is not VALID. %s : %s"%( wf, d, status_cache[d])) can_add = False if not can_add: continue wfi.sendLog('injector',"considering %s"%wf) new_wf = Workflow( name = wf , status = options.setstatus, wm_status = options.wmstatus) session.add( new_wf ) session.commit() time.sleep(0.5) else: #print "already have",wf pass if cannot_inject: #sendEmail('workflow duplicates','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject))) sendLog('injector','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject)), level='warning') ## passing a round of invalidation of what needs to be invalidated if use_mcm and (options.invalidate or True): invalidator(url) no_replacement = set() ## pick up replacements for wf in session.query(Workflow).filter(Workflow.status == 'trouble').all(): print wf.name if specific and not specific in wf.name: continue print wf.name wfi = workflowInfo(url, wf.name ) wl = wfi.request #getWorkLoad(url, wf.name) familly = getWorkflowById( url, wl['PrepID'] ) true_familly = [] for member in familly: if member == wf.name: continue fwl = getWorkLoad(url , member) if options.replace: if member != options.replace: continue else: if fwl['RequestDate'] < wl['RequestDate']: continue if fwl['RequestType']=='Resubmission': continue if fwl['RequestStatus'] in ['None',None,'new']: continue if fwl['RequestStatus'] in ['rejected','rejected-archived','aborted','aborted-archived']: continue true_familly.append( fwl ) if len(true_familly)==0: #sendLog('injector','%s had no replacement'%wf.name, level='critical') wfi.sendLog('injector','the workflow was found in trouble with no replacement') no_replacement.add( wf.name ) continue else: wfi.sendLog('injector','the workflow was found in trouble and has a replacement') print wf.name,"has",len(familly),"familly members" print wf.name,"has",len(true_familly),"true familly members" ##we cannot have more than one of them !!! pick the last one if len(true_familly)>1: #sendEmail('multiple wf','please take a look at injector for %s'%wf.name) sendLog('injector','Multiple wf in line, will take the last one for %s \n%s'%( wf.name, ', '.join(fwl['RequestName'] for fwl in true_familly)), level='critical') for fwl in true_familly[-1:]: member = fwl['RequestName'] new_wf = session.query(Workflow).filter(Workflow.name == member).first() if not new_wf: sendLog('injector',"putting %s as replacement of %s"%( member, wf.name)) status = 'away' if fwl['RequestStatus'] in ['assignment-approved']: status = 'considered' new_wf = Workflow( name = member, status = status, wm_status = fwl['RequestStatus']) wf.status = 'forget' session.add( new_wf ) else: if new_wf.status == 'forget': continue sendLog('injector',"getting %s as replacement of %s"%( new_wf.name, wf.name )) wf.status = 'forget' for tr in session.query(Transfer).all(): if wf.id in tr.workflows_id: sw = copy.deepcopy(tr.workflows_id) sw.remove( wf.id) sw.append(new_wf.id) tr.workflows_id = sw print tr.phedexid,"got",new_wf.name if new_wf.status != 'away': print "\t setting it considered" new_wf.status = 'considered' if tr.phedexid<0: ## set it back to positive tr.phedexid = -tr.phedexid session.commit() ## don't do that automatically #wf.status = 'forget' session.commit() if no_replacement: #sendEmail('workflow with no replacement','%s \n are dangling there'%( '\n'.join(no_replacement))) sendLog('injector','workflow with no replacement, %s \n are dangling there'% ( '\n'.join(no_replacement)), level='critical')
def injector(url, options, specific): ## passing a round of invalidation of what needs to be invalidated if options.invalidate: invalidator(url) workflows = getWorkflows(url, status=options.wmstatus, user=options.user) existing = [wf.name for wf in session.query(Workflow).all()] ## browse for assignment-approved requests, browsed for ours, insert the diff for wf in workflows: if wf not in existing: print "putting", wf new_wf = Workflow(name=wf, status=options.setstatus, wm_status=options.wmstatus) session.add(new_wf) session.commit() existing = [wf.name for wf in session.query(Workflow).all()] ## pick up replacements for wf in session.query(Workflow).filter( Workflow.status == 'trouble').all(): if specific and wf.name != specific: continue print wf.name wl = getWorkLoad(url, wf.name) familly = getWorkflowById(url, wl['PrepID']) if len(familly) == 1: print wf.name, "ERROR has no replacement" continue print wf.name, "has", len(familly), "familly members" for member in familly: if member != wf.name: fwl = getWorkLoad(url, member) if options.replace: if member != options.replace: continue else: if fwl['RequestDate'] < wl['RequestDate']: continue if fwl['RequestType'] == 'Resubmission': continue if fwl['RequestStatus'] in ['None', None]: continue new_wf = session.query(Workflow).filter( Workflow.name == member).first() if not new_wf: print "putting", member status = 'away' if fwl['RequestStatus'] in ['assignment-approved']: status = 'considered' new_wf = Workflow(name=member, status=status, wm_status=fwl['RequestStatus']) wf.status = 'forget' session.add(new_wf) session.commit() else: if new_wf.status == 'forget': continue print "getting", new_wf.name, "as replacement of", wf.name for tr in session.query(Transfer).all(): if wf.id in tr.workflows_id: sw = copy.deepcopy(tr.workflows_id) sw.remove(wf.id) sw.append(new_wf.id) tr.workflows_id = sw print tr.phedexid, "got", new_wf.name if new_wf.status != 'away': new_wf.status = 'staging' session.commit() ## don't do that automatically #wf.status = 'forget' session.commit()
def equalizor(url , specific = None, options=None): up = componentInfo(mcm=False, soft=['mcm']) if not up.check(): return if not specific: workflows = getWorkflows(url, status='running-closed', details=True) workflows.extend(getWorkflows(url, status='running-open', details=True)) ## start from scratch modifications = defaultdict(dict) ## define regionality site => fallback allowed. feed on an ssb metric ?? mapping = defaultdict(list) reversed_mapping = defaultdict(list) regions = defaultdict(list) SI = siteInfo() CI = campaignInfo() UC = unifiedConfiguration() for site in SI.sites_ready: region = site.split('_')[1] if not region in ['US' ,'DE','IT','FR', 'ES', 'UK' ### latest addition ]: continue regions[region] = [region] def site_in_depletion(s): return True if s in SI.sites_pressure: (m, r, pressure) = SI.sites_pressure[s] if float(m) < float(r): print s,m,r,"lacking pressure" return True else: print s,m,r,"pressure" pass return False for site in SI.sites_ready: region = site.split('_')[1] ## fallback to the region, to site with on-going low pressure mapping[site] = [fb for fb in SI.sites_ready if any([('_%s_'%(reg) in fb and fb!=site and site_in_depletion(fb))for reg in regions[region]]) ] for site in SI.sites_ready: if site.split('_')[1] == 'US': ## to all site in the US ## add NERSC mapping[site].append('T3_US_NERSC') ## add OSG mapping[site].append('T3_US_OSG') pass #mapping['T2_IT_Rome'].append('T3_US_OSG') #mapping['T1_US_FNAL'].append('T3_US_NERSC') use_T0 = ('T0_CH_CERN' in UC.get("site_for_overflow")) if options.t0: use_T0 = True #if options.augment : use_T0 = True use_HLT = ('T2_CH_CERN_HLT' in UC.get("site_for_overflow")) if options.hlt: use_HLT = True #if options.augment : use_HLT=True if use_HLT: mapping['T2_CH_CERN'].append('T2_CH_CERN_HLT') if use_T0: mapping['T2_CH_CERN'].append('T0_CH_CERN') mapping['T1_IT_CNAF'].append('T0_CH_CERN') mapping['T1_FR_CCIN2P3'].append('T0_CH_CERN') mapping['T1_DE_KIT'].append('T0_CH_CERN') ## temptatively #mapping['T0_CH_CERN'].append( 'T2_CH_CERN' ) ## all europ can read from CERN for reg in ['IT','DE','UK','FR','BE','ES']: mapping['T2_CH_CERN'].extend([fb for fb in SI.sites_ready if '_%s_'%reg in fb]) pass ## all europ T1 among each others europ_t1 = [site for site in SI.sites_ready if site.startswith('T1') and any([reg in site for reg in ['IT','DE','UK','FR','ES']])] print europ_t1 for one in europ_t1: for two in europ_t1: if one==two: continue mapping[one].append(two) pass ## fnal can read from cnaf ? #mapping['T1_IT_CNAF'].append( 'T1_US_FNAL' ) mapping['T1_IT_CNAF'].extend( [site for site in SI.sites_ready if '_US_' in site] ) ## all US can read from CNAF mapping['T1_IT_CNAF'].append( 'T2_CH_CERN' ) mapping['T1_DE_KIT'].append( 'T2_CH_CERN' ) mapping['T2_CH_CERN'].append( 'T1_IT_CNAF' ) mapping['T2_CH_CERN'].append( 'T1_US_FNAL' ) #mapping['T2_UK_London_IC'].append( 'T2_CH_CERN' ) #mapping['T1_UK_RAL'].append( 'T2_BE_IIHE' ) mapping['T2_UK_London_IC'].append( 'T2_BE_IIHE' ) mapping['T2_UK_London_IC'].append( 'T2_FR_CCIN2P3' ) for site in SI.sites_ready: if '_US_' in site: mapping[site].append('T2_CH_CERN') ## make them appear as OK to use force_sites = [] ## overflow CERN to underutilized T1s upcoming = json.loads( open('%s/GQ.json'%monitor_dir).read()) for possible in SI.sites_T1s: if not possible in upcoming: mapping['T2_CH_CERN'].append(possible) pass ## remove add-hoc sites from overflow mapping prevent_sites = []#'T2_US_Purdue'] for prevent in prevent_sites: if prevent in mapping: mapping.pop( prevent ) for src in mapping: for prevent in prevent_sites: if prevent in mapping[src]: mapping[src].remove( prevent ) ## create the reverse mapping for the condor module for site,fallbacks in mapping.items(): for fb in fallbacks: if not site in reversed_mapping[fb]: reversed_mapping[fb].append(site) ## this is the fallback mapping print "Direct mapping : site => overflow" print json.dumps( mapping, indent=2) print "Reverse mapping : dest <= from origin" print json.dumps( reversed_mapping, indent=2) altered_tasks = set() def running_idle( wfi , task_name): gmon = wfi.getGlideMon() #print gmon if not gmon: return (0,0) if not task_name in gmon: return (0,0) return (gmon[task_name]['Running'], gmon[task_name]['Idle']) def needs_action( wfi, task, min_idled = 100): task_name = task.pathName.split('/')[-1] running, idled = running_idle( wfi, task_name) go = True if not idled and not running : go = False if idled < 100: go = False if (not running and idled) or (running and (idled / float(running) > needs_action.pressure)): go = True else: go = False return go, task_name, running, idled needs_action.pressure = UC.get('overflow_pressure') def getPerf( task , stats_to_go = 200): task = task.split('/')[1]+'/'+task.split('/')[-1] try: u = 'http://cms-gwmsmon.cern.ch/prodview/json/history/memoryusage720/%s'%task print u perf_data = json.loads(os.popen('curl -s --retry 5 %s'%u).read()) except Exception as e: print str(e) return (None,None) buckets = filter(lambda i:i['key']!=0,perf_data['aggregations']["2"]['buckets']) buckets.sort( key = lambda i:i['key']) s=0 for bucket in buckets: s+= bucket['doc_count'] bucket['cum'] = s s_m = sum( bucket['key']*bucket['doc_count'] for bucket in buckets) w_m = sum( bucket['doc_count'] for bucket in buckets) m_m = max( bucket['key'] for bucket in buckets) if buckets else None #90% percentile calculation percentile_m = int(0.90 * w_m) p_m = 0 s=0 for bucket in buckets: p_m = bucket['key'] if bucket['cum'] > percentile_m: break p_m *= 1.1 print "percentile mem",p_m max_count_m = None max_count = 0 for bucket in buckets: if bucket['doc_count'] > max_count: max_count_m = bucket['key'] max_count = bucket['doc_count'] if max_count_m: max_count_m *= 1.1 print "max count mem",max_count_m b_m = None if w_m > stats_to_go: if p_m: b_m = int(p_m) else: b_m = int(m_m) ## this is very bad if there are just a couple of outliers b_m = int((s_m / float(w_m)) * 1.2) else: print "not enough stats for memory",w_m try: perf_data = json.loads(os.popen('curl -s --retry 5 http://cms-gwmsmon.cern.ch/prodview/json/history/runtime720/%s'%task).read()) except Exception as e: print str(e) return (b_m,None) buckets = filter(lambda i:i['key']!=0,perf_data['aggregations']["2"]['buckets']) buckets.sort( key = lambda i:i['key']) s=0 for bucket in buckets: s+= bucket['doc_count'] bucket['cum'] = s s_t = sum( bucket['key']*bucket['doc_count'] for bucket in buckets) w_t = sum( bucket['doc_count'] for bucket in buckets) m_t = max( bucket['key'] for bucket in buckets) if buckets else None percentile_t = int(0.90 * w_t) p_t = 0 for bucket in buckets: p_t = bucket['key'] if bucket['cum'] > percentile_t: break p_t *= 1.1 print "percentile time",p_t max_count_t = None max_count = 0 for bucket in buckets: if bucket['doc_count'] > max_count: max_count_t = bucket['key'] max_count = bucket['doc_count'] if max_count_t: max_count_t *= 1.1 print "max count time",max_count_t b_t = None if w_t > stats_to_go: b_t = m_t else: print "not enough stats for time",w_t return (b_m,b_t) def getcampaign( task ): try: taskname = task.pathName.split('/')[-1] if hasattr( task, 'prepID'): return task.prepID.split('-')[1] elif taskname.count('-')>=1: return taskname.split('-')[1] else: return None except Exception as e : print "Inconsistent prepid very likely" print str(e) return None def close( interface ): open('%s/equalizor.json.new'%monitor_pub_dir,'w').write( json.dumps( interface, indent=2)) os.system('mv %s/equalizor.json.new %s/equalizor.json'%(monitor_pub_dir,monitor_pub_dir)) os.system('cp %s/equalizor.json %s/logs/equalizor/equalizor.%s.json'%(monitor_pub_dir,monitor_dir,time.mktime(time.gmtime()))) interface = { 'mapping' : mapping, 'reversed_mapping' : reversed_mapping, 'modifications' : {}, 'time' : {}, 'memory' : {} } if options.augment or options.remove: interface['modifications'] = json.loads( open('%s/equalizor.json'%monitor_pub_dir).read())['modifications'] interface['memory'] = json.loads( open('%s/equalizor.json'%monitor_pub_dir).read())['memory'] interface['time'] = json.loads( open('%s/equalizor.json'%monitor_pub_dir).read())['time'] if options.remove: if specific in interface['modifications']: print "poping",specific interface['modifications'].pop(specific) close( interface ) return PU_locations = {} PU_overflow = {} PRIM_overflow = {} PREMIX_overflow = {} LHE_overflow = {} tune_performance = [] pending_HLT = 0 max_HLT = 60000 pending_T0 = 0 max_T0 = 60000 try: gmon = json.loads(os.popen('curl -s http://cms-gwmsmon.cern.ch/prodview/json/T2_CH_CERN_HLT').read()) pending_HLT += gmon["Running"] pending_HLT += gmon["MatchingIdle"] except: pass restricting_to_ready = ['pdmvserv_HIG-RunIISummer15wmLHEGS-00420_00157_v0__160909_001612_2018', 'pdmvserv_HIG-RunIISummer15wmLHEGS-00418_00157_v0__160909_001621_321', 'pdmvserv_HIG-RunIISummer15wmLHEGS-00419_00157_v0__160909_001621_2641' ] remove_from = { #'cerminar_Run2016B-v1-BTagCSV-23Sep2016_8020_160923_163224_2174' : ['T2_CH_CERN_HLT'] } add_to = { #'pdmvserv_EXO-RunIISpring16MiniAODv2-05060_00552_v0__161001_151813_7925' : ['T3_US_OSG'], #'cerminar_Run2016C-v2-SingleElectron-23Sep2016_8020_160923_182146_3498' : ['T3_US_NERSC'], #'cerminar_Run2016C-v2-Tau-23Sep2016_8020_160923_182336_5649' : ['T3_US_NERSC'], } stay_within_site_whitelist = False specific_task=None if specific and ":" in specific: specific,specific_task = specific.split(':') if specific: wfs = session.query(Workflow).filter(Workflow.name.contains(specific)).all() else: wfs = session.query(Workflow).filter(Workflow.status == 'away').all() performance = {} resizing = {} no_routing = [ ] random.shuffle( wfs ) for wfo in wfs: if not wfo.status in ['away']: continue if wfo.name in no_routing and not options.augment: continue if specific and not specific in wfo.name: continue if specific: wfi = workflowInfo(url, wfo.name) else: cached = filter(lambda d : d['RequestName']==wfo.name, workflows) if not cached : continue wfi = workflowInfo(url, wfo.name, request = cached[0]) ## only running-* should get re-routed, unless done by hand if not wfi.request['RequestStatus'] in ['running-open','running-closed'] and not specific: continue is_chain = (wfi.request['RequestType'] in ['TaskChain','StepChain']) tasks_and_campaigns = [] for task in wfi.getWorkTasks(): tasks_and_campaigns.append( (task, getcampaign(task) ) ) lhe,prim,_,sec,sites_allowed = wfi.getSiteWhiteList()#getIO() ncores = wfi.getMulticore() memory_allowed = SI.sitesByMemory( float(wfi.request['Memory']) , maxCore=ncores) if not lhe and not prim and not sec: ## no input at all: go for OSG!!! add_to[wfo.name] = ['T3_US_OSG'] ## check needs override needs_overide = False if not needs_overide and options.augment: needs_overide=True def overide_from_agent( wfi, needs_overide): bad_agents = []#'http://cmssrv219.fnal.gov:5984'] if not bad_agents: return needs_overide if needs_overide: return True agents = wfi.getAgents() wqss = ['Running','Acquired'] if any([agent in agents.get(wqs,{}).keys() for wqs,agent in itertools.product( wqss, bad_agents)]): print "overriding the need for bad agent" needs_overide = True return needs_overide ## now parse this for action for i_task,(task,campaign) in enumerate(tasks_and_campaigns): if options.augment: print task.pathName print campaign resize = CI.get(campaign,'resize',{}) if resize and not is_chain: resizing[task.pathName] = resize tune = CI.get(campaign,'tune',options.tune) if tune and not campaign in tune_performance: tune_performance.append( campaign ) overflow = CI.get(campaign,'overflow',{}) if overflow: if "PRIM" in overflow and not campaign in PRIM_overflow: PRIM_overflow[campaign] = copy.deepcopy(overflow['PRIM']) print "adding",campaign,"to PRIM overflow" if "PREMIX" in overflow and not campaign in PREMIX_overflow: PREMIX_overflow[campaign] = copy.deepcopy(overflow['PREMIX']) print "adding",campaign,"to PREMIX overflow" if "PU" in overflow and not campaign in PU_overflow: PU_overflow[campaign] = copy.deepcopy(overflow['PU']) print "adding",campaign,"to PU overflow rules" if "LHE" in overflow and not campaign in LHE_overflow: site_list = overflow['LHE'].get('site_list',"") if site_list: if type(site_list)==list: LHE_overflow[campaign] = site_list else: print site_list if hasattr(SI,site_list): LHE_overflow[campaign] = copy.deepcopy( getattr(SI,site_list) ) else: LHE_overflow[campaign] = site_list.split(',') print "adding",campaign,"to light input overflow rules",LHE_overflow[campaign] ### setup the resizing ### get the task performance, for further massaging. if campaign in tune_performance or options.tune: print "performance",task.taskType,task.pathName if task.taskType in ['Processing','Production']: set_memory,set_time = getPerf( task.pathName ) #print "Performance %s GB %s min"%( set_memory,set_time) wfi.sendLog('equalizor','Performance tuning to %s GB %s min for %s'%( set_memory,set_time,task.pathName.split('/')[-1] )) ## get values from gmwsmon # massage the values : 95% percentile performance[task.pathName] = {} if set_memory: performance[task.pathName]['memory']=min(set_memory,15000) ## max to 15GB if set_time: performance[task.pathName]['time'] = min(set_time, 1440) ## max to 24H ## rule to remove from the site whitelist site that do not look ready for unified (local banning) if wfo.name in restricting_to_ready: if task.taskType in ['Production']: new_list = list(set(SI.sites_ready)&set(wfi.request['SiteWhitelist'])) modifications[wfo.name][task.pathName] = { "ReplaceSiteWhitelist" : new_list } if campaign in PREMIX_overflow: ## figure out secondary location and neighbors ## figure out primary presence and neighbors ## do the intersection and add if in need. needs, task_name, running, idled = needs_action(wfi, task) #needs = True ## trick to be removed once all wf are passed through the agent patch assigned_log = filter(lambda change : change["Status"] in ["assigned","acquired"],wfi.request['RequestTransition']) if assigned_log: then = assigned_log[0]['UpdateTime'] if then < 1479481842: print "assigned too early" needs = False else: print "assigned later enough" else: needs = False if is_chain and task.pathName.endswith('_1'): print i_task,"in chain prevents overflowing" needs = False if task.taskType in ['Processing','Production'] and needs: secondary_locations = set(SI.sites_ready + force_sites) for s in sec: if not s in PU_locations: presence = getDatasetPresence( url, s) one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.] PU_locations[s] = one_secondary_locations print "secondary is at",sorted(PU_locations[s]) secondary_locations = set([SI.SE_to_CE(site) for site in PU_locations[s]]) & secondary_locations aaa_sec_grid = set(secondary_locations) for site in sorted(aaa_sec_grid): aaa_sec_grid.update( mapping.get(site, []) ) if len(prim): dataset = list(prim)[0] all_blocks,blocks = wfi.getActiveBlocks() count_all = sum([len(v) for k,v in all_blocks.items()]) presence = getDatasetPresence(url, dataset, only_blocks=blocks ) aaa_prim_grid = set([SI.SE_to_CE(site) for site in presence.keys()]) for site in sorted(aaa_prim_grid): aaa_prim_grid.update( mapping.get(site, []) ) print sorted(aaa_prim_grid),"around primary location",sorted(presence.keys()) print sorted(aaa_sec_grid),"aroudn secondary location",sorted(secondary_locations) ## intersect aaa_grid = aaa_sec_grid & aaa_prim_grid else: print "premix overflow from a taskchain" ### hack hack hack #modifications[wfo.name][task.pathName]= {"ReplaceSiteWhitelist" : ['T2_CH_CERN','T1_US_FNAL']} aaa_grid = set(wfi.request['SiteWhitelist']) banned_until_you_find_a_way_to_do_this = ['T3_US_OSG'] aaa_grid = filter(lambda s : not s in banned_until_you_find_a_way_to_do_this, aaa_grid) if aaa_grid: wfi.sendLog('equalizor','Extending site whitelist to %s'%sorted(aaa_grid)) modifications[wfo.name][task.pathName]= {"AddWhitelist" : sorted(aaa_grid)} ## rule to overflow jobs on the primary input if campaign in PRIM_overflow: if task.taskType in ['Processing','Production']: if not wfi.request['TrustSitelists']: ###xrootd is OFF dataset = list(prim)[0] all_blocks,blocks = wfi.getActiveBlocks() count_all = sum([len(v) for k,v in all_blocks.items()]) presence = getDatasetPresence(url, dataset, only_blocks=blocks ) in_full = [SI.SE_to_CE(site) for site,(there,_) in presence.items() if there] aaa_grid= set() aaa_grid_in_full = set(in_full) for site in sorted(aaa_grid_in_full): aaa_grid_in_full.update( mapping.get(site, []) ) ## just add the neighbors to the existing whitelist. we could do more with block classAd for site in wfi.request['SiteWhitelist']: aaa_grid.update( mapping.get(site, []) ) aaa_grid = aaa_grid & set(sites_allowed + ['T3_US_NERSC']) ## and restrict to site that would be allowed at all (mcore, mem) aaa_grid_in_full = aaa_grid_in_full & set(sites_allowed + ['T3_US_NERSC']) ## and restrict to site that would be allowed at all (mcore, mem) gmon = wfi.getGlideMon() needs, task_name, running, idled = needs_action(wfi, task) print needs,running,idled site_in_use = set(gmon[task_name]['Sites']) if gmon and task_name in gmon and 'Sites' in gmon[task_name] else set() print dataset,"at",sorted(in_full),len(blocks),"/",count_all print "running at",sorted(site_in_use) print "set for",sorted(wfi.request['SiteWhitelist']) print "around current whitelist" ,sorted(aaa_grid) print "around where the data is now in full", sorted(aaa_grid_in_full) if needs and not (site_in_use & set(in_full)) and aaa_grid_in_full: print "we could be going for replace at that point" wfi.sendLog('equalizor','Replaceing site whitelie to %s dynamically'% sorted(aaa_grid_in_full)) modifications[wfo.name][task.pathName] = { "ReplaceSiteWhitelist" : sorted( aaa_grid_in_full) } else: if aaa_grid: print wfo.name wfi.sendLog('equalizor','Adding in site white list %s dynamically'% sorted(aaa_grid) ) if wfo.name in modifications and task.pathName in modifications[wfo.name] and 'AddWhitelist' in modifications[wfo.name][task.pathName]: modifications[wfo.name][task.pathName]["AddWhitelist"].extend(sorted(aaa_grid)) else: modifications[wfo.name][task.pathName] = { "AddWhitelist" : sorted(aaa_grid) } else: ## the request is already is in xrootd mode (either too generous, or just about right with neighbors of full data) dataset = list(prim)[0] all_blocks,blocks = wfi.getActiveBlocks() count_all = sum([len(v) for k,v in all_blocks.items()]) fraction_left = float(len(blocks))/ count_all #if fraction_left< 0.5: print '\n'.join( blocks ) presence = getDatasetPresence(url, dataset, only_blocks=blocks ) ## in full is really the only place we can go to safely, since we have no job-data matching in_full = [SI.SE_to_CE(site) for site,(there,_) in presence.items() if there] gmon = wfi.getGlideMon() needs, task_name, running, idled = needs_action(wfi, task) site_in_use = set(gmon[task_name]['Sites']) if gmon and task_name in gmon and 'Sites' in gmon[task_name] else set() print needs,running,idled aaa_grid = set(in_full) for site in list(aaa_grid): aaa_grid.update( mapping.get(site, []) ) new_ones = set(in_full) - set(wfi.request['SiteWhitelist']) ## symptomatic of data have been repositionned common = set(in_full) & set(wfi.request['SiteWhitelist']) extra_shit = set(wfi.request['SiteWhitelist']) - aaa_grid ## symptomatic of too generous site-whitelist aaa_grid = aaa_grid & set(sites_allowed+ ['T3_US_NERSC']) ## restrict to site that would be allowed at all (mcore, mem) new_grid = aaa_grid - set(wfi.request['SiteWhitelist']) print dataset,"is in full ",len(blocks),"/",count_all," at",in_full print '\n'.join( sorted(blocks) ) print "running at",site_in_use print "in common of the site whitelist",sorted(common) print "site now also hosting the data",sorted(new_ones) print "site in whitelist with no data",sorted(extra_shit)## with no data and not within aaa reach if new_ones: ## we will be add sites if needs and aaa_grid: print wfo.name,"would replace for",sorted(aaa_grid) print "but no thanks" wfi.sendLog('equalizor','Changing the site whitelist to %s dynamically'%(sorted(aaa_grid))) modifications[wfo.name][task.pathName] = { "ReplaceSiteWhitelist" : sorted(aaa_grid) } elif new_grid: print wfo.name,"would complement up to",sorted(aaa_grid) wfi.sendLog('equalizor','Adding site white list to %s dynamically'% sorted(new_grid) ) modifications[wfo.name][task.pathName] = { "AddWhitelist" : sorted(new_grid) } elif len(extra_shit)>5: if aaa_grid: print wfo.name,"would be restricting down to",sorted(aaa_grid),"because of",sorted(extra_shit) wfi.sendLog('equalizor','Restricting the white list to %s dynamically'% sorted(aaa_grid) ) modifications[wfo.name][task.pathName] = { "ReplaceSiteWhitelist" : sorted(aaa_grid) } else: print wfo.name,"don't do anything" if wfo.name in remove_from and task.taskType in ['Processing','Production']: remove = remove_from[wfo.name] restrict_to = set(wfi.request['SiteWhitelist']) intersection= set(remove)&set(restrict_to) if intersection: print intersection,"is indeed in the original whitelist" restrict_to = restrict_to - set(remove) modifications[wfo.name][task.pathName] = { "ReplaceSiteWhitelist" : sorted(restrict_to) } if wfo.name in add_to: if task.taskType in ['Production','Processing']: augment_to = add_to[wfo.name] print "adding",sorted(augment_to),"to",wfo.name if wfo.name in modifications and task.pathName in modifications[wfo.name] and 'AddWhitelist' in modifications[wfo.name][task.pathName]: modifications[wfo.name][task.pathName]['AddWhitelist'].extend( augment_to ) else: modifications[wfo.name][task.pathName] = { "AddWhitelist" : augment_to } ### rule to avoid the issue of taskchain secondary jobs being stuck at sites processing the initial step if campaign in LHE_overflow: #if not is_chain and task.taskType in ['Processing']: if task.taskType in ['Processing']: needs, task_name, running, idled = needs_action(wfi, task) needs_overide = overide_from_agent( wfi, needs_overide) extend_to = list(set(copy.deepcopy( LHE_overflow[campaign] ))) if stay_within_site_whitelist: extend_to = list(set(extend_to) & set(wfi.request['SiteWhitelist'])) ## restrict to stupid-site-whitelist extend_to = list(set(extend_to) & set(SI.sites_ready + force_sites)) if is_chain: print "further restricting to initially allowed sites" ## restrict to initial allowed sites extend_to = list(set(extend_to) & set(sites_allowed)) if not extend_to: print "Nowhere to extend to" continue if extend_to and needs or needs_overide: modifications[wfo.name][task.pathName] = { "ReplaceSiteWhitelist" : extend_to ,"Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']} wfi.sendLog('equalizor','%s of %s is running %d and pending %d, taking action : ReplaceSiteWhitelist \n %s'%( task_name, wfo.name, running, idled , json.dumps( sorted(modifications[wfo.name][task.pathName]['ReplaceSiteWhitelist'])))) altered_tasks.add( task.pathName ) else: wfi.sendLog('equalizor','%s of %s is running %d and pending %d'%( task_name, wfo.name, running, idled)) ### overflow the 76 digi-reco to the site holding the pileup if campaign in PU_overflow: force = PU_overflow[campaign]['force'] if 'force' in PU_overflow[campaign] else False secondary_locations = set(SI.sites_ready + force_sites) for s in sec: if not s in PU_locations: presence = getDatasetPresence( url, s) #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.] PU_locations[s] = one_secondary_locations print "secondary is at",sorted(PU_locations[s]) secondary_locations = set([SI.SE_to_CE(site) for site in PU_locations[s]]) & secondary_locations ## we should add all sites that hold the secondary input if any ### given that we have the secondary location available, it is not necessary to use the add-hoc list ##secondary_locations = list(set(PU_overflow[campaign]['sites']) & set( SI.sites_ready )) ## intersect with the sites that are allowed from the request requirement secondary_locations = secondary_locations & set(memory_allowed) if any([task.pathName.endswith(finish) for finish in ['_0','StepOneProc','Production']]) : needs, task_name, running, idled = needs_action(wfi, task) ## removing the ones in the site whitelist already since they encode the primary input location if stay_within_site_whitelist: original_site_in_use = set(wfi.request['SiteWhitelist'] & set(secondary_locations)) else: original_site_in_use = set(secondary_locations) mode = 'AddWhitelist' if not prim and i_task==0: print "because there isn't any input, one should be able to just replace the sitewhitelist instead of adding, with the restriction of not reaching every possible sites" mode='ReplaceSiteWhitelist' ## remove the sites that have already running jobs gmon = wfi.getGlideMon() if gmon and task_name in gmon and 'Sites' in gmon[task_name] and mode=='AddWhitelist': site_in_use = set(gmon[task_name]['Sites']) site_in_use = set([]) ## at this time I cannot find a reason to apply such limitation print "removing",sorted(site_in_use) ## that determines where you want to run in addition augment_by = list((set(secondary_locations)- site_in_use) & original_site_in_use) else: print "no existing running site" augment_by = list(original_site_in_use) if not augment_by: print "Nowhere to extend to" needs_overide = overide_from_agent( wfi, needs_overide) if augment_by and (needs or needs_overide or force) and PU_overflow[campaign]['pending'] < PU_overflow[campaign]['max']: PU_overflow[campaign]['pending'] += idled print "raising overflow to",PU_overflow[campaign]['pending'],"for",PU_overflow[campaign]['max'] ## the step with an input ought to be the digi part : make this one go anywhere modifications[wfo.name][task.pathName] = { mode : augment_by , "Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']} altered_tasks.add( task.pathName ) wfi.sendLog('equalizor','%s of %s is running %d and pending %d, taking action : %s \n %s'%( task_name, wfo.name, running, idled, mode, json.dumps( sorted(augment_by), indent=2 ))) else: print task_name,"of",wfo.name,"running",running,"and pending",idled ### overflow the skims back to multi-core if campaign in ['Run2015D','Run2015C_25ns'] and task.taskType =='Skim': original_swl = wfi.request['SiteWhitelist'] needs, task_name, running, idled = needs_action(wfi, task) if (needs or needs_overide): modifications[wfo.name][task.pathName] = { 'AddWhitelist' : original_swl, "Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']} altered_tasks.add( task.pathName ) wfi.sendLog('equalizor','%s of %s is running %d and pending %d, taking action : AddWhitelist \n %s'%( task_name, wfo.name, running, idled, json.dumps( sorted(original_swl), indent=2 ))) if options.augment: #print "uhm ....",sorted(wfi.request['SiteWhitelist']),i_task,use_HLT pass ### this is a hack when we need to kick gensim out of everything if campaign in [ #'RunIIWinter15GS', #'RunIISummer15GS', #'RunIISummer15wmLHEGS', #'Summer12', ] and task.taskType in ['Production'] and is_chain: #what are the site you want to take out. What are the jobs in whitelist, make the diff and replace t1s = set([site for site in SI.all_sites if site.startswith('T1')]) ust2s = set([site for site in SI.all_sites if site.startswith('T2_US')]) #ust2s = set([site for site in SI.sites_mcore_ready if site.startswith('T2_US')]) allmcores = set(SI.sites_mcore_ready) #set_for = set(wfi.request['SiteWhitelist']) - t1s #set_for = set(wfi.request['SiteWhitelist']) - t1s - ust2s #set_for = set(wfi.request['SiteWhitelist']) - allmcores set_for = set(wfi.request['SiteWhitelist']) & t1s print wfo.name,"going for",set_for print task.pathName if set_for: modifications[wfo.name][task.pathName] = { "ReplaceSiteWhitelist" : sorted(set_for) } ### add the HLT at partner of CERN if 'T2_CH_CERN' in wfi.request['SiteWhitelist'] and i_task in [0,1] and use_HLT and not wfi.request['TrustSitelists']: needs, task_name, running, idled = needs_action(wfi, task) if options.augment: needs=True needs = True ##needs = random.random()<0.40 remove the random, just add up to a limit if (needs or needs_overide) and pending_HLT < max_HLT: pending_HLT += idled if task.pathName in modifications[wfo.name] and 'AddWhitelist' in modifications[wfo.name][task.pathName]: modifications[wfo.name][task.pathName]["AddWhitelist"].append( "T2_CH_CERN_HLT" ) wfi.sendLog('equalizor','also adding the HLT in whitelist of %s to %d for %d'%( task.pathName, pending_HLT, max_HLT)) ## this Replace does not work at all for HLT elif task.pathName in modifications[wfo.name] and 'ReplaceSiteWhitelist' in modifications[wfo.name][task.pathName]: #modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"].append( "T2_CH_CERN_HLT" ) #print "\t",wfo.name,"adding replace HLT up to",pending_HLT,"for",max_HLT print "already having a site replacement, not adding the HLT for now" pass else: modifications[wfo.name][task.pathName] = { "AddWhitelist" : ["T2_CH_CERN_HLT"], "Priority" : wfi.request['RequestPriority'], "Running" : running, "Pending" : idled} wfi.sendLog('equalizor','adding the HLT in whitelist of %s to %d for %d'%( task.pathName, pending_HLT, max_HLT)) if i_task==0 and not sec and use_T0: needs, task_name, running, idled = needs_action(wfi, task) if options.augment: needs=True #needs = True good_type = wfi.request['RequestType'] in ['MonteCarlo','MonteCarloFromGEN'] read_lhe = ((not 'LheInputFiles' in wfi.request) or bool(wfi.request['LheInputFiles'])) good_type &= not read_lhe if not good_type and not options.augment: needs = False ##needs = random.random()<0.40 remove the random, just add up to a limit if (needs or needs_overide): pending_T0 += idled if task.pathName in modifications[wfo.name] and 'AddWhitelist' in modifications[wfo.name][task.pathName]: if not "T0_CH_CERN" in modifications[wfo.name][task.pathName]["AddWhitelist"]: modifications[wfo.name][task.pathName]["AddWhitelist"].append( "T0_CH_CERN" ) wfi,sendLog('equalizor','adding the T0 for %s to %d for %d'%( task.pathName, pending_T0, max_T0)) elif task.pathName in modifications[wfo.name] and 'ReplaceSiteWhitelist' in modifications[wfo.name][task.pathName]: if not "T0_CH_CERN" in modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"]: modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"].append( "T0_CH_CERN" ) wfi,sendLog('equalizor','adding the T0 to replacement for %s to %d for %d'%( task.pathName, pending_T0, max_T0)) else: modifications[wfo.name][task.pathName] = { "AddWhitelist" : ["T0_CH_CERN"], "Priority" : wfi.request['RequestPriority'], "Running" : running, "Pending" : idled} wfi,sendLog('equalizor','adding the T0 for %s to %d for %d'%( task.pathName, pending_T0, max_T0)) interface['modifications'].update( modifications ) ### manage the number of core and job resizing #interface['cores']={'T2_CH_CERN_HLT': {'min':4,'max':16}, 'default': {'min':1, 'max':4}} #interface['resizes'] = ['RunIISpring16DR80'] interface['resizing'] = resizing ### manage the modification of the memory and target time max_N_mem = 2 max_N_time = 4 ## discretize the memory to 10 at most values mems = set([o['memory'] for t,o in performance.items() if 'memory' in o]) times = set([o['time'] for t,o in performance.items() if 'time' in o]) if len(mems)>max_N_mem: mem_step = int((max(mems) - min(mems))/ float(max_N_mem)) print "rebinning memory" for t in performance: if not 'memory' in performance[t]: continue (m,r) = divmod(performance[t]['memory'], mem_step) performance[t]['memory'] = (m+1)*mem_step if len(times)>max_N_time: print "rebinning memory" time_step = int((max(times) - min(times))/float(max_N_time)) for t in performance: if not 'time' in performance[t]: continue (m,r) = divmod(performance[t]['time'], time_step) performance[t]['time'] = (m+1)*time_step new_times = defaultdict(list) new_memories = defaultdict(list) for t,o in performance.items(): if 'time' in o: new_times[str(o['time'])].append( t ) if 'memory' in o: new_memories[str(o['memory'])].append( t ) interface['time'].update( new_times ) interface['memory'].update( new_memories ) ## close and save close( interface )
def injector(url, options, specific): use_mcm = True up = componentInfo( mcm = use_mcm, soft=['mcm'] ) if not up.check(): return use_mcm = up.status['mcm'] workflows = getWorkflows(url, status=options.wmstatus, user=options.user) workflows.extend( getWorkflows(url, status=options.wmstatus, user='******', rtype="ReReco")) ## regardless of users, pick up all ReReco on the table existing = [wf.name for wf in session.query(Workflow).all()] ## browse for assignment-approved requests, browsed for ours, insert the diff for wf in workflows: if wf not in existing: print "putting",wf new_wf = Workflow( name = wf , status = options.setstatus, wm_status = options.wmstatus) session.add( new_wf ) session.commit() time.sleep(1) existing = [wf.name for wf in session.query(Workflow).all()] ## passing a round of invalidation of what needs to be invalidated if use_mcm and (options.invalidate or True): invalidator(url) ## pick up replacements for wf in session.query(Workflow).filter(Workflow.status == 'trouble').all(): if specific and wf.name != specific: continue print wf.name wl = getWorkLoad(url, wf.name) familly = getWorkflowById( url, wl['PrepID'] ) true_familly = [] for member in familly: if member == wf.name: continue fwl = getWorkLoad(url , member) if options.replace: if member != options.replace: continue else: if fwl['RequestDate'] < wl['RequestDate']: continue if fwl['RequestType']=='Resubmission': continue if fwl['RequestStatus'] in ['None',None]: continue true_familly.append( fwl ) if len(true_familly)==0: print wf.name,"ERROR has no replacement" known = [] try: known = json.loads(open('no_replacement.json').read()) except: pass if not wf.name in known: sendEmail('workflow in %s with no replacement'%(wl['RequestStatus']),'%s is dangling there'%(wf.name)) known.append( wf.name ) open('no_replacement.json','w').write( json.dumps( known, indent=2 )) continue print wf.name,"has",len(familly),"familly members" print wf.name,"has",len(true_familly),"true familly members" for fwl in true_familly: member = fwl['RequestName'] new_wf = session.query(Workflow).filter(Workflow.name == member).first() if not new_wf: print "putting",member,"as replacement of",wf.name status = 'away' if fwl['RequestStatus'] in ['assignment-approved']: status = 'considered' new_wf = Workflow( name = member, status = status, wm_status = fwl['RequestStatus']) wf.status = 'forget' session.add( new_wf ) else: if new_wf.status == 'forget': continue print "getting",new_wf.name,"as replacement of",wf.name wf.status = 'forget' for tr in session.query(Transfer).all(): if wf.id in tr.workflows_id: sw = copy.deepcopy(tr.workflows_id) sw.remove( wf.id) sw.append(new_wf.id) tr.workflows_id = sw print tr.phedexid,"got",new_wf.name if new_wf.status != 'away': print "\t setting it considered" new_wf.status = 'considered' if tr.phedexid<0: ## set it back to positive tr.phedexid = -tr.phedexid session.commit() ## don't do that automatically #wf.status = 'forget' session.commit()
def equalizor(url , specific = None): if not specific: workflows = getWorkflows(url, status='running-closed', details=True) workflows.extend(getWorkflows(url, status='running-open', details=True)) ## start from scratch modifications = defaultdict(dict) ## define regionality site => fallback allowed. feed on an ssb metric ?? mapping = defaultdict(list) reversed_mapping = defaultdict(list) regions = defaultdict(list) SI = siteInfo() for site in SI.sites_ready: region = site.split('_')[1] if not region in ['US','IT']: continue regions[region] = [region] def site_in_depletion(s): return True if s in SI.sites_pressure: (m, r, pressure) = SI.sites_pressure[s] if float(m) < float(r): print s,m,r,"lacking pressure" return True else: print s,m,r,"pressure" pass return False for site in SI.sites_ready: region = site.split('_')[1] ## fallback to the region, to site with on-going low pressure mapping[site] = [fb for fb in SI.sites_ready if any([('_%s_'%(reg) in fb and fb!=site and site_in_depletion(fb))for reg in regions[region]]) ] mapping['T2_CH_CERN'].append('T2_CH_CERN_HLT') for site,fallbacks in mapping.items(): for fb in fallbacks: reversed_mapping[fb].append(site) ## this is the fallback mapping print json.dumps( mapping, indent=2) #print json.dumps( reversed_mapping, indent=2) altered_tasks = set() def running_idle( wfi , task_name): gmon = wfi.getGlideMon() if not gmon: return (0,0) if not task_name in gmon: return (0,0) return (gmon[task_name]['Running'], gmon[task_name]['Idle']) def needs_action( wfi, task): task_name = task.pathName.split('/')[-1] running, idled = running_idle( wfi, task_name) if not idled and not running : return False, task_name, running, idled if idled < 100: return False, task_name, running, idled if (not running and idled) or (idled / float(running) > 0.2): return True, task_name, running, idled else: return False, task_name, running, idled def getcampaign( task ): taskname = task.pathName.split('/')[-1] if hasattr( task, 'prepID'): return task.prepID.split('-')[1] elif taskname.count('-')>=1: return taskname.split('-')[1] else: return None for wfo in session.query(Workflow).filter(Workflow.status == 'away').all(): if specific and not specific in wfo.name: continue if specific: wfi = workflowInfo(url, wfo.name) else: cached = filter(lambda d : d['RequestName']==wfo.name, workflows) if not cached : continue wfi = workflowInfo(url, wfo.name, request = cached[0]) tasks_and_campaigns = [] for task in wfi.getWorkTasks(): tasks_and_campaigns.append( (task, getcampaign(task) ) ) ## now parse this for action for i_task,(task,campaign) in enumerate(tasks_and_campaigns): #print task.pathName #print campaign if campaign in [ 'RunIIWinter15wmLHE', 'RunIISummer15GS'] and wfi.request['RequestType'] in ['TaskChain']: if task.taskType == 'Processing': needs, task_name, running, idled = needs_action(wfi, task) if needs: print task_name,"of",wfo.name,"running",running,"and pending",idled,"taking action : ReplaceSiteWhitelist" set_to = wfi.request['SiteWhitelist'] modifications[wfo.name][task.pathName] = { "ReplaceSiteWhitelist" : set_to ,"Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']} altered_tasks.add( task.pathName ) else: print task_name,"of",wfo.name,"running",running,"and pending",idled if campaign == 'RunIIFall15DR76': ## we should add all sites that hold the secondary input if any secondary_locations = ['T1_ES_PIC','T2_US_Purdue','T2_UK_SGrid_RALPP','T2_BE_IIHE','T2_DE_DESY','T2_IT_Legnaro','T2_US_Caltech','T1_DE_KIT','T2_UK_London_Brunel','T2_IT_Pisa','T1_US_FNAL','T2_IT_Rome','T2_US_Florida','T1_IT_CNAF','T1_RU_JINR','T2_UK_London_IC','T2_US_Nebraska','T2_FR_CCIN2P3','T2_US_UCSD','T2_ES_CIEMAT','T1_FR_CCIN2P3','T2_US_Wisconsin','T2_US_MIT','T2_DE_RWTH','T1_UK_RAL','T2_US_Vanderbilt','T2_CH_CERN'] ## should discover the above from secondary location (remember to cache this) #(lheinput,primary,parent,secondary, sites_allowed) = wfh.getSiteWhiteList() ## removing the ones in the site whitelist already since they encode the primary input location augment_by = list(set(secondary_locations)- set(wfi.request['SiteWhitelist'])) if task.pathName.endswith('_0'): needs, task_name, running, idled = needs_action(wfi, task) if needs: ## the step with an input ought to be the digi part : make this one go anywhere modifications[wfo.name][task.pathName] = { "AddWhitelist" : augment_by , "Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']} altered_tasks.add( task.pathName ) print task_name,"of",wfo.name,"running",running,"and pending",idled,"taking action : AddWhitelist" else: print task_name,"of",wfo.name,"running",running,"and pending",idled if 'T2_CH_CERN' in wfi.request['SiteWhitelist'] and i_task==0: if random.random()<0.005: if task.pathName in modifications[wfo.name] and 'AddWhitelist' in modifications[wfo.name][task.pathName]: modifications[wfo.name][task.pathName]["AddWhitelist"].append( "T2_CH_CERN_HLT" ) print wfo.name,"adding HLT" elif task.pathName in modifications[wfo.name] and 'ReplaceSiteWhitelist' in modifications[wfo.name][task.pathName]: modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"].append( "T2_CH_CERN_HLT" ) print wfo.name,"adding HLT" else: modifications[wfo.name][task.pathName] = { "AddWhitelist" : ["T2_CH_CERN_HLT"], "Priority" : wfi.request['RequestPriority']} print wfo.name,"adding HLT" interface = { 'reversed_mapping' : reversed_mapping, 'modifications' : {} } if options.augment: interface['modifications'] = json.loads( open('/afs/cern.ch/user/c/cmst2/www/unified/equalizor.json').read())['modifications'] interface['modifications'].update( modifications ) open('/afs/cern.ch/user/c/cmst2/www/unified/equalizor.json.new','w').write( json.dumps( interface, indent=2)) os.system('mv /afs/cern.ch/user/c/cmst2/www/unified/equalizor.json.new /afs/cern.ch/user/c/cmst2/www/unified/equalizor.json') os.system('cp /afs/cern.ch/user/c/cmst2/www/unified/equalizor.json /afs/cern.ch/user/c/cmst2/www/unified/logs/equalizor/equalizor.%s.json'%(time.mktime(time.gmtime()))) #open('/afs/cern.ch/user/c/cmst2/www/unified/logs/equalizor/equalizor.%s.json'%(time.gmtime()),'w').write( json.dumps( altered_tasks , indent=2)) sendEmail("Altering the job whitelist","The following tasks had condor rule set for overflow \n%s"%("\n".join( altered_tasks )))
def transferor(url, specific=None, talk=True, options=None): if userLock('transferor'): return if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() mcm = McMClient(dev=False) dss = DSS() print "counting all being handled..." being_handled = len( session.query(Workflow).filter(Workflow.status == 'away').all()) being_handled += len( session.query(Workflow).filter( Workflow.status.startswith('stag')).all()) being_handled += len( session.query(Workflow).filter( Workflow.status.startswith('assistance')).all()) max_to_handle = options.maxworkflows allowed_to_handle = max(0, max_to_handle - being_handled) wf_buffer = 5 if allowed_to_handle <= wf_buffer: ## buffer for having several wf per transfer print "Not allowed to run more than", max_to_handle, "at a time. Currently", being_handled, "and", wf_buffer, "buffer" else: print being_handled, "already being handled", max_to_handle, "max allowed,", allowed_to_handle, "remaining", "and", wf_buffer, "buffer" print "... done" all_transfers = defaultdict(list) workflow_dependencies = defaultdict( set) ## list of wf.id per input dataset wfs_and_wfh = [] print "getting all wf to consider ..." cache = getWorkflows(url, 'assignment-approved', details=True) for wfo in session.query(Workflow).filter( Workflow.status == 'considered').all(): if specific and not specific in wfo.name: continue cache_r = filter(lambda d: d['RequestName'] == wfo.name, cache) if len(cache_r): wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False, request=cache_r[0]))) else: wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False))) print "... done" input_sizes = {} ## list the size of those in transfer already in_transfer_priority = 0 min_transfer_priority = 100000000 print "getting all wf in staging ..." for wfo in session.query(Workflow).filter( Workflow.status == 'staging').all(): wfh = workflowInfo(url, wfo.name, spec=False) (_, primary, _, _) = wfh.getIO() for prim in primary: input_sizes[prim] = dss.get(prim) in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority'])) min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority'])) print "... done" print "Max priority in transfer already", in_transfer_priority print "Min priority in transfer already", min_transfer_priority in_transfer_already = sum(input_sizes.values()) #sort by priority higher first wfs_and_wfh.sort(cmp=lambda i, j: cmp(int(i[1].request[ 'RequestPriority']), int(j[1].request['RequestPriority'])), reverse=True) ## list the size of all inputs print "getting all input sizes ..." for (wfo, wfh) in wfs_and_wfh: (_, primary, _, _) = wfh.getIO() for prim in primary: input_sizes[prim] = dss.get(prim) print "... done" grand_total = sum(input_sizes.values()) to_transfer = grand_total - in_transfer_already grand_transfer_limit = options.maxtransfer transfer_limit = grand_transfer_limit - in_transfer_already print "%15.4f GB already being transfered" % in_transfer_already print "%15.4f GB is the current requested transfer load" % to_transfer print "%15.4f GB is the global transfer limit" % grand_transfer_limit print "%15.4f GB is the available limit" % transfer_limit # the max priority value per dataset. max_priority = defaultdict(int) needs_transfer = 0 ## so that we can count'em passing_along = 0 transfer_sizes = {} went_over_budget = False for (wfo, wfh) in wfs_and_wfh: print wfh.request['RequestPriority'] print wfo.name, "to be transfered" #wfh = workflowInfo( url, wfo.name) (_, primary, _, _) = wfh.getIO() this_load = sum([input_sizes[prim] for prim in primary]) if (this_load and (sum(transfer_sizes.values()) + this_load > transfer_limit or went_over_budget)): if went_over_budget: print "Transfer has gone over bubget." else: print "Transfer will go over bubget." print "%15.4f GB this load" % this_load print "%15.4f GB already this round" % sum(transfer_sizes.values()) print "%15.4f GB is the available limit" % transfer_limit went_over_budget = True if int( wfh.request['RequestPriority'] ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority: print "Higher priority sample", wfh.request[ 'RequestPriority'], ">=", in_transfer_priority, "go-on over budget" else: if not options.go: print min_transfer_priority, "minimum priority", wfh.request[ 'RequestPriority'], "<", in_transfer_priority, "stop" continue ## throtlle by campaign go if not CI.go(wfh.request['Campaign']): print "No go for", wfh.request['Campaign'] if not options.go: continue ## check if the batch is announced announced = False is_real = False for b in mcm.getA('batches', query='contains=%s' % wfo.name): is_real = True if b['status'] == 'announced': announced = True break if not announced: print wfo.name, "does not look announced." # skipping?, rejecting?, reporting?" if not is_real: print wfo.name, "does not appear to be genuine." ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status continue ## check on a grace period injection_time = time.mktime( time.strptime('.'.join(map(str, wfh.request['RequestDate'])), "%Y.%m.%d.%H.%M.%S")) / (60. * 60.) now = time.mktime(time.gmtime()) / (60. * 60.) if float(now - injection_time) < 4.: if not options.go and not announced: print "It is too soon to start transfer: %3.2fH remaining" % ( now - injection_time) continue passing_along += 1 if passing_along >= allowed_to_handle: if int( wfh.request['RequestPriority'] ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority: print "Higher priority sample", wfh.request[ 'RequestPriority'], ">=", in_transfer_priority, "go-on over", max_to_handle else: print "Not allowed to pass more than", max_to_handle, "at a time. Currently", being_handled, "handled, and adding", passing_along break (lheinput, primary, parent, secondary) = wfh.getIO() if options and options.tosites: sites_allowed = options.tosites.split(',') else: sites_allowed = getSiteWhiteList( (lheinput, primary, parent, secondary)) if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']): sites_allowed = CI.parameters( wfh.request['Campaign'])['SiteWhitelist'] blocks = [] if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']: blocks = wfh.request['BlockWhitelist'] can_go = True staging = False if primary: if talk: print wfo.name, 'reads', ', '.join(primary), 'in primary' ## chope the primary dataset for prim in primary: max_priority[prim] = max(max_priority[prim], int(wfh.request['RequestPriority'])) sites_really_allowed = [ site for site in sites_allowed if not any([ osite.startswith(site) for osite in SI.sites_veto_transfer ]) ] print "Sites allowed minus the vetoed transfer" print sites_really_allowed copies_needed = int( 0.35 * len(sites_really_allowed) ) + 1 ## should just go for a fixed number based if the white list grows that big print "Would make", copies_needed, "copies" if options.maxcopy > 0: copies_needed = min(options.maxcopy, copies_needed) ## remove the sites that do not want transfers print "need", copies_needed workflow_dependencies[prim].add(wfo.id) presence = getDatasetPresence(url, prim) prim_location = [ site for site, pres in presence.items() if pres[0] == True ] if len(prim_location) >= copies_needed: print "The output is all fully in place at", len( prim_location), "sites" continue # reduce the number of copies required by existing full copies copies_needed = max(0, copies_needed - len(prim_location)) print "now need", copies_needed subscriptions = listSubscriptions(url, prim) prim_destination = list( set([ site for (site, (tid, decision)) in subscriptions.items() if decision and not any([ site.endswith(veto) for veto in ['MSS', 'Export', 'Buffer'] ]) ])) ## need to reject from that list the ones with a full copy already: i.e the transfer corresponds to the copy in place prim_destination = [ site for site in prim_destination if not site in prim_location ] ## add transfer dependencies latching_on_transfers = list( set([ tid for (site, (tid, decision)) in subscriptions.items() if decision and site in prim_destination and not any([ site.endswith(veto) for veto in ['MSS', 'Export', 'Buffer'] ]) ])) print latching_on_transfers for latching in latching_on_transfers: tfo = session.query(Transfer).filter( Transfer.phedexid == latching).first() if not tfo: tfo = Transfer(phedexid=latching) tfo.workflows_id = [] session.add(tfo) if not wfo.id in tfo.workflows_id: print "adding", wfo.id, "to", tfo.id, "with phedexid", latching l = copy.deepcopy(tfo.workflows_id) l.append(wfo.id) tfo.workflows_id = l if not options.test: session.commit() else: session.flush( ) ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query can_go = False transfer_sizes[prim] = input_sizes[prim] staging = True # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ?? copies_needed = max(0, copies_needed - len(prim_destination)) print "then need", copies_needed if copies_needed == 0: print "The output is either fully in place or getting in full somewhere with", latching_on_transfers can_go = True continue prim_to_distribute = [ site for site in sites_allowed if not any( [osite.startswith(site) for osite in prim_location]) ] prim_to_distribute = [ site for site in prim_to_distribute if not any( [osite.startswith(site) for osite in prim_destination]) ] ## take out the ones that cannot receive transfers prim_to_distribute = [ site for site in prim_to_distribute if not any([ osite.startswith(site) for osite in SI.sites_veto_transfer ]) ] if len( prim_to_distribute ) > 0: ## maybe that a parameter we can play with to limit the if not options or options.chop: spreading = distributeToSites(getDatasetChops(prim), prim_to_distribute, n_copies=copies_needed, weights=SI.cpu_pledges) else: spreading = {} for site in prim_to_distribute: spreading[site] = [prim] can_go = False transfer_sizes[prim] = input_sizes[prim] for (site, items) in spreading.items(): all_transfers[site].extend(items) if secondary: if talk: print wfo.name, 'reads', ', '.join(secondary), 'in secondary' for sec in secondary: workflow_dependencies[sec].add(wfo.id) presence = getDatasetPresence(url, sec) sec_location = [ site for site, pres in presence.items() if pres[1] > 90. ] ## more than 90% of the minbias at sites subscriptions = listSubscriptions(url, sec) sec_destination = [site for site in subscriptions] sec_to_distribute = [ site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location]) ] sec_to_distribute = [ site for site in sec_to_distribute if not any( [osite.startswith(site) for osite in sec_destination]) ] sec_to_distribute = [ site for site in sec_to_distribute if not any([ osite.startswith(site) for osite in SI.sites_veto_transfer ]) ] if len(sec_to_distribute) > 0: for site in sec_to_distribute: all_transfers[site].append(sec) can_go = False ## is that possible to do something more if can_go: ## no explicit transfer required this time if staging: ## but using existing ones print wfo.name, "latches on existing transfers, and nothing else" wfo.status = 'staging' else: print wfo.name, "should just be assigned NOW to", sites_allowed wfo.status = 'staged' print "setting status to", wfo.status session.commit() continue else: ## there is an explicit transfer required if staging: ## and also using an existing one print wfo.name, "latches on existing transfers" if not options.test: wfo.status = 'staging' print "setting status to", wfo.status session.commit() print wfo.name, "needs a transfer" needs_transfer += 1 #print json.dumps(all_transfers) fake_id = -1 wf_id_in_prestaging = set() for (site, items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## site that do not want input datasets if site in SI.sites_veto_transfer: print site, "does not want transfers" continue ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured. ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] datasets = [it for it in items_to_transfer if not '#' in it] if execute: print "Making a replica to", site, "(CE)", site_se, "(SE) for" else: print "Would make a replica to", site, "(CE)", site_se, "(SE) for" print "\t", len(blocks), "blocks" ## remove blocks if full dataset is send out blocks = [ block for block in blocks if not block.split('#')[0] in datasets ] print "\t", len(blocks), "needed blocks for", list( set([block.split('#')[0] for block in blocks])) print "\t", len(datasets), "datasets" print "\t", datasets items_to_transfer = blocks + datasets ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y', 'yes', 'go']: continue if execute: result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority='normal') ## make use of max_priority dataset:priority to set the subscriptions priority """ ## does not function once = True for item in items_to_transfer: bds = item.split('#')[0] if max_priority[bds] >= 90000: if once: w=10 print "waiting",w,"s before raising priority" time.sleep(w) once=False ## raise it to high priority print item,"subscription priority raised to high at",site_se #print "This does not work yet properly it seems" print updateSubscription(url, site_se, item, priority='high') """ else: #result= {'phedex':{'request_created' : [{'id' : fake_id}]}} result = {'phedex': {'request_created': []}} fake_id -= 1 if not result: print "ERROR Could not make a replica request for", site, items_to_transfer, "pre-staging" continue for phedexid in [o['id'] for o in result['phedex']['request_created']]: new_transfer = session.query(Transfer).filter( Transfer.phedexid == phedexid).first() print phedexid, "transfer created" if not new_transfer: new_transfer = Transfer(phedexid=phedexid) session.add(new_transfer) new_transfer.workflows_id = set() for transfering in list( set(map(lambda it: it.split('#')[0], items_to_transfer))): new_transfer.workflows_id.update( workflow_dependencies[transfering]) new_transfer.workflows_id = list(new_transfer.workflows_id) wf_id_in_prestaging.update(new_transfer.workflows_id) session.commit() ## auto approve it if execute: approved = approveSubscription(url, phedexid, [site_se]) for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status != 'staging': if execute: tr_wf.status = 'staging' if talk: print "setting", tr_wf.name, "to staging" session.commit()
def assignor(url, specific=None, talk=True, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() #SI = siteInfo() SI = global_SI() #NLI = newLockInfo() #if not NLI.free() and not options.go: return LI = lockInfo() if not LI.free() and not options.go: return n_assigned = 0 n_stalled = 0 wfos = [] fetch_from = [] if specific or options.early: fetch_from.extend(['considered', 'staging']) if specific: fetch_from.extend(['considered-tried']) if options.early: print "Option Early is on" fetch_from.extend(['staged']) if options.from_status: fetch_from = options.from_status.split(',') print "Overriding to read from", fetch_from for status in fetch_from: print "getting wf in", status wfos.extend( session.query(Workflow).filter(Workflow.status == status).all()) print len(wfos) ## in case of partial, go for fetching a list from json ? #if options.partial and not specific: # pass dataset_endpoints = json.loads( open('%s/dataset_endpoints.json' % monitor_dir).read()) aaa_mapping = json.loads( open('%s/equalizor.json' % monitor_pub_dir).read())['mapping'] all_stuck = set() all_stuck.update( json.loads(open('%s/stuck_transfers.json' % monitor_pub_dir).read())) all_stuck.update(getAllStuckDataset()) max_per_round = UC.get('max_per_round').get('assignor', None) max_cpuh_block = UC.get('max_cpuh_block') ##order by priority instead of random if options.early: cache = sorted(getWorkflows(url, 'assignment-approved', details=True), key=lambda r: r['RequestPriority']) cache = [r['RequestName'] for r in cache] def rank(wfn): return cache.index(wfn) if wfn in cache else 0 wfos = sorted(wfos, key=lambda wfo: rank(wfo.name), reverse=True) print "10 first", [wfo.name for wfo in wfos[:10]] print "10 last", [wfo.name for wfo in wfos[-10:]] else: random.shuffle(wfos) for wfo in wfos: if options.limit and (n_stalled + n_assigned) > options.limit: break if max_per_round and (n_stalled + n_assigned) > max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo(url, wfo.name) if wfh.request['RequestStatus'] in [ 'rejected', 'aborted', 'aborted-completed', 'aborted-archived', 'rejected-archived' ] and wfh.isRelval(): wfo.status = 'forget' session.commit() n_stalled += 1 continue if options.priority and int( wfh.request['RequestPriority']) < options.priority: continue options_text = "" if options.early: options_text += ", early option is ON" if options.partial: options_text += ", partial option is ON" options_text += ", good fraction is %.2f" % options.good_enough wfh.sendLog('assignor', "%s to be assigned%s" % (wfo.name, options_text)) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() output_tiers = list( set([o.split('/')[-1] for o in wfh.request['OutputDatasets']])) if not output_tiers: n_stalled += 1 wfh.sendLog('assignor', 'There is no output at all') sendLog('assignor', 'Workflow %s has no output at all' % (wfo.name), level='critical') continue is_stuck = (all_stuck & primary) if is_stuck: wfh.sendLog('assignor', "%s are stuck input" % (','.join(is_stuck))) ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: no_go = True allowed_secondary = {} assign_parameters = {} check_secondary = (not wfh.isRelval()) for campaign in wfh.getCampaigns(): if campaign in CI.campaigns: assign_parameters.update(CI.campaigns[campaign]) if campaign in CI.campaigns and 'secondaries' in CI.campaigns[ campaign]: if CI.campaigns[campaign]['secondaries']: allowed_secondary.update( CI.campaigns[campaign]['secondaries']) check_secondary = True if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[ campaign]: banned_tier = list( set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers)) if banned_tier: no_go = True wfh.sendLog( 'assignor', 'These data tiers %s are not allowed' % (','.join(banned_tier))) sendLog('assignor', 'These data tiers %s are not allowed' % (','.join(banned_tier)), level='critical') if secondary and check_secondary: if (set(secondary) & set(allowed_secondary.keys()) != set(secondary)): wfh.sendLog( 'assignor', '%s is not an allowed secondary' % (', '.join(set(secondary) - set(allowed_secondary.keys())))) sendLog( 'assignor', '%s is not an allowed secondary' % (', '.join(set(secondary) - set(allowed_secondary.keys()))), level='critical') if not options.go: no_go = True ## then get whether there is something more to be done by secondary for sec in secondary: if sec in allowed_secondary: # and 'parameters' in allowed_secondary[sec]: assign_parameters.update(allowed_secondary[sec]) if no_go: n_stalled += 1 ## make a very loud noise if >100k priority stalled continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] != 'assignment-approved': if not options.test: wfh.sendLog('assignor', "setting %s away and skipping" % wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name, wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version = wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor', "cannot decide on version number") n_stalled += 1 wfo.status = 'trouble' session.commit() continue original_sites_allowed = copy.deepcopy(sites_allowed) wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed)) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) blocks = wfh.getBlockWhiteList() rwl = wfh.getRunWhiteList() if rwl: ## augment with run white list for dataset in primary: blocks = list(set(blocks + getDatasetBlocks(dataset, runs=rwl))) lwl = wfh.getLumiWhiteList() if lwl: ## augment with lumi white list for dataset in primary: blocks = list( set(blocks + getDatasetBlocks(dataset, lumis=lwl))) wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed)) secondary_locations = None primary_aaa = options.primary_aaa secondary_aaa = options.secondary_aaa do_partial = False #options.good_enough if options.partial else 0 if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns: assign_parameters.update(CI.campaigns[wfh.request['Campaign']]) if 'primary_AAA' in assign_parameters: primary_aaa = primary_aaa or assign_parameters['primary_AAA'] if 'secondary_AAA' in assign_parameters: secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA'] if 'partial_copy' in assign_parameters: ## can this only work if there is a stuck input ? maybe not ## this is a number. 0 means no print "Could do partial disk copy assignment" if is_stuck or options.partial: do_partial = assign_parameters['partial_copy'] wfh.sendLog( 'assignor', "Overiding partial copy assignment to %.2f fraction" % do_partial) #sendEmail('stuck input to assignment','%s is stuck for assigning %s and going fractional'%(','.join( is_stuck), wfo.name)) do_partial = options.good_enough if options.partial else do_partial for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" #sendEmail("tempting to pass sec location check","but we cannot yet IMO") #pass presence = getDatasetPresence(url, sec) print sec print json.dumps(presence, indent=2) one_secondary_locations = [ site for (site, (there, frac)) in presence.items() if frac > 98. ] if secondary_aaa: if not one_secondary_locations: sec_availability = getDatasetBlocksFraction(url, sec) if sec_availability >= 1. and options.go: ## there is at least one copy of each block on disk. We should go ahead and let it go. wfh.sendLog( 'assignor', "The secondary %s is available %s times on disk, and usable" % (sec, sec_availability)) else: ## not even a copy on disk anywhere !!!! sites_allowed = [] ## will block the assignment wfh.sendLog( 'assignor', "The secondary %s is nowhere on disk" % sec) #just continue without checking continue #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations == None: secondary_locations = one_secondary_locations else: secondary_locations = list( set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [ site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations ] wfh.sendLog( 'assignor', "From/after secondary requirement, now Allowed%s" % sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy(sites_allowed) sites_with_data = copy.deepcopy(sites_allowed) sites_with_any_data = copy.deepcopy(sites_allowed) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default endpoints = set() for prim in list(primary): if prim in dataset_endpoints: print "endpoints from stagor", dataset_endpoints[prim] endpoints.update(dataset_endpoints[prim]) set_lfn = getLFNbase(prim) presence = getDatasetPresence(url, prim, only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction( url, prim, sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks) if primary_aaa: available_fractions[prim] = getDatasetBlocksFraction( url, prim, only_blocks=blocks) sites_all_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [ psite for (psite, (there, frac)) in presence.items() if there ] ] if primary_aaa: sites_all_data = list( set([ SI.SE_to_CE(psite) for (psite, (there, frac)) in presence.items() if there ])) sites_with_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite, frac) in presence.items() if frac[1] > 90.] ] sites_with_any_data = [ site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys() ] if primary_aaa: sites_with_any_data = list( set([SI.SE_to_CE(psite) for psite in presence.keys()])) wfh.sendLog( 'assignor', "Holding the data but not allowed %s" % sorted( list( set([ se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed ])))) if primary_locations == None: primary_locations = presence.keys() else: primary_locations = list( set(primary_locations) & set(presence.keys())) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites = [] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] elif primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list( set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] else: opportunistic_sites = [] wfh.sendLog( 'assignor', "We could be running in addition at %s" % sorted(opportunistic_sites)) if any( [osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog( 'assignor', "One of the usable site is in downtime %s" % ([ osite for osite in opportunistic_sites if osite in SI.sites_not_ready ])) down_time = True ## should this be send back to considered ? ## should be 2 but for the time-being let's lower it to get things going copies_wanted, cpuh = wfh.getNCopies() wfh.sendLog('assignor', "we need %s CPUh" % cpuh) if cpuh > max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog( 'assignor', '%s requires a large numbr of CPUh %s , not assigning, please check with requester' % (wfo.name, cpuh), level='critical') wfh.sendLog( 'assignor', "Requiring a large number of CPUh %s, not assigning" % cpuh) continue if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[ wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[ wfh.request['Campaign']]['maxcopies'] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max( 1, copies_wanted - less_copies_than_requested) # take one out for the efficiency else: ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going pass wfh.sendLog('assignor', "needed availability fraction %s" % copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently ## default back to white list to original white list with any data wfh.sendLog('assignor', "Allowed sites :%s" % sorted(sites_allowed)) if primary_aaa: ## remove the sites not reachable localy if not in having the data if not sites_all_data: wfh.sendLog('assignor', "Overiding the primary on AAA setting to Off") primary_aaa = False else: aaa_grid = set(sites_all_data) for site in list(aaa_grid): aaa_grid.update(aaa_mapping.get(site, [])) sites_allowed = list(set(initial_sites_allowed) & aaa_grid) wfh.sendLog( 'assignor', "Selected to read primary through xrootd %s" % sorted(sites_allowed)) isStoreResults = ('StoreResults' == wfh.request.setdefault( 'RequestType', None)) if isStoreResults: if 'MergedLFNBase' in wfh.request: set_lfn = wfh.request['MergedLFNBase'] else: n_stalled += 1 wfh.sendLog( 'assignor', "Cannot assign StoreResults request because MergedLFN is missing" ) sendLog( 'assignor', 'Cannot assign StoreResults request because MergedLFN is missing', level='critical') continue if not primary_aaa: if not isStoreResults: sites_allowed = sites_with_any_data else: ## if we are dealing with a StoreResults request, we don't need to check dataset availability and ## should use the SiteWhiteList set in the original request if 'SiteWhitelist' in wfh.request: sites_allowed = wfh.request['SiteWhitelist'] else: wfh.sendLog( 'assignor', "Cannot assign StoreResults request because SiteWhitelist is missing" ) sendLog( 'assignor', 'Cannot assign StoreResults request because SiteWhitelist is missing', level='critical') n_stalled += 1 continue available_fractions = {} wfh.sendLog('assignor', "Selected for any data %s" % sorted(sites_allowed)) ### check on endpoints for on-going transfers if do_partial: if endpoints: end_sites = [SI.SE_to_CE(s) for s in endpoints] sites_allowed = list(set(sites_allowed + end_sites)) if down_time and not any(osite in SI.sites_not_ready for osite in end_sites): print "Flip the status of downtime, since our destinations are good" down_time = False print "with added endpoints", sorted(end_sites) else: print "Cannot do partial assignment without knowin the endpoints" n_stalled += 1 continue #if not len(sites_allowed): # if not options.early: # wfh.sendLog('assignor',"cannot be assign with no matched sites") # sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') # n_stalled+=1 # continue low_pressure = SI.sites_low_pressure(0.4) ## if any of the site allowed is low pressure : reduce to 1 copy so that it gets started allowed_and_low = sorted(set(low_pressure) & set(sites_allowed)) if allowed_and_low: wfh.sendLog( 'assignor', "The workflow can run at %s under low pressure currently" % (','.join(allowed_and_low))) copies_wanted = max(1., copies_wanted - 1.) if available_fractions and not all([ available >= copies_wanted for available in available_fractions.values() ]): not_even_once = not all([ available >= 1. for available in available_fractions.values() ]) above_good = all([ available >= do_partial for available in available_fractions.values() ]) wfh.sendLog( 'assignor', "The input dataset is not available %s times, only %s" % (copies_wanted, available_fractions.values())) if down_time and not options.go and not options.early: wfo.status = 'considered' session.commit() wfh.sendLog( 'assignor', "sending back to considered because of site downtime, instead of waiting" ) #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) sendLog( 'assignor', '%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.' % (wfo.name), level='delay') n_stalled += 1 continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known and not options.limit and not options.go and not options.early and not ( do_partial and above_good): wfh.sendLog( 'assignor', "cannot be assigned, %s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions))) #sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) known.append(wfo.name) open('cannot_assign.json', 'w').write(json.dumps(known, indent=2)) if options.early: if wfo.status == 'considered': wfh.sendLog('assignor', "setting considered-tried") wfo.status = 'considered-tried' session.commit() else: print "tried but status is", wfo.status if do_partial and above_good: print "Will move on with partial locations" else: n_stalled += 1 continue if not len(sites_allowed): if not options.early: wfh.sendLog('assignor', "cannot be assign with no matched sites") sendLog('assignor', '%s has no whitelist' % wfo.name, level='critical') n_stalled += 1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [ SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed]) ] wfh.sendLog('assignor', "Placing the output on %s" % sites_out) parameters = { 'SiteWhitelist': sites_allowed, 'NonCustodialSites': sites_out, 'AutoApproveSubscriptionSites': list(set(sites_out)), 'AcquisitionEra': wfh.acquisitionEra(), 'ProcessingString': wfh.processingString(), 'MergedLFNBase': set_lfn, 'ProcessingVersion': version, } if primary_aaa: parameters['TrustSitelists'] = True wfh.sendLog( 'assignor', "Reading primary through xrootd at %s" % sorted(sites_allowed)) if secondary_aaa: parameters['TrustPUSitelists'] = True wfh.sendLog( 'assignor', "Reading secondary through xrootd at %s" % sorted(sites_allowed)) ## plain assignment here team = 'production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team parameters['Team'] = team if lheinput: ## throttle reading LHE article wfh.sendLog('assignor', 'Setting the number of events per job to 500k max') parameters['EventsPerJob'] = 500000 def pick_options(options, parameters): ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v = getattr(options, key) if v != None: if type(v) == str and ',' in v: parameters[key] = filter(None, v.split(',')) else: parameters[key] = v def pick_campaign(assign_parameters, parameters): ## pick up campaign specific assignment parameters parameters.update(assign_parameters.get('parameters', {})) if options.force_options: pick_campaign(assign_parameters, parameters) pick_options(options, parameters) else: ## campaign parameters update last pick_options(options, parameters) pick_campaign(assign_parameters, parameters) if not options.test: parameters['execute'] = True hold_split, split_check = wfh.checkSplitting() if hold_split and not options.go: if split_check: wfh.sendLog( 'assignor', 'Holding on to the change in splitting %s' % ('\n\n'.join([str(i) for i in split_check]))) else: wfh.sendLog('assignor', 'Change of splitting is on hold') n_stalled += 1 continue if split_check == None or split_check == False: n_stalled += 1 continue elif split_check: ## operate all recommended changes reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check) wfh.sendLog( 'assignor', 'Applying the change in splitting %s' % ('\n\n'.join([str(i) for i in split_check]))) split_check = True ## bypass completely and use the above # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents / (reqJobs * 1.4)) lumisPerJob = int(eventsPerJob / eventsPerLumi) if lumisPerJob == 0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob), level='critical') wfh.sendLog( 'assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl[ 'events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl[ 'avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob), level='critical') wfh.sendLog( 'assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog( 'assignor', "leaving splitting untouched for %s, please check on %s" % (pstring, wfo.name), level='critical') wfh.sendLog( 'assignor', "leaving splitting untouched for PU_RD*, please check." ) if isHEPCloudReady(url) and wfh.isGoodForNERSC(): parameters['Team'] = 'hepcloud' parameters['SiteWhitelist'] = ['T3_US_NERSC'] if primary: parameters['TrustSitelists'] = True if secondary: parameters['TrustPUSitelists'] = True sendEmail("sending work to hepcloud", "pleasse check on %s" % wfh.request['RequestName'], destination=['*****@*****.**']) ## make sure to autoapprove all NonCustodialSites parameters['AutoApproveSubscriptionSites'] = list( set(parameters['NonCustodialSites'] + parameters.get('AutoApproveSubscriptionSites', []))) result = reqMgrClient.assignWorkflow( url, wfo.name, None, parameters) ## team is not relevant anymore here # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned += 1 wfh.sendLog( 'assignor', "Properly assigned\n%s" % (json.dumps(parameters, indent=2))) try: ## refetch information and lock output new_wfi = workflowInfo(url, wfo.name) (_, prim, _, sec) = new_wfi.getIO() for secure in list(prim) + list( sec) + new_wfi.request['OutputDatasets']: ## lock all outputs LI.lock(secure, reason='assigning') except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output", str(e)) else: wfh.sendLog( 'assignor', "Failed to assign %s.\n%s \n Please check the logs" % (wfo.name, reqMgrClient.assignWorkflow.errorMessage)) sendLog('assignor', "Failed to assign %s.\n%s \n Please check the logs" % (wfo.name, reqMgrClient.assignWorkflow.errorMessage), level='critical') print "ERROR could not assign", wfo.name else: pass print "Assignment summary:" sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled)) if n_stalled and not options.go and not options.early: sendLog('assignor', "%s workflows cannot be assigned. Please take a look" % (n_stalled), level='critical')
#!/usr/bin/env python from utils import workflowInfo, getWorkflows, sendEmail, componentInfo, monitor_dir, reqmgr_url, newLockInfo from assignSession import * import reqMgrClient import os import sys import json url = reqmgr_url #nl = newLockInfo() #nl.lock('/Neutrino_E-10_gun/RunIISpring15PrePremix-AVE_25_BX_25ns_76X_mcRun2_asymptotic_v12-v3/GEN-SIM-DIGI-RAW') #nl.lock('/DYJetsToLL_M-50_TuneCUETP8M1_13TeV-amcatnloFXFX-pythia8/RunIISummer15GS-MCRUN2_71_V1_ext1-v2/GEN-SIM') ## all dqmharvest completed to announced right away wfs = getWorkflows(url, 'completed', user=None, rtype='DQMHarvest') for wf in wfs: print "closing out", wf reqMgrClient.closeOutWorkflow(url, wf) wfs = getWorkflows(url, 'closed-out', user=None, rtype='DQMHarvest') for wf in wfs: print "announcing", wf reqMgrClient.announceWorkflow(url, wf) #os.system('Unified/equalizor.py -a pdmvserv_task_HIG-RunIIFall15DR76-01039__v1_T_160120_002705_9423') #os.system('Unified/equalizor.py -a pdmvserv_SMP-Summer12DR53X-00027_00440_v0__160224_044437_5031') up = componentInfo(mcm=False, soft=['mcm']) if not up.check(): sys.exit(1)
from utils import getDatasetBlockAndSite, siteInfo, getWorkflows, workflowInfo, monitor_dir from collections import defaultdict import time import json import sys spec = None if len(sys.argv) > 1: spec = sys.argv[1] url = 'cmsweb.cern.ch' wfs = getWorkflows(url, 'acquired', details=True) wfs.extend(getWorkflows(url, 'running-open', details=True)) wfs.extend(getWorkflows(url, 'running-closed', details=True)) jobs_for = defaultdict(lambda: defaultdict(int)) wf_for = defaultdict(lambda: defaultdict(set)) agent_for = defaultdict(lambda: defaultdict(set)) s_block_locations = {} block_locations = defaultdict(lambda: defaultdict(list)) wfs_no_location_in_GQ = defaultdict(list) si = siteInfo() #bad_blocks = defaultdict( set ) unprocessable = set() for wf in wfs: if spec and not spec in wf['RequestName']: continue wfi = workflowInfo(url, wf['RequestName'], request=wf) sitewhitelist = wfi.request['SiteWhitelist'] wqs = wfi.getWorkQueue()
def assignor(url ,specific = None, talk=True, options=None): if userLock(): return mlock = moduleLock() if mlock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() #SI = siteInfo() SI = global_SI() #NLI = newLockInfo() #if not NLI.free() and not options.go: return LI = lockInfo() if not LI.free() and not options.go: return n_assigned = 0 n_stalled = 0 wfos=[] fetch_from = [] if specific or options.early: fetch_from.extend(['considered','staging']) if specific: fetch_from.extend(['considered-tried']) if options.early: print "Option Early is on" fetch_from.extend(['staged']) if options.from_status: fetch_from = options.from_status.split(',') print "Overriding to read from",fetch_from for status in fetch_from: print "getting wf in",status wfos.extend(session.query(Workflow).filter(Workflow.status==status).all()) print len(wfos) ## in case of partial, go for fetching a list from json ? #if options.partial and not specific: # pass dataset_endpoints = json.loads(open('%s/dataset_endpoints.json'%monitor_dir).read()) aaa_mapping = json.loads(open('%s/equalizor.json'%monitor_pub_dir).read())['mapping'] all_stuck = set() all_stuck.update( json.loads( open('%s/stuck_transfers.json'%monitor_pub_dir).read() )) all_stuck.update( getAllStuckDataset()) max_per_round = UC.get('max_per_round').get('assignor',None) max_cpuh_block = UC.get('max_cpuh_block') ##order by priority instead of random if options.early: cache = sorted(getWorkflows(url, 'assignment-approved', details=True), key = lambda r : r['RequestPriority']) cache = [r['RequestName'] for r in cache] def rank( wfn ): return cache.index( wfn ) if wfn in cache else 0 wfos = sorted(wfos, key = lambda wfo : rank( wfo.name ),reverse=True) print "10 first",[wfo.name for wfo in wfos[:10]] print "10 last",[wfo.name for wfo in wfos[-10:]] else: random.shuffle( wfos ) for wfo in wfos: if options.limit and (n_stalled+n_assigned)>options.limit: break if max_per_round and (n_stalled+n_assigned)>max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo( url, wfo.name) if wfh.request['RequestStatus'] in ['rejected','aborted','aborted-completed','aborted-archived','rejected-archived'] and wfh.isRelval(): wfo.status = 'forget' session.commit() n_stalled+=1 continue if options.priority and int(wfh.request['RequestPriority']) < options.priority: continue options_text="" if options.early: options_text+=", early option is ON" if options.partial: options_text+=", partial option is ON" options_text+=", good fraction is %.2f"%options.good_enough wfh.sendLog('assignor',"%s to be assigned%s"%(wfo.name, options_text)) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput,primary,parent,secondary, sites_allowed) = wfh.getSiteWhiteList() output_tiers = list(set([o.split('/')[-1] for o in wfh.request['OutputDatasets']])) if not output_tiers: n_stalled+=1 wfh.sendLog('assignor','There is no output at all') sendLog('assignor','Workflow %s has no output at all'%( wfo.name), level='critical') continue is_stuck = (all_stuck & primary) if is_stuck: wfh.sendLog('assignor',"%s are stuck input"%(','.join( is_stuck))) ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: no_go = True allowed_secondary = {} assign_parameters = {} check_secondary = (not wfh.isRelval()) for campaign in wfh.getCampaigns(): if campaign in CI.campaigns: assign_parameters.update( CI.campaigns[campaign] ) if campaign in CI.campaigns and 'secondaries' in CI.campaigns[campaign]: if CI.campaigns[campaign]['secondaries']: allowed_secondary.update( CI.campaigns[campaign]['secondaries'] ) check_secondary = True if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[campaign]: banned_tier = list(set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers)) if banned_tier: no_go=True wfh.sendLog('assignor','These data tiers %s are not allowed'%(','.join( banned_tier))) sendLog('assignor','These data tiers %s are not allowed'%(','.join( banned_tier)), level='critical') if secondary and check_secondary: if (set(secondary)&set(allowed_secondary.keys())!=set(secondary)): wfh.sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-set(allowed_secondary.keys())))) sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-set(allowed_secondary.keys()))), level='critical') if not options.go: no_go = True ## then get whether there is something more to be done by secondary for sec in secondary: if sec in allowed_secondary:# and 'parameters' in allowed_secondary[sec]: assign_parameters.update( allowed_secondary[sec] ) if no_go: n_stalled+=1 ## make a very loud noise if >100k priority stalled continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] !='assignment-approved': if not options.test: wfh.sendLog('assignor',"setting %s away and skipping"%wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name,wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version=wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor',"cannot decide on version number") n_stalled+=1 wfo.status = 'trouble' session.commit() continue original_sites_allowed = copy.deepcopy( sites_allowed ) wfh.sendLog('assignor',"Site white list %s"%sorted(sites_allowed)) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) blocks = wfh.getBlockWhiteList() rwl = wfh.getRunWhiteList() if rwl: ## augment with run white list for dataset in primary: blocks = list(set( blocks + getDatasetBlocks( dataset, runs=rwl ) )) lwl = wfh.getLumiWhiteList() if lwl: ## augment with lumi white list for dataset in primary: blocks = list(set( blocks + getDatasetBlocks( dataset, lumis=lwl))) wfh.sendLog('assignor',"Allowed %s"%sorted(sites_allowed)) secondary_locations=None primary_aaa = options.primary_aaa secondary_aaa = options.secondary_aaa do_partial = False #options.good_enough if options.partial else 0 if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns: assign_parameters.update( CI.campaigns[wfh.request['Campaign']] ) if 'primary_AAA' in assign_parameters: primary_aaa = primary_aaa or assign_parameters['primary_AAA'] if 'secondary_AAA' in assign_parameters: secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA'] if 'partial_copy' in assign_parameters: ## can this only work if there is a stuck input ? maybe not ## this is a number. 0 means no print "Could do partial disk copy assignment" if is_stuck or options.partial: do_partial = assign_parameters['partial_copy'] wfh.sendLog('assignor',"Overiding partial copy assignment to %.2f fraction"% do_partial) #sendEmail('stuck input to assignment','%s is stuck for assigning %s and going fractional'%(','.join( is_stuck), wfo.name)) do_partial = options.good_enough if options.partial else do_partial for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" #sendEmail("tempting to pass sec location check","but we cannot yet IMO") #pass presence = getDatasetPresence( url, sec ) print sec print json.dumps(presence, indent=2) one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.] if secondary_aaa: if not one_secondary_locations: sec_availability = getDatasetBlocksFraction( url, sec ) if sec_availability >=1. and options.go: ## there is at least one copy of each block on disk. We should go ahead and let it go. wfh.sendLog('assignor',"The secondary %s is available %s times on disk, and usable"%( sec, sec_availability)) else: ## not even a copy on disk anywhere !!!! sites_allowed = [] ## will block the assignment wfh.sendLog('assignor',"The secondary %s is nowhere on disk"% sec) #just continue without checking continue #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations==None: secondary_locations = one_secondary_locations else: secondary_locations = list(set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations] wfh.sendLog('assignor',"Intersecting with secondary requirement, now allowed %s"%sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy( sites_allowed ) sites_with_data = copy.deepcopy( sites_allowed ) sites_with_any_data = copy.deepcopy( sites_allowed ) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default endpoints = set() for prim in list(primary): if prim in dataset_endpoints: print "endpoints from stagor",dataset_endpoints[prim] endpoints.update( dataset_endpoints[prim] ) set_lfn = getLFNbase( prim ) ## if they are requested for processing, they should bbe all closed already closeAllBlocks(url, prim, blocks) presence = getDatasetPresence( url, prim , only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction(url, prim, sites = [SI.CE_to_SE(site) for site in sites_allowed] , only_blocks = blocks) if primary_aaa: available_fractions[prim] = getDatasetBlocksFraction(url, prim, only_blocks = blocks) sites_all_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,(there,frac)) in presence.items() if there]] if primary_aaa: sites_all_data = set() for (psite,(there,frac)) in presence.items(): if there: sites_all_data.update( SI.SE_to_CEs(psite) ) sites_all_data = list(sites_all_data) #sites_all_data = list(set([SI.SE_to_CE(psite) for (psite,(there,frac)) in presence.items() if there])) sites_with_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,frac) in presence.items() if frac[1]>90.]] sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()] if primary_aaa: sites_with_any_data = set() for psite in presence.keys(): sites_with_any_data.update( SI.SE_to_CEs(psite) ) sites_with_any_data = list(sites_with_any_data) #sites_with_any_data = list(set([SI.SE_to_CE(psite) for psite in presence.keys()])) holding_but_not_allowed = set() for se_site in presence.keys(): if not (set(SI.SE_to_CEs(se_site)) & set(sites_allowed)): holding_but_not_allowed.add( se_site ) #wfh.sendLog('assignor',"Holding the data but not allowed %s"%sorted(list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed])))) wfh.sendLog('assignor',"Holding the data but not allowed %s"%sorted( holding_but_not_allowed )) if primary_locations==None: primary_locations = presence.keys() else: primary_locations = list(set(primary_locations) & set(presence.keys() )) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites=[] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed]))] elif primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed]))] else: opportunistic_sites = [] wfh.sendLog('assignor',"We could be running in addition at %s"% sorted(opportunistic_sites)) if any([osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog('assignor',"One of the usable site is in downtime %s"%([osite for osite in opportunistic_sites if osite in SI.sites_not_ready])) down_time = True ## should this be send back to considered ? ## should be 2 but for the time-being let's lower it to get things going copies_wanted,cpuh = wfh.getNCopies() wfh.sendLog('assignor',"we need %s CPUh"%cpuh) if cpuh>max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog('assignor','%s requires a large numbr of CPUh %s , not assigning, please check with requester'%( wfo.name, cpuh), level='critical') wfh.sendLog('assignor',"Requiring a large number of CPUh %s, not assigning"%cpuh) continue if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies'] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max(1,copies_wanted-less_copies_than_requested) # take one out for the efficiency else: ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going pass wfh.sendLog('assignor',"needed availability fraction %s"% copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently ## default back to white list to original white list with any data wfh.sendLog('assignor',"Allowed sites :%s"% sorted(sites_allowed)) if primary_aaa: ## remove the sites not reachable localy if not in having the data if not sites_all_data: wfh.sendLog('assignor',"Overiding the primary on AAA setting to Off") primary_aaa=False else: aaa_grid = set(sites_all_data) for site in list(aaa_grid): aaa_grid.update( aaa_mapping.get(site,[]) ) sites_allowed = list(set(initial_sites_allowed) & aaa_grid) wfh.sendLog('assignor',"Selected to read primary through xrootd %s"%sorted(sites_allowed)) isStoreResults = ( 'StoreResults' == wfh.request.setdefault('RequestType',None) ) if isStoreResults: if 'MergedLFNBase' in wfh.request: set_lfn = wfh.request['MergedLFNBase'] else: n_stalled+= 1 wfh.sendLog('assignor',"Cannot assign StoreResults request because MergedLFN is missing") sendLog('assignor','Cannot assign StoreResults request because MergedLFN is missing', level='critical') continue if not primary_aaa: if not isStoreResults: sites_allowed = sites_with_any_data else: ## if we are dealing with a StoreResults request, we don't need to check dataset availability and ## should use the SiteWhiteList set in the original request if 'SiteWhitelist' in wfh.request: sites_allowed = wfh.request['SiteWhitelist'] else: wfh.sendLog('assignor',"Cannot assign StoreResults request because SiteWhitelist is missing") sendLog('assignor','Cannot assign StoreResults request because SiteWhitelist is missing', level='critical') n_stalled += 1 continue available_fractions = {} wfh.sendLog('assignor',"Selected for any data %s"%sorted(sites_allowed)) ### check on endpoints for on-going transfers if do_partial: if endpoints: end_sites = [SI.SE_to_CE(s) for s in endpoints] sites_allowed = list(set(sites_allowed + end_sites)) if down_time and not any(osite in SI.sites_not_ready for osite in end_sites): print "Flip the status of downtime, since our destinations are good" down_time = False print "with added endpoints",sorted(end_sites) else: print "Cannot do partial assignment without knowin the endpoints" n_stalled+=1 continue #if not len(sites_allowed): # if not options.early: # wfh.sendLog('assignor',"cannot be assign with no matched sites") # sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') # n_stalled+=1 # continue low_pressure = SI.sites_low_pressure(0.4) ## if any of the site allowed is low pressure : reduce to 1 copy so that it gets started allowed_and_low = sorted(set(low_pressure) & set(sites_allowed)) if allowed_and_low: wfh.sendLog('assignor',"The workflow can run at %s under low pressure currently"%( ','.join( allowed_and_low ))) copies_wanted = max(1., copies_wanted-1.) if available_fractions and not all([available>=copies_wanted for available in available_fractions.values()]): not_even_once = not all([available>=1. for available in available_fractions.values()]) above_good = all([available >= do_partial for available in available_fractions.values()]) wfh.sendLog('assignor',"The input dataset is not available %s times, only %s"%( copies_wanted, available_fractions.values())) if down_time and not options.go and not options.early: wfo.status = 'considered' session.commit() wfh.sendLog('assignor',"sending back to considered because of site downtime, instead of waiting") #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) sendLog('assignor','%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.'%( wfo.name ), level='delay') n_stalled+=1 continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known and not options.limit and not options.go and not options.early and not (do_partial and above_good): wfh.sendLog('assignor',"cannot be assigned, %s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) #sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) known.append( wfo.name ) open('cannot_assign.json','w').write(json.dumps( known, indent=2)) if options.early: if wfo.status == 'considered': wfh.sendLog('assignor',"setting considered-tried") wfo.status = 'considered-tried' session.commit() else: print "tried but status is",wfo.status if do_partial and above_good: print "Will move on with partial locations" else: n_stalled+=1 continue if not len(sites_allowed) and not options.SiteWhitelist: if not options.early: wfh.sendLog('assignor',"cannot be assign with no matched sites") sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') n_stalled+=1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])] wfh.sendLog('assignor',"Placing the output on %s"%sites_out) parameters={ 'SiteWhitelist' : sites_allowed, 'NonCustodialSites' : sites_out, 'AutoApproveSubscriptionSites' : list(set(sites_out)), 'AcquisitionEra' : wfh.acquisitionEra(), 'ProcessingString' : wfh.processingString(), 'MergedLFNBase' : set_lfn, 'ProcessingVersion' : version, } if primary_aaa: parameters['TrustSitelists'] = True wfh.sendLog('assignor',"Reading primary through xrootd at %s"%sorted(sites_allowed)) if secondary_aaa: parameters['TrustPUSitelists'] = True wfh.sendLog('assignor',"Reading secondary through xrootd at %s"%sorted(sites_allowed)) ## plain assignment here team='production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team parameters['Team'] = team if lheinput: ## throttle reading LHE article wfh.sendLog('assignor', 'Setting the number of events per job to 500k max') parameters['EventsPerJob'] = 500000 def pick_options(options, parameters): ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v=getattr(options,key) if v!=None: if type(v)==str and ',' in v: parameters[key] = filter(None,v.split(',')) else: parameters[key] = v def pick_campaign( assign_parameters, parameters): ## pick up campaign specific assignment parameters parameters.update( assign_parameters.get('parameters',{}) ) if options.force_options: pick_campaign( assign_parameters, parameters) pick_options(options, parameters) else: ## campaign parameters update last pick_options(options, parameters) pick_campaign( assign_parameters, parameters) if not options.test: parameters['execute'] = True hold_split, split_check = wfh.checkSplitting() if hold_split and not options.go: if split_check: wfh.sendLog('assignor','Holding on to the change in splitting %s'%( '\n\n'.join([str(i) for i in split_check]))) else: wfh.sendLog('assignor','Change of splitting is on hold') n_stalled+=1 continue if split_check==None or split_check==False: n_stalled+=1 continue elif split_check: ## operate all recommended changes reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check) wfh.sendLog('assignor','Applying the change in splitting %s'%( '\n\n'.join([str(i) for i in split_check]))) split_check = True ## bypass completely and use the above # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi)/float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents/(reqJobs*1.4)) lumisPerJob = int(eventsPerJob/eventsPerLumi) if lumisPerJob==0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob), level='critical') wfh.sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl['events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl['avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob), level='critical') wfh.sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog('assignor',"leaving splitting untouched for %s, please check on %s"%( pstring, wfo.name), level='critical') wfh.sendLog('assignor',"leaving splitting untouched for PU_RD*, please check.") if isHEPCloudReady(url) and wfh.isGoodForNERSC(): parameters['Team'] = 'hepcloud' parameters['SiteWhitelist'] = ['T3_US_NERSC'] if primary: parameters['TrustSitelists'] = True if secondary: parameters['TrustPUSitelists'] = True sendEmail("sending work to hepcloud","pleasse check on %s"% wfh.request['RequestName'], destination=['*****@*****.**']) ## make sure to autoapprove all NonCustodialSites parameters['AutoApproveSubscriptionSites'] = list(set(parameters['NonCustodialSites'] + parameters.get('AutoApproveSubscriptionSites',[]))) result = reqMgrClient.assignWorkflow(url, wfo.name, None, parameters) ## team is not relevant anymore here # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned+=1 wfh.sendLog('assignor',"Properly assigned\n%s"%(json.dumps( parameters, indent=2))) try: ## refetch information and lock output new_wfi = workflowInfo( url, wfo.name) (_,prim,_,sec) = new_wfi.getIO() for secure in list(prim)+list(sec)+new_wfi.request['OutputDatasets']: ## lock all outputs LI.lock( secure, reason = 'assigning') except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output",str(e)) else: wfh.sendLog('assignor',"Failed to assign %s.\n%s \n Please check the logs"%(wfo.name, reqMgrClient.assignWorkflow.errorMessage)) sendLog('assignor',"Failed to assign %s.\n%s \n Please check the logs"%(wfo.name, reqMgrClient.assignWorkflow.errorMessage), level='critical') print "ERROR could not assign",wfo.name else: pass print "Assignment summary:" sendLog('assignor',"Assigned %d Stalled %s"%(n_assigned, n_stalled)) if n_stalled and not options.go and not options.early: sendLog('assignor',"%s workflows cannot be assigned. Please take a look"%(n_stalled), level='critical')
def checkor(url, spec=None, options=None): if userLock(): return if duplicateLock() and not options.go: return fDB = closeoutInfo() UC = unifiedConfiguration() use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] def time_point(label="",sub_lap=False): now = time.mktime(time.gmtime()) nows = time.asctime(time.gmtime()) print "Time check (%s) point at : %s"%(label, nows) print "Since start: %s [s]"% ( now - time_point.start) if sub_lap: print "Sub Lap : %s [s]"% ( now - time_point.sub_lap ) time_point.sub_lap = now else: print "Lap : %s [s]"% ( now - time_point.lap ) time_point.lap = now time_point.sub_lap = now time_point.sub_lap = time_point.lap = time_point.start = time.mktime(time.gmtime()) runnings = session.query(Workflow).filter(Workflow.status == 'away').all() standings = session.query(Workflow).filter(Workflow.status.startswith('assistance')).all() ## intersect with what is actually in completed status in request manager now all_completed = set(getWorkflows(url, 'completed' )) wfs=[] if options.strict: ## the one which were running and now have completed print "strict option is on: checking workflows that freshly completed" wfs.extend( filter(lambda wfo: wfo.name in all_completed , runnings)) if options.update: print "update option is on: checking workflows that have not completed yet" wfs.extend( filter(lambda wfo: not wfo.name in all_completed , runnings)) if options.clear: print "clear option is on: checking workflows that are ready to toggle closed-out" wfs.extend( filter(lambda wfo: 'custodial' in wfo.status, standings)) if options.review: print "review option is on: checking the workflows that needed intervention" wfs.extend( filter(lambda wfo: not 'custodial' in wfo.status, standings)) ## what is left out are the wf which were running and ended up aborted/failed/... custodials = defaultdict(list) #sites : dataset list transfers = defaultdict(list) #sites : dataset list invalidations = [] #a list of files SI = siteInfo() CI = campaignInfo() mcm = McMClient(dev=False) if use_mcm else None def get_campaign(output, wfi): ## this should be a perfect matching of output->task->campaign campaign = None era = None wf_campaign = None if 'Campaign' in wfi.request: wf_campaign = wfi.request['Campaign'] try: era = output.split('/')[2].split('-')[0] except: era = None if wfi.isRelval(): campaign = wf_campaign else: campaign = era if era else wf_campaign return campaign ## retrieve bypass and onhold configuration bypasses = [] forcings = [] overrides = getForceCompletes() holdings = [] actors = UC.get('allowed_bypass') for bypassor,email in actors: bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor) if not os.path.isfile(bypass_file): #sendLog('checkor','no file %s',bypass_file) continue try: bypasses.extend( json.loads(open(bypass_file).read())) except: sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email]) holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor) if not os.path.isfile(holding_file): #sendLog('checkor',"no file %s"%holding_file) continue try: extending = json.loads(open(holding_file).read()) print bypassor,"is holding",extending holdings.extend( extending ) except: sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email]) ## once this was force-completed, you want to bypass for rider,email in actors: rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider) if not os.path.isfile(rider_file): print "no file",rider_file #sendLog('checkor',"no file %s"%rider_file) continue try: bypasses.extend( json.loads(open( rider_file ).read() ) ) except: sendLog('checkor',"cannot get force complete list from %s"%rider) sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email]) if use_mcm: forcings = mcm.get('/restapi/requests/forcecomplete') #if forcings: # sendEmail('force completing mechanism','please check what checkor is doing with %s'%( ','.join(forcings))) pattern_fraction_pass = UC.get('pattern_fraction_pass') total_running_time = 5.*60. sleep_time = 1 if len(wfs): sleep_time = min(max(0.5, total_running_time / len(wfs)), 10) random.shuffle( wfs ) in_manual = 0 ## now you have a record of what file was invalidated globally from TT TMDB_invalid = dataCache.get('file_invalidation') #try: # TMDB_invalid = set([row[3] for row in csv.reader( os.popen('curl -s "https://docs.google.com/spreadsheets/d/11fFsDOTLTtRcI4Q3gXw0GNj4ZS8IoXMoQDC3CbOo_2o/export?format=csv"'))]) # TMDB_invalid = map(lambda e : e.split(':')[-1], TMDB_invalid) # print len(TMDB_invalid),"globally invalidated files" #except Exception as e: # print "TMDB not fetched" # print str(e) # TMDB_invalid = [] print len(wfs),"to consider, pausing for",sleep_time max_per_round = UC.get('max_per_round').get('checkor',None) if options.limit: max_per_round=options.limit if max_per_round and not spec: wfs = wfs[:max_per_round] for wfo in wfs: if spec and not (spec in wfo.name): continue time.sleep( sleep_time ) time_point("Starting with %s"% wfo.name) ## get info wfi = workflowInfo(url, wfo.name) wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status)) ## make sure the wm status is up to date. # and send things back/forward if necessary. wfo.wm_status = wfi.request['RequestStatus'] if wfo.wm_status == 'closed-out': ## manually closed-out wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status)) wfo.status = 'close' session.commit() continue elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']: ## went into trouble wfo.status = 'trouble' wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status)) session.commit() continue elif wfo.wm_status in ['assigned','acquired']: ## not worth checking yet wfi.sendLog('checkor',"%s is not running yet"%wfo.name) session.commit() continue if '-onhold' in wfo.status: if wfo.name in holdings and wfo.name not in bypasses: wfi.sendLog('checkor',"%s is on hold"%wfo.name) continue if wfo.wm_status != 'completed': #and not wfo.name in bypasses: ## for sure move on with closeout check if in completed wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status)) session.commit() continue if wfo.name in holdings and wfo.name not in bypasses: wfo.status = 'assistance-onhold' wfi.sendLog('checkor',"setting %s on hold"%wfo.name) session.commit() continue session.commit() #sub_assistance="" # if that string is filled, there will be need for manual assistance existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance assistance_tags = set() is_closing = True ## get it from somewhere bypass_checks = False for bypass in bypasses: if bypass in wfo.name: wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass)) bypass_checks = True break pids = wfi.getPrepIDs() force_by_mcm = False force_by_user = False for force in forcings: if force in pids: wfi.sendLog('checkor',"we can bypass checks and force complete %s because of prepid %s "%( wfo.name, force)) bypass_checks = True force_by_mcm = True break for user in overrides: for force in overrides[user]: if force in wfo.name: wfi.sendLog('checkor',"we can bypass checks and force complete %s because of keyword %s of user %s"%( wfo.name, force, user)) bypass_checks = True force_by_user = True break tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm* vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco to_ddm_tier = copy.deepcopy(UC.get('tiers_to_DDM')) campaigns = {} ## this mapping of campaign per output dataset assumes era==campaing, which is not true for relval expected_outputs = copy.deepcopy( wfi.request['OutputDatasets'] ) for out in wfi.request['OutputDatasets']: c = get_campaign(out, wfi) campaigns[out] = c if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]: vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override'])) ## add those that we need to check for custodial copy tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] )) wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])] check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] )) check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check )) check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) ) wfi.sendLog('checkor', check_output_text ) ## anything running on acdc : getting the real prepid is not worth it familly = getWorkflowById(url, wfi.request['PrepID'], details=True) acdc = [] acdc_inactive = [] forced_already=False acdc_bads = [] true_familly = [] for member in familly: if member['RequestType'] != 'Resubmission': continue if member['RequestName'] == wfo.name: continue if member['RequestDate'] < wfi.request['RequestDate']: continue if member['PrepID'] != wfi.request['PrepID'] : continue #if 'OriginalRequestName' in member and (not 'ACDC' in member['OriginalRequestName']) and member['OriginalRequestName'] != wfo.name: continue if member['RequestStatus'] == None: continue if not set(member['OutputDatasets']).issubset( set(expected_outputs)): if not member['RequestStatus'] in ['rejected-archived','rejected','aborted','aborted-archived']: ##this is not good at all wfi.sendLog('checkor','inconsistent ACDC %s'%member['RequestName'] ) #sendLog('checkor','inconsistent ACDC %s'%member['RequestName'], level='critical') acdc_bads.append( member['RequestName'] ) is_closing = False assistance_tags.add('manual') continue true_familly.append( member['RequestName'] ) #try: # parse_one(url, member['RequestName']) #except: # print "Could not make error report for",member['RequestName'] if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']: print wfo.name,"still has an ACDC running",member['RequestName'] acdc.append( member['RequestName'] ) ## cannot be bypassed! is_closing = False assistance_tags.add('recovering') if (force_by_mcm or force_by_user) and not forced_already: wfi.sendLog('checkor','%s is being forced completed while recovering'%wfo.name) wfi.notifyRequestor("The workflow %s was force completed"% wfo.name, do_batch=False) forceComplete(url, wfi) forced_already=True else: acdc_inactive.append( member['RequestName'] ) assistance_tags.add('recovered') if acdc_bads: #sendEmail('inconsistent ACDC','for %s, ACDC %s is inconsistent, preventing from closing'%( wfo.name, ','.join(acdc_bads) )) sendLog('checkor','For %s, ACDC %s is inconsistent, preventing from closing or will create a mess.'%( wfo.name, ','.join(acdc_bads) ), level='critical') time_point("checked workflow familly", sub_lap=True) ## completion check percent_completions = {} if not 'TotalInputEvents' in wfi.request: event_expected,lumi_expected = 0,0 if not 'recovery' in wfo.status: #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**']) sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical') else: event_expected,lumi_expected = wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis'] if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']): event_expected = int(wfi.request['RequestNumEvents']) elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']: event_expected = wfi.request['Task1']['RequestNumEvents'] for i in range(1,20): if 'Task%d'%i in wfi.request: ## this is wrong ibsolute if 'FilterEfficiency' in wfi.request['Task%d'%i]: event_expected *= float(wfi.request['Task%d'%i]['FilterEfficiency']) event_expected = int(event_expected) fractions_pass = {} events_per_lumi = {} over_100_pass = False (lhe,prim,_,_) = wfi.getIO() if lhe or prim: over_100_pass = False time_point("execpted statistics", sub_lap=True) for output in wfi.request['OutputDatasets']: event_count,lumi_count = getDatasetEventsAndLumis(dataset=output) events_per_lumi[output] = event_count/float(lumi_count) if lumi_count else 100 percent_completions[output] = 0. if lumi_expected: percent_completions[output] = lumi_count / float( lumi_expected ) if event_expected: wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected )) percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) ) default_pass = UC.get('default_fraction_pass') fractions_pass[output] = default_pass c = campaigns[output] if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]: if type(CI.campaigns[c]['fractionpass']) == dict: tier = output.split('/')[-1] priority = str(wfi.request['RequestPriority']) ## defined per tier fractions_pass[output] = CI.campaigns[c]['fractionpass'].get('all', default_pass) if tier in CI.campaigns[c]['fractionpass']: fractions_pass[output] = CI.campaigns[c]['fractionpass'][tier] if priority in CI.campaigns[c]['fractionpass']: fractions_pass[output] = CI.campaigns[c]['fractionpass'][priority] else: fractions_pass[output] = CI.campaigns[c]['fractionpass'] wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output)) if options.fractionpass: fractions_pass[output] = options.fractionpass print "overriding fraction to",fractions_pass[output],"by command line for",output for key in pattern_fraction_pass: if key in output: fractions_pass[output] = pattern_fraction_pass[key] print "overriding fraction to",fractions_pass[output],"by dataset key",key if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]): possible_recoveries = wfi.getRecoveryDoc() if possible_recoveries == []: wfi.sendLog('checkor','%s has missing statistics \n%s \n%s, but nothing is recoverable. passing through to annoucement'%( wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) )) sendLog('checkor','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name, level='critical') #sendEmail('nothing is recoverable','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name)#,destination=['*****@*****.**']) ## do not bypass for now, until Alan understands why we are loosing ACDC docs bypass_checks = True else: wfi.sendLog('checkor','%s is not completed \n%s \n%s'%( wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) )) ## hook for creating automatically ACDC ? if not bypass_checks: assistance_tags.add('recovery') is_closing = False if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]): print wfo.name,"is over completed" print json.dumps(percent_completions, indent=2) if not bypass_checks: assistance_tags.add('over100') is_closing = False time_point("checked output size", sub_lap=True) ## correct lumi < 300 event per lumi #for output in wfi.request['OutputDatasets']: #events_per_lumi[output] = getDatasetEventsPerLumi( output ) lumi_upper_limit = {} for output in wfi.request['OutputDatasets']: upper_limit = 301. campaign = campaigns[output] if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]: upper_limit = CI.campaigns[campaign]['lumisize'] print "overriding the upper lumi size to",upper_limit,"for",campaign if options.lumisize: upper_limit = options.lumisize print "overriding the upper lumi size to",upper_limit,"by command line" lumi_upper_limit[output] = upper_limit if wfi.request['RequestType'] in ['ReDigi','ReReco']: lumi_upper_limit[output] = -1 if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]): print wfo.name,"has big lumisections" print json.dumps(events_per_lumi, indent=2) ## hook for rejecting the request ? if not bypass_checks: assistance_tags.add('biglumi') is_closing = False any_presence = {} for output in wfi.request['OutputDatasets']: any_presence[output] = getDatasetPresence(url, output, vetoes=[]) time_point("checked dataset presence", sub_lap=True) ## custodial copy custodial_locations = {} custodial_presences = {} for output in wfi.request['OutputDatasets']: custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s] custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output) if not custodial_locations[output]: custodial_locations[output] = [] time_point("checked custodiality", sub_lap=True) ## presence in phedex phedex_presence ={} for output in wfi.request['OutputDatasets']: phedex_presence[output] = phedexClient.getFileCountDataset(url, output ) time_point("checked phedex count", sub_lap=True) out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier] size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs size_worht_going_to_ddm = sum([getDatasetSize(out)/1023. for out in out_worth_checking if out.split('/')[-1] in to_ddm_tier ]) ## size in TBs of all outputs if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])): print wfo.name,"has not all custodial location" print json.dumps(custodial_locations, indent=2) ########## ## hook for making a custodial replica ? custodial = None ## get from other outputs for output in out_worth_checking: if len(custodial_locations[output]): custodial = custodial_locations[output][0] if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the other output custodial:",custodial,"because of limited space" custodial = None ## try to get it from campaign configuration if not custodial: for output in out_worth_checking: campaign = campaigns[output] if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]: custodial = CI.campaigns[campaign]['custodial'] print "Setting custodial to",custodial,"from campaign configuration" group = None if campaign in CI.campaigns and 'phedex_group' in CI.campaigns[campaign]: group = CI.campaigns[campaign]['phedex_group'] print "using group",group,"for replica" if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the campaign configuration custodial:",custodial,"because of limited space" custodial = None ## get from the parent pick_custodial = True use_parent_custodial = UC.get('use_parent_custodial') tape_size_limit = options.tape_size_limit if options.tape_size_limit else UC.get("tape_size_limit") _,prim,_,_ = wfi.getIO() if not custodial and prim and use_parent_custodial: parent_dataset = prim.pop() ## this is terribly dangerous to assume only parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset ) ###parents_custodial = findCustodialLocation(url, parent_dataset) if not parents_custodial: parents_custodial = [] if len(parents_custodial): custodial = parents_custodial[0] else: print "the input dataset",parent_dataset,"does not have custodial in the first place. abort" #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset) ## does not work for RAWOADSIM sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset) ## cannot be bypassed, this is an issue to fix is_closing = False pick_custodial = False assistance_tags.add('parentcustodial') if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the parent custodial:",custodial,"because of limited space" custodial = None if not custodial and pick_custodial: ## pick one at random custodial = SI.pick_SE(size=size_worth_checking) if custodial and size_worht_going_to_ddm > tape_size_limit: print wfi.sendLog('checkor',"The total output size (%s TB) is too large for the limit set (%s TB)"%( size_worth_checking, tape_size_limit)) custodial = None if not custodial: print "cannot find a custodial for",wfo.name wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking)) sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical') if custodial and (is_closing or bypass_checks): print "picked",custodial,"for tape copy" ## remember how much you added this round already ; this stays locally SI.storage[custodial] -= size_worth_checking ## register the custodial request, if there are no other big issues for output in out_worth_checking: if not len(custodial_locations[output]): if phedex_presence[output]>=1: wfi.sendLog('checkor','Using %s as a tape destination for %s'%(custodial, output)) custodials[custodial].append( output ) if group: custodials[custodial][-1]+='@%s'%group ## let's wait and see if that's needed assistance_tags.add('custodial') else: print "no file in phedex for",output," not good to add to custodial requests" #cannot be bypassed is_closing = False time_point("determined tape location", sub_lap=True) ## disk copy disk_copies = {} for output in wfi.request['OutputDatasets']: disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)] if not all(map( lambda sites : len(sites)!=0, disk_copies.values())): print wfo.name,"has not all output on disk" print json.dumps(disk_copies, indent=2) ## presence in dbs dbs_presence = {} dbs_invalid = {} for output in wfi.request['OutputDatasets']: dbs_presence[output] = dbs3Client.getFileCountDataset( output ) dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True) time_point("dbs file count", sub_lap=True) if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: mismatch_notice = wfo.name+" has a dbs,phedex mismatch\n" mismatch_notice += "in dbs\n"+json.dumps(dbs_presence, indent=2) +"\n" mismatch_notice += "invalide in dbs\n"+json.dumps(dbs_invalid, indent=2) +"\n" mismatch_notice += "in phedex\n"+json.dumps(phedex_presence, indent=2) +"\n" wfi.sendLog('checkor',mismatch_notice) if not 'recovering' in assistance_tags: assistance_tags.add('filemismatch') #print this for show and tell if no recovery on-going for out in dbs_presence: _,_,missing_phedex,missing_dbs = getDatasetFiles(url, out) if missing_phedex: wfi.sendLog('checkor',"These %d files are missing in phedex\n%s"%(len(missing_phedex), "\n".join( missing_phedex ))) were_invalidated = sorted(set(missing_phedex) & set(TMDB_invalid )) if were_invalidated: wfi.sendLog('checkor',"These %d files were invalidated globally\n%s"%(len(were_invalidated), "\n".join(were_invalidated))) sendLog('checkor',"These %d files were invalidated globally\n%s\nand are invalidated in dbs"%(len(were_invalidated), "\n".join(were_invalidated)), level='critical') dbs3Client.setFileStatus( were_invalidated, newstatus=0 ) if missing_dbs: wfi.sendLog('checkor',"These %d files are missing in dbs\n%s"%(len(missing_dbs), "\n".join( missing_dbs ))) were_invalidated = sorted(set(missing_dbs) & set(TMDB_invalid )) if were_invalidated: wfi.sendLog('checkor',"These %d files were invalidated globally\n%s"%(len(were_invalidated), "\n".join(were_invalidated))) #if not bypass_checks: ## I don't think we can by pass this is_closing = False time_point("checked file count", sub_lap=True) fraction_invalid = 0.20 if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignoreinvalid: print wfo.name,"has a dbs invalid file level too high" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## need to be going and taking an eye assistance_tags.add('invalidfiles') if not bypass_checks: #sub_assistance+="-invalidfiles" is_closing = False ## put that heavy part at the end ## duplication check duplications = {} files_per_rl = {} for output in wfi.request['OutputDatasets']: duplications[output] = "skiped" files_per_rl[output] = "skiped" time_point("checked invalidation", sub_lap=True) if (is_closing or bypass_checks) and (not options.ignoreduplicates): print "starting duplicate checker for",wfo.name for output in wfi.request['OutputDatasets']: print "\tchecking",output duplications[output] = True try: duplications[output],files_per_rl[output] = dbs3Client.duplicateRunLumiFiles( output , skipInvalid=True, verbose=True) except: try: duplications[output],files_per_rl[output] = dbs3Client.duplicateRunLumiFiles( output , skipInvalid=True, verbose=True) except Exception as e: wfi.sendLog('checkor','Not possible to check on duplicate lumi count on %s'%(output)) sendLog('checkor','Not possible to check on duplicate lumi count on %s\n%s'%(output,str(e)),level='critical') is_closing=False if is_closing and any(duplications.values()) and not options.ignoreduplicates: duplicate_notice = "" duplicate_notice += "%s has duplicates\n"%wfo.name duplicate_notice += json.dumps( duplications,indent=2) duplicate_notice += '\n' duplicate_notice += json.dumps( files_per_rl, indent=2) wfi.sendLog('checkor',duplicate_notice) ## hook for making file invalidation ? ## it shouldn't be allowed to bypass it assistance_tags.add('duplicates') is_closing = False time_point("checked duplicates", sub_lap=True) time_point("done with %s"%wfo.name) ## for visualization later on if not wfo.name in fDB.record: #print "adding",wfo.name,"to close out record" fDB.record[wfo.name] = { 'datasets' :{}, 'name' : wfo.name, 'closeOutWorkflow' : None, } fDB.record[wfo.name]['closeOutWorkflow'] = is_closing fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority'] fDB.record[wfo.name]['prepid'] = wfi.request['PrepID'] for output in wfi.request['OutputDatasets']: if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {} rec = fDB.record[wfo.name]['datasets'][output] #rec['percentage'] = float('%.2f'%(percent_completions[output]*100)) rec['percentage'] = math.floor(percent_completions[output]*10000)/100.## round down rec['duplicate'] = duplications[output] if output in duplications else 'N/A' rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A' rec['closeOutDataset'] = is_closing rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A' rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output]))) rec['dbsFiles'] = dbs_presence[output] rec['dbsInvFiles'] = dbs_invalid[output] rec['phedexFiles'] = phedex_presence[output] rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive)) rec['familly'] = true_familly now = time.gmtime() rec['timestamp'] = time.mktime(now) rec['updated'] = time.asctime(now)+' (GMT)' ## make the lumi summary if wfi.request['RequestType'] == 'ReReco': try: os.system('python Unified/lumi_summary.py %s 1 > /dev/null'%(wfi.request['PrepID'])) os.system('python Unified/lumi_plot.py %s > /dev/null'%(wfi.request['PrepID'])) wfi.sendLog('checkor','Lumi summary available at %s/datalumi/lumi.%s.html'%(unified_url,wfi.request['PrepID'])) except Exception as e: print str(e) ## make the error report ## and move on if is_closing: ## toggle status to closed-out in request manager wfi.sendLog('checkor',"setting %s closed-out"% wfo.name) if not options.test: if wfo.wm_status in ['closed-out','announced','normal-archived']: print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does" res = None else: res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) print "close out answer",res if not res in ["None",None]: print "try to get the current status again" wfi_bis = workflowInfo(url, wfo.name) if wfi_bis.request['RequestStatus'] == 'closed-out': print "the request did toggle to closed-out" res = None if not res in ["None",None]: print "retrying to closing out" print res res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) if res in [None,"None"]: wfo.status = 'close' session.commit() if use_mcm and force_by_mcm: ## shoot large on all prepids, on closing the wf for pid in pids: mcm.delete('/restapi/requests/forcecomplete/%s'%pid) else: print "could not close out",wfo.name,"will try again next time" else: if not 'custodial' in assistance_tags or wfi.isRelval(): ## do only the report for those for member in acdc+acdc_inactive+[wfo.name]: try: parse_one(url, member) except: print "Could not make error report for",member ## full known list #recovering # has active ACDC ##OUT #recovered #had inactive ACDC #recovery #not over the pass bar #over100 # over 100% #biglumi # has a big lumiblock #parentcustodial # the parent does not have a valid subscription yet #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear #filemismatch # there is a dbs/phedex mismatch #duplicates #a lumi section is there twice ## manual is not added yet, and should be so by recoveror print wfo.name,"was tagged with :",list(assistance_tags) if 'recovering' in assistance_tags: ## if active ACDC, being under threshold, filemismatch do not matter assistance_tags = assistance_tags - set(['recovery','filemismatch']) if 'recovery' in assistance_tags and 'recovered' in assistance_tags: ## should not set -recovery to anything that had ACDC already assistance_tags = assistance_tags - set(['recovery','recovered']) ## straight to manual assistance_tags.add('manual') in_manual += 1 if 'recovery' in assistance_tags and 'manual' in assistance_tags: ## this is likely because something bad is happening, so leave it to manual assistance_tags = assistance_tags - set(['recovery']) assistance_tags.add('manual') in_manual += 1 ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it print wfo.name,"needs assistance with",",".join( assistance_tags ) print wfo.name,"existing conditions",",".join( existing_assistance_tags ) ######################################### ##### notification to requester ######### go_notify=False if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags: go_notify=True if go_notify: #if wfo.name in already_notified: # print "double notification" # sendEmail('double notification','please take a look at %s'%(wfo.name)) #else: # already_notified.append( wfo.name ) ###detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s' #detailslink = 'https://cmsweb.cern.ch/reqmgr2/fetch?rid=%s'%(wfo.name) ###perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name) perflink = '%s/report/%s'%(unified_url,wfo.name) splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name) ## notify templates messages= { 'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ), 'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink), 'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ), 'filemismatch': 'Samples completed with inconsistency in DBS/Phedex', #'manual' : 'Workflow completed and requires manual checks by Ops', } content = "The request PREPID (WORKFLOW) is facing issue in production.\n" motive = False for case in messages: if case in assistance_tags: content+= "\n"+messages[case]+"\n" motive = True content += "You are invited to check, while this is being taken care of by Comp-Ops.\n" content += "This is an automated message from Comp-Ops.\n" items_notified = set() if use_mcm and motive: wfi.notifyRequestor( content , mcm = mcm) ######################################### ## logic to set the status further if assistance_tags: new_status = 'assistance-'+'-'.join(sorted(assistance_tags) ) else: new_status = 'assistance' ## case where the workflow was in manual from recoveror if not 'manual' in wfo.status or new_status!='assistance-recovery': wfo.status = new_status if not options.test: wfi.sendLog('checkor','setting %s to %s'%(wfo.name, wfo.status)) session.commit() else: print "current status is",wfo.status,"not changing to anything" #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2)) fDB.html() if not spec and in_manual!=0: sendEmail("fresh assistance status available","Fresh status are available at %s/assistance.html"%unified_url,destination=['*****@*****.**']) #it's a bit annoying pass ## custodial requests print "Custodials" print json.dumps(custodials, indent=2) for site in custodials: items_at = defaultdict(set) for i in custodials[site]: item, group = i.split('@') if '@' in i else (i,'DataOps') items_at[group].add( item ) for group,items in items_at.items(): print ','.join(items),'=>',site,'@',group if not options.test: result = makeReplicaRequest(url, site, sorted(items) ,"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) , group=group) print result print "File Invalidation" print invalidations
def transferor(url ,specific = None, talk=True, options=None): if userLock(): return if duplicateLock(): return use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() LI = lockInfo() NLI = newLockInfo() mcm = McMClient(dev=False) dss = DSS() print "counting all being handled..." being_handled = len(session.query(Workflow).filter(Workflow.status == 'away').all()) being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('stag')).all()) being_transfered = len(session.query(Workflow).filter(Workflow.status == 'staging').all()) being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all()) max_to_handle = options.maxworkflows max_to_transfer = options.maxstaging allowed_to_handle = max(0,max_to_handle - being_handled) allowed_to_transfer = max(0,max_to_transfer - being_transfered) wf_buffer = 5 if allowed_to_handle<=wf_buffer: ## buffer for having several wf per transfer print "Not allowed to run more than",max_to_handle,"at a time. Currently",being_handled,"and",wf_buffer,"buffer" else: print being_handled,"already being handled",max_to_handle,"max allowed,",allowed_to_handle,"remaining","and",wf_buffer,"buffer" if allowed_to_transfer <= wf_buffer: print "Not allowed to transfer more than",max_to_transfer,"at a time. Currently",being_transfered,"and",wf_buffer,"buffer" else: print being_transfered,"already being transfered",max_to_transfer,"max allowed,",allowed_to_transfer,"remaining","and",wf_buffer,"buffer" print "... done" all_transfers=defaultdict(list) needing_locks=defaultdict(list) workflow_dependencies = defaultdict(set) ## list of wf.id per input dataset wfs_and_wfh=[] print "getting all wf to consider ..." cache = getWorkflows(url, 'assignment-approved', details=True) for wfo in session.query(Workflow).filter(Workflow.status=='considered').all(): print "\t",wfo.name if specific and not specific in wfo.name: continue cache_r =filter(lambda d:d['RequestName']==wfo.name, cache) if len(cache_r): wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False, request = cache_r[0]) ) ) else: wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False) ) ) print "... done" transfers_per_sites = defaultdict(int) input_sizes = {} input_cput = {} input_st = {} ## list the size of those in transfer already in_transfer_priority=0 min_transfer_priority=100000000 print "getting all wf in staging ..." for wfo in session.query(Workflow).filter(Workflow.status=='staging').all(): wfh = workflowInfo( url, wfo.name, spec=False) (lheinput,primary,parent,secondary) = wfh.getIO() sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) for site in sites_allowed: ## we should get the actual transfer destination instead of the full white list transfers_per_sites[site] += 1 #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() for prim in primary: input_sizes[prim] = dss.get( prim ) print "\t",wfo.name,"needs",input_sizes[prim],"GB" in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority'])) min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority'])) print "... done" print "Max priority in transfer already",in_transfer_priority print "Min priority in transfer already",min_transfer_priority print "transfers per sites" print json.dumps( transfers_per_sites, indent=2) in_transfer_already = sum(input_sizes.values()) cput_in_transfer_already = sum(input_cput.values()) st_in_transfer_already = sum(input_st.values()) # shuffle first by name random.shuffle( wfs_and_wfh ) #sort by priority higher first wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority']) ), reverse=True) ## list the size of all inputs print "getting all input sizes ..." for (wfo,wfh) in wfs_and_wfh: (_,primary,_,_) = wfh.getIO() #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() for prim in primary: input_sizes[prim] = dss.get( prim ) print "... done" cput_grand_total = sum(input_cput.values()) cput_to_transfer = cput_grand_total - cput_in_transfer_already st_grand_total = sum(input_st.values()) st_to_transfer = st_grand_total - st_in_transfer_already print "%15.4f [CPU h] worth already in transfer"%cput_in_transfer_already print "%15.4f [CPU h] worth is the current requested transfer load"%cput_to_transfer print "%15.4f [h] worth of absolute system time in transfer"%( cput_in_transfer_already / SI.availableSlots()) print "%15.4f [h] worth of absolute system time is the current requested transfer load"%( cput_to_transfer / SI.availableSlots()) print "%15.4f [h] worth of theoritical system time in transfer"%( st_in_transfer_already ) print "%15.4f [h] worth of theoritical system time is the current requested transfer load"%( st_to_transfer ) grand_total = sum(input_sizes.values()) to_transfer = grand_total - in_transfer_already grand_transfer_limit = options.maxtransfer transfer_limit = grand_transfer_limit - in_transfer_already print "%15.4f GB already being transfered"%in_transfer_already print "%15.4f GB is the current requested transfer load"%to_transfer print "%15.4f GB is the global transfer limit"%grand_transfer_limit print "%15.4f GB is the available limit"%transfer_limit max_staging_per_site = options.maxstagingpersite # the max priority value per dataset. max_priority = defaultdict(int) needs_transfer=0 ## so that we can count'em passing_along = 0 transfer_sizes={} went_over_budget=False destination_cache = {} for (wfo,wfh) in wfs_and_wfh: print wfh.request['RequestPriority'] print wfo.name,"to be transfered" #wfh = workflowInfo( url, wfo.name) (_,primary,_,_) = wfh.getIO() this_load=sum([input_sizes[prim] for prim in primary]) if ( this_load and (sum(transfer_sizes.values())+this_load > transfer_limit or went_over_budget ) ): if went_over_budget: print "Transfer has gone over bubget." else: print "Transfer will go over bubget." print "%15.4f GB this load"%this_load print "%15.4f GB already this round"%sum(transfer_sizes.values()) print "%15.4f GB is the available limit"%transfer_limit went_over_budget=True if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over budget" else: if not options.go: print min_transfer_priority,"minimum priority",wfh.request['RequestPriority'],"<",in_transfer_priority,"stop" continue ## throtlle by campaign go if not CI.go( wfh.request['Campaign'] ): print "No go for",wfh.request['Campaign'] if not options.go: sendEmail("no go for managing","No go for "+wfh.request['Campaign']) continue ## check if the batch is announced def check_mcm(wfn): announced=False is_real=False if not wfn.startswith('pdmvserv'): is_real = True try: for b in mcm.getA('batches',query='contains=%s'% wfo.name): is_real = True if b['status']=='announced': announced=True break except: try: for b in mcm.getA('batches',query='contains=%s'% wfo.name): is_real = True if b['status']=='announced': announced=True break except: print "could not get mcm batch announcement, assuming not real" return announced,is_real if not use_mcm: announced,is_real = False,True else: announced,is_real = check_mcm( wfo.name ) if not announced: print wfo.name,"does not look announced."# skipping?, rejecting?, reporting?" if not is_real: print wfo.name,"does not appear to be genuine." ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status continue ## check on a grace period injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.) now = time.mktime(time.gmtime()) / (60.*60.) if float(now - injection_time) < 4.: if not options.go and not announced: print "It is too soon to start transfer: %3.2fH remaining"%(now - injection_time) continue if passing_along >= allowed_to_handle: #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority: ## higher priority, and not only this priority being transfered print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over",max_to_handle else: print "Not allowed to pass more than",max_to_handle,"at a time. Currently",being_handled,"handled, and adding",passing_along if not options.go: break if this_load and needs_transfer >= allowed_to_transfer: #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority: ## higher priority, and not only this priority being transfered print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over",max_to_transfer else: print "Not allowed to transfer more than",max_to_transfer,"at a time. Currently",being_transfered,"transfering, and adding",needs_transfer if not options.go: continue (lheinput,primary,parent,secondary) = wfh.getIO() for dataset in list(primary)+list(parent)+list(secondary): ## lock everything flat NLI.lock( dataset ) if options and options.tosites: sites_allowed = options.tosites.split(',') else: sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']): sites_allowed = CI.parameters(wfh.request['Campaign'])['SiteWhitelist'] if 'SiteBlacklist' in CI.parameters(wfh.request['Campaign']): sites_allowed = list(set(sites_allowed) - set(CI.parameters(wfh.request['Campaign'])['SiteBlacklist'])) ## reduce right away to sites in case of memory limitation memory_allowed = SI.sitesByMemory( wfh.request['Memory'] ) if memory_allowed!=None: print "sites allowing", wfh.request['Memory'],"are",memory_allowed sites_allowed = list(set(sites_allowed) & set(memory_allowed)) if not sites_allowed: print wfo.name,"has no possible sites to run at" print "available for",wfh.request['Memory'],"are",memory_allowed sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name )) continue blocks = [] if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## should make the block selection here pass if 'LumiList' in wfh.request and wfh.request['LumiList']: ## same, we could be doing the white list here too pass if blocks: print "Reading",len(blocks),"in whitelist" can_go = True staging=False allowed=True if primary: copies_needed_from_CPUh,CPUh = wfh.getNCopies() if talk: print wfo.name,'reads',', '.join(primary),'in primary' ## chope the primary dataset for prim in primary: max_priority[prim] = max(max_priority[prim],int(wfh.request['RequestPriority'])) sites_allowed = [site for site in sites_allowed if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] print "Sites allowed minus the vetoed transfer" print sorted(sites_allowed) copies_needed_from_site = int(0.35*len(sites_allowed))+1 ## should just go for a fixed number based if the white list grows that big print "Would make",copies_needed_from_site,"copies from site white list" copies_needed = copies_needed_from_site print "Would make",copies_needed_from_CPUh,"from cpu requirement",CPUh copies_needed = copies_needed_from_CPUh if options.maxcopy>0: ## stop maxing things out ?? #copies_needed = min(options.maxcopy,copies_needed) #print "Maxed to",copies_needed if copies_needed_from_CPUh > options.maxcopy: sendEmail('An example of more than three copies','for %s it could have been beneficial to make %s copies'%( wfo.name, copies_needed_from_CPUh)) if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies'] copies_needed = min(copies_needed_from_campaign,copies_needed_from_site) print "Maxed to",copies_needed,"by campaign configuration",wfh.request['Campaign'] ## remove the sites that do not want transfers workflow_dependencies[prim].add( wfo.id ) ##################################### ###### JR 3/8/15 #### deprecating this """ presence = getDatasetPresence( url, prim , within_sites = [SI.CE_to_SE(site) for site in sites_allowed]) prim_location = [site for site,pres in presence.items() if pres[0]==True] prim_parts = [site for site,pres in presence.items() if pres[0]==False] if len(prim_location) >= copies_needed: print "The output is all fully in place at",len(prim_location),"sites" continue # reduce the number of copies required by existing full copies copies_needed = max(0,copies_needed - len(prim_location)) print "now need",copies_needed subscriptions = listSubscriptions( url , prim , sites_allowed ) prim_destination = list(set([site for (site,(tid,decision)) in subscriptions.items() if decision and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])])) ## remove the subscription where the dataset is in parts at #prim_destination = list(set([site for (site,(tid,decision)) in subscriptions.items() if decision and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']]) and not site in prim_parts])) ## need to reject from that list the ones with a full copy already: i.e the transfer corresponds to the copy in place prim_destination = [site for site in prim_destination if not site in prim_location] ## add transfer dependencies latching_on_transfers = list(set([ tid for (site,(tid,decision)) in subscriptions.items() if decision and site in prim_destination and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])])) print latching_on_transfers """ ###### JR 3/8/15 #### deprecating this ##################################### ### new ways of making the whole thing destinations,all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks ) #destinations,all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks, group='DataOps') #anaops_destinations,anaops_all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks, group='AnalysisOps' ) print json.dumps(destinations, indent=2) ## get where the dataset is in full and completed prim_location = [site for (site,info) in destinations.items() if info['completion']==100 and info['data_fraction']==1] ## the rest is places it is going to be prim_destination = [site for site in destinations.keys() if not site in prim_location] ## need to take out the transfer veto prim_destination = [site for site in prim_destination if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] for dsite in prim_destination: needing_locks[dsite].append( prim ) if len(prim_location) >= copies_needed: print "The output is all fully in place at",len(prim_location),"sites",prim_location continue copies_needed = max(0,copies_needed - len(prim_location)) print "now need",copies_needed copies_being_made = [ sum([info['blocks'].keys().count(block) for site,info in destinations.items() if site in prim_destination]) for block in all_block_names] latching_on_transfers = set() [latching_on_transfers.update(info['blocks'].values()) for site,info in destinations.items() if site in prim_destination] latching_on_transfers = list(latching_on_transfers) #print latching_on_transfers ## figure out where all this is going to go prim_to_distribute = [site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location] prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in prim_destination] ## take out the ones that cannot receive transfers prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] if any([transfers_per_sites[site] < max_staging_per_site for site in prim_to_distribute]): ## means there is openings let me go print "There are transfer slots available:",[(site,transfers_per_sites[site]) for site in prim_to_distribute] for site in sites_allowed: #increment accross the board, regardless of real destination: could be changed transfers_per_sites[site] += 1 else: if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over transfer slots available" else: print "Not allowed to transfer more than",max_staging_per_site," per site at a time. Going overboard for",[site for site in prim_to_distribute if transfers_per_sites[site]>=max_staging_per_site] if not options.go: allowed = False break for latching in latching_on_transfers: tfo = session.query(Transfer).filter(Transfer.phedexid == latching).first() if not tfo: tfo = Transfer( phedexid = latching) tfo.workflows_id = [] session.add(tfo) if not wfo.id in tfo.workflows_id: print "adding",wfo.id,"to",tfo.id,"with phedexid",latching l = copy.deepcopy( tfo.workflows_id ) l.append( wfo.id ) tfo.workflows_id = l if not options.test: session.commit() else: session.flush() ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query can_go = False transfer_sizes[prim] = input_sizes[prim] staging = True # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ?? #copies_needed = max(0,copies_needed - len(prim_destination)) copies_needed = max(0,copies_needed - min(copies_being_made)) print "then need",copies_needed if copies_needed == 0: print "The output is either fully in place or getting in full somewhere with",latching_on_transfers can_go = True continue if len(prim_to_distribute)>0: ## maybe that a parameter we can play with to limit the if not options or options.chop: chops,sizes = getDatasetChops(prim, chop_threshold = options.chopsize, only_blocks=blocks) spreading = distributeToSites( chops, prim_to_distribute, n_copies = copies_needed, weights=SI.cpu_pledges, sizes=sizes) transfer_sizes[prim] = sum(sizes) else: spreading = {} for site in prim_to_distribute: if blocks: spreading[site]=blocks else: spreading[site]=[prim] transfer_sizes[prim] = input_sizes[prim] ## this is approximate if blocks are specified can_go = False print "selected CE destinations",spreading.keys() for (site,items) in spreading.items(): all_transfers[site].extend( items ) if not allowed: print "Not allowed to move on with",wfo.name continue if secondary: if talk: print wfo.name,'reads',', '.join(secondary),'in secondary' for sec in secondary: workflow_dependencies[sec].add( wfo.id ) if False: ## new style, failing on minbias if not sec in destination_cache: ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed]) destinations = destination_cache[sec] ## truncate location/destination to those making up for >90% of the dataset bad_destinations = [destinations.pop(site) for (site,info) in destinations.items() if info['data_fraction']<0.9] sec_location = [site for (site,info) in destinations.items() if info['completion']>=95] sec_destination = [site for site in destinations.keys() if not site in sec_location] else: ## old style presence = getDatasetPresence( url, sec ) sec_location = [site for site,pres in presence.items() if pres[1]>90.] ## more than 90% of the minbias at sites subscriptions = listSubscriptions( url ,sec ) sec_destination = [site for site in subscriptions] for site in sec_location: needing_locks[site].append( sec ) for site in sec_destination: needing_locks[site].append( sec ) sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] if len( sec_to_distribute )>0: sec_size = dss.get( sec ) for site in sec_to_distribute: site_se =SI.CE_to_SE(site) if (SI.disk[site_se]*1024.) > sec_size: all_transfers[site].append( sec ) can_go = False else: print "could not send the secondary input to",site_se,"because it is too big for the available disk",SI.disk[site_se]*1024,"GB need",sec_size #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024)) ## is that possible to do something more if can_go: ## no explicit transfer required this time if staging: ## but using existing ones print wfo.name,"latches on existing transfers, and nothing else" wfo.status = 'staging' needs_transfer+=1 else: print wfo.name,"should just be assigned NOW to",sites_allowed wfo.status = 'staged' passing_along+=1 print "setting status to",wfo.status session.commit() continue else: ## there is an explicit transfer required if staging: ## and also using an existing one print wfo.name,"latches on existing transfers" if not options.test: wfo.status = 'staging' print "setting status to",wfo.status session.commit() print wfo.name,"needs a transfer" needs_transfer+=1 passing_along+=1 print "accumulated locks of dataset in place" print json.dumps(needing_locks, indent=2) for site,items in needing_locks.items(): for item in items: LI.lock( item, SI.CE_to_SE(site), 'usable input') print "accumulated transfers" print json.dumps(all_transfers, indent=2) fake_id=-1 wf_id_in_prestaging=set() for (site,items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## site that do not want input datasets if site in SI.sites_veto_transfer: print site,"does not want transfers" continue ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured. ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] datasets = [it for it in items_to_transfer if not '#' in it] if execute: print "Making a replica to",site,"(CE)",site_se,"(SE) for" else: print "Would make a replica to",site,"(CE)",site_se,"(SE) for" print "\t",len(blocks),"blocks" ## remove blocks if full dataset is send out blocks = [block for block in blocks if not block.split('#')[0] in datasets] print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks])) print "\t",len(datasets),"datasets" print "\t",datasets items_to_transfer = blocks + datasets ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y','yes','go']: continue if execute: result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority='normal') ## make use of max_priority dataset:priority to set the subscriptions priority """ ## does not function once = True for item in items_to_transfer: bds = item.split('#')[0] if max_priority[bds] >= 90000: if once: w=10 print "waiting",w,"s before raising priority" time.sleep(w) once=False ## raise it to high priority print item,"subscription priority raised to high at",site_se #print "This does not work yet properly it seems" print updateSubscription(url, site_se, item, priority='high') """ #for item in list(set([it.split('#')[0] for it in items_to_transfer])): for item in items_to_transfer: LI.lock( item, site_se, 'pre-staging') else: #result= {'phedex':{'request_created' : [{'id' : fake_id}]}} result= {'phedex':{'request_created' : []}} fake_id-=1 if not result: print "ERROR Could not make a replica request for",site,items_to_transfer,"pre-staging" continue for phedexid in [o['id'] for o in result['phedex']['request_created']]: new_transfer = session.query(Transfer).filter(Transfer.phedexid == phedexid).first() print phedexid,"transfer created" if not new_transfer: new_transfer = Transfer( phedexid = phedexid) session.add( new_transfer ) new_transfer.workflows_id = set() for transfering in list(set(map(lambda it : it.split('#')[0], items_to_transfer))): new_transfer.workflows_id.update( workflow_dependencies[transfering] ) new_transfer.workflows_id = list(new_transfer.workflows_id) wf_id_in_prestaging.update(new_transfer.workflows_id) session.commit() ## auto approve it if execute: approved = approveSubscription(url, phedexid, [site_se]) for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status!='staging': if execute: tr_wf.status = 'staging' if talk: print "setting",tr_wf.name,"to staging" session.commit()
def transferor(url, specific=None, talk=True, options=None): if userLock(): return if duplicateLock(): return use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() NLI = newLockInfo() mcm = McMClient(dev=False) dss = DSS() #allowed_secondary = UC.get('') print "counting all being handled..." being_handled = len( session.query(Workflow).filter(Workflow.status == 'away').all()) being_handled += len( session.query(Workflow).filter( Workflow.status.startswith('stag')).all()) being_transfered = len( session.query(Workflow).filter(Workflow.status == 'staging').all()) being_handled += len( session.query(Workflow).filter( Workflow.status.startswith('assistance-')).all()) max_to_handle = options.maxworkflows max_to_transfer = options.maxstaging allowed_to_handle = max(0, max_to_handle - being_handled) allowed_to_transfer = max(0, max_to_transfer - being_transfered) wf_buffer = 5 if allowed_to_handle <= wf_buffer: ## buffer for having several wf per transfer print "Not allowed to run more than", max_to_handle, "at a time. Currently", being_handled, "and", wf_buffer, "buffer" else: print being_handled, "already being handled", max_to_handle, "max allowed,", allowed_to_handle, "remaining", "and", wf_buffer, "buffer" if allowed_to_transfer <= wf_buffer: print "Not allowed to transfer more than", max_to_transfer, "at a time. Currently", being_transfered, "and", wf_buffer, "buffer" else: print being_transfered, "already being transfered", max_to_transfer, "max allowed,", allowed_to_transfer, "remaining", "and", wf_buffer, "buffer" print "... done" all_transfers = defaultdict(list) workflow_dependencies = defaultdict( set) ## list of wf.id per input dataset wfs_and_wfh = [] print "getting all wf to consider ..." cache = getWorkflows(url, 'assignment-approved', details=True) for wfo in session.query(Workflow).filter( Workflow.status.startswith('considered')).all(): print "\t", wfo.name if specific and not specific in wfo.name: continue cache_r = filter(lambda d: d['RequestName'] == wfo.name, cache) if len(cache_r): wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False, request=cache_r[0]))) else: wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False))) print "... done" transfers_per_sites = defaultdict(int) input_sizes = {} ignored_input_sizes = {} input_cput = {} input_st = {} ## list the size of those in transfer already in_transfer_priority = None min_transfer_priority = None print "getting all wf in staging ..." stucks = json.loads(open('%s/stuck_transfers.json' % monitor_dir).read()) for wfo in session.query(Workflow).filter( Workflow.status == 'staging').all(): wfh = workflowInfo(url, wfo.name, spec=False) #(lheinput,primary,parent,secondary) = wfh.getIO() #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() for site in sites_allowed: ## we should get the actual transfer destination instead of the full white list transfers_per_sites[site] += 1 #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() for prim in primary: ds_s = dss.get(prim) if prim in stucks: sendLog('transferor', "%s appears stuck, so not counting it %s [GB]" % (prim, ds_s), wfi=wfh) ignored_input_sizes[prim] = ds_s else: input_sizes[prim] = ds_s sendLog('transferor', "%s needs %s [GB]" % (wfo.name, ds_s), wfi=wfh) if in_transfer_priority == None: in_transfer_priority = int(wfh.request['RequestPriority']) else: in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority'])) if min_transfer_priority == None: min_transfer_priority = int(wfh.request['RequestPriority']) else: min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority'])) if min_transfer_priority == None or in_transfer_priority == None: print "nothing is lining up for transfer" sendEmail("no request in staging", "no request in staging") return pass try: print "Ignored input sizes" ignored_values = list(ignored_input_sizes.items()) ignored_values.sort(key=lambda i: i[1]) print "\n".join(map(str, ignored_values)) print "Considered input sizes" considered_values = list(input_sizes.items()) considered_values.sort(key=lambda i: i[1]) print "\n".join(map(str, considered_values)) except Exception as e: print "trying to print the summary of input size" print str(e) print "... done" print "Max priority in transfer already", in_transfer_priority print "Min priority in transfer already", min_transfer_priority print "transfers per sites" print json.dumps(transfers_per_sites, indent=2) in_transfer_already = sum(input_sizes.values()) cput_in_transfer_already = sum(input_cput.values()) st_in_transfer_already = sum(input_st.values()) ## list the size of all inputs primary_input_per_workflow_gb = defaultdict(float) print "getting all input sizes ..." for (wfo, wfh) in wfs_and_wfh: (_, primary, _, _) = wfh.getIO() #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() for prim in primary: ## do not count it if it appears stalled ! prim_size = dss.get(prim) input_sizes[prim] = prim_size primary_input_per_workflow_gb[wfo.name] += prim_size print "... done" # shuffle first by name random.shuffle(wfs_and_wfh) # Sort smallest transfers first; allows us to transfer as many as possible workflows. def prio_and_size(i, j): if int(i[1].request['RequestPriority']) == int( j[1].request['RequestPriority']): return cmp(int(primary_input_per_workflow_gb.get(j[0].name, 0)), int(primary_input_per_workflow_gb.get(i[0].name, 0))) else: return cmp(int(i[1].request['RequestPriority']), int(j[1].request['RequestPriority'])) #wfs_and_wfh.sort(cmp = prio_and_size, reverse=True) #wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(primary_input_per_workflow_gb.get(i[0].name, 0)), int(primary_input_per_workflow_gb.get(j[0].name, 0)) )) #sort by priority higher first wfs_and_wfh.sort(cmp=lambda i, j: cmp(int(i[1].request[ 'RequestPriority']), int(j[1].request['RequestPriority'])), reverse=True) cput_grand_total = sum(input_cput.values()) cput_to_transfer = cput_grand_total - cput_in_transfer_already st_grand_total = sum(input_st.values()) st_to_transfer = st_grand_total - st_in_transfer_already print "%15.4f [CPU h] worth already in transfer" % cput_in_transfer_already print "%15.4f [CPU h] worth is the current requested transfer load" % cput_to_transfer print "%15.4f [h] worth of absolute system time in transfer" % ( cput_in_transfer_already / SI.availableSlots()) print "%15.4f [h] worth of absolute system time is the current requested transfer load" % ( cput_to_transfer / SI.availableSlots()) print "%15.4f [h] worth of theoritical system time in transfer" % ( st_in_transfer_already) print "%15.4f [h] worth of theoritical system time is the current requested transfer load" % ( st_to_transfer) grand_total = sum(input_sizes.values()) to_transfer = grand_total - in_transfer_already grand_transfer_limit = options.maxtransfer #grand_transfer_limit = SI.total_disk()*0.25*1024## half of the free sapce in TB->GB transfer_limit = grand_transfer_limit - in_transfer_already print "%15.4f GB already being transfered" % in_transfer_already print "%15.4f GB is the current requested transfer load" % to_transfer print "%15.4f GB is the global transfer limit" % grand_transfer_limit print "%15.4f GB is the available limit" % transfer_limit max_staging_per_site = options.maxstagingpersite # the max priority value per dataset. max_priority = defaultdict(int) needs_transfer = 0 ## so that we can count'em passing_along = 0 transfer_sizes = {} went_over_budget = False destination_cache = {} no_goes = set() max_per_round = UC.get('max_per_round').get('transferor', None) if max_per_round and not spec: wfs_and_wfh = wfs_and_wfh[:max_per_round] for (wfo, wfh) in wfs_and_wfh: print wfo.name, "to be transfered with priority", wfh.request[ 'RequestPriority'] if wfh.request['RequestStatus'] != 'assignment-approved': if wfh.request['RequestStatus'] in [ 'aborted', 'rejected', 'rejected-archived', 'aborted-archived' ]: wfo.status = 'trouble' ## so that we look or a replacement else: wfo.status = 'away' wfh.sendLog( 'transferor', '%s in status %s, setting %s' % (wfo.name, wfh.request['RequestStatus'], wfo.status)) continue (_, primary, _, _) = wfh.getIO() this_load = sum([input_sizes[prim] for prim in primary]) no_budget = False if (this_load and (sum(transfer_sizes.values()) + this_load > transfer_limit or went_over_budget)): if went_over_budget: wfh.sendLog('transferor', "Transfer has gone over bubget.") else: wfh.sendLog('transferor', "Transfer will go over bubget.") wfh.sendLog( 'transferor', "%15.4f GB this load, %15.4f GB already this round, %15.4f GB is the available limit" % (this_load, sum(transfer_sizes.values()), transfer_limit)) #if sum(transfer_sizes.values()) > transfer_limit: went_over_budget = True if in_transfer_priority != None and min_transfer_priority != None: if int( wfh.request['RequestPriority'] ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority: wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over budget" % (wfh.request['RequestPriority'], in_transfer_priority)) else: if not options.go: wfh.sendLog( 'transferor', "%s minimum priority %s < %s : stop" % (min_transfer_priority, wfh.request['RequestPriority'], in_transfer_priority)) no_budget = True ## throtlle by campaign go no_go = False if not wfh.go(log=True) and not options.go: no_go = True no_goes.add(wfo.name) allowed_secondary = set() for campaign in wfh.getCampaigns(): if campaign in CI.campaigns and 'secondaries' in CI.campaigns[ campaign]: allowed_secondary.update(CI.campaigns[campaign]['secondaries']) if secondary: if (secondary and allowed_secondary) and ( set(secondary) & allowed_secondary != set(secondary)): wfh.sendLog( 'assignor', '%s is not an allowed secondary' % (', '.join(set(secondary) - allowed_secondary))) no_go = True if no_go: continue ## check if the batch is announced def check_mcm(wfn): announced = False is_real = False if not wfn.startswith('pdmvserv'): is_real = True try: for b in mcm.getA('batches', query='contains=%s' % wfo.name): is_real = True if b['status'] == 'announced': announced = True break except: try: for b in mcm.getA('batches', query='contains=%s' % wfo.name): is_real = True if b['status'] == 'announced': announced = True break except: print "could not get mcm batch announcement, assuming not real" return announced, is_real if not use_mcm: announced, is_real = False, True else: if wfh.request['RequestType'] in ['ReReco']: announced, is_real = True, True else: announced, is_real = check_mcm(wfo.name) if not announced: wfh.sendLog('transferor', "does not look announced.") if not is_real: wfh.sendLog('transferor', "does not appear to be genuine.") ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status continue ## check on a grace period injection_time = time.mktime( time.strptime('.'.join(map(str, wfh.request['RequestDate'])), "%Y.%m.%d.%H.%M.%S")) / (60. * 60.) now = time.mktime(time.gmtime()) / (60. * 60.) if float(now - injection_time) < 4.: if not options.go and not announced: wfh.sendLog( 'transferor', "It is too soon to start transfer: %3.2fH remaining" % (now - injection_time)) continue if passing_along >= allowed_to_handle: #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: if in_transfer_priority != None and min_transfer_priority != None: if int(wfh.request['RequestPriority'] ) >= in_transfer_priority and int( wfh.request['RequestPriority'] ) != min_transfer_priority: ## higher priority, and not only this priority being transfered wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over %s" % (wfh.request['RequestPriority'], in_transfer_priority, max_to_handle)) else: wfh.sendLog( 'transferor', " Not allowed to pass more than %s at a time. Currently %s handled, and adding %s" % (max_to_handle, being_handled, passing_along)) if not options.go: ## should not allow to jump that fence break if this_load and needs_transfer >= allowed_to_transfer: if in_transfer_priority != None and min_transfer_priority != None: if int(wfh.request['RequestPriority'] ) >= in_transfer_priority and int( wfh.request['RequestPriority'] ) != min_transfer_priority: ## higher priority, and not only this priority being transfered wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over %s" % (wfh.request['RequestPriority'], in_transfer_priority, max_to_transfer)) else: wfh.sendLog( 'transferor', "Not allowed to transfer more than %s at a time. Currently %s transfering, and adding %s" % (max_to_transfer, being_transfered, needs_transfer)) if not options.go: no_budget = True if no_budget: continue ## the site white list considers site, campaign, memory and core information (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() if options and options.tosites: sites_allowed = options.tosites.split(',') for dataset in list(primary) + list(parent) + list(secondary): ## lock everything flat NLI.lock(dataset) if not sites_allowed: wfh.sendLog('transferor', "not possible site to run at") #sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name )) sendLog('transferor', "%s has no possible sites to run at" % (wfo.name), level='critical') continue blocks = [] if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## augment with run white list for dataset in primary: blocks = list( set(blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist']))) if 'LumiList' in wfh.request and wfh.request['LumiList']: ## augment with the lumi white list blocks = list( set(blocks + getDatasetBlocks(dataset, lumis=wfh.request['LumiList']))) if blocks: print "Reading", len(blocks), "in block whitelist" can_go = True staging = False allowed = True primary_destinations = set() if primary: copies_needed_from_CPUh, CPUh = wfh.getNCopies() if talk: print wfo.name, 'reads', ', '.join(primary), 'in primary' ## chope the primary dataset for prim in primary: ## keep track of what needs what workflow_dependencies[prim].add(wfo.id) max_priority[prim] = max(max_priority[prim], int(wfh.request['RequestPriority'])) wfh.sendLog( 'transferor', "Would make %s from cpu requirement %s" % (copies_needed_from_CPUh, CPUh)) copies_needed = copies_needed_from_CPUh if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[ wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[ wfh.request['Campaign']]['maxcopies'] copies_needed = min(copies_needed_from_campaign, copies_needed) wfh.sendLog( 'transferor', "Maxed to %s by campaign configuration %s" % (copies_needed, wfh.request['Campaign'])) ### new ways of making the whole thing destinations, all_block_names = getDatasetDestinations( url, prim, within_sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks) print json.dumps(destinations, indent=2) ## get where the dataset is in full and completed prim_location = [ site for (site, info) in destinations.items() if info['completion'] == 100 and info['data_fraction'] == 1 ] ## the rest is places it is going to be prim_destination = [ site for site in destinations.keys() if not site in prim_location ] if len(prim_location) >= copies_needed: wfh.sendLog( 'transferor', "The input is all fully in place at %s sites %s" % (len(prim_location), sorted(prim_location))) continue copies_needed = max(0, copies_needed - len(prim_location)) wfh.sendLog( 'transferor', "not counting existing copies ; now need %s" % copies_needed) copies_being_made = [ sum([ info['blocks'].keys().count(block) for site, info in destinations.items() if site in prim_destination ]) for block in all_block_names ] latching_on_transfers = set() [ latching_on_transfers.update(info['blocks'].values()) for site, info in destinations.items() if site in prim_destination ] latching_on_transfers = list(latching_on_transfers) #print latching_on_transfers ## figure out where all this is going to go prim_to_distribute = [ site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location ] prim_to_distribute = [ site for site in prim_to_distribute if not SI.CE_to_SE(site) in prim_destination ] ## take out the ones that cannot receive transfers prim_to_distribute = [ site for site in prim_to_distribute if not any([ osite.startswith(site) for osite in SI.sites_veto_transfer ]) ] wfh.sendLog( 'transferor', "Could be going to: %s" % sorted(prim_to_distribute)) if not prim_to_distribute or any([ transfers_per_sites[site] < max_staging_per_site for site in prim_to_distribute ]): ## means there is openings let me go print "There are transfer slots available:", [ (site, transfers_per_sites[site]) for site in prim_to_distribute ] #for site in sites_allowed: # #increment accross the board, regardless of real destination: could be changed # transfers_per_sites[site] += 1 else: if int( wfh.request['RequestPriority'] ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority: wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over transfer slots available" % (wfh.request['RequestPriority'], in_transfer_priority)) else: wfh.sendLog( 'transferor', "Not allowed to transfer more than %s per site at a time. Going overboard for %s" % (max_staging_per_site, sorted([ site for site in prim_to_distribute if transfers_per_sites[site] >= max_staging_per_site ]))) if not options.go: allowed = False break for latching in latching_on_transfers: tfo = session.query(Transfer).filter( Transfer.phedexid == int(latching)).first() if not tfo: tfo = session.query(Transfer).filter( Transfer.phedexid == -int(latching)).first() if not tfo: tfo = Transfer(phedexid=latching) tfo.workflows_id = [] session.add(tfo) else: tfo.phedexid = latching ## make it positive ever if not wfo.id in tfo.workflows_id: print "adding", wfo.id, "to", tfo.id, "with phedexid", latching l = copy.deepcopy(tfo.workflows_id) l.append(wfo.id) tfo.workflows_id = l if not options.test: session.commit() else: session.flush( ) ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query can_go = False transfer_sizes[prim] = input_sizes[prim] staging = True # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ?? #copies_needed = max(0,copies_needed - len(prim_destination)) copies_needed = max(0, copies_needed - min(copies_being_made)) wfh.sendLog( 'transferor', "Not counting the copies being made ; then need %s" % copies_needed) if copies_needed == 0: wfh.sendLog( 'transferor', "The output is either fully in place or getting in full somewhere with %s" % latching_on_transfers) can_go = True continue elif len(prim_to_distribute) == 0: wfh.sendLog( 'transferor', "We are going to need extra copies, but no destinations seems available" ) prim_to_distribute = [ site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location ] prim_to_distribute = [ site for site in prim_to_distribute if not any([ osite.startswith(site) for osite in SI.sites_veto_transfer ]) ] if len( prim_to_distribute ) > 0: ## maybe that a parameter we can play with to limit the if not options or options.chop: ### hard include the tape disk andpoint ? #tapes = [site for site in getDatasetPresence( url, prim, vetos=['T0','T2','T3','Disk']) if site.endswith('MSS')] chops, sizes = getDatasetChops( prim, chop_threshold=options.chopsize, only_blocks=blocks) spreading = distributeToSites(chops, prim_to_distribute, n_copies=copies_needed, weights=SI.cpu_pledges, sizes=sizes) transfer_sizes[prim] = sum(sizes) if not spreading: sendLog( 'transferor', 'cannot send %s to any site, it cannot fit anywhere' % prim, level='critical') wfh.sendLog( 'transferor', "cannot send to any site. %s cannot seem to fit anywhere" % (prim)) staging = False can_go = False else: spreading = {} for site in prim_to_distribute: if blocks: spreading[site] = blocks else: spreading[site] = [prim] transfer_sizes[prim] = input_sizes[ prim] ## this is approximate if blocks are specified can_go = False wfh.sendLog( 'transferor', "selected CE destinations %s" % (sorted(spreading.keys()))) for (site, items) in spreading.items(): all_transfers[site].extend(items) transfers_per_sites[site] += 1 primary_destinations.add(site) if not allowed: wfh.sendLog('transferor', "Not allowed to move on with") continue if secondary: override_sec_destination = [] if 'SecondaryLocation' in CI.campaigns[wfh.request['Campaign']]: override_sec_destination = CI.campaigns[ wfh.request['Campaign']]['SecondaryLocation'] print wfo.name, 'reads', ', '.join(secondary), 'in secondary' for sec in secondary: workflow_dependencies[sec].add(wfo.id) if True: ## new style, failing on minbias if not sec in destination_cache: ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist destination_cache[sec], _ = getDatasetDestinations( url, sec) ## NO SITE WHITE LIST ADDED #destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed]) ## limit to the site whitelist NOW se_allowed = [SI.CE_to_SE(site) for site in sites_allowed] destinations = dict([ (k, v) for (k, v) in destination_cache[sec].items() if site in se_allowed ]) ## truncate location/destination to those making up for >90% of the dataset bad_destinations = [ destinations.pop(site) for (site, info) in destinations.items() if info['data_fraction'] < 0.9 ] sec_location = [ site for (site, info) in destinations.items() if info['completion'] >= 95 ] sec_destination = [ site for site in destinations.keys() if not site in sec_location ] else: ## old style presence = getDatasetPresence(url, sec) sec_location = [ site for site, pres in presence.items() if pres[1] > 90. ] ## more than 90% of the minbias at sites subscriptions = listSubscriptions(url, sec) sec_destination = [site for site in subscriptions] sec_to_distribute = [ site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location]) ] sec_to_distribute = [ site for site in sec_to_distribute if not any( [osite.startswith(site) for osite in sec_destination]) ] sec_to_distribute = [ site for site in sec_to_distribute if not any([ osite.startswith(site) for osite in SI.sites_veto_transfer ]) ] if override_sec_destination: ## intersect with where we want the PU to be not_needed_anymore = list( set(sec_to_distribute) - set(override_sec_destination)) #sendEmail("secondary superfluous","the dataset %s could be removed from %s"%( sec, not_needed_anymore )) sendLog( 'transferor', "the dataset %s could be removed from %s" % (sec, not_needed_anymore)) sec_to_distribute = list( set(sec_to_distribute) & set(override_sec_destination)) if len(sec_to_distribute) > 0: print "secondary could go to", sorted(sec_to_distribute) sec_size = dss.get(sec) for site in sec_to_distribute: site_se = SI.CE_to_SE(site) if (SI.disk[site_se] * 1024.) > sec_size: all_transfers[site].append(sec) can_go = False else: print "could not send the secondary input to", site_se, "because it is too big for the available disk", SI.disk[ site_se] * 1024, "GB need", sec_size if primary_destinations and site in primary_destinations: #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024)) sendLog('transferor', '%s is too big (%s) for %s (%s)' % (sec, sec_size, site_se, SI.disk[site_se] * 1024), level='critical') else: print "the secondary input does not have to be send to site" ## is that possible to do something more if can_go: ## no explicit transfer required this time if staging: ## but using existing ones wfh.sendLog( 'transferor', "latches on existing transfers, and nothing else, settin staging" ) wfo.status = 'staging' needs_transfer += 1 else: wfh.sendLog( 'transferor', "should just be assigned now to %s" % sorted(sites_allowed)) wfo.status = 'staged' passing_along += 1 wfh.sendLog('transferor', "setting status to %s" % wfo.status) session.commit() continue else: ## there is an explicit transfer required if staging: ## and also using an existing one wfh.sendLog('transferor', "latches on existing transfers") if not options.test: wfo.status = 'staging' wfh.sendLog('transferor', "setting status to %s" % wfo.status) session.commit() wfh.sendLog('transferor', "needs a transfer") needs_transfer += 1 passing_along += 1 if no_goes: #sendEmail("no go for managing","No go for \n"+"\n".join( no_goes )) sendLog('transferor', "No go for \n" + "\n".join(no_goes), level='critical') print "accumulated transfers" print json.dumps(all_transfers, indent=2) fake_id = -1 wf_id_in_prestaging = set() for (site, items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## site that do not want input datasets if site in SI.sites_veto_transfer: print site, "does not want transfers" continue ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured. ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] block_datasets = list(set([it.split('#')[0] for it in blocks])) datasets = [it for it in items_to_transfer if not '#' in it] details_text = "Making a replica to %s (CE) %s (SE) for" % (site, site_se) #print "\t",len(blocks),"blocks" ## remove blocks if full dataset is send out blocks = [ block for block in blocks if not block.split('#')[0] in datasets ] #print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks])) #print "\t",len(datasets),"datasets" #print "\t",datasets details_text += '\n\t%d blocks' % len(blocks) details_text += '\n\t%d needed blocks for %s' % ( len(blocks), sorted(list(set([block.split('#')[0] for block in blocks])))) details_text += '\n\t%d datasets' % len(datasets) details_text += '\n\t%s' % sorted(datasets) items_to_transfer = blocks + datasets if execute: sendLog('transferor', details_text) else: print "Would make a replica to", site, "(CE)", site_se, "(SE) for" print details_text ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y', 'yes', 'go']: continue if execute: priority = 'normal' cds = [ ds for ds in datasets + block_datasets if ds in max_priority ] if cds and False: ## I don't think this is working. subscription should be updated on the fly and regularly for raising the priority if needed ## decide on an overall priority : that's a bit too large though if any([max_priority[ds] >= 90000 for ds in cds]): priority = 'high' elif all([max_priority[ds] < 80000 for ds in cds]): priority = 'low' result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority=priority) else: result = {'phedex': {'request_created': []}} fake_id -= 1 if not result: print "ERROR Could not make a replica request for", site, items_to_transfer, "pre-staging" continue for phedexid in [o['id'] for o in result['phedex']['request_created']]: new_transfer = session.query(Transfer).filter( Transfer.phedexid == int(phedexid)).first() if not new_transfer: new_transfer = session.query(Transfer).filter( Transfer.phedexid == -int(phedexid)).first() print phedexid, "transfer created" if not new_transfer: new_transfer = Transfer(phedexid=phedexid) session.add(new_transfer) else: new_transfer.phedexid = phedexid ## make it positive again new_transfer.workflows_id = set() for transfering in list( set(map(lambda it: it.split('#')[0], items_to_transfer))): new_transfer.workflows_id.update( workflow_dependencies[transfering]) new_transfer.workflows_id = list(new_transfer.workflows_id) wf_id_in_prestaging.update(new_transfer.workflows_id) session.commit() ## auto approve it if execute: approved = approveSubscription(url, phedexid, [site_se]) for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status != 'staging': if execute: tr_wf.status = 'staging' if talk: print "setting", tr_wf.name, "to staging" session.commit()
def equalizor(url, specific=None, options=None): up = componentInfo(mcm=False, soft=['mcm']) if not up.check(): return if not specific: workflows = getWorkflows(url, status='running-closed', details=True) workflows.extend(getWorkflows(url, status='running-open', details=True)) ## start from scratch modifications = defaultdict(dict) ## define regionality site => fallback allowed. feed on an ssb metric ?? mapping = defaultdict(list) reversed_mapping = defaultdict(list) regions = defaultdict(list) SI = siteInfo() for site in SI.sites_ready: region = site.split('_')[1] if not region in ['US', 'DE', 'IT']: continue regions[region] = [region] def site_in_depletion(s): return True if s in SI.sites_pressure: (m, r, pressure) = SI.sites_pressure[s] if float(m) < float(r): print s, m, r, "lacking pressure" return True else: print s, m, r, "pressure" pass return False for site in SI.sites_ready: region = site.split('_')[1] ## fallback to the region, to site with on-going low pressure mapping[site] = [ fb for fb in SI.sites_ready if any([('_%s_' % (reg) in fb and fb != site and site_in_depletion(fb)) for reg in regions[region]]) ] use_T0 = False if options.augment: use_T0 = True use_HLT = False if options.augment: use_HLT = True if use_HLT: mapping['T2_CH_CERN'].append('T2_CH_CERN_HLT') #mapping['T2_IT_Legnaro'].append('T1_IT_CNAF') for reg in ['IT', 'DE', 'UK']: mapping['T2_CH_CERN'].extend( [fb for fb in SI.sites_ready if '_%s_' % reg in fb]) for site, fallbacks in mapping.items(): for fb in fallbacks: reversed_mapping[fb].append(site) ## this is the fallback mapping print json.dumps(mapping, indent=2) #print json.dumps( reversed_mapping, indent=2) altered_tasks = set() def running_idle(wfi, task_name): gmon = wfi.getGlideMon() #print gmon if not gmon: return (0, 0) if not task_name in gmon: return (0, 0) return (gmon[task_name]['Running'], gmon[task_name]['Idle']) def needs_action(wfi, task, min_idled=100, pressure=0.2): task_name = task.pathName.split('/')[-1] running, idled = running_idle(wfi, task_name) go = True if not idled and not running: go = False if idled < 100: go = False if (not running and idled) or (running and (idled / float(running) > pressure)): go = True else: go = False return go, task_name, running, idled def getcampaign(task): taskname = task.pathName.split('/')[-1] if hasattr(task, 'prepID'): return task.prepID.split('-')[1] elif taskname.count('-') >= 1: return taskname.split('-')[1] else: return None def close(interface): open('%s/equalizor.json.new' % monitor_dir, 'w').write(json.dumps(interface, indent=2)) os.system('mv %s/equalizor.json.new %s/equalizor.json' % (monitor_dir, monitor_dir)) os.system('cp %s/equalizor.json %s/logs/equalizor/equalizor.%s.json' % (monitor_dir, monitor_dir, time.mktime(time.gmtime()))) interface = {'reversed_mapping': reversed_mapping, 'modifications': {}} if options.augment or options.remove: interface['modifications'] = json.loads( open('%s/equalizor.json' % monitor_dir).read())['modifications'] if options.remove: if specific in interface['modifications']: print "poping", specific interface['modifications'].pop(specific) close(interface) return PU_locations = {} PU_overflow = { #'RunIISpring15PrePremix' : { # 'sites' : ["T1_US_FNAL", "T1_DE_KIT" , "T1_IT_CNAF", "T1_RU_JINR" ,"T2_CH_CERN"], # 'max' : 20000, # 'pending' : 0 # }, 'RunIIFall15DR76': { 'sites': [ 'T1_ES_PIC', 'T2_US_Purdue', 'T2_UK_SGrid_RALPP', 'T2_BE_IIHE', 'T2_DE_DESY', 'T2_IT_Legnaro', 'T2_US_Caltech', 'T1_DE_KIT', 'T2_UK_London_Brunel', 'T2_IT_Pisa', 'T1_US_FNAL', 'T2_IT_Rome', 'T2_US_Florida', 'T1_IT_CNAF', 'T1_RU_JINR', 'T2_UK_London_IC', 'T2_US_Nebraska', 'T2_FR_CCIN2P3', 'T2_US_UCSD', 'T2_ES_CIEMAT', 'T1_FR_CCIN2P3', 'T2_US_Wisconsin', 'T2_US_MIT', 'T2_DE_RWTH', 'T1_UK_RAL', 'T2_US_Vanderbilt', 'T2_CH_CERN' ], 'max': 20000, 'pending': 0 }, 'RunIISpring16DR80': { 'sites': [ 'T1_ES_PIC', 'T2_US_Purdue', 'T2_UK_SGrid_RALPP', 'T2_BE_IIHE', 'T2_DE_DESY', 'T2_IT_Legnaro', 'T2_US_Caltech', 'T1_DE_KIT', 'T2_UK_London_Brunel', 'T2_IT_Pisa', 'T1_US_FNAL', 'T2_IT_Rome', 'T2_US_Florida', 'T1_IT_CNAF', 'T1_RU_JINR', 'T2_UK_London_IC', 'T2_US_Nebraska', 'T2_FR_CCIN2P3', 'T2_US_UCSD', 'T2_ES_CIEMAT', 'T1_FR_CCIN2P3', 'T2_US_Wisconsin', 'T2_US_MIT', 'T2_DE_RWTH', 'T1_UK_RAL', 'T2_US_Vanderbilt', 'T2_CH_CERN' ], 'max': 20000, 'pending': 0, 'force': True }, 'RunIISpring15DR74': { 'sites': [ 'T1_ES_PIC', 'T1_DE_KIT', 'T1_US_FNAL', 'T1_IT_CNAF', 'T1_RU_JINR', 'T1_FR_CCIN2P3', 'T1_UK_RAL', 'T2_CH_CERN' ], 'max': 20000, 'pending': 0 } } set_to = SI.sites_AAA LHE_overflow = { 'RunIIWinter15GS': set_to, 'RunIISummer15GS': set_to, 'Summer12': set_to, 'Summer11Leg': set_to #'RunIIFall15MiniAODv2' : set_to, } pending_HLT = 0 max_HLT = 60000 pending_T0 = 0 max_T0 = 60000 try: gmon = json.loads( os.popen( 'curl -s http://cms-gwmsmon.cern.ch/prodview/json/T2_CH_CERN_HLT' ).read()) pending_HLT += gmon["Running"] pending_HLT += gmon["MatchingIdle"] except: pass t0_special = [ 'vlimant_BPH-RunIISummer15GS-00030_00212_v0__160129_135314_9755', 'pdmvserv_TSG-RunIISummer15GS-00044_00240_v0__160210_121223_8582' ] no_routing = [ #'vlimant_BPH-RunIISummer15GS-00030_00212_v0__160129_135314_9755', #'pdmvserv_TOP-RunIIWinter15GS-00074_00187_v0__160207_162312_1992', ] stay_within_site_whitelist = False specific_task = None if specific and ":" in specific: specific, specific_task = specific.split(':') if specific: wfs = session.query(Workflow).filter( Workflow.name.contains(specific)).all() else: wfs = session.query(Workflow).filter(Workflow.status == 'away').all() random.shuffle(wfs) for wfo in wfs: if wfo.name in no_routing and not options.augment: continue if specific and not specific in wfo.name: continue if specific: wfi = workflowInfo(url, wfo.name) else: cached = filter(lambda d: d['RequestName'] == wfo.name, workflows) if not cached: continue wfi = workflowInfo(url, wfo.name, request=cached[0]) ## only running should get re-routed if not wfi.request['RequestStatus'] in [ 'running-open', 'running-closed' ] and not specific: continue tasks_and_campaigns = [] for task in wfi.getWorkTasks(): tasks_and_campaigns.append((task, getcampaign(task))) _, _, _, sec = wfi.getIO() ## check needs override needs_overide = False if not needs_overide and options.augment: needs_overide = True def overide_from_agent(wfi, needs_overide): bad_agents = [] #'http://cmssrv219.fnal.gov:5984'] if not bad_agents: return needs_overide if needs_overide: return True agents = wfi.getAgents() wqss = ['Running', 'Acquired'] if any([ agent in agents.get(wqs, {}).keys() for wqs, agent in itertools.product(wqss, bad_agents) ]): print "overriding the need for bad agent" needs_overide = True return needs_overide ## now parse this for action for i_task, (task, campaign) in enumerate(tasks_and_campaigns): if options.augment: print task.pathName print campaign ### rule to avoid the issue of taskchain secondary jobs being stuck at sites processing the initial step if campaign in LHE_overflow: if task.taskType in ['Processing']: needs, task_name, running, idled = needs_action(wfi, task) needs_overide = overide_from_agent(wfi, needs_overide) extend_to = copy.deepcopy(LHE_overflow[campaign]) if stay_within_site_whitelist: extend_to = list( set(extend_to) & set(wfi.request['SiteWhitelist']) ) ## restrict to stupid-site-whitelist if extend_to and needs or needs_overide: print "\t", task_name, "of", wfo.name, "running", running, "and pending", idled, "taking action : ReplaceSiteWhitelist" modifications[wfo.name][task.pathName] = { "ReplaceSiteWhitelist": copy.deepcopy(LHE_overflow[campaign]), "Running": running, "Pending": idled, "Priority": wfi.request['RequestPriority'] } #print json.dumps( modifications[wfo.name][task.pathName]['ReplaceSiteWhitelist'] altered_tasks.add(task.pathName) else: print task_name, "of", wfo.name, "running", running, "and pending", idled ### overflow the 76 digi-reco to the site holding the pileup if campaign in PU_overflow: force = PU_overflow[campaign][ 'force'] if 'force' in PU_overflow[campaign] else False secondary_locations = set(SI.sites_ready) for s in sec: if not s in PU_locations: presence = getDatasetPresence(url, s) #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] one_secondary_locations = [ site for (site, (there, frac)) in presence.items() if frac > 98. ] PU_locations[s] = one_secondary_locations print "secondary is at", sorted(PU_locations[s]) secondary_locations = set( PU_locations[s]) & secondary_locations ## we should add all sites that hold the secondary input if any secondary_locations = list( set(PU_overflow[campaign]['sites']) & set(SI.sites_ready)) if any([ task.pathName.endswith(finish) for finish in ['_0', 'StepOneProc', 'Production'] ]): needs, task_name, running, idled = needs_action(wfi, task) ## removing the ones in the site whitelist already since they encode the primary input location if stay_within_site_whitelist: original_site_in_use = set( wfi.request['SiteWhitelist']) else: original_site_in_use = set(secondary_locations) ## remove the sites that have already running jobs gmon = wfi.getGlideMon() if gmon and task_name in gmon and 'Sites' in gmon[ task_name]: site_in_use = set(gmon[task_name]['Sites']) ## that determines where you want to run in addition #augment_by = list((set(secondary_locations)- site_in_use)) augment_by = list( (set(secondary_locations) - site_in_use) & original_site_in_use ) ## restrict to stupid-site-whitelist else: augment_by = list(original_site_in_use) needs_overide = overide_from_agent(wfi, needs_overide) if augment_by and ( needs or needs_overide or force) and PU_overflow[campaign][ 'pending'] < PU_overflow[campaign]['max']: PU_overflow[campaign]['pending'] += idled print "raising overflow to", PU_overflow[campaign][ 'pending'], "for", PU_overflow[campaign]['max'] ## the step with an input ought to be the digi part : make this one go anywhere modifications[wfo.name][task.pathName] = { "AddWhitelist": augment_by, "Running": running, "Pending": idled, "Priority": wfi.request['RequestPriority'] } altered_tasks.add(task.pathName) print "\t", task_name, "of", wfo.name, "running", running, "and pending", idled, "taking action : AddWhitelist" #print json.dumps( augment_by, indent=2 ) else: print task_name, "of", wfo.name, "running", running, "and pending", idled ### overflow the skims back to multi-core if campaign in ['Run2015D', 'Run2015C_25ns' ] and task.taskType == 'Skim': original_swl = wfi.request['SiteWhitelist'] needs, task_name, running, idled = needs_action(wfi, task) if (needs or needs_overide): modifications[wfo.name][task.pathName] = { 'AddWhitelist': original_swl, "Running": running, "Pending": idled, "Priority": wfi.request['RequestPriority'] } altered_tasks.add(task.pathName) print "\t", task_name, "of", wfo.name, "running", running, "and pending", idled, "taking action : AddWhitelist" if options.augment: print sorted(wfi.request['SiteWhitelist']), i_task, use_HLT ### add the HLT at partner of CERN if 'T2_CH_CERN' in wfi.request[ 'SiteWhitelist'] and i_task == 0 and use_HLT: needs, task_name, running, idled = needs_action(wfi, task) if options.augment: needs = True needs = True ##needs = random.random()<0.40 remove the random, just add up to a limit if (needs or needs_overide) and pending_HLT < max_HLT: pending_HLT += idled if task.pathName in modifications[ wfo.name] and 'AddWhitelist' in modifications[ wfo.name][task.pathName]: modifications[wfo.name][task.pathName][ "AddWhitelist"].append("T2_CH_CERN_HLT") print "\t", wfo.name, "adding addHLT up to", pending_HLT, "for", max_HLT print task.pathName ## this Replace does not work at all for HLT #elif task.pathName in modifications[wfo.name] and 'ReplaceSiteWhitelist' in modifications[wfo.name][task.pathName]: #modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"].append( "T2_CH_CERN_HLT" ) #print "\t",wfo.name,"adding replace HLT up to",pending_HLT,"for",max_HLT else: modifications[wfo.name][task.pathName] = { "AddWhitelist": ["T2_CH_CERN_HLT"], "Priority": wfi.request['RequestPriority'], "Running": running, "Pending": idled } print "\t", wfo.name, "adding HLT up to", pending_HLT, "for", max_HLT print task.pathName if i_task == 0 and not sec and use_T0: needs, task_name, running, idled = needs_action(wfi, task) if options.augment: needs = True #needs = True #if not (wfo.name in t0_special) and not options.augment: needs = False if not wfi.request['RequestType'] in [ 'MonteCarlo', 'MonteCarloFromGEN' ] and not options.augment: needs = False ##needs = random.random()<0.40 remove the random, just add up to a limit if (needs or needs_overide): pending_T0 += idled if task.pathName in modifications[ wfo.name] and 'AddWhitelist' in modifications[ wfo.name][task.pathName]: modifications[wfo.name][ task.pathName]["AddWhitelist"].append("T0_CH_CERN") print "\t", wfo.name, "adding addT0 up to", pending_T0, "for", max_T0 print task.pathName elif task.pathName in modifications[ wfo. name] and 'ReplaceSiteWhitelist' in modifications[ wfo.name][task.pathName]: modifications[wfo.name][task.pathName][ "ReplaceSiteWhitelist"].append("T0_CH_CERN") print "\t", wfo.name, "adding replace T0 up to", pending_T0, "for", max_T0 else: modifications[wfo.name][task.pathName] = { "AddWhitelist": ["T0_CH_CERN"], "Priority": wfi.request['RequestPriority'], "Running": running, "Pending": idled } print "\t", wfo.name, "adding T0 up to", pending_T0, "for", max_T0 print task.pathName interface['modifications'].update(modifications) ## temporary core managing interface['cores'] = { 'T2_CH_CERN_HLT': { 'min': 4, 'max': 16 }, 'default': { 'min': 1, 'max': 4 } } #interface['max_cores']={'T2_CH_CERN_HLT': 16, 'default': 4} #interface['min_cores']={'T2_CH_CERN_HLT': 4, 'default': 1} #interface['resize_subtasks'] = 'RunIISpring16DR80' interface['resizes'] = ['RunIISpring16DR80', 'NotACampaign'] ## close and save close(interface)
def closor(url, specific=None, options=None): if userLock(): return mlock = moduleLock() if mlock() and not options.manual: return up = componentInfo(soft=['mcm', 'wtc']) if not up.check(): return UC = unifiedConfiguration() CI = campaignInfo() BI = batchInfo() CloseI = closeoutInfo() all_late_files = [] jump_the_line = options.announce if options else False if jump_the_line: print "announce option is on. Checking on things on-going ready to be announced" wfs = session.query(Workflow).filter( Workflow.status.contains('announce')).filter( sqlalchemy.not_(Workflow.status.contains('announced'))).all() else: print "regular option. Checking on things done and to be announced" wfs = session.query(Workflow).filter(Workflow.status == 'close').all() if specific: wfs = [wfo for wfo in wfs if specific in wfo.name] wfs_n = [w.name for w in wfs] print "unique names?" print len(set(wfs_n)) == len(wfs_n) held = set() print len(wfs), "closing" random.shuffle(wfs) max_per_round = UC.get('max_per_round').get('closor', None) if options.limit: max_per_round = options.limit if max_per_round: ## order them by priority all_closedout = sorted(getWorkflows(url, 'closed-out', details=True), key=lambda r: r['RequestPriority']) all_closedout = [r['RequestName'] for r in all_closedout] def rank(wfn): return all_closedout.index(wfn) if wfn in all_closedout else 0 wfs = sorted(wfs, key=lambda wfo: rank(wfo.name), reverse=True) wfs = wfs[:max_per_round] batch_go = {} batch_warnings = defaultdict(set) batch_extreme_warnings = defaultdict(set) batch_goodness = UC.get("batch_goodness") closers = [] print len(wfs), "closing" th_start = time.mktime(time.gmtime()) for iwfo, wfo in enumerate(wfs): if specific and not specific in wfo.name: continue if not options.manual and ( 'cmsunified_task_HIG-RunIIFall17wmLHEGS-05036__v1_T_200712_005621_4159' .lower() in (wfo.name).lower() or 'pdmvserv_task_HIG-RunIISummer16NanoAODv7-03979__v1_T_200915_013748_1986' .lower() in (wfo.name).lower()): continue closers.append( CloseBuster( wfo=wfo, url=url, CI=CI, UC=UC, jump_the_line=jump_the_line, batch_goodness=batch_goodness, batch_go=batch_go, #stats = stats, batch_warnings=batch_warnings, batch_extreme_warnings=batch_extreme_warnings, all_late_files=all_late_files, held=held, )) run_threads = ThreadHandler(threads=closers, n_threads=options.threads, sleepy=10, timeout=None, verbose=True, label='closor') run_threads.start() ## waiting on all to complete while run_threads.is_alive(): #print "Waiting on closing threads",time.asctime(time.gmtime()) time.sleep(5) JC = JIRAClient() if up.status.get('jira', False) else None print len( run_threads.threads), "finished thread to gather information from" failed_threads = 0 for to in run_threads.threads: if to.failed: failed_threads += 1 continue if to.outs: for outO in to.outs: out = outO.datasetname odb = session.query(Output).filter( Output.datasetname == out).first() if not odb: print "adding an output object", out session.add(outO) else: odb.date = outO.date if to.to_status: to.wfo.status = to.to_status if JC and to.to_status == "done" and to.wfi: jiras = JC.find({"prepid": to.wfi.request['PrepID']}) for jira in jiras: JC.close(jira.key) if to.to_wm_status: to.wfo.wm_status = to.to_wm_status if to.closing: CloseI.pop(to.wfo.name) session.commit() th_stop = time.mktime(time.gmtime()) if wfs: time_spend_per_workflow = (th_stop - th_start) / float(len(wfs)) print "Average time spend per workflow is", time_spend_per_workflow if float(failed_threads / run_threads.n_threads) > 0: sendLog('checkor', '%d/%d threads have failed, better check this out' % (failed_threads, run_threads.n_threads), level='critical') sendEmail( 'checkor', '%d/%d threads have failed, better check this out' % (failed_threads, run_threads.n_threads)) days_late = 0. retries_late = 10 really_late_files = [ info for info in all_late_files if info['retries'] >= retries_late ] really_late_files = [ info for info in really_late_files if info['delay'] / (60 * 60 * 24.) >= days_late ] if really_late_files: subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s' % ( len(really_late_files), days_late, retries_late, json.dumps(really_late_files, indent=2)) #sendEmail('waiting for files to announce', subject) sendLog('closor', subject, level='warning') sendLog('closor', subject) print subject open('%s/stuck_files.json' % monitor_dir, 'w').write(json.dumps(really_late_files, indent=2)) if held: sendLog('closor', "the workflows below are held up \n%s" % ("\n".join(sorted(held))), level='critical') for bname, go in batch_go.items(): if go: subject = "Release Validation Samples Batch %s" % bname issues = "" #if batch_warnings[ bname ]: # issues="The following datasets have outstanding completion (<%d%%) issues:\n\n"% batch_goodness # issues+="\n".join( sorted( batch_warnings[ bname ] )) # issues+="\n\n" if batch_extreme_warnings[bname]: subject = "Low Statistics for %s" % bname issues = "The following datasets have outstanding completion (<50%%) issues:\n\n" issues += "\n".join(sorted(batch_extreme_warnings[bname])) issues += "\n\n" elif batch_warnings[bname]: issues = "The following datasets have outstanding completion (<%d%%) issues:\n\n" % batch_goodness issues += "\n".join(sorted(batch_warnings[bname])) issues += "\n\n" text = "" text += "Dear all,\n\n" text += "A batch of release validation workflows has finished.\n\n" text += "Batch ID:\n\n" text += "%s\n\n" % (bname) text += "Detail of the workflows\n\n" text += "https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s\n\n" % ( bname) text += "%s\n\n" % (issues) text += "This is an automated message.\n\n" text += "" to = ['*****@*****.**'] sendEmail(subject, text, destination=to) ## just announced ; take it out now. BI.pop(bname) deleteCampaignConfig(bname) if os.path.isfile('.closor_stop'): print "The loop on workflows was shortened" sendEmail('closor', 'Closor loop was shortened artificially using .closor_stop') os.system('rm -f .closor_stop')
def closor(url, specific=None, options=None): if userLock(): return mlock = moduleLock() if mlock(): return up = componentInfo(soft=['mcm','wtc']) if not up.check(): return UC = unifiedConfiguration() CI = campaignInfo() BI = batchInfo() CloseI = closeoutInfo() all_late_files = [] jump_the_line = options.announce if options else False if jump_the_line: print "announce option is on. Checking on things on-going ready to be announced" wfs = session.query(Workflow).filter(Workflow.status.contains('announce')).filter(sqlalchemy.not_(Workflow.status.contains('announced'))).all() else: print "regular option. Checking on things done and to be announced" wfs = session.query(Workflow).filter(Workflow.status=='close').all() if specific: wfs = [wfo for wfo in wfs if specific in wfo.name] wfs_n = [w.name for w in wfs] print "unique names?" print len(set(wfs_n)) == len(wfs_n) held = set() print len(wfs),"closing" random.shuffle( wfs ) max_per_round = UC.get('max_per_round').get('closor',None) if options.limit: max_per_round = options.limit if max_per_round: ## order them by priority all_closedout = sorted(getWorkflows(url, 'closed-out', details=True), key = lambda r : r['RequestPriority']) all_closedout = [r['RequestName'] for r in all_closedout] def rank( wfn ): return all_closedout.index( wfn ) if wfn in all_closedout else 0 wfs = sorted( wfs, key = lambda wfo : rank( wfo.name ),reverse=True) wfs = wfs[:max_per_round] batch_go = {} batch_warnings = defaultdict(set) batch_goodness = UC.get("batch_goodness") closers = [] print len(wfs),"closing" th_start = time.mktime(time.gmtime()) for iwfo,wfo in enumerate(wfs): if specific and not specific in wfo.name: continue closers.append( CloseBuster( wfo = wfo, url = url, CI = CI, UC = UC, jump_the_line = jump_the_line, batch_goodness = batch_goodness, batch_go = batch_go, #stats = stats, batch_warnings = batch_warnings, all_late_files = all_late_files, held = held, )) run_threads = ThreadHandler( threads = closers, n_threads = options.threads, sleepy = 10, timeout = None, verbose = True, label = 'closor') run_threads.start() ## waiting on all to complete while run_threads.is_alive(): #print "Waiting on closing threads",time.asctime(time.gmtime()) time.sleep(5) JC = JIRAClient() if up.status.get('jira',False) else None print len(run_threads.threads),"finished thread to gather information from" failed_threads = 0 for to in run_threads.threads: if to.failed: failed_threads += 1 continue if to.outs: for outO in to.outs: out = outO.datasetname odb = session.query(Output).filter(Output.datasetname==out).first() if not odb: print "adding an output object",out session.add( outO ) else: odb.date = outO.date if to.to_status: to.wfo.status = to.to_status if JC and to.to_status == "done" and to.wfi: jiras = JC.find({"prepid" : to.wfi.request['PrepID']}) for jira in jiras: JC.close(jira.key) if to.to_wm_status: to.wfo.wm_status = to.to_wm_status if to.closing: CloseI.pop( to.wfo.name ) session.commit() th_stop = time.mktime(time.gmtime()) if wfs: time_spend_per_workflow = (th_stop-th_start) / float(len(wfs)) print "Average time spend per workflow is", time_spend_per_workflow if float(failed_threads/run_threads.n_threads) > 0: sendLog('checkor','%d/%d threads have failed, better check this out'% (failed_threads, run_threads.n_threads), level='critical') sendEmail('checkor','%d/%d threads have failed, better check this out'% (failed_threads,run_threads.n_threads)) days_late = 0. retries_late = 10 really_late_files = [info for info in all_late_files if info['retries']>=retries_late] really_late_files = [info for info in really_late_files if info['delay']/(60*60*24.)>=days_late] if really_late_files: subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s'%(len(really_late_files), days_late, retries_late, json.dumps( really_late_files , indent=2) ) #sendEmail('waiting for files to announce', subject) sendLog('closor', subject, level='warning') sendLog('closor',subject) print subject open('%s/stuck_files.json'%monitor_dir,'w').write( json.dumps( really_late_files , indent=2)) if held: sendLog('closor',"the workflows below are held up \n%s"%("\n".join( sorted(held) )), level='critical') for bname,go in batch_go.items(): if go: subject = "Release Validation Samples Batch %s"% bname issues="" if batch_warnings[ bname ]: issues="The following datasets have outstanding completion (<%d%%) issues:\n\n"% batch_goodness issues+="\n".join( sorted( batch_warnings[ bname ] )) issues+="\n\n" text = """ Dear all, a batch of release validation workflows has finished. Batch ID: %s Detail of the workflows https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s %s This is an automated message. """%( bname, bname, issues) to = ['*****@*****.**'] sendEmail(subject, text, destination=to ) ## just announced ; take it out now. BI.pop( bname ) if os.path.isfile('.closor_stop'): print "The loop on workflows was shortened" sendEmail('closor','Closor loop was shortened artificially using .closor_stop') os.system('rm -f .closor_stop')
def injector(url, options, specific): ## passing a round of invalidation of what needs to be invalidated if options.invalidate: invalidator(url) workflows = getWorkflows(url, status=options.wmstatus,user=options.user) existing = [wf.name for wf in session.query(Workflow).all()] ## browse for assignment-approved requests, browsed for ours, insert the diff for wf in workflows: if wf not in existing: print "putting",wf new_wf = Workflow( name = wf , status = options.setstatus, wm_status = options.wmstatus) session.add( new_wf ) session.commit() existing = [wf.name for wf in session.query(Workflow).all()] ## pick up replacements for wf in session.query(Workflow).filter(Workflow.status == 'trouble').all(): if specific and wf.name != specific: continue print wf.name wl = getWorkLoad(url, wf.name) familly = getWorkflowById( url, wl['PrepID'] ) if len(familly)==1: print wf.name,"ERROR has no replacement" continue print wf.name,"has",len(familly),"familly members" for member in familly: if member != wf.name: fwl = getWorkLoad(url , member) if options.replace: if member != options.replace: continue else: if fwl['RequestDate'] < wl['RequestDate']: continue if fwl['RequestType']=='Resubmission': continue if fwl['RequestStatus'] in ['None',None]: continue new_wf = session.query(Workflow).filter(Workflow.name == member).first() if not new_wf: print "putting",member status = 'away' if fwl['RequestStatus'] in ['assignment-approved']: status = 'considered' new_wf = Workflow( name = member, status = status, wm_status = fwl['RequestStatus']) wf.status = 'forget' session.add( new_wf ) session.commit() else: if new_wf.status == 'forget': continue print "getting",new_wf.name,"as replacement of",wf.name for tr in session.query(Transfer).all(): if wf.id in tr.workflows_id: sw = copy.deepcopy(tr.workflows_id) sw.remove( wf.id) sw.append(new_wf.id) tr.workflows_id = sw print tr.phedexid,"got",new_wf.name if new_wf.status != 'away': new_wf.status = 'staging' session.commit() ## don't do that automatically #wf.status = 'forget' session.commit()
def htmlor(caller=""): up = componentInfo(mcm=False, soft=['mcm']) if not up.check(): return try: boost = json.loads(open('%s/equalizor.json' % monitor_dir).read())['modifications'] except: boost = {} cache = getWorkflows(reqmgr_url, 'assignment-approved', details=True) cache.extend(getWorkflows(reqmgr_url, 'acquired', details=True)) cache.extend(getWorkflows(reqmgr_url, 'running-open', details=True)) cache.extend(getWorkflows(reqmgr_url, 'running-closed', details=True)) def getWL(wfn): cached = filter(lambda d: d['RequestName'] == wfn, cache) if cached: wl = cached[0] else: wl = getWorkLoad(reqmgr_url, wfn) return wl def wfl(wf, view=False, p=False, ms=False, within=False, ongoing=False, status=False, update=False): wfn = wf.name wfs = wf.wm_status wl = None pid = None wl_pid = None pids = filter(lambda seg: seg.count('-') == 2, wf.name.split('_')) if len(pids): pids = pids[:1] pid = pids[0] if not pids: wl = getWL(wf.name) pids = getPrepIDs(wl) pid = pids[0] wl_pid = pid if 'task' in wf.name: wl_pid = 'task_' + pid text = ', '.join([ #wfn, #'<a href="https://cmsweb.cern.ch/reqmgr/view/details/%s" target="_blank">%s</a> '%(wfn,wfn), #'<table><tr><td>%s</td></tr></table>'%(wfn), #'<span>%s</span>'%(wfn), "%s " % wfn, '(%s) <br>' % wfs ]) text += ', '.join([ '<a href="https://%s/reqmgr2/fetch?rid=%s" target="_blank">dts</a>' % (reqmgr_url, wfn), '<a href="https://cmsweb.cern.ch/reqmgr/view/details/%s" target="_blank">dts-req1</a>' % wfn, #TOFIX '<a href=https://cmsweb.cern.ch/reqmgr/view/showWorkload?requestName=%s target="_blank">wkl</a>'%wfn, '<a href="https://%s/couchdb/reqmgr_workload_cache/%s" target="_blank">wfc</a>' % (reqmgr_url, wfn), '<a href="https://%s/reqmgr2/data/request?name=%s" target="_blank">req</a>' % (reqmgr_url, wfn), #'<a href="https://cmsweb.cern.ch/reqmgr/reqMgr/request?requestName=%s" target="_blank">dwkc</a>'%wfn, #TOFIX '<a href="https://cmsweb.cern.ch/reqmgr/view/splitting/%s" target="_blank">spl</a>'%wfn, '<a href="https://cms-pdmv.cern.ch/stats/?RN=%s" target="_blank">vw</a>' % wfn, '<a href="https://cms-pdmv.cern.ch/stats/restapi/get_one/%s" target="_blank">vwo</a>' % wfn, '<a href="https://cms-logbook.cern.ch/elog/Workflow+processing/?mode=full&reverse=0&reverse=1&npp=20&subtext=%s&sall=q" target="_blank">elog</a>' % pid, '<a href="http://cms-gwmsmon.cern.ch/prodview/%s" target="_blank">pv</a>' % wfn, #deprecated '<a href="https://cmsweb.cern.ch/reqmgr/reqMgr/outputDatasetsByRequestName/%s" target="_blank">out</a>'%wfn, '<a href="closeout.html#%s" target="_blank">clo</a>' % wfn, '<a href="statuses.html#%s" target="_blank">st</a>' % wfn, '<a href="https://%s/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s" target="_blank">perf</a>' % (reqmgr_url, wfn) ]) if within and (not view or wfs == 'completed'): wl = getWL(wfn) dataset = None if 'InputDataset' in wl: dataset = wl['InputDataset'] if 'Task1' in wl and 'InputDataset' in wl['Task1']: dataset = wl['Task1']['InputDataset'] if dataset: text += ', '.join([ '', '<a href=https://cmsweb.cern.ch/das/request?input=%s target=_blank>input</a>' % dataset, '<a href=https://cmsweb.cern.ch/phedex/prod/Data::Subscriptions#state=create_since=0;filter=%s target=_blank>sub</a>' % dataset, '<a href=https://cmsweb.cern.ch/phedex/datasvc/xml/prod/subscriptions?dataset=%s&collapse=n target=_blank>ds</a>' % dataset, '<a href=https://cmsweb.cern.ch/phedex/datasvc/xml/prod/blockreplicas?dataset=%s target=_blank>rep</a>' % dataset, ]) if p: cached = filter(lambda d: d['RequestName'] == wfn, cache) if cached: wl = cached[0] else: wl = getWorkLoad('cmsweb.cern.ch', wfn) text += ', (%s)' % (wl['RequestPriority']) pass if pid: if ms: mcm_s = json.loads( os.popen( 'curl https://cms-pdmv.cern.ch/mcm/public/restapi/requests/get_status/%s --insecure' % pid).read())[pid] text += ', <a href="https://cms-pdmv.cern.ch/mcm/requests?prepid=%s" target="_blank">mcm (%s)</a>' % ( pid, mcm_s) else: text += ', <a href="https://cms-pdmv.cern.ch/mcm/requests?prepid=%s" target="_blank">mcm</a>' % ( pid) text += ', <a href="https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=%s" target="_blank">ac</a>' % ( wl_pid) if status: if wf.status.startswith('assistance'): text += ', <a href="assistance.html#%s" target="_blank">assist</a>' % wfn text += ' : %s ' % (wf.status) if view and wfs != 'acquired': text += '<a href="https://cms-pdmv.web.cern.ch/cms-pdmv/stats/growth/%s.gif" target="_blank"><img src="https://cms-pdmv.web.cern.ch/cms-pdmv/stats/growth/%s.gif" style="height:50px"></a>' % ( wfn.replace('_', '/'), wfn.replace('_', '/')) if ongoing: text += '<a href="http://cms-gwmsmon.cern.ch/prodview/%s" target="_blank"><img src="http://cms-gwmsmon.cern.ch/prodview/graphs/%s/daily" style="height:50px"></a>' % ( wfn, wfn) if ongoing: date1 = time.strftime( '%Y-%m-%d+%H:%M', time.gmtime(time.mktime(time.gmtime()) - (15 * 24 * 60 * 60))) date2 = time.strftime('%Y-%m-%d+%H:%M', time.gmtime()) text += '<a href="http://dashb-cms-job.cern.ch/dashboard/templates/web-job2/#table=Jobs&date1=%s&date2=%s&sortby=site&task=wmagent_%s">dashb</a>' % ( date1, date2, wfn) if ongoing and wfn in boost: for task in boost[wfn]: overflow = boost[wfn][task].get('ReplaceSiteWhitelist', None) if not overflow: overflow = boost[wfn][task].get('AddWhitelist', None) if overflow: text += ',boost (<a href=equalizor.json>%d</a>)' % len( overflow) #text+="<hr>" return text def phl(phid): text = ', '.join([ str(phid), '<a href="https://cmsweb.cern.ch/phedex/prod/Request::View?request=%s" target="_blank">vw</a>' % phid, '<a href="https://cmsweb.cern.ch/phedex/prod/Data::Subscriptions?reqfilter=%s" target="_blank">sub</a>' % phid, ]) return text def ol(out): return '<a href="https://cmsweb.cern.ch/das/request?input=%s" target="_blank"> %s</a>' % ( out, out) def lap(comment): l = time.mktime(time.gmtime()) spend = l - lap.start lap.start = l print "Spend %d [s] for %s" % (spend, comment) lap.start = time.mktime(time.gmtime()) ## start to write it #html_doc = open('/afs/cern.ch/user/v/vlimant/public/ops/index.html','w') html_doc = open('%s/index.html.new' % monitor_dir, 'w') print "Updating the status page ..." UC = unifiedConfiguration() if not caller: try: #caller = sys._getframe(1).f_code.co_name caller = sys.argv[0].split('/')[-1].replace('.py', '') print "caller is" print caller except Exception as es: caller = 'none found' print "not getting frame" print str(es) html_doc.write(""" <html> <head> <META HTTP-EQUIV="refresh" CONTENT="900"> <script type="text/javascript"> function showhide(id) { var e = document.getElementById(id); e.style.display = (e.style.display == 'block') ? 'none' : 'block'; } </script> </head> <body> Last update on %s(CET), %s(GMT) <br> <a href=logs/ target=_blank>logs</a> <a href=logs/last.log target=_blank>last</a> <a href=statuses.html>statuses</a> <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/ target=_blank>prod mon</a> <a href=https://%s/wmstats/index.html target=_blank>wmstats</a> <a href=http://t3serv001.mit.edu/~cmsprod/IntelROCCS/Detox/SitesInfo.txt target=_blank>detox</a> <a href=locked.html>space</a> <a href=logs/subscribor/last.log target=_blank>blocks</a> <a href=logs/agents/last.log>agents</a> <br> <a href=https://twiki.cern.ch/twiki/bin/view/CMSPublic/CompOpsWorkflowL3Responsibilities#Automatic_Assignment_and_Unified>what am I</a> <a href=data.html>json interfaces</a> <a href=logs/addHoc/last.log>add-hoc op</a> created from <b>%s <a href=logs/last_running>last running</a></b> <object height=20 type="text/html" data="logs/last_running"><p>backup content</p></object> <br><br> """ % (time.asctime(time.localtime()), time.asctime( time.gmtime()), reqmgr_url, caller)) text = "" count = 0 count_by_campaign = defaultdict(lambda: defaultdict(int)) for wf in session.query(Workflow).filter( Workflow.status.startswith('considered')).all(): wl = getWL(wf.name) count_by_campaign[wl['Campaign']][int(wl['RequestPriority'])] += 1 #print wf.name text += "<li> %s </li> \n" % wfl(wf, p=True) count += 1 text_by_c = "" for c in count_by_campaign: text_by_c += "<li> %s (%d) : " % (c, sum( count_by_campaign[c].values())) for p in sorted(count_by_campaign[c].keys()): text_by_c += "%d (%d), " % (p, count_by_campaign[c][p]) text_by_c += "</li>" html_doc.write(""" Worflow next to handle (%d) <a href=https://cms-pdmv.cern.ch/mcm/batches?status=new&page=-1 target="_blank"> batches</a> <a href=logs/injector/last.log target=_blank>log</a> <a href=logs/transferor/last.log target=_blank>postlog</a> <a href="javascript:showhide('considered')">[Click to show/hide]</a> <br> <div id="considered" style="display:none;"> <ul> <li> By workflow (%d) </li><a href="javascript:showhide('considered_bywf')">[Click to show/hide]</a><div id="considered_bywf" style="display:none;"> <ul> %s </ul></div> <li> By campaigns (%d) </li><a href="javascript:showhide('considered_bycamp')">[Click to show/hide]</a><div id="considered_bycamp" style="display:none;"> <ul> %s </ul></div> </ul> </div> """ % (count, count, text, len(count_by_campaign), text_by_c)) lap('done with considered') text = "" count = 0 count_by_campaign = defaultdict(lambda: defaultdict(int)) for wf in session.query(Workflow).filter( Workflow.status == 'staging').all(): wl = getWL(wf.name) count_by_campaign[wl['Campaign']][int(wl['RequestPriority'])] += 1 text += "<li> %s </li> \n" % wfl(wf, within=True) count += 1 text_by_c = "" for c in count_by_campaign: text_by_c += "<li> %s (%d) : " % (c, sum( count_by_campaign[c].values())) for p in sorted(count_by_campaign[c].keys()): text_by_c += "%d (%d), " % (p, count_by_campaign[c][p]) text_by_c += "</li>" html_doc.write(""" Worflow waiting in staging (%d) <a href=logs/transferor/last.log target=_blank>log</a> <a href=logs/stagor/last.log target=_blank>postlog</a> <a href="javascript:showhide('staging')">[Click to show/hide]</a> <br> <div id="staging" style="display:none;"> <ul> <li> By workflow (%d) </li><a href="javascript:showhide('staging_bywf')">[Click to show/hide]</a><div id="staging_bywf" style="display:none;"> <ul> %s </ul></div> <li> By campaigns (%d) </li><a href="javascript:showhide('staging_bycamp')">[Click to show/hide]</a><div id="staging_bycamp" style="display:none;"> <ul> %s </ul></div> </ul> </div> """ % (count, count, text, len(count_by_campaign), text_by_c)) lap('done with staging') text = "<ul>" count = 0 transfer_per_wf = defaultdict(list) for ts in session.query(Transfer).filter(Transfer.phedexid > 0).all(): hide = True t_count = 0 stext = "" for pid in ts.workflows_id: w = session.query(Workflow).get(pid) hide &= (w.status != 'staging') if w.status in ['considered', 'staging', 'staged']: stext += "<li> %s </li>\n" % (wfl(w, status=True)) transfer_per_wf[w].append(ts.phedexid) t_count += 1 stext = '<li> %s serves %d workflows<br><a href="javascript:showhide(\'%s\')">[show/hide]</a> <div id="%s" style="display:none;"><ul>\n' % ( phl(ts.phedexid), t_count, ts.phedexid, ts.phedexid) + stext stext += "</ul></li>\n" if hide: #text+="<li> %s not needed anymore to start running (does not mean it went through completely)</li>"%phl(ts.phedexid) pass else: count += 1 text += stext text += "</ul>" text_bywf = "<ul>" for wf in transfer_per_wf: text_bywf += "<li> %s </li>" % (wfl(wf, within=True)) text_bywf += '<a href=javascript:showhide("transfer_%s")>[Click to show/hide] %d transfers</a>' % ( wf.name, len(transfer_per_wf[wf])) text_bywf += '<div id="transfer_%s" style="display:none;">' % wf.name text_bywf += "<ul>" for pid in sorted(transfer_per_wf[wf]): text_bywf += "<li> %s </li>" % (phl(pid)) text_bywf += "</ul></div><hr>" text_bywf += '</ul>' html_doc.write(""" Transfer on-going (%d) <a href=http://cmstransferteam.web.cern.ch/cmstransferteam/ target=_blank>dashboard</a> <a href=logs/transferor/last.log target=_blank>log</a> <a href=logs/stagor/last.log target=_blank>postlog</a> <a href="javascript:showhide('transfer')">[Click to show/hide]</a> <br> <div id="transfer" style="display:none;"> <ul> <li> By Workflow <a href="javascript:showhide('transfer_bywf')">[Click to show/hide]</a> <div id="transfer_bywf" style="display:none;"> %s </div> </li> <li> By transfer request <a href="javascript:showhide('transfer_byreq')">[Click to show/hide]</a> <div id="transfer_byreq" style="display:none;"> %s </div> </li> </ul> </div> """ % (count, text_bywf, text)) lap('done with transfers') text = "" count = 0 count_by_campaign = defaultdict(lambda: defaultdict(int)) for wf in session.query(Workflow).filter( Workflow.status == 'staged').all(): wl = getWL(wf.name) count_by_campaign[wl['Campaign']][int(wl['RequestPriority'])] += 1 text += "<li> %s </li> \n" % wfl(wf, p=True) count += 1 text_by_c = "" for c in count_by_campaign: text_by_c += "<li> %s (%d) : " % (c, sum( count_by_campaign[c].values())) for p in sorted(count_by_campaign[c].keys()): text_by_c += "%d (%d), " % (p, count_by_campaign[c][p]) text_by_c += "</li>" html_doc.write( """Worflow ready for assigning (%d) <a href=logs/stagor/last.log target=_blank>log</a> <a href=logs/assignor/last.log target=_blank>postlog</a> <a href="javascript:showhide('staged')">[Click to show/hide]</a> <br> <div id="staged" style="display:none;"> <br> <ul> <li> By workflow (%d) </li><a href="javascript:showhide('staged_bywf')">[Click to show/hide]</a><div id="staged_bywf" style="display:none;"> <ul> %s </ul></div> <li> By campaigns (%d) </li><a href="javascript:showhide('staged_bycamp')">[Click to show/hide]</a><div id="staged_bycamp" style="display:none;"> <ul> %s </ul></div> </ul> </div> """ % (count, count, text, len(count_by_campaign), text_by_c)) lap('done with staged') lines = [] count_by_campaign = defaultdict(lambda: defaultdict(int)) for wf in session.query(Workflow).filter(Workflow.status == 'away').all(): wl = getWL(wf.name) count_by_campaign[wl['Campaign']][int(wl['RequestPriority'])] += 1 lines.append("<li> %s <hr></li>" % wfl(wf, view=True, ongoing=True)) text_by_c = "" for c in count_by_campaign: text_by_c += "<li> %s (%d) <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/campaign.php?campaign=%s>mon</a> <a href=https://cms-pdmv.cern.ch/pmp/historical?r=%s target=_blank>pmp</a> " % ( c, sum(count_by_campaign[c].values()), c, c) for p in sorted(count_by_campaign[c].keys()): text_by_c += "%d (%d), " % (p, count_by_campaign[c][p]) text_by_c += "</li>" lines.sort() html_doc.write(""" Worflow on-going (%d) <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests_in_production.php target=_blank>ongoing</a> <a href=https://cms-logbook.cern.ch/elog/Workflow+processing/?mode=summary target=_blank>elog</a> <a href=http://cms-gwmsmon.cern.ch/prodview target=_blank>queues</a> <a href=logs/assignor/last.log target=_blank>log</a> <a href=logs/checkor/last.log target=_blank>postlog</a> <a href=logs/equalizor/last.log target=_blank>equ</a> <a href=logs/completor/last.log target=_blank>comp</a> <a href="javascript:showhide('away')">[Click to show/hide]</a> <br> <div id="away" style="display:none;"> <ul> <li>By workflow (%d) </li> <a href="javascript:showhide('away_bywf')">[Click to show/hide]</a><div id="away_bywf" style="display:none;"> <ul> %s </ul></div> <li> By campaigns (%d) </li><a href="javascript:showhide('away_bycamp')">[Click to show/hide]</a><div id="away_bycamp" style="display:none;"> <ul> %s </ul></div> </ul> </div> """ % (len(lines), len(lines), '\n'.join(lines), len(count_by_campaign), text_by_c)) lap('done with away') text = "" count = 0 #for wf in session.query(Workflow).filter(Workflow.status == 'assistance-custodial').all(): for wf in session.query(Workflow).filter( Workflow.status.startswith('assistance')).filter( Workflow.status.contains('custodial')).all(): text += "<li> %s </li> \n" % wfl( wf, view=True, update=True, status=True) count += 1 text += "</ul></div>\n" html_doc.write("""Worflow that are closing (%d) <a href=closeout.html target=_blank>closeout</a> <a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a> <a href="javascript:showhide('closing')">[Click to show/hide]</a> <br> <div id="closing" style="display:none;"> <br> <ul> """ % count) html_doc.write(text) lap('done with closing') assistance_by_type = defaultdict(list) text = "" count = 0 for wf in session.query(Workflow).filter( Workflow.status.startswith('assistance-')).all(): assistance_by_type[wf.status].append(wf) count += 1 for assistance_type in assistance_by_type: text += "<li> %s (%d) <a href=\"javascript:showhide('%s')\">[Click to show/hide]</a><br><div id=\"%s\" style=\"display:none;\"><ul>" % ( assistance_type, len(assistance_by_type[assistance_type]), assistance_type, assistance_type, ) for wf in assistance_by_type[assistance_type]: text += "<li> %s <hr></li> \n" % wfl( wf, view=True, within=True, status=True, update=True) text += "</ul></div></li>\n" html_doc.write("""Worflow which need assistance (%d) <a href=assistance.html target=_blank>assistance</a> <a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/recoveror/last.log target=_blank>postlog</a> <a href="javascript:showhide('assistance')">[Click to show/hide]</a> <br> <div id="assistance" style="display:none;"> <br> <ul> %s </ul> </div> """ % (count, text)) lap('done with assistance') text = "" count = 0 for wf in session.query(Workflow).filter(Workflow.status == 'close').all(): text += "<li> %s </li> \n" % wfl(wf) count += 1 text += "</ul></div>\n" html_doc.write("""Worflow ready to close (%d) <a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a> <a href="javascript:showhide('close')">[Click to show/hide]</a> <br> <div id="close" style="display:none;"> <br> <ul> """ % count) html_doc.write(text) lap('done with annoucing') text = "" count = 0 for wf in session.query(Workflow).filter( Workflow.status == 'trouble').all(): text += "<li> %s </li> \n" % wfl(wf) count += 1 text += "</ul></div>\n" html_doc.write( """Worflow with issue (%d) <a href=logs/closor/last.log target=_blank>log</a> <a href=logs/injector/last.log target=_blank>postlog</a> <a href="javascript:showhide('trouble')">[Click to show/hide]</a> <br> <div id="trouble" style="display:none;"> <br> <ul> """ % count) html_doc.write(text) lap('done with trouble') text = "" count = 0 for wf in session.query(Workflow).filter( Workflow.status == 'forget').all(): text += "<li> %s </li> \n" % wfl(wf) count += 1 text += "</ul></div>\n" html_doc.write(""" Worflow to forget (%d) <a href=logs/injector/last.log target=_blank>log</a> <a href=logs/lockor/last.log target=_blank>postlog</a> <a href="javascript:showhide('forget')">[Click to show/hide]</a> <br> <div id="forget" style="display:none;"> <br> <ul> """ % count) html_doc.write(text) lap('done with forget') text = "" count = 0 for wf in session.query(Workflow).filter(Workflow.status == 'done').all(): text += "<li> %s </li> \n" % wfl(wf) #,ms=True) count += 1 text += "</ul></div>\n" html_doc.write(""" Worflow through (%d) <a href=logs/closor/last.log target=_blank>log</a> <a href=logs/lockor/last.log target=_blank>postlog</a> <a href="javascript:showhide('done')">[Click to show/hide]</a> <br> <div id="done" style="display:none;"> <br> <ul> """ % count) html_doc.write(text) lap('done with done') wfs = session.query(Workflow).filter( Workflow.status.endswith('-unlock')).all() html_doc.write( " Workflows unlocked : %s <a href=logs/lockor/last.log target=_blank>log</a><br>" % (len(wfs))) lap('done with unlocked') text = "" lines_thisweek = [] lines_lastweek = [] now = time.mktime(time.gmtime()) this_week = int(time.strftime("%W", time.gmtime())) start_time_two_weeks_ago = time.mktime( time.gmtime(now - (20 * 24 * 60 * 60))) # 20 last_week = int(time.strftime("%W", time.gmtime(now - (7 * 24 * 60 * 60)))) all_locks = json.loads(open('%s/globallocks.json' % monitor_dir).read()) waiting_custodial = json.loads( open('%s/waiting_custodial.json' % monitor_dir).read()) all_pending_approval_custodial = dict([ (k, item) for k, item in waiting_custodial.items() if 'nodes' in item and not any([node['decided'] for node in item['nodes'].values()]) ]) n_pending_approval = len(all_pending_approval_custodial) #n_pending_approval = len([item for item in waiting_custodial.values() if 'nodes' in item and not any([node['decided'] for node in item['nodes'].values() ])]) missing_approval_custodial = json.loads( open('%s/missing_approval_custodial.json' % monitor_dir).read()) stuck_custudial = json.loads( open('%s/stuck_custodial.json' % monitor_dir).read()) lagging_custudial = json.loads( open('%s/lagging_custodial.json' % monitor_dir).read()) if len(stuck_custudial): stuck_string = ', <font color=red>%d appear to be <a href=stuck_custodial.json>stuck</a></font>' % len( stuck_custudial) else: stuck_string = '' if len(missing_approval_custodial): long_approve_string = ', <font color=red>%d more than %d days</font>' % ( len(missing_approval_custodial), UC.get('transfer_timeout')) else: long_approve_string = '' output_within_two_weeks = session.query(Output).filter( Output.date >= start_time_two_weeks_ago).all() waiting_custodial_string = "" waiting_custodial_strings = [] for ds in waiting_custodial: out = None ## lots of it will be within two weeks of = filter(lambda odb: odb.datasetname == ds, output_within_two_weeks) if of: out = of[0] else: out = session.query(Output).filter( Output.datasetname == ds).first() if out: info = waiting_custodial[out.datasetname] action = 'going' if out.datasetname in all_pending_approval_custodial: action = '<font color=red>pending</font>' try: size = str(info['size']) except: size = "x" destination = ",".join(info['nodes'].keys()) if not destination: destination = '<font color=red>NO SITE</font>' a_waiting_custodial_string = "<li>on week %s : %s %s</li>" % ( time.strftime("%W (%x %X)", time.gmtime( out.date)), ol(out.datasetname), ' %s [GB] %s to %s on %s (<a href="https://cmsweb.cern.ch/phedex/datasvc/xml/prod/requestlist?dataset=%s&node=T*MSS">%d missing</a>)' % (size, action, destination, time.asctime(time.gmtime( info['checked'])), out.datasetname, info['nmissing'])) waiting_custodial_strings.append( (out.date, a_waiting_custodial_string)) waiting_custodial_strings.sort(key=lambda i: i[0]) waiting_custodial_string = "\n".join( [i[1] for i in waiting_custodial_strings]) #start_time_two_weeks_ago = time.mktime(time.strptime("15-0-%d"%(this_week-2), "%y-%w-%W")) for out in output_within_two_weeks: if not out.workflow: print "This is a problem with", out.datasetname continue if out.workflow.status in [ 'done-unlock', 'done', 'clean', 'clean-out', 'clean-unlock' ]: custodial = '' if out.datasetname in waiting_custodial: info = waiting_custodial[out.datasetname] try: try: size = str(info['size']) except: size = "x" destination = ",".join(info['nodes'].keys()) if not destination: destination = '<font color=red>NO SITE</font>' action = 'going' if out.datasetname in all_pending_approval_custodial: action = '<font color=red>pending</font>' custodial = ' %s [GB] %s to %s on %s (<a href="https://cmsweb.cern.ch/phedex/datasvc/xml/prod/requestlist?dataset=%s&node=T*MSS">%d missing</a>)' % ( size, action, destination, time.asctime(time.gmtime(info['checked'])), out.datasetname, info['nmissing']) except Exception as e: #print info #print str(e) pass elif out.datasetname in all_locks: custodial = '<font color=green>LOCKED</font>' out_week = int(time.strftime("%W", time.gmtime(out.date))) ##only show current week, and the previous. if last_week == out_week: lines_lastweek.append( "<li>on week %s : %s %s</li>" % (time.strftime("%W (%x %X)", time.gmtime( out.date)), ol(out.datasetname), custodial)) if this_week == out_week: lines_thisweek.append( "<li>on week %s : %s %s</li>" % (time.strftime("%W (%x %X)", time.gmtime( out.date)), ol(out.datasetname), custodial)) lines_thisweek.sort() lines_lastweek.sort() html_doc.write( """Output produced (%d) <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?in_disagreement=1 target=_blank>disagreements</a> <a href="javascript:showhide('output')">[Click to show/hide]</a> <br> <div id="output" style="display:none;"> <br> <ul> <li> %d waiting to go to tape</li> <ul> <li> %d waiting for tape approval%s</li> <li> %d are not completed after %d days%s</li> <li> Full list (%d) <a href="javascript:showhide('waiting-custodial')">[Click to show/hide]</a> <div id="waiting-custodial" style="display:none;"> <ul> %s </ul> </div> </li> </ul> <li> Last week (%d) </li><a href="javascript:showhide('output_lastweek')">[Click to show/hide]</a><div id="output_lastweek" style="display:none;"><ul> %s </ul></div> <li> This week (%d) </li><a href="javascript:showhide('output_thisweek')">[Click to show/hide]</a><div id="output_thisweek" style="display:none;"><ul> %s </ul></div></div> """ % (len(lines_lastweek) + len(lines_thisweek), len(waiting_custodial), n_pending_approval, long_approve_string, len(lagging_custudial), UC.get('transfer_timeout'), stuck_string, len(waiting_custodial), waiting_custodial_string, len(lines_lastweek), '\n'.join(lines_lastweek), len(lines_thisweek), '\n'.join(lines_thisweek))) lap('done with output') html_doc.write("""Job installed <a href="javascript:showhide('acron')">[Click to show/hide]</a> <br> <div id="acron" style="display:none;"> <br> <pre> %s </pre> """ % (os.popen('acrontab -l | grep Unified | grep -v \#').read())) per_module = defaultdict(list) for t in filter( None, os.popen('cat %s/logs/*/*.time' % monitor_dir).read().split('\n')): module_name, run_time, spend = t.split(':') ## then do what you want with it ! if 'cleanor' in module_name: continue per_module[module_name].append(int(spend)) def display_time(sec): m, s = divmod(sec, 60) h, m = divmod(m, 60) dis = "" if h: dis += "%d [h] " % h if h or m: dis += "%d [m] " % m if h or m or s: dis += "%d [s]" % s return dis html_doc.write("Module running time<ul>\n") for m, spends in per_module.items(): avg = sum(spends) / float(len(spends)) lasttime = spends[-1] html_doc.write("<li>%s : last %s, avg %s</li>\n" % (m, display_time(lasttime), display_time(avg))) html_doc.write("</ul>") html_doc.write( "Last running <pre>%s</pre><br>" % (os.popen("tac %s/logs/running | head -5" % monitor_dir).read())) html_doc.write("Order in cycle <pre>%s</pre><br>" % ('\n'.join( map( lambda l: l.split('/')[-1].replace('.py', ''), filter( lambda l: not l.startswith('#') and 'Unified' in l and 'py' in l.split('/')[-1], open('%s/WmAgentScripts/cycle.sh' % base_dir).read().split('\n')))))) html_doc.write("</div>\n") lap('done with jobs') text = "" count = 0 for (c, info) in campaignInfo().campaigns.items(): #if 'go' in info and info['go']: text += "<li>%s <br> <pre>%s</pre> </li>" % ( c, json.dumps(info, indent=2)) count += 1 html_doc.write("""Campaign configuration <a href="javascript:showhide('campaign')">[Click to show/hide]</a> <br> <div id="campaign" style="display:none;"> <br> <ul> %s </ul></div> """ % (text)) text = "" count = 0 n_column = 4 SI = siteInfo() date1 = time.strftime( '%Y-%m-%d+%H:%M', time.gmtime(time.mktime(time.gmtime()) - (15 * 24 * 60 * 60))) ## 15 days date2 = time.strftime('%Y-%m-%d+%H:%M', time.gmtime()) for t in SI.types(): text += "<li>%s<table border=1>" % t c = 0 for site in getattr(SI, t): cpu = SI.cpu_pledges[site] if site in SI.cpu_pledges else 'N/A' disk = SI.disk[SI.CE_to_SE(site)] if SI.CE_to_SE( site) in SI.disk else 'N/A' if c == 0: text += "<tr>" if not disk: ht_disk = '<font color=red>Disk available: %s</font>' % disk else: ht_disk = 'Disk available: %s' % disk text += '<td><a href=http://dashb-ssb.cern.ch/dashboard/templates/sitePendingRunningJobs.html?site=%s>%s</a><br><a href="http://cms-gwmsmon.cern.ch/prodview/%s" target="_blank"><img src="http://cms-gwmsmon.cern.ch/prodview/graphs/%s/daily" style="height:50px"></a><br><a href="http://dashb-cms-job.cern.ch/dashboard/templates/web-job2/#user=&refresh=0&table=Jobs&p=1&records=25&activemenu=1&site=%s&submissiontool=wmagent&check=submitted&sortby=activity&scale=linear&bars=20&data1=%s&date2=%s">dashb</a><br>CPU pledge: %s<br>%s</td>' % ( site, site, site, site, site, date1, date2, cpu, ht_disk) if c == n_column: c = 0 else: c += 1 text += "</table></li>" text += "<li> Sites in auto-approved transfer<ul>" for site in sorted(SI.sites_auto_approve): text += "<li>%s" % site text += "</ul></li>" text += "<li> Sites with vetoe transfer<ul>" for site in sorted(SI.sites_veto_transfer): text += "<li>%s" % site text += "</ul></li>" text += "<li> Sites banned from production<ul>" for site in sorted(SI.sites_banned): text += "<li>%s" % site text += "</ul></li>" text += "<li> Approximate Free Tape<ul>" for mss in SI.storage: waiting = 0 try: waiting = float( os.popen( "grep '%s is pending . Created since' %s/logs/lockor/last.log -B 3 | grep size | awk '{ sum+=$6 ; print sum }' | tail -1" % (mss, monitor_dir)).readline()) except Exception as e: print str(e) oldest = "" os.system( 'grep pending %s/logs/lockor/last.log | sort -u > %s/logs/pending.log' % (monitor_dir, monitor_dir)) try: oldest = os.popen( "grep '%s is pending . Created since ' %s/logs/lockor/last.log | sort | awk '{print $10, $11, $12, $13, $14 }' | head -1" % (mss, monitor_dir)).readline() except Exception as e: print str(e) waiting /= 1024. text += "<li>%s : %d [TB]. Waiting for approval %d [TB] since %s </li>" % ( mss, SI.storage[mss], waiting, oldest) text += "</ul></li>" lap('done with sites') open('%s/siteInfo.json' % monitor_dir, 'w').write( json.dumps(dict([(t, getattr(SI, t)) for t in SI.types()]), indent=2)) lap('done with sites json') chart_data = defaultdict(list) for site in SI.quota: chart_data[site].append(""" var data_%s = google.visualization.arrayToDataTable([ ['Overall', 'Space in TB'], //['Quota' , %s], ['Locked' , %s], ['Free' , %s] ]); """ % ( site, SI.quota[site], SI.locked[site], SI.disk[site], )) chart_data[site].append(""" var chart_%s = new google.visualization.PieChart(document.getElementById('donutchart_%s')); chart_%s.draw(data_%s, {title: '%s %s [TB]', pieHole:0.4, slices:{0:{color:'red'},1:{color:'green'}}}); """ % (site, site, site, site, site, SI.quota[site])) chart_data[site].append(""" <div id="donutchart_%s" style="height: 200px;width: 300px"></div> """ % (site)) ## make the locked/available donut chart donut_html = open('%s/locked.html' % monitor_dir, 'w') tables = "\n".join([info[0] for site, info in chart_data.items()]) draws = "\n".join([info[1] for site, info in chart_data.items()]) divs = "\n".join([info[2] for site, info in chart_data.items()]) divs_table = "<table border=0>" for c, site in enumerate(sorted(chart_data.keys())): if c % 5 == 0: divs_table += "<tr>" divs_table += "<td>%s</td>" % (chart_data[site][2]) divs_table += "</table>" donut_html.write(""" <html> <head> <script type="text/javascript" src="https://www.google.com/jsapi"></script> <script type="text/javascript"> google.load("visualization", "1", {packages:["corechart"]}); google.setOnLoadCallback(drawChart); function drawChart() { %s %s } </script> </head> <body> %s </body> </html> """ % (tables, draws, divs_table)) donut_html.close() html_doc.write("""Site configuration <a href="javascript:showhide('site')">[Click to show/hide]</a> <br> <div id="site" style="display:none;"> <br> <ul> %s </ul></div> """ % (text)) lap('done with space') text = "" for param in UC.configs: text += "<li>%s</li><ul>\n" % param for sub in sorted(UC.configs[param].keys()): text += "<li> %s : %s </li>\n" % (sub, UC.configs[param][sub]) text += '</ul>\n' html_doc.write("""Unified configuration <a href="javascript:showhide('config')">[Click to show/hide]</a> <br> <div id="config" style="display:none;"> <br> <ul> %s </ul></div> """ % (text)) lap('done with configuration') print "... done with status page." html_doc.write(""" </body> </html> """) html_doc.close() ## and put the file in place os.system('mv %s/index.html.new %s/index.html' % (monitor_dir, monitor_dir)) statuses = json.loads(open('%s/statusmon.json' % monitor_dir).read()) s_count = defaultdict(int) now = time.mktime(time.gmtime()) for wf in session.query(Workflow).all(): s_count[wf.status] += 1 statuses[now] = dict(s_count) ## remove old entries for t in statuses.keys(): if (now - float(t)) > 7 * 24 * 60 * 60: statuses.pop(t) open('%s/statusmon.json' % monitor_dir, 'w').write(json.dumps(statuses, indent=2)) html_doc = open('%s/statuses.html' % monitor_dir, 'w') html_doc.write( """ <html> <table border=1> <thead> <tr> <th> workflow </th><th> status </th><th> wm status</th> </tr> </thead> """) wfs = {} for wfo in session.query(Workflow).all(): ## pass all that is unlocked and considered it gone wfs[wfo.name] = (wfo.status, wfo.wm_status) open('%s/statuses.json' % monitor_dir, 'w').write(json.dumps(wfs)) for wfn in sorted(wfs.keys()): ## pass all that is unlocked and considered it gone if 'unlock' in wfs[wfn][0]: continue html_doc.write( '<tr><td><a id="%s">%s</a></td><td>%s</td><td>%s</td></tr>\n' % (wfn, wfn, wfs[wfn][0], wfs[wfn][1])) html_doc.write("</table>") html_doc.write("<br>" * 100) html_doc.write("end of page</html>") html_doc.close()
def equalizor(url, specific=None, options=None): up = componentInfo(mcm=False, soft=['mcm']) if not up.check(): return if not specific: workflows = getWorkflows(url, status='running-closed', details=True) workflows.extend(getWorkflows(url, status='running-open', details=True)) ## start from scratch modifications = defaultdict(dict) ## define regionality site => fallback allowed. feed on an ssb metric ?? mapping = defaultdict(list) reversed_mapping = defaultdict(list) regions = defaultdict(list) SI = siteInfo() CI = campaignInfo() UC = unifiedConfiguration() for site in SI.sites_ready: region = site.split('_')[1] if not region in ['US', 'DE', 'IT']: continue regions[region] = [region] def site_in_depletion(s): return True if s in SI.sites_pressure: (m, r, pressure) = SI.sites_pressure[s] if float(m) < float(r): print s, m, r, "lacking pressure" return True else: print s, m, r, "pressure" pass return False for site in SI.sites_ready: region = site.split('_')[1] ## fallback to the region, to site with on-going low pressure mapping[site] = [ fb for fb in SI.sites_ready if any([('_%s_' % (reg) in fb and fb != site and site_in_depletion(fb)) for reg in regions[region]]) ] use_T0 = ('T0_CH_CERN' in UC.get("site_for_overflow")) if options.t0: use_T0 = True #if options.augment : use_T0 = True use_HLT = ('T2_CH_CERN_HLT' in UC.get("site_for_overflow")) if options.hlt: use_HLT = True #if options.augment : use_HLT=True if use_HLT: mapping['T2_CH_CERN'].append('T2_CH_CERN_HLT') if use_T0: mapping['T2_CH_CERN'].append('T0_CH_CERN') #mapping['T1_FR_CCIN2P3'].append('T0_CH_CERN') #mapping['T2_IT_Legnaro'].append('T1_IT_CNAF') for reg in ['IT', 'DE', 'UK']: mapping['T2_CH_CERN'].extend( [fb for fb in SI.sites_ready if '_%s_' % reg in fb]) ## make them appear as OK to use force_sites = [] ## overflow CERN to underutilized T1s upcoming = json.loads(open('%s/GQ.json' % monitor_dir).read()) for possible in SI.sites_T1s: if not possible in upcoming: mapping['T2_CH_CERN'].append(possible) ## remove add-hoc sites from overflow mapping prevent_sites = ['T2_US_Purdue'] for prevent in prevent_sites: if prevent in mapping: mapping.pop(prevent) for src in mapping: for prevent in prevent_sites: if prevent in mapping[src]: mapping[src].remove(prevent) ## create the reverse mapping for the condor module for site, fallbacks in mapping.items(): for fb in fallbacks: reversed_mapping[fb].append(site) ## this is the fallback mapping print "Direct mapping : site => overflow" print json.dumps(mapping, indent=2) print "Reverse mapping : dest <= from origin" print json.dumps(reversed_mapping, indent=2) altered_tasks = set() def running_idle(wfi, task_name): gmon = wfi.getGlideMon() #print gmon if not gmon: return (0, 0) if not task_name in gmon: return (0, 0) return (gmon[task_name]['Running'], gmon[task_name]['Idle']) def needs_action(wfi, task, min_idled=100, pressure=0.2): task_name = task.pathName.split('/')[-1] running, idled = running_idle(wfi, task_name) go = True if not idled and not running: go = False if idled < 100: go = False if (not running and idled) or (running and (idled / float(running) > pressure)): go = True else: go = False return go, task_name, running, idled def getPerf(task): task = task.split('/')[1] + '/' + task.split('/')[-1] try: u = 'http://cms-gwmsmon.cern.ch/prodview/json/history/memoryusage720/%s' % task print u perf_data = json.loads(os.popen('curl -s --retry 5 %s' % u).read()) except Exception as e: print str(e) return (None, None) buckets = perf_data['aggregations']["2"]['buckets'] s_m = sum(bucket['key'] * bucket['doc_count'] for bucket in buckets) w_m = sum(bucket['doc_count'] for bucket in buckets) m_m = max(bucket['key'] for bucket in buckets) if buckets else None b_m = None if w_m > 100: b_m = m_m try: perf_data = json.loads( os.popen( 'curl -s --retry 5 http://cms-gwmsmon.cern.ch/prodview/json/history/runtime720/%s' % task).read()) except Exception as e: print str(e) return (b_m, None) buckets = perf_data['aggregations']["2"]['buckets'] s_t = sum(bucket['key'] * bucket['doc_count'] for bucket in buckets) w_t = sum(bucket['doc_count'] for bucket in buckets) m_t = max(bucket['key'] for bucket in buckets) if buckets else None b_t = None if w_t > 100: b_t = m_t return (b_m, b_t) def getcampaign(task): taskname = task.pathName.split('/')[-1] if hasattr(task, 'prepID'): return task.prepID.split('-')[1] elif taskname.count('-') >= 1: return taskname.split('-')[1] else: return None def close(interface): open('%s/equalizor.json.new' % monitor_dir, 'w').write(json.dumps(interface, indent=2)) os.system('mv %s/equalizor.json.new %s/equalizor.json' % (monitor_dir, monitor_dir)) os.system('cp %s/equalizor.json %s/logs/equalizor/equalizor.%s.json' % (monitor_dir, monitor_dir, time.mktime(time.gmtime()))) interface = {'reversed_mapping': reversed_mapping, 'modifications': {}} if options.augment or options.remove: interface['modifications'] = json.loads( open('%s/equalizor.json' % monitor_dir).read())['modifications'] if options.remove: if specific in interface['modifications']: print "poping", specific interface['modifications'].pop(specific) close(interface) return PU_locations = {} PU_overflow = {} LHE_overflow = {} tune_performance = [] pending_HLT = 0 max_HLT = 60000 pending_T0 = 0 max_T0 = 60000 try: gmon = json.loads( os.popen( 'curl -s http://cms-gwmsmon.cern.ch/prodview/json/T2_CH_CERN_HLT' ).read()) pending_HLT += gmon["Running"] pending_HLT += gmon["MatchingIdle"] except: pass stay_within_site_whitelist = False specific_task = None if specific and ":" in specific: specific, specific_task = specific.split(':') if specific: wfs = session.query(Workflow).filter( Workflow.name.contains(specific)).all() else: wfs = session.query(Workflow).filter(Workflow.status == 'away').all() performance = {} no_routing = [] random.shuffle(wfs) for wfo in wfs: if wfo.name in no_routing and not options.augment: continue if specific and not specific in wfo.name: continue if specific: wfi = workflowInfo(url, wfo.name) else: cached = filter(lambda d: d['RequestName'] == wfo.name, workflows) if not cached: continue wfi = workflowInfo(url, wfo.name, request=cached[0]) ## only running should get re-routed if not wfi.request['RequestStatus'] in [ 'running-open', 'running-closed' ] and not specific: continue tasks_and_campaigns = [] for task in wfi.getWorkTasks(): tasks_and_campaigns.append((task, getcampaign(task))) _, _, _, sec = wfi.getIO() ## check needs override needs_overide = False if not needs_overide and options.augment: needs_overide = True def overide_from_agent(wfi, needs_overide): bad_agents = [] #'http://cmssrv219.fnal.gov:5984'] if not bad_agents: return needs_overide if needs_overide: return True agents = wfi.getAgents() wqss = ['Running', 'Acquired'] if any([ agent in agents.get(wqs, {}).keys() for wqs, agent in itertools.product(wqss, bad_agents) ]): print "overriding the need for bad agent" needs_overide = True return needs_overide ## now parse this for action for i_task, (task, campaign) in enumerate(tasks_and_campaigns): if options.augment: print task.pathName print campaign tune = CI.get(campaign, 'tune', options.tune) if tune and not campaign in tune_performance: tune_performance.append(campaign) overflow = CI.get(campaign, 'overflow', {}) if overflow: if "PU" in overflow and not campaign in PU_overflow: PU_overflow[campaign] = copy.deepcopy(overflow['PU']) print "adding", campaign, "to PU overflow rules" if "LHE" in overflow and not campaign in LHE_overflow: print "adding", campaign, "to light input overflow rules" site_list = overflow['LHE']['site_list'] LHE_overflow[campaign] = copy.deepcopy( getattr(SI, site_list)) ### get the task performance, for further massaging. if campaign in tune_performance or options.tune: print "performance", task.taskType, task.pathName if task.taskType in ['Processing', 'Production']: set_memory, set_time = getPerf(task.pathName) #print "Performance %s GB %s min"%( set_memory,set_time) wfi.sendLog( 'equalizor', 'Performance tuning to %s GB %s min' % (set_memory, set_time)) ## get values from gmwsmon # massage the values : 95% percentile performance[task.pathName] = {} if set_memory: performance[task.pathName]['memory'] = set_memory if set_time and False: performance[task.pathName]['time'] = set_time ### rule to avoid the issue of taskchain secondary jobs being stuck at sites processing the initial step if campaign in LHE_overflow: if task.taskType in ['Processing']: needs, task_name, running, idled = needs_action(wfi, task) needs_overide = overide_from_agent(wfi, needs_overide) extend_to = list(set(copy.deepcopy( LHE_overflow[campaign]))) if stay_within_site_whitelist: extend_to = list( set(extend_to) & set(wfi.request['SiteWhitelist']) ) ## restrict to stupid-site-whitelist extend_to = list( set(extend_to) & set(SI.sites_ready + force_sites)) if extend_to and needs or needs_overide: modifications[wfo.name][task.pathName] = { "ReplaceSiteWhitelist": extend_to, "Running": running, "Pending": idled, "Priority": wfi.request['RequestPriority'] } wfi.sendLog( 'equalizor', '%s of %s is running %d and pending %d, taking action : ReplaceSiteWhitelist \n %s' % (task_name, wfo.name, running, idled, json.dumps( sorted(modifications[wfo.name][task.pathName] ['ReplaceSiteWhitelist'])))) altered_tasks.add(task.pathName) else: wfi.sendLog( 'equalizor', '%s of %s is running %d and pending %d' % (task_name, wfo.name, running, idled)) ### overflow the 76 digi-reco to the site holding the pileup if campaign in PU_overflow: force = PU_overflow[campaign][ 'force'] if 'force' in PU_overflow[campaign] else False secondary_locations = set(SI.sites_ready + force_sites) for s in sec: if not s in PU_locations: presence = getDatasetPresence(url, s) #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] one_secondary_locations = [ site for (site, (there, frac)) in presence.items() if frac > 98. ] PU_locations[s] = one_secondary_locations print "secondary is at", sorted(PU_locations[s]) secondary_locations = set( [SI.SE_to_CE(site) for site in PU_locations[s]]) & secondary_locations ## we should add all sites that hold the secondary input if any ### given that we have the secondary location available, it is not necessary to use the add-hoc list ##secondary_locations = list(set(PU_overflow[campaign]['sites']) & set( SI.sites_ready )) if any([ task.pathName.endswith(finish) for finish in ['_0', 'StepOneProc', 'Production'] ]): needs, task_name, running, idled = needs_action(wfi, task) ## removing the ones in the site whitelist already since they encode the primary input location if stay_within_site_whitelist: original_site_in_use = set( wfi.request['SiteWhitelist']) else: original_site_in_use = set(secondary_locations) ## remove the sites that have already running jobs gmon = wfi.getGlideMon() if gmon and task_name in gmon and 'Sites' in gmon[ task_name]: site_in_use = set(gmon[task_name]['Sites']) print "removing", sorted(site_in_use) ## that determines where you want to run in addition augment_by = list((set(secondary_locations) - site_in_use) & original_site_in_use) else: print "no existing running site" augment_by = list(original_site_in_use) needs_overide = overide_from_agent(wfi, needs_overide) if augment_by and ( needs or needs_overide or force) and PU_overflow[campaign][ 'pending'] < PU_overflow[campaign]['max']: PU_overflow[campaign]['pending'] += idled print "raising overflow to", PU_overflow[campaign][ 'pending'], "for", PU_overflow[campaign]['max'] ## the step with an input ought to be the digi part : make this one go anywhere modifications[wfo.name][task.pathName] = { "AddWhitelist": augment_by, "Running": running, "Pending": idled, "Priority": wfi.request['RequestPriority'] } altered_tasks.add(task.pathName) wfi.sendLog( 'equalizor', '%s of %s is running %d and pending %d, taking action : AddWhitelist \n %s' % (task_name, wfo.name, running, idled, json.dumps(sorted(augment_by), indent=2))) else: print task_name, "of", wfo.name, "running", running, "and pending", idled ### overflow the skims back to multi-core if campaign in ['Run2015D', 'Run2015C_25ns' ] and task.taskType == 'Skim': original_swl = wfi.request['SiteWhitelist'] needs, task_name, running, idled = needs_action(wfi, task) if (needs or needs_overide): modifications[wfo.name][task.pathName] = { 'AddWhitelist': original_swl, "Running": running, "Pending": idled, "Priority": wfi.request['RequestPriority'] } altered_tasks.add(task.pathName) wfi.sendLog( 'equalizor', '%s of %s is running %d and pending %d, taking action : AddWhitelist \n %s' % (task_name, wfo.name, running, idled, json.dumps(sorted(original_swl), indent=2))) if options.augment: print sorted(wfi.request['SiteWhitelist']), i_task, use_HLT ### add the HLT at partner of CERN if 'T2_CH_CERN' in wfi.request['SiteWhitelist'] and i_task in [ 0, 1 ] and use_HLT: needs, task_name, running, idled = needs_action(wfi, task) if options.augment: needs = True needs = True ##needs = random.random()<0.40 remove the random, just add up to a limit if (needs or needs_overide) and pending_HLT < max_HLT: pending_HLT += idled if task.pathName in modifications[ wfo.name] and 'AddWhitelist' in modifications[ wfo.name][task.pathName]: modifications[wfo.name][task.pathName][ "AddWhitelist"].append("T2_CH_CERN_HLT") print "\t", wfo.name, "adding addHLT up to", pending_HLT, "for", max_HLT print task.pathName ## this Replace does not work at all for HLT #elif task.pathName in modifications[wfo.name] and 'ReplaceSiteWhitelist' in modifications[wfo.name][task.pathName]: #modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"].append( "T2_CH_CERN_HLT" ) #print "\t",wfo.name,"adding replace HLT up to",pending_HLT,"for",max_HLT else: modifications[wfo.name][task.pathName] = { "AddWhitelist": ["T2_CH_CERN_HLT"], "Priority": wfi.request['RequestPriority'], "Running": running, "Pending": idled } wfi.sendLog( 'equalizor', 'adding the HLT in whitelist of %s to %d for %d' % (task.pathName, pending_HLT, max_HLT)) if i_task == 0 and not sec and use_T0: needs, task_name, running, idled = needs_action(wfi, task) if options.augment: needs = True #needs = True good_type = wfi.request['RequestType'] in [ 'MonteCarlo', 'MonteCarloFromGEN' ] read_lhe = ((not 'LheInputFiles' in wfi.request) or bool(wfi.request['LheInputFiles'])) good_type &= not read_lhe if not good_type and not options.augment: needs = False ##needs = random.random()<0.40 remove the random, just add up to a limit if (needs or needs_overide): pending_T0 += idled if task.pathName in modifications[ wfo.name] and 'AddWhitelist' in modifications[ wfo.name][task.pathName]: if not "T0_CH_CERN" in modifications[wfo.name][ task.pathName]["AddWhitelist"]: modifications[wfo.name][task.pathName][ "AddWhitelist"].append("T0_CH_CERN") wfi, sendLog( 'equalizor', 'adding the T0 for %s to %d for %d' % (task.pathName, pending_T0, max_T0)) elif task.pathName in modifications[ wfo. name] and 'ReplaceSiteWhitelist' in modifications[ wfo.name][task.pathName]: if not "T0_CH_CERN" in modifications[wfo.name][ task.pathName]["ReplaceSiteWhitelist"]: modifications[wfo.name][task.pathName][ "ReplaceSiteWhitelist"].append("T0_CH_CERN") wfi, sendLog( 'equalizor', 'adding the T0 to replacement for %s to %d for %d' % (task.pathName, pending_T0, max_T0)) else: modifications[wfo.name][task.pathName] = { "AddWhitelist": ["T0_CH_CERN"], "Priority": wfi.request['RequestPriority'], "Running": running, "Pending": idled } wfi, sendLog( 'equalizor', 'adding the T0 for %s to %d for %d' % (task.pathName, pending_T0, max_T0)) interface['modifications'].update(modifications) ### manage the number of core and job resizing interface['cores'] = { 'T2_CH_CERN_HLT': { 'min': 4, 'max': 16 }, 'default': { 'min': 1, 'max': 4 } } interface['resizes'] = ['RunIISpring16DR80'] ### manage the modification of the memory and target time interface['time'] = defaultdict(list) interface['memory'] = defaultdict(list) max_N_mem = 10 max_N_time = 10 ## discretize the memory to 10 at most values mems = set([o['memory'] for t, o in performance.items() if 'memory' in o]) times = set([o['time'] for t, o in performance.items() if 'time' in o]) if len(mems) > max_N_mem: mem_step = int((max(mems) - min(mems)) / float(max_N_mem)) for t in performance: if not 'memory' in performance[t]: continue (m, r) = divmod(performance[t]['memory'], mem_step) performance[t]['memory'] = (m + 1) * mem_step if len(times) > max_N_time: time_step = int((max(times) - min(times)) / float(max_N_time)) for t in performance: if not 'time' in performance[t]: continue (m, r) = divmod(performance[t]['time'], time_step) performance[t]['time'] = (m + 1) * time_step for t, o in performance.items(): if 'time' in o: interface['time'][str(o['time'])].append(t) if 'memory' in o: interface['memory'][str(o['memory'])].append(t) ## close and save close(interface)
def equalizor(url , specific = None, options=None): up = componentInfo(mcm=False, soft=['mcm']) if not up.check(): return if not specific: workflows = getWorkflows(url, status='running-closed', details=True) workflows.extend(getWorkflows(url, status='running-open', details=True)) ## start from scratch modifications = defaultdict(dict) ## define regionality site => fallback allowed. feed on an ssb metric ?? mapping = defaultdict(list) reversed_mapping = defaultdict(list) regions = defaultdict(list) SI = siteInfo() CI = campaignInfo() UC = unifiedConfiguration() for site in SI.sites_ready: region = site.split('_')[1] if not region in ['US','DE','IT']: continue regions[region] = [region] def site_in_depletion(s): return True if s in SI.sites_pressure: (m, r, pressure) = SI.sites_pressure[s] if float(m) < float(r): print s,m,r,"lacking pressure" return True else: print s,m,r,"pressure" pass return False for site in SI.sites_ready: region = site.split('_')[1] ## fallback to the region, to site with on-going low pressure mapping[site] = [fb for fb in SI.sites_ready if any([('_%s_'%(reg) in fb and fb!=site and site_in_depletion(fb))for reg in regions[region]]) ] use_T0 = ('T0_CH_CERN' in UC.get("site_for_overflow")) if options.t0: use_T0 = True #if options.augment : use_T0 = True use_HLT = ('T2_CH_CERN_HLT' in UC.get("site_for_overflow")) if options.hlt: use_HLT = True #if options.augment : use_HLT=True if use_HLT: mapping['T2_CH_CERN'].append('T2_CH_CERN_HLT') if use_T0: mapping['T2_CH_CERN'].append('T0_CH_CERN') #mapping['T1_FR_CCIN2P3'].append('T0_CH_CERN') #mapping['T2_IT_Legnaro'].append('T1_IT_CNAF') for reg in ['IT','DE','UK']: mapping['T2_CH_CERN'].extend([fb for fb in SI.sites_ready if '_%s_'%reg in fb]) ## make them appear as OK to use force_sites = [] ## overflow CERN to underutilized T1s upcoming = json.loads( open('%s/GQ.json'%monitor_dir).read()) for possible in SI.sites_T1s: if not possible in upcoming: mapping['T2_CH_CERN'].append(possible) ## remove add-hoc sites from overflow mapping prevent_sites = ['T2_US_Purdue'] for prevent in prevent_sites: if prevent in mapping: mapping.pop( prevent ) for src in mapping: for prevent in prevent_sites: if prevent in mapping[src]: mapping[src].remove( prevent ) ## create the reverse mapping for the condor module for site,fallbacks in mapping.items(): for fb in fallbacks: reversed_mapping[fb].append(site) ## this is the fallback mapping print "Direct mapping : site => overflow" print json.dumps( mapping, indent=2) print "Reverse mapping : dest <= from origin" print json.dumps( reversed_mapping, indent=2) altered_tasks = set() def running_idle( wfi , task_name): gmon = wfi.getGlideMon() #print gmon if not gmon: return (0,0) if not task_name in gmon: return (0,0) return (gmon[task_name]['Running'], gmon[task_name]['Idle']) def needs_action( wfi, task, min_idled = 100, pressure = 0.2): task_name = task.pathName.split('/')[-1] running, idled = running_idle( wfi, task_name) go = True if not idled and not running : go = False if idled < 100: go = False if (not running and idled) or (running and (idled / float(running) > pressure)): go = True else: go = False return go, task_name, running, idled def getPerf( task ): task = task.split('/')[1]+'/'+task.split('/')[-1] try: u = 'http://cms-gwmsmon.cern.ch/prodview/json/history/memoryusage720/%s'%task print u perf_data = json.loads(os.popen('curl -s --retry 5 %s'%u).read()) except Exception as e: print str(e) return (None,None) buckets = perf_data['aggregations']["2"]['buckets'] s_m = sum( bucket['key']*bucket['doc_count'] for bucket in buckets) w_m = sum( bucket['doc_count'] for bucket in buckets) m_m = max( bucket['key'] for bucket in buckets) if buckets else None b_m = None if w_m > 100: b_m = m_m try: perf_data = json.loads(os.popen('curl -s --retry 5 http://cms-gwmsmon.cern.ch/prodview/json/history/runtime720/%s'%task).read()) except Exception as e: print str(e) return (b_m,None) buckets = perf_data['aggregations']["2"]['buckets'] s_t = sum( bucket['key']*bucket['doc_count'] for bucket in buckets) w_t = sum( bucket['doc_count'] for bucket in buckets) m_t = max( bucket['key'] for bucket in buckets) if buckets else None b_t = None if w_t > 100: b_t = m_t return (b_m,b_t) def getcampaign( task ): taskname = task.pathName.split('/')[-1] if hasattr( task, 'prepID'): return task.prepID.split('-')[1] elif taskname.count('-')>=1: return taskname.split('-')[1] else: return None def close( interface ): open('%s/equalizor.json.new'%monitor_dir,'w').write( json.dumps( interface, indent=2)) os.system('mv %s/equalizor.json.new %s/equalizor.json'%(monitor_dir,monitor_dir)) os.system('cp %s/equalizor.json %s/logs/equalizor/equalizor.%s.json'%(monitor_dir,monitor_dir,time.mktime(time.gmtime()))) interface = { 'reversed_mapping' : reversed_mapping, 'modifications' : {} } if options.augment or options.remove: interface['modifications'] = json.loads( open('%s/equalizor.json'%monitor_dir).read())['modifications'] if options.remove: if specific in interface['modifications']: print "poping",specific interface['modifications'].pop(specific) close( interface ) return PU_locations = {} PU_overflow = {} LHE_overflow = {} tune_performance = [] pending_HLT = 0 max_HLT = 60000 pending_T0 = 0 max_T0 = 60000 try: gmon = json.loads(os.popen('curl -s http://cms-gwmsmon.cern.ch/prodview/json/T2_CH_CERN_HLT').read()) pending_HLT += gmon["Running"] pending_HLT += gmon["MatchingIdle"] except: pass stay_within_site_whitelist = False specific_task=None if specific and ":" in specific: specific,specific_task = specific.split(':') if specific: wfs = session.query(Workflow).filter(Workflow.name.contains(specific)).all() else: wfs = session.query(Workflow).filter(Workflow.status == 'away').all() performance = {} no_routing = [ ] random.shuffle( wfs ) for wfo in wfs: if wfo.name in no_routing and not options.augment: continue if specific and not specific in wfo.name: continue if specific: wfi = workflowInfo(url, wfo.name) else: cached = filter(lambda d : d['RequestName']==wfo.name, workflows) if not cached : continue wfi = workflowInfo(url, wfo.name, request = cached[0]) ## only running should get re-routed if not wfi.request['RequestStatus'] in ['running-open','running-closed'] and not specific: continue tasks_and_campaigns = [] for task in wfi.getWorkTasks(): tasks_and_campaigns.append( (task, getcampaign(task) ) ) _,_,_,sec = wfi.getIO() ## check needs override needs_overide = False if not needs_overide and options.augment: needs_overide=True def overide_from_agent( wfi, needs_overide): bad_agents = []#'http://cmssrv219.fnal.gov:5984'] if not bad_agents: return needs_overide if needs_overide: return True agents = wfi.getAgents() wqss = ['Running','Acquired'] if any([agent in agents.get(wqs,{}).keys() for wqs,agent in itertools.product( wqss, bad_agents)]): print "overriding the need for bad agent" needs_overide = True return needs_overide ## now parse this for action for i_task,(task,campaign) in enumerate(tasks_and_campaigns): if options.augment: print task.pathName print campaign tune = CI.get(campaign,'tune',options.tune) if tune and not campaign in tune_performance: tune_performance.append( campaign ) overflow = CI.get(campaign,'overflow',{}) if overflow: if "PU" in overflow and not campaign in PU_overflow: PU_overflow[campaign] = copy.deepcopy(overflow['PU']) print "adding",campaign,"to PU overflow rules" if "LHE" in overflow and not campaign in LHE_overflow: print "adding",campaign,"to light input overflow rules" site_list = overflow['LHE']['site_list'] LHE_overflow[campaign] = copy.deepcopy( getattr(SI,site_list) ) ### get the task performance, for further massaging. if campaign in tune_performance or options.tune: print "performance",task.taskType,task.pathName if task.taskType in ['Processing','Production']: set_memory,set_time = getPerf( task.pathName ) #print "Performance %s GB %s min"%( set_memory,set_time) wfi.sendLog('equalizor','Performance tuning to %s GB %s min'%( set_memory,set_time)) ## get values from gmwsmon # massage the values : 95% percentile performance[task.pathName] = {} if set_memory: performance[task.pathName]['memory']=set_memory if set_time and False: performance[task.pathName]['time'] = set_time ### rule to avoid the issue of taskchain secondary jobs being stuck at sites processing the initial step if campaign in LHE_overflow: if task.taskType in ['Processing']: needs, task_name, running, idled = needs_action(wfi, task) needs_overide = overide_from_agent( wfi, needs_overide) extend_to = list(set(copy.deepcopy( LHE_overflow[campaign] ))) if stay_within_site_whitelist: extend_to = list(set(extend_to) & set(wfi.request['SiteWhitelist'])) ## restrict to stupid-site-whitelist extend_to = list(set(extend_to) & set(SI.sites_ready + force_sites)) if extend_to and needs or needs_overide: modifications[wfo.name][task.pathName] = { "ReplaceSiteWhitelist" : extend_to ,"Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']} wfi.sendLog('equalizor','%s of %s is running %d and pending %d, taking action : ReplaceSiteWhitelist \n %s'%( task_name, wfo.name, running, idled , json.dumps( sorted(modifications[wfo.name][task.pathName]['ReplaceSiteWhitelist'])))) altered_tasks.add( task.pathName ) else: wfi.sendLog('equalizor','%s of %s is running %d and pending %d'%( task_name, wfo.name, running, idled)) ### overflow the 76 digi-reco to the site holding the pileup if campaign in PU_overflow: force = PU_overflow[campaign]['force'] if 'force' in PU_overflow[campaign] else False secondary_locations = set(SI.sites_ready + force_sites) for s in sec: if not s in PU_locations: presence = getDatasetPresence( url, s) #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.] PU_locations[s] = one_secondary_locations print "secondary is at",sorted(PU_locations[s]) secondary_locations = set([SI.SE_to_CE(site) for site in PU_locations[s]]) & secondary_locations ## we should add all sites that hold the secondary input if any ### given that we have the secondary location available, it is not necessary to use the add-hoc list ##secondary_locations = list(set(PU_overflow[campaign]['sites']) & set( SI.sites_ready )) if any([task.pathName.endswith(finish) for finish in ['_0','StepOneProc','Production']]) : needs, task_name, running, idled = needs_action(wfi, task) ## removing the ones in the site whitelist already since they encode the primary input location if stay_within_site_whitelist: original_site_in_use = set(wfi.request['SiteWhitelist']) else: original_site_in_use = set(secondary_locations) ## remove the sites that have already running jobs gmon = wfi.getGlideMon() if gmon and task_name in gmon and 'Sites' in gmon[task_name]: site_in_use = set(gmon[task_name]['Sites']) print "removing",sorted(site_in_use) ## that determines where you want to run in addition augment_by = list((set(secondary_locations)- site_in_use) & original_site_in_use) else: print "no existing running site" augment_by = list(original_site_in_use) needs_overide = overide_from_agent( wfi, needs_overide) if augment_by and (needs or needs_overide or force) and PU_overflow[campaign]['pending'] < PU_overflow[campaign]['max']: PU_overflow[campaign]['pending'] += idled print "raising overflow to",PU_overflow[campaign]['pending'],"for",PU_overflow[campaign]['max'] ## the step with an input ought to be the digi part : make this one go anywhere modifications[wfo.name][task.pathName] = { "AddWhitelist" : augment_by , "Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']} altered_tasks.add( task.pathName ) wfi.sendLog('equalizor','%s of %s is running %d and pending %d, taking action : AddWhitelist \n %s'%( task_name, wfo.name, running, idled, json.dumps( sorted(augment_by), indent=2 ))) else: print task_name,"of",wfo.name,"running",running,"and pending",idled ### overflow the skims back to multi-core if campaign in ['Run2015D','Run2015C_25ns'] and task.taskType =='Skim': original_swl = wfi.request['SiteWhitelist'] needs, task_name, running, idled = needs_action(wfi, task) if (needs or needs_overide): modifications[wfo.name][task.pathName] = { 'AddWhitelist' : original_swl, "Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']} altered_tasks.add( task.pathName ) wfi.sendLog('equalizor','%s of %s is running %d and pending %d, taking action : AddWhitelist \n %s'%( task_name, wfo.name, running, idled, json.dumps( sorted(original_swl), indent=2 ))) if options.augment: print sorted(wfi.request['SiteWhitelist']),i_task,use_HLT ### add the HLT at partner of CERN if 'T2_CH_CERN' in wfi.request['SiteWhitelist'] and i_task in [0,1] and use_HLT: needs, task_name, running, idled = needs_action(wfi, task) if options.augment: needs=True needs = True ##needs = random.random()<0.40 remove the random, just add up to a limit if (needs or needs_overide) and pending_HLT < max_HLT: pending_HLT += idled if task.pathName in modifications[wfo.name] and 'AddWhitelist' in modifications[wfo.name][task.pathName]: modifications[wfo.name][task.pathName]["AddWhitelist"].append( "T2_CH_CERN_HLT" ) print "\t",wfo.name,"adding addHLT up to",pending_HLT,"for",max_HLT print task.pathName ## this Replace does not work at all for HLT #elif task.pathName in modifications[wfo.name] and 'ReplaceSiteWhitelist' in modifications[wfo.name][task.pathName]: #modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"].append( "T2_CH_CERN_HLT" ) #print "\t",wfo.name,"adding replace HLT up to",pending_HLT,"for",max_HLT else: modifications[wfo.name][task.pathName] = { "AddWhitelist" : ["T2_CH_CERN_HLT"], "Priority" : wfi.request['RequestPriority'], "Running" : running, "Pending" : idled} wfi.sendLog('equalizor','adding the HLT in whitelist of %s to %d for %d'%( task.pathName, pending_HLT, max_HLT)) if i_task==0 and not sec and use_T0: needs, task_name, running, idled = needs_action(wfi, task) if options.augment: needs=True #needs = True good_type = wfi.request['RequestType'] in ['MonteCarlo','MonteCarloFromGEN'] read_lhe = ((not 'LheInputFiles' in wfi.request) or bool(wfi.request['LheInputFiles'])) good_type &= not read_lhe if not good_type and not options.augment: needs = False ##needs = random.random()<0.40 remove the random, just add up to a limit if (needs or needs_overide): pending_T0 += idled if task.pathName in modifications[wfo.name] and 'AddWhitelist' in modifications[wfo.name][task.pathName]: if not "T0_CH_CERN" in modifications[wfo.name][task.pathName]["AddWhitelist"]: modifications[wfo.name][task.pathName]["AddWhitelist"].append( "T0_CH_CERN" ) wfi,sendLog('equalizor','adding the T0 for %s to %d for %d'%( task.pathName, pending_T0, max_T0)) elif task.pathName in modifications[wfo.name] and 'ReplaceSiteWhitelist' in modifications[wfo.name][task.pathName]: if not "T0_CH_CERN" in modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"]: modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"].append( "T0_CH_CERN" ) wfi,sendLog('equalizor','adding the T0 to replacement for %s to %d for %d'%( task.pathName, pending_T0, max_T0)) else: modifications[wfo.name][task.pathName] = { "AddWhitelist" : ["T0_CH_CERN"], "Priority" : wfi.request['RequestPriority'], "Running" : running, "Pending" : idled} wfi,sendLog('equalizor','adding the T0 for %s to %d for %d'%( task.pathName, pending_T0, max_T0)) interface['modifications'].update( modifications ) ### manage the number of core and job resizing interface['cores']={'T2_CH_CERN_HLT': {'min':4,'max':16}, 'default': {'min':1, 'max':4}} interface['resizes'] = ['RunIISpring16DR80'] ### manage the modification of the memory and target time interface['time'] = defaultdict(list) interface['memory'] = defaultdict(list) max_N_mem = 10 max_N_time = 10 ## discretize the memory to 10 at most values mems = set([o['memory'] for t,o in performance.items() if 'memory' in o]) times = set([o['time'] for t,o in performance.items() if 'time' in o]) if len(mems)>max_N_mem: mem_step = int((max(mems) - min(mems))/ float(max_N_mem)) for t in performance: if not 'memory' in performance[t]: continue (m,r) = divmod(performance[t]['memory'], mem_step) performance[t]['memory'] = (m+1)*mem_step if len(times)>max_N_time: time_step = int((max(times) - min(times))/float(max_N_time)) for t in performance: if not 'time' in performance[t]: continue (m,r) = divmod(performance[t]['time'], time_step) performance[t]['time'] = (m+1)*time_step for t,o in performance.items(): if 'time' in o: interface['time'][str(o['time'])] .append( t ) if 'memory' in o: interface['memory'][str(o['memory'])].append( t ) ## close and save close( interface )
def injector(url, options, specific): use_mcm = True up = componentInfo(mcm=use_mcm, soft=["mcm"]) if not up.check(): return use_mcm = up.status["mcm"] workflows = getWorkflows(url, status=options.wmstatus, user=options.user) existing = [wf.name for wf in session.query(Workflow).all()] ## browse for assignment-approved requests, browsed for ours, insert the diff for wf in workflows: if wf not in existing: print "putting", wf new_wf = Workflow(name=wf, status=options.setstatus, wm_status=options.wmstatus) session.add(new_wf) session.commit() time.sleep(1) existing = [wf.name for wf in session.query(Workflow).all()] ## passing a round of invalidation of what needs to be invalidated if use_mcm and (options.invalidate or True): invalidator(url) ## pick up replacements for wf in session.query(Workflow).filter(Workflow.status == "trouble").all(): if specific and wf.name != specific: continue print wf.name wl = getWorkLoad(url, wf.name) familly = getWorkflowById(url, wl["PrepID"]) true_familly = [] for member in familly: if member == wf.name: continue fwl = getWorkLoad(url, member) if options.replace: if member != options.replace: continue else: if fwl["RequestDate"] < wl["RequestDate"]: continue if fwl["RequestType"] == "Resubmission": continue if fwl["RequestStatus"] in ["None", None]: continue true_familly.append(fwl) if len(true_familly) == 0: print wf.name, "ERROR has no replacement" known = [] try: known = json.loads(open("no_replacement.json").read()) except: pass if not wf.name in known: sendEmail( "workflow in %s with no replacement" % (wl["RequestStatus"]), "%s is dangling there" % (wf.name) ) known.append(wf.name) open("no_replacement.json", "w").write(json.dumps(known, indent=2)) continue print wf.name, "has", len(familly), "familly members" print wf.name, "has", len(true_familly), "true familly members" for fwl in true_familly: member = fwl["RequestName"] new_wf = session.query(Workflow).filter(Workflow.name == member).first() if not new_wf: print "putting", member, "as replacement of", wf.name status = "away" if fwl["RequestStatus"] in ["assignment-approved"]: status = "considered" new_wf = Workflow(name=member, status=status, wm_status=fwl["RequestStatus"]) wf.status = "forget" session.add(new_wf) else: if new_wf.status == "forget": continue print "getting", new_wf.name, "as replacement of", wf.name wf.status = "forget" for tr in session.query(Transfer).all(): if wf.id in tr.workflows_id: sw = copy.deepcopy(tr.workflows_id) sw.remove(wf.id) sw.append(new_wf.id) tr.workflows_id = sw print tr.phedexid, "got", new_wf.name if new_wf.status != "away": print "\t setting it considered" new_wf.status = "considered" session.commit() ## don't do that automatically # wf.status = 'forget' session.commit()
def closor(url, specific=None, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() all_late_files = [] check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce') jump_the_line = options.announce if options else False if jump_the_line: print "announce option is on. Checking on things on-going ready to be announced" wfs = session.query(Workflow).filter( Workflow.status.contains('announce')).filter( sqlalchemy.not_(Workflow.status.contains('announced'))).all() else: print "regular option. Checking on things done and to be announced" wfs = session.query(Workflow).filter(Workflow.status == 'close').all() wfs_n = [w.name for w in wfs] print "unique names?" print len(set(wfs_n)) == len(wfs_n) held = set() print len(wfs), "closing" random.shuffle(wfs) max_per_round = UC.get('max_per_round').get('closor', None) if options.limit: max_per_round = options.limit if max_per_round: ## order them by priority all_closedout = sorted(getWorkflows(url, 'closed-out', details=True), key=lambda r: r['RequestPriority']) all_closedout = [r['RequestName'] for r in all_closedout] def rank(wfn): return all_closedout.index(wfn) if wfn in all_closedout else 0 wfs = sorted(wfs, key=lambda wfo: rank(wfo.name), reverse=True) wfs = wfs[:max_per_round] batch_go = {} batch_warnings = defaultdict(set) batch_goodness = UC.get("batch_goodness") for iwfo, wfo in enumerate(wfs): if specific and not specific in wfo.name: continue print "Progress [%d/%d]" % (iwfo, len(wfs)) ## what is the expected #lumis wfi = workflowInfo(url, wfo.name) wfo.wm_status = wfi.request['RequestStatus'] if wfi.isRelval(): has_batch_go = False batch_name = wfi.getCampaign() if not batch_name in batch_go: ## do the esimatation whethere this can be announced : only once per batch in_batches = getWorkflowByCampaign(url, batch_name, details=True) batch_go[batch_name] = all( map( lambda s: not s in [ 'completed', 'running-open', 'running-closed', 'acquired', 'assigned', 'assignment-approved' ], [r['RequestStatus'] for r in in_batches])) ## already verified has_batch_go = batch_go[batch_name] if not has_batch_go: wfi.sendLog( 'closor', 'Cannot close for now because the batch <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?campaign=%s>%s</a> is not all close' % (batch_name, batch_name)) continue if wfi.request['RequestStatus'] in ['announced', 'normal-archived' ] and not options.force: ## manually announced ?? wfo.status = 'done' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog( 'closor', '%s is announced already : %s' % (wfo.name, wfo.wm_status)) session.commit() if jump_the_line: wfi.sendLog('closor', 'Announcing while completing') expected_lumis = 1 if not 'TotalInputLumis' in wfi.request: print wfo.name, "has not been assigned yet, or the database is corrupted" elif wfi.request['TotalInputLumis'] == 0: print wfo.name, "is corrupted with 0 expected lumis" else: expected_lumis = wfi.request['TotalInputLumis'] ## what are the outputs outputs = wfi.request['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = defaultdict(lambda: False) stats = defaultdict(int) #print outputs if len(outputs): print wfo.name, wfi.request['RequestStatus'] for out in outputs: event_count, lumi_count = getDatasetEventsAndLumis(dataset=out) odb = session.query(Output).filter( Output.datasetname == out).first() if not odb: print "adding an output object", out odb = Output(datasetname=out) odb.workflow = wfo session.add(odb) odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) session.commit() fraction = lumi_count / float(expected_lumis) * 100. completion_line = "%60s %d/%d = %3.2f%%" % ( out, lumi_count, expected_lumis, fraction) wfi.sendLog('closor', "\t%s" % completion_line) if wfi.isRelval() and fraction < batch_goodness: batch_warnings[wfi.getCampaign()].add(completion_line) stats[out] = lumi_count all_OK[out] = True ## check for at least one full copy prior to moving on in_full = {} for out in outputs: in_full[out] = [] presence = getDatasetPresence(url, out) where = [site for site, info in presence.items() if info[0]] if where: all_OK[out] = True print out, "is in full at", ",".join(where) in_full[out] = copy.deepcopy(where) else: going_to = wfi.request['NonCustodialSites'] + wfi.request[ 'CustodialSites'] wfi.sendLog( 'closor', "%s is not in full anywhere. send to %s" % (out, ",".join(sorted(going_to)))) at_destination = dict([(k, v) for (k, v) in presence.items() if k in going_to]) else_where = dict([(k, v) for (k, v) in presence.items() if not k in going_to]) print json.dumps(at_destination) print json.dumps(else_where, indent=2) ## do the full stuck transfer study, missing files and shit ! for there in going_to: late_info = findLateFiles(url, out, going_to=there) for l in late_info: l.update({"workflow": wfo.name, "dataset": out}) all_late_files.extend(late_info) if check_fullcopy_to_announce: ## only set this false if the check is relevant all_OK[out] = False ## verify if we have to do harvesting if not options.no_harvest and not jump_the_line: (OK, requests) = spawn_harvesting(url, wfi, in_full) all_OK.update(OK) ## only that status can let me go into announced if all(all_OK.values()) and ( (wfi.request['RequestStatus'] in ['closed-out']) or options.force or jump_the_line): print wfo.name, "to be announced" results = [] if not results: for out in outputs: if out in stats and not stats[out]: continue _, dsn, process_string, tier = out.split('/') if all_OK[out]: results.append(setDatasetStatus(out, 'VALID')) if all_OK[out] and wfi.isRelval(): ## make the specific relval rules and the replicas ## figure the destination(s) out destinations = set() if tier != "RECO" and tier != "ALCARECO": destinations.add('T2_CH_CERN') if tier == "GEN-SIM": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-DIGI-RAW": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-RECO": destinations.add('T1_US_FNAL_Disk') if "RelValTTBar" in dsn and "TkAlMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if "MinimumBias" in dsn and "SiStripCalMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if destinations: wfi.sendLog( 'closor', '%s to go to %s' % (out, ', '.join(sorted(destinations)))) ## call to makereplicarequest under relval => done for site in destinations: result = makeReplicaRequest( url, site, [out], 'Copy for release validation consumption', priority='normal', approve=True, mail=False, group='RelVal') try: request_id = result['phedex'][ 'request_created'][0]['id'] results.append(True) except: results.append('Failed relval transfer') elif all_OK[out]: campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request and wfi.request[ 'Campaign']: campaign = wfi.request['Campaign'] to_DDM = False ## campaign override if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[ campaign] and tier in CI.campaigns[campaign][ 'toDDM']: to_DDM = True ## by typical enabling if tier in UC.get("tiers_to_DDM"): to_DDM = True ## check for unitarity if not tier in UC.get("tiers_no_DDM") + UC.get( "tiers_to_DDM"): print "tier", tier, "neither TO or NO DDM for", out results.append('Not recognitized tier %s' % tier) #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out) sendLog( 'closor', "could not recognize %s for injecting in DDM" % out, level='critical') continue n_copies = 1 destinations = [] if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[ campaign]: ddm_instructions = CI.campaigns[campaign][ 'DDMcopies'] if type(ddm_instructions) == int: n_copies = CI.campaigns[campaign]['DDMcopies'] elif type(ddm_instructions) == dict: ## a more fancy configuration for ddmtier, indication in ddm_instructions.items( ): if ddmtier == tier or ddmtier in [ '*', 'all' ]: ## this is for us if 'N' in indication: n_copies = indication['N'] if 'host' in indication: destinations = indication['host'] destination_spec = "" if destinations: destination_spec = "--destination=" + ",".join( destinations) group_spec = "" ## not used yet ### should make this a campaign configuration ## inject to DDM when necessary if to_DDM: print "Sending", out, " to DDM" status = pass_to_dynamo( [out], N=n_copies, sites=destinations if destinations else None, group=group_spec if group_spec else None) results.append(status) if status == True: wfi.sendLog( 'closor', '%s is send to dynamo in %s copies %s %s' % (out, n_copies, sorted(destinations), group_spec)) else: sendLog('closor', "could not add " + out + " to dynamo pool. check closor logs.", level='critical') wfi.sendLog( 'closor', "could not add " + out + " to dynamo pool. check closor logs.") else: print wfo.name, "no stats for announcing", out results.append('No Stats') if all( map(lambda result: result in ['None', None, True], results)): if not jump_the_line: ## only announce if all previous are fine res = reqMgrClient.announceWorkflowCascade( url, wfo.name) if not res in ['None', None]: ## check the status again, it might well have toggled wl_bis = workflowInfo(url, wfo.name) wfo.wm_status = wl_bis.request['RequestStatus'] session.commit() if wl_bis.request['RequestStatus'] in [ 'announced', 'normal-archived' ]: res = None else: ## retry ? res = reqMgrClient.announceWorkflowCascade( url, wfo.name) results.append(res) #print results if all(map(lambda result: result in ['None', None, True], results)): if jump_the_line: if not 'announced' in wfo.status: wfo.status = wfo.status.replace( 'announce', 'announced') else: wfo.status = 'done' session.commit() wfi.sendLog('closor', "workflow outputs are announced") else: wfi.sendLog( 'closor', "Error with %s to be announced \n%s" % (wfo.name, json.dumps(results))) elif wfi.request['RequestStatus'] in [ 'failed', 'aborted', 'aborted-archived', 'rejected', 'rejected-archived', 'aborted-completed' ]: if wfi.isRelval(): wfo.status = 'forget' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog( 'closor', "%s is %s, but will not be set in trouble to find a replacement." % (wfo.name, wfo.wm_status)) else: wfo.status = 'trouble' wfo.wm_status = wfi.request['RequestStatus'] session.commit() else: print wfo.name, "not good for announcing:", wfi.request[ 'RequestStatus'] wfi.sendLog('closor', "cannot be announced") held.add(wfo.name) days_late = 0. retries_late = 10 really_late_files = [ info for info in all_late_files if info['retries'] >= retries_late ] really_late_files = [ info for info in really_late_files if info['delay'] / (60 * 60 * 24.) >= days_late ] if really_late_files: subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s' % ( len(really_late_files), days_late, retries_late, json.dumps(really_late_files, indent=2)) #sendEmail('waiting for files to announce', subject) sendLog('closor', subject, level='warning') sendLog('closor', subject) print subject open('%s/stuck_files.json' % monitor_dir, 'w').write(json.dumps(really_late_files, indent=2)) if held: sendLog('closor', "the workflows below are held up \n%s" % ("\n".join(sorted(held))), level='critical') #batches = json.loads(open('batches.json').read()) for bname, go in batch_go.items(): if go: subject = "Release Validation Samples Batch %s" % bname issues = "" if batch_warnings[bname]: issues = "The following datasets have outstanding completion (<%d%%) issues:\n\n" % batch_goodness issues += "\n".join(sorted(batch_warnings[bname])) issues += "\n\n" text = """ Dear all, a batch of release validation workflows has finished. Batch ID: %s Detail of the workflows https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s %s This is an automated message. """ % (bname, bname, issues) to = ['*****@*****.**'] sendEmail(subject, text, destination=to)
from assignSession import * import reqMgrClient import os import sys import json import time import random UC = unifiedConfiguration() ## get all acquired and push one to stepchain so that we can acquire it on nersc N_for_cloud = isHEPCloudReady(reqmgr_url) if N_for_cloud: print "HEP cloud is ready" wfs = getWorkflows(reqmgr_url, 'acquired', details=True) for wf in wfs: if N_for_cloud <= 0: break wfi = workflowInfo(reqmgr_url, wf['RequestName'], request=wf) print "testing", wf['RequestName'] if wfi.isGoodToConvertToStepChain() and wfi.isGoodForNERSC( no_step=True) and N_for_cloud: print "good to convert to step so that we get something for hepcloud on next round", wf[ 'RequestName'] os.system( 'Unified/rejector.py --to_step --clone --comment "convert to step for hepcloud" %s' % wf['RequestName']) ## just do that once and be done with it N_for_cloud -= 1 ## send something to T0
def batchor( url ): UC = unifiedConfiguration() ## get all workflows in assignment-approved with SubRequestType = relval all_wfs = [] for user in UC.get("user_relval"): all_wfs = getWorkflows(url, 'assignment-approved', details=True, user=user, rtype='TaskChain') wfs = filter( lambda r :r['SubRequestType'] == 'RelVal' if 'SubRequestType' in r else False, all_wfs) ## need a special treatment for those hi_wfs = filter( lambda r :r['SubRequestType'] == 'HIRelVal' if 'SubRequestType' in r else False, all_wfs) by_campaign = defaultdict(set) by_hi_campaign = defaultdict(set) for wf in wfs: print "Relval:",wf['RequestName'], wf['Campaign'] by_campaign[wf['Campaign']].add( wf['RequestName'] ) for wf in hi_wfs: print "HI Relval:",wf['RequestName'], wf['Campaign'] by_hi_campaign[wf['Campaign']].add( wf['RequestName'] ) default_setup = { "go" :True, "parameters" : { "SiteWhitelist": [ "T1_US_FNAL" ], "MergedLFNBase": "/store/relval", "Team" : "relval", "NonCustodialGroup" : "RelVal" }, "custodial" : "T1_US_FNAL_MSS", "phedex_group" : "RelVal", "lumisize" : -1, "fractionpass" : 0.0, "maxcopies" : 1 } default_hi_setup = copy.deepcopy( default_setup ) add_on = {} batches = json.loads( open('batches.json').read() ) for campaign in by_campaign: ## get a bunch of information setup = copy.deepcopy( default_setup ) add_on[campaign] = setup sendLog('batchor','Adding the relval campaigns %s with parameters \n%s'%( campaign, json.dumps( setup, indent=2)),level='critical') if not campaign in batches: batches[campaign] = [] batches[campaign] = list(set(list(copy.deepcopy( by_campaign[campaign] )) + batches[campaign] )) for campaign in by_hi_campaign: ## get a bunch of information setup = copy.deepcopy( default_hi_setup ) hi_site = random.choice(["T1_DE_KIT","T1_FR_CCIN2P3"]) setup["parameters"]["SiteWhitelist"]=[ hi_site ] add_on[campaign] = setup sendLog('batchor','Adding the HI relval campaigns %s with parameters \n%s'%( campaign, json.dumps( setup, indent=2)),level='critical') if not campaign in batches: batches[campaign] = [] batches[campaign] = list(set(list(copy.deepcopy( by_hi_campaign[campaign] )) + batches[campaign] )) open('batches.json','w').write( json.dumps( batches , indent=2 ) ) ## open the campaign configuration campaigns = json.loads( open('campaigns.relval.json').read() ) ## protect for overwriting ?? for new_campaign in list(set(add_on.keys())-set(campaigns.keys())): ## this is new, and can be announced as such print new_campaign,"is new stuff" workflows = by_campaign[new_campaign] requester = list(set([wf.split('_')[0] for wf in workflows])) subject = "Request of RelVal samples batch %s"% new_campaign text="""Dear all, A new batch of relval workflows was requested. Batch ID: %s Requestor: %s Details of the workflows: https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s This is an automated message"""%( new_campaign, ', '.join(requester), new_campaign, #'\n'.join( sorted(workflows) ) ) print subject print text to = ['*****@*****.**'] sendEmail(subject, text, destination=to) sendLog('batchor',text, level='critical') ## merge all anyways campaigns.update( add_on ) ## write it out for posterity open('campaigns.json.updated','w').write(json.dumps( campaigns , indent=2)) ## read back rread = json.loads(open('campaigns.json.updated').read()) os.system('mv campaigns.json.updated campaigns.relval.json')
from collections import defaultdict from utils import monitor_pub_dir, workflowInfo, getWorkflows register = [ #'assigned','acquired', 'running-open', 'running-closed', 'force-complete', 'completed', 'closed-out' ] wfs = [] url = 'cmsweb.cern.ch' for r in register: wfs.extend(getWorkflows(url, r, details=True)) print len(wfs), "after collecting", r lfns = defaultdict(set) for wf in wfs: if 'OutputModulesLFNBases' not in wf: print wf['RequestName'] for base in wf['OutputModulesLFNBases']: lfns[base].add(wf['RequestName']) now = time.gmtime() content = { "timestamp": time.mktime(now), "date": time.asctime(now), "protected": sorted(lfns.keys()) }
def htmlor(): cache = getWorkflows('cmsweb.cern.ch','assignment-approved', details=True) def wfl(wf,view=False,p=False,ms=False,within=False,ongoing=False,status=False,update=False): wfn = wf.name wfs = wf.wm_status pid = None pids=filter(lambda seg: seg.count('-')==2, wf.name.split('_')) if len(pids): pid=pids[0] text=', '.join([ #wfn, '<a href="https://cmsweb.cern.ch/reqmgr/view/details/%s" target="_blank">%s</a>'%(wfn,wfn), '(%s) <br>'%wfs]) text+=', '.join([ '<a href="https://cmsweb.cern.ch/reqmgr/view/details/%s" target="_blank">dts</a>'%wfn, '<a href=https://cmsweb.cern.ch/reqmgr/view/showWorkload?requestName=%s target="_blank">wkl</a>'%wfn, '<a href="https://cmsweb.cern.ch/couchdb/reqmgr_workload_cache/%s" target="_blank">wfc</a>'%wfn, '<a href="https://cmsweb.cern.ch/reqmgr/reqMgr/request?requestName=%s" target="_blank">dwkc</a>'%wfn, '<a href="https://cmsweb.cern.ch/reqmgr/view/splitting/%s" target="_blank">spl</a>'%wfn, '<a href="https://cms-pdmv.cern.ch/stats/?RN=%s" target="_blank">vw</a>'%wfn, '<a href="https://cms-pdmv.cern.ch/stats/restapi/get_one/%s" target="_blank">vwo</a>'%wfn, '<a href="https://cms-logbook.cern.ch/elog/Workflow+processing/?mode=full&reverse=0&reverse=1&npp=20&subtext=%s&sall=q" target="_blank">elog</a>'%pid, '<a href="http://hcc-briantest.unl.edu/prodview/%s" target="_blank">pv</a>'%wfn, '<a href="https://cmsweb.cern.ch/reqmgr/reqMgr/outputDatasetsByRequestName/%s" target="_blank">out</a>'%wfn, '<a href="closeout.html#%s" target="_blank">clo</a>'%wfn, '<a href="statuses.html#%s" target="_blank">st</a>'%wfn, '<a href="https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s" target="_blank">perf</a>'%wfn ]) if within and (not view or wfs=='completed'): cached = filter(lambda d : d['RequestName']==wfn, cache) if cached: wl = cached[0] else: wl = getWorkLoad('cmsweb.cern.ch',wfn) if 'InputDataset' in wl: dataset = wl['InputDataset'] text+=', '.join(['', '<a href=https://cmsweb.cern.ch/das/request?input=%s target=_blank>input</a>'%dataset, '<a href=https://cmsweb.cern.ch/phedex/prod/Data::Subscriptions#state=create_since=0;filter=%s target=_blank>sub</a>'%dataset, '<a href=https://cmsweb.cern.ch/phedex/datasvc/xml/prod/subscriptions?dataset=%s&collapse=n target=_blank>ds</a>'%dataset, '<a href=https://cmsweb.cern.ch/phedex/datasvc/xml/prod/blockreplicas?dataset=%s target=_blank>rep</a>'%dataset, ]) if p: cached = filter(lambda d : d['RequestName']==wfn, cache) if cached: wl = cached[0] else: wl = getWorkLoad('cmsweb.cern.ch',wfn) text+=', (%s)'%(wl['RequestPriority']) pass if pid: if ms: mcm_s = json.loads(os.popen('curl https://cms-pdmv.cern.ch/mcm/public/restapi/requests/get_status/%s --insecure'%pid).read())[pid] text+=', <a href="https://cms-pdmv.cern.ch/mcm/requests?prepid=%s" target="_blank">mcm (%s)</a>'%(pid,mcm_s) else: text+=', <a href="https://cms-pdmv.cern.ch/mcm/requests?prepid=%s" target="_blank">mcm</a>'%(pid) text+=', <a href="https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=%s" target="_blank">ac</a>'%(pid) if status: if wf.status.startswith('assistance'): text+=', <a href="assistance.html#%s" target="_blank">assist</a>'%wfn text+=' : %s '%(wf.status) if view and wfs!='acquired': text+='<a href="https://cms-pdmv.web.cern.ch/cms-pdmv/stats/growth/%s.gif" target="_blank"><img src="https://cms-pdmv.web.cern.ch/cms-pdmv/stats/growth/%s.gif" style="height:50px"></a>'%(wfn.replace('_','/'),wfn.replace('_','/')) if ongoing: text+='<a href="http://hcc-briantest.unl.edu/prodview/%s" target="_blank"><img src="http://hcc-briantest.unl.edu/prodview/graphs/%s/daily" style="height:50px"></a>'%(wfn,wfn) text+="<hr>" return text def phl(phid): text=', '.join([ str(phid), '<a href="https://cmsweb.cern.ch/phedex/prod/Request::View?request=%s" target="_blank">vw</a>'%phid, '<a href="https://cmsweb.cern.ch/phedex/prod/Data::Subscriptions?reqfilter=%s" target="_blank">sub</a>'%phid, ]) return text def ol(out): return '<a href="https://cmsweb.cern.ch/das/request?input=%s" target="_blank"> %s</a>'%(out,out) ## start to write it #html_doc = open('/afs/cern.ch/user/v/vlimant/public/ops/index.html','w') html_doc = open('/afs/cern.ch/user/c/cmst2/www/unified/index.html','w') print "Updating the status page ..." html_doc.write(""" <html> <head> <META HTTP-EQUIV="refresh" CONTENT="900"> <script type="text/javascript"> function showhide(id) { var e = document.getElementById(id); e.style.display = (e.style.display == 'block') ? 'none' : 'block'; } </script> </head> <body> Last update on %s(CET), %s(GMT), <a href=logs/ target=_blank> logs</a> <a href=logs/last.log target=_blank>last</a> <a href=statuses.html>statuses</a> <a href=https://twiki.cern.ch/twiki/bin/view/CMSPublic/CompOpsWorkflowL3Responsibilities#Automatic_Assignment_and_Unified>what am I</a> <br><br> """ %(time.asctime(time.localtime()), time.asctime(time.gmtime()))) text="" count=0 for wf in session.query(Workflow).filter(Workflow.status=='considered').all(): text+="<li> %s </li> \n"%wfl(wf,p=True) count+=1 text+="</ul></div>\n" html_doc.write(""" Worlfow next to handle <a href=https://cms-pdmv.cern.ch/mcm/batches?status=new&page=-1 target="_blank"> batches</a> (%d) <a href=logs/injector/last.log target=_blank>log</a> <a href=logs/transferor/last.log target=_blank>postlog</a> <a href="javascript:showhide('considered')">[Click to show/hide]</a> <br> <div id="considered" style="display:none;"> <ul> """%count) html_doc.write(text) text="" count=0 for wf in session.query(Workflow).filter(Workflow.status=='staging').all(): text+="<li> %s </li> \n"%wfl(wf,within=True) count+=1 text+="</ul></div>\n" html_doc.write(""" Worlfow waiting in staging (%d) <a href=logs/transferor/last.log target=_blank>log</a> <a href=logs/stagor/last.log target=_blank>postlog</a> <a href="javascript:showhide('staging')">[Click to show/hide]</a> <br> <div id="staging" style="display:none;"> <ul> """%count) html_doc.write(text) text="" count=0 for ts in session.query(Transfer).all(): stext='<li> %s serves </li><a href="javascript:showhide(\'%s\')">[show/hide]</a> <div id="%s" style="display:none;"><ul>'%( phl(ts.phedexid), ts.phedexid, ts.phedexid ) hide = True for pid in ts.workflows_id: w = session.query(Workflow).get(pid) hide &= (w.status != 'staging' ) stext+="<li> %s </li>\n"%( wfl(w,status=True)) stext+="</ul></div>\n" if hide: #text+="<li> %s not needed anymore to start running (does not mean it went through completely)</li>"%phl(ts.phedexid) pass else: count+=1 text+=stext text+="</ul></div>" html_doc.write(""" Transfer on-going (%d) <a href=https://transferteam.web.cern.ch/transferteam/dashboard/ target=_blank>dashboard</a> <a href=logs/transferor/last.log target=_blank>log</a> <a href=logs/stagor/last.log target=_blank>postlog</a> <a href="javascript:showhide('transfer')">[Click to show/hide]</a> <br> <div id="transfer" style="display:none;"> <br> <ul>"""%count) html_doc.write(text) text="" count=0 for wf in session.query(Workflow).filter(Workflow.status=='staged').all(): text+="<li> %s </li> \n"%wfl(wf,p=True) count+=1 text+="</ul></div>\n" html_doc.write("""Worlfow ready for assigning (%d) <a href=logs/stagor/last.log target=_blank>log</a> <a href=logs/assignor/last.log target=_blank>postlog</a> <a href="javascript:showhide('staged')">[Click to show/hide]</a> <br> <div id="staged" style="display:none;"> <br> <ul> """%count) html_doc.write(text) lines=[] for wf in session.query(Workflow).filter(Workflow.status=='away').all(): lines.append("<li> %s </li>"%wfl(wf,view=True,ongoing=True)) lines.sort() html_doc.write(""" Worlfow on-going (%d) <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests_in_production.php target=_blank>ongoing</a> <a href=https://cms-logbook.cern.ch/elog/Workflow+processing/?mode=summary target=_blank>elog</a> <a href=http://hcc-briantest.unl.edu/prodview target=_blank>queues</a> <a href=logs/assignor/last.log target=_blank>log</a> <a href=logs/checkor/last.log target=_blank>postlog</a> <a href="javascript:showhide('away')">[Click to show/hide]</a> <br> <div id="away" style="display:none;"> <br> <ul> %s </ul> </div> """%(len(lines),'\n'.join(lines))) text="" count=0 for wf in session.query(Workflow).filter(Workflow.status == 'assistance').all(): text+="<li> %s </li> \n"%wfl(wf,view=True,update=True,status=True) count+=1 text+="</ul></div>\n" html_doc.write("""Worlfow that are closing (%d) <a href=closeout.html target=_blank>closeout</a> <a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a> <a href="javascript:showhide('closing')">[Click to show/hide]</a> <br> <div id="closing" style="display:none;"> <br> <ul> """%count) html_doc.write(text) text="" count=0 for wf in session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all(): text+="<li> %s </li> \n"%wfl(wf,view=True,within=True,status=True,update=True) count+=1 text+="</ul></div>\n" html_doc.write("""Worlfow which need assistance (%d) <a href=assistance.html target=_blank>assistance</a> <a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a> <a href="javascript:showhide('assistance')">[Click to show/hide]</a> <br> <div id="assistance" style="display:none;"> <br> <ul> """%count) html_doc.write(text) text="" count=0 for wf in session.query(Workflow).filter(Workflow.status == 'close').all(): text+="<li> %s </li> \n"%wfl(wf) count+=1 text+="</ul></div>\n" html_doc.write("""Worlfow ready to close (%d) <a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a> <a href="javascript:showhide('close')">[Click to show/hide]</a> <br> <div id="close" style="display:none;"> <br> <ul> """%count) html_doc.write(text) text="" count=0 for wf in session.query(Workflow).filter(Workflow.status=='trouble').all(): text+="<li> %s </li> \n"%wfl(wf) count+=1 text+="</ul></div>\n" html_doc.write("""Worlfow with issue (%d) <a href=logs/closor/last.log target=_blank>log</a> <a href=logs/injector/last.log target=_blank>postlog</a> <a href="javascript:showhide('trouble')">[Click to show/hide]</a> <br> <div id="trouble" style="display:none;"> <br> <ul> """%count) html_doc.write(text) text="" count=0 for wf in session.query(Workflow).filter(Workflow.status=='forget').all(): text+="<li> %s </li> \n"%wfl(wf) count+=1 text+="</ul></div>\n" html_doc.write(""" Worlfow to forget (%d) <a href="javascript:showhide('forget')">[Click to show/hide]</a> <br> <div id="forget" style="display:none;"> <br> <ul> """%count) html_doc.write(text) text="" count=0 for wf in session.query(Workflow).filter(Workflow.status=='done').all(): text+="<li> %s </li> \n"%wfl(wf)#,ms=True) count+=1 text+="</ul></div>\n" html_doc.write(""" Worlfow through (%d) <a href=logs/closor/last.log target=_blank>log</a> <a href=logs/cleanor/last.log target=_blank>postlog</a> <a href="javascript:showhide('done')">[Click to show/hide]</a> <br> <div id="done" style="display:none;"> <br> <ul> """%count) html_doc.write(text) text="" count=0 for wf in session.query(Workflow).filter(Workflow.status=='clean').all(): text+="<li> %s </li> \n"%wfl(wf)#,ms=True) count+=1 text+="</ul></div>\n" html_doc.write(""" Worlfow clean for input (%d) <a href=logs/cleanor/last.log target=_blank>log</a> <a href="javascript:showhide('clean')">[Click to show/hide]</a> <br> <div id="clean" style="display:none;"> <br> <ul> """%count) html_doc.write(text) text="" count=0 for wf in session.query(Workflow).filter(Workflow.status.endswith('-out')).all(): text+="<li> %s </li> \n"%wfl(wf,status=True) count+=1 text+="</ul></div>\n" html_doc.write(""" Worlfow clean for output (%d) <a href=logs/outcleanor/last.log target=_blank>log</a> <a href="javascript:showhide('cleanout')">[Click to show/hide]</a> <br> <div id="cleanout" style="display:none;"> <br> <ul> """%count) html_doc.write(text) text="" lines_thisweek=[] lines_lastweek=[] now = time.mktime(time.gmtime()) this_week = int(time.strftime("%W",time.gmtime())) for out in session.query(Output).all(): if not out.workflow: print "This is a problem with",out.datasetname continue if out.workflow.status in ['done','clean']: out_week = int(time.strftime("%W",time.gmtime(out.date))) ##only show current week, and the previous. if (this_week-out_week)==1: lines_lastweek.append("<li>on week %s : %s </li>"%( time.strftime("%W (%x %X)",time.gmtime(out.date)), ol(out.datasetname), ) ) if (this_week-out_week)==0: lines_thisweek.append("<li>on week %s : %s </li>"%( time.strftime("%W (%x %X)",time.gmtime(out.date)), ol(out.datasetname), ) ) lines_thisweek.sort() lines_lastweek.sort() html_doc.write("""Output produced <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?in_disagreement=1 target=_blank>disagreements</a> (%d) <a href="javascript:showhide('output')">[Click to show/hide]</a> <br> <div id="output" style="display:none;"> <br> <ul> <li> Last week (%d) </li><a href="javascript:showhide('output_lastweek')">[Click to show/hide]</a><div id="output_lastweek" style="display:none;"><ul> %s </ul></div> <li> This week (%d) </li><a href="javascript:showhide('output_thisweek')">[Click to show/hide]</a><div id="output_thisweek" style="display:none;"><ul> %s </ul></div></div> """%( len(lines_lastweek)+len(lines_thisweek), len(lines_lastweek), '\n'.join(lines_lastweek), len(lines_thisweek), '\n'.join(lines_thisweek)) ) html_doc.write("""Job installed <a href="javascript:showhide('acron')">[Click to show/hide]</a> <br> <div id="acron" style="display:none;"> <br> <pre> %s </pre></div> """%(os.popen('acrontab -l | grep Unified').read())) text="" count=0 for (c,info) in campaignInfo().campaigns.items(): #if 'go' in info and info['go']: text+="<li>%s <br> <pre>%s</pre> </li>"%( c, json.dumps( info, indent=2)) count+=1 html_doc.write("""Campaign configuration <a href="javascript:showhide('campaign')">[Click to show/hide]</a> <br> <div id="campaign" style="display:none;"> <br> <ul> %s </ul></div> """%(text)) text="" count=0 n_column = 4 SI = siteInfo() for t in SI.types(): #text+="<li>%s<ul>"%t #for site in getattr(SI,t): # text+="<li><a href=http://hcc-briantest.unl.edu/prodview/%s>%s<a/> </li>"%( site, site) # text+='<a href="http://hcc-briantest.unl.edu/prodview/%s" target="_blank"><img src="http://hcc-briantest.unl.edu/prodview/graphs/%s/daily" style="height:50px"></a>'%(site,site) #text+="</ul></li>" text+="<li>%s<table border=1>"%t c=0 for site in getattr(SI,t): cpu = SI.cpu_pledges[site] if site in SI.cpu_pledges else 'N/A' disk = SI.disk[SI.CE_to_SE(site)] if SI.CE_to_SE(site) in SI.disk else 'N/A' if c==0: text+="<tr>" text+='<td><a href=http://dashb-ssb.cern.ch/dashboard/templates/sitePendingRunningJobs.html?site=%s>%s</a><br><a href="http://hcc-briantest.unl.edu/prodview/%s" target="_blank"><img src="http://hcc-briantest.unl.edu/prodview/graphs/%s/daily" style="height:50px"></a><br>CPU pledge: %s<br>Disk available: %s</td>'%(site,site,site,site,cpu,disk) if c==n_column: c=0 else: c+=1 text+="</table></li>" html_doc.write("""Site configuration <a href="javascript:showhide('site')">[Click to show/hide]</a> <br> <div id="site" style="display:none;"> <br> <ul> %s </ul></div> """%(text)) print "... done with status page." html_doc.write(""" </body> </html> """) html_doc.close() html_doc = open('/afs/cern.ch/user/c/cmst2/www/unified/statuses.html','w') html_doc.write(""" <html> <table border=1> <thead> <tr> <th> workflow </th><th> status </th><th> wm status</th> </tr> </thead> """) wfs = {} for wfo in session.query(Workflow).all(): wfs[wfo.name] = (wfo.status,wfo.wm_status) open('/afs/cern.ch/user/c/cmst2/www/unified/statuses.json','w').write(json.dumps( wfs )) for wfn in sorted(wfs.keys()): html_doc.write('<tr><td><a id="%s">%s</a></td><td>%s</td><td>%s</td></tr>\n'%( wfn, wfn, wfs[wfn][0], wfs[wfn][1])) html_doc.write("</table>") html_doc.write("<br>"*100) html_doc.write("end of page</html>") html_doc.close()
def batchor( url ): UC = unifiedConfiguration() SI = global_SI() ## get all workflows in assignment-approved with SubRequestType = relval all_wfs = [] for user in UC.get("user_relval"): all_wfs.extend( getWorkflows(url, 'assignment-approved', details=True, user=user, rtype='TaskChain') ) wfs = filter( lambda r :r['SubRequestType'] == 'RelVal' if 'SubRequestType' in r else False, all_wfs) ## need a special treatment for those hi_wfs = filter( lambda r :r['SubRequestType'] == 'HIRelVal' if 'SubRequestType' in r else False, all_wfs) by_campaign = defaultdict(set) by_hi_campaign = defaultdict(set) for wf in wfs: print "Relval:",wf['RequestName'], wf['Campaign'] #by_campaign[wf['Campaign']].add( wf['RequestName'] ) by_campaign[wf['Campaign']].add( wf['PrepID'] ) for wf in hi_wfs: print "HI Relval:",wf['RequestName'], wf['Campaign'] #by_hi_campaign[wf['Campaign']].add( wf['RequestName'] ) by_hi_campaign[wf['Campaign']].add( wf['PrepID'] ) default_setup = { "go" :True, "parameters" : { "SiteWhitelist": [ "T1_US_FNAL" ], "MergedLFNBase": "/store/relval", "Team" : "relval", "NonCustodialGroup" : "RelVal" }, "custodial" : "T1_US_FNAL_MSS", "custodial_override" : ["DQMIO"], "phedex_group" : "RelVal", "lumisize" : -1, "fractionpass" : 0.0, "maxcopies" : 1 } default_hi_setup = copy.deepcopy( default_setup ) add_on = {} batches = json.loads( open('batches.json').read() ) relval_routing = UC.get('relval_routing') def pick_one_site( p): ## modify the parameters on the spot to have only one site if "parameters" in p and "SiteWhitelist" in p["parameters"] and len(p["parameters"]["SiteWhitelist"])>1: choose_from = list(set(p["parameters"]["SiteWhitelist"]) & set(SI.sites_ready)) picked = random.choice( choose_from ) print "picked",picked,"from",choose_from p["parameters"]["SiteWhitelist"] = [picked] for campaign in by_campaign: if campaign in batches: continue ## get a bunch of information setup = copy.deepcopy( default_setup ) for key in relval_routing: if key in campaign: ## augment with the routing information augment_with = relval_routing[key] print "Modifying the batch configuration because of keyword",key print "with",augment_with setup = deep_update( setup, augment_with ) #if 'cc7' in campaign: setup["parameters"]["SiteWhitelist"] = ["T2_US_Nebraska"] pick_one_site( setup ) add_on[campaign] = setup sendLog('batchor','Adding the relval campaigns %s with parameters \n%s'%( campaign, json.dumps( setup, indent=2)),level='critical') if not campaign in batches: batches[campaign] = [] batches[campaign] = list(set(list(copy.deepcopy( by_campaign[campaign] )) + batches[campaign] )) for campaign in by_hi_campaign: if campaign in batches: continue ## get a bunch of information setup = copy.deepcopy( default_hi_setup ) hi_site = random.choice(["T1_DE_KIT","T1_FR_CCIN2P3"]) setup["parameters"]["SiteWhitelist"]=[ hi_site ] #setup["parameters"]["SiteWhitelist"]=["T1_DE_KIT","T1_FR_CCIN2P3"] pick_one_site( setup ) add_on[campaign] = setup sendLog('batchor','Adding the HI relval campaigns %s with parameters \n%s'%( campaign, json.dumps( setup, indent=2)),level='critical') if not campaign in batches: batches[campaign] = [] batches[campaign] = list(set(list(copy.deepcopy( by_hi_campaign[campaign] )) + batches[campaign] )) open('batches.json','w').write( json.dumps( batches , indent=2 ) ) ## open the campaign configuration campaigns = json.loads( open('campaigns.relval.json').read() ) ## protect for overwriting ?? for new_campaign in list(set(add_on.keys())-set(campaigns.keys())): ## this is new, and can be announced as such print new_campaign,"is new stuff" subject = "Request of RelVal samples batch %s"% new_campaign text="""Dear all, A new batch of relval workflows was requested. Batch ID: %s Details of the workflows: https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s This is an automated message"""%( new_campaign, new_campaign, ) print subject print text to = ['*****@*****.**'] sendEmail(subject, text, destination=to) sendLog('batchor',text, level='critical') ## go through all existing campaigns and remove the ones not in use anymore ? for old_campaign in campaigns.keys(): all_in_batch = getWorkflowByCampaign(url, old_campaign, details=True) is_batch_done = all(map(lambda s : not s in ['completed','running-open','running-closed','acquired','assigned','assignment-approved'], [wf['RequestStatus']for wf in all_in_batch])) ## check all statuses if is_batch_done: #print "batch",old_campaign,"can be closed or removed if necessary" #campaigns[old_campaign]['go'] = False ## disable campaigns.pop( old_campaign ) ## or just drop it all together ? print "batch",old_campaign," configuration was removed" ## merge all anyways campaigns.update( add_on ) ## write it out for posterity open('campaigns.json.updated','w').write(json.dumps( campaigns , indent=2)) ## read back rread = json.loads(open('campaigns.json.updated').read()) os.system('mv campaigns.json.updated campaigns.relval.json')
def htmlor( caller = ""): up = componentInfo(mcm=False, soft=['mcm']) if not up.check(): return try: boost = json.loads(open('%s/equalizor.json'%monitor_dir).read())['modifications'] except: boost = {} cache = getWorkflows(reqmgr_url,'assignment-approved', details=True) cache.extend( getWorkflows(reqmgr_url,'acquired', details=True) ) cache.extend( getWorkflows(reqmgr_url,'running-open', details=True) ) cache.extend( getWorkflows(reqmgr_url,'running-closed', details=True) ) def getWL( wfn ): cached = filter(lambda d : d['RequestName']==wfn, cache) if cached: wl = cached[0] else: wl = getWorkLoad(reqmgr_url,wfn) return wl def wfl(wf,view=False,p=False,ms=False,within=False,ongoing=False,status=False,update=False): wfn = wf.name wfs = wf.wm_status wl = None pid = None wl_pid = None pids=filter(lambda seg: seg.count('-')==2, wf.name.split('_')) if len(pids): pids = pids[:1] pid=pids[0] if not pids: wl = getWL( wf.name ) pids = getPrepIDs( wl ) pid = pids[0] wl_pid = pid if 'task' in wf.name: wl_pid = 'task_'+pid text=', '.join([ #wfn, #'<a href="https://cmsweb.cern.ch/reqmgr/view/details/%s" target="_blank">%s</a> '%(wfn,wfn), #'<table><tr><td>%s</td></tr></table>'%(wfn), #'<span>%s</span>'%(wfn), "%s "%wfn, '(%s) <br>'%wfs]) text+=', '.join([ '<a href="https://%s/reqmgr2/fetch?rid=%s" target="_blank">dts</a>'%(reqmgr_url,wfn), '<a href="https://cmsweb.cern.ch/reqmgr/view/details/%s" target="_blank">dts-req1</a>'%wfn, #TOFIX '<a href=https://cmsweb.cern.ch/reqmgr/view/showWorkload?requestName=%s target="_blank">wkl</a>'%wfn, '<a href="https://%s/couchdb/reqmgr_workload_cache/%s" target="_blank">wfc</a>'%(reqmgr_url,wfn), '<a href="https://%s/reqmgr2/data/request?name=%s" target="_blank">req</a>'%(reqmgr_url,wfn), #'<a href="https://cmsweb.cern.ch/reqmgr/reqMgr/request?requestName=%s" target="_blank">dwkc</a>'%wfn, #TOFIX '<a href="https://cmsweb.cern.ch/reqmgr/view/splitting/%s" target="_blank">spl</a>'%wfn, '<a href="https://cms-pdmv.cern.ch/stats/?RN=%s" target="_blank">vw</a>'%wfn, '<a href="https://cms-pdmv.cern.ch/stats/restapi/get_one/%s" target="_blank">vwo</a>'%wfn, '<a href="https://cms-logbook.cern.ch/elog/Workflow+processing/?mode=full&reverse=0&reverse=1&npp=20&subtext=%s&sall=q" target="_blank">elog</a>'%pid, '<a href="http://cms-gwmsmon.cern.ch/prodview/%s" target="_blank">pv</a>'%wfn, #deprecated '<a href="https://cmsweb.cern.ch/reqmgr/reqMgr/outputDatasetsByRequestName/%s" target="_blank">out</a>'%wfn, '<a href="closeout.html#%s" target="_blank">clo</a>'%wfn, '<a href="statuses.html#%s" target="_blank">st</a>'%wfn, '<a href="https://%s/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s" target="_blank">perf</a>'%(reqmgr_url,wfn) ]) if within and (not view or wfs=='completed'): wl = getWL( wfn ) dataset =None if 'InputDataset' in wl: dataset = wl['InputDataset'] if 'Task1' in wl and 'InputDataset' in wl['Task1']: dataset = wl['Task1']['InputDataset'] if dataset: text+=', '.join(['', '<a href=https://cmsweb.cern.ch/das/request?input=%s target=_blank>input</a>'%dataset, '<a href=https://cmsweb.cern.ch/phedex/prod/Data::Subscriptions#state=create_since=0;filter=%s target=_blank>sub</a>'%dataset, '<a href=https://cmsweb.cern.ch/phedex/datasvc/xml/prod/subscriptions?dataset=%s&collapse=n target=_blank>ds</a>'%dataset, '<a href=https://cmsweb.cern.ch/phedex/datasvc/xml/prod/blockreplicas?dataset=%s target=_blank>rep</a>'%dataset, ]) if p: cached = filter(lambda d : d['RequestName']==wfn, cache) if cached: wl = cached[0] else: wl = getWorkLoad('cmsweb.cern.ch',wfn) text+=', (%s)'%(wl['RequestPriority']) pass if pid: if ms: mcm_s = json.loads(os.popen('curl https://cms-pdmv.cern.ch/mcm/public/restapi/requests/get_status/%s --insecure'%pid).read())[pid] text+=', <a href="https://cms-pdmv.cern.ch/mcm/requests?prepid=%s" target="_blank">mcm (%s)</a>'%(pid,mcm_s) else: text+=', <a href="https://cms-pdmv.cern.ch/mcm/requests?prepid=%s" target="_blank">mcm</a>'%(pid) text+=', <a href="https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=%s" target="_blank">ac</a>'%(wl_pid) if status: if wf.status.startswith('assistance'): text+=', <a href="assistance.html#%s" target="_blank">assist</a>'%wfn text+=' : %s '%(wf.status) if view and wfs!='acquired': text+='<a href="https://cms-pdmv.web.cern.ch/cms-pdmv/stats/growth/%s.gif" target="_blank"><img src="https://cms-pdmv.web.cern.ch/cms-pdmv/stats/growth/%s.gif" style="height:50px"></a>'%(wfn.replace('_','/'),wfn.replace('_','/')) if ongoing: text+='<a href="http://cms-gwmsmon.cern.ch/prodview/%s" target="_blank"><img src="http://cms-gwmsmon.cern.ch/prodview/graphs/%s/daily" style="height:50px"></a>'%(wfn,wfn) if ongoing: date1 = time.strftime('%Y-%m-%d+%H:%M', time.gmtime(time.mktime(time.gmtime())-(15*24*60*60)) ) date2 = time.strftime('%Y-%m-%d+%H:%M', time.gmtime()) text+='<a href="http://dashb-cms-job.cern.ch/dashboard/templates/web-job2/#table=Jobs&date1=%s&date2=%s&sortby=site&task=wmagent_%s">dashb</a>'%( date1, date2, wfn ) if ongoing and wfn in boost: for task in boost[wfn]: overflow = boost[wfn][task].get('ReplaceSiteWhitelist',None) if not overflow: overflow = boost[wfn][task].get('AddWhitelist',None) if overflow: text+=',boost (<a href=equalizor.json>%d</a>)'%len(overflow) #text+="<hr>" return text def phl(phid): text=', '.join([ str(phid), '<a href="https://cmsweb.cern.ch/phedex/prod/Request::View?request=%s" target="_blank">vw</a>'%phid, '<a href="https://cmsweb.cern.ch/phedex/prod/Data::Subscriptions?reqfilter=%s" target="_blank">sub</a>'%phid, ]) return text def ol(out): return '<a href="https://cmsweb.cern.ch/das/request?input=%s" target="_blank"> %s</a>'%(out,out) def lap( comment ): l = time.mktime(time.gmtime()) spend = l-lap.start lap.start =l print "Spend %d [s] for %s"%( spend, comment ) lap.start = time.mktime(time.gmtime()) ## start to write it #html_doc = open('/afs/cern.ch/user/v/vlimant/public/ops/index.html','w') html_doc = open('%s/index.html.new'%monitor_dir,'w') print "Updating the status page ..." UC = unifiedConfiguration() if not caller: try: #caller = sys._getframe(1).f_code.co_name caller = sys.argv[0].split('/')[-1].replace('.py','') print "caller is" print caller except Exception as es: caller = 'none found' print "not getting frame" print str(es) html_doc.write(""" <html> <head> <META HTTP-EQUIV="refresh" CONTENT="900"> <script type="text/javascript"> function showhide(id) { var e = document.getElementById(id); e.style.display = (e.style.display == 'block') ? 'none' : 'block'; } </script> </head> <body> Last update on %s(CET), %s(GMT) <br> <a href=logs/ target=_blank>logs</a> <a href=logs/last.log target=_blank>last</a> <a href=statuses.html>statuses</a> <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/ target=_blank>prod mon</a> <a href=https://%s/wmstats/index.html target=_blank>wmstats</a> <a href=http://t3serv001.mit.edu/~cmsprod/IntelROCCS/Detox/SitesInfo.txt target=_blank>detox</a> <a href=locked.html>space</a> <a href=logs/subscribor/last.log target=_blank>blocks</a> <a href=logs/agents/last.log>agents</a> <br> <a href=https://twiki.cern.ch/twiki/bin/view/CMSPublic/CompOpsWorkflowL3Responsibilities#Automatic_Assignment_and_Unified>what am I</a> <a href=data.html>json interfaces</a> <a href=logs/addHoc/last.log>add-hoc op</a> created from <b>%s <a href=logs/last_running>last running</a></b> <object height=20 type="text/html" data="logs/last_running"><p>backup content</p></object> <br><br> """ %(time.asctime(time.localtime()), time.asctime(time.gmtime()), reqmgr_url, caller ) ) text="" count=0 count_by_campaign=defaultdict(lambda : defaultdict(int)) for wf in session.query(Workflow).filter(Workflow.status.startswith('considered')).all(): wl = getWL( wf.name ) count_by_campaign[wl['Campaign']][int(wl['RequestPriority'])]+=1 #print wf.name text+="<li> %s </li> \n"%wfl(wf,p=True) count+=1 text_by_c="" for c in count_by_campaign: text_by_c+="<li> %s (%d) : "%( c, sum(count_by_campaign[c].values()) ) for p in sorted(count_by_campaign[c].keys()): text_by_c+="%d (%d), "%(p,count_by_campaign[c][p]) text_by_c+="</li>" html_doc.write(""" Worflow next to handle (%d) <a href=https://cms-pdmv.cern.ch/mcm/batches?status=new&page=-1 target="_blank"> batches</a> <a href=logs/injector/last.log target=_blank>log</a> <a href=logs/transferor/last.log target=_blank>postlog</a> <a href="javascript:showhide('considered')">[Click to show/hide]</a> <br> <div id="considered" style="display:none;"> <ul> <li> By workflow (%d) </li><a href="javascript:showhide('considered_bywf')">[Click to show/hide]</a><div id="considered_bywf" style="display:none;"> <ul> %s </ul></div> <li> By campaigns (%d) </li><a href="javascript:showhide('considered_bycamp')">[Click to show/hide]</a><div id="considered_bycamp" style="display:none;"> <ul> %s </ul></div> </ul> </div> """%(count, count, text, len(count_by_campaign), text_by_c)) lap( 'done with considered' ) text="" count=0 count_by_campaign=defaultdict(lambda : defaultdict(int)) for wf in session.query(Workflow).filter(Workflow.status=='staging').all(): wl = getWL( wf.name ) count_by_campaign[wl['Campaign']][int(wl['RequestPriority'])]+=1 text+="<li> %s </li> \n"%wfl(wf,within=True) count+=1 text_by_c="" for c in count_by_campaign: text_by_c+="<li> %s (%d) : "%( c, sum(count_by_campaign[c].values()) ) for p in sorted(count_by_campaign[c].keys()): text_by_c+="%d (%d), "%(p,count_by_campaign[c][p]) text_by_c+="</li>" html_doc.write(""" Worflow waiting in staging (%d) <a href=logs/transferor/last.log target=_blank>log</a> <a href=logs/stagor/last.log target=_blank>postlog</a> <a href="javascript:showhide('staging')">[Click to show/hide]</a> <br> <div id="staging" style="display:none;"> <ul> <li> By workflow (%d) </li><a href="javascript:showhide('staging_bywf')">[Click to show/hide]</a><div id="staging_bywf" style="display:none;"> <ul> %s </ul></div> <li> By campaigns (%d) </li><a href="javascript:showhide('staging_bycamp')">[Click to show/hide]</a><div id="staging_bycamp" style="display:none;"> <ul> %s </ul></div> </ul> </div> """%(count, count, text, len(count_by_campaign), text_by_c)) lap ( 'done with staging' ) text="<ul>" count=0 transfer_per_wf = defaultdict(list) for ts in session.query(Transfer).filter(Transfer.phedexid>0).all(): hide = True t_count = 0 stext="" for pid in ts.workflows_id: w = session.query(Workflow).get(pid) hide &= (w.status != 'staging' ) if w.status in ['considered','staging','staged']: stext += "<li> %s </li>\n"%( wfl(w,status=True)) transfer_per_wf[w].append( ts.phedexid ) t_count +=1 stext = '<li> %s serves %d workflows<br><a href="javascript:showhide(\'%s\')">[show/hide]</a> <div id="%s" style="display:none;"><ul>\n'%( phl(ts.phedexid), t_count, ts.phedexid, ts.phedexid ) + stext stext+="</ul></li>\n" if hide: #text+="<li> %s not needed anymore to start running (does not mean it went through completely)</li>"%phl(ts.phedexid) pass else: count+=1 text+=stext text+="</ul>" text_bywf="<ul>" for wf in transfer_per_wf: text_bywf += "<li> %s </li>"%(wfl(wf,within=True)) text_bywf += '<a href=javascript:showhide("transfer_%s")>[Click to show/hide] %d transfers</a>'% (wf.name, len(transfer_per_wf[wf])) text_bywf += '<div id="transfer_%s" style="display:none;">'% wf.name text_bywf += "<ul>" for pid in sorted(transfer_per_wf[wf]): text_bywf += "<li> %s </li>"%(phl(pid)) text_bywf += "</ul></div><hr>" text_bywf += '</ul>' html_doc.write(""" Transfer on-going (%d) <a href=http://cmstransferteam.web.cern.ch/cmstransferteam/ target=_blank>dashboard</a> <a href=logs/transferor/last.log target=_blank>log</a> <a href=logs/stagor/last.log target=_blank>postlog</a> <a href="javascript:showhide('transfer')">[Click to show/hide]</a> <br> <div id="transfer" style="display:none;"> <ul> <li> By Workflow <a href="javascript:showhide('transfer_bywf')">[Click to show/hide]</a> <div id="transfer_bywf" style="display:none;"> %s </div> </li> <li> By transfer request <a href="javascript:showhide('transfer_byreq')">[Click to show/hide]</a> <div id="transfer_byreq" style="display:none;"> %s </div> </li> </ul> </div> """%(count, text_bywf, text)) lap( 'done with transfers' ) text="" count=0 count_by_campaign=defaultdict(lambda : defaultdict(int)) for wf in session.query(Workflow).filter(Workflow.status=='staged').all(): wl = getWL( wf.name ) count_by_campaign[wl['Campaign']][int(wl['RequestPriority'])]+=1 text+="<li> %s </li> \n"%wfl(wf,p=True) count+=1 text_by_c="" for c in count_by_campaign: text_by_c+="<li> %s (%d) : "%( c, sum(count_by_campaign[c].values()) ) for p in sorted(count_by_campaign[c].keys()): text_by_c+="%d (%d), "%(p,count_by_campaign[c][p]) text_by_c+="</li>" html_doc.write("""Worflow ready for assigning (%d) <a href=logs/stagor/last.log target=_blank>log</a> <a href=logs/assignor/last.log target=_blank>postlog</a> <a href="javascript:showhide('staged')">[Click to show/hide]</a> <br> <div id="staged" style="display:none;"> <br> <ul> <li> By workflow (%d) </li><a href="javascript:showhide('staged_bywf')">[Click to show/hide]</a><div id="staged_bywf" style="display:none;"> <ul> %s </ul></div> <li> By campaigns (%d) </li><a href="javascript:showhide('staged_bycamp')">[Click to show/hide]</a><div id="staged_bycamp" style="display:none;"> <ul> %s </ul></div> </ul> </div> """%(count, count, text, len(count_by_campaign), text_by_c)) lap( 'done with staged' ) lines=[] count_by_campaign=defaultdict(lambda : defaultdict(int)) for wf in session.query(Workflow).filter(Workflow.status=='away').all(): wl = getWL( wf.name ) count_by_campaign[wl['Campaign']][int(wl['RequestPriority'])]+=1 lines.append("<li> %s <hr></li>"%wfl(wf,view=True,ongoing=True)) text_by_c="" for c in count_by_campaign: text_by_c+="<li> %s (%d) <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/campaign.php?campaign=%s>mon</a> <a href=https://cms-pdmv.cern.ch/pmp/historical?r=%s target=_blank>pmp</a> "%( c, sum(count_by_campaign[c].values()),c,c ) for p in sorted(count_by_campaign[c].keys()): text_by_c+="%d (%d), "%(p,count_by_campaign[c][p]) text_by_c+="</li>" lines.sort() html_doc.write(""" Worflow on-going (%d) <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests_in_production.php target=_blank>ongoing</a> <a href=https://cms-logbook.cern.ch/elog/Workflow+processing/?mode=summary target=_blank>elog</a> <a href=http://cms-gwmsmon.cern.ch/prodview target=_blank>queues</a> <a href=logs/assignor/last.log target=_blank>log</a> <a href=logs/checkor/last.log target=_blank>postlog</a> <a href=logs/equalizor/last.log target=_blank>equ</a> <a href=logs/completor/last.log target=_blank>comp</a> <a href="javascript:showhide('away')">[Click to show/hide]</a> <br> <div id="away" style="display:none;"> <ul> <li>By workflow (%d) </li> <a href="javascript:showhide('away_bywf')">[Click to show/hide]</a><div id="away_bywf" style="display:none;"> <ul> %s </ul></div> <li> By campaigns (%d) </li><a href="javascript:showhide('away_bycamp')">[Click to show/hide]</a><div id="away_bycamp" style="display:none;"> <ul> %s </ul></div> </ul> </div> """%(len(lines), len(lines), '\n'.join(lines), len(count_by_campaign), text_by_c )) lap ( 'done with away' ) text="" count=0 #for wf in session.query(Workflow).filter(Workflow.status == 'assistance-custodial').all(): for wf in session.query(Workflow).filter(Workflow.status.startswith('assistance')).filter(Workflow.status.contains('custodial')).all(): text+="<li> %s </li> \n"%wfl(wf,view=True,update=True,status=True) count+=1 text+="</ul></div>\n" html_doc.write("""Worflow that are closing (%d) <a href=closeout.html target=_blank>closeout</a> <a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a> <a href="javascript:showhide('closing')">[Click to show/hide]</a> <br> <div id="closing" style="display:none;"> <br> <ul> """%count) html_doc.write(text) lap ( 'done with closing' ) assistance_by_type = defaultdict(list) text="" count=0 for wf in session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all(): assistance_by_type[wf.status].append( wf ) count+=1 for assistance_type in assistance_by_type: text += "<li> %s (%d) <a href=\"javascript:showhide('%s')\">[Click to show/hide]</a><br><div id=\"%s\" style=\"display:none;\"><ul>"%( assistance_type, len(assistance_by_type[assistance_type]), assistance_type, assistance_type, ) for wf in assistance_by_type[assistance_type]: text+="<li> %s <hr></li> \n"%wfl(wf,view=True,within=True,status=True,update=True) text += "</ul></div></li>\n" html_doc.write("""Worflow which need assistance (%d) <a href=assistance.html target=_blank>assistance</a> <a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/recoveror/last.log target=_blank>postlog</a> <a href="javascript:showhide('assistance')">[Click to show/hide]</a> <br> <div id="assistance" style="display:none;"> <br> <ul> %s </ul> </div> """%(count, text)) lap ( 'done with assistance' ) text="" count=0 for wf in session.query(Workflow).filter(Workflow.status == 'close').all(): text+="<li> %s </li> \n"%wfl(wf) count+=1 text+="</ul></div>\n" html_doc.write("""Worflow ready to close (%d) <a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a> <a href="javascript:showhide('close')">[Click to show/hide]</a> <br> <div id="close" style="display:none;"> <br> <ul> """%count) html_doc.write(text) lap ( 'done with annoucing' ) text="" count=0 for wf in session.query(Workflow).filter(Workflow.status=='trouble').all(): text+="<li> %s </li> \n"%wfl(wf) count+=1 text+="</ul></div>\n" html_doc.write("""Worflow with issue (%d) <a href=logs/closor/last.log target=_blank>log</a> <a href=logs/injector/last.log target=_blank>postlog</a> <a href="javascript:showhide('trouble')">[Click to show/hide]</a> <br> <div id="trouble" style="display:none;"> <br> <ul> """%count) html_doc.write(text) lap ( 'done with trouble' ) text="" count=0 for wf in session.query(Workflow).filter(Workflow.status=='forget').all(): text+="<li> %s </li> \n"%wfl(wf) count+=1 text+="</ul></div>\n" html_doc.write(""" Worflow to forget (%d) <a href=logs/injector/last.log target=_blank>log</a> <a href=logs/lockor/last.log target=_blank>postlog</a> <a href="javascript:showhide('forget')">[Click to show/hide]</a> <br> <div id="forget" style="display:none;"> <br> <ul> """%count) html_doc.write(text) lap ( 'done with forget' ) text="" count=0 for wf in session.query(Workflow).filter(Workflow.status=='done').all(): text+="<li> %s </li> \n"%wfl(wf)#,ms=True) count+=1 text+="</ul></div>\n" html_doc.write(""" Worflow through (%d) <a href=logs/closor/last.log target=_blank>log</a> <a href=logs/lockor/last.log target=_blank>postlog</a> <a href="javascript:showhide('done')">[Click to show/hide]</a> <br> <div id="done" style="display:none;"> <br> <ul> """%count) html_doc.write(text) lap ( 'done with done' ) wfs = session.query(Workflow).filter(Workflow.status.endswith('-unlock')).all() html_doc.write(" Workflows unlocked : %s <a href=logs/lockor/last.log target=_blank>log</a><br>"%(len(wfs))) lap ( 'done with unlocked' ) text="" lines_thisweek=[] lines_lastweek=[] now = time.mktime(time.gmtime()) this_week = int(time.strftime("%W",time.gmtime())) start_time_two_weeks_ago = time.mktime(time.gmtime(now - (20*24*60*60))) # 20 last_week = int(time.strftime("%W",time.gmtime(now - ( 7*24*60*60)))) all_locks = json.loads(open('%s/globallocks.json'%monitor_dir).read()) waiting_custodial = json.loads(open('%s/waiting_custodial.json'%monitor_dir).read()) all_pending_approval_custodial = dict([(k,item) for k,item in waiting_custodial.items() if 'nodes' in item and not any([node['decided'] for node in item['nodes'].values()]) ]) n_pending_approval = len( all_pending_approval_custodial ) #n_pending_approval = len([item for item in waiting_custodial.values() if 'nodes' in item and not any([node['decided'] for node in item['nodes'].values() ])]) missing_approval_custodial = json.loads(open('%s/missing_approval_custodial.json'%monitor_dir).read()) stuck_custudial = json.loads(open('%s/stuck_custodial.json'%monitor_dir).read()) lagging_custudial = json.loads(open('%s/lagging_custodial.json'%monitor_dir).read()) if len(stuck_custudial): stuck_string = ', <font color=red>%d appear to be <a href=stuck_custodial.json>stuck</a></font>'% len(stuck_custudial) else: stuck_string = '' if len(missing_approval_custodial): long_approve_string = ', <font color=red>%d more than %d days</font>'%( len(missing_approval_custodial), UC.get('transfer_timeout')) else: long_approve_string = '' output_within_two_weeks=session.query(Output).filter(Output.date>=start_time_two_weeks_ago).all() waiting_custodial_string="" waiting_custodial_strings=[] for ds in waiting_custodial: out = None ## lots of it will be within two weeks of = filter(lambda odb: odb.datasetname == ds, output_within_two_weeks) if of: out = of[0] else: out = session.query(Output).filter(Output.datasetname == ds).first() if out: info = waiting_custodial[out.datasetname] action = 'going' if out.datasetname in all_pending_approval_custodial: action = '<font color=red>pending</font>' try: size = str(info['size']) except: size = "x" destination = ",".join(info['nodes'].keys()) if not destination: destination ='<font color=red>NO SITE</font>' a_waiting_custodial_string = "<li>on week %s : %s %s</li>"%( time.strftime("%W (%x %X)",time.gmtime(out.date)), ol(out.datasetname), ' %s [GB] %s to %s on %s (<a href="https://cmsweb.cern.ch/phedex/datasvc/xml/prod/requestlist?dataset=%s&node=T*MSS">%d missing</a>)'%( size, action, destination, time.asctime(time.gmtime(info['checked'])), out.datasetname, info['nmissing']) ) waiting_custodial_strings.append( (out.date, a_waiting_custodial_string) ) waiting_custodial_strings.sort( key = lambda i:i[0] ) waiting_custodial_string="\n".join( [i[1] for i in waiting_custodial_strings] ) #start_time_two_weeks_ago = time.mktime(time.strptime("15-0-%d"%(this_week-2), "%y-%w-%W")) for out in output_within_two_weeks: if not out.workflow: print "This is a problem with",out.datasetname continue if out.workflow.status in ['done-unlock','done','clean','clean-out','clean-unlock']: custodial='' if out.datasetname in waiting_custodial: info = waiting_custodial[out.datasetname] try: try: size = str(info['size']) except: size = "x" destination = ",".join(info['nodes'].keys()) if not destination: destination ='<font color=red>NO SITE</font>' action = 'going' if out.datasetname in all_pending_approval_custodial: action = '<font color=red>pending</font>' custodial=' %s [GB] %s to %s on %s (<a href="https://cmsweb.cern.ch/phedex/datasvc/xml/prod/requestlist?dataset=%s&node=T*MSS">%d missing</a>)'%( size, action, destination, time.asctime(time.gmtime(info['checked'])), out.datasetname, info['nmissing']) except Exception as e: #print info #print str(e) pass elif out.datasetname in all_locks: custodial='<font color=green>LOCKED</font>' out_week = int(time.strftime("%W",time.gmtime(out.date))) ##only show current week, and the previous. if last_week==out_week: lines_lastweek.append("<li>on week %s : %s %s</li>"%( time.strftime("%W (%x %X)",time.gmtime(out.date)), ol(out.datasetname), custodial ) ) if this_week==out_week: lines_thisweek.append("<li>on week %s : %s %s</li>"%( time.strftime("%W (%x %X)",time.gmtime(out.date)), ol(out.datasetname), custodial ) ) lines_thisweek.sort() lines_lastweek.sort() html_doc.write("""Output produced (%d) <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?in_disagreement=1 target=_blank>disagreements</a> <a href="javascript:showhide('output')">[Click to show/hide]</a> <br> <div id="output" style="display:none;"> <br> <ul> <li> %d waiting to go to tape</li> <ul> <li> %d waiting for tape approval%s</li> <li> %d are not completed after %d days%s</li> <li> Full list (%d) <a href="javascript:showhide('waiting-custodial')">[Click to show/hide]</a> <div id="waiting-custodial" style="display:none;"> <ul> %s </ul> </div> </li> </ul> <li> Last week (%d) </li><a href="javascript:showhide('output_lastweek')">[Click to show/hide]</a><div id="output_lastweek" style="display:none;"><ul> %s </ul></div> <li> This week (%d) </li><a href="javascript:showhide('output_thisweek')">[Click to show/hide]</a><div id="output_thisweek" style="display:none;"><ul> %s </ul></div></div> """%( len(lines_lastweek)+len(lines_thisweek), len(waiting_custodial), n_pending_approval,long_approve_string, len(lagging_custudial),UC.get('transfer_timeout'),stuck_string, len(waiting_custodial),waiting_custodial_string, len(lines_lastweek), '\n'.join(lines_lastweek), len(lines_thisweek), '\n'.join(lines_thisweek)) ) lap ( 'done with output' ) html_doc.write("""Job installed <a href="javascript:showhide('acron')">[Click to show/hide]</a> <br> <div id="acron" style="display:none;"> <br> <pre> %s </pre> """%(os.popen('acrontab -l | grep Unified | grep -v \#').read())) per_module = defaultdict(list) for t in filter(None,os.popen('cat %s/logs/*/*.time'%monitor_dir).read().split('\n')): module_name,run_time,spend = t.split(':') ## then do what you want with it ! if 'cleanor' in module_name: continue per_module[module_name].append( int(spend) ) def display_time( sec ): m, s = divmod(sec, 60) h, m = divmod(m, 60) dis="" if h: dis += "%d [h] "%h if h or m: dis += "%d [m] "%m if h or m or s: dis += "%d [s]"%s return dis html_doc.write("Module running time<ul>\n") for m,spends in per_module.items(): avg = sum(spends)/float(len(spends)) lasttime = spends[-1] html_doc.write("<li>%s : last %s, avg %s</li>\n"%( m, display_time(lasttime), display_time(avg))) html_doc.write("</ul>") html_doc.write("Last running <pre>%s</pre><br>"%( os.popen("tac %s/logs/running | head -5"%monitor_dir).read() )) html_doc.write("Order in cycle <pre>%s</pre><br>"%( '\n'.join(map(lambda l : l.split('/')[-1].replace('.py',''), filter(lambda l : not l.startswith('#') and 'Unified' in l and 'py' in l.split('/')[-1], open('%s/WmAgentScripts/cycle.sh'%base_dir).read().split('\n')))) )) html_doc.write("</div>\n") lap ( 'done with jobs' ) text="" count=0 for (c,info) in campaignInfo().campaigns.items(): #if 'go' in info and info['go']: text+="<li>%s <br> <pre>%s</pre> </li>"%( c, json.dumps( info, indent=2)) count+=1 html_doc.write("""Campaign configuration <a href="javascript:showhide('campaign')">[Click to show/hide]</a> <br> <div id="campaign" style="display:none;"> <br> <ul> %s </ul></div> """%(text)) text="" count=0 n_column = 4 SI = siteInfo() date1 = time.strftime('%Y-%m-%d+%H:%M', time.gmtime(time.mktime(time.gmtime())-(15*24*60*60)) ) ## 15 days date2 = time.strftime('%Y-%m-%d+%H:%M', time.gmtime()) for t in SI.types(): text+="<li>%s<table border=1>"%t c=0 for site in getattr(SI,t): cpu = SI.cpu_pledges[site] if site in SI.cpu_pledges else 'N/A' disk = SI.disk[SI.CE_to_SE(site)] if SI.CE_to_SE(site) in SI.disk else 'N/A' if c==0: text+="<tr>" if not disk: ht_disk = '<font color=red>Disk available: %s</font>'%disk else: ht_disk = 'Disk available: %s'%disk text+='<td><a href=http://dashb-ssb.cern.ch/dashboard/templates/sitePendingRunningJobs.html?site=%s>%s</a><br><a href="http://cms-gwmsmon.cern.ch/prodview/%s" target="_blank"><img src="http://cms-gwmsmon.cern.ch/prodview/graphs/%s/daily" style="height:50px"></a><br><a href="http://dashb-cms-job.cern.ch/dashboard/templates/web-job2/#user=&refresh=0&table=Jobs&p=1&records=25&activemenu=1&site=%s&submissiontool=wmagent&check=submitted&sortby=activity&scale=linear&bars=20&data1=%s&date2=%s">dashb</a><br>CPU pledge: %s<br>%s</td>'%(site,site,site,site,site,date1,date2,cpu,ht_disk) if c==n_column: c=0 else: c+=1 text+="</table></li>" text += "<li> Sites in auto-approved transfer<ul>" for site in sorted(SI.sites_auto_approve): text+="<li>%s"% site text += "</ul></li>" text += "<li> Sites with vetoe transfer<ul>" for site in sorted(SI.sites_veto_transfer): text+="<li>%s"% site text += "</ul></li>" text += "<li> Sites banned from production<ul>" for site in sorted(SI.sites_banned): text+="<li>%s"% site text += "</ul></li>" text += "<li> Approximate Free Tape<ul>" for mss in SI.storage: waiting = 0 try: waiting = float(os.popen("grep '%s is pending . Created since' %s/logs/lockor/last.log -B 3 | grep size | awk '{ sum+=$6 ; print sum }' | tail -1" % (mss,monitor_dir)).readline()) except Exception as e: print str(e) oldest = "" os.system('grep pending %s/logs/lockor/last.log | sort -u > %s/logs/pending.log'%(monitor_dir,monitor_dir)) try: oldest = os.popen("grep '%s is pending . Created since ' %s/logs/lockor/last.log | sort | awk '{print $10, $11, $12, $13, $14 }' | head -1"% (mss,monitor_dir)).readline() except Exception as e: print str(e) waiting /= 1024. text+="<li>%s : %d [TB]. Waiting for approval %d [TB] since %s </li>"%(mss, SI.storage[mss], waiting, oldest) text += "</ul></li>" lap ( 'done with sites' ) open('%s/siteInfo.json'%monitor_dir,'w').write(json.dumps(dict([(t,getattr(SI,t)) for t in SI.types()]),indent=2)) lap ( 'done with sites json' ) chart_data = defaultdict(list) for site in SI.quota: chart_data[site].append(""" var data_%s = google.visualization.arrayToDataTable([ ['Overall', 'Space in TB'], //['Quota' , %s], ['Locked' , %s], ['Free' , %s] ]); """%( site, SI.quota[site], SI.locked[site], SI.disk[site], )) chart_data[site].append(""" var chart_%s = new google.visualization.PieChart(document.getElementById('donutchart_%s')); chart_%s.draw(data_%s, {title: '%s %s [TB]', pieHole:0.4, slices:{0:{color:'red'},1:{color:'green'}}}); """%(site,site, site,site, site,SI.quota[site])) chart_data[site].append(""" <div id="donutchart_%s" style="height: 200px;width: 300px"></div> """%(site)) ## make the locked/available donut chart donut_html = open('%s/locked.html'%monitor_dir,'w') tables = "\n".join([info[0] for site,info in chart_data.items()]) draws = "\n".join([info[1] for site,info in chart_data.items()]) divs = "\n".join([info[2] for site,info in chart_data.items()]) divs_table="<table border=0>" for c,site in enumerate(sorted(chart_data.keys())): if c%5==0: divs_table += "<tr>" divs_table += "<td>%s</td>"%(chart_data[site][2]) divs_table += "</table>" donut_html.write(""" <html> <head> <script type="text/javascript" src="https://www.google.com/jsapi"></script> <script type="text/javascript"> google.load("visualization", "1", {packages:["corechart"]}); google.setOnLoadCallback(drawChart); function drawChart() { %s %s } </script> </head> <body> %s </body> </html> """%( tables,draws,divs_table ) ) donut_html.close() html_doc.write("""Site configuration <a href="javascript:showhide('site')">[Click to show/hide]</a> <br> <div id="site" style="display:none;"> <br> <ul> %s </ul></div> """%(text)) lap ( 'done with space' ) text = "" for param in UC.configs: text +="<li>%s</li><ul>\n"% param for sub in sorted(UC.configs[param].keys()): text +="<li> %s : %s </li>\n"%( sub, UC.configs[param][sub] ) text += '</ul>\n' html_doc.write("""Unified configuration <a href="javascript:showhide('config')">[Click to show/hide]</a> <br> <div id="config" style="display:none;"> <br> <ul> %s </ul></div> """%(text)) lap ( 'done with configuration' ) print "... done with status page." html_doc.write(""" </body> </html> """) html_doc.close() ## and put the file in place os.system('mv %s/index.html.new %s/index.html'%(monitor_dir,monitor_dir)) statuses = json.loads(open('%s/statusmon.json'%monitor_dir).read()) s_count = defaultdict(int) now = time.mktime(time.gmtime()) for wf in session.query(Workflow).all(): s_count[wf.status]+=1 statuses[now] = dict( s_count ) ## remove old entries for t in statuses.keys(): if (now-float(t)) > 7*24*60*60: statuses.pop(t) open('%s/statusmon.json'%monitor_dir,'w').write( json.dumps( statuses , indent=2)) html_doc = open('%s/statuses.html'%monitor_dir,'w') html_doc.write(""" <html> <table border=1> <thead> <tr> <th> workflow </th><th> status </th><th> wm status</th> </tr> </thead> """) wfs = {} for wfo in session.query(Workflow).all(): ## pass all that is unlocked and considered it gone wfs[wfo.name] = (wfo.status,wfo.wm_status) open('%s/statuses.json'%monitor_dir,'w').write(json.dumps( wfs )) for wfn in sorted(wfs.keys()): ## pass all that is unlocked and considered it gone if 'unlock' in wfs[wfn][0]: continue html_doc.write('<tr><td><a id="%s">%s</a></td><td>%s</td><td>%s</td></tr>\n'%( wfn, wfn, wfs[wfn][0], wfs[wfn][1])) html_doc.write("</table>") html_doc.write("<br>"*100) html_doc.write("end of page</html>") html_doc.close()
#os.system('Unified/assignor.py RunIISummer16MiniAODv2') #os.system('Unified/assignor.py --from_status staging RunIISummer16DR80Premix') #os.system('Unified/assignor.py --from_status staging RunIISummer16DR80-') up = componentInfo(mcm=False, soft=['mcm']) if not up.check(): sys.exit(0) url = reqmgr_url may_have_one=set() may_have_one.update([wfo.name for wfo in session.query(Workflow).filter(Workflow.status.startswith('away')).all()]) may_have_one.update([wfo.name for wfo in session.query(Workflow).filter(Workflow.status.startswith('assistance')).all()]) wfs = [] wfs.extend( getWorkflows(url, 'running-open', details=True)) wfs.extend( getWorkflows(url, 'running-closed', details=True)) wfs.extend( getWorkflows(url, 'completed', details=True)) may_have_one_too = set() for wf in wfs: if wf['RequestName'] in may_have_one: #print wf['RequestName'],"and familly" may_have_one_too.update( getWorkflowById(url, wf['PrepID']) ) may_have_one.update( may_have_one_too ) for logtype in ['report','joblogs','condorlogs']: for d in filter(None,os.popen('ls -d %s/%s/*'%(monitor_dir,logtype)).read().split('\n')): if not any([m in d for m in may_have_one]): ## that can be removed
def checkor(url, spec=None, options=None): if userLock(): return if duplicateLock(): return fDB = closeoutInfo() UC = unifiedConfiguration() use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] wfs=[] if options.new: ## get all in running and check ## you want to intersect with what is completed ! if options.strict: completed_wfi = getWorkflows(url, status='completed') for wfo in session.query(Workflow).filter(Workflow.status == 'away').all(): if wfo.name in completed_wfi: wfs.append( wfo ) else: print wfo.name,"is not completed" sendLog('checkor','%s is not completed'%( wfo.name)) else: wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() ) if options.current: ## recheck those already there, probably to just pass them along wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() ) if options.old: ## than get all in need for assistance wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() ) custodials = defaultdict(list) #sites : dataset list transfers = defaultdict(list) #sites : dataset list invalidations = [] #a list of files SI = global_SI CI = campaignInfo() mcm = McMClient(dev=False) def get_campaign(output, wfi): campaign = None try: campaign = output.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request: campaign = wfi.request['Campaign'] return campaign ## retrieve bypass and onhold configuration bypasses = [] holdings = [] #try: # already_notified = json.loads(open('already_notifified.json').read()) #except: # print "no record of already notified workflow. starting fresh" # already_notified = [] for bypassor,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**')]: bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor) if not os.path.isfile(bypass_file): #sendLog('checkor','no file %s',bypass_file) continue try: bypasses.extend( json.loads(open(bypass_file).read())) except: sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email]) holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor) if not os.path.isfile(holding_file): #sendLog('checkor',"no file %s"%holding_file) continue try: holdings.extend( json.loads(open(holding_file).read())) except: sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email]) ## once this was force-completed, you want to bypass for rider,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('srimanob','*****@*****.**')]: rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider) if not os.path.isfile(rider_file): print "no file",rider_file #sendLog('checkor',"no file %s"%rider_file) continue try: bypasses.extend( json.loads(open( rider_file ).read() ) ) except: sendLog('checkor',"cannot get force complete list from %s"%rider) sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email]) if use_mcm: mcm_force = mcm.get('/restapi/requests/forcecomplete') bypasses.extend( mcm_force ) pattern_fraction_pass = UC.get('pattern_fraction_pass') total_running_time = 5.*60. sleep_time = 1 if len(wfs): sleep_time = min(max(0.5, total_running_time / len(wfs)), 10) random.shuffle( wfs ) print len(wfs),"to consider, pausing for",sleep_time for wfo in wfs: if spec and not (spec in wfo.name): continue time.sleep( sleep_time ) ## get info wfi = workflowInfo(url, wfo.name) wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status)) ## make sure the wm status is up to date. # and send things back/forward if necessary. wfo.wm_status = wfi.request['RequestStatus'] if wfo.wm_status == 'closed-out': ## manually closed-out wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status)) wfo.status = 'close' session.commit() continue elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']: ## went into trouble wfo.status = 'trouble' wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status)) session.commit() continue elif wfo.wm_status in ['assigned','acquired']: ## not worth checking yet wfi.sendLog('checkor',"%s is not running yet"%wfo.name) session.commit() continue if '-onhold' in wfo.status: if wfo.name in holdings and wfo.name not in bypasses: wfi.sendLog('checkor',"%s is on hold"%wfo.name) continue if wfo.wm_status != 'completed': #and not wfo.name in bypasses: ## for sure move on with closeout check if in completed wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status)) session.commit() continue if wfo.name in holdings and wfo.name not in bypasses: wfo.status = 'assistance-onhold' wfi.sendLog('checkor',"setting %s on hold"%wfo.name) session.commit() continue session.commit() #sub_assistance="" # if that string is filled, there will be need for manual assistance existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance assistance_tags = set() is_closing = True ## get it from somewhere bypass_checks = False pids = wfi.getPrepIDs() bypass_by_mcm = False for bypass in bypasses: if bypass in wfo.name: wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass)) bypass_checks = True break if bypass in pids: wfi.sendLog('checkor',"we can bypass checks on %s because of prepid %s "%( wfo.name, bypass)) bypass_checks = True bypass_by_mcm = True break #if not CI.go( wfi.request['Campaign'] ) and not bypass_checks: # print "No go for",wfo.name # wfi.sendLog('checkor',"No go for %s"%wfi.request['Campaign']) # continue tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm* vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco campaigns = {} for out in wfi.request['OutputDatasets']: c = get_campaign(out, wfi) campaigns[out] = c if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]: vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override'])) ## add those that we need to check for custodial copy tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] )) wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])] check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] )) check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check )) check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) ) wfi.sendLog('checkor', check_output_text ) ## anything running on acdc : getting the real prepid is not worth it familly = getWorkflowById(url, wfi.request['PrepID'], details=True) acdc = [] acdc_inactive = [] for member in familly: if member['RequestType'] != 'Resubmission': continue if member['RequestName'] == wfo.name: continue if member['RequestDate'] < wfi.request['RequestDate']: continue if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']: print wfo.name,"still has an ACDC running",member['RequestName'] acdc.append( member['RequestName'] ) ## cannot be bypassed! is_closing = False assistance_tags.add('recovering') elif member['RequestStatus']==None: print member['RequestName'],"is not real" pass else: acdc_inactive.append( member['RequestName'] ) assistance_tags.add('recovered') ## completion check percent_completions = {} if not 'TotalInputEvents' in wfi.request: event_expected,lumi_expected = 0,0 if not 'recovery' in wfo.status: #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**']) sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical') else: event_expected,lumi_expected = wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis'] if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']): event_expected = int(wfi.request['RequestNumEvents']) elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']: event_expected = int(wfi.request['Task1']['RequestNumEvents']) fractions_pass = {} over_100_pass = False (lhe,prim,_,_) = wfi.getIO() if lhe or prim: over_100_pass = False for output in wfi.request['OutputDatasets']: event_count,lumi_count = getDatasetEventsAndLumis(dataset=output) percent_completions[output] = 0. if lumi_expected: percent_completions[output] = lumi_count / float( lumi_expected ) if event_expected: wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected )) percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) ) fractions_pass[output] = 0.95 c = campaigns[output] if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]: fractions_pass[output] = CI.campaigns[c]['fractionpass'] wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output)) if options.fractionpass: fractions_pass[output] = options.fractionpass print "overriding fraction to",fractions_pass[output],"by command line for",output for key in pattern_fraction_pass: if key in output: fractions_pass[output] = pattern_fraction_pass[key] print "overriding fraction to",fractions_pass[output],"by dataset key",key if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]): print wfo.name,"is not completed" print json.dumps(percent_completions, indent=2) print json.dumps(fractions_pass, indent=2) ## hook for creating automatically ACDC ? if not bypass_checks: assistance_tags.add('recovery') is_closing = False if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]): print wfo.name,"is over completed" print json.dumps(percent_completions, indent=2) if not bypass_checks: assistance_tags.add('over100') is_closing = False ## correct lumi < 300 event per lumi events_per_lumi = {} for output in wfi.request['OutputDatasets']: events_per_lumi[output] = getDatasetEventsPerLumi( output ) lumi_upper_limit = {} for output in wfi.request['OutputDatasets']: upper_limit = 301. campaign = campaigns[output] #if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request: # upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency'] # print "setting the upper limit of lumisize to",upper_limit,"by request configuration" if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]: upper_limit = CI.campaigns[campaign]['lumisize'] print "overriding the upper lumi size to",upper_limit,"for",campaign if options.lumisize: upper_limit = options.lumisize print "overriding the upper lumi size to",upper_limit,"by command line" lumi_upper_limit[output] = upper_limit if wfi.request['RequestType'] in ['ReDigi']: lumi_upper_limit[output] = -1 if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]): print wfo.name,"has big lumisections" print json.dumps(events_per_lumi, indent=2) ## hook for rejecting the request ? if not bypass_checks: assistance_tags.add('biglumi') is_closing = False any_presence = {} for output in wfi.request['OutputDatasets']: any_presence[output] = getDatasetPresence(url, output, vetoes=[]) ## custodial copy custodial_locations = {} custodial_presences = {} for output in wfi.request['OutputDatasets']: custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s] custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output) if not custodial_locations[output]: custodial_locations[output] = [] ## presence in phedex phedex_presence ={} for output in wfi.request['OutputDatasets']: phedex_presence[output] = phedexClient.getFileCountDataset(url, output ) out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier] size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])): print wfo.name,"has not all custodial location" print json.dumps(custodial_locations, indent=2) ########## ## hook for making a custodial replica ? custodial = None ## get from other outputs for output in out_worth_checking: if len(custodial_locations[output]): custodial = custodial_locations[output][0] if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the other output custodial:",custodial,"because of limited space" custodial = None ## try to get it from campaign configuration if not custodial: for output in out_worth_checking: campaign = campaigns[output] if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]: custodial = CI.campaigns[campaign]['custodial'] print "Setting custodial to",custodial,"from campaign configuration" if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the campaign configuration custodial:",custodial,"because of limited space" custodial = None ## get from the parent pick_custodial = True use_parent_custodial = UC.get('use_parent_custodial') _,prim,_,_ = wfi.getIO() if not custodial and prim and use_parent_custodial: parent_dataset = prim.pop() ## this is terribly dangerous to assume only parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset ) ###parents_custodial = findCustodialLocation(url, parent_dataset) if not parents_custodial: parents_custodial = [] if len(parents_custodial): custodial = parents_custodial[0] else: print "the input dataset",parent_dataset,"does not have custodial in the first place. abort" #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset) ## does not work for RAWOADSIM sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset) ## cannot be bypassed, this is an issue to fix is_closing = False pick_custodial = False assistance_tags.add('parentcustodial') if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the parent custodial:",custodial,"because of limited space" custodial = None if not custodial and pick_custodial: ## pick one at random custodial = SI.pick_SE(size=size_worth_checking) if not custodial: print "cannot find a custodial for",wfo.name wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking)) #sendEmail( "cannot find a custodial","cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking)) sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical') if custodial and (is_closing or bypass_checks): print "picked",custodial,"for tape copy" ## remember how much you added this round already ; this stays locally SI.storage[custodial] -= size_worth_checking ## register the custodial request, if there are no other big issues for output in out_worth_checking: if not len(custodial_locations[output]): if phedex_presence[output]>=1: custodials[custodial].append( output ) ## let's wait and see if that's needed assistance_tags.add('custodial') else: print "no file in phedex for",output," not good to add to custodial requests" #cannot be bypassed is_closing = False ## disk copy disk_copies = {} for output in wfi.request['OutputDatasets']: disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)] if not all(map( lambda sites : len(sites)!=0, disk_copies.values())): print wfo.name,"has not all output on disk" print json.dumps(disk_copies, indent=2) ## presence in dbs dbs_presence = {} dbs_invalid = {} for output in wfi.request['OutputDatasets']: dbs_presence[output] = dbs3Client.getFileCountDataset( output ) dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True) fraction_invalid = 0.01 if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs,phedex mismatch" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) if not 'recovering' in assistance_tags: assistance_tags.add('filemismatch') #print this for show and tell if no recovery on-going for out in dbs_presence: _,_,missing_phedex,missing_dbs = getDatasetFiles(url, out) if missing_phedex: print "These %d files are missing in phedex"%(len(missing_phedex)) print "\n".join( missing_phedex ) if missing_dbs: print "These %d files are missing in dbs"%(len(missing_dbs)) print "\n".join( missing_dbs ) #if not bypass_checks: ## I don't think we can by pass this is_closing = False if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs invalid file level too high" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## need to be going and taking an eye assistance_tags.add('invalidfiles') if not bypass_checks: #sub_assistance+="-invalidfiles" is_closing = False ## put that heavy part at the end ## duplication check duplications = {} if is_closing or bypass_checks: print "starting duplicate checker for",wfo.name for output in wfi.request['OutputDatasets']: print "\tchecking",output duplications[output] = True try: duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True) except: try: duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True) except: print "was not possible to get the duplicate count for",output is_closing=False if any(duplications.values()) and not options.ignoreduplicates: print wfo.name,"has duplicates" print json.dumps(duplications,indent=2) ## hook for making file invalidation ? ## it shouldn't be allowed to bypass it assistance_tags.add('duplicates') is_closing = False ## for visualization later on if not wfo.name in fDB.record: #print "adding",wfo.name,"to close out record" fDB.record[wfo.name] = { 'datasets' :{}, 'name' : wfo.name, 'closeOutWorkflow' : None, } fDB.record[wfo.name]['closeOutWorkflow'] = is_closing fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority'] fDB.record[wfo.name]['prepid'] = wfi.request['PrepID'] for output in wfi.request['OutputDatasets']: if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {} rec = fDB.record[wfo.name]['datasets'][output] rec['percentage'] = float('%.2f'%(percent_completions[output]*100)) rec['duplicate'] = duplications[output] if output in duplications else 'N/A' rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A' rec['closeOutDataset'] = is_closing rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A' rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output]))) rec['dbsFiles'] = dbs_presence[output] rec['dbsInvFiles'] = dbs_invalid[output] rec['phedexFiles'] = phedex_presence[output] rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive)) ## and move on if is_closing: ## toggle status to closed-out in request manager print "setting",wfo.name,"closed-out" if not options.test: if wfo.wm_status in ['closed-out','announced','normal-archived']: print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does" res = None else: res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) print "close out answer",res if not res in ["None",None]: print "try to get the current status again" wfi_bis = workflowInfo(url, wfo.name) if wfi_bis.request['RequestStatus'] == 'closed-out': print "the request did toggle to closed-out" res = None if not res in ["None",None]: print "retrying to closing out" print res res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) if res in [None,"None"]: wfo.status = 'close' session.commit() if use_mcm and bypass_by_mcm: ## shoot large on all prepids for pid in pids: mcm.delete('/restapi/requests/forcecomplete/%s'%pid) else: print "could not close out",wfo.name,"will try again next time" else: ## full known list #recovering # has active ACDC ##OUT #recovered #had inactive ACDC #recovery #not over the pass bar #over100 # over 100% #biglumi # has a big lumiblock #parentcustodial # the parent does not have a valid subscription yet #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear #filemismatch # there is a dbs/phedex mismatch #duplicates #a lumi section is there twice ## manual is not added yet, and should be so by recoveror print wfo.name,"was tagged with :",list(assistance_tags) if 'recovering' in assistance_tags: ## if active ACDC, being under threshold, filemismatch do not matter assistance_tags = assistance_tags - set(['recovery','filemismatch']) if 'recovery' in assistance_tags and 'recovered' in assistance_tags: ## should not set -recovery to anything that add ACDC already assistance_tags = assistance_tags - set(['recovery','recovered']) ## straight to manual assistance_tags.add('manual') ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it print wfo.name,"needs assistance with",",".join( assistance_tags ) print wfo.name,"existing conditions",",".join( existing_assistance_tags ) ######################################### ##### notification to requester ######### go_notify=False if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags: go_notify=True if go_notify: #if wfo.name in already_notified: # print "double notification" # sendEmail('double notification','please take a look at %s'%(wfo.name)) #else: # already_notified.append( wfo.name ) detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s' perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name) splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name) ## notify templates messages= { 'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ), 'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink), 'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ), 'filemismatch': 'Samples completed with inconsistency in DBS/Phedex', #'manual' : 'Workflow completed and requires manual checks by Ops', } content = "The request PREPID (WORKFLOW) is facing issue in production.\n" motive = False for case in messages: if case in assistance_tags: content+= "\n"+messages[case]+"\n" motive = True content += "You are invited to check, while this is being taken care of by Comp-Ops.\n" content += "This is an automated message from Comp-Ops.\n" items_notified = set() if use_mcm and motive: wfi.notifyRequestor( content , mcm = mcm) ######################################### ## logic to set the status further if assistance_tags: new_status = 'assistance-'+'-'.join(sorted(assistance_tags) ) else: new_status = 'assistance' ## case where the workflow was in manual from recoveror if not 'manual' in wfo.status or new_status!='assistance-recovery': wfo.status = new_status if not options.test: print "setting",wfo.name,"to",wfo.status session.commit() else: print "current status is",wfo.status,"not changing to anything" #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2)) fDB.html() if not spec: #sendEmail("fresh assistance status available","Fresh status are available at https://cmst2.web.cern.ch/cmst2/unified/assistance.html",destination=['*****@*****.**']) #it's a bit annoying pass ## custodial requests print "Custodials" print json.dumps(custodials, indent=2) for site in custodials: print ','.join(custodials[site]),'=>',site if not options.test: result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) ) print result print "Transfers" print json.dumps(transfers, indent=2) ## replicas requests for site in transfers: print ','.join(transfers[site]),'=>',site if not options.test: result = None #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out") print result print "File Invalidation" print invalidations
def injector(url, options, specific): mlock = moduleLock() if mlock(): return use_mcm = True up = componentInfo(soft=['mcm','wtc','jira'] ) if not up.check(): return use_mcm = up.status['mcm'] UC = unifiedConfiguration() transform_keywords = UC.get('convert_to_stepchain') workflows = getWorkflows(url, status=options.wmstatus, user=options.user) for user in UC.get("user_rereco"): workflows.extend( getWorkflows(url, status=options.wmstatus, user=user, rtype="ReReco")) for user in (options.user_relval.split(',') if options.user_relval else UC.get("user_relval")) : workflows.extend( getWorkflows(url, status=options.wmstatus, user=user, rtype="TaskChain")) for user in (options.user_storeresults.split(',') if options.user_storeresults else UC.get("user_storeresults")) : workflows.extend( getWorkflows(url, status=options.wmstatus, user=user, rtype="StoreResults")) print len(workflows),"in line" cannot_inject = set() to_convert = set() status_cache = defaultdict(str) ## browse for assignment-approved requests, browsed for ours, insert the diff for wf in workflows: if specific and not specific in wf: continue exists = session.query(Workflow).filter(Workflow.name == wf ).first() if not exists: wfi = workflowInfo(url, wf) ## check first that there isn't related here with something valid can_add = True ## first try at finding a match familly = session.query(Workflow).filter(Workflow.name.contains(wfi.request['PrepID'])).all() if not familly: pids = wfi.getPrepIDs() req_familly = [] for pid in pids: req_familly.extend( getWorkflowById( url, pid, details=True) ) familly = [] print len(req_familly),"members" for req_member in req_familly: #print "member",req_member['RequestName'] owfi = workflowInfo(url, req_member['RequestName'], request=req_member) other_pids = owfi.getPrepIDs() if set(pids) == set(other_pids): ## this is a real match familly.extend( session.query(Workflow).filter(Workflow.name == req_member['RequestName']).all() ) for lwfo in familly: if lwfo: ## we have it already if not lwfo.status in ['forget','trouble','forget-unlock','forget-out-unlock']: wfi.sendLog('injector',"Should not put %s because of %s %s"%( wf, lwfo.name,lwfo.status )) sendLog('injector',"Should not put %s because of %s %s"%( wf, lwfo.name,lwfo.status ), level='critical') print "Should not put",wf,"because of",lwfo.name,lwfo.status cannot_inject.add( wf ) can_add = False ## add a check on validity of input datasets _,prim,par,sec = wfi.getIO() for d in list(prim)+list(par)+list(sec): if not d in status_cache: status_cache[d] = getDatasetStatus(d) if status_cache[d] != 'VALID': wfi.sendLog('injector',"One of the input is not VALID. %s : %s"%( d, status_cache[d])) sendLog('injector',"One of the input of %s is not VALID. %s : %s"%( wf, d, status_cache[d]), level='critical') can_add = False #else: # ##make sure that all blocks get closed # closeAllBlocks(url, d) ## check for any file in phedex, to verify existence _,ph_files,_,_ = getDatasetFiles(url, d) if not ph_files and not ( 'StoreResults' == wfi.request.setdefault('RequestType',None) ): wfi.sendLog('injector',"One of the input has no file in phedex: %s" % d ) sendLog('injector',"One of the input has no file in phedex: %s"% d, level='critical') can_add = False ### ban some workflow that you don't like anymore #outputs = wfi.request['OutputDatasets'] if not can_add: continue ## temporary hack to transform specific taskchain into stepchains good_for_stepchain = wfi.isGoodToConvertToStepChain( keywords = transform_keywords) #good_for_stepchain = wfi.isGoodToConvertToStepChain( keywords = None) ## match keywords and technical constraints if (not options.no_convert) and good_for_stepchain and not wfi.isRelval(): to_convert.add( wf ) wfi.sendLog('injector','Transforming %s TaskChain into StepChain'%wf) sendEmail('convertion to stepchain','Transforming %s TaskChain into StepChain'%wf) wfi.sendLog('injector',"considering %s"%wf) new_wf = Workflow( name = wf , status = options.setstatus, wm_status = options.wmstatus) session.add( new_wf ) session.commit() time.sleep(0.5) else: #print "already have",wf pass if cannot_inject: #sendEmail('workflow duplicates','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject))) sendLog('injector','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject)), level='critical') for wf in to_convert: os.system('./Unified/rejector.py --clone --to_step --comments \"Transform to StepChain\" %s'% wf) ## passing a round of invalidation of what needs to be invalidated if use_mcm and (options.invalidate or True): invalidator(url) no_replacement = set() #print "getting all transfers" #all_transfers=session.query(Transfer).all() #print "go!" ## pick up replacements for wf in session.query(Workflow).filter(Workflow.status == 'trouble').all(): print wf.name if specific and not specific in wf.name: continue print wf.name wfi = workflowInfo(url, wf.name ) wl = wfi.request #getWorkLoad(url, wf.name) familly = getWorkflowById( url, wl['PrepID'] ) true_familly = [] for member in familly: if member == wf.name: continue fwl = getWorkLoad(url , member) if options.replace: if member != options.replace: continue else: if fwl['RequestDate'] < wl['RequestDate']: continue if fwl['RequestType']=='Resubmission': continue if fwl['RequestStatus'] in ['None',None,'new']: continue if fwl['RequestStatus'] in ['rejected','rejected-archived','aborted','aborted-archived']: continue true_familly.append( fwl ) if len(true_familly)==0: #sendLog('injector','%s had no replacement'%wf.name, level='critical') if wfi.isRelval(): #wfi.sendLog('injector','the workflow was found in trouble with no replacement. As a relval, there is no clean way to handle this.') wfi.sendLog('injector','the workflow was found in trouble with no replacement. As a relval, there is no clean way to handle this. Setting forget') wf.status = 'forget' session.commit() else: wfi.sendLog('injector','the workflow was found in trouble with no replacement') no_replacement.add( wf.name ) continue else: wfi.sendLog('injector','the workflow was found in trouble and has a replacement') print wf.name,"has",len(familly),"familly members" print wf.name,"has",len(true_familly),"true familly members" ##we cannot have more than one of them !!! pick the last one if len(true_familly)>1: #sendEmail('multiple wf','please take a look at injector for %s'%wf.name) sendLog('injector','Multiple wf in line, will take the last one for %s \n%s'%( wf.name, ', '.join(fwl['RequestName'] for fwl in true_familly)), level='critical') for fwl in true_familly[-1:]: member = fwl['RequestName'] new_wf = session.query(Workflow).filter(Workflow.name == member).first() if not new_wf: sendLog('injector',"putting %s as replacement of %s"%( member, wf.name)) status = 'away' if fwl['RequestStatus'] in ['assignment-approved']: status = 'considered' new_wf = Workflow( name = member, status = status, wm_status = fwl['RequestStatus']) wf.status = 'forget' session.add( new_wf ) else: if new_wf.status == 'forget': continue sendLog('injector',"getting %s as replacement of %s"%( new_wf.name, wf.name )) wf.status = 'forget' for tr in session.query(TransferImp).filter( TransferImp.workflow_id == wf.id).all(): ## get all transfer working for the old workflow existing = session.query(TransferImp).filter( TransferImp.phedexid == tr.phedexid).filter( TransferImp.workflow_id == new_wf.id).all() tr.active = False ## disable the old one if not existing: ## create the transfer object for the new dependency tri = TransferImp( phedexid = tr.phedexid, workflow = new_wf) session.add( tri ) session.commit() ## don't do that automatically #wf.status = 'forget' session.commit() if no_replacement: #sendEmail('workflow with no replacement','%s \n are dangling there'%( '\n'.join(no_replacement))) sendLog('injector','workflow with no replacement\n%s \n are dangling there'% ( '\n'.join(no_replacement)), level='critical')
def checkor(url, spec=None, options=None): if userLock(): return if duplicateLock(): return fDB = closeoutInfo() UC = unifiedConfiguration() use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] wfs=[] if options.new: ## get all in running and check ## you want to intersect with what is completed ! if options.strict: completed_wfi = getWorkflows(url, status='completed') for wfo in session.query(Workflow).filter(Workflow.status == 'away').all(): if wfo.name in completed_wfi: wfs.append( wfo ) else: print wfo.name,"is not completed" sendLog('checkor','%s is not completed'%( wfo.name)) else: wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() ) if options.current: ## recheck those already there, probably to just pass them along wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() ) if options.old: ## than get all in need for assistance wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() ) custodials = defaultdict(list) #sites : dataset list transfers = defaultdict(list) #sites : dataset list invalidations = [] #a list of files SI = siteInfo() CI = campaignInfo() mcm = McMClient(dev=False) def get_campaign(output, wfi): campaign = None try: campaign = output.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request: campaign = wfi.request['Campaign'] return campaign ## retrieve bypass and onhold configuration bypasses = [] forcings = [] overrides = getForceCompletes() holdings = [] for bypassor,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('prozober','*****@*****.**')]: bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor) if not os.path.isfile(bypass_file): #sendLog('checkor','no file %s',bypass_file) continue try: bypasses.extend( json.loads(open(bypass_file).read())) except: sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email]) holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor) if not os.path.isfile(holding_file): #sendLog('checkor',"no file %s"%holding_file) continue try: holdings.extend( json.loads(open(holding_file).read())) except: sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email]) ## once this was force-completed, you want to bypass for rider,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('srimanob','*****@*****.**')]: rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider) if not os.path.isfile(rider_file): print "no file",rider_file #sendLog('checkor',"no file %s"%rider_file) continue try: bypasses.extend( json.loads(open( rider_file ).read() ) ) except: sendLog('checkor',"cannot get force complete list from %s"%rider) sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email]) if use_mcm: forcings = mcm.get('/restapi/requests/forcecomplete') if forcings: sendEmail('force completing mechanism','please check what checkor is doing with %s'%( ','.join(forcings))) pattern_fraction_pass = UC.get('pattern_fraction_pass') total_running_time = 5.*60. sleep_time = 1 if len(wfs): sleep_time = min(max(0.5, total_running_time / len(wfs)), 10) random.shuffle( wfs ) print len(wfs),"to consider, pausing for",sleep_time max_per_round = UC.get('max_per_round').get('checkor',None) if max_per_round and not spec: wfs = wfs[:max_per_round] for wfo in wfs: if spec and not (spec in wfo.name): continue time.sleep( sleep_time ) ## get info wfi = workflowInfo(url, wfo.name) wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status)) ## make sure the wm status is up to date. # and send things back/forward if necessary. wfo.wm_status = wfi.request['RequestStatus'] if wfo.wm_status == 'closed-out': ## manually closed-out wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status)) wfo.status = 'close' session.commit() continue elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']: ## went into trouble wfo.status = 'trouble' wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status)) session.commit() continue elif wfo.wm_status in ['assigned','acquired']: ## not worth checking yet wfi.sendLog('checkor',"%s is not running yet"%wfo.name) session.commit() continue if '-onhold' in wfo.status: if wfo.name in holdings and wfo.name not in bypasses: wfi.sendLog('checkor',"%s is on hold"%wfo.name) continue if wfo.wm_status != 'completed': #and not wfo.name in bypasses: ## for sure move on with closeout check if in completed wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status)) session.commit() continue if wfo.name in holdings and wfo.name not in bypasses: wfo.status = 'assistance-onhold' wfi.sendLog('checkor',"setting %s on hold"%wfo.name) session.commit() continue session.commit() #sub_assistance="" # if that string is filled, there will be need for manual assistance existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance assistance_tags = set() is_closing = True ## get it from somewhere bypass_checks = False for bypass in bypasses: if bypass in wfo.name: wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass)) bypass_checks = True break pids = wfi.getPrepIDs() force_by_mcm = False force_by_user = False for force in forcings: if force in pids: wfi.sendLog('checkor',"we can bypass checks and force complete %s because of prepid %s "%( wfo.name, force)) bypass_checks = True force_by_mcm = True break for user in overrides: for force in overrides[user]: if force in wfo.name: wfi.sendLog('checkor',"we can bypass checks and force complete %s because of keyword %s of user %s"%( wfo.name, force, user)) bypass_checks = True force_by_user = True break tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm* vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco campaigns = {} expected_outputs = copy.deepcopy( wfi.request['OutputDatasets'] ) for out in wfi.request['OutputDatasets']: c = get_campaign(out, wfi) campaigns[out] = c if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]: vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override'])) ## add those that we need to check for custodial copy tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] )) wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])] check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] )) check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check )) check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) ) wfi.sendLog('checkor', check_output_text ) ## anything running on acdc : getting the real prepid is not worth it familly = getWorkflowById(url, wfi.request['PrepID'], details=True) acdc = [] acdc_inactive = [] forced_already=False acdc_bads = [] for member in familly: if member['RequestType'] != 'Resubmission': continue if member['RequestName'] == wfo.name: continue if member['RequestDate'] < wfi.request['RequestDate']: continue if 'OriginalRequestName' in member and member['OriginalRequestName'] != wfo.name: continue if member['RequestStatus'] == None: continue if not set(member['OutputDatasets']).issubset( set(expected_outputs)): if not member['RequestStatus'] in ['rejected-archived','rejected','aborted','aborted-archived']: ##this is not good at all wfi.sendLog('checkor','inconsistent ACDC %s'%member['RequestName'] ) acdc_bads.append( member['RequestName'] ) is_closing = False assistance_tags.add('manual') continue if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']: print wfo.name,"still has an ACDC running",member['RequestName'] acdc.append( member['RequestName'] ) ## cannot be bypassed! is_closing = False assistance_tags.add('recovering') if (force_by_mcm or force_by_user) and not forced_already: wfi.sendLog('checkor','%s is being forced completed while recovering'%wfo.name) wfi.notifyRequestor("The workflow %s was force completed"% wfo.name, do_batch=False) forceComplete(url, wfi) forced_already=True else: acdc_inactive.append( member['RequestName'] ) assistance_tags.add('recovered') if acdc_bads: sendEmail('inconsistent ACDC','for %s, ACDC %s is inconsistent, preventing from closing'%( wfo.name, ','.join(acdc_bads) )) ## completion check percent_completions = {} if not 'TotalInputEvents' in wfi.request: event_expected,lumi_expected = 0,0 if not 'recovery' in wfo.status: #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**']) sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical') else: event_expected,lumi_expected = wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis'] if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']): event_expected = int(wfi.request['RequestNumEvents']) elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']: event_expected = wfi.request['Task1']['RequestNumEvents'] for i in range(1,20): if 'Task%d'%i in wfi.request: ## this is wrong ibsolute if 'FilterEfficiency' in wfi.request['Task%d'%i]: event_expected *= float(wfi.request['Task%d'%i]['FilterEfficiency']) event_expected = int(event_expected) fractions_pass = {} over_100_pass = False (lhe,prim,_,_) = wfi.getIO() if lhe or prim: over_100_pass = False for output in wfi.request['OutputDatasets']: event_count,lumi_count = getDatasetEventsAndLumis(dataset=output) percent_completions[output] = 0. if lumi_expected: percent_completions[output] = lumi_count / float( lumi_expected ) if event_expected: wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected )) percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) ) fractions_pass[output] = 0.95 c = campaigns[output] if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]: fractions_pass[output] = CI.campaigns[c]['fractionpass'] wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output)) if options.fractionpass: fractions_pass[output] = options.fractionpass print "overriding fraction to",fractions_pass[output],"by command line for",output for key in pattern_fraction_pass: if key in output: fractions_pass[output] = pattern_fraction_pass[key] print "overriding fraction to",fractions_pass[output],"by dataset key",key if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]): possible_recoveries = wfi.getRecoveryDoc() if possible_recoveries == []: wfi.sendLog('checkor','%s has missing statistics \n%s \n%s, but nothing is recoverable. passing through to annoucement'%( wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) )) sendEmail('nothing is recoverable','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name) bypass_checks = True else: wfi.sendLog('checkor','%s is not completed \n%s \n%s'%( wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) )) ## hook for creating automatically ACDC ? if not bypass_checks: assistance_tags.add('recovery') is_closing = False if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]): print wfo.name,"is over completed" print json.dumps(percent_completions, indent=2) if not bypass_checks: assistance_tags.add('over100') is_closing = False ## correct lumi < 300 event per lumi events_per_lumi = {} for output in wfi.request['OutputDatasets']: events_per_lumi[output] = getDatasetEventsPerLumi( output ) lumi_upper_limit = {} for output in wfi.request['OutputDatasets']: upper_limit = 301. campaign = campaigns[output] #if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request: # upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency'] # print "setting the upper limit of lumisize to",upper_limit,"by request configuration" if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]: upper_limit = CI.campaigns[campaign]['lumisize'] print "overriding the upper lumi size to",upper_limit,"for",campaign if options.lumisize: upper_limit = options.lumisize print "overriding the upper lumi size to",upper_limit,"by command line" lumi_upper_limit[output] = upper_limit if wfi.request['RequestType'] in ['ReDigi']: lumi_upper_limit[output] = -1 if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]): print wfo.name,"has big lumisections" print json.dumps(events_per_lumi, indent=2) ## hook for rejecting the request ? if not bypass_checks: assistance_tags.add('biglumi') is_closing = False any_presence = {} for output in wfi.request['OutputDatasets']: any_presence[output] = getDatasetPresence(url, output, vetoes=[]) ## custodial copy custodial_locations = {} custodial_presences = {} for output in wfi.request['OutputDatasets']: custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s] custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output) if not custodial_locations[output]: custodial_locations[output] = [] ## presence in phedex phedex_presence ={} for output in wfi.request['OutputDatasets']: phedex_presence[output] = phedexClient.getFileCountDataset(url, output ) out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier] size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])): print wfo.name,"has not all custodial location" print json.dumps(custodial_locations, indent=2) ########## ## hook for making a custodial replica ? custodial = None ## get from other outputs for output in out_worth_checking: if len(custodial_locations[output]): custodial = custodial_locations[output][0] if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the other output custodial:",custodial,"because of limited space" custodial = None ## try to get it from campaign configuration if not custodial: for output in out_worth_checking: campaign = campaigns[output] if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]: custodial = CI.campaigns[campaign]['custodial'] print "Setting custodial to",custodial,"from campaign configuration" if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the campaign configuration custodial:",custodial,"because of limited space" custodial = None ## get from the parent pick_custodial = True use_parent_custodial = UC.get('use_parent_custodial') _,prim,_,_ = wfi.getIO() if not custodial and prim and use_parent_custodial: parent_dataset = prim.pop() ## this is terribly dangerous to assume only parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset ) ###parents_custodial = findCustodialLocation(url, parent_dataset) if not parents_custodial: parents_custodial = [] if len(parents_custodial): custodial = parents_custodial[0] else: print "the input dataset",parent_dataset,"does not have custodial in the first place. abort" #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset) ## does not work for RAWOADSIM sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset) ## cannot be bypassed, this is an issue to fix is_closing = False pick_custodial = False assistance_tags.add('parentcustodial') if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the parent custodial:",custodial,"because of limited space" custodial = None if not custodial and pick_custodial: ## pick one at random custodial = SI.pick_SE(size=size_worth_checking) if not custodial: print "cannot find a custodial for",wfo.name wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking)) #sendEmail( "cannot find a custodial","cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking)) sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical') if custodial and (is_closing or bypass_checks): print "picked",custodial,"for tape copy" ## remember how much you added this round already ; this stays locally SI.storage[custodial] -= size_worth_checking ## register the custodial request, if there are no other big issues for output in out_worth_checking: if not len(custodial_locations[output]): if phedex_presence[output]>=1: custodials[custodial].append( output ) ## let's wait and see if that's needed assistance_tags.add('custodial') else: print "no file in phedex for",output," not good to add to custodial requests" #cannot be bypassed is_closing = False ## disk copy disk_copies = {} for output in wfi.request['OutputDatasets']: disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)] if not all(map( lambda sites : len(sites)!=0, disk_copies.values())): print wfo.name,"has not all output on disk" print json.dumps(disk_copies, indent=2) ## presence in dbs dbs_presence = {} dbs_invalid = {} for output in wfi.request['OutputDatasets']: dbs_presence[output] = dbs3Client.getFileCountDataset( output ) dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True) fraction_invalid = 0.01 if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs,phedex mismatch" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) if not 'recovering' in assistance_tags: assistance_tags.add('filemismatch') #print this for show and tell if no recovery on-going for out in dbs_presence: _,_,missing_phedex,missing_dbs = getDatasetFiles(url, out) if missing_phedex: wfi.sendLog('checkor',"These %d files are missing in phedex\n%s"%(len(missing_phedex), "\n".join( missing_phedex ))) if missing_dbs: wfi.sendLog('checkor',"These %d files are missing in dbs\n%s"%(len(missing_dbs), "\n".join( missing_dbs ))) #if not bypass_checks: ## I don't think we can by pass this is_closing = False if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs invalid file level too high" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## need to be going and taking an eye assistance_tags.add('invalidfiles') if not bypass_checks: #sub_assistance+="-invalidfiles" is_closing = False ## put that heavy part at the end ## duplication check duplications = {} if is_closing or bypass_checks: print "starting duplicate checker for",wfo.name for output in wfi.request['OutputDatasets']: print "\tchecking",output duplications[output] = True try: duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True) except: try: duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True) except: print "was not possible to get the duplicate count for",output is_closing=False if any(duplications.values()) and not options.ignoreduplicates: print wfo.name,"has duplicates" print json.dumps(duplications,indent=2) ## hook for making file invalidation ? ## it shouldn't be allowed to bypass it assistance_tags.add('duplicates') is_closing = False ## for visualization later on if not wfo.name in fDB.record: #print "adding",wfo.name,"to close out record" fDB.record[wfo.name] = { 'datasets' :{}, 'name' : wfo.name, 'closeOutWorkflow' : None, } fDB.record[wfo.name]['closeOutWorkflow'] = is_closing fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority'] fDB.record[wfo.name]['prepid'] = wfi.request['PrepID'] for output in wfi.request['OutputDatasets']: if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {} rec = fDB.record[wfo.name]['datasets'][output] rec['percentage'] = float('%.2f'%(percent_completions[output]*100)) rec['duplicate'] = duplications[output] if output in duplications else 'N/A' rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A' rec['closeOutDataset'] = is_closing rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A' rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output]))) rec['dbsFiles'] = dbs_presence[output] rec['dbsInvFiles'] = dbs_invalid[output] rec['phedexFiles'] = phedex_presence[output] rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive)) now = time.gmtime() rec['timestamp'] = time.mktime(now) rec['updated'] = time.asctime(now)+' (GMT)' ## and move on if is_closing: ## toggle status to closed-out in request manager print "setting",wfo.name,"closed-out" if not options.test: if wfo.wm_status in ['closed-out','announced','normal-archived']: print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does" res = None else: res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) print "close out answer",res if not res in ["None",None]: print "try to get the current status again" wfi_bis = workflowInfo(url, wfo.name) if wfi_bis.request['RequestStatus'] == 'closed-out': print "the request did toggle to closed-out" res = None if not res in ["None",None]: print "retrying to closing out" print res res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) if res in [None,"None"]: wfo.status = 'close' session.commit() if use_mcm and force_by_mcm: ## shoot large on all prepids, on closing the wf for pid in pids: mcm.delete('/restapi/requests/forcecomplete/%s'%pid) else: print "could not close out",wfo.name,"will try again next time" else: ## full known list #recovering # has active ACDC ##OUT #recovered #had inactive ACDC #recovery #not over the pass bar #over100 # over 100% #biglumi # has a big lumiblock #parentcustodial # the parent does not have a valid subscription yet #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear #filemismatch # there is a dbs/phedex mismatch #duplicates #a lumi section is there twice ## manual is not added yet, and should be so by recoveror print wfo.name,"was tagged with :",list(assistance_tags) if 'recovering' in assistance_tags: ## if active ACDC, being under threshold, filemismatch do not matter assistance_tags = assistance_tags - set(['recovery','filemismatch']) if 'recovery' in assistance_tags and 'recovered' in assistance_tags: ## should not set -recovery to anything that add ACDC already assistance_tags = assistance_tags - set(['recovery','recovered']) ## straight to manual assistance_tags.add('manual') ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it print wfo.name,"needs assistance with",",".join( assistance_tags ) print wfo.name,"existing conditions",",".join( existing_assistance_tags ) ######################################### ##### notification to requester ######### go_notify=False if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags: go_notify=True if go_notify: #if wfo.name in already_notified: # print "double notification" # sendEmail('double notification','please take a look at %s'%(wfo.name)) #else: # already_notified.append( wfo.name ) detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s' perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name) splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name) ## notify templates messages= { 'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ), 'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink), 'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ), 'filemismatch': 'Samples completed with inconsistency in DBS/Phedex', #'manual' : 'Workflow completed and requires manual checks by Ops', } content = "The request PREPID (WORKFLOW) is facing issue in production.\n" motive = False for case in messages: if case in assistance_tags: content+= "\n"+messages[case]+"\n" motive = True content += "You are invited to check, while this is being taken care of by Comp-Ops.\n" content += "This is an automated message from Comp-Ops.\n" items_notified = set() if use_mcm and motive: wfi.notifyRequestor( content , mcm = mcm) ######################################### ## logic to set the status further if assistance_tags: new_status = 'assistance-'+'-'.join(sorted(assistance_tags) ) else: new_status = 'assistance' ## case where the workflow was in manual from recoveror if not 'manual' in wfo.status or new_status!='assistance-recovery': wfo.status = new_status if not options.test: print "setting",wfo.name,"to",wfo.status session.commit() else: print "current status is",wfo.status,"not changing to anything" #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2)) fDB.html() if not spec: #sendEmail("fresh assistance status available","Fresh status are available at https://cmst2.web.cern.ch/cmst2/unified/assistance.html",destination=['*****@*****.**']) #it's a bit annoying pass ## custodial requests print "Custodials" print json.dumps(custodials, indent=2) for site in custodials: print ','.join(custodials[site]),'=>',site if not options.test: result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) ) print result print "Transfers" print json.dumps(transfers, indent=2) ## replicas requests for site in transfers: print ','.join(transfers[site]),'=>',site if not options.test: result = None #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out") print result print "File Invalidation" print invalidations
def transferor(url, specific=None, talk=True, options=None): if userLock(): return mlock = moduleLock() if mlock(): return use_mcm = True up = componentInfo(soft=['mcm', 'wtc', 'jira']) if not up.check(): return use_mcm = up.status['mcm'] if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() #NLI = newLockInfo() #if not NLI.free(): return LI = lockInfo() if not LI.free(): return mcm = McMClient(dev=False) dss = DSS() #allowed_secondary = UC.get('') print "counting all being handled..." being_handled = len( session.query(Workflow).filter(Workflow.status == 'away').all()) being_handled += len( session.query(Workflow).filter( Workflow.status.startswith('stag')).all()) being_transfered = len( session.query(Workflow).filter(Workflow.status == 'staging').all()) #being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all()) being_handled += len( session.query(Workflow).filter( Workflow.status.startswith('assistance-')).filter( ~Workflow.status.contains('custodial')).all()) max_to_handle = options.maxworkflows max_to_transfer = options.maxstaging allowed_to_handle = max(0, max_to_handle - being_handled) allowed_to_transfer = max(0, max_to_transfer - being_transfered) wf_buffer = 5 if allowed_to_handle <= wf_buffer: ## buffer for having several wf per transfer print "Not allowed to run more than", max_to_handle, "at a time. Currently", being_handled, "and", wf_buffer, "buffer" else: print being_handled, "already being handled", max_to_handle, "max allowed,", allowed_to_handle, "remaining", "and", wf_buffer, "buffer" if allowed_to_transfer <= wf_buffer: print "Not allowed to transfer more than", max_to_transfer, "at a time. Currently", being_transfered, "and", wf_buffer, "buffer" else: print being_transfered, "already being transfered", max_to_transfer, "max allowed,", allowed_to_transfer, "remaining", "and", wf_buffer, "buffer" print "... done" all_transfers = defaultdict(list) workflow_dependencies = defaultdict( set) ## list of wf.id per input dataset wfs_and_wfh = [] max_per_round = UC.get('max_per_round').get('transferor', None) print "getting all wf to consider ..." cache = getWorkflows(url, 'assignment-approved', details=True) all_to_include = session.query(Workflow).filter( Workflow.status.startswith('considered')).all() if len(cache) > 2000: max_to_include = max_per_round random.shuffle(cache) ## randomize first by wf name cache = sorted(cache, key=lambda r: r['RequestPriority'], reverse=True) ## order by prio highest = [r['RequestName'] for r in cache[:max_to_include]] all_to_include = [wfo for wfo in all_to_include if wfo.name in highest] print "limiting what to consider to", max_to_include, "because there is too much stuff going on. Got", len( all_to_include) for wfo in all_to_include: print "\t", wfo.name if specific and not specific in wfo.name: continue cache_r = filter(lambda d: d['RequestName'] == wfo.name, cache) if len(cache_r): wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False, request=cache_r[0]))) else: wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False))) print "... done" transfers_per_sites = defaultdict(int) input_sizes = defaultdict(float) ignored_input_sizes = defaultdict(float) input_cput = {} input_st = {} ## list the size of those in transfer already in_transfer_priority = None min_transfer_priority = None print "getting all wf in staging ..." #stucks = json.loads(open('%s/stuck_transfers.json'%monitor_pub_dir).read()) stucks = json.loads(eosRead('%s/stuck_transfers.json' % monitor_pub_dir)) for wfo in session.query(Workflow).filter( Workflow.status == 'staging').all(): wfh = workflowInfo(url, wfo.name, spec=False) #(lheinput,primary,parent,secondary) = wfh.getIO() #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) print wfo.name, "staging" (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() for site in sites_allowed: ## we should get the actual transfer destination instead of the full white list transfers_per_sites[site] += 1 #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() blocks = wfh.getBlocks() for prim in primary: ds_s = dss.get(prim, blocks=blocks) if prim in stucks: wfh.sendLog( 'transferor', "%s appears stuck, so not counting it %s [GB]" % (prim, ds_s)) ignored_input_sizes[prim] = max(ds_s, ignored_input_sizes[prim]) else: input_sizes[prim] = max(ds_s, input_sizes[prim]) wfh.sendLog('transferor', "%s needs %s [GB]" % (wfo.name, ds_s)) if in_transfer_priority == None: in_transfer_priority = int(wfh.request['RequestPriority']) else: in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority'])) if min_transfer_priority == None: min_transfer_priority = int(wfh.request['RequestPriority']) else: min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority'])) try: print "Ignored input sizes" ignored_values = list(ignored_input_sizes.items()) ignored_values.sort(key=lambda i: i[1]) print "\n".join(map(str, ignored_values)) print "Considered input sizes" considered_values = list(input_sizes.items()) considered_values.sort(key=lambda i: i[1]) print "\n".join(map(str, considered_values)) except Exception as e: print "trying to print the summary of input size" print str(e) print "... done" print "Max priority in transfer already", in_transfer_priority print "Min priority in transfer already", min_transfer_priority print "transfers per sites" print json.dumps(transfers_per_sites, indent=2) in_transfer_already = sum(input_sizes.values()) cput_in_transfer_already = sum(input_cput.values()) st_in_transfer_already = sum(input_st.values()) ## list the size of all inputs primary_input_per_workflow_gb = defaultdict(float) print "getting all input sizes ..." input_blocks = {} for (wfo, wfh) in wfs_and_wfh: (_, primary, _, _) = wfh.getIO() blocks = wfh.getBlocks() input_blocks[wfo.name] = blocks for prim in primary: ## do not count it if it appears stalled ! prim_size = dss.get(prim, blocks=blocks) input_sizes[prim] = max(prim_size, input_sizes[prim]) primary_input_per_workflow_gb[wfo.name] += prim_size print "... done" # shuffle first by name random.shuffle(wfs_and_wfh) # Sort smallest transfers first; allows us to transfer as many as possible workflows. def prio_and_size(i, j): if int(i[1].request['RequestPriority']) == int( j[1].request['RequestPriority']): return cmp(int(primary_input_per_workflow_gb.get(j[0].name, 0)), int(primary_input_per_workflow_gb.get(i[0].name, 0))) else: return cmp(int(i[1].request['RequestPriority']), int(j[1].request['RequestPriority'])) #wfs_and_wfh.sort(cmp = prio_and_size, reverse=True) #wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(primary_input_per_workflow_gb.get(i[0].name, 0)), int(primary_input_per_workflow_gb.get(j[0].name, 0)) )) #sort by priority higher first wfs_and_wfh.sort(cmp=lambda i, j: cmp(int(i[1].request[ 'RequestPriority']), int(j[1].request['RequestPriority'])), reverse=True) if min_transfer_priority == None or in_transfer_priority == None: print "nothing is lining up for transfer" sendLog( "transferor", "No request in staging, using first request to set priority limit") if len(wfs_and_wfh): min_transfer_priority = wfs_and_wfh[0][1].request[ 'RequestPriority'] in_transfer_priority = wfs_and_wfh[0][1].request['RequestPriority'] else: return cput_grand_total = sum(input_cput.values()) cput_to_transfer = cput_grand_total - cput_in_transfer_already st_grand_total = sum(input_st.values()) st_to_transfer = st_grand_total - st_in_transfer_already print "%15.4f [CPU h] worth already in transfer" % cput_in_transfer_already print "%15.4f [CPU h] worth is the current requested transfer load" % cput_to_transfer print "%15.4f [h] worth of absolute system time in transfer" % ( cput_in_transfer_already / SI.availableSlots()) print "%15.4f [h] worth of absolute system time is the current requested transfer load" % ( cput_to_transfer / SI.availableSlots()) print "%15.4f [h] worth of theoritical system time in transfer" % ( st_in_transfer_already) print "%15.4f [h] worth of theoritical system time is the current requested transfer load" % ( st_to_transfer) grand_total = sum(input_sizes.values()) to_transfer = grand_total - in_transfer_already grand_transfer_limit = options.maxtransfer #grand_transfer_limit = SI.total_disk()*0.25*1024## half of the free sapce in TB->GB transfer_limit = grand_transfer_limit - in_transfer_already print "%15.4f GB already being transfered" % in_transfer_already print "%15.4f GB is the current requested transfer load" % to_transfer print "%15.4f GB is the global transfer limit" % grand_transfer_limit print "%15.4f GB is the available limit" % transfer_limit max_staging_per_site = options.maxstagingpersite # the max priority value per dataset. max_priority = defaultdict(int) needs_transfer = 0 ## so that we can count'em passing_along = 0 transfer_sizes = defaultdict(float) went_over_budget = False destination_cache = {} no_goes = set() if max_per_round and not spec: wfs_and_wfh = wfs_and_wfh[:max_per_round] for (wfo, wfh) in wfs_and_wfh: print wfo.name, "to be transfered with priority", wfh.request[ 'RequestPriority'] if wfh.request['RequestStatus'] != 'assignment-approved': if wfh.request['RequestStatus'] in [ 'aborted', 'rejected', 'rejected-archived', 'aborted-archived' ]: if wfh.isRelval(): wfo.status = 'forget' else: wfo.status = 'trouble' ## so that we look or a replacement else: wfo.status = 'away' wfh.sendLog( 'transferor', '%s in status %s, setting %s' % (wfo.name, wfh.request['RequestStatus'], wfo.status)) continue (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() blocks = input_blocks.get(wfo.name, wfh.getBlocks()) if blocks: print "Reading only", len(blocks), "blocks in input" this_load = sum([dss.get(prim, blocks=blocks) for prim in primary]) no_budget = False if (this_load and (sum(transfer_sizes.values()) + this_load > transfer_limit or went_over_budget)): if went_over_budget: wfh.sendLog('transferor', "Transfer has gone over bubget.") else: wfh.sendLog('transferor', "Transfer will go over bubget.") wfh.sendLog( 'transferor', "%15.4f GB this load, %15.4f GB already this round, %15.4f GB is the available limit" % (this_load, sum(transfer_sizes.values()), transfer_limit)) #if sum(transfer_sizes.values()) > transfer_limit: went_over_budget = True if in_transfer_priority != None and min_transfer_priority != None: if int( wfh.request['RequestPriority'] ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority: wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over budget" % (wfh.request['RequestPriority'], in_transfer_priority)) else: if not options.go: wfh.sendLog( 'transferor', "%s minimum priority %s < %s : stop" % (min_transfer_priority, wfh.request['RequestPriority'], in_transfer_priority)) no_budget = True ## throtlle by campaign go no_go = False if not wfh.go(log=True) and not options.go: no_go = True no_goes.add(wfo.name) allowed_secondary = {} overide_parameters = {} check_secondary = (not wfh.isRelval()) output_tiers = list( set([o.split('/')[-1] for o in wfh.request['OutputDatasets']])) for campaign in wfh.getCampaigns(): if campaign in CI.campaigns: overide_parameters.update(CI.campaigns[campaign]) if campaign in CI.campaigns and 'secondaries' in CI.campaigns[ campaign]: if CI.campaigns[campaign]['secondaries']: allowed_secondary.update( CI.campaigns[campaign]['secondaries']) check_secondary = True if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[ campaign]: banned_tier = list( set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers)) if banned_tier: no_go = True wfh.sendLog( 'transferor', 'These data tiers %s are not allowed' % (','.join(banned_tier))) sendLog('transferor', 'These data tiers %s are not allowed in %s' % (','.join(banned_tier), wfo.name), level='critical') if secondary and check_secondary: if (set(secondary) & set(allowed_secondary.keys()) != set(secondary)): msg = '%s is not an allowed secondary' % ( ', '.join(set(secondary) - set(allowed_secondary.keys()))) wfh.sendLog('transferor', msg) critical_msg = msg + '\nWorkflow URL: https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=task_{}'.format( wfh.getPrepIDs()[0]) sendLog('transferor', critical_msg, level='critical') if not options.go: no_go = True for sec in secondary: if sec in allowed_secondary: overide_parameters.update(allowed_secondary[sec]) if 'SiteWhitelist' in overide_parameters: sites_allowed = list( set(sites_allowed) & set(overide_parameters['SiteWhitelist'])) wfh.sendLog( 'transferor', 'Intersecting with the overriding whitelist parameters, allowed sites become {}' .format(sites_allowed)) if no_go: continue if passing_along >= allowed_to_handle: #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: if in_transfer_priority != None and min_transfer_priority != None: if int(wfh.request['RequestPriority'] ) >= in_transfer_priority and int( wfh.request['RequestPriority'] ) != min_transfer_priority: ## higher priority, and not only this priority being transfered wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over %s" % (wfh.request['RequestPriority'], in_transfer_priority, max_to_handle)) else: wfh.sendLog( 'transferor', " Not allowed to pass more than %s at a time. Currently %s handled, and adding %s" % (max_to_handle, being_handled, passing_along)) if not options.go: ## should not allow to jump that fence break if this_load and needs_transfer >= allowed_to_transfer: if in_transfer_priority != None and min_transfer_priority != None: if int(wfh.request['RequestPriority'] ) >= in_transfer_priority and int( wfh.request['RequestPriority'] ) != min_transfer_priority: ## higher priority, and not only this priority being transfered wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over %s" % (wfh.request['RequestPriority'], in_transfer_priority, max_to_transfer)) else: wfh.sendLog( 'transferor', "Not allowed to transfer more than %s at a time. Currently %s transfering, and adding %s" % (max_to_transfer, being_transfered, needs_transfer)) if not options.go: no_budget = True if no_budget: continue # break ## try this for a while to make things faster ## the site white list considers site, campaign, memory and core information if options and options.tosites: sites_allowed = options.tosites.split(',') for dataset in list(primary) + list(parent) + list(secondary): LI.lock(dataset, reason='staging') if not sites_allowed: wfh.sendLog('transferor', "not possible site to run at") #sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name )) sendLog('transferor', "%s has no possible sites to run at" % (wfo.name), level='critical') continue can_go = True staging = False allowed = True primary_destinations = set() if primary: copies_needed_from_CPUh, CPUh = wfh.getNCopies() if talk: print wfo.name, 'reads', ', '.join(primary), 'in primary' ## chope the primary dataset for prim in primary: ## keep track of what needs what workflow_dependencies[prim].add(wfo.id) max_priority[prim] = max(max_priority[prim], int(wfh.request['RequestPriority'])) wfh.sendLog( 'transferor', "Would make %s from cpu requirement %s" % (copies_needed_from_CPUh, CPUh)) copies_needed = copies_needed_from_CPUh if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[ wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[ wfh.request['Campaign']]['maxcopies'] copies_needed = min(copies_needed_from_campaign, copies_needed) wfh.sendLog( 'transferor', "Maxed to %s by campaign configuration %s" % (copies_needed, wfh.request['Campaign'])) if blocks: print "limiting to blocks", "\n".join(sorted(blocks)) ### new ways of making the whole thing destinations, all_block_names = getDatasetDestinations( url, prim, within_sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks) print json.dumps(destinations, indent=2) ## get where the dataset is in full and completed prim_location = [ site for (site, info) in destinations.items() if info['completion'] == 100 and info['data_fraction'] == 1 ] ## the rest is places it is going to be #prim_destination = [site for site in destinations.keys() if not site in prim_location] prim_destination = [ site for (site, info) in destinations.items() if info['data_fraction'] == 1 and info['completion'] != 100 ] ## veto the site with no current disk space, for things that are not relval prim_destination = [ site for site in prim_destination if (SI.disk[site] or wfh.isRelval()) ] if len(prim_location) >= copies_needed: wfh.sendLog( 'transferor', "The input is all fully in place at %s sites %s" % (len(prim_location), sorted(prim_location))) continue copies_needed = max(0, copies_needed - len(prim_location)) wfh.sendLog( 'transferor', "Counting existing copies ; now need %s" % copies_needed) copies_being_made = [ sum([ info['blocks'].keys().count(block) for site, info in destinations.items() if site in prim_destination ]) for block in all_block_names ] latching_on_transfers = set() [ latching_on_transfers.update(info['blocks'].values()) for site, info in destinations.items() if site in prim_destination ] latching_on_transfers = list(latching_on_transfers) #print latching_on_transfers ## figure out where all this is going to go prim_to_distribute = [ site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location ] prim_to_distribute = [ site for site in prim_to_distribute if not SI.CE_to_SE(site) in prim_destination ] ## take out the ones that cannot receive transfers potential_destinations = len(prim_to_distribute) #prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in SI.sites_veto_transfer] prim_to_distribute = [ site for site in prim_to_distribute if (SI.disk[SI.CE_to_SE(site)] or wfh.isRelval()) ] ## do we want to restrict transfers if the amount of site in vetoe are too large ? wfh.sendLog( 'transferor', "Could be going to: %s" % sorted(prim_to_distribute)) if not prim_to_distribute or any([ transfers_per_sites[site] < max_staging_per_site for site in prim_to_distribute ]): ## means there is openings let me go print "There are transfer slots available:", [ (site, transfers_per_sites[site]) for site in prim_to_distribute ] else: if int( wfh.request['RequestPriority'] ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority: wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over transfer slots available" % (wfh.request['RequestPriority'], in_transfer_priority)) else: wfh.sendLog( 'transferor', "Not allowed to transfer more than %s per site at a time. Going overboard for %s" % (max_staging_per_site, sorted([ site for site in prim_to_distribute if transfers_per_sites[site] >= max_staging_per_site ]))) if not options.go: allowed = False break for latching in latching_on_transfers: existings = session.query(TransferImp).filter( TransferImp.phedexid == int(latching)).filter( TransferImp.workflow_id == wfo.id).all() if not existings: tri = TransferImp(phedexid=int(latching), workflow=wfo) print "adding", wfo.id, "with phedexid", latching session.add(tri) else: for existing in existings: existing.active = True session.flush() can_go = False transfer_sizes[prim] = max(this_load, transfer_sizes[prim]) staging = True # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ?? #copies_needed = max(0,copies_needed - len(prim_destination)) copies_needed = max(0, copies_needed - min(copies_being_made)) wfh.sendLog( 'transferor', "Counting the copies being made ; then need %s" % copies_needed) if copies_needed == 0: wfh.sendLog( 'transferor', "The input is either fully in place or getting in full somewhere with %s" % latching_on_transfers) can_go = True continue elif len(prim_to_distribute) == 0: wfh.sendLog( 'transferor', "We are going to need extra copies of %s, but no destinations seems available" % (prim)) sendLog( 'transferor', "We are going to need extra copies of %s, but no destinations seems available" % (prim), level='critical') print json.dumps(prim_to_distribute, indent=2) print json.dumps(prim_location, indent=2) print json.dumps(prim_destination, indent=2) prim_to_distribute = [ site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location ] #prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in SI.sites_veto_transfer ] prim_to_distribute = [ site for site in prim_to_distribute if (SI.disk[SI.CE_to_SE(site)] or wfh.isRelval()) ] print "changed to" print json.dumps(prim_to_distribute, indent=2) if len( prim_to_distribute ) > 0: ## maybe that a parameter we can play with to limit the if not options or options.chop: ### hard include the tape disk andpoint ? #tapes = [site for site in getDatasetPresence( url, prim, vetos=['T0','T2','T3','Disk']) if site.endswith('MSS')] chops, sizes = getDatasetChops( prim, chop_threshold=options.chopsize, only_blocks=blocks) spreading = distributeToSites(chops, prim_to_distribute, n_copies=copies_needed, weights=SI.cpu_pledges, sizes=sizes) ## prune the blocks/destination that are already in the making, so that subscription don't overlap for site in spreading: for block in list(spreading[site]): if site in destinations and block in destinations[ site]['blocks'].keys(): ## prune it spreading[site].remove(block) transfer_sizes[prim] = sum(sizes) if not spreading: sendLog( 'transferor', 'cannot send %s to any site, it cannot fit anywhere' % prim, level='critical') wfh.sendLog( 'transferor', "cannot send to any site. %s cannot seem to fit anywhere" % (prim)) staging = False can_go = False else: spreading = {} for site in prim_to_distribute: if blocks: spreading[site] = blocks else: spreading[site] = [prim] transfer_sizes[prim] = max(this_load, transfer_sizes[prim]) can_go = False wfh.sendLog( 'transferor', "selected CE destinations %s" % (sorted(spreading.keys()))) for (site, items) in spreading.items(): all_transfers[site].extend(items) transfers_per_sites[site] += 1 primary_destinations.add(site) else: can_go = False allowed = False if not allowed: wfh.sendLog('transferor', "Not allowed to move on with") continue if secondary: override_sec_destination = [] if 'SecondaryLocation' in CI.campaigns[wfh.request['Campaign']]: override_sec_destination = CI.campaigns[ wfh.request['Campaign']]['SecondaryLocation'] if 'SecondaryLocation' in overide_parameters: override_sec_destination = overide_parameters[ 'SecondaryLocation'] print wfo.name, 'reads', ', '.join(secondary), 'in secondary' for sec in secondary: workflow_dependencies[sec].add(wfo.id) if True: ## new style, failing on minbias if not sec in destination_cache: ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist destination_cache[sec], _ = getDatasetDestinations( url, sec) ## NO SITE WHITE LIST ADDED #destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed]) ## limit to the site whitelist NOW se_allowed = set( [SI.CE_to_SE(site) for site in sites_allowed]) destinations = dict([ (k, v) for (k, v) in destination_cache[sec].items() if k in se_allowed ]) ## truncate location/destination to those making up for >90% of the dataset bad_destinations = [ destinations.pop(site) for (site, info) in destinations.items() if info['data_fraction'] < 0.9 ] print sec, json.dumps(destinations, indent=2) sec_location = [ site for (site, info) in destinations.items() if info['completion'] >= 95 ] sec_destination = [ site for site in destinations.keys() if not site in sec_location ] ## this is in SE else: ## old style presence = getDatasetPresence(url, sec) sec_location = [ site for site, pres in presence.items() if pres[1] > 90. ] ## more than 90% of the minbias at sites subscriptions = listSubscriptions(url, sec) sec_destination = [site for site in subscriptions] ## how to make unified understand that it has to wait for the secondary if the sec_destination and #sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])] sec_to_distribute = [ site for site in sites_allowed if not SI.CE_to_SE(site) in sec_location ] #sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])] sec_to_distribute = [ site for site in sec_to_distribute if not SI.CE_to_SE(site) in sec_destination ] presitespace_sec_to_distribute = copy.deepcopy( sec_to_distribute) #sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] #sec_to_distribute = [site for site in sec_to_distribute if not SI.CE_to_SE(site) in SI.sites_veto_transfer] sec_to_distribute = [ site for site in sec_to_distribute if (SI.disk[SI.CE_to_SE(site)] or wfh.isRelval()) ] ## at this point you have a problem if len(sec_to_distribute) == 0 and len( presitespace_sec_to_distribute): sendLog( 'transferor', '%s is getting no possible destinations because of lack of space. To be decided what to do in general' % (sec), level='critical') if override_sec_destination: ## intersect with where we want the PU to be not_needed_anymore = list( set(sec_to_distribute) - set(override_sec_destination)) #sendEmail("secondary superfluous","the dataset %s could be removed from %s"%( sec, not_needed_anymore )) sendLog( 'transferor', "the dataset %s could be removed from %s" % (sec, not_needed_anymore)) sec_to_distribute = list( set(sec_to_distribute) & set(override_sec_destination)) if len(sec_to_distribute) > 0: print "secondary could go to", sorted(sec_to_distribute) sec_size = dss.get(sec) for site in sec_to_distribute: site_se = SI.CE_to_SE(site) if (SI.disk[site_se] * 1024.) > sec_size or wfh.isRelval(): wfh.sendLog('transferor', 'Sending %s to %s' % (sec, site)) all_transfers[site].append(sec) can_go = False else: print "could not send the secondary input to", site_se, "because it is too big for the available disk", SI.disk[ site_se] * 1024, "GB need", sec_size if primary_destinations and site in primary_destinations: #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024)) sendLog( 'transferor', '%s is too big (%s) for %s (%s). %s will not be able to run there.' % (sec, sec_size, site_se, SI.disk[site_se] * 1024, wfo.name), level='critical') wfh.sendLog( 'transferor', '%s is too big (%s) for %s (%s). will not be able to run there.' % (sec, sec_size, site_se, SI.disk[site_se] * 1024)) else: ## this is bas overall print "the secondary input does not have to be send to site" ## is that possible to do something more if can_go: ## no explicit transfer required this time if staging: ## but using existing ones wfh.sendLog( 'transferor', "latches on existing transfers, and nothing else, settin staging" ) wfo.status = 'staging' needs_transfer += 1 else: wfh.sendLog( 'transferor', "should just be assigned now to %s" % sorted(sites_allowed)) wfo.status = 'staged' passing_along += 1 wfh.sendLog('transferor', "setting %s status to %s" % (wfo.name, wfo.status)) #session.commit() continue else: ## there is an explicit transfer required if staging: ## and also using an existing one wfh.sendLog('transferor', "latches on existing transfers") if not options.test: wfo.status = 'staging' wfh.sendLog( 'transferor', "setting %s status to %s" % (wfo.name, wfo.status)) #session.commit() wfh.sendLog('transferor', "needs a transfer") needs_transfer += 1 passing_along += 1 if no_goes: #sendEmail("no go for managing","No go for \n"+"\n".join( no_goes )) sendLog('transferor', "No go for \n" + "\n".join(sorted(no_goes)), level='critical') print "accumulated transfers" print json.dumps(all_transfers, indent=2) fake_id = -1 wf_id_in_prestaging = set() for (site, items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## site that do not want input datasets #if site in SI.sites_veto_transfer: # print site,"does not want transfers" # continue ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured. ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] block_datasets = list(set([it.split('#')[0] for it in blocks])) datasets = [it for it in items_to_transfer if not '#' in it] details_text = "Making a replica to %s (CE) %s (SE) for" % (site, site_se) #print "\t",len(blocks),"blocks" ## remove blocks if full dataset is send out blocks = [ block for block in blocks if not block.split('#')[0] in datasets ] #print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks])) #print "\t",len(datasets),"datasets" #print "\t",datasets details_text += '\n\t%d blocks' % len(blocks) details_text += '\n\t%d needed blocks for %s' % ( len(blocks), sorted(list(set([block.split('#')[0] for block in blocks])))) details_text += '\n\t%d datasets' % len(datasets) details_text += '\n\t%s' % sorted(datasets) items_to_transfer = blocks + datasets if execute: sendLog('transferor', details_text) else: print "Would make a replica to", site, "(CE)", site_se, "(SE) for" print details_text ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y', 'yes', 'go']: continue transfered_items = defaultdict(set) if execute: priority = 'normal' cds = [ ds for ds in set(datasets + block_datasets) if ds in max_priority ] ## bucketize the transfers by priority of workflows prioritized_items = defaultdict(set) for item in items_to_transfer: d = item.split('#')[0] p = max_priority.get(d, 80000) q = 'normal' if p > 100000: q = 'reserved' elif p < 70000: q = 'low' prioritized_items[q].add(item) for priority, items in prioritized_items.items(): result = makeReplicaRequest(url, site_se, list(items), 'prestaging', priority=priority, approve=True) if result: these_transfers = [ o['id'] for o in result['phedex']['request_created'] ] #phedexids.extend( these_transfers ) for ph in these_transfers: transfered_items[ph].update(items) else: sendLog( 'transferor', 'Could not make a replica request for items %s to site %s' % (items, site_se), level='critical') #result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority=priority, approve=True) #phedexids = [o['id'] for o in result['phedex']['request_created']]: #else: # #result= {'phedex':{'request_created' : []}} # phedexids = [] # fake_id-=1 if not transfered_items: sendLog( 'transferor', 'Could not make a replica request for items %s to site %s' % (items_to_transfer, site), level='critical') continue for phedexid, items in transfered_items.items(): print phedexid, "transfer created" for transfering in list( set(map(lambda it: it.split('#')[0], items))): for wfid in workflow_dependencies[transfering]: new_transfer = session.query(TransferImp).filter( TransferImp.phedexid == int(phedexid)).filter( TransferImp.workflow_id == wfid).first() if not new_transfer: new_transfer = TransferImp( phedexid=phedexid, workflow=session.query(Workflow).get(wfid)) session.add(new_transfer) else: new_transfer.active = True wf_id_in_prestaging.add(wfid) #session.commit() for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status != 'staging': if execute: tr_wf.status = 'staging' if talk: print "setting", tr_wf.name, "to staging" #session.commit() ## one big session commit at the end that everything went fine session.commit()
def htmlor(caller=""): cache = getWorkflows("cmsweb.cern.ch", "assignment-approved", details=True) cache.extend(getWorkflows("cmsweb.cern.ch", "running-open", details=True)) cache.extend(getWorkflows("cmsweb.cern.ch", "running-closed", details=True)) def getWL(wfn): cached = filter(lambda d: d["RequestName"] == wfn, cache) if cached: wl = cached[0] else: wl = getWorkLoad("cmsweb.cern.ch", wfn) return wl def wfl(wf, view=False, p=False, ms=False, within=False, ongoing=False, status=False, update=False): wfn = wf.name wfs = wf.wm_status wl = None pid = None pids = filter(lambda seg: seg.count("-") == 2, wf.name.split("_")) if len(pids): pids = pids[:1] pid = pids[0] if not pids: wl = getWL(wf.name) pids = getPrepIDs(wl) pid = pids[0] text = ", ".join( [ # wfn, '<a href="https://cmsweb.cern.ch/reqmgr/view/details/%s" target="_blank">%s</a>' % (wfn, wfn), "(%s) <br>" % wfs, ] ) text += ", ".join( [ '<a href="https://cmsweb.cern.ch/reqmgr/view/details/%s" target="_blank">dts</a>' % wfn, '<a href=https://cmsweb.cern.ch/reqmgr/view/showWorkload?requestName=%s target="_blank">wkl</a>' % wfn, '<a href="https://cmsweb.cern.ch/couchdb/reqmgr_workload_cache/%s" target="_blank">wfc</a>' % wfn, '<a href="https://cmsweb.cern.ch/reqmgr/reqMgr/request?requestName=%s" target="_blank">dwkc</a>' % wfn, '<a href="https://cmsweb.cern.ch/reqmgr/view/splitting/%s" target="_blank">spl</a>' % wfn, '<a href="https://cms-pdmv.cern.ch/stats/?RN=%s" target="_blank">vw</a>' % wfn, '<a href="https://cms-pdmv.cern.ch/stats/restapi/get_one/%s" target="_blank">vwo</a>' % wfn, '<a href="https://cms-logbook.cern.ch/elog/Workflow+processing/?mode=full&reverse=0&reverse=1&npp=20&subtext=%s&sall=q" target="_blank">elog</a>' % pid, '<a href="http://cms-gwmsmon.cern.ch/prodview/%s" target="_blank">pv</a>' % wfn, '<a href="https://cmsweb.cern.ch/reqmgr/reqMgr/outputDatasetsByRequestName/%s" target="_blank">out</a>' % wfn, '<a href="closeout.html#%s" target="_blank">clo</a>' % wfn, '<a href="statuses.html#%s" target="_blank">st</a>' % wfn, '<a href="https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s" target="_blank">perf</a>' % wfn, ] ) if within and (not view or wfs == "completed"): wl = getWL(wfn) dataset = None if "InputDataset" in wl: dataset = wl["InputDataset"] if "Task1" in wl and "InputDataset" in wl["Task1"]: dataset = wl["Task1"]["InputDataset"] if dataset: text += ", ".join( [ "", "<a href=https://cmsweb.cern.ch/das/request?input=%s target=_blank>input</a>" % dataset, "<a href=https://cmsweb.cern.ch/phedex/prod/Data::Subscriptions#state=create_since=0;filter=%s target=_blank>sub</a>" % dataset, "<a href=https://cmsweb.cern.ch/phedex/datasvc/xml/prod/subscriptions?dataset=%s&collapse=n target=_blank>ds</a>" % dataset, "<a href=https://cmsweb.cern.ch/phedex/datasvc/xml/prod/blockreplicas?dataset=%s target=_blank>rep</a>" % dataset, ] ) if p: cached = filter(lambda d: d["RequestName"] == wfn, cache) if cached: wl = cached[0] else: wl = getWorkLoad("cmsweb.cern.ch", wfn) text += ", (%s)" % (wl["RequestPriority"]) pass if pid: if ms: mcm_s = json.loads( os.popen( "curl https://cms-pdmv.cern.ch/mcm/public/restapi/requests/get_status/%s --insecure" % pid ).read() )[pid] text += ', <a href="https://cms-pdmv.cern.ch/mcm/requests?prepid=%s" target="_blank">mcm (%s)</a>' % ( pid, mcm_s, ) else: text += ', <a href="https://cms-pdmv.cern.ch/mcm/requests?prepid=%s" target="_blank">mcm</a>' % (pid) text += ( ', <a href="https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=%s" target="_blank">ac</a>' % (pid) ) if status: if wf.status.startswith("assistance"): text += ', <a href="assistance.html#%s" target="_blank">assist</a>' % wfn text += " : %s " % (wf.status) if view and wfs != "acquired": text += ( '<a href="https://cms-pdmv.web.cern.ch/cms-pdmv/stats/growth/%s.gif" target="_blank"><img src="https://cms-pdmv.web.cern.ch/cms-pdmv/stats/growth/%s.gif" style="height:50px"></a>' % (wfn.replace("_", "/"), wfn.replace("_", "/")) ) if ongoing: text += ( '<a href="http://cms-gwmsmon.cern.ch/prodview/%s" target="_blank"><img src="http://cms-gwmsmon.cern.ch/prodview/graphs/%s/daily" style="height:50px"></a>' % (wfn, wfn) ) if ongoing: date1 = time.strftime("%Y-%m-%d+%H:%M", time.gmtime(time.mktime(time.gmtime()) - (15 * 24 * 60 * 60))) date2 = time.strftime("%Y-%m-%d+%H:%M", time.gmtime()) text += ( '<a href="http://dashb-cms-job.cern.ch/dashboard/templates/web-job2/#table=Jobs&date1=%s&date2=%s&sortby=site&task=wmagent_%s">dashb</a>' % (date1, date2, wfn) ) text += "<hr>" return text def phl(phid): text = ", ".join( [ str(phid), '<a href="https://cmsweb.cern.ch/phedex/prod/Request::View?request=%s" target="_blank">vw</a>' % phid, '<a href="https://cmsweb.cern.ch/phedex/prod/Data::Subscriptions?reqfilter=%s" target="_blank">sub</a>' % phid, ] ) return text def ol(out): return '<a href="https://cmsweb.cern.ch/das/request?input=%s" target="_blank"> %s</a>' % (out, out) def lap(comment): l = time.mktime(time.gmtime()) spend = l - lap.start lap.start = l print "Spend %d [s] for %s" % (spend, comment) lap.start = time.mktime(time.gmtime()) ## start to write it # html_doc = open('/afs/cern.ch/user/v/vlimant/public/ops/index.html','w') html_doc = open("/afs/cern.ch/user/c/cmst2/www/unified/index.html", "w") print "Updating the status page ..." if not caller: try: # caller = sys._getframe(1).f_code.co_name caller = sys.argv[0].split("/")[-1].replace(".py", "") print "caller is" print caller except Exception as es: caller = "none found" print "not getting frame" print str(es) html_doc.write( """ <html> <head> <META HTTP-EQUIV="refresh" CONTENT="900"> <script type="text/javascript"> function showhide(id) { var e = document.getElementById(id); e.style.display = (e.style.display == 'block') ? 'none' : 'block'; } </script> </head> <body> Last update on %s(CET), %s(GMT), <a href=logs/ target=_blank>logs</a> <a href=logs/last.log target=_blank>last</a> <a href=statuses.html>statuses</a> <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/ target=_blank>prod mon</a> <a href=https://cmsweb.cern.ch/wmstats/index.html target=_blank>wmstats</a> <a href=http://t3serv001.mit.edu/~cmsprod/IntelROCCS/Detox/SitesInfo.txt target=_blank>detox</a> <a href=locked.html>space</a> <a href=logs/subscribor/last.log target=_blank>blocks</a> <a href=https://twiki.cern.ch/twiki/bin/view/CMSPublic/CompOpsWorkflowL3Responsibilities#Automatic_Assignment_and_Unified>what am I</a> <a href=logs/addHoc/last.log>add-hoc op</a> created from <b>%s <a href=logs/last_running>last running</a></b><br><br> """ % (time.asctime(time.localtime()), time.asctime(time.gmtime()), caller) ) text = "" count = 0 count_by_campaign = defaultdict(lambda: defaultdict(int)) for wf in session.query(Workflow).filter(Workflow.status == "considered").all(): wl = getWL(wf.name) count_by_campaign[wl["Campaign"]][int(wl["RequestPriority"])] += 1 text += "<li> %s </li> \n" % wfl(wf, p=True) count += 1 text_by_c = "" for c in count_by_campaign: text_by_c += "<li> %s (%d) : " % (c, sum(count_by_campaign[c].values())) for p in sorted(count_by_campaign[c].keys()): text_by_c += "%d (%d), " % (p, count_by_campaign[c][p]) text_by_c += "</li>" html_doc.write( """ Worflow next to handle (%d) <a href=https://cms-pdmv.cern.ch/mcm/batches?status=new&page=-1 target="_blank"> batches</a> <a href=logs/injector/last.log target=_blank>log</a> <a href=logs/transferor/last.log target=_blank>postlog</a> <a href="javascript:showhide('considered')">[Click to show/hide]</a> <br> <div id="considered" style="display:none;"> <ul> <li> By workflow (%d) </li><a href="javascript:showhide('considered_bywf')">[Click to show/hide]</a><div id="considered_bywf" style="display:none;"> <ul> %s </ul></div> <li> By campaigns (%d) </li><a href="javascript:showhide('considered_bycamp')">[Click to show/hide]</a><div id="considered_bycamp" style="display:none;"> <ul> %s </ul></div> </ul> </div> """ % (count, count, text, len(count_by_campaign), text_by_c) ) lap("done with considered") text = "" count = 0 count_by_campaign = defaultdict(lambda: defaultdict(int)) for wf in session.query(Workflow).filter(Workflow.status == "staging").all(): wl = getWL(wf.name) count_by_campaign[wl["Campaign"]][int(wl["RequestPriority"])] += 1 text += "<li> %s </li> \n" % wfl(wf, within=True) count += 1 text_by_c = "" for c in count_by_campaign: text_by_c += "<li> %s (%d) : " % (c, sum(count_by_campaign[c].values())) for p in sorted(count_by_campaign[c].keys()): text_by_c += "%d (%d), " % (p, count_by_campaign[c][p]) text_by_c += "</li>" html_doc.write( """ Worflow waiting in staging (%d) <a href=logs/transferor/last.log target=_blank>log</a> <a href=logs/stagor/last.log target=_blank>postlog</a> <a href="javascript:showhide('staging')">[Click to show/hide]</a> <br> <div id="staging" style="display:none;"> <ul> <li> By workflow (%d) </li><a href="javascript:showhide('staging_bywf')">[Click to show/hide]</a><div id="staging_bywf" style="display:none;"> <ul> %s </ul></div> <li> By campaigns (%d) </li><a href="javascript:showhide('staging_bycamp')">[Click to show/hide]</a><div id="staging_bycamp" style="display:none;"> <ul> %s </ul></div> </ul> </div> """ % (count, count, text, len(count_by_campaign), text_by_c) ) lap("done with staging") text = "" count = 0 for ts in session.query(Transfer).all(): stext = ( '<li> %s serves </li><a href="javascript:showhide(\'%s\')">[show/hide] relevant workflows</a> <div id="%s" style="display:none;"><ul>' % (phl(ts.phedexid), ts.phedexid, ts.phedexid) ) hide = True for pid in ts.workflows_id: w = session.query(Workflow).get(pid) hide &= w.status != "staging" if w.status in ["considered", "staging", "staged"]: stext += "<li> %s </li>\n" % (wfl(w, status=True)) stext += "</ul></div>\n" if hide: # text+="<li> %s not needed anymore to start running (does not mean it went through completely)</li>"%phl(ts.phedexid) pass else: count += 1 text += stext text += "</ul></div>" html_doc.write( """ Transfer on-going (%d) <a href=https://transferteam.web.cern.ch/transferteam/dashboard/ target=_blank>dashboard</a> <a href=logs/transferor/last.log target=_blank>log</a> <a href=logs/stagor/last.log target=_blank>postlog</a> <a href="javascript:showhide('transfer')">[Click to show/hide]</a> <br> <div id="transfer" style="display:none;"> <br> <ul>""" % count ) html_doc.write(text) lap("done with transfers") text = "" count = 0 count_by_campaign = defaultdict(lambda: defaultdict(int)) for wf in session.query(Workflow).filter(Workflow.status == "staged").all(): wl = getWL(wf.name) count_by_campaign[wl["Campaign"]][int(wl["RequestPriority"])] += 1 text += "<li> %s </li> \n" % wfl(wf, p=True) count += 1 text_by_c = "" for c in count_by_campaign: text_by_c += "<li> %s (%d) : " % (c, sum(count_by_campaign[c].values())) for p in sorted(count_by_campaign[c].keys()): text_by_c += "%d (%d), " % (p, count_by_campaign[c][p]) text_by_c += "</li>" html_doc.write( """Worflow ready for assigning (%d) <a href=logs/stagor/last.log target=_blank>log</a> <a href=logs/assignor/last.log target=_blank>postlog</a> <a href="javascript:showhide('staged')">[Click to show/hide]</a> <br> <div id="staged" style="display:none;"> <br> <ul> <li> By workflow (%d) </li><a href="javascript:showhide('staged_bywf')">[Click to show/hide]</a><div id="staged_bywf" style="display:none;"> <ul> %s </ul></div> <li> By campaigns (%d) </li><a href="javascript:showhide('staged_bycamp')">[Click to show/hide]</a><div id="staged_bycamp" style="display:none;"> <ul> %s </ul></div> </ul> </div> """ % (count, count, text, len(count_by_campaign), text_by_c) ) lap("done with staged") lines = [] count_by_campaign = defaultdict(lambda: defaultdict(int)) for wf in session.query(Workflow).filter(Workflow.status == "away").all(): wl = getWL(wf.name) count_by_campaign[wl["Campaign"]][int(wl["RequestPriority"])] += 1 lines.append("<li> %s </li>" % wfl(wf, view=True, ongoing=True)) text_by_c = "" for c in count_by_campaign: text_by_c += "<li> %s (%d) : " % (c, sum(count_by_campaign[c].values())) for p in sorted(count_by_campaign[c].keys()): text_by_c += "%d (%d), " % (p, count_by_campaign[c][p]) text_by_c += "</li>" lines.sort() html_doc.write( """ Worflow on-going (%d) <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests_in_production.php target=_blank>ongoing</a> <a href=https://cms-logbook.cern.ch/elog/Workflow+processing/?mode=summary target=_blank>elog</a> <a href=http://cms-gwmsmon.cern.ch/prodview target=_blank>queues</a> <a href=logs/assignor/last.log target=_blank>log</a> <a href=logs/checkor/last.log target=_blank>postlog</a> <a href="javascript:showhide('away')">[Click to show/hide]</a> <br> <div id="away" style="display:none;"> <ul> <li>By workflow (%d) </li> <a href="javascript:showhide('away_bywf')">[Click to show/hide]</a><div id="away_bywf" style="display:none;"> <ul> %s </ul></div> <li> By campaigns (%d) </li><a href="javascript:showhide('away_bycamp')">[Click to show/hide]</a><div id="away_bycamp" style="display:none;"> <ul> %s </ul></div> </ul> </div> """ % (len(lines), len(lines), "\n".join(lines), len(count_by_campaign), text_by_c) ) lap("done with away") text = "" count = 0 for wf in session.query(Workflow).filter(Workflow.status == "assistance").all(): text += "<li> %s </li> \n" % wfl(wf, view=True, update=True, status=True) count += 1 text += "</ul></div>\n" html_doc.write( """Worflow that are closing (%d) <a href=closeout.html target=_blank>closeout</a> <a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a> <a href="javascript:showhide('closing')">[Click to show/hide]</a> <br> <div id="closing" style="display:none;"> <br> <ul> """ % count ) html_doc.write(text) lap("done with closing") text = "" count = 0 for wf in session.query(Workflow).filter(Workflow.status.startswith("assistance-")).all(): text += "<li> %s </li> \n" % wfl(wf, view=True, within=True, status=True, update=True) count += 1 text += "</ul></div>\n" html_doc.write( """Worflow which need assistance (%d) <a href=assistance.html target=_blank>assistance</a> <a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/recoveror/last.log target=_blank>postlog</a> <a href="javascript:showhide('assistance')">[Click to show/hide]</a> <br> <div id="assistance" style="display:none;"> <br> <ul> """ % count ) html_doc.write(text) lap("done with assistance") text = "" count = 0 for wf in session.query(Workflow).filter(Workflow.status == "close").all(): text += "<li> %s </li> \n" % wfl(wf) count += 1 text += "</ul></div>\n" html_doc.write( """Worflow ready to close (%d) <a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a> <a href="javascript:showhide('close')">[Click to show/hide]</a> <br> <div id="close" style="display:none;"> <br> <ul> """ % count ) html_doc.write(text) lap("done with annoucing") text = "" count = 0 for wf in session.query(Workflow).filter(Workflow.status == "trouble").all(): text += "<li> %s </li> \n" % wfl(wf) count += 1 text += "</ul></div>\n" html_doc.write( """Worflow with issue (%d) <a href=logs/closor/last.log target=_blank>log</a> <a href=logs/injector/last.log target=_blank>postlog</a> <a href="javascript:showhide('trouble')">[Click to show/hide]</a> <br> <div id="trouble" style="display:none;"> <br> <ul> """ % count ) html_doc.write(text) lap("done with trouble") text = "" count = 0 for wf in session.query(Workflow).filter(Workflow.status == "forget").all(): text += "<li> %s </li> \n" % wfl(wf) count += 1 text += "</ul></div>\n" html_doc.write( """ Worflow to forget (%d) <a href=logs/injector/last.log target=_blank>log</a> <a href=logs/outcleanor/last.log target=_blank>postlog</a> <a href="javascript:showhide('forget')">[Click to show/hide]</a> <br> <div id="forget" style="display:none;"> <br> <ul> """ % count ) html_doc.write(text) lap("done with forget") text = "" count = 0 for wf in session.query(Workflow).filter(Workflow.status == "done").all(): text += "<li> %s </li> \n" % wfl(wf) # ,ms=True) count += 1 text += "</ul></div>\n" html_doc.write( """ Worflow through (%d) <a href=logs/closor/last.log target=_blank>log</a> <a href=logs/cleanor/last.log target=_blank>postlog</a> <a href="javascript:showhide('done')">[Click to show/hide]</a> <br> <div id="done" style="display:none;"> <br> <ul> """ % count ) html_doc.write(text) lap("done with done") wfs = session.query(Workflow).filter(Workflow.status.endswith("-unlock")).all() html_doc.write(" Workflows unlocked : %s <br>" % (len(wfs))) lap("done with unlocked") text = "" lines_thisweek = [] lines_lastweek = [] now = time.mktime(time.gmtime()) this_week = int(time.strftime("%W", time.gmtime())) start_time_two_weeks_ago = time.mktime(time.strptime("15-0-%d" % (this_week - 2), "%y-%w-%W")) for out in session.query(Output).filter(Output.date >= start_time_two_weeks_ago).all(): if not out.workflow: print "This is a problem with", out.datasetname continue if out.workflow.status in ["done", "clean", "clean-out", "clean-unlock"]: out_week = int(time.strftime("%W", time.gmtime(out.date))) ##only show current week, and the previous. if (this_week - out_week) == 1: lines_lastweek.append( "<li>on week %s : %s </li>" % (time.strftime("%W (%x %X)", time.gmtime(out.date)), ol(out.datasetname)) ) if (this_week - out_week) == 0: lines_thisweek.append( "<li>on week %s : %s </li>" % (time.strftime("%W (%x %X)", time.gmtime(out.date)), ol(out.datasetname)) ) lines_thisweek.sort() lines_lastweek.sort() html_doc.write( """Output produced (%d) <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?in_disagreement=1 target=_blank>disagreements</a> <a href="javascript:showhide('output')">[Click to show/hide]</a> <br> <div id="output" style="display:none;"> <br> <ul> <li> Last week (%d) </li><a href="javascript:showhide('output_lastweek')">[Click to show/hide]</a><div id="output_lastweek" style="display:none;"><ul> %s </ul></div> <li> This week (%d) </li><a href="javascript:showhide('output_thisweek')">[Click to show/hide]</a><div id="output_thisweek" style="display:none;"><ul> %s </ul></div></div> """ % ( len(lines_lastweek) + len(lines_thisweek), len(lines_lastweek), "\n".join(lines_lastweek), len(lines_thisweek), "\n".join(lines_thisweek), ) ) lap("done with output") html_doc.write( """Job installed <a href="javascript:showhide('acron')">[Click to show/hide]</a> <br> <div id="acron" style="display:none;"> <br> <pre> %s </pre> """ % (os.popen("acrontab -l | grep Unified | grep -v \#").read()) ) per_module = defaultdict(list) for t in filter(None, os.popen("cat /afs/cern.ch/user/c/cmst2/www/unified/logs/*/*.time").read().split("\n")): module_name, run_time, spend = t.split(":") ## then do what you want with it ! per_module[module_name].append(int(spend)) html_doc.write("Module running time<ul>\n") for m, spends in per_module.items(): html_doc.write("<li>%s : last %d [s], avg %d [s]</li>\n" % (m, spends[-1], sum(spends) / float(len(spends)))) html_doc.write("</ul>") html_doc.write( "Last running <pre>%s</pre>" % (os.popen("tac /afs/cern.ch/user/c/cmst2/www/unified/logs/running | head -5").read()) ) html_doc.write("</div>\n") lap("done with jobs") text = "" count = 0 for (c, info) in campaignInfo().campaigns.items(): # if 'go' in info and info['go']: text += "<li>%s <br> <pre>%s</pre> </li>" % (c, json.dumps(info, indent=2)) count += 1 html_doc.write( """Campaign configuration <a href="javascript:showhide('campaign')">[Click to show/hide]</a> <br> <div id="campaign" style="display:none;"> <br> <ul> %s </ul></div> """ % (text) ) text = "" count = 0 n_column = 4 SI = siteInfo() for t in SI.types(): text += "<li>%s<table border=1>" % t c = 0 for site in getattr(SI, t): cpu = SI.cpu_pledges[site] if site in SI.cpu_pledges else "N/A" disk = SI.disk[SI.CE_to_SE(site)] if SI.CE_to_SE(site) in SI.disk else "N/A" if c == 0: text += "<tr>" text += ( '<td><a href=http://dashb-ssb.cern.ch/dashboard/templates/sitePendingRunningJobs.html?site=%s>%s</a><br><a href="http://cms-gwmsmon.cern.ch/prodview/%s" target="_blank"><img src="http://cms-gwmsmon.cern.ch/prodview/graphs/%s/daily" style="height:50px"></a><br>CPU pledge: %s<br>Disk available: %s</td>' % (site, site, site, site, cpu, disk) ) if c == n_column: c = 0 else: c += 1 text += "</table></li>" lap("done with campaigns") open("/afs/cern.ch/user/c/cmst2/www/unified/siteInfo.json", "w").write( json.dumps(dict([(t, getattr(SI, t)) for t in SI.types()]), indent=2) ) lap("done with sites json") chart_data = defaultdict(list) for site in SI.quota: chart_data[site].append( """ var data_%s = google.visualization.arrayToDataTable([ ['Overall', 'Space in TB'], //['Quota' , %s], ['Locked' , %s], ['Free' , %s] ]); """ % (site, SI.quota[site], SI.locked[site], SI.disk[site]) ) chart_data[site].append( """ var chart_%s = new google.visualization.PieChart(document.getElementById('donutchart_%s')); chart_%s.draw(data_%s, {title: '%s %s [TB]', pieHole:0.4, slices:{0:{color:'red'},1:{color:'green'}}}); """ % (site, site, site, site, site, SI.quota[site]) ) chart_data[site].append( """ <div id="donutchart_%s" style="height: 200px;"></div> """ % (site) ) ## make the locked/available donut chart donut_html = open("/afs/cern.ch/user/c/cmst2/www/unified/locked.html", "w") tables = "\n".join([info[0] for site, info in chart_data.items()]) draws = "\n".join([info[1] for site, info in chart_data.items()]) divs = "\n".join([info[2] for site, info in chart_data.items()]) divs_table = "<table border=0>" for c, site in enumerate(sorted(chart_data.keys())): if c % 6 == 0: divs_table += "<tr>" divs_table += "<td>%s</td>" % (chart_data[site][2]) divs_table += "</table>" donut_html.write( """ <html> <head> <script type="text/javascript" src="https://www.google.com/jsapi"></script> <script type="text/javascript"> google.load("visualization", "1", {packages:["corechart"]}); google.setOnLoadCallback(drawChart); function drawChart() { %s %s } </script> </head> <body> %s </body> </html> """ % (tables, draws, divs_table) ) donut_html.close() html_doc.write( """Site configuration <a href="javascript:showhide('site')">[Click to show/hide]</a> <br> <div id="site" style="display:none;"> <br> <ul> %s </ul></div> """ % (text) ) lap("done with space") UC = unifiedConfiguration() text = "" for param in UC.configs: text += "<li>%s</li><ul>\n" % param for sub in sorted(UC.configs[param].keys()): text += "<li> %s : %s </li>\n" % (sub, UC.configs[param][sub]) text += "</ul>\n" html_doc.write( """Unified configuration <a href="javascript:showhide('config')">[Click to show/hide]</a> <br> <div id="config" style="display:none;"> <br> <ul> %s </ul></div> """ % (text) ) lap("done with configuration") print "... done with status page." html_doc.write( """ </body> </html> """ ) html_doc.close() html_doc = open("/afs/cern.ch/user/c/cmst2/www/unified/statuses.html", "w") html_doc.write( """ <html> <table border=1> <thead> <tr> <th> workflow </th><th> status </th><th> wm status</th> </tr> </thead> """ ) wfs = {} for wfo in session.query(Workflow).all(): wfs[wfo.name] = (wfo.status, wfo.wm_status) open("/afs/cern.ch/user/c/cmst2/www/unified/statuses.json", "w").write(json.dumps(wfs)) for wfn in sorted(wfs.keys()): html_doc.write( '<tr><td><a id="%s">%s</a></td><td>%s</td><td>%s</td></tr>\n' % (wfn, wfn, wfs[wfn][0], wfs[wfn][1]) ) html_doc.write("</table>") html_doc.write("<br>" * 100) html_doc.write("end of page</html>") html_doc.close()
def transferor(url ,specific = None, talk=True, options=None): if userLock(): return if duplicateLock(): return use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() NLI = newLockInfo() mcm = McMClient(dev=False) dss = DSS() #allowed_secondary = UC.get('') print "counting all being handled..." being_handled = len(session.query(Workflow).filter(Workflow.status == 'away').all()) being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('stag')).all()) being_transfered = len(session.query(Workflow).filter(Workflow.status == 'staging').all()) being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all()) max_to_handle = options.maxworkflows max_to_transfer = options.maxstaging allowed_to_handle = max(0,max_to_handle - being_handled) allowed_to_transfer = max(0,max_to_transfer - being_transfered) wf_buffer = 5 if allowed_to_handle<=wf_buffer: ## buffer for having several wf per transfer print "Not allowed to run more than",max_to_handle,"at a time. Currently",being_handled,"and",wf_buffer,"buffer" else: print being_handled,"already being handled",max_to_handle,"max allowed,",allowed_to_handle,"remaining","and",wf_buffer,"buffer" if allowed_to_transfer <= wf_buffer: print "Not allowed to transfer more than",max_to_transfer,"at a time. Currently",being_transfered,"and",wf_buffer,"buffer" else: print being_transfered,"already being transfered",max_to_transfer,"max allowed,",allowed_to_transfer,"remaining","and",wf_buffer,"buffer" print "... done" all_transfers=defaultdict(list) workflow_dependencies = defaultdict(set) ## list of wf.id per input dataset wfs_and_wfh=[] print "getting all wf to consider ..." cache = getWorkflows(url, 'assignment-approved', details=True) for wfo in session.query(Workflow).filter(Workflow.status.startswith('considered')).all(): print "\t",wfo.name if specific and not specific in wfo.name: continue cache_r =filter(lambda d:d['RequestName']==wfo.name, cache) if len(cache_r): wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False, request = cache_r[0]) ) ) else: wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False) ) ) print "... done" transfers_per_sites = defaultdict(int) input_sizes = {} ignored_input_sizes = {} input_cput = {} input_st = {} ## list the size of those in transfer already in_transfer_priority=None min_transfer_priority=None print "getting all wf in staging ..." stucks = json.loads(open('%s/stuck_transfers.json'%monitor_dir).read()) for wfo in session.query(Workflow).filter(Workflow.status=='staging').all(): wfh = workflowInfo( url, wfo.name, spec=False) #(lheinput,primary,parent,secondary) = wfh.getIO() #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) (lheinput,primary,parent,secondary,sites_allowed) = wfh.getSiteWhiteList() for site in sites_allowed: ## we should get the actual transfer destination instead of the full white list transfers_per_sites[site] += 1 #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() for prim in primary: ds_s = dss.get( prim ) if prim in stucks: sendLog('transferor', "%s appears stuck, so not counting it %s [GB]"%( prim, ds_s), wfi=wfh) ignored_input_sizes[prim] = ds_s else: input_sizes[prim] = ds_s sendLog('transferor', "%s needs %s [GB]"%( wfo.name, ds_s), wfi=wfh) if in_transfer_priority==None: in_transfer_priority = int(wfh.request['RequestPriority']) else: in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority'])) if min_transfer_priority==None: min_transfer_priority = int(wfh.request['RequestPriority']) else: min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority'])) if min_transfer_priority==None or in_transfer_priority ==None: print "nothing is lining up for transfer" sendEmail("no request in staging","no request in staging") return pass try: print "Ignored input sizes" ignored_values = list(ignored_input_sizes.items()) ignored_values.sort( key = lambda i : i[1] ) print "\n".join( map(str, ignored_values ) ) print "Considered input sizes" considered_values = list(input_sizes.items()) considered_values.sort( key = lambda i : i[1] ) print "\n".join( map(str, considered_values) ) except Exception as e: print "trying to print the summary of input size" print str(e) print "... done" print "Max priority in transfer already",in_transfer_priority print "Min priority in transfer already",min_transfer_priority print "transfers per sites" print json.dumps( transfers_per_sites, indent=2) in_transfer_already = sum(input_sizes.values()) cput_in_transfer_already = sum(input_cput.values()) st_in_transfer_already = sum(input_st.values()) ## list the size of all inputs primary_input_per_workflow_gb = defaultdict(float) print "getting all input sizes ..." for (wfo,wfh) in wfs_and_wfh: (_,primary,_,_) = wfh.getIO() #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() for prim in primary: ## do not count it if it appears stalled ! prim_size = dss.get( prim ) input_sizes[prim] = prim_size primary_input_per_workflow_gb[wfo.name] += prim_size print "... done" # shuffle first by name random.shuffle( wfs_and_wfh ) # Sort smallest transfers first; allows us to transfer as many as possible workflows. def prio_and_size( i, j): if int(i[1].request['RequestPriority']) == int(j[1].request['RequestPriority']): return cmp(int(primary_input_per_workflow_gb.get(j[0].name, 0)), int(primary_input_per_workflow_gb.get(i[0].name, 0)) ) else: return cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority'])) #wfs_and_wfh.sort(cmp = prio_and_size, reverse=True) #wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(primary_input_per_workflow_gb.get(i[0].name, 0)), int(primary_input_per_workflow_gb.get(j[0].name, 0)) )) #sort by priority higher first wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority']) ), reverse=True) cput_grand_total = sum(input_cput.values()) cput_to_transfer = cput_grand_total - cput_in_transfer_already st_grand_total = sum(input_st.values()) st_to_transfer = st_grand_total - st_in_transfer_already print "%15.4f [CPU h] worth already in transfer"%cput_in_transfer_already print "%15.4f [CPU h] worth is the current requested transfer load"%cput_to_transfer print "%15.4f [h] worth of absolute system time in transfer"%( cput_in_transfer_already / SI.availableSlots()) print "%15.4f [h] worth of absolute system time is the current requested transfer load"%( cput_to_transfer / SI.availableSlots()) print "%15.4f [h] worth of theoritical system time in transfer"%( st_in_transfer_already ) print "%15.4f [h] worth of theoritical system time is the current requested transfer load"%( st_to_transfer ) grand_total = sum(input_sizes.values()) to_transfer = grand_total - in_transfer_already grand_transfer_limit = options.maxtransfer #grand_transfer_limit = SI.total_disk()*0.25*1024## half of the free sapce in TB->GB transfer_limit = grand_transfer_limit - in_transfer_already print "%15.4f GB already being transfered"%in_transfer_already print "%15.4f GB is the current requested transfer load"%to_transfer print "%15.4f GB is the global transfer limit"%grand_transfer_limit print "%15.4f GB is the available limit"%transfer_limit max_staging_per_site = options.maxstagingpersite # the max priority value per dataset. max_priority = defaultdict(int) needs_transfer=0 ## so that we can count'em passing_along = 0 transfer_sizes={} went_over_budget=False destination_cache = {} no_goes = set() max_per_round = UC.get('max_per_round').get('transferor',None) if max_per_round and not spec: wfs_and_wfh = wfs_and_wfh[:max_per_round] for (wfo,wfh) in wfs_and_wfh: print wfo.name,"to be transfered with priority",wfh.request['RequestPriority'] if wfh.request['RequestStatus']!='assignment-approved': if wfh.request['RequestStatus'] in ['aborted','rejected','rejected-archived','aborted-archived']: wfo.status = 'trouble' ## so that we look or a replacement else: wfo.status = 'away' wfh.sendLog('transferor', '%s in status %s, setting %s'%( wfo.name,wfh.request['RequestStatus'],wfo.status)) continue (_,primary,_,_) = wfh.getIO() this_load=sum([input_sizes[prim] for prim in primary]) no_budget = False if ( this_load and (sum(transfer_sizes.values())+this_load > transfer_limit or went_over_budget ) ): if went_over_budget: wfh.sendLog('transferor', "Transfer has gone over bubget.") else: wfh.sendLog('transferor', "Transfer will go over bubget.") wfh.sendLog('transferor', "%15.4f GB this load, %15.4f GB already this round, %15.4f GB is the available limit"%(this_load, sum(transfer_sizes.values()), transfer_limit)) #if sum(transfer_sizes.values()) > transfer_limit: went_over_budget = True if in_transfer_priority!=None and min_transfer_priority!=None: if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: wfh.sendLog('transferor',"Higher priority sample %s >= %s go-on over budget"%( wfh.request['RequestPriority'], in_transfer_priority)) else: if not options.go: wfh.sendLog('transferor',"%s minimum priority %s < %s : stop"%( min_transfer_priority,wfh.request['RequestPriority'],in_transfer_priority)) no_budget = True ## throtlle by campaign go no_go = False if not wfh.go(log=True) and not options.go: no_go = True no_goes.add( wfo.name ) allowed_secondary = set() for campaign in wfh.getCampaigns(): if campaign in CI.campaigns and 'secondaries' in CI.campaigns[campaign]: allowed_secondary.update( CI.campaigns[campaign]['secondaries'] ) if secondary: if (secondary and allowed_secondary) and (set(secondary)&allowed_secondary!=set(secondary)): wfh.sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-allowed_secondary))) no_go = True if no_go: continue ## check if the batch is announced def check_mcm(wfn): announced=False is_real=False if not wfn.startswith('pdmvserv'): is_real = True try: for b in mcm.getA('batches',query='contains=%s'% wfo.name): is_real = True if b['status']=='announced': announced=True break except: try: for b in mcm.getA('batches',query='contains=%s'% wfo.name): is_real = True if b['status']=='announced': announced=True break except: print "could not get mcm batch announcement, assuming not real" return announced,is_real if not use_mcm: announced,is_real = False,True else: if wfh.request['RequestType'] in ['ReReco']: announced,is_real = True,True else: announced,is_real = check_mcm( wfo.name ) if not announced: wfh.sendLog('transferor', "does not look announced.") if not is_real: wfh.sendLog('transferor', "does not appear to be genuine.") ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status continue ## check on a grace period injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.) now = time.mktime(time.gmtime()) / (60.*60.) if float(now - injection_time) < 4.: if not options.go and not announced: wfh.sendLog('transferor', "It is too soon to start transfer: %3.2fH remaining"%(now - injection_time)) continue if passing_along >= allowed_to_handle: #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: if in_transfer_priority!=None and min_transfer_priority!=None: if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority: ## higher priority, and not only this priority being transfered wfh.sendLog('transferor',"Higher priority sample %s >= %s go-on over %s"%( wfh.request['RequestPriority'], in_transfer_priority, max_to_handle)) else: wfh.sendLog('transferor'," Not allowed to pass more than %s at a time. Currently %s handled, and adding %s"%( max_to_handle, being_handled, passing_along)) if not options.go: ## should not allow to jump that fence break if this_load and needs_transfer >= allowed_to_transfer: if in_transfer_priority!=None and min_transfer_priority!=None: if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority: ## higher priority, and not only this priority being transfered wfh.sendLog('transferor',"Higher priority sample %s >= %s go-on over %s"%(wfh.request['RequestPriority'], in_transfer_priority,max_to_transfer)) else: wfh.sendLog('transferor',"Not allowed to transfer more than %s at a time. Currently %s transfering, and adding %s"%( max_to_transfer, being_transfered, needs_transfer)) if not options.go: no_budget = True if no_budget: continue ## the site white list considers site, campaign, memory and core information (lheinput,primary,parent,secondary,sites_allowed) = wfh.getSiteWhiteList() if options and options.tosites: sites_allowed = options.tosites.split(',') for dataset in list(primary)+list(parent)+list(secondary): ## lock everything flat NLI.lock( dataset ) if not sites_allowed: wfh.sendLog('transferor',"not possible site to run at") #sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name )) sendLog('transferor',"%s has no possible sites to run at"%( wfo.name ),level='critical') continue blocks = [] if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## augment with run white list for dataset in primary: blocks = list(set( blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist'] ) )) if 'LumiList' in wfh.request and wfh.request['LumiList']: ## augment with the lumi white list blocks = list(set( blocks + getDatasetBlocks( dataset, lumis= wfh.request['LumiList'] ) )) if blocks: print "Reading",len(blocks),"in block whitelist" can_go = True staging=False allowed=True primary_destinations = set() if primary: copies_needed_from_CPUh,CPUh = wfh.getNCopies() if talk: print wfo.name,'reads',', '.join(primary),'in primary' ## chope the primary dataset for prim in primary: ## keep track of what needs what workflow_dependencies[prim].add( wfo.id ) max_priority[prim] = max(max_priority[prim],int(wfh.request['RequestPriority'])) wfh.sendLog('transferor',"Would make %s from cpu requirement %s"%( copies_needed_from_CPUh, CPUh)) copies_needed = copies_needed_from_CPUh if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies'] copies_needed = min(copies_needed_from_campaign, copies_needed) wfh.sendLog('transferor',"Maxed to %s by campaign configuration %s"%( copies_needed, wfh.request['Campaign'])) ### new ways of making the whole thing destinations,all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks ) print json.dumps(destinations, indent=2) ## get where the dataset is in full and completed prim_location = [site for (site,info) in destinations.items() if info['completion']==100 and info['data_fraction']==1] ## the rest is places it is going to be prim_destination = [site for site in destinations.keys() if not site in prim_location] if len(prim_location) >= copies_needed: wfh.sendLog('transferor',"The input is all fully in place at %s sites %s"%( len(prim_location), sorted(prim_location))) continue copies_needed = max(0,copies_needed - len(prim_location)) wfh.sendLog('transferor',"not counting existing copies ; now need %s"% copies_needed) copies_being_made = [ sum([info['blocks'].keys().count(block) for site,info in destinations.items() if site in prim_destination]) for block in all_block_names] latching_on_transfers = set() [latching_on_transfers.update(info['blocks'].values()) for site,info in destinations.items() if site in prim_destination] latching_on_transfers = list(latching_on_transfers) #print latching_on_transfers ## figure out where all this is going to go prim_to_distribute = [site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location] prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in prim_destination] ## take out the ones that cannot receive transfers prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] wfh.sendLog('transferor',"Could be going to: %s"% sorted( prim_to_distribute)) if not prim_to_distribute or any([transfers_per_sites[site] < max_staging_per_site for site in prim_to_distribute]): ## means there is openings let me go print "There are transfer slots available:",[(site,transfers_per_sites[site]) for site in prim_to_distribute] #for site in sites_allowed: # #increment accross the board, regardless of real destination: could be changed # transfers_per_sites[site] += 1 else: if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: wfh.sendLog('transferor', "Higher priority sample %s >= %s go-on over transfer slots available"%(wfh.request['RequestPriority'], in_transfer_priority)) else: wfh.sendLog('transferor',"Not allowed to transfer more than %s per site at a time. Going overboard for %s"%( max_staging_per_site, sorted([site for site in prim_to_distribute if transfers_per_sites[site]>=max_staging_per_site]))) if not options.go: allowed = False break for latching in latching_on_transfers: tfo = session.query(Transfer).filter(Transfer.phedexid == int(latching)).first() if not tfo: tfo = session.query(Transfer).filter(Transfer.phedexid == -int(latching)).first() if not tfo: tfo = Transfer( phedexid = latching) tfo.workflows_id = [] session.add(tfo) else: tfo.phedexid = latching ## make it positive ever if not wfo.id in tfo.workflows_id: print "adding",wfo.id,"to",tfo.id,"with phedexid",latching l = copy.deepcopy( tfo.workflows_id ) l.append( wfo.id ) tfo.workflows_id = l if not options.test: session.commit() else: session.flush() ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query can_go = False transfer_sizes[prim] = input_sizes[prim] staging = True # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ?? #copies_needed = max(0,copies_needed - len(prim_destination)) copies_needed = max(0,copies_needed - min(copies_being_made)) wfh.sendLog('transferor', "Not counting the copies being made ; then need %s"% copies_needed) if copies_needed == 0: wfh.sendLog('transferor', "The output is either fully in place or getting in full somewhere with %s"% latching_on_transfers) can_go = True continue elif len(prim_to_distribute)==0: wfh.sendLog('transferor', "We are going to need extra copies, but no destinations seems available") prim_to_distribute = [site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location] prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] if len(prim_to_distribute)>0: ## maybe that a parameter we can play with to limit the if not options or options.chop: ### hard include the tape disk andpoint ? #tapes = [site for site in getDatasetPresence( url, prim, vetos=['T0','T2','T3','Disk']) if site.endswith('MSS')] chops,sizes = getDatasetChops(prim, chop_threshold = options.chopsize, only_blocks=blocks) spreading = distributeToSites( chops, prim_to_distribute, n_copies = copies_needed, weights=SI.cpu_pledges, sizes=sizes) transfer_sizes[prim] = sum(sizes) if not spreading: sendLog('transferor','cannot send %s to any site, it cannot fit anywhere'% prim, level='critical') wfh.sendLog('transferor', "cannot send to any site. %s cannot seem to fit anywhere"%(prim)) staging=False can_go = False else: spreading = {} for site in prim_to_distribute: if blocks: spreading[site]=blocks else: spreading[site]=[prim] transfer_sizes[prim] = input_sizes[prim] ## this is approximate if blocks are specified can_go = False wfh.sendLog('transferor', "selected CE destinations %s"%(sorted( spreading.keys()))) for (site,items) in spreading.items(): all_transfers[site].extend( items ) transfers_per_sites[site] += 1 primary_destinations.add( site ) if not allowed: wfh.sendLog('transferor', "Not allowed to move on with") continue if secondary: override_sec_destination = [] if 'SecondaryLocation' in CI.campaigns[wfh.request['Campaign']]: override_sec_destination = CI.campaigns[wfh.request['Campaign']]['SecondaryLocation'] print wfo.name,'reads',', '.join(secondary),'in secondary' for sec in secondary: workflow_dependencies[sec].add( wfo.id ) if True: ## new style, failing on minbias if not sec in destination_cache: ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist destination_cache[sec],_ = getDatasetDestinations(url, sec) ## NO SITE WHITE LIST ADDED #destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed]) ## limit to the site whitelist NOW se_allowed = [SI.CE_to_SE(site) for site in sites_allowed] destinations = dict([(k,v) for (k,v) in destination_cache[sec].items() if site in se_allowed]) ## truncate location/destination to those making up for >90% of the dataset bad_destinations = [destinations.pop(site) for (site,info) in destinations.items() if info['data_fraction']<0.9] sec_location = [site for (site,info) in destinations.items() if info['completion']>=95] sec_destination = [site for site in destinations.keys() if not site in sec_location] else: ## old style presence = getDatasetPresence( url, sec ) sec_location = [site for site,pres in presence.items() if pres[1]>90.] ## more than 90% of the minbias at sites subscriptions = listSubscriptions( url ,sec ) sec_destination = [site for site in subscriptions] sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] if override_sec_destination: ## intersect with where we want the PU to be not_needed_anymore = list(set(sec_to_distribute) - set(override_sec_destination)) #sendEmail("secondary superfluous","the dataset %s could be removed from %s"%( sec, not_needed_anymore )) sendLog('transferor', "the dataset %s could be removed from %s"%( sec, not_needed_anymore )) sec_to_distribute = list(set(sec_to_distribute) & set(override_sec_destination)) if len( sec_to_distribute )>0: print "secondary could go to",sorted(sec_to_distribute) sec_size = dss.get( sec ) for site in sec_to_distribute: site_se =SI.CE_to_SE(site) if (SI.disk[site_se]*1024.) > sec_size: all_transfers[site].append( sec ) can_go = False else: print "could not send the secondary input to",site_se,"because it is too big for the available disk",SI.disk[site_se]*1024,"GB need",sec_size if primary_destinations and site in primary_destinations: #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024)) sendLog('transferor', '%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024), level='critical') else: print "the secondary input does not have to be send to site" ## is that possible to do something more if can_go: ## no explicit transfer required this time if staging: ## but using existing ones wfh.sendLog('transferor', "latches on existing transfers, and nothing else, settin staging") wfo.status = 'staging' needs_transfer+=1 else: wfh.sendLog('transferor', "should just be assigned now to %s"%sorted(sites_allowed)) wfo.status = 'staged' passing_along+=1 wfh.sendLog('transferor', "setting status to %s"%wfo.status) session.commit() continue else: ## there is an explicit transfer required if staging: ## and also using an existing one wfh.sendLog('transferor', "latches on existing transfers") if not options.test: wfo.status = 'staging' wfh.sendLog('transferor', "setting status to %s"%wfo.status) session.commit() wfh.sendLog('transferor',"needs a transfer") needs_transfer+=1 passing_along+=1 if no_goes: #sendEmail("no go for managing","No go for \n"+"\n".join( no_goes )) sendLog('transferor', "No go for \n"+"\n".join( no_goes ), level='critical') print "accumulated transfers" print json.dumps(all_transfers, indent=2) fake_id=-1 wf_id_in_prestaging=set() for (site,items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## site that do not want input datasets if site in SI.sites_veto_transfer: print site,"does not want transfers" continue ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured. ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] block_datasets = list(set([it.split('#')[0] for it in blocks])) datasets = [it for it in items_to_transfer if not '#' in it] details_text = "Making a replica to %s (CE) %s (SE) for"%( site, site_se) #print "\t",len(blocks),"blocks" ## remove blocks if full dataset is send out blocks = [block for block in blocks if not block.split('#')[0] in datasets] #print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks])) #print "\t",len(datasets),"datasets" #print "\t",datasets details_text += '\n\t%d blocks'%len(blocks) details_text += '\n\t%d needed blocks for %s'%( len(blocks), sorted(list(set([block.split('#')[0] for block in blocks])))) details_text += '\n\t%d datasets'% len(datasets) details_text += '\n\t%s'%sorted(datasets) items_to_transfer = blocks + datasets if execute: sendLog('transferor', details_text) else: print "Would make a replica to",site,"(CE)",site_se,"(SE) for" print details_text ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y','yes','go']: continue if execute: priority = 'normal' cds = [ds for ds in datasets+block_datasets if ds in max_priority] if cds and False: ## I don't think this is working. subscription should be updated on the fly and regularly for raising the priority if needed ## decide on an overall priority : that's a bit too large though if any([max_priority[ds]>=90000 for ds in cds]): priority = 'high' elif all([max_priority[ds]<80000 for ds in cds]): priority = 'low' result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority=priority) else: result= {'phedex':{'request_created' : []}} fake_id-=1 if not result: print "ERROR Could not make a replica request for",site,items_to_transfer,"pre-staging" continue for phedexid in [o['id'] for o in result['phedex']['request_created']]: new_transfer = session.query(Transfer).filter(Transfer.phedexid == int(phedexid)).first() if not new_transfer: new_transfer = session.query(Transfer).filter(Transfer.phedexid == -int(phedexid)).first() print phedexid,"transfer created" if not new_transfer: new_transfer = Transfer( phedexid = phedexid) session.add( new_transfer ) else: new_transfer.phedexid = phedexid ## make it positive again new_transfer.workflows_id = set() for transfering in list(set(map(lambda it : it.split('#')[0], items_to_transfer))): new_transfer.workflows_id.update( workflow_dependencies[transfering] ) new_transfer.workflows_id = list(new_transfer.workflows_id) wf_id_in_prestaging.update(new_transfer.workflows_id) session.commit() ## auto approve it if execute: approved = approveSubscription(url, phedexid, [site_se]) for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status!='staging': if execute: tr_wf.status = 'staging' if talk: print "setting",tr_wf.name,"to staging" session.commit()
from utils import workflowInfo, getWorkflows, sendEmail, componentInfo, monitor_dir, reqmgr_url, newLockInfo from assignSession import * import reqMgrClient import os import sys import json url = reqmgr_url #nl = newLockInfo() #nl.lock('/Neutrino_E-10_gun/RunIISpring15PrePremix-AVE_25_BX_25ns_76X_mcRun2_asymptotic_v12-v3/GEN-SIM-DIGI-RAW') #nl.lock('/DYJetsToLL_M-50_TuneCUETP8M1_13TeV-amcatnloFXFX-pythia8/RunIISummer15GS-MCRUN2_71_V1_ext1-v2/GEN-SIM') ## all dqmharvest completed to announced right away wfs = getWorkflows(url, 'completed', user=None, rtype='DQMHarvest') for wf in wfs: print "closing out",wf reqMgrClient.closeOutWorkflow(url, wf) wfs = getWorkflows(url, 'closed-out', user=None, rtype='DQMHarvest') for wf in wfs: print "announcing",wf reqMgrClient.announceWorkflow(url, wf) #os.system('Unified/equalizor.py -a pdmvserv_task_HIG-RunIIFall15DR76-01039__v1_T_160120_002705_9423') #os.system('Unified/equalizor.py -a pdmvserv_SMP-Summer12DR53X-00027_00440_v0__160224_044437_5031') up = componentInfo(mcm=False, soft=['mcm']) if not up.check(): sys.exit(1)
## those that are already in lock already_locked = set(json.loads(open('%s/globallocks.json'%monitor_dir).read())) if not already_locked: old = json.loads(open('datalocks.json').read()) for site,locks in old.items(): if type(locks) == float: continue for item,info in locks.items(): if info['lock']==False: continue already_locked.add( item.split('#')[0] ) print "found",len(already_locked),"old locks" newly_locking = set() ## you want to take them in reverse order to make sure none go through a transition while you run this for status in reversed(statuses): wfls = getWorkflows(url , status = status,details=True) print len(wfls),"in",status for wl in wfls: ## unknonw to the system known = session.query(Workflow).filter(Workflow.name==wl['RequestName']).all() if not known: #print wl['RequestName'],"is unknown, this is bad news" ## no it is not continue if status == 'assignment-approved': if all([wfo.status == 'considered' for wfo in known]): ## skip those only assignment-approved / considered continue wfi = workflowInfo( url, wl['RequestName'], request = wl ,spec=False) (_,primaries,_,secondaries) = wfi.getIO()
def transferor(url ,specific = None, talk=True, options=None): if userLock('transferor'): return if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() mcm = McMClient(dev=False) dss = DSS() print "counting all being handled..." being_handled = len(session.query(Workflow).filter(Workflow.status == 'away').all()) being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('stag')).all()) being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance')).all()) max_to_handle = options.maxworkflows allowed_to_handle = max(0,max_to_handle - being_handled) wf_buffer = 5 if allowed_to_handle<=wf_buffer: ## buffer for having several wf per transfer print "Not allowed to run more than",max_to_handle,"at a time. Currently",being_handled,"and",wf_buffer,"buffer" else: print being_handled,"already being handled",max_to_handle,"max allowed,",allowed_to_handle,"remaining","and",wf_buffer,"buffer" print "... done" all_transfers=defaultdict(list) workflow_dependencies = defaultdict(set) ## list of wf.id per input dataset wfs_and_wfh=[] print "getting all wf to consider ..." cache = getWorkflows(url, 'assignment-approved', details=True) for wfo in session.query(Workflow).filter(Workflow.status=='considered').all(): if specific and not specific in wfo.name: continue cache_r =filter(lambda d:d['RequestName']==wfo.name, cache) if len(cache_r): wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False, request = cache_r[0]) ) ) else: wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False) ) ) print "... done" input_sizes = {} ## list the size of those in transfer already in_transfer_priority=0 min_transfer_priority=100000000 print "getting all wf in staging ..." for wfo in session.query(Workflow).filter(Workflow.status=='staging').all(): wfh = workflowInfo( url, wfo.name, spec=False) (_,primary,_,_) = wfh.getIO() for prim in primary: input_sizes[prim] = dss.get( prim ) in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority'])) min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority'])) print "... done" print "Max priority in transfer already",in_transfer_priority print "Min priority in transfer already",min_transfer_priority in_transfer_already = sum(input_sizes.values()) #sort by priority higher first wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority']) ), reverse=True) ## list the size of all inputs print "getting all input sizes ..." for (wfo,wfh) in wfs_and_wfh: (_,primary,_,_) = wfh.getIO() for prim in primary: input_sizes[prim] = dss.get( prim ) print "... done" grand_total = sum(input_sizes.values()) to_transfer = grand_total - in_transfer_already grand_transfer_limit = options.maxtransfer transfer_limit = grand_transfer_limit - in_transfer_already print "%15.4f GB already being transfered"%in_transfer_already print "%15.4f GB is the current requested transfer load"%to_transfer print "%15.4f GB is the global transfer limit"%grand_transfer_limit print "%15.4f GB is the available limit"%transfer_limit # the max priority value per dataset. max_priority = defaultdict(int) needs_transfer=0 ## so that we can count'em passing_along = 0 transfer_sizes={} went_over_budget=False for (wfo,wfh) in wfs_and_wfh: print wfh.request['RequestPriority'] print wfo.name,"to be transfered" #wfh = workflowInfo( url, wfo.name) (_,primary,_,_) = wfh.getIO() this_load=sum([input_sizes[prim] for prim in primary]) if ( this_load and (sum(transfer_sizes.values())+this_load > transfer_limit or went_over_budget ) ): if went_over_budget: print "Transfer has gone over bubget." else: print "Transfer will go over bubget." print "%15.4f GB this load"%this_load print "%15.4f GB already this round"%sum(transfer_sizes.values()) print "%15.4f GB is the available limit"%transfer_limit went_over_budget=True if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over budget" else: if not options.go: print min_transfer_priority,"minimum priority",wfh.request['RequestPriority'],"<",in_transfer_priority,"stop" continue ## throtlle by campaign go if not CI.go( wfh.request['Campaign'] ): print "No go for",wfh.request['Campaign'] if not options.go: continue ## check if the batch is announced announced=False is_real=False for b in mcm.getA('batches',query='contains=%s'% wfo.name): is_real = True if b['status']=='announced': announced=True break if not announced: print wfo.name,"does not look announced."# skipping?, rejecting?, reporting?" if not is_real: print wfo.name,"does not appear to be genuine." ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status continue ## check on a grace period injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.) now = time.mktime(time.gmtime()) / (60.*60.) if float(now - injection_time) < 4.: if not options.go and not announced: print "It is too soon to start transfer: %3.2fH remaining"%(now - injection_time) continue passing_along += 1 if passing_along >= allowed_to_handle: if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over",max_to_handle else: print "Not allowed to pass more than",max_to_handle,"at a time. Currently",being_handled,"handled, and adding",passing_along break (lheinput,primary,parent,secondary) = wfh.getIO() if options and options.tosites: sites_allowed = options.tosites.split(',') else: sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']): sites_allowed = CI.parameters(wfh.request['Campaign'])['SiteWhitelist'] blocks = [] if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']: blocks = wfh.request['BlockWhitelist'] can_go = True staging=False if primary: if talk: print wfo.name,'reads',', '.join(primary),'in primary' ## chope the primary dataset for prim in primary: max_priority[prim] = max(max_priority[prim],int(wfh.request['RequestPriority'])) sites_really_allowed = [site for site in sites_allowed if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] print "Sites allowed minus the vetoed transfer" print sites_really_allowed copies_needed = int(0.35*len(sites_really_allowed))+1 ## should just go for a fixed number based if the white list grows that big print "Would make",copies_needed,"copies" if options.maxcopy>0: copies_needed = min(options.maxcopy,copies_needed) ## remove the sites that do not want transfers print "need",copies_needed workflow_dependencies[prim].add( wfo.id ) presence = getDatasetPresence( url, prim ) prim_location = [site for site,pres in presence.items() if pres[0]==True] if len(prim_location) >= copies_needed: print "The output is all fully in place at",len(prim_location),"sites" continue # reduce the number of copies required by existing full copies copies_needed = max(0,copies_needed - len(prim_location)) print "now need",copies_needed subscriptions = listSubscriptions( url , prim ) prim_destination = list(set([site for (site,(tid,decision)) in subscriptions.items() if decision and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])])) ## need to reject from that list the ones with a full copy already: i.e the transfer corresponds to the copy in place prim_destination = [site for site in prim_destination if not site in prim_location] ## add transfer dependencies latching_on_transfers = list(set([ tid for (site,(tid,decision)) in subscriptions.items() if decision and site in prim_destination and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])])) print latching_on_transfers for latching in latching_on_transfers: tfo = session.query(Transfer).filter(Transfer.phedexid == latching).first() if not tfo: tfo = Transfer( phedexid = latching) tfo.workflows_id = [] session.add(tfo) if not wfo.id in tfo.workflows_id: print "adding",wfo.id,"to",tfo.id,"with phedexid",latching l = copy.deepcopy( tfo.workflows_id ) l.append( wfo.id ) tfo.workflows_id = l if not options.test: session.commit() else: session.flush() ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query can_go = False transfer_sizes[prim] = input_sizes[prim] staging = True # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ?? copies_needed = max(0,copies_needed - len(prim_destination)) print "then need",copies_needed if copies_needed == 0: print "The output is either fully in place or getting in full somewhere with",latching_on_transfers can_go = True continue prim_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in prim_location])] prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in prim_destination])] ## take out the ones that cannot receive transfers prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] if len(prim_to_distribute)>0: ## maybe that a parameter we can play with to limit the if not options or options.chop: spreading = distributeToSites( getDatasetChops(prim), prim_to_distribute, n_copies = copies_needed, weights=SI.cpu_pledges) else: spreading = {} for site in prim_to_distribute: spreading[site]=[prim] can_go = False transfer_sizes[prim] = input_sizes[prim] for (site,items) in spreading.items(): all_transfers[site].extend( items ) if secondary: if talk: print wfo.name,'reads',', '.join(secondary),'in secondary' for sec in secondary: workflow_dependencies[sec].add( wfo.id ) presence = getDatasetPresence( url, sec ) sec_location = [site for site,pres in presence.items() if pres[1]>90.] ## more than 90% of the minbias at sites subscriptions = listSubscriptions( url ,sec ) sec_destination = [site for site in subscriptions] sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] if len( sec_to_distribute )>0: for site in sec_to_distribute: all_transfers[site].append( sec ) can_go = False ## is that possible to do something more if can_go: ## no explicit transfer required this time if staging: ## but using existing ones print wfo.name,"latches on existing transfers, and nothing else" wfo.status = 'staging' else: print wfo.name,"should just be assigned NOW to",sites_allowed wfo.status = 'staged' print "setting status to",wfo.status session.commit() continue else: ## there is an explicit transfer required if staging: ## and also using an existing one print wfo.name,"latches on existing transfers" if not options.test: wfo.status = 'staging' print "setting status to",wfo.status session.commit() print wfo.name,"needs a transfer" needs_transfer+=1 #print json.dumps(all_transfers) fake_id=-1 wf_id_in_prestaging=set() for (site,items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## site that do not want input datasets if site in SI.sites_veto_transfer: print site,"does not want transfers" continue ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured. ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] datasets = [it for it in items_to_transfer if not '#' in it] if execute: print "Making a replica to",site,"(CE)",site_se,"(SE) for" else: print "Would make a replica to",site,"(CE)",site_se,"(SE) for" print "\t",len(blocks),"blocks" ## remove blocks if full dataset is send out blocks = [block for block in blocks if not block.split('#')[0] in datasets] print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks])) print "\t",len(datasets),"datasets" print "\t",datasets items_to_transfer = blocks + datasets ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y','yes','go']: continue if execute: result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority='normal') ## make use of max_priority dataset:priority to set the subscriptions priority """ ## does not function once = True for item in items_to_transfer: bds = item.split('#')[0] if max_priority[bds] >= 90000: if once: w=10 print "waiting",w,"s before raising priority" time.sleep(w) once=False ## raise it to high priority print item,"subscription priority raised to high at",site_se #print "This does not work yet properly it seems" print updateSubscription(url, site_se, item, priority='high') """ else: #result= {'phedex':{'request_created' : [{'id' : fake_id}]}} result= {'phedex':{'request_created' : []}} fake_id-=1 if not result: print "ERROR Could not make a replica request for",site,items_to_transfer,"pre-staging" continue for phedexid in [o['id'] for o in result['phedex']['request_created']]: new_transfer = session.query(Transfer).filter(Transfer.phedexid == phedexid).first() print phedexid,"transfer created" if not new_transfer: new_transfer = Transfer( phedexid = phedexid) session.add( new_transfer ) new_transfer.workflows_id = set() for transfering in list(set(map(lambda it : it.split('#')[0], items_to_transfer))): new_transfer.workflows_id.update( workflow_dependencies[transfering] ) new_transfer.workflows_id = list(new_transfer.workflows_id) wf_id_in_prestaging.update(new_transfer.workflows_id) session.commit() ## auto approve it if execute: approved = approveSubscription(url, phedexid, [site_se]) for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status!='staging': if execute: tr_wf.status = 'staging' if talk: print "setting",tr_wf.name,"to staging" session.commit()
def injector(url, options, specific): use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] workflows = getWorkflows(url, status=options.wmstatus, user=options.user) workflows.extend( getWorkflows(url, status=options.wmstatus, user='******', rtype="ReReco") ) ## regardless of users, pick up all ReReco on the table print len(workflows), "in line" cannot_inject = set() ## browse for assignment-approved requests, browsed for ours, insert the diff for wf in workflows: if specific and not specific in wf: continue exists = session.query(Workflow).filter(Workflow.name == wf).first() if not exists: wfi = workflowInfo(url, wf) #wl = getWorkLoad(url, wf) ## check first that there isn't related here with something valid can_add = True ## first try at finding a match # print wfi.request familly = session.query(Workflow).filter( Workflow.name.contains(wfi.request['PrepID'])).all() if not familly: #req_familly = getWorkflowById( url, wl['PrepID']) #familly = [session.query(Workflow).filter(Workflow.name == member).first() for member in req_familly] pids = wfi.getPrepIDs() req_familly = [] for pid in pids: req_familly.extend(getWorkflowById(url, pid, details=True)) familly = [] print len(req_familly), "members" for req_member in req_familly: #print "member",req_member['RequestName'] owfi = workflowInfo(url, req_member['RequestName'], request=req_member) other_pids = owfi.getPrepIDs() if set(pids) == set(other_pids): ## this is a real match familly.extend( session.query(Workflow).filter( Workflow.name == req_member['RequestName']).all()) for lwfo in familly: if lwfo: ## we have it already if not lwfo.status in [ 'forget', 'trouble', 'forget-unlock', 'forget-out-unlock' ]: sendLog( 'injector', "Should not put %s because of %s %s" % (wf, lwfo.name, lwfo.status)) print "Should not put", wf, "because of", lwfo.name, lwfo.status cannot_inject.add(wf) can_add = False if not can_add: continue wfi.sendLog('injector', "considering %s" % wf) new_wf = Workflow(name=wf, status=options.setstatus, wm_status=options.wmstatus) session.add(new_wf) session.commit() time.sleep(0.5) else: #print "already have",wf pass if cannot_inject: #sendEmail('workflow duplicates','These workflow cannot be added in because of duplicates \n\n %s'%( '\n'.join(cannot_inject))) sendLog( 'injector', 'These workflow cannot be added in because of duplicates \n\n %s' % ('\n'.join(cannot_inject)), level='warning') ## passing a round of invalidation of what needs to be invalidated if use_mcm and (options.invalidate or True): invalidator(url) no_replacement = set() ## pick up replacements for wf in session.query(Workflow).filter( Workflow.status == 'trouble').all(): print wf.name if specific and not specific in wf.name: continue print wf.name wfi = workflowInfo(url, wf.name) wl = wfi.request #getWorkLoad(url, wf.name) familly = getWorkflowById(url, wl['PrepID']) true_familly = [] for member in familly: if member == wf.name: continue fwl = getWorkLoad(url, member) if options.replace: if member != options.replace: continue else: if fwl['RequestDate'] < wl['RequestDate']: continue if fwl['RequestType'] == 'Resubmission': continue if fwl['RequestStatus'] in ['None', None, 'new']: continue if fwl['RequestStatus'] in [ 'rejected', 'rejected-archived', 'aborted', 'aborted-archived' ]: continue true_familly.append(fwl) if len(true_familly) == 0: #sendLog('injector','%s had no replacement'%wf.name, level='critical') wfi.sendLog( 'injector', 'the workflow was found in trouble with no replacement') no_replacement.add(wf.name) continue else: wfi.sendLog( 'injector', 'the workflow was found in trouble and has a replacement') print wf.name, "has", len(familly), "familly members" print wf.name, "has", len(true_familly), "true familly members" ##we cannot have more than one of them !!! pick the last one if len(true_familly) > 1: #sendEmail('multiple wf','please take a look at injector for %s'%wf.name) sendLog('injector', 'Multiple wf in line, will take the last one for %s \n%s' % (wf.name, ', '.join(fwl['RequestName'] for fwl in true_familly)), level='critical') for fwl in true_familly[-1:]: member = fwl['RequestName'] new_wf = session.query(Workflow).filter( Workflow.name == member).first() if not new_wf: sendLog('injector', "putting %s as replacement of %s" % (member, wf.name)) status = 'away' if fwl['RequestStatus'] in ['assignment-approved']: status = 'considered' new_wf = Workflow(name=member, status=status, wm_status=fwl['RequestStatus']) wf.status = 'forget' session.add(new_wf) else: if new_wf.status == 'forget': continue sendLog( 'injector', "getting %s as replacement of %s" % (new_wf.name, wf.name)) wf.status = 'forget' for tr in session.query(Transfer).all(): if wf.id in tr.workflows_id: sw = copy.deepcopy(tr.workflows_id) sw.remove(wf.id) sw.append(new_wf.id) tr.workflows_id = sw print tr.phedexid, "got", new_wf.name if new_wf.status != 'away': print "\t setting it considered" new_wf.status = 'considered' if tr.phedexid < 0: ## set it back to positive tr.phedexid = -tr.phedexid session.commit() ## don't do that automatically #wf.status = 'forget' session.commit() if no_replacement: #sendEmail('workflow with no replacement','%s \n are dangling there'%( '\n'.join(no_replacement))) sendLog('injector', 'workflow with no replacement, %s \n are dangling there' % ('\n'.join(no_replacement)), level='critical')
url = reqmgr_url up = componentInfo(soft=['mcm','wtc','jira']) if not up.check(): sys.exit(0) status = sys.argv[1] max_wf = 0 print "Picked status",status wfs = [] if status == 'wmagent': register=['assigned','acquired','running-open','running-closed','force-complete','completed','closed-out'] for r in register: wfs.extend( getWorkflows(url, r, details=True) ) elif status.endswith('*'): wfs.extend([wfo.name for wfo in session.query(Workflow).filter(Workflow.status.startswith(status[:-1])).all() ]) else: wfs.extend([wfo.name for wfo in session.query(Workflow).filter(Workflow.status==status).all() ]) if max_wf: wfs = wfs[:max_wf] random.shuffle( wfs ) all_blocks_at_sites = defaultdict(set) #done = json.loads(open('myblock_done.json').read()) done = {}
def batchor(url): UC = unifiedConfiguration() SI = global_SI() CI = campaignInfo() BI = batchInfo() ## get all workflows in assignment-approved with SubRequestType = relval all_wfs = [] for user in UC.get("user_relval"): all_wfs.extend( getWorkflows(url, 'assignment-approved', details=True, user=user, rtype='TaskChain')) wfs = filter( lambda r: r['SubRequestType'] == 'RelVal' if 'SubRequestType' in r else False, all_wfs) ## need a special treatment for those hi_wfs = filter( lambda r: r['SubRequestType'] == 'HIRelVal' if 'SubRequestType' in r else False, all_wfs) by_campaign = defaultdict(set) by_hi_campaign = defaultdict(set) for wf in wfs: print "Relval:", wf['RequestName'], wf['Campaign'] by_campaign[wf['Campaign']].add(wf['PrepID']) for wf in hi_wfs: print "HI Relval:", wf['RequestName'], wf['Campaign'] by_hi_campaign[wf['Campaign']].add(wf['PrepID']) default_setup = { "go": True, "parameters": { "SiteWhitelist": ["T1_US_FNAL"], "MergedLFNBase": "/store/relval", "Team": "relval", "NonCustodialGroup": "RelVal" }, "custodial_override": "notape", "phedex_group": "RelVal", "lumisize": -1, "fractionpass": 0.0, "maxcopies": 1 } default_hi_setup = copy.deepcopy(default_setup) add_on = {} relval_routing = UC.get('relval_routing') def pick_one_site(p): ## modify the parameters on the spot to have only one site if "parameters" in p and "SiteWhitelist" in p["parameters"] and len( p["parameters"]["SiteWhitelist"]) > 1: choose_from = list( set(p["parameters"]["SiteWhitelist"]) & set(SI.sites_ready)) picked = random.choice(choose_from) print "picked", picked, "from", choose_from p["parameters"]["SiteWhitelist"] = [picked] batches = BI.all() for campaign in by_campaign: if campaign in batches: continue ## get a bunch of information setup = copy.deepcopy(default_setup) for key in relval_routing: if key in campaign: ## augment with the routing information augment_with = relval_routing[key] print "Modifying the batch configuration because of keyword", key print "with", augment_with setup = deep_update(setup, augment_with) pick_one_site(setup) add_on[campaign] = setup sendLog('batchor', 'Adding the relval campaigns %s with parameters \n%s' % (campaign, json.dumps(setup, indent=2)), level='critical') BI.update(campaign, by_campaign[campaign]) for campaign in by_hi_campaign: if campaign in batches: continue ## get a bunch of information setup = copy.deepcopy(default_hi_setup) possible_sites = set(["T1_DE_KIT", "T1_FR_CCIN2P3"]) hi_site = random.choice(list(possible_sites)) setup["parameters"]["SiteWhitelist"] = [hi_site] pick_one_site(setup) add_on[campaign] = setup sendLog('batchor', 'Adding the HI relval campaigns %s with parameters \n%s' % (campaign, json.dumps(setup, indent=2)), level='critical') BI.update(campaign, by_hi_campaign[campaign]) ## only new campaigns in announcement for new_campaign in list( set(add_on.keys()) - set(CI.all(c_type='relval'))): ## this is new, and can be announced as such print new_campaign, "is new stuff" subject = "Request of RelVal samples batch %s" % new_campaign text = """Dear all, A new batch of relval workflows was requested. Batch ID: %s Details of the workflows: https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s This is an automated message""" % ( new_campaign, new_campaign, ) print subject print text to = ['*****@*****.**'] sendEmail(subject, text, destination=to) sendLog('batchor', text, level='critical') ## go through all existing campaigns and remove the ones not in use anymore ? for old_campaign in CI.all(c_type='relval'): all_in_batch = getWorkflowByCampaign(url, old_campaign, details=True) if not all_in_batch: continue is_batch_done = all( map( lambda s: not s in [ 'completed', 'force-complete', 'running-open', 'running-closed', 'acquired', 'assigned', 'assignment-approved' ], [wf['RequestStatus'] for wf in all_in_batch])) ## check all statuses if is_batch_done: #print "batch",old_campaign,"can be closed or removed if necessary" #campaigns[old_campaign]['go'] = False ## disable CI.pop(old_campaign) ## or just drop it all together ? BI.pop(old_campaign) print "batch", old_campaign, " configuration was removed" ## merge all anyways CI.update(add_on, c_type='relval')
def htmlor(): cache = getWorkflows('cmsweb.cern.ch', 'assignment-approved', details=True) def wfl(wf, view=False, p=False, ms=False, within=False, ongoing=False, status=False, update=False): wfn = wf.name wfs = wf.wm_status pid = None pids = filter(lambda seg: seg.count('-') == 2, wf.name.split('_')) if len(pids): pid = pids[0] text = ', '.join([ #wfn, '<a href="https://cmsweb.cern.ch/reqmgr/view/details/%s" target="_blank">%s</a>' % (wfn, wfn), '(%s) <br>' % wfs ]) text += ', '.join([ '<a href="https://cmsweb.cern.ch/reqmgr/view/details/%s" target="_blank">dts</a>' % wfn, '<a href=https://cmsweb.cern.ch/reqmgr/view/showWorkload?requestName=%s target="_blank">wkl</a>' % wfn, '<a href="https://cmsweb.cern.ch/couchdb/reqmgr_workload_cache/%s" target="_blank">wfc</a>' % wfn, '<a href="https://cmsweb.cern.ch/reqmgr/reqMgr/request?requestName=%s" target="_blank">dwkc</a>' % wfn, '<a href="https://cmsweb.cern.ch/reqmgr/view/splitting/%s" target="_blank">spl</a>' % wfn, '<a href="https://cms-pdmv.cern.ch/stats/?RN=%s" target="_blank">vw</a>' % wfn, '<a href="https://cms-pdmv.cern.ch/stats/restapi/get_one/%s" target="_blank">vwo</a>' % wfn, '<a href="https://cms-logbook.cern.ch/elog/Workflow+processing/?mode=full&reverse=0&reverse=1&npp=20&subtext=%s&sall=q" target="_blank">elog</a>' % pid, '<a href="http://hcc-briantest.unl.edu/prodview/%s" target="_blank">pv</a>' % wfn, '<a href="https://cmsweb.cern.ch/reqmgr/reqMgr/outputDatasetsByRequestName/%s" target="_blank">out</a>' % wfn, '<a href="closeout.html#%s" target="_blank">clo</a>' % wfn, '<a href="statuses.html#%s" target="_blank">st</a>' % wfn, '<a href="https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s" target="_blank">perf</a>' % wfn ]) if within and (not view or wfs == 'completed'): cached = filter(lambda d: d['RequestName'] == wfn, cache) if cached: wl = cached[0] else: wl = getWorkLoad('cmsweb.cern.ch', wfn) if 'InputDataset' in wl: dataset = wl['InputDataset'] text += ', '.join([ '', '<a href=https://cmsweb.cern.ch/das/request?input=%s target=_blank>input</a>' % dataset, '<a href=https://cmsweb.cern.ch/phedex/prod/Data::Subscriptions#state=create_since=0;filter=%s target=_blank>sub</a>' % dataset, '<a href=https://cmsweb.cern.ch/phedex/datasvc/xml/prod/subscriptions?dataset=%s&collapse=n target=_blank>ds</a>' % dataset, '<a href=https://cmsweb.cern.ch/phedex/datasvc/xml/prod/blockreplicas?dataset=%s target=_blank>rep</a>' % dataset, ]) if p: cached = filter(lambda d: d['RequestName'] == wfn, cache) if cached: wl = cached[0] else: wl = getWorkLoad('cmsweb.cern.ch', wfn) text += ', (%s)' % (wl['RequestPriority']) pass if pid: if ms: mcm_s = json.loads( os.popen( 'curl https://cms-pdmv.cern.ch/mcm/public/restapi/requests/get_status/%s --insecure' % pid).read())[pid] text += ', <a href="https://cms-pdmv.cern.ch/mcm/requests?prepid=%s" target="_blank">mcm (%s)</a>' % ( pid, mcm_s) else: text += ', <a href="https://cms-pdmv.cern.ch/mcm/requests?prepid=%s" target="_blank">mcm</a>' % ( pid) text += ', <a href="https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=%s" target="_blank">ac</a>' % ( pid) if status: if wf.status.startswith('assistance'): text += ', <a href="assistance.html#%s" target="_blank">assist</a>' % wfn text += ' : %s ' % (wf.status) if view and wfs != 'acquired': text += '<a href="https://cms-pdmv.web.cern.ch/cms-pdmv/stats/growth/%s.gif" target="_blank"><img src="https://cms-pdmv.web.cern.ch/cms-pdmv/stats/growth/%s.gif" style="height:50px"></a>' % ( wfn.replace('_', '/'), wfn.replace('_', '/')) if ongoing: text += '<a href="http://hcc-briantest.unl.edu/prodview/%s" target="_blank"><img src="http://hcc-briantest.unl.edu/prodview/graphs/%s/daily" style="height:50px"></a>' % ( wfn, wfn) text += "<hr>" return text def phl(phid): text = ', '.join([ str(phid), '<a href="https://cmsweb.cern.ch/phedex/prod/Request::View?request=%s" target="_blank">vw</a>' % phid, '<a href="https://cmsweb.cern.ch/phedex/prod/Data::Subscriptions?reqfilter=%s" target="_blank">sub</a>' % phid, ]) return text def ol(out): return '<a href="https://cmsweb.cern.ch/das/request?input=%s" target="_blank"> %s</a>' % ( out, out) ## start to write it #html_doc = open('/afs/cern.ch/user/v/vlimant/public/ops/index.html','w') html_doc = open('/afs/cern.ch/user/c/cmst2/www/unified/index.html', 'w') print "Updating the status page ..." html_doc.write(""" <html> <head> <META HTTP-EQUIV="refresh" CONTENT="900"> <script type="text/javascript"> function showhide(id) { var e = document.getElementById(id); e.style.display = (e.style.display == 'block') ? 'none' : 'block'; } </script> </head> <body> Last update on %s(CET), %s(GMT), <a href=logs/ target=_blank> logs</a> <a href=logs/last.log target=_blank>last</a> <a href=statuses.html>statuses</a> <a href=https://twiki.cern.ch/twiki/bin/view/CMSPublic/CompOpsWorkflowL3Responsibilities#Automatic_Assignment_and_Unified>what am I</a> <br><br> """ % (time.asctime(time.localtime()), time.asctime(time.gmtime()))) text = "" count = 0 for wf in session.query(Workflow).filter( Workflow.status == 'considered').all(): text += "<li> %s </li> \n" % wfl(wf, p=True) count += 1 text += "</ul></div>\n" html_doc.write(""" Worlfow next to handle <a href=https://cms-pdmv.cern.ch/mcm/batches?status=new&page=-1 target="_blank"> batches</a> (%d) <a href=logs/injector/last.log target=_blank>log</a> <a href=logs/transferor/last.log target=_blank>postlog</a> <a href="javascript:showhide('considered')">[Click to show/hide]</a> <br> <div id="considered" style="display:none;"> <ul> """ % count) html_doc.write(text) text = "" count = 0 for wf in session.query(Workflow).filter( Workflow.status == 'staging').all(): text += "<li> %s </li> \n" % wfl(wf, within=True) count += 1 text += "</ul></div>\n" html_doc.write(""" Worlfow waiting in staging (%d) <a href=logs/transferor/last.log target=_blank>log</a> <a href=logs/stagor/last.log target=_blank>postlog</a> <a href="javascript:showhide('staging')">[Click to show/hide]</a> <br> <div id="staging" style="display:none;"> <ul> """ % count) html_doc.write(text) text = "" count = 0 for ts in session.query(Transfer).all(): stext = '<li> %s serves </li><a href="javascript:showhide(\'%s\')">[show/hide]</a> <div id="%s" style="display:none;"><ul>' % ( phl(ts.phedexid), ts.phedexid, ts.phedexid) hide = True for pid in ts.workflows_id: w = session.query(Workflow).get(pid) hide &= (w.status != 'staging') stext += "<li> %s </li>\n" % (wfl(w, status=True)) stext += "</ul></div>\n" if hide: #text+="<li> %s not needed anymore to start running (does not mean it went through completely)</li>"%phl(ts.phedexid) pass else: count += 1 text += stext text += "</ul></div>" html_doc.write(""" Transfer on-going (%d) <a href=https://transferteam.web.cern.ch/transferteam/dashboard/ target=_blank>dashboard</a> <a href=logs/transferor/last.log target=_blank>log</a> <a href=logs/stagor/last.log target=_blank>postlog</a> <a href="javascript:showhide('transfer')">[Click to show/hide]</a> <br> <div id="transfer" style="display:none;"> <br> <ul>""" % count) html_doc.write(text) text = "" count = 0 for wf in session.query(Workflow).filter( Workflow.status == 'staged').all(): text += "<li> %s </li> \n" % wfl(wf, p=True) count += 1 text += "</ul></div>\n" html_doc.write( """Worlfow ready for assigning (%d) <a href=logs/stagor/last.log target=_blank>log</a> <a href=logs/assignor/last.log target=_blank>postlog</a> <a href="javascript:showhide('staged')">[Click to show/hide]</a> <br> <div id="staged" style="display:none;"> <br> <ul> """ % count) html_doc.write(text) lines = [] for wf in session.query(Workflow).filter(Workflow.status == 'away').all(): lines.append("<li> %s </li>" % wfl(wf, view=True, ongoing=True)) lines.sort() html_doc.write(""" Worlfow on-going (%d) <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests_in_production.php target=_blank>ongoing</a> <a href=https://cms-logbook.cern.ch/elog/Workflow+processing/?mode=summary target=_blank>elog</a> <a href=http://hcc-briantest.unl.edu/prodview target=_blank>queues</a> <a href=logs/assignor/last.log target=_blank>log</a> <a href=logs/checkor/last.log target=_blank>postlog</a> <a href="javascript:showhide('away')">[Click to show/hide]</a> <br> <div id="away" style="display:none;"> <br> <ul> %s </ul> </div> """ % (len(lines), '\n'.join(lines))) text = "" count = 0 for wf in session.query(Workflow).filter( Workflow.status == 'assistance').all(): text += "<li> %s </li> \n" % wfl( wf, view=True, update=True, status=True) count += 1 text += "</ul></div>\n" html_doc.write("""Worlfow that are closing (%d) <a href=closeout.html target=_blank>closeout</a> <a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a> <a href="javascript:showhide('closing')">[Click to show/hide]</a> <br> <div id="closing" style="display:none;"> <br> <ul> """ % count) html_doc.write(text) text = "" count = 0 for wf in session.query(Workflow).filter( Workflow.status.startswith('assistance-')).all(): text += "<li> %s </li> \n" % wfl( wf, view=True, within=True, status=True, update=True) count += 1 text += "</ul></div>\n" html_doc.write("""Worlfow which need assistance (%d) <a href=assistance.html target=_blank>assistance</a> <a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a> <a href="javascript:showhide('assistance')">[Click to show/hide]</a> <br> <div id="assistance" style="display:none;"> <br> <ul> """ % count) html_doc.write(text) text = "" count = 0 for wf in session.query(Workflow).filter(Workflow.status == 'close').all(): text += "<li> %s </li> \n" % wfl(wf) count += 1 text += "</ul></div>\n" html_doc.write("""Worlfow ready to close (%d) <a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a> <a href="javascript:showhide('close')">[Click to show/hide]</a> <br> <div id="close" style="display:none;"> <br> <ul> """ % count) html_doc.write(text) text = "" count = 0 for wf in session.query(Workflow).filter( Workflow.status == 'trouble').all(): text += "<li> %s </li> \n" % wfl(wf) count += 1 text += "</ul></div>\n" html_doc.write( """Worlfow with issue (%d) <a href=logs/closor/last.log target=_blank>log</a> <a href=logs/injector/last.log target=_blank>postlog</a> <a href="javascript:showhide('trouble')">[Click to show/hide]</a> <br> <div id="trouble" style="display:none;"> <br> <ul> """ % count) html_doc.write(text) text = "" count = 0 for wf in session.query(Workflow).filter( Workflow.status == 'forget').all(): text += "<li> %s </li> \n" % wfl(wf) count += 1 text += "</ul></div>\n" html_doc.write(""" Worlfow to forget (%d) <a href="javascript:showhide('forget')">[Click to show/hide]</a> <br> <div id="forget" style="display:none;"> <br> <ul> """ % count) html_doc.write(text) text = "" count = 0 for wf in session.query(Workflow).filter(Workflow.status == 'done').all(): text += "<li> %s </li> \n" % wfl(wf) #,ms=True) count += 1 text += "</ul></div>\n" html_doc.write(""" Worlfow through (%d) <a href=logs/closor/last.log target=_blank>log</a> <a href=logs/cleanor/last.log target=_blank>postlog</a> <a href="javascript:showhide('done')">[Click to show/hide]</a> <br> <div id="done" style="display:none;"> <br> <ul> """ % count) html_doc.write(text) text = "" count = 0 for wf in session.query(Workflow).filter(Workflow.status == 'clean').all(): text += "<li> %s </li> \n" % wfl(wf) #,ms=True) count += 1 text += "</ul></div>\n" html_doc.write(""" Worlfow clean for input (%d) <a href=logs/cleanor/last.log target=_blank>log</a> <a href="javascript:showhide('clean')">[Click to show/hide]</a> <br> <div id="clean" style="display:none;"> <br> <ul> """ % count) html_doc.write(text) text = "" count = 0 for wf in session.query(Workflow).filter( Workflow.status.endswith('-out')).all(): text += "<li> %s </li> \n" % wfl(wf, status=True) count += 1 text += "</ul></div>\n" html_doc.write(""" Worlfow clean for output (%d) <a href=logs/outcleanor/last.log target=_blank>log</a> <a href="javascript:showhide('cleanout')">[Click to show/hide]</a> <br> <div id="cleanout" style="display:none;"> <br> <ul> """ % count) html_doc.write(text) text = "" lines_thisweek = [] lines_lastweek = [] now = time.mktime(time.gmtime()) this_week = int(time.strftime("%W", time.gmtime())) for out in session.query(Output).all(): if not out.workflow: print "This is a problem with", out.datasetname continue if out.workflow.status in ['done', 'clean']: out_week = int(time.strftime("%W", time.gmtime(out.date))) ##only show current week, and the previous. if (this_week - out_week) == 1: lines_lastweek.append("<li>on week %s : %s </li>" % ( time.strftime("%W (%x %X)", time.gmtime(out.date)), ol(out.datasetname), )) if (this_week - out_week) == 0: lines_thisweek.append("<li>on week %s : %s </li>" % ( time.strftime("%W (%x %X)", time.gmtime(out.date)), ol(out.datasetname), )) lines_thisweek.sort() lines_lastweek.sort() html_doc.write( """Output produced <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?in_disagreement=1 target=_blank>disagreements</a> (%d) <a href="javascript:showhide('output')">[Click to show/hide]</a> <br> <div id="output" style="display:none;"> <br> <ul> <li> Last week (%d) </li><a href="javascript:showhide('output_lastweek')">[Click to show/hide]</a><div id="output_lastweek" style="display:none;"><ul> %s </ul></div> <li> This week (%d) </li><a href="javascript:showhide('output_thisweek')">[Click to show/hide]</a><div id="output_thisweek" style="display:none;"><ul> %s </ul></div></div> """ % (len(lines_lastweek) + len(lines_thisweek), len(lines_lastweek), '\n'.join(lines_lastweek), len(lines_thisweek), '\n'.join(lines_thisweek))) html_doc.write("""Job installed <a href="javascript:showhide('acron')">[Click to show/hide]</a> <br> <div id="acron" style="display:none;"> <br> <pre> %s </pre></div> """ % (os.popen('acrontab -l | grep Unified').read())) text = "" count = 0 for (c, info) in campaignInfo().campaigns.items(): #if 'go' in info and info['go']: text += "<li>%s <br> <pre>%s</pre> </li>" % ( c, json.dumps(info, indent=2)) count += 1 html_doc.write("""Campaign configuration <a href="javascript:showhide('campaign')">[Click to show/hide]</a> <br> <div id="campaign" style="display:none;"> <br> <ul> %s </ul></div> """ % (text)) text = "" count = 0 n_column = 4 SI = siteInfo() for t in SI.types(): #text+="<li>%s<ul>"%t #for site in getattr(SI,t): # text+="<li><a href=http://hcc-briantest.unl.edu/prodview/%s>%s<a/> </li>"%( site, site) # text+='<a href="http://hcc-briantest.unl.edu/prodview/%s" target="_blank"><img src="http://hcc-briantest.unl.edu/prodview/graphs/%s/daily" style="height:50px"></a>'%(site,site) #text+="</ul></li>" text += "<li>%s<table border=1>" % t c = 0 for site in getattr(SI, t): cpu = SI.cpu_pledges[site] if site in SI.cpu_pledges else 'N/A' disk = SI.disk[SI.CE_to_SE(site)] if SI.CE_to_SE( site) in SI.disk else 'N/A' if c == 0: text += "<tr>" text += '<td><a href=http://dashb-ssb.cern.ch/dashboard/templates/sitePendingRunningJobs.html?site=%s>%s</a><br><a href="http://hcc-briantest.unl.edu/prodview/%s" target="_blank"><img src="http://hcc-briantest.unl.edu/prodview/graphs/%s/daily" style="height:50px"></a><br>CPU pledge: %s<br>Disk available: %s</td>' % ( site, site, site, site, cpu, disk) if c == n_column: c = 0 else: c += 1 text += "</table></li>" html_doc.write("""Site configuration <a href="javascript:showhide('site')">[Click to show/hide]</a> <br> <div id="site" style="display:none;"> <br> <ul> %s </ul></div> """ % (text)) print "... done with status page." html_doc.write(""" </body> </html> """) html_doc.close() html_doc = open('/afs/cern.ch/user/c/cmst2/www/unified/statuses.html', 'w') html_doc.write( """ <html> <table border=1> <thead> <tr> <th> workflow </th><th> status </th><th> wm status</th> </tr> </thead> """) wfs = {} for wfo in session.query(Workflow).all(): wfs[wfo.name] = (wfo.status, wfo.wm_status) open('/afs/cern.ch/user/c/cmst2/www/unified/statuses.json', 'w').write(json.dumps(wfs)) for wfn in sorted(wfs.keys()): html_doc.write( '<tr><td><a id="%s">%s</a></td><td>%s</td><td>%s</td></tr>\n' % (wfn, wfn, wfs[wfn][0], wfs[wfn][1])) html_doc.write("</table>") html_doc.write("<br>" * 100) html_doc.write("end of page</html>") html_doc.close()