def closor(url, specific=None): if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() #LI = lockInfo() all_late_files = [] check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce') ## manually closed-out workflows should get to close with checkor if specific: wfs = session.query(Workflow).filter(Workflow.name.contains(specific)).all() else: wfs = session.query(Workflow).filter(Workflow.status=='close').all() held = set() for wfo in wfs: if specific and not specific in wfo.name: continue ## what is the expected #lumis wfi = workflowInfo(url, wfo.name ) wfo.wm_status = wfi.request['RequestStatus'] if wfi.request['RequestStatus'] in ['announced','normal-archived']: ## manually announced ?? wfo.status = 'done' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog('closor','%s is announced already : %s'%( wfo.name,wfo.wm_status)) session.commit() expected_lumis = 1 if not 'TotalInputLumis' in wfi.request: print wfo.name,"has not been assigned yet, or the database is corrupted" else: expected_lumis = wfi.request['TotalInputLumis'] ## what are the outputs outputs = wfi.request['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = defaultdict(lambda : False) #print outputs if len(outputs): print wfo.name,wfi.request['RequestStatus'] for out in outputs: event_count,lumi_count = getDatasetEventsAndLumis(dataset=out) odb = session.query(Output).filter(Output.datasetname==out).first() if not odb: print "adding an output object",out odb = Output( datasetname = out ) odb.workflow = wfo session.add( odb ) odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) session.commit() wfi.sendLog('closor',"\t%60s %d/%d = %3.2f%%"%(out,lumi_count,expected_lumis,lumi_count/float(expected_lumis)*100.)) #print wfo.fraction_for_closing, lumi_count, expected_lumis #fraction = wfo.fraction_for_closing #fraction = 0.0 #all_OK.append((float(lumi_count) > float(expected_lumis*fraction))) all_OK[out] = True ## check for at least one full copy prior to moving on in_full = {} for out in outputs: in_full[out] = [] presence = getDatasetPresence( url, out ) where = [site for site,info in presence.items() if info[0]] if where: all_OK[out] = True print out,"is in full at",",".join(where) in_full[out] = copy.deepcopy(where) else: going_to = wfi.request['NonCustodialSites']+wfi.request['CustodialSites'] wfi.sendLog('closor',"%s is not in full anywhere. send to %s"%(out, ",".join(sorted(going_to)))) at_destination = dict([(k,v) for (k,v) in presence.items() if k in going_to]) else_where = dict([(k,v) for (k,v) in presence.items() if not k in going_to]) print json.dumps( at_destination ) print json.dumps( else_where, indent=2 ) ## do the full stuck transfer study, missing files and shit ! for there in going_to: late_info = findLateFiles(url, out, going_to = there ) for l in late_info: l.update({"workflow":wfo.name,"dataset":out}) all_late_files.extend( late_info ) if check_fullcopy_to_announce: ## only set this false if the check is relevant all_OK[out] = False ## verify if we have to do harvesting (OK, requests) = spawn_harvesting(url, wfi, in_full) all_OK.update( OK ) ## only that status can let me go into announced if all(all_OK.values()) and wfi.request['RequestStatus'] in ['closed-out']: print wfo.name,"to be announced" results=[]#'dummy'] if not results: for out in outputs: if all_OK[out]: results.append(setDatasetStatus(out, 'VALID')) tier = out.split('/')[-1] campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request and wfi.request['Campaign']: campaign = wfi.request['Campaign'] to_DDM = False ## campaign override if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[campaign] and tier in CI.campaigns[campaign]['toDDM']: to_DDM = True ## by typical enabling if tier in UC.get("tiers_to_DDM"): to_DDM = True ## check for unitarity if not tier in UC.get("tiers_no_DDM")+UC.get("tiers_to_DDM"): print "tier",tier,"neither TO or NO DDM for",out results.append('Not recognitized tier %s'%tier) sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out) continue n_copies = 2 destinations=[] if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[campaign]: ddm_instructions = CI.campaigns[campaign]['DDMcopies'] if type(ddm_instructions) == int: n_copies = CI.campaigns[campaign]['DDMcopies'] elif type(ddm_instructions) == dict: ## a more fancy configuration for ddmtier,indication in ddm_instructions.items(): if ddmtier==tier or ddmtier in ['*','all']: ## this is for us if 'N' in indication: n_copies = indication['N'] if 'host' in indication: destinations = indication['host'] destination_spec = "" if destinations: destination_spec = "--destination="+",".join( destinations ) ## inject to DDM when necessary if to_DDM: #print "Sending",out," to DDM" p = os.popen('python assignDatasetToSite.py --nCopies=%d --dataset=%s %s --exec'%(n_copies, out,destination_spec)) print p.read() status = p.close() if status!=None: print "Failed DDM, retrying a second time" p = os.popen('python assignDatasetToSite.py --nCopies=%d --dataset=%s %s --exec'%(n_copies, out,destination_spec)) print p.read() status = p.close() if status!=None: sendEmail("failed DDM injection","could not add "+out+" to DDM pool. check closor logs.") results.append( status ) if status == None: wfi.sendLog('closor','%s is send to AnalysisOps DDM pool in %s copies %s'%( n_copies, out,destination_spec)) else: print wfo.name,"no stats for announcing",out results.append('No Stats') if all(map(lambda result : result in ['None',None,True],results)): ## only announce if all previous are fine res = reqMgrClient.announceWorkflowCascade(url, wfo.name) if not res in ['None',None]: ## check the status again, it might well have toggled wl_bis = workflowInfo(url, wfo.name) wfo.wm_status = wl_bis.request['RequestStatus'] session.commit() if wl_bis.request['RequestStatus'] in ['announced','normal-archived']: res = None else: ## retry ? res = reqMgrClient.announceWorkflowCascade(url, wfo.name) results.append( res ) #print results if all(map(lambda result : result in ['None',None,True],results)): wfo.status = 'done' session.commit() wfi.sendLog('closor',"workflow is announced") else: print "ERROR with ",wfo.name,"to be announced",json.dumps( results ) else: print wfo.name,"not good for announcing:",wfi.request['RequestStatus'] wfi.sendLog('closor',"cannot be announced") held.add( wfo.name ) days_late = 0. retries_late = 10 really_late_files = [info for info in all_late_files if info['retries']>=retries_late] really_late_files = [info for info in really_late_files if info['delay']/(60*60*24.)>=days_late] if really_late_files: subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s'%(len(really_late_files), days_late, retries_late, json.dumps( really_late_files , indent=2) ) sendEmail('waiting for files to announce', subject) sendLog('closor',subject) print subject open('%s/stuck_files.json'%monitor_dir,'w').write( json.dumps( really_late_files , indent=2)) if held: sendEmail("held from announcing","the workflows below are held up, please check the logs https://cmst2.web.cern.ch/cmst2/unified/logs/closor/last.log \n%s"%("\n".join( held )))
def completor(url, specific): use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] if use_mcm: mcm = McMClient(dev=False) CI = campaignInfo() SI = siteInfo() UC = unifiedConfiguration() wfs = [] wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() ) wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance')).all() ) ## just take it in random order so that not always the same is seen random.shuffle( wfs ) ## by workflow a list of fraction / timestamps completions = json.loads( open('%s/completions.json'%monitor_dir).read()) good_fractions = {} timeout = {} for c in CI.campaigns: if 'force-complete' in CI.campaigns[c]: good_fractions[c] = CI.campaigns[c]['force-complete'] if 'force-timeout' in CI.campaigns[c]: timeout[c] = CI.campaigns[c]['force-timeout'] long_lasting = {} overrides = getForceCompletes() if use_mcm: ## add all workflow that mcm wants to get force completed mcm_force = mcm.get('/restapi/requests/forcecomplete') ## assuming this will be a list of actual prepids overrides['mcm'] = mcm_force print "can force complete on" print json.dumps( good_fractions ,indent=2) print json.dumps( overrides, indent=2) max_force = UC.get("max_force_complete") #wfs_no_location_in_GQ = set() #block_locations = defaultdict(lambda : defaultdict(list)) #wfs_no_location_in_GQ = defaultdict(list) set_force_complete = set() for wfo in wfs: if specific and not specific in wfo.name: continue print "looking at",wfo.name ## get all of the same wfi = workflowInfo(url, wfo.name) pids = wfi.getPrepIDs() skip=False if not any([c in wfo.name for c in good_fractions]): skip=True for user,spec in overrides.items(): if wfi.request['RequestStatus']!='force-complete': if any(s in wfo.name for s in spec) or (wfo.name in spec) or any(pid in spec for pid in pids) or any(s in pids for s in spec): sendEmail('force-complete requested','%s is asking for %s to be force complete'%(user,wfo.name)) wfi = workflowInfo(url, wfo.name) forceComplete(url , wfi ) skip=True wfi.notifyRequestor("The workflow %s was force completed by request of %s"%(wfo.name,user), do_batch=False) wfi.sendLog('completor','%s is asking for %s to be force complete'%(user,wfo.name)) if user == 'mcm' and use_mcm: for pid in wfi.getPrepIDs(): mcm.delete('/restapi/requests/forcecomplete/%s'%pid) #sendEmail('completor test','not force completing automatically, you have to go back to it') #skip=False break if wfo.status.startswith('assistance'): skip = True if skip: continue priority = wfi.request['RequestPriority'] if not 'Campaign' in wfi.request: continue if not wfi.request['RequestStatus'] in ['acquired','running-open','running-closed']: continue c = wfi.request['Campaign'] if not c in good_fractions: continue good_fraction = good_fractions[c] ignore_fraction = 2. lumi_expected = None event_expected = None if not 'TotalInputEvents' in wfi.request: if 'RequestNumEvents' in wfi.request: event_expected = wfi.request['RequestNumEvents'] else: print "truncated, cannot do anything" continue else: lumi_expected = wfi.request['TotalInputLumis'] event_expected = wfi.request['TotalInputEvents'] now = time.mktime(time.gmtime()) / (60*60*24.) running_log = filter(lambda change : change["Status"] in ["running-open","running-closed"],wfi.request['RequestTransition']) if not running_log: print "\tHas no running log" # cannot figure out when the thing started running continue then = running_log[-1]['UpdateTime'] / (60.*60.*24.) delay = now - then ## in days (w,d) = divmod(delay, 7 ) print "\t"*int(w)+"Running since",delay,"[days] priority=",priority monitor_delay = 7 allowed_delay = 14 if c in timeout: allowed_delay = timeout[c] monitor_delay = min(monitor_delay, allowed_delay) ### just skip if too early if delay <= monitor_delay: continue long_lasting[wfo.name] = { "delay" : delay } percent_completions = {} for output in wfi.request['OutputDatasets']: if "/DQM" in output: continue ## that does not count if not output in completions: completions[output] = { 'injected' : None, 'checkpoints' : [], 'workflow' : wfo.name} ## get completion fraction event_count,lumi_count = getDatasetEventsAndLumis(dataset=output) lumi_completion=0. event_completion=0. if lumi_expected: lumi_completion = lumi_count / float( lumi_expected ) if event_expected: event_completion = event_count / float( event_expected ) #take the less optimistic percent_completions[output] = min( lumi_completion, event_completion ) completions[output]['checkpoints'].append( (now, event_completion ) ) if all([percent_completions[out] >= good_fraction for out in percent_completions]): wfi.sendLog('completor', "all is above %s \n%s"%( good_fraction, json.dumps( percent_completions, indent=2 ) )) else: long_lasting[wfo.name].update({ 'completion': sum(percent_completions.values()) / len(percent_completions), 'completions' : percent_completions }) ## do something about the agents this workflow is in long_lasting[wfo.name]['agents'] = wfi.getAgents() wfi.sendLog('completor', "%s not over bound %s\n%s"%(percent_completions.values(), good_fraction, json.dumps( long_lasting[wfo.name]['agents'], indent=2) )) continue if all([percent_completions[out] >= ignore_fraction for out in percent_completions]): print "all is done, just wait a bit" continue for output in percent_completions: completions[output]['injected'] = then #further check on delays cpuh = wfi.getComputingTime(unit='d') ran_at = wfi.request['SiteWhitelist'] wfi.sendLog('completor',"Required %s, time spend %s"%( cpuh, delay)) ##### WILL FORCE COMPLETE BELOW # only really force complete after n days if delay <= allowed_delay: continue ## find ACDCs that might be running if max_force>0: forceComplete(url, wfi ) set_force_complete.add( wfo.name ) print "going for force-complete of",wfo.name wfi.sendLog('completor','going for force completing') wfi.notifyRequestor("The workflow %s was force completed for running too long"% wfo.name) max_force -=1 else: wfi.sendLog('completor',"too many completion this round, cannot force complete") ## do it once only for testing #break if set_force_complete: sendLog('completor','The followings were set force-complete \n%s'%('\n'.join(set_force_complete))) #sendEmail('set force-complete', 'The followings were set force-complete \n%s'%('\n'.join(set_force_complete))) open('%s/completions.json'%monitor_dir,'w').write( json.dumps( completions , indent=2)) text="These have been running for long" open('%s/longlasting.json'%monitor_dir,'w').write( json.dumps( long_lasting, indent=2 )) for wf,info in sorted(long_lasting.items(), key=lambda tp:tp[1]['delay'], reverse=True): delay = info['delay'] text += "\n %s : %s days"% (wf, delay) if 'completion' in info: text += " %d%%"%( info['completion']*100 ) #if wfs_no_location_in_GQ: # sendEmail('workflow with no location in GQ',"there won't be able to run anytime soon\n%s"%( '\n'.join(wfs_no_location_in_GQ))) #sendEmail("long lasting workflow",text) ## you can check the log print text
def completor(url, specific): CI = campaignInfo() wfs = [] wfs.extend(session.query(Workflow).filter(Workflow.status == "away").all()) ##wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance')).all() ) ## just take it in random order random.shuffle(wfs) ## by workflow a list of fraction / timestamps completions = json.loads(open("completions.json").read()) good_fractions = {} for c in CI.campaigns: if "force-complete" in CI.campaigns[c]: good_fractions[c] = CI.campaigns[c]["force-complete"] print "can force complete on" print json.dumps(good_fractions, indent=2) for wfo in wfs: if specific and not specific in wfo.name: continue if not any([c in wfo.name for c in good_fractions]): continue print "looking at", wfo.name ## get all of the same wfi = workflowInfo(url, wfo.name) if not "Campaign" in wfi.request: continue c = wfi.request["Campaign"] if not c in good_fractions: continue good_fraction = good_fractions[c] ignore_fraction = 2.0 lumi_expected = None event_expected = None if not "TotalInputEvents" in wfi.request: if "RequestNumEvents" in wfi.request: event_expected = wfi.request["RequestNumEvents"] else: continue else: lumi_expected = wfi.request["TotalInputLumis"] event_expected = wfi.request["TotalInputEvents"] now = time.mktime(time.gmtime()) / (60 * 60 * 24.0) running_log = filter( lambda change: change["Status"] in ["running-open", "running-closed"], wfi.request["RequestTransition"] ) if not running_log: print "\tHas no running log" # cannot figure out when the thing started running continue then = running_log[-1]["UpdateTime"] / (60.0 * 60.0 * 24.0) delay = now - then ## in days (w, d) = divmod(delay, 7) print "\t" * int(w) + "Running since", delay, "[days]" if delay <= 4: continue if delay >= 7: sendEmail("long lasting workflow", "%s has been running for %s days" % (wfo.name, delay)) percent_completions = {} for output in wfi.request["OutputDatasets"]: if not output in completions: completions[output] = {"injected": None, "checkpoints": [], "workflow": wfo.name} ## get completion fraction event_count, lumi_count = getDatasetEventsAndLumis(dataset=output) lumi_completion = 0.0 event_completion = 0.0 if lumi_expected: lumi_completion = lumi_count / float(lumi_expected) if event_expected: event_completion = event_count / float(event_expected) # take the less optimistic percent_completions[output] = min(lumi_completion, event_completion) completions[output]["checkpoints"].append((now, event_completion)) if all([percent_completions[out] >= good_fraction for out in percent_completions]): print "all is above", good_fraction, "for", wfo.name print json.dumps(percent_completions, indent=2) else: print "\t", percent_completions.values(), "not over bound", good_fraction # print json.dumps( percent_completions, indent=2 ) continue if all([percent_completions[out] >= ignore_fraction for out in percent_completions]): print "all is done, just wait a bit" continue for output in percent_completions: completions[output]["injected"] = then # further check on delays cpuh = wfi.getComputingTime(unit="d") ran_at = wfi.request["SiteWhitelist"] print "Required:", cpuh, print "Time spend:", delay ## find ACDCs that might be running familly = getWorkflowById(url, wfi.request["PrepID"], details=True) for member in familly: ### if member['RequestName'] == wl['RequestName']: continue ## set himself out if member["RequestDate"] < wfi.request["RequestDate"]: continue if member["RequestStatus"] in ["None", None]: continue ## then set force complete all members if member["RequestStatus"] in ["running-opened", "running-closed"]: print "setting", member["RequestName"], "force-complete" print "NOT REALLY FORCING" sendEmail( "force completing", "TAGGING %s is worth force completing\n%s" % (member["RequestName"], percent_completions), ) ##reqMgrClient.setWorkflowForceComplete(url, member['RequestName']) ## do it once only for testing # break open("completions.json", "w").write(json.dumps(completions, indent=2))
def closor(url, specific=None): if not componentInfo().check(): return CI = campaignInfo() LI = lockInfo() ## manually closed-out workflows should get to close with checkor for wfo in session.query(Workflow).filter(Workflow.status=='close').all(): if specific and not specific in wfo.name: continue ## what is the expected #lumis wl = getWorkLoad(url, wfo.name) wfo.wm_status = wl['RequestStatus'] if wl['RequestStatus'] in ['announced','normal-archived']: ## manually announced ?? wfo.status = 'done' wfo.wm_status = wl['RequestStatus'] print wfo.name,"is announced already",wfo.wm_status session.commit() expected_lumis = 1 if not 'TotalInputLumis' in wl: print wfo.name,"has not been assigned yet, or the database is corrupted" else: expected_lumis = wl['TotalInputLumis'] ## what are the outputs outputs = wl['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = [] #print outputs if len(outputs): print wfo.name,wl['RequestStatus'] for out in outputs: event_count,lumi_count = getDatasetEventsAndLumis(dataset=out) odb = session.query(Output).filter(Output.datasetname==out).first() if not odb: print "adding an output object",out odb = Output( datasetname = out ) odb.workflow = wfo session.add( odb ) odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) session.commit() print "\t%60s %d/%d = %3.2f%%"%(out,lumi_count,expected_lumis,lumi_count/float(expected_lumis)*100.) #print wfo.fraction_for_closing, lumi_count, expected_lumis fraction = wfo.fraction_for_closing fraction = 0.0 all_OK.append((float(lumi_count) > float(expected_lumis*fraction))) ## only that status can let me go into announced if wl['RequestStatus'] in ['closed-out']: print wfo.name,"to be announced" results=[]#'dummy'] if not results: for (io,out) in enumerate(outputs): if all_OK[io]: results.append(setDatasetStatus(out, 'VALID')) tier = out.split('/')[-1] to_DDM = (wl['RequestType'] == 'ReDigi' and not ('DQM' in tier)) campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wl and wl['Campaign']: campaign = wl['Campaign'] if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[campaign] and tier in CI.campaigns[campaign]['toDDM']: to_DDM = True ## inject to DDM when necessary passed_to_DDM=True if to_DDM: #print "Sending",out," to DDM" status = subprocess.call(['python','assignDatasetToSite.py','--nCopies=2','--dataset='+out,'--exec']) if status!=0: print "Failed DDM, retrying a second time" status = subprocess.call(['python','assignDatasetToSite.py','--nCopies=2','--dataset='+out,'--exec']) if status!=0: results.append("Failed DDM for %s"% out) sendEmail("failed DDM injection","could not add "+out+" to DDM pool. check closor logs.") passed_to_DDM=False if passed_to_DDM: ## make a lock release LI.release_everywhere( out, reason = 'global unlock after passing to DDM') pass else: print wfo.name,"no stats for announcing",out results.append('No Stats') if all(map(lambda result : result in ['None',None,True],results)): ## only announce if all previous are fine results.append(reqMgrClient.announceWorkflowCascade(url, wfo.name)) #print results if all(map(lambda result : result in ['None',None,True],results)): wfo.status = 'done' session.commit() print wfo.name,"is announced" else: print "ERROR with ",wfo.name,"to be announced",json.dumps( results ) else: print wfo.name,"not good for announcing:",wl['RequestStatus']
def checkor(url, spec=None, options=None): fDB = falseDB() wfs=[] if options.fetch: #workflows = getWorkflows(url, status='completed') #for wf in workflows: # wfo = session.query(Workflow).filter(Workflow.name == wf ).first() # if wfo: # if not wfo.status in ['away','assistance']: continue # wfs.append(wfo ) wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() ) wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() ) else: ## than get all in need for assistance wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() ) custodials = defaultdict(list) #sites : dataset list transfers = defaultdict(list) #sites : dataset list invalidations = [] #a list of files SI = siteInfo() CI = campaignInfo() def get_campaign(output, wfi): campaign = None try: campaign = output.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request: campaign = wfi.request['Campaign'] return campaign for wfo in wfs: if spec and not (spec in wfo.name): continue ## get info wfi = workflowInfo(url, wfo.name) ## make sure the wm status is up to date. # and send things back/forward if necessary. wfo.wm_status = wfi.request['RequestStatus'] if wfo.wm_status == 'closed-out': ## manually closed-out print wfo.name,"is already",wfo.wm_status wfo.status = 'close' session.commit() continue elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived']: ## went into trouble wfo.status = 'trouble' print wfo.name,"is in trouble",wfo.wm_status session.commit() continue elif wfo.wm_status in ['assigned','acquired']: ## not worth checking yet print wfo.name,"not running yet" session.commit() continue if wfo.wm_status != 'completed': ## for sure move on with closeout check if in completed print "no need to check on",wfo.name,"in status",wfo.wm_status session.commit() continue session.commit() sub_assistance="" # if that string is filled, there will be need for manual assistance is_closing = True ## do the closed-out checks one by one # tuck out DQMIO/DQM wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not '/DQM' in out] ## anything running on acdc familly = getWorkflowById(url, wfi.request['PrepID'], details=True) acdc = [] for member in familly: if member['RequestName'] == wfo.name: continue if member['RequestDate'] < wfi.request['RequestDate']: continue if member['RequestType'] != 'Resubmission': continue if member['RequestStatus'] in ['running-opened','running-closed','assignment-approved','assigned','acquired']: print wfo.name,"still has an ACDC running",member['RequestName'] acdc.append( member['RequestName'] ) #print json.dumps(member,indent=2) ## hook for just waiting ... is_closing = False ## completion check percent_completions = {} event_expected,lumi_expected = wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis'] fractions_pass = {} for output in wfi.request['OutputDatasets']: event_count,lumi_count = getDatasetEventsAndLumis(dataset=output) percent_completions[output] = 0. if lumi_expected: percent_completions[output] = lumi_count / float( lumi_expected ) fractions_pass[output] = 0.95 c = get_campaign(output, wfi) if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]: fractions_pass[output] = CI.campaigns[c]['fractionpass'] print "overriding fraction to",fractions_pass[output],"for",output if options.fractionpass: fractions_pass[output] = options.fractionpass print "overriding fraction to",fractions_pass[output],"by command line for",output if not all([percent_completions[out] > fractions_pass[out] for out in fractions_pass]): print wfo.name,"is not completed" print json.dumps(percent_completions, indent=2) print json.dumps(fractions_pass, indent=2) ## hook for creating automatically ACDC ? sub_assistance+='-recovery' is_closing = False ## correct lumi < 300 event per lumi events_per_lumi = {} for output in wfi.request['OutputDatasets']: events_per_lumi[output] = getDatasetEventsPerLumi( output ) lumi_upper_limit = {} for output in wfi.request['OutputDatasets']: upper_limit = 300. campaign = get_campaign(output, wfi) if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]: upper_limit = CI.campaigns[campaign]['lumisize'] print "overriding the upper lumi size to",upper_limit,"for",campaign if options.lumisize: upper_limit = options.lumisize print "overriding the upper lumi size to",upper_limit,"by command line" lumi_upper_limit[output] = upper_limit if any([ events_per_lumi[out] > lumi_upper_limit[out] for out in events_per_lumi]): print wfo.name,"has big lumisections" print json.dumps(events_per_lumi, indent=2) ## hook for rejecting the request ? sub_assistance+='-biglumi' is_closing = False any_presence = {} for output in wfi.request['OutputDatasets']: any_presence[output] = getDatasetPresence(url, output, vetoes=[]) ## custodial copy custodial_locations = {} custodial_presences = {} for output in wfi.request['OutputDatasets']: custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s] custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output) if not custodial_locations[output]: custodial_locations[output] = [] vetoed_custodial_tier = ['MINIAODSIM'] out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier] if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])): print wfo.name,"has not all custodial location" print json.dumps(custodial_locations, indent=2) ########## ## hook for making a custodial replica ? custodial = None ## get from other outputs for output in out_worth_checking: if len(custodial_locations[output]): custodial = custodial_locations[output][0] ## try to get it from campaign configuration if not custodial: for output in out_worth_checking: campaign = get_campaign(output, wfi) if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]: custodial = CI.campaigns[campaign]['custodial'] print "Setting custodial to",custodial,"from campaign configuration" break ## get from the parent if not custodial and 'InputDataset' in wfi.request: parents_custodial = findCustodialLocation(url, wfi.request['InputDataset']) if len(parents_custodial): custodial = parents_custodial[0] else: print "the input dataset",wfi.request['InputDataset'],"does not have custodial in the first place. abort" continue if not custodial: ## pick one at random custodial = SI.pick_SE() if custodial and not sub_assistance and not acdc: ## register the custodial request, if there are no other big issues for output in out_worth_checking: if not len(custodial_locations[output]): custodials[custodial].append( output ) else: print "cannot find a custodial for",wfo.name is_closing = False ## disk copy disk_copies = {} for output in wfi.request['OutputDatasets']: disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)] if not all(map( lambda sites : len(sites)!=0, disk_copies.values())): print wfo.name,"has not all output on disk" print json.dumps(disk_copies, indent=2) ## presence in dbs dbs_presence = {} dbs_invalid = {} for output in wfi.request['OutputDatasets']: dbs_presence[output] = dbs3Client.getFileCountDataset( output ) dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True) ## presence in phedex phedex_presence ={} for output in wfi.request['OutputDatasets']: phedex_presence[output] = phedexClient.getFileCountDataset(url, output ) fraction_invalid = 0.01 if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs,phedex mismatch" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## hook for just waiting ... is_closing = False if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs invalid file level too high" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## need to be going and taking an eye sub_assistance+="-invalidfiles" is_closing = False ## put that heavy part at the end ## duplication check duplications = {} if is_closing: for output in wfi.request['OutputDatasets']: try: duplications[output] = dbs3Client.duplicateRunLumi( output ) except: try: duplications[output] = dbs3Client.duplicateRunLumi( output ) except: print "was not possible to get the duplicate count for",output is_closing=False if any(duplications.values()) and not options.ignoreduplicates: print wfo.name,"has duplicates" print json.dumps(duplications,indent=2) ## hook for making file invalidation ? sub_assistance+='-duplicates' is_closing = False ## for visualization later on if not wfo.name in fDB.record: #print "adding",wfo.name,"to close out record" fDB.record[wfo.name] = { 'datasets' :{}, 'name' : wfo.name, 'closeOutWorkflow' : None, } fDB.record[wfo.name]['closeOutWorkflow'] = is_closing for output in wfi.request['OutputDatasets']: if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {} rec = fDB.record[wfo.name]['datasets'][output] rec['percentage'] = float('%.2f'%(percent_completions[output]*100)) rec['duplicate'] = duplications[output] if output in duplications else 'N/A' rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A' rec['closeOutDataset'] = is_closing rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A' rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output]))) rec['dbsFiles'] = dbs_presence[output] rec['dbsInvFiles'] = dbs_invalid[output] rec['phedexFiles'] = phedex_presence[output] rec['acdc'] = len(acdc) ## and move on if is_closing: ## toggle status to closed-out in request manager print "setting",wfo.name,"closed-out" if not options.test: reqMgrClient.closeOutWorkflowCascade(url, wfo.name) # set it from away/assistance* to close wfo.status = 'close' session.commit() else: print wfo.name,"needs assistance" ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it wfo.status = 'assistance'+sub_assistance if not options.test: print "setting",wfo.name,"to",wfo.status session.commit() fDB.summary() ## custodial requests print "Custodials" print json.dumps(custodials, indent=2) for site in custodials: print ','.join(custodials[site]),'=>',site if not options.test: result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low') print result print "Transfers" print json.dumps(transfers, indent=2) ## replicas requests for site in transfers: print ','.join(transfers[site]),'=>',site if not options.test: result = None #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out") print result print "File Invalidation" print invalidations
def completor(url, specific): use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] if use_mcm: mcm = McMClient(dev=False) CI = campaignInfo() SI = siteInfo() UC = unifiedConfiguration() wfs = [] wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() ) wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance')).all() ) ## just take it in random order so that not always the same is seen random.shuffle( wfs ) max_per_round = UC.get('max_per_round').get('completor',None) if max_per_round and not specific: wfs = wfs[:max_per_round] ## by workflow a list of fraction / timestamps completions = json.loads( open('%s/completions.json'%monitor_dir).read()) good_fractions = {} timeout = {} for c in CI.campaigns: if 'force-complete' in CI.campaigns[c]: good_fractions[c] = CI.campaigns[c]['force-complete'] if 'force-timeout' in CI.campaigns[c]: timeout[c] = CI.campaigns[c]['force-timeout'] long_lasting = {} overrides = getForceCompletes() if use_mcm: ## add all workflow that mcm wants to get force completed mcm_force = mcm.get('/restapi/requests/forcecomplete') ## assuming this will be a list of actual prepids overrides['mcm'] = mcm_force print "can force complete on" print json.dumps( good_fractions ,indent=2) print json.dumps( overrides, indent=2) max_force = UC.get("max_force_complete") #wfs_no_location_in_GQ = set() #block_locations = defaultdict(lambda : defaultdict(list)) #wfs_no_location_in_GQ = defaultdict(list) set_force_complete = set() for wfo in wfs: if specific and not specific in wfo.name: continue print "looking at",wfo.name ## get all of the same wfi = workflowInfo(url, wfo.name) pids = wfi.getPrepIDs() skip=False if not any([c in wfo.name for c in good_fractions]): skip=True for user,spec in overrides.items(): if wfi.request['RequestStatus']!='force-complete': if any(s in wfo.name for s in spec) or (wfo.name in spec) or any(pid in spec for pid in pids) or any(s in pids for s in spec): sendEmail('force-complete requested','%s is asking for %s to be force complete'%(user,wfo.name)) wfi = workflowInfo(url, wfo.name) forceComplete(url , wfi ) skip=True wfi.notifyRequestor("The workflow %s was force completed by request of %s"%(wfo.name,user), do_batch=False) wfi.sendLog('completor','%s is asking for %s to be force complete'%(user,wfo.name)) break if wfo.status.startswith('assistance'): skip = True if skip: continue priority = wfi.request['RequestPriority'] if not 'Campaign' in wfi.request: continue if not wfi.request['RequestStatus'] in ['acquired','running-open','running-closed']: continue c = wfi.request['Campaign'] if not c in good_fractions: continue good_fraction = good_fractions[c] ignore_fraction = 2. lumi_expected = None event_expected = None if not 'TotalInputEvents' in wfi.request: if 'RequestNumEvents' in wfi.request: event_expected = wfi.request['RequestNumEvents'] else: print "truncated, cannot do anything" continue else: lumi_expected = wfi.request['TotalInputLumis'] event_expected = wfi.request['TotalInputEvents'] now = time.mktime(time.gmtime()) / (60*60*24.) running_log = filter(lambda change : change["Status"] in ["running-open","running-closed"],wfi.request['RequestTransition']) if not running_log: print "\tHas no running log" # cannot figure out when the thing started running continue then = running_log[-1]['UpdateTime'] / (60.*60.*24.) delay = now - then ## in days (w,d) = divmod(delay, 7 ) print "\t"*int(w)+"Running since",delay,"[days] priority=",priority monitor_delay = 7 allowed_delay = 14 if c in timeout: allowed_delay = timeout[c] monitor_delay = min(monitor_delay, allowed_delay) ### just skip if too early if delay <= monitor_delay: continue long_lasting[wfo.name] = { "delay" : delay } percent_completions = {} for output in wfi.request['OutputDatasets']: if "/DQM" in output: continue ## that does not count if not output in completions: completions[output] = { 'injected' : None, 'checkpoints' : [], 'workflow' : wfo.name} ## get completion fraction event_count,lumi_count = getDatasetEventsAndLumis(dataset=output) lumi_completion=0. event_completion=0. if lumi_expected: lumi_completion = lumi_count / float( lumi_expected ) if event_expected: event_completion = event_count / float( event_expected ) #take the less optimistic percent_completions[output] = min( lumi_completion, event_completion ) completions[output]['checkpoints'].append( (now, event_completion ) ) if all([percent_completions[out] >= good_fraction for out in percent_completions]): wfi.sendLog('completor', "all is above %s \n%s"%( good_fraction, json.dumps( percent_completions, indent=2 ) )) else: long_lasting[wfo.name].update({ 'completion': sum(percent_completions.values()) / len(percent_completions), 'completions' : percent_completions }) ## do something about the agents this workflow is in long_lasting[wfo.name]['agents'] = wfi.getAgents() wfi.sendLog('completor', "%s not over bound %s\n%s"%(percent_completions.values(), good_fraction, json.dumps( long_lasting[wfo.name]['agents'], indent=2) )) continue if all([percent_completions[out] >= ignore_fraction for out in percent_completions]): print "all is done, just wait a bit" continue for output in percent_completions: completions[output]['injected'] = then #further check on delays cpuh = wfi.getComputingTime(unit='d') ran_at = wfi.request['SiteWhitelist'] wfi.sendLog('completor',"Required %s, time spend %s"%( cpuh, delay)) ##### WILL FORCE COMPLETE BELOW # only really force complete after n days if delay <= allowed_delay: continue ## find ACDCs that might be running if max_force>0: forceComplete(url, wfi ) set_force_complete.add( wfo.name ) print "going for force-complete of",wfo.name wfi.sendLog('completor','going for force completing') wfi.notifyRequestor("The workflow %s was force completed for running too long"% wfo.name) max_force -=1 else: wfi.sendLog('completor',"too many completion this round, cannot force complete") ## do it once only for testing #break if set_force_complete: sendLog('completor','The followings were set force-complete \n%s'%('\n'.join(set_force_complete))) #sendEmail('set force-complete', 'The followings were set force-complete \n%s'%('\n'.join(set_force_complete))) open('%s/completions.json'%monitor_dir,'w').write( json.dumps( completions , indent=2)) text="These have been running for long" open('%s/longlasting.json'%monitor_dir,'w').write( json.dumps( long_lasting, indent=2 )) for wf,info in sorted(long_lasting.items(), key=lambda tp:tp[1]['delay'], reverse=True): delay = info['delay'] text += "\n %s : %s days"% (wf, delay) if 'completion' in info: text += " %d%%"%( info['completion']*100 ) #if wfs_no_location_in_GQ: # sendEmail('workflow with no location in GQ',"there won't be able to run anytime soon\n%s"%( '\n'.join(wfs_no_location_in_GQ))) #sendEmail("long lasting workflow",text) ## you can check the log print text
def completor(url, specific): use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] if use_mcm: mcm = McMClient(dev=False) CI = campaignInfo() SI = siteInfo() UC = unifiedConfiguration() wfs = [] wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() ) wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance')).all() ) ## just take it in random order so that not always the same is seen random.shuffle( wfs ) max_per_round = UC.get('max_per_round').get('completor',None) if max_per_round and not specific: wfs = wfs[:max_per_round] all_stuck = set() ## take into account what stagor was saying all_stuck.update( json.loads( open('%s/stuck_transfers.json'%monitor_dir).read() )) ## take into account the block that needed to be repositioned recently all_stuck.update( [b.split('#')[0] for b in json.loads( open('%s/missing_blocks.json'%monitor_dir).read()) ] ) ## take into account all stuck block and dataset from transfer team all_stuck.update( getAllStuckDataset()) ## by workflow a list of fraction / timestamps completions = json.loads( open('%s/completions.json'%monitor_dir).read()) good_fractions = {} truncate_fractions = {} timeout = {} for c in CI.campaigns: if 'force-complete' in CI.campaigns[c]: good_fractions[c] = CI.campaigns[c]['force-complete'] if 'truncate-complete' in CI.campaigns[c]: truncate_fractions[c] = CI.campaigns[c]['truncate-complete'] if 'force-timeout' in CI.campaigns[c]: timeout[c] = CI.campaigns[c]['force-timeout'] long_lasting = {} overrides = getForceCompletes() if use_mcm: ## add all workflow that mcm wants to get force completed mcm_force = mcm.get('/restapi/requests/forcecomplete') ## assuming this will be a list of actual prepids overrides['mcm'] = mcm_force print "can force complete on" print json.dumps( good_fractions ,indent=2) print "can truncate complete on" print json.dumps( truncate_fractions ,indent=2) print "can overide on" print json.dumps( overrides, indent=2) max_force = UC.get("max_force_complete") #wfs_no_location_in_GQ = set() #block_locations = defaultdict(lambda : defaultdict(list)) #wfs_no_location_in_GQ = defaultdict(list) set_force_complete = set() for wfo in wfs: if specific and not specific in wfo.name: continue print "looking at",wfo.name ## get all of the same wfi = workflowInfo(url, wfo.name) pids = wfi.getPrepIDs() skip=False if not any([c in wfo.name for c in good_fractions]) and not any([c in wfo.name for c in truncate_fractions]): skip=True for user,spec in overrides.items(): if wfi.request['RequestStatus']!='force-complete': if any(s in wfo.name for s in spec) or (wfo.name in spec) or any(pid in spec for pid in pids) or any(s in pids for s in spec): wfi = workflowInfo(url, wfo.name) forceComplete(url , wfi ) skip=True wfi.notifyRequestor("The workflow %s was force completed by request of %s"%(wfo.name,user), do_batch=False) wfi.sendLog('completor','%s is asking for %s to be force complete'%(user,wfo.name)) break if wfo.status.startswith('assistance'): skip = True if skip: continue priority = wfi.request['RequestPriority'] if not 'Campaign' in wfi.request: continue if not wfi.request['RequestStatus'] in ['acquired','running-open','running-closed']: continue if wfi.request['RequestStatus'] in ['running-open','running-closed'] and (random.random() < 0.01): try: parse_one(url, wfo.name ) except Exception as e: print "failed error parsing" print str(e) c = wfi.request['Campaign'] #if not c in good_fractions: continue good_fraction = good_fractions.get(c,1000.) truncate_fraction = truncate_fractions.get(c,1000.) print "force at",good_fraction,"truncate at",truncate_fraction ignore_fraction = 2. lumi_expected = None event_expected = None if not 'TotalInputEvents' in wfi.request: if 'RequestNumEvents' in wfi.request: event_expected = wfi.request['RequestNumEvents'] else: print "truncated, cannot do anything" continue else: lumi_expected = wfi.request['TotalInputLumis'] event_expected = wfi.request['TotalInputEvents'] now = time.mktime(time.gmtime()) / (60*60*24.) running_log = filter(lambda change : change["Status"] in ["running-open","running-closed"],wfi.request['RequestTransition']) if not running_log: print "\tHas no running log" # cannot figure out when the thing started running continue then = running_log[-1]['UpdateTime'] / (60.*60.*24.) delay = now - then ## in days ## this is supposed to be the very initial request date, inherited from clones try: injected_on =time.mktime(time.strptime("-".join(map(lambda n : "%02d"%n, wfi.request['RequestDate'])), "%Y-%m-%d-%H-%M-%S")) / (60.*60.*24.) injection_delay = now - injected_on except: injection_delay = None (w,d) = divmod(delay, 7 ) print "\t"*int(w)+"Running since",delay,"[days] priority=",priority if injection_delay!=None and injection_delay > 50 and False: tail_cutting_priority = 200000 ## bump up to 200k print "Injected since",injection_delay,"[days] priority=",priority if wfi.request['RequestPriority'] != tail_cutting_priority: wfi.sendLog('completor','bumping priority to %d for being injected since'% tail_cutting_priority) reqMgrClient.changePriorityWorkflow(url, wfo.name, tail_cutting_priority) _,prim,_,_ = wfi.getIO() is_stuck = all_stuck & prim if is_stuck: wfi.sendLog('completor','%s is stuck'%','.join(is_stuck)) monitor_delay = 7 allowed_delay = 14 if c in timeout: allowed_delay = timeout[c] monitor_delay = min(monitor_delay, allowed_delay) ### just skip if too early if delay <= monitor_delay: continue long_lasting[wfo.name] = { "delay" : delay, "injection_delay" : injection_delay } percent_completions = {} for output in wfi.request['OutputDatasets']: if "/DQM" in output: continue ## that does not count if not output in completions: completions[output] = { 'injected' : None, 'checkpoints' : [], 'workflow' : wfo.name} ## get completion fraction event_count,lumi_count = getDatasetEventsAndLumis(dataset=output) lumi_completion=0. event_completion=0. if lumi_expected: lumi_completion = lumi_count / float( lumi_expected ) if event_expected: event_completion = event_count / float( event_expected ) #take the less optimistic ## taking the max or not : to force-complete on over-pro percent_completions[output] = min( lumi_completion, event_completion ) completions[output]['checkpoints'].append( (now, event_completion ) ) if all([percent_completions[out] >= good_fraction for out in percent_completions]): wfi.sendLog('completor', "all is above %s \n%s"%( good_fraction, json.dumps( percent_completions, indent=2 ) )) elif is_stuck and (all([percent_completions[out] >= truncate_fraction for out in percent_completions])): wfi.sendLog('completor', "all is above %s truncation level, and the input is stuck\n%s"%( truncate_fraction, json.dumps( percent_completions, indent=2 ) ) ) sendEmail('workflow for truncation','check on %s'%wfo.name) ##continue ## because you do not want to get this one right now else: long_lasting[wfo.name].update({ 'completion': sum(percent_completions.values()) / len(percent_completions), 'completions' : percent_completions }) ## do something about the agents this workflow is in long_lasting[wfo.name]['agents'] = wfi.getAgents() wfi.sendLog('completor', "%s not over bound %.3f (complete) %.3f truncate \n%s"%(percent_completions.values(), good_fraction, truncate_fraction, json.dumps( long_lasting[wfo.name]['agents'], indent=2) )) continue if all([percent_completions[out] >= ignore_fraction for out in percent_completions]): print "all is done, just wait a bit" continue for output in percent_completions: completions[output]['injected'] = then #further check on delays cpuh = wfi.getComputingTime(unit='d') ran_at = wfi.request['SiteWhitelist'] wfi.sendLog('completor',"Required %s, time spend %s"%( cpuh, delay)) ##### WILL FORCE COMPLETE BELOW # only really force complete after n days if delay <= allowed_delay: continue ## find ACDCs that might be running if max_force>0: forceComplete(url, wfi ) set_force_complete.add( wfo.name ) print "going for force-complete of",wfo.name wfi.sendLog('completor','going for force completing') wfi.notifyRequestor("The workflow %s was force completed for running too long"% wfo.name) max_force -=1 else: wfi.sendLog('completor',"too many completion this round, cannot force complete") ## do it once only for testing #break if delay >= 40: print wfo.name,"has been running for",dealy,"days" ## bumping up the priority? if set_force_complete: sendLog('completor','The followings were set force-complete \n%s'%('\n'.join(set_force_complete))) open('%s/completions.json'%monitor_dir,'w').write( json.dumps( completions , indent=2)) text="These have been running for long" open('%s/longlasting.json'%monitor_dir,'w').write( json.dumps( long_lasting, indent=2 )) for wf,info in sorted(long_lasting.items(), key=lambda tp:tp[1]['delay'], reverse=True): delay = info['delay'] text += "\n %s : %s days"% (wf, delay) if 'completion' in info: text += " %d%%"%( info['completion']*100 ) print text
def checkor(url, spec=None, options=None): fDB = closeoutInfo() if userLock(): return if duplicateLock(): return UC = unifiedConfiguration() use_mcm = True up = componentInfo(mcm=use_mcm, soft=["mcm"]) if not up.check(): return use_mcm = up.status["mcm"] wfs = [] if options.fetch: ## get all in running and check wfs.extend(session.query(Workflow).filter(Workflow.status == "away").all()) wfs.extend(session.query(Workflow).filter(Workflow.status == "assistance").all()) if options.nofetch: ## than get all in need for assistance wfs.extend(session.query(Workflow).filter(Workflow.status.startswith("assistance-")).all()) custodials = defaultdict(list) # sites : dataset list transfers = defaultdict(list) # sites : dataset list invalidations = [] # a list of files SI = global_SI CI = campaignInfo() mcm = McMClient(dev=False) def get_campaign(output, wfi): campaign = None try: campaign = output.split("/")[2].split("-")[0] except: if "Campaign" in wfi.request: campaign = wfi.request["Campaign"] return campaign by_passes = [] holdings = [] for bypassor, email in [ ("jbadillo", "*****@*****.**"), ("vlimant", "*****@*****.**"), ("jen_a", "*****@*****.**"), ]: bypass_file = "/afs/cern.ch/user/%s/%s/public/ops/bypass.json" % (bypassor[0], bypassor) if not os.path.isfile(bypass_file): print "no file", bypass_file continue try: by_passes.extend(json.loads(open(bypass_file).read())) except: print "cannot get by-passes from", bypass_file, "for", bypassor sendEmail("malformated by-pass information", "%s is not json readable" % (bypass_file), destination=[email]) holding_file = "/afs/cern.ch/user/%s/%s/public/ops/onhold.json" % (bypassor[0], bypassor) if not os.path.isfile(holding_file): print "no file", holding_file continue try: holdings.extend(json.loads(open(holding_file).read())) except: print "cannot get holdings from", holding_file, "for", bypassor sendEmail( "malformated by-pass information", "%s is not json readable" % (holding_file), destination=[email] ) total_running_time = 5.0 * 60.0 sleep_time = max(0.5, total_running_time / len(wfs)) for wfo in wfs: if spec and not (spec in wfo.name): continue time.sleep(sleep_time) print "checking on", wfo.name ## get info wfi = workflowInfo(url, wfo.name) ## make sure the wm status is up to date. # and send things back/forward if necessary. wfo.wm_status = wfi.request["RequestStatus"] if wfo.wm_status == "closed-out": ## manually closed-out print wfo.name, "is already", wfo.wm_status wfo.status = "close" session.commit() continue elif wfo.wm_status in [ "failed", "aborted", "aborted-archived", "rejected", "rejected-archived", "aborted-completed", ]: ## went into trouble wfo.status = "trouble" print wfo.name, "is in trouble", wfo.wm_status session.commit() continue elif wfo.wm_status in ["assigned", "acquired"]: ## not worth checking yet print wfo.name, "not running yet" session.commit() continue if "-onhold" in wfo.status: if wfo.name in holdings and wfo.name not in by_passes: print wfo.name, "on hold" continue if wfo.name in holdings and wfo.name not in by_passes: wfo.status = "assistance-onhold" print "setting", wfo.name, "on hold" session.commit() continue if wfo.wm_status != "completed" and not wfo.name in by_passes: ## for sure move on with closeout check if in completed print "no need to check on", wfo.name, "in status", wfo.wm_status session.commit() continue session.commit() sub_assistance = "" # if that string is filled, there will be need for manual assistance is_closing = True ## get it from somewhere by_pass_checks = False if wfo.name in by_passes: print "we can bypass checks on", wfo.name by_pass_checks = True for bypass in by_passes: if bypass in wfo.name: print "we can bypass", wfo.name, "because of keyword", bypass by_pass_checks = True break if not CI.go(wfi.request["Campaign"]) and not by_pass_checks: print "No go for", wfo.name continue # tuck out DQMIO/DQM wfi.request["OutputDatasets"] = [out for out in wfi.request["OutputDatasets"] if not "/DQM" in out] ## anything running on acdc familly = getWorkflowById(url, wfi.request["PrepID"], details=True) acdc = [] acdc_inactive = [] has_recovery_going = False had_any_recovery = False for member in familly: if member["RequestType"] != "Resubmission": continue if member["RequestName"] == wfo.name: continue if member["RequestDate"] < wfi.request["RequestDate"]: continue if member["RequestStatus"] in [ "running-open", "running-closed", "assignment-approved", "assigned", "acquired", ]: print wfo.name, "still has an ACDC running", member["RequestName"] acdc.append(member["RequestName"]) # print json.dumps(member,indent=2) ## hook for just waiting ... is_closing = False has_recovery_going = True elif member["RequestStatus"] == None: print member["RequestName"], "is not real" pass else: acdc_inactive.append(member["RequestName"]) had_any_recovery = True ## completion check percent_completions = {} # print "let's see who is crashing", wfo.name # print wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis'] if not "TotalInputEvents" in wfi.request: event_expected, lumi_expected = 0, 0 if not "recovery" in wfo.status: sendEmail( "missing member of the request", "TotalInputEvents is missing from the workload of %s" % wfo.name, destination=["*****@*****.**"], ) else: event_expected, lumi_expected = wfi.request["TotalInputEvents"], wfi.request["TotalInputLumis"] if "RequestNumEvents" in wfi.request: event_expected = int(wfi.request["RequestNumEvents"]) elif "Task1" in wfi.request and "RequestNumEvents" in wfi.request["Task1"]: event_expected = int(wfi.request["Task1"]["RequestNumEvents"]) fractions_pass = {} for output in wfi.request["OutputDatasets"]: event_count, lumi_count = getDatasetEventsAndLumis(dataset=output) percent_completions[output] = 0.0 if lumi_expected: percent_completions[output] = lumi_count / float(lumi_expected) if event_expected: percent_completions[output] = max(percent_completions[output], event_count / float(event_expected)) fractions_pass[output] = 0.95 c = get_campaign(output, wfi) if c in CI.campaigns and "fractionpass" in CI.campaigns[c]: fractions_pass[output] = CI.campaigns[c]["fractionpass"] print "overriding fraction to", fractions_pass[output], "for", output if options.fractionpass: fractions_pass[output] = options.fractionpass print "overriding fraction to", fractions_pass[output], "by command line for", output if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]): print wfo.name, "is not completed" print json.dumps(percent_completions, indent=2) print json.dumps(fractions_pass, indent=2) ## hook for creating automatically ACDC ? if has_recovery_going: sub_assistance += "-recovering" elif had_any_recovery: ## we want to have this looked at sub_assistance += "-manual" else: sub_assistance += "-recovery" is_closing = False ## correct lumi < 300 event per lumi events_per_lumi = {} for output in wfi.request["OutputDatasets"]: events_per_lumi[output] = getDatasetEventsPerLumi(output) lumi_upper_limit = {} for output in wfi.request["OutputDatasets"]: upper_limit = 301.0 campaign = get_campaign(output, wfi) # if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request: # upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency'] # print "setting the upper limit of lumisize to",upper_limit,"by request configuration" if campaign in CI.campaigns and "lumisize" in CI.campaigns[campaign]: upper_limit = CI.campaigns[campaign]["lumisize"] print "overriding the upper lumi size to", upper_limit, "for", campaign if options.lumisize: upper_limit = options.lumisize print "overriding the upper lumi size to", upper_limit, "by command line" lumi_upper_limit[output] = upper_limit if any([events_per_lumi[out] >= lumi_upper_limit[out] for out in events_per_lumi]): print wfo.name, "has big lumisections" print json.dumps(events_per_lumi, indent=2) ## hook for rejecting the request ? sub_assistance += "-biglumi" is_closing = False any_presence = {} for output in wfi.request["OutputDatasets"]: any_presence[output] = getDatasetPresence(url, output, vetoes=[]) ## custodial copy custodial_locations = {} custodial_presences = {} for output in wfi.request["OutputDatasets"]: custodial_presences[output] = [s for s in any_presence[output] if "MSS" in s] custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output) if not custodial_locations[output]: custodial_locations[output] = [] ## presence in phedex phedex_presence = {} for output in wfi.request["OutputDatasets"]: phedex_presence[output] = phedexClient.getFileCountDataset(url, output) vetoed_custodial_tier = UC.get("tiers_with_no_custodial") out_worth_checking = [ out for out in custodial_locations.keys() if out.split("/")[-1] not in vetoed_custodial_tier ] size_worth_checking = sum( [getDatasetSize(out) / 1023.0 for out in out_worth_checking] ) ## size in TBs of all outputs if not all(map(lambda sites: len(sites) != 0, [custodial_locations[out] for out in out_worth_checking])): print wfo.name, "has not all custodial location" print json.dumps(custodial_locations, indent=2) ########## ## hook for making a custodial replica ? custodial = None ## get from other outputs for output in out_worth_checking: if len(custodial_locations[output]): custodial = custodial_locations[output][0] if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the other output custodial:", custodial, "because of limited space" custodial = None ## try to get it from campaign configuration if not custodial: for output in out_worth_checking: campaign = get_campaign(output, wfi) if campaign in CI.campaigns and "custodial" in CI.campaigns[campaign]: custodial = CI.campaigns[campaign]["custodial"] print "Setting custodial to", custodial, "from campaign configuration" break if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the unified configuration custodial:", custodial, "because of limited space" custodial = None ## get from the parent pick_custodial = True if not custodial and "InputDataset" in wfi.request: ## this is terribly dangerous to assume only parents_custodial = phedexClient.getCustodialSubscriptionRequestSite(wfi.request["InputDataset"]) ###parents_custodial = findCustodialLocation(url, wfi.request['InputDataset']) if not parents_custodial: parents_custodial = [] if len(parents_custodial): custodial = parents_custodial[0] else: print "the input dataset", wfi.request[ "InputDataset" ], "does not have custodial in the first place. abort" sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor" % wfi.request["InputDataset"], ) is_closing = False pick_custodial = False if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the parent custodial:", custodial, "because of limited space" custodial = None if not custodial and pick_custodial: ## pick one at random custodial = SI.pick_SE(size=size_worth_checking) if not custodial: print "cannot find a custodial for", wfo.name sendEmail( "cannot find a custodial", "cannot find a custodial for %s probably because of the total output size %d" % (wfo.name, size_worth_checking), ) if custodial and ((not sub_assistance and not acdc) or by_pass_checks): ## remember how much you added this round already ; this stays locally SI.storage[custodial] -= size_worth_checking ## register the custodial request, if there are no other big issues for output in out_worth_checking: if not len(custodial_locations[output]): if phedex_presence[output] >= 1: custodials[custodial].append(output) else: print "no file in phedex for", output, " not good to add to custodial requests" is_closing = False ## disk copy disk_copies = {} for output in wfi.request["OutputDatasets"]: disk_copies[output] = [s for s in any_presence[output] if (not "MSS" in s) and (not "Buffer" in s)] if not all(map(lambda sites: len(sites) != 0, disk_copies.values())): print wfo.name, "has not all output on disk" print json.dumps(disk_copies, indent=2) ## presence in dbs dbs_presence = {} dbs_invalid = {} for output in wfi.request["OutputDatasets"]: dbs_presence[output] = dbs3Client.getFileCountDataset(output) dbs_invalid[output] = dbs3Client.getFileCountDataset(output, onlyInvalid=True) fraction_invalid = 0.01 if ( not all( [ dbs_presence[out] == (dbs_invalid[out] + phedex_presence[out]) for out in wfi.request["OutputDatasets"] ] ) and not options.ignorefiles ): print wfo.name, "has a dbs,phedex mismatch" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## hook for just waiting ... is_closing = False if ( not all( [ (dbs_invalid[out] <= int(fraction_invalid * dbs_presence[out])) for out in wfi.request["OutputDatasets"] ] ) and not options.ignorefiles ): print wfo.name, "has a dbs invalid file level too high" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## need to be going and taking an eye sub_assistance += "-invalidfiles" is_closing = False ## put that heavy part at the end ## duplication check duplications = {} if is_closing: print "starting duplicate checker for", wfo.name for output in wfi.request["OutputDatasets"]: print "\tchecking", output duplications[output] = True try: duplications[output] = dbs3Client.duplicateRunLumi(output) except: try: duplications[output] = dbs3Client.duplicateRunLumi(output) except: print "was not possible to get the duplicate count for", output is_closing = False if any(duplications.values()) and not options.ignoreduplicates: print wfo.name, "has duplicates" print json.dumps(duplications, indent=2) ## hook for making file invalidation ? sub_assistance += "-duplicates" is_closing = False ## for visualization later on if not wfo.name in fDB.record: # print "adding",wfo.name,"to close out record" fDB.record[wfo.name] = {"datasets": {}, "name": wfo.name, "closeOutWorkflow": None} fDB.record[wfo.name]["closeOutWorkflow"] = is_closing for output in wfi.request["OutputDatasets"]: if not output in fDB.record[wfo.name]["datasets"]: fDB.record[wfo.name]["datasets"][output] = {} rec = fDB.record[wfo.name]["datasets"][output] rec["percentage"] = float("%.2f" % (percent_completions[output] * 100)) rec["duplicate"] = duplications[output] if output in duplications else "N/A" rec["phedexReqs"] = ( float("%.2f" % any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output]) != 0 else "N/A" ) rec["closeOutDataset"] = is_closing rec["transPerc"] = ( float("%.2f" % any_presence[output][disk_copies[output][0]][1]) if len(disk_copies[output]) != 0 else "N/A" ) rec["correctLumis"] = ( int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True ) rec["missingSubs"] = ( False if len(custodial_locations[output]) == 0 else ",".join(list(set(custodial_locations[output]))) ) rec["dbsFiles"] = dbs_presence[output] rec["dbsInvFiles"] = dbs_invalid[output] rec["phedexFiles"] = phedex_presence[output] rec["acdc"] = "%d / %d" % (len(acdc), len(acdc + acdc_inactive)) if by_pass_checks: ## force closing is_closing = True ## and move on if is_closing: ## toggle status to closed-out in request manager print "setting", wfo.name, "closed-out" if not options.test: if wfo.wm_status in ["closed-out", "announced", "normal-archived"]: print wfo.name, "is already", wfo.wm_status, "not trying to closed-out and assuming it does" res = None else: res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) print "close out answer", res if not res in ["None", None]: print "retrying to closing out" print res res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) if res in [None, "None"]: wfo.status = "close" session.commit() else: print "could not close out", wfo.name, "will try again next time" else: ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it new_status = "assistance" + sub_assistance print wfo.name, "needs assistance with", new_status if sub_assistance and wfo.status != new_status and "PrepID" in wfi.request and not "manual" in wfo.status: pid = wfi.getPrepIDs()[0].replace("task_", "") # pid = wfi.request['PrepID'].replace('task_','') ## notify messages = { "recovery": "Samples completed with missing statistics:\n%s " % ( "\n".join( [ "%.2f %% complete for %s" % (percent_completions[output] * 100, output) for output in wfi.request["OutputDatasets"] ] ) ), "biglumi": "Samples completed with large luminosity blocks:\n%s " % ( "\n".join( [ "%d > %d for %s" % (events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request["OutputDatasets"] if (events_per_lumi[output] > lumi_upper_limit[output]) ] ) ), "duplicate": "Samples completed with duplicated luminosity blocks:\n%s" % ( "\n".join( [ "%s" % output for output in wfi.request["OutputDatasets"] if output in duplications and duplications[output] ] ) ), } text = "The request %s (%s) is facing issue in production.\n" % (pid, wfo.name) content = "" for case in messages: if case in new_status: content += "\n" + messages[case] + "\n" text += content text += "You are invited to check, while this is being taken care of by Ops.\n" text += "This is an automated message." if use_mcm and content: print "Sending notification back to requestor" print text batches = mcm.getA("batches", query="contains=%s&status=announced" % pid) if len(batches): ## go notify the batch bid = batches[-1]["prepid"] print "batch nofication to", bid mcm.put("/restapi/batches/notify", {"notes": text, "prepid": bid}) ## go notify the request print "request notification to", pid mcm.put("/restapi/requests/notify", {"message": text, "prepids": [pid]}) ## case where the workflow was in manual from recoveror if not "manual" in wfo.status or new_status != "assistance-recovery": wfo.status = new_status if not options.test: print "setting", wfo.name, "to", wfo.status session.commit() else: print "current status is", wfo.status, "not changing to anything" fDB.html() ## custodial requests print "Custodials" print json.dumps(custodials, indent=2) for site in custodials: print ",".join(custodials[site]), "=>", site if not options.test: result = makeReplicaRequest( url, site, list(set(custodials[site])), "custodial copy at production close-out", custodial="y", priority="low", approve=(site in SI.sites_auto_approve), ) print result print "Transfers" print json.dumps(transfers, indent=2) ## replicas requests for site in transfers: print ",".join(transfers[site]), "=>", site if not options.test: result = None # result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out") print result print "File Invalidation" print invalidations
#!/usr/bin/env python from utils import getDatasetPresence, getDatasetEventsAndLumis import json import sys url = 'cmsweb.cern.ch' ev, lumi = getDatasetEventsAndLumis(sys.argv[1]) if lumi: print ev, lumi, ev / float(lumi) presence = getDatasetPresence(url, sys.argv[1]) print json.dumps(presence, indent=2)
def closor(url, specific=None): CI = campaignInfo() ## manually closed-out workflows should get to close with checkor for wfo in session.query(Workflow).filter(Workflow.status=='close').all(): if specific and not specific in wfo.name: continue ## what is the expected #lumis wl = getWorkLoad(url, wfo.name) wfo.wm_status = wl['RequestStatus'] if wl['RequestStatus'] in ['announced','normal-archived']: ## manually announced ?? wfo.status = 'done' wfo.wm_status = wl['RequestStatus'] print wfo.name,"is done already",wfo.wm_status session.commit() if not 'TotalInputLumis' in wl: print wfo.name,"has not been assigned yet" continue expected_lumis = wl['TotalInputLumis'] ## what are the outputs outputs = wl['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = [] #print outputs if len(outputs): print wfo.name,wl['RequestStatus'] for out in outputs: event_count,lumi_count = getDatasetEventsAndLumis(dataset=out) odb = session.query(Output).filter(Output.datasetname==out).first() if not odb: print "adding an output object",out odb = Output( datasetname = out ) odb.workflow = wfo session.add( odb ) odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) session.commit() print "\t%60s %d/%d = %3.2f%%"%(out,lumi_count,expected_lumis,lumi_count/float(expected_lumis)*100.) #print wfo.fraction_for_closing, lumi_count, expected_lumis fraction = wfo.fraction_for_closing fraction = 0.0 all_OK.append((float(lumi_count) > float(expected_lumis*fraction))) ## only that status can let me go into announced if wl['RequestStatus'] in ['closed-out']: print wfo.name,"to be announced" results=[]#'dummy'] if not results: results.append(reqMgrClient.announceWorkflowCascade(url, wfo.name)) for (io,out) in enumerate(outputs): if all_OK[io]: results.append(setDatasetStatusDBS3.setStatusDBS3('https://cmsweb.cern.ch/dbs/prod/global/DBSWriter', out, 'VALID' ,'')) tier = out.split('/')[-1] to_DDM = (wl['RequestType'] == 'ReDigi' and not ('DQM' in tier)) campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wl and wl['Campaign']: campaign = wl['Campaign'] if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[campaign] and tier in CI.campaigns[campaign]['toDDM']: to_DDM = True ## inject to DDM everything from ReDigi if to_DDM: print "Sending",out," to DDM" subprocess.call(['python','assignDatasetToSite.py','--dataset='+out,'--exec']) else: print wfo.name,"no stats for announcing",out results.append(None) #print results if all(map(lambda result : result in ['None',None],results)): wfo.status = 'done' session.commit() print wfo.name,"is announced" else: print "ERROR with ",wfo.name,"to be announced",json.dumps( results ) else: print wfo.name,"not good for announcing:",wl['RequestStatus']
def closor(url, specific=None, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() all_late_files = [] check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce') jump_the_line = options.announce if options else False if jump_the_line: print "announce option is on. Checking on things on-going ready to be announced" wfs = session.query(Workflow).filter( Workflow.status.contains('announce')).filter( sqlalchemy.not_(Workflow.status.contains('announced'))).all() else: print "regular option. Checking on things done and to be announced" wfs = session.query(Workflow).filter(Workflow.status == 'close').all() wfs_n = [w.name for w in wfs] print "unique names?" print len(set(wfs_n)) == len(wfs_n) held = set() print len(wfs), "closing" random.shuffle(wfs) max_per_round = UC.get('max_per_round').get('closor', None) if options.limit: max_per_round = options.limit if max_per_round: ## order them by priority all_closedout = sorted(getWorkflows(url, 'closed-out', details=True), key=lambda r: r['RequestPriority']) all_closedout = [r['RequestName'] for r in all_closedout] def rank(wfn): return all_closedout.index(wfn) if wfn in all_closedout else 0 wfs = sorted(wfs, key=lambda wfo: rank(wfo.name), reverse=True) wfs = wfs[:max_per_round] batch_go = {} batch_warnings = defaultdict(set) batch_goodness = UC.get("batch_goodness") for iwfo, wfo in enumerate(wfs): if specific and not specific in wfo.name: continue print "Progress [%d/%d]" % (iwfo, len(wfs)) ## what is the expected #lumis wfi = workflowInfo(url, wfo.name) wfo.wm_status = wfi.request['RequestStatus'] if wfi.isRelval(): has_batch_go = False batch_name = wfi.getCampaign() if not batch_name in batch_go: ## do the esimatation whethere this can be announced : only once per batch in_batches = getWorkflowByCampaign(url, batch_name, details=True) batch_go[batch_name] = all( map( lambda s: not s in [ 'completed', 'running-open', 'running-closed', 'acquired', 'assigned', 'assignment-approved' ], [r['RequestStatus'] for r in in_batches])) ## already verified has_batch_go = batch_go[batch_name] if not has_batch_go: wfi.sendLog( 'closor', 'Cannot close for now because the batch <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?campaign=%s>%s</a> is not all close' % (batch_name, batch_name)) continue if wfi.request['RequestStatus'] in ['announced', 'normal-archived' ] and not options.force: ## manually announced ?? wfo.status = 'done' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog( 'closor', '%s is announced already : %s' % (wfo.name, wfo.wm_status)) session.commit() if jump_the_line: wfi.sendLog('closor', 'Announcing while completing') expected_lumis = 1 if not 'TotalInputLumis' in wfi.request: print wfo.name, "has not been assigned yet, or the database is corrupted" elif wfi.request['TotalInputLumis'] == 0: print wfo.name, "is corrupted with 0 expected lumis" else: expected_lumis = wfi.request['TotalInputLumis'] ## what are the outputs outputs = wfi.request['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = defaultdict(lambda: False) stats = defaultdict(int) #print outputs if len(outputs): print wfo.name, wfi.request['RequestStatus'] for out in outputs: event_count, lumi_count = getDatasetEventsAndLumis(dataset=out) odb = session.query(Output).filter( Output.datasetname == out).first() if not odb: print "adding an output object", out odb = Output(datasetname=out) odb.workflow = wfo session.add(odb) odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) session.commit() fraction = lumi_count / float(expected_lumis) * 100. completion_line = "%60s %d/%d = %3.2f%%" % ( out, lumi_count, expected_lumis, fraction) wfi.sendLog('closor', "\t%s" % completion_line) if wfi.isRelval() and fraction < batch_goodness: batch_warnings[wfi.getCampaign()].add(completion_line) stats[out] = lumi_count all_OK[out] = True ## check for at least one full copy prior to moving on in_full = {} for out in outputs: in_full[out] = [] presence = getDatasetPresence(url, out) where = [site for site, info in presence.items() if info[0]] if where: all_OK[out] = True print out, "is in full at", ",".join(where) in_full[out] = copy.deepcopy(where) else: going_to = wfi.request['NonCustodialSites'] + wfi.request[ 'CustodialSites'] wfi.sendLog( 'closor', "%s is not in full anywhere. send to %s" % (out, ",".join(sorted(going_to)))) at_destination = dict([(k, v) for (k, v) in presence.items() if k in going_to]) else_where = dict([(k, v) for (k, v) in presence.items() if not k in going_to]) print json.dumps(at_destination) print json.dumps(else_where, indent=2) ## do the full stuck transfer study, missing files and shit ! for there in going_to: late_info = findLateFiles(url, out, going_to=there) for l in late_info: l.update({"workflow": wfo.name, "dataset": out}) all_late_files.extend(late_info) if check_fullcopy_to_announce: ## only set this false if the check is relevant all_OK[out] = False ## verify if we have to do harvesting if not options.no_harvest and not jump_the_line: (OK, requests) = spawn_harvesting(url, wfi, in_full) all_OK.update(OK) ## only that status can let me go into announced if all(all_OK.values()) and ( (wfi.request['RequestStatus'] in ['closed-out']) or options.force or jump_the_line): print wfo.name, "to be announced" results = [] if not results: for out in outputs: if out in stats and not stats[out]: continue _, dsn, process_string, tier = out.split('/') if all_OK[out]: results.append(setDatasetStatus(out, 'VALID')) if all_OK[out] and wfi.isRelval(): ## make the specific relval rules and the replicas ## figure the destination(s) out destinations = set() if tier != "RECO" and tier != "ALCARECO": destinations.add('T2_CH_CERN') if tier == "GEN-SIM": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-DIGI-RAW": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-RECO": destinations.add('T1_US_FNAL_Disk') if "RelValTTBar" in dsn and "TkAlMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if "MinimumBias" in dsn and "SiStripCalMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if destinations: wfi.sendLog( 'closor', '%s to go to %s' % (out, ', '.join(sorted(destinations)))) ## call to makereplicarequest under relval => done for site in destinations: result = makeReplicaRequest( url, site, [out], 'Copy for release validation consumption', priority='normal', approve=True, mail=False, group='RelVal') try: request_id = result['phedex'][ 'request_created'][0]['id'] results.append(True) except: results.append('Failed relval transfer') elif all_OK[out]: campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request and wfi.request[ 'Campaign']: campaign = wfi.request['Campaign'] to_DDM = False ## campaign override if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[ campaign] and tier in CI.campaigns[campaign][ 'toDDM']: to_DDM = True ## by typical enabling if tier in UC.get("tiers_to_DDM"): to_DDM = True ## check for unitarity if not tier in UC.get("tiers_no_DDM") + UC.get( "tiers_to_DDM"): print "tier", tier, "neither TO or NO DDM for", out results.append('Not recognitized tier %s' % tier) #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out) sendLog( 'closor', "could not recognize %s for injecting in DDM" % out, level='critical') continue n_copies = 1 destinations = [] if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[ campaign]: ddm_instructions = CI.campaigns[campaign][ 'DDMcopies'] if type(ddm_instructions) == int: n_copies = CI.campaigns[campaign]['DDMcopies'] elif type(ddm_instructions) == dict: ## a more fancy configuration for ddmtier, indication in ddm_instructions.items( ): if ddmtier == tier or ddmtier in [ '*', 'all' ]: ## this is for us if 'N' in indication: n_copies = indication['N'] if 'host' in indication: destinations = indication['host'] destination_spec = "" if destinations: destination_spec = "--destination=" + ",".join( destinations) group_spec = "" ## not used yet ### should make this a campaign configuration ## inject to DDM when necessary if to_DDM: print "Sending", out, " to DDM" status = pass_to_dynamo( [out], N=n_copies, sites=destinations if destinations else None, group=group_spec if group_spec else None) results.append(status) if status == True: wfi.sendLog( 'closor', '%s is send to dynamo in %s copies %s %s' % (out, n_copies, sorted(destinations), group_spec)) else: sendLog('closor', "could not add " + out + " to dynamo pool. check closor logs.", level='critical') wfi.sendLog( 'closor', "could not add " + out + " to dynamo pool. check closor logs.") else: print wfo.name, "no stats for announcing", out results.append('No Stats') if all( map(lambda result: result in ['None', None, True], results)): if not jump_the_line: ## only announce if all previous are fine res = reqMgrClient.announceWorkflowCascade( url, wfo.name) if not res in ['None', None]: ## check the status again, it might well have toggled wl_bis = workflowInfo(url, wfo.name) wfo.wm_status = wl_bis.request['RequestStatus'] session.commit() if wl_bis.request['RequestStatus'] in [ 'announced', 'normal-archived' ]: res = None else: ## retry ? res = reqMgrClient.announceWorkflowCascade( url, wfo.name) results.append(res) #print results if all(map(lambda result: result in ['None', None, True], results)): if jump_the_line: if not 'announced' in wfo.status: wfo.status = wfo.status.replace( 'announce', 'announced') else: wfo.status = 'done' session.commit() wfi.sendLog('closor', "workflow outputs are announced") else: wfi.sendLog( 'closor', "Error with %s to be announced \n%s" % (wfo.name, json.dumps(results))) elif wfi.request['RequestStatus'] in [ 'failed', 'aborted', 'aborted-archived', 'rejected', 'rejected-archived', 'aborted-completed' ]: if wfi.isRelval(): wfo.status = 'forget' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog( 'closor', "%s is %s, but will not be set in trouble to find a replacement." % (wfo.name, wfo.wm_status)) else: wfo.status = 'trouble' wfo.wm_status = wfi.request['RequestStatus'] session.commit() else: print wfo.name, "not good for announcing:", wfi.request[ 'RequestStatus'] wfi.sendLog('closor', "cannot be announced") held.add(wfo.name) days_late = 0. retries_late = 10 really_late_files = [ info for info in all_late_files if info['retries'] >= retries_late ] really_late_files = [ info for info in really_late_files if info['delay'] / (60 * 60 * 24.) >= days_late ] if really_late_files: subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s' % ( len(really_late_files), days_late, retries_late, json.dumps(really_late_files, indent=2)) #sendEmail('waiting for files to announce', subject) sendLog('closor', subject, level='warning') sendLog('closor', subject) print subject open('%s/stuck_files.json' % monitor_dir, 'w').write(json.dumps(really_late_files, indent=2)) if held: sendLog('closor', "the workflows below are held up \n%s" % ("\n".join(sorted(held))), level='critical') #batches = json.loads(open('batches.json').read()) for bname, go in batch_go.items(): if go: subject = "Release Validation Samples Batch %s" % bname issues = "" if batch_warnings[bname]: issues = "The following datasets have outstanding completion (<%d%%) issues:\n\n" % batch_goodness issues += "\n".join(sorted(batch_warnings[bname])) issues += "\n\n" text = """ Dear all, a batch of release validation workflows has finished. Batch ID: %s Detail of the workflows https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s %s This is an automated message. """ % (bname, bname, issues) to = ['*****@*****.**'] sendEmail(subject, text, destination=to)
def closor(url, specific=None, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() all_late_files = [] check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce') jump_the_line = options.announce if options else False if jump_the_line: wfs = session.query(Workflow).filter(Workflow.status.contains('announce')).filter(sqlalchemy.not_(Workflow.status.contains('announced'))).all() else: wfs = session.query(Workflow).filter(Workflow.status=='close').all() held = set() print len(wfs),"closing" max_per_round = UC.get('max_per_round').get('closor',None) if options.limit: max_per_round = options.limit random.shuffle( wfs ) if max_per_round: wfs = wfs[:max_per_round] batch_go = {} batch_warnings = defaultdict(set) batch_goodness = UC.get("batch_goodness") for wfo in wfs: if specific and not specific in wfo.name: continue ## what is the expected #lumis wfi = workflowInfo(url, wfo.name ) wfo.wm_status = wfi.request['RequestStatus'] if wfi.isRelval(): has_batch_go = False batch_name = wfi.getCampaign() if not batch_name in batch_go: ## do the esimatation whethere this can be announced : only once per batch in_batches = getWorkflowByCampaign(url , batch_name, details=True) batch_go[ batch_name ] = all(map(lambda s : not s in ['completed','running-open','running-closed','acquired','assigned','assignment-approved'], [r['RequestStatus'] for r in in_batches])) ## already verified has_batch_go = batch_go[batch_name] if not has_batch_go: wfi.sendLog('closor', 'Cannot close for now because the batch %s is not all close'% batch_name) continue if wfi.request['RequestStatus'] in ['announced','normal-archived'] and not options.force: ## manually announced ?? wfo.status = 'done' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog('closor','%s is announced already : %s'%( wfo.name,wfo.wm_status)) session.commit() if jump_the_line: wfi.sendLog('closor','Announcing while completing') expected_lumis = 1 if not 'TotalInputLumis' in wfi.request: print wfo.name,"has not been assigned yet, or the database is corrupted" elif wfi.request['TotalInputLumis']==0: print wfo.name,"is corrupted with 0 expected lumis" else: expected_lumis = wfi.request['TotalInputLumis'] ## what are the outputs outputs = wfi.request['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = defaultdict(lambda : False) stats = defaultdict(int) #print outputs if len(outputs): print wfo.name,wfi.request['RequestStatus'] for out in outputs: event_count,lumi_count = getDatasetEventsAndLumis(dataset=out) odb = session.query(Output).filter(Output.datasetname==out).first() if not odb: print "adding an output object",out odb = Output( datasetname = out ) odb.workflow = wfo session.add( odb ) odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) session.commit() fraction = lumi_count/float(expected_lumis)*100. completion_line = "%60s %d/%d = %3.2f%%"%(out,lumi_count,expected_lumis,fraction) wfi.sendLog('closor',"\t%s"% completion_line) if wfi.isRelval() and fraction < batch_goodness: batch_warnings[ wfi.getCampaign()].add( completion_line ) stats[out] = lumi_count all_OK[out] = True ## check for at least one full copy prior to moving on in_full = {} for out in outputs: in_full[out] = [] presence = getDatasetPresence( url, out ) where = [site for site,info in presence.items() if info[0]] if where: all_OK[out] = True print out,"is in full at",",".join(where) in_full[out] = copy.deepcopy(where) else: going_to = wfi.request['NonCustodialSites']+wfi.request['CustodialSites'] wfi.sendLog('closor',"%s is not in full anywhere. send to %s"%(out, ",".join(sorted(going_to)))) at_destination = dict([(k,v) for (k,v) in presence.items() if k in going_to]) else_where = dict([(k,v) for (k,v) in presence.items() if not k in going_to]) print json.dumps( at_destination ) print json.dumps( else_where, indent=2 ) ## do the full stuck transfer study, missing files and shit ! for there in going_to: late_info = findLateFiles(url, out, going_to = there ) for l in late_info: l.update({"workflow":wfo.name,"dataset":out}) all_late_files.extend( late_info ) if check_fullcopy_to_announce: ## only set this false if the check is relevant all_OK[out] = False ## verify if we have to do harvesting if not options.no_harvest and not jump_the_line: (OK, requests) = spawn_harvesting(url, wfi, in_full) all_OK.update( OK ) ## only that status can let me go into announced if all(all_OK.values()) and ((wfi.request['RequestStatus'] in ['closed-out']) or options.force or jump_the_line): print wfo.name,"to be announced" results=[] if not results: for out in outputs: if out in stats and not stats[out]: continue _,dsn,process_string,tier = out.split('/') if all_OK[out]: results.append(setDatasetStatus(out, 'VALID')) if all_OK[out] and wfi.isRelval(): ## make the specific relval rules and the replicas ## figure the destination(s) out destinations = set() if tier != "RECO" and tier != "ALCARECO": destinations.add('T2_CH_CERN') if tier == "GEN-SIM": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-DIGI-RAW": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-RECO": destinations.add('T1_US_FNAL_Disk') if "RelValTTBar" in dsn and "TkAlMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if "MinimumBias" in dsn and "SiStripCalMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if destinations: wfi.sendLog('closor', '%s to go to %s'%(out, ', '.join( sorted( destinations )))) ## call to makereplicarequest under relval => done for site in destinations: result = makeReplicaRequest(url, site, [out], 'Copy for release validation consumption', priority='normal', approve=True, mail=False, group='RelVal') try: request_id = result['phedex']['request_created'][0]['id'] results.append( True ) except: results.append( 'Failed relval transfer' ) elif all_OK[out]: campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request and wfi.request['Campaign']: campaign = wfi.request['Campaign'] to_DDM = False ## campaign override if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[campaign] and tier in CI.campaigns[campaign]['toDDM']: to_DDM = True ## by typical enabling if tier in UC.get("tiers_to_DDM"): to_DDM = True ## check for unitarity if not tier in UC.get("tiers_no_DDM")+UC.get("tiers_to_DDM"): print "tier",tier,"neither TO or NO DDM for",out results.append('Not recognitized tier %s'%tier) #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out) sendLog('closor', "could not recognize %s for injecting in DDM"% out, level='critical') continue n_copies = 2 destinations=[] if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[campaign]: ddm_instructions = CI.campaigns[campaign]['DDMcopies'] if type(ddm_instructions) == int: n_copies = CI.campaigns[campaign]['DDMcopies'] elif type(ddm_instructions) == dict: ## a more fancy configuration for ddmtier,indication in ddm_instructions.items(): if ddmtier==tier or ddmtier in ['*','all']: ## this is for us if 'N' in indication: n_copies = indication['N'] if 'host' in indication: destinations = indication['host'] destination_spec = "" if destinations: destination_spec = "--destination="+",".join( destinations ) group_spec = "" ## not used yet ### should make this a campaign configuration ## inject to DDM when necessary if to_DDM: print "Sending",out," to DDM" p = os.popen('python assignDatasetToSite.py --nCopies=%d --dataset=%s %s %s --debug 0 --exec'%(n_copies, out,destination_spec, group_spec)) ddm_text = p.read() print ddm_text status = p.close() if status!=None: print "Failed DDM, retrying to send",out,"a second time" p = os.popen('python assignDatasetToSite.py --nCopies=%d --dataset=%s %s %s --debug 1 --exec'%(n_copies, out,destination_spec, group_spec)) ddm_text = p.read() print ddm_text status = p.close() if status!=None: #sendEmail("failed DDM injection","could not add "+out+" to DDM pool. check closor logs.") sendLog('closor',"could not add "+out+" to DDM pool. check closor logs.", level='critical') if options.force: status = True results.append( status ) if status == None: wfi.sendLog('closor',ddm_text) wfi.sendLog('closor','%s is send to AnalysisOps DDM pool in %s copies %s'%( out, n_copies, destination_spec)) else: print wfo.name,"no stats for announcing",out results.append('No Stats') if all(map(lambda result : result in ['None',None,True],results)): if not jump_the_line: ## only announce if all previous are fine res = reqMgrClient.announceWorkflowCascade(url, wfo.name) if not res in ['None',None]: ## check the status again, it might well have toggled wl_bis = workflowInfo(url, wfo.name) wfo.wm_status = wl_bis.request['RequestStatus'] session.commit() if wl_bis.request['RequestStatus'] in ['announced','normal-archived']: res = None else: ## retry ? res = reqMgrClient.announceWorkflowCascade(url, wfo.name) results.append( res ) #print results if all(map(lambda result : result in ['None',None,True],results)): if jump_the_line: if not 'announced' in wfo.status: wfo.status = wfo.status.replace('announce','announced') else: wfo.status = 'done' session.commit() wfi.sendLog('closor',"workflow outputs are announced") else: wfi.sendLog('closor',"Error with %s to be announced \n%s"%( wfo.name, json.dumps( results ))) elif wfi.request['RequestStatus'] in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']: if wfi.isRelval(): wfo.status = 'forget' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog('closor',"%s is %s, but will not be set in trouble to find a replacement."%( wfo.name, wfo.wm_status)) else: wfo.status = 'trouble' wfo.wm_status = wfi.request['RequestStatus'] session.commit() else: print wfo.name,"not good for announcing:",wfi.request['RequestStatus'] wfi.sendLog('closor',"cannot be announced") held.add( wfo.name ) days_late = 0. retries_late = 10 really_late_files = [info for info in all_late_files if info['retries']>=retries_late] really_late_files = [info for info in really_late_files if info['delay']/(60*60*24.)>=days_late] if really_late_files: subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s'%(len(really_late_files), days_late, retries_late, json.dumps( really_late_files , indent=2) ) #sendEmail('waiting for files to announce', subject) sendLog('closor', subject, level='warning') sendLog('closor',subject) print subject open('%s/stuck_files.json'%monitor_dir,'w').write( json.dumps( really_late_files , indent=2)) if held: sendLog('closor',"the workflows below are held up \n%s"%("\n".join( sorted(held) )), level='critical') #batches = json.loads(open('batches.json').read()) for bname,go in batch_go.items(): if go: subject = "Release Validation Samples Batch %s"% bname issues="" if batch_warnings[ bname ]: issues="The following datasets have outstanding completion (<%d%%) issues:\n\n"% batch_goodness issues+="\n".join( sorted( batch_warnings[ bname ] )) issues+="\n\n" text = """ Dear all, a batch of release validation workflows has finished. Batch ID: %s Detail of the workflows https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s %s This is an automated message. """%( bname, bname, issues) to = ['*****@*****.**'] sendEmail(subject, text, destination=to )
def close(self): if os.path.isfile('.closor_stop'): print "The closing of workflows is shortened" return url = self.url batch_go = self.batch_go CI = self.CI UC = self.UC wfo = self.wfo jump_the_line = self.jump_the_line batch_goodness = self.batch_goodness check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce') ## what is the expected #lumis self.wfi = workflowInfo(url, wfo.name ) wfi = self.wfi wfo.wm_status = wfi.request['RequestStatus'] if wfi.isRelval(): has_batch_go = False batch_name = wfi.getCampaign() if not batch_name in batch_go: ## do the esimatation whethere this can be announced : only once per batch in_batches = getWorkflowByCampaign(url , batch_name, details=True) batch_go[ batch_name ] = all(map(lambda s : not s in ['completed','running-open','running-closed','acquired','assigned','assignment-approved'], [r['RequestStatus'] for r in in_batches])) ## already verified has_batch_go = batch_go[batch_name] if not has_batch_go: wfi.sendLog('closor', 'Cannot close for now because the batch <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?campaign=%s>%s</a> is not all close'%( batch_name, batch_name)) return if wfi.request['RequestStatus'] in ['announced','normal-archived'] and not options.force: ## manually announced ?? self.to_status = 'done' self.to_wm_status = wfi.request['RequestStatus'] wfi.sendLog('closor','%s is announced already : %s'%( wfo.name,self.to_wm_status)) return if jump_the_line: wfi.sendLog('closor','Announcing while completing') expected_lumis = 1 if not 'TotalInputLumis' in wfi.request: print wfo.name,"has not been assigned yet, or the database is corrupted" elif wfi.request['TotalInputLumis']==0: print wfo.name,"is corrupted with 0 expected lumis" else: expected_lumis = wfi.request['TotalInputLumis'] ## what are the outputs outputs = wfi.request['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = defaultdict(lambda : False) stats = defaultdict(int) #print outputs if len(outputs): print wfo.name,wfi.request['RequestStatus'] for out in outputs: event_count,lumi_count = getDatasetEventsAndLumis(dataset=out) self.outs.append( Output( datasetname = out )) odb = self.outs[-1] odb.workflow = wfo odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) fraction = lumi_count/float(expected_lumis)*100. completion_line = "%60s %d/%d = %3.2f%%"%(out,lumi_count,expected_lumis,fraction) wfi.sendLog('closor',"\t%s"% completion_line) if wfi.isRelval() and fraction < batch_goodness: self.batch_warnings[ wfi.getCampaign()].add( completion_line ) stats[out] = lumi_count all_OK[out] = True ## check for at least one full copy prior to moving on in_full = {} for out in outputs: in_full[out] = [] presence = getDatasetPresence( url, out ) where = [site for site,info in presence.items() if info[0]] if where: all_OK[out] = True print out,"is in full at",",".join(where) in_full[out] = copy.deepcopy(where) else: going_to = wfi.request['NonCustodialSites']+wfi.request['CustodialSites'] wfi.sendLog('closor',"%s is not in full anywhere. send to %s"%(out, ",".join(sorted(going_to)))) at_destination = dict([(k,v) for (k,v) in presence.items() if k in going_to]) else_where = dict([(k,v) for (k,v) in presence.items() if not k in going_to]) print json.dumps( at_destination ) print json.dumps( else_where, indent=2 ) ## do the full stuck transfer study, missing files and shit ! for there in going_to: late_info = findLateFiles(url, out, going_to = there ) for l in late_info: l.update({"workflow":wfo.name,"dataset":out}) self.all_late_files.extend( late_info ) if check_fullcopy_to_announce: ## only set this false if the check is relevant all_OK[out] = False ## verify if we have to do harvesting if not options.no_harvest and not jump_the_line: (OK, requests) = spawn_harvesting(url, wfi, in_full) all_OK.update( OK ) ## only that status can let me go into announced if all(all_OK.values()) and ((wfi.request['RequestStatus'] in ['closed-out']) or options.force or jump_the_line): print wfo.name,"to be announced" results=[] if not results: for out in outputs: print "dealing with",out if out in stats and not stats[out]: continue _,dsn,process_string,tier = out.split('/') if all_OK[out]: print "setting valid" results.append(setDatasetStatus(out, 'VALID', withFiles=False)) if all_OK[out] and wfi.isRelval(): ## make the specific relval rules and the replicas ## figure the destination(s) out destinations = set() if tier != "RECO" and tier != "ALCARECO": destinations.add('T2_CH_CERN') if tier == "GEN-SIM": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-DIGI-RAW": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-RECO": destinations.add('T1_US_FNAL_Disk') if "RelValTTBar" in dsn and "TkAlMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if "MinimumBias" in dsn and "SiStripCalMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if destinations: wfi.sendLog('closor', '%s to go to %s'%(out, ', '.join( sorted( destinations )))) ## call to makereplicarequest under relval => done for site in destinations: result = makeReplicaRequest(url, site, [out], 'Copy for release validation consumption', priority='normal', approve=True, mail=False, group='RelVal') try: request_id = result['phedex']['request_created'][0]['id'] results.append( True ) except: results.append( 'Failed relval transfer' ) elif all_OK[out]: campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request and wfi.request['Campaign']: campaign = wfi.request['Campaign'] to_DDM = False ## campaign override if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[campaign] and tier in CI.campaigns[campaign]['toDDM']: to_DDM = True ## by typical enabling if tier in UC.get("tiers_to_DDM"): to_DDM = True ## check for unitarity if not tier in UC.get("tiers_no_DDM")+UC.get("tiers_to_DDM"): print "tier",tier,"neither TO or NO DDM for",out results.append('Not recognitized tier %s'%tier) #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out) sendLog('closor', "could not recognize %s for injecting in DDM"% out, level='critical') continue n_copies = 1 destinations=[] if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[campaign]: ddm_instructions = CI.campaigns[campaign]['DDMcopies'] if type(ddm_instructions) == int: n_copies = CI.campaigns[campaign]['DDMcopies'] elif type(ddm_instructions) == dict: ## a more fancy configuration for ddmtier,indication in ddm_instructions.items(): if ddmtier==tier or ddmtier in ['*','all']: ## this is for us if 'N' in indication: n_copies = indication['N'] if 'host' in indication: destinations = indication['host'] destination_spec = "" if destinations: destination_spec = "--destination="+",".join( destinations ) group_spec = "" ## not used yet ### should make this a campaign configuration ## inject to DDM when necessary if to_DDM: print "Sending",out," to DDM" status = pass_to_dynamo( [out], N = n_copies, sites=destinations if destinations else None, group = group_spec if group_spec else None) results.append( status ) if status == True: wfi.sendLog('closor','%s is send to dynamo in %s copies %s %s'%( out, n_copies, sorted(destinations), group_spec)) else: sendLog('closor',"could not add "+out+" to dynamo pool. check closor logs.", level='critical') wfi.sendLog('closor',"could not add "+out+" to dynamo pool. check closor logs.") else: print wfo.name,"no stats for announcing",out results.append('No Stats') if all(map(lambda result : result in ['None',None,True],results)): if not jump_the_line: ## only announce if all previous are fine res = reqMgrClient.announceWorkflowCascade(url, wfo.name) if not res in ['None',None]: ## check the status again, it might well have toggled wl_bis = workflowInfo(url, wfo.name) self.to_wm_status = wl_bis.request['RequestStatus'] if wl_bis.request['RequestStatus'] in ['announced','normal-archived']: res = None else: res = reqMgrClient.announceWorkflowCascade(url, wfo.name) results.append( res ) print results if all(map(lambda result : result in ['None',None,True],results)): if jump_the_line: if not 'announced' in wfo.status: self.to_status = wfo.status.replace('announce','announced') else: self.to_status = 'done' self.closing = True wfi.sendLog('closor',"workflow outputs are announced") else: wfi.sendLog('closor',"Error with %s to be announced \n%s"%( wfo.name, json.dumps( results ))) elif wfi.request['RequestStatus'] in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']: if wfi.isRelval(): self.to_status = 'forget' self.to_wm_status = wfi.request['RequestStatus'] wfi.sendLog('closor',"%s is %s, but will not be set in trouble to find a replacement."%( wfo.name, self.to_wm_status)) else: self.to_status = 'trouble' self.to_wm_status = wfi.request['RequestStatus'] else: print wfo.name,"not good for announcing:",wfi.request['RequestStatus'] wfi.sendLog('closor',"cannot be announced") self.held.add( wfo.name )
def checkor(url, spec=None, options=None): fDB = closeoutInfo() use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] wfs=[] if options.fetch: ## get all in running and check wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() ) wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() ) if options.nofetch: ## than get all in need for assistance wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() ) custodials = defaultdict(list) #sites : dataset list transfers = defaultdict(list) #sites : dataset list invalidations = [] #a list of files SI = siteInfo() CI = campaignInfo() mcm = McMClient(dev=False) def get_campaign(output, wfi): campaign = None try: campaign = output.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request: campaign = wfi.request['Campaign'] return campaign by_passes = [] holdings = [] for bypassor,email in [('jbadillo','*****@*****.**'),('vlimant','*****@*****.**'),('jen_a','*****@*****.**')]: bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor) if not os.path.isfile(bypass_file): print "no file",bypass_file continue try: by_passes.extend( json.loads(open(bypass_file).read())) except: print "cannot get by-passes from",bypass_file,"for",bypassor sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email]) holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor) if not os.path.isfile(holding_file): print "no file",holding_file continue try: holdings.extend( json.loads(open(holding_file).read())) except: print "cannot get holdings from",holding_file,"for",bypassor sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email]) total_running_time = 5.*60. sleep_time = max(0.5, total_running_time / len(wfs)) for wfo in wfs: if spec and not (spec in wfo.name): continue time.sleep( sleep_time ) print "checking on",wfo.name ## get info wfi = workflowInfo(url, wfo.name) ## make sure the wm status is up to date. # and send things back/forward if necessary. wfo.wm_status = wfi.request['RequestStatus'] if wfo.wm_status == 'closed-out': ## manually closed-out print wfo.name,"is already",wfo.wm_status wfo.status = 'close' session.commit() continue elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']: ## went into trouble wfo.status = 'trouble' print wfo.name,"is in trouble",wfo.wm_status session.commit() continue elif wfo.wm_status in ['assigned','acquired']: ## not worth checking yet print wfo.name,"not running yet" session.commit() continue if '-onhold' in wfo.status: if wfo.name in holdings: print wfo.name,"on hold" continue if wfo.name in holdings: wfo.status = 'assistance-onhold' print "setting",wfo.name,"on hold" session.commit() continue if wfo.wm_status != 'completed': ## for sure move on with closeout check if in completed print "no need to check on",wfo.name,"in status",wfo.wm_status session.commit() continue session.commit() sub_assistance="" # if that string is filled, there will be need for manual assistance is_closing = True ## do the closed-out checks one by one ## get it from somewhere by_pass_checks = False if wfo.name in by_passes: print "we can bypass checks on",wfo.name by_pass_checks = True for bypass in by_passes: if bypass in wfo.name: print "we can bypass",wfo.name,"because of keyword",bypass by_pass_checks = True break # tuck out DQMIO/DQM wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not '/DQM' in out] ## anything running on acdc familly = getWorkflowById(url, wfi.request['PrepID'], details=True) acdc = [] acdc_inactive = [] has_recovery_going=False had_any_recovery = False for member in familly: if member['RequestType'] != 'Resubmission': continue if member['RequestName'] == wfo.name: continue if member['RequestDate'] < wfi.request['RequestDate']: continue if member['RequestStatus'] in ['running-open','running-closed','assignment-approved','assigned','acquired']: print wfo.name,"still has an ACDC running",member['RequestName'] acdc.append( member['RequestName'] ) #print json.dumps(member,indent=2) ## hook for just waiting ... is_closing = False has_recovery_going=True elif member['RequestStatus']==None: print member['RequestName'],"is not real" pass else: acdc_inactive.append( member['RequestName'] ) had_any_recovery = True ## completion check percent_completions = {} # print "let's see who is crashing", wfo.name # print wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis'] if not 'TotalInputEvents' in wfi.request: event_expected,lumi_expected = 0,0 if not 'recovery' in wfo.status: sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**']) else: event_expected,lumi_expected = wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis'] fractions_pass = {} for output in wfi.request['OutputDatasets']: event_count,lumi_count = getDatasetEventsAndLumis(dataset=output) percent_completions[output] = 0. if lumi_expected: percent_completions[output] = lumi_count / float( lumi_expected ) fractions_pass[output] = 0.95 c = get_campaign(output, wfi) if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]: fractions_pass[output] = CI.campaigns[c]['fractionpass'] print "overriding fraction to",fractions_pass[output],"for",output if options.fractionpass: fractions_pass[output] = options.fractionpass print "overriding fraction to",fractions_pass[output],"by command line for",output if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]): print wfo.name,"is not completed" print json.dumps(percent_completions, indent=2) print json.dumps(fractions_pass, indent=2) ## hook for creating automatically ACDC ? if has_recovery_going: sub_assistance+='-recovering' elif had_any_recovery: ## we want to have this looked at sub_assistance+='-manual' else: sub_assistance+='-recovery' is_closing = False ## correct lumi < 300 event per lumi events_per_lumi = {} for output in wfi.request['OutputDatasets']: events_per_lumi[output] = getDatasetEventsPerLumi( output ) lumi_upper_limit = {} for output in wfi.request['OutputDatasets']: upper_limit = 301. campaign = get_campaign(output, wfi) #if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request: # upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency'] # print "setting the upper limit of lumisize to",upper_limit,"by request configuration" if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]: upper_limit = CI.campaigns[campaign]['lumisize'] print "overriding the upper lumi size to",upper_limit,"for",campaign if options.lumisize: upper_limit = options.lumisize print "overriding the upper lumi size to",upper_limit,"by command line" lumi_upper_limit[output] = upper_limit if any([ events_per_lumi[out] >= lumi_upper_limit[out] for out in events_per_lumi]): print wfo.name,"has big lumisections" print json.dumps(events_per_lumi, indent=2) ## hook for rejecting the request ? sub_assistance+='-biglumi' is_closing = False any_presence = {} for output in wfi.request['OutputDatasets']: any_presence[output] = getDatasetPresence(url, output, vetoes=[]) ## custodial copy custodial_locations = {} custodial_presences = {} for output in wfi.request['OutputDatasets']: custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s] custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output) if not custodial_locations[output]: custodial_locations[output] = [] ## presence in phedex phedex_presence ={} for output in wfi.request['OutputDatasets']: phedex_presence[output] = phedexClient.getFileCountDataset(url, output ) vetoed_custodial_tier = ['MINIAODSIM'] out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier] if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])): print wfo.name,"has not all custodial location" print json.dumps(custodial_locations, indent=2) ########## ## hook for making a custodial replica ? custodial = None ## get from other outputs for output in out_worth_checking: if len(custodial_locations[output]): custodial = custodial_locations[output][0] ## try to get it from campaign configuration if not custodial: for output in out_worth_checking: campaign = get_campaign(output, wfi) if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]: custodial = CI.campaigns[campaign]['custodial'] print "Setting custodial to",custodial,"from campaign configuration" break ## get from the parent pick_custodial = True if not custodial and 'InputDataset' in wfi.request: ## this is terribly dangerous to assume only parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( wfi.request['InputDataset']) ###parents_custodial = findCustodialLocation(url, wfi.request['InputDataset']) if not parents_custodial: parents_custodial = [] if len(parents_custodial): custodial = parents_custodial[0] else: print "the input dataset",wfi.request['InputDataset'],"does not have custodial in the first place. abort" sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%wfi.request['InputDataset']) is_closing = False pick_custodial = False if not custodial and pick_custodial: ## pick one at random custodial = SI.pick_SE() if custodial and ((not sub_assistance and not acdc) or by_pass_checks): ## register the custodial request, if there are no other big issues for output in out_worth_checking: if not len(custodial_locations[output]): if phedex_presence[output]>=1: custodials[custodial].append( output ) else: print "no file in phedex for",output," not good to add to custodial requests" else: print "cannot find a custodial for",wfo.name is_closing = False ## disk copy disk_copies = {} for output in wfi.request['OutputDatasets']: disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)] if not all(map( lambda sites : len(sites)!=0, disk_copies.values())): print wfo.name,"has not all output on disk" print json.dumps(disk_copies, indent=2) ## presence in dbs dbs_presence = {} dbs_invalid = {} for output in wfi.request['OutputDatasets']: dbs_presence[output] = dbs3Client.getFileCountDataset( output ) dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True) fraction_invalid = 0.01 if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs,phedex mismatch" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## hook for just waiting ... is_closing = False if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs invalid file level too high" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## need to be going and taking an eye sub_assistance+="-invalidfiles" is_closing = False ## put that heavy part at the end ## duplication check duplications = {} if is_closing: print "starting duplicate checker for",wfo.name for output in wfi.request['OutputDatasets']: print "\tchecking",output duplications[output] = True try: duplications[output] = dbs3Client.duplicateRunLumi( output ) except: try: duplications[output] = dbs3Client.duplicateRunLumi( output ) except: print "was not possible to get the duplicate count for",output is_closing=False if any(duplications.values()) and not options.ignoreduplicates: print wfo.name,"has duplicates" print json.dumps(duplications,indent=2) ## hook for making file invalidation ? sub_assistance+='-duplicates' is_closing = False ## for visualization later on if not wfo.name in fDB.record: #print "adding",wfo.name,"to close out record" fDB.record[wfo.name] = { 'datasets' :{}, 'name' : wfo.name, 'closeOutWorkflow' : None, } fDB.record[wfo.name]['closeOutWorkflow'] = is_closing for output in wfi.request['OutputDatasets']: if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {} rec = fDB.record[wfo.name]['datasets'][output] rec['percentage'] = float('%.2f'%(percent_completions[output]*100)) rec['duplicate'] = duplications[output] if output in duplications else 'N/A' rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A' rec['closeOutDataset'] = is_closing rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A' rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output]))) rec['dbsFiles'] = dbs_presence[output] rec['dbsInvFiles'] = dbs_invalid[output] rec['phedexFiles'] = phedex_presence[output] rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive)) if by_pass_checks: ## force closing is_closing = True ## and move on if is_closing: ## toggle status to closed-out in request manager print "setting",wfo.name,"closed-out" if not options.test: res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) print "close out answer",res if not res in ["None",None]: print "retrying to closing out" print res res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) if res in [None,"None"]: wfo.status = 'close' session.commit() else: print "could not close out",wfo.name,"will try again next time" else: ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it new_status = 'assistance'+sub_assistance print wfo.name,"needs assistance with",new_status if sub_assistance and wfo.status != new_status and 'PrepID' in wfi.request and not 'manual' in wfo.status: pid = wfi.getPrepIDs()[0].replace('task_','') #pid = wfi.request['PrepID'].replace('task_','') ## notify messages= { 'recovery' : 'Samples completed with missing lumi count:\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ) ), 'biglumi' : 'Samples completed with large luminosity blocks:\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])])), 'duplicate' : 'Samples completed with duplicated luminosity blocks:\n%s'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ), } text ="The request %s (%s) is facing issue in production.\n" %( pid, wfo.name ) content = "" for case in messages: if case in new_status: content+= "\n"+messages[case]+"\n" text += content text += "You are invited to check, while this is being taken care of by Ops.\n" text += "This is an automated message." if use_mcm and content: print "Sending notification back to requestor" print text batches = mcm.getA('batches',query='contains=%s&status=announced'%pid) if len(batches): ## go notify the batch bid = batches[-1]['prepid'] print "batch nofication to",bid mcm.put('/restapi/batches/notify', { "notes" : text, "prepid" : bid}) ## go notify the request print "request notification to",pid mcm.put('/restapi/requests/notify',{ "message" : text, "prepids" : [pid] }) ## case where the workflow was in manual from recoveror if not 'manual' in wfo.status or new_status!='assistance-recovery': wfo.status = new_status if not options.test: print "setting",wfo.name,"to",wfo.status session.commit() else: print "current status is",wfo.status,"not changing to anything" fDB.html() ## custodial requests print "Custodials" print json.dumps(custodials, indent=2) for site in custodials: print ','.join(custodials[site]),'=>',site if not options.test: result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) ) print result print "Transfers" print json.dumps(transfers, indent=2) ## replicas requests for site in transfers: print ','.join(transfers[site]),'=>',site if not options.test: result = None #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out") print result print "File Invalidation" print invalidations
def closor(url, specific=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() all_late_files = [] check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce') ## manually closed-out workflows should get to close with checkor if specific: wfs = session.query(Workflow).filter( Workflow.name.contains(specific)).all() else: wfs = session.query(Workflow).filter(Workflow.status == 'close').all() held = set() max_per_round = UC.get('max_per_round').get('closor', None) random.shuffle(wfs) if max_per_round: wfs = wfs[:max_per_round] for wfo in wfs: if specific and not specific in wfo.name: continue ## what is the expected #lumis wfi = workflowInfo(url, wfo.name) wfo.wm_status = wfi.request['RequestStatus'] if wfi.request['RequestStatus'] in ['announced', 'normal-archived']: ## manually announced ?? wfo.status = 'done' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog( 'closor', '%s is announced already : %s' % (wfo.name, wfo.wm_status)) session.commit() expected_lumis = 1 if not 'TotalInputLumis' in wfi.request: print wfo.name, "has not been assigned yet, or the database is corrupted" else: expected_lumis = wfi.request['TotalInputLumis'] ## what are the outputs outputs = wfi.request['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = defaultdict(lambda: False) #print outputs if len(outputs): print wfo.name, wfi.request['RequestStatus'] for out in outputs: event_count, lumi_count = getDatasetEventsAndLumis(dataset=out) odb = session.query(Output).filter( Output.datasetname == out).first() if not odb: print "adding an output object", out odb = Output(datasetname=out) odb.workflow = wfo session.add(odb) odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) session.commit() wfi.sendLog( 'closor', "\t%60s %d/%d = %3.2f%%" % (out, lumi_count, expected_lumis, lumi_count / float(expected_lumis) * 100.)) #print wfo.fraction_for_closing, lumi_count, expected_lumis #fraction = wfo.fraction_for_closing #fraction = 0.0 #all_OK.append((float(lumi_count) > float(expected_lumis*fraction))) all_OK[out] = True ## check for at least one full copy prior to moving on in_full = {} for out in outputs: in_full[out] = [] presence = getDatasetPresence(url, out) where = [site for site, info in presence.items() if info[0]] if where: all_OK[out] = True print out, "is in full at", ",".join(where) in_full[out] = copy.deepcopy(where) else: going_to = wfi.request['NonCustodialSites'] + wfi.request[ 'CustodialSites'] wfi.sendLog( 'closor', "%s is not in full anywhere. send to %s" % (out, ",".join(sorted(going_to)))) at_destination = dict([(k, v) for (k, v) in presence.items() if k in going_to]) else_where = dict([(k, v) for (k, v) in presence.items() if not k in going_to]) print json.dumps(at_destination) print json.dumps(else_where, indent=2) ## do the full stuck transfer study, missing files and shit ! for there in going_to: late_info = findLateFiles(url, out, going_to=there) for l in late_info: l.update({"workflow": wfo.name, "dataset": out}) all_late_files.extend(late_info) if check_fullcopy_to_announce: ## only set this false if the check is relevant all_OK[out] = False ## verify if we have to do harvesting (OK, requests) = spawn_harvesting(url, wfi, in_full) all_OK.update(OK) ## only that status can let me go into announced if all(all_OK.values()) and wfi.request['RequestStatus'] in [ 'closed-out' ]: print wfo.name, "to be announced" results = [] #'dummy'] if not results: for out in outputs: if all_OK[out]: results.append(setDatasetStatus(out, 'VALID')) tier = out.split('/')[-1] campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request and wfi.request[ 'Campaign']: campaign = wfi.request['Campaign'] to_DDM = False ## campaign override if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[ campaign] and tier in CI.campaigns[campaign][ 'toDDM']: to_DDM = True ## by typical enabling if tier in UC.get("tiers_to_DDM"): to_DDM = True ## check for unitarity if not tier in UC.get("tiers_no_DDM") + UC.get( "tiers_to_DDM"): print "tier", tier, "neither TO or NO DDM for", out results.append('Not recognitized tier %s' % tier) #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out) sendLog( 'closor', "could not recognize %s for injecting in DDM" % out, level='critical') continue n_copies = 2 destinations = [] if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[ campaign]: ddm_instructions = CI.campaigns[campaign][ 'DDMcopies'] if type(ddm_instructions) == int: n_copies = CI.campaigns[campaign]['DDMcopies'] elif type(ddm_instructions) == dict: ## a more fancy configuration for ddmtier, indication in ddm_instructions.items( ): if ddmtier == tier or ddmtier in [ '*', 'all' ]: ## this is for us if 'N' in indication: n_copies = indication['N'] if 'host' in indication: destinations = indication['host'] destination_spec = "" if destinations: destination_spec = "--destination=" + ",".join( destinations) ## inject to DDM when necessary if to_DDM: #print "Sending",out," to DDM" p = os.popen( 'python assignDatasetToSite.py --nCopies=%d --dataset=%s %s --exec' % (n_copies, out, destination_spec)) print p.read() status = p.close() if status != None: print "Failed DDM, retrying a second time" p = os.popen( 'python assignDatasetToSite.py --nCopies=%d --dataset=%s %s --exec' % (n_copies, out, destination_spec)) print p.read() status = p.close() if status != None: #sendEmail("failed DDM injection","could not add "+out+" to DDM pool. check closor logs.") sendLog('closor', "could not add " + out + " to DDM pool. check closor logs.", level='critical') results.append(status) if status == None: wfi.sendLog( 'closor', '%s is send to AnalysisOps DDM pool in %s copies %s' % (n_copies, out, destination_spec)) else: print wfo.name, "no stats for announcing", out results.append('No Stats') if all( map(lambda result: result in ['None', None, True], results)): ## only announce if all previous are fine res = reqMgrClient.announceWorkflowCascade(url, wfo.name) if not res in ['None', None]: ## check the status again, it might well have toggled wl_bis = workflowInfo(url, wfo.name) wfo.wm_status = wl_bis.request['RequestStatus'] session.commit() if wl_bis.request['RequestStatus'] in [ 'announced', 'normal-archived' ]: res = None else: ## retry ? res = reqMgrClient.announceWorkflowCascade( url, wfo.name) results.append(res) #print results if all(map(lambda result: result in ['None', None, True], results)): wfo.status = 'done' session.commit() wfi.sendLog('closor', "workflow is announced") else: print "ERROR with ", wfo.name, "to be announced", json.dumps( results) else: print wfo.name, "not good for announcing:", wfi.request[ 'RequestStatus'] wfi.sendLog('closor', "cannot be announced") held.add(wfo.name) days_late = 0. retries_late = 10 really_late_files = [ info for info in all_late_files if info['retries'] >= retries_late ] really_late_files = [ info for info in really_late_files if info['delay'] / (60 * 60 * 24.) >= days_late ] if really_late_files: subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s' % ( len(really_late_files), days_late, retries_late, json.dumps(really_late_files, indent=2)) #sendEmail('waiting for files to announce', subject) sendLog('closor', subject, level='warning') sendLog('closor', subject) print subject open('%s/stuck_files.json' % monitor_dir, 'w').write(json.dumps(really_late_files, indent=2)) if held: #sendEmail("held from announcing","the workflows below are held up, please check the logs https://cmst2.web.cern.ch/cmst2/unified/logs/closor/last.log \n%s"%("\n".join( held ))) sendLog('closor', "the workflows below are held up \n%s" % ("\n".join(held)), level='critical')
def checkor(url, spec=None, options=None): if userLock(): return if duplicateLock(): return fDB = closeoutInfo() UC = unifiedConfiguration() use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] wfs=[] if options.new: ## get all in running and check ## you want to intersect with what is completed ! if options.strict: completed_wfi = getWorkflows(url, status='completed') for wfo in session.query(Workflow).filter(Workflow.status == 'away').all(): if wfo.name in completed_wfi: wfs.append( wfo ) else: print wfo.name,"is not completed" sendLog('checkor','%s is not completed'%( wfo.name)) else: wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() ) if options.current: ## recheck those already there, probably to just pass them along wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() ) if options.old: ## than get all in need for assistance wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() ) custodials = defaultdict(list) #sites : dataset list transfers = defaultdict(list) #sites : dataset list invalidations = [] #a list of files SI = global_SI CI = campaignInfo() mcm = McMClient(dev=False) def get_campaign(output, wfi): campaign = None try: campaign = output.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request: campaign = wfi.request['Campaign'] return campaign ## retrieve bypass and onhold configuration bypasses = [] holdings = [] #try: # already_notified = json.loads(open('already_notifified.json').read()) #except: # print "no record of already notified workflow. starting fresh" # already_notified = [] for bypassor,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**')]: bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor) if not os.path.isfile(bypass_file): #sendLog('checkor','no file %s',bypass_file) continue try: bypasses.extend( json.loads(open(bypass_file).read())) except: sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email]) holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor) if not os.path.isfile(holding_file): #sendLog('checkor',"no file %s"%holding_file) continue try: holdings.extend( json.loads(open(holding_file).read())) except: sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email]) ## once this was force-completed, you want to bypass for rider,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('srimanob','*****@*****.**')]: rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider) if not os.path.isfile(rider_file): print "no file",rider_file #sendLog('checkor',"no file %s"%rider_file) continue try: bypasses.extend( json.loads(open( rider_file ).read() ) ) except: sendLog('checkor',"cannot get force complete list from %s"%rider) sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email]) if use_mcm: mcm_force = mcm.get('/restapi/requests/forcecomplete') bypasses.extend( mcm_force ) pattern_fraction_pass = UC.get('pattern_fraction_pass') total_running_time = 5.*60. sleep_time = 1 if len(wfs): sleep_time = min(max(0.5, total_running_time / len(wfs)), 10) random.shuffle( wfs ) print len(wfs),"to consider, pausing for",sleep_time for wfo in wfs: if spec and not (spec in wfo.name): continue time.sleep( sleep_time ) ## get info wfi = workflowInfo(url, wfo.name) wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status)) ## make sure the wm status is up to date. # and send things back/forward if necessary. wfo.wm_status = wfi.request['RequestStatus'] if wfo.wm_status == 'closed-out': ## manually closed-out wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status)) wfo.status = 'close' session.commit() continue elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']: ## went into trouble wfo.status = 'trouble' wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status)) session.commit() continue elif wfo.wm_status in ['assigned','acquired']: ## not worth checking yet wfi.sendLog('checkor',"%s is not running yet"%wfo.name) session.commit() continue if '-onhold' in wfo.status: if wfo.name in holdings and wfo.name not in bypasses: wfi.sendLog('checkor',"%s is on hold"%wfo.name) continue if wfo.wm_status != 'completed': #and not wfo.name in bypasses: ## for sure move on with closeout check if in completed wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status)) session.commit() continue if wfo.name in holdings and wfo.name not in bypasses: wfo.status = 'assistance-onhold' wfi.sendLog('checkor',"setting %s on hold"%wfo.name) session.commit() continue session.commit() #sub_assistance="" # if that string is filled, there will be need for manual assistance existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance assistance_tags = set() is_closing = True ## get it from somewhere bypass_checks = False pids = wfi.getPrepIDs() bypass_by_mcm = False for bypass in bypasses: if bypass in wfo.name: wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass)) bypass_checks = True break if bypass in pids: wfi.sendLog('checkor',"we can bypass checks on %s because of prepid %s "%( wfo.name, bypass)) bypass_checks = True bypass_by_mcm = True break #if not CI.go( wfi.request['Campaign'] ) and not bypass_checks: # print "No go for",wfo.name # wfi.sendLog('checkor',"No go for %s"%wfi.request['Campaign']) # continue tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm* vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco campaigns = {} for out in wfi.request['OutputDatasets']: c = get_campaign(out, wfi) campaigns[out] = c if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]: vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override'])) ## add those that we need to check for custodial copy tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] )) wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])] check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] )) check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check )) check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) ) wfi.sendLog('checkor', check_output_text ) ## anything running on acdc : getting the real prepid is not worth it familly = getWorkflowById(url, wfi.request['PrepID'], details=True) acdc = [] acdc_inactive = [] for member in familly: if member['RequestType'] != 'Resubmission': continue if member['RequestName'] == wfo.name: continue if member['RequestDate'] < wfi.request['RequestDate']: continue if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']: print wfo.name,"still has an ACDC running",member['RequestName'] acdc.append( member['RequestName'] ) ## cannot be bypassed! is_closing = False assistance_tags.add('recovering') elif member['RequestStatus']==None: print member['RequestName'],"is not real" pass else: acdc_inactive.append( member['RequestName'] ) assistance_tags.add('recovered') ## completion check percent_completions = {} if not 'TotalInputEvents' in wfi.request: event_expected,lumi_expected = 0,0 if not 'recovery' in wfo.status: #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**']) sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical') else: event_expected,lumi_expected = wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis'] if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']): event_expected = int(wfi.request['RequestNumEvents']) elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']: event_expected = int(wfi.request['Task1']['RequestNumEvents']) fractions_pass = {} over_100_pass = False (lhe,prim,_,_) = wfi.getIO() if lhe or prim: over_100_pass = False for output in wfi.request['OutputDatasets']: event_count,lumi_count = getDatasetEventsAndLumis(dataset=output) percent_completions[output] = 0. if lumi_expected: percent_completions[output] = lumi_count / float( lumi_expected ) if event_expected: wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected )) percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) ) fractions_pass[output] = 0.95 c = campaigns[output] if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]: fractions_pass[output] = CI.campaigns[c]['fractionpass'] wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output)) if options.fractionpass: fractions_pass[output] = options.fractionpass print "overriding fraction to",fractions_pass[output],"by command line for",output for key in pattern_fraction_pass: if key in output: fractions_pass[output] = pattern_fraction_pass[key] print "overriding fraction to",fractions_pass[output],"by dataset key",key if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]): print wfo.name,"is not completed" print json.dumps(percent_completions, indent=2) print json.dumps(fractions_pass, indent=2) ## hook for creating automatically ACDC ? if not bypass_checks: assistance_tags.add('recovery') is_closing = False if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]): print wfo.name,"is over completed" print json.dumps(percent_completions, indent=2) if not bypass_checks: assistance_tags.add('over100') is_closing = False ## correct lumi < 300 event per lumi events_per_lumi = {} for output in wfi.request['OutputDatasets']: events_per_lumi[output] = getDatasetEventsPerLumi( output ) lumi_upper_limit = {} for output in wfi.request['OutputDatasets']: upper_limit = 301. campaign = campaigns[output] #if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request: # upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency'] # print "setting the upper limit of lumisize to",upper_limit,"by request configuration" if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]: upper_limit = CI.campaigns[campaign]['lumisize'] print "overriding the upper lumi size to",upper_limit,"for",campaign if options.lumisize: upper_limit = options.lumisize print "overriding the upper lumi size to",upper_limit,"by command line" lumi_upper_limit[output] = upper_limit if wfi.request['RequestType'] in ['ReDigi']: lumi_upper_limit[output] = -1 if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]): print wfo.name,"has big lumisections" print json.dumps(events_per_lumi, indent=2) ## hook for rejecting the request ? if not bypass_checks: assistance_tags.add('biglumi') is_closing = False any_presence = {} for output in wfi.request['OutputDatasets']: any_presence[output] = getDatasetPresence(url, output, vetoes=[]) ## custodial copy custodial_locations = {} custodial_presences = {} for output in wfi.request['OutputDatasets']: custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s] custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output) if not custodial_locations[output]: custodial_locations[output] = [] ## presence in phedex phedex_presence ={} for output in wfi.request['OutputDatasets']: phedex_presence[output] = phedexClient.getFileCountDataset(url, output ) out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier] size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])): print wfo.name,"has not all custodial location" print json.dumps(custodial_locations, indent=2) ########## ## hook for making a custodial replica ? custodial = None ## get from other outputs for output in out_worth_checking: if len(custodial_locations[output]): custodial = custodial_locations[output][0] if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the other output custodial:",custodial,"because of limited space" custodial = None ## try to get it from campaign configuration if not custodial: for output in out_worth_checking: campaign = campaigns[output] if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]: custodial = CI.campaigns[campaign]['custodial'] print "Setting custodial to",custodial,"from campaign configuration" if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the campaign configuration custodial:",custodial,"because of limited space" custodial = None ## get from the parent pick_custodial = True use_parent_custodial = UC.get('use_parent_custodial') _,prim,_,_ = wfi.getIO() if not custodial and prim and use_parent_custodial: parent_dataset = prim.pop() ## this is terribly dangerous to assume only parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset ) ###parents_custodial = findCustodialLocation(url, parent_dataset) if not parents_custodial: parents_custodial = [] if len(parents_custodial): custodial = parents_custodial[0] else: print "the input dataset",parent_dataset,"does not have custodial in the first place. abort" #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset) ## does not work for RAWOADSIM sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset) ## cannot be bypassed, this is an issue to fix is_closing = False pick_custodial = False assistance_tags.add('parentcustodial') if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the parent custodial:",custodial,"because of limited space" custodial = None if not custodial and pick_custodial: ## pick one at random custodial = SI.pick_SE(size=size_worth_checking) if not custodial: print "cannot find a custodial for",wfo.name wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking)) #sendEmail( "cannot find a custodial","cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking)) sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical') if custodial and (is_closing or bypass_checks): print "picked",custodial,"for tape copy" ## remember how much you added this round already ; this stays locally SI.storage[custodial] -= size_worth_checking ## register the custodial request, if there are no other big issues for output in out_worth_checking: if not len(custodial_locations[output]): if phedex_presence[output]>=1: custodials[custodial].append( output ) ## let's wait and see if that's needed assistance_tags.add('custodial') else: print "no file in phedex for",output," not good to add to custodial requests" #cannot be bypassed is_closing = False ## disk copy disk_copies = {} for output in wfi.request['OutputDatasets']: disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)] if not all(map( lambda sites : len(sites)!=0, disk_copies.values())): print wfo.name,"has not all output on disk" print json.dumps(disk_copies, indent=2) ## presence in dbs dbs_presence = {} dbs_invalid = {} for output in wfi.request['OutputDatasets']: dbs_presence[output] = dbs3Client.getFileCountDataset( output ) dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True) fraction_invalid = 0.01 if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs,phedex mismatch" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) if not 'recovering' in assistance_tags: assistance_tags.add('filemismatch') #print this for show and tell if no recovery on-going for out in dbs_presence: _,_,missing_phedex,missing_dbs = getDatasetFiles(url, out) if missing_phedex: print "These %d files are missing in phedex"%(len(missing_phedex)) print "\n".join( missing_phedex ) if missing_dbs: print "These %d files are missing in dbs"%(len(missing_dbs)) print "\n".join( missing_dbs ) #if not bypass_checks: ## I don't think we can by pass this is_closing = False if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs invalid file level too high" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## need to be going and taking an eye assistance_tags.add('invalidfiles') if not bypass_checks: #sub_assistance+="-invalidfiles" is_closing = False ## put that heavy part at the end ## duplication check duplications = {} if is_closing or bypass_checks: print "starting duplicate checker for",wfo.name for output in wfi.request['OutputDatasets']: print "\tchecking",output duplications[output] = True try: duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True) except: try: duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True) except: print "was not possible to get the duplicate count for",output is_closing=False if any(duplications.values()) and not options.ignoreduplicates: print wfo.name,"has duplicates" print json.dumps(duplications,indent=2) ## hook for making file invalidation ? ## it shouldn't be allowed to bypass it assistance_tags.add('duplicates') is_closing = False ## for visualization later on if not wfo.name in fDB.record: #print "adding",wfo.name,"to close out record" fDB.record[wfo.name] = { 'datasets' :{}, 'name' : wfo.name, 'closeOutWorkflow' : None, } fDB.record[wfo.name]['closeOutWorkflow'] = is_closing fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority'] fDB.record[wfo.name]['prepid'] = wfi.request['PrepID'] for output in wfi.request['OutputDatasets']: if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {} rec = fDB.record[wfo.name]['datasets'][output] rec['percentage'] = float('%.2f'%(percent_completions[output]*100)) rec['duplicate'] = duplications[output] if output in duplications else 'N/A' rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A' rec['closeOutDataset'] = is_closing rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A' rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output]))) rec['dbsFiles'] = dbs_presence[output] rec['dbsInvFiles'] = dbs_invalid[output] rec['phedexFiles'] = phedex_presence[output] rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive)) ## and move on if is_closing: ## toggle status to closed-out in request manager print "setting",wfo.name,"closed-out" if not options.test: if wfo.wm_status in ['closed-out','announced','normal-archived']: print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does" res = None else: res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) print "close out answer",res if not res in ["None",None]: print "try to get the current status again" wfi_bis = workflowInfo(url, wfo.name) if wfi_bis.request['RequestStatus'] == 'closed-out': print "the request did toggle to closed-out" res = None if not res in ["None",None]: print "retrying to closing out" print res res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) if res in [None,"None"]: wfo.status = 'close' session.commit() if use_mcm and bypass_by_mcm: ## shoot large on all prepids for pid in pids: mcm.delete('/restapi/requests/forcecomplete/%s'%pid) else: print "could not close out",wfo.name,"will try again next time" else: ## full known list #recovering # has active ACDC ##OUT #recovered #had inactive ACDC #recovery #not over the pass bar #over100 # over 100% #biglumi # has a big lumiblock #parentcustodial # the parent does not have a valid subscription yet #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear #filemismatch # there is a dbs/phedex mismatch #duplicates #a lumi section is there twice ## manual is not added yet, and should be so by recoveror print wfo.name,"was tagged with :",list(assistance_tags) if 'recovering' in assistance_tags: ## if active ACDC, being under threshold, filemismatch do not matter assistance_tags = assistance_tags - set(['recovery','filemismatch']) if 'recovery' in assistance_tags and 'recovered' in assistance_tags: ## should not set -recovery to anything that add ACDC already assistance_tags = assistance_tags - set(['recovery','recovered']) ## straight to manual assistance_tags.add('manual') ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it print wfo.name,"needs assistance with",",".join( assistance_tags ) print wfo.name,"existing conditions",",".join( existing_assistance_tags ) ######################################### ##### notification to requester ######### go_notify=False if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags: go_notify=True if go_notify: #if wfo.name in already_notified: # print "double notification" # sendEmail('double notification','please take a look at %s'%(wfo.name)) #else: # already_notified.append( wfo.name ) detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s' perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name) splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name) ## notify templates messages= { 'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ), 'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink), 'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ), 'filemismatch': 'Samples completed with inconsistency in DBS/Phedex', #'manual' : 'Workflow completed and requires manual checks by Ops', } content = "The request PREPID (WORKFLOW) is facing issue in production.\n" motive = False for case in messages: if case in assistance_tags: content+= "\n"+messages[case]+"\n" motive = True content += "You are invited to check, while this is being taken care of by Comp-Ops.\n" content += "This is an automated message from Comp-Ops.\n" items_notified = set() if use_mcm and motive: wfi.notifyRequestor( content , mcm = mcm) ######################################### ## logic to set the status further if assistance_tags: new_status = 'assistance-'+'-'.join(sorted(assistance_tags) ) else: new_status = 'assistance' ## case where the workflow was in manual from recoveror if not 'manual' in wfo.status or new_status!='assistance-recovery': wfo.status = new_status if not options.test: print "setting",wfo.name,"to",wfo.status session.commit() else: print "current status is",wfo.status,"not changing to anything" #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2)) fDB.html() if not spec: #sendEmail("fresh assistance status available","Fresh status are available at https://cmst2.web.cern.ch/cmst2/unified/assistance.html",destination=['*****@*****.**']) #it's a bit annoying pass ## custodial requests print "Custodials" print json.dumps(custodials, indent=2) for site in custodials: print ','.join(custodials[site]),'=>',site if not options.test: result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) ) print result print "Transfers" print json.dumps(transfers, indent=2) ## replicas requests for site in transfers: print ','.join(transfers[site]),'=>',site if not options.test: result = None #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out") print result print "File Invalidation" print invalidations
def checkor(url, spec=None, options=None): if userLock(): return if duplicateLock() and not options.go: return fDB = closeoutInfo() UC = unifiedConfiguration() use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] def time_point(label="",sub_lap=False): now = time.mktime(time.gmtime()) nows = time.asctime(time.gmtime()) print "Time check (%s) point at : %s"%(label, nows) print "Since start: %s [s]"% ( now - time_point.start) if sub_lap: print "Sub Lap : %s [s]"% ( now - time_point.sub_lap ) time_point.sub_lap = now else: print "Lap : %s [s]"% ( now - time_point.lap ) time_point.lap = now time_point.sub_lap = now time_point.sub_lap = time_point.lap = time_point.start = time.mktime(time.gmtime()) runnings = session.query(Workflow).filter(Workflow.status == 'away').all() standings = session.query(Workflow).filter(Workflow.status.startswith('assistance')).all() ## intersect with what is actually in completed status in request manager now all_completed = set(getWorkflows(url, 'completed' )) wfs=[] if options.strict: ## the one which were running and now have completed print "strict option is on: checking workflows that freshly completed" wfs.extend( filter(lambda wfo: wfo.name in all_completed , runnings)) if options.update: print "update option is on: checking workflows that have not completed yet" wfs.extend( filter(lambda wfo: not wfo.name in all_completed , runnings)) if options.clear: print "clear option is on: checking workflows that are ready to toggle closed-out" wfs.extend( filter(lambda wfo: 'custodial' in wfo.status, standings)) if options.review: print "review option is on: checking the workflows that needed intervention" wfs.extend( filter(lambda wfo: not 'custodial' in wfo.status, standings)) ## what is left out are the wf which were running and ended up aborted/failed/... custodials = defaultdict(list) #sites : dataset list transfers = defaultdict(list) #sites : dataset list invalidations = [] #a list of files SI = siteInfo() CI = campaignInfo() mcm = McMClient(dev=False) if use_mcm else None def get_campaign(output, wfi): ## this should be a perfect matching of output->task->campaign campaign = None era = None wf_campaign = None if 'Campaign' in wfi.request: wf_campaign = wfi.request['Campaign'] try: era = output.split('/')[2].split('-')[0] except: era = None if wfi.isRelval(): campaign = wf_campaign else: campaign = era if era else wf_campaign return campaign ## retrieve bypass and onhold configuration bypasses = [] forcings = [] overrides = getForceCompletes() holdings = [] actors = UC.get('allowed_bypass') for bypassor,email in actors: bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor) if not os.path.isfile(bypass_file): #sendLog('checkor','no file %s',bypass_file) continue try: bypasses.extend( json.loads(open(bypass_file).read())) except: sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email]) holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor) if not os.path.isfile(holding_file): #sendLog('checkor',"no file %s"%holding_file) continue try: extending = json.loads(open(holding_file).read()) print bypassor,"is holding",extending holdings.extend( extending ) except: sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email]) ## once this was force-completed, you want to bypass for rider,email in actors: rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider) if not os.path.isfile(rider_file): print "no file",rider_file #sendLog('checkor',"no file %s"%rider_file) continue try: bypasses.extend( json.loads(open( rider_file ).read() ) ) except: sendLog('checkor',"cannot get force complete list from %s"%rider) sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email]) if use_mcm: forcings = mcm.get('/restapi/requests/forcecomplete') #if forcings: # sendEmail('force completing mechanism','please check what checkor is doing with %s'%( ','.join(forcings))) pattern_fraction_pass = UC.get('pattern_fraction_pass') total_running_time = 5.*60. sleep_time = 1 if len(wfs): sleep_time = min(max(0.5, total_running_time / len(wfs)), 10) random.shuffle( wfs ) in_manual = 0 ## now you have a record of what file was invalidated globally from TT TMDB_invalid = dataCache.get('file_invalidation') #try: # TMDB_invalid = set([row[3] for row in csv.reader( os.popen('curl -s "https://docs.google.com/spreadsheets/d/11fFsDOTLTtRcI4Q3gXw0GNj4ZS8IoXMoQDC3CbOo_2o/export?format=csv"'))]) # TMDB_invalid = map(lambda e : e.split(':')[-1], TMDB_invalid) # print len(TMDB_invalid),"globally invalidated files" #except Exception as e: # print "TMDB not fetched" # print str(e) # TMDB_invalid = [] print len(wfs),"to consider, pausing for",sleep_time max_per_round = UC.get('max_per_round').get('checkor',None) if options.limit: max_per_round=options.limit if max_per_round and not spec: wfs = wfs[:max_per_round] for wfo in wfs: if spec and not (spec in wfo.name): continue time.sleep( sleep_time ) time_point("Starting with %s"% wfo.name) ## get info wfi = workflowInfo(url, wfo.name) wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status)) ## make sure the wm status is up to date. # and send things back/forward if necessary. wfo.wm_status = wfi.request['RequestStatus'] if wfo.wm_status == 'closed-out': ## manually closed-out wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status)) wfo.status = 'close' session.commit() continue elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']: ## went into trouble wfo.status = 'trouble' wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status)) session.commit() continue elif wfo.wm_status in ['assigned','acquired']: ## not worth checking yet wfi.sendLog('checkor',"%s is not running yet"%wfo.name) session.commit() continue if '-onhold' in wfo.status: if wfo.name in holdings and wfo.name not in bypasses: wfi.sendLog('checkor',"%s is on hold"%wfo.name) continue if wfo.wm_status != 'completed': #and not wfo.name in bypasses: ## for sure move on with closeout check if in completed wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status)) session.commit() continue if wfo.name in holdings and wfo.name not in bypasses: wfo.status = 'assistance-onhold' wfi.sendLog('checkor',"setting %s on hold"%wfo.name) session.commit() continue session.commit() #sub_assistance="" # if that string is filled, there will be need for manual assistance existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance assistance_tags = set() is_closing = True ## get it from somewhere bypass_checks = False for bypass in bypasses: if bypass in wfo.name: wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass)) bypass_checks = True break pids = wfi.getPrepIDs() force_by_mcm = False force_by_user = False for force in forcings: if force in pids: wfi.sendLog('checkor',"we can bypass checks and force complete %s because of prepid %s "%( wfo.name, force)) bypass_checks = True force_by_mcm = True break for user in overrides: for force in overrides[user]: if force in wfo.name: wfi.sendLog('checkor',"we can bypass checks and force complete %s because of keyword %s of user %s"%( wfo.name, force, user)) bypass_checks = True force_by_user = True break tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm* vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco to_ddm_tier = copy.deepcopy(UC.get('tiers_to_DDM')) campaigns = {} ## this mapping of campaign per output dataset assumes era==campaing, which is not true for relval expected_outputs = copy.deepcopy( wfi.request['OutputDatasets'] ) for out in wfi.request['OutputDatasets']: c = get_campaign(out, wfi) campaigns[out] = c if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]: vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override'])) ## add those that we need to check for custodial copy tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] )) wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])] check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] )) check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check )) check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) ) wfi.sendLog('checkor', check_output_text ) ## anything running on acdc : getting the real prepid is not worth it familly = getWorkflowById(url, wfi.request['PrepID'], details=True) acdc = [] acdc_inactive = [] forced_already=False acdc_bads = [] true_familly = [] for member in familly: if member['RequestType'] != 'Resubmission': continue if member['RequestName'] == wfo.name: continue if member['RequestDate'] < wfi.request['RequestDate']: continue if member['PrepID'] != wfi.request['PrepID'] : continue #if 'OriginalRequestName' in member and (not 'ACDC' in member['OriginalRequestName']) and member['OriginalRequestName'] != wfo.name: continue if member['RequestStatus'] == None: continue if not set(member['OutputDatasets']).issubset( set(expected_outputs)): if not member['RequestStatus'] in ['rejected-archived','rejected','aborted','aborted-archived']: ##this is not good at all wfi.sendLog('checkor','inconsistent ACDC %s'%member['RequestName'] ) #sendLog('checkor','inconsistent ACDC %s'%member['RequestName'], level='critical') acdc_bads.append( member['RequestName'] ) is_closing = False assistance_tags.add('manual') continue true_familly.append( member['RequestName'] ) #try: # parse_one(url, member['RequestName']) #except: # print "Could not make error report for",member['RequestName'] if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']: print wfo.name,"still has an ACDC running",member['RequestName'] acdc.append( member['RequestName'] ) ## cannot be bypassed! is_closing = False assistance_tags.add('recovering') if (force_by_mcm or force_by_user) and not forced_already: wfi.sendLog('checkor','%s is being forced completed while recovering'%wfo.name) wfi.notifyRequestor("The workflow %s was force completed"% wfo.name, do_batch=False) forceComplete(url, wfi) forced_already=True else: acdc_inactive.append( member['RequestName'] ) assistance_tags.add('recovered') if acdc_bads: #sendEmail('inconsistent ACDC','for %s, ACDC %s is inconsistent, preventing from closing'%( wfo.name, ','.join(acdc_bads) )) sendLog('checkor','For %s, ACDC %s is inconsistent, preventing from closing or will create a mess.'%( wfo.name, ','.join(acdc_bads) ), level='critical') time_point("checked workflow familly", sub_lap=True) ## completion check percent_completions = {} if not 'TotalInputEvents' in wfi.request: event_expected,lumi_expected = 0,0 if not 'recovery' in wfo.status: #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**']) sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical') else: event_expected,lumi_expected = wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis'] if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']): event_expected = int(wfi.request['RequestNumEvents']) elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']: event_expected = wfi.request['Task1']['RequestNumEvents'] for i in range(1,20): if 'Task%d'%i in wfi.request: ## this is wrong ibsolute if 'FilterEfficiency' in wfi.request['Task%d'%i]: event_expected *= float(wfi.request['Task%d'%i]['FilterEfficiency']) event_expected = int(event_expected) fractions_pass = {} events_per_lumi = {} over_100_pass = False (lhe,prim,_,_) = wfi.getIO() if lhe or prim: over_100_pass = False time_point("execpted statistics", sub_lap=True) for output in wfi.request['OutputDatasets']: event_count,lumi_count = getDatasetEventsAndLumis(dataset=output) events_per_lumi[output] = event_count/float(lumi_count) if lumi_count else 100 percent_completions[output] = 0. if lumi_expected: percent_completions[output] = lumi_count / float( lumi_expected ) if event_expected: wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected )) percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) ) default_pass = UC.get('default_fraction_pass') fractions_pass[output] = default_pass c = campaigns[output] if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]: if type(CI.campaigns[c]['fractionpass']) == dict: tier = output.split('/')[-1] priority = str(wfi.request['RequestPriority']) ## defined per tier fractions_pass[output] = CI.campaigns[c]['fractionpass'].get('all', default_pass) if tier in CI.campaigns[c]['fractionpass']: fractions_pass[output] = CI.campaigns[c]['fractionpass'][tier] if priority in CI.campaigns[c]['fractionpass']: fractions_pass[output] = CI.campaigns[c]['fractionpass'][priority] else: fractions_pass[output] = CI.campaigns[c]['fractionpass'] wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output)) if options.fractionpass: fractions_pass[output] = options.fractionpass print "overriding fraction to",fractions_pass[output],"by command line for",output for key in pattern_fraction_pass: if key in output: fractions_pass[output] = pattern_fraction_pass[key] print "overriding fraction to",fractions_pass[output],"by dataset key",key if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]): possible_recoveries = wfi.getRecoveryDoc() if possible_recoveries == []: wfi.sendLog('checkor','%s has missing statistics \n%s \n%s, but nothing is recoverable. passing through to annoucement'%( wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) )) sendLog('checkor','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name, level='critical') #sendEmail('nothing is recoverable','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name)#,destination=['*****@*****.**']) ## do not bypass for now, until Alan understands why we are loosing ACDC docs bypass_checks = True else: wfi.sendLog('checkor','%s is not completed \n%s \n%s'%( wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) )) ## hook for creating automatically ACDC ? if not bypass_checks: assistance_tags.add('recovery') is_closing = False if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]): print wfo.name,"is over completed" print json.dumps(percent_completions, indent=2) if not bypass_checks: assistance_tags.add('over100') is_closing = False time_point("checked output size", sub_lap=True) ## correct lumi < 300 event per lumi #for output in wfi.request['OutputDatasets']: #events_per_lumi[output] = getDatasetEventsPerLumi( output ) lumi_upper_limit = {} for output in wfi.request['OutputDatasets']: upper_limit = 301. campaign = campaigns[output] if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]: upper_limit = CI.campaigns[campaign]['lumisize'] print "overriding the upper lumi size to",upper_limit,"for",campaign if options.lumisize: upper_limit = options.lumisize print "overriding the upper lumi size to",upper_limit,"by command line" lumi_upper_limit[output] = upper_limit if wfi.request['RequestType'] in ['ReDigi','ReReco']: lumi_upper_limit[output] = -1 if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]): print wfo.name,"has big lumisections" print json.dumps(events_per_lumi, indent=2) ## hook for rejecting the request ? if not bypass_checks: assistance_tags.add('biglumi') is_closing = False any_presence = {} for output in wfi.request['OutputDatasets']: any_presence[output] = getDatasetPresence(url, output, vetoes=[]) time_point("checked dataset presence", sub_lap=True) ## custodial copy custodial_locations = {} custodial_presences = {} for output in wfi.request['OutputDatasets']: custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s] custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output) if not custodial_locations[output]: custodial_locations[output] = [] time_point("checked custodiality", sub_lap=True) ## presence in phedex phedex_presence ={} for output in wfi.request['OutputDatasets']: phedex_presence[output] = phedexClient.getFileCountDataset(url, output ) time_point("checked phedex count", sub_lap=True) out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier] size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs size_worht_going_to_ddm = sum([getDatasetSize(out)/1023. for out in out_worth_checking if out.split('/')[-1] in to_ddm_tier ]) ## size in TBs of all outputs if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])): print wfo.name,"has not all custodial location" print json.dumps(custodial_locations, indent=2) ########## ## hook for making a custodial replica ? custodial = None ## get from other outputs for output in out_worth_checking: if len(custodial_locations[output]): custodial = custodial_locations[output][0] if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the other output custodial:",custodial,"because of limited space" custodial = None ## try to get it from campaign configuration if not custodial: for output in out_worth_checking: campaign = campaigns[output] if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]: custodial = CI.campaigns[campaign]['custodial'] print "Setting custodial to",custodial,"from campaign configuration" group = None if campaign in CI.campaigns and 'phedex_group' in CI.campaigns[campaign]: group = CI.campaigns[campaign]['phedex_group'] print "using group",group,"for replica" if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the campaign configuration custodial:",custodial,"because of limited space" custodial = None ## get from the parent pick_custodial = True use_parent_custodial = UC.get('use_parent_custodial') tape_size_limit = options.tape_size_limit if options.tape_size_limit else UC.get("tape_size_limit") _,prim,_,_ = wfi.getIO() if not custodial and prim and use_parent_custodial: parent_dataset = prim.pop() ## this is terribly dangerous to assume only parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset ) ###parents_custodial = findCustodialLocation(url, parent_dataset) if not parents_custodial: parents_custodial = [] if len(parents_custodial): custodial = parents_custodial[0] else: print "the input dataset",parent_dataset,"does not have custodial in the first place. abort" #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset) ## does not work for RAWOADSIM sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset) ## cannot be bypassed, this is an issue to fix is_closing = False pick_custodial = False assistance_tags.add('parentcustodial') if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the parent custodial:",custodial,"because of limited space" custodial = None if not custodial and pick_custodial: ## pick one at random custodial = SI.pick_SE(size=size_worth_checking) if custodial and size_worht_going_to_ddm > tape_size_limit: print wfi.sendLog('checkor',"The total output size (%s TB) is too large for the limit set (%s TB)"%( size_worth_checking, tape_size_limit)) custodial = None if not custodial: print "cannot find a custodial for",wfo.name wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking)) sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical') if custodial and (is_closing or bypass_checks): print "picked",custodial,"for tape copy" ## remember how much you added this round already ; this stays locally SI.storage[custodial] -= size_worth_checking ## register the custodial request, if there are no other big issues for output in out_worth_checking: if not len(custodial_locations[output]): if phedex_presence[output]>=1: wfi.sendLog('checkor','Using %s as a tape destination for %s'%(custodial, output)) custodials[custodial].append( output ) if group: custodials[custodial][-1]+='@%s'%group ## let's wait and see if that's needed assistance_tags.add('custodial') else: print "no file in phedex for",output," not good to add to custodial requests" #cannot be bypassed is_closing = False time_point("determined tape location", sub_lap=True) ## disk copy disk_copies = {} for output in wfi.request['OutputDatasets']: disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)] if not all(map( lambda sites : len(sites)!=0, disk_copies.values())): print wfo.name,"has not all output on disk" print json.dumps(disk_copies, indent=2) ## presence in dbs dbs_presence = {} dbs_invalid = {} for output in wfi.request['OutputDatasets']: dbs_presence[output] = dbs3Client.getFileCountDataset( output ) dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True) time_point("dbs file count", sub_lap=True) if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: mismatch_notice = wfo.name+" has a dbs,phedex mismatch\n" mismatch_notice += "in dbs\n"+json.dumps(dbs_presence, indent=2) +"\n" mismatch_notice += "invalide in dbs\n"+json.dumps(dbs_invalid, indent=2) +"\n" mismatch_notice += "in phedex\n"+json.dumps(phedex_presence, indent=2) +"\n" wfi.sendLog('checkor',mismatch_notice) if not 'recovering' in assistance_tags: assistance_tags.add('filemismatch') #print this for show and tell if no recovery on-going for out in dbs_presence: _,_,missing_phedex,missing_dbs = getDatasetFiles(url, out) if missing_phedex: wfi.sendLog('checkor',"These %d files are missing in phedex\n%s"%(len(missing_phedex), "\n".join( missing_phedex ))) were_invalidated = sorted(set(missing_phedex) & set(TMDB_invalid )) if were_invalidated: wfi.sendLog('checkor',"These %d files were invalidated globally\n%s"%(len(were_invalidated), "\n".join(were_invalidated))) sendLog('checkor',"These %d files were invalidated globally\n%s\nand are invalidated in dbs"%(len(were_invalidated), "\n".join(were_invalidated)), level='critical') dbs3Client.setFileStatus( were_invalidated, newstatus=0 ) if missing_dbs: wfi.sendLog('checkor',"These %d files are missing in dbs\n%s"%(len(missing_dbs), "\n".join( missing_dbs ))) were_invalidated = sorted(set(missing_dbs) & set(TMDB_invalid )) if were_invalidated: wfi.sendLog('checkor',"These %d files were invalidated globally\n%s"%(len(were_invalidated), "\n".join(were_invalidated))) #if not bypass_checks: ## I don't think we can by pass this is_closing = False time_point("checked file count", sub_lap=True) fraction_invalid = 0.20 if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignoreinvalid: print wfo.name,"has a dbs invalid file level too high" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## need to be going and taking an eye assistance_tags.add('invalidfiles') if not bypass_checks: #sub_assistance+="-invalidfiles" is_closing = False ## put that heavy part at the end ## duplication check duplications = {} files_per_rl = {} for output in wfi.request['OutputDatasets']: duplications[output] = "skiped" files_per_rl[output] = "skiped" time_point("checked invalidation", sub_lap=True) if (is_closing or bypass_checks) and (not options.ignoreduplicates): print "starting duplicate checker for",wfo.name for output in wfi.request['OutputDatasets']: print "\tchecking",output duplications[output] = True try: duplications[output],files_per_rl[output] = dbs3Client.duplicateRunLumiFiles( output , skipInvalid=True, verbose=True) except: try: duplications[output],files_per_rl[output] = dbs3Client.duplicateRunLumiFiles( output , skipInvalid=True, verbose=True) except Exception as e: wfi.sendLog('checkor','Not possible to check on duplicate lumi count on %s'%(output)) sendLog('checkor','Not possible to check on duplicate lumi count on %s\n%s'%(output,str(e)),level='critical') is_closing=False if is_closing and any(duplications.values()) and not options.ignoreduplicates: duplicate_notice = "" duplicate_notice += "%s has duplicates\n"%wfo.name duplicate_notice += json.dumps( duplications,indent=2) duplicate_notice += '\n' duplicate_notice += json.dumps( files_per_rl, indent=2) wfi.sendLog('checkor',duplicate_notice) ## hook for making file invalidation ? ## it shouldn't be allowed to bypass it assistance_tags.add('duplicates') is_closing = False time_point("checked duplicates", sub_lap=True) time_point("done with %s"%wfo.name) ## for visualization later on if not wfo.name in fDB.record: #print "adding",wfo.name,"to close out record" fDB.record[wfo.name] = { 'datasets' :{}, 'name' : wfo.name, 'closeOutWorkflow' : None, } fDB.record[wfo.name]['closeOutWorkflow'] = is_closing fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority'] fDB.record[wfo.name]['prepid'] = wfi.request['PrepID'] for output in wfi.request['OutputDatasets']: if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {} rec = fDB.record[wfo.name]['datasets'][output] #rec['percentage'] = float('%.2f'%(percent_completions[output]*100)) rec['percentage'] = math.floor(percent_completions[output]*10000)/100.## round down rec['duplicate'] = duplications[output] if output in duplications else 'N/A' rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A' rec['closeOutDataset'] = is_closing rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A' rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output]))) rec['dbsFiles'] = dbs_presence[output] rec['dbsInvFiles'] = dbs_invalid[output] rec['phedexFiles'] = phedex_presence[output] rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive)) rec['familly'] = true_familly now = time.gmtime() rec['timestamp'] = time.mktime(now) rec['updated'] = time.asctime(now)+' (GMT)' ## make the lumi summary if wfi.request['RequestType'] == 'ReReco': try: os.system('python Unified/lumi_summary.py %s 1 > /dev/null'%(wfi.request['PrepID'])) os.system('python Unified/lumi_plot.py %s > /dev/null'%(wfi.request['PrepID'])) wfi.sendLog('checkor','Lumi summary available at %s/datalumi/lumi.%s.html'%(unified_url,wfi.request['PrepID'])) except Exception as e: print str(e) ## make the error report ## and move on if is_closing: ## toggle status to closed-out in request manager wfi.sendLog('checkor',"setting %s closed-out"% wfo.name) if not options.test: if wfo.wm_status in ['closed-out','announced','normal-archived']: print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does" res = None else: res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) print "close out answer",res if not res in ["None",None]: print "try to get the current status again" wfi_bis = workflowInfo(url, wfo.name) if wfi_bis.request['RequestStatus'] == 'closed-out': print "the request did toggle to closed-out" res = None if not res in ["None",None]: print "retrying to closing out" print res res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) if res in [None,"None"]: wfo.status = 'close' session.commit() if use_mcm and force_by_mcm: ## shoot large on all prepids, on closing the wf for pid in pids: mcm.delete('/restapi/requests/forcecomplete/%s'%pid) else: print "could not close out",wfo.name,"will try again next time" else: if not 'custodial' in assistance_tags or wfi.isRelval(): ## do only the report for those for member in acdc+acdc_inactive+[wfo.name]: try: parse_one(url, member) except: print "Could not make error report for",member ## full known list #recovering # has active ACDC ##OUT #recovered #had inactive ACDC #recovery #not over the pass bar #over100 # over 100% #biglumi # has a big lumiblock #parentcustodial # the parent does not have a valid subscription yet #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear #filemismatch # there is a dbs/phedex mismatch #duplicates #a lumi section is there twice ## manual is not added yet, and should be so by recoveror print wfo.name,"was tagged with :",list(assistance_tags) if 'recovering' in assistance_tags: ## if active ACDC, being under threshold, filemismatch do not matter assistance_tags = assistance_tags - set(['recovery','filemismatch']) if 'recovery' in assistance_tags and 'recovered' in assistance_tags: ## should not set -recovery to anything that had ACDC already assistance_tags = assistance_tags - set(['recovery','recovered']) ## straight to manual assistance_tags.add('manual') in_manual += 1 if 'recovery' in assistance_tags and 'manual' in assistance_tags: ## this is likely because something bad is happening, so leave it to manual assistance_tags = assistance_tags - set(['recovery']) assistance_tags.add('manual') in_manual += 1 ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it print wfo.name,"needs assistance with",",".join( assistance_tags ) print wfo.name,"existing conditions",",".join( existing_assistance_tags ) ######################################### ##### notification to requester ######### go_notify=False if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags: go_notify=True if go_notify: #if wfo.name in already_notified: # print "double notification" # sendEmail('double notification','please take a look at %s'%(wfo.name)) #else: # already_notified.append( wfo.name ) ###detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s' #detailslink = 'https://cmsweb.cern.ch/reqmgr2/fetch?rid=%s'%(wfo.name) ###perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name) perflink = '%s/report/%s'%(unified_url,wfo.name) splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name) ## notify templates messages= { 'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ), 'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink), 'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ), 'filemismatch': 'Samples completed with inconsistency in DBS/Phedex', #'manual' : 'Workflow completed and requires manual checks by Ops', } content = "The request PREPID (WORKFLOW) is facing issue in production.\n" motive = False for case in messages: if case in assistance_tags: content+= "\n"+messages[case]+"\n" motive = True content += "You are invited to check, while this is being taken care of by Comp-Ops.\n" content += "This is an automated message from Comp-Ops.\n" items_notified = set() if use_mcm and motive: wfi.notifyRequestor( content , mcm = mcm) ######################################### ## logic to set the status further if assistance_tags: new_status = 'assistance-'+'-'.join(sorted(assistance_tags) ) else: new_status = 'assistance' ## case where the workflow was in manual from recoveror if not 'manual' in wfo.status or new_status!='assistance-recovery': wfo.status = new_status if not options.test: wfi.sendLog('checkor','setting %s to %s'%(wfo.name, wfo.status)) session.commit() else: print "current status is",wfo.status,"not changing to anything" #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2)) fDB.html() if not spec and in_manual!=0: sendEmail("fresh assistance status available","Fresh status are available at %s/assistance.html"%unified_url,destination=['*****@*****.**']) #it's a bit annoying pass ## custodial requests print "Custodials" print json.dumps(custodials, indent=2) for site in custodials: items_at = defaultdict(set) for i in custodials[site]: item, group = i.split('@') if '@' in i else (i,'DataOps') items_at[group].add( item ) for group,items in items_at.items(): print ','.join(items),'=>',site,'@',group if not options.test: result = makeReplicaRequest(url, site, sorted(items) ,"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) , group=group) print result print "File Invalidation" print invalidations
def checkor(url, spec=None, options=None): if userLock(): return if duplicateLock(): return fDB = closeoutInfo() UC = unifiedConfiguration() use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] wfs=[] if options.new: ## get all in running and check ## you want to intersect with what is completed ! if options.strict: completed_wfi = getWorkflows(url, status='completed') for wfo in session.query(Workflow).filter(Workflow.status == 'away').all(): if wfo.name in completed_wfi: wfs.append( wfo ) else: print wfo.name,"is not completed" sendLog('checkor','%s is not completed'%( wfo.name)) else: wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() ) if options.current: ## recheck those already there, probably to just pass them along wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() ) if options.old: ## than get all in need for assistance wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() ) custodials = defaultdict(list) #sites : dataset list transfers = defaultdict(list) #sites : dataset list invalidations = [] #a list of files SI = siteInfo() CI = campaignInfo() mcm = McMClient(dev=False) def get_campaign(output, wfi): campaign = None try: campaign = output.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request: campaign = wfi.request['Campaign'] return campaign ## retrieve bypass and onhold configuration bypasses = [] forcings = [] overrides = getForceCompletes() holdings = [] for bypassor,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('prozober','*****@*****.**')]: bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor) if not os.path.isfile(bypass_file): #sendLog('checkor','no file %s',bypass_file) continue try: bypasses.extend( json.loads(open(bypass_file).read())) except: sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email]) holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor) if not os.path.isfile(holding_file): #sendLog('checkor',"no file %s"%holding_file) continue try: holdings.extend( json.loads(open(holding_file).read())) except: sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email]) ## once this was force-completed, you want to bypass for rider,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('srimanob','*****@*****.**')]: rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider) if not os.path.isfile(rider_file): print "no file",rider_file #sendLog('checkor',"no file %s"%rider_file) continue try: bypasses.extend( json.loads(open( rider_file ).read() ) ) except: sendLog('checkor',"cannot get force complete list from %s"%rider) sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email]) if use_mcm: forcings = mcm.get('/restapi/requests/forcecomplete') if forcings: sendEmail('force completing mechanism','please check what checkor is doing with %s'%( ','.join(forcings))) pattern_fraction_pass = UC.get('pattern_fraction_pass') total_running_time = 5.*60. sleep_time = 1 if len(wfs): sleep_time = min(max(0.5, total_running_time / len(wfs)), 10) random.shuffle( wfs ) print len(wfs),"to consider, pausing for",sleep_time max_per_round = UC.get('max_per_round').get('checkor',None) if max_per_round and not spec: wfs = wfs[:max_per_round] for wfo in wfs: if spec and not (spec in wfo.name): continue time.sleep( sleep_time ) ## get info wfi = workflowInfo(url, wfo.name) wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status)) ## make sure the wm status is up to date. # and send things back/forward if necessary. wfo.wm_status = wfi.request['RequestStatus'] if wfo.wm_status == 'closed-out': ## manually closed-out wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status)) wfo.status = 'close' session.commit() continue elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']: ## went into trouble wfo.status = 'trouble' wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status)) session.commit() continue elif wfo.wm_status in ['assigned','acquired']: ## not worth checking yet wfi.sendLog('checkor',"%s is not running yet"%wfo.name) session.commit() continue if '-onhold' in wfo.status: if wfo.name in holdings and wfo.name not in bypasses: wfi.sendLog('checkor',"%s is on hold"%wfo.name) continue if wfo.wm_status != 'completed': #and not wfo.name in bypasses: ## for sure move on with closeout check if in completed wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status)) session.commit() continue if wfo.name in holdings and wfo.name not in bypasses: wfo.status = 'assistance-onhold' wfi.sendLog('checkor',"setting %s on hold"%wfo.name) session.commit() continue session.commit() #sub_assistance="" # if that string is filled, there will be need for manual assistance existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance assistance_tags = set() is_closing = True ## get it from somewhere bypass_checks = False for bypass in bypasses: if bypass in wfo.name: wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass)) bypass_checks = True break pids = wfi.getPrepIDs() force_by_mcm = False force_by_user = False for force in forcings: if force in pids: wfi.sendLog('checkor',"we can bypass checks and force complete %s because of prepid %s "%( wfo.name, force)) bypass_checks = True force_by_mcm = True break for user in overrides: for force in overrides[user]: if force in wfo.name: wfi.sendLog('checkor',"we can bypass checks and force complete %s because of keyword %s of user %s"%( wfo.name, force, user)) bypass_checks = True force_by_user = True break tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm* vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco campaigns = {} expected_outputs = copy.deepcopy( wfi.request['OutputDatasets'] ) for out in wfi.request['OutputDatasets']: c = get_campaign(out, wfi) campaigns[out] = c if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]: vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override'])) ## add those that we need to check for custodial copy tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] )) wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])] check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] )) check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check )) check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) ) wfi.sendLog('checkor', check_output_text ) ## anything running on acdc : getting the real prepid is not worth it familly = getWorkflowById(url, wfi.request['PrepID'], details=True) acdc = [] acdc_inactive = [] forced_already=False acdc_bads = [] for member in familly: if member['RequestType'] != 'Resubmission': continue if member['RequestName'] == wfo.name: continue if member['RequestDate'] < wfi.request['RequestDate']: continue if 'OriginalRequestName' in member and member['OriginalRequestName'] != wfo.name: continue if member['RequestStatus'] == None: continue if not set(member['OutputDatasets']).issubset( set(expected_outputs)): if not member['RequestStatus'] in ['rejected-archived','rejected','aborted','aborted-archived']: ##this is not good at all wfi.sendLog('checkor','inconsistent ACDC %s'%member['RequestName'] ) acdc_bads.append( member['RequestName'] ) is_closing = False assistance_tags.add('manual') continue if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']: print wfo.name,"still has an ACDC running",member['RequestName'] acdc.append( member['RequestName'] ) ## cannot be bypassed! is_closing = False assistance_tags.add('recovering') if (force_by_mcm or force_by_user) and not forced_already: wfi.sendLog('checkor','%s is being forced completed while recovering'%wfo.name) wfi.notifyRequestor("The workflow %s was force completed"% wfo.name, do_batch=False) forceComplete(url, wfi) forced_already=True else: acdc_inactive.append( member['RequestName'] ) assistance_tags.add('recovered') if acdc_bads: sendEmail('inconsistent ACDC','for %s, ACDC %s is inconsistent, preventing from closing'%( wfo.name, ','.join(acdc_bads) )) ## completion check percent_completions = {} if not 'TotalInputEvents' in wfi.request: event_expected,lumi_expected = 0,0 if not 'recovery' in wfo.status: #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**']) sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical') else: event_expected,lumi_expected = wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis'] if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']): event_expected = int(wfi.request['RequestNumEvents']) elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']: event_expected = wfi.request['Task1']['RequestNumEvents'] for i in range(1,20): if 'Task%d'%i in wfi.request: ## this is wrong ibsolute if 'FilterEfficiency' in wfi.request['Task%d'%i]: event_expected *= float(wfi.request['Task%d'%i]['FilterEfficiency']) event_expected = int(event_expected) fractions_pass = {} over_100_pass = False (lhe,prim,_,_) = wfi.getIO() if lhe or prim: over_100_pass = False for output in wfi.request['OutputDatasets']: event_count,lumi_count = getDatasetEventsAndLumis(dataset=output) percent_completions[output] = 0. if lumi_expected: percent_completions[output] = lumi_count / float( lumi_expected ) if event_expected: wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected )) percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) ) fractions_pass[output] = 0.95 c = campaigns[output] if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]: fractions_pass[output] = CI.campaigns[c]['fractionpass'] wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output)) if options.fractionpass: fractions_pass[output] = options.fractionpass print "overriding fraction to",fractions_pass[output],"by command line for",output for key in pattern_fraction_pass: if key in output: fractions_pass[output] = pattern_fraction_pass[key] print "overriding fraction to",fractions_pass[output],"by dataset key",key if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]): possible_recoveries = wfi.getRecoveryDoc() if possible_recoveries == []: wfi.sendLog('checkor','%s has missing statistics \n%s \n%s, but nothing is recoverable. passing through to annoucement'%( wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) )) sendEmail('nothing is recoverable','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name) bypass_checks = True else: wfi.sendLog('checkor','%s is not completed \n%s \n%s'%( wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) )) ## hook for creating automatically ACDC ? if not bypass_checks: assistance_tags.add('recovery') is_closing = False if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]): print wfo.name,"is over completed" print json.dumps(percent_completions, indent=2) if not bypass_checks: assistance_tags.add('over100') is_closing = False ## correct lumi < 300 event per lumi events_per_lumi = {} for output in wfi.request['OutputDatasets']: events_per_lumi[output] = getDatasetEventsPerLumi( output ) lumi_upper_limit = {} for output in wfi.request['OutputDatasets']: upper_limit = 301. campaign = campaigns[output] #if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request: # upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency'] # print "setting the upper limit of lumisize to",upper_limit,"by request configuration" if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]: upper_limit = CI.campaigns[campaign]['lumisize'] print "overriding the upper lumi size to",upper_limit,"for",campaign if options.lumisize: upper_limit = options.lumisize print "overriding the upper lumi size to",upper_limit,"by command line" lumi_upper_limit[output] = upper_limit if wfi.request['RequestType'] in ['ReDigi']: lumi_upper_limit[output] = -1 if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]): print wfo.name,"has big lumisections" print json.dumps(events_per_lumi, indent=2) ## hook for rejecting the request ? if not bypass_checks: assistance_tags.add('biglumi') is_closing = False any_presence = {} for output in wfi.request['OutputDatasets']: any_presence[output] = getDatasetPresence(url, output, vetoes=[]) ## custodial copy custodial_locations = {} custodial_presences = {} for output in wfi.request['OutputDatasets']: custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s] custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output) if not custodial_locations[output]: custodial_locations[output] = [] ## presence in phedex phedex_presence ={} for output in wfi.request['OutputDatasets']: phedex_presence[output] = phedexClient.getFileCountDataset(url, output ) out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier] size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])): print wfo.name,"has not all custodial location" print json.dumps(custodial_locations, indent=2) ########## ## hook for making a custodial replica ? custodial = None ## get from other outputs for output in out_worth_checking: if len(custodial_locations[output]): custodial = custodial_locations[output][0] if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the other output custodial:",custodial,"because of limited space" custodial = None ## try to get it from campaign configuration if not custodial: for output in out_worth_checking: campaign = campaigns[output] if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]: custodial = CI.campaigns[campaign]['custodial'] print "Setting custodial to",custodial,"from campaign configuration" if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the campaign configuration custodial:",custodial,"because of limited space" custodial = None ## get from the parent pick_custodial = True use_parent_custodial = UC.get('use_parent_custodial') _,prim,_,_ = wfi.getIO() if not custodial and prim and use_parent_custodial: parent_dataset = prim.pop() ## this is terribly dangerous to assume only parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset ) ###parents_custodial = findCustodialLocation(url, parent_dataset) if not parents_custodial: parents_custodial = [] if len(parents_custodial): custodial = parents_custodial[0] else: print "the input dataset",parent_dataset,"does not have custodial in the first place. abort" #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset) ## does not work for RAWOADSIM sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset) ## cannot be bypassed, this is an issue to fix is_closing = False pick_custodial = False assistance_tags.add('parentcustodial') if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the parent custodial:",custodial,"because of limited space" custodial = None if not custodial and pick_custodial: ## pick one at random custodial = SI.pick_SE(size=size_worth_checking) if not custodial: print "cannot find a custodial for",wfo.name wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking)) #sendEmail( "cannot find a custodial","cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking)) sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical') if custodial and (is_closing or bypass_checks): print "picked",custodial,"for tape copy" ## remember how much you added this round already ; this stays locally SI.storage[custodial] -= size_worth_checking ## register the custodial request, if there are no other big issues for output in out_worth_checking: if not len(custodial_locations[output]): if phedex_presence[output]>=1: custodials[custodial].append( output ) ## let's wait and see if that's needed assistance_tags.add('custodial') else: print "no file in phedex for",output," not good to add to custodial requests" #cannot be bypassed is_closing = False ## disk copy disk_copies = {} for output in wfi.request['OutputDatasets']: disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)] if not all(map( lambda sites : len(sites)!=0, disk_copies.values())): print wfo.name,"has not all output on disk" print json.dumps(disk_copies, indent=2) ## presence in dbs dbs_presence = {} dbs_invalid = {} for output in wfi.request['OutputDatasets']: dbs_presence[output] = dbs3Client.getFileCountDataset( output ) dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True) fraction_invalid = 0.01 if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs,phedex mismatch" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) if not 'recovering' in assistance_tags: assistance_tags.add('filemismatch') #print this for show and tell if no recovery on-going for out in dbs_presence: _,_,missing_phedex,missing_dbs = getDatasetFiles(url, out) if missing_phedex: wfi.sendLog('checkor',"These %d files are missing in phedex\n%s"%(len(missing_phedex), "\n".join( missing_phedex ))) if missing_dbs: wfi.sendLog('checkor',"These %d files are missing in dbs\n%s"%(len(missing_dbs), "\n".join( missing_dbs ))) #if not bypass_checks: ## I don't think we can by pass this is_closing = False if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs invalid file level too high" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## need to be going and taking an eye assistance_tags.add('invalidfiles') if not bypass_checks: #sub_assistance+="-invalidfiles" is_closing = False ## put that heavy part at the end ## duplication check duplications = {} if is_closing or bypass_checks: print "starting duplicate checker for",wfo.name for output in wfi.request['OutputDatasets']: print "\tchecking",output duplications[output] = True try: duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True) except: try: duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True) except: print "was not possible to get the duplicate count for",output is_closing=False if any(duplications.values()) and not options.ignoreduplicates: print wfo.name,"has duplicates" print json.dumps(duplications,indent=2) ## hook for making file invalidation ? ## it shouldn't be allowed to bypass it assistance_tags.add('duplicates') is_closing = False ## for visualization later on if not wfo.name in fDB.record: #print "adding",wfo.name,"to close out record" fDB.record[wfo.name] = { 'datasets' :{}, 'name' : wfo.name, 'closeOutWorkflow' : None, } fDB.record[wfo.name]['closeOutWorkflow'] = is_closing fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority'] fDB.record[wfo.name]['prepid'] = wfi.request['PrepID'] for output in wfi.request['OutputDatasets']: if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {} rec = fDB.record[wfo.name]['datasets'][output] rec['percentage'] = float('%.2f'%(percent_completions[output]*100)) rec['duplicate'] = duplications[output] if output in duplications else 'N/A' rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A' rec['closeOutDataset'] = is_closing rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A' rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output]))) rec['dbsFiles'] = dbs_presence[output] rec['dbsInvFiles'] = dbs_invalid[output] rec['phedexFiles'] = phedex_presence[output] rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive)) now = time.gmtime() rec['timestamp'] = time.mktime(now) rec['updated'] = time.asctime(now)+' (GMT)' ## and move on if is_closing: ## toggle status to closed-out in request manager print "setting",wfo.name,"closed-out" if not options.test: if wfo.wm_status in ['closed-out','announced','normal-archived']: print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does" res = None else: res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) print "close out answer",res if not res in ["None",None]: print "try to get the current status again" wfi_bis = workflowInfo(url, wfo.name) if wfi_bis.request['RequestStatus'] == 'closed-out': print "the request did toggle to closed-out" res = None if not res in ["None",None]: print "retrying to closing out" print res res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) if res in [None,"None"]: wfo.status = 'close' session.commit() if use_mcm and force_by_mcm: ## shoot large on all prepids, on closing the wf for pid in pids: mcm.delete('/restapi/requests/forcecomplete/%s'%pid) else: print "could not close out",wfo.name,"will try again next time" else: ## full known list #recovering # has active ACDC ##OUT #recovered #had inactive ACDC #recovery #not over the pass bar #over100 # over 100% #biglumi # has a big lumiblock #parentcustodial # the parent does not have a valid subscription yet #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear #filemismatch # there is a dbs/phedex mismatch #duplicates #a lumi section is there twice ## manual is not added yet, and should be so by recoveror print wfo.name,"was tagged with :",list(assistance_tags) if 'recovering' in assistance_tags: ## if active ACDC, being under threshold, filemismatch do not matter assistance_tags = assistance_tags - set(['recovery','filemismatch']) if 'recovery' in assistance_tags and 'recovered' in assistance_tags: ## should not set -recovery to anything that add ACDC already assistance_tags = assistance_tags - set(['recovery','recovered']) ## straight to manual assistance_tags.add('manual') ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it print wfo.name,"needs assistance with",",".join( assistance_tags ) print wfo.name,"existing conditions",",".join( existing_assistance_tags ) ######################################### ##### notification to requester ######### go_notify=False if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags: go_notify=True if go_notify: #if wfo.name in already_notified: # print "double notification" # sendEmail('double notification','please take a look at %s'%(wfo.name)) #else: # already_notified.append( wfo.name ) detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s' perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name) splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name) ## notify templates messages= { 'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ), 'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink), 'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ), 'filemismatch': 'Samples completed with inconsistency in DBS/Phedex', #'manual' : 'Workflow completed and requires manual checks by Ops', } content = "The request PREPID (WORKFLOW) is facing issue in production.\n" motive = False for case in messages: if case in assistance_tags: content+= "\n"+messages[case]+"\n" motive = True content += "You are invited to check, while this is being taken care of by Comp-Ops.\n" content += "This is an automated message from Comp-Ops.\n" items_notified = set() if use_mcm and motive: wfi.notifyRequestor( content , mcm = mcm) ######################################### ## logic to set the status further if assistance_tags: new_status = 'assistance-'+'-'.join(sorted(assistance_tags) ) else: new_status = 'assistance' ## case where the workflow was in manual from recoveror if not 'manual' in wfo.status or new_status!='assistance-recovery': wfo.status = new_status if not options.test: print "setting",wfo.name,"to",wfo.status session.commit() else: print "current status is",wfo.status,"not changing to anything" #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2)) fDB.html() if not spec: #sendEmail("fresh assistance status available","Fresh status are available at https://cmst2.web.cern.ch/cmst2/unified/assistance.html",destination=['*****@*****.**']) #it's a bit annoying pass ## custodial requests print "Custodials" print json.dumps(custodials, indent=2) for site in custodials: print ','.join(custodials[site]),'=>',site if not options.test: result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) ) print result print "Transfers" print json.dumps(transfers, indent=2) ## replicas requests for site in transfers: print ','.join(transfers[site]),'=>',site if not options.test: result = None #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out") print result print "File Invalidation" print invalidations
def close(self): if os.path.isfile('.closor_stop'): print "The closing of workflows is shortened" return url = self.url batch_go = self.batch_go CI = self.CI UC = self.UC wfo = self.wfo jump_the_line = self.jump_the_line batch_goodness = self.batch_goodness check_parentage_to_announce = UC.get('check_parentage_to_announce') check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce') ## what is the expected #lumis self.wfi = workflowInfo(url, wfo.name) wfi = self.wfi wfo.wm_status = wfi.request['RequestStatus'] if wfi.isRelval(): has_batch_go = False batch_name = wfi.getCampaign() if not batch_name in batch_go: ## do the esimatation whethere this can be announced : only once per batch in_batches = getWorkflowByCampaign(url, batch_name, details=True) batch_go[batch_name] = all( map( lambda s: not s in [ 'completed', 'running-open', 'running-closed', 'acquired', 'staged', 'staging', 'assigned', 'assignment-approved' ], [r['RequestStatus'] for r in in_batches])) ## already verified has_batch_go = batch_go[batch_name] if not has_batch_go: wfi.sendLog( 'closor', 'Cannot close for now because the batch <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?campaign=%s>%s</a> is not all close' % (batch_name, batch_name)) return if wfi.request['RequestStatus'] in ['announced', 'normal-archived' ] and not options.force: ## manually announced ?? self.to_status = 'done' self.to_wm_status = wfi.request['RequestStatus'] wfi.sendLog( 'closor', '%s is announced already : %s' % (wfo.name, self.to_wm_status)) return if jump_the_line: wfi.sendLog('closor', 'Announcing while completing') expected_lumis = 1 if not 'TotalInputLumis' in wfi.request: print wfo.name, "has not been assigned yet, or the database is corrupted" elif wfi.request['TotalInputLumis'] == 0: print wfo.name, "is corrupted with 0 expected lumis" else: expected_lumis = wfi.request['TotalInputLumis'] ## what are the outputs outputs = wfi.request['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = defaultdict(lambda: False) stats = defaultdict(int) #print outputs if len(outputs): print wfo.name, wfi.request['RequestStatus'] for out in outputs: event_count, lumi_count = getDatasetEventsAndLumis(dataset=out) self.outs.append(Output(datasetname=out)) odb = self.outs[-1] odb.workflow = wfo odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) fraction = lumi_count / float(expected_lumis) * 100. completion_line = "%60s %d/%d = %3.2f%%" % ( out, lumi_count, expected_lumis, fraction) wfi.sendLog('closor', "\t%s" % completion_line) if wfi.isRelval() and fraction < batch_goodness: self.batch_warnings[wfi.getCampaign()].add(completion_line) if fraction < 50: self.batch_extreme_warnings[wfi.getCampaign()].add( completion_line) stats[out] = lumi_count all_OK[out] = True ## check for at least one full copy prior to moving on #in_full = {} for out in outputs: all_OK[out] = True #in_full[out] = [] #presence = getDatasetPresence( url, out ) #where = [site for site,info in presence.items() if info[0]] #if where: # all_OK[out] = True # print out,"is in full at",",".join(where) # in_full[out] = copy.deepcopy(where) #else: # going_to = wfi.request['NonCustodialSites']+wfi.request['CustodialSites'] # wfi.sendLog('closor',"%s is not in full anywhere. send to %s"%(out, ",".join(sorted(going_to)))) # at_destination = dict([(k,v) for (k,v) in presence.items() if k in going_to]) # else_where = dict([(k,v) for (k,v) in presence.items() if not k in going_to]) # print json.dumps( at_destination ) # print json.dumps( else_where, indent=2 ) ## do the full stuck transfer study, missing files and shit ! #for there in going_to: # late_info = findLateFiles(url, out, going_to = there ) # for l in late_info: # l.update({"workflow":wfo.name,"dataset":out}) # self.all_late_files.extend( late_info ) # if check_fullcopy_to_announce: ## only set this false if the check is relevant # all_OK[out] = False ## verify if we have to do harvesting #if not options.no_harvest and not jump_the_line: # #(OK, requests) = spawn_harvesting(url, wfi, in_full) # sites_for_DQMHarvest = UC.get("sites_for_DQMHarvest") # (OK, requests) = spawn_harvesting(url, wfi, sites_for_DQMHarvest) # print "Harvesting workflow has been created and assigned to: " # print sites_for_DQMHarvest # all_OK.update( OK ) ## only that status can let me go into announced if all(all_OK.values()) and ( (wfi.request['RequestStatus'] in ['closed-out']) or options.force or jump_the_line): print wfo.name, "to be announced" results = [] if not results: for out in outputs: print "dealing with", out if out in stats and not stats[out]: continue _, dsn, process_string, tier = out.split('/') if all_OK[out]: print "setting valid" results.append( setDatasetStatus(out, 'VALID', withFiles=False)) if all_OK[out] and wfi.isRelval(): ## make the specific relval rules and the replicas ## figure the destination(s) out destinations = set() if tier in UC.get("tiers_to_rucio_relval"): wfi.sendLog( 'closor', "Data Tier: %s is blacklisted, so skipping dataset placement for: %s" % (tier, out)) continue if tier != "RECO" and tier != "ALCARECO": destinations.add('T2_CH_CERN') if tier == "GEN-SIM": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-DIGI-RAW": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-RECO": destinations.add('T1_US_FNAL_Disk') if "RelValTTBar" in dsn and "TkAlMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if "MinimumBias" in dsn and "SiStripCalMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if destinations: wfi.sendLog( 'closor', '%s to go to %s' % (out, ', '.join(sorted(destinations)))) elif all_OK[out]: campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request and wfi.request[ 'Campaign']: campaign = wfi.request['Campaign'] to_DDM = False ## campaign override if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[ campaign] and tier in CI.campaigns[campaign][ 'toDDM']: to_DDM = True ## by typical enabling if tier in UC.get("tiers_to_rucio_nonrelval"): wfi.sendLog( 'closor', "Data Tier: %s is blacklisted, so skipping dataset placement for: %s" % (tier, out)) continue if tier in UC.get("tiers_to_DDM"): to_DDM = True ## check for unitarity if not tier in UC.get("tiers_no_DDM") + UC.get( "tiers_to_DDM"): print "tier", tier, "neither TO or NO DDM for", out results.append('Not recognitized tier %s' % tier) #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out) sendLog( 'closor', "could not recognize %s for injecting in DDM" % out, level='critical') continue n_copies = 1 destinations = [] if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[ campaign]: ddm_instructions = CI.campaigns[campaign][ 'DDMcopies'] if type(ddm_instructions) == int: n_copies = CI.campaigns[campaign]['DDMcopies'] elif type(ddm_instructions) == dict: ## a more fancy configuration for ddmtier, indication in ddm_instructions.items( ): if ddmtier == tier or ddmtier in [ '*', 'all' ]: ## this is for us if 'N' in indication: n_copies = indication['N'] if 'host' in indication: destinations = indication['host'] destination_spec = "" if destinations: destination_spec = "--destination=" + ",".join( destinations) group_spec = "" ## not used yet else: print wfo.name, "no stats for announcing", out results.append('No Stats') # adding check for PrentageResolved flag from ReqMgr: if wfi.request[ 'RequestType'] == 'StepChain' and check_parentage_to_announce: if wfi.request['ParentageResolved']: results.append(True) else: wfi.sendLog( 'closor', "Delayed announcement of %s due to unresolved Parentage dependencies" % wfi.request['RequestName']) results.append('No ParentageResolved') if all( map(lambda result: result in ['None', None, True], results)): if not jump_the_line: ## only announce if all previous are fine res = reqMgrClient.announceWorkflowCascade( url, wfo.name) if not res in ['None', None]: ## check the status again, it might well have toggled wl_bis = workflowInfo(url, wfo.name) self.to_wm_status = wl_bis.request['RequestStatus'] if wl_bis.request['RequestStatus'] in [ 'announced', 'normal-archived' ]: res = None else: res = reqMgrClient.announceWorkflowCascade( url, wfo.name) results.append(res) print results if all(map(lambda result: result in ['None', None, True], results)): if jump_the_line: if not 'announced' in wfo.status: self.to_status = wfo.status.replace( 'announce', 'announced') else: self.to_status = 'done' self.closing = True wfi.sendLog('closor', "workflow outputs are announced") else: wfi.sendLog( 'closor', "Error with %s to be announced \n%s" % (wfo.name, json.dumps(results))) elif wfi.request['RequestStatus'] in [ 'failed', 'aborted', 'aborted-archived', 'rejected', 'rejected-archived', 'aborted-completed' ]: if wfi.isRelval(): self.to_status = 'forget' self.to_wm_status = wfi.request['RequestStatus'] wfi.sendLog( 'closor', "%s is %s, but will not be set in trouble to find a replacement." % (wfo.name, self.to_wm_status)) else: self.to_status = 'trouble' self.to_wm_status = wfi.request['RequestStatus'] else: print wfo.name, "not good for announcing:", wfi.request[ 'RequestStatus'] wfi.sendLog('closor', "cannot be announced") self.held.add(wfo.name)
def completor(url, specific): up = componentInfo(mcm=False, soft=['mcm']) if not up.check(): return CI = campaignInfo() SI = siteInfo() wfs = [] wfs.extend(session.query(Workflow).filter(Workflow.status == 'away').all()) wfs.extend( session.query(Workflow).filter( Workflow.status.startswith('assistance')).all()) ## just take it in random order so that not always the same is seen random.shuffle(wfs) ## by workflow a list of fraction / timestamps completions = json.loads(open('%s/completions.json' % monitor_dir).read()) good_fractions = {} for c in CI.campaigns: if 'force-complete' in CI.campaigns[c]: good_fractions[c] = CI.campaigns[c]['force-complete'] long_lasting = {} overrides = {} for rider, email in [('vlimant', '*****@*****.**'), ('jen_a', '*****@*****.**'), ('srimanob', '*****@*****.**')]: rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json' % ( rider[0], rider) if not os.path.isfile(rider_file): print "no file", rider_file continue try: overrides[rider] = json.loads(open(rider_file).read()) except: print "cannot get force complete list from", rider sendEmail("malformated force complet file", "%s is not json readable" % rider_file, destination=[email]) print "can force complete on" print json.dumps(good_fractions, indent=2) print json.dumps(overrides, indent=2) max_force = 5 wfs_no_location_in_GQ = set() for wfo in wfs: if specific and not specific in wfo.name: continue print "looking at", wfo.name ## get all of the same wfi = workflowInfo(url, wfo.name) skip = False if not any([c in wfo.name for c in good_fractions]): skip = True for user, spec in overrides.items(): #print spec if wfo.name in spec and wfi.request[ 'RequestStatus'] != 'force-complete': #skip=False ## do not do it automatically yet sendEmail( 'force-complete requested', '%s is asking for %s to be force complete' % (user, wfo.name)) wfi = workflowInfo(url, wfo.name) forceComplete(url, wfi) skip = True wfi.notifyRequestor( "The workflow %s was force completed by request of %s" % (wfo.name, user), do_batch=False) wfi.sendLog( 'completor', '%s is asking for %s to be force complete' % (user, wfo.name)) break if wfo.status.startswith('assistance'): skip = True if skip: continue priority = wfi.request['RequestPriority'] if not 'Campaign' in wfi.request: continue if not wfi.request['RequestStatus'] in [ 'acquired', 'running-open', 'running-closed' ]: continue c = wfi.request['Campaign'] if not c in good_fractions: continue good_fraction = good_fractions[c] ignore_fraction = 2. lumi_expected = None event_expected = None if not 'TotalInputEvents' in wfi.request: if 'RequestNumEvents' in wfi.request: event_expected = wfi.request['RequestNumEvents'] else: print "truncated, cannot do anything" continue else: lumi_expected = wfi.request['TotalInputLumis'] event_expected = wfi.request['TotalInputEvents'] now = time.mktime(time.gmtime()) / (60 * 60 * 24.) running_log = filter( lambda change: change["Status" ] in ["running-open", "running-closed"], wfi.request['RequestTransition']) if not running_log: print "\tHas no running log" # cannot figure out when the thing started running continue then = running_log[-1]['UpdateTime'] / (60. * 60. * 24.) delay = now - then ## in days (w, d) = divmod(delay, 7) print "\t" * int( w) + "Running since", delay, "[days] priority=", priority if delay <= 7: continue if delay >= 7: long_lasting[wfo.name] = {"delay": delay} pass percent_completions = {} for output in wfi.request['OutputDatasets']: if "/DQM" in output: continue ## that does not count if not output in completions: completions[output] = { 'injected': None, 'checkpoints': [], 'workflow': wfo.name } ## get completion fraction event_count, lumi_count = getDatasetEventsAndLumis(dataset=output) lumi_completion = 0. event_completion = 0. if lumi_expected: lumi_completion = lumi_count / float(lumi_expected) if event_expected: event_completion = event_count / float(event_expected) #take the less optimistic percent_completions[output] = min(lumi_completion, event_completion) completions[output]['checkpoints'].append((now, event_completion)) if all([ percent_completions[out] >= good_fraction for out in percent_completions ]): print "all is above", good_fraction, "for", wfo.name print json.dumps(percent_completions, indent=2) else: print "\t", percent_completions.values( ), "not over bound", good_fraction long_lasting[wfo.name].update({ 'completion': sum(percent_completions.values()) / len(percent_completions), 'completions': percent_completions }) #print json.dumps( percent_completions, indent=2 ) ## do something about the agents this workflow is in long_lasting[wfo.name]['agents'] = wfi.getAgents() print json.dumps(long_lasting[wfo.name]['agents'], indent=2) ## pick up on possible issue with global queue data location locs = wfi.getGQLocations() for b, loc in locs.items(): if not loc: print b, "has no location for GQ in", wfi.request[ 'RequestName'] ## this is severe ! wfs_no_location_in_GQ.add(wfo.name) ## check location and site white list can_run = set([SI.SE_to_CE(se) for se in loc]) & set( wfi.request['SiteWhitelist']) if loc and not can_run: print b, "is missing site to run within the whitelist" wfs_no_location_in_GQ.add(wfo.name) can_run = can_run & set(SI.sites_ready) if loc and not can_run: print b, "is missing available site to run" wfs_no_location_in_GQ.add(wfo.name) continue if all([ percent_completions[out] >= ignore_fraction for out in percent_completions ]): print "all is done, just wait a bit" continue for output in percent_completions: completions[output]['injected'] = then #further check on delays cpuh = wfi.getComputingTime(unit='d') ran_at = wfi.request['SiteWhitelist'] print "Required:", cpuh, print "Time spend:", delay # only really force complete after n days if delay <= 14: continue print "going for force-complete of", wfo.name ## find ACDCs that might be running if max_force > 0: forceComplete(url, wfi) max_force -= 1 else: print "too many completion this round" ## do it once only for testing #break open('%s/completions.json' % monitor_dir, 'w').write(json.dumps(completions, indent=2)) text = "These have been running for long" open('%s/longlasting.json' % monitor_dir, 'w').write(json.dumps(long_lasting, indent=2)) for wf, info in sorted(long_lasting.items(), key=lambda tp: tp[1]['delay'], reverse=True): delay = info['delay'] text += "\n %s : %s days" % (wf, delay) if 'completion' in info: text += " %d%%" % (info['completion'] * 100) if wfs_no_location_in_GQ: sendEmail( 'workflow with no location in GQ', "there won't be able to run anytime soon\n%s" % ('\n'.join(wfs_no_location_in_GQ))) #sendEmail("long lasting workflow",text) ## you can check the log print text