def assignor(url, specific=None, talk=True, options=None): if userLock() and not options.manual: return mlock = moduleLock() if mlock() and not options.manual: return if not componentInfo().check() and not options.manual: return UC = unifiedConfiguration() CI = campaignInfo() SI = siteInfo() SI = global_SI() ###NLI = newLockInfo() ###if not NLI.free() and not options.go: return LI = lockInfo() #if not LI.free() and not options.go and not options.manual: return n_assigned = 0 n_stalled = 0 wfos = [] fetch_from = [] if specific or options.early: fetch_from.extend(['considered', 'staging']) if specific: fetch_from.extend(['considered-tried']) if options.early: print "Option Early is on" fetch_from.extend(['staged']) if options.from_status: fetch_from = options.from_status.split(',') print "Overriding to read from", fetch_from for status in fetch_from: print "getting wf in", status wfos.extend( session.query(Workflow).filter(Workflow.status == status).all()) print len(wfos) ## in case of partial, go for fetching a list from json ? #if options.partial and not specific: # pass aaa_mapping = json.loads(eosRead('%s/equalizor.json' % monitor_pub_dir))['mapping'] all_stuck = set() all_stuck.update( json.loads(eosRead('%s/stuck_transfers.json' % monitor_pub_dir))) max_per_round = UC.get('max_per_round').get('assignor', None) max_cpuh_block = UC.get('max_cpuh_block') # Temporarily switch off prioritization random.shuffle(wfos) ##order by priority instead of random """ if options.early: cache = sorted(getWorkflows(url, 'assignment-approved', details=True), key = lambda r : r['RequestPriority']) cache = [r['RequestName'] for r in cache] def rank( wfn ): return cache.index( wfn ) if wfn in cache else 0 wfos = sorted(wfos, key = lambda wfo : rank( wfo.name ),reverse=True) print "10 first",[wfo.name for wfo in wfos[:10]] print "10 last",[wfo.name for wfo in wfos[-10:]] else: random.shuffle( wfos ) """ for wfo in wfos: if options.limit and (n_stalled + n_assigned) > options.limit: break if max_per_round and (n_stalled + n_assigned) > max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue #if not specific in wfo.name: continue if not options.manual and 'rucio' in (wfo.name).lower(): continue print "\n\n" wfh = workflowInfo(url, wfo.name) if wfh.request['RequestStatus'] in [ 'rejected', 'aborted', 'aborted-completed', 'aborted-archived', 'rejected-archived' ] and wfh.isRelval(): wfo.status = 'forget' session.commit() n_stalled += 1 continue if options.priority and int( wfh.request['RequestPriority']) < options.priority: continue options_text = "" if options.early: options_text += ", early option is ON" wfh.sendLog('assignor', "%s to be assigned %s" % (wfo.name, options_text)) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput, primary, parent, secondary, sites_allowed, sites_not_allowed) = wfh.getSiteWhiteList() output_tiers = list( set([o.split('/')[-1] for o in wfh.request['OutputDatasets']])) if not output_tiers: n_stalled += 1 wfh.sendLog('assignor', 'There is no output at all') sendLog('assignor', 'Workflow %s has no output at all' % (wfo.name), level='critical') continue is_stuck = (all_stuck & primary) if is_stuck: wfh.sendLog('assignor', "%s are stuck input" % (','.join(is_stuck))) ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: no_go = True allowed_secondary = {} assign_parameters = {} check_secondary = (not wfh.isRelval()) for campaign in wfh.getCampaigns(): if campaign in CI.campaigns: assign_parameters.update(CI.campaigns[campaign]) if campaign in CI.campaigns and 'secondaries' in CI.campaigns[ campaign]: if CI.campaigns[campaign]['secondaries']: allowed_secondary.update( CI.campaigns[campaign]['secondaries']) check_secondary = True if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[ campaign]: banned_tier = list( set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers)) if banned_tier: no_go = True wfh.sendLog( 'assignor', 'These data tiers %s are not allowed' % (','.join(banned_tier))) sendLog('assignor', 'These data tiers %s are not allowed' % (','.join(banned_tier)), level='critical') if secondary and check_secondary: if (set(secondary) & set(allowed_secondary.keys()) != set(secondary)): msg = '%s is not an allowed secondary' % ( ', '.join(set(secondary) - set(allowed_secondary.keys()))) wfh.sendLog('assignor', msg) critical_msg = msg + '\nWorkflow URL: https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=task_{}'.format( wfh.getPrepIDs()[0]) sendLog('assignor', critical_msg, level='critical') if not options.go: no_go = True ## then get whether there is something more to be done by secondary for sec in secondary: if sec in allowed_secondary: # and 'parameters' in allowed_secondary[sec]: assign_parameters.update(allowed_secondary[sec]) if no_go: n_stalled += 1 ## make a very loud noise if >100k priority stalled continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] != 'assignment-approved': if not options.test: wfh.sendLog('assignor', "setting %s away and skipping" % wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name, wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version = wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor', "cannot decide on version number") n_stalled += 1 wfo.status = 'trouble' session.commit() continue wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed)) blocks = wfh.getBlocks() if blocks: wfh.sendLog( 'assignor', "Needs {} blocks in input {}".format(len(blocks), '\n'.join(blocks))) wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed)) primary_aaa = options.primary_aaa secondary_aaa = options.secondary_aaa if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns: assign_parameters.update(CI.campaigns[wfh.request['Campaign']]) if 'primary_AAA' in assign_parameters and primary: primary_aaa = primary_aaa or assign_parameters['primary_AAA'] if 'secondary_AAA' in assign_parameters: secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA'] wfh.sendLog( 'assignor', "Initial values for primary_AAA=%s and secondary_AAA=%s" % (primary_aaa, secondary_aaa)) if primary_aaa: if "T2_CH_CERN_HLT" in sites_allowed: sites_allowed.remove("T2_CH_CERN_HLT") if "T2_CH_CERN_HLT" not in sites_not_allowed: sites_not_allowed.append("T2_CH_CERN_HLT") ## keep track of this, after secondary input location restriction : that's how you want to operate it initial_sites_allowed = copy.deepcopy(sites_allowed) set_lfn = '/store/mc' ## by default for prim in list(primary): set_lfn = getLFNbase(prim) ## if they are requested for processing, they should bbe all closed already # FIXME: remove this closeAllBlocks #closeAllBlocks(url, prim, blocks) ## should be 2 but for the time-being let's lower it to get things going _copies_wanted, cpuh = wfh.getNCopies() wfh.sendLog('assignor', "we need %s CPUh" % cpuh) if cpuh > max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog( 'assignor', '%s requires a large numbr of CPUh %s , not assigning, please check with requester' % (wfo.name, cpuh), level='critical') wfh.sendLog( 'assignor', "Requiring a large number of CPUh %s, not assigning" % cpuh) continue ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently ## default back to white list to original white list with any data wfh.sendLog('assignor', "Allowed sites :%s" % sorted(sites_allowed)) # TODO Alan on 1/april/2020: keep the AAA functionality if primary_aaa: ## remove the sites not reachable localy if not in having the data if not sites_allowed: wfh.sendLog('assignor', "Overiding the primary on AAA setting to Off") primary_aaa = False else: aaa_grid = set(sites_allowed) for site in list(aaa_grid): aaa_grid.update(aaa_mapping.get(site, [])) sites_allowed = list(set(initial_sites_allowed) & aaa_grid) wfh.sendLog( 'assignor', "Selected to read primary through xrootd %s" % sorted(sites_allowed)) isStoreResults = ('StoreResults' == wfh.request.setdefault( 'RequestType', None)) if isStoreResults: if 'MergedLFNBase' in wfh.request: set_lfn = wfh.request['MergedLFNBase'] else: n_stalled += 1 wfh.sendLog( 'assignor', "Cannot assign StoreResults request because MergedLFN is missing" ) sendLog( 'assignor', 'Cannot assign StoreResults request because MergedLFN is missing', level='critical') continue if not primary_aaa: if isStoreResults: ## if we are dealing with a StoreResults request, we don't need to check dataset availability and ## should use the SiteWhiteList set in the original request if 'SiteWhitelist' in wfh.request: sites_allowed = wfh.request['SiteWhitelist'] else: wfh.sendLog( 'assignor', "Cannot assign StoreResults request because SiteWhitelist is missing" ) sendLog( 'assignor', 'Cannot assign StoreResults request because SiteWhitelist is missing', level='critical') n_stalled += 1 continue wfh.sendLog('assignor', "Selected for any data %s" % sorted(sites_allowed)) #if not len(sites_allowed): # if not options.early: # wfh.sendLog('assignor',"cannot be assign with no matched sites") # sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') # n_stalled+=1 # continue if not len(sites_allowed) and not options.SiteWhitelist: if not options.early: wfh.sendLog('assignor', "cannot be assign with no matched sites") sendLog('assignor', '%s has no whitelist' % wfo.name, level='critical') n_stalled += 1 continue t1t2_only = [ ce for ce in sites_allowed if [ce.startswith('T1') or ce.startswith('T2')] ] if t1t2_only: # try to pick from T1T2 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1t2_only])] # then pick any otherwise else: sites_out = [ SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed]) ] print "available=", SI.disk[sites_out[0]] wfh.sendLog('assignor', "Placing the output on %s" % sites_out) parameters = { 'SiteWhitelist': sites_allowed, 'SiteBlacklist': sites_not_allowed, 'NonCustodialSites': sites_out, 'AutoApproveSubscriptionSites': list(set(sites_out)), 'AcquisitionEra': wfh.acquisitionEra(), 'ProcessingString': wfh.processingString(), 'MergedLFNBase': set_lfn, 'ProcessingVersion': version, } if primary_aaa: parameters['TrustSitelists'] = True wfh.sendLog( 'assignor', "Reading primary through xrootd at %s" % sorted(sites_allowed)) if secondary_aaa: # Do not set TrustPUSitelist to True if there is no secondary if secondary: parameters['TrustPUSitelists'] = True wfh.sendLog( 'assignor', "Reading secondary through xrootd at %s" % sorted(sites_allowed)) ## plain assignment here team = 'production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team parameters['Team'] = team if lheinput: ## throttle reading LHE article wfh.sendLog('assignor', 'Setting the number of events per job to 500k max') parameters['EventsPerJob'] = 500000 def pick_options(options, parameters): ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v = getattr(options, key) if v != None: if type(v) == str and ',' in v: parameters[key] = filter(None, v.split(',')) else: parameters[key] = v def pick_campaign(assign_parameters, parameters): ## pick up campaign specific assignment parameters parameters.update(assign_parameters.get('parameters', {})) if options.force_options: pick_campaign(assign_parameters, parameters) pick_options(options, parameters) else: ## campaign parameters update last pick_options(options, parameters) pick_campaign(assign_parameters, parameters) if not options.test: parameters['execute'] = True hold_split, split_check = wfh.checkSplitting() if hold_split and not options.go: if split_check: wfh.sendLog( 'assignor', 'Holding on to the change in splitting %s' % ('\n\n'.join([str(i) for i in split_check]))) else: wfh.sendLog('assignor', 'Change of splitting is on hold') n_stalled += 1 continue if split_check == None or split_check == False: n_stalled += 1 continue elif split_check: ## operate all recommended changes reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check) wfh.sendLog( 'assignor', 'Applying the change in splitting %s' % ('\n\n'.join([str(i) for i in split_check]))) split_check = True ## bypass completely and use the above # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents / (reqJobs * 1.4)) lumisPerJob = int(eventsPerJob / eventsPerLumi) if lumisPerJob == 0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob), level='critical') wfh.sendLog( 'assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] # FIXME: decide which of the lines below needs to remain... eventsPerJobEstimated = spl[ 'events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl[ 'avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob), level='critical') wfh.sendLog( 'assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog( 'assignor', "leaving splitting untouched for %s, please check on %s" % (pstring, wfo.name), level='critical') wfh.sendLog( 'assignor', "leaving splitting untouched for PU_RD*, please check." ) ## make sure to autoapprove all NonCustodialSites parameters['AutoApproveSubscriptionSites'] = list( set(parameters['NonCustodialSites'] + parameters.get('AutoApproveSubscriptionSites', []))) result = reqMgrClient.assignWorkflow( url, wfo.name, None, parameters) ## team is not relevant anymore here # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned += 1 wfh.sendLog( 'assignor', "Properly assigned\n%s" % (json.dumps(parameters, indent=2))) if wfh.producePremix() and (not wfh.isRelval()): title = "Heavy workflow assigned to {}".format( parameters['SiteWhitelist']) body = "Workflow name: {}".format( wfh.request['RequestName']) body += "\nOutput dataset(s): {}".format( wfh.request['OutputDatasets']) body += "\nAssigned to: {}".format( parameters['SiteWhitelist']) sendEmail( title, body, destination=[ '*****@*****.**' ]) try: ## refetch information and lock output new_wfi = workflowInfo(url, wfo.name) (_, prim, _, sec) = new_wfi.getIO() for secure in list(prim) + list( sec) + new_wfi.request['OutputDatasets']: ## lock all outputs LI.lock(secure, reason='assigning') except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output", str(e)) else: wfh.sendLog( 'assignor', "Failed to assign %s.\n%s \n Please check the logs" % (wfo.name, reqMgrClient.assignWorkflow.errorMessage)) sendLog('assignor', "Failed to assign %s.\n%s \n Please check the logs" % (wfo.name, reqMgrClient.assignWorkflow.errorMessage), level='critical') print "ERROR could not assign", wfo.name else: pass print "Assignment summary:" sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled)) if n_stalled and not options.go and not options.early: sendLog('assignor', "%s workflows cannot be assigned. Please take a look" % (n_stalled), level='critical')
def assignor(url, specific=None, talk=True, options=None): if userLock('assignor'): return CI = campaignInfo() SI = siteInfo() wfos = [] if specific: wfos = session.query(Workflow).filter(Workflow.name == specific).all() if not wfos: if specific: wfos = session.query(Workflow).filter( Workflow.status == 'considered').all() wfos.extend( session.query(Workflow).filter( Workflow.status == 'staging').all()) wfos.extend( session.query(Workflow).filter(Workflow.status == 'staged').all()) for wfo in wfos: if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue #if not specific in wfo.name: continue print wfo.name, "to be assigned" wfh = workflowInfo(url, wfo.name) ## check if by configuration we gave it a GO if not CI.go(wfh.request['Campaign']) and not options.go: print "No go for", wfh.request['Campaign'] continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] != 'assignment-approved': print wfo.name, wfh.request['RequestStatus'], "skipping" if not options.test: continue ## retrieve from the schema, dbs and reqMgr what should be the next version version = wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: print "cannot decide on version number" continue (lheinput, primary, parent, secondary) = wfh.getIO() sites_allowed = getSiteWhiteList( (lheinput, primary, parent, secondary)) print "Allowed", sites_allowed sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])] sites_custodial = [] if len(sites_custodial) == 0: print "No custodial, it's fine, it's covered in close-out" if len(sites_custodial) > 1: print "more than one custodial for", wfo.name sys.exit(36) secondary_locations = None for sec in list(secondary): presence = getDatasetPresence(url, sec) print sec print json.dumps(presence, indent=2) #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>90.] one_secondary_locations = [ site for (site, (there, frac)) in presence.items() if there ] if secondary_locations == None: secondary_locations = one_secondary_locations else: secondary_locations = list( set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only sites_allowed = [ site for site in sites_allowed if any([ osite.startswith(site) for osite in one_secondary_locations ]) ] sites_all_data = copy.deepcopy(sites_allowed) sites_with_data = copy.deepcopy(sites_allowed) sites_with_any_data = copy.deepcopy(sites_allowed) primary_locations = None available_fractions = {} for prim in list(primary): presence = getDatasetPresence(url, prim) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction( url, prim, sites=[SI.CE_to_SE(site) for site in sites_allowed]) sites_all_data = [ site for site in sites_with_data if any([ osite.startswith(site) for osite in [ psite for (psite, (there, frac)) in presence.items() if there ] ]) ] sites_with_data = [ site for site in sites_with_data if any([ osite.startswith(site) for osite in [ psite for (psite, frac) in presence.items() if frac[1] > 90. ] ]) ] sites_with_any_data = [ site for site in sites_with_any_data if any([osite.startswith(site) for osite in presence.keys()]) ] if primary_locations == None: primary_locations = presence.keys() else: primary_locations = list( set(primary_locations) & set(presence.keys())) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites = [] ## opportunistic running where any piece of data is available if secondary_locations and primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] opportunistic_sites = [ SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set(sites_allowed)) ] print "We could be running at", opportunistic_sites, "in addition" if available_fractions and not all( [available >= 1. for available in available_fractions.values()]): print "The input dataset is not located in full at any site" print json.dumps(available_fractions) if not options.test and not options.go: continue ## skip skip skip copies_wanted = 2. if available_fractions and not all([ available >= copies_wanted for available in available_fractions.values() ]): print "The input dataset is not available", copies_wanted, "times, only", available_fractions.values( ) if not options.go: continue ## default back to white list to original white list with any data print "Allowed", sites_allowed sites_allowed = sites_with_any_data print "Selected for any data", sites_allowed if options.restrict: print "Allowed", sites_allowed sites_allowed = sites_with_any_data print "Selected", sites_allowed else: if set(sites_with_data) != set(sites_allowed): ## the data is not everywhere we wanted to run at : enable aaa print "Sites with 90% data not matching site white list (block choping!)" print "Resorting to AAA reading for", list( set(sites_allowed) - set(sites_with_data)), "?" print "Whitelist site with any data", list( set(sites_allowed) - set(sites_with_any_data)) #options.useSiteListAsLocation = True #print "Not commissioned yet" #continue #print "We could be running at",opportunistic_sites,"in addition" ##sites_allowed = list(set(sites_allowed+ opportunistic_sites)) if not len(sites_allowed): print wfo.name, "cannot be assign with no matched sites" continue parameters = { 'SiteWhitelist': sites_allowed, 'CustodialSites': sites_custodial, 'NonCustodialSites': sites_out, 'AutoApproveSubscriptionSites': list(set(sites_out)), 'AcquisitionEra': wfh.acquisitionEra(), 'ProcessingString': wfh.processingString(), 'MergedLFNBase': '/store/mc', ## to be figured out ! from Hi shit 'ProcessingVersion': version, } ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v = getattr(options, key) if v != None: if ',' in v: parameters[key] = filter(None, v.split(',')) else: parameters[key] = v ## pick up campaign specific assignment parameters parameters.update(CI.parameters(wfh.request['Campaign'])) if not options.test: parameters['execute'] = True if not wfh.checkWorkflowSplitting(): ## needs to go to event based ? fail for now print "Falling back to event splitting ?" #parameters['SplittingAlgorithm'] = 'EventBased' continue ## plain assignment here team = 'production' if options and options.team: team = options.team result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = 'away' session.commit() else: print "ERROR could not assign", wfo.name else: pass
def transferor(url, specific=None, talk=True, options=None): if userLock(): return mlock = moduleLock() if mlock(): return use_mcm = True up = componentInfo(soft=['mcm', 'wtc', 'jira']) if not up.check(): return use_mcm = up.status['mcm'] if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() #NLI = newLockInfo() #if not NLI.free(): return LI = lockInfo() if not LI.free(): return mcm = McMClient(dev=False) dss = DSS() #allowed_secondary = UC.get('') print "counting all being handled..." being_handled = len( session.query(Workflow).filter(Workflow.status == 'away').all()) being_handled += len( session.query(Workflow).filter( Workflow.status.startswith('stag')).all()) being_transfered = len( session.query(Workflow).filter(Workflow.status == 'staging').all()) #being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all()) being_handled += len( session.query(Workflow).filter( Workflow.status.startswith('assistance-')).filter( ~Workflow.status.contains('custodial')).all()) max_to_handle = options.maxworkflows max_to_transfer = options.maxstaging allowed_to_handle = max(0, max_to_handle - being_handled) allowed_to_transfer = max(0, max_to_transfer - being_transfered) wf_buffer = 5 if allowed_to_handle <= wf_buffer: ## buffer for having several wf per transfer print "Not allowed to run more than", max_to_handle, "at a time. Currently", being_handled, "and", wf_buffer, "buffer" else: print being_handled, "already being handled", max_to_handle, "max allowed,", allowed_to_handle, "remaining", "and", wf_buffer, "buffer" if allowed_to_transfer <= wf_buffer: print "Not allowed to transfer more than", max_to_transfer, "at a time. Currently", being_transfered, "and", wf_buffer, "buffer" else: print being_transfered, "already being transfered", max_to_transfer, "max allowed,", allowed_to_transfer, "remaining", "and", wf_buffer, "buffer" print "... done" all_transfers = defaultdict(list) workflow_dependencies = defaultdict( set) ## list of wf.id per input dataset wfs_and_wfh = [] max_per_round = UC.get('max_per_round').get('transferor', None) print "getting all wf to consider ..." cache = getWorkflows(url, 'assignment-approved', details=True) all_to_include = session.query(Workflow).filter( Workflow.status.startswith('considered')).all() if len(cache) > 2000: max_to_include = max_per_round random.shuffle(cache) ## randomize first by wf name cache = sorted(cache, key=lambda r: r['RequestPriority'], reverse=True) ## order by prio highest = [r['RequestName'] for r in cache[:max_to_include]] all_to_include = [wfo for wfo in all_to_include if wfo.name in highest] print "limiting what to consider to", max_to_include, "because there is too much stuff going on. Got", len( all_to_include) for wfo in all_to_include: print "\t", wfo.name if specific and not specific in wfo.name: continue cache_r = filter(lambda d: d['RequestName'] == wfo.name, cache) if len(cache_r): wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False, request=cache_r[0]))) else: wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False))) print "... done" transfers_per_sites = defaultdict(int) input_sizes = defaultdict(float) ignored_input_sizes = defaultdict(float) input_cput = {} input_st = {} ## list the size of those in transfer already in_transfer_priority = None min_transfer_priority = None print "getting all wf in staging ..." #stucks = json.loads(open('%s/stuck_transfers.json'%monitor_pub_dir).read()) stucks = json.loads(eosRead('%s/stuck_transfers.json' % monitor_pub_dir)) for wfo in session.query(Workflow).filter( Workflow.status == 'staging').all(): wfh = workflowInfo(url, wfo.name, spec=False) #(lheinput,primary,parent,secondary) = wfh.getIO() #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) print wfo.name, "staging" (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() for site in sites_allowed: ## we should get the actual transfer destination instead of the full white list transfers_per_sites[site] += 1 #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() blocks = wfh.getBlocks() for prim in primary: ds_s = dss.get(prim, blocks=blocks) if prim in stucks: wfh.sendLog( 'transferor', "%s appears stuck, so not counting it %s [GB]" % (prim, ds_s)) ignored_input_sizes[prim] = max(ds_s, ignored_input_sizes[prim]) else: input_sizes[prim] = max(ds_s, input_sizes[prim]) wfh.sendLog('transferor', "%s needs %s [GB]" % (wfo.name, ds_s)) if in_transfer_priority == None: in_transfer_priority = int(wfh.request['RequestPriority']) else: in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority'])) if min_transfer_priority == None: min_transfer_priority = int(wfh.request['RequestPriority']) else: min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority'])) try: print "Ignored input sizes" ignored_values = list(ignored_input_sizes.items()) ignored_values.sort(key=lambda i: i[1]) print "\n".join(map(str, ignored_values)) print "Considered input sizes" considered_values = list(input_sizes.items()) considered_values.sort(key=lambda i: i[1]) print "\n".join(map(str, considered_values)) except Exception as e: print "trying to print the summary of input size" print str(e) print "... done" print "Max priority in transfer already", in_transfer_priority print "Min priority in transfer already", min_transfer_priority print "transfers per sites" print json.dumps(transfers_per_sites, indent=2) in_transfer_already = sum(input_sizes.values()) cput_in_transfer_already = sum(input_cput.values()) st_in_transfer_already = sum(input_st.values()) ## list the size of all inputs primary_input_per_workflow_gb = defaultdict(float) print "getting all input sizes ..." input_blocks = {} for (wfo, wfh) in wfs_and_wfh: (_, primary, _, _) = wfh.getIO() blocks = wfh.getBlocks() input_blocks[wfo.name] = blocks for prim in primary: ## do not count it if it appears stalled ! prim_size = dss.get(prim, blocks=blocks) input_sizes[prim] = max(prim_size, input_sizes[prim]) primary_input_per_workflow_gb[wfo.name] += prim_size print "... done" # shuffle first by name random.shuffle(wfs_and_wfh) # Sort smallest transfers first; allows us to transfer as many as possible workflows. def prio_and_size(i, j): if int(i[1].request['RequestPriority']) == int( j[1].request['RequestPriority']): return cmp(int(primary_input_per_workflow_gb.get(j[0].name, 0)), int(primary_input_per_workflow_gb.get(i[0].name, 0))) else: return cmp(int(i[1].request['RequestPriority']), int(j[1].request['RequestPriority'])) #wfs_and_wfh.sort(cmp = prio_and_size, reverse=True) #wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(primary_input_per_workflow_gb.get(i[0].name, 0)), int(primary_input_per_workflow_gb.get(j[0].name, 0)) )) #sort by priority higher first wfs_and_wfh.sort(cmp=lambda i, j: cmp(int(i[1].request[ 'RequestPriority']), int(j[1].request['RequestPriority'])), reverse=True) if min_transfer_priority == None or in_transfer_priority == None: print "nothing is lining up for transfer" sendLog( "transferor", "No request in staging, using first request to set priority limit") if len(wfs_and_wfh): min_transfer_priority = wfs_and_wfh[0][1].request[ 'RequestPriority'] in_transfer_priority = wfs_and_wfh[0][1].request['RequestPriority'] else: return cput_grand_total = sum(input_cput.values()) cput_to_transfer = cput_grand_total - cput_in_transfer_already st_grand_total = sum(input_st.values()) st_to_transfer = st_grand_total - st_in_transfer_already print "%15.4f [CPU h] worth already in transfer" % cput_in_transfer_already print "%15.4f [CPU h] worth is the current requested transfer load" % cput_to_transfer print "%15.4f [h] worth of absolute system time in transfer" % ( cput_in_transfer_already / SI.availableSlots()) print "%15.4f [h] worth of absolute system time is the current requested transfer load" % ( cput_to_transfer / SI.availableSlots()) print "%15.4f [h] worth of theoritical system time in transfer" % ( st_in_transfer_already) print "%15.4f [h] worth of theoritical system time is the current requested transfer load" % ( st_to_transfer) grand_total = sum(input_sizes.values()) to_transfer = grand_total - in_transfer_already grand_transfer_limit = options.maxtransfer #grand_transfer_limit = SI.total_disk()*0.25*1024## half of the free sapce in TB->GB transfer_limit = grand_transfer_limit - in_transfer_already print "%15.4f GB already being transfered" % in_transfer_already print "%15.4f GB is the current requested transfer load" % to_transfer print "%15.4f GB is the global transfer limit" % grand_transfer_limit print "%15.4f GB is the available limit" % transfer_limit max_staging_per_site = options.maxstagingpersite # the max priority value per dataset. max_priority = defaultdict(int) needs_transfer = 0 ## so that we can count'em passing_along = 0 transfer_sizes = defaultdict(float) went_over_budget = False destination_cache = {} no_goes = set() if max_per_round and not spec: wfs_and_wfh = wfs_and_wfh[:max_per_round] for (wfo, wfh) in wfs_and_wfh: print wfo.name, "to be transfered with priority", wfh.request[ 'RequestPriority'] if wfh.request['RequestStatus'] != 'assignment-approved': if wfh.request['RequestStatus'] in [ 'aborted', 'rejected', 'rejected-archived', 'aborted-archived' ]: if wfh.isRelval(): wfo.status = 'forget' else: wfo.status = 'trouble' ## so that we look or a replacement else: wfo.status = 'away' wfh.sendLog( 'transferor', '%s in status %s, setting %s' % (wfo.name, wfh.request['RequestStatus'], wfo.status)) continue (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() blocks = input_blocks.get(wfo.name, wfh.getBlocks()) if blocks: print "Reading only", len(blocks), "blocks in input" this_load = sum([dss.get(prim, blocks=blocks) for prim in primary]) no_budget = False if (this_load and (sum(transfer_sizes.values()) + this_load > transfer_limit or went_over_budget)): if went_over_budget: wfh.sendLog('transferor', "Transfer has gone over bubget.") else: wfh.sendLog('transferor', "Transfer will go over bubget.") wfh.sendLog( 'transferor', "%15.4f GB this load, %15.4f GB already this round, %15.4f GB is the available limit" % (this_load, sum(transfer_sizes.values()), transfer_limit)) #if sum(transfer_sizes.values()) > transfer_limit: went_over_budget = True if in_transfer_priority != None and min_transfer_priority != None: if int( wfh.request['RequestPriority'] ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority: wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over budget" % (wfh.request['RequestPriority'], in_transfer_priority)) else: if not options.go: wfh.sendLog( 'transferor', "%s minimum priority %s < %s : stop" % (min_transfer_priority, wfh.request['RequestPriority'], in_transfer_priority)) no_budget = True ## throtlle by campaign go no_go = False if not wfh.go(log=True) and not options.go: no_go = True no_goes.add(wfo.name) allowed_secondary = {} overide_parameters = {} check_secondary = (not wfh.isRelval()) output_tiers = list( set([o.split('/')[-1] for o in wfh.request['OutputDatasets']])) for campaign in wfh.getCampaigns(): if campaign in CI.campaigns: overide_parameters.update(CI.campaigns[campaign]) if campaign in CI.campaigns and 'secondaries' in CI.campaigns[ campaign]: if CI.campaigns[campaign]['secondaries']: allowed_secondary.update( CI.campaigns[campaign]['secondaries']) check_secondary = True if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[ campaign]: banned_tier = list( set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers)) if banned_tier: no_go = True wfh.sendLog( 'transferor', 'These data tiers %s are not allowed' % (','.join(banned_tier))) sendLog('transferor', 'These data tiers %s are not allowed in %s' % (','.join(banned_tier), wfo.name), level='critical') if secondary and check_secondary: if (set(secondary) & set(allowed_secondary.keys()) != set(secondary)): msg = '%s is not an allowed secondary' % ( ', '.join(set(secondary) - set(allowed_secondary.keys()))) wfh.sendLog('transferor', msg) critical_msg = msg + '\nWorkflow URL: https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=task_{}'.format( wfh.getPrepIDs()[0]) sendLog('transferor', critical_msg, level='critical') if not options.go: no_go = True for sec in secondary: if sec in allowed_secondary: overide_parameters.update(allowed_secondary[sec]) if 'SiteWhitelist' in overide_parameters: sites_allowed = list( set(sites_allowed) & set(overide_parameters['SiteWhitelist'])) wfh.sendLog( 'transferor', 'Intersecting with the overriding whitelist parameters, allowed sites become {}' .format(sites_allowed)) if no_go: continue if passing_along >= allowed_to_handle: #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: if in_transfer_priority != None and min_transfer_priority != None: if int(wfh.request['RequestPriority'] ) >= in_transfer_priority and int( wfh.request['RequestPriority'] ) != min_transfer_priority: ## higher priority, and not only this priority being transfered wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over %s" % (wfh.request['RequestPriority'], in_transfer_priority, max_to_handle)) else: wfh.sendLog( 'transferor', " Not allowed to pass more than %s at a time. Currently %s handled, and adding %s" % (max_to_handle, being_handled, passing_along)) if not options.go: ## should not allow to jump that fence break if this_load and needs_transfer >= allowed_to_transfer: if in_transfer_priority != None and min_transfer_priority != None: if int(wfh.request['RequestPriority'] ) >= in_transfer_priority and int( wfh.request['RequestPriority'] ) != min_transfer_priority: ## higher priority, and not only this priority being transfered wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over %s" % (wfh.request['RequestPriority'], in_transfer_priority, max_to_transfer)) else: wfh.sendLog( 'transferor', "Not allowed to transfer more than %s at a time. Currently %s transfering, and adding %s" % (max_to_transfer, being_transfered, needs_transfer)) if not options.go: no_budget = True if no_budget: continue # break ## try this for a while to make things faster ## the site white list considers site, campaign, memory and core information if options and options.tosites: sites_allowed = options.tosites.split(',') for dataset in list(primary) + list(parent) + list(secondary): LI.lock(dataset, reason='staging') if not sites_allowed: wfh.sendLog('transferor', "not possible site to run at") #sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name )) sendLog('transferor', "%s has no possible sites to run at" % (wfo.name), level='critical') continue can_go = True staging = False allowed = True primary_destinations = set() if primary: copies_needed_from_CPUh, CPUh = wfh.getNCopies() if talk: print wfo.name, 'reads', ', '.join(primary), 'in primary' ## chope the primary dataset for prim in primary: ## keep track of what needs what workflow_dependencies[prim].add(wfo.id) max_priority[prim] = max(max_priority[prim], int(wfh.request['RequestPriority'])) wfh.sendLog( 'transferor', "Would make %s from cpu requirement %s" % (copies_needed_from_CPUh, CPUh)) copies_needed = copies_needed_from_CPUh if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[ wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[ wfh.request['Campaign']]['maxcopies'] copies_needed = min(copies_needed_from_campaign, copies_needed) wfh.sendLog( 'transferor', "Maxed to %s by campaign configuration %s" % (copies_needed, wfh.request['Campaign'])) if blocks: print "limiting to blocks", "\n".join(sorted(blocks)) ### new ways of making the whole thing destinations, all_block_names = getDatasetDestinations( url, prim, within_sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks) print json.dumps(destinations, indent=2) ## get where the dataset is in full and completed prim_location = [ site for (site, info) in destinations.items() if info['completion'] == 100 and info['data_fraction'] == 1 ] ## the rest is places it is going to be #prim_destination = [site for site in destinations.keys() if not site in prim_location] prim_destination = [ site for (site, info) in destinations.items() if info['data_fraction'] == 1 and info['completion'] != 100 ] ## veto the site with no current disk space, for things that are not relval prim_destination = [ site for site in prim_destination if (SI.disk[site] or wfh.isRelval()) ] if len(prim_location) >= copies_needed: wfh.sendLog( 'transferor', "The input is all fully in place at %s sites %s" % (len(prim_location), sorted(prim_location))) continue copies_needed = max(0, copies_needed - len(prim_location)) wfh.sendLog( 'transferor', "Counting existing copies ; now need %s" % copies_needed) copies_being_made = [ sum([ info['blocks'].keys().count(block) for site, info in destinations.items() if site in prim_destination ]) for block in all_block_names ] latching_on_transfers = set() [ latching_on_transfers.update(info['blocks'].values()) for site, info in destinations.items() if site in prim_destination ] latching_on_transfers = list(latching_on_transfers) #print latching_on_transfers ## figure out where all this is going to go prim_to_distribute = [ site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location ] prim_to_distribute = [ site for site in prim_to_distribute if not SI.CE_to_SE(site) in prim_destination ] ## take out the ones that cannot receive transfers potential_destinations = len(prim_to_distribute) #prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in SI.sites_veto_transfer] prim_to_distribute = [ site for site in prim_to_distribute if (SI.disk[SI.CE_to_SE(site)] or wfh.isRelval()) ] ## do we want to restrict transfers if the amount of site in vetoe are too large ? wfh.sendLog( 'transferor', "Could be going to: %s" % sorted(prim_to_distribute)) if not prim_to_distribute or any([ transfers_per_sites[site] < max_staging_per_site for site in prim_to_distribute ]): ## means there is openings let me go print "There are transfer slots available:", [ (site, transfers_per_sites[site]) for site in prim_to_distribute ] else: if int( wfh.request['RequestPriority'] ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority: wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over transfer slots available" % (wfh.request['RequestPriority'], in_transfer_priority)) else: wfh.sendLog( 'transferor', "Not allowed to transfer more than %s per site at a time. Going overboard for %s" % (max_staging_per_site, sorted([ site for site in prim_to_distribute if transfers_per_sites[site] >= max_staging_per_site ]))) if not options.go: allowed = False break for latching in latching_on_transfers: existings = session.query(TransferImp).filter( TransferImp.phedexid == int(latching)).filter( TransferImp.workflow_id == wfo.id).all() if not existings: tri = TransferImp(phedexid=int(latching), workflow=wfo) print "adding", wfo.id, "with phedexid", latching session.add(tri) else: for existing in existings: existing.active = True session.flush() can_go = False transfer_sizes[prim] = max(this_load, transfer_sizes[prim]) staging = True # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ?? #copies_needed = max(0,copies_needed - len(prim_destination)) copies_needed = max(0, copies_needed - min(copies_being_made)) wfh.sendLog( 'transferor', "Counting the copies being made ; then need %s" % copies_needed) if copies_needed == 0: wfh.sendLog( 'transferor', "The input is either fully in place or getting in full somewhere with %s" % latching_on_transfers) can_go = True continue elif len(prim_to_distribute) == 0: wfh.sendLog( 'transferor', "We are going to need extra copies of %s, but no destinations seems available" % (prim)) sendLog( 'transferor', "We are going to need extra copies of %s, but no destinations seems available" % (prim), level='critical') print json.dumps(prim_to_distribute, indent=2) print json.dumps(prim_location, indent=2) print json.dumps(prim_destination, indent=2) prim_to_distribute = [ site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location ] #prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in SI.sites_veto_transfer ] prim_to_distribute = [ site for site in prim_to_distribute if (SI.disk[SI.CE_to_SE(site)] or wfh.isRelval()) ] print "changed to" print json.dumps(prim_to_distribute, indent=2) if len( prim_to_distribute ) > 0: ## maybe that a parameter we can play with to limit the if not options or options.chop: ### hard include the tape disk andpoint ? #tapes = [site for site in getDatasetPresence( url, prim, vetos=['T0','T2','T3','Disk']) if site.endswith('MSS')] chops, sizes = getDatasetChops( prim, chop_threshold=options.chopsize, only_blocks=blocks) spreading = distributeToSites(chops, prim_to_distribute, n_copies=copies_needed, weights=SI.cpu_pledges, sizes=sizes) ## prune the blocks/destination that are already in the making, so that subscription don't overlap for site in spreading: for block in list(spreading[site]): if site in destinations and block in destinations[ site]['blocks'].keys(): ## prune it spreading[site].remove(block) transfer_sizes[prim] = sum(sizes) if not spreading: sendLog( 'transferor', 'cannot send %s to any site, it cannot fit anywhere' % prim, level='critical') wfh.sendLog( 'transferor', "cannot send to any site. %s cannot seem to fit anywhere" % (prim)) staging = False can_go = False else: spreading = {} for site in prim_to_distribute: if blocks: spreading[site] = blocks else: spreading[site] = [prim] transfer_sizes[prim] = max(this_load, transfer_sizes[prim]) can_go = False wfh.sendLog( 'transferor', "selected CE destinations %s" % (sorted(spreading.keys()))) for (site, items) in spreading.items(): all_transfers[site].extend(items) transfers_per_sites[site] += 1 primary_destinations.add(site) else: can_go = False allowed = False if not allowed: wfh.sendLog('transferor', "Not allowed to move on with") continue if secondary: override_sec_destination = [] if 'SecondaryLocation' in CI.campaigns[wfh.request['Campaign']]: override_sec_destination = CI.campaigns[ wfh.request['Campaign']]['SecondaryLocation'] if 'SecondaryLocation' in overide_parameters: override_sec_destination = overide_parameters[ 'SecondaryLocation'] print wfo.name, 'reads', ', '.join(secondary), 'in secondary' for sec in secondary: workflow_dependencies[sec].add(wfo.id) if True: ## new style, failing on minbias if not sec in destination_cache: ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist destination_cache[sec], _ = getDatasetDestinations( url, sec) ## NO SITE WHITE LIST ADDED #destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed]) ## limit to the site whitelist NOW se_allowed = set( [SI.CE_to_SE(site) for site in sites_allowed]) destinations = dict([ (k, v) for (k, v) in destination_cache[sec].items() if k in se_allowed ]) ## truncate location/destination to those making up for >90% of the dataset bad_destinations = [ destinations.pop(site) for (site, info) in destinations.items() if info['data_fraction'] < 0.9 ] print sec, json.dumps(destinations, indent=2) sec_location = [ site for (site, info) in destinations.items() if info['completion'] >= 95 ] sec_destination = [ site for site in destinations.keys() if not site in sec_location ] ## this is in SE else: ## old style presence = getDatasetPresence(url, sec) sec_location = [ site for site, pres in presence.items() if pres[1] > 90. ] ## more than 90% of the minbias at sites subscriptions = listSubscriptions(url, sec) sec_destination = [site for site in subscriptions] ## how to make unified understand that it has to wait for the secondary if the sec_destination and #sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])] sec_to_distribute = [ site for site in sites_allowed if not SI.CE_to_SE(site) in sec_location ] #sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])] sec_to_distribute = [ site for site in sec_to_distribute if not SI.CE_to_SE(site) in sec_destination ] presitespace_sec_to_distribute = copy.deepcopy( sec_to_distribute) #sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] #sec_to_distribute = [site for site in sec_to_distribute if not SI.CE_to_SE(site) in SI.sites_veto_transfer] sec_to_distribute = [ site for site in sec_to_distribute if (SI.disk[SI.CE_to_SE(site)] or wfh.isRelval()) ] ## at this point you have a problem if len(sec_to_distribute) == 0 and len( presitespace_sec_to_distribute): sendLog( 'transferor', '%s is getting no possible destinations because of lack of space. To be decided what to do in general' % (sec), level='critical') if override_sec_destination: ## intersect with where we want the PU to be not_needed_anymore = list( set(sec_to_distribute) - set(override_sec_destination)) #sendEmail("secondary superfluous","the dataset %s could be removed from %s"%( sec, not_needed_anymore )) sendLog( 'transferor', "the dataset %s could be removed from %s" % (sec, not_needed_anymore)) sec_to_distribute = list( set(sec_to_distribute) & set(override_sec_destination)) if len(sec_to_distribute) > 0: print "secondary could go to", sorted(sec_to_distribute) sec_size = dss.get(sec) for site in sec_to_distribute: site_se = SI.CE_to_SE(site) if (SI.disk[site_se] * 1024.) > sec_size or wfh.isRelval(): wfh.sendLog('transferor', 'Sending %s to %s' % (sec, site)) all_transfers[site].append(sec) can_go = False else: print "could not send the secondary input to", site_se, "because it is too big for the available disk", SI.disk[ site_se] * 1024, "GB need", sec_size if primary_destinations and site in primary_destinations: #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024)) sendLog( 'transferor', '%s is too big (%s) for %s (%s). %s will not be able to run there.' % (sec, sec_size, site_se, SI.disk[site_se] * 1024, wfo.name), level='critical') wfh.sendLog( 'transferor', '%s is too big (%s) for %s (%s). will not be able to run there.' % (sec, sec_size, site_se, SI.disk[site_se] * 1024)) else: ## this is bas overall print "the secondary input does not have to be send to site" ## is that possible to do something more if can_go: ## no explicit transfer required this time if staging: ## but using existing ones wfh.sendLog( 'transferor', "latches on existing transfers, and nothing else, settin staging" ) wfo.status = 'staging' needs_transfer += 1 else: wfh.sendLog( 'transferor', "should just be assigned now to %s" % sorted(sites_allowed)) wfo.status = 'staged' passing_along += 1 wfh.sendLog('transferor', "setting %s status to %s" % (wfo.name, wfo.status)) #session.commit() continue else: ## there is an explicit transfer required if staging: ## and also using an existing one wfh.sendLog('transferor', "latches on existing transfers") if not options.test: wfo.status = 'staging' wfh.sendLog( 'transferor', "setting %s status to %s" % (wfo.name, wfo.status)) #session.commit() wfh.sendLog('transferor', "needs a transfer") needs_transfer += 1 passing_along += 1 if no_goes: #sendEmail("no go for managing","No go for \n"+"\n".join( no_goes )) sendLog('transferor', "No go for \n" + "\n".join(sorted(no_goes)), level='critical') print "accumulated transfers" print json.dumps(all_transfers, indent=2) fake_id = -1 wf_id_in_prestaging = set() for (site, items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## site that do not want input datasets #if site in SI.sites_veto_transfer: # print site,"does not want transfers" # continue ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured. ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] block_datasets = list(set([it.split('#')[0] for it in blocks])) datasets = [it for it in items_to_transfer if not '#' in it] details_text = "Making a replica to %s (CE) %s (SE) for" % (site, site_se) #print "\t",len(blocks),"blocks" ## remove blocks if full dataset is send out blocks = [ block for block in blocks if not block.split('#')[0] in datasets ] #print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks])) #print "\t",len(datasets),"datasets" #print "\t",datasets details_text += '\n\t%d blocks' % len(blocks) details_text += '\n\t%d needed blocks for %s' % ( len(blocks), sorted(list(set([block.split('#')[0] for block in blocks])))) details_text += '\n\t%d datasets' % len(datasets) details_text += '\n\t%s' % sorted(datasets) items_to_transfer = blocks + datasets if execute: sendLog('transferor', details_text) else: print "Would make a replica to", site, "(CE)", site_se, "(SE) for" print details_text ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y', 'yes', 'go']: continue transfered_items = defaultdict(set) if execute: priority = 'normal' cds = [ ds for ds in set(datasets + block_datasets) if ds in max_priority ] ## bucketize the transfers by priority of workflows prioritized_items = defaultdict(set) for item in items_to_transfer: d = item.split('#')[0] p = max_priority.get(d, 80000) q = 'normal' if p > 100000: q = 'reserved' elif p < 70000: q = 'low' prioritized_items[q].add(item) for priority, items in prioritized_items.items(): result = makeReplicaRequest(url, site_se, list(items), 'prestaging', priority=priority, approve=True) if result: these_transfers = [ o['id'] for o in result['phedex']['request_created'] ] #phedexids.extend( these_transfers ) for ph in these_transfers: transfered_items[ph].update(items) else: sendLog( 'transferor', 'Could not make a replica request for items %s to site %s' % (items, site_se), level='critical') #result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority=priority, approve=True) #phedexids = [o['id'] for o in result['phedex']['request_created']]: #else: # #result= {'phedex':{'request_created' : []}} # phedexids = [] # fake_id-=1 if not transfered_items: sendLog( 'transferor', 'Could not make a replica request for items %s to site %s' % (items_to_transfer, site), level='critical') continue for phedexid, items in transfered_items.items(): print phedexid, "transfer created" for transfering in list( set(map(lambda it: it.split('#')[0], items))): for wfid in workflow_dependencies[transfering]: new_transfer = session.query(TransferImp).filter( TransferImp.phedexid == int(phedexid)).filter( TransferImp.workflow_id == wfid).first() if not new_transfer: new_transfer = TransferImp( phedexid=phedexid, workflow=session.query(Workflow).get(wfid)) session.add(new_transfer) else: new_transfer.active = True wf_id_in_prestaging.add(wfid) #session.commit() for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status != 'staging': if execute: tr_wf.status = 'staging' if talk: print "setting", tr_wf.name, "to staging" #session.commit() ## one big session commit at the end that everything went fine session.commit()
def new_recoveror(url, specific, options=None): if userLock('recoveror'): return up = componentInfo(soft=['mcm', 'wtc']) if not up.check(): return CI = campaignInfo() SI = siteInfo() UC = unifiedConfiguration() wfs = session.query(Workflow).filter( Workflow.status.contains('recovery')).all() if specific: wfs.extend( session.query(Workflow).filter( Workflow.status == 'assistance-manual').all()) try: from_operator = json.loads( os.popen( 'curl -s http://vocms0113.cern.ch/actions/test.json').read()) ## now we have a list of things that we can take action on except: pass for wfo in wfs: if specific and not specific in wfo.name: continue if not specific and 'manual' in wfo.status: continue wfi = workflowInfo(url, wfo.name) send_recovery = False ## will make all acdc send_clone = False ## will make a clone send_back = False ## should just reject. manual ? send_manual = False ## will set in manual where_to_run, missing_to_run = wfi.getRecoveryInfo() task_to_recover = where_to_run.keys() ## if the site at which the recovery could run in drain or out ? for task in task_to_recover: not_ready = set(where_to_run[task]) - set(SI.sites_ready) if not_ready: print "the following sites are not ready for the ACDC", ",".join( sorted(not_ready)) ## do we have a way of telling if a site is going to be out for a long time ? # check on priority: high prio, restart if wfi.request['RequestPriority'] >= 85000: send_clone = True # check on age of the request injection_time = time.mktime( time.strptime( '.'.join(map(str, wfi.request['RequestDate'])), "%Y.%m.%d.%H.%M.%S")) / (60. * 60.) now = time.mktime(time.gmtime()) / (60. * 60.) if float(now - injection_time) < 14.: ## less than 14 days, start over send_clone = True else: send_manual = True if not send_recovery: ## check on whether the stats is very low pass if send_recovery: ## make acdc for all tasks for task in task_to_recover: actions = list( set([ case['solution'] for code, case in task_to_recover[task] ])) acdc = singleRecovery(url, task, wfi.request, actions, do=True) elif send_clone: ## this will get it cloned wfo.status = 'assistance-clone' session.commit() elif send_manual: wfo.status = 'assistance-manual'
def assignor(url, specific=None, talk=True, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() #SI = siteInfo() SI = global_SI() #NLI = newLockInfo() #if not NLI.free() and not options.go: return LI = lockInfo() if not LI.free() and not options.go: return n_assigned = 0 n_stalled = 0 wfos = [] fetch_from = [] if specific or options.early: fetch_from.extend(['considered', 'staging']) if specific: fetch_from.extend(['considered-tried']) if options.early: print "Option Early is on" fetch_from.extend(['staged']) if options.from_status: fetch_from = options.from_status.split(',') print "Overriding to read from", fetch_from for status in fetch_from: print "getting wf in", status wfos.extend( session.query(Workflow).filter(Workflow.status == status).all()) print len(wfos) ## in case of partial, go for fetching a list from json ? #if options.partial and not specific: # pass dataset_endpoints = json.loads( open('%s/dataset_endpoints.json' % monitor_dir).read()) aaa_mapping = json.loads( open('%s/equalizor.json' % monitor_pub_dir).read())['mapping'] all_stuck = set() all_stuck.update( json.loads(open('%s/stuck_transfers.json' % monitor_pub_dir).read())) all_stuck.update(getAllStuckDataset()) max_per_round = UC.get('max_per_round').get('assignor', None) max_cpuh_block = UC.get('max_cpuh_block') ##order by priority instead of random if options.early: cache = sorted(getWorkflows(url, 'assignment-approved', details=True), key=lambda r: r['RequestPriority']) cache = [r['RequestName'] for r in cache] def rank(wfn): return cache.index(wfn) if wfn in cache else 0 wfos = sorted(wfos, key=lambda wfo: rank(wfo.name), reverse=True) print "10 first", [wfo.name for wfo in wfos[:10]] print "10 last", [wfo.name for wfo in wfos[-10:]] else: random.shuffle(wfos) for wfo in wfos: if options.limit and (n_stalled + n_assigned) > options.limit: break if max_per_round and (n_stalled + n_assigned) > max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo(url, wfo.name) if wfh.request['RequestStatus'] in [ 'rejected', 'aborted', 'aborted-completed', 'aborted-archived', 'rejected-archived' ] and wfh.isRelval(): wfo.status = 'forget' session.commit() n_stalled += 1 continue if options.priority and int( wfh.request['RequestPriority']) < options.priority: continue options_text = "" if options.early: options_text += ", early option is ON" if options.partial: options_text += ", partial option is ON" options_text += ", good fraction is %.2f" % options.good_enough wfh.sendLog('assignor', "%s to be assigned%s" % (wfo.name, options_text)) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() output_tiers = list( set([o.split('/')[-1] for o in wfh.request['OutputDatasets']])) if not output_tiers: n_stalled += 1 wfh.sendLog('assignor', 'There is no output at all') sendLog('assignor', 'Workflow %s has no output at all' % (wfo.name), level='critical') continue is_stuck = (all_stuck & primary) if is_stuck: wfh.sendLog('assignor', "%s are stuck input" % (','.join(is_stuck))) ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: no_go = True allowed_secondary = {} assign_parameters = {} check_secondary = (not wfh.isRelval()) for campaign in wfh.getCampaigns(): if campaign in CI.campaigns: assign_parameters.update(CI.campaigns[campaign]) if campaign in CI.campaigns and 'secondaries' in CI.campaigns[ campaign]: if CI.campaigns[campaign]['secondaries']: allowed_secondary.update( CI.campaigns[campaign]['secondaries']) check_secondary = True if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[ campaign]: banned_tier = list( set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers)) if banned_tier: no_go = True wfh.sendLog( 'assignor', 'These data tiers %s are not allowed' % (','.join(banned_tier))) sendLog('assignor', 'These data tiers %s are not allowed' % (','.join(banned_tier)), level='critical') if secondary and check_secondary: if (set(secondary) & set(allowed_secondary.keys()) != set(secondary)): wfh.sendLog( 'assignor', '%s is not an allowed secondary' % (', '.join(set(secondary) - set(allowed_secondary.keys())))) sendLog( 'assignor', '%s is not an allowed secondary' % (', '.join(set(secondary) - set(allowed_secondary.keys()))), level='critical') if not options.go: no_go = True ## then get whether there is something more to be done by secondary for sec in secondary: if sec in allowed_secondary: # and 'parameters' in allowed_secondary[sec]: assign_parameters.update(allowed_secondary[sec]) if no_go: n_stalled += 1 ## make a very loud noise if >100k priority stalled continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] != 'assignment-approved': if not options.test: wfh.sendLog('assignor', "setting %s away and skipping" % wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name, wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version = wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor', "cannot decide on version number") n_stalled += 1 wfo.status = 'trouble' session.commit() continue original_sites_allowed = copy.deepcopy(sites_allowed) wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed)) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) blocks = wfh.getBlockWhiteList() rwl = wfh.getRunWhiteList() if rwl: ## augment with run white list for dataset in primary: blocks = list(set(blocks + getDatasetBlocks(dataset, runs=rwl))) lwl = wfh.getLumiWhiteList() if lwl: ## augment with lumi white list for dataset in primary: blocks = list( set(blocks + getDatasetBlocks(dataset, lumis=lwl))) wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed)) secondary_locations = None primary_aaa = options.primary_aaa secondary_aaa = options.secondary_aaa do_partial = False #options.good_enough if options.partial else 0 if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns: assign_parameters.update(CI.campaigns[wfh.request['Campaign']]) if 'primary_AAA' in assign_parameters: primary_aaa = primary_aaa or assign_parameters['primary_AAA'] if 'secondary_AAA' in assign_parameters: secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA'] if 'partial_copy' in assign_parameters: ## can this only work if there is a stuck input ? maybe not ## this is a number. 0 means no print "Could do partial disk copy assignment" if is_stuck or options.partial: do_partial = assign_parameters['partial_copy'] wfh.sendLog( 'assignor', "Overiding partial copy assignment to %.2f fraction" % do_partial) #sendEmail('stuck input to assignment','%s is stuck for assigning %s and going fractional'%(','.join( is_stuck), wfo.name)) do_partial = options.good_enough if options.partial else do_partial for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" #sendEmail("tempting to pass sec location check","but we cannot yet IMO") #pass presence = getDatasetPresence(url, sec) print sec print json.dumps(presence, indent=2) one_secondary_locations = [ site for (site, (there, frac)) in presence.items() if frac > 98. ] if secondary_aaa: if not one_secondary_locations: sec_availability = getDatasetBlocksFraction(url, sec) if sec_availability >= 1. and options.go: ## there is at least one copy of each block on disk. We should go ahead and let it go. wfh.sendLog( 'assignor', "The secondary %s is available %s times on disk, and usable" % (sec, sec_availability)) else: ## not even a copy on disk anywhere !!!! sites_allowed = [] ## will block the assignment wfh.sendLog( 'assignor', "The secondary %s is nowhere on disk" % sec) #just continue without checking continue #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations == None: secondary_locations = one_secondary_locations else: secondary_locations = list( set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [ site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations ] wfh.sendLog( 'assignor', "From/after secondary requirement, now Allowed%s" % sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy(sites_allowed) sites_with_data = copy.deepcopy(sites_allowed) sites_with_any_data = copy.deepcopy(sites_allowed) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default endpoints = set() for prim in list(primary): if prim in dataset_endpoints: print "endpoints from stagor", dataset_endpoints[prim] endpoints.update(dataset_endpoints[prim]) set_lfn = getLFNbase(prim) presence = getDatasetPresence(url, prim, only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction( url, prim, sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks) if primary_aaa: available_fractions[prim] = getDatasetBlocksFraction( url, prim, only_blocks=blocks) sites_all_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [ psite for (psite, (there, frac)) in presence.items() if there ] ] if primary_aaa: sites_all_data = list( set([ SI.SE_to_CE(psite) for (psite, (there, frac)) in presence.items() if there ])) sites_with_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite, frac) in presence.items() if frac[1] > 90.] ] sites_with_any_data = [ site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys() ] if primary_aaa: sites_with_any_data = list( set([SI.SE_to_CE(psite) for psite in presence.keys()])) wfh.sendLog( 'assignor', "Holding the data but not allowed %s" % sorted( list( set([ se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed ])))) if primary_locations == None: primary_locations = presence.keys() else: primary_locations = list( set(primary_locations) & set(presence.keys())) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites = [] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] elif primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list( set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] else: opportunistic_sites = [] wfh.sendLog( 'assignor', "We could be running in addition at %s" % sorted(opportunistic_sites)) if any( [osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog( 'assignor', "One of the usable site is in downtime %s" % ([ osite for osite in opportunistic_sites if osite in SI.sites_not_ready ])) down_time = True ## should this be send back to considered ? ## should be 2 but for the time-being let's lower it to get things going copies_wanted, cpuh = wfh.getNCopies() wfh.sendLog('assignor', "we need %s CPUh" % cpuh) if cpuh > max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog( 'assignor', '%s requires a large numbr of CPUh %s , not assigning, please check with requester' % (wfo.name, cpuh), level='critical') wfh.sendLog( 'assignor', "Requiring a large number of CPUh %s, not assigning" % cpuh) continue if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[ wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[ wfh.request['Campaign']]['maxcopies'] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max( 1, copies_wanted - less_copies_than_requested) # take one out for the efficiency else: ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going pass wfh.sendLog('assignor', "needed availability fraction %s" % copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently ## default back to white list to original white list with any data wfh.sendLog('assignor', "Allowed sites :%s" % sorted(sites_allowed)) if primary_aaa: ## remove the sites not reachable localy if not in having the data if not sites_all_data: wfh.sendLog('assignor', "Overiding the primary on AAA setting to Off") primary_aaa = False else: aaa_grid = set(sites_all_data) for site in list(aaa_grid): aaa_grid.update(aaa_mapping.get(site, [])) sites_allowed = list(set(initial_sites_allowed) & aaa_grid) wfh.sendLog( 'assignor', "Selected to read primary through xrootd %s" % sorted(sites_allowed)) isStoreResults = ('StoreResults' == wfh.request.setdefault( 'RequestType', None)) if isStoreResults: if 'MergedLFNBase' in wfh.request: set_lfn = wfh.request['MergedLFNBase'] else: n_stalled += 1 wfh.sendLog( 'assignor', "Cannot assign StoreResults request because MergedLFN is missing" ) sendLog( 'assignor', 'Cannot assign StoreResults request because MergedLFN is missing', level='critical') continue if not primary_aaa: if not isStoreResults: sites_allowed = sites_with_any_data else: ## if we are dealing with a StoreResults request, we don't need to check dataset availability and ## should use the SiteWhiteList set in the original request if 'SiteWhitelist' in wfh.request: sites_allowed = wfh.request['SiteWhitelist'] else: wfh.sendLog( 'assignor', "Cannot assign StoreResults request because SiteWhitelist is missing" ) sendLog( 'assignor', 'Cannot assign StoreResults request because SiteWhitelist is missing', level='critical') n_stalled += 1 continue available_fractions = {} wfh.sendLog('assignor', "Selected for any data %s" % sorted(sites_allowed)) ### check on endpoints for on-going transfers if do_partial: if endpoints: end_sites = [SI.SE_to_CE(s) for s in endpoints] sites_allowed = list(set(sites_allowed + end_sites)) if down_time and not any(osite in SI.sites_not_ready for osite in end_sites): print "Flip the status of downtime, since our destinations are good" down_time = False print "with added endpoints", sorted(end_sites) else: print "Cannot do partial assignment without knowin the endpoints" n_stalled += 1 continue #if not len(sites_allowed): # if not options.early: # wfh.sendLog('assignor',"cannot be assign with no matched sites") # sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') # n_stalled+=1 # continue low_pressure = SI.sites_low_pressure(0.4) ## if any of the site allowed is low pressure : reduce to 1 copy so that it gets started allowed_and_low = sorted(set(low_pressure) & set(sites_allowed)) if allowed_and_low: wfh.sendLog( 'assignor', "The workflow can run at %s under low pressure currently" % (','.join(allowed_and_low))) copies_wanted = max(1., copies_wanted - 1.) if available_fractions and not all([ available >= copies_wanted for available in available_fractions.values() ]): not_even_once = not all([ available >= 1. for available in available_fractions.values() ]) above_good = all([ available >= do_partial for available in available_fractions.values() ]) wfh.sendLog( 'assignor', "The input dataset is not available %s times, only %s" % (copies_wanted, available_fractions.values())) if down_time and not options.go and not options.early: wfo.status = 'considered' session.commit() wfh.sendLog( 'assignor', "sending back to considered because of site downtime, instead of waiting" ) #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) sendLog( 'assignor', '%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.' % (wfo.name), level='delay') n_stalled += 1 continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known and not options.limit and not options.go and not options.early and not ( do_partial and above_good): wfh.sendLog( 'assignor', "cannot be assigned, %s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions))) #sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) known.append(wfo.name) open('cannot_assign.json', 'w').write(json.dumps(known, indent=2)) if options.early: if wfo.status == 'considered': wfh.sendLog('assignor', "setting considered-tried") wfo.status = 'considered-tried' session.commit() else: print "tried but status is", wfo.status if do_partial and above_good: print "Will move on with partial locations" else: n_stalled += 1 continue if not len(sites_allowed): if not options.early: wfh.sendLog('assignor', "cannot be assign with no matched sites") sendLog('assignor', '%s has no whitelist' % wfo.name, level='critical') n_stalled += 1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [ SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed]) ] wfh.sendLog('assignor', "Placing the output on %s" % sites_out) parameters = { 'SiteWhitelist': sites_allowed, 'NonCustodialSites': sites_out, 'AutoApproveSubscriptionSites': list(set(sites_out)), 'AcquisitionEra': wfh.acquisitionEra(), 'ProcessingString': wfh.processingString(), 'MergedLFNBase': set_lfn, 'ProcessingVersion': version, } if primary_aaa: parameters['TrustSitelists'] = True wfh.sendLog( 'assignor', "Reading primary through xrootd at %s" % sorted(sites_allowed)) if secondary_aaa: parameters['TrustPUSitelists'] = True wfh.sendLog( 'assignor', "Reading secondary through xrootd at %s" % sorted(sites_allowed)) ## plain assignment here team = 'production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team parameters['Team'] = team if lheinput: ## throttle reading LHE article wfh.sendLog('assignor', 'Setting the number of events per job to 500k max') parameters['EventsPerJob'] = 500000 def pick_options(options, parameters): ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v = getattr(options, key) if v != None: if type(v) == str and ',' in v: parameters[key] = filter(None, v.split(',')) else: parameters[key] = v def pick_campaign(assign_parameters, parameters): ## pick up campaign specific assignment parameters parameters.update(assign_parameters.get('parameters', {})) if options.force_options: pick_campaign(assign_parameters, parameters) pick_options(options, parameters) else: ## campaign parameters update last pick_options(options, parameters) pick_campaign(assign_parameters, parameters) if not options.test: parameters['execute'] = True hold_split, split_check = wfh.checkSplitting() if hold_split and not options.go: if split_check: wfh.sendLog( 'assignor', 'Holding on to the change in splitting %s' % ('\n\n'.join([str(i) for i in split_check]))) else: wfh.sendLog('assignor', 'Change of splitting is on hold') n_stalled += 1 continue if split_check == None or split_check == False: n_stalled += 1 continue elif split_check: ## operate all recommended changes reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check) wfh.sendLog( 'assignor', 'Applying the change in splitting %s' % ('\n\n'.join([str(i) for i in split_check]))) split_check = True ## bypass completely and use the above # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents / (reqJobs * 1.4)) lumisPerJob = int(eventsPerJob / eventsPerLumi) if lumisPerJob == 0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob), level='critical') wfh.sendLog( 'assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl[ 'events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl[ 'avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob), level='critical') wfh.sendLog( 'assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog( 'assignor', "leaving splitting untouched for %s, please check on %s" % (pstring, wfo.name), level='critical') wfh.sendLog( 'assignor', "leaving splitting untouched for PU_RD*, please check." ) if isHEPCloudReady(url) and wfh.isGoodForNERSC(): parameters['Team'] = 'hepcloud' parameters['SiteWhitelist'] = ['T3_US_NERSC'] if primary: parameters['TrustSitelists'] = True if secondary: parameters['TrustPUSitelists'] = True sendEmail("sending work to hepcloud", "pleasse check on %s" % wfh.request['RequestName'], destination=['*****@*****.**']) ## make sure to autoapprove all NonCustodialSites parameters['AutoApproveSubscriptionSites'] = list( set(parameters['NonCustodialSites'] + parameters.get('AutoApproveSubscriptionSites', []))) result = reqMgrClient.assignWorkflow( url, wfo.name, None, parameters) ## team is not relevant anymore here # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned += 1 wfh.sendLog( 'assignor', "Properly assigned\n%s" % (json.dumps(parameters, indent=2))) try: ## refetch information and lock output new_wfi = workflowInfo(url, wfo.name) (_, prim, _, sec) = new_wfi.getIO() for secure in list(prim) + list( sec) + new_wfi.request['OutputDatasets']: ## lock all outputs LI.lock(secure, reason='assigning') except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output", str(e)) else: wfh.sendLog( 'assignor', "Failed to assign %s.\n%s \n Please check the logs" % (wfo.name, reqMgrClient.assignWorkflow.errorMessage)) sendLog('assignor', "Failed to assign %s.\n%s \n Please check the logs" % (wfo.name, reqMgrClient.assignWorkflow.errorMessage), level='critical') print "ERROR could not assign", wfo.name else: pass print "Assignment summary:" sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled)) if n_stalled and not options.go and not options.early: sendLog('assignor', "%s workflows cannot be assigned. Please take a look" % (n_stalled), level='critical')
def actor(url,options=None): if userLock('actor'): return up = componentInfo(mcm=False, soft=['mcm']) if not up.check(): return # CI = campaignInfo() SI = siteInfo() UC = unifiedConfiguration() # Need to look at the actions page https://vocms0113.cern.ch:80/getaction (can add ?days=20) and perform any actions listed try: action_list = json.loads(os.popen('curl -s -k https://vocms0113.cern.ch:80/getaction?days=15').read()) ## now we have a list of things that we can take action on except: print "Not able to load action list :(" sendLog('actor','Not able to load action list', level='critical') return print action_list if not action_list: print "EMPTY!" return for wfname in action_list: print '-'*100 print "Looking at",wfname,"for recovery options" to_clone = False to_acdc = False for key in action_list[wfname]: if key == 'Parameters': tasks = action_list[wfname][key] elif key == 'Action' and action_list[wfname][key] == 'acdc': print "Going to create ACDCs for ", wfname to_acdc = True elif key == 'Action' and action_list[wfname][key] == 'clone': print "Going to clone ", wfname to_clone = True if not to_acdc and not to_clone: sendLog('actor','Action submitted for something other than acdc and clone for workflow %s'%wfname,level='critical') print "Can only do acdcs and clones! Skipping workflow ",wfname continue if not tasks: sendLog('actor','Empty action submitted for workflow %s'%wfname,level='critical') print "Moving on. Parameters is blank for " + wfname continue wfi = workflowInfo(url, wfname) recover = True message_to_ops = "" message_to_user = "" #=========================================================== if to_clone and options.do: print "Let's try kill and clone: " wfi.sendLog('actor','Going to clone %s'%wfname) results=[] datasets = set(wfi.request['OutputDatasets']) comment="" if 'comment' in tasks: comment = ", reason: "+ tasks['comment'] wfi.sendLog('actor',"invalidating the workflow by traffic controller %s"%comment) #Reject all workflows in the family #first reject the original workflow. reqMgrClient.invalidateWorkflow(url, wfi.request['RequestName'], current_status=wfi.request['RequestStatus'], cascade=False) #Then reject any ACDCs associated with that workflow if 'ACDCs' in action_list[wfname]: children = action_list[wfname]['ACDCs'] for child in children: wfi.sendLog('actor',"rejecting %s"%child) wfi_acdc = workflowInfo(url, child) reqMgrClient.invalidateWorkflow(url, wfi_acdc.request['RequestName'], current_status=wfi_acdc.request['RequestStatus'], cascade=False) datasets.update( wfi_acdc.request['OutputDatasets'] ) #Invalidate all associated output datasets for dataset in datasets: results.append( setDatasetStatus(dataset, 'INVALID') ) if all(map(lambda result : result in ['None',None,True],results)): wfi.sendLog('actor',"%s and children are rejected"%wfname) cloned = None try: cloned = singleClone(url, wfname, tasks, comment, options.do) except: sendLog('actor','Failed to create clone for %s! Check logs for more information. Action will need to be resubmitted.'%wfname,level='critical') wfi.sendLog('actor','Failed to create clone for %s!'%wfname) remove_action(wfname) if not cloned: recover = False wfi.sendLog('actor','Failed to create clone for %s!'%wfname) sendLog('actor','Failed to create clone for %s!'%wfname,level='critical') else: wfi.sendLog('actor',"Workflow %s cloned"%wfname) #=========================================================== elif to_acdc: if 'AllSteps' in tasks: allTasksDefaults = tasks['AllSteps'] tasks.pop('AllSteps') for setting in allTasksDefaults: for task in tasks: if setting in tasks[task]: tasks[task][setting] = allTasksDefaults[setting] else: tasks[task].append({setting:allTasksDefaults[setting]}) print "Tasks is " print tasks all_tasks = wfi.getAllTasks() ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves try: WMErr = wfi.getWMErrors() # print WMErr except: sendLog('actor','Cannot create ACDCS for %s because WMErr cannot be reached.'%wfname,level='critical') continue if not WMErr: sendLog('actor','Cannot create ACDCS for %s because WMErr is blank.'%wfname,level='critical') print "Moving on. WMErr is blank" continue try: where_to_run, missing_to_run,missing_to_run_at = wfi.getRecoveryInfo() print "Where to run = " print where_to_run except: sendLog('actor','Cannot create ACDCS for %s because recovery info cannot be found.'%wfname,level='critical') print "Moving on. Cannot access recovery info for " + wfname continue if not where_to_run: sendLog('actor','Cannot create ACDCS for %s because site list cannot be found.'%wfname,level='critical') print "Moving on. where to run is blank" continue message_to_ops = "" message_to_user = "" num_tasks_to_recover = 0 for task in WMErr: if 'LogCollect' in task: continue if 'Cleanup' in task: continue if not 'jobfailed' in WMErr[task]: continue else: num_tasks_to_recover += 1 # print "Task to recover: " + task if not num_tasks_to_recover: print "\tno error for",wfname # recover = False if 'LheInputFiles' in wfi.request and wfi.request['LheInputFiles']: ## we do not try to recover pLHE sendLog('actor','Cannot create ACDCS for %s because it is a pLHE workflow.'%wfname,level='critical') print "We don't try to recover pLHE. Moving on." recover = False # sendEmail('cannot submit action', '%s is a pLHE workflow. We do not try to recover pLHE'%wfname) # if wfi.request['RequestType'] in ['ReReco']: # recover= False # print 'cannot submit action. ReReco' # sendEmail('cannot submit action', '%s is request type ReReco'%wfname) recovering = set() for task in tasks: assign_to_sites = set() print "Task names is " + task fulltaskname = '/' + wfname + '/' + task # print "Full task name is " + fulltaskname wrong_task = False for task_info in all_tasks: if fulltaskname == task_info.pathName: if task_info.taskType not in ['Processing','Production','Merge']: wrong_task=True wfi.sendLog('actor', "Skipping task %s because the taskType is %s. Can only ACDC Processing, Production, or Merge tasks"%( fulltaskname, task_info.taskType)) if wrong_task: continue print tasks[task] actions = tasks[task] for action in actions: if action.startswith('sites'): if type(actions[action]) != list: assign_to_sites=[SI.SE_to_CE(actions[action])] else: assign_to_sites=list(set([SI.SE_to_CE(site) for site in actions[action]])) # if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same' and wfi.request['RequestType'] in ['TaskChain']: # recover = False; # print "Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname # wfi.sendLog('actor',"Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname) if not 'sites' in actions: assign_to_sites = list(set([SI.SE_to_CE(site) for site in where_to_run[task]])) print "Found",sorted(assign_to_sites),"as sites where to run the ACDC at, from the acdc doc of ",wfname print "Going to run at",sorted(assign_to_sites) if recover: print "Initiating recovery" acdc = singleRecovery(url, fulltaskname, wfi.request, actions, do = options.do) if not acdc: if options.do: if recovering: print wfname + " has been partially ACDC'ed. Needs manual attention." sendLog('actor', "%s has had %s/%s recoveries %s only"%( wfname, len(recovering), num_tasks_to_recover, list(recovering)), level='critical') wfi.sendLog('actor', "%s has had %s/%s recoveries %s only"%( wfname, len(recovering), num_tasks_to_recover, list(recovering))) break else: print wfname + " failed recovery once" recover = False break else: print "no action to take further" # sendLog('recoveror', "ACDC for %s can be done automatically"% wfname, level='critical') continue else: #ACDC was made correctly. Now we have to assign it. wfi.sendLog('actor','ACDC created for task %s. Actions taken \n%s'%(fulltaskname,list(actions))) team = wfi.request['Teams'][0] parameters={ 'SiteWhitelist' : sorted(assign_to_sites), 'AcquisitionEra' : wfi.acquisitionEra(), 'ProcessingString' : wfi.processingString(), 'MergedLFNBase' : wfi.request['MergedLFNBase'], 'ProcessingVersion' : wfi.request['ProcessingVersion'], } ## hackery for ACDC merge assignment if wfi.request['RequestType'] == 'TaskChain' and 'Merge' in task.split('/')[-1]: parameters['AcquisitionEra'] = None parameters['ProcessingString'] = None ## xrootd setttings on primary and secondary if 'xrootd' in actions: if actions['xrootd'] == 'enabled': print "Going to assign via xrootd" parameters['TrustSitelists'] = True elif actions['xrootd'] == 'disabled': parameters['TrustSitelists'] = False elif ('TrustSitelists' in wfi.request and wfi.request['TrustSitelists']=='true'): parameters['TrustSitelists'] = True else: parameters['TrustSitelists'] = False if 'TrustPUSitelists' in wfi.request and wfi.request['TrustPUSitelists']: parameters['TrustPUSitelists'] = True if options.ass: print "really doing the assignment of the ACDC",acdc parameters['execute']=True wfi.sendLog('actor',"%s was assigned for recovery"% acdc) else: print "no assignment done with this ACDC",acdc sendLog('actor',"%s needs to be assigned"%(acdc), level='critical') continue # print parameters result = reqMgrClient.assignWorkflow(url, acdc, team, parameters) if not result: print acdc,"was not assigned" sendLog('actor',"%s needs to be assigned"%(acdc), level='critical') else: recovering.add( acdc ) wfi.sendLog('actor',"ACDCs created for %s"%wfname) #=========================================================== if recover and options.do: remove_action(wfname) if message_to_user: print wfname,"to be notified to user(DUMMY)",message_to_user if message_to_ops: print 'message' #sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**']) # sendLog('recoveror',message_to_ops,level='warning') return
def closor(url, specific=None, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() all_late_files = [] check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce') jump_the_line = options.announce if options else False if jump_the_line: print "announce option is on. Checking on things on-going ready to be announced" wfs = session.query(Workflow).filter( Workflow.status.contains('announce')).filter( sqlalchemy.not_(Workflow.status.contains('announced'))).all() else: print "regular option. Checking on things done and to be announced" wfs = session.query(Workflow).filter(Workflow.status == 'close').all() wfs_n = [w.name for w in wfs] print "unique names?" print len(set(wfs_n)) == len(wfs_n) held = set() print len(wfs), "closing" random.shuffle(wfs) max_per_round = UC.get('max_per_round').get('closor', None) if options.limit: max_per_round = options.limit if max_per_round: ## order them by priority all_closedout = sorted(getWorkflows(url, 'closed-out', details=True), key=lambda r: r['RequestPriority']) all_closedout = [r['RequestName'] for r in all_closedout] def rank(wfn): return all_closedout.index(wfn) if wfn in all_closedout else 0 wfs = sorted(wfs, key=lambda wfo: rank(wfo.name), reverse=True) wfs = wfs[:max_per_round] batch_go = {} batch_warnings = defaultdict(set) batch_goodness = UC.get("batch_goodness") for iwfo, wfo in enumerate(wfs): if specific and not specific in wfo.name: continue print "Progress [%d/%d]" % (iwfo, len(wfs)) ## what is the expected #lumis wfi = workflowInfo(url, wfo.name) wfo.wm_status = wfi.request['RequestStatus'] if wfi.isRelval(): has_batch_go = False batch_name = wfi.getCampaign() if not batch_name in batch_go: ## do the esimatation whethere this can be announced : only once per batch in_batches = getWorkflowByCampaign(url, batch_name, details=True) batch_go[batch_name] = all( map( lambda s: not s in [ 'completed', 'running-open', 'running-closed', 'acquired', 'assigned', 'assignment-approved' ], [r['RequestStatus'] for r in in_batches])) ## already verified has_batch_go = batch_go[batch_name] if not has_batch_go: wfi.sendLog( 'closor', 'Cannot close for now because the batch <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?campaign=%s>%s</a> is not all close' % (batch_name, batch_name)) continue if wfi.request['RequestStatus'] in ['announced', 'normal-archived' ] and not options.force: ## manually announced ?? wfo.status = 'done' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog( 'closor', '%s is announced already : %s' % (wfo.name, wfo.wm_status)) session.commit() if jump_the_line: wfi.sendLog('closor', 'Announcing while completing') expected_lumis = 1 if not 'TotalInputLumis' in wfi.request: print wfo.name, "has not been assigned yet, or the database is corrupted" elif wfi.request['TotalInputLumis'] == 0: print wfo.name, "is corrupted with 0 expected lumis" else: expected_lumis = wfi.request['TotalInputLumis'] ## what are the outputs outputs = wfi.request['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = defaultdict(lambda: False) stats = defaultdict(int) #print outputs if len(outputs): print wfo.name, wfi.request['RequestStatus'] for out in outputs: event_count, lumi_count = getDatasetEventsAndLumis(dataset=out) odb = session.query(Output).filter( Output.datasetname == out).first() if not odb: print "adding an output object", out odb = Output(datasetname=out) odb.workflow = wfo session.add(odb) odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) session.commit() fraction = lumi_count / float(expected_lumis) * 100. completion_line = "%60s %d/%d = %3.2f%%" % ( out, lumi_count, expected_lumis, fraction) wfi.sendLog('closor', "\t%s" % completion_line) if wfi.isRelval() and fraction < batch_goodness: batch_warnings[wfi.getCampaign()].add(completion_line) stats[out] = lumi_count all_OK[out] = True ## check for at least one full copy prior to moving on in_full = {} for out in outputs: in_full[out] = [] presence = getDatasetPresence(url, out) where = [site for site, info in presence.items() if info[0]] if where: all_OK[out] = True print out, "is in full at", ",".join(where) in_full[out] = copy.deepcopy(where) else: going_to = wfi.request['NonCustodialSites'] + wfi.request[ 'CustodialSites'] wfi.sendLog( 'closor', "%s is not in full anywhere. send to %s" % (out, ",".join(sorted(going_to)))) at_destination = dict([(k, v) for (k, v) in presence.items() if k in going_to]) else_where = dict([(k, v) for (k, v) in presence.items() if not k in going_to]) print json.dumps(at_destination) print json.dumps(else_where, indent=2) ## do the full stuck transfer study, missing files and shit ! for there in going_to: late_info = findLateFiles(url, out, going_to=there) for l in late_info: l.update({"workflow": wfo.name, "dataset": out}) all_late_files.extend(late_info) if check_fullcopy_to_announce: ## only set this false if the check is relevant all_OK[out] = False ## verify if we have to do harvesting if not options.no_harvest and not jump_the_line: (OK, requests) = spawn_harvesting(url, wfi, in_full) all_OK.update(OK) ## only that status can let me go into announced if all(all_OK.values()) and ( (wfi.request['RequestStatus'] in ['closed-out']) or options.force or jump_the_line): print wfo.name, "to be announced" results = [] if not results: for out in outputs: if out in stats and not stats[out]: continue _, dsn, process_string, tier = out.split('/') if all_OK[out]: results.append(setDatasetStatus(out, 'VALID')) if all_OK[out] and wfi.isRelval(): ## make the specific relval rules and the replicas ## figure the destination(s) out destinations = set() if tier != "RECO" and tier != "ALCARECO": destinations.add('T2_CH_CERN') if tier == "GEN-SIM": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-DIGI-RAW": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-RECO": destinations.add('T1_US_FNAL_Disk') if "RelValTTBar" in dsn and "TkAlMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if "MinimumBias" in dsn and "SiStripCalMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if destinations: wfi.sendLog( 'closor', '%s to go to %s' % (out, ', '.join(sorted(destinations)))) ## call to makereplicarequest under relval => done for site in destinations: result = makeReplicaRequest( url, site, [out], 'Copy for release validation consumption', priority='normal', approve=True, mail=False, group='RelVal') try: request_id = result['phedex'][ 'request_created'][0]['id'] results.append(True) except: results.append('Failed relval transfer') elif all_OK[out]: campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request and wfi.request[ 'Campaign']: campaign = wfi.request['Campaign'] to_DDM = False ## campaign override if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[ campaign] and tier in CI.campaigns[campaign][ 'toDDM']: to_DDM = True ## by typical enabling if tier in UC.get("tiers_to_DDM"): to_DDM = True ## check for unitarity if not tier in UC.get("tiers_no_DDM") + UC.get( "tiers_to_DDM"): print "tier", tier, "neither TO or NO DDM for", out results.append('Not recognitized tier %s' % tier) #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out) sendLog( 'closor', "could not recognize %s for injecting in DDM" % out, level='critical') continue n_copies = 1 destinations = [] if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[ campaign]: ddm_instructions = CI.campaigns[campaign][ 'DDMcopies'] if type(ddm_instructions) == int: n_copies = CI.campaigns[campaign]['DDMcopies'] elif type(ddm_instructions) == dict: ## a more fancy configuration for ddmtier, indication in ddm_instructions.items( ): if ddmtier == tier or ddmtier in [ '*', 'all' ]: ## this is for us if 'N' in indication: n_copies = indication['N'] if 'host' in indication: destinations = indication['host'] destination_spec = "" if destinations: destination_spec = "--destination=" + ",".join( destinations) group_spec = "" ## not used yet ### should make this a campaign configuration ## inject to DDM when necessary if to_DDM: print "Sending", out, " to DDM" status = pass_to_dynamo( [out], N=n_copies, sites=destinations if destinations else None, group=group_spec if group_spec else None) results.append(status) if status == True: wfi.sendLog( 'closor', '%s is send to dynamo in %s copies %s %s' % (out, n_copies, sorted(destinations), group_spec)) else: sendLog('closor', "could not add " + out + " to dynamo pool. check closor logs.", level='critical') wfi.sendLog( 'closor', "could not add " + out + " to dynamo pool. check closor logs.") else: print wfo.name, "no stats for announcing", out results.append('No Stats') if all( map(lambda result: result in ['None', None, True], results)): if not jump_the_line: ## only announce if all previous are fine res = reqMgrClient.announceWorkflowCascade( url, wfo.name) if not res in ['None', None]: ## check the status again, it might well have toggled wl_bis = workflowInfo(url, wfo.name) wfo.wm_status = wl_bis.request['RequestStatus'] session.commit() if wl_bis.request['RequestStatus'] in [ 'announced', 'normal-archived' ]: res = None else: ## retry ? res = reqMgrClient.announceWorkflowCascade( url, wfo.name) results.append(res) #print results if all(map(lambda result: result in ['None', None, True], results)): if jump_the_line: if not 'announced' in wfo.status: wfo.status = wfo.status.replace( 'announce', 'announced') else: wfo.status = 'done' session.commit() wfi.sendLog('closor', "workflow outputs are announced") else: wfi.sendLog( 'closor', "Error with %s to be announced \n%s" % (wfo.name, json.dumps(results))) elif wfi.request['RequestStatus'] in [ 'failed', 'aborted', 'aborted-archived', 'rejected', 'rejected-archived', 'aborted-completed' ]: if wfi.isRelval(): wfo.status = 'forget' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog( 'closor', "%s is %s, but will not be set in trouble to find a replacement." % (wfo.name, wfo.wm_status)) else: wfo.status = 'trouble' wfo.wm_status = wfi.request['RequestStatus'] session.commit() else: print wfo.name, "not good for announcing:", wfi.request[ 'RequestStatus'] wfi.sendLog('closor', "cannot be announced") held.add(wfo.name) days_late = 0. retries_late = 10 really_late_files = [ info for info in all_late_files if info['retries'] >= retries_late ] really_late_files = [ info for info in really_late_files if info['delay'] / (60 * 60 * 24.) >= days_late ] if really_late_files: subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s' % ( len(really_late_files), days_late, retries_late, json.dumps(really_late_files, indent=2)) #sendEmail('waiting for files to announce', subject) sendLog('closor', subject, level='warning') sendLog('closor', subject) print subject open('%s/stuck_files.json' % monitor_dir, 'w').write(json.dumps(really_late_files, indent=2)) if held: sendLog('closor', "the workflows below are held up \n%s" % ("\n".join(sorted(held))), level='critical') #batches = json.loads(open('batches.json').read()) for bname, go in batch_go.items(): if go: subject = "Release Validation Samples Batch %s" % bname issues = "" if batch_warnings[bname]: issues = "The following datasets have outstanding completion (<%d%%) issues:\n\n" % batch_goodness issues += "\n".join(sorted(batch_warnings[bname])) issues += "\n\n" text = """ Dear all, a batch of release validation workflows has finished. Batch ID: %s Detail of the workflows https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s %s This is an automated message. """ % (bname, bname, issues) to = ['*****@*****.**'] sendEmail(subject, text, destination=to)
def transferor(url, specific=None, talk=True, options=None): if userLock('transferor'): return if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() mcm = McMClient(dev=False) dss = DSS() print "counting all being handled..." being_handled = len( session.query(Workflow).filter(Workflow.status == 'away').all()) being_handled += len( session.query(Workflow).filter( Workflow.status.startswith('stag')).all()) being_handled += len( session.query(Workflow).filter( Workflow.status.startswith('assistance')).all()) max_to_handle = options.maxworkflows allowed_to_handle = max(0, max_to_handle - being_handled) wf_buffer = 5 if allowed_to_handle <= wf_buffer: ## buffer for having several wf per transfer print "Not allowed to run more than", max_to_handle, "at a time. Currently", being_handled, "and", wf_buffer, "buffer" else: print being_handled, "already being handled", max_to_handle, "max allowed,", allowed_to_handle, "remaining", "and", wf_buffer, "buffer" print "... done" all_transfers = defaultdict(list) workflow_dependencies = defaultdict( set) ## list of wf.id per input dataset wfs_and_wfh = [] print "getting all wf to consider ..." cache = getWorkflows(url, 'assignment-approved', details=True) for wfo in session.query(Workflow).filter( Workflow.status == 'considered').all(): if specific and not specific in wfo.name: continue cache_r = filter(lambda d: d['RequestName'] == wfo.name, cache) if len(cache_r): wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False, request=cache_r[0]))) else: wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False))) print "... done" input_sizes = {} ## list the size of those in transfer already in_transfer_priority = 0 min_transfer_priority = 100000000 print "getting all wf in staging ..." for wfo in session.query(Workflow).filter( Workflow.status == 'staging').all(): wfh = workflowInfo(url, wfo.name, spec=False) (_, primary, _, _) = wfh.getIO() for prim in primary: input_sizes[prim] = dss.get(prim) in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority'])) min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority'])) print "... done" print "Max priority in transfer already", in_transfer_priority print "Min priority in transfer already", min_transfer_priority in_transfer_already = sum(input_sizes.values()) #sort by priority higher first wfs_and_wfh.sort(cmp=lambda i, j: cmp(int(i[1].request[ 'RequestPriority']), int(j[1].request['RequestPriority'])), reverse=True) ## list the size of all inputs print "getting all input sizes ..." for (wfo, wfh) in wfs_and_wfh: (_, primary, _, _) = wfh.getIO() for prim in primary: input_sizes[prim] = dss.get(prim) print "... done" grand_total = sum(input_sizes.values()) to_transfer = grand_total - in_transfer_already grand_transfer_limit = options.maxtransfer transfer_limit = grand_transfer_limit - in_transfer_already print "%15.4f GB already being transfered" % in_transfer_already print "%15.4f GB is the current requested transfer load" % to_transfer print "%15.4f GB is the global transfer limit" % grand_transfer_limit print "%15.4f GB is the available limit" % transfer_limit # the max priority value per dataset. max_priority = defaultdict(int) needs_transfer = 0 ## so that we can count'em passing_along = 0 transfer_sizes = {} went_over_budget = False for (wfo, wfh) in wfs_and_wfh: print wfh.request['RequestPriority'] print wfo.name, "to be transfered" #wfh = workflowInfo( url, wfo.name) (_, primary, _, _) = wfh.getIO() this_load = sum([input_sizes[prim] for prim in primary]) if (this_load and (sum(transfer_sizes.values()) + this_load > transfer_limit or went_over_budget)): if went_over_budget: print "Transfer has gone over bubget." else: print "Transfer will go over bubget." print "%15.4f GB this load" % this_load print "%15.4f GB already this round" % sum(transfer_sizes.values()) print "%15.4f GB is the available limit" % transfer_limit went_over_budget = True if int( wfh.request['RequestPriority'] ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority: print "Higher priority sample", wfh.request[ 'RequestPriority'], ">=", in_transfer_priority, "go-on over budget" else: if not options.go: print min_transfer_priority, "minimum priority", wfh.request[ 'RequestPriority'], "<", in_transfer_priority, "stop" continue ## throtlle by campaign go if not CI.go(wfh.request['Campaign']): print "No go for", wfh.request['Campaign'] if not options.go: continue ## check if the batch is announced announced = False is_real = False for b in mcm.getA('batches', query='contains=%s' % wfo.name): is_real = True if b['status'] == 'announced': announced = True break if not announced: print wfo.name, "does not look announced." # skipping?, rejecting?, reporting?" if not is_real: print wfo.name, "does not appear to be genuine." ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status continue ## check on a grace period injection_time = time.mktime( time.strptime('.'.join(map(str, wfh.request['RequestDate'])), "%Y.%m.%d.%H.%M.%S")) / (60. * 60.) now = time.mktime(time.gmtime()) / (60. * 60.) if float(now - injection_time) < 4.: if not options.go and not announced: print "It is too soon to start transfer: %3.2fH remaining" % ( now - injection_time) continue passing_along += 1 if passing_along >= allowed_to_handle: if int( wfh.request['RequestPriority'] ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority: print "Higher priority sample", wfh.request[ 'RequestPriority'], ">=", in_transfer_priority, "go-on over", max_to_handle else: print "Not allowed to pass more than", max_to_handle, "at a time. Currently", being_handled, "handled, and adding", passing_along break (lheinput, primary, parent, secondary) = wfh.getIO() if options and options.tosites: sites_allowed = options.tosites.split(',') else: sites_allowed = getSiteWhiteList( (lheinput, primary, parent, secondary)) if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']): sites_allowed = CI.parameters( wfh.request['Campaign'])['SiteWhitelist'] blocks = [] if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']: blocks = wfh.request['BlockWhitelist'] can_go = True staging = False if primary: if talk: print wfo.name, 'reads', ', '.join(primary), 'in primary' ## chope the primary dataset for prim in primary: max_priority[prim] = max(max_priority[prim], int(wfh.request['RequestPriority'])) sites_really_allowed = [ site for site in sites_allowed if not any([ osite.startswith(site) for osite in SI.sites_veto_transfer ]) ] print "Sites allowed minus the vetoed transfer" print sites_really_allowed copies_needed = int( 0.35 * len(sites_really_allowed) ) + 1 ## should just go for a fixed number based if the white list grows that big print "Would make", copies_needed, "copies" if options.maxcopy > 0: copies_needed = min(options.maxcopy, copies_needed) ## remove the sites that do not want transfers print "need", copies_needed workflow_dependencies[prim].add(wfo.id) presence = getDatasetPresence(url, prim) prim_location = [ site for site, pres in presence.items() if pres[0] == True ] if len(prim_location) >= copies_needed: print "The output is all fully in place at", len( prim_location), "sites" continue # reduce the number of copies required by existing full copies copies_needed = max(0, copies_needed - len(prim_location)) print "now need", copies_needed subscriptions = listSubscriptions(url, prim) prim_destination = list( set([ site for (site, (tid, decision)) in subscriptions.items() if decision and not any([ site.endswith(veto) for veto in ['MSS', 'Export', 'Buffer'] ]) ])) ## need to reject from that list the ones with a full copy already: i.e the transfer corresponds to the copy in place prim_destination = [ site for site in prim_destination if not site in prim_location ] ## add transfer dependencies latching_on_transfers = list( set([ tid for (site, (tid, decision)) in subscriptions.items() if decision and site in prim_destination and not any([ site.endswith(veto) for veto in ['MSS', 'Export', 'Buffer'] ]) ])) print latching_on_transfers for latching in latching_on_transfers: tfo = session.query(Transfer).filter( Transfer.phedexid == latching).first() if not tfo: tfo = Transfer(phedexid=latching) tfo.workflows_id = [] session.add(tfo) if not wfo.id in tfo.workflows_id: print "adding", wfo.id, "to", tfo.id, "with phedexid", latching l = copy.deepcopy(tfo.workflows_id) l.append(wfo.id) tfo.workflows_id = l if not options.test: session.commit() else: session.flush( ) ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query can_go = False transfer_sizes[prim] = input_sizes[prim] staging = True # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ?? copies_needed = max(0, copies_needed - len(prim_destination)) print "then need", copies_needed if copies_needed == 0: print "The output is either fully in place or getting in full somewhere with", latching_on_transfers can_go = True continue prim_to_distribute = [ site for site in sites_allowed if not any( [osite.startswith(site) for osite in prim_location]) ] prim_to_distribute = [ site for site in prim_to_distribute if not any( [osite.startswith(site) for osite in prim_destination]) ] ## take out the ones that cannot receive transfers prim_to_distribute = [ site for site in prim_to_distribute if not any([ osite.startswith(site) for osite in SI.sites_veto_transfer ]) ] if len( prim_to_distribute ) > 0: ## maybe that a parameter we can play with to limit the if not options or options.chop: spreading = distributeToSites(getDatasetChops(prim), prim_to_distribute, n_copies=copies_needed, weights=SI.cpu_pledges) else: spreading = {} for site in prim_to_distribute: spreading[site] = [prim] can_go = False transfer_sizes[prim] = input_sizes[prim] for (site, items) in spreading.items(): all_transfers[site].extend(items) if secondary: if talk: print wfo.name, 'reads', ', '.join(secondary), 'in secondary' for sec in secondary: workflow_dependencies[sec].add(wfo.id) presence = getDatasetPresence(url, sec) sec_location = [ site for site, pres in presence.items() if pres[1] > 90. ] ## more than 90% of the minbias at sites subscriptions = listSubscriptions(url, sec) sec_destination = [site for site in subscriptions] sec_to_distribute = [ site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location]) ] sec_to_distribute = [ site for site in sec_to_distribute if not any( [osite.startswith(site) for osite in sec_destination]) ] sec_to_distribute = [ site for site in sec_to_distribute if not any([ osite.startswith(site) for osite in SI.sites_veto_transfer ]) ] if len(sec_to_distribute) > 0: for site in sec_to_distribute: all_transfers[site].append(sec) can_go = False ## is that possible to do something more if can_go: ## no explicit transfer required this time if staging: ## but using existing ones print wfo.name, "latches on existing transfers, and nothing else" wfo.status = 'staging' else: print wfo.name, "should just be assigned NOW to", sites_allowed wfo.status = 'staged' print "setting status to", wfo.status session.commit() continue else: ## there is an explicit transfer required if staging: ## and also using an existing one print wfo.name, "latches on existing transfers" if not options.test: wfo.status = 'staging' print "setting status to", wfo.status session.commit() print wfo.name, "needs a transfer" needs_transfer += 1 #print json.dumps(all_transfers) fake_id = -1 wf_id_in_prestaging = set() for (site, items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## site that do not want input datasets if site in SI.sites_veto_transfer: print site, "does not want transfers" continue ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured. ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] datasets = [it for it in items_to_transfer if not '#' in it] if execute: print "Making a replica to", site, "(CE)", site_se, "(SE) for" else: print "Would make a replica to", site, "(CE)", site_se, "(SE) for" print "\t", len(blocks), "blocks" ## remove blocks if full dataset is send out blocks = [ block for block in blocks if not block.split('#')[0] in datasets ] print "\t", len(blocks), "needed blocks for", list( set([block.split('#')[0] for block in blocks])) print "\t", len(datasets), "datasets" print "\t", datasets items_to_transfer = blocks + datasets ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y', 'yes', 'go']: continue if execute: result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority='normal') ## make use of max_priority dataset:priority to set the subscriptions priority """ ## does not function once = True for item in items_to_transfer: bds = item.split('#')[0] if max_priority[bds] >= 90000: if once: w=10 print "waiting",w,"s before raising priority" time.sleep(w) once=False ## raise it to high priority print item,"subscription priority raised to high at",site_se #print "This does not work yet properly it seems" print updateSubscription(url, site_se, item, priority='high') """ else: #result= {'phedex':{'request_created' : [{'id' : fake_id}]}} result = {'phedex': {'request_created': []}} fake_id -= 1 if not result: print "ERROR Could not make a replica request for", site, items_to_transfer, "pre-staging" continue for phedexid in [o['id'] for o in result['phedex']['request_created']]: new_transfer = session.query(Transfer).filter( Transfer.phedexid == phedexid).first() print phedexid, "transfer created" if not new_transfer: new_transfer = Transfer(phedexid=phedexid) session.add(new_transfer) new_transfer.workflows_id = set() for transfering in list( set(map(lambda it: it.split('#')[0], items_to_transfer))): new_transfer.workflows_id.update( workflow_dependencies[transfering]) new_transfer.workflows_id = list(new_transfer.workflows_id) wf_id_in_prestaging.update(new_transfer.workflows_id) session.commit() ## auto approve it if execute: approved = approveSubscription(url, phedexid, [site_se]) for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status != 'staging': if execute: tr_wf.status = 'staging' if talk: print "setting", tr_wf.name, "to staging" session.commit()
def closor(url, specific=None, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() all_late_files = [] check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce') jump_the_line = options.announce if options else False if jump_the_line: wfs = session.query(Workflow).filter(Workflow.status.contains('announce')).filter(sqlalchemy.not_(Workflow.status.contains('announced'))).all() else: wfs = session.query(Workflow).filter(Workflow.status=='close').all() held = set() print len(wfs),"closing" max_per_round = UC.get('max_per_round').get('closor',None) if options.limit: max_per_round = options.limit random.shuffle( wfs ) if max_per_round: wfs = wfs[:max_per_round] batch_go = {} batch_warnings = defaultdict(set) batch_goodness = UC.get("batch_goodness") for wfo in wfs: if specific and not specific in wfo.name: continue ## what is the expected #lumis wfi = workflowInfo(url, wfo.name ) wfo.wm_status = wfi.request['RequestStatus'] if wfi.isRelval(): has_batch_go = False batch_name = wfi.getCampaign() if not batch_name in batch_go: ## do the esimatation whethere this can be announced : only once per batch in_batches = getWorkflowByCampaign(url , batch_name, details=True) batch_go[ batch_name ] = all(map(lambda s : not s in ['completed','running-open','running-closed','acquired','assigned','assignment-approved'], [r['RequestStatus'] for r in in_batches])) ## already verified has_batch_go = batch_go[batch_name] if not has_batch_go: wfi.sendLog('closor', 'Cannot close for now because the batch %s is not all close'% batch_name) continue if wfi.request['RequestStatus'] in ['announced','normal-archived'] and not options.force: ## manually announced ?? wfo.status = 'done' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog('closor','%s is announced already : %s'%( wfo.name,wfo.wm_status)) session.commit() if jump_the_line: wfi.sendLog('closor','Announcing while completing') expected_lumis = 1 if not 'TotalInputLumis' in wfi.request: print wfo.name,"has not been assigned yet, or the database is corrupted" elif wfi.request['TotalInputLumis']==0: print wfo.name,"is corrupted with 0 expected lumis" else: expected_lumis = wfi.request['TotalInputLumis'] ## what are the outputs outputs = wfi.request['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = defaultdict(lambda : False) stats = defaultdict(int) #print outputs if len(outputs): print wfo.name,wfi.request['RequestStatus'] for out in outputs: event_count,lumi_count = getDatasetEventsAndLumis(dataset=out) odb = session.query(Output).filter(Output.datasetname==out).first() if not odb: print "adding an output object",out odb = Output( datasetname = out ) odb.workflow = wfo session.add( odb ) odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) session.commit() fraction = lumi_count/float(expected_lumis)*100. completion_line = "%60s %d/%d = %3.2f%%"%(out,lumi_count,expected_lumis,fraction) wfi.sendLog('closor',"\t%s"% completion_line) if wfi.isRelval() and fraction < batch_goodness: batch_warnings[ wfi.getCampaign()].add( completion_line ) stats[out] = lumi_count all_OK[out] = True ## check for at least one full copy prior to moving on in_full = {} for out in outputs: in_full[out] = [] presence = getDatasetPresence( url, out ) where = [site for site,info in presence.items() if info[0]] if where: all_OK[out] = True print out,"is in full at",",".join(where) in_full[out] = copy.deepcopy(where) else: going_to = wfi.request['NonCustodialSites']+wfi.request['CustodialSites'] wfi.sendLog('closor',"%s is not in full anywhere. send to %s"%(out, ",".join(sorted(going_to)))) at_destination = dict([(k,v) for (k,v) in presence.items() if k in going_to]) else_where = dict([(k,v) for (k,v) in presence.items() if not k in going_to]) print json.dumps( at_destination ) print json.dumps( else_where, indent=2 ) ## do the full stuck transfer study, missing files and shit ! for there in going_to: late_info = findLateFiles(url, out, going_to = there ) for l in late_info: l.update({"workflow":wfo.name,"dataset":out}) all_late_files.extend( late_info ) if check_fullcopy_to_announce: ## only set this false if the check is relevant all_OK[out] = False ## verify if we have to do harvesting if not options.no_harvest and not jump_the_line: (OK, requests) = spawn_harvesting(url, wfi, in_full) all_OK.update( OK ) ## only that status can let me go into announced if all(all_OK.values()) and ((wfi.request['RequestStatus'] in ['closed-out']) or options.force or jump_the_line): print wfo.name,"to be announced" results=[] if not results: for out in outputs: if out in stats and not stats[out]: continue _,dsn,process_string,tier = out.split('/') if all_OK[out]: results.append(setDatasetStatus(out, 'VALID')) if all_OK[out] and wfi.isRelval(): ## make the specific relval rules and the replicas ## figure the destination(s) out destinations = set() if tier != "RECO" and tier != "ALCARECO": destinations.add('T2_CH_CERN') if tier == "GEN-SIM": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-DIGI-RAW": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-RECO": destinations.add('T1_US_FNAL_Disk') if "RelValTTBar" in dsn and "TkAlMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if "MinimumBias" in dsn and "SiStripCalMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if destinations: wfi.sendLog('closor', '%s to go to %s'%(out, ', '.join( sorted( destinations )))) ## call to makereplicarequest under relval => done for site in destinations: result = makeReplicaRequest(url, site, [out], 'Copy for release validation consumption', priority='normal', approve=True, mail=False, group='RelVal') try: request_id = result['phedex']['request_created'][0]['id'] results.append( True ) except: results.append( 'Failed relval transfer' ) elif all_OK[out]: campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request and wfi.request['Campaign']: campaign = wfi.request['Campaign'] to_DDM = False ## campaign override if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[campaign] and tier in CI.campaigns[campaign]['toDDM']: to_DDM = True ## by typical enabling if tier in UC.get("tiers_to_DDM"): to_DDM = True ## check for unitarity if not tier in UC.get("tiers_no_DDM")+UC.get("tiers_to_DDM"): print "tier",tier,"neither TO or NO DDM for",out results.append('Not recognitized tier %s'%tier) #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out) sendLog('closor', "could not recognize %s for injecting in DDM"% out, level='critical') continue n_copies = 2 destinations=[] if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[campaign]: ddm_instructions = CI.campaigns[campaign]['DDMcopies'] if type(ddm_instructions) == int: n_copies = CI.campaigns[campaign]['DDMcopies'] elif type(ddm_instructions) == dict: ## a more fancy configuration for ddmtier,indication in ddm_instructions.items(): if ddmtier==tier or ddmtier in ['*','all']: ## this is for us if 'N' in indication: n_copies = indication['N'] if 'host' in indication: destinations = indication['host'] destination_spec = "" if destinations: destination_spec = "--destination="+",".join( destinations ) group_spec = "" ## not used yet ### should make this a campaign configuration ## inject to DDM when necessary if to_DDM: print "Sending",out," to DDM" p = os.popen('python assignDatasetToSite.py --nCopies=%d --dataset=%s %s %s --debug 0 --exec'%(n_copies, out,destination_spec, group_spec)) ddm_text = p.read() print ddm_text status = p.close() if status!=None: print "Failed DDM, retrying to send",out,"a second time" p = os.popen('python assignDatasetToSite.py --nCopies=%d --dataset=%s %s %s --debug 1 --exec'%(n_copies, out,destination_spec, group_spec)) ddm_text = p.read() print ddm_text status = p.close() if status!=None: #sendEmail("failed DDM injection","could not add "+out+" to DDM pool. check closor logs.") sendLog('closor',"could not add "+out+" to DDM pool. check closor logs.", level='critical') if options.force: status = True results.append( status ) if status == None: wfi.sendLog('closor',ddm_text) wfi.sendLog('closor','%s is send to AnalysisOps DDM pool in %s copies %s'%( out, n_copies, destination_spec)) else: print wfo.name,"no stats for announcing",out results.append('No Stats') if all(map(lambda result : result in ['None',None,True],results)): if not jump_the_line: ## only announce if all previous are fine res = reqMgrClient.announceWorkflowCascade(url, wfo.name) if not res in ['None',None]: ## check the status again, it might well have toggled wl_bis = workflowInfo(url, wfo.name) wfo.wm_status = wl_bis.request['RequestStatus'] session.commit() if wl_bis.request['RequestStatus'] in ['announced','normal-archived']: res = None else: ## retry ? res = reqMgrClient.announceWorkflowCascade(url, wfo.name) results.append( res ) #print results if all(map(lambda result : result in ['None',None,True],results)): if jump_the_line: if not 'announced' in wfo.status: wfo.status = wfo.status.replace('announce','announced') else: wfo.status = 'done' session.commit() wfi.sendLog('closor',"workflow outputs are announced") else: wfi.sendLog('closor',"Error with %s to be announced \n%s"%( wfo.name, json.dumps( results ))) elif wfi.request['RequestStatus'] in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']: if wfi.isRelval(): wfo.status = 'forget' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog('closor',"%s is %s, but will not be set in trouble to find a replacement."%( wfo.name, wfo.wm_status)) else: wfo.status = 'trouble' wfo.wm_status = wfi.request['RequestStatus'] session.commit() else: print wfo.name,"not good for announcing:",wfi.request['RequestStatus'] wfi.sendLog('closor',"cannot be announced") held.add( wfo.name ) days_late = 0. retries_late = 10 really_late_files = [info for info in all_late_files if info['retries']>=retries_late] really_late_files = [info for info in really_late_files if info['delay']/(60*60*24.)>=days_late] if really_late_files: subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s'%(len(really_late_files), days_late, retries_late, json.dumps( really_late_files , indent=2) ) #sendEmail('waiting for files to announce', subject) sendLog('closor', subject, level='warning') sendLog('closor',subject) print subject open('%s/stuck_files.json'%monitor_dir,'w').write( json.dumps( really_late_files , indent=2)) if held: sendLog('closor',"the workflows below are held up \n%s"%("\n".join( sorted(held) )), level='critical') #batches = json.loads(open('batches.json').read()) for bname,go in batch_go.items(): if go: subject = "Release Validation Samples Batch %s"% bname issues="" if batch_warnings[ bname ]: issues="The following datasets have outstanding completion (<%d%%) issues:\n\n"% batch_goodness issues+="\n".join( sorted( batch_warnings[ bname ] )) issues+="\n\n" text = """ Dear all, a batch of release validation workflows has finished. Batch ID: %s Detail of the workflows https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s %s This is an automated message. """%( bname, bname, issues) to = ['*****@*****.**'] sendEmail(subject, text, destination=to )
def actor(url, options=None): if moduleLock(wait=False, silent=True)(): return if userLock('actor'): return up = componentInfo(mcm=False, soft=['mcm']) if not up.check(): return # CI = campaignInfo() SI = siteInfo() UC = unifiedConfiguration() # Need to look at the actions page https://vocms0113.cern.ch:80/getaction (can add ?days=20) and perform any actions listed try: action_list = json.loads( os.popen( 'curl -s -k https://vocms0113.cern.ch:80/getaction?days=15'). read()) ## now we have a list of things that we can take action on except: try: action_list = json.loads( os.popen( 'curl -s -k https://vocms0113.cern.ch/getaction?days=15'). read()) except: print "Not able to load action list :(" sendLog('actor', 'Not able to load action list', level='critical') return if options.actions: action_list = json.loads(open(options.actions).read()) print json.dumps(action_list, indent=2) if not action_list: print "EMPTY!" return wf_list = action_list.keys() print json.dumps(sorted(wf_list), indent=2) if options.spec: wf_list = [wf for wf in wf_list if options.spec in wf] max_per_round = UC.get('max_per_round').get('actor', None) if max_per_round: random.shuffle(wf_list) wf_list = wf_list[:max_per_round] for wfname in wf_list: print '-' * 100 print "Looking at", wfname, "for recovery options" to_clone = False to_acdc = False to_force = False to_hold = False something_to_do = False tasks = action_list[wfname].get('Parameters', None) to_acdc = action_list[wfname].get('Action', None) == 'acdc' to_clone = action_list[wfname].get('Action', None) == 'clone' to_force = action_list[wfname].get( 'Action', None) == 'special' and action_list[wfname].get( 'Parameters', {}).get('action', None) in ['by-pass', 'bypass'] to_hold = action_list[wfname].get( 'Action', None) == 'special' and action_list[wfname].get( 'Parameters', {}).get('action', None) in ['onhold', 'on-hold'] if not to_acdc and not to_clone and not to_force and not to_hold: sendLog( 'actor', 'Action submitted for something other than acdc, clone, bypass or hold for workflow %s' % wfname, level='critical') print json.dumps(action_list[wfname], indent=2) continue if not tasks and to_acdc: sendLog('actor', 'Empty action submitted for workflow %s' % wfname, level='critical') print "Moving on. Parameters is blank for " + wfname continue wfi = workflowInfo(url, wfname) recover = True message_to_ops = "" message_to_user = "" #=========================================================== if to_clone and options.do: print "Let's try kill and clone: " wfi.sendLog('actor', 'Going to clone %s' % wfname) results = [] datasets = set(wfi.request['OutputDatasets']) comment = "" if 'comment' in tasks: comment = ", reason: " + tasks['comment'] wfi.sendLog( 'actor', "invalidating the workflow by traffic controller %s" % comment) #Reject all workflows in the family #first reject the original workflow. reqMgrClient.invalidateWorkflow( url, wfi.request['RequestName'], current_status=wfi.request['RequestStatus'], cascade=False) #Then reject any ACDCs associated with that workflow family = getWorkflowById(url, wfi.request['PrepID'], details=True) for fwl in family: print "rejecting", fwl['RequestName'], fwl['RequestStatus'] wfi.sendLog( 'actor', "rejecting %s, previous status %s" % (fwl['RequestName'], fwl['RequestStatus'])) reqMgrClient.invalidateWorkflow( url, fwl['RequestName'], current_status=fwl['RequestStatus'], cascade=False) datasets.update(fwl['OutputDatasets']) #Invalidate all associated output datasets for dataset in datasets: results.append(setDatasetStatus(dataset, 'INVALID')) if all(map(lambda result: result in ['None', None, True], results)): wfi.sendLog('actor', "%s and children are rejected" % wfname) cloned = None try: cloned = singleClone(url, wfname, tasks, comment, options.do) except Exception as e: sendLog( 'actor', 'Failed to create clone for %s! Check logs for more information. Action will need to be resubmitted.' % wfname, level='critical') wfi.sendLog('actor', 'Failed to create clone for %s!' % wfname) print str(e) ##let's not remove the action other the workflow goes to "trouble" and the WTC cannot set the action again #remove_action(wfname) if not cloned: recover = False wfi.sendLog('actor', 'Failed to create clone for %s!' % wfname) sendLog('actor', 'Failed to create clone for %s!' % wfname, level='critical') else: wfi.sendLog('actor', "Workflow %s cloned" % wfname) #=========================================================== elif to_force: wfi.sendLog('actor', 'Bypassing from workflow traffic controler request') forcing = json.loads( open( '/afs/cern.ch/user/v/vlimant/public/ops/forcecomplete.json' ).read()) forcing.append(wfname) open('/afs/cern.ch/user/v/vlimant/public/ops/forcecomplete.json', 'w').write(json.dumps(sorted(set(forcing)))) elif to_hold: wfi.sendLog('actor', 'Holding on workflow traffic controler request') holding = json.loads( open('/afs/cern.ch/user/v/vlimant/public/ops/onhold.json'). read()) holding.append(wfname) open('/afs/cern.ch/user/v/vlimant/public/ops/onhold.json', 'w').write(json.dumps(sorted(set(holding)))) #=========================================================== elif to_acdc: if 'AllSteps' in tasks: allTasksDefaults = tasks['AllSteps'] tasks.pop('AllSteps') for setting in allTasksDefaults: for task in tasks: if setting in tasks[task]: tasks[task][setting] = allTasksDefaults[setting] else: tasks[task].append( {setting: allTasksDefaults[setting]}) print "Tasks is " print json.dumps(tasks, indent=2) all_tasks = wfi.getAllTasks() ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves try: WMErr = wfi.getWMErrors() # print WMErr except: sendLog( 'actor', 'Cannot create ACDCS for %s because WMErr cannot be reached.' % wfname, level='critical') continue if not WMErr: wfi.sendLog('actor', 'WMErrors is blank for %s.' % wfname) print "FYI getWMErrors is blank. Presumably there are only unreported errors" # continue try: where_to_run, missing_to_run, missing_to_run_at = wfi.getRecoveryInfo( ) print "Where to run = " print where_to_run except: sendLog( 'actor', 'Cannot create ACDCS for %s because recovery info cannot be found.' % wfname, level='critical') print "Moving on. Cannot access recovery info for " + wfname continue if not where_to_run: sendLog( 'actor', 'Cannot create ACDCS for %s because site list cannot be found.' % wfname, level='critical') print "Moving on. where to run is blank" continue message_to_ops = "" message_to_user = "" num_tasks_to_recover = 0 if WMErr: for task in WMErr: if 'LogCollect' in task: continue if 'Cleanup' in task: continue if not 'jobfailed' in WMErr[task]: continue else: num_tasks_to_recover += 1 # print "Task to recover: " + task if not num_tasks_to_recover: print "\tno error for", wfname # recover = False if 'LheInputFiles' in wfi.request and wfi.request['LheInputFiles']: ## we do not try to recover pLHE sendLog( 'actor', 'Cannot create ACDCS for %s because it is a pLHE workflow.' % wfname, level='critical') print "We don't try to recover pLHE. Moving on." recover = False # sendEmail('cannot submit action', '%s is a pLHE workflow. We do not try to recover pLHE'%wfname) # if wfi.request['RequestType'] in ['ReReco']: # recover= False # print 'cannot submit action. ReReco' # sendEmail('cannot submit action', '%s is request type ReReco'%wfname) recovering = set() for task in tasks: assign_to_sites = set() print "Task names is " + task fulltaskname = '/' + wfname + '/' + task # print "Full task name is " + fulltaskname wrong_task = False for task_info in all_tasks: if fulltaskname == task_info.pathName: if task_info.taskType not in [ 'Processing', 'Production', 'Merge' ]: wrong_task = True wfi.sendLog( 'actor', "Skipping task %s because the taskType is %s. Can only ACDC Processing, Production, or Merge tasks" % (fulltaskname, task_info.taskType)) if wrong_task: continue print tasks[task] actions = tasks[task] for action in actions: if action.startswith('sites'): if type(actions[action]) != list: assign_to_sites = [SI.SE_to_CE(actions[action])] else: assign_to_sites = list( set([ SI.SE_to_CE(site) for site in actions[action] ])) # if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same' and wfi.request['RequestType'] in ['TaskChain']: # recover = False; # print "Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname # wfi.sendLog('actor',"Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname) if not 'sites' in actions: assign_to_sites = list( set([SI.SE_to_CE(site) for site in where_to_run[task]])) print "Found", sorted( assign_to_sites ), "as sites where to run the ACDC at, from the acdc doc of ", wfname print "Going to run at", sorted(assign_to_sites) if recover: print "Initiating recovery" acdc = singleRecovery(url, fulltaskname, wfi.request, actions, do=options.do) if not acdc: if options.do: if recovering: print wfname + " has been partially ACDC'ed. Needs manual attention." sendLog( 'actor', "%s has had %s/%s recoveries %s only" % (wfname, len(recovering), num_tasks_to_recover, list(recovering)), level='critical') wfi.sendLog( 'actor', "%s has had %s/%s recoveries %s only" % (wfname, len(recovering), num_tasks_to_recover, list(recovering))) break else: print wfname + " failed recovery once" recover = False break else: print "no action to take further" # sendLog('recoveror', "ACDC for %s can be done automatically"% wfname, level='critical') continue else: #ACDC was made correctly. Now we have to assign it. wfi.sendLog( 'actor', 'ACDC created for task %s. Actions taken \n%s' % (fulltaskname, json.dumps(actions))) #team = wfi.request['Teams'][0] team = 'production' parameters = { 'SiteWhitelist': sorted(assign_to_sites), 'AcquisitionEra': wfi.acquisitionEra(), 'ProcessingString': wfi.processingString(), 'MergedLFNBase': wfi.request['MergedLFNBase'], 'ProcessingVersion': wfi.request['ProcessingVersion'], } ## hackery for ACDC merge assignment if wfi.request[ 'RequestType'] == 'TaskChain' and 'Merge' in task.split( '/')[-1]: parameters['AcquisitionEra'] = None parameters['ProcessingString'] = None ## xrootd setttings on primary and secondary if 'xrootd' in actions: if actions['xrootd'] == 'enabled': print "Going to assign via xrootd" parameters['TrustSitelists'] = True elif actions['xrootd'] == 'disabled': parameters['TrustSitelists'] = False elif ('TrustSitelists' in wfi.request and wfi.request['TrustSitelists'] == 'true'): parameters['TrustSitelists'] = True else: parameters['TrustSitelists'] = False if 'secondary' in actions: if actions['secondary'] == 'enabled': print 'Enabling reading the secondary input via xrootd' parameters['TrustPUSitelists'] = True elif actions['secondary'] == 'disabled': parameters['TrustPUSitelists'] = False #in case secondary is blank or not set to enabled or disabled elif 'TrustPUSitelists' in wfi.request and wfi.request[ 'TrustPUSitelists']: parameters['TrustPUSitelists'] = True elif 'TrustPUSitelists' in wfi.request and wfi.request[ 'TrustPUSitelists']: parameters['TrustPUSitelists'] = True if options.ass: print "really doing the assignment of the ACDC", acdc parameters['execute'] = True wfi.sendLog('actor', "%s was assigned for recovery" % acdc) else: print "no assignment done with this ACDC", acdc sendLog('actor', "%s needs to be assigned" % (acdc), level='critical') continue # print parameters result = reqMgrClient.assignWorkflow( url, acdc, team, parameters) if not result: print acdc, "was not assigned" sendLog('actor', "%s needs to be assigned" % (acdc), level='critical') else: recovering.add(acdc) wfi.sendLog('actor', "ACDCs created for %s" % wfname) #=========================================================== if recover and options.do: remove_action(wfname) if message_to_user: print wfname, "to be notified to user(DUMMY)", message_to_user if message_to_ops: print 'message' #sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**']) # sendLog('recoveror',message_to_ops,level='warning') return
def closor(url, specific=None, options=None): if userLock(): return mlock = moduleLock() if mlock(): return up = componentInfo(soft=['mcm','wtc']) if not up.check(): return UC = unifiedConfiguration() CI = campaignInfo() BI = batchInfo() CloseI = closeoutInfo() all_late_files = [] jump_the_line = options.announce if options else False if jump_the_line: print "announce option is on. Checking on things on-going ready to be announced" wfs = session.query(Workflow).filter(Workflow.status.contains('announce')).filter(sqlalchemy.not_(Workflow.status.contains('announced'))).all() else: print "regular option. Checking on things done and to be announced" wfs = session.query(Workflow).filter(Workflow.status=='close').all() if specific: wfs = [wfo for wfo in wfs if specific in wfo.name] wfs_n = [w.name for w in wfs] print "unique names?" print len(set(wfs_n)) == len(wfs_n) held = set() print len(wfs),"closing" random.shuffle( wfs ) max_per_round = UC.get('max_per_round').get('closor',None) if options.limit: max_per_round = options.limit if max_per_round: ## order them by priority all_closedout = sorted(getWorkflows(url, 'closed-out', details=True), key = lambda r : r['RequestPriority']) all_closedout = [r['RequestName'] for r in all_closedout] def rank( wfn ): return all_closedout.index( wfn ) if wfn in all_closedout else 0 wfs = sorted( wfs, key = lambda wfo : rank( wfo.name ),reverse=True) wfs = wfs[:max_per_round] batch_go = {} batch_warnings = defaultdict(set) batch_goodness = UC.get("batch_goodness") closers = [] print len(wfs),"closing" th_start = time.mktime(time.gmtime()) for iwfo,wfo in enumerate(wfs): if specific and not specific in wfo.name: continue closers.append( CloseBuster( wfo = wfo, url = url, CI = CI, UC = UC, jump_the_line = jump_the_line, batch_goodness = batch_goodness, batch_go = batch_go, #stats = stats, batch_warnings = batch_warnings, all_late_files = all_late_files, held = held, )) run_threads = ThreadHandler( threads = closers, n_threads = options.threads, sleepy = 10, timeout = None, verbose = True, label = 'closor') run_threads.start() ## waiting on all to complete while run_threads.is_alive(): #print "Waiting on closing threads",time.asctime(time.gmtime()) time.sleep(5) JC = JIRAClient() if up.status.get('jira',False) else None print len(run_threads.threads),"finished thread to gather information from" failed_threads = 0 for to in run_threads.threads: if to.failed: failed_threads += 1 continue if to.outs: for outO in to.outs: out = outO.datasetname odb = session.query(Output).filter(Output.datasetname==out).first() if not odb: print "adding an output object",out session.add( outO ) else: odb.date = outO.date if to.to_status: to.wfo.status = to.to_status if JC and to.to_status == "done" and to.wfi: jiras = JC.find({"prepid" : to.wfi.request['PrepID']}) for jira in jiras: JC.close(jira.key) if to.to_wm_status: to.wfo.wm_status = to.to_wm_status if to.closing: CloseI.pop( to.wfo.name ) session.commit() th_stop = time.mktime(time.gmtime()) if wfs: time_spend_per_workflow = (th_stop-th_start) / float(len(wfs)) print "Average time spend per workflow is", time_spend_per_workflow if float(failed_threads/run_threads.n_threads) > 0: sendLog('checkor','%d/%d threads have failed, better check this out'% (failed_threads, run_threads.n_threads), level='critical') sendEmail('checkor','%d/%d threads have failed, better check this out'% (failed_threads,run_threads.n_threads)) days_late = 0. retries_late = 10 really_late_files = [info for info in all_late_files if info['retries']>=retries_late] really_late_files = [info for info in really_late_files if info['delay']/(60*60*24.)>=days_late] if really_late_files: subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s'%(len(really_late_files), days_late, retries_late, json.dumps( really_late_files , indent=2) ) #sendEmail('waiting for files to announce', subject) sendLog('closor', subject, level='warning') sendLog('closor',subject) print subject open('%s/stuck_files.json'%monitor_dir,'w').write( json.dumps( really_late_files , indent=2)) if held: sendLog('closor',"the workflows below are held up \n%s"%("\n".join( sorted(held) )), level='critical') for bname,go in batch_go.items(): if go: subject = "Release Validation Samples Batch %s"% bname issues="" if batch_warnings[ bname ]: issues="The following datasets have outstanding completion (<%d%%) issues:\n\n"% batch_goodness issues+="\n".join( sorted( batch_warnings[ bname ] )) issues+="\n\n" text = """ Dear all, a batch of release validation workflows has finished. Batch ID: %s Detail of the workflows https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s %s This is an automated message. """%( bname, bname, issues) to = ['*****@*****.**'] sendEmail(subject, text, destination=to ) ## just announced ; take it out now. BI.pop( bname ) if os.path.isfile('.closor_stop'): print "The loop on workflows was shortened" sendEmail('closor','Closor loop was shortened artificially using .closor_stop') os.system('rm -f .closor_stop')
def assignor(url ,specific = None, talk=True, options=None): if userLock('assignor'): return CI = campaignInfo() SI = siteInfo() wfos=[] if specific: wfos = session.query(Workflow).filter(Workflow.name==specific).all() if not wfos: if specific: wfos = session.query(Workflow).filter(Workflow.status=='considered').all() wfos.extend( session.query(Workflow).filter(Workflow.status=='staging').all()) wfos.extend(session.query(Workflow).filter(Workflow.status=='staged').all()) for wfo in wfos: if specific: if not any(map(lambda sp: sp in wfo.name,specific.split(','))): continue #if not specific in wfo.name: continue print wfo.name,"to be assigned" wfh = workflowInfo( url, wfo.name) ## check if by configuration we gave it a GO if not CI.go( wfh.request['Campaign'] ) and not options.go: print "No go for",wfh.request['Campaign'] continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] !='assignment-approved': print wfo.name,wfh.request['RequestStatus'],"skipping" if not options.test: continue ## retrieve from the schema, dbs and reqMgr what should be the next version version=wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: print "cannot decide on version number" continue (lheinput,primary,parent,secondary) = wfh.getIO() sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) print "Allowed",sites_allowed sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])] sites_custodial = [] if len(sites_custodial)==0: print "No custodial, it's fine, it's covered in close-out" if len(sites_custodial)>1: print "more than one custodial for",wfo.name sys.exit(36) secondary_locations=None for sec in list(secondary): presence = getDatasetPresence( url, sec ) print sec print json.dumps(presence, indent=2) #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>90.] one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations==None: secondary_locations = one_secondary_locations else: secondary_locations = list(set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_all_data = copy.deepcopy( sites_allowed ) sites_with_data = copy.deepcopy( sites_allowed ) sites_with_any_data = copy.deepcopy( sites_allowed ) primary_locations = None available_fractions = {} for prim in list(primary): presence = getDatasetPresence( url, prim ) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction(url, prim, sites = [SI.CE_to_SE(site) for site in sites_allowed] ) sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])] sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])] sites_with_any_data = [site for site in sites_with_any_data if any([osite.startswith(site) for osite in presence.keys()])] if primary_locations==None: primary_locations = presence.keys() else: primary_locations = list(set(primary_locations) & set(presence.keys() )) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites=[] ## opportunistic running where any piece of data is available if secondary_locations and primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set(sites_allowed))] print "We could be running at",opportunistic_sites,"in addition" if available_fractions and not all([available>=1. for available in available_fractions.values()]): print "The input dataset is not located in full at any site" print json.dumps(available_fractions) if not options.test and not options.go: continue ## skip skip skip copies_wanted = 2. if available_fractions and not all([available>=copies_wanted for available in available_fractions.values()]): print "The input dataset is not available",copies_wanted,"times, only",available_fractions.values() if not options.go: continue ## default back to white list to original white list with any data print "Allowed",sites_allowed sites_allowed = sites_with_any_data print "Selected for any data",sites_allowed if options.restrict: print "Allowed",sites_allowed sites_allowed = sites_with_any_data print "Selected",sites_allowed else: if set(sites_with_data) != set(sites_allowed): ## the data is not everywhere we wanted to run at : enable aaa print "Sites with 90% data not matching site white list (block choping!)" print "Resorting to AAA reading for",list(set(sites_allowed) - set(sites_with_data)),"?" print "Whitelist site with any data",list(set(sites_allowed) - set(sites_with_any_data)) #options.useSiteListAsLocation = True #print "Not commissioned yet" #continue #print "We could be running at",opportunistic_sites,"in addition" ##sites_allowed = list(set(sites_allowed+ opportunistic_sites)) if not len(sites_allowed): print wfo.name,"cannot be assign with no matched sites" continue parameters={ 'SiteWhitelist' : sites_allowed, 'CustodialSites' : sites_custodial, 'NonCustodialSites' : sites_out, 'AutoApproveSubscriptionSites' : list(set(sites_out)), 'AcquisitionEra' : wfh.acquisitionEra(), 'ProcessingString' : wfh.processingString(), 'MergedLFNBase' : '/store/mc', ## to be figured out ! from Hi shit 'ProcessingVersion' : version, } ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v=getattr(options,key) if v!=None: if ',' in v: parameters[key] = filter(None,v.split(',')) else: parameters[key] = v ## pick up campaign specific assignment parameters parameters.update( CI.parameters(wfh.request['Campaign']) ) if not options.test: parameters['execute'] = True if not wfh.checkWorkflowSplitting(): ## needs to go to event based ? fail for now print "Falling back to event splitting ?" #parameters['SplittingAlgorithm'] = 'EventBased' continue ## plain assignment here team='production' if options and options.team: team = options.team result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = 'away' session.commit() else: print "ERROR could not assign",wfo.name else: pass
def transferor(url ,specific = None, talk=True, options=None): if userLock(): return if duplicateLock(): return use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() LI = lockInfo() NLI = newLockInfo() mcm = McMClient(dev=False) dss = DSS() print "counting all being handled..." being_handled = len(session.query(Workflow).filter(Workflow.status == 'away').all()) being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('stag')).all()) being_transfered = len(session.query(Workflow).filter(Workflow.status == 'staging').all()) being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all()) max_to_handle = options.maxworkflows max_to_transfer = options.maxstaging allowed_to_handle = max(0,max_to_handle - being_handled) allowed_to_transfer = max(0,max_to_transfer - being_transfered) wf_buffer = 5 if allowed_to_handle<=wf_buffer: ## buffer for having several wf per transfer print "Not allowed to run more than",max_to_handle,"at a time. Currently",being_handled,"and",wf_buffer,"buffer" else: print being_handled,"already being handled",max_to_handle,"max allowed,",allowed_to_handle,"remaining","and",wf_buffer,"buffer" if allowed_to_transfer <= wf_buffer: print "Not allowed to transfer more than",max_to_transfer,"at a time. Currently",being_transfered,"and",wf_buffer,"buffer" else: print being_transfered,"already being transfered",max_to_transfer,"max allowed,",allowed_to_transfer,"remaining","and",wf_buffer,"buffer" print "... done" all_transfers=defaultdict(list) needing_locks=defaultdict(list) workflow_dependencies = defaultdict(set) ## list of wf.id per input dataset wfs_and_wfh=[] print "getting all wf to consider ..." cache = getWorkflows(url, 'assignment-approved', details=True) for wfo in session.query(Workflow).filter(Workflow.status=='considered').all(): print "\t",wfo.name if specific and not specific in wfo.name: continue cache_r =filter(lambda d:d['RequestName']==wfo.name, cache) if len(cache_r): wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False, request = cache_r[0]) ) ) else: wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False) ) ) print "... done" transfers_per_sites = defaultdict(int) input_sizes = {} input_cput = {} input_st = {} ## list the size of those in transfer already in_transfer_priority=0 min_transfer_priority=100000000 print "getting all wf in staging ..." for wfo in session.query(Workflow).filter(Workflow.status=='staging').all(): wfh = workflowInfo( url, wfo.name, spec=False) (lheinput,primary,parent,secondary) = wfh.getIO() sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) for site in sites_allowed: ## we should get the actual transfer destination instead of the full white list transfers_per_sites[site] += 1 #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() for prim in primary: input_sizes[prim] = dss.get( prim ) print "\t",wfo.name,"needs",input_sizes[prim],"GB" in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority'])) min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority'])) print "... done" print "Max priority in transfer already",in_transfer_priority print "Min priority in transfer already",min_transfer_priority print "transfers per sites" print json.dumps( transfers_per_sites, indent=2) in_transfer_already = sum(input_sizes.values()) cput_in_transfer_already = sum(input_cput.values()) st_in_transfer_already = sum(input_st.values()) # shuffle first by name random.shuffle( wfs_and_wfh ) #sort by priority higher first wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority']) ), reverse=True) ## list the size of all inputs print "getting all input sizes ..." for (wfo,wfh) in wfs_and_wfh: (_,primary,_,_) = wfh.getIO() #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() for prim in primary: input_sizes[prim] = dss.get( prim ) print "... done" cput_grand_total = sum(input_cput.values()) cput_to_transfer = cput_grand_total - cput_in_transfer_already st_grand_total = sum(input_st.values()) st_to_transfer = st_grand_total - st_in_transfer_already print "%15.4f [CPU h] worth already in transfer"%cput_in_transfer_already print "%15.4f [CPU h] worth is the current requested transfer load"%cput_to_transfer print "%15.4f [h] worth of absolute system time in transfer"%( cput_in_transfer_already / SI.availableSlots()) print "%15.4f [h] worth of absolute system time is the current requested transfer load"%( cput_to_transfer / SI.availableSlots()) print "%15.4f [h] worth of theoritical system time in transfer"%( st_in_transfer_already ) print "%15.4f [h] worth of theoritical system time is the current requested transfer load"%( st_to_transfer ) grand_total = sum(input_sizes.values()) to_transfer = grand_total - in_transfer_already grand_transfer_limit = options.maxtransfer transfer_limit = grand_transfer_limit - in_transfer_already print "%15.4f GB already being transfered"%in_transfer_already print "%15.4f GB is the current requested transfer load"%to_transfer print "%15.4f GB is the global transfer limit"%grand_transfer_limit print "%15.4f GB is the available limit"%transfer_limit max_staging_per_site = options.maxstagingpersite # the max priority value per dataset. max_priority = defaultdict(int) needs_transfer=0 ## so that we can count'em passing_along = 0 transfer_sizes={} went_over_budget=False destination_cache = {} for (wfo,wfh) in wfs_and_wfh: print wfh.request['RequestPriority'] print wfo.name,"to be transfered" #wfh = workflowInfo( url, wfo.name) (_,primary,_,_) = wfh.getIO() this_load=sum([input_sizes[prim] for prim in primary]) if ( this_load and (sum(transfer_sizes.values())+this_load > transfer_limit or went_over_budget ) ): if went_over_budget: print "Transfer has gone over bubget." else: print "Transfer will go over bubget." print "%15.4f GB this load"%this_load print "%15.4f GB already this round"%sum(transfer_sizes.values()) print "%15.4f GB is the available limit"%transfer_limit went_over_budget=True if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over budget" else: if not options.go: print min_transfer_priority,"minimum priority",wfh.request['RequestPriority'],"<",in_transfer_priority,"stop" continue ## throtlle by campaign go if not CI.go( wfh.request['Campaign'] ): print "No go for",wfh.request['Campaign'] if not options.go: sendEmail("no go for managing","No go for "+wfh.request['Campaign']) continue ## check if the batch is announced def check_mcm(wfn): announced=False is_real=False if not wfn.startswith('pdmvserv'): is_real = True try: for b in mcm.getA('batches',query='contains=%s'% wfo.name): is_real = True if b['status']=='announced': announced=True break except: try: for b in mcm.getA('batches',query='contains=%s'% wfo.name): is_real = True if b['status']=='announced': announced=True break except: print "could not get mcm batch announcement, assuming not real" return announced,is_real if not use_mcm: announced,is_real = False,True else: announced,is_real = check_mcm( wfo.name ) if not announced: print wfo.name,"does not look announced."# skipping?, rejecting?, reporting?" if not is_real: print wfo.name,"does not appear to be genuine." ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status continue ## check on a grace period injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.) now = time.mktime(time.gmtime()) / (60.*60.) if float(now - injection_time) < 4.: if not options.go and not announced: print "It is too soon to start transfer: %3.2fH remaining"%(now - injection_time) continue if passing_along >= allowed_to_handle: #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority: ## higher priority, and not only this priority being transfered print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over",max_to_handle else: print "Not allowed to pass more than",max_to_handle,"at a time. Currently",being_handled,"handled, and adding",passing_along if not options.go: break if this_load and needs_transfer >= allowed_to_transfer: #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority: ## higher priority, and not only this priority being transfered print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over",max_to_transfer else: print "Not allowed to transfer more than",max_to_transfer,"at a time. Currently",being_transfered,"transfering, and adding",needs_transfer if not options.go: continue (lheinput,primary,parent,secondary) = wfh.getIO() for dataset in list(primary)+list(parent)+list(secondary): ## lock everything flat NLI.lock( dataset ) if options and options.tosites: sites_allowed = options.tosites.split(',') else: sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']): sites_allowed = CI.parameters(wfh.request['Campaign'])['SiteWhitelist'] if 'SiteBlacklist' in CI.parameters(wfh.request['Campaign']): sites_allowed = list(set(sites_allowed) - set(CI.parameters(wfh.request['Campaign'])['SiteBlacklist'])) ## reduce right away to sites in case of memory limitation memory_allowed = SI.sitesByMemory( wfh.request['Memory'] ) if memory_allowed!=None: print "sites allowing", wfh.request['Memory'],"are",memory_allowed sites_allowed = list(set(sites_allowed) & set(memory_allowed)) if not sites_allowed: print wfo.name,"has no possible sites to run at" print "available for",wfh.request['Memory'],"are",memory_allowed sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name )) continue blocks = [] if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## should make the block selection here pass if 'LumiList' in wfh.request and wfh.request['LumiList']: ## same, we could be doing the white list here too pass if blocks: print "Reading",len(blocks),"in whitelist" can_go = True staging=False allowed=True if primary: copies_needed_from_CPUh,CPUh = wfh.getNCopies() if talk: print wfo.name,'reads',', '.join(primary),'in primary' ## chope the primary dataset for prim in primary: max_priority[prim] = max(max_priority[prim],int(wfh.request['RequestPriority'])) sites_allowed = [site for site in sites_allowed if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] print "Sites allowed minus the vetoed transfer" print sorted(sites_allowed) copies_needed_from_site = int(0.35*len(sites_allowed))+1 ## should just go for a fixed number based if the white list grows that big print "Would make",copies_needed_from_site,"copies from site white list" copies_needed = copies_needed_from_site print "Would make",copies_needed_from_CPUh,"from cpu requirement",CPUh copies_needed = copies_needed_from_CPUh if options.maxcopy>0: ## stop maxing things out ?? #copies_needed = min(options.maxcopy,copies_needed) #print "Maxed to",copies_needed if copies_needed_from_CPUh > options.maxcopy: sendEmail('An example of more than three copies','for %s it could have been beneficial to make %s copies'%( wfo.name, copies_needed_from_CPUh)) if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies'] copies_needed = min(copies_needed_from_campaign,copies_needed_from_site) print "Maxed to",copies_needed,"by campaign configuration",wfh.request['Campaign'] ## remove the sites that do not want transfers workflow_dependencies[prim].add( wfo.id ) ##################################### ###### JR 3/8/15 #### deprecating this """ presence = getDatasetPresence( url, prim , within_sites = [SI.CE_to_SE(site) for site in sites_allowed]) prim_location = [site for site,pres in presence.items() if pres[0]==True] prim_parts = [site for site,pres in presence.items() if pres[0]==False] if len(prim_location) >= copies_needed: print "The output is all fully in place at",len(prim_location),"sites" continue # reduce the number of copies required by existing full copies copies_needed = max(0,copies_needed - len(prim_location)) print "now need",copies_needed subscriptions = listSubscriptions( url , prim , sites_allowed ) prim_destination = list(set([site for (site,(tid,decision)) in subscriptions.items() if decision and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])])) ## remove the subscription where the dataset is in parts at #prim_destination = list(set([site for (site,(tid,decision)) in subscriptions.items() if decision and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']]) and not site in prim_parts])) ## need to reject from that list the ones with a full copy already: i.e the transfer corresponds to the copy in place prim_destination = [site for site in prim_destination if not site in prim_location] ## add transfer dependencies latching_on_transfers = list(set([ tid for (site,(tid,decision)) in subscriptions.items() if decision and site in prim_destination and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])])) print latching_on_transfers """ ###### JR 3/8/15 #### deprecating this ##################################### ### new ways of making the whole thing destinations,all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks ) #destinations,all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks, group='DataOps') #anaops_destinations,anaops_all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks, group='AnalysisOps' ) print json.dumps(destinations, indent=2) ## get where the dataset is in full and completed prim_location = [site for (site,info) in destinations.items() if info['completion']==100 and info['data_fraction']==1] ## the rest is places it is going to be prim_destination = [site for site in destinations.keys() if not site in prim_location] ## need to take out the transfer veto prim_destination = [site for site in prim_destination if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] for dsite in prim_destination: needing_locks[dsite].append( prim ) if len(prim_location) >= copies_needed: print "The output is all fully in place at",len(prim_location),"sites",prim_location continue copies_needed = max(0,copies_needed - len(prim_location)) print "now need",copies_needed copies_being_made = [ sum([info['blocks'].keys().count(block) for site,info in destinations.items() if site in prim_destination]) for block in all_block_names] latching_on_transfers = set() [latching_on_transfers.update(info['blocks'].values()) for site,info in destinations.items() if site in prim_destination] latching_on_transfers = list(latching_on_transfers) #print latching_on_transfers ## figure out where all this is going to go prim_to_distribute = [site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location] prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in prim_destination] ## take out the ones that cannot receive transfers prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] if any([transfers_per_sites[site] < max_staging_per_site for site in prim_to_distribute]): ## means there is openings let me go print "There are transfer slots available:",[(site,transfers_per_sites[site]) for site in prim_to_distribute] for site in sites_allowed: #increment accross the board, regardless of real destination: could be changed transfers_per_sites[site] += 1 else: if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over transfer slots available" else: print "Not allowed to transfer more than",max_staging_per_site," per site at a time. Going overboard for",[site for site in prim_to_distribute if transfers_per_sites[site]>=max_staging_per_site] if not options.go: allowed = False break for latching in latching_on_transfers: tfo = session.query(Transfer).filter(Transfer.phedexid == latching).first() if not tfo: tfo = Transfer( phedexid = latching) tfo.workflows_id = [] session.add(tfo) if not wfo.id in tfo.workflows_id: print "adding",wfo.id,"to",tfo.id,"with phedexid",latching l = copy.deepcopy( tfo.workflows_id ) l.append( wfo.id ) tfo.workflows_id = l if not options.test: session.commit() else: session.flush() ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query can_go = False transfer_sizes[prim] = input_sizes[prim] staging = True # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ?? #copies_needed = max(0,copies_needed - len(prim_destination)) copies_needed = max(0,copies_needed - min(copies_being_made)) print "then need",copies_needed if copies_needed == 0: print "The output is either fully in place or getting in full somewhere with",latching_on_transfers can_go = True continue if len(prim_to_distribute)>0: ## maybe that a parameter we can play with to limit the if not options or options.chop: chops,sizes = getDatasetChops(prim, chop_threshold = options.chopsize, only_blocks=blocks) spreading = distributeToSites( chops, prim_to_distribute, n_copies = copies_needed, weights=SI.cpu_pledges, sizes=sizes) transfer_sizes[prim] = sum(sizes) else: spreading = {} for site in prim_to_distribute: if blocks: spreading[site]=blocks else: spreading[site]=[prim] transfer_sizes[prim] = input_sizes[prim] ## this is approximate if blocks are specified can_go = False print "selected CE destinations",spreading.keys() for (site,items) in spreading.items(): all_transfers[site].extend( items ) if not allowed: print "Not allowed to move on with",wfo.name continue if secondary: if talk: print wfo.name,'reads',', '.join(secondary),'in secondary' for sec in secondary: workflow_dependencies[sec].add( wfo.id ) if False: ## new style, failing on minbias if not sec in destination_cache: ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed]) destinations = destination_cache[sec] ## truncate location/destination to those making up for >90% of the dataset bad_destinations = [destinations.pop(site) for (site,info) in destinations.items() if info['data_fraction']<0.9] sec_location = [site for (site,info) in destinations.items() if info['completion']>=95] sec_destination = [site for site in destinations.keys() if not site in sec_location] else: ## old style presence = getDatasetPresence( url, sec ) sec_location = [site for site,pres in presence.items() if pres[1]>90.] ## more than 90% of the minbias at sites subscriptions = listSubscriptions( url ,sec ) sec_destination = [site for site in subscriptions] for site in sec_location: needing_locks[site].append( sec ) for site in sec_destination: needing_locks[site].append( sec ) sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] if len( sec_to_distribute )>0: sec_size = dss.get( sec ) for site in sec_to_distribute: site_se =SI.CE_to_SE(site) if (SI.disk[site_se]*1024.) > sec_size: all_transfers[site].append( sec ) can_go = False else: print "could not send the secondary input to",site_se,"because it is too big for the available disk",SI.disk[site_se]*1024,"GB need",sec_size #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024)) ## is that possible to do something more if can_go: ## no explicit transfer required this time if staging: ## but using existing ones print wfo.name,"latches on existing transfers, and nothing else" wfo.status = 'staging' needs_transfer+=1 else: print wfo.name,"should just be assigned NOW to",sites_allowed wfo.status = 'staged' passing_along+=1 print "setting status to",wfo.status session.commit() continue else: ## there is an explicit transfer required if staging: ## and also using an existing one print wfo.name,"latches on existing transfers" if not options.test: wfo.status = 'staging' print "setting status to",wfo.status session.commit() print wfo.name,"needs a transfer" needs_transfer+=1 passing_along+=1 print "accumulated locks of dataset in place" print json.dumps(needing_locks, indent=2) for site,items in needing_locks.items(): for item in items: LI.lock( item, SI.CE_to_SE(site), 'usable input') print "accumulated transfers" print json.dumps(all_transfers, indent=2) fake_id=-1 wf_id_in_prestaging=set() for (site,items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## site that do not want input datasets if site in SI.sites_veto_transfer: print site,"does not want transfers" continue ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured. ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] datasets = [it for it in items_to_transfer if not '#' in it] if execute: print "Making a replica to",site,"(CE)",site_se,"(SE) for" else: print "Would make a replica to",site,"(CE)",site_se,"(SE) for" print "\t",len(blocks),"blocks" ## remove blocks if full dataset is send out blocks = [block for block in blocks if not block.split('#')[0] in datasets] print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks])) print "\t",len(datasets),"datasets" print "\t",datasets items_to_transfer = blocks + datasets ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y','yes','go']: continue if execute: result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority='normal') ## make use of max_priority dataset:priority to set the subscriptions priority """ ## does not function once = True for item in items_to_transfer: bds = item.split('#')[0] if max_priority[bds] >= 90000: if once: w=10 print "waiting",w,"s before raising priority" time.sleep(w) once=False ## raise it to high priority print item,"subscription priority raised to high at",site_se #print "This does not work yet properly it seems" print updateSubscription(url, site_se, item, priority='high') """ #for item in list(set([it.split('#')[0] for it in items_to_transfer])): for item in items_to_transfer: LI.lock( item, site_se, 'pre-staging') else: #result= {'phedex':{'request_created' : [{'id' : fake_id}]}} result= {'phedex':{'request_created' : []}} fake_id-=1 if not result: print "ERROR Could not make a replica request for",site,items_to_transfer,"pre-staging" continue for phedexid in [o['id'] for o in result['phedex']['request_created']]: new_transfer = session.query(Transfer).filter(Transfer.phedexid == phedexid).first() print phedexid,"transfer created" if not new_transfer: new_transfer = Transfer( phedexid = phedexid) session.add( new_transfer ) new_transfer.workflows_id = set() for transfering in list(set(map(lambda it : it.split('#')[0], items_to_transfer))): new_transfer.workflows_id.update( workflow_dependencies[transfering] ) new_transfer.workflows_id = list(new_transfer.workflows_id) wf_id_in_prestaging.update(new_transfer.workflows_id) session.commit() ## auto approve it if execute: approved = approveSubscription(url, phedexid, [site_se]) for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status!='staging': if execute: tr_wf.status = 'staging' if talk: print "setting",tr_wf.name,"to staging" session.commit()
def closor(url, specific=None, options=None): if userLock(): return mlock = moduleLock() if mlock() and not options.manual: return up = componentInfo(soft=['mcm', 'wtc']) if not up.check(): return UC = unifiedConfiguration() CI = campaignInfo() BI = batchInfo() CloseI = closeoutInfo() all_late_files = [] jump_the_line = options.announce if options else False if jump_the_line: print "announce option is on. Checking on things on-going ready to be announced" wfs = session.query(Workflow).filter( Workflow.status.contains('announce')).filter( sqlalchemy.not_(Workflow.status.contains('announced'))).all() else: print "regular option. Checking on things done and to be announced" wfs = session.query(Workflow).filter(Workflow.status == 'close').all() if specific: wfs = [wfo for wfo in wfs if specific in wfo.name] wfs_n = [w.name for w in wfs] print "unique names?" print len(set(wfs_n)) == len(wfs_n) held = set() print len(wfs), "closing" random.shuffle(wfs) max_per_round = UC.get('max_per_round').get('closor', None) if options.limit: max_per_round = options.limit if max_per_round: ## order them by priority all_closedout = sorted(getWorkflows(url, 'closed-out', details=True), key=lambda r: r['RequestPriority']) all_closedout = [r['RequestName'] for r in all_closedout] def rank(wfn): return all_closedout.index(wfn) if wfn in all_closedout else 0 wfs = sorted(wfs, key=lambda wfo: rank(wfo.name), reverse=True) wfs = wfs[:max_per_round] batch_go = {} batch_warnings = defaultdict(set) batch_extreme_warnings = defaultdict(set) batch_goodness = UC.get("batch_goodness") closers = [] print len(wfs), "closing" th_start = time.mktime(time.gmtime()) for iwfo, wfo in enumerate(wfs): if specific and not specific in wfo.name: continue if not options.manual and ( 'cmsunified_task_HIG-RunIIFall17wmLHEGS-05036__v1_T_200712_005621_4159' .lower() in (wfo.name).lower() or 'pdmvserv_task_HIG-RunIISummer16NanoAODv7-03979__v1_T_200915_013748_1986' .lower() in (wfo.name).lower()): continue closers.append( CloseBuster( wfo=wfo, url=url, CI=CI, UC=UC, jump_the_line=jump_the_line, batch_goodness=batch_goodness, batch_go=batch_go, #stats = stats, batch_warnings=batch_warnings, batch_extreme_warnings=batch_extreme_warnings, all_late_files=all_late_files, held=held, )) run_threads = ThreadHandler(threads=closers, n_threads=options.threads, sleepy=10, timeout=None, verbose=True, label='closor') run_threads.start() ## waiting on all to complete while run_threads.is_alive(): #print "Waiting on closing threads",time.asctime(time.gmtime()) time.sleep(5) JC = JIRAClient() if up.status.get('jira', False) else None print len( run_threads.threads), "finished thread to gather information from" failed_threads = 0 for to in run_threads.threads: if to.failed: failed_threads += 1 continue if to.outs: for outO in to.outs: out = outO.datasetname odb = session.query(Output).filter( Output.datasetname == out).first() if not odb: print "adding an output object", out session.add(outO) else: odb.date = outO.date if to.to_status: to.wfo.status = to.to_status if JC and to.to_status == "done" and to.wfi: jiras = JC.find({"prepid": to.wfi.request['PrepID']}) for jira in jiras: JC.close(jira.key) if to.to_wm_status: to.wfo.wm_status = to.to_wm_status if to.closing: CloseI.pop(to.wfo.name) session.commit() th_stop = time.mktime(time.gmtime()) if wfs: time_spend_per_workflow = (th_stop - th_start) / float(len(wfs)) print "Average time spend per workflow is", time_spend_per_workflow if float(failed_threads / run_threads.n_threads) > 0: sendLog('checkor', '%d/%d threads have failed, better check this out' % (failed_threads, run_threads.n_threads), level='critical') sendEmail( 'checkor', '%d/%d threads have failed, better check this out' % (failed_threads, run_threads.n_threads)) days_late = 0. retries_late = 10 really_late_files = [ info for info in all_late_files if info['retries'] >= retries_late ] really_late_files = [ info for info in really_late_files if info['delay'] / (60 * 60 * 24.) >= days_late ] if really_late_files: subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s' % ( len(really_late_files), days_late, retries_late, json.dumps(really_late_files, indent=2)) #sendEmail('waiting for files to announce', subject) sendLog('closor', subject, level='warning') sendLog('closor', subject) print subject open('%s/stuck_files.json' % monitor_dir, 'w').write(json.dumps(really_late_files, indent=2)) if held: sendLog('closor', "the workflows below are held up \n%s" % ("\n".join(sorted(held))), level='critical') for bname, go in batch_go.items(): if go: subject = "Release Validation Samples Batch %s" % bname issues = "" #if batch_warnings[ bname ]: # issues="The following datasets have outstanding completion (<%d%%) issues:\n\n"% batch_goodness # issues+="\n".join( sorted( batch_warnings[ bname ] )) # issues+="\n\n" if batch_extreme_warnings[bname]: subject = "Low Statistics for %s" % bname issues = "The following datasets have outstanding completion (<50%%) issues:\n\n" issues += "\n".join(sorted(batch_extreme_warnings[bname])) issues += "\n\n" elif batch_warnings[bname]: issues = "The following datasets have outstanding completion (<%d%%) issues:\n\n" % batch_goodness issues += "\n".join(sorted(batch_warnings[bname])) issues += "\n\n" text = "" text += "Dear all,\n\n" text += "A batch of release validation workflows has finished.\n\n" text += "Batch ID:\n\n" text += "%s\n\n" % (bname) text += "Detail of the workflows\n\n" text += "https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s\n\n" % ( bname) text += "%s\n\n" % (issues) text += "This is an automated message.\n\n" text += "" to = ['*****@*****.**'] sendEmail(subject, text, destination=to) ## just announced ; take it out now. BI.pop(bname) deleteCampaignConfig(bname) if os.path.isfile('.closor_stop'): print "The loop on workflows was shortened" sendEmail('closor', 'Closor loop was shortened artificially using .closor_stop') os.system('rm -f .closor_stop')
def checkor(url, spec=None, options=None): fDB = closeoutInfo() if userLock(): return if duplicateLock(): return UC = unifiedConfiguration() use_mcm = True up = componentInfo(mcm=use_mcm, soft=["mcm"]) if not up.check(): return use_mcm = up.status["mcm"] wfs = [] if options.fetch: ## get all in running and check wfs.extend(session.query(Workflow).filter(Workflow.status == "away").all()) wfs.extend(session.query(Workflow).filter(Workflow.status == "assistance").all()) if options.nofetch: ## than get all in need for assistance wfs.extend(session.query(Workflow).filter(Workflow.status.startswith("assistance-")).all()) custodials = defaultdict(list) # sites : dataset list transfers = defaultdict(list) # sites : dataset list invalidations = [] # a list of files SI = global_SI CI = campaignInfo() mcm = McMClient(dev=False) def get_campaign(output, wfi): campaign = None try: campaign = output.split("/")[2].split("-")[0] except: if "Campaign" in wfi.request: campaign = wfi.request["Campaign"] return campaign by_passes = [] holdings = [] for bypassor, email in [ ("jbadillo", "*****@*****.**"), ("vlimant", "*****@*****.**"), ("jen_a", "*****@*****.**"), ]: bypass_file = "/afs/cern.ch/user/%s/%s/public/ops/bypass.json" % (bypassor[0], bypassor) if not os.path.isfile(bypass_file): print "no file", bypass_file continue try: by_passes.extend(json.loads(open(bypass_file).read())) except: print "cannot get by-passes from", bypass_file, "for", bypassor sendEmail("malformated by-pass information", "%s is not json readable" % (bypass_file), destination=[email]) holding_file = "/afs/cern.ch/user/%s/%s/public/ops/onhold.json" % (bypassor[0], bypassor) if not os.path.isfile(holding_file): print "no file", holding_file continue try: holdings.extend(json.loads(open(holding_file).read())) except: print "cannot get holdings from", holding_file, "for", bypassor sendEmail( "malformated by-pass information", "%s is not json readable" % (holding_file), destination=[email] ) total_running_time = 5.0 * 60.0 sleep_time = max(0.5, total_running_time / len(wfs)) for wfo in wfs: if spec and not (spec in wfo.name): continue time.sleep(sleep_time) print "checking on", wfo.name ## get info wfi = workflowInfo(url, wfo.name) ## make sure the wm status is up to date. # and send things back/forward if necessary. wfo.wm_status = wfi.request["RequestStatus"] if wfo.wm_status == "closed-out": ## manually closed-out print wfo.name, "is already", wfo.wm_status wfo.status = "close" session.commit() continue elif wfo.wm_status in [ "failed", "aborted", "aborted-archived", "rejected", "rejected-archived", "aborted-completed", ]: ## went into trouble wfo.status = "trouble" print wfo.name, "is in trouble", wfo.wm_status session.commit() continue elif wfo.wm_status in ["assigned", "acquired"]: ## not worth checking yet print wfo.name, "not running yet" session.commit() continue if "-onhold" in wfo.status: if wfo.name in holdings and wfo.name not in by_passes: print wfo.name, "on hold" continue if wfo.name in holdings and wfo.name not in by_passes: wfo.status = "assistance-onhold" print "setting", wfo.name, "on hold" session.commit() continue if wfo.wm_status != "completed" and not wfo.name in by_passes: ## for sure move on with closeout check if in completed print "no need to check on", wfo.name, "in status", wfo.wm_status session.commit() continue session.commit() sub_assistance = "" # if that string is filled, there will be need for manual assistance is_closing = True ## get it from somewhere by_pass_checks = False if wfo.name in by_passes: print "we can bypass checks on", wfo.name by_pass_checks = True for bypass in by_passes: if bypass in wfo.name: print "we can bypass", wfo.name, "because of keyword", bypass by_pass_checks = True break if not CI.go(wfi.request["Campaign"]) and not by_pass_checks: print "No go for", wfo.name continue # tuck out DQMIO/DQM wfi.request["OutputDatasets"] = [out for out in wfi.request["OutputDatasets"] if not "/DQM" in out] ## anything running on acdc familly = getWorkflowById(url, wfi.request["PrepID"], details=True) acdc = [] acdc_inactive = [] has_recovery_going = False had_any_recovery = False for member in familly: if member["RequestType"] != "Resubmission": continue if member["RequestName"] == wfo.name: continue if member["RequestDate"] < wfi.request["RequestDate"]: continue if member["RequestStatus"] in [ "running-open", "running-closed", "assignment-approved", "assigned", "acquired", ]: print wfo.name, "still has an ACDC running", member["RequestName"] acdc.append(member["RequestName"]) # print json.dumps(member,indent=2) ## hook for just waiting ... is_closing = False has_recovery_going = True elif member["RequestStatus"] == None: print member["RequestName"], "is not real" pass else: acdc_inactive.append(member["RequestName"]) had_any_recovery = True ## completion check percent_completions = {} # print "let's see who is crashing", wfo.name # print wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis'] if not "TotalInputEvents" in wfi.request: event_expected, lumi_expected = 0, 0 if not "recovery" in wfo.status: sendEmail( "missing member of the request", "TotalInputEvents is missing from the workload of %s" % wfo.name, destination=["*****@*****.**"], ) else: event_expected, lumi_expected = wfi.request["TotalInputEvents"], wfi.request["TotalInputLumis"] if "RequestNumEvents" in wfi.request: event_expected = int(wfi.request["RequestNumEvents"]) elif "Task1" in wfi.request and "RequestNumEvents" in wfi.request["Task1"]: event_expected = int(wfi.request["Task1"]["RequestNumEvents"]) fractions_pass = {} for output in wfi.request["OutputDatasets"]: event_count, lumi_count = getDatasetEventsAndLumis(dataset=output) percent_completions[output] = 0.0 if lumi_expected: percent_completions[output] = lumi_count / float(lumi_expected) if event_expected: percent_completions[output] = max(percent_completions[output], event_count / float(event_expected)) fractions_pass[output] = 0.95 c = get_campaign(output, wfi) if c in CI.campaigns and "fractionpass" in CI.campaigns[c]: fractions_pass[output] = CI.campaigns[c]["fractionpass"] print "overriding fraction to", fractions_pass[output], "for", output if options.fractionpass: fractions_pass[output] = options.fractionpass print "overriding fraction to", fractions_pass[output], "by command line for", output if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]): print wfo.name, "is not completed" print json.dumps(percent_completions, indent=2) print json.dumps(fractions_pass, indent=2) ## hook for creating automatically ACDC ? if has_recovery_going: sub_assistance += "-recovering" elif had_any_recovery: ## we want to have this looked at sub_assistance += "-manual" else: sub_assistance += "-recovery" is_closing = False ## correct lumi < 300 event per lumi events_per_lumi = {} for output in wfi.request["OutputDatasets"]: events_per_lumi[output] = getDatasetEventsPerLumi(output) lumi_upper_limit = {} for output in wfi.request["OutputDatasets"]: upper_limit = 301.0 campaign = get_campaign(output, wfi) # if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request: # upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency'] # print "setting the upper limit of lumisize to",upper_limit,"by request configuration" if campaign in CI.campaigns and "lumisize" in CI.campaigns[campaign]: upper_limit = CI.campaigns[campaign]["lumisize"] print "overriding the upper lumi size to", upper_limit, "for", campaign if options.lumisize: upper_limit = options.lumisize print "overriding the upper lumi size to", upper_limit, "by command line" lumi_upper_limit[output] = upper_limit if any([events_per_lumi[out] >= lumi_upper_limit[out] for out in events_per_lumi]): print wfo.name, "has big lumisections" print json.dumps(events_per_lumi, indent=2) ## hook for rejecting the request ? sub_assistance += "-biglumi" is_closing = False any_presence = {} for output in wfi.request["OutputDatasets"]: any_presence[output] = getDatasetPresence(url, output, vetoes=[]) ## custodial copy custodial_locations = {} custodial_presences = {} for output in wfi.request["OutputDatasets"]: custodial_presences[output] = [s for s in any_presence[output] if "MSS" in s] custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output) if not custodial_locations[output]: custodial_locations[output] = [] ## presence in phedex phedex_presence = {} for output in wfi.request["OutputDatasets"]: phedex_presence[output] = phedexClient.getFileCountDataset(url, output) vetoed_custodial_tier = UC.get("tiers_with_no_custodial") out_worth_checking = [ out for out in custodial_locations.keys() if out.split("/")[-1] not in vetoed_custodial_tier ] size_worth_checking = sum( [getDatasetSize(out) / 1023.0 for out in out_worth_checking] ) ## size in TBs of all outputs if not all(map(lambda sites: len(sites) != 0, [custodial_locations[out] for out in out_worth_checking])): print wfo.name, "has not all custodial location" print json.dumps(custodial_locations, indent=2) ########## ## hook for making a custodial replica ? custodial = None ## get from other outputs for output in out_worth_checking: if len(custodial_locations[output]): custodial = custodial_locations[output][0] if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the other output custodial:", custodial, "because of limited space" custodial = None ## try to get it from campaign configuration if not custodial: for output in out_worth_checking: campaign = get_campaign(output, wfi) if campaign in CI.campaigns and "custodial" in CI.campaigns[campaign]: custodial = CI.campaigns[campaign]["custodial"] print "Setting custodial to", custodial, "from campaign configuration" break if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the unified configuration custodial:", custodial, "because of limited space" custodial = None ## get from the parent pick_custodial = True if not custodial and "InputDataset" in wfi.request: ## this is terribly dangerous to assume only parents_custodial = phedexClient.getCustodialSubscriptionRequestSite(wfi.request["InputDataset"]) ###parents_custodial = findCustodialLocation(url, wfi.request['InputDataset']) if not parents_custodial: parents_custodial = [] if len(parents_custodial): custodial = parents_custodial[0] else: print "the input dataset", wfi.request[ "InputDataset" ], "does not have custodial in the first place. abort" sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor" % wfi.request["InputDataset"], ) is_closing = False pick_custodial = False if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the parent custodial:", custodial, "because of limited space" custodial = None if not custodial and pick_custodial: ## pick one at random custodial = SI.pick_SE(size=size_worth_checking) if not custodial: print "cannot find a custodial for", wfo.name sendEmail( "cannot find a custodial", "cannot find a custodial for %s probably because of the total output size %d" % (wfo.name, size_worth_checking), ) if custodial and ((not sub_assistance and not acdc) or by_pass_checks): ## remember how much you added this round already ; this stays locally SI.storage[custodial] -= size_worth_checking ## register the custodial request, if there are no other big issues for output in out_worth_checking: if not len(custodial_locations[output]): if phedex_presence[output] >= 1: custodials[custodial].append(output) else: print "no file in phedex for", output, " not good to add to custodial requests" is_closing = False ## disk copy disk_copies = {} for output in wfi.request["OutputDatasets"]: disk_copies[output] = [s for s in any_presence[output] if (not "MSS" in s) and (not "Buffer" in s)] if not all(map(lambda sites: len(sites) != 0, disk_copies.values())): print wfo.name, "has not all output on disk" print json.dumps(disk_copies, indent=2) ## presence in dbs dbs_presence = {} dbs_invalid = {} for output in wfi.request["OutputDatasets"]: dbs_presence[output] = dbs3Client.getFileCountDataset(output) dbs_invalid[output] = dbs3Client.getFileCountDataset(output, onlyInvalid=True) fraction_invalid = 0.01 if ( not all( [ dbs_presence[out] == (dbs_invalid[out] + phedex_presence[out]) for out in wfi.request["OutputDatasets"] ] ) and not options.ignorefiles ): print wfo.name, "has a dbs,phedex mismatch" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## hook for just waiting ... is_closing = False if ( not all( [ (dbs_invalid[out] <= int(fraction_invalid * dbs_presence[out])) for out in wfi.request["OutputDatasets"] ] ) and not options.ignorefiles ): print wfo.name, "has a dbs invalid file level too high" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## need to be going and taking an eye sub_assistance += "-invalidfiles" is_closing = False ## put that heavy part at the end ## duplication check duplications = {} if is_closing: print "starting duplicate checker for", wfo.name for output in wfi.request["OutputDatasets"]: print "\tchecking", output duplications[output] = True try: duplications[output] = dbs3Client.duplicateRunLumi(output) except: try: duplications[output] = dbs3Client.duplicateRunLumi(output) except: print "was not possible to get the duplicate count for", output is_closing = False if any(duplications.values()) and not options.ignoreduplicates: print wfo.name, "has duplicates" print json.dumps(duplications, indent=2) ## hook for making file invalidation ? sub_assistance += "-duplicates" is_closing = False ## for visualization later on if not wfo.name in fDB.record: # print "adding",wfo.name,"to close out record" fDB.record[wfo.name] = {"datasets": {}, "name": wfo.name, "closeOutWorkflow": None} fDB.record[wfo.name]["closeOutWorkflow"] = is_closing for output in wfi.request["OutputDatasets"]: if not output in fDB.record[wfo.name]["datasets"]: fDB.record[wfo.name]["datasets"][output] = {} rec = fDB.record[wfo.name]["datasets"][output] rec["percentage"] = float("%.2f" % (percent_completions[output] * 100)) rec["duplicate"] = duplications[output] if output in duplications else "N/A" rec["phedexReqs"] = ( float("%.2f" % any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output]) != 0 else "N/A" ) rec["closeOutDataset"] = is_closing rec["transPerc"] = ( float("%.2f" % any_presence[output][disk_copies[output][0]][1]) if len(disk_copies[output]) != 0 else "N/A" ) rec["correctLumis"] = ( int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True ) rec["missingSubs"] = ( False if len(custodial_locations[output]) == 0 else ",".join(list(set(custodial_locations[output]))) ) rec["dbsFiles"] = dbs_presence[output] rec["dbsInvFiles"] = dbs_invalid[output] rec["phedexFiles"] = phedex_presence[output] rec["acdc"] = "%d / %d" % (len(acdc), len(acdc + acdc_inactive)) if by_pass_checks: ## force closing is_closing = True ## and move on if is_closing: ## toggle status to closed-out in request manager print "setting", wfo.name, "closed-out" if not options.test: if wfo.wm_status in ["closed-out", "announced", "normal-archived"]: print wfo.name, "is already", wfo.wm_status, "not trying to closed-out and assuming it does" res = None else: res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) print "close out answer", res if not res in ["None", None]: print "retrying to closing out" print res res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) if res in [None, "None"]: wfo.status = "close" session.commit() else: print "could not close out", wfo.name, "will try again next time" else: ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it new_status = "assistance" + sub_assistance print wfo.name, "needs assistance with", new_status if sub_assistance and wfo.status != new_status and "PrepID" in wfi.request and not "manual" in wfo.status: pid = wfi.getPrepIDs()[0].replace("task_", "") # pid = wfi.request['PrepID'].replace('task_','') ## notify messages = { "recovery": "Samples completed with missing statistics:\n%s " % ( "\n".join( [ "%.2f %% complete for %s" % (percent_completions[output] * 100, output) for output in wfi.request["OutputDatasets"] ] ) ), "biglumi": "Samples completed with large luminosity blocks:\n%s " % ( "\n".join( [ "%d > %d for %s" % (events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request["OutputDatasets"] if (events_per_lumi[output] > lumi_upper_limit[output]) ] ) ), "duplicate": "Samples completed with duplicated luminosity blocks:\n%s" % ( "\n".join( [ "%s" % output for output in wfi.request["OutputDatasets"] if output in duplications and duplications[output] ] ) ), } text = "The request %s (%s) is facing issue in production.\n" % (pid, wfo.name) content = "" for case in messages: if case in new_status: content += "\n" + messages[case] + "\n" text += content text += "You are invited to check, while this is being taken care of by Ops.\n" text += "This is an automated message." if use_mcm and content: print "Sending notification back to requestor" print text batches = mcm.getA("batches", query="contains=%s&status=announced" % pid) if len(batches): ## go notify the batch bid = batches[-1]["prepid"] print "batch nofication to", bid mcm.put("/restapi/batches/notify", {"notes": text, "prepid": bid}) ## go notify the request print "request notification to", pid mcm.put("/restapi/requests/notify", {"message": text, "prepids": [pid]}) ## case where the workflow was in manual from recoveror if not "manual" in wfo.status or new_status != "assistance-recovery": wfo.status = new_status if not options.test: print "setting", wfo.name, "to", wfo.status session.commit() else: print "current status is", wfo.status, "not changing to anything" fDB.html() ## custodial requests print "Custodials" print json.dumps(custodials, indent=2) for site in custodials: print ",".join(custodials[site]), "=>", site if not options.test: result = makeReplicaRequest( url, site, list(set(custodials[site])), "custodial copy at production close-out", custodial="y", priority="low", approve=(site in SI.sites_auto_approve), ) print result print "Transfers" print json.dumps(transfers, indent=2) ## replicas requests for site in transfers: print ",".join(transfers[site]), "=>", site if not options.test: result = None # result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out") print result print "File Invalidation" print invalidations
def assignor(url ,specific = None, talk=True, options=None): if userLock(): return if duplicateLock(): return #if notRunningBefore( 'stagor' ): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() SI = global_SI #LI = lockInfo() NLI = newLockInfo() n_assigned = 0 n_stalled = 0 wfos=[] if specific: wfos = session.query(Workflow).filter(Workflow.name==specific).all() if not wfos: if specific: wfos = session.query(Workflow).filter(Workflow.status=='considered').all() wfos.extend( session.query(Workflow).filter(Workflow.status=='staging').all()) wfos.extend(session.query(Workflow).filter(Workflow.status=='staged').all()) for wfo in wfos: if specific: if not any(map(lambda sp: sp in wfo.name,specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n",wfo.name,"\n\tto be assigned" wfh = workflowInfo( url, wfo.name) ## check if by configuration we gave it a GO if not CI.go( wfh.request['Campaign'] ) and not options.go: print "No go for",wfh.request['Campaign'] n_stalled+=1 continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] !='assignment-approved': if not options.test: print wfo.name,wfh.request['RequestStatus'],"setting away and skipping" ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name,wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version=wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: print "cannot decide on version number" n_stalled+=1 continue #(lheinput,primary,parent,secondary) = wfh.getIO() #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) (lheinput,primary,parent,secondary, sites_allowed) = wfh.getSiteWhiteList() print "Site white list",sorted(sites_allowed) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) c_sites_allowed = CI.get(wfh.request['Campaign'], 'SiteWhitelist' , []) if c_sites_allowed: print "Would like to use the new whitelist, but will not until things went through a bit" sendEmail("using a restricted site white list","for %s"%(c_sites_allowed)) sites_allowed = list(set(sites_allowed) & set(c_sites_allowed)) c_black_list = CI.get(wfh.request['Campaign'], 'SiteBlacklist', []) if c_black_list: print "Reducing the whitelist due to black list in campaign configuration" print "Removing",c_black_list sites_allowed = list(set(sites_allowed) - set(c_black_list)) blocks = [] if 'BlockWhitelist' in wfh.request: blocks = wfh.request['BlockWhitelist'] ncores = wfh.request.get('Multicore',1) memory_allowed = SI.sitesByMemory( wfh.request['Memory'] , maxCore=ncores) if memory_allowed!=None: print "sites allowing", wfh.request['Memory'],"MB and",ncores,"core are",memory_allowed sites_allowed = list(set(sites_allowed) & set(memory_allowed)) print "Allowed",sorted(sites_allowed) secondary_locations=None for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" sendEmail("tempting to pass sec location check","but we cannot yet IMO") #pass presence = getDatasetPresence( url, sec ) print sec print json.dumps(presence, indent=2) one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.] #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations==None: secondary_locations = one_secondary_locations else: secondary_locations = list(set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations] print "From secondary requirement, now Allowed",sorted(sites_allowed) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy( sites_allowed ) sites_with_data = copy.deepcopy( sites_allowed ) sites_with_any_data = copy.deepcopy( sites_allowed ) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default for prim in list(primary): set_lfn = getLFNbase( prim ) presence = getDatasetPresence( url, prim , only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction(url, prim, sites = [SI.CE_to_SE(site) for site in sites_allowed] , only_blocks = blocks) #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])] #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])] sites_all_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,(there,frac)) in presence.items() if there]] sites_with_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,frac) in presence.items() if frac[1]>90.]] sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()] print "Holding the data but not allowed",list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed])) if primary_locations==None: primary_locations = presence.keys() else: primary_locations = list(set(primary_locations) & set(presence.keys() )) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites=[] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed]))] elif primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed]))] else: opportunistic_sites = [] print "We could be running at",sorted(opportunistic_sites),"in addition" if any([osite in SI.sites_not_ready for osite in opportunistic_sites]): print "One of the destination site is in downtime" down_time = True ## should this be send back to considered ? """ if available_fractions and not all([available>=1. for available in available_fractions.values()]): print "The input dataset is not located in full over sites" print json.dumps(available_fractions) if not options.test and not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known: sendEmail( "cannot be assigned","%s is not full over sites \n %s"%(wfo.name,json.dumps(available_fractions))) known.append( wfo.name ) open('cannot_assign.json','w').write(json.dumps( known, indent=2)) n_stalled+=1 continue ## skip skip skip """ ## should be 2 but for the time-being let's lower it to get things going copies_wanted,cpuh = wfh.getNCopies() less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max(1,copies_wanted-less_copies_than_requested) # take one out for the efficiency if available_fractions and not all([available>=copies_wanted for available in available_fractions.values()]): not_even_once = not all([available>=1. for available in available_fractions.values()]) print "The input dataset is not available",copies_wanted,"times, only",available_fractions.values() if down_time and not options.go: wfo.status = 'considered' session.commit() print "sending back to considered because of site downtime, instead of waiting" sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known: sendEmail( "cannot be assigned","%s is not sufficiently available. Probably phedex information lagging behind. \n %s"%(wfo.name,json.dumps(available_fractions))) known.append( wfo.name ) open('cannot_assign.json','w').write(json.dumps( known, indent=2)) n_stalled+=1 continue ## default back to white list to original white list with any data print "Allowed",sites_allowed if options.primary_aaa: sites_allowed = initial_sites_allowed options.useSiteListAsLocation = True else: sites_allowed = sites_with_any_data print "Selected for any data",sites_allowed if options.restrict: print "Allowed",sites_allowed sites_allowed = sites_with_any_data print "Selected",sites_allowed else: if set(sites_with_data) != set(sites_allowed): ## the data is not everywhere we wanted to run at : enable aaa print "Sites with 90% data not matching site white list (block choping!)" print "Resorting to AAA reading for",list(set(sites_allowed) - set(sites_with_data)),"?" print "Whitelist site with any data",list(set(sites_allowed) - set(sites_with_any_data)) #options.useSiteListAsLocation = True #print "Not commissioned yet" #continue #print "We could be running at",opportunistic_sites,"in addition" ##sites_allowed = list(set(sites_allowed+ opportunistic_sites)) if not len(sites_allowed): print wfo.name,"cannot be assign with no matched sites" sendEmail( "cannot be assigned","%s has no whitelist"%(wfo.name)) n_stalled+=1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])] print "Placing the output on", sites_out parameters={ 'SiteWhitelist' : sites_allowed, #'CustodialSites' : sites_custodial, 'NonCustodialSites' : sites_out, 'AutoApproveSubscriptionSites' : list(set(sites_out)), 'AcquisitionEra' : wfh.acquisitionEra(), 'ProcessingString' : wfh.processingString(), 'MergedLFNBase' : set_lfn, 'ProcessingVersion' : version, } ## plain assignment here team='production' if options and options.team: team = options.team #if wfh.request['RequestPriority'] >= 100000 and (wfh.request['TimePerEvent']*int(wfh.getRequestNumEvents()))/(8*3600.) < 10000: # team = 'highprio' # sendEmail("sending work with highprio team","%s"% wfo.name, destination=['*****@*****.**']) if "T2_US_UCSD" in sites_with_data and random.random() < -0.5 and wfh.request['Campaign']=='RunIISpring15DR74' and int(wfh.getRequestNumEvents()) < 600000 and not any([out.endswith('RAW') for out in wfh.request['OutputDatasets']]): ## consider SDSC parameters['SiteWhitelist'] = ['T2_US_UCSD','T3_US_SDSC'] parameters['useSiteListAsLocation'] = True team = 'allocation-based' sendEmail("sending work to SDSC","%s was assigned to SDSC/UCSD"% wfo.name, destination=['*****@*****.**']) if wfh.request['Campaign']=='RunIIWinter15GS' and random.random() < -1.0: parameters['SiteWhitelist'] = ['T3_US_SDSC'] team = 'allocation-based' sendEmail("sending work to SDSC","%s was assigned to SDSC"% wfo.name, destination=['*****@*****.**']) ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v=getattr(options,key) if v!=None: if type(v)==str and ',' in v: parameters[key] = filter(None,v.split(',')) else: parameters[key] = v ## pick up campaign specific assignment parameters parameters.update( CI.parameters(wfh.request['Campaign']) ) if not options.test: parameters['execute'] = True split_check = wfh.checkWorkflowSplitting() if split_check!=True: parameters.update( split_check ) if 'EventBased' in split_check.values(): print "Falling back to event splitting." sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name) elif 'EventsPerJob' in split_check.values(): print "Modifying the number of job per event" sendEmail("Modifying the job per events","the workflow %s is too heavy in number of jobs explosion"%wfo.name) # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi)/float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents/(reqJobs*1.4)) lumisPerJob = int(eventsPerJob/eventsPerLumi) if lumisPerJob==0: print "There is no go for assigning that request without event splitting" sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) print "need to go down to",eventsPerJob,"events per job" parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl['events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl['avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: print "need to go down to",lumisPerJob,"in assignment" sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: print "the regular splitting should work for",pstring sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned+=1 try: ## refetch information and lock output new_wfi = workflowInfo( url, wfo.name) (_,prim,_,sec) = new_wfi.getIO() for secure in list(prim)+list(sec)+new_wfi.request['OutputDatasets']: ## lock all outputs flat NLI.lock( secure ) #for site in [SI.CE_to_SE(site) for site in sites_allowed]: # for output in new_wfi.request['OutputDatasets']: # LI.lock( output, site, 'dataset in production') # for primary in prim: # LI.lock( primary, site, 'dataset used in input') # for secondary in sec: # LI.lock( secondary, site, 'required for mixing' ) except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output",str(e)) else: print "ERROR could not assign",wfo.name else: pass print "Assignment summary:" print "Assigned",n_assigned print "Stalled",n_stalled
def checkor(url, spec=None, options=None): if userLock(): return if duplicateLock(): return fDB = closeoutInfo() UC = unifiedConfiguration() use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] wfs=[] if options.new: ## get all in running and check ## you want to intersect with what is completed ! if options.strict: completed_wfi = getWorkflows(url, status='completed') for wfo in session.query(Workflow).filter(Workflow.status == 'away').all(): if wfo.name in completed_wfi: wfs.append( wfo ) else: print wfo.name,"is not completed" sendLog('checkor','%s is not completed'%( wfo.name)) else: wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() ) if options.current: ## recheck those already there, probably to just pass them along wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() ) if options.old: ## than get all in need for assistance wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() ) custodials = defaultdict(list) #sites : dataset list transfers = defaultdict(list) #sites : dataset list invalidations = [] #a list of files SI = global_SI CI = campaignInfo() mcm = McMClient(dev=False) def get_campaign(output, wfi): campaign = None try: campaign = output.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request: campaign = wfi.request['Campaign'] return campaign ## retrieve bypass and onhold configuration bypasses = [] holdings = [] #try: # already_notified = json.loads(open('already_notifified.json').read()) #except: # print "no record of already notified workflow. starting fresh" # already_notified = [] for bypassor,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**')]: bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor) if not os.path.isfile(bypass_file): #sendLog('checkor','no file %s',bypass_file) continue try: bypasses.extend( json.loads(open(bypass_file).read())) except: sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email]) holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor) if not os.path.isfile(holding_file): #sendLog('checkor',"no file %s"%holding_file) continue try: holdings.extend( json.loads(open(holding_file).read())) except: sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email]) ## once this was force-completed, you want to bypass for rider,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('srimanob','*****@*****.**')]: rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider) if not os.path.isfile(rider_file): print "no file",rider_file #sendLog('checkor',"no file %s"%rider_file) continue try: bypasses.extend( json.loads(open( rider_file ).read() ) ) except: sendLog('checkor',"cannot get force complete list from %s"%rider) sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email]) if use_mcm: mcm_force = mcm.get('/restapi/requests/forcecomplete') bypasses.extend( mcm_force ) pattern_fraction_pass = UC.get('pattern_fraction_pass') total_running_time = 5.*60. sleep_time = 1 if len(wfs): sleep_time = min(max(0.5, total_running_time / len(wfs)), 10) random.shuffle( wfs ) print len(wfs),"to consider, pausing for",sleep_time for wfo in wfs: if spec and not (spec in wfo.name): continue time.sleep( sleep_time ) ## get info wfi = workflowInfo(url, wfo.name) wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status)) ## make sure the wm status is up to date. # and send things back/forward if necessary. wfo.wm_status = wfi.request['RequestStatus'] if wfo.wm_status == 'closed-out': ## manually closed-out wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status)) wfo.status = 'close' session.commit() continue elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']: ## went into trouble wfo.status = 'trouble' wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status)) session.commit() continue elif wfo.wm_status in ['assigned','acquired']: ## not worth checking yet wfi.sendLog('checkor',"%s is not running yet"%wfo.name) session.commit() continue if '-onhold' in wfo.status: if wfo.name in holdings and wfo.name not in bypasses: wfi.sendLog('checkor',"%s is on hold"%wfo.name) continue if wfo.wm_status != 'completed': #and not wfo.name in bypasses: ## for sure move on with closeout check if in completed wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status)) session.commit() continue if wfo.name in holdings and wfo.name not in bypasses: wfo.status = 'assistance-onhold' wfi.sendLog('checkor',"setting %s on hold"%wfo.name) session.commit() continue session.commit() #sub_assistance="" # if that string is filled, there will be need for manual assistance existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance assistance_tags = set() is_closing = True ## get it from somewhere bypass_checks = False pids = wfi.getPrepIDs() bypass_by_mcm = False for bypass in bypasses: if bypass in wfo.name: wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass)) bypass_checks = True break if bypass in pids: wfi.sendLog('checkor',"we can bypass checks on %s because of prepid %s "%( wfo.name, bypass)) bypass_checks = True bypass_by_mcm = True break #if not CI.go( wfi.request['Campaign'] ) and not bypass_checks: # print "No go for",wfo.name # wfi.sendLog('checkor',"No go for %s"%wfi.request['Campaign']) # continue tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm* vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco campaigns = {} for out in wfi.request['OutputDatasets']: c = get_campaign(out, wfi) campaigns[out] = c if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]: vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override'])) ## add those that we need to check for custodial copy tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] )) wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])] check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] )) check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check )) check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) ) wfi.sendLog('checkor', check_output_text ) ## anything running on acdc : getting the real prepid is not worth it familly = getWorkflowById(url, wfi.request['PrepID'], details=True) acdc = [] acdc_inactive = [] for member in familly: if member['RequestType'] != 'Resubmission': continue if member['RequestName'] == wfo.name: continue if member['RequestDate'] < wfi.request['RequestDate']: continue if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']: print wfo.name,"still has an ACDC running",member['RequestName'] acdc.append( member['RequestName'] ) ## cannot be bypassed! is_closing = False assistance_tags.add('recovering') elif member['RequestStatus']==None: print member['RequestName'],"is not real" pass else: acdc_inactive.append( member['RequestName'] ) assistance_tags.add('recovered') ## completion check percent_completions = {} if not 'TotalInputEvents' in wfi.request: event_expected,lumi_expected = 0,0 if not 'recovery' in wfo.status: #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**']) sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical') else: event_expected,lumi_expected = wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis'] if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']): event_expected = int(wfi.request['RequestNumEvents']) elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']: event_expected = int(wfi.request['Task1']['RequestNumEvents']) fractions_pass = {} over_100_pass = False (lhe,prim,_,_) = wfi.getIO() if lhe or prim: over_100_pass = False for output in wfi.request['OutputDatasets']: event_count,lumi_count = getDatasetEventsAndLumis(dataset=output) percent_completions[output] = 0. if lumi_expected: percent_completions[output] = lumi_count / float( lumi_expected ) if event_expected: wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected )) percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) ) fractions_pass[output] = 0.95 c = campaigns[output] if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]: fractions_pass[output] = CI.campaigns[c]['fractionpass'] wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output)) if options.fractionpass: fractions_pass[output] = options.fractionpass print "overriding fraction to",fractions_pass[output],"by command line for",output for key in pattern_fraction_pass: if key in output: fractions_pass[output] = pattern_fraction_pass[key] print "overriding fraction to",fractions_pass[output],"by dataset key",key if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]): print wfo.name,"is not completed" print json.dumps(percent_completions, indent=2) print json.dumps(fractions_pass, indent=2) ## hook for creating automatically ACDC ? if not bypass_checks: assistance_tags.add('recovery') is_closing = False if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]): print wfo.name,"is over completed" print json.dumps(percent_completions, indent=2) if not bypass_checks: assistance_tags.add('over100') is_closing = False ## correct lumi < 300 event per lumi events_per_lumi = {} for output in wfi.request['OutputDatasets']: events_per_lumi[output] = getDatasetEventsPerLumi( output ) lumi_upper_limit = {} for output in wfi.request['OutputDatasets']: upper_limit = 301. campaign = campaigns[output] #if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request: # upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency'] # print "setting the upper limit of lumisize to",upper_limit,"by request configuration" if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]: upper_limit = CI.campaigns[campaign]['lumisize'] print "overriding the upper lumi size to",upper_limit,"for",campaign if options.lumisize: upper_limit = options.lumisize print "overriding the upper lumi size to",upper_limit,"by command line" lumi_upper_limit[output] = upper_limit if wfi.request['RequestType'] in ['ReDigi']: lumi_upper_limit[output] = -1 if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]): print wfo.name,"has big lumisections" print json.dumps(events_per_lumi, indent=2) ## hook for rejecting the request ? if not bypass_checks: assistance_tags.add('biglumi') is_closing = False any_presence = {} for output in wfi.request['OutputDatasets']: any_presence[output] = getDatasetPresence(url, output, vetoes=[]) ## custodial copy custodial_locations = {} custodial_presences = {} for output in wfi.request['OutputDatasets']: custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s] custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output) if not custodial_locations[output]: custodial_locations[output] = [] ## presence in phedex phedex_presence ={} for output in wfi.request['OutputDatasets']: phedex_presence[output] = phedexClient.getFileCountDataset(url, output ) out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier] size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])): print wfo.name,"has not all custodial location" print json.dumps(custodial_locations, indent=2) ########## ## hook for making a custodial replica ? custodial = None ## get from other outputs for output in out_worth_checking: if len(custodial_locations[output]): custodial = custodial_locations[output][0] if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the other output custodial:",custodial,"because of limited space" custodial = None ## try to get it from campaign configuration if not custodial: for output in out_worth_checking: campaign = campaigns[output] if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]: custodial = CI.campaigns[campaign]['custodial'] print "Setting custodial to",custodial,"from campaign configuration" if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the campaign configuration custodial:",custodial,"because of limited space" custodial = None ## get from the parent pick_custodial = True use_parent_custodial = UC.get('use_parent_custodial') _,prim,_,_ = wfi.getIO() if not custodial and prim and use_parent_custodial: parent_dataset = prim.pop() ## this is terribly dangerous to assume only parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset ) ###parents_custodial = findCustodialLocation(url, parent_dataset) if not parents_custodial: parents_custodial = [] if len(parents_custodial): custodial = parents_custodial[0] else: print "the input dataset",parent_dataset,"does not have custodial in the first place. abort" #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset) ## does not work for RAWOADSIM sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset) ## cannot be bypassed, this is an issue to fix is_closing = False pick_custodial = False assistance_tags.add('parentcustodial') if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the parent custodial:",custodial,"because of limited space" custodial = None if not custodial and pick_custodial: ## pick one at random custodial = SI.pick_SE(size=size_worth_checking) if not custodial: print "cannot find a custodial for",wfo.name wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking)) #sendEmail( "cannot find a custodial","cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking)) sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical') if custodial and (is_closing or bypass_checks): print "picked",custodial,"for tape copy" ## remember how much you added this round already ; this stays locally SI.storage[custodial] -= size_worth_checking ## register the custodial request, if there are no other big issues for output in out_worth_checking: if not len(custodial_locations[output]): if phedex_presence[output]>=1: custodials[custodial].append( output ) ## let's wait and see if that's needed assistance_tags.add('custodial') else: print "no file in phedex for",output," not good to add to custodial requests" #cannot be bypassed is_closing = False ## disk copy disk_copies = {} for output in wfi.request['OutputDatasets']: disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)] if not all(map( lambda sites : len(sites)!=0, disk_copies.values())): print wfo.name,"has not all output on disk" print json.dumps(disk_copies, indent=2) ## presence in dbs dbs_presence = {} dbs_invalid = {} for output in wfi.request['OutputDatasets']: dbs_presence[output] = dbs3Client.getFileCountDataset( output ) dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True) fraction_invalid = 0.01 if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs,phedex mismatch" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) if not 'recovering' in assistance_tags: assistance_tags.add('filemismatch') #print this for show and tell if no recovery on-going for out in dbs_presence: _,_,missing_phedex,missing_dbs = getDatasetFiles(url, out) if missing_phedex: print "These %d files are missing in phedex"%(len(missing_phedex)) print "\n".join( missing_phedex ) if missing_dbs: print "These %d files are missing in dbs"%(len(missing_dbs)) print "\n".join( missing_dbs ) #if not bypass_checks: ## I don't think we can by pass this is_closing = False if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs invalid file level too high" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## need to be going and taking an eye assistance_tags.add('invalidfiles') if not bypass_checks: #sub_assistance+="-invalidfiles" is_closing = False ## put that heavy part at the end ## duplication check duplications = {} if is_closing or bypass_checks: print "starting duplicate checker for",wfo.name for output in wfi.request['OutputDatasets']: print "\tchecking",output duplications[output] = True try: duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True) except: try: duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True) except: print "was not possible to get the duplicate count for",output is_closing=False if any(duplications.values()) and not options.ignoreduplicates: print wfo.name,"has duplicates" print json.dumps(duplications,indent=2) ## hook for making file invalidation ? ## it shouldn't be allowed to bypass it assistance_tags.add('duplicates') is_closing = False ## for visualization later on if not wfo.name in fDB.record: #print "adding",wfo.name,"to close out record" fDB.record[wfo.name] = { 'datasets' :{}, 'name' : wfo.name, 'closeOutWorkflow' : None, } fDB.record[wfo.name]['closeOutWorkflow'] = is_closing fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority'] fDB.record[wfo.name]['prepid'] = wfi.request['PrepID'] for output in wfi.request['OutputDatasets']: if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {} rec = fDB.record[wfo.name]['datasets'][output] rec['percentage'] = float('%.2f'%(percent_completions[output]*100)) rec['duplicate'] = duplications[output] if output in duplications else 'N/A' rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A' rec['closeOutDataset'] = is_closing rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A' rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output]))) rec['dbsFiles'] = dbs_presence[output] rec['dbsInvFiles'] = dbs_invalid[output] rec['phedexFiles'] = phedex_presence[output] rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive)) ## and move on if is_closing: ## toggle status to closed-out in request manager print "setting",wfo.name,"closed-out" if not options.test: if wfo.wm_status in ['closed-out','announced','normal-archived']: print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does" res = None else: res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) print "close out answer",res if not res in ["None",None]: print "try to get the current status again" wfi_bis = workflowInfo(url, wfo.name) if wfi_bis.request['RequestStatus'] == 'closed-out': print "the request did toggle to closed-out" res = None if not res in ["None",None]: print "retrying to closing out" print res res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) if res in [None,"None"]: wfo.status = 'close' session.commit() if use_mcm and bypass_by_mcm: ## shoot large on all prepids for pid in pids: mcm.delete('/restapi/requests/forcecomplete/%s'%pid) else: print "could not close out",wfo.name,"will try again next time" else: ## full known list #recovering # has active ACDC ##OUT #recovered #had inactive ACDC #recovery #not over the pass bar #over100 # over 100% #biglumi # has a big lumiblock #parentcustodial # the parent does not have a valid subscription yet #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear #filemismatch # there is a dbs/phedex mismatch #duplicates #a lumi section is there twice ## manual is not added yet, and should be so by recoveror print wfo.name,"was tagged with :",list(assistance_tags) if 'recovering' in assistance_tags: ## if active ACDC, being under threshold, filemismatch do not matter assistance_tags = assistance_tags - set(['recovery','filemismatch']) if 'recovery' in assistance_tags and 'recovered' in assistance_tags: ## should not set -recovery to anything that add ACDC already assistance_tags = assistance_tags - set(['recovery','recovered']) ## straight to manual assistance_tags.add('manual') ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it print wfo.name,"needs assistance with",",".join( assistance_tags ) print wfo.name,"existing conditions",",".join( existing_assistance_tags ) ######################################### ##### notification to requester ######### go_notify=False if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags: go_notify=True if go_notify: #if wfo.name in already_notified: # print "double notification" # sendEmail('double notification','please take a look at %s'%(wfo.name)) #else: # already_notified.append( wfo.name ) detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s' perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name) splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name) ## notify templates messages= { 'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ), 'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink), 'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ), 'filemismatch': 'Samples completed with inconsistency in DBS/Phedex', #'manual' : 'Workflow completed and requires manual checks by Ops', } content = "The request PREPID (WORKFLOW) is facing issue in production.\n" motive = False for case in messages: if case in assistance_tags: content+= "\n"+messages[case]+"\n" motive = True content += "You are invited to check, while this is being taken care of by Comp-Ops.\n" content += "This is an automated message from Comp-Ops.\n" items_notified = set() if use_mcm and motive: wfi.notifyRequestor( content , mcm = mcm) ######################################### ## logic to set the status further if assistance_tags: new_status = 'assistance-'+'-'.join(sorted(assistance_tags) ) else: new_status = 'assistance' ## case where the workflow was in manual from recoveror if not 'manual' in wfo.status or new_status!='assistance-recovery': wfo.status = new_status if not options.test: print "setting",wfo.name,"to",wfo.status session.commit() else: print "current status is",wfo.status,"not changing to anything" #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2)) fDB.html() if not spec: #sendEmail("fresh assistance status available","Fresh status are available at https://cmst2.web.cern.ch/cmst2/unified/assistance.html",destination=['*****@*****.**']) #it's a bit annoying pass ## custodial requests print "Custodials" print json.dumps(custodials, indent=2) for site in custodials: print ','.join(custodials[site]),'=>',site if not options.test: result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) ) print result print "Transfers" print json.dumps(transfers, indent=2) ## replicas requests for site in transfers: print ','.join(transfers[site]),'=>',site if not options.test: result = None #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out") print result print "File Invalidation" print invalidations
def closor(url, specific=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() all_late_files = [] check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce') ## manually closed-out workflows should get to close with checkor if specific: wfs = session.query(Workflow).filter( Workflow.name.contains(specific)).all() else: wfs = session.query(Workflow).filter(Workflow.status == 'close').all() held = set() max_per_round = UC.get('max_per_round').get('closor', None) random.shuffle(wfs) if max_per_round: wfs = wfs[:max_per_round] for wfo in wfs: if specific and not specific in wfo.name: continue ## what is the expected #lumis wfi = workflowInfo(url, wfo.name) wfo.wm_status = wfi.request['RequestStatus'] if wfi.request['RequestStatus'] in ['announced', 'normal-archived']: ## manually announced ?? wfo.status = 'done' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog( 'closor', '%s is announced already : %s' % (wfo.name, wfo.wm_status)) session.commit() expected_lumis = 1 if not 'TotalInputLumis' in wfi.request: print wfo.name, "has not been assigned yet, or the database is corrupted" else: expected_lumis = wfi.request['TotalInputLumis'] ## what are the outputs outputs = wfi.request['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = defaultdict(lambda: False) #print outputs if len(outputs): print wfo.name, wfi.request['RequestStatus'] for out in outputs: event_count, lumi_count = getDatasetEventsAndLumis(dataset=out) odb = session.query(Output).filter( Output.datasetname == out).first() if not odb: print "adding an output object", out odb = Output(datasetname=out) odb.workflow = wfo session.add(odb) odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) session.commit() wfi.sendLog( 'closor', "\t%60s %d/%d = %3.2f%%" % (out, lumi_count, expected_lumis, lumi_count / float(expected_lumis) * 100.)) #print wfo.fraction_for_closing, lumi_count, expected_lumis #fraction = wfo.fraction_for_closing #fraction = 0.0 #all_OK.append((float(lumi_count) > float(expected_lumis*fraction))) all_OK[out] = True ## check for at least one full copy prior to moving on in_full = {} for out in outputs: in_full[out] = [] presence = getDatasetPresence(url, out) where = [site for site, info in presence.items() if info[0]] if where: all_OK[out] = True print out, "is in full at", ",".join(where) in_full[out] = copy.deepcopy(where) else: going_to = wfi.request['NonCustodialSites'] + wfi.request[ 'CustodialSites'] wfi.sendLog( 'closor', "%s is not in full anywhere. send to %s" % (out, ",".join(sorted(going_to)))) at_destination = dict([(k, v) for (k, v) in presence.items() if k in going_to]) else_where = dict([(k, v) for (k, v) in presence.items() if not k in going_to]) print json.dumps(at_destination) print json.dumps(else_where, indent=2) ## do the full stuck transfer study, missing files and shit ! for there in going_to: late_info = findLateFiles(url, out, going_to=there) for l in late_info: l.update({"workflow": wfo.name, "dataset": out}) all_late_files.extend(late_info) if check_fullcopy_to_announce: ## only set this false if the check is relevant all_OK[out] = False ## verify if we have to do harvesting (OK, requests) = spawn_harvesting(url, wfi, in_full) all_OK.update(OK) ## only that status can let me go into announced if all(all_OK.values()) and wfi.request['RequestStatus'] in [ 'closed-out' ]: print wfo.name, "to be announced" results = [] #'dummy'] if not results: for out in outputs: if all_OK[out]: results.append(setDatasetStatus(out, 'VALID')) tier = out.split('/')[-1] campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request and wfi.request[ 'Campaign']: campaign = wfi.request['Campaign'] to_DDM = False ## campaign override if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[ campaign] and tier in CI.campaigns[campaign][ 'toDDM']: to_DDM = True ## by typical enabling if tier in UC.get("tiers_to_DDM"): to_DDM = True ## check for unitarity if not tier in UC.get("tiers_no_DDM") + UC.get( "tiers_to_DDM"): print "tier", tier, "neither TO or NO DDM for", out results.append('Not recognitized tier %s' % tier) #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out) sendLog( 'closor', "could not recognize %s for injecting in DDM" % out, level='critical') continue n_copies = 2 destinations = [] if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[ campaign]: ddm_instructions = CI.campaigns[campaign][ 'DDMcopies'] if type(ddm_instructions) == int: n_copies = CI.campaigns[campaign]['DDMcopies'] elif type(ddm_instructions) == dict: ## a more fancy configuration for ddmtier, indication in ddm_instructions.items( ): if ddmtier == tier or ddmtier in [ '*', 'all' ]: ## this is for us if 'N' in indication: n_copies = indication['N'] if 'host' in indication: destinations = indication['host'] destination_spec = "" if destinations: destination_spec = "--destination=" + ",".join( destinations) ## inject to DDM when necessary if to_DDM: #print "Sending",out," to DDM" p = os.popen( 'python assignDatasetToSite.py --nCopies=%d --dataset=%s %s --exec' % (n_copies, out, destination_spec)) print p.read() status = p.close() if status != None: print "Failed DDM, retrying a second time" p = os.popen( 'python assignDatasetToSite.py --nCopies=%d --dataset=%s %s --exec' % (n_copies, out, destination_spec)) print p.read() status = p.close() if status != None: #sendEmail("failed DDM injection","could not add "+out+" to DDM pool. check closor logs.") sendLog('closor', "could not add " + out + " to DDM pool. check closor logs.", level='critical') results.append(status) if status == None: wfi.sendLog( 'closor', '%s is send to AnalysisOps DDM pool in %s copies %s' % (n_copies, out, destination_spec)) else: print wfo.name, "no stats for announcing", out results.append('No Stats') if all( map(lambda result: result in ['None', None, True], results)): ## only announce if all previous are fine res = reqMgrClient.announceWorkflowCascade(url, wfo.name) if not res in ['None', None]: ## check the status again, it might well have toggled wl_bis = workflowInfo(url, wfo.name) wfo.wm_status = wl_bis.request['RequestStatus'] session.commit() if wl_bis.request['RequestStatus'] in [ 'announced', 'normal-archived' ]: res = None else: ## retry ? res = reqMgrClient.announceWorkflowCascade( url, wfo.name) results.append(res) #print results if all(map(lambda result: result in ['None', None, True], results)): wfo.status = 'done' session.commit() wfi.sendLog('closor', "workflow is announced") else: print "ERROR with ", wfo.name, "to be announced", json.dumps( results) else: print wfo.name, "not good for announcing:", wfi.request[ 'RequestStatus'] wfi.sendLog('closor', "cannot be announced") held.add(wfo.name) days_late = 0. retries_late = 10 really_late_files = [ info for info in all_late_files if info['retries'] >= retries_late ] really_late_files = [ info for info in really_late_files if info['delay'] / (60 * 60 * 24.) >= days_late ] if really_late_files: subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s' % ( len(really_late_files), days_late, retries_late, json.dumps(really_late_files, indent=2)) #sendEmail('waiting for files to announce', subject) sendLog('closor', subject, level='warning') sendLog('closor', subject) print subject open('%s/stuck_files.json' % monitor_dir, 'w').write(json.dumps(really_late_files, indent=2)) if held: #sendEmail("held from announcing","the workflows below are held up, please check the logs https://cmst2.web.cern.ch/cmst2/unified/logs/closor/last.log \n%s"%("\n".join( held ))) sendLog('closor', "the workflows below are held up \n%s" % ("\n".join(held)), level='critical')
def checkor(url, spec=None, options=None): if userLock(): return if duplicateLock(): return fDB = closeoutInfo() UC = unifiedConfiguration() use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] wfs=[] if options.new: ## get all in running and check ## you want to intersect with what is completed ! if options.strict: completed_wfi = getWorkflows(url, status='completed') for wfo in session.query(Workflow).filter(Workflow.status == 'away').all(): if wfo.name in completed_wfi: wfs.append( wfo ) else: print wfo.name,"is not completed" sendLog('checkor','%s is not completed'%( wfo.name)) else: wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() ) if options.current: ## recheck those already there, probably to just pass them along wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() ) if options.old: ## than get all in need for assistance wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() ) custodials = defaultdict(list) #sites : dataset list transfers = defaultdict(list) #sites : dataset list invalidations = [] #a list of files SI = siteInfo() CI = campaignInfo() mcm = McMClient(dev=False) def get_campaign(output, wfi): campaign = None try: campaign = output.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request: campaign = wfi.request['Campaign'] return campaign ## retrieve bypass and onhold configuration bypasses = [] forcings = [] overrides = getForceCompletes() holdings = [] for bypassor,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('prozober','*****@*****.**')]: bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor) if not os.path.isfile(bypass_file): #sendLog('checkor','no file %s',bypass_file) continue try: bypasses.extend( json.loads(open(bypass_file).read())) except: sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email]) holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor) if not os.path.isfile(holding_file): #sendLog('checkor',"no file %s"%holding_file) continue try: holdings.extend( json.loads(open(holding_file).read())) except: sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email]) ## once this was force-completed, you want to bypass for rider,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('srimanob','*****@*****.**')]: rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider) if not os.path.isfile(rider_file): print "no file",rider_file #sendLog('checkor',"no file %s"%rider_file) continue try: bypasses.extend( json.loads(open( rider_file ).read() ) ) except: sendLog('checkor',"cannot get force complete list from %s"%rider) sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email]) if use_mcm: forcings = mcm.get('/restapi/requests/forcecomplete') if forcings: sendEmail('force completing mechanism','please check what checkor is doing with %s'%( ','.join(forcings))) pattern_fraction_pass = UC.get('pattern_fraction_pass') total_running_time = 5.*60. sleep_time = 1 if len(wfs): sleep_time = min(max(0.5, total_running_time / len(wfs)), 10) random.shuffle( wfs ) print len(wfs),"to consider, pausing for",sleep_time max_per_round = UC.get('max_per_round').get('checkor',None) if max_per_round and not spec: wfs = wfs[:max_per_round] for wfo in wfs: if spec and not (spec in wfo.name): continue time.sleep( sleep_time ) ## get info wfi = workflowInfo(url, wfo.name) wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status)) ## make sure the wm status is up to date. # and send things back/forward if necessary. wfo.wm_status = wfi.request['RequestStatus'] if wfo.wm_status == 'closed-out': ## manually closed-out wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status)) wfo.status = 'close' session.commit() continue elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']: ## went into trouble wfo.status = 'trouble' wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status)) session.commit() continue elif wfo.wm_status in ['assigned','acquired']: ## not worth checking yet wfi.sendLog('checkor',"%s is not running yet"%wfo.name) session.commit() continue if '-onhold' in wfo.status: if wfo.name in holdings and wfo.name not in bypasses: wfi.sendLog('checkor',"%s is on hold"%wfo.name) continue if wfo.wm_status != 'completed': #and not wfo.name in bypasses: ## for sure move on with closeout check if in completed wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status)) session.commit() continue if wfo.name in holdings and wfo.name not in bypasses: wfo.status = 'assistance-onhold' wfi.sendLog('checkor',"setting %s on hold"%wfo.name) session.commit() continue session.commit() #sub_assistance="" # if that string is filled, there will be need for manual assistance existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance assistance_tags = set() is_closing = True ## get it from somewhere bypass_checks = False for bypass in bypasses: if bypass in wfo.name: wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass)) bypass_checks = True break pids = wfi.getPrepIDs() force_by_mcm = False force_by_user = False for force in forcings: if force in pids: wfi.sendLog('checkor',"we can bypass checks and force complete %s because of prepid %s "%( wfo.name, force)) bypass_checks = True force_by_mcm = True break for user in overrides: for force in overrides[user]: if force in wfo.name: wfi.sendLog('checkor',"we can bypass checks and force complete %s because of keyword %s of user %s"%( wfo.name, force, user)) bypass_checks = True force_by_user = True break tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm* vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco campaigns = {} expected_outputs = copy.deepcopy( wfi.request['OutputDatasets'] ) for out in wfi.request['OutputDatasets']: c = get_campaign(out, wfi) campaigns[out] = c if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]: vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override'])) ## add those that we need to check for custodial copy tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] )) wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])] check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] )) check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check )) check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) ) wfi.sendLog('checkor', check_output_text ) ## anything running on acdc : getting the real prepid is not worth it familly = getWorkflowById(url, wfi.request['PrepID'], details=True) acdc = [] acdc_inactive = [] forced_already=False acdc_bads = [] for member in familly: if member['RequestType'] != 'Resubmission': continue if member['RequestName'] == wfo.name: continue if member['RequestDate'] < wfi.request['RequestDate']: continue if 'OriginalRequestName' in member and member['OriginalRequestName'] != wfo.name: continue if member['RequestStatus'] == None: continue if not set(member['OutputDatasets']).issubset( set(expected_outputs)): if not member['RequestStatus'] in ['rejected-archived','rejected','aborted','aborted-archived']: ##this is not good at all wfi.sendLog('checkor','inconsistent ACDC %s'%member['RequestName'] ) acdc_bads.append( member['RequestName'] ) is_closing = False assistance_tags.add('manual') continue if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']: print wfo.name,"still has an ACDC running",member['RequestName'] acdc.append( member['RequestName'] ) ## cannot be bypassed! is_closing = False assistance_tags.add('recovering') if (force_by_mcm or force_by_user) and not forced_already: wfi.sendLog('checkor','%s is being forced completed while recovering'%wfo.name) wfi.notifyRequestor("The workflow %s was force completed"% wfo.name, do_batch=False) forceComplete(url, wfi) forced_already=True else: acdc_inactive.append( member['RequestName'] ) assistance_tags.add('recovered') if acdc_bads: sendEmail('inconsistent ACDC','for %s, ACDC %s is inconsistent, preventing from closing'%( wfo.name, ','.join(acdc_bads) )) ## completion check percent_completions = {} if not 'TotalInputEvents' in wfi.request: event_expected,lumi_expected = 0,0 if not 'recovery' in wfo.status: #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**']) sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical') else: event_expected,lumi_expected = wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis'] if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']): event_expected = int(wfi.request['RequestNumEvents']) elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']: event_expected = wfi.request['Task1']['RequestNumEvents'] for i in range(1,20): if 'Task%d'%i in wfi.request: ## this is wrong ibsolute if 'FilterEfficiency' in wfi.request['Task%d'%i]: event_expected *= float(wfi.request['Task%d'%i]['FilterEfficiency']) event_expected = int(event_expected) fractions_pass = {} over_100_pass = False (lhe,prim,_,_) = wfi.getIO() if lhe or prim: over_100_pass = False for output in wfi.request['OutputDatasets']: event_count,lumi_count = getDatasetEventsAndLumis(dataset=output) percent_completions[output] = 0. if lumi_expected: percent_completions[output] = lumi_count / float( lumi_expected ) if event_expected: wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected )) percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) ) fractions_pass[output] = 0.95 c = campaigns[output] if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]: fractions_pass[output] = CI.campaigns[c]['fractionpass'] wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output)) if options.fractionpass: fractions_pass[output] = options.fractionpass print "overriding fraction to",fractions_pass[output],"by command line for",output for key in pattern_fraction_pass: if key in output: fractions_pass[output] = pattern_fraction_pass[key] print "overriding fraction to",fractions_pass[output],"by dataset key",key if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]): possible_recoveries = wfi.getRecoveryDoc() if possible_recoveries == []: wfi.sendLog('checkor','%s has missing statistics \n%s \n%s, but nothing is recoverable. passing through to annoucement'%( wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) )) sendEmail('nothing is recoverable','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name) bypass_checks = True else: wfi.sendLog('checkor','%s is not completed \n%s \n%s'%( wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) )) ## hook for creating automatically ACDC ? if not bypass_checks: assistance_tags.add('recovery') is_closing = False if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]): print wfo.name,"is over completed" print json.dumps(percent_completions, indent=2) if not bypass_checks: assistance_tags.add('over100') is_closing = False ## correct lumi < 300 event per lumi events_per_lumi = {} for output in wfi.request['OutputDatasets']: events_per_lumi[output] = getDatasetEventsPerLumi( output ) lumi_upper_limit = {} for output in wfi.request['OutputDatasets']: upper_limit = 301. campaign = campaigns[output] #if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request: # upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency'] # print "setting the upper limit of lumisize to",upper_limit,"by request configuration" if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]: upper_limit = CI.campaigns[campaign]['lumisize'] print "overriding the upper lumi size to",upper_limit,"for",campaign if options.lumisize: upper_limit = options.lumisize print "overriding the upper lumi size to",upper_limit,"by command line" lumi_upper_limit[output] = upper_limit if wfi.request['RequestType'] in ['ReDigi']: lumi_upper_limit[output] = -1 if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]): print wfo.name,"has big lumisections" print json.dumps(events_per_lumi, indent=2) ## hook for rejecting the request ? if not bypass_checks: assistance_tags.add('biglumi') is_closing = False any_presence = {} for output in wfi.request['OutputDatasets']: any_presence[output] = getDatasetPresence(url, output, vetoes=[]) ## custodial copy custodial_locations = {} custodial_presences = {} for output in wfi.request['OutputDatasets']: custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s] custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output) if not custodial_locations[output]: custodial_locations[output] = [] ## presence in phedex phedex_presence ={} for output in wfi.request['OutputDatasets']: phedex_presence[output] = phedexClient.getFileCountDataset(url, output ) out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier] size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])): print wfo.name,"has not all custodial location" print json.dumps(custodial_locations, indent=2) ########## ## hook for making a custodial replica ? custodial = None ## get from other outputs for output in out_worth_checking: if len(custodial_locations[output]): custodial = custodial_locations[output][0] if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the other output custodial:",custodial,"because of limited space" custodial = None ## try to get it from campaign configuration if not custodial: for output in out_worth_checking: campaign = campaigns[output] if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]: custodial = CI.campaigns[campaign]['custodial'] print "Setting custodial to",custodial,"from campaign configuration" if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the campaign configuration custodial:",custodial,"because of limited space" custodial = None ## get from the parent pick_custodial = True use_parent_custodial = UC.get('use_parent_custodial') _,prim,_,_ = wfi.getIO() if not custodial and prim and use_parent_custodial: parent_dataset = prim.pop() ## this is terribly dangerous to assume only parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset ) ###parents_custodial = findCustodialLocation(url, parent_dataset) if not parents_custodial: parents_custodial = [] if len(parents_custodial): custodial = parents_custodial[0] else: print "the input dataset",parent_dataset,"does not have custodial in the first place. abort" #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset) ## does not work for RAWOADSIM sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset) ## cannot be bypassed, this is an issue to fix is_closing = False pick_custodial = False assistance_tags.add('parentcustodial') if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the parent custodial:",custodial,"because of limited space" custodial = None if not custodial and pick_custodial: ## pick one at random custodial = SI.pick_SE(size=size_worth_checking) if not custodial: print "cannot find a custodial for",wfo.name wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking)) #sendEmail( "cannot find a custodial","cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking)) sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical') if custodial and (is_closing or bypass_checks): print "picked",custodial,"for tape copy" ## remember how much you added this round already ; this stays locally SI.storage[custodial] -= size_worth_checking ## register the custodial request, if there are no other big issues for output in out_worth_checking: if not len(custodial_locations[output]): if phedex_presence[output]>=1: custodials[custodial].append( output ) ## let's wait and see if that's needed assistance_tags.add('custodial') else: print "no file in phedex for",output," not good to add to custodial requests" #cannot be bypassed is_closing = False ## disk copy disk_copies = {} for output in wfi.request['OutputDatasets']: disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)] if not all(map( lambda sites : len(sites)!=0, disk_copies.values())): print wfo.name,"has not all output on disk" print json.dumps(disk_copies, indent=2) ## presence in dbs dbs_presence = {} dbs_invalid = {} for output in wfi.request['OutputDatasets']: dbs_presence[output] = dbs3Client.getFileCountDataset( output ) dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True) fraction_invalid = 0.01 if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs,phedex mismatch" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) if not 'recovering' in assistance_tags: assistance_tags.add('filemismatch') #print this for show and tell if no recovery on-going for out in dbs_presence: _,_,missing_phedex,missing_dbs = getDatasetFiles(url, out) if missing_phedex: wfi.sendLog('checkor',"These %d files are missing in phedex\n%s"%(len(missing_phedex), "\n".join( missing_phedex ))) if missing_dbs: wfi.sendLog('checkor',"These %d files are missing in dbs\n%s"%(len(missing_dbs), "\n".join( missing_dbs ))) #if not bypass_checks: ## I don't think we can by pass this is_closing = False if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs invalid file level too high" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## need to be going and taking an eye assistance_tags.add('invalidfiles') if not bypass_checks: #sub_assistance+="-invalidfiles" is_closing = False ## put that heavy part at the end ## duplication check duplications = {} if is_closing or bypass_checks: print "starting duplicate checker for",wfo.name for output in wfi.request['OutputDatasets']: print "\tchecking",output duplications[output] = True try: duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True) except: try: duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True) except: print "was not possible to get the duplicate count for",output is_closing=False if any(duplications.values()) and not options.ignoreduplicates: print wfo.name,"has duplicates" print json.dumps(duplications,indent=2) ## hook for making file invalidation ? ## it shouldn't be allowed to bypass it assistance_tags.add('duplicates') is_closing = False ## for visualization later on if not wfo.name in fDB.record: #print "adding",wfo.name,"to close out record" fDB.record[wfo.name] = { 'datasets' :{}, 'name' : wfo.name, 'closeOutWorkflow' : None, } fDB.record[wfo.name]['closeOutWorkflow'] = is_closing fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority'] fDB.record[wfo.name]['prepid'] = wfi.request['PrepID'] for output in wfi.request['OutputDatasets']: if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {} rec = fDB.record[wfo.name]['datasets'][output] rec['percentage'] = float('%.2f'%(percent_completions[output]*100)) rec['duplicate'] = duplications[output] if output in duplications else 'N/A' rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A' rec['closeOutDataset'] = is_closing rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A' rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output]))) rec['dbsFiles'] = dbs_presence[output] rec['dbsInvFiles'] = dbs_invalid[output] rec['phedexFiles'] = phedex_presence[output] rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive)) now = time.gmtime() rec['timestamp'] = time.mktime(now) rec['updated'] = time.asctime(now)+' (GMT)' ## and move on if is_closing: ## toggle status to closed-out in request manager print "setting",wfo.name,"closed-out" if not options.test: if wfo.wm_status in ['closed-out','announced','normal-archived']: print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does" res = None else: res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) print "close out answer",res if not res in ["None",None]: print "try to get the current status again" wfi_bis = workflowInfo(url, wfo.name) if wfi_bis.request['RequestStatus'] == 'closed-out': print "the request did toggle to closed-out" res = None if not res in ["None",None]: print "retrying to closing out" print res res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) if res in [None,"None"]: wfo.status = 'close' session.commit() if use_mcm and force_by_mcm: ## shoot large on all prepids, on closing the wf for pid in pids: mcm.delete('/restapi/requests/forcecomplete/%s'%pid) else: print "could not close out",wfo.name,"will try again next time" else: ## full known list #recovering # has active ACDC ##OUT #recovered #had inactive ACDC #recovery #not over the pass bar #over100 # over 100% #biglumi # has a big lumiblock #parentcustodial # the parent does not have a valid subscription yet #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear #filemismatch # there is a dbs/phedex mismatch #duplicates #a lumi section is there twice ## manual is not added yet, and should be so by recoveror print wfo.name,"was tagged with :",list(assistance_tags) if 'recovering' in assistance_tags: ## if active ACDC, being under threshold, filemismatch do not matter assistance_tags = assistance_tags - set(['recovery','filemismatch']) if 'recovery' in assistance_tags and 'recovered' in assistance_tags: ## should not set -recovery to anything that add ACDC already assistance_tags = assistance_tags - set(['recovery','recovered']) ## straight to manual assistance_tags.add('manual') ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it print wfo.name,"needs assistance with",",".join( assistance_tags ) print wfo.name,"existing conditions",",".join( existing_assistance_tags ) ######################################### ##### notification to requester ######### go_notify=False if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags: go_notify=True if go_notify: #if wfo.name in already_notified: # print "double notification" # sendEmail('double notification','please take a look at %s'%(wfo.name)) #else: # already_notified.append( wfo.name ) detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s' perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name) splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name) ## notify templates messages= { 'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ), 'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink), 'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ), 'filemismatch': 'Samples completed with inconsistency in DBS/Phedex', #'manual' : 'Workflow completed and requires manual checks by Ops', } content = "The request PREPID (WORKFLOW) is facing issue in production.\n" motive = False for case in messages: if case in assistance_tags: content+= "\n"+messages[case]+"\n" motive = True content += "You are invited to check, while this is being taken care of by Comp-Ops.\n" content += "This is an automated message from Comp-Ops.\n" items_notified = set() if use_mcm and motive: wfi.notifyRequestor( content , mcm = mcm) ######################################### ## logic to set the status further if assistance_tags: new_status = 'assistance-'+'-'.join(sorted(assistance_tags) ) else: new_status = 'assistance' ## case where the workflow was in manual from recoveror if not 'manual' in wfo.status or new_status!='assistance-recovery': wfo.status = new_status if not options.test: print "setting",wfo.name,"to",wfo.status session.commit() else: print "current status is",wfo.status,"not changing to anything" #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2)) fDB.html() if not spec: #sendEmail("fresh assistance status available","Fresh status are available at https://cmst2.web.cern.ch/cmst2/unified/assistance.html",destination=['*****@*****.**']) #it's a bit annoying pass ## custodial requests print "Custodials" print json.dumps(custodials, indent=2) for site in custodials: print ','.join(custodials[site]),'=>',site if not options.test: result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) ) print result print "Transfers" print json.dumps(transfers, indent=2) ## replicas requests for site in transfers: print ','.join(transfers[site]),'=>',site if not options.test: result = None #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out") print result print "File Invalidation" print invalidations
def assignor(url, specific=None, talk=True, options=None): if userLock(): return if duplicateLock(): return #if notRunningBefore( 'stagor' ): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() SI = global_SI #LI = lockInfo() NLI = newLockInfo() n_assigned = 0 n_stalled = 0 wfos = [] if specific or options.early: wfos.extend( session.query(Workflow).filter( Workflow.status == 'considered').all()) wfos.extend( session.query(Workflow).filter(Workflow.status == 'staging').all()) if specific: wfos.extend( session.query(Workflow).filter( Workflow.status == 'considered-tried').all()) wfos.extend( session.query(Workflow).filter(Workflow.status == 'staged').all()) #if specific: # #wfos = session.query(Workflow).filter(Workflow.name==specific).all() # wfos = session.query(Workflow).filter(Workflow.name.contains(specific)).all() #if not wfos: # if specific: # wfos = session.query(Workflow).filter(Workflow.status=='considered').all() # wfos.extend( session.query(Workflow).filter(Workflow.status=='staging').all()) # wfos.extend(session.query(Workflow).filter(Workflow.status=='staged').all()) random.shuffle(wfos) for wfo in wfos: if options.limit and (n_stalled + n_assigned) > options.limit: break if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo(url, wfo.name) wfh.sendLog('assignor', "%s to be assigned" % wfo.name) ## check if by configuration we gave it a GO if not CI.go(wfh.request['Campaign']) and not options.go: wfh.sendLog('assignor', "No go for %s" % wfh.request['Campaign']) n_stalled += 1 continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] != 'assignment-approved': if not options.test: wfh.sendLog('assignor', "setting %s away and skipping" % wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name, wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version = wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor', "cannot decide on version number") n_stalled += 1 wfo.status = 'trouble' session.commit() continue ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() original_sites_allowed = copy.deepcopy(sites_allowed) wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed)) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) blocks = [] if 'BlockWhitelist' in wfh.request: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## augment with run white list for dataset in primary: blocks = list( set(blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist']))) wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed)) secondary_locations = None for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" sendEmail("tempting to pass sec location check", "but we cannot yet IMO") #pass presence = getDatasetPresence(url, sec) print sec print json.dumps(presence, indent=2) one_secondary_locations = [ site for (site, (there, frac)) in presence.items() if frac > 98. ] #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations == None: secondary_locations = one_secondary_locations else: secondary_locations = list( set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [ site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations ] wfh.sendLog( 'assignor', "From secondary requirement, now Allowed%s" % sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy(sites_allowed) sites_with_data = copy.deepcopy(sites_allowed) sites_with_any_data = copy.deepcopy(sites_allowed) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default for prim in list(primary): set_lfn = getLFNbase(prim) presence = getDatasetPresence(url, prim, only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction( url, prim, sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks) #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])] #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])] sites_all_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [ psite for (psite, (there, frac)) in presence.items() if there ] ] sites_with_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite, frac) in presence.items() if frac[1] > 90.] ] sites_with_any_data = [ site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys() ] wfh.sendLog( 'assignor', "Holding the data but not allowed %s" % sorted( list( set([ se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed ])))) if primary_locations == None: primary_locations = presence.keys() else: primary_locations = list( set(primary_locations) & set(presence.keys())) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites = [] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] elif primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list( set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] else: opportunistic_sites = [] wfh.sendLog( 'assignor', "We could be running in addition at %s" % sorted(opportunistic_sites)) if any( [osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog( 'assignor', "One of the usable site is in downtime %s" % ([ osite in SI.sites_not_ready for osite in opportunistic_sites ])) down_time = True ## should this be send back to considered ? """ if available_fractions and not all([available>=1. for available in available_fractions.values()]): print "The input dataset is not located in full over sites" print json.dumps(available_fractions) if not options.test and not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known: sendEmail( "cannot be assigned","%s is not full over sites \n %s"%(wfo.name,json.dumps(available_fractions))) known.append( wfo.name ) open('cannot_assign.json','w').write(json.dumps( known, indent=2)) n_stalled+=1 continue ## skip skip skip """ ## should be 2 but for the time-being let's lower it to get things going copies_wanted, cpuh = wfh.getNCopies() if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[ wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[ wfh.request['Campaign']]['maxcopies'] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max( 1, copies_wanted - less_copies_than_requested) # take one out for the efficiency wfh.sendLog('assignor', "needed availability fraction %s" % copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently if available_fractions and not all([ available >= copies_wanted for available in available_fractions.values() ]): not_even_once = not all([ available >= 1. for available in available_fractions.values() ]) wfh.sendLog( 'assignor', "The input dataset is not available %s times, only %s" % (copies_wanted, available_fractions.values())) if down_time and not options.go and not options.early: wfo.status = 'considered' session.commit() wfh.sendLog( 'assignor', "sending back to considered because of site downtime, instead of waiting" ) sendEmail( "cannot be assigned due to downtime", "%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered." % wfo.name) continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known and not options.limit and not options.go and not options.early: wfh.sendLog( 'assignor', "cannot be assigned, %s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions))) sendEmail( "cannot be assigned", "%s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions))) known.append(wfo.name) open('cannot_assign.json', 'w').write(json.dumps(known, indent=2)) n_stalled += 1 if options.early: if wfo.status == 'considered': wfh.sendLog('assignor', "setting considered-tried") wfo.status = 'considered-tried' session.commit() else: print "tried but status is", wfo.status continue ## default back to white list to original white list with any data print "Allowed", sites_allowed if options.primary_aaa: sites_allowed = initial_sites_allowed #options.useSiteListAsLocation = True options.TrustSitelists = True else: sites_allowed = sites_with_any_data wfh.sendLog('assignor', "Selected for any data %s" % sorted(sites_allowed)) if options.restrict: print "Allowed", sites_allowed sites_allowed = sites_with_any_data print "Selected", sites_allowed else: if set(sites_with_data) != set(sites_allowed): ## the data is not everywhere we wanted to run at : enable aaa print "Sites with 90% data not matching site white list (block choping!)" print "Resorting to AAA reading for", list( set(sites_allowed) - set(sites_with_data)), "?" print "Whitelist site with any data", list( set(sites_allowed) - set(sites_with_any_data)) #options.useSiteListAsLocation = True #print "Not commissioned yet" #continue #print "We could be running at",opportunistic_sites,"in addition" ##sites_allowed = list(set(sites_allowed+ opportunistic_sites)) if not len(sites_allowed): wfh.sendLog('assignor', "cannot be assign with no matched sites") sendEmail("cannot be assigned", "%s has no whitelist" % (wfo.name)) n_stalled += 1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [ SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed]) ] ## one last modification now that we know we can assign, and to make sure all ressource can be used by the request : set all ON sites to whitelist ###sites_allowed = original_sites_allowed ## not needed, afterall as secondary jobs go their own ways wfh.sendLog('assignor', "Placing the output on %s" % sites_out) parameters = { 'SiteWhitelist': sites_allowed, #'CustodialSites' : sites_custodial, 'NonCustodialSites': sites_out, 'AutoApproveSubscriptionSites': list(set(sites_out)), 'AcquisitionEra': wfh.acquisitionEra(), 'ProcessingString': wfh.processingString(), 'MergedLFNBase': set_lfn, 'ProcessingVersion': version, } ## plain assignment here team = 'production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team ## high priority team agent #if wfh.request['RequestPriority'] >= 100000 and (wfh.request['TimePerEvent']*int(wfh.getRequestNumEvents()))/(8*3600.) < 10000: # team = 'highprio' # sendEmail("sending work with highprio team","%s"% wfo.name, destination=['*****@*****.**']) ## SDSC redirection #if "T2_US_UCSD" in sites_with_data and random.random() < -0.5 and wfh.request['Campaign']=='RunIISpring15DR74' and int(wfh.getRequestNumEvents()) < 600000 and not any([out.endswith('RAW') for out in wfh.request['OutputDatasets']]): # ## consider SDSC # parameters['SiteWhitelist'] = ['T2_US_UCSD','T3_US_SDSC'] # parameters['useSiteListAsLocation'] = True # team = 'allocation-based' # sendEmail("sending work to SDSC","%s was assigned to SDSC/UCSD"% wfo.name, destination=['*****@*****.**']) ## SDSC redirection #if wfh.request['Campaign']==R'unIIWinter15GS' and random.random() < -1.0: # parameters['SiteWhitelist'] = ['T3_US_SDSC'] # team = 'allocation-based' # sendEmail("sending work to SDSC","%s was assigned to SDSC"% wfo.name, destination=['*****@*****.**']) if False and 'T2_CH_CERN' in parameters['SiteWhitelist']: ## add some check on ### the amount pending to HLT ### the size of the request ### the priority of the request (maybe not if we decide to overflow during runs) parameters['SiteWhitelist'] = ['T2_CH_CERN_HLT'] team = 'hlt' ## reduce the splitting by factor of 4, regardless of type of splitting sendEmail("sending work to HLT", "%s was assigned to HLT" % wfo.name) ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v = getattr(options, key) if v != None: if type(v) == str and ',' in v: parameters[key] = filter(None, v.split(',')) else: parameters[key] = v ## pick up campaign specific assignment parameters parameters.update(CI.parameters(wfh.request['Campaign'])) if not options.test: parameters['execute'] = True split_check = wfh.checkWorkflowSplitting() if split_check != True: parameters.update(split_check) if 'EventBased' in split_check.values(): wfh.sendLog('assignor', "Falling back to event splitting.") sendEmail( "Fallback to EventBased", "the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting" % wfo.name) elif 'EventsPerJob' in split_check.values(): wfh.sendLog('assignor', "Modifying the number of job per event") sendEmail( "Modifying the job per events", "the workflow %s is too heavy in number of jobs explosion" % wfo.name) # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents / (reqJobs * 1.4)) lumisPerJob = int(eventsPerJob / eventsPerLumi) if lumisPerJob == 0: sendEmail( "issue with event splitting for run-dependent MC", "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) wfh.sendLog( 'assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl[ 'events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl[ 'avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: sendEmail( "setting lumi splitting for run-dependent MC", "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) wfh.sendLog( 'assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: sendEmail("leaving splitting untouched for PU_RD*", "please check on " + wfo.name) wfh.sendLog( 'assignor', "leaving splitting untouched for PU_RD*, please check." ) result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned += 1 try: ## refetch information and lock output new_wfi = workflowInfo(url, wfo.name) (_, prim, _, sec) = new_wfi.getIO() for secure in list(prim) + list( sec) + new_wfi.request['OutputDatasets']: ## lock all outputs flat NLI.lock(secure) #for site in [SI.CE_to_SE(site) for site in sites_allowed]: # for output in new_wfi.request['OutputDatasets']: # LI.lock( output, site, 'dataset in production') # for primary in prim: # LI.lock( primary, site, 'dataset used in input') # for secondary in sec: # LI.lock( secondary, site, 'required for mixing' ) except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output", str(e)) else: print "ERROR could not assign", wfo.name else: pass print "Assignment summary:" sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled))
def assignor(url, specific=None, talk=True, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() SI = siteInfo() NLI = newLockInfo() n_assigned = 0 n_stalled = 0 wfos = [] if specific or options.early: wfos.extend( session.query(Workflow).filter( Workflow.status == 'considered').all()) wfos.extend( session.query(Workflow).filter(Workflow.status == 'staging').all()) if specific: wfos.extend( session.query(Workflow).filter( Workflow.status == 'considered-tried').all()) wfos.extend( session.query(Workflow).filter(Workflow.status == 'staged').all()) dataset_endpoints = json.loads( open('%s/dataset_endpoints.json' % monitor_dir).read()) max_per_round = UC.get('max_per_round').get('assignor', None) max_cpuh_block = UC.get('max_cpuh_block') random.shuffle(wfos) for wfo in wfos: if options.limit and (n_stalled + n_assigned) > options.limit: break if max_per_round and (n_stalled + n_assigned) > max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo(url, wfo.name) wfh.sendLog('assignor', "%s to be assigned" % wfo.name) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: n_stalled += 1 no_go = True allowed_secondary = set() for campaign in wfh.getCampaigns(): if campaign in CI.campaigns and 'secondaries' in CI.campaigns[ campaign]: allowed_secondary.update(CI.campaigns[campaign]['secondaries']) if (secondary and allowed_secondary) and ( set(secondary) & allowed_secondary != set(secondary)): wfh.sendLog( 'assignor', '%s is not an allowed secondary' % (', '.join(set(secondary) - allowed_secondary))) #sendEmail('secondary not allowed','%s is not an allowed secondary'%( ', '.join(set(secondary)-allowed_secondary))) sendLog('assignor', '%s is not an allowed secondary' % (', '.join(set(secondary) - allowed_secondary)), level='critical') if not options.go: n_stalled += 1 no_go = True if no_go: continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] != 'assignment-approved': if not options.test: wfh.sendLog('assignor', "setting %s away and skipping" % wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name, wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version = wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor', "cannot decide on version number") n_stalled += 1 wfo.status = 'trouble' session.commit() continue original_sites_allowed = copy.deepcopy(sites_allowed) wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed)) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) blocks = [] if 'BlockWhitelist' in wfh.request: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## augment with run white list for dataset in primary: blocks = list( set(blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist']))) wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed)) secondary_locations = None primary_aaa = options.primary_aaa if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'primary_AAA' in CI.campaigns[ wfh.request['Campaign']]: primary_aaa = primary_aaa or CI.campaigns[ wfh.request['Campaign']]['primary_AAA'] secondary_aaa = options.secondary_aaa if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'secondary_AAA' in CI.campaigns[ wfh.request['Campaign']]: secondary_aaa = secondary_aaa or CI.campaigns[ wfh.request['Campaign']]['secondary_AAA'] for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" sendEmail("tempting to pass sec location check", "but we cannot yet IMO") #pass if secondary_aaa: #just continue without checking continue presence = getDatasetPresence(url, sec) print sec print json.dumps(presence, indent=2) one_secondary_locations = [ site for (site, (there, frac)) in presence.items() if frac > 98. ] #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations == None: secondary_locations = one_secondary_locations else: secondary_locations = list( set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [ site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations ] wfh.sendLog( 'assignor', "From secondary requirement, now Allowed%s" % sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy(sites_allowed) sites_with_data = copy.deepcopy(sites_allowed) sites_with_any_data = copy.deepcopy(sites_allowed) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default endpoints = set() for prim in list(primary): if prim in dataset_endpoints: print "endpoints from stagor", dataset_endpoints[prim] endpoints.update(dataset_endpoints[prim]) set_lfn = getLFNbase(prim) presence = getDatasetPresence(url, prim, only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction( url, prim, sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks) #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])] #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])] sites_all_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [ psite for (psite, (there, frac)) in presence.items() if there ] ] sites_with_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite, frac) in presence.items() if frac[1] > 90.] ] sites_with_any_data = [ site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys() ] wfh.sendLog( 'assignor', "Holding the data but not allowed %s" % sorted( list( set([ se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed ])))) if primary_locations == None: primary_locations = presence.keys() else: primary_locations = list( set(primary_locations) & set(presence.keys())) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites = [] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] elif primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list( set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] else: opportunistic_sites = [] wfh.sendLog( 'assignor', "We could be running in addition at %s" % sorted(opportunistic_sites)) if any( [osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog( 'assignor', "One of the usable site is in downtime %s" % ([ osite in SI.sites_not_ready for osite in opportunistic_sites ])) down_time = True ## should this be send back to considered ? ## should be 2 but for the time-being let's lower it to get things going copies_wanted, cpuh = wfh.getNCopies() wfh.sendLog('assignor', "we need %s CPUh" % cpuh) if cpuh > max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog( 'assignor', '%s requires a large numbr of CPUh %s , not assigning, please check with requester' % (wfo.name, cpuh), level='critical') wfh.sendLog( 'assignor', "Requiring a large number of CPUh %s, not assigning" % cpuh) continue if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[ wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[ wfh.request['Campaign']]['maxcopies'] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max( 1, copies_wanted - less_copies_than_requested) # take one out for the efficiency else: ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going pass wfh.sendLog('assignor', "needed availability fraction %s" % copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently if available_fractions and not all([ available >= copies_wanted for available in available_fractions.values() ]): not_even_once = not all([ available >= 1. for available in available_fractions.values() ]) wfh.sendLog( 'assignor', "The input dataset is not available %s times, only %s" % (copies_wanted, available_fractions.values())) if down_time and not options.go and not options.early: wfo.status = 'considered' session.commit() wfh.sendLog( 'assignor', "sending back to considered because of site downtime, instead of waiting" ) #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) sendLog( 'assignor', '%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.' % (wfo.name), level='delay') continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known and not options.limit and not options.go and not options.early and not options.partial: wfh.sendLog( 'assignor', "cannot be assigned, %s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions))) sendEmail( "cannot be assigned", "%s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions))) known.append(wfo.name) open('cannot_assign.json', 'w').write(json.dumps(known, indent=2)) n_stalled += 1 if options.early: if wfo.status == 'considered': wfh.sendLog('assignor', "setting considered-tried") wfo.status = 'considered-tried' session.commit() else: print "tried but status is", wfo.status if options.partial: print "Will move on with partial locations" else: continue ## default back to white list to original white list with any data print "Allowed", sorted(sites_allowed) if primary_aaa: sites_allowed = initial_sites_allowed options.TrustSitelists = True wfh.sendLog( 'assignor', "Selected to read primary through xrootd %s" % sorted(sites_allowed)) else: sites_allowed = sites_with_any_data wfh.sendLog('assignor', "Selected for any data %s" % sorted(sites_allowed)) if secondary_aaa: options.TrustPUSitelists = True wfh.sendLog( 'assignor', "Reading secondary through xrootd from %s" % sorted(sites_allowed)) ### check on endpoints for on-going transfers if endpoints and options.partial: sites_allowed = list( set(sites_allowed + [SI.SE_to_CE(s) for s in endpoints])) print "with added endpoints", sorted(sites_allowed) if not len(sites_allowed): wfh.sendLog('assignor', "cannot be assign with no matched sites") sendLog('assignor', '%s has no whitelist' % wfo.name, level='critical') n_stalled += 1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [ SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed]) ] wfh.sendLog('assignor', "Placing the output on %s" % sites_out) parameters = { 'SiteWhitelist': sites_allowed, 'NonCustodialSites': sites_out, 'AutoApproveSubscriptionSites': list(set(sites_out)), 'AcquisitionEra': wfh.acquisitionEra(), 'ProcessingString': wfh.processingString(), 'MergedLFNBase': set_lfn, 'ProcessingVersion': version, } ## plain assignment here team = 'production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team if False and 'T2_CH_CERN' in parameters['SiteWhitelist']: ## add some check on ### the amount pending to HLT ### the size of the request ### the priority of the request (maybe not if we decide to overflow during runs) parameters['SiteWhitelist'] = ['T2_CH_CERN_HLT'] team = 'hlt' ## reduce the splitting by factor of 4, regardless of type of splitting sendEmail("sending work to HLT", "%s was assigned to HLT" % wfo.name) ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v = getattr(options, key) if v != None: if type(v) == str and ',' in v: parameters[key] = filter(None, v.split(',')) else: parameters[key] = v if lheinput: ## throttle reading LHE article wfh.sendLog('assignor', 'Setting the number of events per job to 500k max') parameters['EventsPerJob'] = 500000 ## pick up campaign specific assignment parameters parameters.update(CI.parameters(wfh.request['Campaign'])) if not options.test: parameters['execute'] = True split_check = wfh.checkWorkflowSplitting() if split_check != True: parameters.update(split_check) if 'EventBased' in split_check.values(): wfh.sendLog('assignor', "Falling back to event splitting.") #sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name) sendLog( 'assignor', 'the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting' % wfo.name, level='critical') elif 'EventsPerJob' in split_check.values(): wfh.sendLog('assignor', "Modifying the number of job per event") #sendEmail("Modifying the job per events","the workflow %s is too heavy in number of jobs explosion"%wfo.name) sendLog( 'assignor', "the workflow %s is too heavy in number of jobs explosion" % wfo.name, level='critical') # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents / (reqJobs * 1.4)) lumisPerJob = int(eventsPerJob / eventsPerLumi) if lumisPerJob == 0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob), level='critical') wfh.sendLog( 'assignor', "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl[ 'events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl[ 'avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob), level='critical') wfh.sendLog( 'assignor', "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog( 'assignor', "leaving splitting untouched for %s, please check on %s" % (pstring, wfo.name), level='critical') wfh.sendLog( 'assignor', "leaving splitting untouched for PU_RD*, please check." ) result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned += 1 wfh.sendLog( 'assignor', "Properly assigned\n%s" % (json.dumps(parameters, indent=2))) try: ## refetch information and lock output new_wfi = workflowInfo(url, wfo.name) (_, prim, _, sec) = new_wfi.getIO() for secure in list(prim) + list( sec) + new_wfi.request['OutputDatasets']: ## lock all outputs flat NLI.lock(secure) #for site in [SI.CE_to_SE(site) for site in sites_allowed]: # for output in new_wfi.request['OutputDatasets']: # LI.lock( output, site, 'dataset in production') # for primary in prim: # LI.lock( primary, site, 'dataset used in input') # for secondary in sec: # LI.lock( secondary, site, 'required for mixing' ) except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output", str(e)) else: print "ERROR could not assign", wfo.name else: pass print "Assignment summary:" sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled))
def checkor(url, spec=None, options=None): if userLock(): return if duplicateLock() and not options.go: return fDB = closeoutInfo() UC = unifiedConfiguration() use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] def time_point(label="",sub_lap=False): now = time.mktime(time.gmtime()) nows = time.asctime(time.gmtime()) print "Time check (%s) point at : %s"%(label, nows) print "Since start: %s [s]"% ( now - time_point.start) if sub_lap: print "Sub Lap : %s [s]"% ( now - time_point.sub_lap ) time_point.sub_lap = now else: print "Lap : %s [s]"% ( now - time_point.lap ) time_point.lap = now time_point.sub_lap = now time_point.sub_lap = time_point.lap = time_point.start = time.mktime(time.gmtime()) runnings = session.query(Workflow).filter(Workflow.status == 'away').all() standings = session.query(Workflow).filter(Workflow.status.startswith('assistance')).all() ## intersect with what is actually in completed status in request manager now all_completed = set(getWorkflows(url, 'completed' )) wfs=[] if options.strict: ## the one which were running and now have completed print "strict option is on: checking workflows that freshly completed" wfs.extend( filter(lambda wfo: wfo.name in all_completed , runnings)) if options.update: print "update option is on: checking workflows that have not completed yet" wfs.extend( filter(lambda wfo: not wfo.name in all_completed , runnings)) if options.clear: print "clear option is on: checking workflows that are ready to toggle closed-out" wfs.extend( filter(lambda wfo: 'custodial' in wfo.status, standings)) if options.review: print "review option is on: checking the workflows that needed intervention" wfs.extend( filter(lambda wfo: not 'custodial' in wfo.status, standings)) ## what is left out are the wf which were running and ended up aborted/failed/... custodials = defaultdict(list) #sites : dataset list transfers = defaultdict(list) #sites : dataset list invalidations = [] #a list of files SI = siteInfo() CI = campaignInfo() mcm = McMClient(dev=False) if use_mcm else None def get_campaign(output, wfi): ## this should be a perfect matching of output->task->campaign campaign = None era = None wf_campaign = None if 'Campaign' in wfi.request: wf_campaign = wfi.request['Campaign'] try: era = output.split('/')[2].split('-')[0] except: era = None if wfi.isRelval(): campaign = wf_campaign else: campaign = era if era else wf_campaign return campaign ## retrieve bypass and onhold configuration bypasses = [] forcings = [] overrides = getForceCompletes() holdings = [] actors = UC.get('allowed_bypass') for bypassor,email in actors: bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor) if not os.path.isfile(bypass_file): #sendLog('checkor','no file %s',bypass_file) continue try: bypasses.extend( json.loads(open(bypass_file).read())) except: sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email]) holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor) if not os.path.isfile(holding_file): #sendLog('checkor',"no file %s"%holding_file) continue try: extending = json.loads(open(holding_file).read()) print bypassor,"is holding",extending holdings.extend( extending ) except: sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email]) ## once this was force-completed, you want to bypass for rider,email in actors: rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider) if not os.path.isfile(rider_file): print "no file",rider_file #sendLog('checkor',"no file %s"%rider_file) continue try: bypasses.extend( json.loads(open( rider_file ).read() ) ) except: sendLog('checkor',"cannot get force complete list from %s"%rider) sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email]) if use_mcm: forcings = mcm.get('/restapi/requests/forcecomplete') #if forcings: # sendEmail('force completing mechanism','please check what checkor is doing with %s'%( ','.join(forcings))) pattern_fraction_pass = UC.get('pattern_fraction_pass') total_running_time = 5.*60. sleep_time = 1 if len(wfs): sleep_time = min(max(0.5, total_running_time / len(wfs)), 10) random.shuffle( wfs ) in_manual = 0 ## now you have a record of what file was invalidated globally from TT TMDB_invalid = dataCache.get('file_invalidation') #try: # TMDB_invalid = set([row[3] for row in csv.reader( os.popen('curl -s "https://docs.google.com/spreadsheets/d/11fFsDOTLTtRcI4Q3gXw0GNj4ZS8IoXMoQDC3CbOo_2o/export?format=csv"'))]) # TMDB_invalid = map(lambda e : e.split(':')[-1], TMDB_invalid) # print len(TMDB_invalid),"globally invalidated files" #except Exception as e: # print "TMDB not fetched" # print str(e) # TMDB_invalid = [] print len(wfs),"to consider, pausing for",sleep_time max_per_round = UC.get('max_per_round').get('checkor',None) if options.limit: max_per_round=options.limit if max_per_round and not spec: wfs = wfs[:max_per_round] for wfo in wfs: if spec and not (spec in wfo.name): continue time.sleep( sleep_time ) time_point("Starting with %s"% wfo.name) ## get info wfi = workflowInfo(url, wfo.name) wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status)) ## make sure the wm status is up to date. # and send things back/forward if necessary. wfo.wm_status = wfi.request['RequestStatus'] if wfo.wm_status == 'closed-out': ## manually closed-out wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status)) wfo.status = 'close' session.commit() continue elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']: ## went into trouble wfo.status = 'trouble' wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status)) session.commit() continue elif wfo.wm_status in ['assigned','acquired']: ## not worth checking yet wfi.sendLog('checkor',"%s is not running yet"%wfo.name) session.commit() continue if '-onhold' in wfo.status: if wfo.name in holdings and wfo.name not in bypasses: wfi.sendLog('checkor',"%s is on hold"%wfo.name) continue if wfo.wm_status != 'completed': #and not wfo.name in bypasses: ## for sure move on with closeout check if in completed wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status)) session.commit() continue if wfo.name in holdings and wfo.name not in bypasses: wfo.status = 'assistance-onhold' wfi.sendLog('checkor',"setting %s on hold"%wfo.name) session.commit() continue session.commit() #sub_assistance="" # if that string is filled, there will be need for manual assistance existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance assistance_tags = set() is_closing = True ## get it from somewhere bypass_checks = False for bypass in bypasses: if bypass in wfo.name: wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass)) bypass_checks = True break pids = wfi.getPrepIDs() force_by_mcm = False force_by_user = False for force in forcings: if force in pids: wfi.sendLog('checkor',"we can bypass checks and force complete %s because of prepid %s "%( wfo.name, force)) bypass_checks = True force_by_mcm = True break for user in overrides: for force in overrides[user]: if force in wfo.name: wfi.sendLog('checkor',"we can bypass checks and force complete %s because of keyword %s of user %s"%( wfo.name, force, user)) bypass_checks = True force_by_user = True break tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm* vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco to_ddm_tier = copy.deepcopy(UC.get('tiers_to_DDM')) campaigns = {} ## this mapping of campaign per output dataset assumes era==campaing, which is not true for relval expected_outputs = copy.deepcopy( wfi.request['OutputDatasets'] ) for out in wfi.request['OutputDatasets']: c = get_campaign(out, wfi) campaigns[out] = c if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]: vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override'])) ## add those that we need to check for custodial copy tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] )) wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])] check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] )) check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check )) check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) ) wfi.sendLog('checkor', check_output_text ) ## anything running on acdc : getting the real prepid is not worth it familly = getWorkflowById(url, wfi.request['PrepID'], details=True) acdc = [] acdc_inactive = [] forced_already=False acdc_bads = [] true_familly = [] for member in familly: if member['RequestType'] != 'Resubmission': continue if member['RequestName'] == wfo.name: continue if member['RequestDate'] < wfi.request['RequestDate']: continue if member['PrepID'] != wfi.request['PrepID'] : continue #if 'OriginalRequestName' in member and (not 'ACDC' in member['OriginalRequestName']) and member['OriginalRequestName'] != wfo.name: continue if member['RequestStatus'] == None: continue if not set(member['OutputDatasets']).issubset( set(expected_outputs)): if not member['RequestStatus'] in ['rejected-archived','rejected','aborted','aborted-archived']: ##this is not good at all wfi.sendLog('checkor','inconsistent ACDC %s'%member['RequestName'] ) #sendLog('checkor','inconsistent ACDC %s'%member['RequestName'], level='critical') acdc_bads.append( member['RequestName'] ) is_closing = False assistance_tags.add('manual') continue true_familly.append( member['RequestName'] ) #try: # parse_one(url, member['RequestName']) #except: # print "Could not make error report for",member['RequestName'] if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']: print wfo.name,"still has an ACDC running",member['RequestName'] acdc.append( member['RequestName'] ) ## cannot be bypassed! is_closing = False assistance_tags.add('recovering') if (force_by_mcm or force_by_user) and not forced_already: wfi.sendLog('checkor','%s is being forced completed while recovering'%wfo.name) wfi.notifyRequestor("The workflow %s was force completed"% wfo.name, do_batch=False) forceComplete(url, wfi) forced_already=True else: acdc_inactive.append( member['RequestName'] ) assistance_tags.add('recovered') if acdc_bads: #sendEmail('inconsistent ACDC','for %s, ACDC %s is inconsistent, preventing from closing'%( wfo.name, ','.join(acdc_bads) )) sendLog('checkor','For %s, ACDC %s is inconsistent, preventing from closing or will create a mess.'%( wfo.name, ','.join(acdc_bads) ), level='critical') time_point("checked workflow familly", sub_lap=True) ## completion check percent_completions = {} if not 'TotalInputEvents' in wfi.request: event_expected,lumi_expected = 0,0 if not 'recovery' in wfo.status: #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**']) sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical') else: event_expected,lumi_expected = wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis'] if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']): event_expected = int(wfi.request['RequestNumEvents']) elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']: event_expected = wfi.request['Task1']['RequestNumEvents'] for i in range(1,20): if 'Task%d'%i in wfi.request: ## this is wrong ibsolute if 'FilterEfficiency' in wfi.request['Task%d'%i]: event_expected *= float(wfi.request['Task%d'%i]['FilterEfficiency']) event_expected = int(event_expected) fractions_pass = {} events_per_lumi = {} over_100_pass = False (lhe,prim,_,_) = wfi.getIO() if lhe or prim: over_100_pass = False time_point("execpted statistics", sub_lap=True) for output in wfi.request['OutputDatasets']: event_count,lumi_count = getDatasetEventsAndLumis(dataset=output) events_per_lumi[output] = event_count/float(lumi_count) if lumi_count else 100 percent_completions[output] = 0. if lumi_expected: percent_completions[output] = lumi_count / float( lumi_expected ) if event_expected: wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected )) percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) ) default_pass = UC.get('default_fraction_pass') fractions_pass[output] = default_pass c = campaigns[output] if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]: if type(CI.campaigns[c]['fractionpass']) == dict: tier = output.split('/')[-1] priority = str(wfi.request['RequestPriority']) ## defined per tier fractions_pass[output] = CI.campaigns[c]['fractionpass'].get('all', default_pass) if tier in CI.campaigns[c]['fractionpass']: fractions_pass[output] = CI.campaigns[c]['fractionpass'][tier] if priority in CI.campaigns[c]['fractionpass']: fractions_pass[output] = CI.campaigns[c]['fractionpass'][priority] else: fractions_pass[output] = CI.campaigns[c]['fractionpass'] wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output)) if options.fractionpass: fractions_pass[output] = options.fractionpass print "overriding fraction to",fractions_pass[output],"by command line for",output for key in pattern_fraction_pass: if key in output: fractions_pass[output] = pattern_fraction_pass[key] print "overriding fraction to",fractions_pass[output],"by dataset key",key if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]): possible_recoveries = wfi.getRecoveryDoc() if possible_recoveries == []: wfi.sendLog('checkor','%s has missing statistics \n%s \n%s, but nothing is recoverable. passing through to annoucement'%( wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) )) sendLog('checkor','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name, level='critical') #sendEmail('nothing is recoverable','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name)#,destination=['*****@*****.**']) ## do not bypass for now, until Alan understands why we are loosing ACDC docs bypass_checks = True else: wfi.sendLog('checkor','%s is not completed \n%s \n%s'%( wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) )) ## hook for creating automatically ACDC ? if not bypass_checks: assistance_tags.add('recovery') is_closing = False if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]): print wfo.name,"is over completed" print json.dumps(percent_completions, indent=2) if not bypass_checks: assistance_tags.add('over100') is_closing = False time_point("checked output size", sub_lap=True) ## correct lumi < 300 event per lumi #for output in wfi.request['OutputDatasets']: #events_per_lumi[output] = getDatasetEventsPerLumi( output ) lumi_upper_limit = {} for output in wfi.request['OutputDatasets']: upper_limit = 301. campaign = campaigns[output] if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]: upper_limit = CI.campaigns[campaign]['lumisize'] print "overriding the upper lumi size to",upper_limit,"for",campaign if options.lumisize: upper_limit = options.lumisize print "overriding the upper lumi size to",upper_limit,"by command line" lumi_upper_limit[output] = upper_limit if wfi.request['RequestType'] in ['ReDigi','ReReco']: lumi_upper_limit[output] = -1 if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]): print wfo.name,"has big lumisections" print json.dumps(events_per_lumi, indent=2) ## hook for rejecting the request ? if not bypass_checks: assistance_tags.add('biglumi') is_closing = False any_presence = {} for output in wfi.request['OutputDatasets']: any_presence[output] = getDatasetPresence(url, output, vetoes=[]) time_point("checked dataset presence", sub_lap=True) ## custodial copy custodial_locations = {} custodial_presences = {} for output in wfi.request['OutputDatasets']: custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s] custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output) if not custodial_locations[output]: custodial_locations[output] = [] time_point("checked custodiality", sub_lap=True) ## presence in phedex phedex_presence ={} for output in wfi.request['OutputDatasets']: phedex_presence[output] = phedexClient.getFileCountDataset(url, output ) time_point("checked phedex count", sub_lap=True) out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier] size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs size_worht_going_to_ddm = sum([getDatasetSize(out)/1023. for out in out_worth_checking if out.split('/')[-1] in to_ddm_tier ]) ## size in TBs of all outputs if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])): print wfo.name,"has not all custodial location" print json.dumps(custodial_locations, indent=2) ########## ## hook for making a custodial replica ? custodial = None ## get from other outputs for output in out_worth_checking: if len(custodial_locations[output]): custodial = custodial_locations[output][0] if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the other output custodial:",custodial,"because of limited space" custodial = None ## try to get it from campaign configuration if not custodial: for output in out_worth_checking: campaign = campaigns[output] if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]: custodial = CI.campaigns[campaign]['custodial'] print "Setting custodial to",custodial,"from campaign configuration" group = None if campaign in CI.campaigns and 'phedex_group' in CI.campaigns[campaign]: group = CI.campaigns[campaign]['phedex_group'] print "using group",group,"for replica" if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the campaign configuration custodial:",custodial,"because of limited space" custodial = None ## get from the parent pick_custodial = True use_parent_custodial = UC.get('use_parent_custodial') tape_size_limit = options.tape_size_limit if options.tape_size_limit else UC.get("tape_size_limit") _,prim,_,_ = wfi.getIO() if not custodial and prim and use_parent_custodial: parent_dataset = prim.pop() ## this is terribly dangerous to assume only parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset ) ###parents_custodial = findCustodialLocation(url, parent_dataset) if not parents_custodial: parents_custodial = [] if len(parents_custodial): custodial = parents_custodial[0] else: print "the input dataset",parent_dataset,"does not have custodial in the first place. abort" #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset) ## does not work for RAWOADSIM sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset) ## cannot be bypassed, this is an issue to fix is_closing = False pick_custodial = False assistance_tags.add('parentcustodial') if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the parent custodial:",custodial,"because of limited space" custodial = None if not custodial and pick_custodial: ## pick one at random custodial = SI.pick_SE(size=size_worth_checking) if custodial and size_worht_going_to_ddm > tape_size_limit: print wfi.sendLog('checkor',"The total output size (%s TB) is too large for the limit set (%s TB)"%( size_worth_checking, tape_size_limit)) custodial = None if not custodial: print "cannot find a custodial for",wfo.name wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking)) sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical') if custodial and (is_closing or bypass_checks): print "picked",custodial,"for tape copy" ## remember how much you added this round already ; this stays locally SI.storage[custodial] -= size_worth_checking ## register the custodial request, if there are no other big issues for output in out_worth_checking: if not len(custodial_locations[output]): if phedex_presence[output]>=1: wfi.sendLog('checkor','Using %s as a tape destination for %s'%(custodial, output)) custodials[custodial].append( output ) if group: custodials[custodial][-1]+='@%s'%group ## let's wait and see if that's needed assistance_tags.add('custodial') else: print "no file in phedex for",output," not good to add to custodial requests" #cannot be bypassed is_closing = False time_point("determined tape location", sub_lap=True) ## disk copy disk_copies = {} for output in wfi.request['OutputDatasets']: disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)] if not all(map( lambda sites : len(sites)!=0, disk_copies.values())): print wfo.name,"has not all output on disk" print json.dumps(disk_copies, indent=2) ## presence in dbs dbs_presence = {} dbs_invalid = {} for output in wfi.request['OutputDatasets']: dbs_presence[output] = dbs3Client.getFileCountDataset( output ) dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True) time_point("dbs file count", sub_lap=True) if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: mismatch_notice = wfo.name+" has a dbs,phedex mismatch\n" mismatch_notice += "in dbs\n"+json.dumps(dbs_presence, indent=2) +"\n" mismatch_notice += "invalide in dbs\n"+json.dumps(dbs_invalid, indent=2) +"\n" mismatch_notice += "in phedex\n"+json.dumps(phedex_presence, indent=2) +"\n" wfi.sendLog('checkor',mismatch_notice) if not 'recovering' in assistance_tags: assistance_tags.add('filemismatch') #print this for show and tell if no recovery on-going for out in dbs_presence: _,_,missing_phedex,missing_dbs = getDatasetFiles(url, out) if missing_phedex: wfi.sendLog('checkor',"These %d files are missing in phedex\n%s"%(len(missing_phedex), "\n".join( missing_phedex ))) were_invalidated = sorted(set(missing_phedex) & set(TMDB_invalid )) if were_invalidated: wfi.sendLog('checkor',"These %d files were invalidated globally\n%s"%(len(were_invalidated), "\n".join(were_invalidated))) sendLog('checkor',"These %d files were invalidated globally\n%s\nand are invalidated in dbs"%(len(were_invalidated), "\n".join(were_invalidated)), level='critical') dbs3Client.setFileStatus( were_invalidated, newstatus=0 ) if missing_dbs: wfi.sendLog('checkor',"These %d files are missing in dbs\n%s"%(len(missing_dbs), "\n".join( missing_dbs ))) were_invalidated = sorted(set(missing_dbs) & set(TMDB_invalid )) if were_invalidated: wfi.sendLog('checkor',"These %d files were invalidated globally\n%s"%(len(were_invalidated), "\n".join(were_invalidated))) #if not bypass_checks: ## I don't think we can by pass this is_closing = False time_point("checked file count", sub_lap=True) fraction_invalid = 0.20 if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignoreinvalid: print wfo.name,"has a dbs invalid file level too high" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## need to be going and taking an eye assistance_tags.add('invalidfiles') if not bypass_checks: #sub_assistance+="-invalidfiles" is_closing = False ## put that heavy part at the end ## duplication check duplications = {} files_per_rl = {} for output in wfi.request['OutputDatasets']: duplications[output] = "skiped" files_per_rl[output] = "skiped" time_point("checked invalidation", sub_lap=True) if (is_closing or bypass_checks) and (not options.ignoreduplicates): print "starting duplicate checker for",wfo.name for output in wfi.request['OutputDatasets']: print "\tchecking",output duplications[output] = True try: duplications[output],files_per_rl[output] = dbs3Client.duplicateRunLumiFiles( output , skipInvalid=True, verbose=True) except: try: duplications[output],files_per_rl[output] = dbs3Client.duplicateRunLumiFiles( output , skipInvalid=True, verbose=True) except Exception as e: wfi.sendLog('checkor','Not possible to check on duplicate lumi count on %s'%(output)) sendLog('checkor','Not possible to check on duplicate lumi count on %s\n%s'%(output,str(e)),level='critical') is_closing=False if is_closing and any(duplications.values()) and not options.ignoreduplicates: duplicate_notice = "" duplicate_notice += "%s has duplicates\n"%wfo.name duplicate_notice += json.dumps( duplications,indent=2) duplicate_notice += '\n' duplicate_notice += json.dumps( files_per_rl, indent=2) wfi.sendLog('checkor',duplicate_notice) ## hook for making file invalidation ? ## it shouldn't be allowed to bypass it assistance_tags.add('duplicates') is_closing = False time_point("checked duplicates", sub_lap=True) time_point("done with %s"%wfo.name) ## for visualization later on if not wfo.name in fDB.record: #print "adding",wfo.name,"to close out record" fDB.record[wfo.name] = { 'datasets' :{}, 'name' : wfo.name, 'closeOutWorkflow' : None, } fDB.record[wfo.name]['closeOutWorkflow'] = is_closing fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority'] fDB.record[wfo.name]['prepid'] = wfi.request['PrepID'] for output in wfi.request['OutputDatasets']: if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {} rec = fDB.record[wfo.name]['datasets'][output] #rec['percentage'] = float('%.2f'%(percent_completions[output]*100)) rec['percentage'] = math.floor(percent_completions[output]*10000)/100.## round down rec['duplicate'] = duplications[output] if output in duplications else 'N/A' rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A' rec['closeOutDataset'] = is_closing rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A' rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output]))) rec['dbsFiles'] = dbs_presence[output] rec['dbsInvFiles'] = dbs_invalid[output] rec['phedexFiles'] = phedex_presence[output] rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive)) rec['familly'] = true_familly now = time.gmtime() rec['timestamp'] = time.mktime(now) rec['updated'] = time.asctime(now)+' (GMT)' ## make the lumi summary if wfi.request['RequestType'] == 'ReReco': try: os.system('python Unified/lumi_summary.py %s 1 > /dev/null'%(wfi.request['PrepID'])) os.system('python Unified/lumi_plot.py %s > /dev/null'%(wfi.request['PrepID'])) wfi.sendLog('checkor','Lumi summary available at %s/datalumi/lumi.%s.html'%(unified_url,wfi.request['PrepID'])) except Exception as e: print str(e) ## make the error report ## and move on if is_closing: ## toggle status to closed-out in request manager wfi.sendLog('checkor',"setting %s closed-out"% wfo.name) if not options.test: if wfo.wm_status in ['closed-out','announced','normal-archived']: print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does" res = None else: res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) print "close out answer",res if not res in ["None",None]: print "try to get the current status again" wfi_bis = workflowInfo(url, wfo.name) if wfi_bis.request['RequestStatus'] == 'closed-out': print "the request did toggle to closed-out" res = None if not res in ["None",None]: print "retrying to closing out" print res res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) if res in [None,"None"]: wfo.status = 'close' session.commit() if use_mcm and force_by_mcm: ## shoot large on all prepids, on closing the wf for pid in pids: mcm.delete('/restapi/requests/forcecomplete/%s'%pid) else: print "could not close out",wfo.name,"will try again next time" else: if not 'custodial' in assistance_tags or wfi.isRelval(): ## do only the report for those for member in acdc+acdc_inactive+[wfo.name]: try: parse_one(url, member) except: print "Could not make error report for",member ## full known list #recovering # has active ACDC ##OUT #recovered #had inactive ACDC #recovery #not over the pass bar #over100 # over 100% #biglumi # has a big lumiblock #parentcustodial # the parent does not have a valid subscription yet #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear #filemismatch # there is a dbs/phedex mismatch #duplicates #a lumi section is there twice ## manual is not added yet, and should be so by recoveror print wfo.name,"was tagged with :",list(assistance_tags) if 'recovering' in assistance_tags: ## if active ACDC, being under threshold, filemismatch do not matter assistance_tags = assistance_tags - set(['recovery','filemismatch']) if 'recovery' in assistance_tags and 'recovered' in assistance_tags: ## should not set -recovery to anything that had ACDC already assistance_tags = assistance_tags - set(['recovery','recovered']) ## straight to manual assistance_tags.add('manual') in_manual += 1 if 'recovery' in assistance_tags and 'manual' in assistance_tags: ## this is likely because something bad is happening, so leave it to manual assistance_tags = assistance_tags - set(['recovery']) assistance_tags.add('manual') in_manual += 1 ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it print wfo.name,"needs assistance with",",".join( assistance_tags ) print wfo.name,"existing conditions",",".join( existing_assistance_tags ) ######################################### ##### notification to requester ######### go_notify=False if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags: go_notify=True if go_notify: #if wfo.name in already_notified: # print "double notification" # sendEmail('double notification','please take a look at %s'%(wfo.name)) #else: # already_notified.append( wfo.name ) ###detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s' #detailslink = 'https://cmsweb.cern.ch/reqmgr2/fetch?rid=%s'%(wfo.name) ###perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name) perflink = '%s/report/%s'%(unified_url,wfo.name) splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name) ## notify templates messages= { 'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ), 'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink), 'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ), 'filemismatch': 'Samples completed with inconsistency in DBS/Phedex', #'manual' : 'Workflow completed and requires manual checks by Ops', } content = "The request PREPID (WORKFLOW) is facing issue in production.\n" motive = False for case in messages: if case in assistance_tags: content+= "\n"+messages[case]+"\n" motive = True content += "You are invited to check, while this is being taken care of by Comp-Ops.\n" content += "This is an automated message from Comp-Ops.\n" items_notified = set() if use_mcm and motive: wfi.notifyRequestor( content , mcm = mcm) ######################################### ## logic to set the status further if assistance_tags: new_status = 'assistance-'+'-'.join(sorted(assistance_tags) ) else: new_status = 'assistance' ## case where the workflow was in manual from recoveror if not 'manual' in wfo.status or new_status!='assistance-recovery': wfo.status = new_status if not options.test: wfi.sendLog('checkor','setting %s to %s'%(wfo.name, wfo.status)) session.commit() else: print "current status is",wfo.status,"not changing to anything" #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2)) fDB.html() if not spec and in_manual!=0: sendEmail("fresh assistance status available","Fresh status are available at %s/assistance.html"%unified_url,destination=['*****@*****.**']) #it's a bit annoying pass ## custodial requests print "Custodials" print json.dumps(custodials, indent=2) for site in custodials: items_at = defaultdict(set) for i in custodials[site]: item, group = i.split('@') if '@' in i else (i,'DataOps') items_at[group].add( item ) for group,items in items_at.items(): print ','.join(items),'=>',site,'@',group if not options.test: result = makeReplicaRequest(url, site, sorted(items) ,"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) , group=group) print result print "File Invalidation" print invalidations
def closor(url, specific=None, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() all_late_files = [] check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce') ## manually closed-out workflows should get to close with checkor if specific: wfs = session.query(Workflow).filter(Workflow.status=='close').filter(Workflow.name.contains(specific)).all() else: wfs = session.query(Workflow).filter(Workflow.status=='close').all() held = set() print len(wfs),"closing" max_per_round = UC.get('max_per_round').get('closor',None) if options.limit: max_per_round = options.limit random.shuffle( wfs ) if max_per_round: wfs = wfs[:max_per_round] for wfo in wfs: if specific and not specific in wfo.name: continue ## what is the expected #lumis wfi = workflowInfo(url, wfo.name ) wfo.wm_status = wfi.request['RequestStatus'] if wfi.request['RequestStatus'] in ['announced','normal-archived']: ## manually announced ?? wfo.status = 'done' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog('closor','%s is announced already : %s'%( wfo.name,wfo.wm_status)) session.commit() expected_lumis = 1 if not 'TotalInputLumis' in wfi.request: print wfo.name,"has not been assigned yet, or the database is corrupted" elif wfi.request['TotalInputLumis']==0: print wfo.name,"is corrupted with 0 expected lumis" else: expected_lumis = wfi.request['TotalInputLumis'] ## what are the outputs outputs = wfi.request['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = defaultdict(lambda : False) #print outputs if len(outputs): print wfo.name,wfi.request['RequestStatus'] for out in outputs: event_count,lumi_count = getDatasetEventsAndLumis(dataset=out) odb = session.query(Output).filter(Output.datasetname==out).first() if not odb: print "adding an output object",out odb = Output( datasetname = out ) odb.workflow = wfo session.add( odb ) odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) session.commit() wfi.sendLog('closor',"\t%60s %d/%d = %3.2f%%"%(out,lumi_count,expected_lumis,lumi_count/float(expected_lumis)*100.)) #print wfo.fraction_for_closing, lumi_count, expected_lumis #fraction = wfo.fraction_for_closing #fraction = 0.0 #all_OK.append((float(lumi_count) > float(expected_lumis*fraction))) all_OK[out] = True ## check for at least one full copy prior to moving on in_full = {} for out in outputs: in_full[out] = [] presence = getDatasetPresence( url, out ) where = [site for site,info in presence.items() if info[0]] if where: all_OK[out] = True print out,"is in full at",",".join(where) in_full[out] = copy.deepcopy(where) else: going_to = wfi.request['NonCustodialSites']+wfi.request['CustodialSites'] wfi.sendLog('closor',"%s is not in full anywhere. send to %s"%(out, ",".join(sorted(going_to)))) at_destination = dict([(k,v) for (k,v) in presence.items() if k in going_to]) else_where = dict([(k,v) for (k,v) in presence.items() if not k in going_to]) print json.dumps( at_destination ) print json.dumps( else_where, indent=2 ) ## do the full stuck transfer study, missing files and shit ! for there in going_to: late_info = findLateFiles(url, out, going_to = there ) for l in late_info: l.update({"workflow":wfo.name,"dataset":out}) all_late_files.extend( late_info ) if check_fullcopy_to_announce: ## only set this false if the check is relevant all_OK[out] = False ## verify if we have to do harvesting if not options.no_harvest: (OK, requests) = spawn_harvesting(url, wfi, in_full) all_OK.update( OK ) ## only that status can let me go into announced if all(all_OK.values()) and wfi.request['RequestStatus'] in ['closed-out']: print wfo.name,"to be announced" results=[]#'dummy'] if not results: for out in outputs: if all_OK[out]: results.append(setDatasetStatus(out, 'VALID')) tier = out.split('/')[-1] campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request and wfi.request['Campaign']: campaign = wfi.request['Campaign'] to_DDM = False ## campaign override if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[campaign] and tier in CI.campaigns[campaign]['toDDM']: to_DDM = True ## by typical enabling if tier in UC.get("tiers_to_DDM"): to_DDM = True ## check for unitarity if not tier in UC.get("tiers_no_DDM")+UC.get("tiers_to_DDM"): print "tier",tier,"neither TO or NO DDM for",out results.append('Not recognitized tier %s'%tier) #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out) sendLog('closor', "could not recognize %s for injecting in DDM"% out, level='critical') continue n_copies = 2 destinations=[] if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[campaign]: ddm_instructions = CI.campaigns[campaign]['DDMcopies'] if type(ddm_instructions) == int: n_copies = CI.campaigns[campaign]['DDMcopies'] elif type(ddm_instructions) == dict: ## a more fancy configuration for ddmtier,indication in ddm_instructions.items(): if ddmtier==tier or ddmtier in ['*','all']: ## this is for us if 'N' in indication: n_copies = indication['N'] if 'host' in indication: destinations = indication['host'] destination_spec = "" if destinations: destination_spec = "--destination="+",".join( destinations ) ## inject to DDM when necessary if to_DDM: print "Sending",out," to DDM" p = os.popen('python assignDatasetToSite.py --nCopies=%d --dataset=%s %s --debug 0 --exec'%(n_copies, out,destination_spec)) ddm_text = p.read() print ddm_text status = p.close() if status!=None: print "Failed DDM, retrying to send",out,"a second time" p = os.popen('python assignDatasetToSite.py --nCopies=%d --dataset=%s %s --debug 1 --exec'%(n_copies, out,destination_spec)) ddm_text = p.read() print ddm_text status = p.close() if status!=None: #sendEmail("failed DDM injection","could not add "+out+" to DDM pool. check closor logs.") sendLog('closor',"could not add "+out+" to DDM pool. check closor logs.", level='critical') results.append( status ) if status == None: wfi.sendLog('closor',ddm_text) wfi.sendLog('closor','%s is send to AnalysisOps DDM pool in %s copies %s'%( out, n_copies, destination_spec)) else: print wfo.name,"no stats for announcing",out results.append('No Stats') if all(map(lambda result : result in ['None',None,True],results)): ## only announce if all previous are fine res = reqMgrClient.announceWorkflowCascade(url, wfo.name) if not res in ['None',None]: ## check the status again, it might well have toggled wl_bis = workflowInfo(url, wfo.name) wfo.wm_status = wl_bis.request['RequestStatus'] session.commit() if wl_bis.request['RequestStatus'] in ['announced','normal-archived']: res = None else: ## retry ? res = reqMgrClient.announceWorkflowCascade(url, wfo.name) results.append( res ) #print results if all(map(lambda result : result in ['None',None,True],results)): wfo.status = 'done' session.commit() wfi.sendLog('closor',"workflow is announced") else: print "ERROR with ",wfo.name,"to be announced",json.dumps( results ) else: print wfo.name,"not good for announcing:",wfi.request['RequestStatus'] wfi.sendLog('closor',"cannot be announced") held.add( wfo.name ) days_late = 0. retries_late = 10 really_late_files = [info for info in all_late_files if info['retries']>=retries_late] really_late_files = [info for info in really_late_files if info['delay']/(60*60*24.)>=days_late] if really_late_files: subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s'%(len(really_late_files), days_late, retries_late, json.dumps( really_late_files , indent=2) ) #sendEmail('waiting for files to announce', subject) sendLog('closor', subject, level='warning') sendLog('closor',subject) print subject open('%s/stuck_files.json'%monitor_dir,'w').write( json.dumps( really_late_files , indent=2)) if held: #sendEmail("held from announcing","the workflows below are held up, please check the logs https://cmst2.web.cern.ch/cmst2/unified/logs/closor/last.log \n%s"%("\n".join( held ))) sendLog('closor',"the workflows below are held up \n%s"%("\n".join( sorted(held) )), level='critical')
def transferor(url, specific=None, talk=True, options=None): if userLock(): return if duplicateLock(): return use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() NLI = newLockInfo() mcm = McMClient(dev=False) dss = DSS() #allowed_secondary = UC.get('') print "counting all being handled..." being_handled = len( session.query(Workflow).filter(Workflow.status == 'away').all()) being_handled += len( session.query(Workflow).filter( Workflow.status.startswith('stag')).all()) being_transfered = len( session.query(Workflow).filter(Workflow.status == 'staging').all()) being_handled += len( session.query(Workflow).filter( Workflow.status.startswith('assistance-')).all()) max_to_handle = options.maxworkflows max_to_transfer = options.maxstaging allowed_to_handle = max(0, max_to_handle - being_handled) allowed_to_transfer = max(0, max_to_transfer - being_transfered) wf_buffer = 5 if allowed_to_handle <= wf_buffer: ## buffer for having several wf per transfer print "Not allowed to run more than", max_to_handle, "at a time. Currently", being_handled, "and", wf_buffer, "buffer" else: print being_handled, "already being handled", max_to_handle, "max allowed,", allowed_to_handle, "remaining", "and", wf_buffer, "buffer" if allowed_to_transfer <= wf_buffer: print "Not allowed to transfer more than", max_to_transfer, "at a time. Currently", being_transfered, "and", wf_buffer, "buffer" else: print being_transfered, "already being transfered", max_to_transfer, "max allowed,", allowed_to_transfer, "remaining", "and", wf_buffer, "buffer" print "... done" all_transfers = defaultdict(list) workflow_dependencies = defaultdict( set) ## list of wf.id per input dataset wfs_and_wfh = [] print "getting all wf to consider ..." cache = getWorkflows(url, 'assignment-approved', details=True) for wfo in session.query(Workflow).filter( Workflow.status.startswith('considered')).all(): print "\t", wfo.name if specific and not specific in wfo.name: continue cache_r = filter(lambda d: d['RequestName'] == wfo.name, cache) if len(cache_r): wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False, request=cache_r[0]))) else: wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False))) print "... done" transfers_per_sites = defaultdict(int) input_sizes = {} ignored_input_sizes = {} input_cput = {} input_st = {} ## list the size of those in transfer already in_transfer_priority = None min_transfer_priority = None print "getting all wf in staging ..." stucks = json.loads(open('%s/stuck_transfers.json' % monitor_dir).read()) for wfo in session.query(Workflow).filter( Workflow.status == 'staging').all(): wfh = workflowInfo(url, wfo.name, spec=False) #(lheinput,primary,parent,secondary) = wfh.getIO() #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() for site in sites_allowed: ## we should get the actual transfer destination instead of the full white list transfers_per_sites[site] += 1 #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() for prim in primary: ds_s = dss.get(prim) if prim in stucks: sendLog('transferor', "%s appears stuck, so not counting it %s [GB]" % (prim, ds_s), wfi=wfh) ignored_input_sizes[prim] = ds_s else: input_sizes[prim] = ds_s sendLog('transferor', "%s needs %s [GB]" % (wfo.name, ds_s), wfi=wfh) if in_transfer_priority == None: in_transfer_priority = int(wfh.request['RequestPriority']) else: in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority'])) if min_transfer_priority == None: min_transfer_priority = int(wfh.request['RequestPriority']) else: min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority'])) if min_transfer_priority == None or in_transfer_priority == None: print "nothing is lining up for transfer" sendEmail("no request in staging", "no request in staging") return pass try: print "Ignored input sizes" ignored_values = list(ignored_input_sizes.items()) ignored_values.sort(key=lambda i: i[1]) print "\n".join(map(str, ignored_values)) print "Considered input sizes" considered_values = list(input_sizes.items()) considered_values.sort(key=lambda i: i[1]) print "\n".join(map(str, considered_values)) except Exception as e: print "trying to print the summary of input size" print str(e) print "... done" print "Max priority in transfer already", in_transfer_priority print "Min priority in transfer already", min_transfer_priority print "transfers per sites" print json.dumps(transfers_per_sites, indent=2) in_transfer_already = sum(input_sizes.values()) cput_in_transfer_already = sum(input_cput.values()) st_in_transfer_already = sum(input_st.values()) ## list the size of all inputs primary_input_per_workflow_gb = defaultdict(float) print "getting all input sizes ..." for (wfo, wfh) in wfs_and_wfh: (_, primary, _, _) = wfh.getIO() #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() for prim in primary: ## do not count it if it appears stalled ! prim_size = dss.get(prim) input_sizes[prim] = prim_size primary_input_per_workflow_gb[wfo.name] += prim_size print "... done" # shuffle first by name random.shuffle(wfs_and_wfh) # Sort smallest transfers first; allows us to transfer as many as possible workflows. def prio_and_size(i, j): if int(i[1].request['RequestPriority']) == int( j[1].request['RequestPriority']): return cmp(int(primary_input_per_workflow_gb.get(j[0].name, 0)), int(primary_input_per_workflow_gb.get(i[0].name, 0))) else: return cmp(int(i[1].request['RequestPriority']), int(j[1].request['RequestPriority'])) #wfs_and_wfh.sort(cmp = prio_and_size, reverse=True) #wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(primary_input_per_workflow_gb.get(i[0].name, 0)), int(primary_input_per_workflow_gb.get(j[0].name, 0)) )) #sort by priority higher first wfs_and_wfh.sort(cmp=lambda i, j: cmp(int(i[1].request[ 'RequestPriority']), int(j[1].request['RequestPriority'])), reverse=True) cput_grand_total = sum(input_cput.values()) cput_to_transfer = cput_grand_total - cput_in_transfer_already st_grand_total = sum(input_st.values()) st_to_transfer = st_grand_total - st_in_transfer_already print "%15.4f [CPU h] worth already in transfer" % cput_in_transfer_already print "%15.4f [CPU h] worth is the current requested transfer load" % cput_to_transfer print "%15.4f [h] worth of absolute system time in transfer" % ( cput_in_transfer_already / SI.availableSlots()) print "%15.4f [h] worth of absolute system time is the current requested transfer load" % ( cput_to_transfer / SI.availableSlots()) print "%15.4f [h] worth of theoritical system time in transfer" % ( st_in_transfer_already) print "%15.4f [h] worth of theoritical system time is the current requested transfer load" % ( st_to_transfer) grand_total = sum(input_sizes.values()) to_transfer = grand_total - in_transfer_already grand_transfer_limit = options.maxtransfer #grand_transfer_limit = SI.total_disk()*0.25*1024## half of the free sapce in TB->GB transfer_limit = grand_transfer_limit - in_transfer_already print "%15.4f GB already being transfered" % in_transfer_already print "%15.4f GB is the current requested transfer load" % to_transfer print "%15.4f GB is the global transfer limit" % grand_transfer_limit print "%15.4f GB is the available limit" % transfer_limit max_staging_per_site = options.maxstagingpersite # the max priority value per dataset. max_priority = defaultdict(int) needs_transfer = 0 ## so that we can count'em passing_along = 0 transfer_sizes = {} went_over_budget = False destination_cache = {} no_goes = set() max_per_round = UC.get('max_per_round').get('transferor', None) if max_per_round and not spec: wfs_and_wfh = wfs_and_wfh[:max_per_round] for (wfo, wfh) in wfs_and_wfh: print wfo.name, "to be transfered with priority", wfh.request[ 'RequestPriority'] if wfh.request['RequestStatus'] != 'assignment-approved': if wfh.request['RequestStatus'] in [ 'aborted', 'rejected', 'rejected-archived', 'aborted-archived' ]: wfo.status = 'trouble' ## so that we look or a replacement else: wfo.status = 'away' wfh.sendLog( 'transferor', '%s in status %s, setting %s' % (wfo.name, wfh.request['RequestStatus'], wfo.status)) continue (_, primary, _, _) = wfh.getIO() this_load = sum([input_sizes[prim] for prim in primary]) no_budget = False if (this_load and (sum(transfer_sizes.values()) + this_load > transfer_limit or went_over_budget)): if went_over_budget: wfh.sendLog('transferor', "Transfer has gone over bubget.") else: wfh.sendLog('transferor', "Transfer will go over bubget.") wfh.sendLog( 'transferor', "%15.4f GB this load, %15.4f GB already this round, %15.4f GB is the available limit" % (this_load, sum(transfer_sizes.values()), transfer_limit)) #if sum(transfer_sizes.values()) > transfer_limit: went_over_budget = True if in_transfer_priority != None and min_transfer_priority != None: if int( wfh.request['RequestPriority'] ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority: wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over budget" % (wfh.request['RequestPriority'], in_transfer_priority)) else: if not options.go: wfh.sendLog( 'transferor', "%s minimum priority %s < %s : stop" % (min_transfer_priority, wfh.request['RequestPriority'], in_transfer_priority)) no_budget = True ## throtlle by campaign go no_go = False if not wfh.go(log=True) and not options.go: no_go = True no_goes.add(wfo.name) allowed_secondary = set() for campaign in wfh.getCampaigns(): if campaign in CI.campaigns and 'secondaries' in CI.campaigns[ campaign]: allowed_secondary.update(CI.campaigns[campaign]['secondaries']) if secondary: if (secondary and allowed_secondary) and ( set(secondary) & allowed_secondary != set(secondary)): wfh.sendLog( 'assignor', '%s is not an allowed secondary' % (', '.join(set(secondary) - allowed_secondary))) no_go = True if no_go: continue ## check if the batch is announced def check_mcm(wfn): announced = False is_real = False if not wfn.startswith('pdmvserv'): is_real = True try: for b in mcm.getA('batches', query='contains=%s' % wfo.name): is_real = True if b['status'] == 'announced': announced = True break except: try: for b in mcm.getA('batches', query='contains=%s' % wfo.name): is_real = True if b['status'] == 'announced': announced = True break except: print "could not get mcm batch announcement, assuming not real" return announced, is_real if not use_mcm: announced, is_real = False, True else: if wfh.request['RequestType'] in ['ReReco']: announced, is_real = True, True else: announced, is_real = check_mcm(wfo.name) if not announced: wfh.sendLog('transferor', "does not look announced.") if not is_real: wfh.sendLog('transferor', "does not appear to be genuine.") ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status continue ## check on a grace period injection_time = time.mktime( time.strptime('.'.join(map(str, wfh.request['RequestDate'])), "%Y.%m.%d.%H.%M.%S")) / (60. * 60.) now = time.mktime(time.gmtime()) / (60. * 60.) if float(now - injection_time) < 4.: if not options.go and not announced: wfh.sendLog( 'transferor', "It is too soon to start transfer: %3.2fH remaining" % (now - injection_time)) continue if passing_along >= allowed_to_handle: #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: if in_transfer_priority != None and min_transfer_priority != None: if int(wfh.request['RequestPriority'] ) >= in_transfer_priority and int( wfh.request['RequestPriority'] ) != min_transfer_priority: ## higher priority, and not only this priority being transfered wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over %s" % (wfh.request['RequestPriority'], in_transfer_priority, max_to_handle)) else: wfh.sendLog( 'transferor', " Not allowed to pass more than %s at a time. Currently %s handled, and adding %s" % (max_to_handle, being_handled, passing_along)) if not options.go: ## should not allow to jump that fence break if this_load and needs_transfer >= allowed_to_transfer: if in_transfer_priority != None and min_transfer_priority != None: if int(wfh.request['RequestPriority'] ) >= in_transfer_priority and int( wfh.request['RequestPriority'] ) != min_transfer_priority: ## higher priority, and not only this priority being transfered wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over %s" % (wfh.request['RequestPriority'], in_transfer_priority, max_to_transfer)) else: wfh.sendLog( 'transferor', "Not allowed to transfer more than %s at a time. Currently %s transfering, and adding %s" % (max_to_transfer, being_transfered, needs_transfer)) if not options.go: no_budget = True if no_budget: continue ## the site white list considers site, campaign, memory and core information (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() if options and options.tosites: sites_allowed = options.tosites.split(',') for dataset in list(primary) + list(parent) + list(secondary): ## lock everything flat NLI.lock(dataset) if not sites_allowed: wfh.sendLog('transferor', "not possible site to run at") #sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name )) sendLog('transferor', "%s has no possible sites to run at" % (wfo.name), level='critical') continue blocks = [] if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## augment with run white list for dataset in primary: blocks = list( set(blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist']))) if 'LumiList' in wfh.request and wfh.request['LumiList']: ## augment with the lumi white list blocks = list( set(blocks + getDatasetBlocks(dataset, lumis=wfh.request['LumiList']))) if blocks: print "Reading", len(blocks), "in block whitelist" can_go = True staging = False allowed = True primary_destinations = set() if primary: copies_needed_from_CPUh, CPUh = wfh.getNCopies() if talk: print wfo.name, 'reads', ', '.join(primary), 'in primary' ## chope the primary dataset for prim in primary: ## keep track of what needs what workflow_dependencies[prim].add(wfo.id) max_priority[prim] = max(max_priority[prim], int(wfh.request['RequestPriority'])) wfh.sendLog( 'transferor', "Would make %s from cpu requirement %s" % (copies_needed_from_CPUh, CPUh)) copies_needed = copies_needed_from_CPUh if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[ wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[ wfh.request['Campaign']]['maxcopies'] copies_needed = min(copies_needed_from_campaign, copies_needed) wfh.sendLog( 'transferor', "Maxed to %s by campaign configuration %s" % (copies_needed, wfh.request['Campaign'])) ### new ways of making the whole thing destinations, all_block_names = getDatasetDestinations( url, prim, within_sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks) print json.dumps(destinations, indent=2) ## get where the dataset is in full and completed prim_location = [ site for (site, info) in destinations.items() if info['completion'] == 100 and info['data_fraction'] == 1 ] ## the rest is places it is going to be prim_destination = [ site for site in destinations.keys() if not site in prim_location ] if len(prim_location) >= copies_needed: wfh.sendLog( 'transferor', "The input is all fully in place at %s sites %s" % (len(prim_location), sorted(prim_location))) continue copies_needed = max(0, copies_needed - len(prim_location)) wfh.sendLog( 'transferor', "not counting existing copies ; now need %s" % copies_needed) copies_being_made = [ sum([ info['blocks'].keys().count(block) for site, info in destinations.items() if site in prim_destination ]) for block in all_block_names ] latching_on_transfers = set() [ latching_on_transfers.update(info['blocks'].values()) for site, info in destinations.items() if site in prim_destination ] latching_on_transfers = list(latching_on_transfers) #print latching_on_transfers ## figure out where all this is going to go prim_to_distribute = [ site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location ] prim_to_distribute = [ site for site in prim_to_distribute if not SI.CE_to_SE(site) in prim_destination ] ## take out the ones that cannot receive transfers prim_to_distribute = [ site for site in prim_to_distribute if not any([ osite.startswith(site) for osite in SI.sites_veto_transfer ]) ] wfh.sendLog( 'transferor', "Could be going to: %s" % sorted(prim_to_distribute)) if not prim_to_distribute or any([ transfers_per_sites[site] < max_staging_per_site for site in prim_to_distribute ]): ## means there is openings let me go print "There are transfer slots available:", [ (site, transfers_per_sites[site]) for site in prim_to_distribute ] #for site in sites_allowed: # #increment accross the board, regardless of real destination: could be changed # transfers_per_sites[site] += 1 else: if int( wfh.request['RequestPriority'] ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority: wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over transfer slots available" % (wfh.request['RequestPriority'], in_transfer_priority)) else: wfh.sendLog( 'transferor', "Not allowed to transfer more than %s per site at a time. Going overboard for %s" % (max_staging_per_site, sorted([ site for site in prim_to_distribute if transfers_per_sites[site] >= max_staging_per_site ]))) if not options.go: allowed = False break for latching in latching_on_transfers: tfo = session.query(Transfer).filter( Transfer.phedexid == int(latching)).first() if not tfo: tfo = session.query(Transfer).filter( Transfer.phedexid == -int(latching)).first() if not tfo: tfo = Transfer(phedexid=latching) tfo.workflows_id = [] session.add(tfo) else: tfo.phedexid = latching ## make it positive ever if not wfo.id in tfo.workflows_id: print "adding", wfo.id, "to", tfo.id, "with phedexid", latching l = copy.deepcopy(tfo.workflows_id) l.append(wfo.id) tfo.workflows_id = l if not options.test: session.commit() else: session.flush( ) ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query can_go = False transfer_sizes[prim] = input_sizes[prim] staging = True # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ?? #copies_needed = max(0,copies_needed - len(prim_destination)) copies_needed = max(0, copies_needed - min(copies_being_made)) wfh.sendLog( 'transferor', "Not counting the copies being made ; then need %s" % copies_needed) if copies_needed == 0: wfh.sendLog( 'transferor', "The output is either fully in place or getting in full somewhere with %s" % latching_on_transfers) can_go = True continue elif len(prim_to_distribute) == 0: wfh.sendLog( 'transferor', "We are going to need extra copies, but no destinations seems available" ) prim_to_distribute = [ site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location ] prim_to_distribute = [ site for site in prim_to_distribute if not any([ osite.startswith(site) for osite in SI.sites_veto_transfer ]) ] if len( prim_to_distribute ) > 0: ## maybe that a parameter we can play with to limit the if not options or options.chop: ### hard include the tape disk andpoint ? #tapes = [site for site in getDatasetPresence( url, prim, vetos=['T0','T2','T3','Disk']) if site.endswith('MSS')] chops, sizes = getDatasetChops( prim, chop_threshold=options.chopsize, only_blocks=blocks) spreading = distributeToSites(chops, prim_to_distribute, n_copies=copies_needed, weights=SI.cpu_pledges, sizes=sizes) transfer_sizes[prim] = sum(sizes) if not spreading: sendLog( 'transferor', 'cannot send %s to any site, it cannot fit anywhere' % prim, level='critical') wfh.sendLog( 'transferor', "cannot send to any site. %s cannot seem to fit anywhere" % (prim)) staging = False can_go = False else: spreading = {} for site in prim_to_distribute: if blocks: spreading[site] = blocks else: spreading[site] = [prim] transfer_sizes[prim] = input_sizes[ prim] ## this is approximate if blocks are specified can_go = False wfh.sendLog( 'transferor', "selected CE destinations %s" % (sorted(spreading.keys()))) for (site, items) in spreading.items(): all_transfers[site].extend(items) transfers_per_sites[site] += 1 primary_destinations.add(site) if not allowed: wfh.sendLog('transferor', "Not allowed to move on with") continue if secondary: override_sec_destination = [] if 'SecondaryLocation' in CI.campaigns[wfh.request['Campaign']]: override_sec_destination = CI.campaigns[ wfh.request['Campaign']]['SecondaryLocation'] print wfo.name, 'reads', ', '.join(secondary), 'in secondary' for sec in secondary: workflow_dependencies[sec].add(wfo.id) if True: ## new style, failing on minbias if not sec in destination_cache: ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist destination_cache[sec], _ = getDatasetDestinations( url, sec) ## NO SITE WHITE LIST ADDED #destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed]) ## limit to the site whitelist NOW se_allowed = [SI.CE_to_SE(site) for site in sites_allowed] destinations = dict([ (k, v) for (k, v) in destination_cache[sec].items() if site in se_allowed ]) ## truncate location/destination to those making up for >90% of the dataset bad_destinations = [ destinations.pop(site) for (site, info) in destinations.items() if info['data_fraction'] < 0.9 ] sec_location = [ site for (site, info) in destinations.items() if info['completion'] >= 95 ] sec_destination = [ site for site in destinations.keys() if not site in sec_location ] else: ## old style presence = getDatasetPresence(url, sec) sec_location = [ site for site, pres in presence.items() if pres[1] > 90. ] ## more than 90% of the minbias at sites subscriptions = listSubscriptions(url, sec) sec_destination = [site for site in subscriptions] sec_to_distribute = [ site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location]) ] sec_to_distribute = [ site for site in sec_to_distribute if not any( [osite.startswith(site) for osite in sec_destination]) ] sec_to_distribute = [ site for site in sec_to_distribute if not any([ osite.startswith(site) for osite in SI.sites_veto_transfer ]) ] if override_sec_destination: ## intersect with where we want the PU to be not_needed_anymore = list( set(sec_to_distribute) - set(override_sec_destination)) #sendEmail("secondary superfluous","the dataset %s could be removed from %s"%( sec, not_needed_anymore )) sendLog( 'transferor', "the dataset %s could be removed from %s" % (sec, not_needed_anymore)) sec_to_distribute = list( set(sec_to_distribute) & set(override_sec_destination)) if len(sec_to_distribute) > 0: print "secondary could go to", sorted(sec_to_distribute) sec_size = dss.get(sec) for site in sec_to_distribute: site_se = SI.CE_to_SE(site) if (SI.disk[site_se] * 1024.) > sec_size: all_transfers[site].append(sec) can_go = False else: print "could not send the secondary input to", site_se, "because it is too big for the available disk", SI.disk[ site_se] * 1024, "GB need", sec_size if primary_destinations and site in primary_destinations: #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024)) sendLog('transferor', '%s is too big (%s) for %s (%s)' % (sec, sec_size, site_se, SI.disk[site_se] * 1024), level='critical') else: print "the secondary input does not have to be send to site" ## is that possible to do something more if can_go: ## no explicit transfer required this time if staging: ## but using existing ones wfh.sendLog( 'transferor', "latches on existing transfers, and nothing else, settin staging" ) wfo.status = 'staging' needs_transfer += 1 else: wfh.sendLog( 'transferor', "should just be assigned now to %s" % sorted(sites_allowed)) wfo.status = 'staged' passing_along += 1 wfh.sendLog('transferor', "setting status to %s" % wfo.status) session.commit() continue else: ## there is an explicit transfer required if staging: ## and also using an existing one wfh.sendLog('transferor', "latches on existing transfers") if not options.test: wfo.status = 'staging' wfh.sendLog('transferor', "setting status to %s" % wfo.status) session.commit() wfh.sendLog('transferor', "needs a transfer") needs_transfer += 1 passing_along += 1 if no_goes: #sendEmail("no go for managing","No go for \n"+"\n".join( no_goes )) sendLog('transferor', "No go for \n" + "\n".join(no_goes), level='critical') print "accumulated transfers" print json.dumps(all_transfers, indent=2) fake_id = -1 wf_id_in_prestaging = set() for (site, items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## site that do not want input datasets if site in SI.sites_veto_transfer: print site, "does not want transfers" continue ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured. ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] block_datasets = list(set([it.split('#')[0] for it in blocks])) datasets = [it for it in items_to_transfer if not '#' in it] details_text = "Making a replica to %s (CE) %s (SE) for" % (site, site_se) #print "\t",len(blocks),"blocks" ## remove blocks if full dataset is send out blocks = [ block for block in blocks if not block.split('#')[0] in datasets ] #print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks])) #print "\t",len(datasets),"datasets" #print "\t",datasets details_text += '\n\t%d blocks' % len(blocks) details_text += '\n\t%d needed blocks for %s' % ( len(blocks), sorted(list(set([block.split('#')[0] for block in blocks])))) details_text += '\n\t%d datasets' % len(datasets) details_text += '\n\t%s' % sorted(datasets) items_to_transfer = blocks + datasets if execute: sendLog('transferor', details_text) else: print "Would make a replica to", site, "(CE)", site_se, "(SE) for" print details_text ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y', 'yes', 'go']: continue if execute: priority = 'normal' cds = [ ds for ds in datasets + block_datasets if ds in max_priority ] if cds and False: ## I don't think this is working. subscription should be updated on the fly and regularly for raising the priority if needed ## decide on an overall priority : that's a bit too large though if any([max_priority[ds] >= 90000 for ds in cds]): priority = 'high' elif all([max_priority[ds] < 80000 for ds in cds]): priority = 'low' result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority=priority) else: result = {'phedex': {'request_created': []}} fake_id -= 1 if not result: print "ERROR Could not make a replica request for", site, items_to_transfer, "pre-staging" continue for phedexid in [o['id'] for o in result['phedex']['request_created']]: new_transfer = session.query(Transfer).filter( Transfer.phedexid == int(phedexid)).first() if not new_transfer: new_transfer = session.query(Transfer).filter( Transfer.phedexid == -int(phedexid)).first() print phedexid, "transfer created" if not new_transfer: new_transfer = Transfer(phedexid=phedexid) session.add(new_transfer) else: new_transfer.phedexid = phedexid ## make it positive again new_transfer.workflows_id = set() for transfering in list( set(map(lambda it: it.split('#')[0], items_to_transfer))): new_transfer.workflows_id.update( workflow_dependencies[transfering]) new_transfer.workflows_id = list(new_transfer.workflows_id) wf_id_in_prestaging.update(new_transfer.workflows_id) session.commit() ## auto approve it if execute: approved = approveSubscription(url, phedexid, [site_se]) for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status != 'staging': if execute: tr_wf.status = 'staging' if talk: print "setting", tr_wf.name, "to staging" session.commit()
def transferor(url ,specific = None, talk=True, options=None): if userLock('transferor'): return if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() mcm = McMClient(dev=False) dss = DSS() print "counting all being handled..." being_handled = len(session.query(Workflow).filter(Workflow.status == 'away').all()) being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('stag')).all()) being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance')).all()) max_to_handle = options.maxworkflows allowed_to_handle = max(0,max_to_handle - being_handled) wf_buffer = 5 if allowed_to_handle<=wf_buffer: ## buffer for having several wf per transfer print "Not allowed to run more than",max_to_handle,"at a time. Currently",being_handled,"and",wf_buffer,"buffer" else: print being_handled,"already being handled",max_to_handle,"max allowed,",allowed_to_handle,"remaining","and",wf_buffer,"buffer" print "... done" all_transfers=defaultdict(list) workflow_dependencies = defaultdict(set) ## list of wf.id per input dataset wfs_and_wfh=[] print "getting all wf to consider ..." cache = getWorkflows(url, 'assignment-approved', details=True) for wfo in session.query(Workflow).filter(Workflow.status=='considered').all(): if specific and not specific in wfo.name: continue cache_r =filter(lambda d:d['RequestName']==wfo.name, cache) if len(cache_r): wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False, request = cache_r[0]) ) ) else: wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False) ) ) print "... done" input_sizes = {} ## list the size of those in transfer already in_transfer_priority=0 min_transfer_priority=100000000 print "getting all wf in staging ..." for wfo in session.query(Workflow).filter(Workflow.status=='staging').all(): wfh = workflowInfo( url, wfo.name, spec=False) (_,primary,_,_) = wfh.getIO() for prim in primary: input_sizes[prim] = dss.get( prim ) in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority'])) min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority'])) print "... done" print "Max priority in transfer already",in_transfer_priority print "Min priority in transfer already",min_transfer_priority in_transfer_already = sum(input_sizes.values()) #sort by priority higher first wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority']) ), reverse=True) ## list the size of all inputs print "getting all input sizes ..." for (wfo,wfh) in wfs_and_wfh: (_,primary,_,_) = wfh.getIO() for prim in primary: input_sizes[prim] = dss.get( prim ) print "... done" grand_total = sum(input_sizes.values()) to_transfer = grand_total - in_transfer_already grand_transfer_limit = options.maxtransfer transfer_limit = grand_transfer_limit - in_transfer_already print "%15.4f GB already being transfered"%in_transfer_already print "%15.4f GB is the current requested transfer load"%to_transfer print "%15.4f GB is the global transfer limit"%grand_transfer_limit print "%15.4f GB is the available limit"%transfer_limit # the max priority value per dataset. max_priority = defaultdict(int) needs_transfer=0 ## so that we can count'em passing_along = 0 transfer_sizes={} went_over_budget=False for (wfo,wfh) in wfs_and_wfh: print wfh.request['RequestPriority'] print wfo.name,"to be transfered" #wfh = workflowInfo( url, wfo.name) (_,primary,_,_) = wfh.getIO() this_load=sum([input_sizes[prim] for prim in primary]) if ( this_load and (sum(transfer_sizes.values())+this_load > transfer_limit or went_over_budget ) ): if went_over_budget: print "Transfer has gone over bubget." else: print "Transfer will go over bubget." print "%15.4f GB this load"%this_load print "%15.4f GB already this round"%sum(transfer_sizes.values()) print "%15.4f GB is the available limit"%transfer_limit went_over_budget=True if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over budget" else: if not options.go: print min_transfer_priority,"minimum priority",wfh.request['RequestPriority'],"<",in_transfer_priority,"stop" continue ## throtlle by campaign go if not CI.go( wfh.request['Campaign'] ): print "No go for",wfh.request['Campaign'] if not options.go: continue ## check if the batch is announced announced=False is_real=False for b in mcm.getA('batches',query='contains=%s'% wfo.name): is_real = True if b['status']=='announced': announced=True break if not announced: print wfo.name,"does not look announced."# skipping?, rejecting?, reporting?" if not is_real: print wfo.name,"does not appear to be genuine." ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status continue ## check on a grace period injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.) now = time.mktime(time.gmtime()) / (60.*60.) if float(now - injection_time) < 4.: if not options.go and not announced: print "It is too soon to start transfer: %3.2fH remaining"%(now - injection_time) continue passing_along += 1 if passing_along >= allowed_to_handle: if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over",max_to_handle else: print "Not allowed to pass more than",max_to_handle,"at a time. Currently",being_handled,"handled, and adding",passing_along break (lheinput,primary,parent,secondary) = wfh.getIO() if options and options.tosites: sites_allowed = options.tosites.split(',') else: sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']): sites_allowed = CI.parameters(wfh.request['Campaign'])['SiteWhitelist'] blocks = [] if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']: blocks = wfh.request['BlockWhitelist'] can_go = True staging=False if primary: if talk: print wfo.name,'reads',', '.join(primary),'in primary' ## chope the primary dataset for prim in primary: max_priority[prim] = max(max_priority[prim],int(wfh.request['RequestPriority'])) sites_really_allowed = [site for site in sites_allowed if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] print "Sites allowed minus the vetoed transfer" print sites_really_allowed copies_needed = int(0.35*len(sites_really_allowed))+1 ## should just go for a fixed number based if the white list grows that big print "Would make",copies_needed,"copies" if options.maxcopy>0: copies_needed = min(options.maxcopy,copies_needed) ## remove the sites that do not want transfers print "need",copies_needed workflow_dependencies[prim].add( wfo.id ) presence = getDatasetPresence( url, prim ) prim_location = [site for site,pres in presence.items() if pres[0]==True] if len(prim_location) >= copies_needed: print "The output is all fully in place at",len(prim_location),"sites" continue # reduce the number of copies required by existing full copies copies_needed = max(0,copies_needed - len(prim_location)) print "now need",copies_needed subscriptions = listSubscriptions( url , prim ) prim_destination = list(set([site for (site,(tid,decision)) in subscriptions.items() if decision and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])])) ## need to reject from that list the ones with a full copy already: i.e the transfer corresponds to the copy in place prim_destination = [site for site in prim_destination if not site in prim_location] ## add transfer dependencies latching_on_transfers = list(set([ tid for (site,(tid,decision)) in subscriptions.items() if decision and site in prim_destination and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])])) print latching_on_transfers for latching in latching_on_transfers: tfo = session.query(Transfer).filter(Transfer.phedexid == latching).first() if not tfo: tfo = Transfer( phedexid = latching) tfo.workflows_id = [] session.add(tfo) if not wfo.id in tfo.workflows_id: print "adding",wfo.id,"to",tfo.id,"with phedexid",latching l = copy.deepcopy( tfo.workflows_id ) l.append( wfo.id ) tfo.workflows_id = l if not options.test: session.commit() else: session.flush() ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query can_go = False transfer_sizes[prim] = input_sizes[prim] staging = True # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ?? copies_needed = max(0,copies_needed - len(prim_destination)) print "then need",copies_needed if copies_needed == 0: print "The output is either fully in place or getting in full somewhere with",latching_on_transfers can_go = True continue prim_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in prim_location])] prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in prim_destination])] ## take out the ones that cannot receive transfers prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] if len(prim_to_distribute)>0: ## maybe that a parameter we can play with to limit the if not options or options.chop: spreading = distributeToSites( getDatasetChops(prim), prim_to_distribute, n_copies = copies_needed, weights=SI.cpu_pledges) else: spreading = {} for site in prim_to_distribute: spreading[site]=[prim] can_go = False transfer_sizes[prim] = input_sizes[prim] for (site,items) in spreading.items(): all_transfers[site].extend( items ) if secondary: if talk: print wfo.name,'reads',', '.join(secondary),'in secondary' for sec in secondary: workflow_dependencies[sec].add( wfo.id ) presence = getDatasetPresence( url, sec ) sec_location = [site for site,pres in presence.items() if pres[1]>90.] ## more than 90% of the minbias at sites subscriptions = listSubscriptions( url ,sec ) sec_destination = [site for site in subscriptions] sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] if len( sec_to_distribute )>0: for site in sec_to_distribute: all_transfers[site].append( sec ) can_go = False ## is that possible to do something more if can_go: ## no explicit transfer required this time if staging: ## but using existing ones print wfo.name,"latches on existing transfers, and nothing else" wfo.status = 'staging' else: print wfo.name,"should just be assigned NOW to",sites_allowed wfo.status = 'staged' print "setting status to",wfo.status session.commit() continue else: ## there is an explicit transfer required if staging: ## and also using an existing one print wfo.name,"latches on existing transfers" if not options.test: wfo.status = 'staging' print "setting status to",wfo.status session.commit() print wfo.name,"needs a transfer" needs_transfer+=1 #print json.dumps(all_transfers) fake_id=-1 wf_id_in_prestaging=set() for (site,items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## site that do not want input datasets if site in SI.sites_veto_transfer: print site,"does not want transfers" continue ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured. ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] datasets = [it for it in items_to_transfer if not '#' in it] if execute: print "Making a replica to",site,"(CE)",site_se,"(SE) for" else: print "Would make a replica to",site,"(CE)",site_se,"(SE) for" print "\t",len(blocks),"blocks" ## remove blocks if full dataset is send out blocks = [block for block in blocks if not block.split('#')[0] in datasets] print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks])) print "\t",len(datasets),"datasets" print "\t",datasets items_to_transfer = blocks + datasets ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y','yes','go']: continue if execute: result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority='normal') ## make use of max_priority dataset:priority to set the subscriptions priority """ ## does not function once = True for item in items_to_transfer: bds = item.split('#')[0] if max_priority[bds] >= 90000: if once: w=10 print "waiting",w,"s before raising priority" time.sleep(w) once=False ## raise it to high priority print item,"subscription priority raised to high at",site_se #print "This does not work yet properly it seems" print updateSubscription(url, site_se, item, priority='high') """ else: #result= {'phedex':{'request_created' : [{'id' : fake_id}]}} result= {'phedex':{'request_created' : []}} fake_id-=1 if not result: print "ERROR Could not make a replica request for",site,items_to_transfer,"pre-staging" continue for phedexid in [o['id'] for o in result['phedex']['request_created']]: new_transfer = session.query(Transfer).filter(Transfer.phedexid == phedexid).first() print phedexid,"transfer created" if not new_transfer: new_transfer = Transfer( phedexid = phedexid) session.add( new_transfer ) new_transfer.workflows_id = set() for transfering in list(set(map(lambda it : it.split('#')[0], items_to_transfer))): new_transfer.workflows_id.update( workflow_dependencies[transfering] ) new_transfer.workflows_id = list(new_transfer.workflows_id) wf_id_in_prestaging.update(new_transfer.workflows_id) session.commit() ## auto approve it if execute: approved = approveSubscription(url, phedexid, [site_se]) for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status!='staging': if execute: tr_wf.status = 'staging' if talk: print "setting",tr_wf.name,"to staging" session.commit()
def recoveror(url, specific, options=None): if userLock('recoveror'): return up = componentInfo(soft=['mcm', 'wtc']) if not up.check(): return CI = campaignInfo() SI = siteInfo() UC = unifiedConfiguration() use_recoveror = UC.get('use_recoveror') if not use_recoveror and not options.go: print "We are told not to run recoveror" return def make_int_keys(d): for code in d: d[int(code)] = d.pop(code) error_codes_to_recover = UC.get('error_codes_to_recover') error_codes_to_block = UC.get('error_codes_to_block') error_codes_to_notify = UC.get('error_codes_to_notify') make_int_keys(error_codes_to_recover) make_int_keys(error_codes_to_block) make_int_keys(error_codes_to_notify) #wfs = session.query(Workflow).filter(Workflow.status == 'assistance-recovery').all() wfs = session.query(Workflow).filter( Workflow.status.contains('recovery')).all() if specific: wfs.extend( session.query(Workflow).filter( Workflow.status == 'assistance-manual').all()) for wfo in wfs: if specific and not specific in wfo.name: continue if not specific and 'manual' in wfo.status: continue wfi = workflowInfo(url, wfo.name) ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves all_errors = {} try: ## this is clearly very truncated and should be changed completely wfi.getSummary() all_errors = wfi.summary['errors'] except: pass print '-' * 100 print "Looking at", wfo.name, "for recovery options" recover = True if not 'MergedLFNBase' in wfi.request: print "f****d up" sendEmail('missing lfn', '%s wl cache is screwed up' % wfo.name) recover = False if not len(all_errors): print "\tno error for", wfo.name recover = False task_to_recover = defaultdict(list) message_to_ops = "" message_to_user = "" if 'LheInputFilese' in wfi.request and wfi.request['LheInputFiles']: ## we do not try to recover pLHE recover = False if wfi.request['RequestType'] in ['MonteCarlo', 'ReReco']: recover = False if 'Campaign' in wfi.request: c = wfi.request['Campaign'] if c in CI.campaigns and 'recover' in CI.campaigns[c]: recover = CI.campaigns[c]['recover'] for task, errors in all_errors.items(): print "\tTask", task ## collect all error codes and #jobs regardless of step at which it occured all_codes = [] for name, codes in errors.items(): if type(codes) == int: continue all_codes.extend([ (int(code), info['jobs'], name, list(set([e['type'] for e in info['errors']])), list(set([e['details'] for e in info['errors']]))) for code, info in codes.items() ]) all_codes.sort(key=lambda i: i[1], reverse=True) sum_failed = sum([l[1] for l in all_codes]) for errorCode, njobs, name, types, details in all_codes: rate = 100 * njobs / float(sum_failed) #print ("\t\t %10d (%6s%%) failures with error code %10d (%"+str(max_legend)+"s) at stage %s")%(njobs, "%4.2f"%rate, errorCode, legend, name) print( "\t\t %10d (%6s%%) failures with error code %10d (%30s) at stage %s" ) % (njobs, "%4.2f" % rate, errorCode, ','.join(types), name) added_in_recover = False #if options.go: # force the recovery of any task with error ? if errorCode in error_codes_to_recover: ## the error code is registered for case in error_codes_to_recover[errorCode]: match = case['details'] matched = (match == None) if not matched: matched = False for detail in details: if match in detail: print "[recover] Could find keyword", match, "in" print 50 * "#" print detail print 50 * "#" matched = True break if matched and rate > case['rate']: print "\t\t => we should be able to recover that", case[ 'legend'] task_to_recover[task].append((code, case)) added_in_recover = True message_to_user = "" else: print "\t\t recoverable but not frequent enough, needs", case[ 'rate'] if errorCode in error_codes_to_block: for case in error_codes_to_block[errorCode]: match = case['details'] matched = (match == None) if not matched: matched = False for detail in details: if match in detail: print "[block] Could find keyword", match, "in" print 50 * "#" print detail print 50 * "#" matched = True break if matched and rate > case['rate']: print "\t\t => that error means no ACDC on that workflow", case[ 'legend'] if not options.go: message_to_ops += "%s has an error %s blocking an ACDC.\n%s\n " % ( wfo.name, errorCode, '#' * 50) recover = False added_in_recover = False if errorCode in error_codes_to_notify and not added_in_recover: print "\t\t => we should notify people on this" message_to_user += "%s has an error %s in processing.\n%s\n" % ( wfo.name, errorCode, '#' * 50) if message_to_user: print wfo.name, "to be notified to user(DUMMY)", message_to_user if message_to_ops: #sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**']) sendLog('recoveror', message_to_ops, level='warning') if len(task_to_recover) != len(all_errors): print "Should not be doing partial ACDC. skipping" #sendEmail('recoveror','do not want to make partial acdc on %s'%wfo.name) sendLog('recoveror', 'do not want to make partial acdc on %s' % wfo.name, level='warning') recover = False if task_to_recover and recover: print "Initiating recovery" print ', '.join(task_to_recover.keys()), "to be recovered" recovering = set() for task in task_to_recover: print "Will be making a recovery workflow for", task ## from here you can fetch known solutions, to known error codes actions = list( set([ case['solution'] for code, case in task_to_recover[task] ])) acdc = singleRecovery(url, task, wfi.request, actions, do=options.do) if not acdc: if options.do: if recovering: print wfo.name, "has been partially ACDCed. Needs manual attention" #sendEmail( "failed ACDC partial recovery","%s has had %s/%s recoveries %s only"%( wfo.name, len(recovering), len(task_to_recover), list(recovering)), destination=['*****@*****.**']) sendLog('recoveror', "%s has had %s/%s recoveries %s only" % (wfo.name, len(recovering), len(task_to_recover), list(recovering)), level='critical') continue else: print wfo.name, "failed recovery once" #break continue else: print "no action to take further" sendLog('recoveror', "ACDC for %s can be done automatically" % wfo.name, level='critical') continue ## and assign it ? team = wfi.request['Team'] #assign_to_sites = set(SI.sites_ready) ## that needs to be massaged to prevent assigning to something out. assign_to_sites = set(SI.all_sites) parameters = { #'SiteWhitelist' : wfi.request['SiteWhitelist'], 'SiteWhitelist': sorted(assign_to_sites), 'AcquisitionEra': wfi.acquisitionEra(), 'ProcessingString': wfi.processingString(), 'MergedLFNBase': wfi.request['MergedLFNBase'], 'ProcessingVersion': wfi.request['ProcessingVersion'], } ## hackery for ACDC merge assignment if wfi.request[ 'RequestType'] == 'TaskChain' and 'Merge' in task.split( '/')[-1]: parameters['AcquisitionEra'] = None parameters['ProcessingString'] = None ## xrootd setttings on primary and secondary if 'TrustSitelists' in wfi.request and wfi.request[ 'TrustSitelists']: parameters['TrustSitelists'] = True if 'TrustPUSitelists' in wfi.request and wfi.request[ 'TrustPUSitelists']: parameters['TrustPUSitelists'] = True if options.ass: print "really doing the assignment of the ACDC", acdc parameters['execute'] = True wfi.sendLog('recoveror', "%s was assigned for recovery" % acdc) else: print "no assignment done with this ACDC", acdc sendLog('recoveror', "%s needs to be assigned" % (acdc), level='critical') result = reqMgrClient.assignWorkflow(url, acdc, team, parameters) if not result: print acdc, "was not asigned" sendLog('recoveror', "%s needs to be assigned" % (acdc), level='critical') else: recovering.add(acdc) current = None if recovering: #if all went well, set the status to -recovering current = wfo.status if options.ass: current = current.replace('recovery', 'recovering') else: current = 'assistance-manual' print 'created ACDC: ' + ', '.join(recovering) else: ## was set to be recovered, and no acdc was made current = 'assistance-manual' if current: print wfo.name, "setting the status to", current wfo.status = current session.commit() else: ## this workflow should be handled manually at that point print wfo.name, "needs manual intervention" wfo.status = 'assistance-manual' session.commit()
def new_recoveror(url, specific, options=None): if userLock('recoveror'): return up = componentInfo(soft=['mcm','wtc','jira']) if not up.check(): return CI = campaignInfo() SI = siteInfo() UC = unifiedConfiguration() wfs = session.query(Workflow).filter(Workflow.status.contains('recovery')).all() if specific: wfs.extend( session.query(Workflow).filter(Workflow.status == 'assistance-manual').all() ) try: from_operator = json.loads(os.popen('curl -s http://vocms0113.cern.ch/actions/test.json').read()) ## now we have a list of things that we can take action on except: pass for wfo in wfs: if specific and not specific in wfo.name:continue if not specific and 'manual' in wfo.status: continue wfi = workflowInfo(url, wfo.name) send_recovery = False ## will make all acdc send_clone = False ## will make a clone send_back = False ## should just reject. manual ? send_manual = False ## will set in manual where_to_run, missing_to_run = wfi.getRecoveryInfo() task_to_recover = where_to_run.keys() ## if the site at which the recovery could run in drain or out ? for task in task_to_recover: not_ready = set(where_to_run[task]) - set(SI.sites_ready) if not_ready: print "the following sites are not ready for the ACDC",",".join( sorted(not_ready) ) ## do we have a way of telling if a site is going to be out for a long time ? # check on priority: high prio, restart if wfi.request['RequestPriority'] >= 85000: send_clone = True # check on age of the request injection_time = time.mktime(time.strptime('.'.join(map(str,wfi.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.) now = time.mktime(time.gmtime()) / (60.*60.) if float(now - injection_time) <14.: ## less than 14 days, start over send_clone = True else: send_manual = True if not send_recovery: ## check on whether the stats is very low pass if send_recovery: ## make acdc for all tasks for task in task_to_recover: actions = list(set([case['solution'] for code,case in task_to_recover[task] ])) acdc = singleRecovery(url, task, wfi.request , actions, do = True) elif send_clone: ## this will get it cloned wfo.status = 'assistance-clone' session.commit() elif send_manual: wfo.status = 'assistance-manual'
def transferor(url ,specific = None, talk=True, options=None): if userLock(): return if duplicateLock(): return use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() NLI = newLockInfo() mcm = McMClient(dev=False) dss = DSS() #allowed_secondary = UC.get('') print "counting all being handled..." being_handled = len(session.query(Workflow).filter(Workflow.status == 'away').all()) being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('stag')).all()) being_transfered = len(session.query(Workflow).filter(Workflow.status == 'staging').all()) being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all()) max_to_handle = options.maxworkflows max_to_transfer = options.maxstaging allowed_to_handle = max(0,max_to_handle - being_handled) allowed_to_transfer = max(0,max_to_transfer - being_transfered) wf_buffer = 5 if allowed_to_handle<=wf_buffer: ## buffer for having several wf per transfer print "Not allowed to run more than",max_to_handle,"at a time. Currently",being_handled,"and",wf_buffer,"buffer" else: print being_handled,"already being handled",max_to_handle,"max allowed,",allowed_to_handle,"remaining","and",wf_buffer,"buffer" if allowed_to_transfer <= wf_buffer: print "Not allowed to transfer more than",max_to_transfer,"at a time. Currently",being_transfered,"and",wf_buffer,"buffer" else: print being_transfered,"already being transfered",max_to_transfer,"max allowed,",allowed_to_transfer,"remaining","and",wf_buffer,"buffer" print "... done" all_transfers=defaultdict(list) workflow_dependencies = defaultdict(set) ## list of wf.id per input dataset wfs_and_wfh=[] print "getting all wf to consider ..." cache = getWorkflows(url, 'assignment-approved', details=True) for wfo in session.query(Workflow).filter(Workflow.status.startswith('considered')).all(): print "\t",wfo.name if specific and not specific in wfo.name: continue cache_r =filter(lambda d:d['RequestName']==wfo.name, cache) if len(cache_r): wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False, request = cache_r[0]) ) ) else: wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False) ) ) print "... done" transfers_per_sites = defaultdict(int) input_sizes = {} ignored_input_sizes = {} input_cput = {} input_st = {} ## list the size of those in transfer already in_transfer_priority=None min_transfer_priority=None print "getting all wf in staging ..." stucks = json.loads(open('%s/stuck_transfers.json'%monitor_dir).read()) for wfo in session.query(Workflow).filter(Workflow.status=='staging').all(): wfh = workflowInfo( url, wfo.name, spec=False) #(lheinput,primary,parent,secondary) = wfh.getIO() #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) (lheinput,primary,parent,secondary,sites_allowed) = wfh.getSiteWhiteList() for site in sites_allowed: ## we should get the actual transfer destination instead of the full white list transfers_per_sites[site] += 1 #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() for prim in primary: ds_s = dss.get( prim ) if prim in stucks: sendLog('transferor', "%s appears stuck, so not counting it %s [GB]"%( prim, ds_s), wfi=wfh) ignored_input_sizes[prim] = ds_s else: input_sizes[prim] = ds_s sendLog('transferor', "%s needs %s [GB]"%( wfo.name, ds_s), wfi=wfh) if in_transfer_priority==None: in_transfer_priority = int(wfh.request['RequestPriority']) else: in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority'])) if min_transfer_priority==None: min_transfer_priority = int(wfh.request['RequestPriority']) else: min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority'])) if min_transfer_priority==None or in_transfer_priority ==None: print "nothing is lining up for transfer" sendEmail("no request in staging","no request in staging") return pass try: print "Ignored input sizes" ignored_values = list(ignored_input_sizes.items()) ignored_values.sort( key = lambda i : i[1] ) print "\n".join( map(str, ignored_values ) ) print "Considered input sizes" considered_values = list(input_sizes.items()) considered_values.sort( key = lambda i : i[1] ) print "\n".join( map(str, considered_values) ) except Exception as e: print "trying to print the summary of input size" print str(e) print "... done" print "Max priority in transfer already",in_transfer_priority print "Min priority in transfer already",min_transfer_priority print "transfers per sites" print json.dumps( transfers_per_sites, indent=2) in_transfer_already = sum(input_sizes.values()) cput_in_transfer_already = sum(input_cput.values()) st_in_transfer_already = sum(input_st.values()) ## list the size of all inputs primary_input_per_workflow_gb = defaultdict(float) print "getting all input sizes ..." for (wfo,wfh) in wfs_and_wfh: (_,primary,_,_) = wfh.getIO() #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() for prim in primary: ## do not count it if it appears stalled ! prim_size = dss.get( prim ) input_sizes[prim] = prim_size primary_input_per_workflow_gb[wfo.name] += prim_size print "... done" # shuffle first by name random.shuffle( wfs_and_wfh ) # Sort smallest transfers first; allows us to transfer as many as possible workflows. def prio_and_size( i, j): if int(i[1].request['RequestPriority']) == int(j[1].request['RequestPriority']): return cmp(int(primary_input_per_workflow_gb.get(j[0].name, 0)), int(primary_input_per_workflow_gb.get(i[0].name, 0)) ) else: return cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority'])) #wfs_and_wfh.sort(cmp = prio_and_size, reverse=True) #wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(primary_input_per_workflow_gb.get(i[0].name, 0)), int(primary_input_per_workflow_gb.get(j[0].name, 0)) )) #sort by priority higher first wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority']) ), reverse=True) cput_grand_total = sum(input_cput.values()) cput_to_transfer = cput_grand_total - cput_in_transfer_already st_grand_total = sum(input_st.values()) st_to_transfer = st_grand_total - st_in_transfer_already print "%15.4f [CPU h] worth already in transfer"%cput_in_transfer_already print "%15.4f [CPU h] worth is the current requested transfer load"%cput_to_transfer print "%15.4f [h] worth of absolute system time in transfer"%( cput_in_transfer_already / SI.availableSlots()) print "%15.4f [h] worth of absolute system time is the current requested transfer load"%( cput_to_transfer / SI.availableSlots()) print "%15.4f [h] worth of theoritical system time in transfer"%( st_in_transfer_already ) print "%15.4f [h] worth of theoritical system time is the current requested transfer load"%( st_to_transfer ) grand_total = sum(input_sizes.values()) to_transfer = grand_total - in_transfer_already grand_transfer_limit = options.maxtransfer #grand_transfer_limit = SI.total_disk()*0.25*1024## half of the free sapce in TB->GB transfer_limit = grand_transfer_limit - in_transfer_already print "%15.4f GB already being transfered"%in_transfer_already print "%15.4f GB is the current requested transfer load"%to_transfer print "%15.4f GB is the global transfer limit"%grand_transfer_limit print "%15.4f GB is the available limit"%transfer_limit max_staging_per_site = options.maxstagingpersite # the max priority value per dataset. max_priority = defaultdict(int) needs_transfer=0 ## so that we can count'em passing_along = 0 transfer_sizes={} went_over_budget=False destination_cache = {} no_goes = set() max_per_round = UC.get('max_per_round').get('transferor',None) if max_per_round and not spec: wfs_and_wfh = wfs_and_wfh[:max_per_round] for (wfo,wfh) in wfs_and_wfh: print wfo.name,"to be transfered with priority",wfh.request['RequestPriority'] if wfh.request['RequestStatus']!='assignment-approved': if wfh.request['RequestStatus'] in ['aborted','rejected','rejected-archived','aborted-archived']: wfo.status = 'trouble' ## so that we look or a replacement else: wfo.status = 'away' wfh.sendLog('transferor', '%s in status %s, setting %s'%( wfo.name,wfh.request['RequestStatus'],wfo.status)) continue (_,primary,_,_) = wfh.getIO() this_load=sum([input_sizes[prim] for prim in primary]) no_budget = False if ( this_load and (sum(transfer_sizes.values())+this_load > transfer_limit or went_over_budget ) ): if went_over_budget: wfh.sendLog('transferor', "Transfer has gone over bubget.") else: wfh.sendLog('transferor', "Transfer will go over bubget.") wfh.sendLog('transferor', "%15.4f GB this load, %15.4f GB already this round, %15.4f GB is the available limit"%(this_load, sum(transfer_sizes.values()), transfer_limit)) #if sum(transfer_sizes.values()) > transfer_limit: went_over_budget = True if in_transfer_priority!=None and min_transfer_priority!=None: if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: wfh.sendLog('transferor',"Higher priority sample %s >= %s go-on over budget"%( wfh.request['RequestPriority'], in_transfer_priority)) else: if not options.go: wfh.sendLog('transferor',"%s minimum priority %s < %s : stop"%( min_transfer_priority,wfh.request['RequestPriority'],in_transfer_priority)) no_budget = True ## throtlle by campaign go no_go = False if not wfh.go(log=True) and not options.go: no_go = True no_goes.add( wfo.name ) allowed_secondary = set() for campaign in wfh.getCampaigns(): if campaign in CI.campaigns and 'secondaries' in CI.campaigns[campaign]: allowed_secondary.update( CI.campaigns[campaign]['secondaries'] ) if secondary: if (secondary and allowed_secondary) and (set(secondary)&allowed_secondary!=set(secondary)): wfh.sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-allowed_secondary))) no_go = True if no_go: continue ## check if the batch is announced def check_mcm(wfn): announced=False is_real=False if not wfn.startswith('pdmvserv'): is_real = True try: for b in mcm.getA('batches',query='contains=%s'% wfo.name): is_real = True if b['status']=='announced': announced=True break except: try: for b in mcm.getA('batches',query='contains=%s'% wfo.name): is_real = True if b['status']=='announced': announced=True break except: print "could not get mcm batch announcement, assuming not real" return announced,is_real if not use_mcm: announced,is_real = False,True else: if wfh.request['RequestType'] in ['ReReco']: announced,is_real = True,True else: announced,is_real = check_mcm( wfo.name ) if not announced: wfh.sendLog('transferor', "does not look announced.") if not is_real: wfh.sendLog('transferor', "does not appear to be genuine.") ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status continue ## check on a grace period injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.) now = time.mktime(time.gmtime()) / (60.*60.) if float(now - injection_time) < 4.: if not options.go and not announced: wfh.sendLog('transferor', "It is too soon to start transfer: %3.2fH remaining"%(now - injection_time)) continue if passing_along >= allowed_to_handle: #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: if in_transfer_priority!=None and min_transfer_priority!=None: if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority: ## higher priority, and not only this priority being transfered wfh.sendLog('transferor',"Higher priority sample %s >= %s go-on over %s"%( wfh.request['RequestPriority'], in_transfer_priority, max_to_handle)) else: wfh.sendLog('transferor'," Not allowed to pass more than %s at a time. Currently %s handled, and adding %s"%( max_to_handle, being_handled, passing_along)) if not options.go: ## should not allow to jump that fence break if this_load and needs_transfer >= allowed_to_transfer: if in_transfer_priority!=None and min_transfer_priority!=None: if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority: ## higher priority, and not only this priority being transfered wfh.sendLog('transferor',"Higher priority sample %s >= %s go-on over %s"%(wfh.request['RequestPriority'], in_transfer_priority,max_to_transfer)) else: wfh.sendLog('transferor',"Not allowed to transfer more than %s at a time. Currently %s transfering, and adding %s"%( max_to_transfer, being_transfered, needs_transfer)) if not options.go: no_budget = True if no_budget: continue ## the site white list considers site, campaign, memory and core information (lheinput,primary,parent,secondary,sites_allowed) = wfh.getSiteWhiteList() if options and options.tosites: sites_allowed = options.tosites.split(',') for dataset in list(primary)+list(parent)+list(secondary): ## lock everything flat NLI.lock( dataset ) if not sites_allowed: wfh.sendLog('transferor',"not possible site to run at") #sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name )) sendLog('transferor',"%s has no possible sites to run at"%( wfo.name ),level='critical') continue blocks = [] if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## augment with run white list for dataset in primary: blocks = list(set( blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist'] ) )) if 'LumiList' in wfh.request and wfh.request['LumiList']: ## augment with the lumi white list blocks = list(set( blocks + getDatasetBlocks( dataset, lumis= wfh.request['LumiList'] ) )) if blocks: print "Reading",len(blocks),"in block whitelist" can_go = True staging=False allowed=True primary_destinations = set() if primary: copies_needed_from_CPUh,CPUh = wfh.getNCopies() if talk: print wfo.name,'reads',', '.join(primary),'in primary' ## chope the primary dataset for prim in primary: ## keep track of what needs what workflow_dependencies[prim].add( wfo.id ) max_priority[prim] = max(max_priority[prim],int(wfh.request['RequestPriority'])) wfh.sendLog('transferor',"Would make %s from cpu requirement %s"%( copies_needed_from_CPUh, CPUh)) copies_needed = copies_needed_from_CPUh if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies'] copies_needed = min(copies_needed_from_campaign, copies_needed) wfh.sendLog('transferor',"Maxed to %s by campaign configuration %s"%( copies_needed, wfh.request['Campaign'])) ### new ways of making the whole thing destinations,all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks ) print json.dumps(destinations, indent=2) ## get where the dataset is in full and completed prim_location = [site for (site,info) in destinations.items() if info['completion']==100 and info['data_fraction']==1] ## the rest is places it is going to be prim_destination = [site for site in destinations.keys() if not site in prim_location] if len(prim_location) >= copies_needed: wfh.sendLog('transferor',"The input is all fully in place at %s sites %s"%( len(prim_location), sorted(prim_location))) continue copies_needed = max(0,copies_needed - len(prim_location)) wfh.sendLog('transferor',"not counting existing copies ; now need %s"% copies_needed) copies_being_made = [ sum([info['blocks'].keys().count(block) for site,info in destinations.items() if site in prim_destination]) for block in all_block_names] latching_on_transfers = set() [latching_on_transfers.update(info['blocks'].values()) for site,info in destinations.items() if site in prim_destination] latching_on_transfers = list(latching_on_transfers) #print latching_on_transfers ## figure out where all this is going to go prim_to_distribute = [site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location] prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in prim_destination] ## take out the ones that cannot receive transfers prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] wfh.sendLog('transferor',"Could be going to: %s"% sorted( prim_to_distribute)) if not prim_to_distribute or any([transfers_per_sites[site] < max_staging_per_site for site in prim_to_distribute]): ## means there is openings let me go print "There are transfer slots available:",[(site,transfers_per_sites[site]) for site in prim_to_distribute] #for site in sites_allowed: # #increment accross the board, regardless of real destination: could be changed # transfers_per_sites[site] += 1 else: if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: wfh.sendLog('transferor', "Higher priority sample %s >= %s go-on over transfer slots available"%(wfh.request['RequestPriority'], in_transfer_priority)) else: wfh.sendLog('transferor',"Not allowed to transfer more than %s per site at a time. Going overboard for %s"%( max_staging_per_site, sorted([site for site in prim_to_distribute if transfers_per_sites[site]>=max_staging_per_site]))) if not options.go: allowed = False break for latching in latching_on_transfers: tfo = session.query(Transfer).filter(Transfer.phedexid == int(latching)).first() if not tfo: tfo = session.query(Transfer).filter(Transfer.phedexid == -int(latching)).first() if not tfo: tfo = Transfer( phedexid = latching) tfo.workflows_id = [] session.add(tfo) else: tfo.phedexid = latching ## make it positive ever if not wfo.id in tfo.workflows_id: print "adding",wfo.id,"to",tfo.id,"with phedexid",latching l = copy.deepcopy( tfo.workflows_id ) l.append( wfo.id ) tfo.workflows_id = l if not options.test: session.commit() else: session.flush() ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query can_go = False transfer_sizes[prim] = input_sizes[prim] staging = True # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ?? #copies_needed = max(0,copies_needed - len(prim_destination)) copies_needed = max(0,copies_needed - min(copies_being_made)) wfh.sendLog('transferor', "Not counting the copies being made ; then need %s"% copies_needed) if copies_needed == 0: wfh.sendLog('transferor', "The output is either fully in place or getting in full somewhere with %s"% latching_on_transfers) can_go = True continue elif len(prim_to_distribute)==0: wfh.sendLog('transferor', "We are going to need extra copies, but no destinations seems available") prim_to_distribute = [site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location] prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] if len(prim_to_distribute)>0: ## maybe that a parameter we can play with to limit the if not options or options.chop: ### hard include the tape disk andpoint ? #tapes = [site for site in getDatasetPresence( url, prim, vetos=['T0','T2','T3','Disk']) if site.endswith('MSS')] chops,sizes = getDatasetChops(prim, chop_threshold = options.chopsize, only_blocks=blocks) spreading = distributeToSites( chops, prim_to_distribute, n_copies = copies_needed, weights=SI.cpu_pledges, sizes=sizes) transfer_sizes[prim] = sum(sizes) if not spreading: sendLog('transferor','cannot send %s to any site, it cannot fit anywhere'% prim, level='critical') wfh.sendLog('transferor', "cannot send to any site. %s cannot seem to fit anywhere"%(prim)) staging=False can_go = False else: spreading = {} for site in prim_to_distribute: if blocks: spreading[site]=blocks else: spreading[site]=[prim] transfer_sizes[prim] = input_sizes[prim] ## this is approximate if blocks are specified can_go = False wfh.sendLog('transferor', "selected CE destinations %s"%(sorted( spreading.keys()))) for (site,items) in spreading.items(): all_transfers[site].extend( items ) transfers_per_sites[site] += 1 primary_destinations.add( site ) if not allowed: wfh.sendLog('transferor', "Not allowed to move on with") continue if secondary: override_sec_destination = [] if 'SecondaryLocation' in CI.campaigns[wfh.request['Campaign']]: override_sec_destination = CI.campaigns[wfh.request['Campaign']]['SecondaryLocation'] print wfo.name,'reads',', '.join(secondary),'in secondary' for sec in secondary: workflow_dependencies[sec].add( wfo.id ) if True: ## new style, failing on minbias if not sec in destination_cache: ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist destination_cache[sec],_ = getDatasetDestinations(url, sec) ## NO SITE WHITE LIST ADDED #destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed]) ## limit to the site whitelist NOW se_allowed = [SI.CE_to_SE(site) for site in sites_allowed] destinations = dict([(k,v) for (k,v) in destination_cache[sec].items() if site in se_allowed]) ## truncate location/destination to those making up for >90% of the dataset bad_destinations = [destinations.pop(site) for (site,info) in destinations.items() if info['data_fraction']<0.9] sec_location = [site for (site,info) in destinations.items() if info['completion']>=95] sec_destination = [site for site in destinations.keys() if not site in sec_location] else: ## old style presence = getDatasetPresence( url, sec ) sec_location = [site for site,pres in presence.items() if pres[1]>90.] ## more than 90% of the minbias at sites subscriptions = listSubscriptions( url ,sec ) sec_destination = [site for site in subscriptions] sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] if override_sec_destination: ## intersect with where we want the PU to be not_needed_anymore = list(set(sec_to_distribute) - set(override_sec_destination)) #sendEmail("secondary superfluous","the dataset %s could be removed from %s"%( sec, not_needed_anymore )) sendLog('transferor', "the dataset %s could be removed from %s"%( sec, not_needed_anymore )) sec_to_distribute = list(set(sec_to_distribute) & set(override_sec_destination)) if len( sec_to_distribute )>0: print "secondary could go to",sorted(sec_to_distribute) sec_size = dss.get( sec ) for site in sec_to_distribute: site_se =SI.CE_to_SE(site) if (SI.disk[site_se]*1024.) > sec_size: all_transfers[site].append( sec ) can_go = False else: print "could not send the secondary input to",site_se,"because it is too big for the available disk",SI.disk[site_se]*1024,"GB need",sec_size if primary_destinations and site in primary_destinations: #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024)) sendLog('transferor', '%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024), level='critical') else: print "the secondary input does not have to be send to site" ## is that possible to do something more if can_go: ## no explicit transfer required this time if staging: ## but using existing ones wfh.sendLog('transferor', "latches on existing transfers, and nothing else, settin staging") wfo.status = 'staging' needs_transfer+=1 else: wfh.sendLog('transferor', "should just be assigned now to %s"%sorted(sites_allowed)) wfo.status = 'staged' passing_along+=1 wfh.sendLog('transferor', "setting status to %s"%wfo.status) session.commit() continue else: ## there is an explicit transfer required if staging: ## and also using an existing one wfh.sendLog('transferor', "latches on existing transfers") if not options.test: wfo.status = 'staging' wfh.sendLog('transferor', "setting status to %s"%wfo.status) session.commit() wfh.sendLog('transferor',"needs a transfer") needs_transfer+=1 passing_along+=1 if no_goes: #sendEmail("no go for managing","No go for \n"+"\n".join( no_goes )) sendLog('transferor', "No go for \n"+"\n".join( no_goes ), level='critical') print "accumulated transfers" print json.dumps(all_transfers, indent=2) fake_id=-1 wf_id_in_prestaging=set() for (site,items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## site that do not want input datasets if site in SI.sites_veto_transfer: print site,"does not want transfers" continue ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured. ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] block_datasets = list(set([it.split('#')[0] for it in blocks])) datasets = [it for it in items_to_transfer if not '#' in it] details_text = "Making a replica to %s (CE) %s (SE) for"%( site, site_se) #print "\t",len(blocks),"blocks" ## remove blocks if full dataset is send out blocks = [block for block in blocks if not block.split('#')[0] in datasets] #print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks])) #print "\t",len(datasets),"datasets" #print "\t",datasets details_text += '\n\t%d blocks'%len(blocks) details_text += '\n\t%d needed blocks for %s'%( len(blocks), sorted(list(set([block.split('#')[0] for block in blocks])))) details_text += '\n\t%d datasets'% len(datasets) details_text += '\n\t%s'%sorted(datasets) items_to_transfer = blocks + datasets if execute: sendLog('transferor', details_text) else: print "Would make a replica to",site,"(CE)",site_se,"(SE) for" print details_text ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y','yes','go']: continue if execute: priority = 'normal' cds = [ds for ds in datasets+block_datasets if ds in max_priority] if cds and False: ## I don't think this is working. subscription should be updated on the fly and regularly for raising the priority if needed ## decide on an overall priority : that's a bit too large though if any([max_priority[ds]>=90000 for ds in cds]): priority = 'high' elif all([max_priority[ds]<80000 for ds in cds]): priority = 'low' result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority=priority) else: result= {'phedex':{'request_created' : []}} fake_id-=1 if not result: print "ERROR Could not make a replica request for",site,items_to_transfer,"pre-staging" continue for phedexid in [o['id'] for o in result['phedex']['request_created']]: new_transfer = session.query(Transfer).filter(Transfer.phedexid == int(phedexid)).first() if not new_transfer: new_transfer = session.query(Transfer).filter(Transfer.phedexid == -int(phedexid)).first() print phedexid,"transfer created" if not new_transfer: new_transfer = Transfer( phedexid = phedexid) session.add( new_transfer ) else: new_transfer.phedexid = phedexid ## make it positive again new_transfer.workflows_id = set() for transfering in list(set(map(lambda it : it.split('#')[0], items_to_transfer))): new_transfer.workflows_id.update( workflow_dependencies[transfering] ) new_transfer.workflows_id = list(new_transfer.workflows_id) wf_id_in_prestaging.update(new_transfer.workflows_id) session.commit() ## auto approve it if execute: approved = approveSubscription(url, phedexid, [site_se]) for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status!='staging': if execute: tr_wf.status = 'staging' if talk: print "setting",tr_wf.name,"to staging" session.commit()
def recoveror(url,specific,options=None): if userLock('recoveror'): return up = componentInfo(soft=['mcm','wtc','jira']) if not up.check(): return CI = campaignInfo() SI = siteInfo() UC = unifiedConfiguration() use_recoveror = UC.get('use_recoveror') if not use_recoveror and not options.go: print "We are told not to run recoveror" return def make_int_keys( d ): for code in d: d[int(code)] = d.pop(code) error_codes_to_recover = UC.get('error_codes_to_recover') error_codes_to_block = UC.get('error_codes_to_block') error_codes_to_notify = UC.get('error_codes_to_notify') make_int_keys( error_codes_to_recover ) make_int_keys( error_codes_to_block ) make_int_keys( error_codes_to_notify ) #wfs = session.query(Workflow).filter(Workflow.status == 'assistance-recovery').all() wfs = session.query(Workflow).filter(Workflow.status.contains('recovery')).all() if specific: wfs.extend( session.query(Workflow).filter(Workflow.status == 'assistance-manual').all() ) for wfo in wfs: if specific and not specific in wfo.name:continue if not specific and 'manual' in wfo.status: continue wfi = workflowInfo(url, wfo.name) ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves all_errors = {} try: ## this is clearly very truncated and should be changed completely wfi.getSummary() all_errors = wfi.summary['errors'] except: pass print '-'*100 print "Looking at",wfo.name,"for recovery options" recover = True if not 'MergedLFNBase' in wfi.request: print "f****d up" sendEmail('missing lfn','%s wl cache is screwed up'%wfo.name) recover = False if not len(all_errors): print "\tno error for",wfo.name recover = False task_to_recover = defaultdict(list) message_to_ops = "" message_to_user = "" if 'LheInputFilese' in wfi.request and wfi.request['LheInputFiles']: ## we do not try to recover pLHE recover = False if wfi.request['RequestType'] in ['MonteCarlo','ReReco']: recover = False if 'Campaign' in wfi.request: c = wfi.request['Campaign'] if c in CI.campaigns and 'recover' in CI.campaigns[c]: recover=CI.campaigns[c]['recover'] for task,errors in all_errors.items(): print "\tTask",task ## collect all error codes and #jobs regardless of step at which it occured all_codes = [] for name, codes in errors.items(): if type(codes)==int: continue all_codes.extend( [(int(code),info['jobs'],name,list(set([e['type'] for e in info['errors']])),list(set([e['details'] for e in info['errors']])) ) for code,info in codes.items()] ) all_codes.sort(key=lambda i:i[1], reverse=True) sum_failed = sum([l[1] for l in all_codes]) for errorCode,njobs,name,types,details in all_codes: rate = 100*njobs/float(sum_failed) #print ("\t\t %10d (%6s%%) failures with error code %10d (%"+str(max_legend)+"s) at stage %s")%(njobs, "%4.2f"%rate, errorCode, legend, name) print ("\t\t %10d (%6s%%) failures with error code %10d (%30s) at stage %s")%(njobs, "%4.2f"%rate, errorCode, ','.join(types), name) added_in_recover=False #if options.go: # force the recovery of any task with error ? if errorCode in error_codes_to_recover: ## the error code is registered for case in error_codes_to_recover[errorCode]: match = case['details'] matched= (match==None) if not matched: matched=False for detail in details: if match in detail: print "[recover] Could find keyword",match,"in" print 50*"#" print detail print 50*"#" matched = True break if matched and rate > case['rate']: print "\t\t => we should be able to recover that", case['legend'] task_to_recover[task].append( (code,case) ) added_in_recover=True message_to_user = "" else: print "\t\t recoverable but not frequent enough, needs",case['rate'] if errorCode in error_codes_to_block: for case in error_codes_to_block[errorCode]: match = case['details'] matched= (match==None) if not matched: matched=False for detail in details: if match in detail: print "[block] Could find keyword",match,"in" print 50*"#" print detail print 50*"#" matched = True break if matched and rate > case['rate']: print "\t\t => that error means no ACDC on that workflow", case['legend'] if not options.go: message_to_ops += "%s has an error %s blocking an ACDC.\n%s\n "%( wfo.name, errorCode, '#'*50 ) recover = False added_in_recover=False if errorCode in error_codes_to_notify and not added_in_recover: print "\t\t => we should notify people on this" message_to_user += "%s has an error %s in processing.\n%s\n" %( wfo.name, errorCode, '#'*50 ) if message_to_user: print wfo.name,"to be notified to user(DUMMY)",message_to_user if message_to_ops: #sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**']) sendLog('recoveror',message_to_ops,level='warning') if len(task_to_recover) != len(all_errors): print "Should not be doing partial ACDC. skipping" #sendEmail('recoveror','do not want to make partial acdc on %s'%wfo.name) sendLog('recoveror','do not want to make partial acdc on %s'%wfo.name, level='warning') recover = False if task_to_recover and recover: print "Initiating recovery" print ', '.join(task_to_recover.keys()),"to be recovered" recovering=set() for task in task_to_recover: print "Will be making a recovery workflow for",task ## from here you can fetch known solutions, to known error codes actions = list(set([case['solution'] for code,case in task_to_recover[task] ])) acdc = singleRecovery(url, task, wfi.request , actions, do = options.do) if not acdc: if options.do: if recovering: print wfo.name,"has been partially ACDCed. Needs manual attention" #sendEmail( "failed ACDC partial recovery","%s has had %s/%s recoveries %s only"%( wfo.name, len(recovering), len(task_to_recover), list(recovering)), destination=['*****@*****.**']) sendLog('recoveror', "%s has had %s/%s recoveries %s only"%( wfo.name, len(recovering), len(task_to_recover), list(recovering)), level='critical') continue else: print wfo.name,"failed recovery once" #break continue else: print "no action to take further" sendLog('recoveror', "ACDC for %s can be done automatically"% wfo.name, level='critical') continue ## and assign it ? team = wfi.request['Team'] #assign_to_sites = set(SI.sites_ready) ## that needs to be massaged to prevent assigning to something out. assign_to_sites = set(SI.all_sites) parameters={ #'SiteWhitelist' : wfi.request['SiteWhitelist'], 'SiteWhitelist' : sorted(assign_to_sites), 'AcquisitionEra' : wfi.acquisitionEra(), 'ProcessingString' : wfi.processingString(), 'MergedLFNBase' : wfi.request['MergedLFNBase'], 'ProcessingVersion' : wfi.request['ProcessingVersion'], } ## hackery for ACDC merge assignment if wfi.request['RequestType'] == 'TaskChain' and 'Merge' in task.split('/')[-1]: parameters['AcquisitionEra'] = None parameters['ProcessingString'] = None ## xrootd setttings on primary and secondary if 'TrustSitelists' in wfi.request and wfi.request['TrustSitelists']: parameters['TrustSitelists'] = True if 'TrustPUSitelists' in wfi.request and wfi.request['TrustPUSitelists']: parameters['TrustPUSitelists'] = True if options.ass: print "really doing the assignment of the ACDC",acdc parameters['execute']=True wfi.sendLog('recoveror',"%s was assigned for recovery"% acdc) else: print "no assignment done with this ACDC",acdc sendLog('recoveror',"%s needs to be assigned"%(acdc), level='critical') result = reqMgrClient.assignWorkflow(url, acdc, team, parameters) if not result: print acdc,"was not asigned" sendLog('recoveror',"%s needs to be assigned"%(acdc), level='critical') else: recovering.add( acdc ) current = None if recovering: #if all went well, set the status to -recovering current = wfo.status if options.ass: current = current.replace('recovery','recovering') else: current = 'assistance-manual' print 'created ACDC: '+', '.join( recovering ) else: ## was set to be recovered, and no acdc was made current = 'assistance-manual' if current: print wfo.name,"setting the status to",current wfo.status = current session.commit() else: ## this workflow should be handled manually at that point print wfo.name,"needs manual intervention" wfo.status = 'assistance-manual' session.commit()
def assignor(url ,specific = None, talk=True, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() #SI = siteInfo() SI = global_SI() #NLI = newLockInfo() #if not NLI.free() and not options.go: return LI = lockInfo() if not LI.free() and not options.go: return n_assigned = 0 n_stalled = 0 wfos=[] fetch_from = [] if specific or options.early: fetch_from.extend(['considered','staging']) if specific: fetch_from.extend(['considered-tried']) fetch_from.extend(['staged']) if options.from_status: fetch_from = options.from_status.split(',') print "Overriding to read from",fetch_from for status in fetch_from: wfos.extend(session.query(Workflow).filter(Workflow.status==status).all()) ## in case of partial, go for fetching a list from json ? #if options.partial and not specific: # pass dataset_endpoints = json.loads(open('%s/dataset_endpoints.json'%monitor_dir).read()) aaa_mapping = json.loads(open('%s/equalizor.json'%monitor_pub_dir).read())['mapping'] all_stuck = set() all_stuck.update( json.loads( open('%s/stuck_transfers.json'%monitor_dir).read() )) all_stuck.update( getAllStuckDataset()) max_per_round = UC.get('max_per_round').get('assignor',None) max_cpuh_block = UC.get('max_cpuh_block') random.shuffle( wfos ) for wfo in wfos: if options.limit and (n_stalled+n_assigned)>options.limit: break if max_per_round and (n_stalled+n_assigned)>max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name,specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo( url, wfo.name) if options.priority and int(wfh.request['RequestPriority']) < options.priority: continue options_text="" if options.early: options_text+=", early option is ON" if options.partial: options_text+=", partial option is ON" options_text+=", good fraction is %.2f"%options.good_enough wfh.sendLog('assignor',"%s to be assigned%s"%(wfo.name, options_text)) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput,primary,parent,secondary, sites_allowed) = wfh.getSiteWhiteList() output_tiers = list(set([o.split('/')[-1] for o in wfh.request['OutputDatasets']])) is_stuck = (all_stuck & primary) if is_stuck: wfh.sendLog('assignor',"%s are stuck input"%(','.join( is_stuck))) ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: no_go = True allowed_secondary = {} assign_parameters = {} check_secondary = False for campaign in wfh.getCampaigns(): if campaign in CI.campaigns: assign_parameters.update( CI.campaigns[campaign] ) if campaign in CI.campaigns and 'secondaries' in CI.campaigns[campaign]: if CI.campaigns[campaign]['secondaries']: allowed_secondary.update( CI.campaigns[campaign]['secondaries'] ) check_secondary = True if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[campaign]: banned_tier = list(set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers)) if banned_tier: no_go=True wfh.sendLog('assignor','These data tiers %s are not allowed'%(','.join( banned_tier))) sendLog('assignor','These data tiers %s are not allowed'%(','.join( banned_tier)), level='critical') if secondary and check_secondary: if (set(secondary)&set(allowed_secondary.keys())!=set(secondary)): wfh.sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-set(allowed_secondary.keys())))) sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-set(allowed_secondary.keys()))), level='critical') if not options.go: no_go = True ## then get whether there is something more to be done by secondary for sec in secondary: if sec in allowed_secondary:# and 'parameters' in allowed_secondary[sec]: assign_parameters.update( allowed_secondary[sec] ) if no_go: n_stalled+=1 continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] !='assignment-approved': if not options.test: wfh.sendLog('assignor',"setting %s away and skipping"%wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name,wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version=wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor',"cannot decide on version number") n_stalled+=1 wfo.status = 'trouble' session.commit() continue original_sites_allowed = copy.deepcopy( sites_allowed ) wfh.sendLog('assignor',"Site white list %s"%sorted(sites_allowed)) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) blocks = [] if 'BlockWhitelist' in wfh.request: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## augment with run white list for dataset in primary: blocks = list(set( blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist'] ) )) wfh.sendLog('assignor',"Allowed %s"%sorted(sites_allowed)) secondary_locations=None primary_aaa = options.primary_aaa secondary_aaa = options.secondary_aaa do_partial = False #options.good_enough if options.partial else 0 if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns: assign_parameters.update( CI.campaigns[wfh.request['Campaign']] ) if 'primary_AAA' in assign_parameters: primary_aaa = primary_aaa or assign_parameters['primary_AAA'] if 'secondary_AAA' in assign_parameters: secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA'] if 'partial_copy' in assign_parameters: ## can this only work if there is a stuck input ? maybe not ## this is a number. 0 means no print "Could do partial disk copy assignment" if is_stuck or options.partial: do_partial = assign_parameters['partial_copy'] wfh.sendLog('assignor',"Overiding partial copy assignment to %.2f fraction"% do_partial) #sendEmail('stuck input to assignment','%s is stuck for assigning %s and going fractional'%(','.join( is_stuck), wfo.name)) do_partial = options.good_enough if options.partial else do_partial for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" #sendEmail("tempting to pass sec location check","but we cannot yet IMO") #pass if secondary_aaa: #just continue without checking continue presence = getDatasetPresence( url, sec ) print sec print json.dumps(presence, indent=2) one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.] #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations==None: secondary_locations = one_secondary_locations else: secondary_locations = list(set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations] wfh.sendLog('assignor',"From secondary requirement, now Allowed%s"%sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy( sites_allowed ) sites_with_data = copy.deepcopy( sites_allowed ) sites_with_any_data = copy.deepcopy( sites_allowed ) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default endpoints = set() for prim in list(primary): if prim in dataset_endpoints: print "endpoints from stagor",dataset_endpoints[prim] endpoints.update( dataset_endpoints[prim] ) set_lfn = getLFNbase( prim ) presence = getDatasetPresence( url, prim , only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction(url, prim, sites = [SI.CE_to_SE(site) for site in sites_allowed] , only_blocks = blocks) #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])] #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])] sites_all_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,(there,frac)) in presence.items() if there]] sites_with_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,frac) in presence.items() if frac[1]>90.]] sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()] wfh.sendLog('assignor',"Holding the data but not allowed %s"%sorted(list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed])))) if primary_locations==None: primary_locations = presence.keys() else: primary_locations = list(set(primary_locations) & set(presence.keys() )) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites=[] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed]))] elif primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed]))] else: opportunistic_sites = [] wfh.sendLog('assignor',"We could be running in addition at %s"% sorted(opportunistic_sites)) if any([osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog('assignor',"One of the usable site is in downtime %s"%([osite for osite in opportunistic_sites if osite in SI.sites_not_ready])) down_time = True ## should this be send back to considered ? ## should be 2 but for the time-being let's lower it to get things going copies_wanted,cpuh = wfh.getNCopies() wfh.sendLog('assignor',"we need %s CPUh"%cpuh) if cpuh>max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog('assignor','%s requires a large numbr of CPUh %s , not assigning, please check with requester'%( wfo.name, cpuh), level='critical') wfh.sendLog('assignor',"Requiring a large number of CPUh %s, not assigning"%cpuh) continue if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies'] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max(1,copies_wanted-less_copies_than_requested) # take one out for the efficiency else: ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going pass wfh.sendLog('assignor',"needed availability fraction %s"% copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently ## default back to white list to original white list with any data wfh.sendLog('assignor',"Allowed sites :%s"% sorted(sites_allowed)) if primary_aaa: ## remove the sites not reachable localy if not in having the data if not sites_all_data: wfh.sendLog('assignor',"Overiding the primary on AAA setting to Off") primary_aaa=False else: aaa_grid = set(sites_all_data) for site in list(aaa_grid): aaa_grid.update( aaa_mapping.get(site,[]) ) sites_allowed = list(set(initial_sites_allowed) & aaa_grid) wfh.sendLog('assignor',"Selected to read primary through xrootd %s"%sorted(sites_allowed)) if not primary_aaa: sites_allowed = sites_with_any_data wfh.sendLog('assignor',"Selected for any data %s"%sorted(sites_allowed)) ### check on endpoints for on-going transfers if do_partial: if endpoints: end_sites = [SI.SE_to_CE(s) for s in endpoints] sites_allowed = list(set(sites_allowed + end_sites)) if down_time and not any(osite in SI.sites_not_ready for osite in end_sites): print "Flip the status of downtime, since our destinations are good" down_time = False print "with added endpoints",sorted(end_sites) else: print "Cannot do partial assignment without knowin the endpoints" n_stalled+=1 continue #if not len(sites_allowed): # if not options.early: # wfh.sendLog('assignor',"cannot be assign with no matched sites") # sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') # n_stalled+=1 # continue low_pressure = SI.sites_low_pressure(0.4) ## if any of the site allowed is low pressure : reduce to 1 copy so that it gets started allowed_and_low = sorted(set(low_pressure) & set(sites_allowed)) if allowed_and_low: wfh.sendLog('assignor',"The workflow can run at %s under low pressure currently"%( ','.join( allowed_and_low ))) copies_wanted = max(1., copies_wanted-1.) if available_fractions and not all([available>=copies_wanted for available in available_fractions.values()]): not_even_once = not all([available>=1. for available in available_fractions.values()]) above_good = all([available >= do_partial for available in available_fractions.values()]) wfh.sendLog('assignor',"The input dataset is not available %s times, only %s"%( copies_wanted, available_fractions.values())) if down_time and not options.go and not options.early: wfo.status = 'considered' session.commit() wfh.sendLog('assignor',"sending back to considered because of site downtime, instead of waiting") #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) sendLog('assignor','%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.'%( wfo.name ), level='delay') n_stalled+=1 continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known and not options.limit and not options.go and not options.early and not (do_partial and above_good): wfh.sendLog('assignor',"cannot be assigned, %s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) #sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) known.append( wfo.name ) open('cannot_assign.json','w').write(json.dumps( known, indent=2)) if options.early: if wfo.status == 'considered': wfh.sendLog('assignor',"setting considered-tried") wfo.status = 'considered-tried' session.commit() else: print "tried but status is",wfo.status if do_partial and above_good: print "Will move on with partial locations" else: n_stalled+=1 continue if not len(sites_allowed): if not options.early: wfh.sendLog('assignor',"cannot be assign with no matched sites") sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') n_stalled+=1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])] wfh.sendLog('assignor',"Placing the output on %s"%sites_out) parameters={ 'SiteWhitelist' : sites_allowed, 'NonCustodialSites' : sites_out, 'AutoApproveSubscriptionSites' : list(set(sites_out)), 'AcquisitionEra' : wfh.acquisitionEra(), 'ProcessingString' : wfh.processingString(), 'MergedLFNBase' : set_lfn, 'ProcessingVersion' : version, } if primary_aaa: parameters['TrustSitelists'] = True wfh.sendLog('assignor',"Reading primary through xrootd at %s"%sorted(sites_allowed)) if secondary_aaa: parameters['TrustPUSitelists'] = True wfh.sendLog('assignor',"Reading secondary through xrootd at %s"%sorted(sites_allowed)) if 'parameters' in assign_parameters: parameters.update( assign_parameters['parameters'] ) ## plain assignment here team='production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team if False and 'T2_CH_CERN' in parameters['SiteWhitelist']: ## add some check on ### the amount pending to HLT ### the size of the request ### the priority of the request (maybe not if we decide to overflow during runs) parameters['SiteWhitelist'] = ['T2_CH_CERN_HLT'] team = 'hlt' ## reduce the splitting by factor of 4, regardless of type of splitting sendEmail("sending work to HLT","%s was assigned to HLT"%wfo.name) ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v=getattr(options,key) if v!=None: if type(v)==str and ',' in v: parameters[key] = filter(None,v.split(',')) else: parameters[key] = v if lheinput: ## throttle reading LHE article wfh.sendLog('assignor', 'Setting the number of events per job to 500k max') parameters['EventsPerJob'] = 500000 ## pick up campaign specific assignment parameters #parameters.update( CI.parameters(wfh.request['Campaign']) ) parameters.update( assign_parameters.get('parameters',{}) ) if not options.test: parameters['execute'] = True split_check = wfh.checkWorkflowSplitting() if split_check!=True: parameters.update( split_check ) if 'NoGo' in split_check.values(): wfh.sendLog('assignor', "Failing splitting check") sendLog('assignor','the workflow %s is failing the splitting check. Verify in the logs'% wfo.name, level='critical') n_stalled+=1 continue if 'EventBased' in split_check.values(): wfh.sendLog('assignor', "Falling back to event splitting.") #sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name) sendLog('assignor','the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting ?'%wfo.name, level='critical') ## we have a problem here, that EventBased should never be used as a backup if not options.go: n_stalled+=1 continue continue ## skip all together elif 'EventsPerJob' in split_check.values(): wfh.sendLog('assignor', "Modifying the number of events per job") #sendEmail("Modifying the job per events","the workflow %s is too heavy in number of jobs explosion"%wfo.name) sendLog('assignor',"the workflow %s is too heavy in number of jobs explosion"%wfo.name, level='critical') elif 'EventsPerLumi' in split_check.values(): wfh.sendLog('assignor', "Modifying the number of events per lumi to be able to process this") # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi)/float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents/(reqJobs*1.4)) lumisPerJob = int(eventsPerJob/eventsPerLumi) if lumisPerJob==0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob), level='critical') wfh.sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl['events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl['avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob), level='critical') wfh.sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog('assignor',"leaving splitting untouched for %s, please check on %s"%( pstring, wfo.name), level='critical') wfh.sendLog('assignor',"leaving splitting untouched for PU_RD*, please check.") result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned+=1 wfh.sendLog('assignor',"Properly assigned\n%s"%(json.dumps( parameters, indent=2))) try: ## refetch information and lock output new_wfi = workflowInfo( url, wfo.name) (_,prim,_,sec) = new_wfi.getIO() for secure in list(prim)+list(sec)+new_wfi.request['OutputDatasets']: ## lock all outputs flat #NLI.lock( secure ) LI.lock( secure, reason = 'assigning') #for site in [SI.CE_to_SE(site) for site in sites_allowed]: # for output in new_wfi.request['OutputDatasets']: # LI.lock( output, site, 'dataset in production') # for primary in prim: # LI.lock( primary, site, 'dataset used in input') # for secondary in sec: # LI.lock( secondary, site, 'required for mixing' ) except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output",str(e)) else: wfh.sendLog('assignor',"Failed to assign. Please check the logs") print "ERROR could not assign",wfo.name else: pass print "Assignment summary:" sendLog('assignor',"Assigned %d Stalled %s"%(n_assigned, n_stalled))
def assignor(url, specific=None, talk=True, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() SI = siteInfo() NLI = newLockInfo() n_assigned = 0 n_stalled = 0 wfos = [] if specific or options.early: wfos.extend(session.query(Workflow).filter(Workflow.status == "considered").all()) wfos.extend(session.query(Workflow).filter(Workflow.status == "staging").all()) if specific: wfos.extend(session.query(Workflow).filter(Workflow.status == "considered-tried").all()) wfos.extend(session.query(Workflow).filter(Workflow.status == "staged").all()) dataset_endpoints = json.loads(open("%s/dataset_endpoints.json" % monitor_dir).read()) max_per_round = UC.get("max_per_round").get("assignor", None) max_cpuh_block = UC.get("max_cpuh_block") random.shuffle(wfos) for wfo in wfos: if options.limit and (n_stalled + n_assigned) > options.limit: break if max_per_round and (n_stalled + n_assigned) > max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(","))): continue # if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo(url, wfo.name) wfh.sendLog("assignor", "%s to be assigned" % wfo.name) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() ## check if by configuration we gave it a GO no_go = False if not wfh.go(log=True) and not options.go: n_stalled += 1 no_go = True allowed_secondary = set() for campaign in wfh.getCampaigns(): if campaign in CI.campaigns and "secondaries" in CI.campaigns[campaign]: allowed_secondary.update(CI.campaigns[campaign]["secondaries"]) if (secondary and allowed_secondary) and (set(secondary) & allowed_secondary != set(secondary)): wfh.sendLog("assignor", "%s is not an allowed secondary" % (", ".join(set(secondary) - allowed_secondary))) # sendEmail('secondary not allowed','%s is not an allowed secondary'%( ', '.join(set(secondary)-allowed_secondary))) sendLog( "assignor", "%s is not an allowed secondary" % (", ".join(set(secondary) - allowed_secondary)), level="critical", ) if not options.go: n_stalled += 1 no_go = True if no_go: continue ## check on current status for by-passed assignment if wfh.request["RequestStatus"] != "assignment-approved": if not options.test: wfh.sendLog("assignor", "setting %s away and skipping" % wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request["RequestStatus"] wfo.status = "away" session.commit() continue else: print wfo.name, wfh.request["RequestStatus"] ## retrieve from the schema, dbs and reqMgr what should be the next version version = wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog("assignor", "cannot decide on version number") n_stalled += 1 wfo.status = "trouble" session.commit() continue original_sites_allowed = copy.deepcopy(sites_allowed) wfh.sendLog("assignor", "Site white list %s" % sorted(sites_allowed)) override_sec_location = CI.get(wfh.request["Campaign"], "SecondaryLocation", []) blocks = [] if "BlockWhitelist" in wfh.request: blocks = wfh.request["BlockWhitelist"] if "RunWhitelist" in wfh.request and wfh.request["RunWhitelist"]: ## augment with run white list for dataset in primary: blocks = list(set(blocks + getDatasetBlocks(dataset, runs=wfh.request["RunWhitelist"]))) wfh.sendLog("assignor", "Allowed %s" % sorted(sites_allowed)) secondary_locations = None primary_aaa = options.primary_aaa if ( "Campaign" in wfh.request and wfh.request["Campaign"] in CI.campaigns and "primary_AAA" in CI.campaigns[wfh.request["Campaign"]] ): primary_aaa = primary_aaa or CI.campaigns[wfh.request["Campaign"]]["primary_AAA"] secondary_aaa = options.secondary_aaa if ( "Campaign" in wfh.request and wfh.request["Campaign"] in CI.campaigns and "secondary_AAA" in CI.campaigns[wfh.request["Campaign"]] ): secondary_aaa = secondary_aaa or CI.campaigns[wfh.request["Campaign"]]["secondary_AAA"] for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" sendEmail("tempting to pass sec location check", "but we cannot yet IMO") # pass if secondary_aaa: # just continue without checking continue presence = getDatasetPresence(url, sec) print sec print json.dumps(presence, indent=2) one_secondary_locations = [site for (site, (there, frac)) in presence.items() if frac > 98.0] # one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations == None: secondary_locations = one_secondary_locations else: secondary_locations = list(set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only # sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations] wfh.sendLog("assignor", "From secondary requirement, now Allowed%s" % sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy(sites_allowed) sites_with_data = copy.deepcopy(sites_allowed) sites_with_any_data = copy.deepcopy(sites_allowed) primary_locations = None available_fractions = {} set_lfn = "/store/mc" ## by default endpoints = set() for prim in list(primary): if prim in dataset_endpoints: print "endpoints from stagor", dataset_endpoints[prim] endpoints.update(dataset_endpoints[prim]) set_lfn = getLFNbase(prim) presence = getDatasetPresence(url, prim, only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction( url, prim, sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks ) # sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])] # sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])] sites_all_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite, (there, frac)) in presence.items() if there] ] sites_with_data = [ site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite, frac) in presence.items() if frac[1] > 90.0] ] sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()] wfh.sendLog( "assignor", "Holding the data but not allowed %s" % sorted( list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed])) ), ) if primary_locations == None: primary_locations = presence.keys() else: primary_locations = list(set(primary_locations) & set(presence.keys())) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites = [] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO # opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list( (set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed]) ) ] elif primary_locations: opportunistic_sites = [ SI.SE_to_CE(site) for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed])) ] else: opportunistic_sites = [] wfh.sendLog("assignor", "We could be running in addition at %s" % sorted(opportunistic_sites)) if any([osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog( "assignor", "One of the usable site is in downtime %s" % ([osite in SI.sites_not_ready for osite in opportunistic_sites]), ) down_time = True ## should this be send back to considered ? ## should be 2 but for the time-being let's lower it to get things going copies_wanted, cpuh = wfh.getNCopies() wfh.sendLog("assignor", "we need %s CPUh" % cpuh) if cpuh > max_cpuh_block and not options.go: # sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog( "assignor", "%s requires a large numbr of CPUh %s , not assigning, please check with requester" % (wfo.name, cpuh), level="critical", ) wfh.sendLog("assignor", "Requiring a large number of CPUh %s, not assigning" % cpuh) continue if ( "Campaign" in wfh.request and wfh.request["Campaign"] in CI.campaigns and "maxcopies" in CI.campaigns[wfh.request["Campaign"]] ): copies_needed_from_campaign = CI.campaigns[wfh.request["Campaign"]]["maxcopies"] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max(1, copies_wanted - less_copies_than_requested) # take one out for the efficiency else: ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going pass wfh.sendLog("assignor", "needed availability fraction %s" % copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently if available_fractions and not all([available >= copies_wanted for available in available_fractions.values()]): not_even_once = not all([available >= 1.0 for available in available_fractions.values()]) wfh.sendLog( "assignor", "The input dataset is not available %s times, only %s" % (copies_wanted, available_fractions.values()), ) if down_time and not options.go and not options.early: wfo.status = "considered" session.commit() wfh.sendLog("assignor", "sending back to considered because of site downtime, instead of waiting") # sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) sendLog( "assignor", "%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered." % (wfo.name), level="delay", ) continue # pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open("cannot_assign.json").read()) except: pass if ( not wfo.name in known and not options.limit and not options.go and not options.early and not options.partial ): wfh.sendLog( "assignor", "cannot be assigned, %s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions)), ) sendEmail( "cannot be assigned", "%s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions)), ) known.append(wfo.name) open("cannot_assign.json", "w").write(json.dumps(known, indent=2)) n_stalled += 1 if options.early: if wfo.status == "considered": wfh.sendLog("assignor", "setting considered-tried") wfo.status = "considered-tried" session.commit() else: print "tried but status is", wfo.status if options.partial: print "Will move on with partial locations" else: continue ## default back to white list to original white list with any data print "Allowed", sorted(sites_allowed) if primary_aaa: sites_allowed = initial_sites_allowed options.TrustSitelists = True wfh.sendLog("assignor", "Selected to read primary through xrootd %s" % sorted(sites_allowed)) else: sites_allowed = sites_with_any_data wfh.sendLog("assignor", "Selected for any data %s" % sorted(sites_allowed)) if secondary_aaa: options.TrustPUSitelists = True wfh.sendLog("assignor", "Reading secondary through xrootd from %s" % sorted(sites_allowed)) ### check on endpoints for on-going transfers if endpoints and options.partial: sites_allowed = list(set(sites_allowed + [SI.SE_to_CE(s) for s in endpoints])) print "with added endpoints", sorted(sites_allowed) if not len(sites_allowed): wfh.sendLog("assignor", "cannot be assign with no matched sites") sendLog("assignor", "%s has no whitelist" % wfo.name, level="critical") n_stalled += 1 continue t1_only = [ce for ce in sites_allowed if ce.startswith("T1")] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])] wfh.sendLog("assignor", "Placing the output on %s" % sites_out) parameters = { "SiteWhitelist": sites_allowed, "NonCustodialSites": sites_out, "AutoApproveSubscriptionSites": list(set(sites_out)), "AcquisitionEra": wfh.acquisitionEra(), "ProcessingString": wfh.processingString(), "MergedLFNBase": set_lfn, "ProcessingVersion": version, } ## plain assignment here team = "production" if os.getenv("UNIFIED_TEAM"): team = os.getenv("UNIFIED_TEAM") if options and options.team: team = options.team if False and "T2_CH_CERN" in parameters["SiteWhitelist"]: ## add some check on ### the amount pending to HLT ### the size of the request ### the priority of the request (maybe not if we decide to overflow during runs) parameters["SiteWhitelist"] = ["T2_CH_CERN_HLT"] team = "hlt" ## reduce the splitting by factor of 4, regardless of type of splitting sendEmail("sending work to HLT", "%s was assigned to HLT" % wfo.name) ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v = getattr(options, key) if v != None: if type(v) == str and "," in v: parameters[key] = filter(None, v.split(",")) else: parameters[key] = v if lheinput: ## throttle reading LHE article wfh.sendLog("assignor", "Setting the number of events per job to 500k max") parameters["EventsPerJob"] = 500000 ## pick up campaign specific assignment parameters parameters.update(CI.parameters(wfh.request["Campaign"])) if not options.test: parameters["execute"] = True split_check = wfh.checkWorkflowSplitting() if split_check != True: parameters.update(split_check) if "EventBased" in split_check.values(): wfh.sendLog("assignor", "Falling back to event splitting.") # sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name) sendLog( "assignor", "the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting" % wfo.name, level="critical", ) elif "EventsPerJob" in split_check.values(): wfh.sendLog("assignor", "Modifying the number of job per event") # sendEmail("Modifying the job per events","the workflow %s is too heavy in number of jobs explosion"%wfo.name) sendLog( "assignor", "the workflow %s is too heavy in number of jobs explosion" % wfo.name, level="critical" ) # Handle run-dependent MC pstring = wfh.processingString() if "PU_RD" in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi)) reqJobs = 500 if "PU_RD2" in pstring: reqJobs = 2000 eventsPerJob = int(numEvents / (reqJobs * 1.4)) lumisPerJob = int(eventsPerJob / eventsPerLumi) if lumisPerJob == 0: # sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog( "assignor", "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob), level="critical", ) wfh.sendLog("assignor", "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob)) parameters["EventsPerJob"] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl["events_per_job"] if "events_per_job" in spl else None eventsPerJobEstimated = spl["avg_events_per_job"] if "avg_events_per_job" in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: # sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog( "assignor", "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob), level="critical" ) wfh.sendLog("assignor", "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob)) parameters["LumisPerJob"] = lumisPerJob else: # sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog( "assignor", "leaving splitting untouched for %s, please check on %s" % (pstring, wfo.name), level="critical", ) wfh.sendLog("assignor", "leaving splitting untouched for PU_RD*, please check.") result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = "away" session.commit() n_assigned += 1 wfh.sendLog("assignor", "Properly assigned\n%s" % (json.dumps(parameters, indent=2))) try: ## refetch information and lock output new_wfi = workflowInfo(url, wfo.name) (_, prim, _, sec) = new_wfi.getIO() for secure in list(prim) + list(sec) + new_wfi.request["OutputDatasets"]: ## lock all outputs flat NLI.lock(secure) # for site in [SI.CE_to_SE(site) for site in sites_allowed]: # for output in new_wfi.request['OutputDatasets']: # LI.lock( output, site, 'dataset in production') # for primary in prim: # LI.lock( primary, site, 'dataset used in input') # for secondary in sec: # LI.lock( secondary, site, 'required for mixing' ) except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output", str(e)) else: print "ERROR could not assign", wfo.name else: pass print "Assignment summary:" sendLog("assignor", "Assigned %d Stalled %s" % (n_assigned, n_stalled))
def recoveror(url,specific,options=None): if userLock('recoveror'): return up = componentInfo() CI = campaignInfo() UC = unifiedConfiguration() def make_int_keys( d ): for code in d: d[int(code)] = d.pop(code) error_codes_to_recover = UC.get('error_codes_to_recover') error_codes_to_block = UC.get('error_codes_to_block') error_codes_to_notify = UC.get('error_codes_to_notify') make_int_keys( error_codes_to_recover ) make_int_keys( error_codes_to_block ) make_int_keys( error_codes_to_notify ) wfs = session.query(Workflow).filter(Workflow.status == 'assistance-recovery').all() if specific: wfs.extend( session.query(Workflow).filter(Workflow.status == 'assistance-manual').all() ) for wfo in wfs: if specific and not specific in wfo.name:continue if not specific and 'manual' in wfo.status: continue wfi = workflowInfo(url, wfo.name, deprecated=True) ## need deprecated info for mergedlfnbase ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves all_errors = None try: wfi.getSummary() all_errors = wfi.summary['errors'] except: pass print '-'*100 print "Looking at",wfo.name,"for recovery options" if not len(all_errors): print "\tno error for",wfo.name task_to_recover = defaultdict(list) message_to_ops = "" message_to_user = "" recover=True if 'LheInputFilese' in wfi.request and wfi.request['LheInputFiles']: ## we do not try to recover pLHE recover = False if 'Campaign' in wfi.request: c = wfi.request['Campaign'] if c in CI.campaigns and 'recover' in CI.campaigns[c]: recover=CI.campaigns[c]['recover'] for task,errors in all_errors.items(): print "\tTask",task ## collect all error codes and #jobs regardless of step at which it occured all_codes = [] for name, codes in errors.items(): if type(codes)==int: continue all_codes.extend( [(int(code),info['jobs'],name,list(set([e['type'] for e in info['errors']])),list(set([e['details'] for e in info['errors']])) ) for code,info in codes.items()] ) all_codes.sort(key=lambda i:i[1], reverse=True) sum_failed = sum([l[1] for l in all_codes]) for errorCode,njobs,name,types,details in all_codes: rate = 100*njobs/float(sum_failed) #print ("\t\t %10d (%6s%%) failures with error code %10d (%"+str(max_legend)+"s) at stage %s")%(njobs, "%4.2f"%rate, errorCode, legend, name) print ("\t\t %10d (%6s%%) failures with error code %10d (%30s) at stage %s")%(njobs, "%4.2f"%rate, errorCode, ','.join(types), name) added_in_recover=False #if options.go: # force the recovery of any task with error ? if errorCode in error_codes_to_recover: ## the error code is registered for case in error_codes_to_recover[errorCode]: match = case['details'] matched= (match==None) if not matched: matched=False for detail in details: if match in detail: print "[recover] Could find keyword",match,"in" print 50*"#" print detail print 50*"#" matched = True break if matched and rate > case['rate']: print "\t\t => we should be able to recover that", case['legend'] task_to_recover[task].append( (code,case) ) added_in_recover=True message_to_user = "" else: print "\t\t recoverable but not frequent enough, needs",case['rate'] if errorCode in error_codes_to_block: for case in error_codes_to_block[errorCode]: match = case['details'] matched= (match==None) if not matched: matched=False for detail in details: if match in detail: print "[block] Could find keyword",match,"in" print 50*"#" print detail print 50*"#" matched = True break if matched and rate > case['rate']: print "\t\t => that error means no ACDC on that workflow", case['legend'] if not options.go: message_to_ops += "%s has an error %s blocking an ACDC.\n%s\n "%( wfo.name, errorCode, '#'*50 ) recover = False added_in_recover=False if errorCode in error_codes_to_notify and not added_in_recover: print "\t\t => we should notify people on this" message_to_user += "%s has an error %s in processing.\n%s\n" %( wfo.name, errorCode, '#'*50 ) if message_to_user: print wfo.name,"to be notified to user(DUMMY)",message_to_user if message_to_ops: sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**','*****@*****.**']) if task_to_recover and recover: print "Initiating recovery" print ', '.join(task_to_recover.keys()),"to be recovered" recovering=set() for task in task_to_recover: print "Will be making a recovery workflow for",task ## from here you can fetch known solutions, to known error codes actions = list(set([case['solution'] for code,case in task_to_recover[task] ])) acdc = singleRecovery(url, task, wfi.request , actions, do = options.do) if not acdc: if options.do: if recovering: print wfo.name,"has been partially ACDCed. Needs manual attention" sendEmail( "failed ACDC partial recovery","%s has had %s/%s recoveries %s only"%( wfo.name, len(recovering), len(task_to_recover), list(recovering)), destination=['*****@*****.**','*****@*****.**']) continue else: print wfo.name,"failed recovery once" break else: print "no action to take further" sendEmail("an ACDC that can be done automatically","please check https://cmst2.web.cern.ch/cmst2/unified/logs/recoveror/last.log for details", destination=['*****@*****.**','*****@*****.**']) continue ## and assign it ? team = wfi.request['Teams'][0] parameters={ 'SiteWhitelist' : wfi.request['SiteWhitelist'], 'AcquisitionEra' : wfi.acquisitionEra(), 'ProcessingString' : wfi.processingString(), 'MergedLFNBase' : wfi.deprecated_request['MergedLFNBase'], 'ProcessingVersion' : wfi.request['ProcessingVersion'], } if options.ass: print "really doing the assignment of the ACDC",acdc parameters['execute']=True sendEmail("an ACDC was done and WAS assigned", "%s was assigned, please check https://cmst2.web.cern.ch/cmst2/unified/logs/recoveror/last.log for details"%( acdc ), destination=['*****@*****.**','*****@*****.**']) else: print "no assignment done with this ACDC",acdc sendEmail("an ACDC was done and need to be assigned", "%s needs to be assigned, please check https://cmst2.web.cern.ch/cmst2/unified/logs/recoveror/last.log for details"%( acdc ), destination=['*****@*****.**','*****@*****.**']) result = reqMgrClient.assignWorkflow(url, acdc, team, parameters) recovering.add( acdc ) if recovering: #if all went well, set the status to -recovering current = wfo.status if options.ass: current = current.replace('recovery','recovering') else: current = 'assistance-manual' print wfo.name,"setting the status to",current print ', '.join( recovering ) wfo.status = current session.commit() else: ## this workflow should be handled manually at that point print wfo.name,"needs manual intervention" wfo.status = 'assistance-manual' session.commit()
def assignor(url ,specific = None, talk=True, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() SI = global_SI #LI = lockInfo() NLI = newLockInfo() n_assigned = 0 n_stalled = 0 wfos=[] if specific or options.early: wfos.extend( session.query(Workflow).filter(Workflow.status=='considered').all()) wfos.extend( session.query(Workflow).filter(Workflow.status=='staging').all()) if specific: wfos.extend( session.query(Workflow).filter(Workflow.status=='considered-tried').all()) wfos.extend(session.query(Workflow).filter(Workflow.status=='staged').all()) dataset_endpoints = json.loads(open('%s/dataset_endpoints.json'%monitor_dir).read()) max_per_round = UC.get('max_per_round').get('assignor',None) max_cpuh_block = UC.get('max_cpuh_block') random.shuffle( wfos ) for wfo in wfos: if options.limit and (n_stalled+n_assigned)>options.limit: break if max_per_round and (n_stalled+n_assigned)>max_per_round: break if specific: if not any(map(lambda sp: sp in wfo.name,specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n" wfh = workflowInfo( url, wfo.name) wfh.sendLog('assignor',"%s to be assigned"%wfo.name) ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores (lheinput,primary,parent,secondary, sites_allowed) = wfh.getSiteWhiteList() ## check if by configuration we gave it a GO no_go = False allowed_secondary = set() for campaign in wfh.getCampaigns(): if not CI.go( campaign ): wfh.sendLog('assignor',"No go for %s"%campaign) if not options.go: n_stalled+=1 no_go = True break if campaign in CI.campaigns and 'secondaries' in CI.campaigns[campaign]: allowed_secondary.update( CI.campaigns[campaign]['secondaries'] ) if (secondary and allowed_secondary) and (set(secondary)&allowed_secondary!=set(secondary)): wfh.sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-allowed_secondary))) #sendEmail('secondary not allowed','%s is not an allowed secondary'%( ', '.join(set(secondary)-allowed_secondary))) sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-allowed_secondary)), level='critical') if not options.go: n_stalled+=1 no_go = True if no_go: continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] !='assignment-approved': if not options.test: wfh.sendLog('assignor',"setting %s away and skipping"%wfo.name) ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name,wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version=wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: wfh.sendLog('assignor',"cannot decide on version number") n_stalled+=1 wfo.status = 'trouble' session.commit() continue original_sites_allowed = copy.deepcopy( sites_allowed ) wfh.sendLog('assignor',"Site white list %s"%sorted(sites_allowed)) override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', []) blocks = [] if 'BlockWhitelist' in wfh.request: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## augment with run white list for dataset in primary: blocks = list(set( blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist'] ) )) wfh.sendLog('assignor',"Allowed %s"%sorted(sites_allowed)) secondary_locations=None for sec in list(secondary): if override_sec_location: print "We don't care where the secondary is" print "Cannot pass for now" sendEmail("tempting to pass sec location check","but we cannot yet IMO") #pass presence = getDatasetPresence( url, sec ) print sec print json.dumps(presence, indent=2) one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.] #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations==None: secondary_locations = one_secondary_locations else: secondary_locations = list(set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations] wfh.sendLog('assignor',"From secondary requirement, now Allowed%s"%sorted(sites_allowed)) initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it sites_all_data = copy.deepcopy( sites_allowed ) sites_with_data = copy.deepcopy( sites_allowed ) sites_with_any_data = copy.deepcopy( sites_allowed ) primary_locations = None available_fractions = {} set_lfn = '/store/mc' ## by default endpoints = set() for prim in list(primary): if prim in dataset_endpoints: print "endpoints from stagor",dataset_endpoints[prim] endpoints.update( dataset_endpoints[prim] ) set_lfn = getLFNbase( prim ) presence = getDatasetPresence( url, prim , only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction(url, prim, sites = [SI.CE_to_SE(site) for site in sites_allowed] , only_blocks = blocks) #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])] #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])] sites_all_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,(there,frac)) in presence.items() if there]] sites_with_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,frac) in presence.items() if frac[1]>90.]] sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()] wfh.sendLog('assignor',"Holding the data but not allowed %s"%sorted(list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed])))) if primary_locations==None: primary_locations = presence.keys() else: primary_locations = list(set(primary_locations) & set(presence.keys() )) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites=[] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations and primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed]))] elif primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed]))] else: opportunistic_sites = [] wfh.sendLog('assignor',"We could be running in addition at %s"% sorted(opportunistic_sites)) if any([osite in SI.sites_not_ready for osite in opportunistic_sites]): wfh.sendLog('assignor',"One of the usable site is in downtime %s"%([osite in SI.sites_not_ready for osite in opportunistic_sites])) down_time = True ## should this be send back to considered ? ## should be 2 but for the time-being let's lower it to get things going copies_wanted,cpuh = wfh.getNCopies() wfh.sendLog('assignor',"we need %s CPUh"%cpuh) if cpuh>max_cpuh_block and not options.go: #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**']) sendLog('assignor','%s requires a large numbr of CPUh %s , not assigning, please check with requester'%( wfo.name, cpuh), level='critical') wfh.sendLog('assignor',"Requiring a large number of CPUh %s, not assigning"%cpuh) continue if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies'] copies_wanted = min(copies_needed_from_campaign, copies_wanted) if not options.early: less_copies_than_requested = UC.get("less_copies_than_requested") copies_wanted = max(1,copies_wanted-less_copies_than_requested) # take one out for the efficiency else: ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going pass wfh.sendLog('assignor',"needed availability fraction %s"% copies_wanted) ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently if available_fractions and not all([available>=copies_wanted for available in available_fractions.values()]): not_even_once = not all([available>=1. for available in available_fractions.values()]) wfh.sendLog('assignor',"The input dataset is not available %s times, only %s"%( copies_wanted, available_fractions.values())) if down_time and not options.go and not options.early: wfo.status = 'considered' session.commit() wfh.sendLog('assignor',"sending back to considered because of site downtime, instead of waiting") #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) sendLog('assignor','%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.'%( wfo.name ), level='delay') continue #pass print json.dumps(available_fractions) if (options.go and not_even_once) or not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known and not options.limit and not options.go and not options.early and not options.partial: wfh.sendLog('assignor',"cannot be assigned, %s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions))) known.append( wfo.name ) open('cannot_assign.json','w').write(json.dumps( known, indent=2)) n_stalled+=1 if options.early: if wfo.status == 'considered': wfh.sendLog('assignor',"setting considered-tried") wfo.status = 'considered-tried' session.commit() else: print "tried but status is",wfo.status if options.partial: print "Will move on with partial locations" else: continue ## default back to white list to original white list with any data print "Allowed",sorted(sites_allowed) if options.primary_aaa: sites_allowed = initial_sites_allowed #options.useSiteListAsLocation = True options.TrustSitelists = True else: sites_allowed = sites_with_any_data wfh.sendLog('assignor',"Selected for any data %s"%sorted(sites_allowed)) if options.restrict: print "Allowed",sites_allowed sites_allowed = sites_with_any_data print "Selected",sites_allowed else: if set(sites_with_data) != set(sites_allowed): ## the data is not everywhere we wanted to run at : enable aaa print "Sites with 90% data not matching site white list (block choping!)" print "Resorting to AAA reading for",list(set(sites_allowed) - set(sites_with_data)),"?" print "Whitelist site with any data",list(set(sites_allowed) - set(sites_with_any_data)) #options.useSiteListAsLocation = True #print "Not commissioned yet" #continue #print "We could be running at",opportunistic_sites,"in addition" ##sites_allowed = list(set(sites_allowed+ opportunistic_sites)) ### check on endpoints for on-going transfers if endpoints and options.partial: sites_allowed = list(set(sites_allowed + [SI.SE_to_CE(s) for s in endpoints])) print "with added endpoints",sorted(sites_allowed) #if options.partial: # continue if not len(sites_allowed): wfh.sendLog('assignor',"cannot be assign with no matched sites") #sendEmail( "cannot be assigned","%s has no whitelist"%(wfo.name)) sendLog('assignor','%s has no whitelist'% wfo.name, level='critical') n_stalled+=1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])] ## one last modification now that we know we can assign, and to make sure all ressource can be used by the request : set all ON sites to whitelist ###sites_allowed = original_sites_allowed ## not needed, afterall as secondary jobs go their own ways wfh.sendLog('assignor',"Placing the output on %s"%sites_out) parameters={ 'SiteWhitelist' : sites_allowed, #'CustodialSites' : sites_custodial, 'NonCustodialSites' : sites_out, 'AutoApproveSubscriptionSites' : list(set(sites_out)), 'AcquisitionEra' : wfh.acquisitionEra(), 'ProcessingString' : wfh.processingString(), 'MergedLFNBase' : set_lfn, 'ProcessingVersion' : version, } ## plain assignment here team='production' if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM') if options and options.team: team = options.team ## high priority team agent #if wfh.request['RequestPriority'] >= 100000 and (wfh.request['TimePerEvent']*int(wfh.getRequestNumEvents()))/(8*3600.) < 10000: # team = 'highprio' # sendEmail("sending work with highprio team","%s"% wfo.name, destination=['*****@*****.**']) ## SDSC redirection #if "T2_US_UCSD" in sites_with_data and random.random() < -0.5 and wfh.request['Campaign']=='RunIISpring15DR74' and int(wfh.getRequestNumEvents()) < 600000 and not any([out.endswith('RAW') for out in wfh.request['OutputDatasets']]): # ## consider SDSC # parameters['SiteWhitelist'] = ['T2_US_UCSD','T3_US_SDSC'] # parameters['useSiteListAsLocation'] = True # team = 'allocation-based' # sendEmail("sending work to SDSC","%s was assigned to SDSC/UCSD"% wfo.name, destination=['*****@*****.**']) ## SDSC redirection #if wfh.request['Campaign']==R'unIIWinter15GS' and random.random() < -1.0: # parameters['SiteWhitelist'] = ['T3_US_SDSC'] # team = 'allocation-based' # sendEmail("sending work to SDSC","%s was assigned to SDSC"% wfo.name, destination=['*****@*****.**']) if False and 'T2_CH_CERN' in parameters['SiteWhitelist']: ## add some check on ### the amount pending to HLT ### the size of the request ### the priority of the request (maybe not if we decide to overflow during runs) parameters['SiteWhitelist'] = ['T2_CH_CERN_HLT'] team = 'hlt' ## reduce the splitting by factor of 4, regardless of type of splitting sendEmail("sending work to HLT","%s was assigned to HLT"%wfo.name) ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v=getattr(options,key) if v!=None: if type(v)==str and ',' in v: parameters[key] = filter(None,v.split(',')) else: parameters[key] = v ## pick up campaign specific assignment parameters parameters.update( CI.parameters(wfh.request['Campaign']) ) if not options.test: parameters['execute'] = True split_check = wfh.checkWorkflowSplitting() if split_check!=True: parameters.update( split_check ) if 'EventBased' in split_check.values(): wfh.sendLog('assignor', "Falling back to event splitting.") #sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name) sendLog('assignor','the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting'%wfo.name, level='critical') elif 'EventsPerJob' in split_check.values(): wfh.sendLog('assignor', "Modifying the number of job per event") #sendEmail("Modifying the job per events","the workflow %s is too heavy in number of jobs explosion"%wfo.name) sendLog('assignor',"the workflow %s is too heavy in number of jobs explosion"%wfo.name, level='critical') # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi)/float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents/(reqJobs*1.4)) lumisPerJob = int(eventsPerJob/eventsPerLumi) if lumisPerJob==0: #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob), level='critical') wfh.sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl['events_per_job'] if 'events_per_job' in spl else None eventsPerJobEstimated = spl['avg_events_per_job'] if 'avg_events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob), level='critical') wfh.sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) sendLog('assignor',"leaving splitting untouched for %s, please check on %s"%( pstring, wfo.name), level='critical') wfh.sendLog('assignor',"leaving splitting untouched for PU_RD*, please check.") result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned+=1 try: ## refetch information and lock output new_wfi = workflowInfo( url, wfo.name) (_,prim,_,sec) = new_wfi.getIO() for secure in list(prim)+list(sec)+new_wfi.request['OutputDatasets']: ## lock all outputs flat NLI.lock( secure ) #for site in [SI.CE_to_SE(site) for site in sites_allowed]: # for output in new_wfi.request['OutputDatasets']: # LI.lock( output, site, 'dataset in production') # for primary in prim: # LI.lock( primary, site, 'dataset used in input') # for secondary in sec: # LI.lock( secondary, site, 'required for mixing' ) except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output",str(e)) else: print "ERROR could not assign",wfo.name else: pass print "Assignment summary:" sendLog('assignor',"Assigned %d Stalled %s"%(n_assigned, n_stalled))
def actor(url, options=None): mlock = moduleLock(wait=False, silent=True) if mlock(): return if userLock('actor'): return up = componentInfo(soft=['mcm']) if not up.check(): return # CI = campaignInfo() SI = siteInfo() UC = unifiedConfiguration() WC = wtcClient() WI = wtcInfo() JC = JIRAClient() action_list = WC.get_actions() if action_list is None: print "Not able to load action list" sendLog('actor', 'Not able to load action list', level='critical') return if options.actions: action_list = json.loads(open(options.actions).read()) print json.dumps(action_list, indent=2) if not action_list: print "EMPTY!" return wf_list = action_list.keys() print json.dumps(sorted(wf_list), indent=2) if options.spec: wf_list = [wf for wf in wf_list if options.spec in wf] max_per_round = UC.get('max_per_round').get('actor', None) if max_per_round: random.shuffle(wf_list) wf_list = wf_list[:max_per_round] for wfname in wf_list: print '-' * 100 print "Looking at", wfname, "for recovery options" to_clone = False to_acdc = False to_force = False to_hold = False something_to_do = False tasks = action_list[wfname].get('Parameters', None) to_acdc = action_list[wfname].get('Action', None) == 'acdc' to_clone = action_list[wfname].get('Action', None) == 'clone' to_force = action_list[wfname].get( 'Action', None) == 'special' and action_list[wfname].get( 'Parameters', {}).get('action', None) in ['by-pass', 'bypass'] to_hold = action_list[wfname].get( 'Action', None) == 'special' and action_list[wfname].get( 'Parameters', {}).get('action', None) in ['onhold', 'on-hold'] if not to_acdc and not to_clone and not to_force and not to_hold: sendLog( 'actor', 'Action submitted for something other than acdc, clone, bypass or hold for workflow %s' % wfname, level='critical') print json.dumps(action_list[wfname], indent=2) continue if not tasks and to_acdc: sendLog('actor', 'Empty action submitted for workflow %s' % wfname, level='critical') print "Moving on. Parameters is blank for " + wfname continue wfi = workflowInfo(url, wfname) recover = True message_to_ops = "" message_to_user = "" #=========================================================== if to_clone and options.do: print "Let's try kill and clone: " wfi.sendLog('actor', 'Going to clone %s' % wfname) comment = "" if 'comment' in tasks: comment = ", reason: " + tasks['comment'] wfi.sendLog( 'actor', "invalidating the workflow by traffic controller %s" % comment) #Reject all workflows in the family inv_results = invalidate(url, wfi, only_resub=False, with_output=True) all_good = all(inv_results) if all_good: wfi.sendLog('actor', "%s and children are rejected" % wfname) else: wfi.sendLog('actor', "Failed to reject the request and dependents") sendLog('actor', 'Failed to reject the familly of %s' % wfname, level='critical') continue cloned = None try: cloned = singleClone(url, wfname, tasks, comment, options.do) except Exception as e: sendLog( 'actor', 'Failed to create clone for %s! Check logs for more information. Action will need to be resubmitted.' % wfname, level='critical') wfi.sendLog('actor', 'Failed to create clone for %s!' % wfname) print str(e) ##let's not remove the action other the workflow goes to "trouble" and the WTC cannot set the action again if not cloned: recover = False wfi.sendLog('actor', 'Failed to create clone for %s!' % wfname) sendLog('actor', 'Failed to create clone for %s!' % wfname, level='critical') else: wfi.sendLog('actor', "Workflow %s cloned into %s" % (wfname, cloned)) ## set to trouble for swift replacement for wfo in session.query(Workflow).filter( Workflow.name == wfname).all(): wfo.status = 'trouble' session.commit() #=========================================================== elif to_force: wfi.sendLog( 'actor', 'Force-completing from workflow traffic controler request') WI.add(action='force', keyword=wfname, user=action_list[wfname].get('user', 'unified')) elif to_hold: wfi.sendLog('actor', 'Holding on workflow traffic controler request') WI.add(action='hold', keyword=wfname, user=action_list[wfname].get('user', 'unified')) #=========================================================== elif to_acdc: if 'AllSteps' in tasks: allTasksDefaults = tasks['AllSteps'] tasks.pop('AllSteps') for setting in allTasksDefaults: for task in tasks: if setting in tasks[task]: tasks[task][setting] = allTasksDefaults[setting] else: tasks[task].append( {setting: allTasksDefaults[setting]}) print "Tasks is " print json.dumps(tasks, indent=2) all_tasks = wfi.getAllTasks() ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves try: WMErr = wfi.getWMErrors() # print WMErr except: sendLog( 'actor', 'Cannot create ACDCS for %s because WMErr cannot be reached.' % wfname, level='critical') continue if not WMErr: wfi.sendLog('actor', 'WMErrors is blank for %s.' % wfname) print "FYI getWMErrors is blank. Presumably there are only unreported errors" # continue try: where_to_run, missing_to_run, missing_to_run_at = wfi.getRecoveryInfo( ) print "Where to run = " print where_to_run if not where_to_run: sendLog( 'actor', 'Cannot create ACDCS for %s because recovery info cannot be found.' % wfname, level='critical') continue except: sendLog( 'actor', 'Cannot create ACDCS for %s because recovery info cannot be found.' % wfname, level='critical') print "Moving on. Cannot access recovery info for " + wfname continue if not where_to_run: sendLog( 'actor', 'Cannot create ACDCS for %s because site list cannot be found.' % wfname, level='critical') print "Moving on. where to run is blank" continue message_to_ops = "" message_to_user = "" num_tasks_to_recover = 0 if WMErr: for task in WMErr: if 'LogCollect' in task: continue if 'Cleanup' in task: continue if not 'jobfailed' in WMErr[task]: continue else: num_tasks_to_recover += 1 # print "Task to recover: " + task if not num_tasks_to_recover: print "\tno error for", wfname # recover = False if 'LheInputFiles' in wfi.request and wfi.request['LheInputFiles']: ## we do not try to recover pLHE sendLog( 'actor', 'Cannot create ACDCS for %s because it is a pLHE workflow.' % wfname, level='critical') print "We don't try to recover pLHE. Moving on." recover = False # sendEmail('cannot submit action', '%s is a pLHE workflow. We do not try to recover pLHE'%wfname) # if wfi.request['RequestType'] in ['ReReco']: # recover= False # print 'cannot submit action. ReReco' # sendEmail('cannot submit action', '%s is request type ReReco'%wfname) recovering = set() for task in tasks: assign_to_sites = set() print "Task names is " + task fulltaskname = '/' + wfname + '/' + task print "Full task name is " + fulltaskname print where_to_run.keys() wrong_task = False for task_info in all_tasks: if fulltaskname == task_info.pathName: if task_info.taskType not in [ 'Processing', 'Production', 'Merge' ]: wrong_task = True wfi.sendLog( 'actor', "Skipping task %s because the taskType is %s. Can only ACDC Processing, Production, or Merge tasks" % (fulltaskname, task_info.taskType)) if not fulltaskname in where_to_run.keys(): wrong_task = True wfi.sendLog( 'actor', "Skipping task %s because there is no acdc doc for it anyways." % (fulltaskname)) if wrong_task: continue print tasks[task] actions = tasks[task] for action in actions: if action.startswith('sites'): if type(actions[action]) != list: assign_to_sites = [SI.SE_to_CE(actions[action])] else: assign_to_sites = list( set([ SI.SE_to_CE(site) for site in actions[action] ])) # if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same' and wfi.request['RequestType'] in ['TaskChain']: # recover = False; # print "Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname # wfi.sendLog('actor',"Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname) if not 'sites' in actions: assign_to_sites = list( set([ SI.SE_to_CE(site) for site in where_to_run[fulltaskname] ])) print "Found", sorted( assign_to_sites ), "as sites where to run the ACDC at, from the acdc doc of ", wfname print "Going to run at", sorted(assign_to_sites) if recover: print "Initiating recovery" acdc = singleRecovery(url, fulltaskname, wfi.request, actions, do=options.do) if not acdc: if options.do: if recovering: print wfname + " has been partially ACDC'ed. Needs manual attention." sendLog( 'actor', "%s has had %s/%s recoveries %s only" % (wfname, len(recovering), num_tasks_to_recover, list(recovering)), level='critical') wfi.sendLog( 'actor', "%s has had %s/%s recoveries %s only" % (wfname, len(recovering), num_tasks_to_recover, list(recovering))) break else: print wfname + " failed recovery once" recover = False break else: print "no action to take further" # sendLog('recoveror', "ACDC for %s can be done automatically"% wfname, level='critical') continue else: #ACDC was made correctly. Now we have to assign it. wfi.sendLog( 'actor', 'ACDC created for task %s. Actions taken \n%s' % (fulltaskname, json.dumps(actions))) jira_comment = "%s created ACDC for task %s with action %s" % ( action_list[wfname].get('user', 'unified'), task.split('/')[-1], json.dumps(actions), ) reason = action_list[wfname].get('Reason', None) if reason: jira_comment += '\ndue to: %s' % (reason) #team = wfi.request['Teams'][0] team = 'production' parameters = { 'SiteWhitelist': sorted(assign_to_sites), 'AcquisitionEra': wfi.acquisitionEra(), 'ProcessingString': wfi.processingString(), 'MergedLFNBase': wfi.request['MergedLFNBase'], 'ProcessingVersion': wfi.request['ProcessingVersion'], } ## hackery for ACDC merge assignment if wfi.request[ 'RequestType'] == 'TaskChain' and 'Merge' in task.split( '/')[-1]: parameters['AcquisitionEra'] = None parameters['ProcessingString'] = None ## xrootd setttings on primary and secondary if 'xrootd' in actions: if actions['xrootd'] == 'enabled': print "Going to assign via xrootd" parameters['TrustSitelists'] = True elif actions['xrootd'] == 'disabled': parameters['TrustSitelists'] = False elif ('TrustSitelists' in wfi.request and wfi.request['TrustSitelists'] == 'true'): parameters['TrustSitelists'] = True else: parameters['TrustSitelists'] = False if 'secondary' in actions: if actions['secondary'] == 'enabled': print 'Enabling reading the secondary input via xrootd' parameters['TrustPUSitelists'] = True elif actions['secondary'] == 'disabled': parameters['TrustPUSitelists'] = False #in case secondary is blank or not set to enabled or disabled elif 'TrustPUSitelists' in wfi.request and wfi.request[ 'TrustPUSitelists']: parameters['TrustPUSitelists'] = True elif 'TrustPUSitelists' in wfi.request and wfi.request[ 'TrustPUSitelists']: parameters['TrustPUSitelists'] = True if options.ass: print "really doing the assignment of the ACDC", acdc parameters['execute'] = True #wfi.sendLog('actor',"%s was assigned for recovery"% acdc) else: print "no assignment done with this ACDC", acdc sendLog('actor', "%s needs to be assigned" % (acdc), level='critical') wfi.sendLog( 'actor', "%s needs to be assigned by hand" % (acdc)) continue # print parameters result = reqMgrClient.assignWorkflow( url, acdc, team, parameters) if not result: print acdc, "was not assigned" sendLog('actor', "%s failed to be assigned" % (acdc), level='critical') wfi.sendLog( 'actor', "%s failed to get assigned for recovery" % acdc) else: wfi.sendLog('actor', "%s was assigned for recovery" % acdc) recovering.add(acdc) #wfi.sendLog('actor',"ACDCs created for %s"%wfname) try: if jira_comment: jiras = JC.find( {'prepid': wfi.request['PrepID']}) if len(jiras) == 1: ## put a comment on the single corresponding ticket JC.comment(jiras[0].key, jira_comment) JC.progress(jiras[0].key) except Exception as e: print "failed with JIRA" print str(e) #=========================================================== if recover and options.do: r = WC.remove_action(wfname) if not r: sendLog( 'actor', 'not able to remove the action, interlocking the module', level='critical') os.system('touch %s/actor.failed-%s.lock' % (base_eos_dir, os.getpid())) sys.exit(-1) ## update the status with recovering removing manual for wfo in session.query(Workflow).filter( Workflow.name == wfname).all(): wfo.status = wfo.status.replace('manual', 'recovering') session.commit() if message_to_user: print wfname, "to be notified to user(DUMMY)", message_to_user if message_to_ops: print 'message' #sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**']) # sendLog('recoveror',message_to_ops,level='warning') return
def actor(url,options=None): mlock = moduleLock(wait=False ,silent=True) if mlock(): return if userLock('actor'): return up = componentInfo(soft=['mcm']) if not up.check(): return # CI = campaignInfo() SI = siteInfo() UC = unifiedConfiguration() WC = wtcClient() WI = wtcInfo() JC = JIRAClient() action_list = WC.get_actions() if action_list is None: print "Not able to load action list" sendLog('actor','Not able to load action list', level='critical') return if options.actions: action_list = json.loads(open(options.actions).read()) print json.dumps( action_list, indent=2) if not action_list: print "EMPTY!" return wf_list = action_list.keys() print json.dumps( sorted( wf_list), indent=2) if options.spec: wf_list = [wf for wf in wf_list if options.spec in wf] max_per_round = UC.get('max_per_round').get('actor', None) if max_per_round: random.shuffle( wf_list ) wf_list = wf_list[:max_per_round] for wfname in wf_list: print '-'*100 print "Looking at",wfname,"for recovery options" to_clone = False to_acdc = False to_force = False to_hold = False something_to_do = False tasks = action_list[wfname].get( 'Parameters' , None) to_acdc = action_list[wfname].get( 'Action', None) == 'acdc' to_clone = action_list[wfname].get( 'Action', None) == 'clone' to_force = action_list[wfname].get( 'Action', None) == 'special' and action_list[wfname].get( 'Parameters' ,{}).get('action',None) in ['by-pass', 'bypass'] to_hold = action_list[wfname].get( 'Action', None) == 'special' and action_list[wfname].get( 'Parameters' ,{}).get('action',None) in ['onhold','on-hold'] if not to_acdc and not to_clone and not to_force and not to_hold: sendLog('actor','Action submitted for something other than acdc, clone, bypass or hold for workflow %s'%wfname,level='critical') print json.dumps( action_list[wfname] , indent=2) continue if not tasks and to_acdc: sendLog('actor','Empty action submitted for workflow %s'%wfname,level='critical') print "Moving on. Parameters is blank for " + wfname continue wfi = workflowInfo(url, wfname) recover = True message_to_ops = "" message_to_user = "" #=========================================================== if to_clone and options.do: print "Let's try kill and clone: " wfi.sendLog('actor','Going to clone %s'%wfname) comment="" if 'comment' in tasks: comment = ", reason: "+ tasks['comment'] wfi.sendLog('actor',"invalidating the workflow by traffic controller %s"%comment) #Reject all workflows in the family inv_results = invalidate(url, wfi, only_resub=False, with_output=True) all_good = all(inv_results) if all_good: wfi.sendLog('actor',"%s and children are rejected"%wfname) else: wfi.sendLog('actor',"Failed to reject the request and dependents") sendLog('actor','Failed to reject the familly of %s'% wfname, level='critical') continue cloned = None try: cloned = singleClone(url, wfname, tasks, comment, options.do) except Exception as e: sendLog('actor','Failed to create clone for %s! Check logs for more information. Action will need to be resubmitted.'%wfname,level='critical') wfi.sendLog('actor','Failed to create clone for %s!'%wfname) print str(e) ##let's not remove the action other the workflow goes to "trouble" and the WTC cannot set the action again if not cloned: recover = False wfi.sendLog('actor','Failed to create clone for %s!'%wfname) sendLog('actor','Failed to create clone for %s!'%wfname,level='critical') else: wfi.sendLog('actor',"Workflow %s cloned into %s"%(wfname, cloned)) ## set to trouble for swift replacement for wfo in session.query(Workflow).filter(Workflow.name == wfname).all(): wfo.status = 'trouble' session.commit() #=========================================================== elif to_force: wfi.sendLog('actor','Force-completing from workflow traffic controler request') WI.add(action='force', keyword = wfname, user = action_list[wfname].get( 'user', 'unified')) elif to_hold: wfi.sendLog('actor','Holding on workflow traffic controler request') WI.add(action='hold', keyword = wfname, user = action_list[wfname].get( 'user', 'unified')) #=========================================================== elif to_acdc: if 'AllSteps' in tasks: allTasksDefaults = tasks['AllSteps'] tasks.pop('AllSteps') for setting in allTasksDefaults: for task in tasks: if setting in tasks[task]: tasks[task][setting] = allTasksDefaults[setting] else: tasks[task].append({setting:allTasksDefaults[setting]}) print "Tasks is " print json.dumps(tasks, indent=2) all_tasks = wfi.getAllTasks() ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves try: WMErr = wfi.getWMErrors() # print WMErr except: sendLog('actor','Cannot create ACDCS for %s because WMErr cannot be reached.'%wfname,level='critical') continue if not WMErr: wfi.sendLog('actor','WMErrors is blank for %s.'%wfname) print "FYI getWMErrors is blank. Presumably there are only unreported errors" # continue try: where_to_run, missing_to_run,missing_to_run_at = wfi.getRecoveryInfo() print "Where to run = " print where_to_run if not where_to_run: sendLog('actor','Cannot create ACDCS for %s because recovery info cannot be found.'%wfname,level='critical') continue except: sendLog('actor','Cannot create ACDCS for %s because recovery info cannot be found.'%wfname,level='critical') print "Moving on. Cannot access recovery info for " + wfname continue if not where_to_run: sendLog('actor','Cannot create ACDCS for %s because site list cannot be found.'%wfname,level='critical') print "Moving on. where to run is blank" continue message_to_ops = "" message_to_user = "" num_tasks_to_recover = 0 if WMErr: for task in WMErr: if 'LogCollect' in task: continue if 'Cleanup' in task: continue if not 'jobfailed' in WMErr[task]: continue else: num_tasks_to_recover += 1 # print "Task to recover: " + task if not num_tasks_to_recover: print "\tno error for",wfname # recover = False if 'LheInputFiles' in wfi.request and wfi.request['LheInputFiles']: ## we do not try to recover pLHE sendLog('actor','Cannot create ACDCS for %s because it is a pLHE workflow.'%wfname,level='critical') print "We don't try to recover pLHE. Moving on." recover = False # sendEmail('cannot submit action', '%s is a pLHE workflow. We do not try to recover pLHE'%wfname) # if wfi.request['RequestType'] in ['ReReco']: # recover= False # print 'cannot submit action. ReReco' # sendEmail('cannot submit action', '%s is request type ReReco'%wfname) recovering = set() for task in tasks: assign_to_sites = set() print "Task names is " + task fulltaskname = '/' + wfname + '/' + task print "Full task name is " + fulltaskname print where_to_run.keys() wrong_task = False for task_info in all_tasks: if fulltaskname == task_info.pathName: if task_info.taskType not in ['Processing','Production','Merge']: wrong_task= True wfi.sendLog('actor', "Skipping task %s because the taskType is %s. Can only ACDC Processing, Production, or Merge tasks"%( fulltaskname, task_info.taskType)) if not fulltaskname in where_to_run.keys(): wrong_task= True wfi.sendLog('actor', "Skipping task %s because there is no acdc doc for it anyways."%(fulltaskname)) if wrong_task: continue print tasks[task] actions = tasks[task] for action in actions: if action.startswith('sites'): if type(actions[action]) != list: assign_to_sites=[SI.SE_to_CE(actions[action])] else: assign_to_sites=list(set([SI.SE_to_CE(site) for site in actions[action]])) # if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same' and wfi.request['RequestType'] in ['TaskChain']: # recover = False; # print "Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname # wfi.sendLog('actor',"Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname) if not 'sites' in actions: assign_to_sites = list(set([SI.SE_to_CE(site) for site in where_to_run[fulltaskname]])) print "Found",sorted(assign_to_sites),"as sites where to run the ACDC at, from the acdc doc of ",wfname print "Going to run at",sorted(assign_to_sites) if recover: print "Initiating recovery" acdc = singleRecovery(url, fulltaskname, wfi.request, actions, do = options.do) if not acdc: if options.do: if recovering: print wfname + " has been partially ACDC'ed. Needs manual attention." sendLog('actor', "%s has had %s/%s recoveries %s only"%( wfname, len(recovering), num_tasks_to_recover, list(recovering)), level='critical') wfi.sendLog('actor', "%s has had %s/%s recoveries %s only"%( wfname, len(recovering), num_tasks_to_recover, list(recovering))) break else: print wfname + " failed recovery once" recover = False break else: print "no action to take further" # sendLog('recoveror', "ACDC for %s can be done automatically"% wfname, level='critical') continue else: #ACDC was made correctly. Now we have to assign it. wfi.sendLog('actor','ACDC created for task %s. Actions taken \n%s'%(fulltaskname,json.dumps(actions))) jira_comment = "%s created ACDC for task %s with action %s"%( action_list[wfname].get( 'user', 'unified'), task.split('/')[-1] , json.dumps(actions), ) reason = action_list[wfname].get( 'Reason', None) if reason: jira_comment += '\ndue to: %s'%(reason) #team = wfi.request['Teams'][0] team = 'production' parameters={ 'SiteWhitelist' : sorted(assign_to_sites), 'AcquisitionEra' : wfi.acquisitionEra(), 'ProcessingString' : wfi.processingString(), 'MergedLFNBase' : wfi.request['MergedLFNBase'], 'ProcessingVersion' : wfi.request['ProcessingVersion'], } ## hackery for ACDC merge assignment if wfi.request['RequestType'] == 'TaskChain' and 'Merge' in task.split('/')[-1]: parameters['AcquisitionEra'] = None parameters['ProcessingString'] = None ## xrootd setttings on primary and secondary if 'xrootd' in actions: if actions['xrootd'] == 'enabled': print "Going to assign via xrootd" parameters['TrustSitelists'] = True elif actions['xrootd'] == 'disabled': parameters['TrustSitelists'] = False elif ('TrustSitelists' in wfi.request and wfi.request['TrustSitelists']=='true'): parameters['TrustSitelists'] = True else: parameters['TrustSitelists'] = False if 'secondary' in actions: if actions['secondary'] == 'enabled': print 'Enabling reading the secondary input via xrootd' parameters['TrustPUSitelists'] = True elif actions['secondary'] == 'disabled': parameters['TrustPUSitelists'] = False #in case secondary is blank or not set to enabled or disabled elif 'TrustPUSitelists' in wfi.request and wfi.request['TrustPUSitelists']: parameters['TrustPUSitelists'] = True elif 'TrustPUSitelists' in wfi.request and wfi.request['TrustPUSitelists']: parameters['TrustPUSitelists'] = True if options.ass: print "really doing the assignment of the ACDC",acdc parameters['execute']=True #wfi.sendLog('actor',"%s was assigned for recovery"% acdc) else: print "no assignment done with this ACDC",acdc sendLog('actor',"%s needs to be assigned"%(acdc), level='critical') wfi.sendLog('actor',"%s needs to be assigned by hand"%(acdc)) continue # print parameters result = reqMgrClient.assignWorkflow(url, acdc, team, parameters) if not result: print acdc,"was not assigned" sendLog('actor',"%s failed to be assigned"%(acdc), level='critical') wfi.sendLog('actor',"%s failed to get assigned for recovery"% acdc) else: wfi.sendLog('actor',"%s was assigned for recovery"% acdc) recovering.add( acdc ) #wfi.sendLog('actor',"ACDCs created for %s"%wfname) try: if jira_comment: jiras = JC.find({'prepid' : wfi.request['PrepID']}) if len(jiras)==1: ## put a comment on the single corresponding ticket JC.comment(jiras[0].key, jira_comment) JC.progress(jiras[0].key) except Exception as e: print "failed with JIRA" print str(e) #=========================================================== if recover and options.do: r = WC.remove_action(wfname) if not r: sendLog('actor','not able to remove the action, interlocking the module', level='critical') os.system('touch %s/actor.failed-%s.lock'%( base_eos_dir, os.getpid() )) sys.exit(-1) ## update the status with recovering removing manual for wfo in session.query(Workflow).filter(Workflow.name == wfname).all(): wfo.status = wfo.status.replace('manual','recovering') session.commit() if message_to_user: print wfname,"to be notified to user(DUMMY)",message_to_user if message_to_ops: print 'message' #sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**']) # sendLog('recoveror',message_to_ops,level='warning') return