for ds in dss: availability = getDatasetBlocksFraction(url, ds ) if availability>=1: continue blocks= [] chops,sizes = getDatasetChops(ds, chop_threshold = 4000, only_blocks=blocks) spreading = distributeToSites( chops, prim_to_distribute, n_copies = copies_needed, weights=SI.cpu_pledges, sizes=sizes) print json.dumps(spreading, indent=2) for site,items in spreading.items(): full_spread[site].update( items ) to_lock.add( ds ) addHocLocks = set(json.loads( open('addhoc_lock.json').read())) addHocLocks.update( to_lock ) if act: open('addhoc_lock.json','w').write( json.dumps( sorted(addHocLocks), indent=2 ) ) for l in to_lock: if act: LI.lock(l, 'add HOC') for site,items in full_spread.items(): print site se = SI.CE_to_SE(site) print se print len(items) print json.dumps(sorted(items), indent=2) if act: print makeReplicaRequest(url, se , list(set(items)), 'pre-staging', priority='high', approve=True, mail=False)
print "\tshould be replicated to %s"%(','.join(sswl)) wfi = workflowInfo(url, wf) copies_wanted,cpuh = wfi.getNCopies() go_to = si.CE_to_SE(si.pick_CE(sswl)) #go_to = random.choice( swl ) try_me[go_to].add( b ) ## pick a site that should host this ! #wfi.sendLog('GQ','Sending %s to %s'%( b, go_to)) RB = replacedBlocks() for site,blocks in try_me.items(): replace_blocks = [b for b in blocks if not RB.test(b) ] blocks = replace_blocks if UC.get('block_repositionning'): if blocks: RB.add( blocks ) result = makeReplicaRequest(url, site, list(blocks), 'item relocation', priority='reserved', approve=True, mail=False) sendLog('GQ','replacing %s at %s \n%s'%( '\n,'.join(blocks), site, result),level='warning') else: sendLog('GQ','tempting to put %s at %s'%( '\n,'.join(blocks), site),level='warning') eosFile('%s/GQ.json'%monitor_dir,'w').write( json.dumps( jobs_for, indent=2) ).close() eosFile('%s/GQ.txt'%monitor_dir,'w').write( report ).close() if heavy_duty: sendLog('GQ','There are some heavy duty workflows in the system, likely to cause damage\n%s\n%s'%(sorted(heavy_duty.keys()), json.dumps( heavy_duty, indent=2) ),level='critical')
def close(self): if os.path.isfile('.closor_stop'): print "The closing of workflows is shortened" return url = self.url batch_go = self.batch_go CI = self.CI UC = self.UC wfo = self.wfo jump_the_line = self.jump_the_line batch_goodness = self.batch_goodness check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce') ## what is the expected #lumis self.wfi = workflowInfo(url, wfo.name) wfi = self.wfi wfo.wm_status = wfi.request['RequestStatus'] if wfi.isRelval(): has_batch_go = False batch_name = wfi.getCampaign() if not batch_name in batch_go: ## do the esimatation whethere this can be announced : only once per batch in_batches = getWorkflowByCampaign(url, batch_name, details=True) batch_go[batch_name] = all( map( lambda s: not s in [ 'completed', 'running-open', 'running-closed', 'acquired', 'staged', 'staging', 'assigned', 'assignment-approved' ], [r['RequestStatus'] for r in in_batches])) ## already verified has_batch_go = batch_go[batch_name] if not has_batch_go: wfi.sendLog( 'closor', 'Cannot close for now because the batch <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?campaign=%s>%s</a> is not all close' % (batch_name, batch_name)) return if wfi.request['RequestStatus'] in ['announced', 'normal-archived' ] and not options.force: ## manually announced ?? self.to_status = 'done' self.to_wm_status = wfi.request['RequestStatus'] wfi.sendLog( 'closor', '%s is announced already : %s' % (wfo.name, self.to_wm_status)) return if jump_the_line: wfi.sendLog('closor', 'Announcing while completing') expected_lumis = 1 if not 'TotalInputLumis' in wfi.request: print wfo.name, "has not been assigned yet, or the database is corrupted" elif wfi.request['TotalInputLumis'] == 0: print wfo.name, "is corrupted with 0 expected lumis" else: expected_lumis = wfi.request['TotalInputLumis'] ## what are the outputs outputs = wfi.request['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = defaultdict(lambda: False) stats = defaultdict(int) #print outputs if len(outputs): print wfo.name, wfi.request['RequestStatus'] for out in outputs: event_count, lumi_count = getDatasetEventsAndLumis(dataset=out) self.outs.append(Output(datasetname=out)) odb = self.outs[-1] odb.workflow = wfo odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) fraction = lumi_count / float(expected_lumis) * 100. completion_line = "%60s %d/%d = %3.2f%%" % ( out, lumi_count, expected_lumis, fraction) wfi.sendLog('closor', "\t%s" % completion_line) if wfi.isRelval() and fraction < batch_goodness: self.batch_warnings[wfi.getCampaign()].add(completion_line) if fraction < 50: self.batch_extreme_warnings[wfi.getCampaign()].add( completion_line) stats[out] = lumi_count all_OK[out] = True ## check for at least one full copy prior to moving on in_full = {} for out in outputs: in_full[out] = [] presence = getDatasetPresence(url, out) where = [site for site, info in presence.items() if info[0]] if where: all_OK[out] = True print out, "is in full at", ",".join(where) in_full[out] = copy.deepcopy(where) else: going_to = wfi.request['NonCustodialSites'] + wfi.request[ 'CustodialSites'] wfi.sendLog( 'closor', "%s is not in full anywhere. send to %s" % (out, ",".join(sorted(going_to)))) at_destination = dict([(k, v) for (k, v) in presence.items() if k in going_to]) else_where = dict([(k, v) for (k, v) in presence.items() if not k in going_to]) print json.dumps(at_destination) print json.dumps(else_where, indent=2) ## do the full stuck transfer study, missing files and shit ! for there in going_to: late_info = findLateFiles(url, out, going_to=there) for l in late_info: l.update({"workflow": wfo.name, "dataset": out}) self.all_late_files.extend(late_info) if check_fullcopy_to_announce: ## only set this false if the check is relevant all_OK[out] = False ## verify if we have to do harvesting if not options.no_harvest and not jump_the_line: (OK, requests) = spawn_harvesting(url, wfi, in_full) all_OK.update(OK) ## only that status can let me go into announced if all(all_OK.values()) and ( (wfi.request['RequestStatus'] in ['closed-out']) or options.force or jump_the_line): print wfo.name, "to be announced" results = [] if not results: for out in outputs: print "dealing with", out if out in stats and not stats[out]: continue _, dsn, process_string, tier = out.split('/') if all_OK[out]: print "setting valid" results.append( setDatasetStatus(out, 'VALID', withFiles=False)) if all_OK[out] and wfi.isRelval(): ## make the specific relval rules and the replicas ## figure the destination(s) out destinations = set() if tier != "RECO" and tier != "ALCARECO": destinations.add('T2_CH_CERN') if tier == "GEN-SIM": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-DIGI-RAW": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-RECO": destinations.add('T1_US_FNAL_Disk') if "RelValTTBar" in dsn and "TkAlMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if "MinimumBias" in dsn and "SiStripCalMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if destinations: wfi.sendLog( 'closor', '%s to go to %s' % (out, ', '.join(sorted(destinations)))) ## call to makereplicarequest under relval => done for site in destinations: result = makeReplicaRequest( url, site, [out], 'Copy for release validation consumption', priority='normal', approve=True, mail=False, group='RelVal') try: request_id = result['phedex'][ 'request_created'][0]['id'] results.append(True) except: results.append('Failed relval transfer') elif all_OK[out]: campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request and wfi.request[ 'Campaign']: campaign = wfi.request['Campaign'] to_DDM = False ## campaign override if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[ campaign] and tier in CI.campaigns[campaign][ 'toDDM']: to_DDM = True ## by typical enabling if tier in UC.get("tiers_to_DDM"): to_DDM = True ## check for unitarity if not tier in UC.get("tiers_no_DDM") + UC.get( "tiers_to_DDM"): print "tier", tier, "neither TO or NO DDM for", out results.append('Not recognitized tier %s' % tier) #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out) sendLog( 'closor', "could not recognize %s for injecting in DDM" % out, level='critical') continue n_copies = 1 destinations = [] if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[ campaign]: ddm_instructions = CI.campaigns[campaign][ 'DDMcopies'] if type(ddm_instructions) == int: n_copies = CI.campaigns[campaign]['DDMcopies'] elif type(ddm_instructions) == dict: ## a more fancy configuration for ddmtier, indication in ddm_instructions.items( ): if ddmtier == tier or ddmtier in [ '*', 'all' ]: ## this is for us if 'N' in indication: n_copies = indication['N'] if 'host' in indication: destinations = indication['host'] destination_spec = "" if destinations: destination_spec = "--destination=" + ",".join( destinations) group_spec = "" ## not used yet ### should make this a campaign configuration ## inject to DDM when necessary if to_DDM: print "Sending", out, " to DDM" status = pass_to_dynamo( [out], N=n_copies, sites=destinations if destinations else None, group=group_spec if group_spec else None) results.append(status) if status == True: wfi.sendLog( 'closor', '%s is send to dynamo in %s copies %s %s' % (out, n_copies, sorted(destinations), group_spec)) else: sendLog('closor', "could not add " + out + " to dynamo pool. check closor logs.", level='critical') wfi.sendLog( 'closor', "could not add " + out + " to dynamo pool. check closor logs.") else: print wfo.name, "no stats for announcing", out results.append('No Stats') if all( map(lambda result: result in ['None', None, True], results)): if not jump_the_line: ## only announce if all previous are fine res = reqMgrClient.announceWorkflowCascade( url, wfo.name) if not res in ['None', None]: ## check the status again, it might well have toggled wl_bis = workflowInfo(url, wfo.name) self.to_wm_status = wl_bis.request['RequestStatus'] if wl_bis.request['RequestStatus'] in [ 'announced', 'normal-archived' ]: res = None else: res = reqMgrClient.announceWorkflowCascade( url, wfo.name) results.append(res) print results if all(map(lambda result: result in ['None', None, True], results)): if jump_the_line: if not 'announced' in wfo.status: self.to_status = wfo.status.replace( 'announce', 'announced') else: self.to_status = 'done' self.closing = True wfi.sendLog('closor', "workflow outputs are announced") else: wfi.sendLog( 'closor', "Error with %s to be announced \n%s" % (wfo.name, json.dumps(results))) elif wfi.request['RequestStatus'] in [ 'failed', 'aborted', 'aborted-archived', 'rejected', 'rejected-archived', 'aborted-completed' ]: if wfi.isRelval(): self.to_status = 'forget' self.to_wm_status = wfi.request['RequestStatus'] wfi.sendLog( 'closor', "%s is %s, but will not be set in trouble to find a replacement." % (wfo.name, self.to_wm_status)) else: self.to_status = 'trouble' self.to_wm_status = wfi.request['RequestStatus'] else: print wfo.name, "not good for announcing:", wfi.request[ 'RequestStatus'] wfi.sendLog('closor', "cannot be announced") self.held.add(wfo.name)
def main(): mysqlconn = MySQLdb.connect(host='dbod-cmsrv1.cern.ch', user='******', passwd="relval", port=5506) curs = mysqlconn.cursor() curs.execute("use " + dbname + ";") #curs.execute("lock tables batches write, batches_archive write, workflows write, workflows_archive write, datasets write, clone_reinsert_requests write") curs.execute("select * from batches") batches = curs.fetchall() batches_colnames = [desc[0] for desc in curs.description] for batch in batches: batch_dict = dict(zip(batches_colnames, batch)) if batch_dict["status"] != "assigned": continue userid = batch_dict["useridyear"] + "_" + batch_dict[ "useridmonth"] + "_" + batch_dict["useridday"] + "_" + str( batch_dict["useridnum"]) + "_" + str( batch_dict["batch_version_num"]) print " userid ==> " + userid curs.execute( "select workflow_name from workflows where useridyear = \"" + batch_dict["useridyear"] + "\" and useridmonth = \"" + batch_dict["useridmonth"] + "\" and useridday = \"" + batch_dict["useridday"] + "\" and useridnum = " + str(batch_dict["useridnum"]) + " and batch_version_num =" + str(batch_dict["batch_version_num"]) + ";") wfs = curs.fetchall() n_workflows = 0 n_completed = 0 for wf in wfs: n_workflows = n_workflows + 1 conn = httplib.HTTPSConnection( url, cert_file=os.getenv('X509_USER_PROXY'), key_file=os.getenv('X509_USER_PROXY')) r1 = conn.request('GET', '/reqmgr2/data/request?name=' + wf[0], headers={"Accept": "application/json"}) #r1=conn.request('GET','/couchdb/wmstats/_all_docs?keys=["'+wf[0]+'"]&include_docs=true') r2 = conn.getresponse() data = r2.read() if r2.status != 200: os.system( 'echo \"' + wf[0] + '\" | mail -s \"announcor.py error 1\" [email protected] --' ) sys.exit(0) s = json.loads(data) for status in s['result'][0][wf[0]]['RequestTransition']: if status['Status'] == "completed" or status[ 'Status'] == "force-complete": n_completed = n_completed + 1 break print "datetime.datetime.now() = " + str(datetime.datetime.now()) print "n_workflows = " + str(n_workflows) print "n_completed = " + str(n_completed) if n_workflows != n_completed: continue #string="2015_09_30_1_0" #if not (string.split('_')[0] == batch_dict["useridyear"] and string.split('_')[1] == batch_dict["useridmonth"] and string.split('_')[2] == batch_dict["useridday"] and string.split('_')[3] == str(batch_dict["useridnum"]) and string.split('_')[4] == str(batch_dict["batch_version_num"])): # continue wf_list = [] for wf in wfs: print wf[0] wf_list.append(wf[0]) job_failure_information = collect_job_failure_information.collect_job_failure_information( wf_list) needs_assistance = assistance_decision.assistance_decision( job_failure_information) if needs_assistance: curs.execute( "update batches set status=\"assistance\", current_status_start_time=\"" + datetime.datetime.now().strftime("%y:%m:%d %H:%M:%S") + "\" where useridyear = \"" + batch_dict["useridyear"] + "\" and useridmonth = \"" + batch_dict["useridmonth"] + "\" and useridday = \"" + batch_dict["useridday"] + "\" and useridnum = " + str(batch_dict["useridnum"]) + " and batch_version_num =" + str(batch_dict["batch_version_num"]) + ";") mysqlconn.commit() os.system( 'echo \"batch_id: ' + userid + '\" | mail -s \"a batch of relval workflows needs assistance\" [email protected]' ) continue #if there is a '\r' character in the body of an e-mail, it does not get sent description = batch_dict["description"].replace('\r', '') for wf in wf_list: too_many_events_check.too_many_events_check(wf) dset_nevents_list = collect_dsets_and_nevents.collect_dsets_and_nevents( wf_list) print_dsets_and_nevents.print_dsets_and_nevents( dset_nevents_list, userid + ".txt") ret = os.system( "cp " + userid + ".txt /afs/cern.ch/user/r/relval/webpage/relval_stats/" + userid + ".txt") if ret == 0: os.system("rm " + userid + ".txt") else: os.system( 'echo \"' + userid + '\" | mail -s \"announcement_loop.py error 2\" [email protected]' ) sys.exit(0) dsets_list = [] for dset_nevents in dset_nevents_list: dsets_list.append(dset_nevents[0]) for dset in dsets_list: setDatasetStatusDBS3.setStatusDBS3( "https://cmsweb.cern.ch/dbs/prod/global/DBSWriter", dset, "VALID", True) for wf in wf_list: reqMgrClient.closeOutWorkflow("cmsweb.cern.ch", wf) reqMgrClient.announceWorkflow("cmsweb.cern.ch", wf) msg = MIMEMultipart() reply_to = [] #send_to = ["*****@*****.**","*****@*****.**"] send_to = [ "*****@*****.**", "*****@*****.**", "*****@*****.**" ] #send_to = ["*****@*****.**"] #msg['In-Reply-To'] = hn_message_id #msg['References'] = hn_message_id msg['From'] = "*****@*****.**" msg['reply-to'] = COMMASPACE.join(reply_to) msg['To'] = COMMASPACE.join(send_to) msg['Date'] = formatdate(localtime=True) msg['Subject'] = batch_dict["announcement_title"] msg['Message-ID'] = email.Utils.make_msgid() messageText = "Dear all,\n" messageText = messageText + "\n" messageText = messageText + "A batch of relval workflows has finished.\n" messageText = messageText + "\n" messageText = messageText + "Batch ID:\n" messageText = messageText + "\n" messageText = messageText + userid + "\n" if batch_dict["batch_version_num"] > 0: messageText = messageText + "\n" messageText = messageText + "original workflow name ==> clone name:\n" messageText = messageText + "\n" curs.execute( "select workflow_name,original_workflow_name from workflows where useridyear = \"" + batch_dict["useridyear"] + "\" and useridmonth = \"" + batch_dict["useridmonth"] + "\" and useridday = \"" + batch_dict["useridday"] + "\" and useridnum = " + str(batch_dict["useridnum"]) + " and batch_version_num =" + str(batch_dict["batch_version_num"]) + ";") workflows = curs.fetchall() for workflow in workflows: messageText = messageText + workflow[1] + " ==> " + workflow[ 0] + "\n" messageText = messageText + "\n" messageText = messageText + "List of datasets:\n" messageText = messageText + "\n" messageText = messageText + "http://cms-project-relval.web.cern.ch/cms-project-relval/relval_stats/" + userid + ".txt\n" messageText = messageText + "\n" messageText = messageText + "Description:\n" messageText = messageText + "\n" messageText = messageText + description.rstrip('\n') messageText = messageText + "\n" #messageText=messageText+"\n" [istherefailureinformation, return_string ] = print_job_failure_information.print_job_failure_information( job_failure_information) if istherefailureinformation: messageText = messageText + "\n" messageText = messageText + return_string messageText = messageText + "\n" messageText = messageText + "\n" messageText = messageText + "RelVal Batch Manager" #put the announcement message into an e-mail to the relval hypernews and also in a url output_file = open( "/afs/cern.ch/user/r/relval/webpage/relval_announcements/" + userid + ".txt", 'w') output_file.write(messageText) try: msg.attach(MIMEText(messageText)) smtpObj = smtplib.SMTP() smtpObj.connect() smtpObj.sendmail("*****@*****.**", send_to, msg.as_string()) smtpObj.close() except Exception as e: print "Error: unable to send email: %s" % (str(e)) dsets_fnal_disk_list = [] dsets_cern_disk_list = [] for dset in dsets_list: #print dset.split('/') # we were asked to transfer some specific datasets to the cern tier 2 if dset.split('/')[3] != "RECO" and dset.split( '/')[3] != "ALCARECO": dsets_cern_disk_list.append(dset) if dset.split('/')[3] == "GEN-SIM": dsets_fnal_disk_list.append(dset) if dset.split('/')[3] == "GEN-SIM-DIGI-RAW": dsets_fnal_disk_list.append(dset) if dset.split('/')[3] == "GEN-SIM-RECO": dsets_fnal_disk_list.append(dset) if "RelValTTBar" in dset.split( '/')[1] and "TkAlMinBias" in dset.split( '/')[2] and dset.split('/')[3] != "ALCARECO": dsets_cern_disk_list.append(dset) if "MinimumBias" in dset.split( '/')[1] and "SiStripCalMinBias" in dset.split( '/')[2] and dset.split('/')[3] != "ALCARECO": dsets_cern_disk_list.append(dset) result = utils.makeReplicaRequest("cmsweb.cern.ch", "T2_CH_CERN", dsets_cern_disk_list, "relval datasets", group="RelVal") if result != None: phedexid = result['phedex']['request_created'][0]['id'] utils.approveSubscription("cmsweb.cern.ch", phedexid) result = utils.makeReplicaRequest("cmsweb.cern.ch", "T1_US_FNAL_Disk", dsets_fnal_disk_list, "relval datasets", group="RelVal") if result != None: phedexid = result['phedex']['request_created'][0]['id'] utils.approveSubscription("cmsweb.cern.ch", phedexid) result = utils.makeMoveRequest("cmsweb.cern.ch", "T0_CH_CERN_MSS", dsets_list, "relval datasets", group="RelVal") if result != None: phedexid = result['phedex']['request_created'][0]['id'] #even if you disapprove the subscription at the source, it will still deleted the datasets that are at the source but not subscribed their utils.disapproveSubscription("cmsweb.cern.ch", phedexid, ["T2_CH_CERN"]) utils.disapproveSubscription("cmsweb.cern.ch", phedexid, ["T1_US_FNAL_Disk"]) utils.disapproveSubscription("cmsweb.cern.ch", phedexid, ["T1_FR_CCIN2P3_Disk"]) utils.approveSubscription("cmsweb.cern.ch", phedexid, ["T0_CH_CERN_MSS"]) #phedexid = result['phedex']['request_created'][0]['id'] #utils.approveSubscription("cmsweb.cern.ch",phedexid) curs.execute( "update batches set status=\"announced\", current_status_start_time=\"" + datetime.datetime.now().strftime("%y:%m:%d %H:%M:%S") + "\" where useridyear = \"" + batch_dict["useridyear"] + "\" and useridmonth = \"" + batch_dict["useridmonth"] + "\" and useridday = \"" + batch_dict["useridday"] + "\" and useridnum = " + str(batch_dict["useridnum"]) + " and batch_version_num =" + str(batch_dict["batch_version_num"]) + ";") mysqlconn.commit()
def transferor(url, specific=None, talk=True, options=None): if userLock(): return if duplicateLock(): return use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() NLI = newLockInfo() mcm = McMClient(dev=False) dss = DSS() #allowed_secondary = UC.get('') print "counting all being handled..." being_handled = len( session.query(Workflow).filter(Workflow.status == 'away').all()) being_handled += len( session.query(Workflow).filter( Workflow.status.startswith('stag')).all()) being_transfered = len( session.query(Workflow).filter(Workflow.status == 'staging').all()) being_handled += len( session.query(Workflow).filter( Workflow.status.startswith('assistance-')).all()) max_to_handle = options.maxworkflows max_to_transfer = options.maxstaging allowed_to_handle = max(0, max_to_handle - being_handled) allowed_to_transfer = max(0, max_to_transfer - being_transfered) wf_buffer = 5 if allowed_to_handle <= wf_buffer: ## buffer for having several wf per transfer print "Not allowed to run more than", max_to_handle, "at a time. Currently", being_handled, "and", wf_buffer, "buffer" else: print being_handled, "already being handled", max_to_handle, "max allowed,", allowed_to_handle, "remaining", "and", wf_buffer, "buffer" if allowed_to_transfer <= wf_buffer: print "Not allowed to transfer more than", max_to_transfer, "at a time. Currently", being_transfered, "and", wf_buffer, "buffer" else: print being_transfered, "already being transfered", max_to_transfer, "max allowed,", allowed_to_transfer, "remaining", "and", wf_buffer, "buffer" print "... done" all_transfers = defaultdict(list) workflow_dependencies = defaultdict( set) ## list of wf.id per input dataset wfs_and_wfh = [] print "getting all wf to consider ..." cache = getWorkflows(url, 'assignment-approved', details=True) for wfo in session.query(Workflow).filter( Workflow.status.startswith('considered')).all(): print "\t", wfo.name if specific and not specific in wfo.name: continue cache_r = filter(lambda d: d['RequestName'] == wfo.name, cache) if len(cache_r): wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False, request=cache_r[0]))) else: wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False))) print "... done" transfers_per_sites = defaultdict(int) input_sizes = {} ignored_input_sizes = {} input_cput = {} input_st = {} ## list the size of those in transfer already in_transfer_priority = None min_transfer_priority = None print "getting all wf in staging ..." stucks = json.loads(open('%s/stuck_transfers.json' % monitor_dir).read()) for wfo in session.query(Workflow).filter( Workflow.status == 'staging').all(): wfh = workflowInfo(url, wfo.name, spec=False) #(lheinput,primary,parent,secondary) = wfh.getIO() #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() for site in sites_allowed: ## we should get the actual transfer destination instead of the full white list transfers_per_sites[site] += 1 #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() for prim in primary: ds_s = dss.get(prim) if prim in stucks: sendLog('transferor', "%s appears stuck, so not counting it %s [GB]" % (prim, ds_s), wfi=wfh) ignored_input_sizes[prim] = ds_s else: input_sizes[prim] = ds_s sendLog('transferor', "%s needs %s [GB]" % (wfo.name, ds_s), wfi=wfh) if in_transfer_priority == None: in_transfer_priority = int(wfh.request['RequestPriority']) else: in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority'])) if min_transfer_priority == None: min_transfer_priority = int(wfh.request['RequestPriority']) else: min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority'])) if min_transfer_priority == None or in_transfer_priority == None: print "nothing is lining up for transfer" sendEmail("no request in staging", "no request in staging") return pass try: print "Ignored input sizes" ignored_values = list(ignored_input_sizes.items()) ignored_values.sort(key=lambda i: i[1]) print "\n".join(map(str, ignored_values)) print "Considered input sizes" considered_values = list(input_sizes.items()) considered_values.sort(key=lambda i: i[1]) print "\n".join(map(str, considered_values)) except Exception as e: print "trying to print the summary of input size" print str(e) print "... done" print "Max priority in transfer already", in_transfer_priority print "Min priority in transfer already", min_transfer_priority print "transfers per sites" print json.dumps(transfers_per_sites, indent=2) in_transfer_already = sum(input_sizes.values()) cput_in_transfer_already = sum(input_cput.values()) st_in_transfer_already = sum(input_st.values()) ## list the size of all inputs primary_input_per_workflow_gb = defaultdict(float) print "getting all input sizes ..." for (wfo, wfh) in wfs_and_wfh: (_, primary, _, _) = wfh.getIO() #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() for prim in primary: ## do not count it if it appears stalled ! prim_size = dss.get(prim) input_sizes[prim] = prim_size primary_input_per_workflow_gb[wfo.name] += prim_size print "... done" # shuffle first by name random.shuffle(wfs_and_wfh) # Sort smallest transfers first; allows us to transfer as many as possible workflows. def prio_and_size(i, j): if int(i[1].request['RequestPriority']) == int( j[1].request['RequestPriority']): return cmp(int(primary_input_per_workflow_gb.get(j[0].name, 0)), int(primary_input_per_workflow_gb.get(i[0].name, 0))) else: return cmp(int(i[1].request['RequestPriority']), int(j[1].request['RequestPriority'])) #wfs_and_wfh.sort(cmp = prio_and_size, reverse=True) #wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(primary_input_per_workflow_gb.get(i[0].name, 0)), int(primary_input_per_workflow_gb.get(j[0].name, 0)) )) #sort by priority higher first wfs_and_wfh.sort(cmp=lambda i, j: cmp(int(i[1].request[ 'RequestPriority']), int(j[1].request['RequestPriority'])), reverse=True) cput_grand_total = sum(input_cput.values()) cput_to_transfer = cput_grand_total - cput_in_transfer_already st_grand_total = sum(input_st.values()) st_to_transfer = st_grand_total - st_in_transfer_already print "%15.4f [CPU h] worth already in transfer" % cput_in_transfer_already print "%15.4f [CPU h] worth is the current requested transfer load" % cput_to_transfer print "%15.4f [h] worth of absolute system time in transfer" % ( cput_in_transfer_already / SI.availableSlots()) print "%15.4f [h] worth of absolute system time is the current requested transfer load" % ( cput_to_transfer / SI.availableSlots()) print "%15.4f [h] worth of theoritical system time in transfer" % ( st_in_transfer_already) print "%15.4f [h] worth of theoritical system time is the current requested transfer load" % ( st_to_transfer) grand_total = sum(input_sizes.values()) to_transfer = grand_total - in_transfer_already grand_transfer_limit = options.maxtransfer #grand_transfer_limit = SI.total_disk()*0.25*1024## half of the free sapce in TB->GB transfer_limit = grand_transfer_limit - in_transfer_already print "%15.4f GB already being transfered" % in_transfer_already print "%15.4f GB is the current requested transfer load" % to_transfer print "%15.4f GB is the global transfer limit" % grand_transfer_limit print "%15.4f GB is the available limit" % transfer_limit max_staging_per_site = options.maxstagingpersite # the max priority value per dataset. max_priority = defaultdict(int) needs_transfer = 0 ## so that we can count'em passing_along = 0 transfer_sizes = {} went_over_budget = False destination_cache = {} no_goes = set() max_per_round = UC.get('max_per_round').get('transferor', None) if max_per_round and not spec: wfs_and_wfh = wfs_and_wfh[:max_per_round] for (wfo, wfh) in wfs_and_wfh: print wfo.name, "to be transfered with priority", wfh.request[ 'RequestPriority'] if wfh.request['RequestStatus'] != 'assignment-approved': if wfh.request['RequestStatus'] in [ 'aborted', 'rejected', 'rejected-archived', 'aborted-archived' ]: wfo.status = 'trouble' ## so that we look or a replacement else: wfo.status = 'away' wfh.sendLog( 'transferor', '%s in status %s, setting %s' % (wfo.name, wfh.request['RequestStatus'], wfo.status)) continue (_, primary, _, _) = wfh.getIO() this_load = sum([input_sizes[prim] for prim in primary]) no_budget = False if (this_load and (sum(transfer_sizes.values()) + this_load > transfer_limit or went_over_budget)): if went_over_budget: wfh.sendLog('transferor', "Transfer has gone over bubget.") else: wfh.sendLog('transferor', "Transfer will go over bubget.") wfh.sendLog( 'transferor', "%15.4f GB this load, %15.4f GB already this round, %15.4f GB is the available limit" % (this_load, sum(transfer_sizes.values()), transfer_limit)) #if sum(transfer_sizes.values()) > transfer_limit: went_over_budget = True if in_transfer_priority != None and min_transfer_priority != None: if int( wfh.request['RequestPriority'] ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority: wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over budget" % (wfh.request['RequestPriority'], in_transfer_priority)) else: if not options.go: wfh.sendLog( 'transferor', "%s minimum priority %s < %s : stop" % (min_transfer_priority, wfh.request['RequestPriority'], in_transfer_priority)) no_budget = True ## throtlle by campaign go no_go = False if not wfh.go(log=True) and not options.go: no_go = True no_goes.add(wfo.name) allowed_secondary = set() for campaign in wfh.getCampaigns(): if campaign in CI.campaigns and 'secondaries' in CI.campaigns[ campaign]: allowed_secondary.update(CI.campaigns[campaign]['secondaries']) if secondary: if (secondary and allowed_secondary) and ( set(secondary) & allowed_secondary != set(secondary)): wfh.sendLog( 'assignor', '%s is not an allowed secondary' % (', '.join(set(secondary) - allowed_secondary))) no_go = True if no_go: continue ## check if the batch is announced def check_mcm(wfn): announced = False is_real = False if not wfn.startswith('pdmvserv'): is_real = True try: for b in mcm.getA('batches', query='contains=%s' % wfo.name): is_real = True if b['status'] == 'announced': announced = True break except: try: for b in mcm.getA('batches', query='contains=%s' % wfo.name): is_real = True if b['status'] == 'announced': announced = True break except: print "could not get mcm batch announcement, assuming not real" return announced, is_real if not use_mcm: announced, is_real = False, True else: if wfh.request['RequestType'] in ['ReReco']: announced, is_real = True, True else: announced, is_real = check_mcm(wfo.name) if not announced: wfh.sendLog('transferor', "does not look announced.") if not is_real: wfh.sendLog('transferor', "does not appear to be genuine.") ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status continue ## check on a grace period injection_time = time.mktime( time.strptime('.'.join(map(str, wfh.request['RequestDate'])), "%Y.%m.%d.%H.%M.%S")) / (60. * 60.) now = time.mktime(time.gmtime()) / (60. * 60.) if float(now - injection_time) < 4.: if not options.go and not announced: wfh.sendLog( 'transferor', "It is too soon to start transfer: %3.2fH remaining" % (now - injection_time)) continue if passing_along >= allowed_to_handle: #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: if in_transfer_priority != None and min_transfer_priority != None: if int(wfh.request['RequestPriority'] ) >= in_transfer_priority and int( wfh.request['RequestPriority'] ) != min_transfer_priority: ## higher priority, and not only this priority being transfered wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over %s" % (wfh.request['RequestPriority'], in_transfer_priority, max_to_handle)) else: wfh.sendLog( 'transferor', " Not allowed to pass more than %s at a time. Currently %s handled, and adding %s" % (max_to_handle, being_handled, passing_along)) if not options.go: ## should not allow to jump that fence break if this_load and needs_transfer >= allowed_to_transfer: if in_transfer_priority != None and min_transfer_priority != None: if int(wfh.request['RequestPriority'] ) >= in_transfer_priority and int( wfh.request['RequestPriority'] ) != min_transfer_priority: ## higher priority, and not only this priority being transfered wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over %s" % (wfh.request['RequestPriority'], in_transfer_priority, max_to_transfer)) else: wfh.sendLog( 'transferor', "Not allowed to transfer more than %s at a time. Currently %s transfering, and adding %s" % (max_to_transfer, being_transfered, needs_transfer)) if not options.go: no_budget = True if no_budget: continue ## the site white list considers site, campaign, memory and core information (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() if options and options.tosites: sites_allowed = options.tosites.split(',') for dataset in list(primary) + list(parent) + list(secondary): ## lock everything flat NLI.lock(dataset) if not sites_allowed: wfh.sendLog('transferor', "not possible site to run at") #sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name )) sendLog('transferor', "%s has no possible sites to run at" % (wfo.name), level='critical') continue blocks = [] if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## augment with run white list for dataset in primary: blocks = list( set(blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist']))) if 'LumiList' in wfh.request and wfh.request['LumiList']: ## augment with the lumi white list blocks = list( set(blocks + getDatasetBlocks(dataset, lumis=wfh.request['LumiList']))) if blocks: print "Reading", len(blocks), "in block whitelist" can_go = True staging = False allowed = True primary_destinations = set() if primary: copies_needed_from_CPUh, CPUh = wfh.getNCopies() if talk: print wfo.name, 'reads', ', '.join(primary), 'in primary' ## chope the primary dataset for prim in primary: ## keep track of what needs what workflow_dependencies[prim].add(wfo.id) max_priority[prim] = max(max_priority[prim], int(wfh.request['RequestPriority'])) wfh.sendLog( 'transferor', "Would make %s from cpu requirement %s" % (copies_needed_from_CPUh, CPUh)) copies_needed = copies_needed_from_CPUh if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[ wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[ wfh.request['Campaign']]['maxcopies'] copies_needed = min(copies_needed_from_campaign, copies_needed) wfh.sendLog( 'transferor', "Maxed to %s by campaign configuration %s" % (copies_needed, wfh.request['Campaign'])) ### new ways of making the whole thing destinations, all_block_names = getDatasetDestinations( url, prim, within_sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks) print json.dumps(destinations, indent=2) ## get where the dataset is in full and completed prim_location = [ site for (site, info) in destinations.items() if info['completion'] == 100 and info['data_fraction'] == 1 ] ## the rest is places it is going to be prim_destination = [ site for site in destinations.keys() if not site in prim_location ] if len(prim_location) >= copies_needed: wfh.sendLog( 'transferor', "The input is all fully in place at %s sites %s" % (len(prim_location), sorted(prim_location))) continue copies_needed = max(0, copies_needed - len(prim_location)) wfh.sendLog( 'transferor', "not counting existing copies ; now need %s" % copies_needed) copies_being_made = [ sum([ info['blocks'].keys().count(block) for site, info in destinations.items() if site in prim_destination ]) for block in all_block_names ] latching_on_transfers = set() [ latching_on_transfers.update(info['blocks'].values()) for site, info in destinations.items() if site in prim_destination ] latching_on_transfers = list(latching_on_transfers) #print latching_on_transfers ## figure out where all this is going to go prim_to_distribute = [ site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location ] prim_to_distribute = [ site for site in prim_to_distribute if not SI.CE_to_SE(site) in prim_destination ] ## take out the ones that cannot receive transfers prim_to_distribute = [ site for site in prim_to_distribute if not any([ osite.startswith(site) for osite in SI.sites_veto_transfer ]) ] wfh.sendLog( 'transferor', "Could be going to: %s" % sorted(prim_to_distribute)) if not prim_to_distribute or any([ transfers_per_sites[site] < max_staging_per_site for site in prim_to_distribute ]): ## means there is openings let me go print "There are transfer slots available:", [ (site, transfers_per_sites[site]) for site in prim_to_distribute ] #for site in sites_allowed: # #increment accross the board, regardless of real destination: could be changed # transfers_per_sites[site] += 1 else: if int( wfh.request['RequestPriority'] ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority: wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over transfer slots available" % (wfh.request['RequestPriority'], in_transfer_priority)) else: wfh.sendLog( 'transferor', "Not allowed to transfer more than %s per site at a time. Going overboard for %s" % (max_staging_per_site, sorted([ site for site in prim_to_distribute if transfers_per_sites[site] >= max_staging_per_site ]))) if not options.go: allowed = False break for latching in latching_on_transfers: tfo = session.query(Transfer).filter( Transfer.phedexid == int(latching)).first() if not tfo: tfo = session.query(Transfer).filter( Transfer.phedexid == -int(latching)).first() if not tfo: tfo = Transfer(phedexid=latching) tfo.workflows_id = [] session.add(tfo) else: tfo.phedexid = latching ## make it positive ever if not wfo.id in tfo.workflows_id: print "adding", wfo.id, "to", tfo.id, "with phedexid", latching l = copy.deepcopy(tfo.workflows_id) l.append(wfo.id) tfo.workflows_id = l if not options.test: session.commit() else: session.flush( ) ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query can_go = False transfer_sizes[prim] = input_sizes[prim] staging = True # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ?? #copies_needed = max(0,copies_needed - len(prim_destination)) copies_needed = max(0, copies_needed - min(copies_being_made)) wfh.sendLog( 'transferor', "Not counting the copies being made ; then need %s" % copies_needed) if copies_needed == 0: wfh.sendLog( 'transferor', "The output is either fully in place or getting in full somewhere with %s" % latching_on_transfers) can_go = True continue elif len(prim_to_distribute) == 0: wfh.sendLog( 'transferor', "We are going to need extra copies, but no destinations seems available" ) prim_to_distribute = [ site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location ] prim_to_distribute = [ site for site in prim_to_distribute if not any([ osite.startswith(site) for osite in SI.sites_veto_transfer ]) ] if len( prim_to_distribute ) > 0: ## maybe that a parameter we can play with to limit the if not options or options.chop: ### hard include the tape disk andpoint ? #tapes = [site for site in getDatasetPresence( url, prim, vetos=['T0','T2','T3','Disk']) if site.endswith('MSS')] chops, sizes = getDatasetChops( prim, chop_threshold=options.chopsize, only_blocks=blocks) spreading = distributeToSites(chops, prim_to_distribute, n_copies=copies_needed, weights=SI.cpu_pledges, sizes=sizes) transfer_sizes[prim] = sum(sizes) if not spreading: sendLog( 'transferor', 'cannot send %s to any site, it cannot fit anywhere' % prim, level='critical') wfh.sendLog( 'transferor', "cannot send to any site. %s cannot seem to fit anywhere" % (prim)) staging = False can_go = False else: spreading = {} for site in prim_to_distribute: if blocks: spreading[site] = blocks else: spreading[site] = [prim] transfer_sizes[prim] = input_sizes[ prim] ## this is approximate if blocks are specified can_go = False wfh.sendLog( 'transferor', "selected CE destinations %s" % (sorted(spreading.keys()))) for (site, items) in spreading.items(): all_transfers[site].extend(items) transfers_per_sites[site] += 1 primary_destinations.add(site) if not allowed: wfh.sendLog('transferor', "Not allowed to move on with") continue if secondary: override_sec_destination = [] if 'SecondaryLocation' in CI.campaigns[wfh.request['Campaign']]: override_sec_destination = CI.campaigns[ wfh.request['Campaign']]['SecondaryLocation'] print wfo.name, 'reads', ', '.join(secondary), 'in secondary' for sec in secondary: workflow_dependencies[sec].add(wfo.id) if True: ## new style, failing on minbias if not sec in destination_cache: ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist destination_cache[sec], _ = getDatasetDestinations( url, sec) ## NO SITE WHITE LIST ADDED #destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed]) ## limit to the site whitelist NOW se_allowed = [SI.CE_to_SE(site) for site in sites_allowed] destinations = dict([ (k, v) for (k, v) in destination_cache[sec].items() if site in se_allowed ]) ## truncate location/destination to those making up for >90% of the dataset bad_destinations = [ destinations.pop(site) for (site, info) in destinations.items() if info['data_fraction'] < 0.9 ] sec_location = [ site for (site, info) in destinations.items() if info['completion'] >= 95 ] sec_destination = [ site for site in destinations.keys() if not site in sec_location ] else: ## old style presence = getDatasetPresence(url, sec) sec_location = [ site for site, pres in presence.items() if pres[1] > 90. ] ## more than 90% of the minbias at sites subscriptions = listSubscriptions(url, sec) sec_destination = [site for site in subscriptions] sec_to_distribute = [ site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location]) ] sec_to_distribute = [ site for site in sec_to_distribute if not any( [osite.startswith(site) for osite in sec_destination]) ] sec_to_distribute = [ site for site in sec_to_distribute if not any([ osite.startswith(site) for osite in SI.sites_veto_transfer ]) ] if override_sec_destination: ## intersect with where we want the PU to be not_needed_anymore = list( set(sec_to_distribute) - set(override_sec_destination)) #sendEmail("secondary superfluous","the dataset %s could be removed from %s"%( sec, not_needed_anymore )) sendLog( 'transferor', "the dataset %s could be removed from %s" % (sec, not_needed_anymore)) sec_to_distribute = list( set(sec_to_distribute) & set(override_sec_destination)) if len(sec_to_distribute) > 0: print "secondary could go to", sorted(sec_to_distribute) sec_size = dss.get(sec) for site in sec_to_distribute: site_se = SI.CE_to_SE(site) if (SI.disk[site_se] * 1024.) > sec_size: all_transfers[site].append(sec) can_go = False else: print "could not send the secondary input to", site_se, "because it is too big for the available disk", SI.disk[ site_se] * 1024, "GB need", sec_size if primary_destinations and site in primary_destinations: #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024)) sendLog('transferor', '%s is too big (%s) for %s (%s)' % (sec, sec_size, site_se, SI.disk[site_se] * 1024), level='critical') else: print "the secondary input does not have to be send to site" ## is that possible to do something more if can_go: ## no explicit transfer required this time if staging: ## but using existing ones wfh.sendLog( 'transferor', "latches on existing transfers, and nothing else, settin staging" ) wfo.status = 'staging' needs_transfer += 1 else: wfh.sendLog( 'transferor', "should just be assigned now to %s" % sorted(sites_allowed)) wfo.status = 'staged' passing_along += 1 wfh.sendLog('transferor', "setting status to %s" % wfo.status) session.commit() continue else: ## there is an explicit transfer required if staging: ## and also using an existing one wfh.sendLog('transferor', "latches on existing transfers") if not options.test: wfo.status = 'staging' wfh.sendLog('transferor', "setting status to %s" % wfo.status) session.commit() wfh.sendLog('transferor', "needs a transfer") needs_transfer += 1 passing_along += 1 if no_goes: #sendEmail("no go for managing","No go for \n"+"\n".join( no_goes )) sendLog('transferor', "No go for \n" + "\n".join(no_goes), level='critical') print "accumulated transfers" print json.dumps(all_transfers, indent=2) fake_id = -1 wf_id_in_prestaging = set() for (site, items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## site that do not want input datasets if site in SI.sites_veto_transfer: print site, "does not want transfers" continue ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured. ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] block_datasets = list(set([it.split('#')[0] for it in blocks])) datasets = [it for it in items_to_transfer if not '#' in it] details_text = "Making a replica to %s (CE) %s (SE) for" % (site, site_se) #print "\t",len(blocks),"blocks" ## remove blocks if full dataset is send out blocks = [ block for block in blocks if not block.split('#')[0] in datasets ] #print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks])) #print "\t",len(datasets),"datasets" #print "\t",datasets details_text += '\n\t%d blocks' % len(blocks) details_text += '\n\t%d needed blocks for %s' % ( len(blocks), sorted(list(set([block.split('#')[0] for block in blocks])))) details_text += '\n\t%d datasets' % len(datasets) details_text += '\n\t%s' % sorted(datasets) items_to_transfer = blocks + datasets if execute: sendLog('transferor', details_text) else: print "Would make a replica to", site, "(CE)", site_se, "(SE) for" print details_text ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y', 'yes', 'go']: continue if execute: priority = 'normal' cds = [ ds for ds in datasets + block_datasets if ds in max_priority ] if cds and False: ## I don't think this is working. subscription should be updated on the fly and regularly for raising the priority if needed ## decide on an overall priority : that's a bit too large though if any([max_priority[ds] >= 90000 for ds in cds]): priority = 'high' elif all([max_priority[ds] < 80000 for ds in cds]): priority = 'low' result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority=priority) else: result = {'phedex': {'request_created': []}} fake_id -= 1 if not result: print "ERROR Could not make a replica request for", site, items_to_transfer, "pre-staging" continue for phedexid in [o['id'] for o in result['phedex']['request_created']]: new_transfer = session.query(Transfer).filter( Transfer.phedexid == int(phedexid)).first() if not new_transfer: new_transfer = session.query(Transfer).filter( Transfer.phedexid == -int(phedexid)).first() print phedexid, "transfer created" if not new_transfer: new_transfer = Transfer(phedexid=phedexid) session.add(new_transfer) else: new_transfer.phedexid = phedexid ## make it positive again new_transfer.workflows_id = set() for transfering in list( set(map(lambda it: it.split('#')[0], items_to_transfer))): new_transfer.workflows_id.update( workflow_dependencies[transfering]) new_transfer.workflows_id = list(new_transfer.workflows_id) wf_id_in_prestaging.update(new_transfer.workflows_id) session.commit() ## auto approve it if execute: approved = approveSubscription(url, phedexid, [site_se]) for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status != 'staging': if execute: tr_wf.status = 'staging' if talk: print "setting", tr_wf.name, "to staging" session.commit()
def closor(url, specific=None, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() all_late_files = [] check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce') jump_the_line = options.announce if options else False if jump_the_line: wfs = session.query(Workflow).filter(Workflow.status.contains('announce')).filter(sqlalchemy.not_(Workflow.status.contains('announced'))).all() else: wfs = session.query(Workflow).filter(Workflow.status=='close').all() held = set() print len(wfs),"closing" max_per_round = UC.get('max_per_round').get('closor',None) if options.limit: max_per_round = options.limit random.shuffle( wfs ) if max_per_round: wfs = wfs[:max_per_round] batch_go = {} batch_warnings = defaultdict(set) batch_goodness = UC.get("batch_goodness") for wfo in wfs: if specific and not specific in wfo.name: continue ## what is the expected #lumis wfi = workflowInfo(url, wfo.name ) wfo.wm_status = wfi.request['RequestStatus'] if wfi.isRelval(): has_batch_go = False batch_name = wfi.getCampaign() if not batch_name in batch_go: ## do the esimatation whethere this can be announced : only once per batch in_batches = getWorkflowByCampaign(url , batch_name, details=True) batch_go[ batch_name ] = all(map(lambda s : not s in ['completed','running-open','running-closed','acquired','assigned','assignment-approved'], [r['RequestStatus'] for r in in_batches])) ## already verified has_batch_go = batch_go[batch_name] if not has_batch_go: wfi.sendLog('closor', 'Cannot close for now because the batch %s is not all close'% batch_name) continue if wfi.request['RequestStatus'] in ['announced','normal-archived'] and not options.force: ## manually announced ?? wfo.status = 'done' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog('closor','%s is announced already : %s'%( wfo.name,wfo.wm_status)) session.commit() if jump_the_line: wfi.sendLog('closor','Announcing while completing') expected_lumis = 1 if not 'TotalInputLumis' in wfi.request: print wfo.name,"has not been assigned yet, or the database is corrupted" elif wfi.request['TotalInputLumis']==0: print wfo.name,"is corrupted with 0 expected lumis" else: expected_lumis = wfi.request['TotalInputLumis'] ## what are the outputs outputs = wfi.request['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = defaultdict(lambda : False) stats = defaultdict(int) #print outputs if len(outputs): print wfo.name,wfi.request['RequestStatus'] for out in outputs: event_count,lumi_count = getDatasetEventsAndLumis(dataset=out) odb = session.query(Output).filter(Output.datasetname==out).first() if not odb: print "adding an output object",out odb = Output( datasetname = out ) odb.workflow = wfo session.add( odb ) odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) session.commit() fraction = lumi_count/float(expected_lumis)*100. completion_line = "%60s %d/%d = %3.2f%%"%(out,lumi_count,expected_lumis,fraction) wfi.sendLog('closor',"\t%s"% completion_line) if wfi.isRelval() and fraction < batch_goodness: batch_warnings[ wfi.getCampaign()].add( completion_line ) stats[out] = lumi_count all_OK[out] = True ## check for at least one full copy prior to moving on in_full = {} for out in outputs: in_full[out] = [] presence = getDatasetPresence( url, out ) where = [site for site,info in presence.items() if info[0]] if where: all_OK[out] = True print out,"is in full at",",".join(where) in_full[out] = copy.deepcopy(where) else: going_to = wfi.request['NonCustodialSites']+wfi.request['CustodialSites'] wfi.sendLog('closor',"%s is not in full anywhere. send to %s"%(out, ",".join(sorted(going_to)))) at_destination = dict([(k,v) for (k,v) in presence.items() if k in going_to]) else_where = dict([(k,v) for (k,v) in presence.items() if not k in going_to]) print json.dumps( at_destination ) print json.dumps( else_where, indent=2 ) ## do the full stuck transfer study, missing files and shit ! for there in going_to: late_info = findLateFiles(url, out, going_to = there ) for l in late_info: l.update({"workflow":wfo.name,"dataset":out}) all_late_files.extend( late_info ) if check_fullcopy_to_announce: ## only set this false if the check is relevant all_OK[out] = False ## verify if we have to do harvesting if not options.no_harvest and not jump_the_line: (OK, requests) = spawn_harvesting(url, wfi, in_full) all_OK.update( OK ) ## only that status can let me go into announced if all(all_OK.values()) and ((wfi.request['RequestStatus'] in ['closed-out']) or options.force or jump_the_line): print wfo.name,"to be announced" results=[] if not results: for out in outputs: if out in stats and not stats[out]: continue _,dsn,process_string,tier = out.split('/') if all_OK[out]: results.append(setDatasetStatus(out, 'VALID')) if all_OK[out] and wfi.isRelval(): ## make the specific relval rules and the replicas ## figure the destination(s) out destinations = set() if tier != "RECO" and tier != "ALCARECO": destinations.add('T2_CH_CERN') if tier == "GEN-SIM": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-DIGI-RAW": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-RECO": destinations.add('T1_US_FNAL_Disk') if "RelValTTBar" in dsn and "TkAlMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if "MinimumBias" in dsn and "SiStripCalMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if destinations: wfi.sendLog('closor', '%s to go to %s'%(out, ', '.join( sorted( destinations )))) ## call to makereplicarequest under relval => done for site in destinations: result = makeReplicaRequest(url, site, [out], 'Copy for release validation consumption', priority='normal', approve=True, mail=False, group='RelVal') try: request_id = result['phedex']['request_created'][0]['id'] results.append( True ) except: results.append( 'Failed relval transfer' ) elif all_OK[out]: campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request and wfi.request['Campaign']: campaign = wfi.request['Campaign'] to_DDM = False ## campaign override if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[campaign] and tier in CI.campaigns[campaign]['toDDM']: to_DDM = True ## by typical enabling if tier in UC.get("tiers_to_DDM"): to_DDM = True ## check for unitarity if not tier in UC.get("tiers_no_DDM")+UC.get("tiers_to_DDM"): print "tier",tier,"neither TO or NO DDM for",out results.append('Not recognitized tier %s'%tier) #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out) sendLog('closor', "could not recognize %s for injecting in DDM"% out, level='critical') continue n_copies = 2 destinations=[] if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[campaign]: ddm_instructions = CI.campaigns[campaign]['DDMcopies'] if type(ddm_instructions) == int: n_copies = CI.campaigns[campaign]['DDMcopies'] elif type(ddm_instructions) == dict: ## a more fancy configuration for ddmtier,indication in ddm_instructions.items(): if ddmtier==tier or ddmtier in ['*','all']: ## this is for us if 'N' in indication: n_copies = indication['N'] if 'host' in indication: destinations = indication['host'] destination_spec = "" if destinations: destination_spec = "--destination="+",".join( destinations ) group_spec = "" ## not used yet ### should make this a campaign configuration ## inject to DDM when necessary if to_DDM: print "Sending",out," to DDM" p = os.popen('python assignDatasetToSite.py --nCopies=%d --dataset=%s %s %s --debug 0 --exec'%(n_copies, out,destination_spec, group_spec)) ddm_text = p.read() print ddm_text status = p.close() if status!=None: print "Failed DDM, retrying to send",out,"a second time" p = os.popen('python assignDatasetToSite.py --nCopies=%d --dataset=%s %s %s --debug 1 --exec'%(n_copies, out,destination_spec, group_spec)) ddm_text = p.read() print ddm_text status = p.close() if status!=None: #sendEmail("failed DDM injection","could not add "+out+" to DDM pool. check closor logs.") sendLog('closor',"could not add "+out+" to DDM pool. check closor logs.", level='critical') if options.force: status = True results.append( status ) if status == None: wfi.sendLog('closor',ddm_text) wfi.sendLog('closor','%s is send to AnalysisOps DDM pool in %s copies %s'%( out, n_copies, destination_spec)) else: print wfo.name,"no stats for announcing",out results.append('No Stats') if all(map(lambda result : result in ['None',None,True],results)): if not jump_the_line: ## only announce if all previous are fine res = reqMgrClient.announceWorkflowCascade(url, wfo.name) if not res in ['None',None]: ## check the status again, it might well have toggled wl_bis = workflowInfo(url, wfo.name) wfo.wm_status = wl_bis.request['RequestStatus'] session.commit() if wl_bis.request['RequestStatus'] in ['announced','normal-archived']: res = None else: ## retry ? res = reqMgrClient.announceWorkflowCascade(url, wfo.name) results.append( res ) #print results if all(map(lambda result : result in ['None',None,True],results)): if jump_the_line: if not 'announced' in wfo.status: wfo.status = wfo.status.replace('announce','announced') else: wfo.status = 'done' session.commit() wfi.sendLog('closor',"workflow outputs are announced") else: wfi.sendLog('closor',"Error with %s to be announced \n%s"%( wfo.name, json.dumps( results ))) elif wfi.request['RequestStatus'] in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']: if wfi.isRelval(): wfo.status = 'forget' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog('closor',"%s is %s, but will not be set in trouble to find a replacement."%( wfo.name, wfo.wm_status)) else: wfo.status = 'trouble' wfo.wm_status = wfi.request['RequestStatus'] session.commit() else: print wfo.name,"not good for announcing:",wfi.request['RequestStatus'] wfi.sendLog('closor',"cannot be announced") held.add( wfo.name ) days_late = 0. retries_late = 10 really_late_files = [info for info in all_late_files if info['retries']>=retries_late] really_late_files = [info for info in really_late_files if info['delay']/(60*60*24.)>=days_late] if really_late_files: subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s'%(len(really_late_files), days_late, retries_late, json.dumps( really_late_files , indent=2) ) #sendEmail('waiting for files to announce', subject) sendLog('closor', subject, level='warning') sendLog('closor',subject) print subject open('%s/stuck_files.json'%monitor_dir,'w').write( json.dumps( really_late_files , indent=2)) if held: sendLog('closor',"the workflows below are held up \n%s"%("\n".join( sorted(held) )), level='critical') #batches = json.loads(open('batches.json').read()) for bname,go in batch_go.items(): if go: subject = "Release Validation Samples Batch %s"% bname issues="" if batch_warnings[ bname ]: issues="The following datasets have outstanding completion (<%d%%) issues:\n\n"% batch_goodness issues+="\n".join( sorted( batch_warnings[ bname ] )) issues+="\n\n" text = """ Dear all, a batch of release validation workflows has finished. Batch ID: %s Detail of the workflows https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s %s This is an automated message. """%( bname, bname, issues) to = ['*****@*****.**'] sendEmail(subject, text, destination=to )
def main(): mysqlconn = MySQLdb.connect(host='dbod-cmsrv1.cern.ch', user='******', passwd="relval", port=5506) #conn = MySQLdb.connect(host='localhost', user='******', passwd='relval') curs = mysqlconn.cursor() curs.execute("use "+dbname+";") #curs.execute("lock tables batches write, batches_archive write, workflows write, workflows_archive write, datasets write, clone_reinsert_requests write") #workflow = line.rstrip('\n') #curs.execute("insert into workflows set hn_req=\""+hnrequest+"\", workflow_name=\""+workflow+"\";") curs.execute("select * from batches") batches=curs.fetchall() batches_colnames = [desc[0] for desc in curs.description] for batch in batches: #if batch[0] != 21: # continue blocks_dsets_to_transfer=[] blocks_not_at_site=[] batch_dict = dict(zip(batches_colnames, batch)) site = batch_dict["site"] if "T2" in site: site_disk = site elif "T1" in site: site_disk = site + "_Disk" else: os.system('echo '+site+' | mail -s \"input_dset_checkor.py error 1\" [email protected]') print "Neither T1 nor T2 is in site name, exiting" sys.exit(1) if site == "T2_CH_CERN_T0": site_disk = "T2_CH_CERN" if site == "T2_CH_CERN_AI": site_disk = "T2_CH_CERN" #print batch #print "" userid = batch_dict["useridyear"]+"_"+batch_dict["useridmonth"]+"_"+batch_dict["useridday"]+"_"+str(batch_dict["useridnum"])+"_"+str(batch_dict["batch_version_num"]) #if status == "waiting_for_transfer" and count % 10 == 0: if batch_dict["status"] == "waiting_for_transfer": print " userid ==> "+str(userid) #count = 0 all_dsets_blocks_at_site=True curs.execute("select workflow_name from workflows where useridyear = \""+ batch_dict["useridyear"]+"\" and useridmonth = \""+batch_dict["useridmonth"]+"\" and useridday = \""+batch_dict["useridday"]+"\" and useridnum = "+str(batch_dict["useridnum"])+" and batch_version_num = "+str(batch_dict["batch_version_num"])+";") wfs=curs.fetchall() for wf in wfs: print wf[0] headers = {"Content-type": "application/json", "Accept": "application/json"} conn = httplib.HTTPSConnection('cmsweb.cern.ch', cert_file = os.getenv('X509_USER_PROXY'), key_file = os.getenv('X509_USER_PROXY')) r1=conn.request("GET",'/reqmgr2/data/request/'+wf[0],headers=headers) r2=conn.getresponse() schema = json.loads(r2.read()) schema = schema['result'][0][wf[0]] isthereanmcpileupdataset=False for key, value in schema.items(): if type(value) is dict and key.startswith("Task"): if 'MCPileup' in value: isthereanmcpileupdataset=True ismcpileupdatasetatsite=utils.checkIfDatasetIsSubscribedToASite("cmsweb.cern.ch",value["MCPileup"],site_disk) if 'InputDataset' in value: inputdset=value['InputDataset'] if 'RunWhitelist' in value: runwhitelist=value['RunWhitelist'] blocks_fname=os.popen("mktemp").read().rstrip("\n") list_of_blocks=utils.getListOfBlocks(inputdset,str(runwhitelist)) for block in list_of_blocks: #this block (/DoubleMu/...) is not registered in phedex, so it cannot be subscribed to any site if block == "/DoubleMu/Run2011A-ZMu-08Nov2011-v1/RAW-RECO#93c53d22-25b2-11e1-8c62-003048f02c8a": continue isblockatsite = utils.checkIfBlockIsAtASite("cmsweb.cern.ch",block,site_disk) if not isblockatsite: all_dsets_blocks_at_site=False else: isdatasetatsite=utils.checkIfDatasetIsSubscribedToASite("cmsweb.cern.ch",inputdset,site_disk) if not isdatasetatsite: all_dsets_blocks_at_site=False if all_dsets_blocks_at_site and (not isthereanmcpileupdataset or ismcpileupdatasetatsite): curs.execute("update batches set status=\"input_dsets_ready\", current_status_start_time=\""+datetime.datetime.now().strftime("%y:%m:%d %H:%M:%S")+"\" where useridyear = \""+ batch_dict["useridyear"]+"\" and useridmonth = \""+batch_dict["useridmonth"]+"\" and useridday = \""+batch_dict["useridday"]+"\" and useridnum = "+str(batch_dict["useridnum"])+" and batch_version_num = "+str(batch_dict["batch_version_num"])+";") mysqlconn.commit() if batch_dict["status"] == "approved": print " userid ==> "+str(userid) #print "checking input datasets for workflows in batch "+str(batchid) curs.execute("select workflow_name from workflows where useridyear = \""+ batch_dict["useridyear"]+"\" and useridmonth = \""+batch_dict["useridmonth"]+"\" and useridday = \""+batch_dict["useridday"]+"\" and useridnum = "+str(batch_dict["useridnum"])+" and batch_version_num = "+str(batch_dict["batch_version_num"])+";") wfs=curs.fetchall() for wf in wfs: print wf[0] headers = {"Content-type": "application/json", "Accept": "application/json"} conn = httplib.HTTPSConnection('cmsweb.cern.ch', cert_file = os.getenv('X509_USER_PROXY'), key_file = os.getenv('X509_USER_PROXY')) r1=conn.request("GET",'/reqmgr2/data/request/'+wf[0],headers=headers) r2=conn.getresponse() schema = json.loads(r2.read()) schema = schema['result'][0][wf[0]] for key, value in schema.items(): if type(value) is dict and key.startswith("Task"): if 'MCPileup' in value: isdatasetatsite=utils.checkIfDatasetIsSubscribedToASite("cmsweb.cern.ch",value['MCPileup'],site_disk) if not isdatasetatsite: blocks_dsets_to_transfer.append(value['MCPileup']) if 'InputDataset' in value: subscribed_to_disk=False inputdset=value['InputDataset'] if 'RunWhitelist' in value: runwhitelist=value['RunWhitelist'] list_of_blocks=utils.getListOfBlocks(inputdset,str(runwhitelist)) for block in list_of_blocks: #this block (/DoubleMu/...) is not registered in phedex, so it cannot be subscribed to any site if block == "/DoubleMu/Run2011A-ZMu-08Nov2011-v1/RAW-RECO#93c53d22-25b2-11e1-8c62-003048f02c8a": continue isblocksubscribedtosite=utils.checkIfBlockIsSubscribedToASite("cmsweb.cern.ch",block,site_disk) isblockatsite=utils.checkIfBlockIsAtASite("cmsweb.cern.ch",block,site_disk) if not isblocksubscribedtosite: blocks_dsets_to_transfer.append(block) if not isblockatsite: blocks_not_at_site.append(block) else: isdatasetsubscribedtosite=utils.checkIfDatasetIsSubscribedToASite("cmsweb.cern.ch",inputdset,site_disk) isdatasetatsite=utils.checkIfDatasetIsSubscribedToASite("cmsweb.cern.ch",inputdset,site_disk) if not isdatasetsubscribedtosite: blocks_dsets_to_transfer.append(inputdset) if not isdatasetatsite: blocks_not_at_site.append(inputdset) if blocks_dsets_to_transfer != []: print "transfering the following blocks:" print blocks_dsets_to_transfer result=utils.makeReplicaRequest(url="cmsweb.cern.ch", site=site_disk, datasets=blocks_dsets_to_transfer, comments="relval datasets", group = "RelVal") phedexid = result['phedex']['request_created'][0]['id'] utils.approveSubscription("cmsweb.cern.ch",phedexid) curs.execute("update batches set status=\"waiting_for_transfer\", current_status_start_time=\""+datetime.datetime.now().strftime("%y:%m:%d %H:%M:%S")+"\" where useridyear = \""+ batch_dict["useridyear"]+"\" and useridmonth = \""+batch_dict["useridmonth"]+"\" and useridday = \""+batch_dict["useridday"]+"\" and useridnum = "+str(batch_dict["useridnum"])+" and batch_version_num = "+str(batch_dict["batch_version_num"])+";") mysqlconn.commit() elif blocks_not_at_site != []: print blocks_not_at_site curs.execute("update batches set status=\"waiting_for_transfer\", current_status_start_time=\""+datetime.datetime.now().strftime("%y:%m:%d %H:%M:%S")+"\" where useridyear = \""+ batch_dict["useridyear"]+"\" and useridmonth = \""+batch_dict["useridmonth"]+"\" and useridday = \""+batch_dict["useridday"]+"\" and useridnum = "+str(batch_dict["useridnum"])+" and batch_version_num = "+str(batch_dict["batch_version_num"])+";") else: curs.execute("update batches set status=\"input_dsets_ready\", current_status_start_time=\""+datetime.datetime.now().strftime("%y:%m:%d %H:%M:%S")+"\" where useridyear = \""+ batch_dict["useridyear"]+"\" and useridmonth = \""+batch_dict["useridmonth"]+"\" and useridday = \""+batch_dict["useridday"]+"\" and useridnum = "+str(batch_dict["useridnum"])+" and batch_version_num = "+str(batch_dict["batch_version_num"])+";") mysqlconn.commit()
def transferor(url ,specific = None, talk=True, options=None): if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() all_transfers=defaultdict(list) workflow_dependencies = defaultdict(set) ## list of wf.id per input dataset data_to_wf = {} for wfo in session.query(Workflow).filter(Workflow.status=='considered').all(): if specific and not specific in wfo.name: continue print wfo.name,"to be transfered" wfh = workflowInfo( url, wfo.name) #injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.) #now = time.mktime(time.gmtime()) / (60.*60.) #if float(now - injection_time) < 4.: # print "It is too soon to transfer", now, injection_time # continue (lheinput,primary,parent,secondary) = wfh.getIO() if options and options.tosites: sites_allowed = options.tosites.split(',') else: sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']): sites_allowed = CI.parameters(wfh.request['Campaign'])['SiteWhitelist'] blocks = [] if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']: blocks = wfh.request['BlockWhitelist'] can_go = True if primary: if talk: print wfo.name,'reads',', '.join(primary),'in primary' ## chope the primary dataset for prim in primary: workflow_dependencies[prim].add( wfo.id ) presence = getDatasetPresence( url, prim ) prim_location = [site for site,pres in presence.items() if pres[0]==True] subscriptions = listSubscriptions( url , prim ) prim_destination = [site for site in subscriptions] prim_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in prim_location])] prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in prim_destination])] if len(prim_to_distribute)>0: ## maybe that a parameter we can play with to limit the if not options or options.chop: spreading = distributeToSites( [[prim]]+getDatasetChops(prim), prim_to_distribute, n_copies = 3, weights=SI.cpu_pledges) else: spreading = {} for site in prim_to_distribute: spreading[site]=[prim] can_go = False for (site,items) in spreading.items(): all_transfers[site].extend( items ) if secondary: if talk: print wfo.name,'reads',', '.join(secondary),'in secondary' for sec in secondary: workflow_dependencies[sec].add( wfo.id ) presence = getDatasetPresence( url, sec ) sec_location = [site for site,pres in presence.items() if pres[1]>90.] ## more than 90% of the minbias at sites subscriptions = listSubscriptions( url ,sec ) sec_destination = [site for site in subscriptions] sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])] if len( sec_to_distribute )>0: for site in sec_to_distribute: all_transfers[site].append( sec ) can_go = False ## is that possible to do something more if can_go: print wfo.name,"should just be assigned NOW to",sites_allowed wfo.status = 'staged' session.commit() continue else: print wfo.name,"needs a transfer" #print json.dumps(all_transfers) fake_id=-1 wf_id_in_prestaging=set() for (site,items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] datasets = [it for it in items_to_transfer if not '#' in it] print "Making a replica to",site,"(CE)",site_se,"(SE) for" print "\t",len(blocks),"blocks" ## remove blocks if full dataset is send out blocks = [block for block in blocks if not block.split('#')[0] in datasets] print "\t",len(blocks),"needed blocks" print "\t",len(datasets),"datasets" print "\t",datasets items_to_transfer = blocks + datasets ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y','yes','go']: continue if execute: result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging') else: #result= {'phedex':{'request_created' : [{'id' : fake_id}]}} result= {'phedex':{'request_created' : []}} fake_id-=1 if not result: print "ERROR Could not make a replica request for",site,items_to_transfer,"pre-staging" continue for phedexid in [o['id'] for o in result['phedex']['request_created']]: new_transfer = session.query(Transfer).filter(Transfer.phedexid == phedexid).first() if not new_transfer: new_transfer = Transfer( phedexid = phedexid) session.add( new_transfer ) new_transfer.workflows_id = set() for transfering in list(set(map(lambda it : it.split('#')[0], items_to_transfer))): new_transfer.workflows_id.update( workflow_dependencies[transfering] ) new_transfer.workflows_id = list(new_transfer.workflows_id) wf_id_in_prestaging.update(new_transfer.workflows_id) session.commit() ## auto approve it if execute: approved = approveSubscription(url, phedexid, [site_se]) for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status!='staging': if execute: tr_wf.status = 'staging' if talk: print "setting",tr_wf.name,"to staging" session.commit()
def close(self): if os.path.isfile('.closor_stop'): print "The closing of workflows is shortened" return url = self.url batch_go = self.batch_go CI = self.CI UC = self.UC wfo = self.wfo jump_the_line = self.jump_the_line batch_goodness = self.batch_goodness check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce') ## what is the expected #lumis self.wfi = workflowInfo(url, wfo.name ) wfi = self.wfi wfo.wm_status = wfi.request['RequestStatus'] if wfi.isRelval(): has_batch_go = False batch_name = wfi.getCampaign() if not batch_name in batch_go: ## do the esimatation whethere this can be announced : only once per batch in_batches = getWorkflowByCampaign(url , batch_name, details=True) batch_go[ batch_name ] = all(map(lambda s : not s in ['completed','running-open','running-closed','acquired','assigned','assignment-approved'], [r['RequestStatus'] for r in in_batches])) ## already verified has_batch_go = batch_go[batch_name] if not has_batch_go: wfi.sendLog('closor', 'Cannot close for now because the batch <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?campaign=%s>%s</a> is not all close'%( batch_name, batch_name)) return if wfi.request['RequestStatus'] in ['announced','normal-archived'] and not options.force: ## manually announced ?? self.to_status = 'done' self.to_wm_status = wfi.request['RequestStatus'] wfi.sendLog('closor','%s is announced already : %s'%( wfo.name,self.to_wm_status)) return if jump_the_line: wfi.sendLog('closor','Announcing while completing') expected_lumis = 1 if not 'TotalInputLumis' in wfi.request: print wfo.name,"has not been assigned yet, or the database is corrupted" elif wfi.request['TotalInputLumis']==0: print wfo.name,"is corrupted with 0 expected lumis" else: expected_lumis = wfi.request['TotalInputLumis'] ## what are the outputs outputs = wfi.request['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = defaultdict(lambda : False) stats = defaultdict(int) #print outputs if len(outputs): print wfo.name,wfi.request['RequestStatus'] for out in outputs: event_count,lumi_count = getDatasetEventsAndLumis(dataset=out) self.outs.append( Output( datasetname = out )) odb = self.outs[-1] odb.workflow = wfo odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) fraction = lumi_count/float(expected_lumis)*100. completion_line = "%60s %d/%d = %3.2f%%"%(out,lumi_count,expected_lumis,fraction) wfi.sendLog('closor',"\t%s"% completion_line) if wfi.isRelval() and fraction < batch_goodness: self.batch_warnings[ wfi.getCampaign()].add( completion_line ) stats[out] = lumi_count all_OK[out] = True ## check for at least one full copy prior to moving on in_full = {} for out in outputs: in_full[out] = [] presence = getDatasetPresence( url, out ) where = [site for site,info in presence.items() if info[0]] if where: all_OK[out] = True print out,"is in full at",",".join(where) in_full[out] = copy.deepcopy(where) else: going_to = wfi.request['NonCustodialSites']+wfi.request['CustodialSites'] wfi.sendLog('closor',"%s is not in full anywhere. send to %s"%(out, ",".join(sorted(going_to)))) at_destination = dict([(k,v) for (k,v) in presence.items() if k in going_to]) else_where = dict([(k,v) for (k,v) in presence.items() if not k in going_to]) print json.dumps( at_destination ) print json.dumps( else_where, indent=2 ) ## do the full stuck transfer study, missing files and shit ! for there in going_to: late_info = findLateFiles(url, out, going_to = there ) for l in late_info: l.update({"workflow":wfo.name,"dataset":out}) self.all_late_files.extend( late_info ) if check_fullcopy_to_announce: ## only set this false if the check is relevant all_OK[out] = False ## verify if we have to do harvesting if not options.no_harvest and not jump_the_line: (OK, requests) = spawn_harvesting(url, wfi, in_full) all_OK.update( OK ) ## only that status can let me go into announced if all(all_OK.values()) and ((wfi.request['RequestStatus'] in ['closed-out']) or options.force or jump_the_line): print wfo.name,"to be announced" results=[] if not results: for out in outputs: print "dealing with",out if out in stats and not stats[out]: continue _,dsn,process_string,tier = out.split('/') if all_OK[out]: print "setting valid" results.append(setDatasetStatus(out, 'VALID', withFiles=False)) if all_OK[out] and wfi.isRelval(): ## make the specific relval rules and the replicas ## figure the destination(s) out destinations = set() if tier != "RECO" and tier != "ALCARECO": destinations.add('T2_CH_CERN') if tier == "GEN-SIM": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-DIGI-RAW": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-RECO": destinations.add('T1_US_FNAL_Disk') if "RelValTTBar" in dsn and "TkAlMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if "MinimumBias" in dsn and "SiStripCalMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if destinations: wfi.sendLog('closor', '%s to go to %s'%(out, ', '.join( sorted( destinations )))) ## call to makereplicarequest under relval => done for site in destinations: result = makeReplicaRequest(url, site, [out], 'Copy for release validation consumption', priority='normal', approve=True, mail=False, group='RelVal') try: request_id = result['phedex']['request_created'][0]['id'] results.append( True ) except: results.append( 'Failed relval transfer' ) elif all_OK[out]: campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request and wfi.request['Campaign']: campaign = wfi.request['Campaign'] to_DDM = False ## campaign override if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[campaign] and tier in CI.campaigns[campaign]['toDDM']: to_DDM = True ## by typical enabling if tier in UC.get("tiers_to_DDM"): to_DDM = True ## check for unitarity if not tier in UC.get("tiers_no_DDM")+UC.get("tiers_to_DDM"): print "tier",tier,"neither TO or NO DDM for",out results.append('Not recognitized tier %s'%tier) #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out) sendLog('closor', "could not recognize %s for injecting in DDM"% out, level='critical') continue n_copies = 1 destinations=[] if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[campaign]: ddm_instructions = CI.campaigns[campaign]['DDMcopies'] if type(ddm_instructions) == int: n_copies = CI.campaigns[campaign]['DDMcopies'] elif type(ddm_instructions) == dict: ## a more fancy configuration for ddmtier,indication in ddm_instructions.items(): if ddmtier==tier or ddmtier in ['*','all']: ## this is for us if 'N' in indication: n_copies = indication['N'] if 'host' in indication: destinations = indication['host'] destination_spec = "" if destinations: destination_spec = "--destination="+",".join( destinations ) group_spec = "" ## not used yet ### should make this a campaign configuration ## inject to DDM when necessary if to_DDM: print "Sending",out," to DDM" status = pass_to_dynamo( [out], N = n_copies, sites=destinations if destinations else None, group = group_spec if group_spec else None) results.append( status ) if status == True: wfi.sendLog('closor','%s is send to dynamo in %s copies %s %s'%( out, n_copies, sorted(destinations), group_spec)) else: sendLog('closor',"could not add "+out+" to dynamo pool. check closor logs.", level='critical') wfi.sendLog('closor',"could not add "+out+" to dynamo pool. check closor logs.") else: print wfo.name,"no stats for announcing",out results.append('No Stats') if all(map(lambda result : result in ['None',None,True],results)): if not jump_the_line: ## only announce if all previous are fine res = reqMgrClient.announceWorkflowCascade(url, wfo.name) if not res in ['None',None]: ## check the status again, it might well have toggled wl_bis = workflowInfo(url, wfo.name) self.to_wm_status = wl_bis.request['RequestStatus'] if wl_bis.request['RequestStatus'] in ['announced','normal-archived']: res = None else: res = reqMgrClient.announceWorkflowCascade(url, wfo.name) results.append( res ) print results if all(map(lambda result : result in ['None',None,True],results)): if jump_the_line: if not 'announced' in wfo.status: self.to_status = wfo.status.replace('announce','announced') else: self.to_status = 'done' self.closing = True wfi.sendLog('closor',"workflow outputs are announced") else: wfi.sendLog('closor',"Error with %s to be announced \n%s"%( wfo.name, json.dumps( results ))) elif wfi.request['RequestStatus'] in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']: if wfi.isRelval(): self.to_status = 'forget' self.to_wm_status = wfi.request['RequestStatus'] wfi.sendLog('closor',"%s is %s, but will not be set in trouble to find a replacement."%( wfo.name, self.to_wm_status)) else: self.to_status = 'trouble' self.to_wm_status = wfi.request['RequestStatus'] else: print wfo.name,"not good for announcing:",wfi.request['RequestStatus'] wfi.sendLog('closor',"cannot be announced") self.held.add( wfo.name )
def main(): mysqlconn = MySQLdb.connect(host='dbod-cmsrv1.cern.ch', user='******', passwd="relval", port=5506) curs = mysqlconn.cursor() curs.execute("use "+dbname+";") #curs.execute("lock tables batches write, batches_archive write, workflows write, workflows_archive write, datasets write, clone_reinsert_requests write") curs.execute("select * from batches") batches=curs.fetchall() batches_colnames = [desc[0] for desc in curs.description] for batch in batches: batch_dict= dict(zip(batches_colnames, batch)) if batch_dict["status"] != "assigned": continue userid=batch_dict["useridyear"]+"_"+batch_dict["useridmonth"]+"_"+batch_dict["useridday"]+"_"+str(batch_dict["useridnum"])+"_"+str(batch_dict["batch_version_num"]) print " userid ==> "+userid curs.execute("select workflow_name from workflows where useridyear = \""+batch_dict["useridyear"]+"\" and useridmonth = \""+batch_dict["useridmonth"]+ "\" and useridday = \""+batch_dict["useridday"]+"\" and useridnum = "+str(batch_dict["useridnum"])+" and batch_version_num ="+str(batch_dict["batch_version_num"])+";") wfs=curs.fetchall() n_workflows=0 n_completed=0 for wf in wfs: n_workflows=n_workflows+1 conn = httplib.HTTPSConnection(url, cert_file = os.getenv('X509_USER_PROXY'), key_file = os.getenv('X509_USER_PROXY')) r1=conn.request('GET','/reqmgr2/data/request?name='+wf[0],headers={"Accept": "application/json"}) r2=conn.getresponse() data = r2.read() if r2.status != 200: time.sleep(10) #try it again conn = httplib.HTTPSConnection(url, cert_file = os.getenv('X509_USER_PROXY'), key_file = os.getenv('X509_USER_PROXY')) r1=conn.request('GET','/reqmgr2/data/request?name='+wf[0],headers={"Accept": "application/json"}) r2=conn.getresponse() data = r2.read() if r2.status != 200: time.sleep(10) #try it a third time conn = httplib.HTTPSConnection(url, cert_file = os.getenv('X509_USER_PROXY'), key_file = os.getenv('X509_USER_PROXY')) r1=conn.request('GET','/reqmgr2/data/request?name='+wf[0],headers={"Accept": "application/json"}) r2=conn.getresponse() data = r2.read() if r2.status != 200: os.system('echo \"'+wf[0]+'\" | mail -s \"announcor.py error 1\" [email protected]') sys.exit(1) s = json.loads(data) for status in s['result'][0][wf[0]]['RequestTransition']: if status['Status'] == "completed" or status['Status'] == "force-complete": #conn = httplib.HTTPSConnection(url, cert_file = os.getenv('X509_USER_PROXY'), key_file = os.getenv('X509_USER_PROXY')) #r1=conn.request('GET',"/wmstatsserver/data/isfinished/"+wf[0],headers={"Accept": "application/json"}) #r2=conn.getresponse() #data = r2.read() #s = json.loads(data) #if s['result'][0] == "true": # n_completed=n_completed+1 n_completed=n_completed+1 break print "datetime.datetime.now() = " + str(datetime.datetime.now()) print "n_workflows = " + str(n_workflows) print "n_completed = " + str(n_completed) if n_workflows != n_completed: continue #string="2016_04_11_1_0" #if not (string.split('_')[0] == batch_dict["useridyear"] and string.split('_')[1] == batch_dict["useridmonth"] and string.split('_')[2] == batch_dict["useridday"] and string.split('_')[3] == str(batch_dict["useridnum"]) and string.split('_')[4] == str(batch_dict["batch_version_num"])): # continue wf_list = [] for wf in wfs: print wf[0] wf_list.append(wf[0]) job_failure_information=collect_job_failure_information.collect_job_failure_information(wf_list) needs_assistance = assistance_decision.assistance_decision(job_failure_information) if needs_assistance: curs.execute("update batches set status=\"assistance\", current_status_start_time=\""+datetime.datetime.now().strftime("%y:%m:%d %H:%M:%S")+"\" where useridyear = \""+batch_dict["useridyear"]+"\" and useridmonth = \""+batch_dict["useridmonth"]+ "\" and useridday = \""+batch_dict["useridday"]+"\" and useridnum = "+str(batch_dict["useridnum"])+" and batch_version_num ="+str(batch_dict["batch_version_num"])+";") mysqlconn.commit() os.system('echo \"batch_id: '+userid+'\" | mail -s \"a batch of relval workflows needs assistance\" [email protected]') continue #if there is a '\r' character in the body of an e-mail, it does not get sent description=batch_dict["description"].replace('\r','') for wf in wf_list: too_many_events_check.too_many_events_check(wf) dset_nevents_list=collect_dsets_and_nevents.collect_dsets_and_nevents(wf_list) print_dsets_and_nevents.print_dsets_and_nevents(dset_nevents_list, userid+".txt") ret=os.system("cp "+userid+".txt /afs/cern.ch/user/r/relval/webpage/relval_stats/"+userid+".txt") if ret == 0: os.system("rm "+userid+".txt") else: os.system('echo \"'+userid+'\" | mail -s \"announcor.py error 2\" [email protected]') sys.exit(0) dsets_list = [] for dset_nevents in dset_nevents_list: dsets_list.append(dset_nevents[0]) for dset in dsets_list: setDatasetStatusDBS3.setStatusDBS3("https://cmsweb.cern.ch/dbs/prod/global/DBSWriter", dset, "VALID", True) for wf in wf_list: reqMgrClient.closeOutWorkflow("cmsweb.cern.ch",wf) reqMgrClient.announceWorkflow("cmsweb.cern.ch",wf) msg = MIMEMultipart() reply_to = [] #send_to = ["*****@*****.**","*****@*****.**"] send_to = ["*****@*****.**","*****@*****.**","*****@*****.**"] #send_to = ["*****@*****.**"] #msg['In-Reply-To'] = hn_message_id #msg['References'] = hn_message_id msg['From'] = "*****@*****.**" msg['reply-to'] = COMMASPACE.join(reply_to) msg['To'] = COMMASPACE.join(send_to) msg['Date'] = formatdate(localtime=True) msg['Subject'] = batch_dict["announcement_title"] msg['Message-ID'] = email.Utils.make_msgid() messageText="Dear all,\n" messageText=messageText+"\n" messageText=messageText+"A batch of relval workflows has finished.\n" messageText=messageText+"\n" messageText=messageText+"Batch ID:\n" messageText=messageText+"\n" messageText=messageText+userid+"\n" if batch_dict["batch_version_num"] > 0: messageText=messageText+"\n" messageText=messageText+"original workflow name ==> clone name:\n" messageText=messageText+"\n" curs.execute("select workflow_name,original_workflow_name from workflows where useridyear = \""+batch_dict["useridyear"]+"\" and useridmonth = \""+batch_dict["useridmonth"]+ "\" and useridday = \""+batch_dict["useridday"]+"\" and useridnum = "+str(batch_dict["useridnum"])+" and batch_version_num ="+str(batch_dict["batch_version_num"])+";") workflows=curs.fetchall() for workflow in workflows: messageText=messageText+workflow[1] + " ==> "+workflow[0] + "\n" messageText=messageText+"\n" messageText=messageText+"List of datasets:\n" messageText=messageText+"\n" messageText=messageText+"http://cms-project-relval.web.cern.ch/cms-project-relval/relval_stats/"+userid+".txt\n" messageText=messageText+"\n" messageText=messageText+"Description:\n" messageText=messageText+"\n" messageText=messageText+description.rstrip('\n') messageText=messageText+"\n" #messageText=messageText+"\n" [istherefailureinformation,return_string]=print_job_failure_information.print_job_failure_information(job_failure_information) if istherefailureinformation: messageText=messageText+"\n" messageText=messageText+return_string messageText=messageText+"\n" messageText=messageText+"\n" messageText=messageText+"RelVal Batch Manager" #put the announcement message into an e-mail to the relval hypernews and also in a url output_file = open("/afs/cern.ch/user/r/relval/webpage/relval_announcements/"+userid+".txt", 'w') output_file.write(messageText) try: msg.attach(MIMEText(messageText)) smtpObj = smtplib.SMTP() smtpObj.connect() smtpObj.sendmail("*****@*****.**", send_to, msg.as_string()) smtpObj.close() except Exception as e: print "Error: unable to send email: %s" %(str(e)) dsets_fnal_disk_list = [] dsets_cern_disk_list = [] for dset in dsets_list: #print dset.split('/') # we were asked to transfer some specific datasets to the cern tier 2 if dset.split('/')[3] != "RECO" and dset.split('/')[3] != "ALCARECO": dsets_cern_disk_list.append(dset) if dset.split('/')[3] == "GEN-SIM": dsets_fnal_disk_list.append(dset) if dset.split('/')[3] == "GEN-SIM-DIGI-RAW": dsets_fnal_disk_list.append(dset) if dset.split('/')[3] == "GEN-SIM-RECO": dsets_fnal_disk_list.append(dset) if "RelValTTBar" in dset.split('/')[1] and "TkAlMinBias" in dset.split('/')[2] and dset.split('/')[3] != "ALCARECO": dsets_cern_disk_list.append(dset) if "MinimumBias" in dset.split('/')[1] and "SiStripCalMinBias" in dset.split('/')[2] and dset.split('/')[3] != "ALCARECO": dsets_cern_disk_list.append(dset) result=utils.makeReplicaRequest("cmsweb.cern.ch", "T2_CH_CERN", dsets_cern_disk_list, "relval datasets",group="RelVal") if result != None: phedexid = result['phedex']['request_created'][0]['id'] utils.approveSubscription("cmsweb.cern.ch",phedexid) result=utils.makeReplicaRequest("cmsweb.cern.ch", "T1_US_FNAL_Disk", dsets_fnal_disk_list, "relval datasets", group = "RelVal") if result != None: phedexid = result['phedex']['request_created'][0]['id'] utils.approveSubscription("cmsweb.cern.ch",phedexid) result=utils.makeMoveRequest("cmsweb.cern.ch", "T0_CH_CERN_MSS", dsets_list, "relval datasets", group = "RelVal") if result != None: phedexid = result['phedex']['request_created'][0]['id'] #even if you disapprove the subscription at the source, it will still deleted the datasets that are at the source but not subscribed their utils.disapproveSubscription("cmsweb.cern.ch",phedexid,["T2_CH_CERN"]) utils.disapproveSubscription("cmsweb.cern.ch",phedexid,["T1_US_FNAL_Disk"]) utils.disapproveSubscription("cmsweb.cern.ch",phedexid,["T1_FR_CCIN2P3_Disk"]) utils.disapproveSubscription("cmsweb.cern.ch",phedexid,["T1_DE_KIT_Disk"]) utils.approveSubscription("cmsweb.cern.ch",phedexid,["T0_CH_CERN_MSS"]) #phedexid = result['phedex']['request_created'][0]['id'] #utils.approveSubscription("cmsweb.cern.ch",phedexid) curs.execute("update batches set status=\"announced\", current_status_start_time=\""+datetime.datetime.now().strftime("%y:%m:%d %H:%M:%S")+"\" where useridyear = \""+batch_dict["useridyear"]+"\" and useridmonth = \""+batch_dict["useridmonth"]+ "\" and useridday = \""+batch_dict["useridday"]+"\" and useridnum = "+str(batch_dict["useridnum"])+" and batch_version_num ="+str(batch_dict["batch_version_num"])+";") mysqlconn.commit()
def transferor(url ,specific = None, talk=True, options=None): if userLock(): return if duplicateLock(): return use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() LI = lockInfo() NLI = newLockInfo() mcm = McMClient(dev=False) dss = DSS() print "counting all being handled..." being_handled = len(session.query(Workflow).filter(Workflow.status == 'away').all()) being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('stag')).all()) being_transfered = len(session.query(Workflow).filter(Workflow.status == 'staging').all()) being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all()) max_to_handle = options.maxworkflows max_to_transfer = options.maxstaging allowed_to_handle = max(0,max_to_handle - being_handled) allowed_to_transfer = max(0,max_to_transfer - being_transfered) wf_buffer = 5 if allowed_to_handle<=wf_buffer: ## buffer for having several wf per transfer print "Not allowed to run more than",max_to_handle,"at a time. Currently",being_handled,"and",wf_buffer,"buffer" else: print being_handled,"already being handled",max_to_handle,"max allowed,",allowed_to_handle,"remaining","and",wf_buffer,"buffer" if allowed_to_transfer <= wf_buffer: print "Not allowed to transfer more than",max_to_transfer,"at a time. Currently",being_transfered,"and",wf_buffer,"buffer" else: print being_transfered,"already being transfered",max_to_transfer,"max allowed,",allowed_to_transfer,"remaining","and",wf_buffer,"buffer" print "... done" all_transfers=defaultdict(list) needing_locks=defaultdict(list) workflow_dependencies = defaultdict(set) ## list of wf.id per input dataset wfs_and_wfh=[] print "getting all wf to consider ..." cache = getWorkflows(url, 'assignment-approved', details=True) for wfo in session.query(Workflow).filter(Workflow.status=='considered').all(): print "\t",wfo.name if specific and not specific in wfo.name: continue cache_r =filter(lambda d:d['RequestName']==wfo.name, cache) if len(cache_r): wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False, request = cache_r[0]) ) ) else: wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False) ) ) print "... done" transfers_per_sites = defaultdict(int) input_sizes = {} input_cput = {} input_st = {} ## list the size of those in transfer already in_transfer_priority=0 min_transfer_priority=100000000 print "getting all wf in staging ..." for wfo in session.query(Workflow).filter(Workflow.status=='staging').all(): wfh = workflowInfo( url, wfo.name, spec=False) (lheinput,primary,parent,secondary) = wfh.getIO() sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) for site in sites_allowed: ## we should get the actual transfer destination instead of the full white list transfers_per_sites[site] += 1 #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() for prim in primary: input_sizes[prim] = dss.get( prim ) print "\t",wfo.name,"needs",input_sizes[prim],"GB" in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority'])) min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority'])) print "... done" print "Max priority in transfer already",in_transfer_priority print "Min priority in transfer already",min_transfer_priority print "transfers per sites" print json.dumps( transfers_per_sites, indent=2) in_transfer_already = sum(input_sizes.values()) cput_in_transfer_already = sum(input_cput.values()) st_in_transfer_already = sum(input_st.values()) # shuffle first by name random.shuffle( wfs_and_wfh ) #sort by priority higher first wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority']) ), reverse=True) ## list the size of all inputs print "getting all input sizes ..." for (wfo,wfh) in wfs_and_wfh: (_,primary,_,_) = wfh.getIO() #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() for prim in primary: input_sizes[prim] = dss.get( prim ) print "... done" cput_grand_total = sum(input_cput.values()) cput_to_transfer = cput_grand_total - cput_in_transfer_already st_grand_total = sum(input_st.values()) st_to_transfer = st_grand_total - st_in_transfer_already print "%15.4f [CPU h] worth already in transfer"%cput_in_transfer_already print "%15.4f [CPU h] worth is the current requested transfer load"%cput_to_transfer print "%15.4f [h] worth of absolute system time in transfer"%( cput_in_transfer_already / SI.availableSlots()) print "%15.4f [h] worth of absolute system time is the current requested transfer load"%( cput_to_transfer / SI.availableSlots()) print "%15.4f [h] worth of theoritical system time in transfer"%( st_in_transfer_already ) print "%15.4f [h] worth of theoritical system time is the current requested transfer load"%( st_to_transfer ) grand_total = sum(input_sizes.values()) to_transfer = grand_total - in_transfer_already grand_transfer_limit = options.maxtransfer transfer_limit = grand_transfer_limit - in_transfer_already print "%15.4f GB already being transfered"%in_transfer_already print "%15.4f GB is the current requested transfer load"%to_transfer print "%15.4f GB is the global transfer limit"%grand_transfer_limit print "%15.4f GB is the available limit"%transfer_limit max_staging_per_site = options.maxstagingpersite # the max priority value per dataset. max_priority = defaultdict(int) needs_transfer=0 ## so that we can count'em passing_along = 0 transfer_sizes={} went_over_budget=False destination_cache = {} for (wfo,wfh) in wfs_and_wfh: print wfh.request['RequestPriority'] print wfo.name,"to be transfered" #wfh = workflowInfo( url, wfo.name) (_,primary,_,_) = wfh.getIO() this_load=sum([input_sizes[prim] for prim in primary]) if ( this_load and (sum(transfer_sizes.values())+this_load > transfer_limit or went_over_budget ) ): if went_over_budget: print "Transfer has gone over bubget." else: print "Transfer will go over bubget." print "%15.4f GB this load"%this_load print "%15.4f GB already this round"%sum(transfer_sizes.values()) print "%15.4f GB is the available limit"%transfer_limit went_over_budget=True if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over budget" else: if not options.go: print min_transfer_priority,"minimum priority",wfh.request['RequestPriority'],"<",in_transfer_priority,"stop" continue ## throtlle by campaign go if not CI.go( wfh.request['Campaign'] ): print "No go for",wfh.request['Campaign'] if not options.go: sendEmail("no go for managing","No go for "+wfh.request['Campaign']) continue ## check if the batch is announced def check_mcm(wfn): announced=False is_real=False if not wfn.startswith('pdmvserv'): is_real = True try: for b in mcm.getA('batches',query='contains=%s'% wfo.name): is_real = True if b['status']=='announced': announced=True break except: try: for b in mcm.getA('batches',query='contains=%s'% wfo.name): is_real = True if b['status']=='announced': announced=True break except: print "could not get mcm batch announcement, assuming not real" return announced,is_real if not use_mcm: announced,is_real = False,True else: announced,is_real = check_mcm( wfo.name ) if not announced: print wfo.name,"does not look announced."# skipping?, rejecting?, reporting?" if not is_real: print wfo.name,"does not appear to be genuine." ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status continue ## check on a grace period injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.) now = time.mktime(time.gmtime()) / (60.*60.) if float(now - injection_time) < 4.: if not options.go and not announced: print "It is too soon to start transfer: %3.2fH remaining"%(now - injection_time) continue if passing_along >= allowed_to_handle: #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority: ## higher priority, and not only this priority being transfered print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over",max_to_handle else: print "Not allowed to pass more than",max_to_handle,"at a time. Currently",being_handled,"handled, and adding",passing_along if not options.go: break if this_load and needs_transfer >= allowed_to_transfer: #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority: ## higher priority, and not only this priority being transfered print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over",max_to_transfer else: print "Not allowed to transfer more than",max_to_transfer,"at a time. Currently",being_transfered,"transfering, and adding",needs_transfer if not options.go: continue (lheinput,primary,parent,secondary) = wfh.getIO() for dataset in list(primary)+list(parent)+list(secondary): ## lock everything flat NLI.lock( dataset ) if options and options.tosites: sites_allowed = options.tosites.split(',') else: sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']): sites_allowed = CI.parameters(wfh.request['Campaign'])['SiteWhitelist'] if 'SiteBlacklist' in CI.parameters(wfh.request['Campaign']): sites_allowed = list(set(sites_allowed) - set(CI.parameters(wfh.request['Campaign'])['SiteBlacklist'])) ## reduce right away to sites in case of memory limitation memory_allowed = SI.sitesByMemory( wfh.request['Memory'] ) if memory_allowed!=None: print "sites allowing", wfh.request['Memory'],"are",memory_allowed sites_allowed = list(set(sites_allowed) & set(memory_allowed)) if not sites_allowed: print wfo.name,"has no possible sites to run at" print "available for",wfh.request['Memory'],"are",memory_allowed sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name )) continue blocks = [] if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## should make the block selection here pass if 'LumiList' in wfh.request and wfh.request['LumiList']: ## same, we could be doing the white list here too pass if blocks: print "Reading",len(blocks),"in whitelist" can_go = True staging=False allowed=True if primary: copies_needed_from_CPUh,CPUh = wfh.getNCopies() if talk: print wfo.name,'reads',', '.join(primary),'in primary' ## chope the primary dataset for prim in primary: max_priority[prim] = max(max_priority[prim],int(wfh.request['RequestPriority'])) sites_allowed = [site for site in sites_allowed if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] print "Sites allowed minus the vetoed transfer" print sorted(sites_allowed) copies_needed_from_site = int(0.35*len(sites_allowed))+1 ## should just go for a fixed number based if the white list grows that big print "Would make",copies_needed_from_site,"copies from site white list" copies_needed = copies_needed_from_site print "Would make",copies_needed_from_CPUh,"from cpu requirement",CPUh copies_needed = copies_needed_from_CPUh if options.maxcopy>0: ## stop maxing things out ?? #copies_needed = min(options.maxcopy,copies_needed) #print "Maxed to",copies_needed if copies_needed_from_CPUh > options.maxcopy: sendEmail('An example of more than three copies','for %s it could have been beneficial to make %s copies'%( wfo.name, copies_needed_from_CPUh)) if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies'] copies_needed = min(copies_needed_from_campaign,copies_needed_from_site) print "Maxed to",copies_needed,"by campaign configuration",wfh.request['Campaign'] ## remove the sites that do not want transfers workflow_dependencies[prim].add( wfo.id ) ##################################### ###### JR 3/8/15 #### deprecating this """ presence = getDatasetPresence( url, prim , within_sites = [SI.CE_to_SE(site) for site in sites_allowed]) prim_location = [site for site,pres in presence.items() if pres[0]==True] prim_parts = [site for site,pres in presence.items() if pres[0]==False] if len(prim_location) >= copies_needed: print "The output is all fully in place at",len(prim_location),"sites" continue # reduce the number of copies required by existing full copies copies_needed = max(0,copies_needed - len(prim_location)) print "now need",copies_needed subscriptions = listSubscriptions( url , prim , sites_allowed ) prim_destination = list(set([site for (site,(tid,decision)) in subscriptions.items() if decision and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])])) ## remove the subscription where the dataset is in parts at #prim_destination = list(set([site for (site,(tid,decision)) in subscriptions.items() if decision and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']]) and not site in prim_parts])) ## need to reject from that list the ones with a full copy already: i.e the transfer corresponds to the copy in place prim_destination = [site for site in prim_destination if not site in prim_location] ## add transfer dependencies latching_on_transfers = list(set([ tid for (site,(tid,decision)) in subscriptions.items() if decision and site in prim_destination and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])])) print latching_on_transfers """ ###### JR 3/8/15 #### deprecating this ##################################### ### new ways of making the whole thing destinations,all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks ) #destinations,all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks, group='DataOps') #anaops_destinations,anaops_all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks, group='AnalysisOps' ) print json.dumps(destinations, indent=2) ## get where the dataset is in full and completed prim_location = [site for (site,info) in destinations.items() if info['completion']==100 and info['data_fraction']==1] ## the rest is places it is going to be prim_destination = [site for site in destinations.keys() if not site in prim_location] ## need to take out the transfer veto prim_destination = [site for site in prim_destination if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] for dsite in prim_destination: needing_locks[dsite].append( prim ) if len(prim_location) >= copies_needed: print "The output is all fully in place at",len(prim_location),"sites",prim_location continue copies_needed = max(0,copies_needed - len(prim_location)) print "now need",copies_needed copies_being_made = [ sum([info['blocks'].keys().count(block) for site,info in destinations.items() if site in prim_destination]) for block in all_block_names] latching_on_transfers = set() [latching_on_transfers.update(info['blocks'].values()) for site,info in destinations.items() if site in prim_destination] latching_on_transfers = list(latching_on_transfers) #print latching_on_transfers ## figure out where all this is going to go prim_to_distribute = [site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location] prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in prim_destination] ## take out the ones that cannot receive transfers prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] if any([transfers_per_sites[site] < max_staging_per_site for site in prim_to_distribute]): ## means there is openings let me go print "There are transfer slots available:",[(site,transfers_per_sites[site]) for site in prim_to_distribute] for site in sites_allowed: #increment accross the board, regardless of real destination: could be changed transfers_per_sites[site] += 1 else: if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over transfer slots available" else: print "Not allowed to transfer more than",max_staging_per_site," per site at a time. Going overboard for",[site for site in prim_to_distribute if transfers_per_sites[site]>=max_staging_per_site] if not options.go: allowed = False break for latching in latching_on_transfers: tfo = session.query(Transfer).filter(Transfer.phedexid == latching).first() if not tfo: tfo = Transfer( phedexid = latching) tfo.workflows_id = [] session.add(tfo) if not wfo.id in tfo.workflows_id: print "adding",wfo.id,"to",tfo.id,"with phedexid",latching l = copy.deepcopy( tfo.workflows_id ) l.append( wfo.id ) tfo.workflows_id = l if not options.test: session.commit() else: session.flush() ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query can_go = False transfer_sizes[prim] = input_sizes[prim] staging = True # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ?? #copies_needed = max(0,copies_needed - len(prim_destination)) copies_needed = max(0,copies_needed - min(copies_being_made)) print "then need",copies_needed if copies_needed == 0: print "The output is either fully in place or getting in full somewhere with",latching_on_transfers can_go = True continue if len(prim_to_distribute)>0: ## maybe that a parameter we can play with to limit the if not options or options.chop: chops,sizes = getDatasetChops(prim, chop_threshold = options.chopsize, only_blocks=blocks) spreading = distributeToSites( chops, prim_to_distribute, n_copies = copies_needed, weights=SI.cpu_pledges, sizes=sizes) transfer_sizes[prim] = sum(sizes) else: spreading = {} for site in prim_to_distribute: if blocks: spreading[site]=blocks else: spreading[site]=[prim] transfer_sizes[prim] = input_sizes[prim] ## this is approximate if blocks are specified can_go = False print "selected CE destinations",spreading.keys() for (site,items) in spreading.items(): all_transfers[site].extend( items ) if not allowed: print "Not allowed to move on with",wfo.name continue if secondary: if talk: print wfo.name,'reads',', '.join(secondary),'in secondary' for sec in secondary: workflow_dependencies[sec].add( wfo.id ) if False: ## new style, failing on minbias if not sec in destination_cache: ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed]) destinations = destination_cache[sec] ## truncate location/destination to those making up for >90% of the dataset bad_destinations = [destinations.pop(site) for (site,info) in destinations.items() if info['data_fraction']<0.9] sec_location = [site for (site,info) in destinations.items() if info['completion']>=95] sec_destination = [site for site in destinations.keys() if not site in sec_location] else: ## old style presence = getDatasetPresence( url, sec ) sec_location = [site for site,pres in presence.items() if pres[1]>90.] ## more than 90% of the minbias at sites subscriptions = listSubscriptions( url ,sec ) sec_destination = [site for site in subscriptions] for site in sec_location: needing_locks[site].append( sec ) for site in sec_destination: needing_locks[site].append( sec ) sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] if len( sec_to_distribute )>0: sec_size = dss.get( sec ) for site in sec_to_distribute: site_se =SI.CE_to_SE(site) if (SI.disk[site_se]*1024.) > sec_size: all_transfers[site].append( sec ) can_go = False else: print "could not send the secondary input to",site_se,"because it is too big for the available disk",SI.disk[site_se]*1024,"GB need",sec_size #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024)) ## is that possible to do something more if can_go: ## no explicit transfer required this time if staging: ## but using existing ones print wfo.name,"latches on existing transfers, and nothing else" wfo.status = 'staging' needs_transfer+=1 else: print wfo.name,"should just be assigned NOW to",sites_allowed wfo.status = 'staged' passing_along+=1 print "setting status to",wfo.status session.commit() continue else: ## there is an explicit transfer required if staging: ## and also using an existing one print wfo.name,"latches on existing transfers" if not options.test: wfo.status = 'staging' print "setting status to",wfo.status session.commit() print wfo.name,"needs a transfer" needs_transfer+=1 passing_along+=1 print "accumulated locks of dataset in place" print json.dumps(needing_locks, indent=2) for site,items in needing_locks.items(): for item in items: LI.lock( item, SI.CE_to_SE(site), 'usable input') print "accumulated transfers" print json.dumps(all_transfers, indent=2) fake_id=-1 wf_id_in_prestaging=set() for (site,items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## site that do not want input datasets if site in SI.sites_veto_transfer: print site,"does not want transfers" continue ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured. ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] datasets = [it for it in items_to_transfer if not '#' in it] if execute: print "Making a replica to",site,"(CE)",site_se,"(SE) for" else: print "Would make a replica to",site,"(CE)",site_se,"(SE) for" print "\t",len(blocks),"blocks" ## remove blocks if full dataset is send out blocks = [block for block in blocks if not block.split('#')[0] in datasets] print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks])) print "\t",len(datasets),"datasets" print "\t",datasets items_to_transfer = blocks + datasets ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y','yes','go']: continue if execute: result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority='normal') ## make use of max_priority dataset:priority to set the subscriptions priority """ ## does not function once = True for item in items_to_transfer: bds = item.split('#')[0] if max_priority[bds] >= 90000: if once: w=10 print "waiting",w,"s before raising priority" time.sleep(w) once=False ## raise it to high priority print item,"subscription priority raised to high at",site_se #print "This does not work yet properly it seems" print updateSubscription(url, site_se, item, priority='high') """ #for item in list(set([it.split('#')[0] for it in items_to_transfer])): for item in items_to_transfer: LI.lock( item, site_se, 'pre-staging') else: #result= {'phedex':{'request_created' : [{'id' : fake_id}]}} result= {'phedex':{'request_created' : []}} fake_id-=1 if not result: print "ERROR Could not make a replica request for",site,items_to_transfer,"pre-staging" continue for phedexid in [o['id'] for o in result['phedex']['request_created']]: new_transfer = session.query(Transfer).filter(Transfer.phedexid == phedexid).first() print phedexid,"transfer created" if not new_transfer: new_transfer = Transfer( phedexid = phedexid) session.add( new_transfer ) new_transfer.workflows_id = set() for transfering in list(set(map(lambda it : it.split('#')[0], items_to_transfer))): new_transfer.workflows_id.update( workflow_dependencies[transfering] ) new_transfer.workflows_id = list(new_transfer.workflows_id) wf_id_in_prestaging.update(new_transfer.workflows_id) session.commit() ## auto approve it if execute: approved = approveSubscription(url, phedexid, [site_se]) for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status!='staging': if execute: tr_wf.status = 'staging' if talk: print "setting",tr_wf.name,"to staging" session.commit()
def checkor(url, spec=None, options=None): fDB = closeoutInfo() use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] wfs=[] if options.fetch: ## get all in running and check wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() ) wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() ) if options.nofetch: ## than get all in need for assistance wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() ) custodials = defaultdict(list) #sites : dataset list transfers = defaultdict(list) #sites : dataset list invalidations = [] #a list of files SI = siteInfo() CI = campaignInfo() mcm = McMClient(dev=False) def get_campaign(output, wfi): campaign = None try: campaign = output.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request: campaign = wfi.request['Campaign'] return campaign by_passes = [] holdings = [] for bypassor,email in [('jbadillo','*****@*****.**'),('vlimant','*****@*****.**'),('jen_a','*****@*****.**')]: bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor) if not os.path.isfile(bypass_file): print "no file",bypass_file continue try: by_passes.extend( json.loads(open(bypass_file).read())) except: print "cannot get by-passes from",bypass_file,"for",bypassor sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email]) holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor) if not os.path.isfile(holding_file): print "no file",holding_file continue try: holdings.extend( json.loads(open(holding_file).read())) except: print "cannot get holdings from",holding_file,"for",bypassor sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email]) total_running_time = 5.*60. sleep_time = max(0.5, total_running_time / len(wfs)) for wfo in wfs: if spec and not (spec in wfo.name): continue time.sleep( sleep_time ) print "checking on",wfo.name ## get info wfi = workflowInfo(url, wfo.name) ## make sure the wm status is up to date. # and send things back/forward if necessary. wfo.wm_status = wfi.request['RequestStatus'] if wfo.wm_status == 'closed-out': ## manually closed-out print wfo.name,"is already",wfo.wm_status wfo.status = 'close' session.commit() continue elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']: ## went into trouble wfo.status = 'trouble' print wfo.name,"is in trouble",wfo.wm_status session.commit() continue elif wfo.wm_status in ['assigned','acquired']: ## not worth checking yet print wfo.name,"not running yet" session.commit() continue if '-onhold' in wfo.status: if wfo.name in holdings: print wfo.name,"on hold" continue if wfo.name in holdings: wfo.status = 'assistance-onhold' print "setting",wfo.name,"on hold" session.commit() continue if wfo.wm_status != 'completed': ## for sure move on with closeout check if in completed print "no need to check on",wfo.name,"in status",wfo.wm_status session.commit() continue session.commit() sub_assistance="" # if that string is filled, there will be need for manual assistance is_closing = True ## do the closed-out checks one by one ## get it from somewhere by_pass_checks = False if wfo.name in by_passes: print "we can bypass checks on",wfo.name by_pass_checks = True for bypass in by_passes: if bypass in wfo.name: print "we can bypass",wfo.name,"because of keyword",bypass by_pass_checks = True break # tuck out DQMIO/DQM wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not '/DQM' in out] ## anything running on acdc familly = getWorkflowById(url, wfi.request['PrepID'], details=True) acdc = [] acdc_inactive = [] has_recovery_going=False had_any_recovery = False for member in familly: if member['RequestType'] != 'Resubmission': continue if member['RequestName'] == wfo.name: continue if member['RequestDate'] < wfi.request['RequestDate']: continue if member['RequestStatus'] in ['running-open','running-closed','assignment-approved','assigned','acquired']: print wfo.name,"still has an ACDC running",member['RequestName'] acdc.append( member['RequestName'] ) #print json.dumps(member,indent=2) ## hook for just waiting ... is_closing = False has_recovery_going=True elif member['RequestStatus']==None: print member['RequestName'],"is not real" pass else: acdc_inactive.append( member['RequestName'] ) had_any_recovery = True ## completion check percent_completions = {} # print "let's see who is crashing", wfo.name # print wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis'] if not 'TotalInputEvents' in wfi.request: event_expected,lumi_expected = 0,0 if not 'recovery' in wfo.status: sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**']) else: event_expected,lumi_expected = wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis'] fractions_pass = {} for output in wfi.request['OutputDatasets']: event_count,lumi_count = getDatasetEventsAndLumis(dataset=output) percent_completions[output] = 0. if lumi_expected: percent_completions[output] = lumi_count / float( lumi_expected ) fractions_pass[output] = 0.95 c = get_campaign(output, wfi) if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]: fractions_pass[output] = CI.campaigns[c]['fractionpass'] print "overriding fraction to",fractions_pass[output],"for",output if options.fractionpass: fractions_pass[output] = options.fractionpass print "overriding fraction to",fractions_pass[output],"by command line for",output if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]): print wfo.name,"is not completed" print json.dumps(percent_completions, indent=2) print json.dumps(fractions_pass, indent=2) ## hook for creating automatically ACDC ? if has_recovery_going: sub_assistance+='-recovering' elif had_any_recovery: ## we want to have this looked at sub_assistance+='-manual' else: sub_assistance+='-recovery' is_closing = False ## correct lumi < 300 event per lumi events_per_lumi = {} for output in wfi.request['OutputDatasets']: events_per_lumi[output] = getDatasetEventsPerLumi( output ) lumi_upper_limit = {} for output in wfi.request['OutputDatasets']: upper_limit = 301. campaign = get_campaign(output, wfi) #if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request: # upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency'] # print "setting the upper limit of lumisize to",upper_limit,"by request configuration" if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]: upper_limit = CI.campaigns[campaign]['lumisize'] print "overriding the upper lumi size to",upper_limit,"for",campaign if options.lumisize: upper_limit = options.lumisize print "overriding the upper lumi size to",upper_limit,"by command line" lumi_upper_limit[output] = upper_limit if any([ events_per_lumi[out] >= lumi_upper_limit[out] for out in events_per_lumi]): print wfo.name,"has big lumisections" print json.dumps(events_per_lumi, indent=2) ## hook for rejecting the request ? sub_assistance+='-biglumi' is_closing = False any_presence = {} for output in wfi.request['OutputDatasets']: any_presence[output] = getDatasetPresence(url, output, vetoes=[]) ## custodial copy custodial_locations = {} custodial_presences = {} for output in wfi.request['OutputDatasets']: custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s] custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output) if not custodial_locations[output]: custodial_locations[output] = [] ## presence in phedex phedex_presence ={} for output in wfi.request['OutputDatasets']: phedex_presence[output] = phedexClient.getFileCountDataset(url, output ) vetoed_custodial_tier = ['MINIAODSIM'] out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier] if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])): print wfo.name,"has not all custodial location" print json.dumps(custodial_locations, indent=2) ########## ## hook for making a custodial replica ? custodial = None ## get from other outputs for output in out_worth_checking: if len(custodial_locations[output]): custodial = custodial_locations[output][0] ## try to get it from campaign configuration if not custodial: for output in out_worth_checking: campaign = get_campaign(output, wfi) if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]: custodial = CI.campaigns[campaign]['custodial'] print "Setting custodial to",custodial,"from campaign configuration" break ## get from the parent pick_custodial = True if not custodial and 'InputDataset' in wfi.request: ## this is terribly dangerous to assume only parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( wfi.request['InputDataset']) ###parents_custodial = findCustodialLocation(url, wfi.request['InputDataset']) if not parents_custodial: parents_custodial = [] if len(parents_custodial): custodial = parents_custodial[0] else: print "the input dataset",wfi.request['InputDataset'],"does not have custodial in the first place. abort" sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%wfi.request['InputDataset']) is_closing = False pick_custodial = False if not custodial and pick_custodial: ## pick one at random custodial = SI.pick_SE() if custodial and ((not sub_assistance and not acdc) or by_pass_checks): ## register the custodial request, if there are no other big issues for output in out_worth_checking: if not len(custodial_locations[output]): if phedex_presence[output]>=1: custodials[custodial].append( output ) else: print "no file in phedex for",output," not good to add to custodial requests" else: print "cannot find a custodial for",wfo.name is_closing = False ## disk copy disk_copies = {} for output in wfi.request['OutputDatasets']: disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)] if not all(map( lambda sites : len(sites)!=0, disk_copies.values())): print wfo.name,"has not all output on disk" print json.dumps(disk_copies, indent=2) ## presence in dbs dbs_presence = {} dbs_invalid = {} for output in wfi.request['OutputDatasets']: dbs_presence[output] = dbs3Client.getFileCountDataset( output ) dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True) fraction_invalid = 0.01 if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs,phedex mismatch" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## hook for just waiting ... is_closing = False if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs invalid file level too high" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## need to be going and taking an eye sub_assistance+="-invalidfiles" is_closing = False ## put that heavy part at the end ## duplication check duplications = {} if is_closing: print "starting duplicate checker for",wfo.name for output in wfi.request['OutputDatasets']: print "\tchecking",output duplications[output] = True try: duplications[output] = dbs3Client.duplicateRunLumi( output ) except: try: duplications[output] = dbs3Client.duplicateRunLumi( output ) except: print "was not possible to get the duplicate count for",output is_closing=False if any(duplications.values()) and not options.ignoreduplicates: print wfo.name,"has duplicates" print json.dumps(duplications,indent=2) ## hook for making file invalidation ? sub_assistance+='-duplicates' is_closing = False ## for visualization later on if not wfo.name in fDB.record: #print "adding",wfo.name,"to close out record" fDB.record[wfo.name] = { 'datasets' :{}, 'name' : wfo.name, 'closeOutWorkflow' : None, } fDB.record[wfo.name]['closeOutWorkflow'] = is_closing for output in wfi.request['OutputDatasets']: if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {} rec = fDB.record[wfo.name]['datasets'][output] rec['percentage'] = float('%.2f'%(percent_completions[output]*100)) rec['duplicate'] = duplications[output] if output in duplications else 'N/A' rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A' rec['closeOutDataset'] = is_closing rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A' rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output]))) rec['dbsFiles'] = dbs_presence[output] rec['dbsInvFiles'] = dbs_invalid[output] rec['phedexFiles'] = phedex_presence[output] rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive)) if by_pass_checks: ## force closing is_closing = True ## and move on if is_closing: ## toggle status to closed-out in request manager print "setting",wfo.name,"closed-out" if not options.test: res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) print "close out answer",res if not res in ["None",None]: print "retrying to closing out" print res res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) if res in [None,"None"]: wfo.status = 'close' session.commit() else: print "could not close out",wfo.name,"will try again next time" else: ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it new_status = 'assistance'+sub_assistance print wfo.name,"needs assistance with",new_status if sub_assistance and wfo.status != new_status and 'PrepID' in wfi.request and not 'manual' in wfo.status: pid = wfi.getPrepIDs()[0].replace('task_','') #pid = wfi.request['PrepID'].replace('task_','') ## notify messages= { 'recovery' : 'Samples completed with missing lumi count:\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ) ), 'biglumi' : 'Samples completed with large luminosity blocks:\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])])), 'duplicate' : 'Samples completed with duplicated luminosity blocks:\n%s'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ), } text ="The request %s (%s) is facing issue in production.\n" %( pid, wfo.name ) content = "" for case in messages: if case in new_status: content+= "\n"+messages[case]+"\n" text += content text += "You are invited to check, while this is being taken care of by Ops.\n" text += "This is an automated message." if use_mcm and content: print "Sending notification back to requestor" print text batches = mcm.getA('batches',query='contains=%s&status=announced'%pid) if len(batches): ## go notify the batch bid = batches[-1]['prepid'] print "batch nofication to",bid mcm.put('/restapi/batches/notify', { "notes" : text, "prepid" : bid}) ## go notify the request print "request notification to",pid mcm.put('/restapi/requests/notify',{ "message" : text, "prepids" : [pid] }) ## case where the workflow was in manual from recoveror if not 'manual' in wfo.status or new_status!='assistance-recovery': wfo.status = new_status if not options.test: print "setting",wfo.name,"to",wfo.status session.commit() else: print "current status is",wfo.status,"not changing to anything" fDB.html() ## custodial requests print "Custodials" print json.dumps(custodials, indent=2) for site in custodials: print ','.join(custodials[site]),'=>',site if not options.test: result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) ) print result print "Transfers" print json.dumps(transfers, indent=2) ## replicas requests for site in transfers: print ','.join(transfers[site]),'=>',site if not options.test: result = None #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out") print result print "File Invalidation" print invalidations
def checkor(url, spec=None, options=None): fDB = falseDB() wfs=[] if options.fetch: #workflows = getWorkflows(url, status='completed') #for wf in workflows: # wfo = session.query(Workflow).filter(Workflow.name == wf ).first() # if wfo: # if not wfo.status in ['away','assistance']: continue # wfs.append(wfo ) wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() ) wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() ) else: ## than get all in need for assistance wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() ) custodials = defaultdict(list) #sites : dataset list transfers = defaultdict(list) #sites : dataset list invalidations = [] #a list of files SI = siteInfo() CI = campaignInfo() def get_campaign(output, wfi): campaign = None try: campaign = output.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request: campaign = wfi.request['Campaign'] return campaign for wfo in wfs: if spec and not (spec in wfo.name): continue ## get info wfi = workflowInfo(url, wfo.name) ## make sure the wm status is up to date. # and send things back/forward if necessary. wfo.wm_status = wfi.request['RequestStatus'] if wfo.wm_status == 'closed-out': ## manually closed-out print wfo.name,"is already",wfo.wm_status wfo.status = 'close' session.commit() continue elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived']: ## went into trouble wfo.status = 'trouble' print wfo.name,"is in trouble",wfo.wm_status session.commit() continue elif wfo.wm_status in ['assigned','acquired']: ## not worth checking yet print wfo.name,"not running yet" session.commit() continue if wfo.wm_status != 'completed': ## for sure move on with closeout check if in completed print "no need to check on",wfo.name,"in status",wfo.wm_status session.commit() continue session.commit() sub_assistance="" # if that string is filled, there will be need for manual assistance is_closing = True ## do the closed-out checks one by one # tuck out DQMIO/DQM wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not '/DQM' in out] ## anything running on acdc familly = getWorkflowById(url, wfi.request['PrepID'], details=True) acdc = [] for member in familly: if member['RequestName'] == wfo.name: continue if member['RequestDate'] < wfi.request['RequestDate']: continue if member['RequestType'] != 'Resubmission': continue if member['RequestStatus'] in ['running-opened','running-closed','assignment-approved','assigned','acquired']: print wfo.name,"still has an ACDC running",member['RequestName'] acdc.append( member['RequestName'] ) #print json.dumps(member,indent=2) ## hook for just waiting ... is_closing = False ## completion check percent_completions = {} event_expected,lumi_expected = wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis'] fractions_pass = {} for output in wfi.request['OutputDatasets']: event_count,lumi_count = getDatasetEventsAndLumis(dataset=output) percent_completions[output] = 0. if lumi_expected: percent_completions[output] = lumi_count / float( lumi_expected ) fractions_pass[output] = 0.95 c = get_campaign(output, wfi) if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]: fractions_pass[output] = CI.campaigns[c]['fractionpass'] print "overriding fraction to",fractions_pass[output],"for",output if options.fractionpass: fractions_pass[output] = options.fractionpass print "overriding fraction to",fractions_pass[output],"by command line for",output if not all([percent_completions[out] > fractions_pass[out] for out in fractions_pass]): print wfo.name,"is not completed" print json.dumps(percent_completions, indent=2) print json.dumps(fractions_pass, indent=2) ## hook for creating automatically ACDC ? sub_assistance+='-recovery' is_closing = False ## correct lumi < 300 event per lumi events_per_lumi = {} for output in wfi.request['OutputDatasets']: events_per_lumi[output] = getDatasetEventsPerLumi( output ) lumi_upper_limit = {} for output in wfi.request['OutputDatasets']: upper_limit = 300. campaign = get_campaign(output, wfi) if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]: upper_limit = CI.campaigns[campaign]['lumisize'] print "overriding the upper lumi size to",upper_limit,"for",campaign if options.lumisize: upper_limit = options.lumisize print "overriding the upper lumi size to",upper_limit,"by command line" lumi_upper_limit[output] = upper_limit if any([ events_per_lumi[out] > lumi_upper_limit[out] for out in events_per_lumi]): print wfo.name,"has big lumisections" print json.dumps(events_per_lumi, indent=2) ## hook for rejecting the request ? sub_assistance+='-biglumi' is_closing = False any_presence = {} for output in wfi.request['OutputDatasets']: any_presence[output] = getDatasetPresence(url, output, vetoes=[]) ## custodial copy custodial_locations = {} custodial_presences = {} for output in wfi.request['OutputDatasets']: custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s] custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output) if not custodial_locations[output]: custodial_locations[output] = [] vetoed_custodial_tier = ['MINIAODSIM'] out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier] if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])): print wfo.name,"has not all custodial location" print json.dumps(custodial_locations, indent=2) ########## ## hook for making a custodial replica ? custodial = None ## get from other outputs for output in out_worth_checking: if len(custodial_locations[output]): custodial = custodial_locations[output][0] ## try to get it from campaign configuration if not custodial: for output in out_worth_checking: campaign = get_campaign(output, wfi) if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]: custodial = CI.campaigns[campaign]['custodial'] print "Setting custodial to",custodial,"from campaign configuration" break ## get from the parent if not custodial and 'InputDataset' in wfi.request: parents_custodial = findCustodialLocation(url, wfi.request['InputDataset']) if len(parents_custodial): custodial = parents_custodial[0] else: print "the input dataset",wfi.request['InputDataset'],"does not have custodial in the first place. abort" continue if not custodial: ## pick one at random custodial = SI.pick_SE() if custodial and not sub_assistance and not acdc: ## register the custodial request, if there are no other big issues for output in out_worth_checking: if not len(custodial_locations[output]): custodials[custodial].append( output ) else: print "cannot find a custodial for",wfo.name is_closing = False ## disk copy disk_copies = {} for output in wfi.request['OutputDatasets']: disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)] if not all(map( lambda sites : len(sites)!=0, disk_copies.values())): print wfo.name,"has not all output on disk" print json.dumps(disk_copies, indent=2) ## presence in dbs dbs_presence = {} dbs_invalid = {} for output in wfi.request['OutputDatasets']: dbs_presence[output] = dbs3Client.getFileCountDataset( output ) dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True) ## presence in phedex phedex_presence ={} for output in wfi.request['OutputDatasets']: phedex_presence[output] = phedexClient.getFileCountDataset(url, output ) fraction_invalid = 0.01 if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs,phedex mismatch" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## hook for just waiting ... is_closing = False if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs invalid file level too high" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## need to be going and taking an eye sub_assistance+="-invalidfiles" is_closing = False ## put that heavy part at the end ## duplication check duplications = {} if is_closing: for output in wfi.request['OutputDatasets']: try: duplications[output] = dbs3Client.duplicateRunLumi( output ) except: try: duplications[output] = dbs3Client.duplicateRunLumi( output ) except: print "was not possible to get the duplicate count for",output is_closing=False if any(duplications.values()) and not options.ignoreduplicates: print wfo.name,"has duplicates" print json.dumps(duplications,indent=2) ## hook for making file invalidation ? sub_assistance+='-duplicates' is_closing = False ## for visualization later on if not wfo.name in fDB.record: #print "adding",wfo.name,"to close out record" fDB.record[wfo.name] = { 'datasets' :{}, 'name' : wfo.name, 'closeOutWorkflow' : None, } fDB.record[wfo.name]['closeOutWorkflow'] = is_closing for output in wfi.request['OutputDatasets']: if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {} rec = fDB.record[wfo.name]['datasets'][output] rec['percentage'] = float('%.2f'%(percent_completions[output]*100)) rec['duplicate'] = duplications[output] if output in duplications else 'N/A' rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A' rec['closeOutDataset'] = is_closing rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A' rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output]))) rec['dbsFiles'] = dbs_presence[output] rec['dbsInvFiles'] = dbs_invalid[output] rec['phedexFiles'] = phedex_presence[output] rec['acdc'] = len(acdc) ## and move on if is_closing: ## toggle status to closed-out in request manager print "setting",wfo.name,"closed-out" if not options.test: reqMgrClient.closeOutWorkflowCascade(url, wfo.name) # set it from away/assistance* to close wfo.status = 'close' session.commit() else: print wfo.name,"needs assistance" ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it wfo.status = 'assistance'+sub_assistance if not options.test: print "setting",wfo.name,"to",wfo.status session.commit() fDB.summary() ## custodial requests print "Custodials" print json.dumps(custodials, indent=2) for site in custodials: print ','.join(custodials[site]),'=>',site if not options.test: result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low') print result print "Transfers" print json.dumps(transfers, indent=2) ## replicas requests for site in transfers: print ','.join(transfers[site]),'=>',site if not options.test: result = None #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out") print result print "File Invalidation" print invalidations
print json.dumps(spreading, indent=2) for site, items in spreading.items(): full_spread[site].update(items) to_lock.add(ds) addHocLocks = set(json.loads(open('addhoc_lock.json').read())) addHocLocks.update(to_lock) if act: open('addhoc_lock.json', 'w').write(json.dumps(sorted(addHocLocks), indent=2)) for l in to_lock: if act: LI.lock(l, 'add HOC') for site, items in full_spread.items(): print site se = SI.CE_to_SE(site) print se print len(items) print json.dumps(sorted(items), indent=2) if act: print makeReplicaRequest(url, se, list(set(items)), 'pre-staging', priority='high', approve=True, mail=False)
all_blocks_at_sites[site].update(blocks) print "\t", out print "\t\t", len(blocks_at_sites), "sites", sorted( blocks_at_sites.keys()), "with unsubscribed blocks" if len(all_blocks_at_sites.keys()) == 0 and len(wfs): ## no subscription to be done at this time, let me know #sendEmail('no unsubscribed blocks','while catching up %s does not need to be there anymore'%( one_status )) pass print len(all_blocks_at_sites.keys()), "sites to subscribe things at" for site, blocks in all_blocks_at_sites.items(): if 'Buffer' in site or 'Export' in site or 'MSS' in site: continue if not site in done: done[site] = [] blocks = [block for block in blocks if not block in done[site]] print "Would subscribe", len(blocks), "blocks to", site print "\tSubscribe", len(blocks), "blocks to", site done[site].extend(blocks) if blocks: print makeReplicaRequest(url, site, list(blocks), "Production blocks", priority="low", approve=True, mail=False) time.sleep(1) #open('myblock_done.json','w').write( json.dumps( done, indent=2 ))
def closor(url, specific=None, options=None): if userLock(): return if duplicateLock(): return if not componentInfo().check(): return UC = unifiedConfiguration() CI = campaignInfo() all_late_files = [] check_fullcopy_to_announce = UC.get('check_fullcopy_to_announce') jump_the_line = options.announce if options else False if jump_the_line: print "announce option is on. Checking on things on-going ready to be announced" wfs = session.query(Workflow).filter( Workflow.status.contains('announce')).filter( sqlalchemy.not_(Workflow.status.contains('announced'))).all() else: print "regular option. Checking on things done and to be announced" wfs = session.query(Workflow).filter(Workflow.status == 'close').all() wfs_n = [w.name for w in wfs] print "unique names?" print len(set(wfs_n)) == len(wfs_n) held = set() print len(wfs), "closing" random.shuffle(wfs) max_per_round = UC.get('max_per_round').get('closor', None) if options.limit: max_per_round = options.limit if max_per_round: ## order them by priority all_closedout = sorted(getWorkflows(url, 'closed-out', details=True), key=lambda r: r['RequestPriority']) all_closedout = [r['RequestName'] for r in all_closedout] def rank(wfn): return all_closedout.index(wfn) if wfn in all_closedout else 0 wfs = sorted(wfs, key=lambda wfo: rank(wfo.name), reverse=True) wfs = wfs[:max_per_round] batch_go = {} batch_warnings = defaultdict(set) batch_goodness = UC.get("batch_goodness") for iwfo, wfo in enumerate(wfs): if specific and not specific in wfo.name: continue print "Progress [%d/%d]" % (iwfo, len(wfs)) ## what is the expected #lumis wfi = workflowInfo(url, wfo.name) wfo.wm_status = wfi.request['RequestStatus'] if wfi.isRelval(): has_batch_go = False batch_name = wfi.getCampaign() if not batch_name in batch_go: ## do the esimatation whethere this can be announced : only once per batch in_batches = getWorkflowByCampaign(url, batch_name, details=True) batch_go[batch_name] = all( map( lambda s: not s in [ 'completed', 'running-open', 'running-closed', 'acquired', 'assigned', 'assignment-approved' ], [r['RequestStatus'] for r in in_batches])) ## already verified has_batch_go = batch_go[batch_name] if not has_batch_go: wfi.sendLog( 'closor', 'Cannot close for now because the batch <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?campaign=%s>%s</a> is not all close' % (batch_name, batch_name)) continue if wfi.request['RequestStatus'] in ['announced', 'normal-archived' ] and not options.force: ## manually announced ?? wfo.status = 'done' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog( 'closor', '%s is announced already : %s' % (wfo.name, wfo.wm_status)) session.commit() if jump_the_line: wfi.sendLog('closor', 'Announcing while completing') expected_lumis = 1 if not 'TotalInputLumis' in wfi.request: print wfo.name, "has not been assigned yet, or the database is corrupted" elif wfi.request['TotalInputLumis'] == 0: print wfo.name, "is corrupted with 0 expected lumis" else: expected_lumis = wfi.request['TotalInputLumis'] ## what are the outputs outputs = wfi.request['OutputDatasets'] ## check whether the number of lumis is as expected for each all_OK = defaultdict(lambda: False) stats = defaultdict(int) #print outputs if len(outputs): print wfo.name, wfi.request['RequestStatus'] for out in outputs: event_count, lumi_count = getDatasetEventsAndLumis(dataset=out) odb = session.query(Output).filter( Output.datasetname == out).first() if not odb: print "adding an output object", out odb = Output(datasetname=out) odb.workflow = wfo session.add(odb) odb.nlumis = lumi_count odb.nevents = event_count odb.workfow_id = wfo.id if odb.expectedlumis < expected_lumis: odb.expectedlumis = expected_lumis else: expected_lumis = odb.expectedlumis odb.date = time.mktime(time.gmtime()) session.commit() fraction = lumi_count / float(expected_lumis) * 100. completion_line = "%60s %d/%d = %3.2f%%" % ( out, lumi_count, expected_lumis, fraction) wfi.sendLog('closor', "\t%s" % completion_line) if wfi.isRelval() and fraction < batch_goodness: batch_warnings[wfi.getCampaign()].add(completion_line) stats[out] = lumi_count all_OK[out] = True ## check for at least one full copy prior to moving on in_full = {} for out in outputs: in_full[out] = [] presence = getDatasetPresence(url, out) where = [site for site, info in presence.items() if info[0]] if where: all_OK[out] = True print out, "is in full at", ",".join(where) in_full[out] = copy.deepcopy(where) else: going_to = wfi.request['NonCustodialSites'] + wfi.request[ 'CustodialSites'] wfi.sendLog( 'closor', "%s is not in full anywhere. send to %s" % (out, ",".join(sorted(going_to)))) at_destination = dict([(k, v) for (k, v) in presence.items() if k in going_to]) else_where = dict([(k, v) for (k, v) in presence.items() if not k in going_to]) print json.dumps(at_destination) print json.dumps(else_where, indent=2) ## do the full stuck transfer study, missing files and shit ! for there in going_to: late_info = findLateFiles(url, out, going_to=there) for l in late_info: l.update({"workflow": wfo.name, "dataset": out}) all_late_files.extend(late_info) if check_fullcopy_to_announce: ## only set this false if the check is relevant all_OK[out] = False ## verify if we have to do harvesting if not options.no_harvest and not jump_the_line: (OK, requests) = spawn_harvesting(url, wfi, in_full) all_OK.update(OK) ## only that status can let me go into announced if all(all_OK.values()) and ( (wfi.request['RequestStatus'] in ['closed-out']) or options.force or jump_the_line): print wfo.name, "to be announced" results = [] if not results: for out in outputs: if out in stats and not stats[out]: continue _, dsn, process_string, tier = out.split('/') if all_OK[out]: results.append(setDatasetStatus(out, 'VALID')) if all_OK[out] and wfi.isRelval(): ## make the specific relval rules and the replicas ## figure the destination(s) out destinations = set() if tier != "RECO" and tier != "ALCARECO": destinations.add('T2_CH_CERN') if tier == "GEN-SIM": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-DIGI-RAW": destinations.add('T1_US_FNAL_Disk') if tier == "GEN-SIM-RECO": destinations.add('T1_US_FNAL_Disk') if "RelValTTBar" in dsn and "TkAlMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if "MinimumBias" in dsn and "SiStripCalMinBias" in process_string and tier != "ALCARECO": destinations.add('T2_CH_CERN') if destinations: wfi.sendLog( 'closor', '%s to go to %s' % (out, ', '.join(sorted(destinations)))) ## call to makereplicarequest under relval => done for site in destinations: result = makeReplicaRequest( url, site, [out], 'Copy for release validation consumption', priority='normal', approve=True, mail=False, group='RelVal') try: request_id = result['phedex'][ 'request_created'][0]['id'] results.append(True) except: results.append('Failed relval transfer') elif all_OK[out]: campaign = None try: campaign = out.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request and wfi.request[ 'Campaign']: campaign = wfi.request['Campaign'] to_DDM = False ## campaign override if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[ campaign] and tier in CI.campaigns[campaign][ 'toDDM']: to_DDM = True ## by typical enabling if tier in UC.get("tiers_to_DDM"): to_DDM = True ## check for unitarity if not tier in UC.get("tiers_no_DDM") + UC.get( "tiers_to_DDM"): print "tier", tier, "neither TO or NO DDM for", out results.append('Not recognitized tier %s' % tier) #sendEmail("failed DDM injection","could not recognize %s for injecting in DDM"% out) sendLog( 'closor', "could not recognize %s for injecting in DDM" % out, level='critical') continue n_copies = 1 destinations = [] if to_DDM and campaign and campaign in CI.campaigns and 'DDMcopies' in CI.campaigns[ campaign]: ddm_instructions = CI.campaigns[campaign][ 'DDMcopies'] if type(ddm_instructions) == int: n_copies = CI.campaigns[campaign]['DDMcopies'] elif type(ddm_instructions) == dict: ## a more fancy configuration for ddmtier, indication in ddm_instructions.items( ): if ddmtier == tier or ddmtier in [ '*', 'all' ]: ## this is for us if 'N' in indication: n_copies = indication['N'] if 'host' in indication: destinations = indication['host'] destination_spec = "" if destinations: destination_spec = "--destination=" + ",".join( destinations) group_spec = "" ## not used yet ### should make this a campaign configuration ## inject to DDM when necessary if to_DDM: print "Sending", out, " to DDM" status = pass_to_dynamo( [out], N=n_copies, sites=destinations if destinations else None, group=group_spec if group_spec else None) results.append(status) if status == True: wfi.sendLog( 'closor', '%s is send to dynamo in %s copies %s %s' % (out, n_copies, sorted(destinations), group_spec)) else: sendLog('closor', "could not add " + out + " to dynamo pool. check closor logs.", level='critical') wfi.sendLog( 'closor', "could not add " + out + " to dynamo pool. check closor logs.") else: print wfo.name, "no stats for announcing", out results.append('No Stats') if all( map(lambda result: result in ['None', None, True], results)): if not jump_the_line: ## only announce if all previous are fine res = reqMgrClient.announceWorkflowCascade( url, wfo.name) if not res in ['None', None]: ## check the status again, it might well have toggled wl_bis = workflowInfo(url, wfo.name) wfo.wm_status = wl_bis.request['RequestStatus'] session.commit() if wl_bis.request['RequestStatus'] in [ 'announced', 'normal-archived' ]: res = None else: ## retry ? res = reqMgrClient.announceWorkflowCascade( url, wfo.name) results.append(res) #print results if all(map(lambda result: result in ['None', None, True], results)): if jump_the_line: if not 'announced' in wfo.status: wfo.status = wfo.status.replace( 'announce', 'announced') else: wfo.status = 'done' session.commit() wfi.sendLog('closor', "workflow outputs are announced") else: wfi.sendLog( 'closor', "Error with %s to be announced \n%s" % (wfo.name, json.dumps(results))) elif wfi.request['RequestStatus'] in [ 'failed', 'aborted', 'aborted-archived', 'rejected', 'rejected-archived', 'aborted-completed' ]: if wfi.isRelval(): wfo.status = 'forget' wfo.wm_status = wfi.request['RequestStatus'] wfi.sendLog( 'closor', "%s is %s, but will not be set in trouble to find a replacement." % (wfo.name, wfo.wm_status)) else: wfo.status = 'trouble' wfo.wm_status = wfi.request['RequestStatus'] session.commit() else: print wfo.name, "not good for announcing:", wfi.request[ 'RequestStatus'] wfi.sendLog('closor', "cannot be announced") held.add(wfo.name) days_late = 0. retries_late = 10 really_late_files = [ info for info in all_late_files if info['retries'] >= retries_late ] really_late_files = [ info for info in really_late_files if info['delay'] / (60 * 60 * 24.) >= days_late ] if really_late_files: subject = 'These %d files are lagging for %d days and %d retries announcing dataset \n%s' % ( len(really_late_files), days_late, retries_late, json.dumps(really_late_files, indent=2)) #sendEmail('waiting for files to announce', subject) sendLog('closor', subject, level='warning') sendLog('closor', subject) print subject open('%s/stuck_files.json' % monitor_dir, 'w').write(json.dumps(really_late_files, indent=2)) if held: sendLog('closor', "the workflows below are held up \n%s" % ("\n".join(sorted(held))), level='critical') #batches = json.loads(open('batches.json').read()) for bname, go in batch_go.items(): if go: subject = "Release Validation Samples Batch %s" % bname issues = "" if batch_warnings[bname]: issues = "The following datasets have outstanding completion (<%d%%) issues:\n\n" % batch_goodness issues += "\n".join(sorted(batch_warnings[bname])) issues += "\n\n" text = """ Dear all, a batch of release validation workflows has finished. Batch ID: %s Detail of the workflows https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?campaign=%s %s This is an automated message. """ % (bname, bname, issues) to = ['*****@*****.**'] sendEmail(subject, text, destination=to)
def checkor(url, spec=None, options=None): if userLock(): return if duplicateLock() and not options.go: return fDB = closeoutInfo() UC = unifiedConfiguration() use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] def time_point(label="",sub_lap=False): now = time.mktime(time.gmtime()) nows = time.asctime(time.gmtime()) print "Time check (%s) point at : %s"%(label, nows) print "Since start: %s [s]"% ( now - time_point.start) if sub_lap: print "Sub Lap : %s [s]"% ( now - time_point.sub_lap ) time_point.sub_lap = now else: print "Lap : %s [s]"% ( now - time_point.lap ) time_point.lap = now time_point.sub_lap = now time_point.sub_lap = time_point.lap = time_point.start = time.mktime(time.gmtime()) runnings = session.query(Workflow).filter(Workflow.status == 'away').all() standings = session.query(Workflow).filter(Workflow.status.startswith('assistance')).all() ## intersect with what is actually in completed status in request manager now all_completed = set(getWorkflows(url, 'completed' )) wfs=[] if options.strict: ## the one which were running and now have completed print "strict option is on: checking workflows that freshly completed" wfs.extend( filter(lambda wfo: wfo.name in all_completed , runnings)) if options.update: print "update option is on: checking workflows that have not completed yet" wfs.extend( filter(lambda wfo: not wfo.name in all_completed , runnings)) if options.clear: print "clear option is on: checking workflows that are ready to toggle closed-out" wfs.extend( filter(lambda wfo: 'custodial' in wfo.status, standings)) if options.review: print "review option is on: checking the workflows that needed intervention" wfs.extend( filter(lambda wfo: not 'custodial' in wfo.status, standings)) ## what is left out are the wf which were running and ended up aborted/failed/... custodials = defaultdict(list) #sites : dataset list transfers = defaultdict(list) #sites : dataset list invalidations = [] #a list of files SI = siteInfo() CI = campaignInfo() mcm = McMClient(dev=False) if use_mcm else None def get_campaign(output, wfi): ## this should be a perfect matching of output->task->campaign campaign = None era = None wf_campaign = None if 'Campaign' in wfi.request: wf_campaign = wfi.request['Campaign'] try: era = output.split('/')[2].split('-')[0] except: era = None if wfi.isRelval(): campaign = wf_campaign else: campaign = era if era else wf_campaign return campaign ## retrieve bypass and onhold configuration bypasses = [] forcings = [] overrides = getForceCompletes() holdings = [] actors = UC.get('allowed_bypass') for bypassor,email in actors: bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor) if not os.path.isfile(bypass_file): #sendLog('checkor','no file %s',bypass_file) continue try: bypasses.extend( json.loads(open(bypass_file).read())) except: sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email]) holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor) if not os.path.isfile(holding_file): #sendLog('checkor',"no file %s"%holding_file) continue try: extending = json.loads(open(holding_file).read()) print bypassor,"is holding",extending holdings.extend( extending ) except: sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email]) ## once this was force-completed, you want to bypass for rider,email in actors: rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider) if not os.path.isfile(rider_file): print "no file",rider_file #sendLog('checkor',"no file %s"%rider_file) continue try: bypasses.extend( json.loads(open( rider_file ).read() ) ) except: sendLog('checkor',"cannot get force complete list from %s"%rider) sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email]) if use_mcm: forcings = mcm.get('/restapi/requests/forcecomplete') #if forcings: # sendEmail('force completing mechanism','please check what checkor is doing with %s'%( ','.join(forcings))) pattern_fraction_pass = UC.get('pattern_fraction_pass') total_running_time = 5.*60. sleep_time = 1 if len(wfs): sleep_time = min(max(0.5, total_running_time / len(wfs)), 10) random.shuffle( wfs ) in_manual = 0 ## now you have a record of what file was invalidated globally from TT TMDB_invalid = dataCache.get('file_invalidation') #try: # TMDB_invalid = set([row[3] for row in csv.reader( os.popen('curl -s "https://docs.google.com/spreadsheets/d/11fFsDOTLTtRcI4Q3gXw0GNj4ZS8IoXMoQDC3CbOo_2o/export?format=csv"'))]) # TMDB_invalid = map(lambda e : e.split(':')[-1], TMDB_invalid) # print len(TMDB_invalid),"globally invalidated files" #except Exception as e: # print "TMDB not fetched" # print str(e) # TMDB_invalid = [] print len(wfs),"to consider, pausing for",sleep_time max_per_round = UC.get('max_per_round').get('checkor',None) if options.limit: max_per_round=options.limit if max_per_round and not spec: wfs = wfs[:max_per_round] for wfo in wfs: if spec and not (spec in wfo.name): continue time.sleep( sleep_time ) time_point("Starting with %s"% wfo.name) ## get info wfi = workflowInfo(url, wfo.name) wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status)) ## make sure the wm status is up to date. # and send things back/forward if necessary. wfo.wm_status = wfi.request['RequestStatus'] if wfo.wm_status == 'closed-out': ## manually closed-out wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status)) wfo.status = 'close' session.commit() continue elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']: ## went into trouble wfo.status = 'trouble' wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status)) session.commit() continue elif wfo.wm_status in ['assigned','acquired']: ## not worth checking yet wfi.sendLog('checkor',"%s is not running yet"%wfo.name) session.commit() continue if '-onhold' in wfo.status: if wfo.name in holdings and wfo.name not in bypasses: wfi.sendLog('checkor',"%s is on hold"%wfo.name) continue if wfo.wm_status != 'completed': #and not wfo.name in bypasses: ## for sure move on with closeout check if in completed wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status)) session.commit() continue if wfo.name in holdings and wfo.name not in bypasses: wfo.status = 'assistance-onhold' wfi.sendLog('checkor',"setting %s on hold"%wfo.name) session.commit() continue session.commit() #sub_assistance="" # if that string is filled, there will be need for manual assistance existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance assistance_tags = set() is_closing = True ## get it from somewhere bypass_checks = False for bypass in bypasses: if bypass in wfo.name: wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass)) bypass_checks = True break pids = wfi.getPrepIDs() force_by_mcm = False force_by_user = False for force in forcings: if force in pids: wfi.sendLog('checkor',"we can bypass checks and force complete %s because of prepid %s "%( wfo.name, force)) bypass_checks = True force_by_mcm = True break for user in overrides: for force in overrides[user]: if force in wfo.name: wfi.sendLog('checkor',"we can bypass checks and force complete %s because of keyword %s of user %s"%( wfo.name, force, user)) bypass_checks = True force_by_user = True break tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm* vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco to_ddm_tier = copy.deepcopy(UC.get('tiers_to_DDM')) campaigns = {} ## this mapping of campaign per output dataset assumes era==campaing, which is not true for relval expected_outputs = copy.deepcopy( wfi.request['OutputDatasets'] ) for out in wfi.request['OutputDatasets']: c = get_campaign(out, wfi) campaigns[out] = c if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]: vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override'])) ## add those that we need to check for custodial copy tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] )) wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])] check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] )) check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check )) check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) ) wfi.sendLog('checkor', check_output_text ) ## anything running on acdc : getting the real prepid is not worth it familly = getWorkflowById(url, wfi.request['PrepID'], details=True) acdc = [] acdc_inactive = [] forced_already=False acdc_bads = [] true_familly = [] for member in familly: if member['RequestType'] != 'Resubmission': continue if member['RequestName'] == wfo.name: continue if member['RequestDate'] < wfi.request['RequestDate']: continue if member['PrepID'] != wfi.request['PrepID'] : continue #if 'OriginalRequestName' in member and (not 'ACDC' in member['OriginalRequestName']) and member['OriginalRequestName'] != wfo.name: continue if member['RequestStatus'] == None: continue if not set(member['OutputDatasets']).issubset( set(expected_outputs)): if not member['RequestStatus'] in ['rejected-archived','rejected','aborted','aborted-archived']: ##this is not good at all wfi.sendLog('checkor','inconsistent ACDC %s'%member['RequestName'] ) #sendLog('checkor','inconsistent ACDC %s'%member['RequestName'], level='critical') acdc_bads.append( member['RequestName'] ) is_closing = False assistance_tags.add('manual') continue true_familly.append( member['RequestName'] ) #try: # parse_one(url, member['RequestName']) #except: # print "Could not make error report for",member['RequestName'] if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']: print wfo.name,"still has an ACDC running",member['RequestName'] acdc.append( member['RequestName'] ) ## cannot be bypassed! is_closing = False assistance_tags.add('recovering') if (force_by_mcm or force_by_user) and not forced_already: wfi.sendLog('checkor','%s is being forced completed while recovering'%wfo.name) wfi.notifyRequestor("The workflow %s was force completed"% wfo.name, do_batch=False) forceComplete(url, wfi) forced_already=True else: acdc_inactive.append( member['RequestName'] ) assistance_tags.add('recovered') if acdc_bads: #sendEmail('inconsistent ACDC','for %s, ACDC %s is inconsistent, preventing from closing'%( wfo.name, ','.join(acdc_bads) )) sendLog('checkor','For %s, ACDC %s is inconsistent, preventing from closing or will create a mess.'%( wfo.name, ','.join(acdc_bads) ), level='critical') time_point("checked workflow familly", sub_lap=True) ## completion check percent_completions = {} if not 'TotalInputEvents' in wfi.request: event_expected,lumi_expected = 0,0 if not 'recovery' in wfo.status: #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**']) sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical') else: event_expected,lumi_expected = wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis'] if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']): event_expected = int(wfi.request['RequestNumEvents']) elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']: event_expected = wfi.request['Task1']['RequestNumEvents'] for i in range(1,20): if 'Task%d'%i in wfi.request: ## this is wrong ibsolute if 'FilterEfficiency' in wfi.request['Task%d'%i]: event_expected *= float(wfi.request['Task%d'%i]['FilterEfficiency']) event_expected = int(event_expected) fractions_pass = {} events_per_lumi = {} over_100_pass = False (lhe,prim,_,_) = wfi.getIO() if lhe or prim: over_100_pass = False time_point("execpted statistics", sub_lap=True) for output in wfi.request['OutputDatasets']: event_count,lumi_count = getDatasetEventsAndLumis(dataset=output) events_per_lumi[output] = event_count/float(lumi_count) if lumi_count else 100 percent_completions[output] = 0. if lumi_expected: percent_completions[output] = lumi_count / float( lumi_expected ) if event_expected: wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected )) percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) ) default_pass = UC.get('default_fraction_pass') fractions_pass[output] = default_pass c = campaigns[output] if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]: if type(CI.campaigns[c]['fractionpass']) == dict: tier = output.split('/')[-1] priority = str(wfi.request['RequestPriority']) ## defined per tier fractions_pass[output] = CI.campaigns[c]['fractionpass'].get('all', default_pass) if tier in CI.campaigns[c]['fractionpass']: fractions_pass[output] = CI.campaigns[c]['fractionpass'][tier] if priority in CI.campaigns[c]['fractionpass']: fractions_pass[output] = CI.campaigns[c]['fractionpass'][priority] else: fractions_pass[output] = CI.campaigns[c]['fractionpass'] wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output)) if options.fractionpass: fractions_pass[output] = options.fractionpass print "overriding fraction to",fractions_pass[output],"by command line for",output for key in pattern_fraction_pass: if key in output: fractions_pass[output] = pattern_fraction_pass[key] print "overriding fraction to",fractions_pass[output],"by dataset key",key if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]): possible_recoveries = wfi.getRecoveryDoc() if possible_recoveries == []: wfi.sendLog('checkor','%s has missing statistics \n%s \n%s, but nothing is recoverable. passing through to annoucement'%( wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) )) sendLog('checkor','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name, level='critical') #sendEmail('nothing is recoverable','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name)#,destination=['*****@*****.**']) ## do not bypass for now, until Alan understands why we are loosing ACDC docs bypass_checks = True else: wfi.sendLog('checkor','%s is not completed \n%s \n%s'%( wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) )) ## hook for creating automatically ACDC ? if not bypass_checks: assistance_tags.add('recovery') is_closing = False if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]): print wfo.name,"is over completed" print json.dumps(percent_completions, indent=2) if not bypass_checks: assistance_tags.add('over100') is_closing = False time_point("checked output size", sub_lap=True) ## correct lumi < 300 event per lumi #for output in wfi.request['OutputDatasets']: #events_per_lumi[output] = getDatasetEventsPerLumi( output ) lumi_upper_limit = {} for output in wfi.request['OutputDatasets']: upper_limit = 301. campaign = campaigns[output] if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]: upper_limit = CI.campaigns[campaign]['lumisize'] print "overriding the upper lumi size to",upper_limit,"for",campaign if options.lumisize: upper_limit = options.lumisize print "overriding the upper lumi size to",upper_limit,"by command line" lumi_upper_limit[output] = upper_limit if wfi.request['RequestType'] in ['ReDigi','ReReco']: lumi_upper_limit[output] = -1 if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]): print wfo.name,"has big lumisections" print json.dumps(events_per_lumi, indent=2) ## hook for rejecting the request ? if not bypass_checks: assistance_tags.add('biglumi') is_closing = False any_presence = {} for output in wfi.request['OutputDatasets']: any_presence[output] = getDatasetPresence(url, output, vetoes=[]) time_point("checked dataset presence", sub_lap=True) ## custodial copy custodial_locations = {} custodial_presences = {} for output in wfi.request['OutputDatasets']: custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s] custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output) if not custodial_locations[output]: custodial_locations[output] = [] time_point("checked custodiality", sub_lap=True) ## presence in phedex phedex_presence ={} for output in wfi.request['OutputDatasets']: phedex_presence[output] = phedexClient.getFileCountDataset(url, output ) time_point("checked phedex count", sub_lap=True) out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier] size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs size_worht_going_to_ddm = sum([getDatasetSize(out)/1023. for out in out_worth_checking if out.split('/')[-1] in to_ddm_tier ]) ## size in TBs of all outputs if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])): print wfo.name,"has not all custodial location" print json.dumps(custodial_locations, indent=2) ########## ## hook for making a custodial replica ? custodial = None ## get from other outputs for output in out_worth_checking: if len(custodial_locations[output]): custodial = custodial_locations[output][0] if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the other output custodial:",custodial,"because of limited space" custodial = None ## try to get it from campaign configuration if not custodial: for output in out_worth_checking: campaign = campaigns[output] if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]: custodial = CI.campaigns[campaign]['custodial'] print "Setting custodial to",custodial,"from campaign configuration" group = None if campaign in CI.campaigns and 'phedex_group' in CI.campaigns[campaign]: group = CI.campaigns[campaign]['phedex_group'] print "using group",group,"for replica" if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the campaign configuration custodial:",custodial,"because of limited space" custodial = None ## get from the parent pick_custodial = True use_parent_custodial = UC.get('use_parent_custodial') tape_size_limit = options.tape_size_limit if options.tape_size_limit else UC.get("tape_size_limit") _,prim,_,_ = wfi.getIO() if not custodial and prim and use_parent_custodial: parent_dataset = prim.pop() ## this is terribly dangerous to assume only parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset ) ###parents_custodial = findCustodialLocation(url, parent_dataset) if not parents_custodial: parents_custodial = [] if len(parents_custodial): custodial = parents_custodial[0] else: print "the input dataset",parent_dataset,"does not have custodial in the first place. abort" #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset) ## does not work for RAWOADSIM sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset) ## cannot be bypassed, this is an issue to fix is_closing = False pick_custodial = False assistance_tags.add('parentcustodial') if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the parent custodial:",custodial,"because of limited space" custodial = None if not custodial and pick_custodial: ## pick one at random custodial = SI.pick_SE(size=size_worth_checking) if custodial and size_worht_going_to_ddm > tape_size_limit: print wfi.sendLog('checkor',"The total output size (%s TB) is too large for the limit set (%s TB)"%( size_worth_checking, tape_size_limit)) custodial = None if not custodial: print "cannot find a custodial for",wfo.name wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking)) sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical') if custodial and (is_closing or bypass_checks): print "picked",custodial,"for tape copy" ## remember how much you added this round already ; this stays locally SI.storage[custodial] -= size_worth_checking ## register the custodial request, if there are no other big issues for output in out_worth_checking: if not len(custodial_locations[output]): if phedex_presence[output]>=1: wfi.sendLog('checkor','Using %s as a tape destination for %s'%(custodial, output)) custodials[custodial].append( output ) if group: custodials[custodial][-1]+='@%s'%group ## let's wait and see if that's needed assistance_tags.add('custodial') else: print "no file in phedex for",output," not good to add to custodial requests" #cannot be bypassed is_closing = False time_point("determined tape location", sub_lap=True) ## disk copy disk_copies = {} for output in wfi.request['OutputDatasets']: disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)] if not all(map( lambda sites : len(sites)!=0, disk_copies.values())): print wfo.name,"has not all output on disk" print json.dumps(disk_copies, indent=2) ## presence in dbs dbs_presence = {} dbs_invalid = {} for output in wfi.request['OutputDatasets']: dbs_presence[output] = dbs3Client.getFileCountDataset( output ) dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True) time_point("dbs file count", sub_lap=True) if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: mismatch_notice = wfo.name+" has a dbs,phedex mismatch\n" mismatch_notice += "in dbs\n"+json.dumps(dbs_presence, indent=2) +"\n" mismatch_notice += "invalide in dbs\n"+json.dumps(dbs_invalid, indent=2) +"\n" mismatch_notice += "in phedex\n"+json.dumps(phedex_presence, indent=2) +"\n" wfi.sendLog('checkor',mismatch_notice) if not 'recovering' in assistance_tags: assistance_tags.add('filemismatch') #print this for show and tell if no recovery on-going for out in dbs_presence: _,_,missing_phedex,missing_dbs = getDatasetFiles(url, out) if missing_phedex: wfi.sendLog('checkor',"These %d files are missing in phedex\n%s"%(len(missing_phedex), "\n".join( missing_phedex ))) were_invalidated = sorted(set(missing_phedex) & set(TMDB_invalid )) if were_invalidated: wfi.sendLog('checkor',"These %d files were invalidated globally\n%s"%(len(were_invalidated), "\n".join(were_invalidated))) sendLog('checkor',"These %d files were invalidated globally\n%s\nand are invalidated in dbs"%(len(were_invalidated), "\n".join(were_invalidated)), level='critical') dbs3Client.setFileStatus( were_invalidated, newstatus=0 ) if missing_dbs: wfi.sendLog('checkor',"These %d files are missing in dbs\n%s"%(len(missing_dbs), "\n".join( missing_dbs ))) were_invalidated = sorted(set(missing_dbs) & set(TMDB_invalid )) if were_invalidated: wfi.sendLog('checkor',"These %d files were invalidated globally\n%s"%(len(were_invalidated), "\n".join(were_invalidated))) #if not bypass_checks: ## I don't think we can by pass this is_closing = False time_point("checked file count", sub_lap=True) fraction_invalid = 0.20 if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignoreinvalid: print wfo.name,"has a dbs invalid file level too high" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## need to be going and taking an eye assistance_tags.add('invalidfiles') if not bypass_checks: #sub_assistance+="-invalidfiles" is_closing = False ## put that heavy part at the end ## duplication check duplications = {} files_per_rl = {} for output in wfi.request['OutputDatasets']: duplications[output] = "skiped" files_per_rl[output] = "skiped" time_point("checked invalidation", sub_lap=True) if (is_closing or bypass_checks) and (not options.ignoreduplicates): print "starting duplicate checker for",wfo.name for output in wfi.request['OutputDatasets']: print "\tchecking",output duplications[output] = True try: duplications[output],files_per_rl[output] = dbs3Client.duplicateRunLumiFiles( output , skipInvalid=True, verbose=True) except: try: duplications[output],files_per_rl[output] = dbs3Client.duplicateRunLumiFiles( output , skipInvalid=True, verbose=True) except Exception as e: wfi.sendLog('checkor','Not possible to check on duplicate lumi count on %s'%(output)) sendLog('checkor','Not possible to check on duplicate lumi count on %s\n%s'%(output,str(e)),level='critical') is_closing=False if is_closing and any(duplications.values()) and not options.ignoreduplicates: duplicate_notice = "" duplicate_notice += "%s has duplicates\n"%wfo.name duplicate_notice += json.dumps( duplications,indent=2) duplicate_notice += '\n' duplicate_notice += json.dumps( files_per_rl, indent=2) wfi.sendLog('checkor',duplicate_notice) ## hook for making file invalidation ? ## it shouldn't be allowed to bypass it assistance_tags.add('duplicates') is_closing = False time_point("checked duplicates", sub_lap=True) time_point("done with %s"%wfo.name) ## for visualization later on if not wfo.name in fDB.record: #print "adding",wfo.name,"to close out record" fDB.record[wfo.name] = { 'datasets' :{}, 'name' : wfo.name, 'closeOutWorkflow' : None, } fDB.record[wfo.name]['closeOutWorkflow'] = is_closing fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority'] fDB.record[wfo.name]['prepid'] = wfi.request['PrepID'] for output in wfi.request['OutputDatasets']: if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {} rec = fDB.record[wfo.name]['datasets'][output] #rec['percentage'] = float('%.2f'%(percent_completions[output]*100)) rec['percentage'] = math.floor(percent_completions[output]*10000)/100.## round down rec['duplicate'] = duplications[output] if output in duplications else 'N/A' rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A' rec['closeOutDataset'] = is_closing rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A' rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output]))) rec['dbsFiles'] = dbs_presence[output] rec['dbsInvFiles'] = dbs_invalid[output] rec['phedexFiles'] = phedex_presence[output] rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive)) rec['familly'] = true_familly now = time.gmtime() rec['timestamp'] = time.mktime(now) rec['updated'] = time.asctime(now)+' (GMT)' ## make the lumi summary if wfi.request['RequestType'] == 'ReReco': try: os.system('python Unified/lumi_summary.py %s 1 > /dev/null'%(wfi.request['PrepID'])) os.system('python Unified/lumi_plot.py %s > /dev/null'%(wfi.request['PrepID'])) wfi.sendLog('checkor','Lumi summary available at %s/datalumi/lumi.%s.html'%(unified_url,wfi.request['PrepID'])) except Exception as e: print str(e) ## make the error report ## and move on if is_closing: ## toggle status to closed-out in request manager wfi.sendLog('checkor',"setting %s closed-out"% wfo.name) if not options.test: if wfo.wm_status in ['closed-out','announced','normal-archived']: print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does" res = None else: res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) print "close out answer",res if not res in ["None",None]: print "try to get the current status again" wfi_bis = workflowInfo(url, wfo.name) if wfi_bis.request['RequestStatus'] == 'closed-out': print "the request did toggle to closed-out" res = None if not res in ["None",None]: print "retrying to closing out" print res res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) if res in [None,"None"]: wfo.status = 'close' session.commit() if use_mcm and force_by_mcm: ## shoot large on all prepids, on closing the wf for pid in pids: mcm.delete('/restapi/requests/forcecomplete/%s'%pid) else: print "could not close out",wfo.name,"will try again next time" else: if not 'custodial' in assistance_tags or wfi.isRelval(): ## do only the report for those for member in acdc+acdc_inactive+[wfo.name]: try: parse_one(url, member) except: print "Could not make error report for",member ## full known list #recovering # has active ACDC ##OUT #recovered #had inactive ACDC #recovery #not over the pass bar #over100 # over 100% #biglumi # has a big lumiblock #parentcustodial # the parent does not have a valid subscription yet #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear #filemismatch # there is a dbs/phedex mismatch #duplicates #a lumi section is there twice ## manual is not added yet, and should be so by recoveror print wfo.name,"was tagged with :",list(assistance_tags) if 'recovering' in assistance_tags: ## if active ACDC, being under threshold, filemismatch do not matter assistance_tags = assistance_tags - set(['recovery','filemismatch']) if 'recovery' in assistance_tags and 'recovered' in assistance_tags: ## should not set -recovery to anything that had ACDC already assistance_tags = assistance_tags - set(['recovery','recovered']) ## straight to manual assistance_tags.add('manual') in_manual += 1 if 'recovery' in assistance_tags and 'manual' in assistance_tags: ## this is likely because something bad is happening, so leave it to manual assistance_tags = assistance_tags - set(['recovery']) assistance_tags.add('manual') in_manual += 1 ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it print wfo.name,"needs assistance with",",".join( assistance_tags ) print wfo.name,"existing conditions",",".join( existing_assistance_tags ) ######################################### ##### notification to requester ######### go_notify=False if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags: go_notify=True if go_notify: #if wfo.name in already_notified: # print "double notification" # sendEmail('double notification','please take a look at %s'%(wfo.name)) #else: # already_notified.append( wfo.name ) ###detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s' #detailslink = 'https://cmsweb.cern.ch/reqmgr2/fetch?rid=%s'%(wfo.name) ###perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name) perflink = '%s/report/%s'%(unified_url,wfo.name) splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name) ## notify templates messages= { 'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ), 'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink), 'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ), 'filemismatch': 'Samples completed with inconsistency in DBS/Phedex', #'manual' : 'Workflow completed and requires manual checks by Ops', } content = "The request PREPID (WORKFLOW) is facing issue in production.\n" motive = False for case in messages: if case in assistance_tags: content+= "\n"+messages[case]+"\n" motive = True content += "You are invited to check, while this is being taken care of by Comp-Ops.\n" content += "This is an automated message from Comp-Ops.\n" items_notified = set() if use_mcm and motive: wfi.notifyRequestor( content , mcm = mcm) ######################################### ## logic to set the status further if assistance_tags: new_status = 'assistance-'+'-'.join(sorted(assistance_tags) ) else: new_status = 'assistance' ## case where the workflow was in manual from recoveror if not 'manual' in wfo.status or new_status!='assistance-recovery': wfo.status = new_status if not options.test: wfi.sendLog('checkor','setting %s to %s'%(wfo.name, wfo.status)) session.commit() else: print "current status is",wfo.status,"not changing to anything" #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2)) fDB.html() if not spec and in_manual!=0: sendEmail("fresh assistance status available","Fresh status are available at %s/assistance.html"%unified_url,destination=['*****@*****.**']) #it's a bit annoying pass ## custodial requests print "Custodials" print json.dumps(custodials, indent=2) for site in custodials: items_at = defaultdict(set) for i in custodials[site]: item, group = i.split('@') if '@' in i else (i,'DataOps') items_at[group].add( item ) for group,items in items_at.items(): print ','.join(items),'=>',site,'@',group if not options.test: result = makeReplicaRequest(url, site, sorted(items) ,"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) , group=group) print result print "File Invalidation" print invalidations
def checkor(url, spec=None, options=None): fDB = closeoutInfo() if userLock(): return if duplicateLock(): return UC = unifiedConfiguration() use_mcm = True up = componentInfo(mcm=use_mcm, soft=["mcm"]) if not up.check(): return use_mcm = up.status["mcm"] wfs = [] if options.fetch: ## get all in running and check wfs.extend(session.query(Workflow).filter(Workflow.status == "away").all()) wfs.extend(session.query(Workflow).filter(Workflow.status == "assistance").all()) if options.nofetch: ## than get all in need for assistance wfs.extend(session.query(Workflow).filter(Workflow.status.startswith("assistance-")).all()) custodials = defaultdict(list) # sites : dataset list transfers = defaultdict(list) # sites : dataset list invalidations = [] # a list of files SI = global_SI CI = campaignInfo() mcm = McMClient(dev=False) def get_campaign(output, wfi): campaign = None try: campaign = output.split("/")[2].split("-")[0] except: if "Campaign" in wfi.request: campaign = wfi.request["Campaign"] return campaign by_passes = [] holdings = [] for bypassor, email in [ ("jbadillo", "*****@*****.**"), ("vlimant", "*****@*****.**"), ("jen_a", "*****@*****.**"), ]: bypass_file = "/afs/cern.ch/user/%s/%s/public/ops/bypass.json" % (bypassor[0], bypassor) if not os.path.isfile(bypass_file): print "no file", bypass_file continue try: by_passes.extend(json.loads(open(bypass_file).read())) except: print "cannot get by-passes from", bypass_file, "for", bypassor sendEmail("malformated by-pass information", "%s is not json readable" % (bypass_file), destination=[email]) holding_file = "/afs/cern.ch/user/%s/%s/public/ops/onhold.json" % (bypassor[0], bypassor) if not os.path.isfile(holding_file): print "no file", holding_file continue try: holdings.extend(json.loads(open(holding_file).read())) except: print "cannot get holdings from", holding_file, "for", bypassor sendEmail( "malformated by-pass information", "%s is not json readable" % (holding_file), destination=[email] ) total_running_time = 5.0 * 60.0 sleep_time = max(0.5, total_running_time / len(wfs)) for wfo in wfs: if spec and not (spec in wfo.name): continue time.sleep(sleep_time) print "checking on", wfo.name ## get info wfi = workflowInfo(url, wfo.name) ## make sure the wm status is up to date. # and send things back/forward if necessary. wfo.wm_status = wfi.request["RequestStatus"] if wfo.wm_status == "closed-out": ## manually closed-out print wfo.name, "is already", wfo.wm_status wfo.status = "close" session.commit() continue elif wfo.wm_status in [ "failed", "aborted", "aborted-archived", "rejected", "rejected-archived", "aborted-completed", ]: ## went into trouble wfo.status = "trouble" print wfo.name, "is in trouble", wfo.wm_status session.commit() continue elif wfo.wm_status in ["assigned", "acquired"]: ## not worth checking yet print wfo.name, "not running yet" session.commit() continue if "-onhold" in wfo.status: if wfo.name in holdings and wfo.name not in by_passes: print wfo.name, "on hold" continue if wfo.name in holdings and wfo.name not in by_passes: wfo.status = "assistance-onhold" print "setting", wfo.name, "on hold" session.commit() continue if wfo.wm_status != "completed" and not wfo.name in by_passes: ## for sure move on with closeout check if in completed print "no need to check on", wfo.name, "in status", wfo.wm_status session.commit() continue session.commit() sub_assistance = "" # if that string is filled, there will be need for manual assistance is_closing = True ## get it from somewhere by_pass_checks = False if wfo.name in by_passes: print "we can bypass checks on", wfo.name by_pass_checks = True for bypass in by_passes: if bypass in wfo.name: print "we can bypass", wfo.name, "because of keyword", bypass by_pass_checks = True break if not CI.go(wfi.request["Campaign"]) and not by_pass_checks: print "No go for", wfo.name continue # tuck out DQMIO/DQM wfi.request["OutputDatasets"] = [out for out in wfi.request["OutputDatasets"] if not "/DQM" in out] ## anything running on acdc familly = getWorkflowById(url, wfi.request["PrepID"], details=True) acdc = [] acdc_inactive = [] has_recovery_going = False had_any_recovery = False for member in familly: if member["RequestType"] != "Resubmission": continue if member["RequestName"] == wfo.name: continue if member["RequestDate"] < wfi.request["RequestDate"]: continue if member["RequestStatus"] in [ "running-open", "running-closed", "assignment-approved", "assigned", "acquired", ]: print wfo.name, "still has an ACDC running", member["RequestName"] acdc.append(member["RequestName"]) # print json.dumps(member,indent=2) ## hook for just waiting ... is_closing = False has_recovery_going = True elif member["RequestStatus"] == None: print member["RequestName"], "is not real" pass else: acdc_inactive.append(member["RequestName"]) had_any_recovery = True ## completion check percent_completions = {} # print "let's see who is crashing", wfo.name # print wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis'] if not "TotalInputEvents" in wfi.request: event_expected, lumi_expected = 0, 0 if not "recovery" in wfo.status: sendEmail( "missing member of the request", "TotalInputEvents is missing from the workload of %s" % wfo.name, destination=["*****@*****.**"], ) else: event_expected, lumi_expected = wfi.request["TotalInputEvents"], wfi.request["TotalInputLumis"] if "RequestNumEvents" in wfi.request: event_expected = int(wfi.request["RequestNumEvents"]) elif "Task1" in wfi.request and "RequestNumEvents" in wfi.request["Task1"]: event_expected = int(wfi.request["Task1"]["RequestNumEvents"]) fractions_pass = {} for output in wfi.request["OutputDatasets"]: event_count, lumi_count = getDatasetEventsAndLumis(dataset=output) percent_completions[output] = 0.0 if lumi_expected: percent_completions[output] = lumi_count / float(lumi_expected) if event_expected: percent_completions[output] = max(percent_completions[output], event_count / float(event_expected)) fractions_pass[output] = 0.95 c = get_campaign(output, wfi) if c in CI.campaigns and "fractionpass" in CI.campaigns[c]: fractions_pass[output] = CI.campaigns[c]["fractionpass"] print "overriding fraction to", fractions_pass[output], "for", output if options.fractionpass: fractions_pass[output] = options.fractionpass print "overriding fraction to", fractions_pass[output], "by command line for", output if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]): print wfo.name, "is not completed" print json.dumps(percent_completions, indent=2) print json.dumps(fractions_pass, indent=2) ## hook for creating automatically ACDC ? if has_recovery_going: sub_assistance += "-recovering" elif had_any_recovery: ## we want to have this looked at sub_assistance += "-manual" else: sub_assistance += "-recovery" is_closing = False ## correct lumi < 300 event per lumi events_per_lumi = {} for output in wfi.request["OutputDatasets"]: events_per_lumi[output] = getDatasetEventsPerLumi(output) lumi_upper_limit = {} for output in wfi.request["OutputDatasets"]: upper_limit = 301.0 campaign = get_campaign(output, wfi) # if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request: # upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency'] # print "setting the upper limit of lumisize to",upper_limit,"by request configuration" if campaign in CI.campaigns and "lumisize" in CI.campaigns[campaign]: upper_limit = CI.campaigns[campaign]["lumisize"] print "overriding the upper lumi size to", upper_limit, "for", campaign if options.lumisize: upper_limit = options.lumisize print "overriding the upper lumi size to", upper_limit, "by command line" lumi_upper_limit[output] = upper_limit if any([events_per_lumi[out] >= lumi_upper_limit[out] for out in events_per_lumi]): print wfo.name, "has big lumisections" print json.dumps(events_per_lumi, indent=2) ## hook for rejecting the request ? sub_assistance += "-biglumi" is_closing = False any_presence = {} for output in wfi.request["OutputDatasets"]: any_presence[output] = getDatasetPresence(url, output, vetoes=[]) ## custodial copy custodial_locations = {} custodial_presences = {} for output in wfi.request["OutputDatasets"]: custodial_presences[output] = [s for s in any_presence[output] if "MSS" in s] custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output) if not custodial_locations[output]: custodial_locations[output] = [] ## presence in phedex phedex_presence = {} for output in wfi.request["OutputDatasets"]: phedex_presence[output] = phedexClient.getFileCountDataset(url, output) vetoed_custodial_tier = UC.get("tiers_with_no_custodial") out_worth_checking = [ out for out in custodial_locations.keys() if out.split("/")[-1] not in vetoed_custodial_tier ] size_worth_checking = sum( [getDatasetSize(out) / 1023.0 for out in out_worth_checking] ) ## size in TBs of all outputs if not all(map(lambda sites: len(sites) != 0, [custodial_locations[out] for out in out_worth_checking])): print wfo.name, "has not all custodial location" print json.dumps(custodial_locations, indent=2) ########## ## hook for making a custodial replica ? custodial = None ## get from other outputs for output in out_worth_checking: if len(custodial_locations[output]): custodial = custodial_locations[output][0] if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the other output custodial:", custodial, "because of limited space" custodial = None ## try to get it from campaign configuration if not custodial: for output in out_worth_checking: campaign = get_campaign(output, wfi) if campaign in CI.campaigns and "custodial" in CI.campaigns[campaign]: custodial = CI.campaigns[campaign]["custodial"] print "Setting custodial to", custodial, "from campaign configuration" break if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the unified configuration custodial:", custodial, "because of limited space" custodial = None ## get from the parent pick_custodial = True if not custodial and "InputDataset" in wfi.request: ## this is terribly dangerous to assume only parents_custodial = phedexClient.getCustodialSubscriptionRequestSite(wfi.request["InputDataset"]) ###parents_custodial = findCustodialLocation(url, wfi.request['InputDataset']) if not parents_custodial: parents_custodial = [] if len(parents_custodial): custodial = parents_custodial[0] else: print "the input dataset", wfi.request[ "InputDataset" ], "does not have custodial in the first place. abort" sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor" % wfi.request["InputDataset"], ) is_closing = False pick_custodial = False if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the parent custodial:", custodial, "because of limited space" custodial = None if not custodial and pick_custodial: ## pick one at random custodial = SI.pick_SE(size=size_worth_checking) if not custodial: print "cannot find a custodial for", wfo.name sendEmail( "cannot find a custodial", "cannot find a custodial for %s probably because of the total output size %d" % (wfo.name, size_worth_checking), ) if custodial and ((not sub_assistance and not acdc) or by_pass_checks): ## remember how much you added this round already ; this stays locally SI.storage[custodial] -= size_worth_checking ## register the custodial request, if there are no other big issues for output in out_worth_checking: if not len(custodial_locations[output]): if phedex_presence[output] >= 1: custodials[custodial].append(output) else: print "no file in phedex for", output, " not good to add to custodial requests" is_closing = False ## disk copy disk_copies = {} for output in wfi.request["OutputDatasets"]: disk_copies[output] = [s for s in any_presence[output] if (not "MSS" in s) and (not "Buffer" in s)] if not all(map(lambda sites: len(sites) != 0, disk_copies.values())): print wfo.name, "has not all output on disk" print json.dumps(disk_copies, indent=2) ## presence in dbs dbs_presence = {} dbs_invalid = {} for output in wfi.request["OutputDatasets"]: dbs_presence[output] = dbs3Client.getFileCountDataset(output) dbs_invalid[output] = dbs3Client.getFileCountDataset(output, onlyInvalid=True) fraction_invalid = 0.01 if ( not all( [ dbs_presence[out] == (dbs_invalid[out] + phedex_presence[out]) for out in wfi.request["OutputDatasets"] ] ) and not options.ignorefiles ): print wfo.name, "has a dbs,phedex mismatch" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## hook for just waiting ... is_closing = False if ( not all( [ (dbs_invalid[out] <= int(fraction_invalid * dbs_presence[out])) for out in wfi.request["OutputDatasets"] ] ) and not options.ignorefiles ): print wfo.name, "has a dbs invalid file level too high" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## need to be going and taking an eye sub_assistance += "-invalidfiles" is_closing = False ## put that heavy part at the end ## duplication check duplications = {} if is_closing: print "starting duplicate checker for", wfo.name for output in wfi.request["OutputDatasets"]: print "\tchecking", output duplications[output] = True try: duplications[output] = dbs3Client.duplicateRunLumi(output) except: try: duplications[output] = dbs3Client.duplicateRunLumi(output) except: print "was not possible to get the duplicate count for", output is_closing = False if any(duplications.values()) and not options.ignoreduplicates: print wfo.name, "has duplicates" print json.dumps(duplications, indent=2) ## hook for making file invalidation ? sub_assistance += "-duplicates" is_closing = False ## for visualization later on if not wfo.name in fDB.record: # print "adding",wfo.name,"to close out record" fDB.record[wfo.name] = {"datasets": {}, "name": wfo.name, "closeOutWorkflow": None} fDB.record[wfo.name]["closeOutWorkflow"] = is_closing for output in wfi.request["OutputDatasets"]: if not output in fDB.record[wfo.name]["datasets"]: fDB.record[wfo.name]["datasets"][output] = {} rec = fDB.record[wfo.name]["datasets"][output] rec["percentage"] = float("%.2f" % (percent_completions[output] * 100)) rec["duplicate"] = duplications[output] if output in duplications else "N/A" rec["phedexReqs"] = ( float("%.2f" % any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output]) != 0 else "N/A" ) rec["closeOutDataset"] = is_closing rec["transPerc"] = ( float("%.2f" % any_presence[output][disk_copies[output][0]][1]) if len(disk_copies[output]) != 0 else "N/A" ) rec["correctLumis"] = ( int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True ) rec["missingSubs"] = ( False if len(custodial_locations[output]) == 0 else ",".join(list(set(custodial_locations[output]))) ) rec["dbsFiles"] = dbs_presence[output] rec["dbsInvFiles"] = dbs_invalid[output] rec["phedexFiles"] = phedex_presence[output] rec["acdc"] = "%d / %d" % (len(acdc), len(acdc + acdc_inactive)) if by_pass_checks: ## force closing is_closing = True ## and move on if is_closing: ## toggle status to closed-out in request manager print "setting", wfo.name, "closed-out" if not options.test: if wfo.wm_status in ["closed-out", "announced", "normal-archived"]: print wfo.name, "is already", wfo.wm_status, "not trying to closed-out and assuming it does" res = None else: res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) print "close out answer", res if not res in ["None", None]: print "retrying to closing out" print res res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) if res in [None, "None"]: wfo.status = "close" session.commit() else: print "could not close out", wfo.name, "will try again next time" else: ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it new_status = "assistance" + sub_assistance print wfo.name, "needs assistance with", new_status if sub_assistance and wfo.status != new_status and "PrepID" in wfi.request and not "manual" in wfo.status: pid = wfi.getPrepIDs()[0].replace("task_", "") # pid = wfi.request['PrepID'].replace('task_','') ## notify messages = { "recovery": "Samples completed with missing statistics:\n%s " % ( "\n".join( [ "%.2f %% complete for %s" % (percent_completions[output] * 100, output) for output in wfi.request["OutputDatasets"] ] ) ), "biglumi": "Samples completed with large luminosity blocks:\n%s " % ( "\n".join( [ "%d > %d for %s" % (events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request["OutputDatasets"] if (events_per_lumi[output] > lumi_upper_limit[output]) ] ) ), "duplicate": "Samples completed with duplicated luminosity blocks:\n%s" % ( "\n".join( [ "%s" % output for output in wfi.request["OutputDatasets"] if output in duplications and duplications[output] ] ) ), } text = "The request %s (%s) is facing issue in production.\n" % (pid, wfo.name) content = "" for case in messages: if case in new_status: content += "\n" + messages[case] + "\n" text += content text += "You are invited to check, while this is being taken care of by Ops.\n" text += "This is an automated message." if use_mcm and content: print "Sending notification back to requestor" print text batches = mcm.getA("batches", query="contains=%s&status=announced" % pid) if len(batches): ## go notify the batch bid = batches[-1]["prepid"] print "batch nofication to", bid mcm.put("/restapi/batches/notify", {"notes": text, "prepid": bid}) ## go notify the request print "request notification to", pid mcm.put("/restapi/requests/notify", {"message": text, "prepids": [pid]}) ## case where the workflow was in manual from recoveror if not "manual" in wfo.status or new_status != "assistance-recovery": wfo.status = new_status if not options.test: print "setting", wfo.name, "to", wfo.status session.commit() else: print "current status is", wfo.status, "not changing to anything" fDB.html() ## custodial requests print "Custodials" print json.dumps(custodials, indent=2) for site in custodials: print ",".join(custodials[site]), "=>", site if not options.test: result = makeReplicaRequest( url, site, list(set(custodials[site])), "custodial copy at production close-out", custodial="y", priority="low", approve=(site in SI.sites_auto_approve), ) print result print "Transfers" print json.dumps(transfers, indent=2) ## replicas requests for site in transfers: print ",".join(transfers[site]), "=>", site if not options.test: result = None # result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out") print result print "File Invalidation" print invalidations
def transferor(url, specific=None, talk=True, options=None): if userLock(): return mlock = moduleLock() if mlock(): return use_mcm = True up = componentInfo(soft=['mcm', 'wtc', 'jira']) if not up.check(): return use_mcm = up.status['mcm'] if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() #NLI = newLockInfo() #if not NLI.free(): return LI = lockInfo() if not LI.free(): return mcm = McMClient(dev=False) dss = DSS() #allowed_secondary = UC.get('') print "counting all being handled..." being_handled = len( session.query(Workflow).filter(Workflow.status == 'away').all()) being_handled += len( session.query(Workflow).filter( Workflow.status.startswith('stag')).all()) being_transfered = len( session.query(Workflow).filter(Workflow.status == 'staging').all()) #being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all()) being_handled += len( session.query(Workflow).filter( Workflow.status.startswith('assistance-')).filter( ~Workflow.status.contains('custodial')).all()) max_to_handle = options.maxworkflows max_to_transfer = options.maxstaging allowed_to_handle = max(0, max_to_handle - being_handled) allowed_to_transfer = max(0, max_to_transfer - being_transfered) wf_buffer = 5 if allowed_to_handle <= wf_buffer: ## buffer for having several wf per transfer print "Not allowed to run more than", max_to_handle, "at a time. Currently", being_handled, "and", wf_buffer, "buffer" else: print being_handled, "already being handled", max_to_handle, "max allowed,", allowed_to_handle, "remaining", "and", wf_buffer, "buffer" if allowed_to_transfer <= wf_buffer: print "Not allowed to transfer more than", max_to_transfer, "at a time. Currently", being_transfered, "and", wf_buffer, "buffer" else: print being_transfered, "already being transfered", max_to_transfer, "max allowed,", allowed_to_transfer, "remaining", "and", wf_buffer, "buffer" print "... done" all_transfers = defaultdict(list) workflow_dependencies = defaultdict( set) ## list of wf.id per input dataset wfs_and_wfh = [] max_per_round = UC.get('max_per_round').get('transferor', None) print "getting all wf to consider ..." cache = getWorkflows(url, 'assignment-approved', details=True) all_to_include = session.query(Workflow).filter( Workflow.status.startswith('considered')).all() if len(cache) > 2000: max_to_include = max_per_round random.shuffle(cache) ## randomize first by wf name cache = sorted(cache, key=lambda r: r['RequestPriority'], reverse=True) ## order by prio highest = [r['RequestName'] for r in cache[:max_to_include]] all_to_include = [wfo for wfo in all_to_include if wfo.name in highest] print "limiting what to consider to", max_to_include, "because there is too much stuff going on. Got", len( all_to_include) for wfo in all_to_include: print "\t", wfo.name if specific and not specific in wfo.name: continue cache_r = filter(lambda d: d['RequestName'] == wfo.name, cache) if len(cache_r): wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False, request=cache_r[0]))) else: wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False))) print "... done" transfers_per_sites = defaultdict(int) input_sizes = defaultdict(float) ignored_input_sizes = defaultdict(float) input_cput = {} input_st = {} ## list the size of those in transfer already in_transfer_priority = None min_transfer_priority = None print "getting all wf in staging ..." #stucks = json.loads(open('%s/stuck_transfers.json'%monitor_pub_dir).read()) stucks = json.loads(eosRead('%s/stuck_transfers.json' % monitor_pub_dir)) for wfo in session.query(Workflow).filter( Workflow.status == 'staging').all(): wfh = workflowInfo(url, wfo.name, spec=False) #(lheinput,primary,parent,secondary) = wfh.getIO() #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) print wfo.name, "staging" (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() for site in sites_allowed: ## we should get the actual transfer destination instead of the full white list transfers_per_sites[site] += 1 #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() blocks = wfh.getBlocks() for prim in primary: ds_s = dss.get(prim, blocks=blocks) if prim in stucks: wfh.sendLog( 'transferor', "%s appears stuck, so not counting it %s [GB]" % (prim, ds_s)) ignored_input_sizes[prim] = max(ds_s, ignored_input_sizes[prim]) else: input_sizes[prim] = max(ds_s, input_sizes[prim]) wfh.sendLog('transferor', "%s needs %s [GB]" % (wfo.name, ds_s)) if in_transfer_priority == None: in_transfer_priority = int(wfh.request['RequestPriority']) else: in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority'])) if min_transfer_priority == None: min_transfer_priority = int(wfh.request['RequestPriority']) else: min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority'])) try: print "Ignored input sizes" ignored_values = list(ignored_input_sizes.items()) ignored_values.sort(key=lambda i: i[1]) print "\n".join(map(str, ignored_values)) print "Considered input sizes" considered_values = list(input_sizes.items()) considered_values.sort(key=lambda i: i[1]) print "\n".join(map(str, considered_values)) except Exception as e: print "trying to print the summary of input size" print str(e) print "... done" print "Max priority in transfer already", in_transfer_priority print "Min priority in transfer already", min_transfer_priority print "transfers per sites" print json.dumps(transfers_per_sites, indent=2) in_transfer_already = sum(input_sizes.values()) cput_in_transfer_already = sum(input_cput.values()) st_in_transfer_already = sum(input_st.values()) ## list the size of all inputs primary_input_per_workflow_gb = defaultdict(float) print "getting all input sizes ..." input_blocks = {} for (wfo, wfh) in wfs_and_wfh: (_, primary, _, _) = wfh.getIO() blocks = wfh.getBlocks() input_blocks[wfo.name] = blocks for prim in primary: ## do not count it if it appears stalled ! prim_size = dss.get(prim, blocks=blocks) input_sizes[prim] = max(prim_size, input_sizes[prim]) primary_input_per_workflow_gb[wfo.name] += prim_size print "... done" # shuffle first by name random.shuffle(wfs_and_wfh) # Sort smallest transfers first; allows us to transfer as many as possible workflows. def prio_and_size(i, j): if int(i[1].request['RequestPriority']) == int( j[1].request['RequestPriority']): return cmp(int(primary_input_per_workflow_gb.get(j[0].name, 0)), int(primary_input_per_workflow_gb.get(i[0].name, 0))) else: return cmp(int(i[1].request['RequestPriority']), int(j[1].request['RequestPriority'])) #wfs_and_wfh.sort(cmp = prio_and_size, reverse=True) #wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(primary_input_per_workflow_gb.get(i[0].name, 0)), int(primary_input_per_workflow_gb.get(j[0].name, 0)) )) #sort by priority higher first wfs_and_wfh.sort(cmp=lambda i, j: cmp(int(i[1].request[ 'RequestPriority']), int(j[1].request['RequestPriority'])), reverse=True) if min_transfer_priority == None or in_transfer_priority == None: print "nothing is lining up for transfer" sendLog( "transferor", "No request in staging, using first request to set priority limit") if len(wfs_and_wfh): min_transfer_priority = wfs_and_wfh[0][1].request[ 'RequestPriority'] in_transfer_priority = wfs_and_wfh[0][1].request['RequestPriority'] else: return cput_grand_total = sum(input_cput.values()) cput_to_transfer = cput_grand_total - cput_in_transfer_already st_grand_total = sum(input_st.values()) st_to_transfer = st_grand_total - st_in_transfer_already print "%15.4f [CPU h] worth already in transfer" % cput_in_transfer_already print "%15.4f [CPU h] worth is the current requested transfer load" % cput_to_transfer print "%15.4f [h] worth of absolute system time in transfer" % ( cput_in_transfer_already / SI.availableSlots()) print "%15.4f [h] worth of absolute system time is the current requested transfer load" % ( cput_to_transfer / SI.availableSlots()) print "%15.4f [h] worth of theoritical system time in transfer" % ( st_in_transfer_already) print "%15.4f [h] worth of theoritical system time is the current requested transfer load" % ( st_to_transfer) grand_total = sum(input_sizes.values()) to_transfer = grand_total - in_transfer_already grand_transfer_limit = options.maxtransfer #grand_transfer_limit = SI.total_disk()*0.25*1024## half of the free sapce in TB->GB transfer_limit = grand_transfer_limit - in_transfer_already print "%15.4f GB already being transfered" % in_transfer_already print "%15.4f GB is the current requested transfer load" % to_transfer print "%15.4f GB is the global transfer limit" % grand_transfer_limit print "%15.4f GB is the available limit" % transfer_limit max_staging_per_site = options.maxstagingpersite # the max priority value per dataset. max_priority = defaultdict(int) needs_transfer = 0 ## so that we can count'em passing_along = 0 transfer_sizes = defaultdict(float) went_over_budget = False destination_cache = {} no_goes = set() if max_per_round and not spec: wfs_and_wfh = wfs_and_wfh[:max_per_round] for (wfo, wfh) in wfs_and_wfh: print wfo.name, "to be transfered with priority", wfh.request[ 'RequestPriority'] if wfh.request['RequestStatus'] != 'assignment-approved': if wfh.request['RequestStatus'] in [ 'aborted', 'rejected', 'rejected-archived', 'aborted-archived' ]: if wfh.isRelval(): wfo.status = 'forget' else: wfo.status = 'trouble' ## so that we look or a replacement else: wfo.status = 'away' wfh.sendLog( 'transferor', '%s in status %s, setting %s' % (wfo.name, wfh.request['RequestStatus'], wfo.status)) continue (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() blocks = input_blocks.get(wfo.name, wfh.getBlocks()) if blocks: print "Reading only", len(blocks), "blocks in input" this_load = sum([dss.get(prim, blocks=blocks) for prim in primary]) no_budget = False if (this_load and (sum(transfer_sizes.values()) + this_load > transfer_limit or went_over_budget)): if went_over_budget: wfh.sendLog('transferor', "Transfer has gone over bubget.") else: wfh.sendLog('transferor', "Transfer will go over bubget.") wfh.sendLog( 'transferor', "%15.4f GB this load, %15.4f GB already this round, %15.4f GB is the available limit" % (this_load, sum(transfer_sizes.values()), transfer_limit)) #if sum(transfer_sizes.values()) > transfer_limit: went_over_budget = True if in_transfer_priority != None and min_transfer_priority != None: if int( wfh.request['RequestPriority'] ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority: wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over budget" % (wfh.request['RequestPriority'], in_transfer_priority)) else: if not options.go: wfh.sendLog( 'transferor', "%s minimum priority %s < %s : stop" % (min_transfer_priority, wfh.request['RequestPriority'], in_transfer_priority)) no_budget = True ## throtlle by campaign go no_go = False if not wfh.go(log=True) and not options.go: no_go = True no_goes.add(wfo.name) allowed_secondary = {} overide_parameters = {} check_secondary = (not wfh.isRelval()) output_tiers = list( set([o.split('/')[-1] for o in wfh.request['OutputDatasets']])) for campaign in wfh.getCampaigns(): if campaign in CI.campaigns: overide_parameters.update(CI.campaigns[campaign]) if campaign in CI.campaigns and 'secondaries' in CI.campaigns[ campaign]: if CI.campaigns[campaign]['secondaries']: allowed_secondary.update( CI.campaigns[campaign]['secondaries']) check_secondary = True if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[ campaign]: banned_tier = list( set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers)) if banned_tier: no_go = True wfh.sendLog( 'transferor', 'These data tiers %s are not allowed' % (','.join(banned_tier))) sendLog('transferor', 'These data tiers %s are not allowed in %s' % (','.join(banned_tier), wfo.name), level='critical') if secondary and check_secondary: if (set(secondary) & set(allowed_secondary.keys()) != set(secondary)): msg = '%s is not an allowed secondary' % ( ', '.join(set(secondary) - set(allowed_secondary.keys()))) wfh.sendLog('transferor', msg) critical_msg = msg + '\nWorkflow URL: https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=task_{}'.format( wfh.getPrepIDs()[0]) sendLog('transferor', critical_msg, level='critical') if not options.go: no_go = True for sec in secondary: if sec in allowed_secondary: overide_parameters.update(allowed_secondary[sec]) if 'SiteWhitelist' in overide_parameters: sites_allowed = list( set(sites_allowed) & set(overide_parameters['SiteWhitelist'])) wfh.sendLog( 'transferor', 'Intersecting with the overriding whitelist parameters, allowed sites become {}' .format(sites_allowed)) if no_go: continue if passing_along >= allowed_to_handle: #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: if in_transfer_priority != None and min_transfer_priority != None: if int(wfh.request['RequestPriority'] ) >= in_transfer_priority and int( wfh.request['RequestPriority'] ) != min_transfer_priority: ## higher priority, and not only this priority being transfered wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over %s" % (wfh.request['RequestPriority'], in_transfer_priority, max_to_handle)) else: wfh.sendLog( 'transferor', " Not allowed to pass more than %s at a time. Currently %s handled, and adding %s" % (max_to_handle, being_handled, passing_along)) if not options.go: ## should not allow to jump that fence break if this_load and needs_transfer >= allowed_to_transfer: if in_transfer_priority != None and min_transfer_priority != None: if int(wfh.request['RequestPriority'] ) >= in_transfer_priority and int( wfh.request['RequestPriority'] ) != min_transfer_priority: ## higher priority, and not only this priority being transfered wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over %s" % (wfh.request['RequestPriority'], in_transfer_priority, max_to_transfer)) else: wfh.sendLog( 'transferor', "Not allowed to transfer more than %s at a time. Currently %s transfering, and adding %s" % (max_to_transfer, being_transfered, needs_transfer)) if not options.go: no_budget = True if no_budget: continue # break ## try this for a while to make things faster ## the site white list considers site, campaign, memory and core information if options and options.tosites: sites_allowed = options.tosites.split(',') for dataset in list(primary) + list(parent) + list(secondary): LI.lock(dataset, reason='staging') if not sites_allowed: wfh.sendLog('transferor', "not possible site to run at") #sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name )) sendLog('transferor', "%s has no possible sites to run at" % (wfo.name), level='critical') continue can_go = True staging = False allowed = True primary_destinations = set() if primary: copies_needed_from_CPUh, CPUh = wfh.getNCopies() if talk: print wfo.name, 'reads', ', '.join(primary), 'in primary' ## chope the primary dataset for prim in primary: ## keep track of what needs what workflow_dependencies[prim].add(wfo.id) max_priority[prim] = max(max_priority[prim], int(wfh.request['RequestPriority'])) wfh.sendLog( 'transferor', "Would make %s from cpu requirement %s" % (copies_needed_from_CPUh, CPUh)) copies_needed = copies_needed_from_CPUh if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[ wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[ wfh.request['Campaign']]['maxcopies'] copies_needed = min(copies_needed_from_campaign, copies_needed) wfh.sendLog( 'transferor', "Maxed to %s by campaign configuration %s" % (copies_needed, wfh.request['Campaign'])) if blocks: print "limiting to blocks", "\n".join(sorted(blocks)) ### new ways of making the whole thing destinations, all_block_names = getDatasetDestinations( url, prim, within_sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks) print json.dumps(destinations, indent=2) ## get where the dataset is in full and completed prim_location = [ site for (site, info) in destinations.items() if info['completion'] == 100 and info['data_fraction'] == 1 ] ## the rest is places it is going to be #prim_destination = [site for site in destinations.keys() if not site in prim_location] prim_destination = [ site for (site, info) in destinations.items() if info['data_fraction'] == 1 and info['completion'] != 100 ] ## veto the site with no current disk space, for things that are not relval prim_destination = [ site for site in prim_destination if (SI.disk[site] or wfh.isRelval()) ] if len(prim_location) >= copies_needed: wfh.sendLog( 'transferor', "The input is all fully in place at %s sites %s" % (len(prim_location), sorted(prim_location))) continue copies_needed = max(0, copies_needed - len(prim_location)) wfh.sendLog( 'transferor', "Counting existing copies ; now need %s" % copies_needed) copies_being_made = [ sum([ info['blocks'].keys().count(block) for site, info in destinations.items() if site in prim_destination ]) for block in all_block_names ] latching_on_transfers = set() [ latching_on_transfers.update(info['blocks'].values()) for site, info in destinations.items() if site in prim_destination ] latching_on_transfers = list(latching_on_transfers) #print latching_on_transfers ## figure out where all this is going to go prim_to_distribute = [ site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location ] prim_to_distribute = [ site for site in prim_to_distribute if not SI.CE_to_SE(site) in prim_destination ] ## take out the ones that cannot receive transfers potential_destinations = len(prim_to_distribute) #prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in SI.sites_veto_transfer] prim_to_distribute = [ site for site in prim_to_distribute if (SI.disk[SI.CE_to_SE(site)] or wfh.isRelval()) ] ## do we want to restrict transfers if the amount of site in vetoe are too large ? wfh.sendLog( 'transferor', "Could be going to: %s" % sorted(prim_to_distribute)) if not prim_to_distribute or any([ transfers_per_sites[site] < max_staging_per_site for site in prim_to_distribute ]): ## means there is openings let me go print "There are transfer slots available:", [ (site, transfers_per_sites[site]) for site in prim_to_distribute ] else: if int( wfh.request['RequestPriority'] ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority: wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over transfer slots available" % (wfh.request['RequestPriority'], in_transfer_priority)) else: wfh.sendLog( 'transferor', "Not allowed to transfer more than %s per site at a time. Going overboard for %s" % (max_staging_per_site, sorted([ site for site in prim_to_distribute if transfers_per_sites[site] >= max_staging_per_site ]))) if not options.go: allowed = False break for latching in latching_on_transfers: existings = session.query(TransferImp).filter( TransferImp.phedexid == int(latching)).filter( TransferImp.workflow_id == wfo.id).all() if not existings: tri = TransferImp(phedexid=int(latching), workflow=wfo) print "adding", wfo.id, "with phedexid", latching session.add(tri) else: for existing in existings: existing.active = True session.flush() can_go = False transfer_sizes[prim] = max(this_load, transfer_sizes[prim]) staging = True # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ?? #copies_needed = max(0,copies_needed - len(prim_destination)) copies_needed = max(0, copies_needed - min(copies_being_made)) wfh.sendLog( 'transferor', "Counting the copies being made ; then need %s" % copies_needed) if copies_needed == 0: wfh.sendLog( 'transferor', "The input is either fully in place or getting in full somewhere with %s" % latching_on_transfers) can_go = True continue elif len(prim_to_distribute) == 0: wfh.sendLog( 'transferor', "We are going to need extra copies of %s, but no destinations seems available" % (prim)) sendLog( 'transferor', "We are going to need extra copies of %s, but no destinations seems available" % (prim), level='critical') print json.dumps(prim_to_distribute, indent=2) print json.dumps(prim_location, indent=2) print json.dumps(prim_destination, indent=2) prim_to_distribute = [ site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location ] #prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in SI.sites_veto_transfer ] prim_to_distribute = [ site for site in prim_to_distribute if (SI.disk[SI.CE_to_SE(site)] or wfh.isRelval()) ] print "changed to" print json.dumps(prim_to_distribute, indent=2) if len( prim_to_distribute ) > 0: ## maybe that a parameter we can play with to limit the if not options or options.chop: ### hard include the tape disk andpoint ? #tapes = [site for site in getDatasetPresence( url, prim, vetos=['T0','T2','T3','Disk']) if site.endswith('MSS')] chops, sizes = getDatasetChops( prim, chop_threshold=options.chopsize, only_blocks=blocks) spreading = distributeToSites(chops, prim_to_distribute, n_copies=copies_needed, weights=SI.cpu_pledges, sizes=sizes) ## prune the blocks/destination that are already in the making, so that subscription don't overlap for site in spreading: for block in list(spreading[site]): if site in destinations and block in destinations[ site]['blocks'].keys(): ## prune it spreading[site].remove(block) transfer_sizes[prim] = sum(sizes) if not spreading: sendLog( 'transferor', 'cannot send %s to any site, it cannot fit anywhere' % prim, level='critical') wfh.sendLog( 'transferor', "cannot send to any site. %s cannot seem to fit anywhere" % (prim)) staging = False can_go = False else: spreading = {} for site in prim_to_distribute: if blocks: spreading[site] = blocks else: spreading[site] = [prim] transfer_sizes[prim] = max(this_load, transfer_sizes[prim]) can_go = False wfh.sendLog( 'transferor', "selected CE destinations %s" % (sorted(spreading.keys()))) for (site, items) in spreading.items(): all_transfers[site].extend(items) transfers_per_sites[site] += 1 primary_destinations.add(site) else: can_go = False allowed = False if not allowed: wfh.sendLog('transferor', "Not allowed to move on with") continue if secondary: override_sec_destination = [] if 'SecondaryLocation' in CI.campaigns[wfh.request['Campaign']]: override_sec_destination = CI.campaigns[ wfh.request['Campaign']]['SecondaryLocation'] if 'SecondaryLocation' in overide_parameters: override_sec_destination = overide_parameters[ 'SecondaryLocation'] print wfo.name, 'reads', ', '.join(secondary), 'in secondary' for sec in secondary: workflow_dependencies[sec].add(wfo.id) if True: ## new style, failing on minbias if not sec in destination_cache: ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist destination_cache[sec], _ = getDatasetDestinations( url, sec) ## NO SITE WHITE LIST ADDED #destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed]) ## limit to the site whitelist NOW se_allowed = set( [SI.CE_to_SE(site) for site in sites_allowed]) destinations = dict([ (k, v) for (k, v) in destination_cache[sec].items() if k in se_allowed ]) ## truncate location/destination to those making up for >90% of the dataset bad_destinations = [ destinations.pop(site) for (site, info) in destinations.items() if info['data_fraction'] < 0.9 ] print sec, json.dumps(destinations, indent=2) sec_location = [ site for (site, info) in destinations.items() if info['completion'] >= 95 ] sec_destination = [ site for site in destinations.keys() if not site in sec_location ] ## this is in SE else: ## old style presence = getDatasetPresence(url, sec) sec_location = [ site for site, pres in presence.items() if pres[1] > 90. ] ## more than 90% of the minbias at sites subscriptions = listSubscriptions(url, sec) sec_destination = [site for site in subscriptions] ## how to make unified understand that it has to wait for the secondary if the sec_destination and #sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])] sec_to_distribute = [ site for site in sites_allowed if not SI.CE_to_SE(site) in sec_location ] #sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])] sec_to_distribute = [ site for site in sec_to_distribute if not SI.CE_to_SE(site) in sec_destination ] presitespace_sec_to_distribute = copy.deepcopy( sec_to_distribute) #sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] #sec_to_distribute = [site for site in sec_to_distribute if not SI.CE_to_SE(site) in SI.sites_veto_transfer] sec_to_distribute = [ site for site in sec_to_distribute if (SI.disk[SI.CE_to_SE(site)] or wfh.isRelval()) ] ## at this point you have a problem if len(sec_to_distribute) == 0 and len( presitespace_sec_to_distribute): sendLog( 'transferor', '%s is getting no possible destinations because of lack of space. To be decided what to do in general' % (sec), level='critical') if override_sec_destination: ## intersect with where we want the PU to be not_needed_anymore = list( set(sec_to_distribute) - set(override_sec_destination)) #sendEmail("secondary superfluous","the dataset %s could be removed from %s"%( sec, not_needed_anymore )) sendLog( 'transferor', "the dataset %s could be removed from %s" % (sec, not_needed_anymore)) sec_to_distribute = list( set(sec_to_distribute) & set(override_sec_destination)) if len(sec_to_distribute) > 0: print "secondary could go to", sorted(sec_to_distribute) sec_size = dss.get(sec) for site in sec_to_distribute: site_se = SI.CE_to_SE(site) if (SI.disk[site_se] * 1024.) > sec_size or wfh.isRelval(): wfh.sendLog('transferor', 'Sending %s to %s' % (sec, site)) all_transfers[site].append(sec) can_go = False else: print "could not send the secondary input to", site_se, "because it is too big for the available disk", SI.disk[ site_se] * 1024, "GB need", sec_size if primary_destinations and site in primary_destinations: #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024)) sendLog( 'transferor', '%s is too big (%s) for %s (%s). %s will not be able to run there.' % (sec, sec_size, site_se, SI.disk[site_se] * 1024, wfo.name), level='critical') wfh.sendLog( 'transferor', '%s is too big (%s) for %s (%s). will not be able to run there.' % (sec, sec_size, site_se, SI.disk[site_se] * 1024)) else: ## this is bas overall print "the secondary input does not have to be send to site" ## is that possible to do something more if can_go: ## no explicit transfer required this time if staging: ## but using existing ones wfh.sendLog( 'transferor', "latches on existing transfers, and nothing else, settin staging" ) wfo.status = 'staging' needs_transfer += 1 else: wfh.sendLog( 'transferor', "should just be assigned now to %s" % sorted(sites_allowed)) wfo.status = 'staged' passing_along += 1 wfh.sendLog('transferor', "setting %s status to %s" % (wfo.name, wfo.status)) #session.commit() continue else: ## there is an explicit transfer required if staging: ## and also using an existing one wfh.sendLog('transferor', "latches on existing transfers") if not options.test: wfo.status = 'staging' wfh.sendLog( 'transferor', "setting %s status to %s" % (wfo.name, wfo.status)) #session.commit() wfh.sendLog('transferor', "needs a transfer") needs_transfer += 1 passing_along += 1 if no_goes: #sendEmail("no go for managing","No go for \n"+"\n".join( no_goes )) sendLog('transferor', "No go for \n" + "\n".join(sorted(no_goes)), level='critical') print "accumulated transfers" print json.dumps(all_transfers, indent=2) fake_id = -1 wf_id_in_prestaging = set() for (site, items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## site that do not want input datasets #if site in SI.sites_veto_transfer: # print site,"does not want transfers" # continue ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured. ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] block_datasets = list(set([it.split('#')[0] for it in blocks])) datasets = [it for it in items_to_transfer if not '#' in it] details_text = "Making a replica to %s (CE) %s (SE) for" % (site, site_se) #print "\t",len(blocks),"blocks" ## remove blocks if full dataset is send out blocks = [ block for block in blocks if not block.split('#')[0] in datasets ] #print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks])) #print "\t",len(datasets),"datasets" #print "\t",datasets details_text += '\n\t%d blocks' % len(blocks) details_text += '\n\t%d needed blocks for %s' % ( len(blocks), sorted(list(set([block.split('#')[0] for block in blocks])))) details_text += '\n\t%d datasets' % len(datasets) details_text += '\n\t%s' % sorted(datasets) items_to_transfer = blocks + datasets if execute: sendLog('transferor', details_text) else: print "Would make a replica to", site, "(CE)", site_se, "(SE) for" print details_text ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y', 'yes', 'go']: continue transfered_items = defaultdict(set) if execute: priority = 'normal' cds = [ ds for ds in set(datasets + block_datasets) if ds in max_priority ] ## bucketize the transfers by priority of workflows prioritized_items = defaultdict(set) for item in items_to_transfer: d = item.split('#')[0] p = max_priority.get(d, 80000) q = 'normal' if p > 100000: q = 'reserved' elif p < 70000: q = 'low' prioritized_items[q].add(item) for priority, items in prioritized_items.items(): result = makeReplicaRequest(url, site_se, list(items), 'prestaging', priority=priority, approve=True) if result: these_transfers = [ o['id'] for o in result['phedex']['request_created'] ] #phedexids.extend( these_transfers ) for ph in these_transfers: transfered_items[ph].update(items) else: sendLog( 'transferor', 'Could not make a replica request for items %s to site %s' % (items, site_se), level='critical') #result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority=priority, approve=True) #phedexids = [o['id'] for o in result['phedex']['request_created']]: #else: # #result= {'phedex':{'request_created' : []}} # phedexids = [] # fake_id-=1 if not transfered_items: sendLog( 'transferor', 'Could not make a replica request for items %s to site %s' % (items_to_transfer, site), level='critical') continue for phedexid, items in transfered_items.items(): print phedexid, "transfer created" for transfering in list( set(map(lambda it: it.split('#')[0], items))): for wfid in workflow_dependencies[transfering]: new_transfer = session.query(TransferImp).filter( TransferImp.phedexid == int(phedexid)).filter( TransferImp.workflow_id == wfid).first() if not new_transfer: new_transfer = TransferImp( phedexid=phedexid, workflow=session.query(Workflow).get(wfid)) session.add(new_transfer) else: new_transfer.active = True wf_id_in_prestaging.add(wfid) #session.commit() for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status != 'staging': if execute: tr_wf.status = 'staging' if talk: print "setting", tr_wf.name, "to staging" #session.commit() ## one big session commit at the end that everything went fine session.commit()
def checkor(url, spec=None, options=None): if userLock(): return if duplicateLock(): return fDB = closeoutInfo() UC = unifiedConfiguration() use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] wfs=[] if options.new: ## get all in running and check ## you want to intersect with what is completed ! if options.strict: completed_wfi = getWorkflows(url, status='completed') for wfo in session.query(Workflow).filter(Workflow.status == 'away').all(): if wfo.name in completed_wfi: wfs.append( wfo ) else: print wfo.name,"is not completed" sendLog('checkor','%s is not completed'%( wfo.name)) else: wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() ) if options.current: ## recheck those already there, probably to just pass them along wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() ) if options.old: ## than get all in need for assistance wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() ) custodials = defaultdict(list) #sites : dataset list transfers = defaultdict(list) #sites : dataset list invalidations = [] #a list of files SI = global_SI CI = campaignInfo() mcm = McMClient(dev=False) def get_campaign(output, wfi): campaign = None try: campaign = output.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request: campaign = wfi.request['Campaign'] return campaign ## retrieve bypass and onhold configuration bypasses = [] holdings = [] #try: # already_notified = json.loads(open('already_notifified.json').read()) #except: # print "no record of already notified workflow. starting fresh" # already_notified = [] for bypassor,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**')]: bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor) if not os.path.isfile(bypass_file): #sendLog('checkor','no file %s',bypass_file) continue try: bypasses.extend( json.loads(open(bypass_file).read())) except: sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email]) holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor) if not os.path.isfile(holding_file): #sendLog('checkor',"no file %s"%holding_file) continue try: holdings.extend( json.loads(open(holding_file).read())) except: sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email]) ## once this was force-completed, you want to bypass for rider,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('srimanob','*****@*****.**')]: rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider) if not os.path.isfile(rider_file): print "no file",rider_file #sendLog('checkor',"no file %s"%rider_file) continue try: bypasses.extend( json.loads(open( rider_file ).read() ) ) except: sendLog('checkor',"cannot get force complete list from %s"%rider) sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email]) if use_mcm: mcm_force = mcm.get('/restapi/requests/forcecomplete') bypasses.extend( mcm_force ) pattern_fraction_pass = UC.get('pattern_fraction_pass') total_running_time = 5.*60. sleep_time = 1 if len(wfs): sleep_time = min(max(0.5, total_running_time / len(wfs)), 10) random.shuffle( wfs ) print len(wfs),"to consider, pausing for",sleep_time for wfo in wfs: if spec and not (spec in wfo.name): continue time.sleep( sleep_time ) ## get info wfi = workflowInfo(url, wfo.name) wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status)) ## make sure the wm status is up to date. # and send things back/forward if necessary. wfo.wm_status = wfi.request['RequestStatus'] if wfo.wm_status == 'closed-out': ## manually closed-out wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status)) wfo.status = 'close' session.commit() continue elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']: ## went into trouble wfo.status = 'trouble' wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status)) session.commit() continue elif wfo.wm_status in ['assigned','acquired']: ## not worth checking yet wfi.sendLog('checkor',"%s is not running yet"%wfo.name) session.commit() continue if '-onhold' in wfo.status: if wfo.name in holdings and wfo.name not in bypasses: wfi.sendLog('checkor',"%s is on hold"%wfo.name) continue if wfo.wm_status != 'completed': #and not wfo.name in bypasses: ## for sure move on with closeout check if in completed wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status)) session.commit() continue if wfo.name in holdings and wfo.name not in bypasses: wfo.status = 'assistance-onhold' wfi.sendLog('checkor',"setting %s on hold"%wfo.name) session.commit() continue session.commit() #sub_assistance="" # if that string is filled, there will be need for manual assistance existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance assistance_tags = set() is_closing = True ## get it from somewhere bypass_checks = False pids = wfi.getPrepIDs() bypass_by_mcm = False for bypass in bypasses: if bypass in wfo.name: wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass)) bypass_checks = True break if bypass in pids: wfi.sendLog('checkor',"we can bypass checks on %s because of prepid %s "%( wfo.name, bypass)) bypass_checks = True bypass_by_mcm = True break #if not CI.go( wfi.request['Campaign'] ) and not bypass_checks: # print "No go for",wfo.name # wfi.sendLog('checkor',"No go for %s"%wfi.request['Campaign']) # continue tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm* vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco campaigns = {} for out in wfi.request['OutputDatasets']: c = get_campaign(out, wfi) campaigns[out] = c if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]: vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override'])) ## add those that we need to check for custodial copy tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] )) wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])] check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] )) check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check )) check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) ) wfi.sendLog('checkor', check_output_text ) ## anything running on acdc : getting the real prepid is not worth it familly = getWorkflowById(url, wfi.request['PrepID'], details=True) acdc = [] acdc_inactive = [] for member in familly: if member['RequestType'] != 'Resubmission': continue if member['RequestName'] == wfo.name: continue if member['RequestDate'] < wfi.request['RequestDate']: continue if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']: print wfo.name,"still has an ACDC running",member['RequestName'] acdc.append( member['RequestName'] ) ## cannot be bypassed! is_closing = False assistance_tags.add('recovering') elif member['RequestStatus']==None: print member['RequestName'],"is not real" pass else: acdc_inactive.append( member['RequestName'] ) assistance_tags.add('recovered') ## completion check percent_completions = {} if not 'TotalInputEvents' in wfi.request: event_expected,lumi_expected = 0,0 if not 'recovery' in wfo.status: #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**']) sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical') else: event_expected,lumi_expected = wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis'] if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']): event_expected = int(wfi.request['RequestNumEvents']) elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']: event_expected = int(wfi.request['Task1']['RequestNumEvents']) fractions_pass = {} over_100_pass = False (lhe,prim,_,_) = wfi.getIO() if lhe or prim: over_100_pass = False for output in wfi.request['OutputDatasets']: event_count,lumi_count = getDatasetEventsAndLumis(dataset=output) percent_completions[output] = 0. if lumi_expected: percent_completions[output] = lumi_count / float( lumi_expected ) if event_expected: wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected )) percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) ) fractions_pass[output] = 0.95 c = campaigns[output] if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]: fractions_pass[output] = CI.campaigns[c]['fractionpass'] wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output)) if options.fractionpass: fractions_pass[output] = options.fractionpass print "overriding fraction to",fractions_pass[output],"by command line for",output for key in pattern_fraction_pass: if key in output: fractions_pass[output] = pattern_fraction_pass[key] print "overriding fraction to",fractions_pass[output],"by dataset key",key if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]): print wfo.name,"is not completed" print json.dumps(percent_completions, indent=2) print json.dumps(fractions_pass, indent=2) ## hook for creating automatically ACDC ? if not bypass_checks: assistance_tags.add('recovery') is_closing = False if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]): print wfo.name,"is over completed" print json.dumps(percent_completions, indent=2) if not bypass_checks: assistance_tags.add('over100') is_closing = False ## correct lumi < 300 event per lumi events_per_lumi = {} for output in wfi.request['OutputDatasets']: events_per_lumi[output] = getDatasetEventsPerLumi( output ) lumi_upper_limit = {} for output in wfi.request['OutputDatasets']: upper_limit = 301. campaign = campaigns[output] #if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request: # upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency'] # print "setting the upper limit of lumisize to",upper_limit,"by request configuration" if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]: upper_limit = CI.campaigns[campaign]['lumisize'] print "overriding the upper lumi size to",upper_limit,"for",campaign if options.lumisize: upper_limit = options.lumisize print "overriding the upper lumi size to",upper_limit,"by command line" lumi_upper_limit[output] = upper_limit if wfi.request['RequestType'] in ['ReDigi']: lumi_upper_limit[output] = -1 if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]): print wfo.name,"has big lumisections" print json.dumps(events_per_lumi, indent=2) ## hook for rejecting the request ? if not bypass_checks: assistance_tags.add('biglumi') is_closing = False any_presence = {} for output in wfi.request['OutputDatasets']: any_presence[output] = getDatasetPresence(url, output, vetoes=[]) ## custodial copy custodial_locations = {} custodial_presences = {} for output in wfi.request['OutputDatasets']: custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s] custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output) if not custodial_locations[output]: custodial_locations[output] = [] ## presence in phedex phedex_presence ={} for output in wfi.request['OutputDatasets']: phedex_presence[output] = phedexClient.getFileCountDataset(url, output ) out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier] size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])): print wfo.name,"has not all custodial location" print json.dumps(custodial_locations, indent=2) ########## ## hook for making a custodial replica ? custodial = None ## get from other outputs for output in out_worth_checking: if len(custodial_locations[output]): custodial = custodial_locations[output][0] if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the other output custodial:",custodial,"because of limited space" custodial = None ## try to get it from campaign configuration if not custodial: for output in out_worth_checking: campaign = campaigns[output] if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]: custodial = CI.campaigns[campaign]['custodial'] print "Setting custodial to",custodial,"from campaign configuration" if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the campaign configuration custodial:",custodial,"because of limited space" custodial = None ## get from the parent pick_custodial = True use_parent_custodial = UC.get('use_parent_custodial') _,prim,_,_ = wfi.getIO() if not custodial and prim and use_parent_custodial: parent_dataset = prim.pop() ## this is terribly dangerous to assume only parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset ) ###parents_custodial = findCustodialLocation(url, parent_dataset) if not parents_custodial: parents_custodial = [] if len(parents_custodial): custodial = parents_custodial[0] else: print "the input dataset",parent_dataset,"does not have custodial in the first place. abort" #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset) ## does not work for RAWOADSIM sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset) ## cannot be bypassed, this is an issue to fix is_closing = False pick_custodial = False assistance_tags.add('parentcustodial') if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the parent custodial:",custodial,"because of limited space" custodial = None if not custodial and pick_custodial: ## pick one at random custodial = SI.pick_SE(size=size_worth_checking) if not custodial: print "cannot find a custodial for",wfo.name wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking)) #sendEmail( "cannot find a custodial","cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking)) sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical') if custodial and (is_closing or bypass_checks): print "picked",custodial,"for tape copy" ## remember how much you added this round already ; this stays locally SI.storage[custodial] -= size_worth_checking ## register the custodial request, if there are no other big issues for output in out_worth_checking: if not len(custodial_locations[output]): if phedex_presence[output]>=1: custodials[custodial].append( output ) ## let's wait and see if that's needed assistance_tags.add('custodial') else: print "no file in phedex for",output," not good to add to custodial requests" #cannot be bypassed is_closing = False ## disk copy disk_copies = {} for output in wfi.request['OutputDatasets']: disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)] if not all(map( lambda sites : len(sites)!=0, disk_copies.values())): print wfo.name,"has not all output on disk" print json.dumps(disk_copies, indent=2) ## presence in dbs dbs_presence = {} dbs_invalid = {} for output in wfi.request['OutputDatasets']: dbs_presence[output] = dbs3Client.getFileCountDataset( output ) dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True) fraction_invalid = 0.01 if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs,phedex mismatch" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) if not 'recovering' in assistance_tags: assistance_tags.add('filemismatch') #print this for show and tell if no recovery on-going for out in dbs_presence: _,_,missing_phedex,missing_dbs = getDatasetFiles(url, out) if missing_phedex: print "These %d files are missing in phedex"%(len(missing_phedex)) print "\n".join( missing_phedex ) if missing_dbs: print "These %d files are missing in dbs"%(len(missing_dbs)) print "\n".join( missing_dbs ) #if not bypass_checks: ## I don't think we can by pass this is_closing = False if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs invalid file level too high" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## need to be going and taking an eye assistance_tags.add('invalidfiles') if not bypass_checks: #sub_assistance+="-invalidfiles" is_closing = False ## put that heavy part at the end ## duplication check duplications = {} if is_closing or bypass_checks: print "starting duplicate checker for",wfo.name for output in wfi.request['OutputDatasets']: print "\tchecking",output duplications[output] = True try: duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True) except: try: duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True) except: print "was not possible to get the duplicate count for",output is_closing=False if any(duplications.values()) and not options.ignoreduplicates: print wfo.name,"has duplicates" print json.dumps(duplications,indent=2) ## hook for making file invalidation ? ## it shouldn't be allowed to bypass it assistance_tags.add('duplicates') is_closing = False ## for visualization later on if not wfo.name in fDB.record: #print "adding",wfo.name,"to close out record" fDB.record[wfo.name] = { 'datasets' :{}, 'name' : wfo.name, 'closeOutWorkflow' : None, } fDB.record[wfo.name]['closeOutWorkflow'] = is_closing fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority'] fDB.record[wfo.name]['prepid'] = wfi.request['PrepID'] for output in wfi.request['OutputDatasets']: if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {} rec = fDB.record[wfo.name]['datasets'][output] rec['percentage'] = float('%.2f'%(percent_completions[output]*100)) rec['duplicate'] = duplications[output] if output in duplications else 'N/A' rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A' rec['closeOutDataset'] = is_closing rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A' rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output]))) rec['dbsFiles'] = dbs_presence[output] rec['dbsInvFiles'] = dbs_invalid[output] rec['phedexFiles'] = phedex_presence[output] rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive)) ## and move on if is_closing: ## toggle status to closed-out in request manager print "setting",wfo.name,"closed-out" if not options.test: if wfo.wm_status in ['closed-out','announced','normal-archived']: print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does" res = None else: res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) print "close out answer",res if not res in ["None",None]: print "try to get the current status again" wfi_bis = workflowInfo(url, wfo.name) if wfi_bis.request['RequestStatus'] == 'closed-out': print "the request did toggle to closed-out" res = None if not res in ["None",None]: print "retrying to closing out" print res res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) if res in [None,"None"]: wfo.status = 'close' session.commit() if use_mcm and bypass_by_mcm: ## shoot large on all prepids for pid in pids: mcm.delete('/restapi/requests/forcecomplete/%s'%pid) else: print "could not close out",wfo.name,"will try again next time" else: ## full known list #recovering # has active ACDC ##OUT #recovered #had inactive ACDC #recovery #not over the pass bar #over100 # over 100% #biglumi # has a big lumiblock #parentcustodial # the parent does not have a valid subscription yet #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear #filemismatch # there is a dbs/phedex mismatch #duplicates #a lumi section is there twice ## manual is not added yet, and should be so by recoveror print wfo.name,"was tagged with :",list(assistance_tags) if 'recovering' in assistance_tags: ## if active ACDC, being under threshold, filemismatch do not matter assistance_tags = assistance_tags - set(['recovery','filemismatch']) if 'recovery' in assistance_tags and 'recovered' in assistance_tags: ## should not set -recovery to anything that add ACDC already assistance_tags = assistance_tags - set(['recovery','recovered']) ## straight to manual assistance_tags.add('manual') ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it print wfo.name,"needs assistance with",",".join( assistance_tags ) print wfo.name,"existing conditions",",".join( existing_assistance_tags ) ######################################### ##### notification to requester ######### go_notify=False if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags: go_notify=True if go_notify: #if wfo.name in already_notified: # print "double notification" # sendEmail('double notification','please take a look at %s'%(wfo.name)) #else: # already_notified.append( wfo.name ) detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s' perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name) splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name) ## notify templates messages= { 'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ), 'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink), 'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ), 'filemismatch': 'Samples completed with inconsistency in DBS/Phedex', #'manual' : 'Workflow completed and requires manual checks by Ops', } content = "The request PREPID (WORKFLOW) is facing issue in production.\n" motive = False for case in messages: if case in assistance_tags: content+= "\n"+messages[case]+"\n" motive = True content += "You are invited to check, while this is being taken care of by Comp-Ops.\n" content += "This is an automated message from Comp-Ops.\n" items_notified = set() if use_mcm and motive: wfi.notifyRequestor( content , mcm = mcm) ######################################### ## logic to set the status further if assistance_tags: new_status = 'assistance-'+'-'.join(sorted(assistance_tags) ) else: new_status = 'assistance' ## case where the workflow was in manual from recoveror if not 'manual' in wfo.status or new_status!='assistance-recovery': wfo.status = new_status if not options.test: print "setting",wfo.name,"to",wfo.status session.commit() else: print "current status is",wfo.status,"not changing to anything" #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2)) fDB.html() if not spec: #sendEmail("fresh assistance status available","Fresh status are available at https://cmst2.web.cern.ch/cmst2/unified/assistance.html",destination=['*****@*****.**']) #it's a bit annoying pass ## custodial requests print "Custodials" print json.dumps(custodials, indent=2) for site in custodials: print ','.join(custodials[site]),'=>',site if not options.test: result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) ) print result print "Transfers" print json.dumps(transfers, indent=2) ## replicas requests for site in transfers: print ','.join(transfers[site]),'=>',site if not options.test: result = None #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out") print result print "File Invalidation" print invalidations
def main(): mysqlconn = MySQLdb.connect(host='dbod-cmsrv1.cern.ch', user='******', passwd="relval", port=5506) #conn = MySQLdb.connect(host='localhost', user='******', passwd='relval') curs = mysqlconn.cursor() curs.execute("use " + dbname + ";") #curs.execute("lock tables batches write, batches_archive write, workflows write, workflows_archive write, datasets write, clone_reinsert_requests write") #workflow = line.rstrip('\n') #curs.execute("insert into workflows set hn_req=\""+hnrequest+"\", workflow_name=\""+workflow+"\";") curs.execute("select * from batches") batches = curs.fetchall() batches_colnames = [desc[0] for desc in curs.description] for batch in batches: #if batch[0] != 21: # continue blocks_dsets_to_transfer = [] blocks_not_at_site = [] batch_dict = dict(zip(batches_colnames, batch)) site = batch_dict["site"] if "T2" in site: site_disk = site elif "T1" in site: site_disk = site + "_Disk" else: os.system( 'echo ' + site + ' | mail -s \"input_dset_checkor.py error 1\" [email protected]' ) print "Neither T1 nor T2 is in site name, exiting" sys.exit(1) if site == "T2_CH_CERN_T0": site_disk = "T2_CH_CERN" if site == "T2_CH_CERN_AI": site_disk = "T2_CH_CERN" #print batch #print "" userid = batch_dict["useridyear"] + "_" + batch_dict[ "useridmonth"] + "_" + batch_dict["useridday"] + "_" + str( batch_dict["useridnum"]) + "_" + str( batch_dict["batch_version_num"]) #if status == "waiting_for_transfer" and count % 10 == 0: if batch_dict["status"] == "waiting_for_transfer": print " userid ==> " + str(userid) #count = 0 all_dsets_blocks_at_site = True curs.execute( "select workflow_name from workflows where useridyear = \"" + batch_dict["useridyear"] + "\" and useridmonth = \"" + batch_dict["useridmonth"] + "\" and useridday = \"" + batch_dict["useridday"] + "\" and useridnum = " + str(batch_dict["useridnum"]) + " and batch_version_num = " + str(batch_dict["batch_version_num"]) + ";") wfs = curs.fetchall() for wf in wfs: print wf[0] headers = { "Content-type": "application/json", "Accept": "application/json" } conn = httplib.HTTPSConnection( 'cmsweb.cern.ch', cert_file=os.getenv('X509_USER_PROXY'), key_file=os.getenv('X509_USER_PROXY')) r1 = conn.request("GET", '/reqmgr2/data/request/' + wf[0], headers=headers) r2 = conn.getresponse() schema = json.loads(r2.read()) schema = schema['result'][0][wf[0]] isthereanmcpileupdataset = False for key, value in schema.items(): if type(value) is dict and key.startswith("Task"): if 'MCPileup' in value: isthereanmcpileupdataset = True ismcpileupdatasetatsite = utils.checkIfDatasetIsSubscribedToASite( "cmsweb.cern.ch", value["MCPileup"], site_disk) if 'InputDataset' in value: inputdset = value['InputDataset'] if 'RunWhitelist' in value: runwhitelist = value['RunWhitelist'] blocks_fname = os.popen( "mktemp").read().rstrip("\n") list_of_blocks = utils.getListOfBlocks( inputdset, str(runwhitelist)) for block in list_of_blocks: #this block (/DoubleMu/...) is not registered in phedex, so it cannot be subscribed to any site if block == "/DoubleMu/Run2011A-ZMu-08Nov2011-v1/RAW-RECO#93c53d22-25b2-11e1-8c62-003048f02c8a": continue isblockatsite = utils.checkIfBlockIsAtASite( "cmsweb.cern.ch", block, site_disk) if not isblockatsite: all_dsets_blocks_at_site = False else: isdatasetatsite = utils.checkIfDatasetIsSubscribedToASite( "cmsweb.cern.ch", inputdset, site_disk) if not isdatasetatsite: all_dsets_blocks_at_site = False if all_dsets_blocks_at_site and (not isthereanmcpileupdataset or ismcpileupdatasetatsite): curs.execute( "update batches set status=\"input_dsets_ready\", current_status_start_time=\"" + datetime.datetime.now().strftime("%y:%m:%d %H:%M:%S") + "\" where useridyear = \"" + batch_dict["useridyear"] + "\" and useridmonth = \"" + batch_dict["useridmonth"] + "\" and useridday = \"" + batch_dict["useridday"] + "\" and useridnum = " + str(batch_dict["useridnum"]) + " and batch_version_num = " + str(batch_dict["batch_version_num"]) + ";") mysqlconn.commit() if batch_dict["status"] == "approved": print " userid ==> " + str(userid) #print "checking input datasets for workflows in batch "+str(batchid) curs.execute( "select workflow_name from workflows where useridyear = \"" + batch_dict["useridyear"] + "\" and useridmonth = \"" + batch_dict["useridmonth"] + "\" and useridday = \"" + batch_dict["useridday"] + "\" and useridnum = " + str(batch_dict["useridnum"]) + " and batch_version_num = " + str(batch_dict["batch_version_num"]) + ";") wfs = curs.fetchall() for wf in wfs: print wf[0] headers = { "Content-type": "application/json", "Accept": "application/json" } conn = httplib.HTTPSConnection( 'cmsweb.cern.ch', cert_file=os.getenv('X509_USER_PROXY'), key_file=os.getenv('X509_USER_PROXY')) r1 = conn.request("GET", '/reqmgr2/data/request/' + wf[0], headers=headers) r2 = conn.getresponse() schema = json.loads(r2.read()) schema = schema['result'][0][wf[0]] for key, value in schema.items(): if type(value) is dict and key.startswith("Task"): if 'MCPileup' in value: isdatasetatsite = utils.checkIfDatasetIsSubscribedToASite( "cmsweb.cern.ch", value['MCPileup'], site_disk) if not isdatasetatsite: blocks_dsets_to_transfer.append( value['MCPileup']) if 'InputDataset' in value: subscribed_to_disk = False inputdset = value['InputDataset'] if 'RunWhitelist' in value: runwhitelist = value['RunWhitelist'] list_of_blocks = utils.getListOfBlocks( inputdset, str(runwhitelist)) for block in list_of_blocks: #this block (/DoubleMu/...) is not registered in phedex, so it cannot be subscribed to any site if block == "/DoubleMu/Run2011A-ZMu-08Nov2011-v1/RAW-RECO#93c53d22-25b2-11e1-8c62-003048f02c8a": continue isblocksubscribedtosite = utils.checkIfBlockIsSubscribedToASite( "cmsweb.cern.ch", block, site_disk) isblockatsite = utils.checkIfBlockIsAtASite( "cmsweb.cern.ch", block, site_disk) if not isblocksubscribedtosite: blocks_dsets_to_transfer.append(block) if not isblockatsite: blocks_not_at_site.append(block) else: isdatasetsubscribedtosite = utils.checkIfDatasetIsSubscribedToASite( "cmsweb.cern.ch", inputdset, site_disk) isdatasetatsite = utils.checkIfDatasetIsSubscribedToASite( "cmsweb.cern.ch", inputdset, site_disk) if not isdatasetsubscribedtosite: blocks_dsets_to_transfer.append(inputdset) if not isdatasetatsite: blocks_not_at_site.append(inputdset) if blocks_dsets_to_transfer != []: print "transfering the following blocks:" print blocks_dsets_to_transfer result = utils.makeReplicaRequest( url="cmsweb.cern.ch", site=site_disk, datasets=blocks_dsets_to_transfer, comments="relval datasets", group="RelVal") phedexid = result['phedex']['request_created'][0]['id'] utils.approveSubscription("cmsweb.cern.ch", phedexid) curs.execute( "update batches set status=\"waiting_for_transfer\", current_status_start_time=\"" + datetime.datetime.now().strftime("%y:%m:%d %H:%M:%S") + "\" where useridyear = \"" + batch_dict["useridyear"] + "\" and useridmonth = \"" + batch_dict["useridmonth"] + "\" and useridday = \"" + batch_dict["useridday"] + "\" and useridnum = " + str(batch_dict["useridnum"]) + " and batch_version_num = " + str(batch_dict["batch_version_num"]) + ";") mysqlconn.commit() elif blocks_not_at_site != []: print blocks_not_at_site curs.execute( "update batches set status=\"waiting_for_transfer\", current_status_start_time=\"" + datetime.datetime.now().strftime("%y:%m:%d %H:%M:%S") + "\" where useridyear = \"" + batch_dict["useridyear"] + "\" and useridmonth = \"" + batch_dict["useridmonth"] + "\" and useridday = \"" + batch_dict["useridday"] + "\" and useridnum = " + str(batch_dict["useridnum"]) + " and batch_version_num = " + str(batch_dict["batch_version_num"]) + ";") else: curs.execute( "update batches set status=\"input_dsets_ready\", current_status_start_time=\"" + datetime.datetime.now().strftime("%y:%m:%d %H:%M:%S") + "\" where useridyear = \"" + batch_dict["useridyear"] + "\" and useridmonth = \"" + batch_dict["useridmonth"] + "\" and useridday = \"" + batch_dict["useridday"] + "\" and useridnum = " + str(batch_dict["useridnum"]) + " and batch_version_num = " + str(batch_dict["batch_version_num"]) + ";") mysqlconn.commit()
def checkor(url, spec=None, options=None): if userLock(): return if duplicateLock(): return fDB = closeoutInfo() UC = unifiedConfiguration() use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] wfs=[] if options.new: ## get all in running and check ## you want to intersect with what is completed ! if options.strict: completed_wfi = getWorkflows(url, status='completed') for wfo in session.query(Workflow).filter(Workflow.status == 'away').all(): if wfo.name in completed_wfi: wfs.append( wfo ) else: print wfo.name,"is not completed" sendLog('checkor','%s is not completed'%( wfo.name)) else: wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() ) if options.current: ## recheck those already there, probably to just pass them along wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() ) if options.old: ## than get all in need for assistance wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() ) custodials = defaultdict(list) #sites : dataset list transfers = defaultdict(list) #sites : dataset list invalidations = [] #a list of files SI = siteInfo() CI = campaignInfo() mcm = McMClient(dev=False) def get_campaign(output, wfi): campaign = None try: campaign = output.split('/')[2].split('-')[0] except: if 'Campaign' in wfi.request: campaign = wfi.request['Campaign'] return campaign ## retrieve bypass and onhold configuration bypasses = [] forcings = [] overrides = getForceCompletes() holdings = [] for bypassor,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('prozober','*****@*****.**')]: bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor) if not os.path.isfile(bypass_file): #sendLog('checkor','no file %s',bypass_file) continue try: bypasses.extend( json.loads(open(bypass_file).read())) except: sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email]) holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor) if not os.path.isfile(holding_file): #sendLog('checkor',"no file %s"%holding_file) continue try: holdings.extend( json.loads(open(holding_file).read())) except: sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor)) sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email]) ## once this was force-completed, you want to bypass for rider,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('srimanob','*****@*****.**')]: rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider) if not os.path.isfile(rider_file): print "no file",rider_file #sendLog('checkor',"no file %s"%rider_file) continue try: bypasses.extend( json.loads(open( rider_file ).read() ) ) except: sendLog('checkor',"cannot get force complete list from %s"%rider) sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email]) if use_mcm: forcings = mcm.get('/restapi/requests/forcecomplete') if forcings: sendEmail('force completing mechanism','please check what checkor is doing with %s'%( ','.join(forcings))) pattern_fraction_pass = UC.get('pattern_fraction_pass') total_running_time = 5.*60. sleep_time = 1 if len(wfs): sleep_time = min(max(0.5, total_running_time / len(wfs)), 10) random.shuffle( wfs ) print len(wfs),"to consider, pausing for",sleep_time max_per_round = UC.get('max_per_round').get('checkor',None) if max_per_round and not spec: wfs = wfs[:max_per_round] for wfo in wfs: if spec and not (spec in wfo.name): continue time.sleep( sleep_time ) ## get info wfi = workflowInfo(url, wfo.name) wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status)) ## make sure the wm status is up to date. # and send things back/forward if necessary. wfo.wm_status = wfi.request['RequestStatus'] if wfo.wm_status == 'closed-out': ## manually closed-out wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status)) wfo.status = 'close' session.commit() continue elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']: ## went into trouble wfo.status = 'trouble' wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status)) session.commit() continue elif wfo.wm_status in ['assigned','acquired']: ## not worth checking yet wfi.sendLog('checkor',"%s is not running yet"%wfo.name) session.commit() continue if '-onhold' in wfo.status: if wfo.name in holdings and wfo.name not in bypasses: wfi.sendLog('checkor',"%s is on hold"%wfo.name) continue if wfo.wm_status != 'completed': #and not wfo.name in bypasses: ## for sure move on with closeout check if in completed wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status)) session.commit() continue if wfo.name in holdings and wfo.name not in bypasses: wfo.status = 'assistance-onhold' wfi.sendLog('checkor',"setting %s on hold"%wfo.name) session.commit() continue session.commit() #sub_assistance="" # if that string is filled, there will be need for manual assistance existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance assistance_tags = set() is_closing = True ## get it from somewhere bypass_checks = False for bypass in bypasses: if bypass in wfo.name: wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass)) bypass_checks = True break pids = wfi.getPrepIDs() force_by_mcm = False force_by_user = False for force in forcings: if force in pids: wfi.sendLog('checkor',"we can bypass checks and force complete %s because of prepid %s "%( wfo.name, force)) bypass_checks = True force_by_mcm = True break for user in overrides: for force in overrides[user]: if force in wfo.name: wfi.sendLog('checkor',"we can bypass checks and force complete %s because of keyword %s of user %s"%( wfo.name, force, user)) bypass_checks = True force_by_user = True break tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm* vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco campaigns = {} expected_outputs = copy.deepcopy( wfi.request['OutputDatasets'] ) for out in wfi.request['OutputDatasets']: c = get_campaign(out, wfi) campaigns[out] = c if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]: vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override'])) ## add those that we need to check for custodial copy tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] )) wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])] check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] )) check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check )) check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) ) wfi.sendLog('checkor', check_output_text ) ## anything running on acdc : getting the real prepid is not worth it familly = getWorkflowById(url, wfi.request['PrepID'], details=True) acdc = [] acdc_inactive = [] forced_already=False acdc_bads = [] for member in familly: if member['RequestType'] != 'Resubmission': continue if member['RequestName'] == wfo.name: continue if member['RequestDate'] < wfi.request['RequestDate']: continue if 'OriginalRequestName' in member and member['OriginalRequestName'] != wfo.name: continue if member['RequestStatus'] == None: continue if not set(member['OutputDatasets']).issubset( set(expected_outputs)): if not member['RequestStatus'] in ['rejected-archived','rejected','aborted','aborted-archived']: ##this is not good at all wfi.sendLog('checkor','inconsistent ACDC %s'%member['RequestName'] ) acdc_bads.append( member['RequestName'] ) is_closing = False assistance_tags.add('manual') continue if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']: print wfo.name,"still has an ACDC running",member['RequestName'] acdc.append( member['RequestName'] ) ## cannot be bypassed! is_closing = False assistance_tags.add('recovering') if (force_by_mcm or force_by_user) and not forced_already: wfi.sendLog('checkor','%s is being forced completed while recovering'%wfo.name) wfi.notifyRequestor("The workflow %s was force completed"% wfo.name, do_batch=False) forceComplete(url, wfi) forced_already=True else: acdc_inactive.append( member['RequestName'] ) assistance_tags.add('recovered') if acdc_bads: sendEmail('inconsistent ACDC','for %s, ACDC %s is inconsistent, preventing from closing'%( wfo.name, ','.join(acdc_bads) )) ## completion check percent_completions = {} if not 'TotalInputEvents' in wfi.request: event_expected,lumi_expected = 0,0 if not 'recovery' in wfo.status: #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**']) sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical') else: event_expected,lumi_expected = wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis'] if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']): event_expected = int(wfi.request['RequestNumEvents']) elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']: event_expected = wfi.request['Task1']['RequestNumEvents'] for i in range(1,20): if 'Task%d'%i in wfi.request: ## this is wrong ibsolute if 'FilterEfficiency' in wfi.request['Task%d'%i]: event_expected *= float(wfi.request['Task%d'%i]['FilterEfficiency']) event_expected = int(event_expected) fractions_pass = {} over_100_pass = False (lhe,prim,_,_) = wfi.getIO() if lhe or prim: over_100_pass = False for output in wfi.request['OutputDatasets']: event_count,lumi_count = getDatasetEventsAndLumis(dataset=output) percent_completions[output] = 0. if lumi_expected: percent_completions[output] = lumi_count / float( lumi_expected ) if event_expected: wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected )) percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) ) fractions_pass[output] = 0.95 c = campaigns[output] if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]: fractions_pass[output] = CI.campaigns[c]['fractionpass'] wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output)) if options.fractionpass: fractions_pass[output] = options.fractionpass print "overriding fraction to",fractions_pass[output],"by command line for",output for key in pattern_fraction_pass: if key in output: fractions_pass[output] = pattern_fraction_pass[key] print "overriding fraction to",fractions_pass[output],"by dataset key",key if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]): possible_recoveries = wfi.getRecoveryDoc() if possible_recoveries == []: wfi.sendLog('checkor','%s has missing statistics \n%s \n%s, but nothing is recoverable. passing through to annoucement'%( wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) )) sendEmail('nothing is recoverable','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name) bypass_checks = True else: wfi.sendLog('checkor','%s is not completed \n%s \n%s'%( wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) )) ## hook for creating automatically ACDC ? if not bypass_checks: assistance_tags.add('recovery') is_closing = False if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]): print wfo.name,"is over completed" print json.dumps(percent_completions, indent=2) if not bypass_checks: assistance_tags.add('over100') is_closing = False ## correct lumi < 300 event per lumi events_per_lumi = {} for output in wfi.request['OutputDatasets']: events_per_lumi[output] = getDatasetEventsPerLumi( output ) lumi_upper_limit = {} for output in wfi.request['OutputDatasets']: upper_limit = 301. campaign = campaigns[output] #if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request: # upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency'] # print "setting the upper limit of lumisize to",upper_limit,"by request configuration" if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]: upper_limit = CI.campaigns[campaign]['lumisize'] print "overriding the upper lumi size to",upper_limit,"for",campaign if options.lumisize: upper_limit = options.lumisize print "overriding the upper lumi size to",upper_limit,"by command line" lumi_upper_limit[output] = upper_limit if wfi.request['RequestType'] in ['ReDigi']: lumi_upper_limit[output] = -1 if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]): print wfo.name,"has big lumisections" print json.dumps(events_per_lumi, indent=2) ## hook for rejecting the request ? if not bypass_checks: assistance_tags.add('biglumi') is_closing = False any_presence = {} for output in wfi.request['OutputDatasets']: any_presence[output] = getDatasetPresence(url, output, vetoes=[]) ## custodial copy custodial_locations = {} custodial_presences = {} for output in wfi.request['OutputDatasets']: custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s] custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output) if not custodial_locations[output]: custodial_locations[output] = [] ## presence in phedex phedex_presence ={} for output in wfi.request['OutputDatasets']: phedex_presence[output] = phedexClient.getFileCountDataset(url, output ) out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier] size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])): print wfo.name,"has not all custodial location" print json.dumps(custodial_locations, indent=2) ########## ## hook for making a custodial replica ? custodial = None ## get from other outputs for output in out_worth_checking: if len(custodial_locations[output]): custodial = custodial_locations[output][0] if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the other output custodial:",custodial,"because of limited space" custodial = None ## try to get it from campaign configuration if not custodial: for output in out_worth_checking: campaign = campaigns[output] if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]: custodial = CI.campaigns[campaign]['custodial'] print "Setting custodial to",custodial,"from campaign configuration" if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the campaign configuration custodial:",custodial,"because of limited space" custodial = None ## get from the parent pick_custodial = True use_parent_custodial = UC.get('use_parent_custodial') _,prim,_,_ = wfi.getIO() if not custodial and prim and use_parent_custodial: parent_dataset = prim.pop() ## this is terribly dangerous to assume only parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset ) ###parents_custodial = findCustodialLocation(url, parent_dataset) if not parents_custodial: parents_custodial = [] if len(parents_custodial): custodial = parents_custodial[0] else: print "the input dataset",parent_dataset,"does not have custodial in the first place. abort" #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset) ## does not work for RAWOADSIM sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset) ## cannot be bypassed, this is an issue to fix is_closing = False pick_custodial = False assistance_tags.add('parentcustodial') if custodial and float(SI.storage[custodial]) < size_worth_checking: print "cannot use the parent custodial:",custodial,"because of limited space" custodial = None if not custodial and pick_custodial: ## pick one at random custodial = SI.pick_SE(size=size_worth_checking) if not custodial: print "cannot find a custodial for",wfo.name wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking)) #sendEmail( "cannot find a custodial","cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking)) sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical') if custodial and (is_closing or bypass_checks): print "picked",custodial,"for tape copy" ## remember how much you added this round already ; this stays locally SI.storage[custodial] -= size_worth_checking ## register the custodial request, if there are no other big issues for output in out_worth_checking: if not len(custodial_locations[output]): if phedex_presence[output]>=1: custodials[custodial].append( output ) ## let's wait and see if that's needed assistance_tags.add('custodial') else: print "no file in phedex for",output," not good to add to custodial requests" #cannot be bypassed is_closing = False ## disk copy disk_copies = {} for output in wfi.request['OutputDatasets']: disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)] if not all(map( lambda sites : len(sites)!=0, disk_copies.values())): print wfo.name,"has not all output on disk" print json.dumps(disk_copies, indent=2) ## presence in dbs dbs_presence = {} dbs_invalid = {} for output in wfi.request['OutputDatasets']: dbs_presence[output] = dbs3Client.getFileCountDataset( output ) dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True) fraction_invalid = 0.01 if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs,phedex mismatch" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) if not 'recovering' in assistance_tags: assistance_tags.add('filemismatch') #print this for show and tell if no recovery on-going for out in dbs_presence: _,_,missing_phedex,missing_dbs = getDatasetFiles(url, out) if missing_phedex: wfi.sendLog('checkor',"These %d files are missing in phedex\n%s"%(len(missing_phedex), "\n".join( missing_phedex ))) if missing_dbs: wfi.sendLog('checkor',"These %d files are missing in dbs\n%s"%(len(missing_dbs), "\n".join( missing_dbs ))) #if not bypass_checks: ## I don't think we can by pass this is_closing = False if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles: print wfo.name,"has a dbs invalid file level too high" print json.dumps(dbs_presence, indent=2) print json.dumps(dbs_invalid, indent=2) print json.dumps(phedex_presence, indent=2) ## need to be going and taking an eye assistance_tags.add('invalidfiles') if not bypass_checks: #sub_assistance+="-invalidfiles" is_closing = False ## put that heavy part at the end ## duplication check duplications = {} if is_closing or bypass_checks: print "starting duplicate checker for",wfo.name for output in wfi.request['OutputDatasets']: print "\tchecking",output duplications[output] = True try: duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True) except: try: duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True) except: print "was not possible to get the duplicate count for",output is_closing=False if any(duplications.values()) and not options.ignoreduplicates: print wfo.name,"has duplicates" print json.dumps(duplications,indent=2) ## hook for making file invalidation ? ## it shouldn't be allowed to bypass it assistance_tags.add('duplicates') is_closing = False ## for visualization later on if not wfo.name in fDB.record: #print "adding",wfo.name,"to close out record" fDB.record[wfo.name] = { 'datasets' :{}, 'name' : wfo.name, 'closeOutWorkflow' : None, } fDB.record[wfo.name]['closeOutWorkflow'] = is_closing fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority'] fDB.record[wfo.name]['prepid'] = wfi.request['PrepID'] for output in wfi.request['OutputDatasets']: if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {} rec = fDB.record[wfo.name]['datasets'][output] rec['percentage'] = float('%.2f'%(percent_completions[output]*100)) rec['duplicate'] = duplications[output] if output in duplications else 'N/A' rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A' rec['closeOutDataset'] = is_closing rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A' rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output]))) rec['dbsFiles'] = dbs_presence[output] rec['dbsInvFiles'] = dbs_invalid[output] rec['phedexFiles'] = phedex_presence[output] rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive)) now = time.gmtime() rec['timestamp'] = time.mktime(now) rec['updated'] = time.asctime(now)+' (GMT)' ## and move on if is_closing: ## toggle status to closed-out in request manager print "setting",wfo.name,"closed-out" if not options.test: if wfo.wm_status in ['closed-out','announced','normal-archived']: print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does" res = None else: res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) print "close out answer",res if not res in ["None",None]: print "try to get the current status again" wfi_bis = workflowInfo(url, wfo.name) if wfi_bis.request['RequestStatus'] == 'closed-out': print "the request did toggle to closed-out" res = None if not res in ["None",None]: print "retrying to closing out" print res res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name) if res in [None,"None"]: wfo.status = 'close' session.commit() if use_mcm and force_by_mcm: ## shoot large on all prepids, on closing the wf for pid in pids: mcm.delete('/restapi/requests/forcecomplete/%s'%pid) else: print "could not close out",wfo.name,"will try again next time" else: ## full known list #recovering # has active ACDC ##OUT #recovered #had inactive ACDC #recovery #not over the pass bar #over100 # over 100% #biglumi # has a big lumiblock #parentcustodial # the parent does not have a valid subscription yet #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear #filemismatch # there is a dbs/phedex mismatch #duplicates #a lumi section is there twice ## manual is not added yet, and should be so by recoveror print wfo.name,"was tagged with :",list(assistance_tags) if 'recovering' in assistance_tags: ## if active ACDC, being under threshold, filemismatch do not matter assistance_tags = assistance_tags - set(['recovery','filemismatch']) if 'recovery' in assistance_tags and 'recovered' in assistance_tags: ## should not set -recovery to anything that add ACDC already assistance_tags = assistance_tags - set(['recovery','recovered']) ## straight to manual assistance_tags.add('manual') ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it print wfo.name,"needs assistance with",",".join( assistance_tags ) print wfo.name,"existing conditions",",".join( existing_assistance_tags ) ######################################### ##### notification to requester ######### go_notify=False if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags: go_notify=True if go_notify: #if wfo.name in already_notified: # print "double notification" # sendEmail('double notification','please take a look at %s'%(wfo.name)) #else: # already_notified.append( wfo.name ) detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s' perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name) splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name) ## notify templates messages= { 'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ), 'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink), 'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ), 'filemismatch': 'Samples completed with inconsistency in DBS/Phedex', #'manual' : 'Workflow completed and requires manual checks by Ops', } content = "The request PREPID (WORKFLOW) is facing issue in production.\n" motive = False for case in messages: if case in assistance_tags: content+= "\n"+messages[case]+"\n" motive = True content += "You are invited to check, while this is being taken care of by Comp-Ops.\n" content += "This is an automated message from Comp-Ops.\n" items_notified = set() if use_mcm and motive: wfi.notifyRequestor( content , mcm = mcm) ######################################### ## logic to set the status further if assistance_tags: new_status = 'assistance-'+'-'.join(sorted(assistance_tags) ) else: new_status = 'assistance' ## case where the workflow was in manual from recoveror if not 'manual' in wfo.status or new_status!='assistance-recovery': wfo.status = new_status if not options.test: print "setting",wfo.name,"to",wfo.status session.commit() else: print "current status is",wfo.status,"not changing to anything" #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2)) fDB.html() if not spec: #sendEmail("fresh assistance status available","Fresh status are available at https://cmst2.web.cern.ch/cmst2/unified/assistance.html",destination=['*****@*****.**']) #it's a bit annoying pass ## custodial requests print "Custodials" print json.dumps(custodials, indent=2) for site in custodials: print ','.join(custodials[site]),'=>',site if not options.test: result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) ) print result print "Transfers" print json.dumps(transfers, indent=2) ## replicas requests for site in transfers: print ','.join(transfers[site]),'=>',site if not options.test: result = None #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out") print result print "File Invalidation" print invalidations
for out in outs: blocks_at_sites = getDatasetBlockAndSite(url, out, group="") deletions = getDatasetOnGoingDeletion(url, out) if len(deletions): print "\t\tshould not subscribe with on-going deletions",out continue for site,blocks in blocks_at_sites.items(): if 'Buffer' in site or 'Export' in site or 'MSS' in site: continue all_blocks_at_sites[site].update( blocks ) print "\t",out print "\t\t",len(blocks_at_sites),"sites",sorted(blocks_at_sites.keys()),"with unsubscribed blocks" if len(all_blocks_at_sites.keys())==0 and len(wfs): ## no subscription to be done at this time, let me know sendEmail('no unsubscribed blocks','while catching up %s does not need to be there anymore'%( one_status )) print len(all_blocks_at_sites.keys()),"sites to subscribe things at" for site,blocks in all_blocks_at_sites.items(): if 'Buffer' in site or 'Export' in site or 'MSS' in site: continue if not site in done: done[site] = [] blocks = [block for block in blocks if not block in done[site]] print "Would subscribe",len(blocks),"blocks to",site print "\tSubscribe",len(blocks),"blocks to",site done[site].extend( blocks ) if blocks: print makeReplicaRequest(url, site, list(blocks), "Production blocks", priority="low", approve=True,mail=False) time.sleep(1) open('myblock_done.json','w').write( json.dumps( done, indent=2 ))
def transferor(url ,specific = None, talk=True, options=None): if userLock('transferor'): return if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() mcm = McMClient(dev=False) dss = DSS() print "counting all being handled..." being_handled = len(session.query(Workflow).filter(Workflow.status == 'away').all()) being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('stag')).all()) being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance')).all()) max_to_handle = options.maxworkflows allowed_to_handle = max(0,max_to_handle - being_handled) wf_buffer = 5 if allowed_to_handle<=wf_buffer: ## buffer for having several wf per transfer print "Not allowed to run more than",max_to_handle,"at a time. Currently",being_handled,"and",wf_buffer,"buffer" else: print being_handled,"already being handled",max_to_handle,"max allowed,",allowed_to_handle,"remaining","and",wf_buffer,"buffer" print "... done" all_transfers=defaultdict(list) workflow_dependencies = defaultdict(set) ## list of wf.id per input dataset wfs_and_wfh=[] print "getting all wf to consider ..." cache = getWorkflows(url, 'assignment-approved', details=True) for wfo in session.query(Workflow).filter(Workflow.status=='considered').all(): if specific and not specific in wfo.name: continue cache_r =filter(lambda d:d['RequestName']==wfo.name, cache) if len(cache_r): wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False, request = cache_r[0]) ) ) else: wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False) ) ) print "... done" input_sizes = {} ## list the size of those in transfer already in_transfer_priority=0 min_transfer_priority=100000000 print "getting all wf in staging ..." for wfo in session.query(Workflow).filter(Workflow.status=='staging').all(): wfh = workflowInfo( url, wfo.name, spec=False) (_,primary,_,_) = wfh.getIO() for prim in primary: input_sizes[prim] = dss.get( prim ) in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority'])) min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority'])) print "... done" print "Max priority in transfer already",in_transfer_priority print "Min priority in transfer already",min_transfer_priority in_transfer_already = sum(input_sizes.values()) #sort by priority higher first wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority']) ), reverse=True) ## list the size of all inputs print "getting all input sizes ..." for (wfo,wfh) in wfs_and_wfh: (_,primary,_,_) = wfh.getIO() for prim in primary: input_sizes[prim] = dss.get( prim ) print "... done" grand_total = sum(input_sizes.values()) to_transfer = grand_total - in_transfer_already grand_transfer_limit = options.maxtransfer transfer_limit = grand_transfer_limit - in_transfer_already print "%15.4f GB already being transfered"%in_transfer_already print "%15.4f GB is the current requested transfer load"%to_transfer print "%15.4f GB is the global transfer limit"%grand_transfer_limit print "%15.4f GB is the available limit"%transfer_limit # the max priority value per dataset. max_priority = defaultdict(int) needs_transfer=0 ## so that we can count'em passing_along = 0 transfer_sizes={} went_over_budget=False for (wfo,wfh) in wfs_and_wfh: print wfh.request['RequestPriority'] print wfo.name,"to be transfered" #wfh = workflowInfo( url, wfo.name) (_,primary,_,_) = wfh.getIO() this_load=sum([input_sizes[prim] for prim in primary]) if ( this_load and (sum(transfer_sizes.values())+this_load > transfer_limit or went_over_budget ) ): if went_over_budget: print "Transfer has gone over bubget." else: print "Transfer will go over bubget." print "%15.4f GB this load"%this_load print "%15.4f GB already this round"%sum(transfer_sizes.values()) print "%15.4f GB is the available limit"%transfer_limit went_over_budget=True if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over budget" else: if not options.go: print min_transfer_priority,"minimum priority",wfh.request['RequestPriority'],"<",in_transfer_priority,"stop" continue ## throtlle by campaign go if not CI.go( wfh.request['Campaign'] ): print "No go for",wfh.request['Campaign'] if not options.go: continue ## check if the batch is announced announced=False is_real=False for b in mcm.getA('batches',query='contains=%s'% wfo.name): is_real = True if b['status']=='announced': announced=True break if not announced: print wfo.name,"does not look announced."# skipping?, rejecting?, reporting?" if not is_real: print wfo.name,"does not appear to be genuine." ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status continue ## check on a grace period injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.) now = time.mktime(time.gmtime()) / (60.*60.) if float(now - injection_time) < 4.: if not options.go and not announced: print "It is too soon to start transfer: %3.2fH remaining"%(now - injection_time) continue passing_along += 1 if passing_along >= allowed_to_handle: if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over",max_to_handle else: print "Not allowed to pass more than",max_to_handle,"at a time. Currently",being_handled,"handled, and adding",passing_along break (lheinput,primary,parent,secondary) = wfh.getIO() if options and options.tosites: sites_allowed = options.tosites.split(',') else: sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']): sites_allowed = CI.parameters(wfh.request['Campaign'])['SiteWhitelist'] blocks = [] if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']: blocks = wfh.request['BlockWhitelist'] can_go = True staging=False if primary: if talk: print wfo.name,'reads',', '.join(primary),'in primary' ## chope the primary dataset for prim in primary: max_priority[prim] = max(max_priority[prim],int(wfh.request['RequestPriority'])) sites_really_allowed = [site for site in sites_allowed if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] print "Sites allowed minus the vetoed transfer" print sites_really_allowed copies_needed = int(0.35*len(sites_really_allowed))+1 ## should just go for a fixed number based if the white list grows that big print "Would make",copies_needed,"copies" if options.maxcopy>0: copies_needed = min(options.maxcopy,copies_needed) ## remove the sites that do not want transfers print "need",copies_needed workflow_dependencies[prim].add( wfo.id ) presence = getDatasetPresence( url, prim ) prim_location = [site for site,pres in presence.items() if pres[0]==True] if len(prim_location) >= copies_needed: print "The output is all fully in place at",len(prim_location),"sites" continue # reduce the number of copies required by existing full copies copies_needed = max(0,copies_needed - len(prim_location)) print "now need",copies_needed subscriptions = listSubscriptions( url , prim ) prim_destination = list(set([site for (site,(tid,decision)) in subscriptions.items() if decision and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])])) ## need to reject from that list the ones with a full copy already: i.e the transfer corresponds to the copy in place prim_destination = [site for site in prim_destination if not site in prim_location] ## add transfer dependencies latching_on_transfers = list(set([ tid for (site,(tid,decision)) in subscriptions.items() if decision and site in prim_destination and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])])) print latching_on_transfers for latching in latching_on_transfers: tfo = session.query(Transfer).filter(Transfer.phedexid == latching).first() if not tfo: tfo = Transfer( phedexid = latching) tfo.workflows_id = [] session.add(tfo) if not wfo.id in tfo.workflows_id: print "adding",wfo.id,"to",tfo.id,"with phedexid",latching l = copy.deepcopy( tfo.workflows_id ) l.append( wfo.id ) tfo.workflows_id = l if not options.test: session.commit() else: session.flush() ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query can_go = False transfer_sizes[prim] = input_sizes[prim] staging = True # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ?? copies_needed = max(0,copies_needed - len(prim_destination)) print "then need",copies_needed if copies_needed == 0: print "The output is either fully in place or getting in full somewhere with",latching_on_transfers can_go = True continue prim_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in prim_location])] prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in prim_destination])] ## take out the ones that cannot receive transfers prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] if len(prim_to_distribute)>0: ## maybe that a parameter we can play with to limit the if not options or options.chop: spreading = distributeToSites( getDatasetChops(prim), prim_to_distribute, n_copies = copies_needed, weights=SI.cpu_pledges) else: spreading = {} for site in prim_to_distribute: spreading[site]=[prim] can_go = False transfer_sizes[prim] = input_sizes[prim] for (site,items) in spreading.items(): all_transfers[site].extend( items ) if secondary: if talk: print wfo.name,'reads',', '.join(secondary),'in secondary' for sec in secondary: workflow_dependencies[sec].add( wfo.id ) presence = getDatasetPresence( url, sec ) sec_location = [site for site,pres in presence.items() if pres[1]>90.] ## more than 90% of the minbias at sites subscriptions = listSubscriptions( url ,sec ) sec_destination = [site for site in subscriptions] sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] if len( sec_to_distribute )>0: for site in sec_to_distribute: all_transfers[site].append( sec ) can_go = False ## is that possible to do something more if can_go: ## no explicit transfer required this time if staging: ## but using existing ones print wfo.name,"latches on existing transfers, and nothing else" wfo.status = 'staging' else: print wfo.name,"should just be assigned NOW to",sites_allowed wfo.status = 'staged' print "setting status to",wfo.status session.commit() continue else: ## there is an explicit transfer required if staging: ## and also using an existing one print wfo.name,"latches on existing transfers" if not options.test: wfo.status = 'staging' print "setting status to",wfo.status session.commit() print wfo.name,"needs a transfer" needs_transfer+=1 #print json.dumps(all_transfers) fake_id=-1 wf_id_in_prestaging=set() for (site,items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## site that do not want input datasets if site in SI.sites_veto_transfer: print site,"does not want transfers" continue ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured. ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] datasets = [it for it in items_to_transfer if not '#' in it] if execute: print "Making a replica to",site,"(CE)",site_se,"(SE) for" else: print "Would make a replica to",site,"(CE)",site_se,"(SE) for" print "\t",len(blocks),"blocks" ## remove blocks if full dataset is send out blocks = [block for block in blocks if not block.split('#')[0] in datasets] print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks])) print "\t",len(datasets),"datasets" print "\t",datasets items_to_transfer = blocks + datasets ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y','yes','go']: continue if execute: result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority='normal') ## make use of max_priority dataset:priority to set the subscriptions priority """ ## does not function once = True for item in items_to_transfer: bds = item.split('#')[0] if max_priority[bds] >= 90000: if once: w=10 print "waiting",w,"s before raising priority" time.sleep(w) once=False ## raise it to high priority print item,"subscription priority raised to high at",site_se #print "This does not work yet properly it seems" print updateSubscription(url, site_se, item, priority='high') """ else: #result= {'phedex':{'request_created' : [{'id' : fake_id}]}} result= {'phedex':{'request_created' : []}} fake_id-=1 if not result: print "ERROR Could not make a replica request for",site,items_to_transfer,"pre-staging" continue for phedexid in [o['id'] for o in result['phedex']['request_created']]: new_transfer = session.query(Transfer).filter(Transfer.phedexid == phedexid).first() print phedexid,"transfer created" if not new_transfer: new_transfer = Transfer( phedexid = phedexid) session.add( new_transfer ) new_transfer.workflows_id = set() for transfering in list(set(map(lambda it : it.split('#')[0], items_to_transfer))): new_transfer.workflows_id.update( workflow_dependencies[transfering] ) new_transfer.workflows_id = list(new_transfer.workflows_id) wf_id_in_prestaging.update(new_transfer.workflows_id) session.commit() ## auto approve it if execute: approved = approveSubscription(url, phedexid, [site_se]) for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status!='staging': if execute: tr_wf.status = 'staging' if talk: print "setting",tr_wf.name,"to staging" session.commit()
def transferor(url ,specific = None, talk=True, options=None): if userLock(): return if duplicateLock(): return use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() NLI = newLockInfo() mcm = McMClient(dev=False) dss = DSS() #allowed_secondary = UC.get('') print "counting all being handled..." being_handled = len(session.query(Workflow).filter(Workflow.status == 'away').all()) being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('stag')).all()) being_transfered = len(session.query(Workflow).filter(Workflow.status == 'staging').all()) being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all()) max_to_handle = options.maxworkflows max_to_transfer = options.maxstaging allowed_to_handle = max(0,max_to_handle - being_handled) allowed_to_transfer = max(0,max_to_transfer - being_transfered) wf_buffer = 5 if allowed_to_handle<=wf_buffer: ## buffer for having several wf per transfer print "Not allowed to run more than",max_to_handle,"at a time. Currently",being_handled,"and",wf_buffer,"buffer" else: print being_handled,"already being handled",max_to_handle,"max allowed,",allowed_to_handle,"remaining","and",wf_buffer,"buffer" if allowed_to_transfer <= wf_buffer: print "Not allowed to transfer more than",max_to_transfer,"at a time. Currently",being_transfered,"and",wf_buffer,"buffer" else: print being_transfered,"already being transfered",max_to_transfer,"max allowed,",allowed_to_transfer,"remaining","and",wf_buffer,"buffer" print "... done" all_transfers=defaultdict(list) workflow_dependencies = defaultdict(set) ## list of wf.id per input dataset wfs_and_wfh=[] print "getting all wf to consider ..." cache = getWorkflows(url, 'assignment-approved', details=True) for wfo in session.query(Workflow).filter(Workflow.status.startswith('considered')).all(): print "\t",wfo.name if specific and not specific in wfo.name: continue cache_r =filter(lambda d:d['RequestName']==wfo.name, cache) if len(cache_r): wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False, request = cache_r[0]) ) ) else: wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False) ) ) print "... done" transfers_per_sites = defaultdict(int) input_sizes = {} ignored_input_sizes = {} input_cput = {} input_st = {} ## list the size of those in transfer already in_transfer_priority=None min_transfer_priority=None print "getting all wf in staging ..." stucks = json.loads(open('%s/stuck_transfers.json'%monitor_dir).read()) for wfo in session.query(Workflow).filter(Workflow.status=='staging').all(): wfh = workflowInfo( url, wfo.name, spec=False) #(lheinput,primary,parent,secondary) = wfh.getIO() #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) (lheinput,primary,parent,secondary,sites_allowed) = wfh.getSiteWhiteList() for site in sites_allowed: ## we should get the actual transfer destination instead of the full white list transfers_per_sites[site] += 1 #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() for prim in primary: ds_s = dss.get( prim ) if prim in stucks: sendLog('transferor', "%s appears stuck, so not counting it %s [GB]"%( prim, ds_s), wfi=wfh) ignored_input_sizes[prim] = ds_s else: input_sizes[prim] = ds_s sendLog('transferor', "%s needs %s [GB]"%( wfo.name, ds_s), wfi=wfh) if in_transfer_priority==None: in_transfer_priority = int(wfh.request['RequestPriority']) else: in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority'])) if min_transfer_priority==None: min_transfer_priority = int(wfh.request['RequestPriority']) else: min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority'])) if min_transfer_priority==None or in_transfer_priority ==None: print "nothing is lining up for transfer" sendEmail("no request in staging","no request in staging") return pass try: print "Ignored input sizes" ignored_values = list(ignored_input_sizes.items()) ignored_values.sort( key = lambda i : i[1] ) print "\n".join( map(str, ignored_values ) ) print "Considered input sizes" considered_values = list(input_sizes.items()) considered_values.sort( key = lambda i : i[1] ) print "\n".join( map(str, considered_values) ) except Exception as e: print "trying to print the summary of input size" print str(e) print "... done" print "Max priority in transfer already",in_transfer_priority print "Min priority in transfer already",min_transfer_priority print "transfers per sites" print json.dumps( transfers_per_sites, indent=2) in_transfer_already = sum(input_sizes.values()) cput_in_transfer_already = sum(input_cput.values()) st_in_transfer_already = sum(input_st.values()) ## list the size of all inputs primary_input_per_workflow_gb = defaultdict(float) print "getting all input sizes ..." for (wfo,wfh) in wfs_and_wfh: (_,primary,_,_) = wfh.getIO() #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() for prim in primary: ## do not count it if it appears stalled ! prim_size = dss.get( prim ) input_sizes[prim] = prim_size primary_input_per_workflow_gb[wfo.name] += prim_size print "... done" # shuffle first by name random.shuffle( wfs_and_wfh ) # Sort smallest transfers first; allows us to transfer as many as possible workflows. def prio_and_size( i, j): if int(i[1].request['RequestPriority']) == int(j[1].request['RequestPriority']): return cmp(int(primary_input_per_workflow_gb.get(j[0].name, 0)), int(primary_input_per_workflow_gb.get(i[0].name, 0)) ) else: return cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority'])) #wfs_and_wfh.sort(cmp = prio_and_size, reverse=True) #wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(primary_input_per_workflow_gb.get(i[0].name, 0)), int(primary_input_per_workflow_gb.get(j[0].name, 0)) )) #sort by priority higher first wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority']) ), reverse=True) cput_grand_total = sum(input_cput.values()) cput_to_transfer = cput_grand_total - cput_in_transfer_already st_grand_total = sum(input_st.values()) st_to_transfer = st_grand_total - st_in_transfer_already print "%15.4f [CPU h] worth already in transfer"%cput_in_transfer_already print "%15.4f [CPU h] worth is the current requested transfer load"%cput_to_transfer print "%15.4f [h] worth of absolute system time in transfer"%( cput_in_transfer_already / SI.availableSlots()) print "%15.4f [h] worth of absolute system time is the current requested transfer load"%( cput_to_transfer / SI.availableSlots()) print "%15.4f [h] worth of theoritical system time in transfer"%( st_in_transfer_already ) print "%15.4f [h] worth of theoritical system time is the current requested transfer load"%( st_to_transfer ) grand_total = sum(input_sizes.values()) to_transfer = grand_total - in_transfer_already grand_transfer_limit = options.maxtransfer #grand_transfer_limit = SI.total_disk()*0.25*1024## half of the free sapce in TB->GB transfer_limit = grand_transfer_limit - in_transfer_already print "%15.4f GB already being transfered"%in_transfer_already print "%15.4f GB is the current requested transfer load"%to_transfer print "%15.4f GB is the global transfer limit"%grand_transfer_limit print "%15.4f GB is the available limit"%transfer_limit max_staging_per_site = options.maxstagingpersite # the max priority value per dataset. max_priority = defaultdict(int) needs_transfer=0 ## so that we can count'em passing_along = 0 transfer_sizes={} went_over_budget=False destination_cache = {} no_goes = set() max_per_round = UC.get('max_per_round').get('transferor',None) if max_per_round and not spec: wfs_and_wfh = wfs_and_wfh[:max_per_round] for (wfo,wfh) in wfs_and_wfh: print wfo.name,"to be transfered with priority",wfh.request['RequestPriority'] if wfh.request['RequestStatus']!='assignment-approved': if wfh.request['RequestStatus'] in ['aborted','rejected','rejected-archived','aborted-archived']: wfo.status = 'trouble' ## so that we look or a replacement else: wfo.status = 'away' wfh.sendLog('transferor', '%s in status %s, setting %s'%( wfo.name,wfh.request['RequestStatus'],wfo.status)) continue (_,primary,_,_) = wfh.getIO() this_load=sum([input_sizes[prim] for prim in primary]) no_budget = False if ( this_load and (sum(transfer_sizes.values())+this_load > transfer_limit or went_over_budget ) ): if went_over_budget: wfh.sendLog('transferor', "Transfer has gone over bubget.") else: wfh.sendLog('transferor', "Transfer will go over bubget.") wfh.sendLog('transferor', "%15.4f GB this load, %15.4f GB already this round, %15.4f GB is the available limit"%(this_load, sum(transfer_sizes.values()), transfer_limit)) #if sum(transfer_sizes.values()) > transfer_limit: went_over_budget = True if in_transfer_priority!=None and min_transfer_priority!=None: if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: wfh.sendLog('transferor',"Higher priority sample %s >= %s go-on over budget"%( wfh.request['RequestPriority'], in_transfer_priority)) else: if not options.go: wfh.sendLog('transferor',"%s minimum priority %s < %s : stop"%( min_transfer_priority,wfh.request['RequestPriority'],in_transfer_priority)) no_budget = True ## throtlle by campaign go no_go = False if not wfh.go(log=True) and not options.go: no_go = True no_goes.add( wfo.name ) allowed_secondary = set() for campaign in wfh.getCampaigns(): if campaign in CI.campaigns and 'secondaries' in CI.campaigns[campaign]: allowed_secondary.update( CI.campaigns[campaign]['secondaries'] ) if secondary: if (secondary and allowed_secondary) and (set(secondary)&allowed_secondary!=set(secondary)): wfh.sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-allowed_secondary))) no_go = True if no_go: continue ## check if the batch is announced def check_mcm(wfn): announced=False is_real=False if not wfn.startswith('pdmvserv'): is_real = True try: for b in mcm.getA('batches',query='contains=%s'% wfo.name): is_real = True if b['status']=='announced': announced=True break except: try: for b in mcm.getA('batches',query='contains=%s'% wfo.name): is_real = True if b['status']=='announced': announced=True break except: print "could not get mcm batch announcement, assuming not real" return announced,is_real if not use_mcm: announced,is_real = False,True else: if wfh.request['RequestType'] in ['ReReco']: announced,is_real = True,True else: announced,is_real = check_mcm( wfo.name ) if not announced: wfh.sendLog('transferor', "does not look announced.") if not is_real: wfh.sendLog('transferor', "does not appear to be genuine.") ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status continue ## check on a grace period injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.) now = time.mktime(time.gmtime()) / (60.*60.) if float(now - injection_time) < 4.: if not options.go and not announced: wfh.sendLog('transferor', "It is too soon to start transfer: %3.2fH remaining"%(now - injection_time)) continue if passing_along >= allowed_to_handle: #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: if in_transfer_priority!=None and min_transfer_priority!=None: if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority: ## higher priority, and not only this priority being transfered wfh.sendLog('transferor',"Higher priority sample %s >= %s go-on over %s"%( wfh.request['RequestPriority'], in_transfer_priority, max_to_handle)) else: wfh.sendLog('transferor'," Not allowed to pass more than %s at a time. Currently %s handled, and adding %s"%( max_to_handle, being_handled, passing_along)) if not options.go: ## should not allow to jump that fence break if this_load and needs_transfer >= allowed_to_transfer: if in_transfer_priority!=None and min_transfer_priority!=None: if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority: ## higher priority, and not only this priority being transfered wfh.sendLog('transferor',"Higher priority sample %s >= %s go-on over %s"%(wfh.request['RequestPriority'], in_transfer_priority,max_to_transfer)) else: wfh.sendLog('transferor',"Not allowed to transfer more than %s at a time. Currently %s transfering, and adding %s"%( max_to_transfer, being_transfered, needs_transfer)) if not options.go: no_budget = True if no_budget: continue ## the site white list considers site, campaign, memory and core information (lheinput,primary,parent,secondary,sites_allowed) = wfh.getSiteWhiteList() if options and options.tosites: sites_allowed = options.tosites.split(',') for dataset in list(primary)+list(parent)+list(secondary): ## lock everything flat NLI.lock( dataset ) if not sites_allowed: wfh.sendLog('transferor',"not possible site to run at") #sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name )) sendLog('transferor',"%s has no possible sites to run at"%( wfo.name ),level='critical') continue blocks = [] if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## augment with run white list for dataset in primary: blocks = list(set( blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist'] ) )) if 'LumiList' in wfh.request and wfh.request['LumiList']: ## augment with the lumi white list blocks = list(set( blocks + getDatasetBlocks( dataset, lumis= wfh.request['LumiList'] ) )) if blocks: print "Reading",len(blocks),"in block whitelist" can_go = True staging=False allowed=True primary_destinations = set() if primary: copies_needed_from_CPUh,CPUh = wfh.getNCopies() if talk: print wfo.name,'reads',', '.join(primary),'in primary' ## chope the primary dataset for prim in primary: ## keep track of what needs what workflow_dependencies[prim].add( wfo.id ) max_priority[prim] = max(max_priority[prim],int(wfh.request['RequestPriority'])) wfh.sendLog('transferor',"Would make %s from cpu requirement %s"%( copies_needed_from_CPUh, CPUh)) copies_needed = copies_needed_from_CPUh if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies'] copies_needed = min(copies_needed_from_campaign, copies_needed) wfh.sendLog('transferor',"Maxed to %s by campaign configuration %s"%( copies_needed, wfh.request['Campaign'])) ### new ways of making the whole thing destinations,all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks ) print json.dumps(destinations, indent=2) ## get where the dataset is in full and completed prim_location = [site for (site,info) in destinations.items() if info['completion']==100 and info['data_fraction']==1] ## the rest is places it is going to be prim_destination = [site for site in destinations.keys() if not site in prim_location] if len(prim_location) >= copies_needed: wfh.sendLog('transferor',"The input is all fully in place at %s sites %s"%( len(prim_location), sorted(prim_location))) continue copies_needed = max(0,copies_needed - len(prim_location)) wfh.sendLog('transferor',"not counting existing copies ; now need %s"% copies_needed) copies_being_made = [ sum([info['blocks'].keys().count(block) for site,info in destinations.items() if site in prim_destination]) for block in all_block_names] latching_on_transfers = set() [latching_on_transfers.update(info['blocks'].values()) for site,info in destinations.items() if site in prim_destination] latching_on_transfers = list(latching_on_transfers) #print latching_on_transfers ## figure out where all this is going to go prim_to_distribute = [site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location] prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in prim_destination] ## take out the ones that cannot receive transfers prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] wfh.sendLog('transferor',"Could be going to: %s"% sorted( prim_to_distribute)) if not prim_to_distribute or any([transfers_per_sites[site] < max_staging_per_site for site in prim_to_distribute]): ## means there is openings let me go print "There are transfer slots available:",[(site,transfers_per_sites[site]) for site in prim_to_distribute] #for site in sites_allowed: # #increment accross the board, regardless of real destination: could be changed # transfers_per_sites[site] += 1 else: if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: wfh.sendLog('transferor', "Higher priority sample %s >= %s go-on over transfer slots available"%(wfh.request['RequestPriority'], in_transfer_priority)) else: wfh.sendLog('transferor',"Not allowed to transfer more than %s per site at a time. Going overboard for %s"%( max_staging_per_site, sorted([site for site in prim_to_distribute if transfers_per_sites[site]>=max_staging_per_site]))) if not options.go: allowed = False break for latching in latching_on_transfers: tfo = session.query(Transfer).filter(Transfer.phedexid == int(latching)).first() if not tfo: tfo = session.query(Transfer).filter(Transfer.phedexid == -int(latching)).first() if not tfo: tfo = Transfer( phedexid = latching) tfo.workflows_id = [] session.add(tfo) else: tfo.phedexid = latching ## make it positive ever if not wfo.id in tfo.workflows_id: print "adding",wfo.id,"to",tfo.id,"with phedexid",latching l = copy.deepcopy( tfo.workflows_id ) l.append( wfo.id ) tfo.workflows_id = l if not options.test: session.commit() else: session.flush() ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query can_go = False transfer_sizes[prim] = input_sizes[prim] staging = True # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ?? #copies_needed = max(0,copies_needed - len(prim_destination)) copies_needed = max(0,copies_needed - min(copies_being_made)) wfh.sendLog('transferor', "Not counting the copies being made ; then need %s"% copies_needed) if copies_needed == 0: wfh.sendLog('transferor', "The output is either fully in place or getting in full somewhere with %s"% latching_on_transfers) can_go = True continue elif len(prim_to_distribute)==0: wfh.sendLog('transferor', "We are going to need extra copies, but no destinations seems available") prim_to_distribute = [site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location] prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] if len(prim_to_distribute)>0: ## maybe that a parameter we can play with to limit the if not options or options.chop: ### hard include the tape disk andpoint ? #tapes = [site for site in getDatasetPresence( url, prim, vetos=['T0','T2','T3','Disk']) if site.endswith('MSS')] chops,sizes = getDatasetChops(prim, chop_threshold = options.chopsize, only_blocks=blocks) spreading = distributeToSites( chops, prim_to_distribute, n_copies = copies_needed, weights=SI.cpu_pledges, sizes=sizes) transfer_sizes[prim] = sum(sizes) if not spreading: sendLog('transferor','cannot send %s to any site, it cannot fit anywhere'% prim, level='critical') wfh.sendLog('transferor', "cannot send to any site. %s cannot seem to fit anywhere"%(prim)) staging=False can_go = False else: spreading = {} for site in prim_to_distribute: if blocks: spreading[site]=blocks else: spreading[site]=[prim] transfer_sizes[prim] = input_sizes[prim] ## this is approximate if blocks are specified can_go = False wfh.sendLog('transferor', "selected CE destinations %s"%(sorted( spreading.keys()))) for (site,items) in spreading.items(): all_transfers[site].extend( items ) transfers_per_sites[site] += 1 primary_destinations.add( site ) if not allowed: wfh.sendLog('transferor', "Not allowed to move on with") continue if secondary: override_sec_destination = [] if 'SecondaryLocation' in CI.campaigns[wfh.request['Campaign']]: override_sec_destination = CI.campaigns[wfh.request['Campaign']]['SecondaryLocation'] print wfo.name,'reads',', '.join(secondary),'in secondary' for sec in secondary: workflow_dependencies[sec].add( wfo.id ) if True: ## new style, failing on minbias if not sec in destination_cache: ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist destination_cache[sec],_ = getDatasetDestinations(url, sec) ## NO SITE WHITE LIST ADDED #destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed]) ## limit to the site whitelist NOW se_allowed = [SI.CE_to_SE(site) for site in sites_allowed] destinations = dict([(k,v) for (k,v) in destination_cache[sec].items() if site in se_allowed]) ## truncate location/destination to those making up for >90% of the dataset bad_destinations = [destinations.pop(site) for (site,info) in destinations.items() if info['data_fraction']<0.9] sec_location = [site for (site,info) in destinations.items() if info['completion']>=95] sec_destination = [site for site in destinations.keys() if not site in sec_location] else: ## old style presence = getDatasetPresence( url, sec ) sec_location = [site for site,pres in presence.items() if pres[1]>90.] ## more than 90% of the minbias at sites subscriptions = listSubscriptions( url ,sec ) sec_destination = [site for site in subscriptions] sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] if override_sec_destination: ## intersect with where we want the PU to be not_needed_anymore = list(set(sec_to_distribute) - set(override_sec_destination)) #sendEmail("secondary superfluous","the dataset %s could be removed from %s"%( sec, not_needed_anymore )) sendLog('transferor', "the dataset %s could be removed from %s"%( sec, not_needed_anymore )) sec_to_distribute = list(set(sec_to_distribute) & set(override_sec_destination)) if len( sec_to_distribute )>0: print "secondary could go to",sorted(sec_to_distribute) sec_size = dss.get( sec ) for site in sec_to_distribute: site_se =SI.CE_to_SE(site) if (SI.disk[site_se]*1024.) > sec_size: all_transfers[site].append( sec ) can_go = False else: print "could not send the secondary input to",site_se,"because it is too big for the available disk",SI.disk[site_se]*1024,"GB need",sec_size if primary_destinations and site in primary_destinations: #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024)) sendLog('transferor', '%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024), level='critical') else: print "the secondary input does not have to be send to site" ## is that possible to do something more if can_go: ## no explicit transfer required this time if staging: ## but using existing ones wfh.sendLog('transferor', "latches on existing transfers, and nothing else, settin staging") wfo.status = 'staging' needs_transfer+=1 else: wfh.sendLog('transferor', "should just be assigned now to %s"%sorted(sites_allowed)) wfo.status = 'staged' passing_along+=1 wfh.sendLog('transferor', "setting status to %s"%wfo.status) session.commit() continue else: ## there is an explicit transfer required if staging: ## and also using an existing one wfh.sendLog('transferor', "latches on existing transfers") if not options.test: wfo.status = 'staging' wfh.sendLog('transferor', "setting status to %s"%wfo.status) session.commit() wfh.sendLog('transferor',"needs a transfer") needs_transfer+=1 passing_along+=1 if no_goes: #sendEmail("no go for managing","No go for \n"+"\n".join( no_goes )) sendLog('transferor', "No go for \n"+"\n".join( no_goes ), level='critical') print "accumulated transfers" print json.dumps(all_transfers, indent=2) fake_id=-1 wf_id_in_prestaging=set() for (site,items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## site that do not want input datasets if site in SI.sites_veto_transfer: print site,"does not want transfers" continue ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured. ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] block_datasets = list(set([it.split('#')[0] for it in blocks])) datasets = [it for it in items_to_transfer if not '#' in it] details_text = "Making a replica to %s (CE) %s (SE) for"%( site, site_se) #print "\t",len(blocks),"blocks" ## remove blocks if full dataset is send out blocks = [block for block in blocks if not block.split('#')[0] in datasets] #print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks])) #print "\t",len(datasets),"datasets" #print "\t",datasets details_text += '\n\t%d blocks'%len(blocks) details_text += '\n\t%d needed blocks for %s'%( len(blocks), sorted(list(set([block.split('#')[0] for block in blocks])))) details_text += '\n\t%d datasets'% len(datasets) details_text += '\n\t%s'%sorted(datasets) items_to_transfer = blocks + datasets if execute: sendLog('transferor', details_text) else: print "Would make a replica to",site,"(CE)",site_se,"(SE) for" print details_text ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y','yes','go']: continue if execute: priority = 'normal' cds = [ds for ds in datasets+block_datasets if ds in max_priority] if cds and False: ## I don't think this is working. subscription should be updated on the fly and regularly for raising the priority if needed ## decide on an overall priority : that's a bit too large though if any([max_priority[ds]>=90000 for ds in cds]): priority = 'high' elif all([max_priority[ds]<80000 for ds in cds]): priority = 'low' result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority=priority) else: result= {'phedex':{'request_created' : []}} fake_id-=1 if not result: print "ERROR Could not make a replica request for",site,items_to_transfer,"pre-staging" continue for phedexid in [o['id'] for o in result['phedex']['request_created']]: new_transfer = session.query(Transfer).filter(Transfer.phedexid == int(phedexid)).first() if not new_transfer: new_transfer = session.query(Transfer).filter(Transfer.phedexid == -int(phedexid)).first() print phedexid,"transfer created" if not new_transfer: new_transfer = Transfer( phedexid = phedexid) session.add( new_transfer ) else: new_transfer.phedexid = phedexid ## make it positive again new_transfer.workflows_id = set() for transfering in list(set(map(lambda it : it.split('#')[0], items_to_transfer))): new_transfer.workflows_id.update( workflow_dependencies[transfering] ) new_transfer.workflows_id = list(new_transfer.workflows_id) wf_id_in_prestaging.update(new_transfer.workflows_id) session.commit() ## auto approve it if execute: approved = approveSubscription(url, phedexid, [site_se]) for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status!='staging': if execute: tr_wf.status = 'staging' if talk: print "setting",tr_wf.name,"to staging" session.commit()
def transferor(url, specific=None, talk=True, options=None): if userLock('transferor'): return if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() mcm = McMClient(dev=False) dss = DSS() print "counting all being handled..." being_handled = len( session.query(Workflow).filter(Workflow.status == 'away').all()) being_handled += len( session.query(Workflow).filter( Workflow.status.startswith('stag')).all()) being_handled += len( session.query(Workflow).filter( Workflow.status.startswith('assistance')).all()) max_to_handle = options.maxworkflows allowed_to_handle = max(0, max_to_handle - being_handled) wf_buffer = 5 if allowed_to_handle <= wf_buffer: ## buffer for having several wf per transfer print "Not allowed to run more than", max_to_handle, "at a time. Currently", being_handled, "and", wf_buffer, "buffer" else: print being_handled, "already being handled", max_to_handle, "max allowed,", allowed_to_handle, "remaining", "and", wf_buffer, "buffer" print "... done" all_transfers = defaultdict(list) workflow_dependencies = defaultdict( set) ## list of wf.id per input dataset wfs_and_wfh = [] print "getting all wf to consider ..." cache = getWorkflows(url, 'assignment-approved', details=True) for wfo in session.query(Workflow).filter( Workflow.status == 'considered').all(): if specific and not specific in wfo.name: continue cache_r = filter(lambda d: d['RequestName'] == wfo.name, cache) if len(cache_r): wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False, request=cache_r[0]))) else: wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False))) print "... done" input_sizes = {} ## list the size of those in transfer already in_transfer_priority = 0 min_transfer_priority = 100000000 print "getting all wf in staging ..." for wfo in session.query(Workflow).filter( Workflow.status == 'staging').all(): wfh = workflowInfo(url, wfo.name, spec=False) (_, primary, _, _) = wfh.getIO() for prim in primary: input_sizes[prim] = dss.get(prim) in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority'])) min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority'])) print "... done" print "Max priority in transfer already", in_transfer_priority print "Min priority in transfer already", min_transfer_priority in_transfer_already = sum(input_sizes.values()) #sort by priority higher first wfs_and_wfh.sort(cmp=lambda i, j: cmp(int(i[1].request[ 'RequestPriority']), int(j[1].request['RequestPriority'])), reverse=True) ## list the size of all inputs print "getting all input sizes ..." for (wfo, wfh) in wfs_and_wfh: (_, primary, _, _) = wfh.getIO() for prim in primary: input_sizes[prim] = dss.get(prim) print "... done" grand_total = sum(input_sizes.values()) to_transfer = grand_total - in_transfer_already grand_transfer_limit = options.maxtransfer transfer_limit = grand_transfer_limit - in_transfer_already print "%15.4f GB already being transfered" % in_transfer_already print "%15.4f GB is the current requested transfer load" % to_transfer print "%15.4f GB is the global transfer limit" % grand_transfer_limit print "%15.4f GB is the available limit" % transfer_limit # the max priority value per dataset. max_priority = defaultdict(int) needs_transfer = 0 ## so that we can count'em passing_along = 0 transfer_sizes = {} went_over_budget = False for (wfo, wfh) in wfs_and_wfh: print wfh.request['RequestPriority'] print wfo.name, "to be transfered" #wfh = workflowInfo( url, wfo.name) (_, primary, _, _) = wfh.getIO() this_load = sum([input_sizes[prim] for prim in primary]) if (this_load and (sum(transfer_sizes.values()) + this_load > transfer_limit or went_over_budget)): if went_over_budget: print "Transfer has gone over bubget." else: print "Transfer will go over bubget." print "%15.4f GB this load" % this_load print "%15.4f GB already this round" % sum(transfer_sizes.values()) print "%15.4f GB is the available limit" % transfer_limit went_over_budget = True if int( wfh.request['RequestPriority'] ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority: print "Higher priority sample", wfh.request[ 'RequestPriority'], ">=", in_transfer_priority, "go-on over budget" else: if not options.go: print min_transfer_priority, "minimum priority", wfh.request[ 'RequestPriority'], "<", in_transfer_priority, "stop" continue ## throtlle by campaign go if not CI.go(wfh.request['Campaign']): print "No go for", wfh.request['Campaign'] if not options.go: continue ## check if the batch is announced announced = False is_real = False for b in mcm.getA('batches', query='contains=%s' % wfo.name): is_real = True if b['status'] == 'announced': announced = True break if not announced: print wfo.name, "does not look announced." # skipping?, rejecting?, reporting?" if not is_real: print wfo.name, "does not appear to be genuine." ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status continue ## check on a grace period injection_time = time.mktime( time.strptime('.'.join(map(str, wfh.request['RequestDate'])), "%Y.%m.%d.%H.%M.%S")) / (60. * 60.) now = time.mktime(time.gmtime()) / (60. * 60.) if float(now - injection_time) < 4.: if not options.go and not announced: print "It is too soon to start transfer: %3.2fH remaining" % ( now - injection_time) continue passing_along += 1 if passing_along >= allowed_to_handle: if int( wfh.request['RequestPriority'] ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority: print "Higher priority sample", wfh.request[ 'RequestPriority'], ">=", in_transfer_priority, "go-on over", max_to_handle else: print "Not allowed to pass more than", max_to_handle, "at a time. Currently", being_handled, "handled, and adding", passing_along break (lheinput, primary, parent, secondary) = wfh.getIO() if options and options.tosites: sites_allowed = options.tosites.split(',') else: sites_allowed = getSiteWhiteList( (lheinput, primary, parent, secondary)) if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']): sites_allowed = CI.parameters( wfh.request['Campaign'])['SiteWhitelist'] blocks = [] if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']: blocks = wfh.request['BlockWhitelist'] can_go = True staging = False if primary: if talk: print wfo.name, 'reads', ', '.join(primary), 'in primary' ## chope the primary dataset for prim in primary: max_priority[prim] = max(max_priority[prim], int(wfh.request['RequestPriority'])) sites_really_allowed = [ site for site in sites_allowed if not any([ osite.startswith(site) for osite in SI.sites_veto_transfer ]) ] print "Sites allowed minus the vetoed transfer" print sites_really_allowed copies_needed = int( 0.35 * len(sites_really_allowed) ) + 1 ## should just go for a fixed number based if the white list grows that big print "Would make", copies_needed, "copies" if options.maxcopy > 0: copies_needed = min(options.maxcopy, copies_needed) ## remove the sites that do not want transfers print "need", copies_needed workflow_dependencies[prim].add(wfo.id) presence = getDatasetPresence(url, prim) prim_location = [ site for site, pres in presence.items() if pres[0] == True ] if len(prim_location) >= copies_needed: print "The output is all fully in place at", len( prim_location), "sites" continue # reduce the number of copies required by existing full copies copies_needed = max(0, copies_needed - len(prim_location)) print "now need", copies_needed subscriptions = listSubscriptions(url, prim) prim_destination = list( set([ site for (site, (tid, decision)) in subscriptions.items() if decision and not any([ site.endswith(veto) for veto in ['MSS', 'Export', 'Buffer'] ]) ])) ## need to reject from that list the ones with a full copy already: i.e the transfer corresponds to the copy in place prim_destination = [ site for site in prim_destination if not site in prim_location ] ## add transfer dependencies latching_on_transfers = list( set([ tid for (site, (tid, decision)) in subscriptions.items() if decision and site in prim_destination and not any([ site.endswith(veto) for veto in ['MSS', 'Export', 'Buffer'] ]) ])) print latching_on_transfers for latching in latching_on_transfers: tfo = session.query(Transfer).filter( Transfer.phedexid == latching).first() if not tfo: tfo = Transfer(phedexid=latching) tfo.workflows_id = [] session.add(tfo) if not wfo.id in tfo.workflows_id: print "adding", wfo.id, "to", tfo.id, "with phedexid", latching l = copy.deepcopy(tfo.workflows_id) l.append(wfo.id) tfo.workflows_id = l if not options.test: session.commit() else: session.flush( ) ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query can_go = False transfer_sizes[prim] = input_sizes[prim] staging = True # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ?? copies_needed = max(0, copies_needed - len(prim_destination)) print "then need", copies_needed if copies_needed == 0: print "The output is either fully in place or getting in full somewhere with", latching_on_transfers can_go = True continue prim_to_distribute = [ site for site in sites_allowed if not any( [osite.startswith(site) for osite in prim_location]) ] prim_to_distribute = [ site for site in prim_to_distribute if not any( [osite.startswith(site) for osite in prim_destination]) ] ## take out the ones that cannot receive transfers prim_to_distribute = [ site for site in prim_to_distribute if not any([ osite.startswith(site) for osite in SI.sites_veto_transfer ]) ] if len( prim_to_distribute ) > 0: ## maybe that a parameter we can play with to limit the if not options or options.chop: spreading = distributeToSites(getDatasetChops(prim), prim_to_distribute, n_copies=copies_needed, weights=SI.cpu_pledges) else: spreading = {} for site in prim_to_distribute: spreading[site] = [prim] can_go = False transfer_sizes[prim] = input_sizes[prim] for (site, items) in spreading.items(): all_transfers[site].extend(items) if secondary: if talk: print wfo.name, 'reads', ', '.join(secondary), 'in secondary' for sec in secondary: workflow_dependencies[sec].add(wfo.id) presence = getDatasetPresence(url, sec) sec_location = [ site for site, pres in presence.items() if pres[1] > 90. ] ## more than 90% of the minbias at sites subscriptions = listSubscriptions(url, sec) sec_destination = [site for site in subscriptions] sec_to_distribute = [ site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location]) ] sec_to_distribute = [ site for site in sec_to_distribute if not any( [osite.startswith(site) for osite in sec_destination]) ] sec_to_distribute = [ site for site in sec_to_distribute if not any([ osite.startswith(site) for osite in SI.sites_veto_transfer ]) ] if len(sec_to_distribute) > 0: for site in sec_to_distribute: all_transfers[site].append(sec) can_go = False ## is that possible to do something more if can_go: ## no explicit transfer required this time if staging: ## but using existing ones print wfo.name, "latches on existing transfers, and nothing else" wfo.status = 'staging' else: print wfo.name, "should just be assigned NOW to", sites_allowed wfo.status = 'staged' print "setting status to", wfo.status session.commit() continue else: ## there is an explicit transfer required if staging: ## and also using an existing one print wfo.name, "latches on existing transfers" if not options.test: wfo.status = 'staging' print "setting status to", wfo.status session.commit() print wfo.name, "needs a transfer" needs_transfer += 1 #print json.dumps(all_transfers) fake_id = -1 wf_id_in_prestaging = set() for (site, items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## site that do not want input datasets if site in SI.sites_veto_transfer: print site, "does not want transfers" continue ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured. ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] datasets = [it for it in items_to_transfer if not '#' in it] if execute: print "Making a replica to", site, "(CE)", site_se, "(SE) for" else: print "Would make a replica to", site, "(CE)", site_se, "(SE) for" print "\t", len(blocks), "blocks" ## remove blocks if full dataset is send out blocks = [ block for block in blocks if not block.split('#')[0] in datasets ] print "\t", len(blocks), "needed blocks for", list( set([block.split('#')[0] for block in blocks])) print "\t", len(datasets), "datasets" print "\t", datasets items_to_transfer = blocks + datasets ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y', 'yes', 'go']: continue if execute: result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority='normal') ## make use of max_priority dataset:priority to set the subscriptions priority """ ## does not function once = True for item in items_to_transfer: bds = item.split('#')[0] if max_priority[bds] >= 90000: if once: w=10 print "waiting",w,"s before raising priority" time.sleep(w) once=False ## raise it to high priority print item,"subscription priority raised to high at",site_se #print "This does not work yet properly it seems" print updateSubscription(url, site_se, item, priority='high') """ else: #result= {'phedex':{'request_created' : [{'id' : fake_id}]}} result = {'phedex': {'request_created': []}} fake_id -= 1 if not result: print "ERROR Could not make a replica request for", site, items_to_transfer, "pre-staging" continue for phedexid in [o['id'] for o in result['phedex']['request_created']]: new_transfer = session.query(Transfer).filter( Transfer.phedexid == phedexid).first() print phedexid, "transfer created" if not new_transfer: new_transfer = Transfer(phedexid=phedexid) session.add(new_transfer) new_transfer.workflows_id = set() for transfering in list( set(map(lambda it: it.split('#')[0], items_to_transfer))): new_transfer.workflows_id.update( workflow_dependencies[transfering]) new_transfer.workflows_id = list(new_transfer.workflows_id) wf_id_in_prestaging.update(new_transfer.workflows_id) session.commit() ## auto approve it if execute: approved = approveSubscription(url, phedexid, [site_se]) for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status != 'staging': if execute: tr_wf.status = 'staging' if talk: print "setting", tr_wf.name, "to staging" session.commit()