def main(): mysqlconn = MySQLdb.connect(host='dbod-cmsrv1.cern.ch', user='******', passwd="relval", port=5506) curs = mysqlconn.cursor() curs.execute("use " + dbname + ";") #curs.execute("lock tables batches write, batches_archive write, workflows write, workflows_archive write, datasets write, clone_reinsert_requests write") curs.execute("select * from batches") batches = curs.fetchall() batches_colnames = [desc[0] for desc in curs.description] for batch in batches: batch_dict = dict(zip(batches_colnames, batch)) if batch_dict["status"] != "assigned": continue userid = batch_dict["useridyear"] + "_" + batch_dict[ "useridmonth"] + "_" + batch_dict["useridday"] + "_" + str( batch_dict["useridnum"]) + "_" + str( batch_dict["batch_version_num"]) print " userid ==> " + userid curs.execute( "select workflow_name from workflows where useridyear = \"" + batch_dict["useridyear"] + "\" and useridmonth = \"" + batch_dict["useridmonth"] + "\" and useridday = \"" + batch_dict["useridday"] + "\" and useridnum = " + str(batch_dict["useridnum"]) + " and batch_version_num =" + str(batch_dict["batch_version_num"]) + ";") wfs = curs.fetchall() n_workflows = 0 n_completed = 0 for wf in wfs: n_workflows = n_workflows + 1 conn = httplib.HTTPSConnection( url, cert_file=os.getenv('X509_USER_PROXY'), key_file=os.getenv('X509_USER_PROXY')) r1 = conn.request('GET', '/reqmgr2/data/request?name=' + wf[0], headers={"Accept": "application/json"}) #r1=conn.request('GET','/couchdb/wmstats/_all_docs?keys=["'+wf[0]+'"]&include_docs=true') r2 = conn.getresponse() data = r2.read() if r2.status != 200: os.system( 'echo \"' + wf[0] + '\" | mail -s \"announcor.py error 1\" [email protected] --' ) sys.exit(0) s = json.loads(data) for status in s['result'][0][wf[0]]['RequestTransition']: if status['Status'] == "completed" or status[ 'Status'] == "force-complete": n_completed = n_completed + 1 break print "datetime.datetime.now() = " + str(datetime.datetime.now()) print "n_workflows = " + str(n_workflows) print "n_completed = " + str(n_completed) if n_workflows != n_completed: continue #string="2015_09_30_1_0" #if not (string.split('_')[0] == batch_dict["useridyear"] and string.split('_')[1] == batch_dict["useridmonth"] and string.split('_')[2] == batch_dict["useridday"] and string.split('_')[3] == str(batch_dict["useridnum"]) and string.split('_')[4] == str(batch_dict["batch_version_num"])): # continue wf_list = [] for wf in wfs: print wf[0] wf_list.append(wf[0]) job_failure_information = collect_job_failure_information.collect_job_failure_information( wf_list) needs_assistance = assistance_decision.assistance_decision( job_failure_information) if needs_assistance: curs.execute( "update batches set status=\"assistance\", current_status_start_time=\"" + datetime.datetime.now().strftime("%y:%m:%d %H:%M:%S") + "\" where useridyear = \"" + batch_dict["useridyear"] + "\" and useridmonth = \"" + batch_dict["useridmonth"] + "\" and useridday = \"" + batch_dict["useridday"] + "\" and useridnum = " + str(batch_dict["useridnum"]) + " and batch_version_num =" + str(batch_dict["batch_version_num"]) + ";") mysqlconn.commit() os.system( 'echo \"batch_id: ' + userid + '\" | mail -s \"a batch of relval workflows needs assistance\" [email protected]' ) continue #if there is a '\r' character in the body of an e-mail, it does not get sent description = batch_dict["description"].replace('\r', '') for wf in wf_list: too_many_events_check.too_many_events_check(wf) dset_nevents_list = collect_dsets_and_nevents.collect_dsets_and_nevents( wf_list) print_dsets_and_nevents.print_dsets_and_nevents( dset_nevents_list, userid + ".txt") ret = os.system( "cp " + userid + ".txt /afs/cern.ch/user/r/relval/webpage/relval_stats/" + userid + ".txt") if ret == 0: os.system("rm " + userid + ".txt") else: os.system( 'echo \"' + userid + '\" | mail -s \"announcement_loop.py error 2\" [email protected]' ) sys.exit(0) dsets_list = [] for dset_nevents in dset_nevents_list: dsets_list.append(dset_nevents[0]) for dset in dsets_list: setDatasetStatusDBS3.setStatusDBS3( "https://cmsweb.cern.ch/dbs/prod/global/DBSWriter", dset, "VALID", True) for wf in wf_list: reqMgrClient.closeOutWorkflow("cmsweb.cern.ch", wf) reqMgrClient.announceWorkflow("cmsweb.cern.ch", wf) msg = MIMEMultipart() reply_to = [] #send_to = ["*****@*****.**","*****@*****.**"] send_to = [ "*****@*****.**", "*****@*****.**", "*****@*****.**" ] #send_to = ["*****@*****.**"] #msg['In-Reply-To'] = hn_message_id #msg['References'] = hn_message_id msg['From'] = "*****@*****.**" msg['reply-to'] = COMMASPACE.join(reply_to) msg['To'] = COMMASPACE.join(send_to) msg['Date'] = formatdate(localtime=True) msg['Subject'] = batch_dict["announcement_title"] msg['Message-ID'] = email.Utils.make_msgid() messageText = "Dear all,\n" messageText = messageText + "\n" messageText = messageText + "A batch of relval workflows has finished.\n" messageText = messageText + "\n" messageText = messageText + "Batch ID:\n" messageText = messageText + "\n" messageText = messageText + userid + "\n" if batch_dict["batch_version_num"] > 0: messageText = messageText + "\n" messageText = messageText + "original workflow name ==> clone name:\n" messageText = messageText + "\n" curs.execute( "select workflow_name,original_workflow_name from workflows where useridyear = \"" + batch_dict["useridyear"] + "\" and useridmonth = \"" + batch_dict["useridmonth"] + "\" and useridday = \"" + batch_dict["useridday"] + "\" and useridnum = " + str(batch_dict["useridnum"]) + " and batch_version_num =" + str(batch_dict["batch_version_num"]) + ";") workflows = curs.fetchall() for workflow in workflows: messageText = messageText + workflow[1] + " ==> " + workflow[ 0] + "\n" messageText = messageText + "\n" messageText = messageText + "List of datasets:\n" messageText = messageText + "\n" messageText = messageText + "http://cms-project-relval.web.cern.ch/cms-project-relval/relval_stats/" + userid + ".txt\n" messageText = messageText + "\n" messageText = messageText + "Description:\n" messageText = messageText + "\n" messageText = messageText + description.rstrip('\n') messageText = messageText + "\n" #messageText=messageText+"\n" [istherefailureinformation, return_string ] = print_job_failure_information.print_job_failure_information( job_failure_information) if istherefailureinformation: messageText = messageText + "\n" messageText = messageText + return_string messageText = messageText + "\n" messageText = messageText + "\n" messageText = messageText + "RelVal Batch Manager" #put the announcement message into an e-mail to the relval hypernews and also in a url output_file = open( "/afs/cern.ch/user/r/relval/webpage/relval_announcements/" + userid + ".txt", 'w') output_file.write(messageText) try: msg.attach(MIMEText(messageText)) smtpObj = smtplib.SMTP() smtpObj.connect() smtpObj.sendmail("*****@*****.**", send_to, msg.as_string()) smtpObj.close() except Exception as e: print "Error: unable to send email: %s" % (str(e)) dsets_fnal_disk_list = [] dsets_cern_disk_list = [] for dset in dsets_list: #print dset.split('/') # we were asked to transfer some specific datasets to the cern tier 2 if dset.split('/')[3] != "RECO" and dset.split( '/')[3] != "ALCARECO": dsets_cern_disk_list.append(dset) if dset.split('/')[3] == "GEN-SIM": dsets_fnal_disk_list.append(dset) if dset.split('/')[3] == "GEN-SIM-DIGI-RAW": dsets_fnal_disk_list.append(dset) if dset.split('/')[3] == "GEN-SIM-RECO": dsets_fnal_disk_list.append(dset) if "RelValTTBar" in dset.split( '/')[1] and "TkAlMinBias" in dset.split( '/')[2] and dset.split('/')[3] != "ALCARECO": dsets_cern_disk_list.append(dset) if "MinimumBias" in dset.split( '/')[1] and "SiStripCalMinBias" in dset.split( '/')[2] and dset.split('/')[3] != "ALCARECO": dsets_cern_disk_list.append(dset) result = utils.makeReplicaRequest("cmsweb.cern.ch", "T2_CH_CERN", dsets_cern_disk_list, "relval datasets", group="RelVal") if result != None: phedexid = result['phedex']['request_created'][0]['id'] utils.approveSubscription("cmsweb.cern.ch", phedexid) result = utils.makeReplicaRequest("cmsweb.cern.ch", "T1_US_FNAL_Disk", dsets_fnal_disk_list, "relval datasets", group="RelVal") if result != None: phedexid = result['phedex']['request_created'][0]['id'] utils.approveSubscription("cmsweb.cern.ch", phedexid) result = utils.makeMoveRequest("cmsweb.cern.ch", "T0_CH_CERN_MSS", dsets_list, "relval datasets", group="RelVal") if result != None: phedexid = result['phedex']['request_created'][0]['id'] #even if you disapprove the subscription at the source, it will still deleted the datasets that are at the source but not subscribed their utils.disapproveSubscription("cmsweb.cern.ch", phedexid, ["T2_CH_CERN"]) utils.disapproveSubscription("cmsweb.cern.ch", phedexid, ["T1_US_FNAL_Disk"]) utils.disapproveSubscription("cmsweb.cern.ch", phedexid, ["T1_FR_CCIN2P3_Disk"]) utils.approveSubscription("cmsweb.cern.ch", phedexid, ["T0_CH_CERN_MSS"]) #phedexid = result['phedex']['request_created'][0]['id'] #utils.approveSubscription("cmsweb.cern.ch",phedexid) curs.execute( "update batches set status=\"announced\", current_status_start_time=\"" + datetime.datetime.now().strftime("%y:%m:%d %H:%M:%S") + "\" where useridyear = \"" + batch_dict["useridyear"] + "\" and useridmonth = \"" + batch_dict["useridmonth"] + "\" and useridday = \"" + batch_dict["useridday"] + "\" and useridnum = " + str(batch_dict["useridnum"]) + " and batch_version_num =" + str(batch_dict["batch_version_num"]) + ";") mysqlconn.commit()
def main(): mysqlconn = MySQLdb.connect(host='dbod-cmsrv1.cern.ch', user='******', passwd="relval", port=5506) #conn = MySQLdb.connect(host='localhost', user='******', passwd='relval') curs = mysqlconn.cursor() curs.execute("use " + dbname + ";") #curs.execute("lock tables batches write, batches_archive write, workflows write, workflows_archive write, datasets write, clone_reinsert_requests write") #workflow = line.rstrip('\n') #curs.execute("insert into workflows set hn_req=\""+hnrequest+"\", workflow_name=\""+workflow+"\";") curs.execute("select * from batches") batches = curs.fetchall() batches_colnames = [desc[0] for desc in curs.description] for batch in batches: #if batch[0] != 21: # continue blocks_dsets_to_transfer = [] blocks_not_at_site = [] batch_dict = dict(zip(batches_colnames, batch)) site = batch_dict["site"] if "T2" in site: site_disk = site elif "T1" in site: site_disk = site + "_Disk" else: os.system( 'echo ' + site + ' | mail -s \"input_dset_checkor.py error 1\" [email protected]' ) print "Neither T1 nor T2 is in site name, exiting" sys.exit(1) if site == "T2_CH_CERN_T0": site_disk = "T2_CH_CERN" if site == "T2_CH_CERN_AI": site_disk = "T2_CH_CERN" #print batch #print "" userid = batch_dict["useridyear"] + "_" + batch_dict[ "useridmonth"] + "_" + batch_dict["useridday"] + "_" + str( batch_dict["useridnum"]) + "_" + str( batch_dict["batch_version_num"]) #if status == "waiting_for_transfer" and count % 10 == 0: if batch_dict["status"] == "waiting_for_transfer": print " userid ==> " + str(userid) #count = 0 all_dsets_blocks_at_site = True curs.execute( "select workflow_name from workflows where useridyear = \"" + batch_dict["useridyear"] + "\" and useridmonth = \"" + batch_dict["useridmonth"] + "\" and useridday = \"" + batch_dict["useridday"] + "\" and useridnum = " + str(batch_dict["useridnum"]) + " and batch_version_num = " + str(batch_dict["batch_version_num"]) + ";") wfs = curs.fetchall() for wf in wfs: print wf[0] headers = { "Content-type": "application/json", "Accept": "application/json" } conn = httplib.HTTPSConnection( 'cmsweb.cern.ch', cert_file=os.getenv('X509_USER_PROXY'), key_file=os.getenv('X509_USER_PROXY')) r1 = conn.request("GET", '/reqmgr2/data/request/' + wf[0], headers=headers) r2 = conn.getresponse() schema = json.loads(r2.read()) schema = schema['result'][0][wf[0]] isthereanmcpileupdataset = False for key, value in schema.items(): if type(value) is dict and key.startswith("Task"): if 'MCPileup' in value: isthereanmcpileupdataset = True ismcpileupdatasetatsite = utils.checkIfDatasetIsSubscribedToASite( "cmsweb.cern.ch", value["MCPileup"], site_disk) if 'InputDataset' in value: inputdset = value['InputDataset'] if 'RunWhitelist' in value: runwhitelist = value['RunWhitelist'] blocks_fname = os.popen( "mktemp").read().rstrip("\n") list_of_blocks = utils.getListOfBlocks( inputdset, str(runwhitelist)) for block in list_of_blocks: #this block (/DoubleMu/...) is not registered in phedex, so it cannot be subscribed to any site if block == "/DoubleMu/Run2011A-ZMu-08Nov2011-v1/RAW-RECO#93c53d22-25b2-11e1-8c62-003048f02c8a": continue isblockatsite = utils.checkIfBlockIsAtASite( "cmsweb.cern.ch", block, site_disk) if not isblockatsite: all_dsets_blocks_at_site = False else: isdatasetatsite = utils.checkIfDatasetIsSubscribedToASite( "cmsweb.cern.ch", inputdset, site_disk) if not isdatasetatsite: all_dsets_blocks_at_site = False if all_dsets_blocks_at_site and (not isthereanmcpileupdataset or ismcpileupdatasetatsite): curs.execute( "update batches set status=\"input_dsets_ready\", current_status_start_time=\"" + datetime.datetime.now().strftime("%y:%m:%d %H:%M:%S") + "\" where useridyear = \"" + batch_dict["useridyear"] + "\" and useridmonth = \"" + batch_dict["useridmonth"] + "\" and useridday = \"" + batch_dict["useridday"] + "\" and useridnum = " + str(batch_dict["useridnum"]) + " and batch_version_num = " + str(batch_dict["batch_version_num"]) + ";") mysqlconn.commit() if batch_dict["status"] == "approved": print " userid ==> " + str(userid) #print "checking input datasets for workflows in batch "+str(batchid) curs.execute( "select workflow_name from workflows where useridyear = \"" + batch_dict["useridyear"] + "\" and useridmonth = \"" + batch_dict["useridmonth"] + "\" and useridday = \"" + batch_dict["useridday"] + "\" and useridnum = " + str(batch_dict["useridnum"]) + " and batch_version_num = " + str(batch_dict["batch_version_num"]) + ";") wfs = curs.fetchall() for wf in wfs: print wf[0] headers = { "Content-type": "application/json", "Accept": "application/json" } conn = httplib.HTTPSConnection( 'cmsweb.cern.ch', cert_file=os.getenv('X509_USER_PROXY'), key_file=os.getenv('X509_USER_PROXY')) r1 = conn.request("GET", '/reqmgr2/data/request/' + wf[0], headers=headers) r2 = conn.getresponse() schema = json.loads(r2.read()) schema = schema['result'][0][wf[0]] for key, value in schema.items(): if type(value) is dict and key.startswith("Task"): if 'MCPileup' in value: isdatasetatsite = utils.checkIfDatasetIsSubscribedToASite( "cmsweb.cern.ch", value['MCPileup'], site_disk) if not isdatasetatsite: blocks_dsets_to_transfer.append( value['MCPileup']) if 'InputDataset' in value: subscribed_to_disk = False inputdset = value['InputDataset'] if 'RunWhitelist' in value: runwhitelist = value['RunWhitelist'] list_of_blocks = utils.getListOfBlocks( inputdset, str(runwhitelist)) for block in list_of_blocks: #this block (/DoubleMu/...) is not registered in phedex, so it cannot be subscribed to any site if block == "/DoubleMu/Run2011A-ZMu-08Nov2011-v1/RAW-RECO#93c53d22-25b2-11e1-8c62-003048f02c8a": continue isblocksubscribedtosite = utils.checkIfBlockIsSubscribedToASite( "cmsweb.cern.ch", block, site_disk) isblockatsite = utils.checkIfBlockIsAtASite( "cmsweb.cern.ch", block, site_disk) if not isblocksubscribedtosite: blocks_dsets_to_transfer.append(block) if not isblockatsite: blocks_not_at_site.append(block) else: isdatasetsubscribedtosite = utils.checkIfDatasetIsSubscribedToASite( "cmsweb.cern.ch", inputdset, site_disk) isdatasetatsite = utils.checkIfDatasetIsSubscribedToASite( "cmsweb.cern.ch", inputdset, site_disk) if not isdatasetsubscribedtosite: blocks_dsets_to_transfer.append(inputdset) if not isdatasetatsite: blocks_not_at_site.append(inputdset) if blocks_dsets_to_transfer != []: print "transfering the following blocks:" print blocks_dsets_to_transfer result = utils.makeReplicaRequest( url="cmsweb.cern.ch", site=site_disk, datasets=blocks_dsets_to_transfer, comments="relval datasets", group="RelVal") phedexid = result['phedex']['request_created'][0]['id'] utils.approveSubscription("cmsweb.cern.ch", phedexid) curs.execute( "update batches set status=\"waiting_for_transfer\", current_status_start_time=\"" + datetime.datetime.now().strftime("%y:%m:%d %H:%M:%S") + "\" where useridyear = \"" + batch_dict["useridyear"] + "\" and useridmonth = \"" + batch_dict["useridmonth"] + "\" and useridday = \"" + batch_dict["useridday"] + "\" and useridnum = " + str(batch_dict["useridnum"]) + " and batch_version_num = " + str(batch_dict["batch_version_num"]) + ";") mysqlconn.commit() elif blocks_not_at_site != []: print blocks_not_at_site curs.execute( "update batches set status=\"waiting_for_transfer\", current_status_start_time=\"" + datetime.datetime.now().strftime("%y:%m:%d %H:%M:%S") + "\" where useridyear = \"" + batch_dict["useridyear"] + "\" and useridmonth = \"" + batch_dict["useridmonth"] + "\" and useridday = \"" + batch_dict["useridday"] + "\" and useridnum = " + str(batch_dict["useridnum"]) + " and batch_version_num = " + str(batch_dict["batch_version_num"]) + ";") else: curs.execute( "update batches set status=\"input_dsets_ready\", current_status_start_time=\"" + datetime.datetime.now().strftime("%y:%m:%d %H:%M:%S") + "\" where useridyear = \"" + batch_dict["useridyear"] + "\" and useridmonth = \"" + batch_dict["useridmonth"] + "\" and useridday = \"" + batch_dict["useridday"] + "\" and useridnum = " + str(batch_dict["useridnum"]) + " and batch_version_num = " + str(batch_dict["batch_version_num"]) + ";") mysqlconn.commit()
def stagor(url,specific =None, options=None): if not componentInfo().check(): return SI = siteInfo() CI = campaignInfo() UC = unifiedConfiguration() done_by_wf_id = {} done_by_input = {} completion_by_input = {} good_enough = 100.0 lost = json.loads(open('lost_blocks_datasets.json').read()) still_lost = [] for dataset in lost: l = findLostBlocks(url ,dataset) if not l: print dataset,"is not really lost" else: still_lost.append( dataset ) open('lost_blocks_datasets.json','w').write( json.dumps( still_lost, indent=2) ) if options.fast: print "doing the fast check of staged with threshold:",options.goodavailability for wfo in session.query(Workflow).filter(Workflow.status == 'staging').all(): if specific and not specific in wfo.name: continue wfi = workflowInfo(url, wfo.name) sites_allowed = getSiteWhiteList( wfi.getIO() ) if 'SiteWhitelist' in CI.parameters(wfi.request['Campaign']): sites_allowed = CI.parameters(wfi.request['Campaign'])['SiteWhitelist'] if 'SiteBlacklist' in CI.parameters(wfi.request['Campaign']): sites_allowed = list(set(sites_allowed) - set(CI.parameters(wfi.request['Campaign'])['SiteBlacklist'])) _,primaries,_,secondaries = wfi.getIO() se_allowed = [SI.CE_to_SE(site) for site in sites_allowed] all_check = True for dataset in list(primaries):#+list(secondaries) ? #print se_allowed available = getDatasetBlocksFraction( url , dataset , sites=se_allowed ) all_check &= (available >= options.goodavailability) if not all_check: break if all_check: print "\t\t",wfo.name,"can go staged" wfo.status = 'staged' session.commit() else: print "\t",wfo.name,"can wait a bit more" return for wfo in session.query(Workflow).filter(Workflow.status == 'staging').all(): wfi = workflowInfo(url, wfo.name) _,primaries,_,secondaries = wfi.getIO() for dataset in list(primaries)+list(secondaries): done_by_input[dataset] = {} completion_by_input[dataset] = {} print wfo.name,"needs",dataset for transfer in session.query(Transfer).all(): if specific and str(transfer.phedexid)!=str(specific): continue skip=True for wfid in transfer.workflows_id: tr_wf = session.query(Workflow).get(wfid) if tr_wf: if tr_wf.status == 'staging': print "\t",transfer.phedexid,"is staging for",tr_wf.name skip=False if skip: continue if transfer.phedexid<0: continue ## check the status of transfers checks = checkTransferApproval(url, transfer.phedexid) approved = all(checks.values()) if not approved: print transfer.phedexid,"is not yet approved" approveSubscription(url, transfer.phedexid) continue ## check on transfer completion checks = checkTransferStatus(url, transfer.phedexid, nocollapse=True) if not specific: for dsname in checks: if not dsname in done_by_input: done_by_input[dsname]={} if not dsname in completion_by_input: completion_by_input[dsname] = {} done_by_input[dsname][transfer.phedexid]=all(map(lambda i:i>=good_enough, checks[dsname].values())) completion_by_input[dsname][transfer.phedexid]=checks[dsname].values() if checks: print "Checks for",transfer.phedexid,[node.values() for node in checks.values()] done = all(map(lambda i:i>=good_enough,list(itertools.chain.from_iterable([node.values() for node in checks.values()])))) else: ## it is empty, is that a sign that all is done and away ? print "ERROR with the scubscriptions API of ",transfer.phedexid print "Most likely something else is overiding the transfer request. Need to work on finding the replacement automatically, if the replacement exists" done = False ## the thing above is NOT giving the right number #done = False for wfid in transfer.workflows_id: tr_wf = session.query(Workflow).get(wfid) if tr_wf:# and tr_wf.status == 'staging': if not tr_wf.id in done_by_wf_id: done_by_wf_id[tr_wf.id]={} done_by_wf_id[tr_wf.id][transfer.phedexid]=done if done: ## transfer.status = 'done' print transfer.phedexid,"is done" else: print transfer.phedexid,"not finished" pprint.pprint( checks ) #print done_by_input print "\n----\n" for dsname in done_by_input: fractions = None if dsname in completion_by_input: fractions = itertools.chain.from_iterable([check.values() for check in completion_by_input.values()]) ## the workflows in the waiting room for the dataset using_its = getWorkflowByInput(url, dsname) #print using_its using_wfos = [] for using_it in using_its: wf = session.query(Workflow).filter(Workflow.name == using_it).first() if wf: using_wfos.append( wf ) if not len(done_by_input[dsname]): print "For dataset",dsname,"there are no transfer report. That's an issue." for wf in using_wfos: if wf.status == 'staging': if UC.get("stagor_sends_back"): print "sending",wf.name,"back to considered" wf.status = 'considered' session.commit() sendEmail( "send back to considered","%s was send back and might be trouble"% wf.name) else: print "would send",wf.name,"back to considered" sendEmail( "subscription lagging behind","susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name) continue #need_sites = int(len(done_by_input[dsname].values())*0.7)+1 need_sites = len(done_by_input[dsname].values()) #if need_sites > 10: need_sites = int(need_sites/2.) got = done_by_input[dsname].values().count(True) if all([wf.status != 'staging' for wf in using_wfos]): ## not a single ds-using wf is in staging => moved on already ## just forget about it print "presence of",dsname,"does not matter anymore" print "\t",done_by_input[dsname] print "\t",[wf.status for wf in using_wfos] print "\tneeds",need_sites continue #?? ## should the need_sites reduces with time ? # with dataset choping, reducing that number might work as a block black-list. if len(done_by_input[dsname].values()) and all(done_by_input[dsname].values()): print dsname,"is everywhere we wanted" ## the input dataset is fully transfered, should consider setting the corresponding wf to staged for wf in using_wfos: if wf.status == 'staging': print wf.name,"is with us. setting staged and move on" wf.status = 'staged' session.commit() elif fractions and len(list(fractions))>1 and set(fractions)==1: print dsname,"is everywhere at the same fraction" print "We do not want this in the end. we want the data we asked for" continue ## the input dataset is fully transfered, should consider setting the corresponding wf to staged for wf in using_wfos: if wf.status == 'staging': print wf.name,"is with us everywhere the same. setting staged and move on" wf.status = 'staged' session.commit() elif got >= need_sites: print dsname,"is almost everywhere we wanted" #print "We do not want this in the end. we want the data we asked for" #continue ## the input dataset is fully transfered, should consider setting the corresponding wf to staged for wf in using_wfos: if wf.status == 'staging': print wf.name,"is almost with us. setting staged and move on" wf.status = 'staged' session.commit() else: print "incomplete",dsname lost = findLostBlocks(url, dsname) try: known_lost = json.loads(open('lost_blocks_datasets.json').read()) except: print "enable to get the known_lost from local json file" known_lost = [] if lost and not dsname in known_lost: lost_names = [item['name'] for item in lost] ## make a deeper investigation of the block location to see whether it's really no-where no-where print "We have lost",len(lost),"blocks",lost_names #print json.dumps( lost , indent=2 ) sendEmail('we have lost a few blocks', str(len(lost))+" in total.\nDetails \n:"+json.dumps( lost , indent=2 )) known_lost.append(dsname) rr= open('lost_blocks_datasets.json','w') rr.write( json.dumps( known_lost, indent=2)) rr.close() ## should the status be change to held-staging and pending on a ticket print "\t",done_by_input[dsname] print "\tneeds",need_sites print "\tgot",got for wfid in done_by_wf_id: #print done_by_wf_id[wfid].values() ## ask that all related transfer get into a valid state if all(done_by_wf_id[wfid].values()): pass
def outcleanor(url, options): if options.approve: for user in ['*Vlimant']: #,'*Cremonesi']: deletes = listDelete(url, user=user) for (site, who, tid) in deletes: if 'MSS' in site: continue ### ever print site, who, tid print "approving deletion" print approveSubscription( url, tid, nodes=[site], comments='Production cleaning by data ops') return sites_and_datasets = defaultdict(list) our_copies = defaultdict(list) wf_cleaned = {} wfs = [] for fetch in options.fetch.split(','): wfs.extend( session.query(Workflow).filter(Workflow.status == fetch).all()) random.shuffle(wfs) last_answer = None for wfo in wfs: if options.number and len(wf_cleaned) >= options.number: print "Reached", options.number, "cleaned" break print '-' * 100 wfi = workflowInfo(url, wfo.name) goes = {} # boolean per output for dataset in wfi.request['OutputDatasets']: goes[dataset] = False keep_one_out = True status = getDatasetStatus(dataset) print "\n\tLooking at", dataset, status, "\n" vetoes = None if status == 'INVALID': vetoes = ['Export', 'Buffer'] ## can take themselves out keep_one_out = False # just wipe clean elif status == None: print dataset, "actually does not exist. skip" goes[dataset] = True continue elif status in ['PRODUCTION', 'VALID' ] and wfo.status in ['forget', 'trouble']: print dataset, "should probably be invalidated. (", wfo.status, ") skip" keep_one_out = False # just wipe clean continue ## you are not sure. just skip it for the time being elif status == 'PRODUCTION' and wfo.status in ['clean']: print dataset, "should probably be set valid .skip" continue ## you are not sure. just skip it for the time being if status == 'VALID' and dataset.startswith('/MinBias'): print "This is a /MinBias. skip" continue if '/DQM' in dataset: keep_one_out = False total_size = getDatasetSize(dataset) our_presence = getDatasetPresence(url, dataset, complete=None, group="DataOps", vetoes=vetoes) also_our_presence = getDatasetPresence(url, dataset, complete=None, group="", vetoes=vetoes) ## merge in one unique dict for site in also_our_presence: if site in our_presence: there, frac = our_presence[site] other, ofrac = also_our_presence[site] our_presence[site] = (max(there, other), max(frac, ofrac)) else: our_presence[site] = also_our_presence[site] if our_presence: print our_presence ## analysis ops copies need to be taken into account anaops_presence = getDatasetPresence(url, dataset, complete=None, group="AnalysisOps") own_by_anaops = anaops_presence.keys() ## all our copies to_be_cleaned = our_presence.keys() if not len(to_be_cleaned): print "nowhere to be found of ours,", len( own_by_anaops), "in analysi ops pool" goes[dataset] = True continue print "Where we own bits of dataset" print to_be_cleaned if len(own_by_anaops): ## remove site with the anaops copies to_be_cleaned = list(set(to_be_cleaned) - set(own_by_anaops)) keep_one_out = False ## in that case, just remove our copies print "Own by anaops (therefore not keep a copy of ours)" print own_by_anaops else: ## we should not be looking at anything that was not passed to DDM, otherwise we'll be cutting the grass under our feet using_the_same = getWorkflowByInput(url, dataset, details=True) conflict = False for other in using_the_same: if other['RequestName'] == wfo.name: continue if other['RequestType'] == 'Resubmission': continue if not other['RequestStatus'] in [ 'announced', 'normal-archived', 'aborted', 'rejected', 'aborted-archived', 'rejected-archived', 'closed-out', 'None', None ]: print other['RequestName'], 'is in status', other[ 'RequestStatus'], 'preventing from cleaning', dataset conflict = True break if conflict: continue ## not being used. a bit less dangerous to clean-out ## keep one full copy out there full_copies = [ site for (site, (there, fract)) in our_presence.items() if there ] if keep_one_out: if not len(full_copies): print "we do not own a full copy of", dataset, status, wfo.status, ".skip" continue stay_there = random.choice( full_copies) #at a place own by ops print "Where we keep a full copy", stay_there to_be_cleaned.remove(stay_there) our_copies[stay_there].append(dataset) else: print "We do not want to keep a copy of ", dataset, status, wfo.status if len(to_be_cleaned): print "Where we can clean" print to_be_cleaned for site in to_be_cleaned: sites_and_datasets[site].append( (dataset, total_size * our_presence[site][1] / 100., status)) goes[dataset] = True else: print "no cleaning to be done" goes[dataset] = True print wfo.name, "scrutinized" if all(goes.values()): print "\t", wfo.name, "can toggle -out" def ask(): global last_answer last_answer = raw_input('go on ?') return last_answer if options.auto or ask() in ['y', '']: if all(goes.values()): wfo.status = wfo.status + '-out' wf_cleaned[wfo.name] = wfo.status continue elif last_answer in ['q', 'n']: break else: return if options.auto: pass elif last_answer in ['q']: return print "Potential cleanups" for (site, items) in sites_and_datasets.items(): cleanup = sum([size for (_, size, _) in items]) print "\n\t potential cleanup of", "%8.4f" % cleanup, "GB at ", site print "\n".join([ds + " " + st for ds, _, st in items]) datasets = [ds for ds, _, st in items] print "Copies and bits we are going to delete" print json.dumps(sites_and_datasets, indent=2) print "Copies we are keeping" print json.dumps(our_copies, indent=2) print "Workflows cleaned for output" print json.dumps(wf_cleaned, indent=2) stamp = time.strftime("%Y%m%d%H%M%S", time.localtime()) open('outcleaning_%s.json' % stamp, 'w').write(json.dumps(sites_and_datasets, indent=2)) open('keepcopies_%s.json' % stamp, 'w').write(json.dumps(our_copies, indent=2)) open('wfcleanout_%s.json' % stamp, 'w').write(json.dumps(wf_cleaned, indent=2)) if (not options.test) and (options.auto or raw_input( "Satisfied ? (y will trigger status change and deletion requests)") in ['y']): for (site, items) in sites_and_datasets.items(): datasets = [ds for ds, _, st in items] print "making deletion to", site result = makeDeleteRequest( url, site, datasets, "Cleanup output after production. DataOps will take care of approving it." ) print result ## approve it right away ? if 'MSS' in site: continue if 'Export' in site: continue if 'Buffer' in site: continue for did in [ item['id'] for item in result['phedex']['request_created'] ]: print "auto-approve disabled, but ready" #approveSubscription(url, did, nodes = [site], comments = 'Auto-approving production cleaning deletion') pass session.commit() else: print "Not making the deletion and changing statuses"
def transferor(url, specific=None, talk=True, options=None): if userLock(): return if duplicateLock(): return use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() NLI = newLockInfo() mcm = McMClient(dev=False) dss = DSS() #allowed_secondary = UC.get('') print "counting all being handled..." being_handled = len( session.query(Workflow).filter(Workflow.status == 'away').all()) being_handled += len( session.query(Workflow).filter( Workflow.status.startswith('stag')).all()) being_transfered = len( session.query(Workflow).filter(Workflow.status == 'staging').all()) being_handled += len( session.query(Workflow).filter( Workflow.status.startswith('assistance-')).all()) max_to_handle = options.maxworkflows max_to_transfer = options.maxstaging allowed_to_handle = max(0, max_to_handle - being_handled) allowed_to_transfer = max(0, max_to_transfer - being_transfered) wf_buffer = 5 if allowed_to_handle <= wf_buffer: ## buffer for having several wf per transfer print "Not allowed to run more than", max_to_handle, "at a time. Currently", being_handled, "and", wf_buffer, "buffer" else: print being_handled, "already being handled", max_to_handle, "max allowed,", allowed_to_handle, "remaining", "and", wf_buffer, "buffer" if allowed_to_transfer <= wf_buffer: print "Not allowed to transfer more than", max_to_transfer, "at a time. Currently", being_transfered, "and", wf_buffer, "buffer" else: print being_transfered, "already being transfered", max_to_transfer, "max allowed,", allowed_to_transfer, "remaining", "and", wf_buffer, "buffer" print "... done" all_transfers = defaultdict(list) workflow_dependencies = defaultdict( set) ## list of wf.id per input dataset wfs_and_wfh = [] print "getting all wf to consider ..." cache = getWorkflows(url, 'assignment-approved', details=True) for wfo in session.query(Workflow).filter( Workflow.status.startswith('considered')).all(): print "\t", wfo.name if specific and not specific in wfo.name: continue cache_r = filter(lambda d: d['RequestName'] == wfo.name, cache) if len(cache_r): wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False, request=cache_r[0]))) else: wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False))) print "... done" transfers_per_sites = defaultdict(int) input_sizes = {} ignored_input_sizes = {} input_cput = {} input_st = {} ## list the size of those in transfer already in_transfer_priority = None min_transfer_priority = None print "getting all wf in staging ..." stucks = json.loads(open('%s/stuck_transfers.json' % monitor_dir).read()) for wfo in session.query(Workflow).filter( Workflow.status == 'staging').all(): wfh = workflowInfo(url, wfo.name, spec=False) #(lheinput,primary,parent,secondary) = wfh.getIO() #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() for site in sites_allowed: ## we should get the actual transfer destination instead of the full white list transfers_per_sites[site] += 1 #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() for prim in primary: ds_s = dss.get(prim) if prim in stucks: sendLog('transferor', "%s appears stuck, so not counting it %s [GB]" % (prim, ds_s), wfi=wfh) ignored_input_sizes[prim] = ds_s else: input_sizes[prim] = ds_s sendLog('transferor', "%s needs %s [GB]" % (wfo.name, ds_s), wfi=wfh) if in_transfer_priority == None: in_transfer_priority = int(wfh.request['RequestPriority']) else: in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority'])) if min_transfer_priority == None: min_transfer_priority = int(wfh.request['RequestPriority']) else: min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority'])) if min_transfer_priority == None or in_transfer_priority == None: print "nothing is lining up for transfer" sendEmail("no request in staging", "no request in staging") return pass try: print "Ignored input sizes" ignored_values = list(ignored_input_sizes.items()) ignored_values.sort(key=lambda i: i[1]) print "\n".join(map(str, ignored_values)) print "Considered input sizes" considered_values = list(input_sizes.items()) considered_values.sort(key=lambda i: i[1]) print "\n".join(map(str, considered_values)) except Exception as e: print "trying to print the summary of input size" print str(e) print "... done" print "Max priority in transfer already", in_transfer_priority print "Min priority in transfer already", min_transfer_priority print "transfers per sites" print json.dumps(transfers_per_sites, indent=2) in_transfer_already = sum(input_sizes.values()) cput_in_transfer_already = sum(input_cput.values()) st_in_transfer_already = sum(input_st.values()) ## list the size of all inputs primary_input_per_workflow_gb = defaultdict(float) print "getting all input sizes ..." for (wfo, wfh) in wfs_and_wfh: (_, primary, _, _) = wfh.getIO() #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() for prim in primary: ## do not count it if it appears stalled ! prim_size = dss.get(prim) input_sizes[prim] = prim_size primary_input_per_workflow_gb[wfo.name] += prim_size print "... done" # shuffle first by name random.shuffle(wfs_and_wfh) # Sort smallest transfers first; allows us to transfer as many as possible workflows. def prio_and_size(i, j): if int(i[1].request['RequestPriority']) == int( j[1].request['RequestPriority']): return cmp(int(primary_input_per_workflow_gb.get(j[0].name, 0)), int(primary_input_per_workflow_gb.get(i[0].name, 0))) else: return cmp(int(i[1].request['RequestPriority']), int(j[1].request['RequestPriority'])) #wfs_and_wfh.sort(cmp = prio_and_size, reverse=True) #wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(primary_input_per_workflow_gb.get(i[0].name, 0)), int(primary_input_per_workflow_gb.get(j[0].name, 0)) )) #sort by priority higher first wfs_and_wfh.sort(cmp=lambda i, j: cmp(int(i[1].request[ 'RequestPriority']), int(j[1].request['RequestPriority'])), reverse=True) cput_grand_total = sum(input_cput.values()) cput_to_transfer = cput_grand_total - cput_in_transfer_already st_grand_total = sum(input_st.values()) st_to_transfer = st_grand_total - st_in_transfer_already print "%15.4f [CPU h] worth already in transfer" % cput_in_transfer_already print "%15.4f [CPU h] worth is the current requested transfer load" % cput_to_transfer print "%15.4f [h] worth of absolute system time in transfer" % ( cput_in_transfer_already / SI.availableSlots()) print "%15.4f [h] worth of absolute system time is the current requested transfer load" % ( cput_to_transfer / SI.availableSlots()) print "%15.4f [h] worth of theoritical system time in transfer" % ( st_in_transfer_already) print "%15.4f [h] worth of theoritical system time is the current requested transfer load" % ( st_to_transfer) grand_total = sum(input_sizes.values()) to_transfer = grand_total - in_transfer_already grand_transfer_limit = options.maxtransfer #grand_transfer_limit = SI.total_disk()*0.25*1024## half of the free sapce in TB->GB transfer_limit = grand_transfer_limit - in_transfer_already print "%15.4f GB already being transfered" % in_transfer_already print "%15.4f GB is the current requested transfer load" % to_transfer print "%15.4f GB is the global transfer limit" % grand_transfer_limit print "%15.4f GB is the available limit" % transfer_limit max_staging_per_site = options.maxstagingpersite # the max priority value per dataset. max_priority = defaultdict(int) needs_transfer = 0 ## so that we can count'em passing_along = 0 transfer_sizes = {} went_over_budget = False destination_cache = {} no_goes = set() max_per_round = UC.get('max_per_round').get('transferor', None) if max_per_round and not spec: wfs_and_wfh = wfs_and_wfh[:max_per_round] for (wfo, wfh) in wfs_and_wfh: print wfo.name, "to be transfered with priority", wfh.request[ 'RequestPriority'] if wfh.request['RequestStatus'] != 'assignment-approved': if wfh.request['RequestStatus'] in [ 'aborted', 'rejected', 'rejected-archived', 'aborted-archived' ]: wfo.status = 'trouble' ## so that we look or a replacement else: wfo.status = 'away' wfh.sendLog( 'transferor', '%s in status %s, setting %s' % (wfo.name, wfh.request['RequestStatus'], wfo.status)) continue (_, primary, _, _) = wfh.getIO() this_load = sum([input_sizes[prim] for prim in primary]) no_budget = False if (this_load and (sum(transfer_sizes.values()) + this_load > transfer_limit or went_over_budget)): if went_over_budget: wfh.sendLog('transferor', "Transfer has gone over bubget.") else: wfh.sendLog('transferor', "Transfer will go over bubget.") wfh.sendLog( 'transferor', "%15.4f GB this load, %15.4f GB already this round, %15.4f GB is the available limit" % (this_load, sum(transfer_sizes.values()), transfer_limit)) #if sum(transfer_sizes.values()) > transfer_limit: went_over_budget = True if in_transfer_priority != None and min_transfer_priority != None: if int( wfh.request['RequestPriority'] ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority: wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over budget" % (wfh.request['RequestPriority'], in_transfer_priority)) else: if not options.go: wfh.sendLog( 'transferor', "%s minimum priority %s < %s : stop" % (min_transfer_priority, wfh.request['RequestPriority'], in_transfer_priority)) no_budget = True ## throtlle by campaign go no_go = False if not wfh.go(log=True) and not options.go: no_go = True no_goes.add(wfo.name) allowed_secondary = set() for campaign in wfh.getCampaigns(): if campaign in CI.campaigns and 'secondaries' in CI.campaigns[ campaign]: allowed_secondary.update(CI.campaigns[campaign]['secondaries']) if secondary: if (secondary and allowed_secondary) and ( set(secondary) & allowed_secondary != set(secondary)): wfh.sendLog( 'assignor', '%s is not an allowed secondary' % (', '.join(set(secondary) - allowed_secondary))) no_go = True if no_go: continue ## check if the batch is announced def check_mcm(wfn): announced = False is_real = False if not wfn.startswith('pdmvserv'): is_real = True try: for b in mcm.getA('batches', query='contains=%s' % wfo.name): is_real = True if b['status'] == 'announced': announced = True break except: try: for b in mcm.getA('batches', query='contains=%s' % wfo.name): is_real = True if b['status'] == 'announced': announced = True break except: print "could not get mcm batch announcement, assuming not real" return announced, is_real if not use_mcm: announced, is_real = False, True else: if wfh.request['RequestType'] in ['ReReco']: announced, is_real = True, True else: announced, is_real = check_mcm(wfo.name) if not announced: wfh.sendLog('transferor', "does not look announced.") if not is_real: wfh.sendLog('transferor', "does not appear to be genuine.") ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status continue ## check on a grace period injection_time = time.mktime( time.strptime('.'.join(map(str, wfh.request['RequestDate'])), "%Y.%m.%d.%H.%M.%S")) / (60. * 60.) now = time.mktime(time.gmtime()) / (60. * 60.) if float(now - injection_time) < 4.: if not options.go and not announced: wfh.sendLog( 'transferor', "It is too soon to start transfer: %3.2fH remaining" % (now - injection_time)) continue if passing_along >= allowed_to_handle: #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: if in_transfer_priority != None and min_transfer_priority != None: if int(wfh.request['RequestPriority'] ) >= in_transfer_priority and int( wfh.request['RequestPriority'] ) != min_transfer_priority: ## higher priority, and not only this priority being transfered wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over %s" % (wfh.request['RequestPriority'], in_transfer_priority, max_to_handle)) else: wfh.sendLog( 'transferor', " Not allowed to pass more than %s at a time. Currently %s handled, and adding %s" % (max_to_handle, being_handled, passing_along)) if not options.go: ## should not allow to jump that fence break if this_load and needs_transfer >= allowed_to_transfer: if in_transfer_priority != None and min_transfer_priority != None: if int(wfh.request['RequestPriority'] ) >= in_transfer_priority and int( wfh.request['RequestPriority'] ) != min_transfer_priority: ## higher priority, and not only this priority being transfered wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over %s" % (wfh.request['RequestPriority'], in_transfer_priority, max_to_transfer)) else: wfh.sendLog( 'transferor', "Not allowed to transfer more than %s at a time. Currently %s transfering, and adding %s" % (max_to_transfer, being_transfered, needs_transfer)) if not options.go: no_budget = True if no_budget: continue ## the site white list considers site, campaign, memory and core information (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList() if options and options.tosites: sites_allowed = options.tosites.split(',') for dataset in list(primary) + list(parent) + list(secondary): ## lock everything flat NLI.lock(dataset) if not sites_allowed: wfh.sendLog('transferor', "not possible site to run at") #sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name )) sendLog('transferor', "%s has no possible sites to run at" % (wfo.name), level='critical') continue blocks = [] if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## augment with run white list for dataset in primary: blocks = list( set(blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist']))) if 'LumiList' in wfh.request and wfh.request['LumiList']: ## augment with the lumi white list blocks = list( set(blocks + getDatasetBlocks(dataset, lumis=wfh.request['LumiList']))) if blocks: print "Reading", len(blocks), "in block whitelist" can_go = True staging = False allowed = True primary_destinations = set() if primary: copies_needed_from_CPUh, CPUh = wfh.getNCopies() if talk: print wfo.name, 'reads', ', '.join(primary), 'in primary' ## chope the primary dataset for prim in primary: ## keep track of what needs what workflow_dependencies[prim].add(wfo.id) max_priority[prim] = max(max_priority[prim], int(wfh.request['RequestPriority'])) wfh.sendLog( 'transferor', "Would make %s from cpu requirement %s" % (copies_needed_from_CPUh, CPUh)) copies_needed = copies_needed_from_CPUh if 'Campaign' in wfh.request and wfh.request[ 'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[ wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[ wfh.request['Campaign']]['maxcopies'] copies_needed = min(copies_needed_from_campaign, copies_needed) wfh.sendLog( 'transferor', "Maxed to %s by campaign configuration %s" % (copies_needed, wfh.request['Campaign'])) ### new ways of making the whole thing destinations, all_block_names = getDatasetDestinations( url, prim, within_sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks) print json.dumps(destinations, indent=2) ## get where the dataset is in full and completed prim_location = [ site for (site, info) in destinations.items() if info['completion'] == 100 and info['data_fraction'] == 1 ] ## the rest is places it is going to be prim_destination = [ site for site in destinations.keys() if not site in prim_location ] if len(prim_location) >= copies_needed: wfh.sendLog( 'transferor', "The input is all fully in place at %s sites %s" % (len(prim_location), sorted(prim_location))) continue copies_needed = max(0, copies_needed - len(prim_location)) wfh.sendLog( 'transferor', "not counting existing copies ; now need %s" % copies_needed) copies_being_made = [ sum([ info['blocks'].keys().count(block) for site, info in destinations.items() if site in prim_destination ]) for block in all_block_names ] latching_on_transfers = set() [ latching_on_transfers.update(info['blocks'].values()) for site, info in destinations.items() if site in prim_destination ] latching_on_transfers = list(latching_on_transfers) #print latching_on_transfers ## figure out where all this is going to go prim_to_distribute = [ site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location ] prim_to_distribute = [ site for site in prim_to_distribute if not SI.CE_to_SE(site) in prim_destination ] ## take out the ones that cannot receive transfers prim_to_distribute = [ site for site in prim_to_distribute if not any([ osite.startswith(site) for osite in SI.sites_veto_transfer ]) ] wfh.sendLog( 'transferor', "Could be going to: %s" % sorted(prim_to_distribute)) if not prim_to_distribute or any([ transfers_per_sites[site] < max_staging_per_site for site in prim_to_distribute ]): ## means there is openings let me go print "There are transfer slots available:", [ (site, transfers_per_sites[site]) for site in prim_to_distribute ] #for site in sites_allowed: # #increment accross the board, regardless of real destination: could be changed # transfers_per_sites[site] += 1 else: if int( wfh.request['RequestPriority'] ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority: wfh.sendLog( 'transferor', "Higher priority sample %s >= %s go-on over transfer slots available" % (wfh.request['RequestPriority'], in_transfer_priority)) else: wfh.sendLog( 'transferor', "Not allowed to transfer more than %s per site at a time. Going overboard for %s" % (max_staging_per_site, sorted([ site for site in prim_to_distribute if transfers_per_sites[site] >= max_staging_per_site ]))) if not options.go: allowed = False break for latching in latching_on_transfers: tfo = session.query(Transfer).filter( Transfer.phedexid == int(latching)).first() if not tfo: tfo = session.query(Transfer).filter( Transfer.phedexid == -int(latching)).first() if not tfo: tfo = Transfer(phedexid=latching) tfo.workflows_id = [] session.add(tfo) else: tfo.phedexid = latching ## make it positive ever if not wfo.id in tfo.workflows_id: print "adding", wfo.id, "to", tfo.id, "with phedexid", latching l = copy.deepcopy(tfo.workflows_id) l.append(wfo.id) tfo.workflows_id = l if not options.test: session.commit() else: session.flush( ) ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query can_go = False transfer_sizes[prim] = input_sizes[prim] staging = True # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ?? #copies_needed = max(0,copies_needed - len(prim_destination)) copies_needed = max(0, copies_needed - min(copies_being_made)) wfh.sendLog( 'transferor', "Not counting the copies being made ; then need %s" % copies_needed) if copies_needed == 0: wfh.sendLog( 'transferor', "The output is either fully in place or getting in full somewhere with %s" % latching_on_transfers) can_go = True continue elif len(prim_to_distribute) == 0: wfh.sendLog( 'transferor', "We are going to need extra copies, but no destinations seems available" ) prim_to_distribute = [ site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location ] prim_to_distribute = [ site for site in prim_to_distribute if not any([ osite.startswith(site) for osite in SI.sites_veto_transfer ]) ] if len( prim_to_distribute ) > 0: ## maybe that a parameter we can play with to limit the if not options or options.chop: ### hard include the tape disk andpoint ? #tapes = [site for site in getDatasetPresence( url, prim, vetos=['T0','T2','T3','Disk']) if site.endswith('MSS')] chops, sizes = getDatasetChops( prim, chop_threshold=options.chopsize, only_blocks=blocks) spreading = distributeToSites(chops, prim_to_distribute, n_copies=copies_needed, weights=SI.cpu_pledges, sizes=sizes) transfer_sizes[prim] = sum(sizes) if not spreading: sendLog( 'transferor', 'cannot send %s to any site, it cannot fit anywhere' % prim, level='critical') wfh.sendLog( 'transferor', "cannot send to any site. %s cannot seem to fit anywhere" % (prim)) staging = False can_go = False else: spreading = {} for site in prim_to_distribute: if blocks: spreading[site] = blocks else: spreading[site] = [prim] transfer_sizes[prim] = input_sizes[ prim] ## this is approximate if blocks are specified can_go = False wfh.sendLog( 'transferor', "selected CE destinations %s" % (sorted(spreading.keys()))) for (site, items) in spreading.items(): all_transfers[site].extend(items) transfers_per_sites[site] += 1 primary_destinations.add(site) if not allowed: wfh.sendLog('transferor', "Not allowed to move on with") continue if secondary: override_sec_destination = [] if 'SecondaryLocation' in CI.campaigns[wfh.request['Campaign']]: override_sec_destination = CI.campaigns[ wfh.request['Campaign']]['SecondaryLocation'] print wfo.name, 'reads', ', '.join(secondary), 'in secondary' for sec in secondary: workflow_dependencies[sec].add(wfo.id) if True: ## new style, failing on minbias if not sec in destination_cache: ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist destination_cache[sec], _ = getDatasetDestinations( url, sec) ## NO SITE WHITE LIST ADDED #destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed]) ## limit to the site whitelist NOW se_allowed = [SI.CE_to_SE(site) for site in sites_allowed] destinations = dict([ (k, v) for (k, v) in destination_cache[sec].items() if site in se_allowed ]) ## truncate location/destination to those making up for >90% of the dataset bad_destinations = [ destinations.pop(site) for (site, info) in destinations.items() if info['data_fraction'] < 0.9 ] sec_location = [ site for (site, info) in destinations.items() if info['completion'] >= 95 ] sec_destination = [ site for site in destinations.keys() if not site in sec_location ] else: ## old style presence = getDatasetPresence(url, sec) sec_location = [ site for site, pres in presence.items() if pres[1] > 90. ] ## more than 90% of the minbias at sites subscriptions = listSubscriptions(url, sec) sec_destination = [site for site in subscriptions] sec_to_distribute = [ site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location]) ] sec_to_distribute = [ site for site in sec_to_distribute if not any( [osite.startswith(site) for osite in sec_destination]) ] sec_to_distribute = [ site for site in sec_to_distribute if not any([ osite.startswith(site) for osite in SI.sites_veto_transfer ]) ] if override_sec_destination: ## intersect with where we want the PU to be not_needed_anymore = list( set(sec_to_distribute) - set(override_sec_destination)) #sendEmail("secondary superfluous","the dataset %s could be removed from %s"%( sec, not_needed_anymore )) sendLog( 'transferor', "the dataset %s could be removed from %s" % (sec, not_needed_anymore)) sec_to_distribute = list( set(sec_to_distribute) & set(override_sec_destination)) if len(sec_to_distribute) > 0: print "secondary could go to", sorted(sec_to_distribute) sec_size = dss.get(sec) for site in sec_to_distribute: site_se = SI.CE_to_SE(site) if (SI.disk[site_se] * 1024.) > sec_size: all_transfers[site].append(sec) can_go = False else: print "could not send the secondary input to", site_se, "because it is too big for the available disk", SI.disk[ site_se] * 1024, "GB need", sec_size if primary_destinations and site in primary_destinations: #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024)) sendLog('transferor', '%s is too big (%s) for %s (%s)' % (sec, sec_size, site_se, SI.disk[site_se] * 1024), level='critical') else: print "the secondary input does not have to be send to site" ## is that possible to do something more if can_go: ## no explicit transfer required this time if staging: ## but using existing ones wfh.sendLog( 'transferor', "latches on existing transfers, and nothing else, settin staging" ) wfo.status = 'staging' needs_transfer += 1 else: wfh.sendLog( 'transferor', "should just be assigned now to %s" % sorted(sites_allowed)) wfo.status = 'staged' passing_along += 1 wfh.sendLog('transferor', "setting status to %s" % wfo.status) session.commit() continue else: ## there is an explicit transfer required if staging: ## and also using an existing one wfh.sendLog('transferor', "latches on existing transfers") if not options.test: wfo.status = 'staging' wfh.sendLog('transferor', "setting status to %s" % wfo.status) session.commit() wfh.sendLog('transferor', "needs a transfer") needs_transfer += 1 passing_along += 1 if no_goes: #sendEmail("no go for managing","No go for \n"+"\n".join( no_goes )) sendLog('transferor', "No go for \n" + "\n".join(no_goes), level='critical') print "accumulated transfers" print json.dumps(all_transfers, indent=2) fake_id = -1 wf_id_in_prestaging = set() for (site, items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## site that do not want input datasets if site in SI.sites_veto_transfer: print site, "does not want transfers" continue ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured. ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] block_datasets = list(set([it.split('#')[0] for it in blocks])) datasets = [it for it in items_to_transfer if not '#' in it] details_text = "Making a replica to %s (CE) %s (SE) for" % (site, site_se) #print "\t",len(blocks),"blocks" ## remove blocks if full dataset is send out blocks = [ block for block in blocks if not block.split('#')[0] in datasets ] #print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks])) #print "\t",len(datasets),"datasets" #print "\t",datasets details_text += '\n\t%d blocks' % len(blocks) details_text += '\n\t%d needed blocks for %s' % ( len(blocks), sorted(list(set([block.split('#')[0] for block in blocks])))) details_text += '\n\t%d datasets' % len(datasets) details_text += '\n\t%s' % sorted(datasets) items_to_transfer = blocks + datasets if execute: sendLog('transferor', details_text) else: print "Would make a replica to", site, "(CE)", site_se, "(SE) for" print details_text ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y', 'yes', 'go']: continue if execute: priority = 'normal' cds = [ ds for ds in datasets + block_datasets if ds in max_priority ] if cds and False: ## I don't think this is working. subscription should be updated on the fly and regularly for raising the priority if needed ## decide on an overall priority : that's a bit too large though if any([max_priority[ds] >= 90000 for ds in cds]): priority = 'high' elif all([max_priority[ds] < 80000 for ds in cds]): priority = 'low' result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority=priority) else: result = {'phedex': {'request_created': []}} fake_id -= 1 if not result: print "ERROR Could not make a replica request for", site, items_to_transfer, "pre-staging" continue for phedexid in [o['id'] for o in result['phedex']['request_created']]: new_transfer = session.query(Transfer).filter( Transfer.phedexid == int(phedexid)).first() if not new_transfer: new_transfer = session.query(Transfer).filter( Transfer.phedexid == -int(phedexid)).first() print phedexid, "transfer created" if not new_transfer: new_transfer = Transfer(phedexid=phedexid) session.add(new_transfer) else: new_transfer.phedexid = phedexid ## make it positive again new_transfer.workflows_id = set() for transfering in list( set(map(lambda it: it.split('#')[0], items_to_transfer))): new_transfer.workflows_id.update( workflow_dependencies[transfering]) new_transfer.workflows_id = list(new_transfer.workflows_id) wf_id_in_prestaging.update(new_transfer.workflows_id) session.commit() ## auto approve it if execute: approved = approveSubscription(url, phedexid, [site_se]) for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status != 'staging': if execute: tr_wf.status = 'staging' if talk: print "setting", tr_wf.name, "to staging" session.commit()
def transferor(url, specific=None, talk=True, options=None): if userLock('transferor'): return if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() mcm = McMClient(dev=False) dss = DSS() print "counting all being handled..." being_handled = len( session.query(Workflow).filter(Workflow.status == 'away').all()) being_handled += len( session.query(Workflow).filter( Workflow.status.startswith('stag')).all()) being_handled += len( session.query(Workflow).filter( Workflow.status.startswith('assistance')).all()) max_to_handle = options.maxworkflows allowed_to_handle = max(0, max_to_handle - being_handled) wf_buffer = 5 if allowed_to_handle <= wf_buffer: ## buffer for having several wf per transfer print "Not allowed to run more than", max_to_handle, "at a time. Currently", being_handled, "and", wf_buffer, "buffer" else: print being_handled, "already being handled", max_to_handle, "max allowed,", allowed_to_handle, "remaining", "and", wf_buffer, "buffer" print "... done" all_transfers = defaultdict(list) workflow_dependencies = defaultdict( set) ## list of wf.id per input dataset wfs_and_wfh = [] print "getting all wf to consider ..." cache = getWorkflows(url, 'assignment-approved', details=True) for wfo in session.query(Workflow).filter( Workflow.status == 'considered').all(): if specific and not specific in wfo.name: continue cache_r = filter(lambda d: d['RequestName'] == wfo.name, cache) if len(cache_r): wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False, request=cache_r[0]))) else: wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False))) print "... done" input_sizes = {} ## list the size of those in transfer already in_transfer_priority = 0 min_transfer_priority = 100000000 print "getting all wf in staging ..." for wfo in session.query(Workflow).filter( Workflow.status == 'staging').all(): wfh = workflowInfo(url, wfo.name, spec=False) (_, primary, _, _) = wfh.getIO() for prim in primary: input_sizes[prim] = dss.get(prim) in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority'])) min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority'])) print "... done" print "Max priority in transfer already", in_transfer_priority print "Min priority in transfer already", min_transfer_priority in_transfer_already = sum(input_sizes.values()) #sort by priority higher first wfs_and_wfh.sort(cmp=lambda i, j: cmp(int(i[1].request[ 'RequestPriority']), int(j[1].request['RequestPriority'])), reverse=True) ## list the size of all inputs print "getting all input sizes ..." for (wfo, wfh) in wfs_and_wfh: (_, primary, _, _) = wfh.getIO() for prim in primary: input_sizes[prim] = dss.get(prim) print "... done" grand_total = sum(input_sizes.values()) to_transfer = grand_total - in_transfer_already grand_transfer_limit = options.maxtransfer transfer_limit = grand_transfer_limit - in_transfer_already print "%15.4f GB already being transfered" % in_transfer_already print "%15.4f GB is the current requested transfer load" % to_transfer print "%15.4f GB is the global transfer limit" % grand_transfer_limit print "%15.4f GB is the available limit" % transfer_limit # the max priority value per dataset. max_priority = defaultdict(int) needs_transfer = 0 ## so that we can count'em passing_along = 0 transfer_sizes = {} went_over_budget = False for (wfo, wfh) in wfs_and_wfh: print wfh.request['RequestPriority'] print wfo.name, "to be transfered" #wfh = workflowInfo( url, wfo.name) (_, primary, _, _) = wfh.getIO() this_load = sum([input_sizes[prim] for prim in primary]) if (this_load and (sum(transfer_sizes.values()) + this_load > transfer_limit or went_over_budget)): if went_over_budget: print "Transfer has gone over bubget." else: print "Transfer will go over bubget." print "%15.4f GB this load" % this_load print "%15.4f GB already this round" % sum(transfer_sizes.values()) print "%15.4f GB is the available limit" % transfer_limit went_over_budget = True if int( wfh.request['RequestPriority'] ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority: print "Higher priority sample", wfh.request[ 'RequestPriority'], ">=", in_transfer_priority, "go-on over budget" else: if not options.go: print min_transfer_priority, "minimum priority", wfh.request[ 'RequestPriority'], "<", in_transfer_priority, "stop" continue ## throtlle by campaign go if not CI.go(wfh.request['Campaign']): print "No go for", wfh.request['Campaign'] if not options.go: continue ## check if the batch is announced announced = False is_real = False for b in mcm.getA('batches', query='contains=%s' % wfo.name): is_real = True if b['status'] == 'announced': announced = True break if not announced: print wfo.name, "does not look announced." # skipping?, rejecting?, reporting?" if not is_real: print wfo.name, "does not appear to be genuine." ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status continue ## check on a grace period injection_time = time.mktime( time.strptime('.'.join(map(str, wfh.request['RequestDate'])), "%Y.%m.%d.%H.%M.%S")) / (60. * 60.) now = time.mktime(time.gmtime()) / (60. * 60.) if float(now - injection_time) < 4.: if not options.go and not announced: print "It is too soon to start transfer: %3.2fH remaining" % ( now - injection_time) continue passing_along += 1 if passing_along >= allowed_to_handle: if int( wfh.request['RequestPriority'] ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority: print "Higher priority sample", wfh.request[ 'RequestPriority'], ">=", in_transfer_priority, "go-on over", max_to_handle else: print "Not allowed to pass more than", max_to_handle, "at a time. Currently", being_handled, "handled, and adding", passing_along break (lheinput, primary, parent, secondary) = wfh.getIO() if options and options.tosites: sites_allowed = options.tosites.split(',') else: sites_allowed = getSiteWhiteList( (lheinput, primary, parent, secondary)) if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']): sites_allowed = CI.parameters( wfh.request['Campaign'])['SiteWhitelist'] blocks = [] if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']: blocks = wfh.request['BlockWhitelist'] can_go = True staging = False if primary: if talk: print wfo.name, 'reads', ', '.join(primary), 'in primary' ## chope the primary dataset for prim in primary: max_priority[prim] = max(max_priority[prim], int(wfh.request['RequestPriority'])) sites_really_allowed = [ site for site in sites_allowed if not any([ osite.startswith(site) for osite in SI.sites_veto_transfer ]) ] print "Sites allowed minus the vetoed transfer" print sites_really_allowed copies_needed = int( 0.35 * len(sites_really_allowed) ) + 1 ## should just go for a fixed number based if the white list grows that big print "Would make", copies_needed, "copies" if options.maxcopy > 0: copies_needed = min(options.maxcopy, copies_needed) ## remove the sites that do not want transfers print "need", copies_needed workflow_dependencies[prim].add(wfo.id) presence = getDatasetPresence(url, prim) prim_location = [ site for site, pres in presence.items() if pres[0] == True ] if len(prim_location) >= copies_needed: print "The output is all fully in place at", len( prim_location), "sites" continue # reduce the number of copies required by existing full copies copies_needed = max(0, copies_needed - len(prim_location)) print "now need", copies_needed subscriptions = listSubscriptions(url, prim) prim_destination = list( set([ site for (site, (tid, decision)) in subscriptions.items() if decision and not any([ site.endswith(veto) for veto in ['MSS', 'Export', 'Buffer'] ]) ])) ## need to reject from that list the ones with a full copy already: i.e the transfer corresponds to the copy in place prim_destination = [ site for site in prim_destination if not site in prim_location ] ## add transfer dependencies latching_on_transfers = list( set([ tid for (site, (tid, decision)) in subscriptions.items() if decision and site in prim_destination and not any([ site.endswith(veto) for veto in ['MSS', 'Export', 'Buffer'] ]) ])) print latching_on_transfers for latching in latching_on_transfers: tfo = session.query(Transfer).filter( Transfer.phedexid == latching).first() if not tfo: tfo = Transfer(phedexid=latching) tfo.workflows_id = [] session.add(tfo) if not wfo.id in tfo.workflows_id: print "adding", wfo.id, "to", tfo.id, "with phedexid", latching l = copy.deepcopy(tfo.workflows_id) l.append(wfo.id) tfo.workflows_id = l if not options.test: session.commit() else: session.flush( ) ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query can_go = False transfer_sizes[prim] = input_sizes[prim] staging = True # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ?? copies_needed = max(0, copies_needed - len(prim_destination)) print "then need", copies_needed if copies_needed == 0: print "The output is either fully in place or getting in full somewhere with", latching_on_transfers can_go = True continue prim_to_distribute = [ site for site in sites_allowed if not any( [osite.startswith(site) for osite in prim_location]) ] prim_to_distribute = [ site for site in prim_to_distribute if not any( [osite.startswith(site) for osite in prim_destination]) ] ## take out the ones that cannot receive transfers prim_to_distribute = [ site for site in prim_to_distribute if not any([ osite.startswith(site) for osite in SI.sites_veto_transfer ]) ] if len( prim_to_distribute ) > 0: ## maybe that a parameter we can play with to limit the if not options or options.chop: spreading = distributeToSites(getDatasetChops(prim), prim_to_distribute, n_copies=copies_needed, weights=SI.cpu_pledges) else: spreading = {} for site in prim_to_distribute: spreading[site] = [prim] can_go = False transfer_sizes[prim] = input_sizes[prim] for (site, items) in spreading.items(): all_transfers[site].extend(items) if secondary: if talk: print wfo.name, 'reads', ', '.join(secondary), 'in secondary' for sec in secondary: workflow_dependencies[sec].add(wfo.id) presence = getDatasetPresence(url, sec) sec_location = [ site for site, pres in presence.items() if pres[1] > 90. ] ## more than 90% of the minbias at sites subscriptions = listSubscriptions(url, sec) sec_destination = [site for site in subscriptions] sec_to_distribute = [ site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location]) ] sec_to_distribute = [ site for site in sec_to_distribute if not any( [osite.startswith(site) for osite in sec_destination]) ] sec_to_distribute = [ site for site in sec_to_distribute if not any([ osite.startswith(site) for osite in SI.sites_veto_transfer ]) ] if len(sec_to_distribute) > 0: for site in sec_to_distribute: all_transfers[site].append(sec) can_go = False ## is that possible to do something more if can_go: ## no explicit transfer required this time if staging: ## but using existing ones print wfo.name, "latches on existing transfers, and nothing else" wfo.status = 'staging' else: print wfo.name, "should just be assigned NOW to", sites_allowed wfo.status = 'staged' print "setting status to", wfo.status session.commit() continue else: ## there is an explicit transfer required if staging: ## and also using an existing one print wfo.name, "latches on existing transfers" if not options.test: wfo.status = 'staging' print "setting status to", wfo.status session.commit() print wfo.name, "needs a transfer" needs_transfer += 1 #print json.dumps(all_transfers) fake_id = -1 wf_id_in_prestaging = set() for (site, items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## site that do not want input datasets if site in SI.sites_veto_transfer: print site, "does not want transfers" continue ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured. ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] datasets = [it for it in items_to_transfer if not '#' in it] if execute: print "Making a replica to", site, "(CE)", site_se, "(SE) for" else: print "Would make a replica to", site, "(CE)", site_se, "(SE) for" print "\t", len(blocks), "blocks" ## remove blocks if full dataset is send out blocks = [ block for block in blocks if not block.split('#')[0] in datasets ] print "\t", len(blocks), "needed blocks for", list( set([block.split('#')[0] for block in blocks])) print "\t", len(datasets), "datasets" print "\t", datasets items_to_transfer = blocks + datasets ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y', 'yes', 'go']: continue if execute: result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority='normal') ## make use of max_priority dataset:priority to set the subscriptions priority """ ## does not function once = True for item in items_to_transfer: bds = item.split('#')[0] if max_priority[bds] >= 90000: if once: w=10 print "waiting",w,"s before raising priority" time.sleep(w) once=False ## raise it to high priority print item,"subscription priority raised to high at",site_se #print "This does not work yet properly it seems" print updateSubscription(url, site_se, item, priority='high') """ else: #result= {'phedex':{'request_created' : [{'id' : fake_id}]}} result = {'phedex': {'request_created': []}} fake_id -= 1 if not result: print "ERROR Could not make a replica request for", site, items_to_transfer, "pre-staging" continue for phedexid in [o['id'] for o in result['phedex']['request_created']]: new_transfer = session.query(Transfer).filter( Transfer.phedexid == phedexid).first() print phedexid, "transfer created" if not new_transfer: new_transfer = Transfer(phedexid=phedexid) session.add(new_transfer) new_transfer.workflows_id = set() for transfering in list( set(map(lambda it: it.split('#')[0], items_to_transfer))): new_transfer.workflows_id.update( workflow_dependencies[transfering]) new_transfer.workflows_id = list(new_transfer.workflows_id) wf_id_in_prestaging.update(new_transfer.workflows_id) session.commit() ## auto approve it if execute: approved = approveSubscription(url, phedexid, [site_se]) for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status != 'staging': if execute: tr_wf.status = 'staging' if talk: print "setting", tr_wf.name, "to staging" session.commit()
def main(): mysqlconn = MySQLdb.connect(host='dbod-cmsrv1.cern.ch', user='******', passwd="relval", port=5506) #conn = MySQLdb.connect(host='localhost', user='******', passwd='relval') curs = mysqlconn.cursor() curs.execute("use "+dbname+";") #curs.execute("lock tables batches write, batches_archive write, workflows write, workflows_archive write, datasets write, clone_reinsert_requests write") #workflow = line.rstrip('\n') #curs.execute("insert into workflows set hn_req=\""+hnrequest+"\", workflow_name=\""+workflow+"\";") curs.execute("select * from batches") batches=curs.fetchall() batches_colnames = [desc[0] for desc in curs.description] for batch in batches: #if batch[0] != 21: # continue blocks_dsets_to_transfer=[] blocks_not_at_site=[] batch_dict = dict(zip(batches_colnames, batch)) site = batch_dict["site"] if "T2" in site: site_disk = site elif "T1" in site: site_disk = site + "_Disk" else: os.system('echo '+site+' | mail -s \"input_dset_checkor.py error 1\" [email protected]') print "Neither T1 nor T2 is in site name, exiting" sys.exit(1) if site == "T2_CH_CERN_T0": site_disk = "T2_CH_CERN" if site == "T2_CH_CERN_AI": site_disk = "T2_CH_CERN" #print batch #print "" userid = batch_dict["useridyear"]+"_"+batch_dict["useridmonth"]+"_"+batch_dict["useridday"]+"_"+str(batch_dict["useridnum"])+"_"+str(batch_dict["batch_version_num"]) #if status == "waiting_for_transfer" and count % 10 == 0: if batch_dict["status"] == "waiting_for_transfer": print " userid ==> "+str(userid) #count = 0 all_dsets_blocks_at_site=True curs.execute("select workflow_name from workflows where useridyear = \""+ batch_dict["useridyear"]+"\" and useridmonth = \""+batch_dict["useridmonth"]+"\" and useridday = \""+batch_dict["useridday"]+"\" and useridnum = "+str(batch_dict["useridnum"])+" and batch_version_num = "+str(batch_dict["batch_version_num"])+";") wfs=curs.fetchall() for wf in wfs: print wf[0] headers = {"Content-type": "application/json", "Accept": "application/json"} conn = httplib.HTTPSConnection('cmsweb.cern.ch', cert_file = os.getenv('X509_USER_PROXY'), key_file = os.getenv('X509_USER_PROXY')) r1=conn.request("GET",'/reqmgr2/data/request/'+wf[0],headers=headers) r2=conn.getresponse() schema = json.loads(r2.read()) schema = schema['result'][0][wf[0]] isthereanmcpileupdataset=False for key, value in schema.items(): if type(value) is dict and key.startswith("Task"): if 'MCPileup' in value: isthereanmcpileupdataset=True ismcpileupdatasetatsite=utils.checkIfDatasetIsSubscribedToASite("cmsweb.cern.ch",value["MCPileup"],site_disk) if 'InputDataset' in value: inputdset=value['InputDataset'] if 'RunWhitelist' in value: runwhitelist=value['RunWhitelist'] blocks_fname=os.popen("mktemp").read().rstrip("\n") list_of_blocks=utils.getListOfBlocks(inputdset,str(runwhitelist)) for block in list_of_blocks: #this block (/DoubleMu/...) is not registered in phedex, so it cannot be subscribed to any site if block == "/DoubleMu/Run2011A-ZMu-08Nov2011-v1/RAW-RECO#93c53d22-25b2-11e1-8c62-003048f02c8a": continue isblockatsite = utils.checkIfBlockIsAtASite("cmsweb.cern.ch",block,site_disk) if not isblockatsite: all_dsets_blocks_at_site=False else: isdatasetatsite=utils.checkIfDatasetIsSubscribedToASite("cmsweb.cern.ch",inputdset,site_disk) if not isdatasetatsite: all_dsets_blocks_at_site=False if all_dsets_blocks_at_site and (not isthereanmcpileupdataset or ismcpileupdatasetatsite): curs.execute("update batches set status=\"input_dsets_ready\", current_status_start_time=\""+datetime.datetime.now().strftime("%y:%m:%d %H:%M:%S")+"\" where useridyear = \""+ batch_dict["useridyear"]+"\" and useridmonth = \""+batch_dict["useridmonth"]+"\" and useridday = \""+batch_dict["useridday"]+"\" and useridnum = "+str(batch_dict["useridnum"])+" and batch_version_num = "+str(batch_dict["batch_version_num"])+";") mysqlconn.commit() if batch_dict["status"] == "approved": print " userid ==> "+str(userid) #print "checking input datasets for workflows in batch "+str(batchid) curs.execute("select workflow_name from workflows where useridyear = \""+ batch_dict["useridyear"]+"\" and useridmonth = \""+batch_dict["useridmonth"]+"\" and useridday = \""+batch_dict["useridday"]+"\" and useridnum = "+str(batch_dict["useridnum"])+" and batch_version_num = "+str(batch_dict["batch_version_num"])+";") wfs=curs.fetchall() for wf in wfs: print wf[0] headers = {"Content-type": "application/json", "Accept": "application/json"} conn = httplib.HTTPSConnection('cmsweb.cern.ch', cert_file = os.getenv('X509_USER_PROXY'), key_file = os.getenv('X509_USER_PROXY')) r1=conn.request("GET",'/reqmgr2/data/request/'+wf[0],headers=headers) r2=conn.getresponse() schema = json.loads(r2.read()) schema = schema['result'][0][wf[0]] for key, value in schema.items(): if type(value) is dict and key.startswith("Task"): if 'MCPileup' in value: isdatasetatsite=utils.checkIfDatasetIsSubscribedToASite("cmsweb.cern.ch",value['MCPileup'],site_disk) if not isdatasetatsite: blocks_dsets_to_transfer.append(value['MCPileup']) if 'InputDataset' in value: subscribed_to_disk=False inputdset=value['InputDataset'] if 'RunWhitelist' in value: runwhitelist=value['RunWhitelist'] list_of_blocks=utils.getListOfBlocks(inputdset,str(runwhitelist)) for block in list_of_blocks: #this block (/DoubleMu/...) is not registered in phedex, so it cannot be subscribed to any site if block == "/DoubleMu/Run2011A-ZMu-08Nov2011-v1/RAW-RECO#93c53d22-25b2-11e1-8c62-003048f02c8a": continue isblocksubscribedtosite=utils.checkIfBlockIsSubscribedToASite("cmsweb.cern.ch",block,site_disk) isblockatsite=utils.checkIfBlockIsAtASite("cmsweb.cern.ch",block,site_disk) if not isblocksubscribedtosite: blocks_dsets_to_transfer.append(block) if not isblockatsite: blocks_not_at_site.append(block) else: isdatasetsubscribedtosite=utils.checkIfDatasetIsSubscribedToASite("cmsweb.cern.ch",inputdset,site_disk) isdatasetatsite=utils.checkIfDatasetIsSubscribedToASite("cmsweb.cern.ch",inputdset,site_disk) if not isdatasetsubscribedtosite: blocks_dsets_to_transfer.append(inputdset) if not isdatasetatsite: blocks_not_at_site.append(inputdset) if blocks_dsets_to_transfer != []: print "transfering the following blocks:" print blocks_dsets_to_transfer result=utils.makeReplicaRequest(url="cmsweb.cern.ch", site=site_disk, datasets=blocks_dsets_to_transfer, comments="relval datasets", group = "RelVal") phedexid = result['phedex']['request_created'][0]['id'] utils.approveSubscription("cmsweb.cern.ch",phedexid) curs.execute("update batches set status=\"waiting_for_transfer\", current_status_start_time=\""+datetime.datetime.now().strftime("%y:%m:%d %H:%M:%S")+"\" where useridyear = \""+ batch_dict["useridyear"]+"\" and useridmonth = \""+batch_dict["useridmonth"]+"\" and useridday = \""+batch_dict["useridday"]+"\" and useridnum = "+str(batch_dict["useridnum"])+" and batch_version_num = "+str(batch_dict["batch_version_num"])+";") mysqlconn.commit() elif blocks_not_at_site != []: print blocks_not_at_site curs.execute("update batches set status=\"waiting_for_transfer\", current_status_start_time=\""+datetime.datetime.now().strftime("%y:%m:%d %H:%M:%S")+"\" where useridyear = \""+ batch_dict["useridyear"]+"\" and useridmonth = \""+batch_dict["useridmonth"]+"\" and useridday = \""+batch_dict["useridday"]+"\" and useridnum = "+str(batch_dict["useridnum"])+" and batch_version_num = "+str(batch_dict["batch_version_num"])+";") else: curs.execute("update batches set status=\"input_dsets_ready\", current_status_start_time=\""+datetime.datetime.now().strftime("%y:%m:%d %H:%M:%S")+"\" where useridyear = \""+ batch_dict["useridyear"]+"\" and useridmonth = \""+batch_dict["useridmonth"]+"\" and useridday = \""+batch_dict["useridday"]+"\" and useridnum = "+str(batch_dict["useridnum"])+" and batch_version_num = "+str(batch_dict["batch_version_num"])+";") mysqlconn.commit()
def stagor(url,specific =None): done_by_wf_id = {} done_by_input = {} completion_by_input = {} good_enough = 100.0 for wfo in session.query(Workflow).filter(Workflow.status == 'staging').all(): ## implement the grace period for by-passing the transfer. pass for transfer in session.query(Transfer).all(): if specific and str(transfer.phedexid)!=str(specific): continue skip=True for wfid in transfer.workflows_id: tr_wf = session.query(Workflow).get(wfid) if tr_wf: if tr_wf.status == 'staging': skip=False break if skip: continue if transfer.phedexid<0: continue ## check the status of transfers checks = checkTransferApproval(url, transfer.phedexid) approved = all(checks.values()) if not approved: print transfer.phedexid,"is not yet approved" approveSubscription(url, transfer.phedexid) continue ## check on transfer completion checks = checkTransferStatus(url, transfer.phedexid, nocollapse=True) if not specific: for dsname in checks: if not dsname in done_by_input: done_by_input[dsname]={} if not dsname in completion_by_input: completion_by_input[dsname] = {} done_by_input[dsname][transfer.phedexid]=all(map(lambda i:i>=good_enough, checks[dsname].values())) completion_by_input[dsname][transfer.phedexid]=checks[dsname].values() if checks: print "Checks for",transfer.phedexid,[node.values() for node in checks.values()] done = all(map(lambda i:i>=good_enough,list(itertools.chain.from_iterable([node.values() for node in checks.values()])))) else: ## it is empty, is that a sign that all is done and away ? print "ERROR with the scubscriptions API of ",transfer.phedexid print "Most likely something else is overiding the transfer request. Need to work on finding the replacement automatically, if the replacement exists" done = False ## the thing above is NOT giving the right number #done = False for wfid in transfer.workflows_id: tr_wf = session.query(Workflow).get(wfid) if tr_wf:# and tr_wf.status == 'staging': if not tr_wf.id in done_by_wf_id: done_by_wf_id[tr_wf.id]={} done_by_wf_id[tr_wf.id][transfer.phedexid]=done if done: ## transfer.status = 'done' print transfer.phedexid,"is done" else: print transfer.phedexid,"not finished" pprint.pprint( checks ) #print done_by_input print "\n----\n" for dsname in done_by_input: fractions = None if dsname in completion_by_input: fractions = itertools.chain.from_iterable([check.values() for check in completion_by_input.values()]) ## the workflows in the waiting room for the dataset using_its = getWorkflowByInput(url, dsname) #print using_its using_wfos = [] for using_it in using_its: wf = session.query(Workflow).filter(Workflow.name == using_it).first() if wf: using_wfos.append( wf ) #need_sites = int(len(done_by_input[dsname].values())*0.7)+1 need_sites = len(done_by_input[dsname].values()) if need_sites > 10: need_sites = int(need_sites/2.) got = done_by_input[dsname].values().count(True) if all([wf.status != 'staging' for wf in using_wfos]): ## not a single ds-using wf is in staging => moved on already ## just forget about it print "presence of",dsname,"does not matter anymore" print "\t",done_by_input[dsname] print "\t",[wf.status for wf in using_wfos] print "\tneeds",need_sites continue #?? ## should the need_sites reduces with time ? # with dataset choping, reducing that number might work as a block black-list. if all(done_by_input[dsname].values()): print dsname,"is everywhere we wanted" ## the input dataset is fully transfered, should consider setting the corresponding wf to staged for wf in using_wfos: if wf.status == 'staging': print wf.name,"is with us. setting staged and move on" wf.status = 'staged' session.commit() elif fractions and len(list(fractions))>1 and set(fractions)==1: print dsname,"is everywhere at the same fraction" print "We do not want this in the end. we want the data we asked for" continue ## the input dataset is fully transfered, should consider setting the corresponding wf to staged for wf in using_wfos: if wf.status == 'staging': print wf.name,"is with us everywhere the same. setting staged and move on" wf.status = 'staged' session.commit() elif got >= need_sites: print dsname,"is almost everywhere we wanted" #print "We do not want this in the end. we want the data we asked for" #continue ## the input dataset is fully transfered, should consider setting the corresponding wf to staged for wf in using_wfos: if wf.status == 'staging': print wf.name,"is almost with us. setting staged and move on" wf.status = 'staged' session.commit() else: print dsname print "\t",done_by_input[dsname] print "\tneeds",need_sites print "\tgot",got for wfid in done_by_wf_id: #print done_by_wf_id[wfid].values() ## ask that all related transfer get into a valid state if all(done_by_wf_id[wfid].values()): pass
def stagor(url, specific=None): done_by_wf_id = {} done_by_input = {} completion_by_input = {} good_enough = 100.0 for wfo in session.query(Workflow).filter( Workflow.status == 'staging').all(): ## implement the grace period for by-passing the transfer. pass for transfer in session.query(Transfer).all(): if specific and str(transfer.phedexid) != str(specific): continue skip = True for wfid in transfer.workflows_id: tr_wf = session.query(Workflow).get(wfid) if tr_wf: if tr_wf.status == 'staging': skip = False break if skip: continue if transfer.phedexid < 0: continue ## check the status of transfers checks = checkTransferApproval(url, transfer.phedexid) approved = all(checks.values()) if not approved: print transfer.phedexid, "is not yet approved" approveSubscription(url, transfer.phedexid) continue ## check on transfer completion checks = checkTransferStatus(url, transfer.phedexid, nocollapse=True) if not specific: for dsname in checks: if not dsname in done_by_input: done_by_input[dsname] = {} if not dsname in completion_by_input: completion_by_input[dsname] = {} done_by_input[dsname][transfer.phedexid] = all( map(lambda i: i >= good_enough, checks[dsname].values())) completion_by_input[dsname][ transfer.phedexid] = checks[dsname].values() if checks: print "Checks for", transfer.phedexid, [ node.values() for node in checks.values() ] done = all( map( lambda i: i >= good_enough, list( itertools.chain.from_iterable( [node.values() for node in checks.values()])))) else: ## it is empty, is that a sign that all is done and away ? print "ERROR with the scubscriptions API of ", transfer.phedexid print "Most likely something else is overiding the transfer request. Need to work on finding the replacement automatically, if the replacement exists" done = False ## the thing above is NOT giving the right number #done = False for wfid in transfer.workflows_id: tr_wf = session.query(Workflow).get(wfid) if tr_wf: # and tr_wf.status == 'staging': if not tr_wf.id in done_by_wf_id: done_by_wf_id[tr_wf.id] = {} done_by_wf_id[tr_wf.id][transfer.phedexid] = done if done: ## transfer.status = 'done' print transfer.phedexid, "is done" else: print transfer.phedexid, "not finished" pprint.pprint(checks) #print done_by_input print "\n----\n" for dsname in done_by_input: fractions = None if dsname in completion_by_input: fractions = itertools.chain.from_iterable( [check.values() for check in completion_by_input.values()]) ## the workflows in the waiting room for the dataset using_its = getWorkflowByInput(url, dsname) #print using_its using_wfos = [] for using_it in using_its: wf = session.query(Workflow).filter( Workflow.name == using_it).first() if wf: using_wfos.append(wf) #need_sites = int(len(done_by_input[dsname].values())*0.7)+1 need_sites = len(done_by_input[dsname].values()) if need_sites > 10: need_sites = int(need_sites / 2.) got = done_by_input[dsname].values().count(True) if all([wf.status != 'staging' for wf in using_wfos]): ## not a single ds-using wf is in staging => moved on already ## just forget about it print "presence of", dsname, "does not matter anymore" print "\t", done_by_input[dsname] print "\t", [wf.status for wf in using_wfos] print "\tneeds", need_sites continue #?? ## should the need_sites reduces with time ? # with dataset choping, reducing that number might work as a block black-list. if all(done_by_input[dsname].values()): print dsname, "is everywhere we wanted" ## the input dataset is fully transfered, should consider setting the corresponding wf to staged for wf in using_wfos: if wf.status == 'staging': print wf.name, "is with us. setting staged and move on" wf.status = 'staged' session.commit() elif fractions and len(list(fractions)) > 1 and set(fractions) == 1: print dsname, "is everywhere at the same fraction" print "We do not want this in the end. we want the data we asked for" continue ## the input dataset is fully transfered, should consider setting the corresponding wf to staged for wf in using_wfos: if wf.status == 'staging': print wf.name, "is with us everywhere the same. setting staged and move on" wf.status = 'staged' session.commit() elif got >= need_sites: print dsname, "is almost everywhere we wanted" #print "We do not want this in the end. we want the data we asked for" #continue ## the input dataset is fully transfered, should consider setting the corresponding wf to staged for wf in using_wfos: if wf.status == 'staging': print wf.name, "is almost with us. setting staged and move on" wf.status = 'staged' session.commit() else: print dsname print "\t", done_by_input[dsname] print "\tneeds", need_sites print "\tgot", got for wfid in done_by_wf_id: #print done_by_wf_id[wfid].values() ## ask that all related transfer get into a valid state if all(done_by_wf_id[wfid].values()): pass
def transferor(url ,specific = None, talk=True, options=None): if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() all_transfers=defaultdict(list) workflow_dependencies = defaultdict(set) ## list of wf.id per input dataset data_to_wf = {} for wfo in session.query(Workflow).filter(Workflow.status=='considered').all(): if specific and not specific in wfo.name: continue print wfo.name,"to be transfered" wfh = workflowInfo( url, wfo.name) #injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.) #now = time.mktime(time.gmtime()) / (60.*60.) #if float(now - injection_time) < 4.: # print "It is too soon to transfer", now, injection_time # continue (lheinput,primary,parent,secondary) = wfh.getIO() if options and options.tosites: sites_allowed = options.tosites.split(',') else: sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']): sites_allowed = CI.parameters(wfh.request['Campaign'])['SiteWhitelist'] blocks = [] if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']: blocks = wfh.request['BlockWhitelist'] can_go = True if primary: if talk: print wfo.name,'reads',', '.join(primary),'in primary' ## chope the primary dataset for prim in primary: workflow_dependencies[prim].add( wfo.id ) presence = getDatasetPresence( url, prim ) prim_location = [site for site,pres in presence.items() if pres[0]==True] subscriptions = listSubscriptions( url , prim ) prim_destination = [site for site in subscriptions] prim_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in prim_location])] prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in prim_destination])] if len(prim_to_distribute)>0: ## maybe that a parameter we can play with to limit the if not options or options.chop: spreading = distributeToSites( [[prim]]+getDatasetChops(prim), prim_to_distribute, n_copies = 3, weights=SI.cpu_pledges) else: spreading = {} for site in prim_to_distribute: spreading[site]=[prim] can_go = False for (site,items) in spreading.items(): all_transfers[site].extend( items ) if secondary: if talk: print wfo.name,'reads',', '.join(secondary),'in secondary' for sec in secondary: workflow_dependencies[sec].add( wfo.id ) presence = getDatasetPresence( url, sec ) sec_location = [site for site,pres in presence.items() if pres[1]>90.] ## more than 90% of the minbias at sites subscriptions = listSubscriptions( url ,sec ) sec_destination = [site for site in subscriptions] sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])] if len( sec_to_distribute )>0: for site in sec_to_distribute: all_transfers[site].append( sec ) can_go = False ## is that possible to do something more if can_go: print wfo.name,"should just be assigned NOW to",sites_allowed wfo.status = 'staged' session.commit() continue else: print wfo.name,"needs a transfer" #print json.dumps(all_transfers) fake_id=-1 wf_id_in_prestaging=set() for (site,items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] datasets = [it for it in items_to_transfer if not '#' in it] print "Making a replica to",site,"(CE)",site_se,"(SE) for" print "\t",len(blocks),"blocks" ## remove blocks if full dataset is send out blocks = [block for block in blocks if not block.split('#')[0] in datasets] print "\t",len(blocks),"needed blocks" print "\t",len(datasets),"datasets" print "\t",datasets items_to_transfer = blocks + datasets ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y','yes','go']: continue if execute: result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging') else: #result= {'phedex':{'request_created' : [{'id' : fake_id}]}} result= {'phedex':{'request_created' : []}} fake_id-=1 if not result: print "ERROR Could not make a replica request for",site,items_to_transfer,"pre-staging" continue for phedexid in [o['id'] for o in result['phedex']['request_created']]: new_transfer = session.query(Transfer).filter(Transfer.phedexid == phedexid).first() if not new_transfer: new_transfer = Transfer( phedexid = phedexid) session.add( new_transfer ) new_transfer.workflows_id = set() for transfering in list(set(map(lambda it : it.split('#')[0], items_to_transfer))): new_transfer.workflows_id.update( workflow_dependencies[transfering] ) new_transfer.workflows_id = list(new_transfer.workflows_id) wf_id_in_prestaging.update(new_transfer.workflows_id) session.commit() ## auto approve it if execute: approved = approveSubscription(url, phedexid, [site_se]) for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status!='staging': if execute: tr_wf.status = 'staging' if talk: print "setting",tr_wf.name,"to staging" session.commit()
def transferor(url ,specific = None, talk=True, options=None): if userLock('transferor'): return if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() mcm = McMClient(dev=False) dss = DSS() print "counting all being handled..." being_handled = len(session.query(Workflow).filter(Workflow.status == 'away').all()) being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('stag')).all()) being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance')).all()) max_to_handle = options.maxworkflows allowed_to_handle = max(0,max_to_handle - being_handled) wf_buffer = 5 if allowed_to_handle<=wf_buffer: ## buffer for having several wf per transfer print "Not allowed to run more than",max_to_handle,"at a time. Currently",being_handled,"and",wf_buffer,"buffer" else: print being_handled,"already being handled",max_to_handle,"max allowed,",allowed_to_handle,"remaining","and",wf_buffer,"buffer" print "... done" all_transfers=defaultdict(list) workflow_dependencies = defaultdict(set) ## list of wf.id per input dataset wfs_and_wfh=[] print "getting all wf to consider ..." cache = getWorkflows(url, 'assignment-approved', details=True) for wfo in session.query(Workflow).filter(Workflow.status=='considered').all(): if specific and not specific in wfo.name: continue cache_r =filter(lambda d:d['RequestName']==wfo.name, cache) if len(cache_r): wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False, request = cache_r[0]) ) ) else: wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False) ) ) print "... done" input_sizes = {} ## list the size of those in transfer already in_transfer_priority=0 min_transfer_priority=100000000 print "getting all wf in staging ..." for wfo in session.query(Workflow).filter(Workflow.status=='staging').all(): wfh = workflowInfo( url, wfo.name, spec=False) (_,primary,_,_) = wfh.getIO() for prim in primary: input_sizes[prim] = dss.get( prim ) in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority'])) min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority'])) print "... done" print "Max priority in transfer already",in_transfer_priority print "Min priority in transfer already",min_transfer_priority in_transfer_already = sum(input_sizes.values()) #sort by priority higher first wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority']) ), reverse=True) ## list the size of all inputs print "getting all input sizes ..." for (wfo,wfh) in wfs_and_wfh: (_,primary,_,_) = wfh.getIO() for prim in primary: input_sizes[prim] = dss.get( prim ) print "... done" grand_total = sum(input_sizes.values()) to_transfer = grand_total - in_transfer_already grand_transfer_limit = options.maxtransfer transfer_limit = grand_transfer_limit - in_transfer_already print "%15.4f GB already being transfered"%in_transfer_already print "%15.4f GB is the current requested transfer load"%to_transfer print "%15.4f GB is the global transfer limit"%grand_transfer_limit print "%15.4f GB is the available limit"%transfer_limit # the max priority value per dataset. max_priority = defaultdict(int) needs_transfer=0 ## so that we can count'em passing_along = 0 transfer_sizes={} went_over_budget=False for (wfo,wfh) in wfs_and_wfh: print wfh.request['RequestPriority'] print wfo.name,"to be transfered" #wfh = workflowInfo( url, wfo.name) (_,primary,_,_) = wfh.getIO() this_load=sum([input_sizes[prim] for prim in primary]) if ( this_load and (sum(transfer_sizes.values())+this_load > transfer_limit or went_over_budget ) ): if went_over_budget: print "Transfer has gone over bubget." else: print "Transfer will go over bubget." print "%15.4f GB this load"%this_load print "%15.4f GB already this round"%sum(transfer_sizes.values()) print "%15.4f GB is the available limit"%transfer_limit went_over_budget=True if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over budget" else: if not options.go: print min_transfer_priority,"minimum priority",wfh.request['RequestPriority'],"<",in_transfer_priority,"stop" continue ## throtlle by campaign go if not CI.go( wfh.request['Campaign'] ): print "No go for",wfh.request['Campaign'] if not options.go: continue ## check if the batch is announced announced=False is_real=False for b in mcm.getA('batches',query='contains=%s'% wfo.name): is_real = True if b['status']=='announced': announced=True break if not announced: print wfo.name,"does not look announced."# skipping?, rejecting?, reporting?" if not is_real: print wfo.name,"does not appear to be genuine." ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status continue ## check on a grace period injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.) now = time.mktime(time.gmtime()) / (60.*60.) if float(now - injection_time) < 4.: if not options.go and not announced: print "It is too soon to start transfer: %3.2fH remaining"%(now - injection_time) continue passing_along += 1 if passing_along >= allowed_to_handle: if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over",max_to_handle else: print "Not allowed to pass more than",max_to_handle,"at a time. Currently",being_handled,"handled, and adding",passing_along break (lheinput,primary,parent,secondary) = wfh.getIO() if options and options.tosites: sites_allowed = options.tosites.split(',') else: sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']): sites_allowed = CI.parameters(wfh.request['Campaign'])['SiteWhitelist'] blocks = [] if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']: blocks = wfh.request['BlockWhitelist'] can_go = True staging=False if primary: if talk: print wfo.name,'reads',', '.join(primary),'in primary' ## chope the primary dataset for prim in primary: max_priority[prim] = max(max_priority[prim],int(wfh.request['RequestPriority'])) sites_really_allowed = [site for site in sites_allowed if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] print "Sites allowed minus the vetoed transfer" print sites_really_allowed copies_needed = int(0.35*len(sites_really_allowed))+1 ## should just go for a fixed number based if the white list grows that big print "Would make",copies_needed,"copies" if options.maxcopy>0: copies_needed = min(options.maxcopy,copies_needed) ## remove the sites that do not want transfers print "need",copies_needed workflow_dependencies[prim].add( wfo.id ) presence = getDatasetPresence( url, prim ) prim_location = [site for site,pres in presence.items() if pres[0]==True] if len(prim_location) >= copies_needed: print "The output is all fully in place at",len(prim_location),"sites" continue # reduce the number of copies required by existing full copies copies_needed = max(0,copies_needed - len(prim_location)) print "now need",copies_needed subscriptions = listSubscriptions( url , prim ) prim_destination = list(set([site for (site,(tid,decision)) in subscriptions.items() if decision and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])])) ## need to reject from that list the ones with a full copy already: i.e the transfer corresponds to the copy in place prim_destination = [site for site in prim_destination if not site in prim_location] ## add transfer dependencies latching_on_transfers = list(set([ tid for (site,(tid,decision)) in subscriptions.items() if decision and site in prim_destination and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])])) print latching_on_transfers for latching in latching_on_transfers: tfo = session.query(Transfer).filter(Transfer.phedexid == latching).first() if not tfo: tfo = Transfer( phedexid = latching) tfo.workflows_id = [] session.add(tfo) if not wfo.id in tfo.workflows_id: print "adding",wfo.id,"to",tfo.id,"with phedexid",latching l = copy.deepcopy( tfo.workflows_id ) l.append( wfo.id ) tfo.workflows_id = l if not options.test: session.commit() else: session.flush() ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query can_go = False transfer_sizes[prim] = input_sizes[prim] staging = True # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ?? copies_needed = max(0,copies_needed - len(prim_destination)) print "then need",copies_needed if copies_needed == 0: print "The output is either fully in place or getting in full somewhere with",latching_on_transfers can_go = True continue prim_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in prim_location])] prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in prim_destination])] ## take out the ones that cannot receive transfers prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] if len(prim_to_distribute)>0: ## maybe that a parameter we can play with to limit the if not options or options.chop: spreading = distributeToSites( getDatasetChops(prim), prim_to_distribute, n_copies = copies_needed, weights=SI.cpu_pledges) else: spreading = {} for site in prim_to_distribute: spreading[site]=[prim] can_go = False transfer_sizes[prim] = input_sizes[prim] for (site,items) in spreading.items(): all_transfers[site].extend( items ) if secondary: if talk: print wfo.name,'reads',', '.join(secondary),'in secondary' for sec in secondary: workflow_dependencies[sec].add( wfo.id ) presence = getDatasetPresence( url, sec ) sec_location = [site for site,pres in presence.items() if pres[1]>90.] ## more than 90% of the minbias at sites subscriptions = listSubscriptions( url ,sec ) sec_destination = [site for site in subscriptions] sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] if len( sec_to_distribute )>0: for site in sec_to_distribute: all_transfers[site].append( sec ) can_go = False ## is that possible to do something more if can_go: ## no explicit transfer required this time if staging: ## but using existing ones print wfo.name,"latches on existing transfers, and nothing else" wfo.status = 'staging' else: print wfo.name,"should just be assigned NOW to",sites_allowed wfo.status = 'staged' print "setting status to",wfo.status session.commit() continue else: ## there is an explicit transfer required if staging: ## and also using an existing one print wfo.name,"latches on existing transfers" if not options.test: wfo.status = 'staging' print "setting status to",wfo.status session.commit() print wfo.name,"needs a transfer" needs_transfer+=1 #print json.dumps(all_transfers) fake_id=-1 wf_id_in_prestaging=set() for (site,items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## site that do not want input datasets if site in SI.sites_veto_transfer: print site,"does not want transfers" continue ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured. ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] datasets = [it for it in items_to_transfer if not '#' in it] if execute: print "Making a replica to",site,"(CE)",site_se,"(SE) for" else: print "Would make a replica to",site,"(CE)",site_se,"(SE) for" print "\t",len(blocks),"blocks" ## remove blocks if full dataset is send out blocks = [block for block in blocks if not block.split('#')[0] in datasets] print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks])) print "\t",len(datasets),"datasets" print "\t",datasets items_to_transfer = blocks + datasets ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y','yes','go']: continue if execute: result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority='normal') ## make use of max_priority dataset:priority to set the subscriptions priority """ ## does not function once = True for item in items_to_transfer: bds = item.split('#')[0] if max_priority[bds] >= 90000: if once: w=10 print "waiting",w,"s before raising priority" time.sleep(w) once=False ## raise it to high priority print item,"subscription priority raised to high at",site_se #print "This does not work yet properly it seems" print updateSubscription(url, site_se, item, priority='high') """ else: #result= {'phedex':{'request_created' : [{'id' : fake_id}]}} result= {'phedex':{'request_created' : []}} fake_id-=1 if not result: print "ERROR Could not make a replica request for",site,items_to_transfer,"pre-staging" continue for phedexid in [o['id'] for o in result['phedex']['request_created']]: new_transfer = session.query(Transfer).filter(Transfer.phedexid == phedexid).first() print phedexid,"transfer created" if not new_transfer: new_transfer = Transfer( phedexid = phedexid) session.add( new_transfer ) new_transfer.workflows_id = set() for transfering in list(set(map(lambda it : it.split('#')[0], items_to_transfer))): new_transfer.workflows_id.update( workflow_dependencies[transfering] ) new_transfer.workflows_id = list(new_transfer.workflows_id) wf_id_in_prestaging.update(new_transfer.workflows_id) session.commit() ## auto approve it if execute: approved = approveSubscription(url, phedexid, [site_se]) for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status!='staging': if execute: tr_wf.status = 'staging' if talk: print "setting",tr_wf.name,"to staging" session.commit()
def main(): mysqlconn = MySQLdb.connect(host='dbod-cmsrv1.cern.ch', user='******', passwd="relval", port=5506) curs = mysqlconn.cursor() curs.execute("use "+dbname+";") #curs.execute("lock tables batches write, batches_archive write, workflows write, workflows_archive write, datasets write, clone_reinsert_requests write") curs.execute("select * from batches") batches=curs.fetchall() batches_colnames = [desc[0] for desc in curs.description] for batch in batches: batch_dict= dict(zip(batches_colnames, batch)) if batch_dict["status"] != "assigned": continue userid=batch_dict["useridyear"]+"_"+batch_dict["useridmonth"]+"_"+batch_dict["useridday"]+"_"+str(batch_dict["useridnum"])+"_"+str(batch_dict["batch_version_num"]) print " userid ==> "+userid curs.execute("select workflow_name from workflows where useridyear = \""+batch_dict["useridyear"]+"\" and useridmonth = \""+batch_dict["useridmonth"]+ "\" and useridday = \""+batch_dict["useridday"]+"\" and useridnum = "+str(batch_dict["useridnum"])+" and batch_version_num ="+str(batch_dict["batch_version_num"])+";") wfs=curs.fetchall() n_workflows=0 n_completed=0 for wf in wfs: n_workflows=n_workflows+1 conn = httplib.HTTPSConnection(url, cert_file = os.getenv('X509_USER_PROXY'), key_file = os.getenv('X509_USER_PROXY')) r1=conn.request('GET','/reqmgr2/data/request?name='+wf[0],headers={"Accept": "application/json"}) r2=conn.getresponse() data = r2.read() if r2.status != 200: time.sleep(10) #try it again conn = httplib.HTTPSConnection(url, cert_file = os.getenv('X509_USER_PROXY'), key_file = os.getenv('X509_USER_PROXY')) r1=conn.request('GET','/reqmgr2/data/request?name='+wf[0],headers={"Accept": "application/json"}) r2=conn.getresponse() data = r2.read() if r2.status != 200: time.sleep(10) #try it a third time conn = httplib.HTTPSConnection(url, cert_file = os.getenv('X509_USER_PROXY'), key_file = os.getenv('X509_USER_PROXY')) r1=conn.request('GET','/reqmgr2/data/request?name='+wf[0],headers={"Accept": "application/json"}) r2=conn.getresponse() data = r2.read() if r2.status != 200: os.system('echo \"'+wf[0]+'\" | mail -s \"announcor.py error 1\" [email protected]') sys.exit(1) s = json.loads(data) for status in s['result'][0][wf[0]]['RequestTransition']: if status['Status'] == "completed" or status['Status'] == "force-complete": #conn = httplib.HTTPSConnection(url, cert_file = os.getenv('X509_USER_PROXY'), key_file = os.getenv('X509_USER_PROXY')) #r1=conn.request('GET',"/wmstatsserver/data/isfinished/"+wf[0],headers={"Accept": "application/json"}) #r2=conn.getresponse() #data = r2.read() #s = json.loads(data) #if s['result'][0] == "true": # n_completed=n_completed+1 n_completed=n_completed+1 break print "datetime.datetime.now() = " + str(datetime.datetime.now()) print "n_workflows = " + str(n_workflows) print "n_completed = " + str(n_completed) if n_workflows != n_completed: continue #string="2016_04_11_1_0" #if not (string.split('_')[0] == batch_dict["useridyear"] and string.split('_')[1] == batch_dict["useridmonth"] and string.split('_')[2] == batch_dict["useridday"] and string.split('_')[3] == str(batch_dict["useridnum"]) and string.split('_')[4] == str(batch_dict["batch_version_num"])): # continue wf_list = [] for wf in wfs: print wf[0] wf_list.append(wf[0]) job_failure_information=collect_job_failure_information.collect_job_failure_information(wf_list) needs_assistance = assistance_decision.assistance_decision(job_failure_information) if needs_assistance: curs.execute("update batches set status=\"assistance\", current_status_start_time=\""+datetime.datetime.now().strftime("%y:%m:%d %H:%M:%S")+"\" where useridyear = \""+batch_dict["useridyear"]+"\" and useridmonth = \""+batch_dict["useridmonth"]+ "\" and useridday = \""+batch_dict["useridday"]+"\" and useridnum = "+str(batch_dict["useridnum"])+" and batch_version_num ="+str(batch_dict["batch_version_num"])+";") mysqlconn.commit() os.system('echo \"batch_id: '+userid+'\" | mail -s \"a batch of relval workflows needs assistance\" [email protected]') continue #if there is a '\r' character in the body of an e-mail, it does not get sent description=batch_dict["description"].replace('\r','') for wf in wf_list: too_many_events_check.too_many_events_check(wf) dset_nevents_list=collect_dsets_and_nevents.collect_dsets_and_nevents(wf_list) print_dsets_and_nevents.print_dsets_and_nevents(dset_nevents_list, userid+".txt") ret=os.system("cp "+userid+".txt /afs/cern.ch/user/r/relval/webpage/relval_stats/"+userid+".txt") if ret == 0: os.system("rm "+userid+".txt") else: os.system('echo \"'+userid+'\" | mail -s \"announcor.py error 2\" [email protected]') sys.exit(0) dsets_list = [] for dset_nevents in dset_nevents_list: dsets_list.append(dset_nevents[0]) for dset in dsets_list: setDatasetStatusDBS3.setStatusDBS3("https://cmsweb.cern.ch/dbs/prod/global/DBSWriter", dset, "VALID", True) for wf in wf_list: reqMgrClient.closeOutWorkflow("cmsweb.cern.ch",wf) reqMgrClient.announceWorkflow("cmsweb.cern.ch",wf) msg = MIMEMultipart() reply_to = [] #send_to = ["*****@*****.**","*****@*****.**"] send_to = ["*****@*****.**","*****@*****.**","*****@*****.**"] #send_to = ["*****@*****.**"] #msg['In-Reply-To'] = hn_message_id #msg['References'] = hn_message_id msg['From'] = "*****@*****.**" msg['reply-to'] = COMMASPACE.join(reply_to) msg['To'] = COMMASPACE.join(send_to) msg['Date'] = formatdate(localtime=True) msg['Subject'] = batch_dict["announcement_title"] msg['Message-ID'] = email.Utils.make_msgid() messageText="Dear all,\n" messageText=messageText+"\n" messageText=messageText+"A batch of relval workflows has finished.\n" messageText=messageText+"\n" messageText=messageText+"Batch ID:\n" messageText=messageText+"\n" messageText=messageText+userid+"\n" if batch_dict["batch_version_num"] > 0: messageText=messageText+"\n" messageText=messageText+"original workflow name ==> clone name:\n" messageText=messageText+"\n" curs.execute("select workflow_name,original_workflow_name from workflows where useridyear = \""+batch_dict["useridyear"]+"\" and useridmonth = \""+batch_dict["useridmonth"]+ "\" and useridday = \""+batch_dict["useridday"]+"\" and useridnum = "+str(batch_dict["useridnum"])+" and batch_version_num ="+str(batch_dict["batch_version_num"])+";") workflows=curs.fetchall() for workflow in workflows: messageText=messageText+workflow[1] + " ==> "+workflow[0] + "\n" messageText=messageText+"\n" messageText=messageText+"List of datasets:\n" messageText=messageText+"\n" messageText=messageText+"http://cms-project-relval.web.cern.ch/cms-project-relval/relval_stats/"+userid+".txt\n" messageText=messageText+"\n" messageText=messageText+"Description:\n" messageText=messageText+"\n" messageText=messageText+description.rstrip('\n') messageText=messageText+"\n" #messageText=messageText+"\n" [istherefailureinformation,return_string]=print_job_failure_information.print_job_failure_information(job_failure_information) if istherefailureinformation: messageText=messageText+"\n" messageText=messageText+return_string messageText=messageText+"\n" messageText=messageText+"\n" messageText=messageText+"RelVal Batch Manager" #put the announcement message into an e-mail to the relval hypernews and also in a url output_file = open("/afs/cern.ch/user/r/relval/webpage/relval_announcements/"+userid+".txt", 'w') output_file.write(messageText) try: msg.attach(MIMEText(messageText)) smtpObj = smtplib.SMTP() smtpObj.connect() smtpObj.sendmail("*****@*****.**", send_to, msg.as_string()) smtpObj.close() except Exception as e: print "Error: unable to send email: %s" %(str(e)) dsets_fnal_disk_list = [] dsets_cern_disk_list = [] for dset in dsets_list: #print dset.split('/') # we were asked to transfer some specific datasets to the cern tier 2 if dset.split('/')[3] != "RECO" and dset.split('/')[3] != "ALCARECO": dsets_cern_disk_list.append(dset) if dset.split('/')[3] == "GEN-SIM": dsets_fnal_disk_list.append(dset) if dset.split('/')[3] == "GEN-SIM-DIGI-RAW": dsets_fnal_disk_list.append(dset) if dset.split('/')[3] == "GEN-SIM-RECO": dsets_fnal_disk_list.append(dset) if "RelValTTBar" in dset.split('/')[1] and "TkAlMinBias" in dset.split('/')[2] and dset.split('/')[3] != "ALCARECO": dsets_cern_disk_list.append(dset) if "MinimumBias" in dset.split('/')[1] and "SiStripCalMinBias" in dset.split('/')[2] and dset.split('/')[3] != "ALCARECO": dsets_cern_disk_list.append(dset) result=utils.makeReplicaRequest("cmsweb.cern.ch", "T2_CH_CERN", dsets_cern_disk_list, "relval datasets",group="RelVal") if result != None: phedexid = result['phedex']['request_created'][0]['id'] utils.approveSubscription("cmsweb.cern.ch",phedexid) result=utils.makeReplicaRequest("cmsweb.cern.ch", "T1_US_FNAL_Disk", dsets_fnal_disk_list, "relval datasets", group = "RelVal") if result != None: phedexid = result['phedex']['request_created'][0]['id'] utils.approveSubscription("cmsweb.cern.ch",phedexid) result=utils.makeMoveRequest("cmsweb.cern.ch", "T0_CH_CERN_MSS", dsets_list, "relval datasets", group = "RelVal") if result != None: phedexid = result['phedex']['request_created'][0]['id'] #even if you disapprove the subscription at the source, it will still deleted the datasets that are at the source but not subscribed their utils.disapproveSubscription("cmsweb.cern.ch",phedexid,["T2_CH_CERN"]) utils.disapproveSubscription("cmsweb.cern.ch",phedexid,["T1_US_FNAL_Disk"]) utils.disapproveSubscription("cmsweb.cern.ch",phedexid,["T1_FR_CCIN2P3_Disk"]) utils.disapproveSubscription("cmsweb.cern.ch",phedexid,["T1_DE_KIT_Disk"]) utils.approveSubscription("cmsweb.cern.ch",phedexid,["T0_CH_CERN_MSS"]) #phedexid = result['phedex']['request_created'][0]['id'] #utils.approveSubscription("cmsweb.cern.ch",phedexid) curs.execute("update batches set status=\"announced\", current_status_start_time=\""+datetime.datetime.now().strftime("%y:%m:%d %H:%M:%S")+"\" where useridyear = \""+batch_dict["useridyear"]+"\" and useridmonth = \""+batch_dict["useridmonth"]+ "\" and useridday = \""+batch_dict["useridday"]+"\" and useridnum = "+str(batch_dict["useridnum"])+" and batch_version_num ="+str(batch_dict["batch_version_num"])+";") mysqlconn.commit()
def transferor(url ,specific = None, talk=True, options=None): if userLock(): return if duplicateLock(): return use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() LI = lockInfo() NLI = newLockInfo() mcm = McMClient(dev=False) dss = DSS() print "counting all being handled..." being_handled = len(session.query(Workflow).filter(Workflow.status == 'away').all()) being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('stag')).all()) being_transfered = len(session.query(Workflow).filter(Workflow.status == 'staging').all()) being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all()) max_to_handle = options.maxworkflows max_to_transfer = options.maxstaging allowed_to_handle = max(0,max_to_handle - being_handled) allowed_to_transfer = max(0,max_to_transfer - being_transfered) wf_buffer = 5 if allowed_to_handle<=wf_buffer: ## buffer for having several wf per transfer print "Not allowed to run more than",max_to_handle,"at a time. Currently",being_handled,"and",wf_buffer,"buffer" else: print being_handled,"already being handled",max_to_handle,"max allowed,",allowed_to_handle,"remaining","and",wf_buffer,"buffer" if allowed_to_transfer <= wf_buffer: print "Not allowed to transfer more than",max_to_transfer,"at a time. Currently",being_transfered,"and",wf_buffer,"buffer" else: print being_transfered,"already being transfered",max_to_transfer,"max allowed,",allowed_to_transfer,"remaining","and",wf_buffer,"buffer" print "... done" all_transfers=defaultdict(list) needing_locks=defaultdict(list) workflow_dependencies = defaultdict(set) ## list of wf.id per input dataset wfs_and_wfh=[] print "getting all wf to consider ..." cache = getWorkflows(url, 'assignment-approved', details=True) for wfo in session.query(Workflow).filter(Workflow.status=='considered').all(): print "\t",wfo.name if specific and not specific in wfo.name: continue cache_r =filter(lambda d:d['RequestName']==wfo.name, cache) if len(cache_r): wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False, request = cache_r[0]) ) ) else: wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False) ) ) print "... done" transfers_per_sites = defaultdict(int) input_sizes = {} input_cput = {} input_st = {} ## list the size of those in transfer already in_transfer_priority=0 min_transfer_priority=100000000 print "getting all wf in staging ..." for wfo in session.query(Workflow).filter(Workflow.status=='staging').all(): wfh = workflowInfo( url, wfo.name, spec=False) (lheinput,primary,parent,secondary) = wfh.getIO() sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) for site in sites_allowed: ## we should get the actual transfer destination instead of the full white list transfers_per_sites[site] += 1 #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() for prim in primary: input_sizes[prim] = dss.get( prim ) print "\t",wfo.name,"needs",input_sizes[prim],"GB" in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority'])) min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority'])) print "... done" print "Max priority in transfer already",in_transfer_priority print "Min priority in transfer already",min_transfer_priority print "transfers per sites" print json.dumps( transfers_per_sites, indent=2) in_transfer_already = sum(input_sizes.values()) cput_in_transfer_already = sum(input_cput.values()) st_in_transfer_already = sum(input_st.values()) # shuffle first by name random.shuffle( wfs_and_wfh ) #sort by priority higher first wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority']) ), reverse=True) ## list the size of all inputs print "getting all input sizes ..." for (wfo,wfh) in wfs_and_wfh: (_,primary,_,_) = wfh.getIO() #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() for prim in primary: input_sizes[prim] = dss.get( prim ) print "... done" cput_grand_total = sum(input_cput.values()) cput_to_transfer = cput_grand_total - cput_in_transfer_already st_grand_total = sum(input_st.values()) st_to_transfer = st_grand_total - st_in_transfer_already print "%15.4f [CPU h] worth already in transfer"%cput_in_transfer_already print "%15.4f [CPU h] worth is the current requested transfer load"%cput_to_transfer print "%15.4f [h] worth of absolute system time in transfer"%( cput_in_transfer_already / SI.availableSlots()) print "%15.4f [h] worth of absolute system time is the current requested transfer load"%( cput_to_transfer / SI.availableSlots()) print "%15.4f [h] worth of theoritical system time in transfer"%( st_in_transfer_already ) print "%15.4f [h] worth of theoritical system time is the current requested transfer load"%( st_to_transfer ) grand_total = sum(input_sizes.values()) to_transfer = grand_total - in_transfer_already grand_transfer_limit = options.maxtransfer transfer_limit = grand_transfer_limit - in_transfer_already print "%15.4f GB already being transfered"%in_transfer_already print "%15.4f GB is the current requested transfer load"%to_transfer print "%15.4f GB is the global transfer limit"%grand_transfer_limit print "%15.4f GB is the available limit"%transfer_limit max_staging_per_site = options.maxstagingpersite # the max priority value per dataset. max_priority = defaultdict(int) needs_transfer=0 ## so that we can count'em passing_along = 0 transfer_sizes={} went_over_budget=False destination_cache = {} for (wfo,wfh) in wfs_and_wfh: print wfh.request['RequestPriority'] print wfo.name,"to be transfered" #wfh = workflowInfo( url, wfo.name) (_,primary,_,_) = wfh.getIO() this_load=sum([input_sizes[prim] for prim in primary]) if ( this_load and (sum(transfer_sizes.values())+this_load > transfer_limit or went_over_budget ) ): if went_over_budget: print "Transfer has gone over bubget." else: print "Transfer will go over bubget." print "%15.4f GB this load"%this_load print "%15.4f GB already this round"%sum(transfer_sizes.values()) print "%15.4f GB is the available limit"%transfer_limit went_over_budget=True if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over budget" else: if not options.go: print min_transfer_priority,"minimum priority",wfh.request['RequestPriority'],"<",in_transfer_priority,"stop" continue ## throtlle by campaign go if not CI.go( wfh.request['Campaign'] ): print "No go for",wfh.request['Campaign'] if not options.go: sendEmail("no go for managing","No go for "+wfh.request['Campaign']) continue ## check if the batch is announced def check_mcm(wfn): announced=False is_real=False if not wfn.startswith('pdmvserv'): is_real = True try: for b in mcm.getA('batches',query='contains=%s'% wfo.name): is_real = True if b['status']=='announced': announced=True break except: try: for b in mcm.getA('batches',query='contains=%s'% wfo.name): is_real = True if b['status']=='announced': announced=True break except: print "could not get mcm batch announcement, assuming not real" return announced,is_real if not use_mcm: announced,is_real = False,True else: announced,is_real = check_mcm( wfo.name ) if not announced: print wfo.name,"does not look announced."# skipping?, rejecting?, reporting?" if not is_real: print wfo.name,"does not appear to be genuine." ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status continue ## check on a grace period injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.) now = time.mktime(time.gmtime()) / (60.*60.) if float(now - injection_time) < 4.: if not options.go and not announced: print "It is too soon to start transfer: %3.2fH remaining"%(now - injection_time) continue if passing_along >= allowed_to_handle: #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority: ## higher priority, and not only this priority being transfered print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over",max_to_handle else: print "Not allowed to pass more than",max_to_handle,"at a time. Currently",being_handled,"handled, and adding",passing_along if not options.go: break if this_load and needs_transfer >= allowed_to_transfer: #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority: ## higher priority, and not only this priority being transfered print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over",max_to_transfer else: print "Not allowed to transfer more than",max_to_transfer,"at a time. Currently",being_transfered,"transfering, and adding",needs_transfer if not options.go: continue (lheinput,primary,parent,secondary) = wfh.getIO() for dataset in list(primary)+list(parent)+list(secondary): ## lock everything flat NLI.lock( dataset ) if options and options.tosites: sites_allowed = options.tosites.split(',') else: sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']): sites_allowed = CI.parameters(wfh.request['Campaign'])['SiteWhitelist'] if 'SiteBlacklist' in CI.parameters(wfh.request['Campaign']): sites_allowed = list(set(sites_allowed) - set(CI.parameters(wfh.request['Campaign'])['SiteBlacklist'])) ## reduce right away to sites in case of memory limitation memory_allowed = SI.sitesByMemory( wfh.request['Memory'] ) if memory_allowed!=None: print "sites allowing", wfh.request['Memory'],"are",memory_allowed sites_allowed = list(set(sites_allowed) & set(memory_allowed)) if not sites_allowed: print wfo.name,"has no possible sites to run at" print "available for",wfh.request['Memory'],"are",memory_allowed sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name )) continue blocks = [] if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## should make the block selection here pass if 'LumiList' in wfh.request and wfh.request['LumiList']: ## same, we could be doing the white list here too pass if blocks: print "Reading",len(blocks),"in whitelist" can_go = True staging=False allowed=True if primary: copies_needed_from_CPUh,CPUh = wfh.getNCopies() if talk: print wfo.name,'reads',', '.join(primary),'in primary' ## chope the primary dataset for prim in primary: max_priority[prim] = max(max_priority[prim],int(wfh.request['RequestPriority'])) sites_allowed = [site for site in sites_allowed if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] print "Sites allowed minus the vetoed transfer" print sorted(sites_allowed) copies_needed_from_site = int(0.35*len(sites_allowed))+1 ## should just go for a fixed number based if the white list grows that big print "Would make",copies_needed_from_site,"copies from site white list" copies_needed = copies_needed_from_site print "Would make",copies_needed_from_CPUh,"from cpu requirement",CPUh copies_needed = copies_needed_from_CPUh if options.maxcopy>0: ## stop maxing things out ?? #copies_needed = min(options.maxcopy,copies_needed) #print "Maxed to",copies_needed if copies_needed_from_CPUh > options.maxcopy: sendEmail('An example of more than three copies','for %s it could have been beneficial to make %s copies'%( wfo.name, copies_needed_from_CPUh)) if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies'] copies_needed = min(copies_needed_from_campaign,copies_needed_from_site) print "Maxed to",copies_needed,"by campaign configuration",wfh.request['Campaign'] ## remove the sites that do not want transfers workflow_dependencies[prim].add( wfo.id ) ##################################### ###### JR 3/8/15 #### deprecating this """ presence = getDatasetPresence( url, prim , within_sites = [SI.CE_to_SE(site) for site in sites_allowed]) prim_location = [site for site,pres in presence.items() if pres[0]==True] prim_parts = [site for site,pres in presence.items() if pres[0]==False] if len(prim_location) >= copies_needed: print "The output is all fully in place at",len(prim_location),"sites" continue # reduce the number of copies required by existing full copies copies_needed = max(0,copies_needed - len(prim_location)) print "now need",copies_needed subscriptions = listSubscriptions( url , prim , sites_allowed ) prim_destination = list(set([site for (site,(tid,decision)) in subscriptions.items() if decision and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])])) ## remove the subscription where the dataset is in parts at #prim_destination = list(set([site for (site,(tid,decision)) in subscriptions.items() if decision and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']]) and not site in prim_parts])) ## need to reject from that list the ones with a full copy already: i.e the transfer corresponds to the copy in place prim_destination = [site for site in prim_destination if not site in prim_location] ## add transfer dependencies latching_on_transfers = list(set([ tid for (site,(tid,decision)) in subscriptions.items() if decision and site in prim_destination and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])])) print latching_on_transfers """ ###### JR 3/8/15 #### deprecating this ##################################### ### new ways of making the whole thing destinations,all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks ) #destinations,all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks, group='DataOps') #anaops_destinations,anaops_all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks, group='AnalysisOps' ) print json.dumps(destinations, indent=2) ## get where the dataset is in full and completed prim_location = [site for (site,info) in destinations.items() if info['completion']==100 and info['data_fraction']==1] ## the rest is places it is going to be prim_destination = [site for site in destinations.keys() if not site in prim_location] ## need to take out the transfer veto prim_destination = [site for site in prim_destination if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] for dsite in prim_destination: needing_locks[dsite].append( prim ) if len(prim_location) >= copies_needed: print "The output is all fully in place at",len(prim_location),"sites",prim_location continue copies_needed = max(0,copies_needed - len(prim_location)) print "now need",copies_needed copies_being_made = [ sum([info['blocks'].keys().count(block) for site,info in destinations.items() if site in prim_destination]) for block in all_block_names] latching_on_transfers = set() [latching_on_transfers.update(info['blocks'].values()) for site,info in destinations.items() if site in prim_destination] latching_on_transfers = list(latching_on_transfers) #print latching_on_transfers ## figure out where all this is going to go prim_to_distribute = [site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location] prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in prim_destination] ## take out the ones that cannot receive transfers prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] if any([transfers_per_sites[site] < max_staging_per_site for site in prim_to_distribute]): ## means there is openings let me go print "There are transfer slots available:",[(site,transfers_per_sites[site]) for site in prim_to_distribute] for site in sites_allowed: #increment accross the board, regardless of real destination: could be changed transfers_per_sites[site] += 1 else: if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over transfer slots available" else: print "Not allowed to transfer more than",max_staging_per_site," per site at a time. Going overboard for",[site for site in prim_to_distribute if transfers_per_sites[site]>=max_staging_per_site] if not options.go: allowed = False break for latching in latching_on_transfers: tfo = session.query(Transfer).filter(Transfer.phedexid == latching).first() if not tfo: tfo = Transfer( phedexid = latching) tfo.workflows_id = [] session.add(tfo) if not wfo.id in tfo.workflows_id: print "adding",wfo.id,"to",tfo.id,"with phedexid",latching l = copy.deepcopy( tfo.workflows_id ) l.append( wfo.id ) tfo.workflows_id = l if not options.test: session.commit() else: session.flush() ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query can_go = False transfer_sizes[prim] = input_sizes[prim] staging = True # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ?? #copies_needed = max(0,copies_needed - len(prim_destination)) copies_needed = max(0,copies_needed - min(copies_being_made)) print "then need",copies_needed if copies_needed == 0: print "The output is either fully in place or getting in full somewhere with",latching_on_transfers can_go = True continue if len(prim_to_distribute)>0: ## maybe that a parameter we can play with to limit the if not options or options.chop: chops,sizes = getDatasetChops(prim, chop_threshold = options.chopsize, only_blocks=blocks) spreading = distributeToSites( chops, prim_to_distribute, n_copies = copies_needed, weights=SI.cpu_pledges, sizes=sizes) transfer_sizes[prim] = sum(sizes) else: spreading = {} for site in prim_to_distribute: if blocks: spreading[site]=blocks else: spreading[site]=[prim] transfer_sizes[prim] = input_sizes[prim] ## this is approximate if blocks are specified can_go = False print "selected CE destinations",spreading.keys() for (site,items) in spreading.items(): all_transfers[site].extend( items ) if not allowed: print "Not allowed to move on with",wfo.name continue if secondary: if talk: print wfo.name,'reads',', '.join(secondary),'in secondary' for sec in secondary: workflow_dependencies[sec].add( wfo.id ) if False: ## new style, failing on minbias if not sec in destination_cache: ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed]) destinations = destination_cache[sec] ## truncate location/destination to those making up for >90% of the dataset bad_destinations = [destinations.pop(site) for (site,info) in destinations.items() if info['data_fraction']<0.9] sec_location = [site for (site,info) in destinations.items() if info['completion']>=95] sec_destination = [site for site in destinations.keys() if not site in sec_location] else: ## old style presence = getDatasetPresence( url, sec ) sec_location = [site for site,pres in presence.items() if pres[1]>90.] ## more than 90% of the minbias at sites subscriptions = listSubscriptions( url ,sec ) sec_destination = [site for site in subscriptions] for site in sec_location: needing_locks[site].append( sec ) for site in sec_destination: needing_locks[site].append( sec ) sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] if len( sec_to_distribute )>0: sec_size = dss.get( sec ) for site in sec_to_distribute: site_se =SI.CE_to_SE(site) if (SI.disk[site_se]*1024.) > sec_size: all_transfers[site].append( sec ) can_go = False else: print "could not send the secondary input to",site_se,"because it is too big for the available disk",SI.disk[site_se]*1024,"GB need",sec_size #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024)) ## is that possible to do something more if can_go: ## no explicit transfer required this time if staging: ## but using existing ones print wfo.name,"latches on existing transfers, and nothing else" wfo.status = 'staging' needs_transfer+=1 else: print wfo.name,"should just be assigned NOW to",sites_allowed wfo.status = 'staged' passing_along+=1 print "setting status to",wfo.status session.commit() continue else: ## there is an explicit transfer required if staging: ## and also using an existing one print wfo.name,"latches on existing transfers" if not options.test: wfo.status = 'staging' print "setting status to",wfo.status session.commit() print wfo.name,"needs a transfer" needs_transfer+=1 passing_along+=1 print "accumulated locks of dataset in place" print json.dumps(needing_locks, indent=2) for site,items in needing_locks.items(): for item in items: LI.lock( item, SI.CE_to_SE(site), 'usable input') print "accumulated transfers" print json.dumps(all_transfers, indent=2) fake_id=-1 wf_id_in_prestaging=set() for (site,items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## site that do not want input datasets if site in SI.sites_veto_transfer: print site,"does not want transfers" continue ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured. ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] datasets = [it for it in items_to_transfer if not '#' in it] if execute: print "Making a replica to",site,"(CE)",site_se,"(SE) for" else: print "Would make a replica to",site,"(CE)",site_se,"(SE) for" print "\t",len(blocks),"blocks" ## remove blocks if full dataset is send out blocks = [block for block in blocks if not block.split('#')[0] in datasets] print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks])) print "\t",len(datasets),"datasets" print "\t",datasets items_to_transfer = blocks + datasets ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y','yes','go']: continue if execute: result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority='normal') ## make use of max_priority dataset:priority to set the subscriptions priority """ ## does not function once = True for item in items_to_transfer: bds = item.split('#')[0] if max_priority[bds] >= 90000: if once: w=10 print "waiting",w,"s before raising priority" time.sleep(w) once=False ## raise it to high priority print item,"subscription priority raised to high at",site_se #print "This does not work yet properly it seems" print updateSubscription(url, site_se, item, priority='high') """ #for item in list(set([it.split('#')[0] for it in items_to_transfer])): for item in items_to_transfer: LI.lock( item, site_se, 'pre-staging') else: #result= {'phedex':{'request_created' : [{'id' : fake_id}]}} result= {'phedex':{'request_created' : []}} fake_id-=1 if not result: print "ERROR Could not make a replica request for",site,items_to_transfer,"pre-staging" continue for phedexid in [o['id'] for o in result['phedex']['request_created']]: new_transfer = session.query(Transfer).filter(Transfer.phedexid == phedexid).first() print phedexid,"transfer created" if not new_transfer: new_transfer = Transfer( phedexid = phedexid) session.add( new_transfer ) new_transfer.workflows_id = set() for transfering in list(set(map(lambda it : it.split('#')[0], items_to_transfer))): new_transfer.workflows_id.update( workflow_dependencies[transfering] ) new_transfer.workflows_id = list(new_transfer.workflows_id) wf_id_in_prestaging.update(new_transfer.workflows_id) session.commit() ## auto approve it if execute: approved = approveSubscription(url, phedexid, [site_se]) for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status!='staging': if execute: tr_wf.status = 'staging' if talk: print "setting",tr_wf.name,"to staging" session.commit()
def stagor(url,specific =None, options=None): if not componentInfo().check(): return SI = global_SI CI = campaignInfo() UC = unifiedConfiguration() done_by_wf_id = {} done_by_input = {} completion_by_input = {} good_enough = 100.0 lost = json.loads(open('lost_blocks_datasets.json').read()) still_lost = [] for dataset in lost: l = findLostBlocks(url ,dataset) if not l: print dataset,"is not really lost" else: still_lost.append( dataset ) open('lost_blocks_datasets.json','w').write( json.dumps( still_lost, indent=2) ) cached_transfer_statuses = json.loads(open('cached_transfer_statuses.json').read()) if options.fast: print "doing the fast check of staged with threshold:",options.goodavailability for wfo in session.query(Workflow).filter(Workflow.status == 'staging').all(): if specific and not specific in wfo.name: continue wfi = workflowInfo(url, wfo.name) (_,primaries,_,secondaries,sites_allowed) = wfi.getSiteWhiteList() if 'SiteWhitelist' in CI.parameters(wfi.request['Campaign']): sites_allowed = CI.parameters(wfi.request['Campaign'])['SiteWhitelist'] if 'SiteBlacklist' in CI.parameters(wfi.request['Campaign']): sites_allowed = list(set(sites_allowed) - set(CI.parameters(wfi.request['Campaign'])['SiteBlacklist'])) se_allowed = [SI.CE_to_SE(site) for site in sites_allowed] all_check = True n_copies = wfi.getNCopies() for dataset in list(primaries):#+list(secondaries) ? #print se_allowed available = getDatasetBlocksFraction( url , dataset , sites=se_allowed ) #all_check &= (available >= options.goodavailability) all_check &= (available >= n_copies) if not all_check: break if all_check: print "\t\t",wfo.name,"can go staged" wfo.status = 'staged' session.commit() else: print "\t",wfo.name,"can wait a bit more" return for wfo in session.query(Workflow).filter(Workflow.status == 'staging').all(): wfi = workflowInfo(url, wfo.name) if wfi.request['RequestStatus'] in ['running-open','running-closed','completed']: print wfo.name,"is",wfi.request['RequestStatus'] wfi.status='away' session.commit() continue _,primaries,_,secondaries = wfi.getIO() for dataset in list(primaries)+list(secondaries): done_by_input[dataset] = {} completion_by_input[dataset] = {} print wfo.name,"needs",dataset ## this loop is very expensive and will not function at some point. ## transfer objects should probably be deleted as some point for transfer in session.query(Transfer).filter(Transfer.phedexid>0).all(): if specific and str(transfer.phedexid)!=str(specific): continue skip=True for wfid in transfer.workflows_id: tr_wf = session.query(Workflow).get(wfid) if tr_wf: if tr_wf.status == 'staging': print "\t",transfer.phedexid,"is staging for",tr_wf.name skip=False if skip: print "setting",transfer.phedexid,"to negative value" transfer.phedexid = -transfer.phedexid session.commit() continue if transfer.phedexid<0: continue ## check the status of transfers checks = checkTransferApproval(url, transfer.phedexid) approved = all(checks.values()) if not approved: print transfer.phedexid,"is not yet approved" approveSubscription(url, transfer.phedexid) continue ## check on transfer completion if str(transfer.phedexid) in cached_transfer_statuses: ### use a cache for transfer that already looked done print "read",transfer.phedexid,"from cache" checks = cached_transfer_statuses[str(transfer.phedexid)] else: checks = checkTransferStatus(url, transfer.phedexid, nocollapse=True) if not specific: for dsname in checks: if not dsname in done_by_input: done_by_input[dsname]={} if not dsname in completion_by_input: completion_by_input[dsname] = {} done_by_input[dsname][transfer.phedexid]=all(map(lambda i:i>=good_enough, checks[dsname].values())) completion_by_input[dsname][transfer.phedexid]=checks[dsname].values() if checks: print "Checks for",transfer.phedexid,[node.values() for node in checks.values()] done = all(map(lambda i:i>=good_enough,list(itertools.chain.from_iterable([node.values() for node in checks.values()])))) else: ## it is empty, is that a sign that all is done and away ? print "ERROR with the scubscriptions API of ",transfer.phedexid print "Most likely something else is overiding the transfer request. Need to work on finding the replacement automatically, if the replacement exists" done = False ## the thing above is NOT giving the right number #done = False for wfid in transfer.workflows_id: tr_wf = session.query(Workflow).get(wfid) if tr_wf:# and tr_wf.status == 'staging': if not tr_wf.id in done_by_wf_id: done_by_wf_id[tr_wf.id]={} done_by_wf_id[tr_wf.id][transfer.phedexid]=done ## for those that are in staging, and the destination site is in drain #if not done and tr_wf.status == 'staging': if done: ## transfer.status = 'done' print transfer.phedexid,"is done" cached_transfer_statuses[str(transfer.phedexid)] = copy.deepcopy(checks) else: print transfer.phedexid,"not finished" pprint.pprint( checks ) open('cached_transfer_statuses.json','w').write( json.dumps( cached_transfer_statuses, indent=2)) missing_in_action = defaultdict(list) #print done_by_input print "\n----\n" for dsname in done_by_input: fractions = None if dsname in completion_by_input: fractions = itertools.chain.from_iterable([check.values() for check in completion_by_input.values()]) ## the workflows in the waiting room for the dataset using_its = getWorkflowByInput(url, dsname) #print using_its using_wfos = [] for using_it in using_its: wf = session.query(Workflow).filter(Workflow.name == using_it).first() if wf: using_wfos.append( wf ) if not len(done_by_input[dsname]): print "For dataset",dsname,"there are no transfer report. That's an issue." for wf in using_wfos: if wf.status == 'staging': if UC.get("stagor_sends_back"): print "sending",wf.name,"back to considered" wf.status = 'considered' session.commit() sendEmail( "send back to considered","%s was send back and might be trouble"% wf.name) else: print "would send",wf.name,"back to considered" sendEmail( "subscription lagging behind","susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name) continue #need_sites = int(len(done_by_input[dsname].values())*0.7)+1 need_sites = len(done_by_input[dsname].values()) #if need_sites > 10: need_sites = int(need_sites/2.) got = done_by_input[dsname].values().count(True) if all([wf.status != 'staging' for wf in using_wfos]): ## not a single ds-using wf is in staging => moved on already ## just forget about it #print "presence of",dsname,"does not matter anymore" #print "\t",done_by_input[dsname] #print "\t",[wf.status for wf in using_wfos] #print "\tneeds",need_sites continue ## should the need_sites reduces with time ? # with dataset choping, reducing that number might work as a block black-list. if len(done_by_input[dsname].values()) and all(done_by_input[dsname].values()): print dsname,"is everywhere we wanted" ## the input dataset is fully transfered, should consider setting the corresponding wf to staged for wf in using_wfos: if wf.status == 'staging': print wf.name,"is with us. setting staged and move on" wf.status = 'staged' session.commit() elif fractions and len(list(fractions))>1 and set(fractions)==1: print dsname,"is everywhere at the same fraction" print "We do not want this in the end. we want the data we asked for" continue ## the input dataset is fully transfered, should consider setting the corresponding wf to staged for wf in using_wfos: if wf.status == 'staging': print wf.name,"is with us everywhere the same. setting staged and move on" wf.status = 'staged' session.commit() elif got >= need_sites: print dsname,"is almost everywhere we wanted" #print "We do not want this in the end. we want the data we asked for" #continue ## the input dataset is fully transfered, should consider setting the corresponding wf to staged for wf in using_wfos: if wf.status == 'staging': print wf.name,"is almost with us. setting staged and move on" wf.status = 'staged' session.commit() else: print "incomplete",dsname lost = findLostBlocks(url, dsname) lost_names = [item['name'] for item in lost] try: known_lost = json.loads(open('lost_blocks_datasets.json').read()) except: print "enable to get the known_lost from local json file" known_lost = [] if lost: print "We have lost",len(lost),"blocks",lost_names #print json.dumps( lost , indent=2 ) if lost and not dsname in known_lost: ## make a deeper investigation of the block location to see whether it's really no-where no-where sendEmail('we have lost a few blocks', str(len(lost))+" in total.\nDetails \n:"+json.dumps( lost , indent=2 )) known_lost.append(dsname) rr= open('lost_blocks_datasets.json','w') rr.write( json.dumps( known_lost, indent=2)) rr.close() ## should the status be change to held-staging and pending on a ticket missings = [pid for (pid,d) in done_by_input[dsname].items() if d==False] print "\t",done_by_input[dsname] print "\tneeds",need_sites print "\tgot",got print "\tmissing",missings missing_in_action[dsname].extend( missings ) open('/afs/cern.ch/user/c/cmst2/www/unified/incomplete_transfers.json','w').write( json.dumps(missing_in_action, indent=2) ) print "Stuck transfers and datasets" print json.dumps( missing_in_action, indent=2 ) print "Going further and make a report of stuck transfers" datasets_by_phid = defaultdict(set) for dataset in missing_in_action: for phid in missing_in_action[dataset]: #print dataset,"stuck through",phid datasets_by_phid[phid].add( dataset ) bad_destinations = defaultdict(set) bad_sources = defaultdict(set) report = "" really_stuck_dataset = set() for phid,datasets in datasets_by_phid.items(): issues = checkTransferLag( url, phid , datasets=list(datasets) ) for dataset in issues: for block in issues[dataset]: for destination in issues[dataset][block]: (block_size,destination_size,delay,rate,dones) = issues[dataset][block][destination] ## count x_Buffer and x_MSS as one source redones=[] for d in dones: if d.endswith('Buffer') or d.endswith('Export'): if d.replace('Buffer','MSS').replace('Export','MSS') in dones: continue else: redones.append( d ) else: redones.append( d ) dones = list(set( redones )) #dones = filter(lambda s : (s.endswith('Buffer') and not s.replace('Buffer','MSS') in dones) or (not s.endswith('Buffer')) , dones) if delay>7 and rate<0.0004: if len(dones)>1: ## its the destination that sucks bad_destinations[destination].add( block ) else: dum=[bad_sources[d].add( block ) for d in dones] really_stuck_dataset.add( dataset ) report += "%s is not getting to %s, out of %s faster than %f [GB/s] since %f [d]\n"%(block,destination,", ".join(dones), rate, delay) print "\n"*2 ## create tickets right away ? report+="\nbad sources "+",".join(bad_sources.keys())+"\n" for site,blocks in bad_sources.items(): report+="\n\n%s:"%site+"\n\t".join(['']+list(blocks)) report+="\nbad destinations "+",".join(bad_destinations.keys())+"\n" for site,blocks in bad_destinations.items(): report+="\n\n%s:"%site+"\n\t".join(['']+list(blocks)) print report open('/afs/cern.ch/user/c/cmst2/www/unified/stuck_transfers.json','w').write( json.dumps(dict([(k,v) for (k,v) in missing_in_action.items() if k in really_stuck_dataset]), indent=2) ) open('/afs/cern.ch/user/c/cmst2/www/unified/logs/incomplete_transfers.log','w').write( report )
def transferor(url ,specific = None, talk=True, options=None): if userLock(): return if duplicateLock(): return use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() NLI = newLockInfo() mcm = McMClient(dev=False) dss = DSS() #allowed_secondary = UC.get('') print "counting all being handled..." being_handled = len(session.query(Workflow).filter(Workflow.status == 'away').all()) being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('stag')).all()) being_transfered = len(session.query(Workflow).filter(Workflow.status == 'staging').all()) being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all()) max_to_handle = options.maxworkflows max_to_transfer = options.maxstaging allowed_to_handle = max(0,max_to_handle - being_handled) allowed_to_transfer = max(0,max_to_transfer - being_transfered) wf_buffer = 5 if allowed_to_handle<=wf_buffer: ## buffer for having several wf per transfer print "Not allowed to run more than",max_to_handle,"at a time. Currently",being_handled,"and",wf_buffer,"buffer" else: print being_handled,"already being handled",max_to_handle,"max allowed,",allowed_to_handle,"remaining","and",wf_buffer,"buffer" if allowed_to_transfer <= wf_buffer: print "Not allowed to transfer more than",max_to_transfer,"at a time. Currently",being_transfered,"and",wf_buffer,"buffer" else: print being_transfered,"already being transfered",max_to_transfer,"max allowed,",allowed_to_transfer,"remaining","and",wf_buffer,"buffer" print "... done" all_transfers=defaultdict(list) workflow_dependencies = defaultdict(set) ## list of wf.id per input dataset wfs_and_wfh=[] print "getting all wf to consider ..." cache = getWorkflows(url, 'assignment-approved', details=True) for wfo in session.query(Workflow).filter(Workflow.status.startswith('considered')).all(): print "\t",wfo.name if specific and not specific in wfo.name: continue cache_r =filter(lambda d:d['RequestName']==wfo.name, cache) if len(cache_r): wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False, request = cache_r[0]) ) ) else: wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False) ) ) print "... done" transfers_per_sites = defaultdict(int) input_sizes = {} ignored_input_sizes = {} input_cput = {} input_st = {} ## list the size of those in transfer already in_transfer_priority=None min_transfer_priority=None print "getting all wf in staging ..." stucks = json.loads(open('%s/stuck_transfers.json'%monitor_dir).read()) for wfo in session.query(Workflow).filter(Workflow.status=='staging').all(): wfh = workflowInfo( url, wfo.name, spec=False) #(lheinput,primary,parent,secondary) = wfh.getIO() #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) (lheinput,primary,parent,secondary,sites_allowed) = wfh.getSiteWhiteList() for site in sites_allowed: ## we should get the actual transfer destination instead of the full white list transfers_per_sites[site] += 1 #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() for prim in primary: ds_s = dss.get( prim ) if prim in stucks: sendLog('transferor', "%s appears stuck, so not counting it %s [GB]"%( prim, ds_s), wfi=wfh) ignored_input_sizes[prim] = ds_s else: input_sizes[prim] = ds_s sendLog('transferor', "%s needs %s [GB]"%( wfo.name, ds_s), wfi=wfh) if in_transfer_priority==None: in_transfer_priority = int(wfh.request['RequestPriority']) else: in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority'])) if min_transfer_priority==None: min_transfer_priority = int(wfh.request['RequestPriority']) else: min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority'])) if min_transfer_priority==None or in_transfer_priority ==None: print "nothing is lining up for transfer" sendEmail("no request in staging","no request in staging") return pass try: print "Ignored input sizes" ignored_values = list(ignored_input_sizes.items()) ignored_values.sort( key = lambda i : i[1] ) print "\n".join( map(str, ignored_values ) ) print "Considered input sizes" considered_values = list(input_sizes.items()) considered_values.sort( key = lambda i : i[1] ) print "\n".join( map(str, considered_values) ) except Exception as e: print "trying to print the summary of input size" print str(e) print "... done" print "Max priority in transfer already",in_transfer_priority print "Min priority in transfer already",min_transfer_priority print "transfers per sites" print json.dumps( transfers_per_sites, indent=2) in_transfer_already = sum(input_sizes.values()) cput_in_transfer_already = sum(input_cput.values()) st_in_transfer_already = sum(input_st.values()) ## list the size of all inputs primary_input_per_workflow_gb = defaultdict(float) print "getting all input sizes ..." for (wfo,wfh) in wfs_and_wfh: (_,primary,_,_) = wfh.getIO() #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() for prim in primary: ## do not count it if it appears stalled ! prim_size = dss.get( prim ) input_sizes[prim] = prim_size primary_input_per_workflow_gb[wfo.name] += prim_size print "... done" # shuffle first by name random.shuffle( wfs_and_wfh ) # Sort smallest transfers first; allows us to transfer as many as possible workflows. def prio_and_size( i, j): if int(i[1].request['RequestPriority']) == int(j[1].request['RequestPriority']): return cmp(int(primary_input_per_workflow_gb.get(j[0].name, 0)), int(primary_input_per_workflow_gb.get(i[0].name, 0)) ) else: return cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority'])) #wfs_and_wfh.sort(cmp = prio_and_size, reverse=True) #wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(primary_input_per_workflow_gb.get(i[0].name, 0)), int(primary_input_per_workflow_gb.get(j[0].name, 0)) )) #sort by priority higher first wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority']) ), reverse=True) cput_grand_total = sum(input_cput.values()) cput_to_transfer = cput_grand_total - cput_in_transfer_already st_grand_total = sum(input_st.values()) st_to_transfer = st_grand_total - st_in_transfer_already print "%15.4f [CPU h] worth already in transfer"%cput_in_transfer_already print "%15.4f [CPU h] worth is the current requested transfer load"%cput_to_transfer print "%15.4f [h] worth of absolute system time in transfer"%( cput_in_transfer_already / SI.availableSlots()) print "%15.4f [h] worth of absolute system time is the current requested transfer load"%( cput_to_transfer / SI.availableSlots()) print "%15.4f [h] worth of theoritical system time in transfer"%( st_in_transfer_already ) print "%15.4f [h] worth of theoritical system time is the current requested transfer load"%( st_to_transfer ) grand_total = sum(input_sizes.values()) to_transfer = grand_total - in_transfer_already grand_transfer_limit = options.maxtransfer #grand_transfer_limit = SI.total_disk()*0.25*1024## half of the free sapce in TB->GB transfer_limit = grand_transfer_limit - in_transfer_already print "%15.4f GB already being transfered"%in_transfer_already print "%15.4f GB is the current requested transfer load"%to_transfer print "%15.4f GB is the global transfer limit"%grand_transfer_limit print "%15.4f GB is the available limit"%transfer_limit max_staging_per_site = options.maxstagingpersite # the max priority value per dataset. max_priority = defaultdict(int) needs_transfer=0 ## so that we can count'em passing_along = 0 transfer_sizes={} went_over_budget=False destination_cache = {} no_goes = set() max_per_round = UC.get('max_per_round').get('transferor',None) if max_per_round and not spec: wfs_and_wfh = wfs_and_wfh[:max_per_round] for (wfo,wfh) in wfs_and_wfh: print wfo.name,"to be transfered with priority",wfh.request['RequestPriority'] if wfh.request['RequestStatus']!='assignment-approved': if wfh.request['RequestStatus'] in ['aborted','rejected','rejected-archived','aborted-archived']: wfo.status = 'trouble' ## so that we look or a replacement else: wfo.status = 'away' wfh.sendLog('transferor', '%s in status %s, setting %s'%( wfo.name,wfh.request['RequestStatus'],wfo.status)) continue (_,primary,_,_) = wfh.getIO() this_load=sum([input_sizes[prim] for prim in primary]) no_budget = False if ( this_load and (sum(transfer_sizes.values())+this_load > transfer_limit or went_over_budget ) ): if went_over_budget: wfh.sendLog('transferor', "Transfer has gone over bubget.") else: wfh.sendLog('transferor', "Transfer will go over bubget.") wfh.sendLog('transferor', "%15.4f GB this load, %15.4f GB already this round, %15.4f GB is the available limit"%(this_load, sum(transfer_sizes.values()), transfer_limit)) #if sum(transfer_sizes.values()) > transfer_limit: went_over_budget = True if in_transfer_priority!=None and min_transfer_priority!=None: if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: wfh.sendLog('transferor',"Higher priority sample %s >= %s go-on over budget"%( wfh.request['RequestPriority'], in_transfer_priority)) else: if not options.go: wfh.sendLog('transferor',"%s minimum priority %s < %s : stop"%( min_transfer_priority,wfh.request['RequestPriority'],in_transfer_priority)) no_budget = True ## throtlle by campaign go no_go = False if not wfh.go(log=True) and not options.go: no_go = True no_goes.add( wfo.name ) allowed_secondary = set() for campaign in wfh.getCampaigns(): if campaign in CI.campaigns and 'secondaries' in CI.campaigns[campaign]: allowed_secondary.update( CI.campaigns[campaign]['secondaries'] ) if secondary: if (secondary and allowed_secondary) and (set(secondary)&allowed_secondary!=set(secondary)): wfh.sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-allowed_secondary))) no_go = True if no_go: continue ## check if the batch is announced def check_mcm(wfn): announced=False is_real=False if not wfn.startswith('pdmvserv'): is_real = True try: for b in mcm.getA('batches',query='contains=%s'% wfo.name): is_real = True if b['status']=='announced': announced=True break except: try: for b in mcm.getA('batches',query='contains=%s'% wfo.name): is_real = True if b['status']=='announced': announced=True break except: print "could not get mcm batch announcement, assuming not real" return announced,is_real if not use_mcm: announced,is_real = False,True else: if wfh.request['RequestType'] in ['ReReco']: announced,is_real = True,True else: announced,is_real = check_mcm( wfo.name ) if not announced: wfh.sendLog('transferor', "does not look announced.") if not is_real: wfh.sendLog('transferor', "does not appear to be genuine.") ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status continue ## check on a grace period injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.) now = time.mktime(time.gmtime()) / (60.*60.) if float(now - injection_time) < 4.: if not options.go and not announced: wfh.sendLog('transferor', "It is too soon to start transfer: %3.2fH remaining"%(now - injection_time)) continue if passing_along >= allowed_to_handle: #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: if in_transfer_priority!=None and min_transfer_priority!=None: if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority: ## higher priority, and not only this priority being transfered wfh.sendLog('transferor',"Higher priority sample %s >= %s go-on over %s"%( wfh.request['RequestPriority'], in_transfer_priority, max_to_handle)) else: wfh.sendLog('transferor'," Not allowed to pass more than %s at a time. Currently %s handled, and adding %s"%( max_to_handle, being_handled, passing_along)) if not options.go: ## should not allow to jump that fence break if this_load and needs_transfer >= allowed_to_transfer: if in_transfer_priority!=None and min_transfer_priority!=None: if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority: ## higher priority, and not only this priority being transfered wfh.sendLog('transferor',"Higher priority sample %s >= %s go-on over %s"%(wfh.request['RequestPriority'], in_transfer_priority,max_to_transfer)) else: wfh.sendLog('transferor',"Not allowed to transfer more than %s at a time. Currently %s transfering, and adding %s"%( max_to_transfer, being_transfered, needs_transfer)) if not options.go: no_budget = True if no_budget: continue ## the site white list considers site, campaign, memory and core information (lheinput,primary,parent,secondary,sites_allowed) = wfh.getSiteWhiteList() if options and options.tosites: sites_allowed = options.tosites.split(',') for dataset in list(primary)+list(parent)+list(secondary): ## lock everything flat NLI.lock( dataset ) if not sites_allowed: wfh.sendLog('transferor',"not possible site to run at") #sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name )) sendLog('transferor',"%s has no possible sites to run at"%( wfo.name ),level='critical') continue blocks = [] if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## augment with run white list for dataset in primary: blocks = list(set( blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist'] ) )) if 'LumiList' in wfh.request and wfh.request['LumiList']: ## augment with the lumi white list blocks = list(set( blocks + getDatasetBlocks( dataset, lumis= wfh.request['LumiList'] ) )) if blocks: print "Reading",len(blocks),"in block whitelist" can_go = True staging=False allowed=True primary_destinations = set() if primary: copies_needed_from_CPUh,CPUh = wfh.getNCopies() if talk: print wfo.name,'reads',', '.join(primary),'in primary' ## chope the primary dataset for prim in primary: ## keep track of what needs what workflow_dependencies[prim].add( wfo.id ) max_priority[prim] = max(max_priority[prim],int(wfh.request['RequestPriority'])) wfh.sendLog('transferor',"Would make %s from cpu requirement %s"%( copies_needed_from_CPUh, CPUh)) copies_needed = copies_needed_from_CPUh if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies'] copies_needed = min(copies_needed_from_campaign, copies_needed) wfh.sendLog('transferor',"Maxed to %s by campaign configuration %s"%( copies_needed, wfh.request['Campaign'])) ### new ways of making the whole thing destinations,all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks ) print json.dumps(destinations, indent=2) ## get where the dataset is in full and completed prim_location = [site for (site,info) in destinations.items() if info['completion']==100 and info['data_fraction']==1] ## the rest is places it is going to be prim_destination = [site for site in destinations.keys() if not site in prim_location] if len(prim_location) >= copies_needed: wfh.sendLog('transferor',"The input is all fully in place at %s sites %s"%( len(prim_location), sorted(prim_location))) continue copies_needed = max(0,copies_needed - len(prim_location)) wfh.sendLog('transferor',"not counting existing copies ; now need %s"% copies_needed) copies_being_made = [ sum([info['blocks'].keys().count(block) for site,info in destinations.items() if site in prim_destination]) for block in all_block_names] latching_on_transfers = set() [latching_on_transfers.update(info['blocks'].values()) for site,info in destinations.items() if site in prim_destination] latching_on_transfers = list(latching_on_transfers) #print latching_on_transfers ## figure out where all this is going to go prim_to_distribute = [site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location] prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in prim_destination] ## take out the ones that cannot receive transfers prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] wfh.sendLog('transferor',"Could be going to: %s"% sorted( prim_to_distribute)) if not prim_to_distribute or any([transfers_per_sites[site] < max_staging_per_site for site in prim_to_distribute]): ## means there is openings let me go print "There are transfer slots available:",[(site,transfers_per_sites[site]) for site in prim_to_distribute] #for site in sites_allowed: # #increment accross the board, regardless of real destination: could be changed # transfers_per_sites[site] += 1 else: if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: wfh.sendLog('transferor', "Higher priority sample %s >= %s go-on over transfer slots available"%(wfh.request['RequestPriority'], in_transfer_priority)) else: wfh.sendLog('transferor',"Not allowed to transfer more than %s per site at a time. Going overboard for %s"%( max_staging_per_site, sorted([site for site in prim_to_distribute if transfers_per_sites[site]>=max_staging_per_site]))) if not options.go: allowed = False break for latching in latching_on_transfers: tfo = session.query(Transfer).filter(Transfer.phedexid == int(latching)).first() if not tfo: tfo = session.query(Transfer).filter(Transfer.phedexid == -int(latching)).first() if not tfo: tfo = Transfer( phedexid = latching) tfo.workflows_id = [] session.add(tfo) else: tfo.phedexid = latching ## make it positive ever if not wfo.id in tfo.workflows_id: print "adding",wfo.id,"to",tfo.id,"with phedexid",latching l = copy.deepcopy( tfo.workflows_id ) l.append( wfo.id ) tfo.workflows_id = l if not options.test: session.commit() else: session.flush() ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query can_go = False transfer_sizes[prim] = input_sizes[prim] staging = True # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ?? #copies_needed = max(0,copies_needed - len(prim_destination)) copies_needed = max(0,copies_needed - min(copies_being_made)) wfh.sendLog('transferor', "Not counting the copies being made ; then need %s"% copies_needed) if copies_needed == 0: wfh.sendLog('transferor', "The output is either fully in place or getting in full somewhere with %s"% latching_on_transfers) can_go = True continue elif len(prim_to_distribute)==0: wfh.sendLog('transferor', "We are going to need extra copies, but no destinations seems available") prim_to_distribute = [site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location] prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] if len(prim_to_distribute)>0: ## maybe that a parameter we can play with to limit the if not options or options.chop: ### hard include the tape disk andpoint ? #tapes = [site for site in getDatasetPresence( url, prim, vetos=['T0','T2','T3','Disk']) if site.endswith('MSS')] chops,sizes = getDatasetChops(prim, chop_threshold = options.chopsize, only_blocks=blocks) spreading = distributeToSites( chops, prim_to_distribute, n_copies = copies_needed, weights=SI.cpu_pledges, sizes=sizes) transfer_sizes[prim] = sum(sizes) if not spreading: sendLog('transferor','cannot send %s to any site, it cannot fit anywhere'% prim, level='critical') wfh.sendLog('transferor', "cannot send to any site. %s cannot seem to fit anywhere"%(prim)) staging=False can_go = False else: spreading = {} for site in prim_to_distribute: if blocks: spreading[site]=blocks else: spreading[site]=[prim] transfer_sizes[prim] = input_sizes[prim] ## this is approximate if blocks are specified can_go = False wfh.sendLog('transferor', "selected CE destinations %s"%(sorted( spreading.keys()))) for (site,items) in spreading.items(): all_transfers[site].extend( items ) transfers_per_sites[site] += 1 primary_destinations.add( site ) if not allowed: wfh.sendLog('transferor', "Not allowed to move on with") continue if secondary: override_sec_destination = [] if 'SecondaryLocation' in CI.campaigns[wfh.request['Campaign']]: override_sec_destination = CI.campaigns[wfh.request['Campaign']]['SecondaryLocation'] print wfo.name,'reads',', '.join(secondary),'in secondary' for sec in secondary: workflow_dependencies[sec].add( wfo.id ) if True: ## new style, failing on minbias if not sec in destination_cache: ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist destination_cache[sec],_ = getDatasetDestinations(url, sec) ## NO SITE WHITE LIST ADDED #destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed]) ## limit to the site whitelist NOW se_allowed = [SI.CE_to_SE(site) for site in sites_allowed] destinations = dict([(k,v) for (k,v) in destination_cache[sec].items() if site in se_allowed]) ## truncate location/destination to those making up for >90% of the dataset bad_destinations = [destinations.pop(site) for (site,info) in destinations.items() if info['data_fraction']<0.9] sec_location = [site for (site,info) in destinations.items() if info['completion']>=95] sec_destination = [site for site in destinations.keys() if not site in sec_location] else: ## old style presence = getDatasetPresence( url, sec ) sec_location = [site for site,pres in presence.items() if pres[1]>90.] ## more than 90% of the minbias at sites subscriptions = listSubscriptions( url ,sec ) sec_destination = [site for site in subscriptions] sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] if override_sec_destination: ## intersect with where we want the PU to be not_needed_anymore = list(set(sec_to_distribute) - set(override_sec_destination)) #sendEmail("secondary superfluous","the dataset %s could be removed from %s"%( sec, not_needed_anymore )) sendLog('transferor', "the dataset %s could be removed from %s"%( sec, not_needed_anymore )) sec_to_distribute = list(set(sec_to_distribute) & set(override_sec_destination)) if len( sec_to_distribute )>0: print "secondary could go to",sorted(sec_to_distribute) sec_size = dss.get( sec ) for site in sec_to_distribute: site_se =SI.CE_to_SE(site) if (SI.disk[site_se]*1024.) > sec_size: all_transfers[site].append( sec ) can_go = False else: print "could not send the secondary input to",site_se,"because it is too big for the available disk",SI.disk[site_se]*1024,"GB need",sec_size if primary_destinations and site in primary_destinations: #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024)) sendLog('transferor', '%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024), level='critical') else: print "the secondary input does not have to be send to site" ## is that possible to do something more if can_go: ## no explicit transfer required this time if staging: ## but using existing ones wfh.sendLog('transferor', "latches on existing transfers, and nothing else, settin staging") wfo.status = 'staging' needs_transfer+=1 else: wfh.sendLog('transferor', "should just be assigned now to %s"%sorted(sites_allowed)) wfo.status = 'staged' passing_along+=1 wfh.sendLog('transferor', "setting status to %s"%wfo.status) session.commit() continue else: ## there is an explicit transfer required if staging: ## and also using an existing one wfh.sendLog('transferor', "latches on existing transfers") if not options.test: wfo.status = 'staging' wfh.sendLog('transferor', "setting status to %s"%wfo.status) session.commit() wfh.sendLog('transferor',"needs a transfer") needs_transfer+=1 passing_along+=1 if no_goes: #sendEmail("no go for managing","No go for \n"+"\n".join( no_goes )) sendLog('transferor', "No go for \n"+"\n".join( no_goes ), level='critical') print "accumulated transfers" print json.dumps(all_transfers, indent=2) fake_id=-1 wf_id_in_prestaging=set() for (site,items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## site that do not want input datasets if site in SI.sites_veto_transfer: print site,"does not want transfers" continue ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured. ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] block_datasets = list(set([it.split('#')[0] for it in blocks])) datasets = [it for it in items_to_transfer if not '#' in it] details_text = "Making a replica to %s (CE) %s (SE) for"%( site, site_se) #print "\t",len(blocks),"blocks" ## remove blocks if full dataset is send out blocks = [block for block in blocks if not block.split('#')[0] in datasets] #print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks])) #print "\t",len(datasets),"datasets" #print "\t",datasets details_text += '\n\t%d blocks'%len(blocks) details_text += '\n\t%d needed blocks for %s'%( len(blocks), sorted(list(set([block.split('#')[0] for block in blocks])))) details_text += '\n\t%d datasets'% len(datasets) details_text += '\n\t%s'%sorted(datasets) items_to_transfer = blocks + datasets if execute: sendLog('transferor', details_text) else: print "Would make a replica to",site,"(CE)",site_se,"(SE) for" print details_text ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y','yes','go']: continue if execute: priority = 'normal' cds = [ds for ds in datasets+block_datasets if ds in max_priority] if cds and False: ## I don't think this is working. subscription should be updated on the fly and regularly for raising the priority if needed ## decide on an overall priority : that's a bit too large though if any([max_priority[ds]>=90000 for ds in cds]): priority = 'high' elif all([max_priority[ds]<80000 for ds in cds]): priority = 'low' result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority=priority) else: result= {'phedex':{'request_created' : []}} fake_id-=1 if not result: print "ERROR Could not make a replica request for",site,items_to_transfer,"pre-staging" continue for phedexid in [o['id'] for o in result['phedex']['request_created']]: new_transfer = session.query(Transfer).filter(Transfer.phedexid == int(phedexid)).first() if not new_transfer: new_transfer = session.query(Transfer).filter(Transfer.phedexid == -int(phedexid)).first() print phedexid,"transfer created" if not new_transfer: new_transfer = Transfer( phedexid = phedexid) session.add( new_transfer ) else: new_transfer.phedexid = phedexid ## make it positive again new_transfer.workflows_id = set() for transfering in list(set(map(lambda it : it.split('#')[0], items_to_transfer))): new_transfer.workflows_id.update( workflow_dependencies[transfering] ) new_transfer.workflows_id = list(new_transfer.workflows_id) wf_id_in_prestaging.update(new_transfer.workflows_id) session.commit() ## auto approve it if execute: approved = approveSubscription(url, phedexid, [site_se]) for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status!='staging': if execute: tr_wf.status = 'staging' if talk: print "setting",tr_wf.name,"to staging" session.commit()
def stagor(url,specific =None, options=None): if not componentInfo().check(): return SI = siteInfo() CI = campaignInfo() UC = unifiedConfiguration() done_by_wf_id = {} done_by_input = {} completion_by_input = {} good_enough = 100.0 lost_blocks = json.loads(open('%s/lost_blocks_datasets.json'%monitor_dir).read()) lost_files = json.loads(open('%s/lost_files_datasets.json'%monitor_dir).read()) known_lost_blocks = {} known_lost_files = {} for dataset in set(lost_blocks.keys()+lost_files.keys()): b,f = findLostBlocksFiles(url, dataset) if dataset in lost_blocks and not b: print dataset,"has no really lost blocks" else: known_lost_blocks[dataset] = [i['name'] for i in b] if dataset in lost_files and not f: print dataset,"has no really lost files" else: known_lost_files[dataset] = [i['name'] for i in f] try: cached_transfer_statuses = json.loads(open('cached_transfer_statuses.json').read()) except: print "inexisting transfer statuses. starting fresh" cached_transfer_statuses = {} transfer_statuses = {} ## pop all that are now in negative values for phedexid in cached_transfer_statuses.keys(): transfers = session.query(Transfer).filter(Transfer.phedexid==int(phedexid)).all() if not transfers: print phedexid,"does not look relevant to be in cache anymore. poping" print cached_transfer_statuses.pop( phedexid ) ## collect all datasets that are needed for wf in staging, correcting the status of those that are not really in staging wfois = [] needs = defaultdict(list) for wfo in session.query(Workflow).filter(Workflow.status == 'staging').all(): wfi = workflowInfo(url, wfo.name) if wfi.request['RequestStatus'] in ['running-open','running-closed','completed','assigned','acquired']: wfi.sendLog('stagor', "is in status %s"%wfi.request['RequestStatus']) wfi.status='away' session.commit() continue if not wfi.request['RequestStatus'] in ['assignment-approved']: ## should be setting 'away' too print wfo.name,"is",wfi.request['RequestStatus'] sendEmail("wrong status in staging. debug","%s is in %s, should set away."%(wfo.name,wfi.request['RequestStatus'])) wfois.append( (wfo,wfi) ) _,primaries,_,secondaries = wfi.getIO() for dataset in list(primaries)+list(secondaries): needs[wfo.name].append( dataset) done_by_input[dataset] = {} completion_by_input[dataset] = {} wfi.sendLog('stagor', '%s needs %s'%( wfo.name, dataset)) open('%s/dataset_requirements.json'%monitor_dir,'w').write( json.dumps( needs, indent=2)) dataset_endpoints = defaultdict(set) endpoint_in_downtime = defaultdict(set) #endpoint_completed = defaultdict(set) endpoint_incompleted = defaultdict(set) #endpoint = defaultdict(set) send_back_to_considered = set() ## phedexid are set negative when not relevant anymore # probably there is a db schema that would allow much faster and simpler query for transfer in session.query(Transfer).filter(Transfer.phedexid>0).all(): if specific and str(transfer.phedexid)!=str(specific): continue skip=True for wfid in transfer.workflows_id: tr_wf = session.query(Workflow).get(wfid) if tr_wf: if tr_wf.status == 'staging': sendLog('stagor',"\t%s is staging for %s"%(transfer.phedexid, tr_wf.name)) skip=False if skip: sendLog('stagor',"setting %s to negative value"%transfer.phedexid) transfer.phedexid = -transfer.phedexid session.commit() continue if transfer.phedexid<0: continue ## check the status of transfers checks = checkTransferApproval(url, transfer.phedexid) approved = all(checks.values()) if not approved: sendLog('stagor', "%s is not yet approved"%transfer.phedexid) approveSubscription(url, transfer.phedexid) continue ## check on transfer completion if str(transfer.phedexid) in cached_transfer_statuses: ### use a cache for transfer that already looked done sendLog('stagor',"read %s from cache"%transfer.phedexid) checks = cached_transfer_statuses[str(transfer.phedexid)] else: checks = checkTransferStatus(url, transfer.phedexid, nocollapse=True) ## just write this out transfer_statuses[str(transfer.phedexid)] = copy.deepcopy(checks) if not specific: for dsname in checks: if not dsname in done_by_input: done_by_input[dsname]={} if not dsname in completion_by_input: completion_by_input[dsname] = {} done_by_input[dsname][transfer.phedexid]=all(map(lambda i:i>=good_enough, checks[dsname].values())) completion_by_input[dsname][transfer.phedexid]=checks[dsname].values() if checks: sendLog('stagor',"Checks for %s are %s"%( transfer.phedexid, [node.values() for node in checks.values()])) done = all(map(lambda i:i>=good_enough,list(itertools.chain.from_iterable([node.values() for node in checks.values()])))) else: ## it is empty, is that a sign that all is done and away ? print "ERROR with the scubscriptions API of ",transfer.phedexid print "Most likely something else is overiding the transfer request. Need to work on finding the replacement automatically, if the replacement exists" done = False ## the thing above is NOT giving the right number #done = False for wfid in transfer.workflows_id: tr_wf = session.query(Workflow).get(wfid) if tr_wf:# and tr_wf.status == 'staging': if not tr_wf.id in done_by_wf_id: done_by_wf_id[tr_wf.id]={} done_by_wf_id[tr_wf.id][transfer.phedexid]=done ## for those that are in staging, and the destination site is in drain #if not done and tr_wf.status == 'staging': for ds in checks: for s,v in checks[ds].items(): dataset_endpoints[ds].add( s ) if done: ## transfer.status = 'done' sendLog('stagor',"%s is done"%transfer.phedexid) cached_transfer_statuses[str(transfer.phedexid)] = copy.deepcopy(checks) else: sendLog('stagor',"%s is not finished %s"%(transfer.phedexid, pprint.pformat( checks ))) pprint.pprint( checks ) ## check if the destination is in down-time for ds in checks: sites_incomplete = [SI.SE_to_CE(s) for s,v in checks[ds].items() if v<good_enough] sites_incomplete_down = [s for s in sites_incomplete if not s in SI.sites_ready] if sites_incomplete_down: sendLog('stagor',"%s are in downtime, while waiting for %s to get there"%( ",".join(sites_incomplete_down), ds)) #sites_complete = [SI.SE_to_CE(s) for s,v in checks[ds].items() if v>=good_enough] #endpoint[ds].update( sites_complete ) #endpoint[ds].update( sites_incomplete ) #endpoint_completed[ds].update( sites_complete ) endpoint_incompleted[ds].update( sites_incomplete ) endpoint_in_downtime[ds].update( sites_incomplete_down ) print "End point in down time" for k in endpoint_in_downtime: endpoint_in_downtime[k] = list(endpoint_in_downtime[k]) for k in dataset_endpoints: dataset_endpoints[k] = list(dataset_endpoints[k]) print json.dumps( endpoint_in_downtime , indent=2) open('cached_transfer_statuses.json','w').write( json.dumps( cached_transfer_statuses, indent=2)) open('%s/transfer_statuses.json'%monitor_dir,'w').write( json.dumps( transfer_statuses, indent=2)) open('%s/dataset_endpoints.json'%monitor_dir,'w').write( json.dumps(dataset_endpoints, indent=2)) already_stuck = json.loads( open('%s/stuck_transfers.json'%monitor_dir).read() ) missing_in_action = defaultdict(list) print "-"*10,"Checking on workflows in staging","-"*10 #forget_about = ['/MinBias_TuneCUETP8M1_13TeV-pythia8/RunIISummer15GS-MCRUN2_71_V1-v2/GEN-SIM'] #for what in forget_about: # if not done_by_input[what]: # done_by_input[what] = {'fake':True} ## come back to workflows and check if they can go available_cache = defaultdict(lambda : defaultdict(float)) presence_cache = defaultdict(dict) for wfo,wfi in wfois: print "#"*30 ## the site white list takes site, campaign, memory and core information (_,primaries,_,secondaries,sites_allowed) = wfi.getSiteWhiteList(verbose=False) se_allowed = [SI.CE_to_SE(site) for site in sites_allowed] se_allowed.sort() se_allowed_key = ','.join(se_allowed) readys={} for need in list(primaries)+list(secondaries): if not need in done_by_input: wfi.sendLog('stagor',"missing transfer report for %s"%need) readys[need] = False ## should warn someone about this !!! ## it cannot happen, by construction sendEmail('missing transfer report','%s does not have a transfer report'%(need)) continue if not done_by_input[need] and need in list(secondaries): wfi.sendLog('stagor',"assuming it is OK for secondary %s to have no attached transfers"% need) readys[need] = True done_by_input[need] = { "fake" : True } continue if len(done_by_input[need]) and all(done_by_input[need].values()): wfi.sendLog('stagor',"%s is ready"%need) print json.dumps( done_by_input[need] , indent=2) readys[need] = True else: wfi.sendLog('stagor',"%s is not ready"%need) print json.dumps( done_by_input[need] , indent=2) readys[need] = False if readys and all(readys.values()): if wfo.status == 'staging': wfi.sendLog('stagor',"all needs are fullfilled, setting staged") wfo.status = 'staged' session.commit() else: wfi.sendLog('stagor',"all needs are fullfilled, already") print json.dumps( readys, indent=2 ) else: wfi.sendLog('stagor',"missing requirements") copies_needed,_ = wfi.getNCopies() jump_ahead = False re_transfer = False ## there is missing input let's do something more elaborated for need in list(primaries):#+list(secondaries): if endpoint_in_downtime[need] == endpoint_incompleted[need]: #print need,"is going to an end point in downtime" wfi.sendLog('stagor',"%s has only incomplete endpoint in downtime"%need) re_transfer=True if not se_allowed_key in available_cache[need]: available_cache[need][se_allowed_key] = getDatasetBlocksFraction( url , need, sites=se_allowed ) if available_cache[need][se_allowed_key] >= copies_needed: wfi.sendLog('stagor',"assuming it is OK to move on like this already for %s"%need) jump_ahead = True ## compute a time since staging to filter jump starting ? # check whether the inputs is already in the stuck list ... for need in list(primaries)+list(secondaries): if need in already_stuck: wfi.sendLog('stagor',"%s is stuck, so try to jump ahead"%need) jump_ahead = True if jump_ahead or re_transfer: details_text = "checking on availability for %s to jump ahead"%wfo.name details_text += '\n%s wants %s copies'%(wfo.name,copies_needed) copies_needed = max(1,copies_needed-1) details_text += '\nlowering by one unit to %s'%copies_needed wfi.sendLog('stagor', details_text) all_check = True prim_where = set() for need in list(primaries): if not se_allowed_key in presence_cache[need]: presence_cache[need][se_allowed_key] = getDatasetPresence( url, need , within_sites=se_allowed) presence = presence_cache[need][se_allowed_key] prim_where.update( presence.keys() ) available = available_cache[need][se_allowed_key] this_check = (available >= copies_needed) wfi.sendLog('stagor', "%s is available %s times %s"%( need, available, this_check)) all_check &= this_check if not all_check: break for need in list(secondaries): ## I do not want to check on the secon this_check = all(done_by_input[need].values()) wfi.sendLog('stagor',"%s is all transfered %s"%(need, json.dumps(done_by_input[need], indent=2))) all_check&= this_check #if not se_allowed_key in presence_cache[need]: # presence_cache[need][se_allowed_key] = getDatasetPresence( url, need , within_sites=se_allowed) ## restrict to where the primary is #presence = dict([(k,v) for (k,v) in presence_cache[need][se_allowed_key].items() if k in prim_where]) #this_check = all([there for (there,frac) in presence.values()]) #print need,"is present at all sites:",this_check #all_check&= this_check if all_check: wfi.sendLog('stagor',"needs are sufficiently fullfilled, setting staged") wfo.status = 'staged' session.commit() else: print wfo.name,"has to wait a bit more" wfi.sendLog('stagor',"needs to wait a bit more") else: wfi.sendLog('stagor',"not checking availability") if re_transfer: wfi.sendLog('stagor',"Sending back to considered because of endpoint in downtime") if wfo.status == 'staging': wfo.status = 'considered' session.commit() send_back_to_considered.add( wfo.name ) if send_back_to_considered: #sendEmail("transfer to endpoint in downtime","sending back to considered the following workflows \n%s"%('\n'.join( send_back_to_considered))) sendLog('stagor', "sending back to considered the following workflows \n%s"%('\n'.join( send_back_to_considered)), level='critical') print "-"*10,"Checking on non-available datasets","-"*10 ## now check on those that are not fully available for dsname in available_cache.keys(): ## squash the se_allowed_key key available_cache[dsname] = min( available_cache[dsname].values() ) for dsname,available in available_cache.items(): using_its = getWorkflowByInput(url, dsname) #print using_its using_wfos = [] for using_it in using_its: wf = session.query(Workflow).filter(Workflow.name == using_it).first() if wf: using_wfos.append( wf ) if not len(done_by_input[dsname]): print "For dataset",dsname,"there are no transfer report. That's an issue." for wf in using_wfos: if wf.status == 'staging': if UC.get("stagor_sends_back"): print "sending",wf.name,"back to considered" wf.status = 'considered' session.commit() #sendEmail( "send back to considered","%s was send back and might be trouble"% wf.name) sendLog('stagor', "%s was send back and might be trouble"% wf.name, level='critical') else: print "would send",wf.name,"back to considered" #sendEmail( "subscription lagging behind","susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name) sendLog('stagor', "susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name, level='critical') continue ## not compatible with checking on secondary availability #if all([wf.status != 'staging' for wf in using_wfos]): # ## means despite all checks that input is not needed # continue if available < 1.: print "incomplete",dsname ## there is a problem in the method below that it does not account for files stuck in T1*Buffer only lost_blocks,lost_files = findLostBlocksFiles( url, dsname ) lost_block_names = [item['name'] for item in lost_blocks] lost_file_names = [item['name'] for item in lost_files] if lost_blocks: #print json.dumps( lost , indent=2 ) ## estimate for how much ! fraction_loss,_,n_missing = getDatasetBlockFraction(dsname, lost_block_names) print "We have lost",len(lost_block_names),"blocks",lost_block_names,"for %f%%"%(100.*fraction_loss) if fraction_loss > 0.05: ## 95% completion mark #sendEmail('we have lost too many blocks','%s is missing %d blocks, for %d events, %f %% loss'%(dsname, len(lost_block_names), n_missing, fraction_loss)) sendLog('stagor', '%s is missing %d blocks, for %d events, %3.2f %% loss'%(dsname, len(lost_block_names), n_missing, 100*fraction_loss), level='warning') ## the workflow should be rejected ! for wf in using_wfos: if wf.status == 'staging': print wf.name,"is doomed. setting to trouble" wf.status = 'trouble' session.commit() #sendEmail('doomed workflow','%s has too much loss on the input dataset %s. please check on stagor logs https://cmst2.web.cern.ch/cmst2/unified/logs/stagor/last.log'%(wf.name, dsname)) sendLog('stagor', '%s has too much loss on the input dataset %s. Missing %d blocks, for %d events, %3.2f %% loss'%(wf.name, dsname, len(lost_block_names), n_missing, 100*fraction_loss), level='critical') else: ## probably enough to make a ggus and remove if not dsname in known_lost_blocks: #sendEmail('we have lost a few blocks', '%s is missing %d blocks, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_block_names), n_missing, fraction_loss, '\n'.join( lost_block_names ) )) sendLog('stagor', '%s is missing %d blocks, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_block_names), n_missing, fraction_loss, '\n'.join( lost_block_names ) ), level='critical') known_lost_blocks[dsname] = [i['name'] for i in lost_blocks] if lost_files: fraction_loss,_,n_missing = getDatasetFileFraction(dsname, lost_file_names) print "We have lost",len(lost_file_names),"files",lost_file_names,"for %f%%"%fraction_loss if fraction_loss > 0.05: #sendEmail('we have lost too many files','%s is missing %d files, for %d events, %f %% loss'%(dsname, len(lost_file_names),n_missing, fraction_loss)) sendLog('stagor', '%s is missing %d files, for %d events, %f %% loss'%(dsname, len(lost_file_names),n_missing, fraction_loss), level='critical') for wf in using_wfos: if wf.status == 'staging': print wf.name,"is doomed. setting to trouble" wf.status = 'trouble' session.commit() else: ## probably enough to make a ggus and remove if not dsname in known_lost_files: #sendEmail('we have lost a few files','%s is missing %d files, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_file_names),n_missing, fraction_loss, '\n'.join(lost_file_names))) sendLog('stagor', '%s is missing %d files, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_file_names),n_missing, fraction_loss, '\n'.join(lost_file_names)), level='critical') known_lost_files[dsname] = [i['name'] for i in lost_files] ## should the status be change to held-staging and pending on a ticket missings = [pid for (pid,d) in done_by_input[dsname].items() if d==False] print "\t",done_by_input[dsname] print "\tneeds",len(done_by_input[dsname]) print "\tgot",done_by_input[dsname].values().count(True) print "\tmissing",missings missing_in_action[dsname].extend( missings ) rr= open('%s/lost_blocks_datasets.json'%monitor_dir,'w') rr.write( json.dumps( known_lost_blocks, indent=2)) rr.close() rr= open('%s/lost_files_datasets.json'%monitor_dir,'w') rr.write( json.dumps( known_lost_files, indent=2)) rr.close() open('%s/incomplete_transfers.json'%monitor_dir,'w').write( json.dumps(missing_in_action, indent=2) ) print "Stuck transfers and datasets" print json.dumps( missing_in_action, indent=2 ) print "Going further and make a report of stuck transfers" datasets_by_phid = defaultdict(set) for dataset in missing_in_action: for phid in missing_in_action[dataset]: #print dataset,"stuck through",phid datasets_by_phid[phid].add( dataset ) bad_destinations = defaultdict(set) bad_sources = defaultdict(set) report = "" really_stuck_dataset = set() transfer_timeout = UC.get("transfer_timeout") transfer_lowrate = UC.get("transfer_lowrate") for phid,datasets in datasets_by_phid.items(): issues = checkTransferLag( url, phid , datasets=list(datasets) ) for dataset in issues: for block in issues[dataset]: for destination in issues[dataset][block]: (block_size,destination_size,delay,rate,dones) = issues[dataset][block][destination] ## count x_Buffer and x_MSS as one source redones=[] for d in dones: if d.endswith('Buffer') or d.endswith('Export'): if d.replace('Buffer','MSS').replace('Export','MSS') in dones: continue else: redones.append( d ) else: redones.append( d ) dones = list(set( redones )) #dones = filter(lambda s : (s.endswith('Buffer') and not s.replace('Buffer','MSS') in dones) or (not s.endswith('Buffer')) , dones) if delay>transfer_timeout and rate<transfer_lowrate: if len(dones)>1: ## its the destination that sucks bad_destinations[destination].add( block ) else: dum=[bad_sources[d].add( block ) for d in dones] really_stuck_dataset.add( dataset ) print "add",dataset,"to really stuck" report += "%s is not getting to %s, out of %s faster than %f [GB/s] since %f [d]\n"%(block,destination,", ".join(dones), rate, delay) print "\n"*2 ## create tickets right away ? report+="\nbad sources "+",".join(bad_sources.keys())+"\n" for site,blocks in bad_sources.items(): report+="\n\n%s:"%site+"\n\t".join(['']+list(blocks)) report+="\nbad destinations "+",".join(bad_destinations.keys())+"\n" for site,blocks in bad_destinations.items(): report+="\n\n%s:"%site+"\n\t".join(['']+list(blocks)) print '\n'*2,"Datasets really stuck" print '\n'.join( really_stuck_dataset ) print '\n'*2,"report written at https://cmst2.web.cern.ch/cmst2/unified/logs/incomplete_transfers.log" print report stuck_transfers = dict([(k,v) for (k,v) in missing_in_action.items() if k in really_stuck_dataset]) print '\n'*2,'Stuck dataset transfers' print json.dumps(stuck_transfers , indent=2) open('%s/stuck_transfers.json'%monitor_dir,'w').write( json.dumps(stuck_transfers , indent=2) ) open('%s/logs/incomplete_transfers.log'%monitor_dir,'w').write( report )
def outcleanor(url, options): if options.approve: for user in ['*Vlimant']:#,'*Cremonesi']: deletes = listDelete( url , user = user) for (site,who,tid) in deletes: if 'MSS' in site: continue### ever print site,who,tid print "approving deletion" print approveSubscription(url, tid, nodes = [site], comments = 'Production cleaning by data ops') return sites_and_datasets = defaultdict(list) our_copies = defaultdict(list) wf_cleaned = {} wfs = [] for fetch in options.fetch.split(','): wfs.extend(session.query(Workflow).filter(Workflow.status==fetch).all()) random.shuffle( wfs ) last_answer = None for wfo in wfs : if options.number and len(wf_cleaned)>= options.number: print "Reached",options.number,"cleaned" break print '-'*100 wfi = workflowInfo(url, wfo.name) goes = {} # boolean per output for dataset in wfi.request['OutputDatasets']: goes[dataset] = False keep_one_out = True status = getDatasetStatus( dataset ) print "\n\tLooking at",dataset,status,"\n" vetoes = None if status == 'INVALID': vetoes = ['Export','Buffer'] ## can take themselves out keep_one_out = False # just wipe clean elif status == None: print dataset,"actually does not exist. skip" goes[dataset] = True continue elif status in ['PRODUCTION','VALID'] and wfo.status in ['forget','trouble']: print dataset,"should probably be invalidated. (",wfo.status,") skip" keep_one_out = False # just wipe clean continue ## you are not sure. just skip it for the time being elif status == 'PRODUCTION' and wfo.status in ['clean']: print dataset,"should probably be set valid .skip" continue ## you are not sure. just skip it for the time being if status == 'VALID' and dataset.startswith('/MinBias'): print "This is a /MinBias. skip" continue if '/DQM' in dataset: keep_one_out = False total_size = getDatasetSize( dataset ) our_presence = getDatasetPresence(url, dataset, complete=None, group="DataOps", vetoes=vetoes) also_our_presence = getDatasetPresence(url, dataset, complete=None, group="", vetoes=vetoes) ## merge in one unique dict for site in also_our_presence: if site in our_presence: there,frac = our_presence[site] other,ofrac = also_our_presence[site] our_presence[site] = (max(there,other),max(frac,ofrac)) else: our_presence[site] = also_our_presence[site] if our_presence: print our_presence ## analysis ops copies need to be taken into account anaops_presence = getDatasetPresence(url, dataset, complete=None, group="AnalysisOps") own_by_anaops = anaops_presence.keys() ## all our copies to_be_cleaned = our_presence.keys() if not len(to_be_cleaned): print "nowhere to be found of ours,",len(own_by_anaops),"in analysi ops pool" goes[dataset] = True continue print "Where we own bits of dataset" print to_be_cleaned if len(own_by_anaops): ## remove site with the anaops copies to_be_cleaned = list(set(to_be_cleaned) - set(own_by_anaops)) keep_one_out = False ## in that case, just remove our copies print "Own by anaops (therefore not keep a copy of ours)" print own_by_anaops else: ## we should not be looking at anything that was not passed to DDM, otherwise we'll be cutting the grass under our feet using_the_same = getWorkflowByInput(url, dataset, details=True) conflict = False for other in using_the_same: if other['RequestName'] == wfo.name: continue if other['RequestType'] == 'Resubmission': continue if not other['RequestStatus'] in ['announced','normal-archived','aborted','rejected','aborted-archived','rejected-archived','closed-out','None',None]: print other['RequestName'],'is in status',other['RequestStatus'],'preventing from cleaning',dataset conflict=True break if conflict: continue ## not being used. a bit less dangerous to clean-out ## keep one full copy out there full_copies = [site for (site,(there,fract)) in our_presence.items() if there] if keep_one_out: if not len(full_copies): print "we do not own a full copy of",dataset,status,wfo.status,".skip" continue stay_there = random.choice( full_copies ) #at a place own by ops print "Where we keep a full copy", stay_there to_be_cleaned.remove( stay_there ) our_copies[stay_there].append( dataset ) else: print "We do not want to keep a copy of ",dataset,status,wfo.status if len(to_be_cleaned): print "Where we can clean" print to_be_cleaned for site in to_be_cleaned: sites_and_datasets[site].append( (dataset, total_size*our_presence[site][1]/100., status) ) goes[dataset] = True else: print "no cleaning to be done" goes[dataset] = True print wfo.name,"scrutinized" if all(goes.values()): print "\t",wfo.name,"can toggle -out" def ask(): global last_answer last_answer = raw_input('go on ?') return last_answer if options.auto or ask() in ['y','']: if all(goes.values()): wfo.status = wfo.status+'-out' wf_cleaned[wfo.name] = wfo.status continue elif last_answer in ['q','n']: break else: return if options.auto: pass elif last_answer in ['q']: return print "Potential cleanups" for (site,items) in sites_and_datasets.items(): cleanup = sum([size for (_,size,_) in items]) print "\n\t potential cleanup of","%8.4f"%cleanup,"GB at ",site print "\n".join([ds+" "+st for ds,_,st in items]) datasets = [ ds for ds,_,st in items] print "Copies and bits we are going to delete" print json.dumps( sites_and_datasets, indent=2) print "Copies we are keeping" print json.dumps( our_copies, indent=2 ) print "Workflows cleaned for output" print json.dumps( wf_cleaned, indent=2 ) stamp = time.strftime("%Y%m%d%H%M%S", time.localtime()) open('outcleaning_%s.json'%stamp,'w').write( json.dumps( sites_and_datasets, indent=2)) open('keepcopies_%s.json'%stamp,'w').write( json.dumps( our_copies, indent=2)) open('wfcleanout_%s.json'%stamp,'w').write( json.dumps( wf_cleaned, indent=2)) if (not options.test) and (options.auto or raw_input("Satisfied ? (y will trigger status change and deletion requests)") in ['y']): for (site,items) in sites_and_datasets.items(): datasets = [ ds for ds,_,st in items] print "making deletion to",site result = makeDeleteRequest(url, site, datasets, "Cleanup output after production. DataOps will take care of approving it.") print result ## approve it right away ? if 'MSS' in site: continue if 'Export' in site: continue if 'Buffer' in site: continue for did in [item['id'] for item in result['phedex']['request_created']]: print "auto-approve disabled, but ready" #approveSubscription(url, did, nodes = [site], comments = 'Auto-approving production cleaning deletion') pass session.commit() else: print "Not making the deletion and changing statuses"
def stagor(url,specific =None, good_enough = 99.9): done_by_wf_id = {} done_by_input = {} for transfer in session.query(Transfer).all(): if specific and str(transfer.phedexid)!=str(specific): continue skip=True for wfid in transfer.workflows_id: tr_wf = session.query(Workflow).get(wfid) if tr_wf: if tr_wf.status == 'staging': skip=False break if skip: continue if transfer.phedexid<0: continue ## check the status of transfers checks = checkTransferApproval(url, transfer.phedexid) approved = all(checks.values()) if not approved: print transfer.phedexid,"is not yet approved" approveSubscription(url, transfer.phedexid) continue ## check on transfer completion checks = checkTransferStatus(url, transfer.phedexid) if not specific: for dsname in checks: if not dsname in done_by_input: done_by_input[dsname]={} done_by_input[dsname][transfer.phedexid]=all(map(lambda i:i>good_enough, checks[dsname].values())) if checks: done = all(map(lambda i:i>good_enough,list(itertools.chain.from_iterable([node.values() for node in checks.values()])))) else: ## it is empty, is that a sign that all is done and away ? print "ERROR with the scubscriptions API" done = False ## the thing above is NOT giving the right number #done = False for wfid in transfer.workflows_id: tr_wf = session.query(Workflow).get(wfid) if tr_wf:# and tr_wf.status == 'staging': if not tr_wf.id in done_by_wf_id: done_by_wf_id[tr_wf.id]={} done_by_wf_id[tr_wf.id][transfer.phedexid]=done if done: ## transfer.status = 'done' print transfer.phedexid,"is done" else: print transfer.phedexid,"not finished" pprint.pprint( checks ) #print done_by_input print "\n----\n" for dsname in done_by_input: if all(done_by_input[dsname].values()): print dsname,"is everywhere we wanted" ## the input dataset is fully transfered, should consider setting the corresponding wf to staged using_its = getWorkflowByInput(url, dsname) #print using_its for using_it in using_its: wf = session.query(Workflow).filter(Workflow.name == using_it).first() if wf and wf.status == 'staging': print wf.name,"is with us. setting staged and move on" wf.status = 'staged' session.commit() else: print dsname,done_by_input[dsname] for wfid in done_by_wf_id: #print done_by_wf_id[wfid].values() ## ask that all related transfer get into a valid state if all(done_by_wf_id[wfid].values()): pass