def stagor(url,specific =None, options=None): if not componentInfo().check(): return SI = siteInfo() CI = campaignInfo() UC = unifiedConfiguration() done_by_wf_id = {} done_by_input = {} completion_by_input = {} good_enough = 100.0 lost = json.loads(open('lost_blocks_datasets.json').read()) still_lost = [] for dataset in lost: l = findLostBlocks(url ,dataset) if not l: print dataset,"is not really lost" else: still_lost.append( dataset ) open('lost_blocks_datasets.json','w').write( json.dumps( still_lost, indent=2) ) if options.fast: print "doing the fast check of staged with threshold:",options.goodavailability for wfo in session.query(Workflow).filter(Workflow.status == 'staging').all(): if specific and not specific in wfo.name: continue wfi = workflowInfo(url, wfo.name) sites_allowed = getSiteWhiteList( wfi.getIO() ) if 'SiteWhitelist' in CI.parameters(wfi.request['Campaign']): sites_allowed = CI.parameters(wfi.request['Campaign'])['SiteWhitelist'] if 'SiteBlacklist' in CI.parameters(wfi.request['Campaign']): sites_allowed = list(set(sites_allowed) - set(CI.parameters(wfi.request['Campaign'])['SiteBlacklist'])) _,primaries,_,secondaries = wfi.getIO() se_allowed = [SI.CE_to_SE(site) for site in sites_allowed] all_check = True for dataset in list(primaries):#+list(secondaries) ? #print se_allowed available = getDatasetBlocksFraction( url , dataset , sites=se_allowed ) all_check &= (available >= options.goodavailability) if not all_check: break if all_check: print "\t\t",wfo.name,"can go staged" wfo.status = 'staged' session.commit() else: print "\t",wfo.name,"can wait a bit more" return for wfo in session.query(Workflow).filter(Workflow.status == 'staging').all(): wfi = workflowInfo(url, wfo.name) _,primaries,_,secondaries = wfi.getIO() for dataset in list(primaries)+list(secondaries): done_by_input[dataset] = {} completion_by_input[dataset] = {} print wfo.name,"needs",dataset for transfer in session.query(Transfer).all(): if specific and str(transfer.phedexid)!=str(specific): continue skip=True for wfid in transfer.workflows_id: tr_wf = session.query(Workflow).get(wfid) if tr_wf: if tr_wf.status == 'staging': print "\t",transfer.phedexid,"is staging for",tr_wf.name skip=False if skip: continue if transfer.phedexid<0: continue ## check the status of transfers checks = checkTransferApproval(url, transfer.phedexid) approved = all(checks.values()) if not approved: print transfer.phedexid,"is not yet approved" approveSubscription(url, transfer.phedexid) continue ## check on transfer completion checks = checkTransferStatus(url, transfer.phedexid, nocollapse=True) if not specific: for dsname in checks: if not dsname in done_by_input: done_by_input[dsname]={} if not dsname in completion_by_input: completion_by_input[dsname] = {} done_by_input[dsname][transfer.phedexid]=all(map(lambda i:i>=good_enough, checks[dsname].values())) completion_by_input[dsname][transfer.phedexid]=checks[dsname].values() if checks: print "Checks for",transfer.phedexid,[node.values() for node in checks.values()] done = all(map(lambda i:i>=good_enough,list(itertools.chain.from_iterable([node.values() for node in checks.values()])))) else: ## it is empty, is that a sign that all is done and away ? print "ERROR with the scubscriptions API of ",transfer.phedexid print "Most likely something else is overiding the transfer request. Need to work on finding the replacement automatically, if the replacement exists" done = False ## the thing above is NOT giving the right number #done = False for wfid in transfer.workflows_id: tr_wf = session.query(Workflow).get(wfid) if tr_wf:# and tr_wf.status == 'staging': if not tr_wf.id in done_by_wf_id: done_by_wf_id[tr_wf.id]={} done_by_wf_id[tr_wf.id][transfer.phedexid]=done if done: ## transfer.status = 'done' print transfer.phedexid,"is done" else: print transfer.phedexid,"not finished" pprint.pprint( checks ) #print done_by_input print "\n----\n" for dsname in done_by_input: fractions = None if dsname in completion_by_input: fractions = itertools.chain.from_iterable([check.values() for check in completion_by_input.values()]) ## the workflows in the waiting room for the dataset using_its = getWorkflowByInput(url, dsname) #print using_its using_wfos = [] for using_it in using_its: wf = session.query(Workflow).filter(Workflow.name == using_it).first() if wf: using_wfos.append( wf ) if not len(done_by_input[dsname]): print "For dataset",dsname,"there are no transfer report. That's an issue." for wf in using_wfos: if wf.status == 'staging': if UC.get("stagor_sends_back"): print "sending",wf.name,"back to considered" wf.status = 'considered' session.commit() sendEmail( "send back to considered","%s was send back and might be trouble"% wf.name) else: print "would send",wf.name,"back to considered" sendEmail( "subscription lagging behind","susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name) continue #need_sites = int(len(done_by_input[dsname].values())*0.7)+1 need_sites = len(done_by_input[dsname].values()) #if need_sites > 10: need_sites = int(need_sites/2.) got = done_by_input[dsname].values().count(True) if all([wf.status != 'staging' for wf in using_wfos]): ## not a single ds-using wf is in staging => moved on already ## just forget about it print "presence of",dsname,"does not matter anymore" print "\t",done_by_input[dsname] print "\t",[wf.status for wf in using_wfos] print "\tneeds",need_sites continue #?? ## should the need_sites reduces with time ? # with dataset choping, reducing that number might work as a block black-list. if len(done_by_input[dsname].values()) and all(done_by_input[dsname].values()): print dsname,"is everywhere we wanted" ## the input dataset is fully transfered, should consider setting the corresponding wf to staged for wf in using_wfos: if wf.status == 'staging': print wf.name,"is with us. setting staged and move on" wf.status = 'staged' session.commit() elif fractions and len(list(fractions))>1 and set(fractions)==1: print dsname,"is everywhere at the same fraction" print "We do not want this in the end. we want the data we asked for" continue ## the input dataset is fully transfered, should consider setting the corresponding wf to staged for wf in using_wfos: if wf.status == 'staging': print wf.name,"is with us everywhere the same. setting staged and move on" wf.status = 'staged' session.commit() elif got >= need_sites: print dsname,"is almost everywhere we wanted" #print "We do not want this in the end. we want the data we asked for" #continue ## the input dataset is fully transfered, should consider setting the corresponding wf to staged for wf in using_wfos: if wf.status == 'staging': print wf.name,"is almost with us. setting staged and move on" wf.status = 'staged' session.commit() else: print "incomplete",dsname lost = findLostBlocks(url, dsname) try: known_lost = json.loads(open('lost_blocks_datasets.json').read()) except: print "enable to get the known_lost from local json file" known_lost = [] if lost and not dsname in known_lost: lost_names = [item['name'] for item in lost] ## make a deeper investigation of the block location to see whether it's really no-where no-where print "We have lost",len(lost),"blocks",lost_names #print json.dumps( lost , indent=2 ) sendEmail('we have lost a few blocks', str(len(lost))+" in total.\nDetails \n:"+json.dumps( lost , indent=2 )) known_lost.append(dsname) rr= open('lost_blocks_datasets.json','w') rr.write( json.dumps( known_lost, indent=2)) rr.close() ## should the status be change to held-staging and pending on a ticket print "\t",done_by_input[dsname] print "\tneeds",need_sites print "\tgot",got for wfid in done_by_wf_id: #print done_by_wf_id[wfid].values() ## ask that all related transfer get into a valid state if all(done_by_wf_id[wfid].values()): pass
def assignor(url, specific=None, talk=True, options=None): if userLock('assignor'): return CI = campaignInfo() SI = siteInfo() wfos = [] if specific: wfos = session.query(Workflow).filter(Workflow.name == specific).all() if not wfos: if specific: wfos = session.query(Workflow).filter( Workflow.status == 'considered').all() wfos.extend( session.query(Workflow).filter( Workflow.status == 'staging').all()) wfos.extend( session.query(Workflow).filter(Workflow.status == 'staged').all()) for wfo in wfos: if specific: if not any(map(lambda sp: sp in wfo.name, specific.split(','))): continue #if not specific in wfo.name: continue print wfo.name, "to be assigned" wfh = workflowInfo(url, wfo.name) ## check if by configuration we gave it a GO if not CI.go(wfh.request['Campaign']) and not options.go: print "No go for", wfh.request['Campaign'] continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] != 'assignment-approved': print wfo.name, wfh.request['RequestStatus'], "skipping" if not options.test: continue ## retrieve from the schema, dbs and reqMgr what should be the next version version = wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: print "cannot decide on version number" continue (lheinput, primary, parent, secondary) = wfh.getIO() sites_allowed = getSiteWhiteList( (lheinput, primary, parent, secondary)) print "Allowed", sites_allowed sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])] sites_custodial = [] if len(sites_custodial) == 0: print "No custodial, it's fine, it's covered in close-out" if len(sites_custodial) > 1: print "more than one custodial for", wfo.name sys.exit(36) secondary_locations = None for sec in list(secondary): presence = getDatasetPresence(url, sec) print sec print json.dumps(presence, indent=2) #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>90.] one_secondary_locations = [ site for (site, (there, frac)) in presence.items() if there ] if secondary_locations == None: secondary_locations = one_secondary_locations else: secondary_locations = list( set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only sites_allowed = [ site for site in sites_allowed if any([ osite.startswith(site) for osite in one_secondary_locations ]) ] sites_all_data = copy.deepcopy(sites_allowed) sites_with_data = copy.deepcopy(sites_allowed) sites_with_any_data = copy.deepcopy(sites_allowed) primary_locations = None available_fractions = {} for prim in list(primary): presence = getDatasetPresence(url, prim) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction( url, prim, sites=[SI.CE_to_SE(site) for site in sites_allowed]) sites_all_data = [ site for site in sites_with_data if any([ osite.startswith(site) for osite in [ psite for (psite, (there, frac)) in presence.items() if there ] ]) ] sites_with_data = [ site for site in sites_with_data if any([ osite.startswith(site) for osite in [ psite for (psite, frac) in presence.items() if frac[1] > 90. ] ]) ] sites_with_any_data = [ site for site in sites_with_any_data if any([osite.startswith(site) for osite in presence.keys()]) ] if primary_locations == None: primary_locations = presence.keys() else: primary_locations = list( set(primary_locations) & set(presence.keys())) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites = [] ## opportunistic running where any piece of data is available if secondary_locations and primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] opportunistic_sites = [ SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set(sites_allowed)) ] print "We could be running at", opportunistic_sites, "in addition" if available_fractions and not all( [available >= 1. for available in available_fractions.values()]): print "The input dataset is not located in full at any site" print json.dumps(available_fractions) if not options.test and not options.go: continue ## skip skip skip copies_wanted = 2. if available_fractions and not all([ available >= copies_wanted for available in available_fractions.values() ]): print "The input dataset is not available", copies_wanted, "times, only", available_fractions.values( ) if not options.go: continue ## default back to white list to original white list with any data print "Allowed", sites_allowed sites_allowed = sites_with_any_data print "Selected for any data", sites_allowed if options.restrict: print "Allowed", sites_allowed sites_allowed = sites_with_any_data print "Selected", sites_allowed else: if set(sites_with_data) != set(sites_allowed): ## the data is not everywhere we wanted to run at : enable aaa print "Sites with 90% data not matching site white list (block choping!)" print "Resorting to AAA reading for", list( set(sites_allowed) - set(sites_with_data)), "?" print "Whitelist site with any data", list( set(sites_allowed) - set(sites_with_any_data)) #options.useSiteListAsLocation = True #print "Not commissioned yet" #continue #print "We could be running at",opportunistic_sites,"in addition" ##sites_allowed = list(set(sites_allowed+ opportunistic_sites)) if not len(sites_allowed): print wfo.name, "cannot be assign with no matched sites" continue parameters = { 'SiteWhitelist': sites_allowed, 'CustodialSites': sites_custodial, 'NonCustodialSites': sites_out, 'AutoApproveSubscriptionSites': list(set(sites_out)), 'AcquisitionEra': wfh.acquisitionEra(), 'ProcessingString': wfh.processingString(), 'MergedLFNBase': '/store/mc', ## to be figured out ! from Hi shit 'ProcessingVersion': version, } ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v = getattr(options, key) if v != None: if ',' in v: parameters[key] = filter(None, v.split(',')) else: parameters[key] = v ## pick up campaign specific assignment parameters parameters.update(CI.parameters(wfh.request['Campaign'])) if not options.test: parameters['execute'] = True if not wfh.checkWorkflowSplitting(): ## needs to go to event based ? fail for now print "Falling back to event splitting ?" #parameters['SplittingAlgorithm'] = 'EventBased' continue ## plain assignment here team = 'production' if options and options.team: team = options.team result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = 'away' session.commit() else: print "ERROR could not assign", wfo.name else: pass
def assignor(url ,specific = None, talk=True, options=None): CI = campaignInfo() SI = siteInfo() wfos=[] if specific: wfos = session.query(Workflow).filter(Workflow.name==specific).all() if not wfos: if specific: wfos = session.query(Workflow).filter(Workflow.status=='considered').all() wfos.extend( session.query(Workflow).filter(Workflow.status=='staging').all()) wfos.extend(session.query(Workflow).filter(Workflow.status=='staged').all()) for wfo in wfos: if specific: if not any(map(lambda sp: sp in wfo.name,specific.split(','))): continue #if not specific in wfo.name: continue print wfo.name,"to be assigned" wfh = workflowInfo( url, wfo.name) #wl = getWorkLoad(url, wfo.name ) if not CI.go( wfh.request['Campaign'] ): print "No go for",wfh.request['Campaign'] continue injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.) now = time.mktime(time.gmtime()) / (60.*60.) if float(now - injection_time) < 4.: print "It is too soon to inject: %3.2fH remaining"%(now - injection_time) if not options.test: continue #grace_period = 4 #days #if float(now - injection_time) > grace_period*24.: # print "it has been",grace_period,"need to do something" # options.restrict = True #else: # print now,injection_time,now - injection_time #print wl if wfh.request['RequestStatus'] !='assignment-approved': print wfo.name,wfh.request['RequestStatus'],"skipping" if not options.test: continue version=wfh.getNextVersion() (lheinput,primary,parent,secondary) = wfh.getIO() sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) sites_custodial = list(set(itertools.chain.from_iterable([findCustodialLocation(url, prim) for prim in primary]))) sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])] if len(sites_custodial)==0: sites_custodial = [SI.pick_SE()] print "picked",sites_custodial," as custodial for",wfo.name if len(sites_custodial)>1: print "more than one custodial for",wfo.name sys.exit(36) sites_with_data = copy.deepcopy( sites_allowed ) for prim in list(primary)+list(secondary): presence = getDatasetPresence( url, prim ) if talk: print prim,presence sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])] sites_with_data = list(set(sites_with_data)) if options.restrict: if talk: print sites_allowed sites_allowed = sites_with_data else: if set(sites_with_data) != set(sites_allowed): ## the data is not everywhere we wanted to run at : enable aaa print "Resorting to AAA reading for",list(set(sites_allowed) - set(sites_with_data)) #options.useSiteListAsLocation = True print "Not commissioned yet" continue if not len(sites_allowed): print wfo.name,"cannot be assign with no matched sites" continue parameters={ 'SiteWhitelist' : sites_allowed, 'CustodialSites' : sites_custodial, 'NonCustodialSites' : sites_out, 'AutoApproveSubscriptionSites' : list(set(sites_out+sites_custodial)), 'AcquisitionEra' : wfh.acquisitionEra(), 'ProcessingString' : wfh.processingString(), 'MergedLFNBase' : '/store/mc', ## to be figured out ! from Hi shit 'ProcessingVersion' : version, } ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v=getattr(options,key) if v!=None: if ',' in v: parameters[key] = filter(None,v.split(',')) else: parameters[key] = v ## take care of a few exceptions if (wfh.request['Memory']*1000) > 3000000: parameters['MaxRSS'] = 4000000 ## pick up campaign specific assignment parameters parameters.update( CI.parameters(wfh.request['Campaign']) ) if not options.test: parameters['execute'] = True if not wfh.checkWorkflowSplitting(): ## needs to go to event based ? fail for now print "Falling back to event splitting ?" parameters['SplittingAlgorithm'] = 'EventBased' ## plain assignment here result = reqMgrClient.assignWorkflow(url, wfo.name, 'production', parameters) # set status if not options.test: if result: wfo.status = 'away' session.commit() else: print "ERROR could not assign",wfo.name else: pass
def transferor(url ,specific = None, talk=True, options=None): if userLock('transferor'): return if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() mcm = McMClient(dev=False) dss = DSS() print "counting all being handled..." being_handled = len(session.query(Workflow).filter(Workflow.status == 'away').all()) being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('stag')).all()) being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance')).all()) max_to_handle = options.maxworkflows allowed_to_handle = max(0,max_to_handle - being_handled) wf_buffer = 5 if allowed_to_handle<=wf_buffer: ## buffer for having several wf per transfer print "Not allowed to run more than",max_to_handle,"at a time. Currently",being_handled,"and",wf_buffer,"buffer" else: print being_handled,"already being handled",max_to_handle,"max allowed,",allowed_to_handle,"remaining","and",wf_buffer,"buffer" print "... done" all_transfers=defaultdict(list) workflow_dependencies = defaultdict(set) ## list of wf.id per input dataset wfs_and_wfh=[] print "getting all wf to consider ..." cache = getWorkflows(url, 'assignment-approved', details=True) for wfo in session.query(Workflow).filter(Workflow.status=='considered').all(): if specific and not specific in wfo.name: continue cache_r =filter(lambda d:d['RequestName']==wfo.name, cache) if len(cache_r): wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False, request = cache_r[0]) ) ) else: wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False) ) ) print "... done" input_sizes = {} ## list the size of those in transfer already in_transfer_priority=0 min_transfer_priority=100000000 print "getting all wf in staging ..." for wfo in session.query(Workflow).filter(Workflow.status=='staging').all(): wfh = workflowInfo( url, wfo.name, spec=False) (_,primary,_,_) = wfh.getIO() for prim in primary: input_sizes[prim] = dss.get( prim ) in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority'])) min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority'])) print "... done" print "Max priority in transfer already",in_transfer_priority print "Min priority in transfer already",min_transfer_priority in_transfer_already = sum(input_sizes.values()) #sort by priority higher first wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority']) ), reverse=True) ## list the size of all inputs print "getting all input sizes ..." for (wfo,wfh) in wfs_and_wfh: (_,primary,_,_) = wfh.getIO() for prim in primary: input_sizes[prim] = dss.get( prim ) print "... done" grand_total = sum(input_sizes.values()) to_transfer = grand_total - in_transfer_already grand_transfer_limit = options.maxtransfer transfer_limit = grand_transfer_limit - in_transfer_already print "%15.4f GB already being transfered"%in_transfer_already print "%15.4f GB is the current requested transfer load"%to_transfer print "%15.4f GB is the global transfer limit"%grand_transfer_limit print "%15.4f GB is the available limit"%transfer_limit # the max priority value per dataset. max_priority = defaultdict(int) needs_transfer=0 ## so that we can count'em passing_along = 0 transfer_sizes={} went_over_budget=False for (wfo,wfh) in wfs_and_wfh: print wfh.request['RequestPriority'] print wfo.name,"to be transfered" #wfh = workflowInfo( url, wfo.name) (_,primary,_,_) = wfh.getIO() this_load=sum([input_sizes[prim] for prim in primary]) if ( this_load and (sum(transfer_sizes.values())+this_load > transfer_limit or went_over_budget ) ): if went_over_budget: print "Transfer has gone over bubget." else: print "Transfer will go over bubget." print "%15.4f GB this load"%this_load print "%15.4f GB already this round"%sum(transfer_sizes.values()) print "%15.4f GB is the available limit"%transfer_limit went_over_budget=True if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over budget" else: if not options.go: print min_transfer_priority,"minimum priority",wfh.request['RequestPriority'],"<",in_transfer_priority,"stop" continue ## throtlle by campaign go if not CI.go( wfh.request['Campaign'] ): print "No go for",wfh.request['Campaign'] if not options.go: continue ## check if the batch is announced announced=False is_real=False for b in mcm.getA('batches',query='contains=%s'% wfo.name): is_real = True if b['status']=='announced': announced=True break if not announced: print wfo.name,"does not look announced."# skipping?, rejecting?, reporting?" if not is_real: print wfo.name,"does not appear to be genuine." ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status continue ## check on a grace period injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.) now = time.mktime(time.gmtime()) / (60.*60.) if float(now - injection_time) < 4.: if not options.go and not announced: print "It is too soon to start transfer: %3.2fH remaining"%(now - injection_time) continue passing_along += 1 if passing_along >= allowed_to_handle: if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over",max_to_handle else: print "Not allowed to pass more than",max_to_handle,"at a time. Currently",being_handled,"handled, and adding",passing_along break (lheinput,primary,parent,secondary) = wfh.getIO() if options and options.tosites: sites_allowed = options.tosites.split(',') else: sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']): sites_allowed = CI.parameters(wfh.request['Campaign'])['SiteWhitelist'] blocks = [] if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']: blocks = wfh.request['BlockWhitelist'] can_go = True staging=False if primary: if talk: print wfo.name,'reads',', '.join(primary),'in primary' ## chope the primary dataset for prim in primary: max_priority[prim] = max(max_priority[prim],int(wfh.request['RequestPriority'])) sites_really_allowed = [site for site in sites_allowed if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] print "Sites allowed minus the vetoed transfer" print sites_really_allowed copies_needed = int(0.35*len(sites_really_allowed))+1 ## should just go for a fixed number based if the white list grows that big print "Would make",copies_needed,"copies" if options.maxcopy>0: copies_needed = min(options.maxcopy,copies_needed) ## remove the sites that do not want transfers print "need",copies_needed workflow_dependencies[prim].add( wfo.id ) presence = getDatasetPresence( url, prim ) prim_location = [site for site,pres in presence.items() if pres[0]==True] if len(prim_location) >= copies_needed: print "The output is all fully in place at",len(prim_location),"sites" continue # reduce the number of copies required by existing full copies copies_needed = max(0,copies_needed - len(prim_location)) print "now need",copies_needed subscriptions = listSubscriptions( url , prim ) prim_destination = list(set([site for (site,(tid,decision)) in subscriptions.items() if decision and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])])) ## need to reject from that list the ones with a full copy already: i.e the transfer corresponds to the copy in place prim_destination = [site for site in prim_destination if not site in prim_location] ## add transfer dependencies latching_on_transfers = list(set([ tid for (site,(tid,decision)) in subscriptions.items() if decision and site in prim_destination and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])])) print latching_on_transfers for latching in latching_on_transfers: tfo = session.query(Transfer).filter(Transfer.phedexid == latching).first() if not tfo: tfo = Transfer( phedexid = latching) tfo.workflows_id = [] session.add(tfo) if not wfo.id in tfo.workflows_id: print "adding",wfo.id,"to",tfo.id,"with phedexid",latching l = copy.deepcopy( tfo.workflows_id ) l.append( wfo.id ) tfo.workflows_id = l if not options.test: session.commit() else: session.flush() ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query can_go = False transfer_sizes[prim] = input_sizes[prim] staging = True # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ?? copies_needed = max(0,copies_needed - len(prim_destination)) print "then need",copies_needed if copies_needed == 0: print "The output is either fully in place or getting in full somewhere with",latching_on_transfers can_go = True continue prim_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in prim_location])] prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in prim_destination])] ## take out the ones that cannot receive transfers prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] if len(prim_to_distribute)>0: ## maybe that a parameter we can play with to limit the if not options or options.chop: spreading = distributeToSites( getDatasetChops(prim), prim_to_distribute, n_copies = copies_needed, weights=SI.cpu_pledges) else: spreading = {} for site in prim_to_distribute: spreading[site]=[prim] can_go = False transfer_sizes[prim] = input_sizes[prim] for (site,items) in spreading.items(): all_transfers[site].extend( items ) if secondary: if talk: print wfo.name,'reads',', '.join(secondary),'in secondary' for sec in secondary: workflow_dependencies[sec].add( wfo.id ) presence = getDatasetPresence( url, sec ) sec_location = [site for site,pres in presence.items() if pres[1]>90.] ## more than 90% of the minbias at sites subscriptions = listSubscriptions( url ,sec ) sec_destination = [site for site in subscriptions] sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] if len( sec_to_distribute )>0: for site in sec_to_distribute: all_transfers[site].append( sec ) can_go = False ## is that possible to do something more if can_go: ## no explicit transfer required this time if staging: ## but using existing ones print wfo.name,"latches on existing transfers, and nothing else" wfo.status = 'staging' else: print wfo.name,"should just be assigned NOW to",sites_allowed wfo.status = 'staged' print "setting status to",wfo.status session.commit() continue else: ## there is an explicit transfer required if staging: ## and also using an existing one print wfo.name,"latches on existing transfers" if not options.test: wfo.status = 'staging' print "setting status to",wfo.status session.commit() print wfo.name,"needs a transfer" needs_transfer+=1 #print json.dumps(all_transfers) fake_id=-1 wf_id_in_prestaging=set() for (site,items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## site that do not want input datasets if site in SI.sites_veto_transfer: print site,"does not want transfers" continue ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured. ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] datasets = [it for it in items_to_transfer if not '#' in it] if execute: print "Making a replica to",site,"(CE)",site_se,"(SE) for" else: print "Would make a replica to",site,"(CE)",site_se,"(SE) for" print "\t",len(blocks),"blocks" ## remove blocks if full dataset is send out blocks = [block for block in blocks if not block.split('#')[0] in datasets] print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks])) print "\t",len(datasets),"datasets" print "\t",datasets items_to_transfer = blocks + datasets ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y','yes','go']: continue if execute: result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority='normal') ## make use of max_priority dataset:priority to set the subscriptions priority """ ## does not function once = True for item in items_to_transfer: bds = item.split('#')[0] if max_priority[bds] >= 90000: if once: w=10 print "waiting",w,"s before raising priority" time.sleep(w) once=False ## raise it to high priority print item,"subscription priority raised to high at",site_se #print "This does not work yet properly it seems" print updateSubscription(url, site_se, item, priority='high') """ else: #result= {'phedex':{'request_created' : [{'id' : fake_id}]}} result= {'phedex':{'request_created' : []}} fake_id-=1 if not result: print "ERROR Could not make a replica request for",site,items_to_transfer,"pre-staging" continue for phedexid in [o['id'] for o in result['phedex']['request_created']]: new_transfer = session.query(Transfer).filter(Transfer.phedexid == phedexid).first() print phedexid,"transfer created" if not new_transfer: new_transfer = Transfer( phedexid = phedexid) session.add( new_transfer ) new_transfer.workflows_id = set() for transfering in list(set(map(lambda it : it.split('#')[0], items_to_transfer))): new_transfer.workflows_id.update( workflow_dependencies[transfering] ) new_transfer.workflows_id = list(new_transfer.workflows_id) wf_id_in_prestaging.update(new_transfer.workflows_id) session.commit() ## auto approve it if execute: approved = approveSubscription(url, phedexid, [site_se]) for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status!='staging': if execute: tr_wf.status = 'staging' if talk: print "setting",tr_wf.name,"to staging" session.commit()
def assignor(url ,specific = None, talk=True, options=None): if userLock(): return if duplicateLock(): return #if notRunningBefore( 'stagor' ): return if not componentInfo().check(): return CI = campaignInfo() SI = siteInfo() LI = lockInfo() NLI = newLockInfo() n_assigned = 0 n_stalled = 0 wfos=[] if specific: wfos = session.query(Workflow).filter(Workflow.name==specific).all() if not wfos: if specific: wfos = session.query(Workflow).filter(Workflow.status=='considered').all() wfos.extend( session.query(Workflow).filter(Workflow.status=='staging').all()) wfos.extend(session.query(Workflow).filter(Workflow.status=='staged').all()) for wfo in wfos: if specific: if not any(map(lambda sp: sp in wfo.name,specific.split(','))): continue #if not specific in wfo.name: continue print "\n\n",wfo.name,"\n\tto be assigned" wfh = workflowInfo( url, wfo.name) ## check if by configuration we gave it a GO if not CI.go( wfh.request['Campaign'] ) and not options.go: print "No go for",wfh.request['Campaign'] n_stalled+=1 continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] !='assignment-approved': if not options.test: print wfo.name,wfh.request['RequestStatus'],"setting away and skipping" ## the module picking up from away will do what is necessary of it wfo.wm_status = wfh.request['RequestStatus'] wfo.status = 'away' session.commit() continue else: print wfo.name,wfh.request['RequestStatus'] ## retrieve from the schema, dbs and reqMgr what should be the next version version=wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: print "cannot decide on version number" n_stalled+=1 continue (lheinput,primary,parent,secondary) = wfh.getIO() sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) print "Site white list",sorted(sites_allowed) if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']): sites_allowed = CI.parameters(wfh.request['Campaign'])['SiteWhitelist'] if 'SiteBlacklist' in CI.parameters(wfh.request['Campaign']): print "Reducing the whitelist due to black list in campaign configuration" print "Removing",CI.parameters(wfh.request['Campaign'])['SiteBlacklist'] sites_allowed = list(set(sites_allowed) - set(CI.parameters(wfh.request['Campaign'])['SiteBlacklist'])) blocks = [] if 'BlockWhitelist' in wfh.request: blocks = wfh.request['BlockWhitelist'] memory_allowed = SI.sitesByMemory( wfh.request['Memory'] ) if memory_allowed!=None: print "sites allowing", wfh.request['Memory'],"are",sorted(memory_allowed) sites_allowed = list(set(sites_allowed) & set(memory_allowed)) print "Allowed",sorted(sites_allowed) secondary_locations=None for sec in list(secondary): presence = getDatasetPresence( url, sec ) print sec print json.dumps(presence, indent=2) #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>90.] one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations==None: secondary_locations = one_secondary_locations else: secondary_locations = list(set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations] print "From secondary requirement, now Allowed",sorted(sites_allowed) sites_all_data = copy.deepcopy( sites_allowed ) sites_with_data = copy.deepcopy( sites_allowed ) sites_with_any_data = copy.deepcopy( sites_allowed ) primary_locations = None available_fractions = {} for prim in list(primary): presence = getDatasetPresence( url, prim , only_blocks=blocks) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction(url, prim, sites = [SI.CE_to_SE(site) for site in sites_allowed] , only_blocks = blocks) #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])] #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])] sites_all_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,(there,frac)) in presence.items() if there]] sites_with_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,frac) in presence.items() if frac[1]>90.]] sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()] print "Holding the data but not allowed",list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed])) if primary_locations==None: primary_locations = presence.keys() else: primary_locations = list(set(primary_locations) & set(presence.keys() )) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites=[] down_time = False ## opportunistic running where any piece of data is available if secondary_locations or primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] if secondary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed]))] elif primary_locations: opportunistic_sites = [SI.SE_to_CE(site) for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed]))] else: opportunistic_sites = [] print "We could be running at",sorted(opportunistic_sites),"in addition" if any([osite in SI.sites_not_ready for osite in opportunistic_sites]): print "One of the destination site is in downtime" down_time = True ## should this be send back to considered ? """ if available_fractions and not all([available>=1. for available in available_fractions.values()]): print "The input dataset is not located in full over sites" print json.dumps(available_fractions) if not options.test and not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known: sendEmail( "cannot be assigned","%s is not full over sites \n %s"%(wfo.name,json.dumps(available_fractions))) known.append( wfo.name ) open('cannot_assign.json','w').write(json.dumps( known, indent=2)) n_stalled+=1 continue ## skip skip skip """ ## should be 2 but for the time-being let's lower it to get things going copies_wanted,cpuh = wfh.getNCopies() if available_fractions and not all([available>=copies_wanted for available in available_fractions.values()]): print "The input dataset is not available",copies_wanted,"times, only",available_fractions.values() if down_time: wfo.status = 'considered' session.commit() print "sending back to considered because of site downtime, instead of waiting" sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name) continue #pass print json.dumps(available_fractions) if not options.go: known = [] try: known = json.loads(open('cannot_assign.json').read()) except: pass if not wfo.name in known: sendEmail( "cannot be assigned","%s is not sufficiently available. Probably phedex information lagging behind. \n %s"%(wfo.name,json.dumps(available_fractions))) known.append( wfo.name ) open('cannot_assign.json','w').write(json.dumps( known, indent=2)) n_stalled+=1 continue ## default back to white list to original white list with any data print "Allowed",sites_allowed sites_allowed = sites_with_any_data print "Selected for any data",sites_allowed if options.restrict: print "Allowed",sites_allowed sites_allowed = sites_with_any_data print "Selected",sites_allowed else: if set(sites_with_data) != set(sites_allowed): ## the data is not everywhere we wanted to run at : enable aaa print "Sites with 90% data not matching site white list (block choping!)" print "Resorting to AAA reading for",list(set(sites_allowed) - set(sites_with_data)),"?" print "Whitelist site with any data",list(set(sites_allowed) - set(sites_with_any_data)) #options.useSiteListAsLocation = True #print "Not commissioned yet" #continue #print "We could be running at",opportunistic_sites,"in addition" ##sites_allowed = list(set(sites_allowed+ opportunistic_sites)) if not len(sites_allowed): print wfo.name,"cannot be assign with no matched sites" sendEmail( "cannot be assigned","%s has no whitelist"%(wfo.name)) n_stalled+=1 continue t1_only = [ce for ce in sites_allowed if ce.startswith('T1')] if t1_only: # try to pick from T1 only first sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])] else: # then pick any otherwise sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])] print "Placing the output on", sites_out parameters={ 'SiteWhitelist' : sites_allowed, #'CustodialSites' : sites_custodial, 'NonCustodialSites' : sites_out, 'AutoApproveSubscriptionSites' : list(set(sites_out)), 'AcquisitionEra' : wfh.acquisitionEra(), 'ProcessingString' : wfh.processingString(), 'MergedLFNBase' : '/store/mc', ## to be figured out 'ProcessingVersion' : version, } ## plain assignment here team='production' if options and options.team: team = options.team if "T2_US_UCSD" in sites_with_data and random.random() < -0.5 and wfh.request['Campaign']=='RunIISpring15DR74' and int(wfh.getRequestNumEvents()) < 600000 and not any([out.endswith('RAW') for out in wfh.request['OutputDatasets']]): ## consider SDSC parameters['SiteWhitelist'] = ['T2_US_UCSD','T3_US_SDSC'] parameters['useSiteListAsLocation'] = True team = 'allocation-based' sendEmail("sending work to SDSC","%s was assigned to SDSC/UCSD"% wfo.name, destination=['*****@*****.**']) if wfh.request['Campaign']=='RunIIWinter15GS' and random.random() < -1.0: parameters['SiteWhitelist'] = ['T3_US_SDSC'] team = 'allocation-based' sendEmail("sending work to SDSC","%s was assigned to SDSC"% wfo.name, destination=['*****@*****.**']) ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v=getattr(options,key) if v!=None: if ',' in v: parameters[key] = filter(None,v.split(',')) else: parameters[key] = v ## pick up campaign specific assignment parameters parameters.update( CI.parameters(wfh.request['Campaign']) ) if not options.test: parameters['execute'] = True if not wfh.checkWorkflowSplitting(): print "Falling back to event splitting." parameters['SplittingAlgorithm'] = 'EventBased' sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name) ## needs to go to event based ? fail for now #print "Falling back to event splitting ?" #sendEmail("Cannot assign","the workflow %s is too heavy to be processed as it is. Could fallback to EventBased splitting"%wfo.name) #continue # Handle run-dependent MC pstring = wfh.processingString() if 'PU_RD' in pstring: numEvents = wfh.getRequestNumEvents() eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary] eventsPerLumi = sum(eventsPerLumi)/float(len(eventsPerLumi)) reqJobs = 500 if 'PU_RD2' in pstring: reqJobs = 2000 eventsPerJob = int(numEvents/(reqJobs*1.4)) lumisPerJob = int(eventsPerJob/eventsPerLumi) if lumisPerJob==0: print "There is no go for assigning that request without event splitting" sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob)) print "need to go down to",eventsPerJob,"events per job" parameters['EventsPerJob'] = eventsPerJob else: spl = wfh.getSplittings()[0] eventsPerJobEstimated = spl['events_per_job'] if 'events_per_job' in spl else None if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob: print "need to go down to",lumisPerJob,"in assignment" sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob)) parameters['LumisPerJob'] = lumisPerJob else: print "the regular splitting should work for",pstring sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name) result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = 'away' session.commit() n_assigned+=1 try: ## refetch information and lock output new_wfi = workflowInfo( url, wfo.name) (_,prim,_,sec) = new_wfi.getIO() for output in new_wfi.request['OutputDatasets']: ## lock all outputs flat NLI.lock( output ) for site in [SI.CE_to_SE(site) for site in sites_allowed]: for output in new_wfi.request['OutputDatasets']: LI.lock( output, site, 'dataset in production') for primary in prim: LI.lock( primary, site, 'dataset used in input') for secondary in sec: LI.lock( secondary, site, 'required for mixing' ) except Exception as e: print "fail in locking output" print str(e) sendEmail("failed locking of output",str(e)) else: print "ERROR could not assign",wfo.name else: pass print "Assignment summary:" print "Assigned",n_assigned print "Stalled",n_stalled
def transferor(url ,specific = None, talk=True, options=None): if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() all_transfers=defaultdict(list) workflow_dependencies = defaultdict(set) ## list of wf.id per input dataset data_to_wf = {} for wfo in session.query(Workflow).filter(Workflow.status=='considered').all(): if specific and not specific in wfo.name: continue print wfo.name,"to be transfered" wfh = workflowInfo( url, wfo.name) #injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.) #now = time.mktime(time.gmtime()) / (60.*60.) #if float(now - injection_time) < 4.: # print "It is too soon to transfer", now, injection_time # continue (lheinput,primary,parent,secondary) = wfh.getIO() if options and options.tosites: sites_allowed = options.tosites.split(',') else: sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']): sites_allowed = CI.parameters(wfh.request['Campaign'])['SiteWhitelist'] blocks = [] if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']: blocks = wfh.request['BlockWhitelist'] can_go = True if primary: if talk: print wfo.name,'reads',', '.join(primary),'in primary' ## chope the primary dataset for prim in primary: workflow_dependencies[prim].add( wfo.id ) presence = getDatasetPresence( url, prim ) prim_location = [site for site,pres in presence.items() if pres[0]==True] subscriptions = listSubscriptions( url , prim ) prim_destination = [site for site in subscriptions] prim_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in prim_location])] prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in prim_destination])] if len(prim_to_distribute)>0: ## maybe that a parameter we can play with to limit the if not options or options.chop: spreading = distributeToSites( [[prim]]+getDatasetChops(prim), prim_to_distribute, n_copies = 3, weights=SI.cpu_pledges) else: spreading = {} for site in prim_to_distribute: spreading[site]=[prim] can_go = False for (site,items) in spreading.items(): all_transfers[site].extend( items ) if secondary: if talk: print wfo.name,'reads',', '.join(secondary),'in secondary' for sec in secondary: workflow_dependencies[sec].add( wfo.id ) presence = getDatasetPresence( url, sec ) sec_location = [site for site,pres in presence.items() if pres[1]>90.] ## more than 90% of the minbias at sites subscriptions = listSubscriptions( url ,sec ) sec_destination = [site for site in subscriptions] sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])] if len( sec_to_distribute )>0: for site in sec_to_distribute: all_transfers[site].append( sec ) can_go = False ## is that possible to do something more if can_go: print wfo.name,"should just be assigned NOW to",sites_allowed wfo.status = 'staged' session.commit() continue else: print wfo.name,"needs a transfer" #print json.dumps(all_transfers) fake_id=-1 wf_id_in_prestaging=set() for (site,items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] datasets = [it for it in items_to_transfer if not '#' in it] print "Making a replica to",site,"(CE)",site_se,"(SE) for" print "\t",len(blocks),"blocks" ## remove blocks if full dataset is send out blocks = [block for block in blocks if not block.split('#')[0] in datasets] print "\t",len(blocks),"needed blocks" print "\t",len(datasets),"datasets" print "\t",datasets items_to_transfer = blocks + datasets ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y','yes','go']: continue if execute: result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging') else: #result= {'phedex':{'request_created' : [{'id' : fake_id}]}} result= {'phedex':{'request_created' : []}} fake_id-=1 if not result: print "ERROR Could not make a replica request for",site,items_to_transfer,"pre-staging" continue for phedexid in [o['id'] for o in result['phedex']['request_created']]: new_transfer = session.query(Transfer).filter(Transfer.phedexid == phedexid).first() if not new_transfer: new_transfer = Transfer( phedexid = phedexid) session.add( new_transfer ) new_transfer.workflows_id = set() for transfering in list(set(map(lambda it : it.split('#')[0], items_to_transfer))): new_transfer.workflows_id.update( workflow_dependencies[transfering] ) new_transfer.workflows_id = list(new_transfer.workflows_id) wf_id_in_prestaging.update(new_transfer.workflows_id) session.commit() ## auto approve it if execute: approved = approveSubscription(url, phedexid, [site_se]) for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status!='staging': if execute: tr_wf.status = 'staging' if talk: print "setting",tr_wf.name,"to staging" session.commit()
def assignor(url ,specific = None, talk=True, options=None): if userLock('assignor'): return CI = campaignInfo() SI = siteInfo() wfos=[] if specific: wfos = session.query(Workflow).filter(Workflow.name==specific).all() if not wfos: if specific: wfos = session.query(Workflow).filter(Workflow.status=='considered').all() wfos.extend( session.query(Workflow).filter(Workflow.status=='staging').all()) wfos.extend(session.query(Workflow).filter(Workflow.status=='staged').all()) for wfo in wfos: if specific: if not any(map(lambda sp: sp in wfo.name,specific.split(','))): continue #if not specific in wfo.name: continue print wfo.name,"to be assigned" wfh = workflowInfo( url, wfo.name) ## check if by configuration we gave it a GO if not CI.go( wfh.request['Campaign'] ) and not options.go: print "No go for",wfh.request['Campaign'] continue ## check on current status for by-passed assignment if wfh.request['RequestStatus'] !='assignment-approved': print wfo.name,wfh.request['RequestStatus'],"skipping" if not options.test: continue ## retrieve from the schema, dbs and reqMgr what should be the next version version=wfh.getNextVersion() if not version: if options and options.ProcessingVersion: version = options.ProcessingVersion else: print "cannot decide on version number" continue (lheinput,primary,parent,secondary) = wfh.getIO() sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) print "Allowed",sites_allowed sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])] sites_custodial = [] if len(sites_custodial)==0: print "No custodial, it's fine, it's covered in close-out" if len(sites_custodial)>1: print "more than one custodial for",wfo.name sys.exit(36) secondary_locations=None for sec in list(secondary): presence = getDatasetPresence( url, sec ) print sec print json.dumps(presence, indent=2) #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>90.] one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there] if secondary_locations==None: secondary_locations = one_secondary_locations else: secondary_locations = list(set(secondary_locations) & set(one_secondary_locations)) ## reduce the site white list to site with secondary only sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])] sites_all_data = copy.deepcopy( sites_allowed ) sites_with_data = copy.deepcopy( sites_allowed ) sites_with_any_data = copy.deepcopy( sites_allowed ) primary_locations = None available_fractions = {} for prim in list(primary): presence = getDatasetPresence( url, prim ) if talk: print prim print json.dumps(presence, indent=2) available_fractions[prim] = getDatasetBlocksFraction(url, prim, sites = [SI.CE_to_SE(site) for site in sites_allowed] ) sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])] sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])] sites_with_any_data = [site for site in sites_with_any_data if any([osite.startswith(site) for osite in presence.keys()])] if primary_locations==None: primary_locations = presence.keys() else: primary_locations = list(set(primary_locations) & set(presence.keys() )) sites_with_data = list(set(sites_with_data)) sites_with_any_data = list(set(sites_with_any_data)) opportunistic_sites=[] ## opportunistic running where any piece of data is available if secondary_locations and primary_locations: ## intersection of both any pieces of the primary and good IO #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))] opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set(sites_allowed))] print "We could be running at",opportunistic_sites,"in addition" if available_fractions and not all([available>=1. for available in available_fractions.values()]): print "The input dataset is not located in full at any site" print json.dumps(available_fractions) if not options.test and not options.go: continue ## skip skip skip copies_wanted = 2. if available_fractions and not all([available>=copies_wanted for available in available_fractions.values()]): print "The input dataset is not available",copies_wanted,"times, only",available_fractions.values() if not options.go: continue ## default back to white list to original white list with any data print "Allowed",sites_allowed sites_allowed = sites_with_any_data print "Selected for any data",sites_allowed if options.restrict: print "Allowed",sites_allowed sites_allowed = sites_with_any_data print "Selected",sites_allowed else: if set(sites_with_data) != set(sites_allowed): ## the data is not everywhere we wanted to run at : enable aaa print "Sites with 90% data not matching site white list (block choping!)" print "Resorting to AAA reading for",list(set(sites_allowed) - set(sites_with_data)),"?" print "Whitelist site with any data",list(set(sites_allowed) - set(sites_with_any_data)) #options.useSiteListAsLocation = True #print "Not commissioned yet" #continue #print "We could be running at",opportunistic_sites,"in addition" ##sites_allowed = list(set(sites_allowed+ opportunistic_sites)) if not len(sites_allowed): print wfo.name,"cannot be assign with no matched sites" continue parameters={ 'SiteWhitelist' : sites_allowed, 'CustodialSites' : sites_custodial, 'NonCustodialSites' : sites_out, 'AutoApproveSubscriptionSites' : list(set(sites_out)), 'AcquisitionEra' : wfh.acquisitionEra(), 'ProcessingString' : wfh.processingString(), 'MergedLFNBase' : '/store/mc', ## to be figured out ! from Hi shit 'ProcessingVersion' : version, } ##parse options entered in command line if any if options: for key in reqMgrClient.assignWorkflow.keys: v=getattr(options,key) if v!=None: if ',' in v: parameters[key] = filter(None,v.split(',')) else: parameters[key] = v ## pick up campaign specific assignment parameters parameters.update( CI.parameters(wfh.request['Campaign']) ) if not options.test: parameters['execute'] = True if not wfh.checkWorkflowSplitting(): ## needs to go to event based ? fail for now print "Falling back to event splitting ?" #parameters['SplittingAlgorithm'] = 'EventBased' continue ## plain assignment here team='production' if options and options.team: team = options.team result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters) # set status if not options.test: if result: wfo.status = 'away' session.commit() else: print "ERROR could not assign",wfo.name else: pass
def transferor(url ,specific = None, talk=True, options=None): if userLock(): return if duplicateLock(): return use_mcm = True up = componentInfo(mcm=use_mcm, soft=['mcm']) if not up.check(): return use_mcm = up.status['mcm'] if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() LI = lockInfo() NLI = newLockInfo() mcm = McMClient(dev=False) dss = DSS() print "counting all being handled..." being_handled = len(session.query(Workflow).filter(Workflow.status == 'away').all()) being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('stag')).all()) being_transfered = len(session.query(Workflow).filter(Workflow.status == 'staging').all()) being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all()) max_to_handle = options.maxworkflows max_to_transfer = options.maxstaging allowed_to_handle = max(0,max_to_handle - being_handled) allowed_to_transfer = max(0,max_to_transfer - being_transfered) wf_buffer = 5 if allowed_to_handle<=wf_buffer: ## buffer for having several wf per transfer print "Not allowed to run more than",max_to_handle,"at a time. Currently",being_handled,"and",wf_buffer,"buffer" else: print being_handled,"already being handled",max_to_handle,"max allowed,",allowed_to_handle,"remaining","and",wf_buffer,"buffer" if allowed_to_transfer <= wf_buffer: print "Not allowed to transfer more than",max_to_transfer,"at a time. Currently",being_transfered,"and",wf_buffer,"buffer" else: print being_transfered,"already being transfered",max_to_transfer,"max allowed,",allowed_to_transfer,"remaining","and",wf_buffer,"buffer" print "... done" all_transfers=defaultdict(list) needing_locks=defaultdict(list) workflow_dependencies = defaultdict(set) ## list of wf.id per input dataset wfs_and_wfh=[] print "getting all wf to consider ..." cache = getWorkflows(url, 'assignment-approved', details=True) for wfo in session.query(Workflow).filter(Workflow.status=='considered').all(): print "\t",wfo.name if specific and not specific in wfo.name: continue cache_r =filter(lambda d:d['RequestName']==wfo.name, cache) if len(cache_r): wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False, request = cache_r[0]) ) ) else: wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False) ) ) print "... done" transfers_per_sites = defaultdict(int) input_sizes = {} input_cput = {} input_st = {} ## list the size of those in transfer already in_transfer_priority=0 min_transfer_priority=100000000 print "getting all wf in staging ..." for wfo in session.query(Workflow).filter(Workflow.status=='staging').all(): wfh = workflowInfo( url, wfo.name, spec=False) (lheinput,primary,parent,secondary) = wfh.getIO() sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) for site in sites_allowed: ## we should get the actual transfer destination instead of the full white list transfers_per_sites[site] += 1 #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() for prim in primary: input_sizes[prim] = dss.get( prim ) print "\t",wfo.name,"needs",input_sizes[prim],"GB" in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority'])) min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority'])) print "... done" print "Max priority in transfer already",in_transfer_priority print "Min priority in transfer already",min_transfer_priority print "transfers per sites" print json.dumps( transfers_per_sites, indent=2) in_transfer_already = sum(input_sizes.values()) cput_in_transfer_already = sum(input_cput.values()) st_in_transfer_already = sum(input_st.values()) # shuffle first by name random.shuffle( wfs_and_wfh ) #sort by priority higher first wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority']) ), reverse=True) ## list the size of all inputs print "getting all input sizes ..." for (wfo,wfh) in wfs_and_wfh: (_,primary,_,_) = wfh.getIO() #input_cput[wfo.name] = wfh.getComputingTime() #input_st[wfo.name] = wfh.getSystemTime() for prim in primary: input_sizes[prim] = dss.get( prim ) print "... done" cput_grand_total = sum(input_cput.values()) cput_to_transfer = cput_grand_total - cput_in_transfer_already st_grand_total = sum(input_st.values()) st_to_transfer = st_grand_total - st_in_transfer_already print "%15.4f [CPU h] worth already in transfer"%cput_in_transfer_already print "%15.4f [CPU h] worth is the current requested transfer load"%cput_to_transfer print "%15.4f [h] worth of absolute system time in transfer"%( cput_in_transfer_already / SI.availableSlots()) print "%15.4f [h] worth of absolute system time is the current requested transfer load"%( cput_to_transfer / SI.availableSlots()) print "%15.4f [h] worth of theoritical system time in transfer"%( st_in_transfer_already ) print "%15.4f [h] worth of theoritical system time is the current requested transfer load"%( st_to_transfer ) grand_total = sum(input_sizes.values()) to_transfer = grand_total - in_transfer_already grand_transfer_limit = options.maxtransfer transfer_limit = grand_transfer_limit - in_transfer_already print "%15.4f GB already being transfered"%in_transfer_already print "%15.4f GB is the current requested transfer load"%to_transfer print "%15.4f GB is the global transfer limit"%grand_transfer_limit print "%15.4f GB is the available limit"%transfer_limit max_staging_per_site = options.maxstagingpersite # the max priority value per dataset. max_priority = defaultdict(int) needs_transfer=0 ## so that we can count'em passing_along = 0 transfer_sizes={} went_over_budget=False destination_cache = {} for (wfo,wfh) in wfs_and_wfh: print wfh.request['RequestPriority'] print wfo.name,"to be transfered" #wfh = workflowInfo( url, wfo.name) (_,primary,_,_) = wfh.getIO() this_load=sum([input_sizes[prim] for prim in primary]) if ( this_load and (sum(transfer_sizes.values())+this_load > transfer_limit or went_over_budget ) ): if went_over_budget: print "Transfer has gone over bubget." else: print "Transfer will go over bubget." print "%15.4f GB this load"%this_load print "%15.4f GB already this round"%sum(transfer_sizes.values()) print "%15.4f GB is the available limit"%transfer_limit went_over_budget=True if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over budget" else: if not options.go: print min_transfer_priority,"minimum priority",wfh.request['RequestPriority'],"<",in_transfer_priority,"stop" continue ## throtlle by campaign go if not CI.go( wfh.request['Campaign'] ): print "No go for",wfh.request['Campaign'] if not options.go: sendEmail("no go for managing","No go for "+wfh.request['Campaign']) continue ## check if the batch is announced def check_mcm(wfn): announced=False is_real=False if not wfn.startswith('pdmvserv'): is_real = True try: for b in mcm.getA('batches',query='contains=%s'% wfo.name): is_real = True if b['status']=='announced': announced=True break except: try: for b in mcm.getA('batches',query='contains=%s'% wfo.name): is_real = True if b['status']=='announced': announced=True break except: print "could not get mcm batch announcement, assuming not real" return announced,is_real if not use_mcm: announced,is_real = False,True else: announced,is_real = check_mcm( wfo.name ) if not announced: print wfo.name,"does not look announced."# skipping?, rejecting?, reporting?" if not is_real: print wfo.name,"does not appear to be genuine." ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status continue ## check on a grace period injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.) now = time.mktime(time.gmtime()) / (60.*60.) if float(now - injection_time) < 4.: if not options.go and not announced: print "It is too soon to start transfer: %3.2fH remaining"%(now - injection_time) continue if passing_along >= allowed_to_handle: #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority: ## higher priority, and not only this priority being transfered print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over",max_to_handle else: print "Not allowed to pass more than",max_to_handle,"at a time. Currently",being_handled,"handled, and adding",passing_along if not options.go: break if this_load and needs_transfer >= allowed_to_transfer: #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority: ## higher priority, and not only this priority being transfered print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over",max_to_transfer else: print "Not allowed to transfer more than",max_to_transfer,"at a time. Currently",being_transfered,"transfering, and adding",needs_transfer if not options.go: continue (lheinput,primary,parent,secondary) = wfh.getIO() for dataset in list(primary)+list(parent)+list(secondary): ## lock everything flat NLI.lock( dataset ) if options and options.tosites: sites_allowed = options.tosites.split(',') else: sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) ) if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']): sites_allowed = CI.parameters(wfh.request['Campaign'])['SiteWhitelist'] if 'SiteBlacklist' in CI.parameters(wfh.request['Campaign']): sites_allowed = list(set(sites_allowed) - set(CI.parameters(wfh.request['Campaign'])['SiteBlacklist'])) ## reduce right away to sites in case of memory limitation memory_allowed = SI.sitesByMemory( wfh.request['Memory'] ) if memory_allowed!=None: print "sites allowing", wfh.request['Memory'],"are",memory_allowed sites_allowed = list(set(sites_allowed) & set(memory_allowed)) if not sites_allowed: print wfo.name,"has no possible sites to run at" print "available for",wfh.request['Memory'],"are",memory_allowed sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name )) continue blocks = [] if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']: blocks = wfh.request['BlockWhitelist'] if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']: ## should make the block selection here pass if 'LumiList' in wfh.request and wfh.request['LumiList']: ## same, we could be doing the white list here too pass if blocks: print "Reading",len(blocks),"in whitelist" can_go = True staging=False allowed=True if primary: copies_needed_from_CPUh,CPUh = wfh.getNCopies() if talk: print wfo.name,'reads',', '.join(primary),'in primary' ## chope the primary dataset for prim in primary: max_priority[prim] = max(max_priority[prim],int(wfh.request['RequestPriority'])) sites_allowed = [site for site in sites_allowed if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] print "Sites allowed minus the vetoed transfer" print sorted(sites_allowed) copies_needed_from_site = int(0.35*len(sites_allowed))+1 ## should just go for a fixed number based if the white list grows that big print "Would make",copies_needed_from_site,"copies from site white list" copies_needed = copies_needed_from_site print "Would make",copies_needed_from_CPUh,"from cpu requirement",CPUh copies_needed = copies_needed_from_CPUh if options.maxcopy>0: ## stop maxing things out ?? #copies_needed = min(options.maxcopy,copies_needed) #print "Maxed to",copies_needed if copies_needed_from_CPUh > options.maxcopy: sendEmail('An example of more than three copies','for %s it could have been beneficial to make %s copies'%( wfo.name, copies_needed_from_CPUh)) if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]: copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies'] copies_needed = min(copies_needed_from_campaign,copies_needed_from_site) print "Maxed to",copies_needed,"by campaign configuration",wfh.request['Campaign'] ## remove the sites that do not want transfers workflow_dependencies[prim].add( wfo.id ) ##################################### ###### JR 3/8/15 #### deprecating this """ presence = getDatasetPresence( url, prim , within_sites = [SI.CE_to_SE(site) for site in sites_allowed]) prim_location = [site for site,pres in presence.items() if pres[0]==True] prim_parts = [site for site,pres in presence.items() if pres[0]==False] if len(prim_location) >= copies_needed: print "The output is all fully in place at",len(prim_location),"sites" continue # reduce the number of copies required by existing full copies copies_needed = max(0,copies_needed - len(prim_location)) print "now need",copies_needed subscriptions = listSubscriptions( url , prim , sites_allowed ) prim_destination = list(set([site for (site,(tid,decision)) in subscriptions.items() if decision and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])])) ## remove the subscription where the dataset is in parts at #prim_destination = list(set([site for (site,(tid,decision)) in subscriptions.items() if decision and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']]) and not site in prim_parts])) ## need to reject from that list the ones with a full copy already: i.e the transfer corresponds to the copy in place prim_destination = [site for site in prim_destination if not site in prim_location] ## add transfer dependencies latching_on_transfers = list(set([ tid for (site,(tid,decision)) in subscriptions.items() if decision and site in prim_destination and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])])) print latching_on_transfers """ ###### JR 3/8/15 #### deprecating this ##################################### ### new ways of making the whole thing destinations,all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks ) #destinations,all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks, group='DataOps') #anaops_destinations,anaops_all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks, group='AnalysisOps' ) print json.dumps(destinations, indent=2) ## get where the dataset is in full and completed prim_location = [site for (site,info) in destinations.items() if info['completion']==100 and info['data_fraction']==1] ## the rest is places it is going to be prim_destination = [site for site in destinations.keys() if not site in prim_location] ## need to take out the transfer veto prim_destination = [site for site in prim_destination if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] for dsite in prim_destination: needing_locks[dsite].append( prim ) if len(prim_location) >= copies_needed: print "The output is all fully in place at",len(prim_location),"sites",prim_location continue copies_needed = max(0,copies_needed - len(prim_location)) print "now need",copies_needed copies_being_made = [ sum([info['blocks'].keys().count(block) for site,info in destinations.items() if site in prim_destination]) for block in all_block_names] latching_on_transfers = set() [latching_on_transfers.update(info['blocks'].values()) for site,info in destinations.items() if site in prim_destination] latching_on_transfers = list(latching_on_transfers) #print latching_on_transfers ## figure out where all this is going to go prim_to_distribute = [site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location] prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in prim_destination] ## take out the ones that cannot receive transfers prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] if any([transfers_per_sites[site] < max_staging_per_site for site in prim_to_distribute]): ## means there is openings let me go print "There are transfer slots available:",[(site,transfers_per_sites[site]) for site in prim_to_distribute] for site in sites_allowed: #increment accross the board, regardless of real destination: could be changed transfers_per_sites[site] += 1 else: if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority: print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over transfer slots available" else: print "Not allowed to transfer more than",max_staging_per_site," per site at a time. Going overboard for",[site for site in prim_to_distribute if transfers_per_sites[site]>=max_staging_per_site] if not options.go: allowed = False break for latching in latching_on_transfers: tfo = session.query(Transfer).filter(Transfer.phedexid == latching).first() if not tfo: tfo = Transfer( phedexid = latching) tfo.workflows_id = [] session.add(tfo) if not wfo.id in tfo.workflows_id: print "adding",wfo.id,"to",tfo.id,"with phedexid",latching l = copy.deepcopy( tfo.workflows_id ) l.append( wfo.id ) tfo.workflows_id = l if not options.test: session.commit() else: session.flush() ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query can_go = False transfer_sizes[prim] = input_sizes[prim] staging = True # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ?? #copies_needed = max(0,copies_needed - len(prim_destination)) copies_needed = max(0,copies_needed - min(copies_being_made)) print "then need",copies_needed if copies_needed == 0: print "The output is either fully in place or getting in full somewhere with",latching_on_transfers can_go = True continue if len(prim_to_distribute)>0: ## maybe that a parameter we can play with to limit the if not options or options.chop: chops,sizes = getDatasetChops(prim, chop_threshold = options.chopsize, only_blocks=blocks) spreading = distributeToSites( chops, prim_to_distribute, n_copies = copies_needed, weights=SI.cpu_pledges, sizes=sizes) transfer_sizes[prim] = sum(sizes) else: spreading = {} for site in prim_to_distribute: if blocks: spreading[site]=blocks else: spreading[site]=[prim] transfer_sizes[prim] = input_sizes[prim] ## this is approximate if blocks are specified can_go = False print "selected CE destinations",spreading.keys() for (site,items) in spreading.items(): all_transfers[site].extend( items ) if not allowed: print "Not allowed to move on with",wfo.name continue if secondary: if talk: print wfo.name,'reads',', '.join(secondary),'in secondary' for sec in secondary: workflow_dependencies[sec].add( wfo.id ) if False: ## new style, failing on minbias if not sec in destination_cache: ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed]) destinations = destination_cache[sec] ## truncate location/destination to those making up for >90% of the dataset bad_destinations = [destinations.pop(site) for (site,info) in destinations.items() if info['data_fraction']<0.9] sec_location = [site for (site,info) in destinations.items() if info['completion']>=95] sec_destination = [site for site in destinations.keys() if not site in sec_location] else: ## old style presence = getDatasetPresence( url, sec ) sec_location = [site for site,pres in presence.items() if pres[1]>90.] ## more than 90% of the minbias at sites subscriptions = listSubscriptions( url ,sec ) sec_destination = [site for site in subscriptions] for site in sec_location: needing_locks[site].append( sec ) for site in sec_destination: needing_locks[site].append( sec ) sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])] sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])] if len( sec_to_distribute )>0: sec_size = dss.get( sec ) for site in sec_to_distribute: site_se =SI.CE_to_SE(site) if (SI.disk[site_se]*1024.) > sec_size: all_transfers[site].append( sec ) can_go = False else: print "could not send the secondary input to",site_se,"because it is too big for the available disk",SI.disk[site_se]*1024,"GB need",sec_size #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024)) ## is that possible to do something more if can_go: ## no explicit transfer required this time if staging: ## but using existing ones print wfo.name,"latches on existing transfers, and nothing else" wfo.status = 'staging' needs_transfer+=1 else: print wfo.name,"should just be assigned NOW to",sites_allowed wfo.status = 'staged' passing_along+=1 print "setting status to",wfo.status session.commit() continue else: ## there is an explicit transfer required if staging: ## and also using an existing one print wfo.name,"latches on existing transfers" if not options.test: wfo.status = 'staging' print "setting status to",wfo.status session.commit() print wfo.name,"needs a transfer" needs_transfer+=1 passing_along+=1 print "accumulated locks of dataset in place" print json.dumps(needing_locks, indent=2) for site,items in needing_locks.items(): for item in items: LI.lock( item, SI.CE_to_SE(site), 'usable input') print "accumulated transfers" print json.dumps(all_transfers, indent=2) fake_id=-1 wf_id_in_prestaging=set() for (site,items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## site that do not want input datasets if site in SI.sites_veto_transfer: print site,"does not want transfers" continue ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured. ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] datasets = [it for it in items_to_transfer if not '#' in it] if execute: print "Making a replica to",site,"(CE)",site_se,"(SE) for" else: print "Would make a replica to",site,"(CE)",site_se,"(SE) for" print "\t",len(blocks),"blocks" ## remove blocks if full dataset is send out blocks = [block for block in blocks if not block.split('#')[0] in datasets] print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks])) print "\t",len(datasets),"datasets" print "\t",datasets items_to_transfer = blocks + datasets ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y','yes','go']: continue if execute: result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority='normal') ## make use of max_priority dataset:priority to set the subscriptions priority """ ## does not function once = True for item in items_to_transfer: bds = item.split('#')[0] if max_priority[bds] >= 90000: if once: w=10 print "waiting",w,"s before raising priority" time.sleep(w) once=False ## raise it to high priority print item,"subscription priority raised to high at",site_se #print "This does not work yet properly it seems" print updateSubscription(url, site_se, item, priority='high') """ #for item in list(set([it.split('#')[0] for it in items_to_transfer])): for item in items_to_transfer: LI.lock( item, site_se, 'pre-staging') else: #result= {'phedex':{'request_created' : [{'id' : fake_id}]}} result= {'phedex':{'request_created' : []}} fake_id-=1 if not result: print "ERROR Could not make a replica request for",site,items_to_transfer,"pre-staging" continue for phedexid in [o['id'] for o in result['phedex']['request_created']]: new_transfer = session.query(Transfer).filter(Transfer.phedexid == phedexid).first() print phedexid,"transfer created" if not new_transfer: new_transfer = Transfer( phedexid = phedexid) session.add( new_transfer ) new_transfer.workflows_id = set() for transfering in list(set(map(lambda it : it.split('#')[0], items_to_transfer))): new_transfer.workflows_id.update( workflow_dependencies[transfering] ) new_transfer.workflows_id = list(new_transfer.workflows_id) wf_id_in_prestaging.update(new_transfer.workflows_id) session.commit() ## auto approve it if execute: approved = approveSubscription(url, phedexid, [site_se]) for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status!='staging': if execute: tr_wf.status = 'staging' if talk: print "setting",tr_wf.name,"to staging" session.commit()
def transferor(url, specific=None, talk=True, options=None): if userLock('transferor'): return if options and options.test: execute = False else: execute = True SI = siteInfo() CI = campaignInfo() mcm = McMClient(dev=False) dss = DSS() print "counting all being handled..." being_handled = len( session.query(Workflow).filter(Workflow.status == 'away').all()) being_handled += len( session.query(Workflow).filter( Workflow.status.startswith('stag')).all()) being_handled += len( session.query(Workflow).filter( Workflow.status.startswith('assistance')).all()) max_to_handle = options.maxworkflows allowed_to_handle = max(0, max_to_handle - being_handled) wf_buffer = 5 if allowed_to_handle <= wf_buffer: ## buffer for having several wf per transfer print "Not allowed to run more than", max_to_handle, "at a time. Currently", being_handled, "and", wf_buffer, "buffer" else: print being_handled, "already being handled", max_to_handle, "max allowed,", allowed_to_handle, "remaining", "and", wf_buffer, "buffer" print "... done" all_transfers = defaultdict(list) workflow_dependencies = defaultdict( set) ## list of wf.id per input dataset wfs_and_wfh = [] print "getting all wf to consider ..." cache = getWorkflows(url, 'assignment-approved', details=True) for wfo in session.query(Workflow).filter( Workflow.status == 'considered').all(): if specific and not specific in wfo.name: continue cache_r = filter(lambda d: d['RequestName'] == wfo.name, cache) if len(cache_r): wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False, request=cache_r[0]))) else: wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False))) print "... done" input_sizes = {} ## list the size of those in transfer already in_transfer_priority = 0 min_transfer_priority = 100000000 print "getting all wf in staging ..." for wfo in session.query(Workflow).filter( Workflow.status == 'staging').all(): wfh = workflowInfo(url, wfo.name, spec=False) (_, primary, _, _) = wfh.getIO() for prim in primary: input_sizes[prim] = dss.get(prim) in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority'])) min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority'])) print "... done" print "Max priority in transfer already", in_transfer_priority print "Min priority in transfer already", min_transfer_priority in_transfer_already = sum(input_sizes.values()) #sort by priority higher first wfs_and_wfh.sort(cmp=lambda i, j: cmp(int(i[1].request[ 'RequestPriority']), int(j[1].request['RequestPriority'])), reverse=True) ## list the size of all inputs print "getting all input sizes ..." for (wfo, wfh) in wfs_and_wfh: (_, primary, _, _) = wfh.getIO() for prim in primary: input_sizes[prim] = dss.get(prim) print "... done" grand_total = sum(input_sizes.values()) to_transfer = grand_total - in_transfer_already grand_transfer_limit = options.maxtransfer transfer_limit = grand_transfer_limit - in_transfer_already print "%15.4f GB already being transfered" % in_transfer_already print "%15.4f GB is the current requested transfer load" % to_transfer print "%15.4f GB is the global transfer limit" % grand_transfer_limit print "%15.4f GB is the available limit" % transfer_limit # the max priority value per dataset. max_priority = defaultdict(int) needs_transfer = 0 ## so that we can count'em passing_along = 0 transfer_sizes = {} went_over_budget = False for (wfo, wfh) in wfs_and_wfh: print wfh.request['RequestPriority'] print wfo.name, "to be transfered" #wfh = workflowInfo( url, wfo.name) (_, primary, _, _) = wfh.getIO() this_load = sum([input_sizes[prim] for prim in primary]) if (this_load and (sum(transfer_sizes.values()) + this_load > transfer_limit or went_over_budget)): if went_over_budget: print "Transfer has gone over bubget." else: print "Transfer will go over bubget." print "%15.4f GB this load" % this_load print "%15.4f GB already this round" % sum(transfer_sizes.values()) print "%15.4f GB is the available limit" % transfer_limit went_over_budget = True if int( wfh.request['RequestPriority'] ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority: print "Higher priority sample", wfh.request[ 'RequestPriority'], ">=", in_transfer_priority, "go-on over budget" else: if not options.go: print min_transfer_priority, "minimum priority", wfh.request[ 'RequestPriority'], "<", in_transfer_priority, "stop" continue ## throtlle by campaign go if not CI.go(wfh.request['Campaign']): print "No go for", wfh.request['Campaign'] if not options.go: continue ## check if the batch is announced announced = False is_real = False for b in mcm.getA('batches', query='contains=%s' % wfo.name): is_real = True if b['status'] == 'announced': announced = True break if not announced: print wfo.name, "does not look announced." # skipping?, rejecting?, reporting?" if not is_real: print wfo.name, "does not appear to be genuine." ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status continue ## check on a grace period injection_time = time.mktime( time.strptime('.'.join(map(str, wfh.request['RequestDate'])), "%Y.%m.%d.%H.%M.%S")) / (60. * 60.) now = time.mktime(time.gmtime()) / (60. * 60.) if float(now - injection_time) < 4.: if not options.go and not announced: print "It is too soon to start transfer: %3.2fH remaining" % ( now - injection_time) continue passing_along += 1 if passing_along >= allowed_to_handle: if int( wfh.request['RequestPriority'] ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority: print "Higher priority sample", wfh.request[ 'RequestPriority'], ">=", in_transfer_priority, "go-on over", max_to_handle else: print "Not allowed to pass more than", max_to_handle, "at a time. Currently", being_handled, "handled, and adding", passing_along break (lheinput, primary, parent, secondary) = wfh.getIO() if options and options.tosites: sites_allowed = options.tosites.split(',') else: sites_allowed = getSiteWhiteList( (lheinput, primary, parent, secondary)) if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']): sites_allowed = CI.parameters( wfh.request['Campaign'])['SiteWhitelist'] blocks = [] if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']: blocks = wfh.request['BlockWhitelist'] can_go = True staging = False if primary: if talk: print wfo.name, 'reads', ', '.join(primary), 'in primary' ## chope the primary dataset for prim in primary: max_priority[prim] = max(max_priority[prim], int(wfh.request['RequestPriority'])) sites_really_allowed = [ site for site in sites_allowed if not any([ osite.startswith(site) for osite in SI.sites_veto_transfer ]) ] print "Sites allowed minus the vetoed transfer" print sites_really_allowed copies_needed = int( 0.35 * len(sites_really_allowed) ) + 1 ## should just go for a fixed number based if the white list grows that big print "Would make", copies_needed, "copies" if options.maxcopy > 0: copies_needed = min(options.maxcopy, copies_needed) ## remove the sites that do not want transfers print "need", copies_needed workflow_dependencies[prim].add(wfo.id) presence = getDatasetPresence(url, prim) prim_location = [ site for site, pres in presence.items() if pres[0] == True ] if len(prim_location) >= copies_needed: print "The output is all fully in place at", len( prim_location), "sites" continue # reduce the number of copies required by existing full copies copies_needed = max(0, copies_needed - len(prim_location)) print "now need", copies_needed subscriptions = listSubscriptions(url, prim) prim_destination = list( set([ site for (site, (tid, decision)) in subscriptions.items() if decision and not any([ site.endswith(veto) for veto in ['MSS', 'Export', 'Buffer'] ]) ])) ## need to reject from that list the ones with a full copy already: i.e the transfer corresponds to the copy in place prim_destination = [ site for site in prim_destination if not site in prim_location ] ## add transfer dependencies latching_on_transfers = list( set([ tid for (site, (tid, decision)) in subscriptions.items() if decision and site in prim_destination and not any([ site.endswith(veto) for veto in ['MSS', 'Export', 'Buffer'] ]) ])) print latching_on_transfers for latching in latching_on_transfers: tfo = session.query(Transfer).filter( Transfer.phedexid == latching).first() if not tfo: tfo = Transfer(phedexid=latching) tfo.workflows_id = [] session.add(tfo) if not wfo.id in tfo.workflows_id: print "adding", wfo.id, "to", tfo.id, "with phedexid", latching l = copy.deepcopy(tfo.workflows_id) l.append(wfo.id) tfo.workflows_id = l if not options.test: session.commit() else: session.flush( ) ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query can_go = False transfer_sizes[prim] = input_sizes[prim] staging = True # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ?? copies_needed = max(0, copies_needed - len(prim_destination)) print "then need", copies_needed if copies_needed == 0: print "The output is either fully in place or getting in full somewhere with", latching_on_transfers can_go = True continue prim_to_distribute = [ site for site in sites_allowed if not any( [osite.startswith(site) for osite in prim_location]) ] prim_to_distribute = [ site for site in prim_to_distribute if not any( [osite.startswith(site) for osite in prim_destination]) ] ## take out the ones that cannot receive transfers prim_to_distribute = [ site for site in prim_to_distribute if not any([ osite.startswith(site) for osite in SI.sites_veto_transfer ]) ] if len( prim_to_distribute ) > 0: ## maybe that a parameter we can play with to limit the if not options or options.chop: spreading = distributeToSites(getDatasetChops(prim), prim_to_distribute, n_copies=copies_needed, weights=SI.cpu_pledges) else: spreading = {} for site in prim_to_distribute: spreading[site] = [prim] can_go = False transfer_sizes[prim] = input_sizes[prim] for (site, items) in spreading.items(): all_transfers[site].extend(items) if secondary: if talk: print wfo.name, 'reads', ', '.join(secondary), 'in secondary' for sec in secondary: workflow_dependencies[sec].add(wfo.id) presence = getDatasetPresence(url, sec) sec_location = [ site for site, pres in presence.items() if pres[1] > 90. ] ## more than 90% of the minbias at sites subscriptions = listSubscriptions(url, sec) sec_destination = [site for site in subscriptions] sec_to_distribute = [ site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location]) ] sec_to_distribute = [ site for site in sec_to_distribute if not any( [osite.startswith(site) for osite in sec_destination]) ] sec_to_distribute = [ site for site in sec_to_distribute if not any([ osite.startswith(site) for osite in SI.sites_veto_transfer ]) ] if len(sec_to_distribute) > 0: for site in sec_to_distribute: all_transfers[site].append(sec) can_go = False ## is that possible to do something more if can_go: ## no explicit transfer required this time if staging: ## but using existing ones print wfo.name, "latches on existing transfers, and nothing else" wfo.status = 'staging' else: print wfo.name, "should just be assigned NOW to", sites_allowed wfo.status = 'staged' print "setting status to", wfo.status session.commit() continue else: ## there is an explicit transfer required if staging: ## and also using an existing one print wfo.name, "latches on existing transfers" if not options.test: wfo.status = 'staging' print "setting status to", wfo.status session.commit() print wfo.name, "needs a transfer" needs_transfer += 1 #print json.dumps(all_transfers) fake_id = -1 wf_id_in_prestaging = set() for (site, items_to_transfer) in all_transfers.iteritems(): items_to_transfer = list(set(items_to_transfer)) ## convert to storage element site_se = SI.CE_to_SE(site) ## site that do not want input datasets if site in SI.sites_veto_transfer: print site, "does not want transfers" continue ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured. ## massage a bit the items blocks = [it for it in items_to_transfer if '#' in it] datasets = [it for it in items_to_transfer if not '#' in it] if execute: print "Making a replica to", site, "(CE)", site_se, "(SE) for" else: print "Would make a replica to", site, "(CE)", site_se, "(SE) for" print "\t", len(blocks), "blocks" ## remove blocks if full dataset is send out blocks = [ block for block in blocks if not block.split('#')[0] in datasets ] print "\t", len(blocks), "needed blocks for", list( set([block.split('#')[0] for block in blocks])) print "\t", len(datasets), "datasets" print "\t", datasets items_to_transfer = blocks + datasets ## operate the transfer if options and options.stop: ## ask to move-on answer = raw_input('Continue with that ?') if not answer.lower() in ['y', 'yes', 'go']: continue if execute: result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority='normal') ## make use of max_priority dataset:priority to set the subscriptions priority """ ## does not function once = True for item in items_to_transfer: bds = item.split('#')[0] if max_priority[bds] >= 90000: if once: w=10 print "waiting",w,"s before raising priority" time.sleep(w) once=False ## raise it to high priority print item,"subscription priority raised to high at",site_se #print "This does not work yet properly it seems" print updateSubscription(url, site_se, item, priority='high') """ else: #result= {'phedex':{'request_created' : [{'id' : fake_id}]}} result = {'phedex': {'request_created': []}} fake_id -= 1 if not result: print "ERROR Could not make a replica request for", site, items_to_transfer, "pre-staging" continue for phedexid in [o['id'] for o in result['phedex']['request_created']]: new_transfer = session.query(Transfer).filter( Transfer.phedexid == phedexid).first() print phedexid, "transfer created" if not new_transfer: new_transfer = Transfer(phedexid=phedexid) session.add(new_transfer) new_transfer.workflows_id = set() for transfering in list( set(map(lambda it: it.split('#')[0], items_to_transfer))): new_transfer.workflows_id.update( workflow_dependencies[transfering]) new_transfer.workflows_id = list(new_transfer.workflows_id) wf_id_in_prestaging.update(new_transfer.workflows_id) session.commit() ## auto approve it if execute: approved = approveSubscription(url, phedexid, [site_se]) for wfid in wf_id_in_prestaging: tr_wf = session.query(Workflow).get(wfid) if tr_wf and tr_wf.status != 'staging': if execute: tr_wf.status = 'staging' if talk: print "setting", tr_wf.name, "to staging" session.commit()