def getDatasetSize(dataset): # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) # retrieve file aggregation only by the runs #transform from strin to list reply = dbsapi.listBlockSummaries(dataset=dataset) return reply[0]['file_size']
def dbs3_get_data(dataset, timestamps=1): #q = "/afs/cern.ch/user/s/spinoso/public/dbs3wrapper.sh /afs/cern.ch/user/c/cmst2/mc/scripts/datasetinfo.py --dataset %s --json" % dataset #output=os.popen(q).read() #s = json.loads(output) dbsapi = DbsApi(url=dbs3_url) # retrieve dataset summary try: reply = dbsapi.listDatasets(dataset=dataset, dataset_access_type='*', detail=True) #print reply if len(reply): status = reply[0]['dataset_access_type'] reply = dbsapi.listBlockSummaries(dataset=dataset, detail=True) cnt = 0 for block in reply: cnt += int(block['num_event']) return [cnt, status, int(cnt / 100.)] else: print dataset, "not exsiting" return [0, '', 0] except: print "crash dbs3" return [0, '', 0]
def main(): # args=sys.argv[1:] # data=args[0] sample_group = 'signal' # signal, background, data, all sample_list = get_sample_list(sample_group) sample_list.sort() url="https://cmsweb.cern.ch/dbs/prod/global/DBSReader" api=DbsApi(url=url) for samp in sample_list: outputDataSets = '' #print('Checking {1}'.format(samp.DAS)) outputDataSets = api.listDatasets(dataset=samp.DAS, detail = True, dataset_access_type='VALID') if outputDataSets: for ds in outputDataSets: #print('{0}'.format(ds['dataset'])) #print('{0}'.format(ds['primary_ds_name'])) #print('{0}'.format(ds['xtcrosssection'])) nevents = api.listBlockSummaries(dataset=ds['dataset']) #print(nevents[0]['num_event']) # this to create a table for the paper with dataset name and number of events print('verb@ {0} @ & {1:.2e} & XX \\\\ '.format(ds['primary_ds_name'],nevents[0]['num_event'])) sys.exit(0);
def getFileCount(dataset): # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) # retrieve dataset summary reply = dbsapi.listBlockSummaries(dataset=dataset,detail=True) cnt=0 for block in reply: cnt = cnt + int(block['num_file']) return cnt
def getEventCountDataSet(dataset): """ Returns the number of events in a dataset using DBS3 """ # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) # retrieve dataset summary reply = dbsapi.listBlockSummaries(dataset=dataset) return reply[0]['num_event']
def getEventCountDataSetBlockList(dataset,blockList): """ Counts and adds all the events for a given lists blocks inside a dataset """ # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) lumis=0 reply = dbsapi.listBlockSummaries(block_name=blockList) return reply[0]['num_event']
def getEventCountBlock(block): """ Returns the number of events in a dataset using DBS3 """ # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) # retrieve dataset summary reply = dbsapi.listBlockSummaries(block_name=block) return reply[0]["num_event"]
def getEventCountBlock(block): """ Returns the number of events in a dataset using DBS3 """ # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) # retrieve dataset summary reply = dbsapi.listBlockSummaries(block_name=block) return reply[0]['num_event']
def getNumberOfEvents( dataset ): query = 'find sum(block.numevents) where dataset = ' + dataset dbs_cmd = [ 'dbs', 'search', '--query', query ] dbs_output = subprocess.Popen( dbs_cmd, stdout = subprocess.PIPE ).communicate()[0] from dbs.apis.dbsClient import DbsApi dbsUrl = 'https://cmsweb.cern.ch/dbs/prod/global/DBSReader' dbsApi = DbsApi( url = dbsUrl ) datasetBlocks = dbsApi.listBlockSummaries( dataset = dataset ) numEvents = sum( [ block['num_event'] for block in datasetBlocks ] ) return numEvents
def getEventCountDataSetBlockList(dataset,blockList): """ Counts and adds all the events for a given lists blocks inside a dataset """ # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) #transform from strin to list if type(blockList) in (str, unicode): blockList = eval(blockList) total = 0 #get one by one block and add it so uri wont be too large for block in blockList: reply = dbsapi.listBlockSummaries(block_name=block) total += reply[0]['num_event'] return total
def getDatasetChops(dataset, chop_threshold=1000., talk=False): ## does a *flat* choping of the input in chunk of size less than chop threshold dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader') blocks = dbsapi.listBlockSummaries(dataset=dataset, detail=True) sum_all = 0 ## put everything in terms of GB for block in blocks: block['file_size'] /= 1000000000. for block in blocks: sum_all += block['file_size'] items = [] if sum_all > chop_threshold: items.extend([[block['block_name']] for block in filter( lambda b: b['file_size'] > chop_threshold, blocks)]) small_block = filter(lambda b: b['file_size'] <= chop_threshold, blocks) small_block.sort(lambda b1, b2: cmp(b1['file_size'], b2['file_size']), reverse=True) while len(small_block): first, small_block = small_block[0], small_block[1:] items.append([first['block_name']]) size_chunk = first['file_size'] while size_chunk < chop_threshold and small_block: last, small_block = small_block[-1], small_block[:-1] size_chunk += last['file_size'] items[-1].append(last['block_name']) if talk: print len(items[-1]), "items below thresholds", size_chunk print items[-1] else: if talk: print "one big", sum_all items = [[dataset]] if talk: print items ## a list of list of blocks or dataset print "Choped", dataset, "of size", sum_all, "GB (", chop_threshold, "GB) in", len( items), "pieces" return items
def getDatasetPresence( url, dataset, complete='y', only_blocks=None, group=None, vetoes=None): if vetoes==None: vetoes = ['MSS','Buffer','Export'] #print "presence of",dataset dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader') all_blocks = dbsapi.listBlockSummaries( dataset = dataset, detail=True) all_block_names=set([block['block_name'] for block in all_blocks]) if only_blocks: all_block_names = filter( lambda b : b in only_blocks, all_block_names) full_size = sum([block['file_size'] for block in all_blocks if (block['block_name'] in only_blocks)]) #print all_block_names #print [block['block_name'] for block in all_blocks if block['block_name'] in only_blocks] else: full_size = sum([block['file_size'] for block in all_blocks]) if not full_size: print dataset,"is nowhere" return {} #print full_size conn = httplib.HTTPSConnection(url, cert_file = os.getenv('X509_USER_PROXY'), key_file = os.getenv('X509_USER_PROXY')) r1=conn.request("GET",'/phedex/datasvc/json/prod/blockreplicas?dataset=%s'%(dataset)) r2=conn.getresponse() result = json.loads(r2.read()) items=result['phedex']['block'] locations=defaultdict(set) for item in items: for replica in item['replica']: if not any(replica['node'].endswith(v) for v in vetoes): if replica['group'] == None: replica['group']="" if complete and not replica['complete']==complete: continue #if group!=None and replica['group']==None: continue if group!=None and not replica['group'].lower()==group.lower(): continue locations[replica['node']].add( item['name'] ) presence={} for (site,blocks) in locations.items(): site_size = sum([ block['file_size'] for block in all_blocks if (block['block_name'] in blocks and block['block_name'] in all_block_names)]) #print site,blocks,all_block_names #presence[site] = (set(blocks).issubset(set(all_block_names)), site_size/float(full_size)*100.) presence[site] = (set(all_block_names).issubset(set(blocks)), site_size/float(full_size)*100.) #print json.dumps( presence , indent=2) return presence
def getEventCountDataSet(dataset, skipInvalid=False): """ Returns the number of events in a dataset using DBS3 If skipInvalid =True, it will count only valid files. This is slower (specially on larger datasets) """ # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) # retrieve dataset summary - faster if not skipInvalid: reply = dbsapi.listBlockSummaries(dataset=dataset) if not reply: return 0 return reply[0]['num_event'] #discard invalid files (only count valid ones) - slower else: # retrieve file list reply = dbsapi.listFiles(dataset=dataset, detail=True) #sum only valid total = sum(f['event_count'] for f in reply if f['is_file_valid']==1) return total
def getDatasetChops(dataset, chop_threshold =1000., talk=False): ## does a *flat* choping of the input in chunk of size less than chop threshold dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader') blocks = dbsapi.listBlockSummaries( dataset = dataset, detail=True) sum_all = 0 ## put everything in terms of GB for block in blocks: block['file_size'] /= 1000000000. for block in blocks: sum_all += block['file_size'] items=[] if sum_all > chop_threshold: items.extend( [[block['block_name']] for block in filter(lambda b : b['file_size'] > chop_threshold, blocks)] ) small_block = filter(lambda b : b['file_size'] <= chop_threshold, blocks) small_block.sort( lambda b1,b2 : cmp(b1['file_size'],b2['file_size']), reverse=True) while len(small_block): first,small_block = small_block[0],small_block[1:] items.append([first['block_name']]) size_chunk = first['file_size'] while size_chunk < chop_threshold and small_block: last,small_block = small_block[-1], small_block[:-1] size_chunk += last['file_size'] items[-1].append( last['block_name'] ) if talk: print len(items[-1]),"items below thresholds",size_chunk print items[-1] else: if talk: print "one big",sum_all items = [[dataset]] if talk: print items ## a list of list of blocks or dataset print "Choped",dataset,"of size",sum_all,"GB (",chop_threshold,"GB) in",len(items),"pieces" return items
def dbs3_get_data(dataset,timestamps=1): #q = "/afs/cern.ch/user/s/spinoso/public/dbs3wrapper.sh /afs/cern.ch/user/c/cmst2/mc/scripts/datasetinfo.py --dataset %s --json" % dataset #output=os.popen(q).read() #s = json.loads(output) dbsapi = DbsApi(url=dbs3_url) # retrieve dataset summary try: reply = dbsapi.listDatasets(dataset=dataset,dataset_access_type='*',detail=True) #print reply if len(reply): status=reply[0]['dataset_access_type'] reply = dbsapi.listBlockSummaries(dataset=dataset,detail=True) cnt=0 for block in reply: cnt += int(block['num_event']) return [cnt,status,int(cnt/100.)] else: print dataset,"not exsiting" return [0,'',0] except: print "crash dbs3" return [0,'',0]
if read == None: for category in datatiers.keys(): if category not in dbs_query_results.keys(): dbs_query_results[category] = {} for datatier in datatiers[category]: if datatier not in dbs_query_results[category].keys(): dbs_query_results[category][datatier] = {} blocks = api3.listBlocks(data_tier_name=datatier,min_cdate=startdate.strftime("%s"),max_cdate=enddate.strftime("%s")) for block in blocks: exclude = False for exclusion_string in exclusion_strings[category]: if exclusion_string.lower() in block['block_name'].lower(): if verbose == True: print 'blockname was rejected:',block['block_name'] exclude = True continue if exclude == True: continue if verbose == True: print 'Querying for the summary for block:',block['block_name'],'!' properties = api3.listBlockSummaries(block_name=block['block_name']) dbs_query_results[category][datatier][block['block_name']] = properties if persist != None: outputfile = open(persist,'w') json.dump(dbs_query_results,outputfile) outputfile.close() else: dbs_query_results = json.load(open(read)) for category in datatiers.keys(): if category not in results.keys(): results[category] = {} for datatier in datatiers[category]: if datatier not in results[category].keys(): results[category][datatier] = {} for blockname in dbs_query_results[category][datatier]: triggered_separation = False
def main(): usage="%prog <options>" parser = OptionParser(usage=usage) parser.add_option("-u", "--url", dest="url", help="DBS Instance url. default is https://cmsweb.cern.ch/dbs/prod/global/DBSReader", metavar="<url>") parser.add_option("-l", "--length", dest="length", help="Number of days for calculate the accumated events. It is Optional, default is 30 days.", metavar="<length>") parser.add_option("-d", "--dataset", dest="dataset", help="The dataset name for cacluate the events. Can be optional if datatier is used.", metavar="<dataset>") parser.add_option("-t", "--datatier", dest="datatier", help="The datatier name for cacluate the events. Can be optional if dataset is used. In this version datatier is not supported yet.", metavar="<data_tier_name>") parser.add_option("-a", "--access_type", dest="ds_access_type", help="Dataset access types: VALID, PRODUCTION or ALL(VALID+PRODUCTION). Default is ALL", metavar="<dataset_access_type>") parser.set_defaults(url="https://cmsweb.cern.ch/dbs/prod/global/DBSReader") parser.set_defaults(length=30) parser.set_defaults(ds_access_type="ALL") (opts, args) = parser.parse_args() if not (opts.dataset or opts.datatier): parser.print_help() parser.error('either --dataset or --datatier is required') dataset = opts.dataset #seconds per day sdays = 86400 lenth = int(opts.length) now = time.time() #now = 1391353032 then = now - sdays*lenth url = opts.url api=DbsApi(url=url) outputDataSets = [] f = [0 for x in range(lenth)] min_cdate = int(then) max_cdate = int(now) if (opts.ds_access_type == "ALL"): outputDataSetsValid = api.listDatasets(dataset=dataset, min_cdate=min_cdate-30*sdays, max_cdate=max_cdate, dataset_access_type="VALID") outputDataSetsProd = api.listDatasets(dataset=dataset, min_cdate=min_cdate-30*sdays, max_cdate=max_cdate, dataset_access_type="PRODUCTION") outputDataSets = outputDataSetsValid + outputDataSetsProd elif (opts.ds_access_type == "VALID"): outputDataSets = api.listDatasets(dataset=dataset, min_cdate=min_cdate-30*sdays, max_cdate=max_cdate, dataset_access_type="VALID") elif (opts.ds_access_type == "PRODUCTION"): outputDataSets = api.listDatasets(dataset=dataset, min_cdate=min_cdate-30*sdays, max_cdate=max_cdate, dataset_access_type="PRODUCTION") for dataset in outputDataSets: outputBlocks = api.listBlocks(dataset=dataset["dataset"], detail=1, min_cdate=min_cdate, max_cdate=max_cdate) blockList = [] blockCdate = {} for block in outputBlocks: blockList.append(block["block_name"]) blockCdate[block["block_name"]] = block["creation_date"] blockSum = [] if blockList: blockSum = api.listBlockSummaries(block_name=blockList, detail=1) for b in blockSum: cdate= blockCdate[b["block_name"]] day = int((now-cdate)/sdays) f[day] = f[day] + b["num_event"] for i in range(lenth): #print (lenth-1)-i, ": ", f[i], " ", sum(item['all'] for item in f[i:lenth]) print i, ": ", f[(lenth-1)-i], " ", sum(item for item in f[(lenth-1)-i:lenth]) sys.exit(0);
# size of provided dataset #------------------------- # instantiate an API dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader') # first test whether dataset is valid dbsList = dbsapi.listDatasets(dataset = dataset, dataset_access_type = 'VALID') datasetInvalid = False if dbsList == []: datasetInvalid = True print ' Dataset does not exist or is invalid. Exit now!\n' sys.exit(1) # determine size and number of files size = str(sum([block['file_size'] for block in dbsapi.listBlockSummaries(dataset = dataset)]))+'UB' sizeGb = convertSizeToGb(size) # in case this is an open subscription we need to adjust sizeGb to the expected size if expectedSizeGb > 0: sizeGb = expectedSizeGb print ' SIZE: %.1f GB'%(sizeGb) # prepare subscription list datasets = [] datasets.append(dataset) # first make sure this dataset is not owned by DataOps group anymore at the Tier-1 site(s) #-----------------------------------------------------------------------------------------
def getDatasetSize(dataset): dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader') blocks = dbsapi.listBlockSummaries( dataset = dataset, detail=True) ## put everything in terms of GB return sum([block['file_size'] / (1024.**3) for block in blocks])
def QueryForRquestedEventsPerDay(dbsurl,couchurl,outputdict,data_regexp): # # query couch DB and extract list of requests per day # these status values are for rejected workflows rejected_status = ['rejected','rejected-archived'] basenames_to_print = ['SUS-Spring14miniaod-00017_00029_v0_'] # load requests from json header = {'Content-type': 'application/json', 'Accept': 'application/json'} conn = httplib.HTTPConnection(couchurl) conn.request("GET", '/latency_analytics/_design/latency/_view/maria', headers= header) response = conn.getresponse() data = response.read() conn.close() myString = data.decode('utf-8') workflows = json.loads(myString)['rows'] # first extract workflows per workflow basename to identify actual requests in case of clones or other basenames = {} for entry in workflows: # extract information workflowname = entry['id'] info = entry['value'] workflow_dict = { 'Campaign' : info[0], 'Tier' : info[1], 'Task type' : info[2], 'Status' : info[3], 'Priority' : info[4], 'Requested events' : info[5], '% Complete' : info[6], 'Completed events' : 0, 'Request date' : time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(info[7])), 'Processing dataset name' : '', 'Input Dataset' : info[8], 'Output Datasets' : info[9], 'Filter efficiency' : info[10], 'Run white list' : info[11], } if workflowname == 'pdmvserv_SUS-Spring14miniaod-00016_00029_v0__140728_120018_4477': print workflowname,workflow_dict # filter for data_regexp match = False try: for output_dataset in workflow_dict['Output Datasets']: if re.compile(data_regexp).match(output_dataset) is not None: match = True break except: for output_dataset in workflow_dict['Output Datasets']: if re.compile(data_regexp).match(output_dataset[0]) is not None: match = True break if match == False: continue # extract workflow basename, split by '_', remove first field that is the username who injected the workflow, and the last 3 fields that are date, time and fractions of a second (?) workflowname_array = workflowname.split('_') basename_array = workflowname_array[1:-3] # continue if basename_array length == 0 if len(basename_array) == 0: continue # filter out ACDC and tests if workflowname.lower().count('acdc') > 0: continue if workflowname.lower().count('test') > 0: continue # Jen's username is jen_a, from split above a_ could remain, remove if basename_array[0].lower() == 'a': basename_array = basename_array[1:] # if extension, remove EXT from beginning of basename if basename_array[0].lower() == 'ext': basename_array = basename_array[1:] basename = '_'.join(basename_array) requestdatetime = int(workflowname_array[-1]) + int(workflowname_array[-2]) * 1E4 + int(workflowname_array[-3]) * 1E10 if basename not in basenames.keys(): basenames[basename] = {} basenames[basename][requestdatetime] = [workflowname,workflow_dict] # select the original workflow removing clones, etc selected = {} rejected = {} for basename in basenames.keys(): if basename in basenames_to_print: print 'selected basename:',basename for date in sorted(basenames[basename].keys()): print basenames[basename][date] if basename in selected.keys() or basename in rejected.keys(): continue # look at all the workflow names of a basename ordered by injection time # if the first workflow name of a basename ordered by injection time is not a rejected status, select it if basenames[basename][sorted(basenames[basename].keys())[0]][1]['Status'] not in rejected_status: selected[basename] = basenames[basename][sorted(basenames[basename].keys())[0]] else : # if the last workflow is not in rejected status (indication that the workflow never started to run), choose the first workflow as reference if basenames[basename][sorted(basenames[basename].keys())[-1]][1]['Status'] not in rejected_status: selected[basename] = basenames[basename][sorted(basenames[basename].keys())[0]] else : # if there is only one workflow for the basename and if the status is rejected if len(basenames[basename]) == 1 and basenames[basename][basenames[basename].keys()[0]][1]['Status'] in rejected_status: rejected[basename] = basenames[basename][basenames[basename].keys()[0]] else : # go through workflowname per basename ordered by status, select the first status that is not a rejected status firstvalidentry = None for entry in sorted(basenames[basename].keys()): if basenames[basename][entry][1]['Status'] not in rejected_status: firstvalidentry = entry break if firstvalidentry != None: selected[basename] = basenames[basename][firstvalidentry] else: # check if there are only workflownames per basename that are in a rejected status nonrejectedstatus = False for entry in basenames[basename].keys(): if basenames[basename][entry][1]['Status'] not in rejected_status: nonrejectedstatus = True break if nonrejectedstatus == False : # select last one rejected[basename] = basenames[basename][sorted(basenames[basename].keys())[-1]] if basename in selected.keys() or basename in rejected.keys(): continue print 'could not decide which workflow is the original workflow for basename:',basename for date in sorted(basenames[basename].keys()): print basenames[basename][date] sys.exit(1) # loop over selected workflows and fill requested events per day # only fill day if defined as key of outputdict api=DbsApi(url=dbsurl) for basename in selected.keys(): print 'selected basename:',basename for date in sorted(basenames[basename].keys()): print basenames[basename][date] workflowname = selected[basename][0] workflow_dict = selected[basename][1] # extract unix time of start of day of request date request_date = datetime.datetime.strptime(workflow_dict['Request date'],"%Y-%m-%d %H:%M:%S") request_date = request_date.replace(tzinfo=pytz.timezone('UTC')) request_day = int(datetime.datetime(request_date.year, request_date.month, request_date.day,0,0,0,0, tzinfo=pytz.timezone('UTC')).strftime("%s")) if str(request_day) not in outputdict.keys(): continue if 'REQUESTED' not in outputdict[str(request_day)].keys(): outputdict[str(request_day)]['REQUESTED'] = 0 if 'WORKFLOWS' not in outputdict[str(request_day)].keys(): outputdict[str(request_day)]['WORKFLOWS'] = [] outputdict[str(request_day)]['WORKFLOWS'].append(workflowname) request_events = int(workflow_dict['Requested events']) if request_events == 0 and workflow_dict['Input Dataset'] != '': blocks = api.listBlocks(dataset=workflow_dict['Input Dataset'], detail=False) for block in blocks: reply= api.listBlockSummaries(block_name=block['block_name']) request_events += reply[0]['num_event'] if workflow_dict['Filter efficiency'] == None : outputdict[str(request_day)]['REQUESTED'] += int(request_events) else: outputdict[str(request_day)]['REQUESTED'] += int(request_events) * float(workflow_dict['Filter efficiency'])
def getDatasetSize(dataset): dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader') blocks = dbsapi.listBlockSummaries(dataset=dataset, detail=True) ## put everything in terms of GB return sum([block['file_size'] / (1024.**3) for block in blocks])
if options.era: datasets = api.listDatasets(acquisition_era_name=options.era, detail=True) nDatasetsToCheck = 0 for ds in datasets: if datatiers and not ds['data_tier_name'] in datatiers: continue nDatasetsToCheck += 1 print >>log, "Number of datasets to check: %d" % nDatasetsToCheck print "Number of datasets to check: %d" % nDatasetsToCheck for ds in datasets: if datatiers and not ds['data_tier_name'] in datatiers: continue print >>log, "\nDatset:",ds['dataset'], blocks = api.listBlockSummaries(dataset = ds['dataset']) ds_size = blocks[0]['file_size']/pow(2,30) print >>log, " \t %0.0f GB" % (ds_size) report = get_subscription_information(ds['dataset']) if options.ignore and report['firstSubscription']!=None and (time.time()-report['firstSubscription'])<86400*options.ignore: print >>log, "Skip the dataset availability check since the first subscription is very recent" continue if report['nComplete']==0: summary["NoCompleteCopyAnywhere"].append(ds['dataset']) if report['nIncomplete']==0: summary["Lost"].append(ds['dataset']) if report['nAnalysisOpsComplete']==0: summary["NoCompleteCopyAnalysisOps"].append(ds['dataset']) pprint.pprint(summary)
def getBlockSizeDataSet(dataset): # initialize API to DBS3 dbsapi = DbsApi(url=dbs3_url) # retrieve dataset summary reply = dbsapi.listBlockSummaries(dataset=dataset) return int(reply[0]['file_size'])/1000000000000.0
def getDatasetPresence(url, dataset, complete='y', only_blocks=None, group=None, vetoes=None): if vetoes == None: vetoes = ['MSS', 'Buffer', 'Export'] #print "presence of",dataset dbsapi = DbsApi(url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader') all_blocks = dbsapi.listBlockSummaries(dataset=dataset, detail=True) all_block_names = set([block['block_name'] for block in all_blocks]) if only_blocks: all_block_names = filter(lambda b: b in only_blocks, all_block_names) full_size = sum([ block['file_size'] for block in all_blocks if (block['block_name'] in only_blocks) ]) #print all_block_names #print [block['block_name'] for block in all_blocks if block['block_name'] in only_blocks] else: full_size = sum([block['file_size'] for block in all_blocks]) if not full_size: print dataset, "is nowhere" return {} #print full_size conn = httplib.HTTPSConnection(url, cert_file=os.getenv('X509_USER_PROXY'), key_file=os.getenv('X509_USER_PROXY')) r1 = conn.request( "GET", '/phedex/datasvc/json/prod/blockreplicas?dataset=%s' % (dataset)) r2 = conn.getresponse() result = json.loads(r2.read()) items = result['phedex']['block'] locations = defaultdict(set) for item in items: for replica in item['replica']: if not any(replica['node'].endswith(v) for v in vetoes): if replica['group'] == None: replica['group'] = "" if complete and not replica['complete'] == complete: continue #if group!=None and replica['group']==None: continue if group != None and not replica['group'].lower( ) == group.lower(): continue locations[replica['node']].add(item['name']) presence = {} for (site, blocks) in locations.items(): site_size = sum([ block['file_size'] for block in all_blocks if (block['block_name'] in blocks and block['block_name'] in all_block_names) ]) #print site,blocks,all_block_names #presence[site] = (set(blocks).issubset(set(all_block_names)), site_size/float(full_size)*100.) presence[site] = (set(all_block_names).issubset(set(blocks)), site_size / float(full_size) * 100.) #print json.dumps( presence , indent=2) return presence