def test_xml_parser(self): """ Test functionality of xml_parser """ xmldata = """<?xml version='1.0' encoding='ISO-8859-1'?> <phedex attr="a"> <block bytes="1"> <file size="10"> </file> </block> </phedex> """ fdesc = tempfile.NamedTemporaryFile() fname = fdesc.name stream = open(fname, 'w') stream.write(xmldata) stream.close() stream = open(fname, 'r') gen = xml_parser(stream, "block", []) result = next(gen) expect = {'block': {'bytes': 1, 'file': {'size': 10}}} self.assertEqual(expect, result) stream = open(fname, 'r') gen = xml_parser(stream, "file", ["block.bytes"]) result = next(gen) expect = {'file': {'block': {'bytes': 1}, 'size': 10}} self.assertEqual(expect, result)
def test_xml_parser(self): """ Test functionality of xml_parser """ xmldata = """<?xml version='1.0' encoding='ISO-8859-1'?> <phedex attr="a"> <block bytes="1"> <file size="10"> </file> </block> </phedex> """ fdesc = tempfile.NamedTemporaryFile() fname = fdesc.name stream = file(fname, 'w') stream.write(xmldata) stream.close() stream = file(fname, 'r') gen = xml_parser(stream, "block", []) result = gen.next() expect = {'block': {'bytes': 1, 'file': {'size': 10}}} self.assertEqual(expect, result) stream = file(fname, 'r') gen = xml_parser(stream, "file", ["block.bytes"]) result = gen.next() expect = {'file': {'block': {'bytes': 1}, 'size': 10}} self.assertEqual(expect, result)
def phedex_files(phedex_url, kwds): "Get file information from Phedex" params = dict(kwds) # parameters to be send to Phedex site = kwds.get('site', None) if site and phedex_node_pattern.match(site): if not site.endswith('*'): # this will account to look-up site names w/o _Buffer or _MSS site += '*' params.update({'node': site}) params.pop('site') elif site and se_pattern.match(site): params.update({'se': site}) params.pop('site') else: return expire = 600 # set some expire since we're not going to use it headers = {'Accept': 'text/xml'} source, expire = \ getdata(phedex_url, params, headers, expire, ckey=CKEY, cert=CERT, system='phedex') tags = 'block.file.name' prim_key = 'block' for rec in xml_parser(source, prim_key, tags): ddict = DotDict(rec) files = ddict.get('block.file') if not isinstance(files, list): files = [files] for row in files: yield row['name']
def test_xml_parser_2(self): """ Test functionality of xml_parser """ xmldata = """<?xml version='1.0' encoding='ISO-8859-1'?> <RUNS> <RUN id="751084"> <LUMI> <NUMBER>1</NUMBER> <PROP>avx</PROP> <TEST> <FOO>1</FOO> <BOO>2</BOO> </TEST> </LUMI> </RUN> </RUNS> """ fdesc = tempfile.NamedTemporaryFile() fname = fdesc.name stream = file(fname, 'w') stream.write(xmldata) stream.close() stream = file(fname, 'r') gen = xml_parser(stream, "RUNS", []) result = gen.next() expect = {'RUNS': {'RUN': {'id': 751084.0, 'LUMI': {'TEST': {'FOO': 1, 'BOO': 2}, 'NUMBER': 1, 'PROP': 'avx'} } } } self.assertEqual(expect, result)
def files4site(phedex_url, files, site): "Find site for given files" params = {} if site and phedex_node_pattern.match(site): if not site.endswith('*'): # this will account to look-up site names w/o _Buffer or _MSS site += '*' params.update({'node': site}) elif site and se_pattern.match(site): params.update({'se': site}) else: return sname = urllib.urlencode(params) urls = [] for fname in files: url = '%s?lfn=%s&%s' % (phedex_url, fname, sname) urls.append(url) tags = 'block.replica.node' prim_key = 'block' gen = urlfetch_getdata(urls, CKEY, CERT) for rec in gen: if 'error' in rec.keys(): yield rec else: # convert record string into StringIO for xml_parser source = StringIO.StringIO(rec['data']) for row in xml_parser(source, prim_key, tags): fobj = row['block']['file'] fname = fobj['name'] replica = fobj['replica'] for item in replica: yield fname
def site4dataset(dbs_url, phedex_api, args, expire): "Yield site information about given dataset" # DBS part dataset = args['dataset'] try: totblocks, totfiles = dataset_summary(dbs_url, dataset) except Exception as err: error = str(err) reason = "Can't find #block, #files info in DBS for dataset=%s" \ % dataset yield {'site': {'error': error, 'reason': reason}} return # Phedex part phedex_args = {'dataset':args['dataset']} headers = {'Accept': 'text/xml'} source, expire = \ getdata(phedex_api, phedex_args, headers, expire, post=True, system='phedex') prim_key = 'block' tags = 'block.replica.node' site_info = {} for rec in xml_parser(source, prim_key, tags): ddict = DotDict(rec) replicas = ddict.get('block.replica') if not isinstance(replicas, list): replicas = [replicas] for row in replicas: if not row or 'node' not in row: continue node = row['node'] files = int(row['files']) complete = 1 if row['complete'] == 'y' else 0 if node in site_info: files = site_info[node]['files'] + files nblks = site_info[node]['blocks'] + 1 bc_val = site_info[node]['blocks_complete'] b_complete = bc_val+1 if complete else bc_val else: b_complete = 1 if complete else 0 nblks = 1 site_info[node] = {'files': files, 'blocks': nblks, 'blocks_complete': b_complete} row = {} for key, val in site_info.iteritems(): if totfiles: nfiles = '%5.2f%%' % (100*float(val['files'])/totfiles) else: nfiles = 'N/A' if totblocks: nblks = '%5.2f%%' % (100*float(val['blocks'])/totblocks) else: nblks = 'N/A' ratio = float(val['blocks_complete'])/val['blocks'] b_completion = '%5.2f%%' % (100*ratio) row = {'site':{'name':key, 'dataset_fraction': nfiles, 'block_fraction': nblks, 'block_completion': b_completion}} yield row
def parser(self, dasquery, dformat, data, api): """ DAS data parser. Input parameters: - *query* input DAS query - *dformat* is a data format, e.g. XML, JSON - *data* is a data source, either file-like object or actual data - *api* is API name """ prim_key = self.dasmapping.primary_key(self.name, api) apitag = self.dasmapping.apitag(self.name, api) counter = 0 if dformat.lower() == 'xml': tags = self.dasmapping.api2daskey(self.name, api) gen = xml_parser(data, prim_key, tags) for row in gen: counter += 1 yield row elif dformat.lower() == 'json' or dformat.lower() == 'dasjson': gen = json_parser(data, self.logger) das_dict = {} for row in gen: if dformat.lower() == 'dasjson': for key, val in row.iteritems(): if key != 'results': das_dict[key] = val row = row['results'] self.analytics.update_apicall(\ dasquery.mongo_query, das_dict) if apitag and row.has_key(apitag): row = row[apitag] if isinstance(row, list): for item in row: if item.has_key(prim_key): counter += 1 yield item else: counter += 1 yield {prim_key:item} else: if row.has_key(prim_key): counter += 1 yield row else: counter += 1 yield {prim_key:row} else: msg = 'Unsupported data format="%s", API="%s"' % (dformat, api) raise Exception(msg) msg = "api=%s, format=%s " % (api, dformat) msg += "prim_key=%s yield %s rows" % (prim_key, counter) self.logger.info(msg)
def parser(self, dasquery, dformat, data, api): """ DAS data parser. Input parameters: - *query* input DAS query - *dformat* is a data format, e.g. XML, JSON - *data* is a data source, either file-like object or actual data - *api* is API name """ prim_key = self.dasmapping.primary_key(self.name, api) counter = 0 if dformat.lower() == 'xml': tags = self.dasmapping.api2daskey(self.name, api) gen = xml_parser(data, prim_key, tags) for row in gen: counter += 1 yield row elif dformat.lower() == 'json' or dformat.lower() == 'dasjson': gen = json_parser(data, self.logger) das_dict = {} for row in gen: if dformat.lower() == 'dasjson': for key, val in row.items(): if key != 'results': das_dict[key] = val row = row['results'] if isinstance(row, list): for item in row: if item: if prim_key in item: counter += 1 yield item else: counter += 1 yield {prim_key: item} else: if prim_key in row: counter += 1 yield row else: counter += 1 yield {prim_key: row} else: msg = 'Unsupported data format="%s", API="%s"' % (dformat, api) raise Exception(msg) msg = "api=%s, format=%s " % (api, dformat) msg += "prim_key=%s yield %s rows" % (prim_key, counter) self.logger.info(msg)
def parser(self, dasquery, dformat, data, api): """ DAS data parser. Input parameters: - *query* input DAS query - *dformat* is a data format, e.g. XML, JSON - *data* is a data source, either file-like object or actual data - *api* is API name """ prim_key = self.dasmapping.primary_key(self.name, api) counter = 0 if dformat.lower() == "xml": tags = self.dasmapping.api2daskey(self.name, api) gen = xml_parser(data, prim_key, tags) for row in gen: counter += 1 yield row elif dformat.lower() == "json" or dformat.lower() == "dasjson": gen = json_parser(data, self.logger) das_dict = {} for row in gen: if dformat.lower() == "dasjson": for key, val in row.iteritems(): if key != "results": das_dict[key] = val row = row["results"] if isinstance(row, list): for item in row: if prim_key in item: counter += 1 yield item else: counter += 1 yield {prim_key: item} else: if prim_key in row: counter += 1 yield row else: counter += 1 yield {prim_key: row} else: msg = 'Unsupported data format="%s", API="%s"' % (dformat, api) raise Exception(msg) msg = "api=%s, format=%s " % (api, dformat) msg += "prim_key=%s yield %s rows" % (prim_key, counter) self.logger.info(msg)
def test_xml_parser_2(self): """ Test functionality of xml_parser """ xmldata = """<?xml version='1.0' encoding='ISO-8859-1'?> <RUNS> <RUN id="751084"> <LUMI> <NUMBER>1</NUMBER> <PROP>avx</PROP> <TEST> <FOO>1</FOO> <BOO>2</BOO> </TEST> </LUMI> </RUN> </RUNS> """ fdesc = tempfile.NamedTemporaryFile() fname = fdesc.name stream = open(fname, 'w') stream.write(xmldata) stream.close() stream = open(fname, 'r') gen = xml_parser(stream, "RUNS", []) result = next(gen) expect = { 'RUNS': { 'RUN': { 'id': 751084.0, 'LUMI': { 'TEST': { 'FOO': 1, 'BOO': 2 }, 'NUMBER': 1, 'PROP': 'avx' } } } } self.assertEqual(expect, result)
def parser(self, query, dformat, source, api): """ Phedex data-service parser. """ tags = [] if api == 'blockReplicas': prim_key = 'block' elif api == 'fileReplicas': prim_key = 'file' tags = 'block.name' elif api == 'fileReplicas4dataset': prim_key = 'file' tags = 'block.name' elif api == 'fileReplicas4file': prim_key = 'file' tags = 'block.name' elif api == 'dataset4site': prim_key = 'block' tags = 'block' elif api == 'dataset4se': prim_key = 'block' tags = 'block' elif api == 'dataset4site_group': prim_key = 'block' tags = 'block' elif api == 'dataset4se_group': prim_key = 'block' tags = 'block' elif api == 'site4dataset': prim_key = 'block' tags = 'block.replica.node' elif api == 'site4block': prim_key = 'block' tags = 'block.replica.node' elif api == 'site4file': prim_key = 'block' tags = 'block.replica.node' elif api == 'nodes': prim_key = 'node' elif api == 'nodeusage': prim_key = 'node' elif api == 'groups': prim_key = 'group' elif api == 'groupusage': prim_key = 'node' elif api == 'lfn2pfn': prim_key = 'mapping' elif api == 'tfc': prim_key = 'storage-mapping' else: msg = 'PhedexService::parser, unsupported %s API %s' \ % (self.name, api) raise Exception(msg) gen = xml_parser(source, prim_key, tags) site_names = [] seen = set() tot_files = 0 site_info_dict = {} for row in gen: if api == 'nodeusage': if row.has_key('node') and row['node'].has_key('name'): row['name'] = row['node']['name'] if row.has_key('block') and row['block'].has_key('name'): if not row['block'].has_key('dataset'): dataset = row['block']['name'].split('#')[0] row['block']['dataset'] = dataset if api == 'site4dataset' or api == 'site4block': item = row['block']['replica'] if isinstance(item, list): for replica in item: result = get_replica_info(replica) site_info(site_info_dict, row['block'], replica) if not replica['files']: continue if result not in site_names: site_names.append(result) elif isinstance(item, dict): replica = item result = get_replica_info(replica) site_info(site_info_dict, row['block'], replica) if not replica['files']: continue result = get_replica_info(replica) if result not in site_names: site_names.append(result) elif api == 'site4file': item = row['block']['file']['replica'] if isinstance(item, list): for replica in item: result = get_replica_info(replica) if result not in site_names: site_names.append(result) elif isinstance(item, dict): replica = item result = get_replica_info(replica) if result not in site_names: site_names.append(result) elif api == 'dataset4site' or api == 'dataset4se' or \ api == 'dataset4site_group' or api == 'dataset4se_group': if row.has_key('block'): dataset = row['block']['name'].split('#')[0] seen.add(dataset) elif api == 'fileReplicas' or api == 'fileReplicas4file' or \ api == 'fileReplicas4dataset': try: if row.has_key('file') and isinstance(row['file'], dict): rec = row['file'] cksum = rec['checksum'] if cksum.find(',') != -1: adler, cksum = cksum.split(',') rec['adler32'] = adler.replace('adler32:', '') rec['checksum'] = int(cksum.replace('cksum:', '')) except: pass yield row else: yield row if api == 'site4dataset' or api == 'site4block': for row in site_names: name = row['name'] if site_info_dict.has_key(name): sdict = site_info_dict[name] sfiles = float(sdict['files']) tot_files = float(sdict['totfiles']) file_occ = '%5.2f%%' % (100*sfiles/tot_files) else: file_occ = '0%%' row['replica_fraction'] = file_occ.strip() yield row if api == 'site4file': for row in site_names: yield row del site_names del site_info_dict if seen: for dataset in seen: yield {'dataset':dict(name=dataset)} del seen
def helper(self, api, args, expire): """ Class helper function which yields results for given set of input parameters. It yeilds the data record which must contain combined attribute corresponding to systems used to produce record content. """ dbs_url = self.map[api]['services'][self.dbs] phedex_url = self.map[api]['services']['phedex'] # make phedex_api from url, but use xml version for processing phedex_api = phedex_url.replace('/json/', '/xml/') + '/blockReplicas' if api == 'dataset4site_release' or \ api == 'dataset4site_release_parent' or \ api == 'child4site_release_dataset': # DBS part datasets = set() release = args['release'] parent = args.get('parent', None) for row in dbs_dataset4release_parent(dbs_url, release, parent): datasets.add(row) # Phedex part if args['site'].find('.') != -1: # it is SE phedex_args = {'dataset':list(datasets), 'se': '%s' % args['site']} else: phedex_args = {'dataset':list(datasets), 'node': '%s*' % args['site']} headers = {'Accept': 'text/xml'} source, expire = \ getdata(phedex_api, phedex_args, headers, expire, system='phedex') prim_key = 'block' tags = 'block.replica.node' found = {} for rec in xml_parser(source, prim_key, tags): ddict = DotDict(rec) block = ddict.get('block.name') bbytes = ddict.get('block.bytes') files = ddict.get('block.files') found_dataset = block.split('#')[0] if found_dataset in found: val = found[found_dataset] found[found_dataset] = {'bytes': val['bytes'] + bbytes, 'files': val['files'] + files} else: found[found_dataset] = {'bytes': bbytes, 'files': files} for name, val in found.items(): record = dict(name=name, size=val['bytes'], files=val['files']) if api == 'child4site_release_dataset': yield {'child': record} else: yield {'dataset':record} del datasets del found if api == 'site4block': pass if api == 'site4dataset': try: gen = site4dataset(dbs_url, phedex_api, args, expire) for row in gen: sname = row.get('site', {}).get('name', '') skind = self.site_info(phedex_url, sname) row['site'].update({'kind':skind}) yield row except Exception as err: print_exc(err) tstamp = dastimestamp('') msg = tstamp + ' Exception while processing DBS/Phedex info:' msg += str(err) row = {'site':{'name':'Fail to look-up site info', 'error':msg, 'dataset_fraction': 'N/A', 'block_fraction':'N/A', 'block_completion':'N/A'}, 'error': msg} yield row if api == 'files4dataset_runs_site' or \ api == 'files4block_runs_site': run_value = args.get('run', []) if isinstance(run_value, dict) and '$in' in run_value: runs = run_value['$in'] elif isinstance(run_value, list): runs = run_value else: if int_number_pattern.match(str(run_value)): runs = [run_value] else: runs = [] args.update({'runs': runs}) files = dbs_find('file', dbs_url, args) site = args.get('site') phedex_api = phedex_url.replace('/json/', '/xml/') + '/fileReplicas' for fname in files4site(phedex_api, files, site): yield {'file':{'name':fname}}
def parser(self, query, dformat, source, api): """ Phedex data-service parser. """ tags = [] if api == 'blockReplicas': prim_key = 'block' elif api == 'fileReplicas': prim_key = 'file' tags = 'block.name' elif api == 'fileReplicas4dataset': prim_key = 'file' tags = 'block.name' elif api == 'fileReplicas4file': prim_key = 'file' tags = 'block.name' elif api == 'dataset4site': prim_key = 'block' tags = 'block' elif api == 'dataset4se': prim_key = 'block' tags = 'block' elif api == 'dataset4site_group': prim_key = 'block' tags = 'block' elif api == 'dataset4se_group': prim_key = 'block' tags = 'block' elif api == 'site4dataset': prim_key = 'block' tags = 'block.replica.node' elif api == 'site4block': prim_key = 'block' tags = 'block.replica.node' elif api == 'site4file': prim_key = 'block' tags = 'block.replica.node' elif api == 'nodes': prim_key = 'node' elif api == 'nodeusage': prim_key = 'node' elif api == 'groups': prim_key = 'group' elif api == 'groupusage': prim_key = 'node' elif api == 'lfn2pfn': prim_key = 'mapping' elif api == 'tfc': prim_key = 'storage-mapping' else: msg = 'PhedexService::parser, unsupported %s API %s' \ % (self.name, api) raise Exception(msg) gen = xml_parser(source, prim_key, tags) site_names = [] seen = set() tot_files = 0 site_info_dict = {} for row in gen: if api == 'nodeusage': if 'node' in row and 'name' in row['node']: row['name'] = row['node']['name'] if 'block' in row and 'name' in row['block']: if 'dataset' not in row['block']: dataset = row['block']['name'].split('#')[0] row['block']['dataset'] = dataset if api == 'site4dataset' or api == 'site4block': item = row['block']['replica'] if isinstance(item, list): for replica in item: result = get_replica_info(replica) site_info(site_info_dict, row['block'], replica) if not replica['files']: continue if result not in site_names: site_names.append(result) elif isinstance(item, dict): replica = item result = get_replica_info(replica) site_info(site_info_dict, row['block'], replica) if not replica['files']: continue result = get_replica_info(replica) if result not in site_names: site_names.append(result) elif api == 'site4file': item = row['block']['file']['replica'] if isinstance(item, list): for replica in item: result = get_replica_info(replica) if result not in site_names: site_names.append(result) elif isinstance(item, dict): replica = item result = get_replica_info(replica) if result not in site_names: site_names.append(result) elif api == 'dataset4site' or api == 'dataset4se' or \ api == 'dataset4site_group' or api == 'dataset4se_group': if 'block' in row: dataset = row['block']['name'].split('#')[0] seen.add(dataset) elif api == 'fileReplicas' or api == 'fileReplicas4file' or \ api == 'fileReplicas4dataset': try: if 'file' in row and isinstance(row['file'], dict): rec = row['file'] cksum = rec['checksum'] for item in cksum.split(','): key, val = item.split(':') if key == 'cksum': rec['checksum'] = int(val) else: rec[key] = val except: pass yield row else: yield row if api == 'site4dataset' or api == 'site4block': for row in site_names: name = row['name'] if name in site_info_dict: sdict = site_info_dict[name] sfiles = float(sdict['files']) tot_files = float(sdict['totfiles']) file_occ = '%5.2f%%' % (100 * sfiles / tot_files) else: file_occ = '0%%' row['replica_fraction'] = file_occ.strip() yield row if api == 'site4file': for row in site_names: yield row del site_names del site_info_dict if seen: for dataset in seen: yield {'dataset': dict(name=dataset)} del seen
def helper(self, api, args, expire): """ Class helper function which yields results for given set of input parameters. It yeilds the data record which must contain combined attribute corresponding to systems used to produce record content. """ dbs_url = self.map[api]['services'][self.dbs] phedex_url = self.map[api]['services']['phedex'] # make phedex_api from url, but use xml version for processing phedex_api = phedex_url.replace('/json/', '/xml/') + '/blockReplicas' if api == 'dataset4site_release' or \ api == 'dataset4site_release_parent' or \ api == 'child4site_release_dataset': # DBS part datasets = set() release = args['release'] parent = args.get('parent', None) for row in dbs_dataset4release_parent(dbs_url, release, parent): datasets.add(row) # Phedex part if args['site'].find('.') != -1: # it is SE phedex_args = { 'dataset': list(datasets), 'se': '%s' % args['site'] } else: phedex_args = { 'dataset': list(datasets), 'node': '%s*' % args['site'] } headers = {'Accept': 'text/xml'} source, expire = \ getdata(phedex_api, phedex_args, headers, expire, system='phedex') prim_key = 'block' tags = 'block.replica.node' found = {} for rec in xml_parser(source, prim_key, tags): ddict = DotDict(rec) block = ddict.get('block.name') bbytes = ddict.get('block.bytes') files = ddict.get('block.files') found_dataset = block.split('#')[0] if found_dataset in found: val = found[found_dataset] found[found_dataset] = { 'bytes': val['bytes'] + bbytes, 'files': val['files'] + files } else: found[found_dataset] = {'bytes': bbytes, 'files': files} for name, val in found.items(): record = dict(name=name, size=val['bytes'], files=val['files']) if api == 'child4site_release_dataset': yield {'child': record} else: yield {'dataset': record} del datasets del found if api == 'site4dataset': try: gen = site4dataset(dbs_url, phedex_api, args, expire) for row in gen: sname = row.get('site', {}).get('name', '') skind = self.site_info(phedex_url, sname) row['site'].update({'kind': skind}) yield row except Exception as err: print_exc(err) tstamp = dastimestamp('') msg = tstamp + ' Exception while processing DBS/Phedex info:' msg += str(err) row = { 'site': { 'name': 'Fail to look-up site info', 'error': msg, 'dataset_fraction': 'N/A', 'block_fraction': 'N/A', 'block_completion': 'N/A' }, 'error': msg } yield row if api == 'files4dataset_runs_site' or \ api == 'files4block_runs_site': run_value = args.get('run', []) if isinstance(run_value, dict) and '$in' in run_value: runs = run_value['$in'] elif isinstance(run_value, list): runs = run_value else: if int_number_pattern.match(str(run_value)): runs = [run_value] else: runs = [] args.update({'runs': runs}) files = dbs_find('file', dbs_url, args) site = args.get('site') phedex_api = phedex_url.replace('/json/', '/xml/') + '/fileReplicas' for fname in files4site(phedex_api, files, site): yield {'file': {'name': fname}}
def site4dataset(dbs_url, phedex_api, args, expire): "Yield site information about given dataset" # DBS part dataset = args['dataset'] try: totblocks, totfiles = dataset_summary(dbs_url, dataset) except Exception as err: error = 'combined service unable to process your request' reason = "Fail to parse #block, #files info, %s" % str(err) yield { 'site': { 'name': 'N/A', 'se': 'N/A', 'error': error, 'reason': reason } } return # Phedex part phedex_args = {'dataset': args['dataset']} headers = {'Accept': 'text/xml'} source, expire = \ getdata(phedex_api, phedex_args, headers, expire, system='phedex') prim_key = 'block' tags = 'block.replica.node' site_info = {} for rec in xml_parser(source, prim_key, tags): ddict = DotDict(rec) replicas = ddict.get('block.replica') if not isinstance(replicas, list): replicas = [replicas] for row in replicas: if not row or 'node' not in row: continue node = row['node'] files = int(row['files']) complete = 1 if row['complete'] == 'y' else 0 if node in site_info: files = site_info[node]['files'] + files nblks = site_info[node]['blocks'] + 1 bc_val = site_info[node]['blocks_complete'] b_complete = bc_val + 1 if complete else bc_val else: b_complete = 1 if complete else 0 nblks = 1 site_info[node] = { 'files': files, 'blocks': nblks, 'blocks_complete': b_complete } row = {} for key, val in site_info.items(): if totfiles: nfiles = '%5.2f%%' % (100 * float(val['files']) / totfiles) else: nfiles = 'N/A' if totblocks: nblks = '%5.2f%%' % (100 * float(val['blocks']) / totblocks) else: nblks = 'N/A' ratio = float(val['blocks_complete']) / val['blocks'] b_completion = '%5.2f%%' % (100 * ratio) row = { 'site': { 'name': key, 'dataset_fraction': nfiles, 'block_fraction': nblks, 'block_completion': b_completion } } yield row
def parser(self, dasquery, dformat, source, api): """ DBS data-service parser. """ sitedb = SERVICES.get('sitedb2', None) # look-up SiteDB from global scope query = dasquery.mongo_query if api == 'listBlocks': prim_key = 'block' elif api == 'listBlocks4path': api = 'listBlocks' prim_key = 'block' elif api == 'listBlockProvenance': prim_key = 'block' elif api == 'listBlockProvenance4child': prim_key = 'block' elif api == 'listFiles': prim_key = 'file' elif api == 'listLFNs': prim_key = 'file_lfn' elif api == 'listFileLumis': prim_key = 'file_lumi_section' elif api == 'listFileProcQuality': prim_key = 'file_proc_quality' elif api == 'listFileParents': prim_key = 'file_parent' elif api == 'listTiers': prim_key = 'data_tier' elif api == 'listDatasetParents': prim_key = 'processed_dataset_parent' elif api == 'listPrimaryDatasets': prim_key = 'primary_dataset' elif api == 'listProcessedDatasets': prim_key = 'processed_dataset' elif api == 'fakeReleases': prim_key = 'release' elif api == 'listRuns': prim_key = 'run' elif api == 'fakeRelease4File': prim_key = 'release' elif api == 'fakeRelease4Dataset': prim_key = 'release' elif api == 'fakeGroup4Dataset': prim_key = 'group' elif api == 'fakeConfig': prim_key = 'config' elif api == 'fakeListDataset4Block': prim_key = 'dataset' elif api == 'fakeListDataset4File': prim_key = 'dataset' elif api == 'fakeListDatasetbyDate': prim_key = 'dataset' elif api == 'fakeDatasetSummary': prim_key = 'dataset' elif api == 'fakeDataset4Run': prim_key = 'dataset' elif api == 'fakeRun4File': prim_key = 'run' elif api == 'fakeRun4Run': prim_key = 'run' elif api == 'fakeChild4File': prim_key = 'child' elif api == 'fakeChild4Dataset': prim_key = 'child' elif api == 'fakeSite4Dataset': prim_key = 'site' elif api == 'fakeStatus': prim_key = 'status' elif api == 'fakeFiles4DatasetRunLumis': prim_key = 'file' elif api == 'fakeRun4Block': prim_key = 'run' elif api == 'fakeBlock4DatasetRun': prim_key = 'block' elif api == 'fakeSite4Dataset': prim_key = 'site' else: msg = 'DBSService::parser, unsupported %s API %s' \ % (self.name, api) raise Exception(msg) if api.find('fake') != -1: gen = qlxml_parser(source, prim_key) else: gen = xml_parser(source, prim_key) useless_run_atts = ['number_of_events', 'number_of_lumi_sections', \ 'id', 'total_luminosity', 'store_number', 'end_of_run', \ 'start_of_run'] config_attrs = ['config.name', 'config.content', 'config.version', \ 'config.type', 'config.annotation', 'config.createdate', \ 'config.createby', 'config.moddate', 'config.modby'] for row in gen: if not row: continue if row.has_key('status') and \ row['status'].has_key('dataset.status'): row['status']['name'] = row['status']['dataset.status'] del row['status']['dataset.status'] if row.has_key('file_lumi_section'): row['lumi'] = row['file_lumi_section'] del row['file_lumi_section'] if row.has_key('algorithm'): del row['algorithm']['ps_content'] if row.has_key('processed_dataset') and \ row['processed_dataset'].has_key('path'): if isinstance(row['processed_dataset']['path'], dict) \ and row['processed_dataset']['path'].has_key('dataset_path'): path = row['processed_dataset']['path']['dataset_path'] del row['processed_dataset']['path'] row['processed_dataset']['name'] = path # case for fake apis # remove useless attribute from results if row.has_key('dataset'): if row['dataset'].has_key('count_file.size'): del row['dataset']['count_file.size'] if row['dataset'].has_key('dataset'): name = row['dataset']['dataset'] del row['dataset']['dataset'] row['dataset']['name'] = name if row.has_key('child') and row['child'].has_key('dataset.child'): row['child']['name'] = row['child']['dataset.child'] del row['child']['dataset.child'] if row.has_key('child') and row['child'].has_key('file.child'): row['child']['name'] = row['child']['file.child'] del row['child']['file.child'] if row.has_key('block') and query.get('fields') == ['parent']: row['parent'] = row['block'] del row['block'] if row.has_key('block') and query.get('fields') == ['child']: row['child'] = row['block'] del row['block'] if row.has_key('run') and row['run'].has_key('run'): row['run']['run_number'] = row['run']['run'] del row['run']['run'] if row.has_key('release') and row['release'].has_key('release'): row['release']['name'] = row['release']['release'] del row['release']['release'] if row.has_key('site'): row['site']['se'] = row['site']['site'] del row['site']['site'] convert_dot(row, 'config', config_attrs) convert_dot(row, 'file', ['file.name']) convert_dot(row, 'block', ['block.name']) convert_dot(row, 'dataset', ['dataset.tag', 'dataset.status']) # remove DBS2 run attributes (to be consistent with DBS3 output) # and let people extract this info from CondDB/LumiDB. if row.has_key('run'): for att in useless_run_atts: try: del row['run'][att] except: pass if api == 'fakeSite4Dataset' and sitedb: site = row.get('site', None) if site and isinstance(site, dict): sename = site.get('se', None) info = sitedb.site_info(sename) if info: row['site'].update(info) yield row
def helper(self, url, api, args, expire): """ Class helper function which yields results for given set of input parameters. It yeilds the data record which must contain combined attribute corresponding to systems used to produce record content. """ dbs_url = url['dbs'] phedex_url = url['phedex'] if api == 'combined_dataset4site_release': # DBS part datasets = set() for row in dbs_dataset4site_release(dbs_url, self.getdata, args['release']): datasets.add(row) # Phedex part if args['site'].find('.') != -1: # it is SE phedex_args = {'dataset':list(datasets), 'se': '%s' % args['site']} else: phedex_args = {'dataset':list(datasets), 'node': '%s*' % args['site']} headers = {'Accept': 'text/xml'} source, expire = \ self.getdata(phedex_url, phedex_args, expire, headers, post=True) prim_key = 'block' tags = 'block.replica.node' found = {} for rec in xml_parser(source, prim_key, tags): ddict = DotDict(rec) block = ddict.get('block.name') bbytes = ddict.get('block.bytes') files = ddict.get('block.files') found_dataset = block.split('#')[0] if found.has_key(found_dataset): val = found[found_dataset] found[found_dataset] = {'bytes': val['bytes'] + bbytes, 'files': val['files'] + files} else: found[found_dataset] = {'bytes': bbytes, 'files': files} for name, val in found.iteritems(): record = dict(name=name, size=val['bytes'], files=val['files'], combined=['dbs', 'phedex']) yield {'dataset':record} del datasets del found if api == 'combined_site4dataset': # DBS part dataset = args['dataset'] totblocks, totfiles = \ dataset_summary(dbs_url, self.getdata, dataset) # Phedex part phedex_args = {'dataset':args['dataset']} headers = {'Accept': 'text/xml'} source, expire = \ self.getdata(phedex_url, phedex_args, expire, headers, post=True) prim_key = 'block' tags = 'block.replica.node' found = {} site_info = {} for rec in xml_parser(source, prim_key, tags): ddict = DotDict(rec) replicas = ddict.get('block.replica') if not isinstance(replicas, list): replicas = [replicas] for row in replicas: if not row or not row.has_key('node'): continue node = row['node'] files = int(row['files']) complete = 1 if row['complete'] == 'y' else 0 if site_info.has_key(node): files = site_info[node]['files'] + files nblks = site_info[node]['blocks'] + 1 bc_val = site_info[node]['blocks_complete'] b_complete = bc_val+1 if complete else bc_val else: b_complete = 1 if complete else 0 nblks = 1 site_info[node] = {'files': files, 'blocks': nblks, 'blocks_complete': b_complete} row = {} for key, val in site_info.iteritems(): if totfiles: nfiles = '%5.2f%%' % (100*float(val['files'])/totfiles) else: nfiles = 'N/A' if totblocks: nblks = '%5.2f%%' % (100*float(val['blocks'])/totblocks) else: nblks = 'N/A' ratio = float(val['blocks_complete'])/val['blocks'] b_completion = '%5.2f%%' % (100*ratio) row = {'site':{'name':key, 'dataset_fraction': nfiles, 'block_fraction': nblks, 'block_completion': b_completion}} yield row