def test_site_tier(self): "Test site_tier method" result = site_tier('T0_CERN') expect = TIER0 self.assertEqual(expect, result) result = site_tier('T1_CERN') expect = TIER1 self.assertEqual(expect, result) result = site_tier('T2_CERN') expect = TIER2 self.assertEqual(expect, result) result = site_tier('T3_CERN') expect = TIER3 self.assertEqual(expect, result) result = site_tier('CERN') expect = TIER_NA self.assertEqual(expect, result)
def dataset_info(self, timeframe, dataset, dtypes, stypes, rtypes, tiers, dformat, target=0): "Return common dataset info in specified data format" dbsinst = self.dbs.dataset_dbsinst(dataset) if not dbsinst: return row = self.dbs.dataset_info(dataset, dbsinst) if row: if self.multitask: releases, sites, parents, summary, dashboard = \ self.dataset_info_all(dataset, dbsinst, timeframe) else: releases = [rname for rname in self.dbs.dataset_release_versions(dataset, dbsinst)] sites = [sname for sname in self.phedex.sites(dataset)] parents = [r for r in self.dbs.dataset_parents(dataset, dbsinst)] summary = self.dbs.dataset_summary(dataset, dbsinst) dashboard = self.dashboard.dataset_info(dataset, timeframe[0], timeframe[1]) nrels = len(releases) series = rtypes['series'] majors = rtypes['majors'] minors = rtypes['minors'] relclf = rtypes['rtypes'] for rel in releases: rserie, rmajor, rminor = rel_ver(rel) if not cmssw_test(rserie, rmajor, rminor): continue rtype = rel_type(rel) try: series['rel1_%s'%rserie] += 1 except: pass try: majors['rel2_%s'%rmajor] += 1 except: pass try: minors['rel3_%s'%rminor] += 1 except: pass try: relclf['relt_%s'%rtype] += 1 except: pass nsites = len(sites) for site in sites: stier = site_tier(site) stypes['s_%s'%stier] += 1 dataset_id = row['rid'] era = genkey(row['acquisition_era_name'], self.salt, 5) create_dn = self.sitedb.dnid(row['create_by']) dbsinstid = row['dbs_instance'] dtype = row['primary_ds_type'] # number of data types should be small and simple # list look-up shouldn't be a problem if dtype not in dtypes: dtypes.append(dtype) dtype = dtypes.index(dtype) _, prim, proc, tier = dataset.split('/') prim = genkey(prim, self.salt, 5) proc = genkey(proc, self.salt, 5) if tier not in tiers: tiers.append(tier) tier = genkey(tier, self.salt, 5) parent = parents[0] if len(parents) else 0 uid = genuid(yyyymmdd(timeframe[0]), dbsinstid, dataset_id) size_norm = 2**30 # normalization factor for file size rec = dict(id=uid, dataset=dataset_id, primds=prim, procds=proc, tier=tier, dtype=dtype, creator=create_dn, nrel=nrels, nsites=nsites, parent=parent, era=era, dbs=dbsinstid, nfiles=summary.get('num_file', 0), nlumis=summary.get('num_lumi', 0), nblk=summary.get('num_block', 0), nevt=summary.get('num_event', 0), size=summary.get('file_size', 0)/size_norm, cpu=dashboard.get('cpu', 0), wct=dashboard.get('wct', 0), proc_evts=dashboard.get('nevt', 0)) if isinstance(target, dict): rec.update(target) for key,val in series.items(): rec.update({key:val}) for key, val in majors.items(): rec.update({key:val}) for key, val in minors.items(): rec.update({key:val}) for key, val in relclf.items(): rec.update({key:val}) for key, val in stypes.items(): rec.update({key:val}) headers = rec.keys() headers.sort() headers.remove('id') headers = ['id'] + headers # let dataset id be the first column if dformat == 'headers': yield headers elif dformat == 'csv': res = [str(rec[h]) for h in headers] yield ','.join(res) elif dformat == 'vw': target_str = target.get('rnaccess') vals = ' '.join([str(rec[h]) for h in headers]) uid = genkey(vals, self.salt, 5) # unique row identified vwrow = "%s '%s |f %s" % (target_str, uid, vals) yield vwrow
def dataset_info(self, timeframe, dataset, dtypes, stypes, rtypes, tiers, dformat, target=0): "Return common dataset info in specified data format" dbsinst = self.dbs.dataset_dbsinst(dataset) if not dbsinst: return row = self.dbs.dataset_info(dataset, dbsinst) if row: if self.multitask: releases, sites, parents, summary, dashboard = \ self.dataset_info_all(dataset, dbsinst, timeframe) else: releases = [ rname for rname in self.dbs.dataset_release_versions( dataset, dbsinst) ] sites = [sname for sname in self.phedex.sites(dataset)] parents = [ r for r in self.dbs.dataset_parents(dataset, dbsinst) ] summary = self.dbs.dataset_summary(dataset, dbsinst) dashboard = self.dashboard.dataset_info( dataset, timeframe[0], timeframe[1]) nrels = len(releases) series = {} for k in rtypes['series'].keys(): series[k] = 0 majors = {} for k in rtypes['majors'].keys(): majors[k] = 0 minors = {} for k in rtypes['minors'].keys(): minors[k] = 0 relclf = {} for k in rtypes['rtypes'].keys(): relclf[k] = 0 for rel in releases: rserie, rmajor, rminor = rel_ver(rel) if not cmssw_test(rserie, rmajor, rminor): continue rtype = rel_type(rel) try: series['rel1_%s' % rserie] += 1 except: pass try: majors['rel2_%s' % rmajor] += 1 except: pass try: minors['rel3_%s' % rminor] += 1 except: pass try: relclf['relt_%s' % rtype] += 1 except: pass nsites = len(sites) for site in sites: stier = site_tier(site) stypes['s_%s' % stier] += 1 dataset_id = row['rid'] era = genkey(row['acquisition_era_name'], self.salt, 5) create_dn = self.sitedb.dnid(row['create_by']) dbsinstid = row['dbs_instance'] dtype = row['primary_ds_type'] # number of data types should be small and simple # list look-up shouldn't be a problem if dtype not in dtypes: dtypes.append(dtype) dtype = dtypes.index(dtype) _, prim, proc, tier = dataset.split('/') prim = genkey(prim, self.salt, 5) proc = genkey(proc, self.salt, 5) if tier not in tiers: tiers.append(tier) tier = genkey(tier, self.salt, 5) parent = parents[0] if len(parents) else 0 uid = genuid(yyyymmdd(timeframe[0]), dbsinstid, dataset_id) size_norm = 2**30 # normalization factor for file size if not summary: summary = {} # we need a dict type rec = dict(id=uid, dataset=dataset_id, primds=prim, procds=proc, tier=tier, dtype=dtype, creator=create_dn, nrel=nrels, nsites=nsites, parent=parent, era=era, dbs=dbsinstid, nfiles=summary.get('num_file', 0), nlumis=summary.get('num_lumi', 0), nblk=summary.get('num_block', 0), nevt=summary.get('num_event', 0), size=summary.get('file_size', 0) / size_norm, cpu=dashboard.get('cpu', 0), wct=dashboard.get('wct', 0), proc_evts=dashboard.get('nevt', 0)) if isinstance(target, dict): rec.update(target) for key, val in series.items(): rec.update({key: val}) for key, val in majors.items(): rec.update({key: val}) for key, val in minors.items(): rec.update({key: val}) for key, val in relclf.items(): rec.update({key: val}) for key, val in stypes.items(): rec.update({key: val}) headers = rec.keys() headers.sort() headers.remove('id') headers = ['id'] + headers # let dataset id be the first column if dformat == 'headers': yield headers elif dformat == 'csv': res = [str(rec[h]) for h in headers] yield ','.join(res) elif dformat == 'vw': target_str = target.get('rnaccess') vals = ' '.join([str(rec[h]) for h in headers]) uid = genkey(vals, self.salt, 5) # unique row identified vwrow = "%s '%s |f %s" % (target_str, uid, vals) yield vwrow