def data_types(self): """Return list of data types dicts: - dtypes is data types, e.g. mc/data - stypes is site types, dict of Tier sites - rtypes is release types {'series':{'major':{'minor':}} """ dtypes = ['mc', 'data'] # data types, should be small list tiers = self.dbs.data_tiers() stypes = { 's_%s' % TIER0: 0, 's_%s' % TIER1: 0, 's_%s' % TIER2: 0, 's_%s' % TIER3: 0, 's_%s' % TIER_NA: 0 } # site types rtypes = {} # release types releases = self.dbs.releases() series = set() majors = set() minors = set() for row in releases: rel = row['release'] sval, major, minor = rel_ver(rel) if not cmssw_test(sval, major, minor): continue series.add(sval) majors.add(major) minors.add(minor) serdict = {} for val in series: serdict['rel1_%s' % val] = 0 majdict = {} for val in majors: majdict['rel2_%s' % val] = 0 mindict = {} for val in minors: mindict['rel3_%s' % val] = 0 # release types as defined in rel_type function typdict = { 'relt_%s' % RFULL: 0, 'relt_%s' % RPRE: 0, 'relt_%s' % RPATCH: 0 } rtypes = { 'series': serdict, 'majors': majdict, 'minors': mindict, 'rtypes': typdict } return dtypes, stypes, rtypes, tiers
def data_types(self): """Return list of data types dicts: - dtypes is data types, e.g. mc/data - stypes is site types, dict of Tier sites - rtypes is release types {'series':{'major':{'minor':}} """ dtypes = ['mc', 'data'] # data types, should be small list tiers = self.dbs.data_tiers() stypes = {'s_%s'%TIER0:0, 's_%s'%TIER1:0, 's_%s'%TIER2:0, 's_%s'%TIER3:0, 's_%s'%TIER_NA:0} # site types rtypes = {} # release types releases = self.dbs.releases() series = set() majors = set() minors = set() for row in releases: rel = row['release'] sval, major, minor = rel_ver(rel) if not cmssw_test(sval, major, minor): continue series.add(sval) majors.add(major) minors.add(minor) serdict = {} for val in series: serdict['rel1_%s'%val] = 0 majdict = {} for val in majors: majdict['rel2_%s'%val] = 0 mindict = {} for val in minors: mindict['rel3_%s'%val] = 0 # release types as defined in rel_type function typdict = {'relt_%s'%RFULL:0, 'relt_%s'%RPRE:0, 'relt_%s'%RPATCH:0} rtypes = {'series': serdict, 'majors': majdict, 'minors': mindict, 'rtypes': typdict} return dtypes, stypes, rtypes, tiers
def dataset_info(self, timeframe, dataset, dtypes, stypes, rtypes, tiers, dformat, target=0): "Return common dataset info in specified data format" dbsinst = self.dbs.dataset_dbsinst(dataset) if not dbsinst: return row = self.dbs.dataset_info(dataset, dbsinst) if row: if self.multitask: releases, sites, parents, summary, dashboard = \ self.dataset_info_all(dataset, dbsinst, timeframe) else: releases = [rname for rname in self.dbs.dataset_release_versions(dataset, dbsinst)] sites = [sname for sname in self.phedex.sites(dataset)] parents = [r for r in self.dbs.dataset_parents(dataset, dbsinst)] summary = self.dbs.dataset_summary(dataset, dbsinst) dashboard = self.dashboard.dataset_info(dataset, timeframe[0], timeframe[1]) nrels = len(releases) series = rtypes['series'] majors = rtypes['majors'] minors = rtypes['minors'] relclf = rtypes['rtypes'] for rel in releases: rserie, rmajor, rminor = rel_ver(rel) if not cmssw_test(rserie, rmajor, rminor): continue rtype = rel_type(rel) try: series['rel1_%s'%rserie] += 1 except: pass try: majors['rel2_%s'%rmajor] += 1 except: pass try: minors['rel3_%s'%rminor] += 1 except: pass try: relclf['relt_%s'%rtype] += 1 except: pass nsites = len(sites) for site in sites: stier = site_tier(site) stypes['s_%s'%stier] += 1 dataset_id = row['rid'] era = genkey(row['acquisition_era_name'], self.salt, 5) create_dn = self.sitedb.dnid(row['create_by']) dbsinstid = row['dbs_instance'] dtype = row['primary_ds_type'] # number of data types should be small and simple # list look-up shouldn't be a problem if dtype not in dtypes: dtypes.append(dtype) dtype = dtypes.index(dtype) _, prim, proc, tier = dataset.split('/') prim = genkey(prim, self.salt, 5) proc = genkey(proc, self.salt, 5) if tier not in tiers: tiers.append(tier) tier = genkey(tier, self.salt, 5) parent = parents[0] if len(parents) else 0 uid = genuid(yyyymmdd(timeframe[0]), dbsinstid, dataset_id) size_norm = 2**30 # normalization factor for file size rec = dict(id=uid, dataset=dataset_id, primds=prim, procds=proc, tier=tier, dtype=dtype, creator=create_dn, nrel=nrels, nsites=nsites, parent=parent, era=era, dbs=dbsinstid, nfiles=summary.get('num_file', 0), nlumis=summary.get('num_lumi', 0), nblk=summary.get('num_block', 0), nevt=summary.get('num_event', 0), size=summary.get('file_size', 0)/size_norm, cpu=dashboard.get('cpu', 0), wct=dashboard.get('wct', 0), proc_evts=dashboard.get('nevt', 0)) if isinstance(target, dict): rec.update(target) for key,val in series.items(): rec.update({key:val}) for key, val in majors.items(): rec.update({key:val}) for key, val in minors.items(): rec.update({key:val}) for key, val in relclf.items(): rec.update({key:val}) for key, val in stypes.items(): rec.update({key:val}) headers = rec.keys() headers.sort() headers.remove('id') headers = ['id'] + headers # let dataset id be the first column if dformat == 'headers': yield headers elif dformat == 'csv': res = [str(rec[h]) for h in headers] yield ','.join(res) elif dformat == 'vw': target_str = target.get('rnaccess') vals = ' '.join([str(rec[h]) for h in headers]) uid = genkey(vals, self.salt, 5) # unique row identified vwrow = "%s '%s |f %s" % (target_str, uid, vals) yield vwrow
def test_rel_ver(self): "Test rel_ver method" result = rel_ver('CMSSW_1_0_1') expect = ('1', '0', '1') self.assertEqual(expect, result)
def dataset_info(self, timeframe, dataset, dtypes, stypes, rtypes, tiers, dformat, target=0): "Return common dataset info in specified data format" dbsinst = self.dbs.dataset_dbsinst(dataset) if not dbsinst: return row = self.dbs.dataset_info(dataset, dbsinst) if row: if self.multitask: releases, sites, parents, summary, dashboard = \ self.dataset_info_all(dataset, dbsinst, timeframe) else: releases = [ rname for rname in self.dbs.dataset_release_versions( dataset, dbsinst) ] sites = [sname for sname in self.phedex.sites(dataset)] parents = [ r for r in self.dbs.dataset_parents(dataset, dbsinst) ] summary = self.dbs.dataset_summary(dataset, dbsinst) dashboard = self.dashboard.dataset_info( dataset, timeframe[0], timeframe[1]) nrels = len(releases) series = {} for k in rtypes['series'].keys(): series[k] = 0 majors = {} for k in rtypes['majors'].keys(): majors[k] = 0 minors = {} for k in rtypes['minors'].keys(): minors[k] = 0 relclf = {} for k in rtypes['rtypes'].keys(): relclf[k] = 0 for rel in releases: rserie, rmajor, rminor = rel_ver(rel) if not cmssw_test(rserie, rmajor, rminor): continue rtype = rel_type(rel) try: series['rel1_%s' % rserie] += 1 except: pass try: majors['rel2_%s' % rmajor] += 1 except: pass try: minors['rel3_%s' % rminor] += 1 except: pass try: relclf['relt_%s' % rtype] += 1 except: pass nsites = len(sites) for site in sites: stier = site_tier(site) stypes['s_%s' % stier] += 1 dataset_id = row['rid'] era = genkey(row['acquisition_era_name'], self.salt, 5) create_dn = self.sitedb.dnid(row['create_by']) dbsinstid = row['dbs_instance'] dtype = row['primary_ds_type'] # number of data types should be small and simple # list look-up shouldn't be a problem if dtype not in dtypes: dtypes.append(dtype) dtype = dtypes.index(dtype) _, prim, proc, tier = dataset.split('/') prim = genkey(prim, self.salt, 5) proc = genkey(proc, self.salt, 5) if tier not in tiers: tiers.append(tier) tier = genkey(tier, self.salt, 5) parent = parents[0] if len(parents) else 0 uid = genuid(yyyymmdd(timeframe[0]), dbsinstid, dataset_id) size_norm = 2**30 # normalization factor for file size if not summary: summary = {} # we need a dict type rec = dict(id=uid, dataset=dataset_id, primds=prim, procds=proc, tier=tier, dtype=dtype, creator=create_dn, nrel=nrels, nsites=nsites, parent=parent, era=era, dbs=dbsinstid, nfiles=summary.get('num_file', 0), nlumis=summary.get('num_lumi', 0), nblk=summary.get('num_block', 0), nevt=summary.get('num_event', 0), size=summary.get('file_size', 0) / size_norm, cpu=dashboard.get('cpu', 0), wct=dashboard.get('wct', 0), proc_evts=dashboard.get('nevt', 0)) if isinstance(target, dict): rec.update(target) for key, val in series.items(): rec.update({key: val}) for key, val in majors.items(): rec.update({key: val}) for key, val in minors.items(): rec.update({key: val}) for key, val in relclf.items(): rec.update({key: val}) for key, val in stypes.items(): rec.update({key: val}) headers = rec.keys() headers.sort() headers.remove('id') headers = ['id'] + headers # let dataset id be the first column if dformat == 'headers': yield headers elif dformat == 'csv': res = [str(rec[h]) for h in headers] yield ','.join(res) elif dformat == 'vw': target_str = target.get('rnaccess') vals = ' '.join([str(rec[h]) for h in headers]) uid = genkey(vals, self.salt, 5) # unique row identified vwrow = "%s '%s |f %s" % (target_str, uid, vals) yield vwrow