Exemplo n.º 1
0
    def data_types(self):
        """Return list of data types dicts:

            - dtypes is data types, e.g. mc/data
            - stypes is site types, dict of Tier sites
            - rtypes is release types
              {'series':{'major':{'minor':}}
        """
        dtypes = ['mc', 'data']  # data types, should be small list
        tiers = self.dbs.data_tiers()
        stypes = {
            's_%s' % TIER0: 0,
            's_%s' % TIER1: 0,
            's_%s' % TIER2: 0,
            's_%s' % TIER3: 0,
            's_%s' % TIER_NA: 0
        }  # site types
        rtypes = {}  # release types
        releases = self.dbs.releases()
        series = set()
        majors = set()
        minors = set()
        for row in releases:
            rel = row['release']
            sval, major, minor = rel_ver(rel)
            if not cmssw_test(sval, major, minor):
                continue
            series.add(sval)
            majors.add(major)
            minors.add(minor)
        serdict = {}
        for val in series:
            serdict['rel1_%s' % val] = 0
        majdict = {}
        for val in majors:
            majdict['rel2_%s' % val] = 0
        mindict = {}
        for val in minors:
            mindict['rel3_%s' % val] = 0
        # release types as defined in rel_type function
        typdict = {
            'relt_%s' % RFULL: 0,
            'relt_%s' % RPRE: 0,
            'relt_%s' % RPATCH: 0
        }
        rtypes = {
            'series': serdict,
            'majors': majdict,
            'minors': mindict,
            'rtypes': typdict
        }
        return dtypes, stypes, rtypes, tiers
Exemplo n.º 2
0
    def data_types(self):
        """Return list of data types dicts:

            - dtypes is data types, e.g. mc/data
            - stypes is site types, dict of Tier sites
            - rtypes is release types
              {'series':{'major':{'minor':}}
        """
        dtypes = ['mc', 'data'] # data types, should be small list
        tiers = self.dbs.data_tiers()
        stypes = {'s_%s'%TIER0:0, 's_%s'%TIER1:0, 's_%s'%TIER2:0, 's_%s'%TIER3:0, 's_%s'%TIER_NA:0} # site types
        rtypes = {} # release types
        releases = self.dbs.releases()
        series = set()
        majors = set()
        minors = set()
        for row in releases:
            rel = row['release']
            sval, major, minor = rel_ver(rel)
            if  not cmssw_test(sval, major, minor):
                continue
            series.add(sval)
            majors.add(major)
            minors.add(minor)
        serdict = {}
        for val in series:
            serdict['rel1_%s'%val] = 0
        majdict = {}
        for val in majors:
            majdict['rel2_%s'%val] = 0
        mindict = {}
        for val in minors:
            mindict['rel3_%s'%val] = 0
        # release types as defined in rel_type function
        typdict = {'relt_%s'%RFULL:0, 'relt_%s'%RPRE:0, 'relt_%s'%RPATCH:0}
        rtypes = {'series': serdict, 'majors': majdict, 'minors': mindict, 'rtypes': typdict}
        return dtypes, stypes, rtypes, tiers
Exemplo n.º 3
0
 def dataset_info(self, timeframe, dataset, dtypes, stypes, rtypes, tiers, dformat, target=0):
     "Return common dataset info in specified data format"
     dbsinst = self.dbs.dataset_dbsinst(dataset)
     if  not dbsinst:
         return
     row = self.dbs.dataset_info(dataset, dbsinst)
     if  row:
         if  self.multitask:
             releases, sites, parents, summary, dashboard = \
                     self.dataset_info_all(dataset, dbsinst, timeframe)
         else:
             releases = [rname for rname in self.dbs.dataset_release_versions(dataset, dbsinst)]
             sites = [sname for sname in self.phedex.sites(dataset)]
             parents = [r for r in self.dbs.dataset_parents(dataset, dbsinst)]
             summary = self.dbs.dataset_summary(dataset, dbsinst)
             dashboard = self.dashboard.dataset_info(dataset, timeframe[0], timeframe[1])
         nrels = len(releases)
         series = rtypes['series']
         majors = rtypes['majors']
         minors = rtypes['minors']
         relclf = rtypes['rtypes']
         for rel in releases:
             rserie, rmajor, rminor = rel_ver(rel)
             if  not cmssw_test(rserie, rmajor, rminor):
                 continue
             rtype = rel_type(rel)
             try:
                 series['rel1_%s'%rserie] += 1
             except:
                 pass
             try:
                 majors['rel2_%s'%rmajor] += 1
             except:
                 pass
             try:
                 minors['rel3_%s'%rminor] += 1
             except:
                 pass
             try:
                 relclf['relt_%s'%rtype] += 1
             except:
                 pass
         nsites = len(sites)
         for site in sites:
             stier = site_tier(site)
             stypes['s_%s'%stier] += 1
         dataset_id = row['rid']
         era = genkey(row['acquisition_era_name'], self.salt, 5)
         create_dn = self.sitedb.dnid(row['create_by'])
         dbsinstid = row['dbs_instance']
         dtype = row['primary_ds_type']
         # number of data types should be small and simple
         # list look-up shouldn't be a problem
         if  dtype not in dtypes:
             dtypes.append(dtype)
         dtype = dtypes.index(dtype)
         _, prim, proc, tier = dataset.split('/')
         prim = genkey(prim, self.salt, 5)
         proc = genkey(proc, self.salt, 5)
         if  tier not in tiers:
             tiers.append(tier)
         tier = genkey(tier, self.salt, 5)
         parent = parents[0] if len(parents) else 0
         uid = genuid(yyyymmdd(timeframe[0]), dbsinstid, dataset_id)
         size_norm = 2**30 # normalization factor for file size
         rec = dict(id=uid, dataset=dataset_id, primds=prim, procds=proc, tier=tier,
                 dtype=dtype, creator=create_dn, nrel=nrels, nsites=nsites,
                 parent=parent, era=era, dbs=dbsinstid,
                 nfiles=summary.get('num_file', 0),
                 nlumis=summary.get('num_lumi', 0),
                 nblk=summary.get('num_block', 0),
                 nevt=summary.get('num_event', 0),
                 size=summary.get('file_size', 0)/size_norm,
                 cpu=dashboard.get('cpu', 0),
                 wct=dashboard.get('wct', 0),
                 proc_evts=dashboard.get('nevt', 0))
         if  isinstance(target, dict):
             rec.update(target)
         for key,val in series.items():
             rec.update({key:val})
         for key, val in majors.items():
             rec.update({key:val})
         for key, val in minors.items():
             rec.update({key:val})
         for key, val in relclf.items():
             rec.update({key:val})
         for key, val in stypes.items():
             rec.update({key:val})
         headers = rec.keys()
         headers.sort()
         headers.remove('id')
         headers = ['id'] + headers # let dataset id be the first column
         if  dformat == 'headers':
             yield headers
         elif  dformat == 'csv':
             res = [str(rec[h]) for h in headers]
             yield ','.join(res)
         elif dformat == 'vw':
             target_str = target.get('rnaccess')
             vals = ' '.join([str(rec[h]) for h in headers])
             uid = genkey(vals, self.salt, 5) # unique row identified
             vwrow = "%s '%s |f %s" % (target_str, uid, vals)
             yield vwrow
Exemplo n.º 4
0
 def test_rel_ver(self):
     "Test rel_ver method"
     result = rel_ver('CMSSW_1_0_1')
     expect = ('1', '0', '1')
     self.assertEqual(expect, result)
Exemplo n.º 5
0
 def dataset_info(self,
                  timeframe,
                  dataset,
                  dtypes,
                  stypes,
                  rtypes,
                  tiers,
                  dformat,
                  target=0):
     "Return common dataset info in specified data format"
     dbsinst = self.dbs.dataset_dbsinst(dataset)
     if not dbsinst:
         return
     row = self.dbs.dataset_info(dataset, dbsinst)
     if row:
         if self.multitask:
             releases, sites, parents, summary, dashboard = \
                     self.dataset_info_all(dataset, dbsinst, timeframe)
         else:
             releases = [
                 rname for rname in self.dbs.dataset_release_versions(
                     dataset, dbsinst)
             ]
             sites = [sname for sname in self.phedex.sites(dataset)]
             parents = [
                 r for r in self.dbs.dataset_parents(dataset, dbsinst)
             ]
             summary = self.dbs.dataset_summary(dataset, dbsinst)
             dashboard = self.dashboard.dataset_info(
                 dataset, timeframe[0], timeframe[1])
         nrels = len(releases)
         series = {}
         for k in rtypes['series'].keys():
             series[k] = 0
         majors = {}
         for k in rtypes['majors'].keys():
             majors[k] = 0
         minors = {}
         for k in rtypes['minors'].keys():
             minors[k] = 0
         relclf = {}
         for k in rtypes['rtypes'].keys():
             relclf[k] = 0
         for rel in releases:
             rserie, rmajor, rminor = rel_ver(rel)
             if not cmssw_test(rserie, rmajor, rminor):
                 continue
             rtype = rel_type(rel)
             try:
                 series['rel1_%s' % rserie] += 1
             except:
                 pass
             try:
                 majors['rel2_%s' % rmajor] += 1
             except:
                 pass
             try:
                 minors['rel3_%s' % rminor] += 1
             except:
                 pass
             try:
                 relclf['relt_%s' % rtype] += 1
             except:
                 pass
         nsites = len(sites)
         for site in sites:
             stier = site_tier(site)
             stypes['s_%s' % stier] += 1
         dataset_id = row['rid']
         era = genkey(row['acquisition_era_name'], self.salt, 5)
         create_dn = self.sitedb.dnid(row['create_by'])
         dbsinstid = row['dbs_instance']
         dtype = row['primary_ds_type']
         # number of data types should be small and simple
         # list look-up shouldn't be a problem
         if dtype not in dtypes:
             dtypes.append(dtype)
         dtype = dtypes.index(dtype)
         _, prim, proc, tier = dataset.split('/')
         prim = genkey(prim, self.salt, 5)
         proc = genkey(proc, self.salt, 5)
         if tier not in tiers:
             tiers.append(tier)
         tier = genkey(tier, self.salt, 5)
         parent = parents[0] if len(parents) else 0
         uid = genuid(yyyymmdd(timeframe[0]), dbsinstid, dataset_id)
         size_norm = 2**30  # normalization factor for file size
         if not summary:
             summary = {}  # we need a dict type
         rec = dict(id=uid,
                    dataset=dataset_id,
                    primds=prim,
                    procds=proc,
                    tier=tier,
                    dtype=dtype,
                    creator=create_dn,
                    nrel=nrels,
                    nsites=nsites,
                    parent=parent,
                    era=era,
                    dbs=dbsinstid,
                    nfiles=summary.get('num_file', 0),
                    nlumis=summary.get('num_lumi', 0),
                    nblk=summary.get('num_block', 0),
                    nevt=summary.get('num_event', 0),
                    size=summary.get('file_size', 0) / size_norm,
                    cpu=dashboard.get('cpu', 0),
                    wct=dashboard.get('wct', 0),
                    proc_evts=dashboard.get('nevt', 0))
         if isinstance(target, dict):
             rec.update(target)
         for key, val in series.items():
             rec.update({key: val})
         for key, val in majors.items():
             rec.update({key: val})
         for key, val in minors.items():
             rec.update({key: val})
         for key, val in relclf.items():
             rec.update({key: val})
         for key, val in stypes.items():
             rec.update({key: val})
         headers = rec.keys()
         headers.sort()
         headers.remove('id')
         headers = ['id'] + headers  # let dataset id be the first column
         if dformat == 'headers':
             yield headers
         elif dformat == 'csv':
             res = [str(rec[h]) for h in headers]
             yield ','.join(res)
         elif dformat == 'vw':
             target_str = target.get('rnaccess')
             vals = ' '.join([str(rec[h]) for h in headers])
             uid = genkey(vals, self.salt, 5)  # unique row identified
             vwrow = "%s '%s |f %s" % (target_str, uid, vals)
             yield vwrow
Exemplo n.º 6
0
 def test_rel_ver(self):
     "Test rel_ver method"
     result = rel_ver('CMSSW_1_0_1')
     expect = ('1', '0', '1')
     self.assertEqual(expect, result)