示例#1
0
    def dataframe(self, timeframe, seed, dformat, dbs_extra, newdata=None):
        """Form a dataframe from various CMS data-providers"""
        dtypes, stypes, rtypes, tiers = self.data_types()
        pop_datasets = 0
        dbs_datasets = 0
        popdb_results = [r for r in self.popdb.dataset_stat(timeframe[0], timeframe[1])]
        if  dformat == 'csv':
            row = popdb_results[0]
            dataset = row['dataset']
            target = dict(naccess=row['naccess'],nusers=row['nusers'],totcpu=row['totcpu'],
                    rnaccess=row['rnaccess'],rnusers=row['rnusers'],rtotcpu=row['rtotcpu'])
            # seed dataset to determine headers of the dataframe
            rows = self.dataset_info(timeframe, seed, dtypes, stypes, rtypes,
                    tiers, 'headers', target)
            headers = [r for r in rows][0]
            yield ','.join(headers)
        tstamp = time.strftime("%Y-%m-%d %H:%M:%S GMT", time.gmtime())
        if  newdata: # request new dataset
            if  self.verbose:
                print("Generate dataframe for new datasets", tstamp)
            n_days = 7
            if  timeframe:
                n_days = ndays(yyyymmdd(timeframe[0]), yyyymmdd(timeframe[1]))
            new_datasets = self.dbs.new_datasets(n_days)
            target = dict(naccess=0,nusers=0,totcpu=0,
                    rnaccess=0,rnusers=0,rtotcpu=0)
            for row in new_datasets:
                dataset = row['dataset']
                rows = self.dataset_info(timeframe, dataset, dtypes, stypes, \
                        rtypes, tiers, dformat, target)
                for row in rows:
                    yield row
            return
        # get list of popular datasets in certain time frame
#        popdb_results = [r for r in self.popdb.dataset_stat(timeframe[0], timeframe[1])]
        popdb_datasets = {} #
        for row in popdb_results:
            dataset = row['dataset']
            if  not DATASET_PAT.match(dataset):
                continue
            if  self.verbose:
                print("Generate dataframe for %s, timeframe: %s, %s" \
                        % (dataset, timeframe, tstamp))
            target = dict(naccess=row['naccess'],nusers=row['nusers'],totcpu=row['totcpu'],
                    rnaccess=row['rnaccess'],rnusers=row['rnusers'],rtotcpu=row['rtotcpu'])
            rows = self.dataset_info(timeframe, dataset, dtypes, stypes, \
                    rtypes, tiers, dformat, target)
            popdb_datasets[dataset] = row
            for row in rows:
                yield row
                pop_datasets += 1

        # get list of datasets from DBS and discard from this list
        # those who were presented in popdb
        all_dbs_datasets = self.dbs.datasets()
        dbsdatasets = [d for d in all_dbs_datasets if d not in popdb_datasets.keys()]
        target = dict(naccess=0,nusers=0,totcpu=0,
                rnaccess=0,rnusers=0,rtotcpu=0)
        for dataset in random.sample(dbsdatasets, dbs_extra):
            rows = self.dataset_info(timeframe, dataset, dtypes, stypes, \
                    rtypes, tiers, dformat, target)
            for row in rows:
                yield row
                dbs_datasets += 1
        if  self.verbose:
            print("DBS datasets  : %s" % dbs_datasets)
            print("PopDB datasets: %s out of %s" % (pop_datasets, len(popdb_results)))
示例#2
0
    def dataframe(self, timeframe, seed, dformat, dbs_extra, newdata=None):
        """Form a dataframe from various CMS data-providers"""
        dtypes, stypes, rtypes, tiers = self.data_types()
        pop_datasets = 0
        dbs_datasets = 0
        popdb_results = [
            r for r in self.popdb.dataset_stat(timeframe[0], timeframe[1])
        ]
        if dformat == 'csv':
            dataset = None
            for row in popdb_results:
                if len(row['dataset'].split(
                        '/')) == 4:  # dataset with 3 slashes
                    dataset = row['dataset']
                    break
            if not dataset:
                raise Exception(
                    "Unable to find valid dataset name in popdb output")
            target = dict(naccess=row['naccess'],
                          nusers=row['nusers'],
                          totcpu=row['totcpu'],
                          rnaccess=row['rnaccess'],
                          rnusers=row['rnusers'],
                          rtotcpu=row['rtotcpu'])
            # seed dataset to determine headers of the dataframe
            rows = self.dataset_info(timeframe, seed, dtypes, stypes, rtypes,
                                     tiers, 'headers', target)
            headers = [r for r in rows][0]
            yield ','.join(headers)
        tstamp = time.strftime("%Y-%m-%d %H:%M:%S GMT", time.gmtime())
        if newdata:  # request new dataset
            if self.verbose:
                print("Generate dataframe for new datasets", tstamp)
            n_days = 7
            if timeframe:
                n_days = ndays(yyyymmdd(timeframe[0]), yyyymmdd(timeframe[1]))
            new_datasets = self.dbs.new_datasets(n_days)
            target = dict(naccess=0,
                          nusers=0,
                          totcpu=0,
                          rnaccess=0,
                          rnusers=0,
                          rtotcpu=0)
            for row in new_datasets:
                dataset = row['dataset']
                rows = self.dataset_info(timeframe, dataset, dtypes, stypes, \
                        rtypes, tiers, dformat, target)
                for row in rows:
                    yield row
            return
        # get list of popular datasets in certain time frame
#        popdb_results = [r for r in self.popdb.dataset_stat(timeframe[0], timeframe[1])]
        popdb_datasets = {}  #
        for row in popdb_results:
            dataset = row['dataset']
            if not DATASET_PAT.match(dataset):
                continue
            if self.verbose:
                print("Generate dataframe for %s, timeframe: %s, %s" \
                        % (dataset, timeframe, tstamp))
            target = dict(naccess=row['naccess'],
                          nusers=row['nusers'],
                          totcpu=row['totcpu'],
                          rnaccess=row['rnaccess'],
                          rnusers=row['rnusers'],
                          rtotcpu=row['rtotcpu'])
            rows = self.dataset_info(timeframe, dataset, dtypes, stypes, \
                    rtypes, tiers, dformat, target)
            popdb_datasets[dataset] = row
            for row in rows:
                yield row
                pop_datasets += 1

        # get list of datasets from DBS and discard from this list
        # those who were presented in popdb
        all_dbs_datasets = self.dbs.datasets()
        dbsdatasets = [
            d for d in all_dbs_datasets if d not in popdb_datasets.keys()
        ]
        target = dict(naccess=0,
                      nusers=0,
                      totcpu=0,
                      rnaccess=0,
                      rnusers=0,
                      rtotcpu=0)
        for dataset in random.sample(dbsdatasets, dbs_extra):
            rows = self.dataset_info(timeframe, dataset, dtypes, stypes, \
                    rtypes, tiers, dformat, target)
            for row in rows:
                yield row
                dbs_datasets += 1
        if self.verbose:
            print("DBS datasets  : %s" % dbs_datasets)
            print("PopDB datasets: %s out of %s" %
                  (pop_datasets, len(popdb_results)))
示例#3
0
 def dataset_info(self, timeframe, dataset, dtypes, stypes, rtypes, tiers, dformat, target=0):
     "Return common dataset info in specified data format"
     dbsinst = self.dbs.dataset_dbsinst(dataset)
     if  not dbsinst:
         return
     row = self.dbs.dataset_info(dataset, dbsinst)
     if  row:
         if  self.multitask:
             releases, sites, parents, summary, dashboard = \
                     self.dataset_info_all(dataset, dbsinst, timeframe)
         else:
             releases = [rname for rname in self.dbs.dataset_release_versions(dataset, dbsinst)]
             sites = [sname for sname in self.phedex.sites(dataset)]
             parents = [r for r in self.dbs.dataset_parents(dataset, dbsinst)]
             summary = self.dbs.dataset_summary(dataset, dbsinst)
             dashboard = self.dashboard.dataset_info(dataset, timeframe[0], timeframe[1])
         nrels = len(releases)
         series = rtypes['series']
         majors = rtypes['majors']
         minors = rtypes['minors']
         relclf = rtypes['rtypes']
         for rel in releases:
             rserie, rmajor, rminor = rel_ver(rel)
             if  not cmssw_test(rserie, rmajor, rminor):
                 continue
             rtype = rel_type(rel)
             try:
                 series['rel1_%s'%rserie] += 1
             except:
                 pass
             try:
                 majors['rel2_%s'%rmajor] += 1
             except:
                 pass
             try:
                 minors['rel3_%s'%rminor] += 1
             except:
                 pass
             try:
                 relclf['relt_%s'%rtype] += 1
             except:
                 pass
         nsites = len(sites)
         for site in sites:
             stier = site_tier(site)
             stypes['s_%s'%stier] += 1
         dataset_id = row['rid']
         era = genkey(row['acquisition_era_name'], self.salt, 5)
         create_dn = self.sitedb.dnid(row['create_by'])
         dbsinstid = row['dbs_instance']
         dtype = row['primary_ds_type']
         # number of data types should be small and simple
         # list look-up shouldn't be a problem
         if  dtype not in dtypes:
             dtypes.append(dtype)
         dtype = dtypes.index(dtype)
         _, prim, proc, tier = dataset.split('/')
         prim = genkey(prim, self.salt, 5)
         proc = genkey(proc, self.salt, 5)
         if  tier not in tiers:
             tiers.append(tier)
         tier = genkey(tier, self.salt, 5)
         parent = parents[0] if len(parents) else 0
         uid = genuid(yyyymmdd(timeframe[0]), dbsinstid, dataset_id)
         size_norm = 2**30 # normalization factor for file size
         rec = dict(id=uid, dataset=dataset_id, primds=prim, procds=proc, tier=tier,
                 dtype=dtype, creator=create_dn, nrel=nrels, nsites=nsites,
                 parent=parent, era=era, dbs=dbsinstid,
                 nfiles=summary.get('num_file', 0),
                 nlumis=summary.get('num_lumi', 0),
                 nblk=summary.get('num_block', 0),
                 nevt=summary.get('num_event', 0),
                 size=summary.get('file_size', 0)/size_norm,
                 cpu=dashboard.get('cpu', 0),
                 wct=dashboard.get('wct', 0),
                 proc_evts=dashboard.get('nevt', 0))
         if  isinstance(target, dict):
             rec.update(target)
         for key,val in series.items():
             rec.update({key:val})
         for key, val in majors.items():
             rec.update({key:val})
         for key, val in minors.items():
             rec.update({key:val})
         for key, val in relclf.items():
             rec.update({key:val})
         for key, val in stypes.items():
             rec.update({key:val})
         headers = rec.keys()
         headers.sort()
         headers.remove('id')
         headers = ['id'] + headers # let dataset id be the first column
         if  dformat == 'headers':
             yield headers
         elif  dformat == 'csv':
             res = [str(rec[h]) for h in headers]
             yield ','.join(res)
         elif dformat == 'vw':
             target_str = target.get('rnaccess')
             vals = ' '.join([str(rec[h]) for h in headers])
             uid = genkey(vals, self.salt, 5) # unique row identified
             vwrow = "%s '%s |f %s" % (target_str, uid, vals)
             yield vwrow
示例#4
0
 def dataset_info(self,
                  timeframe,
                  dataset,
                  dtypes,
                  stypes,
                  rtypes,
                  tiers,
                  dformat,
                  target=0):
     "Return common dataset info in specified data format"
     dbsinst = self.dbs.dataset_dbsinst(dataset)
     if not dbsinst:
         return
     row = self.dbs.dataset_info(dataset, dbsinst)
     if row:
         if self.multitask:
             releases, sites, parents, summary, dashboard = \
                     self.dataset_info_all(dataset, dbsinst, timeframe)
         else:
             releases = [
                 rname for rname in self.dbs.dataset_release_versions(
                     dataset, dbsinst)
             ]
             sites = [sname for sname in self.phedex.sites(dataset)]
             parents = [
                 r for r in self.dbs.dataset_parents(dataset, dbsinst)
             ]
             summary = self.dbs.dataset_summary(dataset, dbsinst)
             dashboard = self.dashboard.dataset_info(
                 dataset, timeframe[0], timeframe[1])
         nrels = len(releases)
         series = {}
         for k in rtypes['series'].keys():
             series[k] = 0
         majors = {}
         for k in rtypes['majors'].keys():
             majors[k] = 0
         minors = {}
         for k in rtypes['minors'].keys():
             minors[k] = 0
         relclf = {}
         for k in rtypes['rtypes'].keys():
             relclf[k] = 0
         for rel in releases:
             rserie, rmajor, rminor = rel_ver(rel)
             if not cmssw_test(rserie, rmajor, rminor):
                 continue
             rtype = rel_type(rel)
             try:
                 series['rel1_%s' % rserie] += 1
             except:
                 pass
             try:
                 majors['rel2_%s' % rmajor] += 1
             except:
                 pass
             try:
                 minors['rel3_%s' % rminor] += 1
             except:
                 pass
             try:
                 relclf['relt_%s' % rtype] += 1
             except:
                 pass
         nsites = len(sites)
         for site in sites:
             stier = site_tier(site)
             stypes['s_%s' % stier] += 1
         dataset_id = row['rid']
         era = genkey(row['acquisition_era_name'], self.salt, 5)
         create_dn = self.sitedb.dnid(row['create_by'])
         dbsinstid = row['dbs_instance']
         dtype = row['primary_ds_type']
         # number of data types should be small and simple
         # list look-up shouldn't be a problem
         if dtype not in dtypes:
             dtypes.append(dtype)
         dtype = dtypes.index(dtype)
         _, prim, proc, tier = dataset.split('/')
         prim = genkey(prim, self.salt, 5)
         proc = genkey(proc, self.salt, 5)
         if tier not in tiers:
             tiers.append(tier)
         tier = genkey(tier, self.salt, 5)
         parent = parents[0] if len(parents) else 0
         uid = genuid(yyyymmdd(timeframe[0]), dbsinstid, dataset_id)
         size_norm = 2**30  # normalization factor for file size
         if not summary:
             summary = {}  # we need a dict type
         rec = dict(id=uid,
                    dataset=dataset_id,
                    primds=prim,
                    procds=proc,
                    tier=tier,
                    dtype=dtype,
                    creator=create_dn,
                    nrel=nrels,
                    nsites=nsites,
                    parent=parent,
                    era=era,
                    dbs=dbsinstid,
                    nfiles=summary.get('num_file', 0),
                    nlumis=summary.get('num_lumi', 0),
                    nblk=summary.get('num_block', 0),
                    nevt=summary.get('num_event', 0),
                    size=summary.get('file_size', 0) / size_norm,
                    cpu=dashboard.get('cpu', 0),
                    wct=dashboard.get('wct', 0),
                    proc_evts=dashboard.get('nevt', 0))
         if isinstance(target, dict):
             rec.update(target)
         for key, val in series.items():
             rec.update({key: val})
         for key, val in majors.items():
             rec.update({key: val})
         for key, val in minors.items():
             rec.update({key: val})
         for key, val in relclf.items():
             rec.update({key: val})
         for key, val in stypes.items():
             rec.update({key: val})
         headers = rec.keys()
         headers.sort()
         headers.remove('id')
         headers = ['id'] + headers  # let dataset id be the first column
         if dformat == 'headers':
             yield headers
         elif dformat == 'csv':
             res = [str(rec[h]) for h in headers]
             yield ','.join(res)
         elif dformat == 'vw':
             target_str = target.get('rnaccess')
             vals = ' '.join([str(rec[h]) for h in headers])
             uid = genkey(vals, self.salt, 5)  # unique row identified
             vwrow = "%s '%s |f %s" % (target_str, uid, vals)
             yield vwrow