示例#1
0
 def test_pop_db(self):
     "Test pop_db functions"
     print ""
     pop_db = PopDBService(config=self.config)
     api = 'DSStatInTimeWindow/'
     params = {'tstart':'2015-04-18', 'tstop':'2015-04-18', 'sitename':'T2_US_Nebraska'}
     expected = 'T2_US_Nebraska'
     json_data = pop_db.fetch(api=api, params=params, cache=False)
     result = json_data['SITENAME']
     self.assertEqual(result, expected)
示例#2
0
 def test_pop_db_memory(self):
     "Test pop_db data memory usage"
     print ""
     pop_db = PopDBService(self.config)
     api = 'getDSdata'
     sitename = 'summary'
     aggr = 'day'
     n = 200000
     orderby = 'totcpu'
     tstart = datetime_to_string(datetime_day(datetime.datetime.utcnow() - datetime.timedelta(days=10)))
     tstop = datetime_to_string(datetime_day(datetime.datetime.utcnow()))
     params = {'sitename':sitename, 'tstart':tstart, 'tstop':tstop, 'aggr':aggr, 'n':n, 'orderby':orderby}
     pop_db_data = pop_db.fetch(api=api, params=params, cache=False)
     total_size = total_size_of(pop_db_data)
     logger.info('Total size of Pop DB data in memory is %d bytes (%dMB)', total_size, total_size/10**6)
示例#3
0
 def __init__(self, config=dict()):
     self.logger = logging.getLogger(__name__)
     self.config = config
     self.pop_db = PopDBService(self.config)
     self.sites = SiteManager(self.config)
     self.datasets = DatasetManager(self.config)
     self.storage = StorageManager(self.config)
     self.MAX_THREADS = int(config['threading']['max_threads'])
示例#4
0
class PopularityManager(object):
    """
    Generate popularity metrics for datasets and sites
    """
    def __init__(self, config=dict()):
        self.logger = logging.getLogger(__name__)
        self.config = config
        self.pop_db = PopDBService(self.config)
        self.sites = SiteManager(self.config)
        self.datasets = DatasetManager(self.config)
        self.storage = StorageManager(self.config)
        self.MAX_THREADS = int(config['threading']['max_threads'])

    def initiate_db(self):
        """
        Collect popularity data
        """
        q = Queue.Queue()
        for i in range(self.MAX_THREADS):
            worker = threading.Thread(target=self.insert_popularity_data, args=(i, q))
            worker.daemon = True
            worker.start()
        start_date = datetime_day(datetime.datetime.utcnow() - datetime.timedelta(days=90))
        end_date = datetime_day(datetime.datetime.utcnow())
        # fetch popularity data
        t1 = datetime.datetime.utcnow()
        for date in daterange(start_date, end_date):
            q.put(date)
        q.join()
        t2 = datetime.datetime.utcnow()
        td = t2 - t1
        self.logger.info('Inserting Pop DB data took %s', str(td))

    def insert_popularity_data(self, i, q):
        """
        Insert popularity data for one dataset into db
        """
        coll = 'dataset_popularity'
        while True:
            date = q.get()
            api = 'DSStatInTimeWindow/'
            tstart = datetime_to_string(date)
            tstop = tstart
            params = {'sitename':'summary', 'tstart':tstart, 'tstop':tstop}
            json_data = self.pop_db.fetch(api=api, params=params)
            # sort it in dictionary for easy fetching
            for dataset in json_data['DATA']:
                dataset_name = dataset['COLLNAME']
                popularity_data = {'name':dataset_name, 'date':date}
                popularity_data['n_accesses'] = dataset['NACC']
                popularity_data['n_cpus'] = dataset['TOTCPU']
                popularity_data['n_users'] = dataset['NUSERS']
                query = {'name':dataset_name, 'data':date}
                data = {'$set':popularity_data}
                self.storage.update_data(coll=coll, query=query, data=data, upsert=True)
            q.task_done()

    def update_db(self):
        """
        Fetch latest popularity data not in database
        """
        # get dates
        coll = 'dataset_popularity'
        pipeline = list()
        sort = {'$sort':{'date':-1}}
        pipeline.append(sort)
        limit = {'$limit':1}
        pipeline.append(limit)
        project = {'$project':{'date':1, '_id':0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        try:
            start_date = data[0]['date']
        except:
            self.logger.warning('Popularity needs to be initiated')
            self.initiate_db()
            return
        q = Queue.Queue()
        for i in range(self.MAX_THREADS):
            worker = threading.Thread(target=self.insert_popularity_data, args=(i, q))
            worker.daemon = True
            worker.start()
        end_date = datetime_day(datetime.datetime.utcnow())
        # fetch popularity data
        t1 = datetime.datetime.utcnow()
        for date in daterange(start_date, end_date):
            q.put(date)
        t2 = datetime.datetime.utcnow()
        td = t2 - t1
        self.logger.info('Updating Pop DB data took %s', str(td))

    def insert_dataset(self, dataset_name):
        """
        Fetch all popularity data for dataset
        """
        api = 'getSingleDSstat'
        sitename = 'summary'
        name = dataset_name
        aggr = 'day'
        orderbys = ['totcpu', 'naccess']
        coll = 'dataset_popularity'
        for orderby in orderbys:
            params = {'sitename':sitename, 'name':name, 'aggr':aggr, 'orderby':orderby}
            json_data = self.pop_db.fetch(api=api, params=params)
            data = get_json(json_data, 'data')
            for pop_data in get_json(data, 'data'):
                date = pop_db_timestamp_to_datetime(pop_data[0])
                query = {'name':dataset_name, 'data':date}
                popularity_data = {'name':dataset_name, 'date':date}
                popularity_data[orderby] = pop_data[1]
                data = {'$set':popularity_data}
                self.storage.update_data(coll=coll, query=query, data=data, upsert=True)