def test_pop_db_memory(self): "Test pop_db data memory usage" print "" pop_db = PopDBService(self.config) api = 'getDSdata' sitename = 'summary' aggr = 'day' n = 200000 orderby = 'totcpu' tstart = datetime_to_string(datetime_day(datetime.datetime.utcnow() - datetime.timedelta(days=10))) tstop = datetime_to_string(datetime_day(datetime.datetime.utcnow())) params = {'sitename':sitename, 'tstart':tstart, 'tstop':tstop, 'aggr':aggr, 'n':n, 'orderby':orderby} pop_db_data = pop_db.fetch(api=api, params=params, cache=False) total_size = total_size_of(pop_db_data) logger.info('Total size of Pop DB data in memory is %d bytes (%dMB)', total_size, total_size/10**6)
def get_site_popularity(self, site_name, date=datetime_day(datetime.datetime.utcnow())): """ Get popularity for site """ # get all datasets with a replica at the site and how many replicas it has coll = 'dataset_data' pipeline = list() match = {'$match':{'replicas':site_name}} pipeline.append(match) project = {'$project':{'name':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) popularity = 0.0 dataset_names = [dataset_data['name'] for dataset_data in data] # get the popularity of the dataset and decide by number of replicas coll = 'dataset_rankings' pipeline = list() match = {'$match':{'date':date}} pipeline.append(match) match = {'$match':{'name':{'$in':dataset_names}}} pipeline.append(match) group = {'$group':{'_id':'$date', 'total_popularity':{'$sum':'$popularity'}}} pipeline.append(group) project = {'$project':{'total_popularity':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) try: popularity = data[0]['total_popularity'] except: popularity = 0.0 return popularity
def update_db(self): """ Fetch latest popularity data not in database """ # get dates coll = 'dataset_popularity' pipeline = list() sort = {'$sort':{'date':-1}} pipeline.append(sort) limit = {'$limit':1} pipeline.append(limit) project = {'$project':{'date':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) try: start_date = data[0]['date'] except: self.logger.warning('Popularity needs to be initiated') self.initiate_db() return q = Queue.Queue() for i in range(self.MAX_THREADS): worker = threading.Thread(target=self.insert_popularity_data, args=(i, q)) worker.daemon = True worker.start() start_date = start_date + datetime.timedelta(days=1) end_date = datetime_day(datetime.datetime.utcnow()) # fetch popularity data t1 = datetime.datetime.utcnow() for date in daterange(start_date, end_date): q.put(date) q.join() t2 = datetime.datetime.utcnow() td = t2 - t1 self.logger.info('Updating Pop DB data took %s', str(td))
def get_site_rankings(self, date=datetime_day(datetime.datetime.utcnow())): """ Generate site rankings """ # get all sites which can be replicated to site_names = self.sites.get_available_sites() site_rankings = dict() for site_name in site_names: # get popularity popularity = self.get_site_popularity(site_name, date) # get cpu and storage (performance) performance = self.sites.get_performance(site_name) # get available storage available_storage_tb = self.sites.get_available_storage(site_name)/10**3 if available_storage_tb <= 0: available_storage_tb = 0 else: available_storage_tb = 1 #calculate rank try: rank = (performance*available_storage_tb)/popularity except: rank = 0.0 # store into dict site_rankings[site_name] = rank # insert into database coll = 'site_rankings' query = {'name':site_name, 'date':date} data = {'$set':{'name':site_name, 'date':date, 'rank':rank, 'popularity':popularity}} self.storage.update_data(coll=coll, query=query, data=data, upsert=True) return site_rankings
def get_site_popularity(self, site_name): """ Get delta popularity for site """ date = datetime_day(datetime.datetime.utcnow()) # get all datasets with a replica at the site and how many replicas it has coll = 'dataset_data' pipeline = list() match = {'$match':{'replicas':site_name}} pipeline.append(match) project = {'$project':{'name':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) popularity = 0.0 for dataset in data: dataset_name = dataset['name'] # get the popularity of the dataset and dicide by number of replicas coll = 'dataset_popularity' pipeline = list() match = {'$match':{'name':dataset_name, 'date':date}} pipeline.append(match) project = {'$project':{'delta_popularity':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) popularity += data[0]['delta_popularity'] return popularity
def insert_cache(self, coll, api, params=dict(), data=dict()): """ Insert data into collection Collection should be the service it is caching data for Use update to have the possibility to force cache update """ result = list() db_coll = self.db[coll] object_id = get_object_id(str(api)+str(params)) data['_id'] = object_id data['datetime'] = datetime_day(datetime.datetime.utcnow()) for i in range(2): try: result = db_coll.replace_one({'_id':object_id}, data, upsert=True) except DocumentTooLarge: self.logger.warning('DocumentTooLarge error for %s api %s', coll, api) break except AutoReconnect: call(["start_mongodb", self.OPT_PATH]) continue else: if (result.modified_count == 0) and (not result.upserted_id): self.logger.debug('Failed to insert %s cache for api %s\n\tData: %s', coll, api, str(data)) break else: self.logger.error("Couldn't establish connection to mongodb server %s", self.URI) return result
def site_rankings(self): """ Generate site rankings """ date = datetime_day(datetime.datetime.utcnow()) # get all sites which can be replicated to site_names = self.sites.get_available_sites() site_rankings = dict() for site_name in site_names: # get popularity popularity = float(self.get_site_popularity(site_name)) # get cpu and storage (performance) performance = float(self.sites.get_performance(site_name)) # get available storage available_storage = float(self.sites.get_available_storage(site_name)) if available_storage <= 0: available_storage = 0.0 # insert into database coll = 'site_popularity' query = {'name':site_name, 'date':date} data = {'$set':{'name':site_name, 'date':date, 'delta_popularity':popularity, 'performance':performance, 'available_storage':available_storage}} self.storage.update_data(coll=coll, query=query, data=data, upsert=True) #calculate rank try: rank = (performance*available_storage)/popularity except: rank = 0.0 # store into dict site_rankings[site_name] = rank # insert into database coll = 'site_rankings' query = {'name':site_name, 'date':date} data = {'$set':{'name':site_name, 'date':date, 'delta_rank':rank}} self.storage.update_data(coll=coll, query=query, data=data, upsert=True) return site_rankings
def subscribe(self, subscriptions): """ Make subscriptions to phedex subscriptions = [(dataset_name, site_name), ...] """ new_subscriptions = dict() for subscription in subscriptions: dataset_name = subscription[0] site_name = subscription[1] try: new_subscriptions[site_name].append(dataset_name) except: new_subscriptions[site_name] = list() new_subscriptions[site_name].append(dataset_name) for site_name, dataset_names in new_subscriptions.items(): data = self.phedex.generate_xml(dataset_names) comments = ( "This dataset is predicted to become popular and has therefore been automatically replicated by cuadrnt" ) api = "subscribe" params = [ ("node", site_name), ("data", data), ("level", "dataset"), ("move", "n"), ("custodial", "n"), ("group", "AnalysisOps"), ("request_only", "n"), ("no_mail", "n"), ("comments", comments), ] json_data = self.phedex.fetch(api=api, params=params, method="post") # insert into db group_name = "AnalysisOps" request_id = 0 request_type = 0 try: request = json_data["phedex"] request_id = request["request_created"][0]["id"] request_created = timestamp_to_datetime(request["request_timestamp"]) except: self.logger.warning( "Subscription did not succeed\n\tSite:%s\n\tDatasets: %s", str(site_name), str(dataset_names) ) continue for dataset_name in dataset_names: coll = "dataset_rankings" date = datetime_day(datetime.datetime.utcnow()) pipeline = list() match = {"$match": {"name": dataset_name, "date": date}} pipeline.append(match) project = {"$project": {"delta_rank": 1, "_id": 0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) dataset_rank = data[0]["delta_rank"] query = "INSERT INTO Requests(RequestId, RequestType, DatasetId, SiteId, GroupId, Rank, Date) SELECT %s, %s, Datasets.DatasetId, Sites.SiteId, Groups.GroupId, %s, %s FROM Datasets, Sites, Groups WHERE Datasets.DatasetName=%s AND Sites.SiteName=%s AND Groups.GroupName=%s" values = (request_id, request_type, dataset_rank, request_created, dataset_name, site_name, group_name) self.mit_db.query(query=query, values=values, cache=False)
def initiate_db(self): """ Collect popularity data """ q = Queue.Queue() for i in range(self.MAX_THREADS): worker = threading.Thread(target=self.insert_popularity_data, args=(i, q)) worker.daemon = True worker.start() start_date = datetime_day(datetime.datetime.utcnow() - datetime.timedelta(days=90)) end_date = datetime_day(datetime.datetime.utcnow()) # fetch popularity data t1 = datetime.datetime.utcnow() for date in daterange(start_date, end_date): q.put(date) q.join() t2 = datetime.datetime.utcnow() td = t2 - t1 self.logger.info('Inserting Pop DB data took %s', str(td))
def get_dataset_popularity(self, dataset_name): """ Get delta popularity for dataset """ coll = 'dataset_popularity' start_date = datetime_day(datetime.datetime.utcnow()) - datetime.timedelta(days=14) end_date = datetime_day(datetime.datetime.utcnow()) - datetime.timedelta(days=8) pipeline = list() match = {'$match':{'name':dataset_name}} pipeline.append(match) match = {'$match':{'date':{'$gte':start_date, '$lte':end_date}}} pipeline.append(match) group = {'$group':{'_id':'$name', 'old_popularity':{'$sum':{'$multiply':['$n_accesses', '$n_cpus']}}}} pipeline.append(group) data = self.storage.get_data(coll=coll, pipeline=pipeline) try: old_pop = float(data[0]['old_popularity']) except: old_pop = 0.0 start_date = datetime_day(datetime.datetime.utcnow()) - datetime.timedelta(days=7) end_date = datetime_day(datetime.datetime.utcnow()) - datetime.timedelta(days=1) pipeline = list() match = {'$match':{'name':dataset_name}} pipeline.append(match) match = {'$match':{'date':{'$gte':start_date, '$lte':end_date}}} pipeline.append(match) group = {'$group':{'_id':'$name', 'new_popularity':{'$sum': {'$multiply':['$n_accesses', '$n_cpus']}}}} pipeline.append(group) data = self.storage.get_data(coll=coll, pipeline=pipeline) try: new_pop = float(data[0]['new_popularity']) except: new_pop = 0.0 delta_popularity = new_pop - old_pop if delta_popularity > 1: delta_popularity = log(delta_popularity) else: delta_popularity = 0.0 size_gb = self.datasets.get_size(dataset_name) return delta_popularity/size_gb
def start(self, date=datetime_day(datetime.datetime.utcnow())): """ Begin Rocker Board Algorithm """ t1 = datetime.datetime.utcnow() # Get goals dataset_rankings = self.rankings.get_dataset_rankings(date) site_rankings = self.rankings.get_site_rankings(date) self.change_dataset_rankings(dataset_rankings) subscriptions = self.replicate(dataset_rankings, site_rankings) self.logger.info('SUBSCRIPTIONS') for subscription in subscriptions: self.logger.info('site: %s\tdataset: %s', subscription[1], subscription[0]) # self.subscribe(subscriptions) t2 = datetime.datetime.utcnow() td = t2 - t1 self.logger.info('Rocker Board took %s', str(td))
def dataset_rankings(self): """ Generate dataset rankings """ date = datetime_day(datetime.datetime.utcnow()) dataset_names = self.datasets.get_db_datasets() dataset_rankings = dict() coll = 'dataset_popularity' for dataset_name in dataset_names: delta_popularity = self.get_dataset_popularity(dataset_name) # insert into database query = {'name':dataset_name, 'date':date} data = {'$set':{'delta_popularity':delta_popularity}} self.storage.update_data(coll=coll, query=query, data=data, upsert=True) # store into dict dataset_rankings[dataset_name] = delta_popularity return dataset_rankings
def insert_dbs_data(self, dataset_name): """ Fetch dbs data about dataset and insert into database """ api = 'datasets' params = {'dataset':dataset_name, 'detail':True, 'dataset_access_type':'*'} dbs_data = self.dbs.fetch(api=api, params=params) dataset_data = get_json(dbs_data, 'data')[0] ds_name = get_json(dataset_data, 'primary_ds_name') physics_group = get_json(dataset_data, 'physics_group_name') data_tier = get_json(dataset_data, 'data_tier_name') creation_date = datetime_day(timestamp_to_datetime(get_json(dataset_data, 'creation_date'))) ds_type = get_json(dataset_data, 'primary_ds_type') coll = 'dataset_data' query = {'name':dataset_name} data = {'$set':{'ds_name':ds_name, 'physics_group':physics_group, 'data_tier':data_tier, 'creation_date':creation_date, 'ds_type':ds_type}} self.storage.update_data(coll=coll, query=query, data=data, upsert=False)
def get_dataset_rankings(self, date=datetime_day(datetime.datetime.utcnow())): """ Generate dataset rankings """ self.dataset_popularity = dict() dataset_names = self.datasets.get_db_datasets() q = Queue.Queue() for i in range(self.MAX_THREADS): worker = threading.Thread(target=self.get_dataset_popularity, args=(q,)) worker.daemon = True worker.start() # self.dataset_features = self.popularity.get_features(dataset_names, date) # self.dataset_tiers = self.datasets.get_data_tiers(dataset_names) for dataset_name in dataset_names: q.put((dataset_name, date)) q.join() dataset_rankings = self.normalize_popularity(date) return dataset_rankings
def subscribe(self, subscriptions): """ Make subscriptions to phedex subscriptions = [(dataset_name, site_name), ...] """ new_subscriptions = dict() for subscription in subscriptions: dataset_name = subscription[0] site_name = subscription[1] try: new_subscriptions[site_name].append(dataset_name) except: new_subscriptions[site_name] = list() new_subscriptions[site_name].append(dataset_name) for site_name, dataset_names in new_subscriptions.items(): data = self.phedex.generate_xml(dataset_names) comments = 'This dataset is predicted to become popular and has therefore been automatically replicated by cuadrnt' api = 'subscribe' params = [('node', site_name), ('data', data), ('level','dataset'), ('move', 'n'), ('custodial', 'n'), ('group', 'AnalysisOps'), ('request_only', 'n'), ('no_mail', 'n'), ('comments', comments)] json_data = self.phedex.fetch(api=api, params=params, method='post') # insert into db group_name = 'AnalysisOps' request_id = 0 request_type = 0 try: request = json_data['phedex'] request_id = request['request_created'][0]['id'] request_created = timestamp_to_datetime(request['request_timestamp']) except: self.logger.warning('Subscription did not succeed\n\tSite:%s\n\tDatasets: %s', str(site_name), str(dataset_names)) continue for dataset_name in dataset_names: coll = 'dataset_rankings' date = datetime_day(datetime.datetime.utcnow()) pipeline = list() match = {'$match':{'name':dataset_name, 'date':date}} pipeline.append(match) project = {'$project':{'delta_rank':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) dataset_rank = data[0]['delta_rank'] query = "INSERT INTO Requests(RequestId, RequestType, DatasetId, SiteId, GroupId, Rank, Date) SELECT %s, %s, Datasets.DatasetId, Sites.SiteId, Groups.GroupId, %s, %s FROM Datasets, Sites, Groups WHERE Datasets.DatasetName=%s AND Sites.SiteName=%s AND Groups.GroupName=%s" values = (request_id, request_type, dataset_rank, request_created, dataset_name, site_name, group_name) self.mit_db.query(query=query, values=values, cache=False)
def subscribe(self, subscriptions): """ Make subscriptions to phedex subscriptions = [(dataset_name, site_name), ...] """ new_subscriptions = dict() for subscription in subscriptions: dataset_name = subscription[0] site_name = subscription[1] try: new_subscriptions[site_name].append(dataset_name) except: new_subscriptions[site_name] = list() new_subscriptions[site_name].append(dataset_name) for site_name, dataset_names in new_subscriptions.items(): data = self.phedex.generate_xml(dataset_names) comments = 'This dataset is predicted to become popular and has therefore been automatically replicated by cuadrnt' api = 'subscribe' params = [('node', site_name), ('data', data), ('level','dataset'), ('move', 'n'), ('custodial', 'n'), ('group', 'AnalysisOps'), ('request_only', 'n'), ('no_mail', 'n'), ('comments', comments)] json_data = self.phedex.fetch(api=api, params=params, method='post') # insert into db group_name = 'AnalysisOps' request_id = 0 request_type = 0 try: request = json_data['phedex'] request_id = request['request_created'][0]['id'] request_created = timestamp_to_datetime(request['request_timestamp']) except: self.logger.warning('Subscription did not succeed\n\tSite:%s\n\tDatasets: %s', str(site_name), str(dataset_names)) continue for dataset_name in dataset_names: coll = 'dataset_popularity' date = datetime_day(datetime.datetime.utcnow()) pipeline = list() match = {'$match':{'name':dataset_name, 'date':date}} pipeline.append(match) project = {'$project':{'delta_popularity':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) dataset_rank = data[0]['delta_popularity'] query = "INSERT INTO Requests(RequestId, RequestType, DatasetId, SiteId, GroupId, Rank, Date) SELECT %s, %s, Datasets.DatasetId, Sites.SiteId, Groups.GroupId, %s, %s FROM Datasets, Sites, Groups WHERE Datasets.DatasetName=%s AND Sites.SiteName=%s AND Groups.GroupName=%s" values = (request_id, request_type, dataset_rank, request_created, dataset_name, site_name, group_name) self.mit_db.query(query=query, values=values, cache=False)
def get_site_rankings(self, date=datetime_day(datetime.datetime.utcnow())): """ Generate site rankings """ # get all sites which can be replicated to site_names = self.sites.get_available_sites() site_rankings = dict() for site_name in site_names: # get popularity popularity = self.get_site_popularity(site_name, date) # get cpu and storage (performance) performance = self.sites.get_performance(site_name) # get available storage available_storage_tb = self.sites.get_available_storage( site_name) / 10**3 if available_storage_tb <= 0: available_storage_tb = 0 else: available_storage_tb = 1 #calculate rank try: rank = (performance * available_storage_tb) / popularity except: rank = 0.0 # store into dict site_rankings[site_name] = rank # insert into database coll = 'site_rankings' query = {'name': site_name, 'date': date} data = { '$set': { 'name': site_name, 'date': date, 'rank': rank, 'popularity': popularity } } self.storage.update_data(coll=coll, query=query, data=data, upsert=True) return site_rankings
def get_dataset_rankings(self, date=datetime_day(datetime.datetime.utcnow())): """ Generate dataset rankings """ self.dataset_popularity = dict() dataset_names = self.datasets.get_db_datasets() q = Queue.Queue() for i in range(self.MAX_THREADS): worker = threading.Thread(target=self.get_dataset_popularity, args=(q, )) worker.daemon = True worker.start() # self.dataset_features = self.popularity.get_features(dataset_names, date) # self.dataset_tiers = self.datasets.get_data_tiers(dataset_names) for dataset_name in dataset_names: q.put((dataset_name, date)) q.join() dataset_rankings = self.normalize_popularity(date) return dataset_rankings
def get_site_popularity(self, site_name, date=datetime_day(datetime.datetime.utcnow())): """ Get popularity for site """ # get all datasets with a replica at the site and how many replicas it has coll = 'dataset_data' pipeline = list() match = {'$match': {'replicas': site_name}} pipeline.append(match) project = {'$project': {'name': 1, '_id': 0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) popularity = 0.0 dataset_names = [dataset_data['name'] for dataset_data in data] # get the popularity of the dataset and decide by number of replicas coll = 'dataset_rankings' pipeline = list() match = {'$match': {'date': date}} pipeline.append(match) match = {'$match': {'name': {'$in': dataset_names}}} pipeline.append(match) group = { '$group': { '_id': '$date', 'total_popularity': { '$sum': '$popularity' } } } pipeline.append(group) project = {'$project': {'total_popularity': 1, '_id': 0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) try: popularity = data[0]['total_popularity'] except: popularity = 0.0 return popularity
def start(self, date=datetime_day(datetime.datetime.utcnow())): """ Begin Rocker Board Algorithm """ t1 = datetime.datetime.utcnow() # Get goals dataset_rankings = self.rankings.get_dataset_rankings(date) site_rankings = self.rankings.get_site_rankings(date) self.change_dataset_rankings(dataset_rankings) subscriptions = self.replicate(dataset_rankings, site_rankings) self.logger.info("SUBSCRIPTIONS") for subscription in subscriptions: self.logger.info("site: %s\tdataset: %s", subscription[1], subscription[0]) # site_storage = self.rankings.get_site_storage_rankings(subscriptions) # deletions = self.clean(dataset_rankings, site_storage) # self.logger.info('DELETIONS') # for deletion in deletions: # self.logger.info('site: %s\tdataset: %s', deletion[1], deletion[0]) # self.delete(deletions) self.subscribe(subscriptions) # self.datasets.update_replicas(subscriptions, deletions) t2 = datetime.datetime.utcnow() td = t2 - t1 self.logger.info("Rocker Board took %s", str(td))
def test_datetime_day(self): "Test datetime_day function" datetime_ = datetime.datetime(1987, 10, 27, 3, 6, 9) expected = datetime.datetime(year=1987, month=10, day=27) result = datetime_day(datetime_) self.assertEqual(result, expected)