class DataAnalysis(object): """ Data Analysis is collecting data and prints it to be used by visualization software to better understand access patterns """ def __init__(self, config=dict()): self.logger = logging.getLogger(__name__) self.config = config self.datasets = DatasetManager(self.config) self.sites = SiteManager(self.config) self.storage = StorageManager(self.config) self.popularity = PopularityManager(self.config) def start(self): """ Begin Data Analysis """ t1 = datetime.datetime.utcnow() dataset_name = '/PAHighPt/HIRun2013-PromptReco-v1/RECO' self.initiate_data(dataset_name) self.export_data(dataset_name) t2 = datetime.datetime.utcnow() td = t2 - t1 self.logger.info('Data Analysis took %s', str(td)) def initiate_data(self, dataset_name): """ Initiate data about dataset(s) """ coll = 'dataset_data' query = {'name':dataset_name} data = {'$set':{'name':dataset_name}} self.storage.update_data(coll=coll, query=query, data=data, upsert=True) self.datasets.insert_phedex_data(dataset_name) self.datasets.insert_dbs_data(dataset_name) self.popularity.insert_dataset(dataset_name) def export_data(self, dataset_name): """ Get data from DB and export to file for usage in visualization """ # get data from DB coll = 'dataset_popularity' pipeline = list() match = {'$match':{'name':dataset_name}} pipeline.append(match) db_data = self.storage.get_data(coll=coll, pipeline=pipeline) headers = ('dataset_name', 'date', 'popularity') data = list() for data_entry in db_data: data.append(tuple(data_entry['name'], data_entry['date'], data_entry['n_accesses']*data_entry['n_cpus']*data_entry['n_users'])) export_csv(headers=headers, data=data, file_name='single_dataset')
def __init__(self, config=dict()): self.logger = logging.getLogger(__name__) self.config = config self.storage = StorageManager(self.config) self.sites = SiteManager(self.config) self.datasets = DatasetManager(self.config) self.popularity = PopularityManager(self.config)
def __init__(self, config=dict()): self.logger = logging.getLogger(__name__) self.config = config self.phedex = PhEDExService(self.config) self.mit_db = MITDBService(self.config) self.datasets = DatasetManager(self.config) self.sites = SiteManager(self.config) self.storage = StorageManager(self.config) self.rankings = DeltaRanking(self.config) self.max_gb = int(self.config['rocker_board']['max_gb']) self.min_rank = float(self.config['rocker_board']['min_rank'])
class Initiate(object): """ Initiate Database """ def __init__(self, config=dict()): self.logger = logging.getLogger(__name__) self.config = config self.sites = SiteManager(self.config) self.datasets = DatasetManager(self.config) self.popularity = PopularityManager(self.config) def start(self): """ Begin Initiating Database """ t1 = datetime.datetime.utcnow() self.sites.initiate_db() self.datasets.initiate_db() self.popularity.initiate_db() t2 = datetime.datetime.utcnow() td = t2 - t1 self.logger.info('Initiate took %s', str(td))
class UpdateDB(object): """ Update DB with new dataset and site data """ def __init__(self, config=dict()): self.logger = logging.getLogger(__name__) self.config = config self.storage = StorageManager(self.config) self.sites = SiteManager(self.config) self.datasets = DatasetManager(self.config) self.popularity = PopularityManager(self.config) def start(self): """ Begin Database Update """ t1 = datetime.datetime.utcnow() self.sites.update_db() self.datasets.update_db() self.popularity.update_db() t2 = datetime.datetime.utcnow() td = t2 - t1 self.logger.info("Update DB took %s", str(td))
class RockerBoard(object): """ RockerBoard is a system balancing algorithm using popularity metrics to predict popularity and make appropriate replications to keep the system balanced """ def __init__(self, config=dict()): self.logger = logging.getLogger(__name__) self.config = config self.phedex = PhEDExService(self.config) self.mit_db = MITDBService(self.config) self.datasets = DatasetManager(self.config) self.sites = SiteManager(self.config) self.storage = StorageManager(self.config) self.rankings = DeltaRanking(self.config) self.max_gb = int(self.config['rocker_board']['max_gb']) self.min_rank = float(self.config['rocker_board']['min_rank']) def start(self): """ Begin Rocker Board Algorithm """ t1 = datetime.datetime.utcnow() subscriptions = self.balance() for subscription in subscriptions: self.logger.info('site: %s\tdataset: %s', subscription[1], subscription[0]) self.subscribe(subscriptions) t2 = datetime.datetime.utcnow() td = t2 - t1 self.logger.info('Rocker Board took %s', str(td)) def balance(self): """ Balance system by creating new replicas based on popularity """ subscriptions = list() dataset_rankings = self.rankings.dataset_rankings() site_rankings = self.rankings.site_rankings() subscribed_gb = 0 while subscribed_gb < self.max_gb: tmp_site_rankings = site_rankings dataset_name = weighted_choice(dataset_rankings) if (not dataset_name) or (dataset_rankings[dataset_name] < self.min_rank): break size_gb = self.datasets.get_size(dataset_name) unavailable_sites = set(self.datasets.get_sites(dataset_name)) for site_name in tmp_site_rankings.keys(): if (self.sites.get_available_storage(site_name) < size_gb) or (tmp_site_rankings[site_name] <= 0): unavailable_sites.add(site_name) for site_name in unavailable_sites: try: del tmp_site_rankings[site_name] except: continue if not tmp_site_rankings: break site_name = weighted_choice(tmp_site_rankings) subscription = (dataset_name, site_name) subscriptions.append(subscription) subscribed_gb += size_gb avail_storage = self.sites.get_available_storage(site_name) self.logger.info('rank: %s\tsize: %.2f\tdataset: %s', dataset_rankings[dataset_name], size_gb, dataset_name) self.logger.info('rank: %s\tstorage: %d\site: %s', site_rankings[site_name], avail_storage, site_name) new_avail_storage = avail_storage - self.datasets.get_size(dataset_name) if new_avail_storage > 0: new_rank = 0.0 else: new_rank = (site_rankings[site_name]/avail_storage)*new_avail_storage site_rankings[site_name] = new_rank del dataset_rankings[dataset_name] self.logger.info('Subscribed %dGB', subscribed_gb) return subscriptions def subscribe(self, subscriptions): """ Make subscriptions to phedex subscriptions = [(dataset_name, site_name), ...] """ new_subscriptions = dict() for subscription in subscriptions: dataset_name = subscription[0] site_name = subscription[1] try: new_subscriptions[site_name].append(dataset_name) except: new_subscriptions[site_name] = list() new_subscriptions[site_name].append(dataset_name) for site_name, dataset_names in new_subscriptions.items(): data = self.phedex.generate_xml(dataset_names) comments = 'This dataset is predicted to become popular and has therefore been automatically replicated by cuadrnt' api = 'subscribe' params = [('node', site_name), ('data', data), ('level','dataset'), ('move', 'n'), ('custodial', 'n'), ('group', 'AnalysisOps'), ('request_only', 'n'), ('no_mail', 'n'), ('comments', comments)] json_data = self.phedex.fetch(api=api, params=params, method='post') # insert into db group_name = 'AnalysisOps' request_id = 0 request_type = 0 try: request = json_data['phedex'] request_id = request['request_created'][0]['id'] request_created = timestamp_to_datetime(request['request_timestamp']) except: self.logger.warning('Subscription did not succeed\n\tSite:%s\n\tDatasets: %s', str(site_name), str(dataset_names)) continue for dataset_name in dataset_names: coll = 'dataset_popularity' date = datetime_day(datetime.datetime.utcnow()) pipeline = list() match = {'$match':{'name':dataset_name, 'date':date}} pipeline.append(match) project = {'$project':{'delta_popularity':1, '_id':0}} pipeline.append(project) data = self.storage.get_data(coll=coll, pipeline=pipeline) dataset_rank = data[0]['delta_popularity'] query = "INSERT INTO Requests(RequestId, RequestType, DatasetId, SiteId, GroupId, Rank, Date) SELECT %s, %s, Datasets.DatasetId, Sites.SiteId, Groups.GroupId, %s, %s FROM Datasets, Sites, Groups WHERE Datasets.DatasetName=%s AND Sites.SiteName=%s AND Groups.GroupName=%s" values = (request_id, request_type, dataset_rank, request_created, dataset_name, site_name, group_name) self.mit_db.query(query=query, values=values, cache=False)