def initiate_db(self): """ Initiate dataset data in database Get general data and popularity data from beginning """ q = Queue.Queue() for i in range(self.MAX_THREADS): worker = threading.Thread(target=self.insert_dataset_data, args=(i, q)) worker.daemon = True worker.start() active_sites = self.sites.get_active_sites() api = 'blockreplicas' params = [('node', active_sites), ('create_since', 0.0), ('complete', 'y'), ('dist_complete', 'y'), ('group', 'AnalysisOps'), ('show_dataset', 'y')] t1 = datetime.datetime.utcnow() phedex_data = self.phedex.fetch(api=api, params=params) t2 = datetime.datetime.utcnow() td = t2 - t1 self.logger.info('Call to PhEDEx took %s', str(td)) count = 1 t1 = datetime.datetime.utcnow() for dataset_data in get_json(get_json(phedex_data, 'phedex'), 'dataset'): q.put((dataset_data, count)) count += 1 q.join() t2 = datetime.datetime.utcnow() td = t2 - t1 self.logger.info('Inserting dataset data took %s', str(td)) self.logger.info('Done inserting datasets into DB')
def test_get_json(self): "Test get_json function" json_data = {'foo': [{'bar': 1}, {'bar': 2}]} field = 'foobar' expected = list() result = get_json(json_data, field) self.assertEqual(result, expected) field = 'foo' expected = [{'bar': 1}, {'bar': 2}] result = get_json(json_data, field)
def test_get_json(self): "Test get_json function" json_data = {'foo':[{'bar':1}, {'bar':2}]} field = 'foobar' expected = list() result = get_json(json_data, field) self.assertEqual(result, expected) field = 'foo' expected = [{'bar':1}, {'bar':2}] result = get_json(json_data, field)
def get_replicas(self, dataset_data): """ Generator function to get all replicas of a dataset """ replicas = list() for block_data in get_json(dataset_data, 'block'): for replica_data in get_json(block_data, 'replica'): if get_json(replica_data, 'files') > 0: replicas.append(get_json(replica_data, 'node')) return replicas
def update_db(self): """ Get datasets currently in AnalysisOps and compare to database Deactivate removed datasets and insert new Update replicas """ # get all datasets in database dataset_names = self.get_db_datasets() dataset_names = set(dataset_names) # get all active sites, only fetch replicas from these active_sites = self.sites.get_active_sites() api = 'blockreplicas' params = [('node', active_sites), ('create_since', 0.0), ('complete', 'y'), ('group', 'AnalysisOps'), ('show_dataset', 'y')] t1 = datetime.datetime.utcnow() phedex_data = self.phedex.fetch(api=api, params=params) t2 = datetime.datetime.utcnow() td = t2 - t1 self.logger.info('Call to PhEDEx took %s', str(td)) current_datasets = set() q = Queue.Queue() for i in range(self.MAX_THREADS): worker = threading.Thread(target=self.insert_dataset_data, args=(i, q)) worker.daemon = True worker.start() count = 1 t1 = datetime.datetime.utcnow() for dataset_data in get_json(get_json(phedex_data, 'phedex'), 'dataset'): dataset_name = get_json(dataset_data, 'name') current_datasets.add(dataset_name) if dataset_name not in dataset_names: # this is a new dataset which need to be inserted into the database q.put((dataset_data, count)) count += 1 else: # update replicas replicas = self.get_replicas(dataset_data) coll = 'dataset_data' query = {'name':dataset_name} data = {'$set':{'replicas':replicas}} data = self.storage.update_data(coll=coll, query=query, data=data) q.join() deprecated_datasets = dataset_names - current_datasets for dataset_name in deprecated_datasets: self.remove_dataset(dataset_name) t2 = datetime.datetime.utcnow() td = t2 - t1 self.logger.info('Updating dataset data took %s', str(td)) self.logger.info('Done updating datasets in DB')
def insert_dataset_data(self, i, q): """ Insert a new dataset into the database and initiate all data """ while True: data = q.get() dataset_data = data[0] count = data[1] self.logger.debug('Inserting dataset number %d', count) dataset_name = get_json(dataset_data, 'name') coll = 'dataset_data' query = {'name':dataset_name} data = {'$set':{'name':dataset_name}} data = self.storage.update_data(coll=coll, query=query, data=data, upsert=True) try: self.insert_phedex_data(dataset_name) self.insert_dbs_data(dataset_name) replicas = self.get_replicas(dataset_data) query = {'name':dataset_name} data = {'$set':{'name':dataset_name, 'replicas':replicas}} data = self.storage.update_data(coll=coll, query=query, data=data) except: coll = 'dataset_data' query = {'name':dataset_name} self.storage.delete_data(coll=coll, query=query) q.task_done()
def update_db(self): """ Initiate site data in database Get general data about all sites """ api = 'Detox' file_ = 'SitesInfo.txt' intelroccs_data = self.intelroccs.fetch(api=api, params=file_, secure=False) for site_data in get_json(intelroccs_data, 'data'): self.insert_site_data(site_data)
def insert_dataset(self, dataset_name): """ Fetch all popularity data for dataset """ api = 'getSingleDSstat' sitename = 'summary' name = dataset_name aggr = 'day' orderbys = ['totcpu', 'naccess'] coll = 'dataset_popularity' for orderby in orderbys: params = {'sitename':sitename, 'name':name, 'aggr':aggr, 'orderby':orderby} json_data = self.pop_db.fetch(api=api, params=params) data = get_json(json_data, 'data') for pop_data in get_json(data, 'data'): date = pop_db_timestamp_to_datetime(pop_data[0]) query = {'name':dataset_name, 'data':date} popularity_data = {'name':dataset_name, 'date':date} popularity_data[orderby] = pop_data[1] data = {'$set':popularity_data} self.storage.update_data(coll=coll, query=query, data=data, upsert=True)
def get_replicas(self, dataset_data): """ Generator function to get all replicas of a dataset """ replicas_check = dict() dataset_name = get_json(dataset_data, 'name') for block_data in get_json(dataset_data, 'block'): for replica_data in get_json(block_data, 'replica'): try: replicas_check[get_json(replica_data, 'node')] += get_json(replica_data, 'files') except: replicas_check[get_json(replica_data, 'node')] = get_json(replica_data, 'files') replicas = list() n_files = self.get_n_files(dataset_name) for site, site_files in replicas_check.items(): if site_files == n_files: replicas.append(site) return replicas
def insert_dbs_data(self, dataset_name): """ Fetch dbs data about dataset and insert into database """ api = 'datasets' params = {'dataset':dataset_name, 'detail':True, 'dataset_access_type':'*'} dbs_data = self.dbs.fetch(api=api, params=params) dataset_data = get_json(dbs_data, 'data')[0] ds_name = get_json(dataset_data, 'primary_ds_name') physics_group = get_json(dataset_data, 'physics_group_name') data_tier = get_json(dataset_data, 'data_tier_name') creation_date = datetime_day(timestamp_to_datetime(get_json(dataset_data, 'creation_date'))) ds_type = get_json(dataset_data, 'primary_ds_type') coll = 'dataset_data' query = {'name':dataset_name} data = {'$set':{'ds_name':ds_name, 'physics_group':physics_group, 'data_tier':data_tier, 'creation_date':creation_date, 'ds_type':ds_type}} self.storage.update_data(coll=coll, query=query, data=data, upsert=False)
def insert_phedex_data(self, dataset_name): """ Fetch phedex data about dataset and insert into database """ api = 'data' params = {'dataset':dataset_name, 'level':'block', 'create_since':0.0} phedex_data = self.phedex.fetch(api=api, params=params) size_bytes = 0 n_files = 0 dataset_data = get_json(get_json(get_json(phedex_data, 'phedex'), 'dbs')[0],'dataset')[0] for block_data in get_json(dataset_data, 'block'): size_bytes += get_json(block_data, 'bytes') n_files += get_json(block_data, 'files') coll = 'dataset_data' query = {'name':dataset_name} data = {'$set':{'size_bytes':size_bytes, 'n_files':n_files}} self.storage.update_data(coll=coll, query=query, data=data, upsert=False)