Exemplo n.º 1
0
 def __init__(self, config=dict()):
     self.logger = logging.getLogger(__name__)
     self.config = config
     self.storage = StorageManager(self.config)
     self.sites = SiteManager(self.config)
     self.datasets = DatasetManager(self.config)
     self.popularity = PopularityManager(self.config)
Exemplo n.º 2
0
 def __init__(self, config=dict()):
     self.logger = logging.getLogger(__name__)
     self.config = config
     self.sites = SiteManager(self.config)
     self.datasets = DatasetManager(self.config)
     self.popularity = PopularityManager(self.config)
     self.storage = StorageManager(self.config)
     self.max_replicas = int(config['rocker_board']['max_replicas'])
     self.MAX_THREADS = int(config['threading']['max_threads'])
     self.dataset_popularity = dict()
Exemplo n.º 3
0
 def __init__(self, config=dict()):
     self.logger = logging.getLogger(__name__)
     self.config = config
     self.storage = StorageManager(self.config)
     self.sites = SiteManager(self.config)
     self.datasets = DatasetManager(self.config)
     self.popularity = PopularityManager(self.config)
Exemplo n.º 4
0
 def __init__(self, config=dict()):
     self.logger = logging.getLogger(__name__)
     self.config = config
     self.sites = SiteManager(self.config)
     self.datasets = DatasetManager(self.config)
     self.popularity = PopularityManager(self.config)
     self.storage = StorageManager(self.config)
     self.max_replicas = int(config['rocker_board']['max_replicas'])
     self.MAX_THREADS = int(config['threading']['max_threads'])
     self.dataset_popularity = dict()
Exemplo n.º 5
0
 def __init__(self, config=dict()):
     self.logger = logging.getLogger(__name__)
     self.config = config
     self.phedex = PhEDExService(self.config)
     self.mit_db = MITDBService(self.config)
     self.datasets = DatasetManager(self.config)
     self.sites = SiteManager(self.config)
     self.popularity = PopularityManager(self.config)
     self.storage = StorageManager(self.config)
     self.rankings = Ranker(self.config)
     self.max_gb = int(self.config['rocker_board']['max_gb'])
     self.csv_data = list()
Exemplo n.º 6
0
class Initiate(object):
    """
    Initiate Database
    """
    def __init__(self, config=dict()):
        self.logger = logging.getLogger(__name__)
        self.config = config
        self.sites = SiteManager(self.config)
        self.datasets = DatasetManager(self.config)
        self.popularity = PopularityManager(self.config)

    def start(self):
        """
        Begin Initiating Database
        """
        t1 = datetime.datetime.utcnow()
        self.sites.initiate_db()
        self.datasets.initiate_db()
        self.popularity.initiate_db()
        t2 = datetime.datetime.utcnow()
        td = t2 - t1
        self.logger.info('Initiate took %s', str(td))
Exemplo n.º 7
0
class UpdateDB(object):
    """
    Update DB with new dataset and site data
    """
    def __init__(self, config=dict()):
        self.logger = logging.getLogger(__name__)
        self.config = config
        self.storage = StorageManager(self.config)
        self.sites = SiteManager(self.config)
        self.datasets = DatasetManager(self.config)
        self.popularity = PopularityManager(self.config)

    def start(self):
        """
        Begin Database Update
        """
        t1 = datetime.datetime.utcnow()
        self.sites.update_db()
        self.datasets.update_db()
        self.popularity.update_db()
        t2 = datetime.datetime.utcnow()
        td = t2 - t1
        self.logger.info('Update DB took %s', str(td))
Exemplo n.º 8
0
 def __init__(self, config=dict()):
     self.logger = logging.getLogger(__name__)
     self.config = config
     self.sites = SiteManager(self.config)
     self.datasets = DatasetManager(self.config)
     self.popularity = PopularityManager(self.config)
     self.storage = StorageManager(self.config)
     self.max_replicas = int(config['rocker_board']['max_replicas'])
     self.name = 'generic'
     self.data_path = self.config['paths']['data']
     self.data_tiers = config['tools']['valid_tiers'].split(',')
     self.preprocessed_data = dict()
     self.clf_trend = dict()
     self.clf_avg = dict()
Exemplo n.º 9
0
class UpdateDB(object):
    """
    Update DB with new dataset and site data
    """

    def __init__(self, config=dict()):
        self.logger = logging.getLogger(__name__)
        self.config = config
        self.storage = StorageManager(self.config)
        self.sites = SiteManager(self.config)
        self.datasets = DatasetManager(self.config)
        self.popularity = PopularityManager(self.config)

    def start(self):
        """
        Begin Database Update
        """
        t1 = datetime.datetime.utcnow()
        self.sites.update_db()
        self.datasets.update_db()
        self.popularity.update_db()
        t2 = datetime.datetime.utcnow()
        td = t2 - t1
        self.logger.info("Update DB took %s", str(td))
Exemplo n.º 10
0
class Ranker(object):
    """
    Generic Ranking class
    """
    def __init__(self, config=dict()):
        self.logger = logging.getLogger(__name__)
        self.config = config
        self.sites = SiteManager(self.config)
        self.datasets = DatasetManager(self.config)
        self.popularity = PopularityManager(self.config)
        self.storage = StorageManager(self.config)
        self.max_replicas = int(config['rocker_board']['max_replicas'])
        self.MAX_THREADS = int(config['threading']['max_threads'])
        self.dataset_popularity = dict()

    def get_dataset_rankings(self,
                             date=datetime_day(datetime.datetime.utcnow())):
        """
        Generate dataset rankings
        """
        self.dataset_popularity = dict()
        dataset_names = self.datasets.get_db_datasets()
        q = Queue.Queue()
        for i in range(self.MAX_THREADS):
            worker = threading.Thread(target=self.get_dataset_popularity,
                                      args=(q, ))
            worker.daemon = True
            worker.start()
        # self.dataset_features = self.popularity.get_features(dataset_names, date)
        # self.dataset_tiers = self.datasets.get_data_tiers(dataset_names)
        for dataset_name in dataset_names:
            q.put((dataset_name, date))
        q.join()
        dataset_rankings = self.normalize_popularity(date)
        return dataset_rankings

    def get_site_rankings(self, date=datetime_day(datetime.datetime.utcnow())):
        """
        Generate site rankings
        """
        # get all sites which can be replicated to
        site_names = self.sites.get_available_sites()
        site_rankings = dict()
        for site_name in site_names:
            # get popularity
            popularity = self.get_site_popularity(site_name, date)
            # get cpu and storage (performance)
            performance = self.sites.get_performance(site_name)
            # get available storage
            available_storage_tb = self.sites.get_available_storage(
                site_name) / 10**3
            if available_storage_tb <= 0:
                available_storage_tb = 0
            else:
                available_storage_tb = 1
            #calculate rank
            try:
                rank = (performance * available_storage_tb) / popularity
            except:
                rank = 0.0
            # store into dict
            site_rankings[site_name] = rank
            # insert into database
            coll = 'site_rankings'
            query = {'name': site_name, 'date': date}
            data = {
                '$set': {
                    'name': site_name,
                    'date': date,
                    'rank': rank,
                    'popularity': popularity
                }
            }
            self.storage.update_data(coll=coll,
                                     query=query,
                                     data=data,
                                     upsert=True)
        return site_rankings

    def get_dataset_popularity(self, q):
        """
        Get the estimated popularity for dataset
        """
        while True:
            # collect features
            data = q.get()
            dataset_name = data[0]
            date = data[1]
            popularity = 0.0
            # get average
            popularity = self.popularity.get_average_popularity(
                dataset_name, date)
            self.dataset_popularity[dataset_name] = popularity
            q.task_done()

    def get_site_popularity(self,
                            site_name,
                            date=datetime_day(datetime.datetime.utcnow())):
        """
        Get popularity for site
        """
        # get all datasets with a replica at the site and how many replicas it has
        coll = 'dataset_data'
        pipeline = list()
        match = {'$match': {'replicas': site_name}}
        pipeline.append(match)
        project = {'$project': {'name': 1, '_id': 0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        popularity = 0.0
        dataset_names = [dataset_data['name'] for dataset_data in data]
        # get the popularity of the dataset and decide by number of replicas
        coll = 'dataset_rankings'
        pipeline = list()
        match = {'$match': {'date': date}}
        pipeline.append(match)
        match = {'$match': {'name': {'$in': dataset_names}}}
        pipeline.append(match)
        group = {
            '$group': {
                '_id': '$date',
                'total_popularity': {
                    '$sum': '$popularity'
                }
            }
        }
        pipeline.append(group)
        project = {'$project': {'total_popularity': 1, '_id': 0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        try:
            popularity = data[0]['total_popularity']
        except:
            popularity = 0.0
        return popularity

    def get_site_storage_rankings(self, subscriptions):
        """
        Return the amount over the soft limit sites are including new subscriptions
        If site is not over just set to 0
        """
        site_rankings = dict()
        available_sites = self.sites.get_available_sites()
        for site_name in available_sites:
            site_rankings[site_name] = self.sites.get_over_soft_limit(
                site_name)
        for subscription in subscriptions:
            site_rankings[subscription[1]] += self.datasets.get_size(
                subscription[0])
        for site_name in available_sites:
            if site_rankings[site_name] < 0:
                del site_rankings[site_name]
        return site_rankings

    def normalize_popularity(self, date):
        """
        Normalize popularity values to be between 1 and max_replicas
        """
        dataset_rankings = dict()
        max_pop = max(self.dataset_popularity.iteritems(),
                      key=operator.itemgetter(1))[1]
        min_pop = min(self.dataset_popularity.iteritems(),
                      key=operator.itemgetter(1))[1]
        n = float(min_pop + (self.max_replicas - 1)) / max_pop
        m = 1 - n * min_pop
        for dataset_name, popularity in self.dataset_popularity.items():
            # store into dict
            rank = int(n * self.dataset_popularity[dataset_name] + m)
            dataset_rankings[dataset_name] = rank
            coll = 'dataset_rankings'
            query = data = {'name': dataset_name, 'date': date}
            data = {
                '$set': {
                    'name': dataset_name,
                    'date': date,
                    'rank': rank,
                    'popularity': popularity
                }
            }
            self.storage.update_data(coll=coll,
                                     query=query,
                                     data=data,
                                     upsert=True)
        return dataset_rankings
Exemplo n.º 11
0
class Ranker(object):
    """
    Generic Ranking class
    """
    def __init__(self, config=dict()):
        self.logger = logging.getLogger(__name__)
        self.config = config
        self.sites = SiteManager(self.config)
        self.datasets = DatasetManager(self.config)
        self.popularity = PopularityManager(self.config)
        self.storage = StorageManager(self.config)
        self.max_replicas = int(config['rocker_board']['max_replicas'])
        self.MAX_THREADS = int(config['threading']['max_threads'])
        self.dataset_popularity = dict()

    def get_dataset_rankings(self, date=datetime_day(datetime.datetime.utcnow())):
        """
        Generate dataset rankings
        """
        self.dataset_popularity = dict()
        dataset_names = self.datasets.get_db_datasets()
        q = Queue.Queue()
        for i in range(self.MAX_THREADS):
            worker = threading.Thread(target=self.get_dataset_popularity, args=(q,))
            worker.daemon = True
            worker.start()
        # self.dataset_features = self.popularity.get_features(dataset_names, date)
        # self.dataset_tiers = self.datasets.get_data_tiers(dataset_names)
        for dataset_name in dataset_names:
            q.put((dataset_name, date))
        q.join()
        dataset_rankings = self.normalize_popularity(date)
        return dataset_rankings

    def get_site_rankings(self, date=datetime_day(datetime.datetime.utcnow())):
        """
        Generate site rankings
        """
        # get all sites which can be replicated to
        site_names = self.sites.get_available_sites()
        site_rankings = dict()
        for site_name in site_names:
            # get popularity
            popularity = self.get_site_popularity(site_name, date)
            # get cpu and storage (performance)
            performance = self.sites.get_performance(site_name)
            # get available storage
            available_storage_tb = self.sites.get_available_storage(site_name)/10**3
            if available_storage_tb <= 0:
                available_storage_tb = 0
            else:
                available_storage_tb = 1
            #calculate rank
            try:
                rank = (performance*available_storage_tb)/popularity
            except:
                rank = 0.0
            # store into dict
            site_rankings[site_name] = rank
            # insert into database
            coll = 'site_rankings'
            query = {'name':site_name, 'date':date}
            data = {'$set':{'name':site_name, 'date':date, 'rank':rank, 'popularity':popularity}}
            self.storage.update_data(coll=coll, query=query, data=data, upsert=True)
        return site_rankings

    def get_dataset_popularity(self, q):
        """
        Get the estimated popularity for dataset
        """
        while True:
            # collect features
            data = q.get()
            dataset_name = data[0]
            date = data[1]
            popularity = 0.0
            # get average
            popularity = self.popularity.get_average_popularity(dataset_name, date)
            self.dataset_popularity[dataset_name] = popularity
            q.task_done()

    def get_site_popularity(self, site_name, date=datetime_day(datetime.datetime.utcnow())):
        """
        Get popularity for site
        """
        # get all datasets with a replica at the site and how many replicas it has
        coll = 'dataset_data'
        pipeline = list()
        match = {'$match':{'replicas':site_name}}
        pipeline.append(match)
        project = {'$project':{'name':1, '_id':0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        popularity = 0.0
        dataset_names = [dataset_data['name'] for dataset_data in data]
        # get the popularity of the dataset and decide by number of replicas
        coll = 'dataset_rankings'
        pipeline = list()
        match = {'$match':{'date':date}}
        pipeline.append(match)
        match = {'$match':{'name':{'$in':dataset_names}}}
        pipeline.append(match)
        group = {'$group':{'_id':'$date', 'total_popularity':{'$sum':'$popularity'}}}
        pipeline.append(group)
        project = {'$project':{'total_popularity':1, '_id':0}}
        pipeline.append(project)
        data = self.storage.get_data(coll=coll, pipeline=pipeline)
        try:
            popularity = data[0]['total_popularity']
        except:
            popularity = 0.0
        return popularity

    def get_site_storage_rankings(self, subscriptions):
        """
        Return the amount over the soft limit sites are including new subscriptions
        If site is not over just set to 0
        """
        site_rankings = dict()
        available_sites = self.sites.get_available_sites()
        for site_name in available_sites:
            site_rankings[site_name] = self.sites.get_over_soft_limit(site_name)
        for subscription in subscriptions:
            site_rankings[subscription[1]] += self.datasets.get_size(subscription[0])
        for site_name in available_sites:
            if site_rankings[site_name] < 0:
                del site_rankings[site_name]
        return site_rankings

    def normalize_popularity(self, date):
        """
        Normalize popularity values to be between 1 and max_replicas
        """
        dataset_rankings = dict()
        max_pop = max(self.dataset_popularity.iteritems(), key=operator.itemgetter(1))[1]
        min_pop = min(self.dataset_popularity.iteritems(), key=operator.itemgetter(1))[1]
        n = float(min_pop + (self.max_replicas - 1))/max_pop
        m = 1 - n*min_pop
        for dataset_name, popularity in self.dataset_popularity.items():
            # store into dict
            rank = int(n*self.dataset_popularity[dataset_name] + m)
            dataset_rankings[dataset_name] = rank
            coll = 'dataset_rankings'
            query = data = {'name':dataset_name, 'date':date}
            data = {'$set':{'name':dataset_name, 'date':date, 'rank':rank, 'popularity':popularity}}
            self.storage.update_data(coll=coll, query=query, data=data, upsert=True)
        return dataset_rankings