示例#1
0
 def __init__(self):
     self.logger = logging.getLogger('logger')
     self.database = Database()
     # Cache for DBS filesummaries calls
     self.dataset_filesummaries_cache = {}
     # Cache for DBS dataset info + filiesummaries calls
     self.dataset_info_cache = {}
示例#2
0
def main():
    setup_console_logging()
    logger = logging.getLogger('logger')
    parser = argparse.ArgumentParser(description='Stats2 update')
    parser.add_argument('--action',
                        choices=['update', 'see', 'drop'],
                        required=True,
                        help='Action to be performed.')
    parser.add_argument('--name',
                        required=False,
                        help='Request to be updated.')
    args = vars(parser.parse_args())
    logger.info('Arguments %s' % (str(args)))

    action = args.get('action', None)
    name = args.get('name', None)

    if action == 'update':
        stats_update = StatsUpdate()
        stats_update.perform_update(name)
    elif action == 'see':
        request = Database().get_request(name)
        print(json.dumps(request, indent=4))
    elif action == 'drop':
        Database().clear_database()
示例#3
0
文件: main.py 项目: andrius-k/Stats2
def get_nice_json(request_name):
    database = Database()
    request = database.get_request(request_name)
    if request is None:
        response = make_response("{}", 404)
    else:
        response = make_response(json.dumps(request, indent=4, sort_keys=True), 200)

    response.headers['Content-Type'] = 'application/json'
    return response
示例#4
0
文件: main.py 项目: andrius-k/Stats2
def get_one(request_name):
    database = Database()
    request = database.get_request(request_name)
    if request is None:
        response = make_response("{}", 404)
    else:
        response = make_response(json.dumps(request), 200)

    response.headers['Content-Type'] = 'application/json'
    return response
示例#5
0
def html_search():
    """
    Perform search on given input and redirect to correct search URL
    """
    query = request.args.get('q', '').strip()
    if not query:
        return redirect('/stats', code=302)

    database = Database()
    if database.get_workflows_with_prepid(query, page_size=1):
        return redirect('/stats?prepid=' + query, code=302)

    if database.get_workflows_with_output_dataset(query, page_size=1):
        return redirect('/stats?output_dataset=' + query, code=302)

    if database.get_workflows_with_input_dataset(query, page_size=1):
        return redirect('/stats?input_dataset=' + query, code=302)

    if database.get_workflows_with_campaign(query, page_size=1):
        return redirect('/stats?campaign=' + query, code=302)

    if database.get_workflows_with_type(query, page_size=1):
        return redirect('/stats?type=' + query, code=302)

    if database.get_workflows_with_processing_string(query, page_size=1):
        return redirect('/stats?processing_string=' + query, code=302)

    if database.get_workflows_with_request(query, page_size=1):
        return redirect('/stats?request=' + query, code=302)

    return redirect('/stats?workflow_name=' + query, code=302)
示例#6
0
def html_view_json(workflow_name):
    """
    Return one workflow
    """
    database = Database()
    workflow = database.get_workflow(workflow_name)
    if workflow is None:
        response = make_response("{}", 404)
    else:
        response = make_response(
            json.dumps(workflow, indent=2, sort_keys=True), 200)

    response.headers['Content-Type'] = 'application/json'
    return response
示例#7
0
def main():
    """
    Main function that parses arguments and starts the update
    """
    setup_console_logging()
    logger = logging.getLogger('logger')
    parser = argparse.ArgumentParser(description='Stats2 update')
    parser.add_argument('--action',
                        choices=['update', 'see'],
                        required=True,
                        help='Action to be performed.')
    parser.add_argument('--name',
                        required=False,
                        help='Workflow to be updated.')
    parser.add_argument('--trigger-prod',
                        required=False,
                        action='store_true',
                        help='Trigger production McM to update')
    parser.add_argument('--trigger-dev',
                        required=False,
                        action='store_true',
                        help='Trigger development McM to update')
    args = vars(parser.parse_args())
    logger.info('Arguments %s', str(args))

    action = args.get('action', None)
    name = args.get('name', None)
    trigger_prod = args.get('trigger_prod', False)
    trigger_dev = args.get('trigger_dev', False)

    if action == 'update':
        if not os.environ.get('STATS_DB_AUTH_HEADER'):
            logger.error('STATS_DB_AUTH_HEADER is missing')
            return

        stats_update = StatsUpdate()
        stats_update.perform_update(name, trigger_prod, trigger_dev)
    elif action == 'see':
        workflow = Database().get_workflow(name)
        print(json.dumps(workflow, indent=4))
示例#8
0
文件: main.py 项目: andrius-k/Stats2
def index(page=0):
    database = Database()
    prepid = request.args.get('prepid')
    dataset = request.args.get('dataset')
    campaign = request.args.get('campaign')
    request_type = request.args.get('type')
    request_name = request.args.get('request_name')
    check = request.args.get('check')
    if page < 0:
        page = 0

    if request_name is not None:
        req = database.get_request(request_name)
        if req is not None:
            requests = [req]
        else:
            requests = []

    else:
        if prepid is not None:
            requests = database.get_requests_with_prepid(prepid, page=page, include_docs=True)
        elif dataset is not None:
            requests = database.get_requests_with_dataset(dataset, page=page, include_docs=True)
        elif campaign is not None:
            requests = database.get_requests_with_campaign(campaign, page=page, include_docs=True)
        elif request_type is not None:
            requests = database.get_requests_with_type(request_type, page=page, include_docs=True)
        else:
            requests = database.get_requests(page=page, include_docs=True)

    if check is not None:
        check_with_old_stats(requests)

    pages = [page, page > 0, database.PAGE_SIZE == len(requests)]
    requests = list(filter(lambda req: '_design' not in req['_id'], requests))
    for req in requests:
        req['DonePercent'] = '0.00'
        req['OpenPercent'] = '0.00'
        req['LastDatasetType'] = 'NONE'
        req['LastDataset'] = ''
        req['DoneEvents'] = '0'
        req['LastUpdate'] = time.strftime('%Y&#8209;%m&#8209;%d&nbsp;%H:%M:%S', time.localtime(req['LastUpdate']))

        if len(req['OutputDatasets']) == 0:
            continue

        if len(req['EventNumberHistory']) == 0:
            continue

        last_dataset = req['OutputDatasets'][-1:][0]
        last_history = req['EventNumberHistory'][-1:][0]
        if last_dataset not in last_history['Datasets']:
            continue

        calculated_dataset = last_history['Datasets'][last_dataset]
        dataset_type = calculated_dataset['Type']
        req['LastDatasetType'] = dataset_type
        req['LastDataset'] = last_dataset
        done_events = calculated_dataset['Events']
        req['DoneEvents'] = done_events
        if 'TotalEvents' not in req:
            continue

        if req['TotalEvents'] > 0:
            total_events = req['TotalEvents']
            req['DonePercent'] = '%.2f' % (done_events / total_events * 100.0)

    return render_template('index.html',
                           requests=requests,
                           total_requests=database.get_request_count(),
                           pages=pages,
                           query=request.query_string.decode('utf-8'))
示例#9
0
def get_page(page=0):
    """
    Return a list of workflows based on url query parameters (if any)
    """
    database = Database()
    prepid = request.args.get('prepid')
    output_dataset = request.args.get('output_dataset')
    input_dataset = request.args.get('input_dataset')
    campaign = request.args.get('campaign')
    workflow_type = request.args.get('type')
    workflow_name = request.args.get('workflow_name')
    processing_string = request.args.get('processing_string')
    request_name = request.args.get('request')
    if page < 0:
        page = 0

    if workflow_name is not None:
        req = database.get_workflow(workflow_name)
        if req is not None:
            workflows = [req]
        else:
            workflows = []

    else:
        if prepid is not None:
            workflows = database.get_workflows_with_prepid(prepid,
                                                           page=page,
                                                           include_docs=True)
        elif output_dataset is not None:
            workflows = database.get_workflows_with_output_dataset(
                output_dataset, page=page, include_docs=True)
        elif input_dataset is not None:
            workflows = database.get_workflows_with_input_dataset(
                input_dataset, page=page, include_docs=True)
        elif campaign is not None:
            workflows = database.get_workflows_with_campaign(campaign,
                                                             page=page,
                                                             include_docs=True)
        elif workflow_type is not None:
            workflows = database.get_workflows_with_type(workflow_type,
                                                         page=page,
                                                         include_docs=True)
        elif processing_string is not None:
            workflows = database.get_workflows_with_processing_string(
                processing_string, page=page, include_docs=True)
        elif request_name is not None:
            workflows = database.get_workflows_with_request(request_name,
                                                            page=page,
                                                            include_docs=True)
        else:
            workflows = database.get_workflows(page=page, include_docs=True)

    if prepid is not None or output_dataset is not None or input_dataset is not None or request_name is not None:
        workflows = sorted(
            workflows,
            key=lambda wf: '_'.join(wf.get('RequestName').split('_')[-3:-1]))

    return workflows
示例#10
0
def html_get(page=0):
    """
    Return HTML of selected page
    This method also prettifies some dates, makes campaigns and requests lists unique,
    calculates completness of output datasets
    """
    database = Database()
    workflows = get_page(page)
    pages = [page, page > 0, database.PAGE_SIZE == len(workflows)]
    workflows = list(filter(lambda req: '_design' not in req['_id'],
                            workflows))
    datetime_format = '%Y&#8209;%m&#8209;%d&nbsp;%H:%M:%S'
    now = int(time.time())
    for req in workflows:
        if '_design' in req['_id']:
            continue

        req['FirstStatus'] = ''
        req['LastStatus'] = ''
        if req.get('RequestTransition', []):
            first_transition = req['RequestTransition'][0]
            last_transition = req['RequestTransition'][-1]
            if 'Status' in first_transition and 'UpdateTime' in first_transition:
                status = first_transition['Status']
                update_time = time.strftime(
                    datetime_format,
                    time.localtime(first_transition['UpdateTime']))
                req['FirstStatus'] = status
                req['FirstStatusTime'] = update_time
                req['FirstStatusAgo'] = get_time_diff(
                    first_transition['UpdateTime'], now)

            if 'Status' in last_transition and 'UpdateTime' in last_transition:
                status = last_transition['Status']
                update_time = time.strftime(
                    datetime_format,
                    time.localtime(last_transition['UpdateTime']))
                req['LastStatus'] = status
                req['LastStatusTime'] = update_time
                req['LastStatusAgo'] = get_time_diff(
                    last_transition['UpdateTime'], now)

        req['LastUpdateAgo'] = get_time_diff(req['LastUpdate'], now)
        req['LastUpdate'] = time.strftime(datetime_format,
                                          time.localtime(req['LastUpdate']))
        req['Requests'] = get_unique_list(req.get('Requests', []))
        req['Campaigns'] = get_unique_list(req.get('Campaigns', []))
        service_type, service_name = get_service_type_and_name(req)
        # Links to external pages - McM, ReReco, RelVal, pMp
        attribute = 'request'
        if len(req['Requests']) == 0 and req.get('PrepID'):
            attribute = 'prepid'
            req['Requests'] = [req['PrepID']]

        req['Campaigns'] = [{
            'name':
            x,
            'links':
            get_campaign_links(x, service_type, service_name)
        } for x in req['Campaigns']]
        req['Requests'] = [{
            'name':
            x,
            'attribute':
            attribute,
            'links':
            get_request_links(x, service_type, service_name)
        } for x in req['Requests']]

        calculated_datasets = []

        total_events = req.get('TotalEvents', 0)
        for dataset in req['OutputDatasets']:
            new_dataset = {
                'Name': dataset,
                'Events': 0,
                'Type': 'NONE',
                'CompletedPerc': '0.0',
                'Datatier': dataset.split('/')[-1],
                'Size': -1,
                'NiceSize': '0B'
            }
            for history_entry in reversed(req['EventNumberHistory']):
                history_entry = history_entry['Datasets']
                if dataset in history_entry:
                    new_dataset['Events'] = comma_separate_thousands(
                        history_entry[dataset]['Events'])
                    new_dataset['Type'] = history_entry[dataset]['Type']
                    new_dataset['Size'] = history_entry[dataset].get(
                        'Size', -1)
                    new_dataset['NiceSize'] = get_nice_size(
                        new_dataset['Size'])
                    if total_events > 0:
                        percentage = history_entry[dataset][
                            'Events'] / total_events * 100.0
                        new_dataset['CompletedPerc'] = '%.2f' % (percentage)

                    break

            calculated_datasets.append(new_dataset)

        req['OutputDatasets'] = calculated_datasets
        if 'TotalEvents' in req:
            req['TotalEvents'] = comma_separate_thousands(
                int(req['TotalEvents']))

        if 'RequestPriority' in req:
            req['RequestPriority'] = comma_separate_thousands(
                int(req['RequestPriority']))

    last_stats_update = database.get_setting('last_dbs_update_date', 0)
    last_stats_update = time.strftime(datetime_format,
                                      time.localtime(last_stats_update))

    return render_template('index.html',
                           last_stats_update=last_stats_update,
                           workflows=workflows,
                           total_workflows=database.get_workflow_count(),
                           pages=pages,
                           query=request.query_string.decode('utf-8'))
示例#11
0
class StatsUpdate():
    """
    Update workflows in Stats2 database.
    """

    def __init__(self):
        self.logger = logging.getLogger('logger')
        self.database = Database()
        # Cache for DBS filesummaries calls
        self.dataset_filesummaries_cache = {}
        # Cache for DBS dataset info + filiesummaries calls
        self.dataset_info_cache = {}

    def perform_update(self, workflow_name=None, trigger_prod=False, trigger_dev=False):
        """
        Perform update for specific workflow if workflow name is given or for all changed
        workflows if no name is specified.
        """
        if workflow_name is not None:
            self.perform_update_one(workflow_name, trigger_prod, trigger_dev)
        else:
            self.perform_update_new(trigger_prod, trigger_dev)

        self.logger.info('Workflows after update %d', self.database.get_workflow_count())

    def perform_update_one(self, workflow_name, trigger_prod=False, trigger_dev=False):
        """
        Perform update for specific workflow: fetch new dictionary from RequestManager
        and update event recalculation
        """
        self.logger.info('Will update only one workflow: %s', workflow_name)
        self.update_one(workflow_name, trigger_prod, trigger_dev)
        self.recalculate_one(workflow_name)

    def perform_update_new(self, trigger_prod=False, trigger_dev=False):
        """
        Perform update for all workflows that changed since last update and recalculate
        events for files that changed since last update
        """
        update_start = time.time()
        changed_workflows, deleted_workflows, last_seq = self.get_list_of_changed_workflows()
        self.logger.info('Will delete %d workflows', len(deleted_workflows))
        for workflow_name in deleted_workflows:
            try:
                self.delete_one(workflow_name)
            except Exception as ex:
                self.logger.error('Exception while deleting %s:%s', workflow_name, str(ex))

        previously_crashed_workflows = self.get_list_of_previously_crashed_workflows()
        self.logger.info('Have %d workflows that crashed during last update',
                         len(previously_crashed_workflows))
        changed_workflows = set(changed_workflows).union(set(previously_crashed_workflows))
        self.logger.info('Will update %d workflows', len(changed_workflows))
        for index, workflow_name in enumerate(changed_workflows):
            try:
                self.logger.info('Will update %d/%d workflow', index + 1, len(changed_workflows))
                self.update_one(workflow_name, trigger_prod, trigger_dev)
                self.remove_from_list_of_crashed_workflows(workflow_name)
            except Exception as ex:
                self.add_to_list_of_crashed_workflows(workflow_name)
                self.logger.error('Exception while updating %s:%s\nTraceback:%s',
                                  workflow_name,
                                  str(ex),
                                  traceback.format_exc())

        update_end = time.time()
        self.logger.info('Finished updating workflows')
        self.logger.info('Will update event count')
        related_workflows = self.get_workflows_with_same_output(changed_workflows)
        self.logger.info('There are %s related workflows to %s changed workflows',
                         len(related_workflows),
                         len(changed_workflows))
        changed_datasets = self.get_list_of_workflows_with_changed_datasets()
        workflows_to_recalculate = set(changed_workflows).union(set(changed_datasets))
        workflows_to_recalculate.update(related_workflows)
        self.logger.info('Will update event count for %d workflows', len(workflows_to_recalculate))
        for index, workflow_name in enumerate(workflows_to_recalculate):
            try:
                self.logger.info('Will update event count for %d/%d',
                                 index + 1,
                                 len(workflows_to_recalculate))
                self.recalculate_one(workflow_name, trigger_prod, trigger_dev)
                self.remove_from_list_of_crashed_workflows(workflow_name)
            except Exception as ex:
                self.add_to_list_of_crashed_workflows(workflow_name)
                self.logger.error('Exception while updating event count %s:%s\nTraceback:%s',
                                  workflow_name,
                                  str(ex),
                                  traceback.format_exc())

        recalculation_end = time.time()
        self.database.set_setting('last_reqmgr_sequence', int(last_seq))
        self.database.set_setting('last_dbs_update_date', int(update_start))
        self.logger.info('Updated and deleted %d/%d workflows in %.3fs',
                         len(changed_workflows), len(deleted_workflows),
                         (update_end - update_start))
        self.logger.info('Updated event count for %d workflows in %.3fs',
                         len(workflows_to_recalculate),
                         (recalculation_end - update_end))

    def update_one(self, workflow_name, trigger_prod=False, trigger_dev=False):
        """
        Action to update one workflow's dictionary from RequestManager. If no such
        workflow exist in database, new one will be created.
        """
        self.logger.info('Updating %s', workflow_name)
        update_start = time.time()
        wf_dict = self.get_new_dict_from_reqmgr2(workflow_name)
        wf_dict_old = self.database.get_workflow(workflow_name)
        if wf_dict_old is None:
            wf_dict_old = {'_id': workflow_name}
            self.logger.info('Inserting %s', workflow_name)
            self.database.update_workflow(wf_dict_old)
            wf_dict_old = self.database.get_workflow(workflow_name)

        wf_dict['_rev'] = wf_dict_old['_rev']
        wf_dict['EventNumberHistory'] = wf_dict_old.get('EventNumberHistory', [])
        wf_dict['OutputDatasets'] = self.sort_datasets(wf_dict['OutputDatasets'])
        old_wf_dict_string = json.dumps(wf_dict_old, sort_keys=True)
        new_wf_dict_string = json.dumps(wf_dict, sort_keys=True)
        update_end = time.time()
        if old_wf_dict_string != new_wf_dict_string:
            self.database.update_workflow(wf_dict)
            self.logger.info('Updated %s in %.3fs', workflow_name, (update_end - update_start))
            self.trigger_outside(wf_dict, trigger_prod, trigger_dev)
        else:
            self.logger.info('Did not update %s because it did not change. Time: %.3fs',
                             workflow_name,
                             (update_end - update_start))

    def delete_one(self, workflow_name):
        """
        Action to delete one workflow from database.
        """
        self.logger.info('Deleting %s', workflow_name)
        self.database.delete_workflow(workflow_name)
        self.logger.info('Deleted %s', workflow_name)

    def recalculate_one(self, workflow_name, trigger_prod=False, trigger_dev=False):
        """
        Action to update event count for workflow.
        """
        recalc_start = time.time()
        self.logger.info('Updating event count for %s', workflow_name)
        workflow = self.database.get_workflow(workflow_name)
        if workflow is None:
            self.logger.warning('Will not update %s event count because it\'s no longer in database',
                                workflow_name)
            return

        history_entry = self.get_new_history_entry(workflow)
        added_history_entry = self.add_history_entry_to_workflow(workflow, history_entry)
        recalc_end = time.time()
        if added_history_entry:
            self.database.update_workflow(workflow)
            self.logger.info('Updated event count for %s in %.3fs',
                             workflow_name,
                             (recalc_end - recalc_start))
            self.trigger_outside(workflow, trigger_prod, trigger_dev)
        else:
            self.logger.info('Did not update event count for %s because it did not change. Time: %.3fs',
                             workflow_name,
                             (recalc_end - recalc_start))

    def get_new_dict_from_reqmgr2(self, workflow_name):
        """
        Get workflow dictionary from RequestManager.
        """
        url = f'/couchdb/reqmgr_workload_cache/{workflow_name}'
        wf_dict = make_cmsweb_request(url)
        expected_events = self.get_expected_events_with_dict(wf_dict)
        campaigns = self.get_campaigns_from_workflow(wf_dict)
        requests = self.get_requests_from_workflow(wf_dict)
        attributes = ['AcquisitionEra',
                      'CMSSWVersion',
                      'InputDataset',
                      'OutputDatasets',
                      'PrepID',
                      'ProcessingString',
                      'RequestName',
                      'RequestPriority',
                      'RequestTransition',
                      'RequestType',
                      'SizePerEvent',
                      'TimePerEvent']
        if 'Task1' in wf_dict and 'InputDataset' in wf_dict['Task1']:
            wf_dict['InputDataset'] = wf_dict['Task1']['InputDataset']
        elif 'Step1' in wf_dict and 'InputDataset' in wf_dict['Step1']:
            wf_dict['InputDataset'] = wf_dict['Step1']['InputDataset']

        if 'Task1' in wf_dict and 'ProcessingString' in wf_dict['Task1']:
            wf_dict['ProcessingString'] = wf_dict['Task1']['ProcessingString']
        elif 'Step1' in wf_dict and 'ProcessingString' in wf_dict['Step1']:
            wf_dict['ProcessingString'] = wf_dict['Step1']['ProcessingString']

        wf_dict = pick_attributes(wf_dict, attributes)
        wf_dict['RequestTransition'] = [{'Status': tr['Status'],
                                         'UpdateTime': tr['UpdateTime']} for tr in wf_dict.get('RequestTransition', [])]
        wf_dict['_id'] = workflow_name
        wf_dict['TotalEvents'] = expected_events
        wf_dict['Campaigns'] = campaigns
        wf_dict['Requests'] = requests
        wf_dict['OutputDatasets'] = self.sort_datasets(self.flat_list(wf_dict['OutputDatasets']))
        wf_dict['EventNumberHistory'] = []
        wf_dict['RequestPriority'] = int(wf_dict.get('RequestPriority', 0))
        if 'ProcessingString' in wf_dict and not isinstance(wf_dict['ProcessingString'], str):
            del wf_dict['ProcessingString']

        if 'PrepID' in wf_dict and wf_dict['PrepID'] is None:
            del wf_dict['PrepID']

        return wf_dict

    def flat_list(self, given_list):
        """
        Make list of lists to flat list
        """
        new_list = []
        for element in given_list:
            if not isinstance(element, list):
                new_list.append(element)
            else:
                new_list += self.flat_list(element)

        return new_list

    def __get_filesummaries_from_dbs(self, dataset_name, dataset_access_type=None):
        """
        Get file summary from DBS for given dataset
        """
        query_url = f'/dbs/prod/global/DBSReader/filesummaries?dataset={dataset_name}'
        if dataset_access_type in ('PRODUCTION', 'VALID'):
            query_url += '&validFileOnly=1'

        filesummaries = make_cmsweb_prod_request(query_url)
        if filesummaries:
            return filesummaries[0]

        return {}

    def get_workflows_with_same_output(self, workflow_names):
        """
        Get list of workflow names that have the same output datasets as given workflows
        """
        datasets = set()
        for workflow_name in workflow_names:
            workflow = self.database.get_workflow(workflow_name)
            datasets.update(workflow.get('OutputDatasets', []))

        same_output_workflows = set()
        for dataset in datasets:
            dataset_workflows = self.database.get_workflows_with_output_dataset(dataset, page_size=1000)
            same_output_workflows.update(dataset_workflows)

        return same_output_workflows

    def get_event_count_from_dbs(self, dataset_name, dataset_access_type=None):
        """
        Get event count for specified dataset from DBS.
        """
        if dataset_name not in self.dataset_filesummaries_cache:
            file_summary = self.__get_filesummaries_from_dbs(dataset_name, dataset_access_type)
            self.dataset_filesummaries_cache[dataset_name] = file_summary
        else:
            file_summary = self.dataset_filesummaries_cache[dataset_name]

        num_event = int(file_summary.get('num_event', 0))
        return num_event

    def get_dataset_size_from_dbs(self, dataset_name):
        """
        Get size for specified dataset from DBS.
        """
        if dataset_name not in self.dataset_filesummaries_cache:
            file_summary = self.__get_filesummaries_from_dbs(dataset_name)
            self.dataset_filesummaries_cache[dataset_name] = file_summary
        else:
            file_summary = self.dataset_filesummaries_cache[dataset_name]

        file_size = int(file_summary.get('file_size', 0))
        return file_size

    def get_new_history_entry(self, wf_dict):
        """
        Form a new history entry dictionary for given workflow.
        """
        output_datasets = wf_dict.get('OutputDatasets')
        if not output_datasets:
            return None

        output_datasets_set = set(output_datasets)
        history_entry = {'Time': int(time.time()), 'Datasets': {}}
        dataset_list_url = '/dbs/prod/global/DBSReader/datasetlist'
        output_datasets_to_query = []
        for output_dataset in set(output_datasets):
            if output_dataset in self.dataset_info_cache:
                # Trying to find type, events and size in cache
                cache_entry = self.dataset_info_cache[output_dataset]
                self.logger.info('Found %s dataset info in cache. Type: %s, events: %s, size: %s',
                                 output_dataset,
                                 cache_entry['Type'],
                                 cache_entry['Events'],
                                 cache_entry['Size'])
                history_entry['Datasets'][output_dataset] = cache_entry
                output_datasets_set.remove(output_dataset)
            else:
                # Add dataset to list of datasets that are not in cache
                output_datasets_to_query.append(output_dataset)

        if output_datasets_to_query:
            # Get datasets that were not in cache
            dbs_dataset_list = make_cmsweb_prod_request(dataset_list_url,
                                                        {'dataset': output_datasets_to_query,
                                                         'detail': 1,
                                                         'dataset_access_type': '*'})
        else:
            self.logger.info('Not doing a request to %s because all datasets were in cache',
                             dataset_list_url)
            dbs_dataset_list = []

        for dbs_dataset in dbs_dataset_list:
            # Get events and size for newly queried datasets and add them to cache
            dataset_name = dbs_dataset['dataset']
            dataset_access_type = dbs_dataset['dataset_access_type']
            dataset_events = self.get_event_count_from_dbs(dataset_name, dataset_access_type)
            dataset_size = self.get_dataset_size_from_dbs(dataset_name)
            history_entry['Datasets'][dataset_name] = {'Type': dataset_access_type,
                                                       'Events': dataset_events,
                                                       'Size': dataset_size}
            # Put a copy to cache
            self.dataset_info_cache[dataset_name] = dict(history_entry['Datasets'][dataset_name])
            self.logger.info('Setting %s events, %s size and %s type for %s (%s)',
                             dataset_events,
                             dataset_size,
                             dataset_access_type,
                             dataset_name,
                             wf_dict.get('_id'))
            output_datasets_set.remove(dataset_name)

        for dataset_name in output_datasets_set:
            # Datasets that were not in the cache and not in response of query, make them NONE type with 0 events and 0 size
            dataset_access_type = 'NONE'
            dataset_events = 0
            dataset_size = 0
            # Setting defaults
            history_entry['Datasets'][dataset_name] = {'Type': dataset_access_type,
                                                       'Events': dataset_events,
                                                       'Size': dataset_size}
            # Put a copy to cache
            self.dataset_info_cache[dataset_name] = dict(history_entry['Datasets'][dataset_name])
            self.logger.info('Setting %s events, %s size and %s type for %s (%s)',
                             dataset_events,
                             dataset_size,
                             dataset_access_type,
                             dataset_name,
                             wf_dict.get('_id'))

        if len(history_entry['Datasets']) != len(set(output_datasets)):
            self.logger.error('Wrong number of datasets for %s. '
                              'New history item - %s, '
                              'output datasets - %s, '
                              'returning None',
                              wf_dict['_id'],
                              len(history_entry['Datasets']),
                              len(output_datasets))
            return None

        return history_entry

    def add_history_entry_to_workflow(self, wf_dict, new_history_entry):
        """
        Add history entry to workflow if such entry does not exist.
        """
        if new_history_entry is None:
            return False

        if not new_history_entry.get('Datasets', []):
            # No datasets, no point in adding this entry
            return False

        new_dict_string = json.dumps(new_history_entry['Datasets'], sort_keys=True)
        history_entries = sorted(wf_dict['EventNumberHistory'],
                                 key=lambda entry: entry.get('Time', 0))
        if history_entries:
            last_dict_string = json.dumps(history_entries[-1]['Datasets'], sort_keys=True)
            if new_dict_string == last_dict_string:
                return False

        history_entries.append(new_history_entry)
        wf_dict['EventNumberHistory'] = history_entries
        # self.logger.info(json.dumps(history_entry, indent=2))
        return True

    def get_expected_events_with_dict(self, wf_dict):
        """
        Get number of expected events of a workflow.
        """
        if 'FilterEfficiency' in wf_dict:
            filter_eff = float(wf_dict['FilterEfficiency'])
        elif 'Task1' in wf_dict and 'FilterEfficiency' in wf_dict['Task1']:
            filter_eff = float(wf_dict['Task1']['FilterEfficiency'])
        elif 'Step1' in wf_dict and 'FilterEfficiency' in wf_dict['Step1']:
            filter_eff = float(wf_dict['Step1']['FilterEfficiency'])
        else:
            filter_eff = 1.

        wf_type = wf_dict.get('RequestType', '').lower()
        if wf_type != 'resubmission':
            if wf_dict.get('TotalInputFiles', 0) > 0:
                if 'TotalInputEvents' in wf_dict:
                    return int(filter_eff * wf_dict['TotalInputEvents'])

            if 'RequestNumEvents' in wf_dict and wf_dict['RequestNumEvents'] is not None:
                return int(wf_dict['RequestNumEvents'])

            if 'Task1' in wf_dict and 'RequestNumEvents' in wf_dict['Task1']:
                return int(wf_dict['Task1']['RequestNumEvents'])

            if 'Step1' in wf_dict and 'RequestNumEvents' in wf_dict['Step1']:
                return int(wf_dict['Step1']['RequestNumEvents'])

            if 'Task1' in wf_dict and 'InputDataset' in wf_dict['Task1']:
                return self.get_event_count_from_dbs(wf_dict['Task1']['InputDataset'])

            if 'Step1' in wf_dict and 'InputDataset' in wf_dict['Step1']:
                return self.get_event_count_from_dbs(wf_dict['Step1']['InputDataset'])

        else:
            prep_id = wf_dict['PrepID']
            url = f'/reqmgr2/data/request?mask=TotalInputEvents&mask=RequestType&prep_id={prep_id}'
            ret = make_cmsweb_request(url)
            ret = ret['result']
            if ret:
                ret = ret[0]
                for request_name in ret:
                    if ret[request_name]['RequestType'].lower() != 'resubmission' and ret[request_name]['TotalInputEvents'] is not None:
                        return int(filter_eff * ret[request_name]['TotalInputEvents'])

        self.logger.error('%s does not have total events!', wf_dict['_id'])
        return -1

    def get_campaigns_from_workflow(self, wf_dict):
        """
        Get list of campaigns or acquisition eras in tasks. If there are no tasks, workflow's
        campaign or acquisition era will be used
        """
        task_number = 1
        # Preven infinite loop
        max_tasks = 999
        campaigns = []
        # Check whether it's a TaskChain or a StepChain
        if 'StepChain' in wf_dict:
            task_format = 'Step%s'
        else:
            task_format = 'Task%s'

        while max_tasks > 0:
            max_tasks -= 1
            task_name = task_format % task_number
            if task_name not in wf_dict:
                break

            if wf_dict[task_name].get('Campaign'):
                campaigns.append(wf_dict[task_name]['Campaign'])
            elif wf_dict[task_name].get('AcquisitionEra'):
                campaigns.append(wf_dict[task_name]['AcquisitionEra'])

            task_number += 1

        if not campaigns:
            if wf_dict.get('Campaign'):
                campaigns.append(wf_dict['Campaign'])
            elif wf_dict.get('AcquisitionEra'):
                campaigns.append(wf_dict['AcquisitionEra'])

        return campaigns

    def get_requests_from_workflow(self, wf_dict):
        """
        Get list of request prepids
        """
        task_number = 1
        # Preven infinite loop
        max_tasks = 999
        requests = []
        # Check whether it's a TaskChain or a StepChain
        if 'StepChain' in wf_dict:
            task_format = 'Step%s'
        else:
            task_format = 'Task%s'

        while max_tasks > 0:
            max_tasks -= 1
            task_name = task_format % task_number
            if task_name not in wf_dict:
                break

            if wf_dict[task_name].get('PrepID'):
                requests.append(wf_dict[task_name]['PrepID'])

            task_number += 1

        return requests

    def sort_datasets(self, dataset_list):
        """
        Sort dataset list by specific priority list.
        """
        if len(dataset_list) <= 1:
            return dataset_list

        def tier_priority(dataset):
            dataset_tier = dataset.split('/')[-1:][0]
            # DQMIO priority is the lowest because it does not produce any events
            # and is used only for some statistical things
            tier_priority = ['USER',
                             'FEVT',
                             'RAW-HLT',
                             'ALCARECO',
                             'ALCAPROMPT',
                             'HLT',
                             'DQM',
                             'DQMIO',
                             'DQMROOT',
                             'GEN-SIM-RECODEBUG',
                             'GEN-SIM-DIGI-RECODEBUG',
                             'GEN-SIM-RAWDEBUG',
                             'GEN-SIM-RAW-HLTDEBUG',
                             'GEN-SIM-RAW-HLTDEBUG-RECO',
                             'GEN-SIM-RAW-HLTDEBUG-RECODEBUG',
                             'GEN-SIM-DIGI-RAW-HLTDEBUG-RECO',
                             'GEN-SIM-DIGI-RAW-HLTDEBUG',
                             'GEN-SIM-DIGI-HLTDEBUG-RECO',
                             'GEN-SIM-DIGI-HLTDEBUG',
                             'FEVTDEBUGHLT',
                             'GEN-RAWDEBUG',
                             'RAWDEBUG',
                             'RECODEBUG',
                             'HLTDEBUG',
                             'RAWRECOSIMHLT',
                             'RAW-RECOSIMHLT',
                             'RECOSIMHLT',
                             'FEVTHLTALL',
                             'PREMIXRAW',
                             'PREMIX-RAW',
                             'RAW',
                             'RAW-RECO',
                             'LHE',
                             'GEN',
                             'GEN-RAW',
                             'GEN-SIM',
                             'SIM',
                             'DIGI',
                             'DIGI-RECO',
                             'RECO',
                             'RAWAODSIM',
                             'GEN-SIM-RECO',
                             'GEN-SIM-RAW',
                             'GEN-SIM-RAW-HLT',
                             'GEN-SIM-RAW-RECO',
                             'GEN-SIM-DIGI',
                             'GEN-SIM-DIGI-RECO',
                             'GEN-SIM-DIGI-RAW',
                             'GEN-SIM-DIGI-RAW-RECO',
                             'AOD',
                             'AODSIM',
                             'MINIAOD',
                             'MINIAODSIM',
                             'NANOAOD',
                             'NANOAODSIM']

            for (priority, tier) in enumerate(tier_priority):
                if tier.upper() == dataset_tier:
                    return priority

            return -1

        dataset_list = sorted(dataset_list, key=tier_priority)
        return dataset_list

    def get_list_of_changed_workflows(self):
        """
        Get list of workflows that changed in RequestManager since last update.
        """
        last_seq = self.database.get_setting('last_reqmgr_sequence', 0)
        url = f'/couchdb/reqmgr_workload_cache/_changes?since={last_seq}'
        self.logger.info('Getting the list of all workflows since %d from %s', last_seq, url)
        response = make_cmsweb_request(url)
        last_seq = int(response['last_seq'])
        wf_list = response['results']
        changed_wf_list = list(filter(lambda x: not x.get('deleted', False), wf_list))
        changed_wf_list = [wf['id'] for wf in changed_wf_list]
        changed_wf_list = list(filter(lambda x: '_design' not in x, changed_wf_list))
        deleted_wf_list = list(filter(lambda x: x.get('deleted', False), wf_list))
        deleted_wf_list = [wf['id'] for wf in deleted_wf_list]
        deleted_wf_list = list(filter(lambda x: '_design' not in x, deleted_wf_list))
        self.logger.info('Got %d updated workflows. Got %d deleted workflows.',
                         len(changed_wf_list),
                         len(deleted_wf_list))
        return changed_wf_list, deleted_wf_list, last_seq

    def get_updated_dataset_list_from_dbs(self, since_timestamp=0):
        """
        Get list of datasets that changed since last update.
        """
        url = f'/dbs/prod/global/DBSReader/datasets?min_ldate={since_timestamp}&dataset_access_type=*'
        self.logger.info('Getting the list of modified datasets since %d from %s',
                         since_timestamp,
                         url)
        dataset_list = make_cmsweb_prod_request(url)
        if dataset_list is None:
            self.logger.error('Could not get list of modified datasets since %d from %s',
                              since_timestamp,
                              url)

        dataset_list = [dataset['dataset'] for dataset in dataset_list]
        self.logger.info('Got %d datasets', len(dataset_list))
        return dataset_list

    def get_list_of_workflows_with_changed_datasets(self):
        """
        Get list of workflows whose datasets changed since last update.
        """
        self.logger.info('Will get list of changed datasets')
        workflows = set()
        last_dataset_modification_date = max(0, self.database.get_setting('last_dbs_update_date', 0) - 300) # 300s margin
        updated_datasets = self.get_updated_dataset_list_from_dbs(since_timestamp=last_dataset_modification_date)
        self.logger.info('Will find if any of changed datasets belong to workflows in database')
        for dataset in updated_datasets:
            dataset_workflows = self.database.get_workflows_with_output_dataset(dataset, page_size=1000)
            self.logger.info('%d workflows contain %s', len(dataset_workflows), dataset)
            workflows.update(dataset_workflows)

        workflows_from_wmstats = self.get_active_workflows_from_wmstats()
        workflows.update(set(workflows_from_wmstats))

        self.logger.info('Found %d workflows for changed datasets', len(workflows))
        return workflows

    def get_active_workflows_from_wmstats(self):
        """
        Get list of workflows which are currently putting data to DBS.
        """
        self.logger.info('Will get list of workflows which are currently putting data to DBS')
        url = '/wmstatsserver/data/filtered_requests?mask=RequestName'
        try:
            workflow_list = make_cmsweb_request(url, timeout=600, keep_open=False)
        except AttributeError as ae:
            self.logger.error(ae)
            workflow_list = None

        if workflow_list is None:
            self.logger.error('Could not get list of workflows from wmstats')
            return []

        workflow_list = workflow_list.get('result', [])
        workflow_list = [workflow['RequestName'] for workflow in workflow_list]

        self.logger.info('Found %d workflows which are currently putting data to DBS',
                         len(workflow_list))
        return workflow_list

    def get_list_of_previously_crashed_workflows(self):
        """
        Return list of workflows that failed during previous update
        """
        workflows = self.database.get_setting('failed_workflows', [])
        return list(set(workflows))

    def remove_from_list_of_crashed_workflows(self, workflow_name):
        """
        Remove workflow from list of failed workflows that should be updated during next update
        """
        workflows = self.get_list_of_previously_crashed_workflows()
        if workflow_name in set(workflows):
            workflows = [x for x in workflows if x != workflow_name]
            self.database.set_setting('failed_workflows', workflows)

    def add_to_list_of_crashed_workflows(self, workflow_name):
        """
        Add workflow to list of failed workflows that should be updated during next update
        """
        workflows = self.get_list_of_previously_crashed_workflows()
        if workflow_name not in set(workflows):
            workflows.append(workflow_name)
            self.database.set_setting('failed_workflows', workflows)

    def trigger_outside(self, workflow, trigger_prod=False, trigger_dev=False):
        """
        Trigger something outside (McM) when workflow is updated
        """
        workflow_name = workflow['_id']
        workflow_type = workflow.get('RequestType')
        outside_urls = []
        self.logger.info('Trigger outside for %s (%s)', workflow_name, workflow_type)
        if trigger_prod:
            if workflow_type.lower() == 'rereco' or workflow.get('PrepID', '').startswith('ReReco-'):
                outside_urls.append({'url': 'https://cms-pdmv.cern.ch/rereco/api/requests/update_workflows',
                                     'cookie': 'prod_cookie.txt',
                                     'data': {'prepid': workflow.get('PrepID', '')},
                                     'method': 'POST'})
            elif 'RVCMSSW' in workflow_name:
                outside_urls.append({'url': 'https://cms-pdmv.cern.ch/relval/api/relvals/update_workflows',
                                     'cookie': 'prod_cookie.txt',
                                     'data': {'prepid': workflow.get('PrepID', '')},
                                     'method': 'POST'})
            else:
                outside_urls.append({'url': f'https://cms-pdmv.cern.ch/mcm/restapi/requests/fetch_stats_by_wf/{workflow_name}',
                                     'cookie': 'prod_cookie.txt'})

        if trigger_dev:
            if workflow_type.lower() == 'rereco' or workflow.get('PrepID', '').startswith('ReReco-'):
                outside_urls.append({'url': 'https://cms-pdmv-dev.cern.ch/rereco/api/requests/update_workflows',
                                     'cookie': 'dev_cookie.txt',
                                     'data': {'prepid': workflow.get('PrepID', '')},
                                     'method': 'POST'})
            elif 'RVCMSSW' in workflow_name:
                outside_urls.append({'url': 'https://cms-pdmv-dev.cern.ch/relval/api/relvals/update_workflows',
                                     'cookie': 'dev_cookie.txt',
                                     'data': {'prepid': workflow.get('PrepID', '')},
                                     'method': 'POST'})
            else:
                outside_urls.append({'url': f'https://cms-pdmv-dev.cern.ch/mcm/restapi/requests/fetch_stats_by_wf/{workflow_name}',
                                     'cookie': 'dev_cookie.txt'})

        for outside in outside_urls:
            try:
                self.logger.info('Triggering outside for %s', workflow_name)
                args = ['curl',
                        '-X',
                        outside.get('method', 'GET'),
                        outside['url'],
                        '-s',  # Silent
                        '-k',  # Ignore invalid https certificate
                        '-L',  # Follow 3xx codes
                        '-m 20',  # Timeout 20s
                        '-w %{http_code}',  # Return only HTTP code
                        '-o /dev/null']
                if outside.get('cookie'):
                    self.logger.info('Append cookie "%s" while making request for %s',
                                     outside['cookie'],
                                     workflow_name)
                    args += ['--cookie', outside['cookie']]

                if outside.get('data'):
                    self.logger.info('Adding data "%s" while making request for %s',
                                     outside['data'],
                                     workflow_name)
                    args += ['-d', '\'%s\'' % (json.dumps(outside['data']))]
                    args += ['-H', '"Content-Type: application/json"']

                args = ' '.join(args)
                proc = subprocess.Popen(args, stdout=subprocess.PIPE, shell=True)
                code = proc.communicate()[0]
                code = int(code)
                self.logger.info('HTTP code %s for %s', code, workflow_name)
            except Exception as ex:
                self.logger.error('Exception while trigerring %s for %s. Exception: %s',
                                  outside['url'],
                                  workflow_name,
                                  str(ex))
示例#12
0
 def __init__(self):
     self.logger = logging.getLogger('logger')
     self.database = Database()
示例#13
0
class StatsUpdate():
    """
    Update request info in Stats2 database.
    """

    __SKIPPABLE_STATUS = set([
        'rejected', 'aborted', 'failed', 'rejected-archived',
        'aborted-archived', 'failed-archived', 'aborted-completed'
    ])

    def __init__(self):
        self.logger = logging.getLogger('logger')
        self.database = Database()

    def perform_update(self, request_name=None):
        """
        Perform update for specific request if request name is given or for all changed
        requests if no name is specified.
        """
        if request_name is not None:
            self.perform_update_one(request_name)
        else:
            self.perform_update_new()

        self.logger.info('Requests after update %d' %
                         (self.database.get_request_count()))

    def perform_update_one(self, request_name):
        """
        Perform update for specific request: fetch new dictionary from RequestManager
        and update event recalculation
        """
        self.logger.info('Will update only one request: %s' % (request_name))
        self.update_one(request_name)
        self.recalculate_one(request_name)

    def perform_update_new(self):
        """
        Perform update for all requests that changed since last update and recalculate
        events for files that changed since last update
        """
        update_start = time.time()
        changed_requests, deleted_requests, last_seq = self.get_list_of_changed_requests(
        )
        self.logger.info('Will delete %d requests' % (len(deleted_requests)))
        for request_name in deleted_requests:
            try:
                self.delete_one(request_name)
            except Exception as e:
                self.logger.error('Exception while deleting %s:%s' %
                                  (request_name, str(e)))

        self.logger.info('Will update %d requests' % (len(changed_requests)))
        for index, request_name in enumerate(changed_requests):
            try:
                self.logger.info('Will update %d/%d request' %
                                 (index + 1, len(changed_requests)))
                self.update_one(request_name)
            except Exception as e:
                self.logger.error(
                    'Exception while updating %s:%s\nTraceback:%s' %
                    (request_name, str(e), traceback.format_exc()))

        update_end = time.time()
        self.logger.info('Finished updating requests')
        self.logger.info('Will update event count')
        changed_datasets = self.get_list_of_requests_with_changed_datasets()
        requests_to_recalculate = set(changed_requests).union(
            set(changed_datasets))

        self.logger.info('Will update event count for %d requests' %
                         (len(requests_to_recalculate)))
        for index, request_name in enumerate(requests_to_recalculate):
            try:
                self.logger.info('Will update event count for %d/%d' %
                                 (index + 1, len(requests_to_recalculate)))
                self.recalculate_one(request_name)
            except Exception as e:
                self.logger.error(
                    'Exception while updating event count %s:%s\nTraceback:%s'
                    % (request_name, str(e), traceback.format_exc()))

        recalculation_end = time.time()
        self.database.set_setting('last_reqmgr_sequence', int(last_seq))
        self.database.set_setting('last_dbs_update_date', int(update_start))
        self.logger.info('Updated and deleted %d/%d requests in %.3fs' %
                         (len(changed_requests), len(deleted_requests),
                          (update_end - update_start)))
        self.logger.info('Updated event count for %d requests in %.3fs' %
                         (len(requests_to_recalculate),
                          (recalculation_end - update_end)))

    def update_one(self, request_name):
        """
        Action to update one request's dictionary from RequestManager. If no such
        request exist in database, new one will be created.
        """
        self.logger.info('Updating %s' % (request_name))
        update_start = time.time()
        req_dict = self.get_new_dict_from_reqmgr2(request_name)
        req_transitions = req_dict.get('RequestTransition', [])
        for req_transition in req_transitions:
            if req_transition['Status'] in self.__SKIPPABLE_STATUS:
                self.logger.info(
                    'Skipping and deleting %s because it\'s status is %s' %
                    (request_name, req_transition['Status']))
                self.database.delete_request(request_name)
                return

        req_dict_old = self.database.get_request(request_name)
        if req_dict_old is None:
            req_dict_old = {'_id': request_name}
            self.logger.info('Inserting %s' % (request_name))
            self.database.update_request(req_dict_old)
            req_dict_old = self.database.get_request(request_name)
            # self.steal_history_from_old_stats(req_dict_old)

        req_dict['_rev'] = req_dict_old['_rev']
        req_dict['EventNumberHistory'] = req_dict_old.get(
            'EventNumberHistory', [])
        req_dict['OutputDatasets'] = self.sort_datasets(
            req_dict['OutputDatasets'])
        self.database.update_request(req_dict)
        update_end = time.time()
        self.logger.info('Updated %s in %.3fs' % (request_name,
                                                  (update_end - update_start)))

    def delete_one(self, request_name):
        """
        Action to delete one request from database.
        """
        self.logger.info('Deleting %s' % (request_name))
        self.database.delete_request(request_name)
        self.logger.info('Deleted %s' % (request_name))

    def recalculate_one(self, request_name):
        """
        Action to update event count for request.
        """
        recalc_start = time.time()
        self.logger.info('Updating event count for %s' % (request_name))
        request = self.database.get_request(request_name)
        if request is None:
            self.logger.warning(
                'Will not update %s event count because it\'s no longer in database'
                % (request_name))
            return

        history_entry = self.get_new_history_entry(request)
        added_history_entry = self.add_history_entry_to_request(
            request, history_entry)
        recalc_end = time.time()
        if added_history_entry:
            self.database.update_request(request)
            self.logger.info('Updated event count for %s in %fs' %
                             (request_name, (recalc_end - recalc_start)))
        else:
            self.logger.info('Did not update event count for %s' %
                             (request_name))

    def get_new_dict_from_reqmgr2(self, request_name):
        """
        Get request dictionary from RequestManager.
        """
        url = '/couchdb/reqmgr_workload_cache/%s' % (request_name)
        req_dict = make_cmsweb_request(url)
        expected_events = self.get_expected_events_with_dict(req_dict)
        campaigns = self.get_campaigns_from_request(req_dict)
        req_dict = pick_attributes(req_dict, [
            'AcquisitionEra', 'InputDataset', 'Memory', 'OutputDatasets',
            'PrepID', 'RequestName', 'RequestPriority', 'RequestTransition',
            'RequestType', 'SizePerEvent', 'TimePerEvent'
        ])
        req_dict['RequestTransition'] = [{
            'Status': tr['Status'],
            'UpdateTime': tr['UpdateTime']
        } for tr in req_dict.get('RequestTransition', [])]
        req_dict['_id'] = request_name
        req_dict['TotalEvents'] = expected_events
        req_dict['Campaigns'] = campaigns
        req_dict['OutputDatasets'] = self.sort_datasets(
            req_dict['OutputDatasets'])
        req_dict['EventNumberHistory'] = []
        req_dict['RequestPriority'] = int(req_dict.get('RequestPriority', 0))
        return req_dict

    def get_event_count_from_dbs(self, dataset_name):
        """
        Get event count for specified dataset from DBS.
        """
        query_url = '/dbs/prod/global/DBSReader/filesummaries?dataset=%s' % (
            dataset_name)
        filesummaries = make_cmsweb_request(query_url)
        if len(filesummaries) == 0:
            return 0

        return int(filesummaries[0]['num_event'])

    def get_new_history_entry(self, req_dict, depth=0):
        """
        Form a new history entry dictionary for given request.
        """
        output_datasets = req_dict.get('OutputDatasets', [])
        output_datasets_set = set(output_datasets)
        if len(output_datasets) == 0:
            return None

        history_entry = {'Time': int(time.time()), 'Datasets': {}}
        dataset_list_url = '/dbs/prod/global/DBSReader/datasetlist'
        dbs_dataset_list = make_cmsweb_request(dataset_list_url, {
            'dataset': output_datasets,
            'detail': 1
        })
        for dbs_dataset in dbs_dataset_list:
            dataset_name = dbs_dataset['dataset']
            history_entry['Datasets'][dataset_name] = {
                'Type': dbs_dataset['dataset_access_type'],
                'Events': self.get_event_count_from_dbs(dataset_name)
            }
            output_datasets_set.remove(dataset_name)

        for dataset in output_datasets_set:
            history_entry['Datasets'][dataset] = {'Type': 'NONE', 'Events': 0}

        if len(history_entry['Datasets']) != len(output_datasets):
            self.logger.error(
                'Wrong number of datasets for %s, returning None' %
                (req_dict['_id']))
            return None

        return history_entry

    def add_history_entry_to_request(self, req_dict, new_history_entry):
        """
        Add history entry to request if such entry does not exist.
        """
        if new_history_entry is None:
            return False

        new_dict_string = json.dumps(new_history_entry['Datasets'],
                                     sort_keys=True)
        history_entries = req_dict['EventNumberHistory']
        for history_entry in history_entries:
            old_dict_string = json.dumps(history_entry['Datasets'],
                                         sort_keys=True)
            if new_dict_string == old_dict_string:
                return False

        history_entries.append(new_history_entry)
        # self.logger.info(json.dumps(history_entry, indent=2))
        return True

    def get_expected_events_with_dict(self, req_dict):
        """
        Get number of expected events of a request.
        """
        if 'FilterEfficiency' in req_dict:
            f = float(req_dict['FilterEfficiency'])
        elif 'Task1' in req_dict and 'FilterEfficiency' in req_dict['Task1']:
            f = float(req_dict['Task1']['FilterEfficiency'])
        elif 'Step1' in req_dict and 'FilterEfficiency' in req_dict['Step1']:
            f = float(req_dict['Step1']['FilterEfficiency'])
        else:
            f = 1.

        req_type = req_dict.get('RequestType', '').lower()
        if req_type != 'resubmission':
            if req_dict.get('TotalInputFiles', 0) > 0:
                if 'TotalInputEvents' in req_dict:
                    return int(f * req_dict['TotalInputEvents'])

            if 'RequestNumEvents' in req_dict and req_dict[
                    'RequestNumEvents'] is not None:
                return int(req_dict['RequestNumEvents'])
            elif 'Task1' in req_dict and 'RequestNumEvents' in req_dict[
                    'Task1']:
                return int(req_dict['Task1']['RequestNumEvents'])
            elif 'Step1' in req_dict and 'RequestNumEvents' in req_dict[
                    'Step1']:
                return int(req_dict['Step1']['RequestNumEvents'])
            elif 'Task1' in req_dict and 'InputDataset' in req_dict['Task1']:
                return self.get_event_count_from_dbs(
                    req_dict['Task1']['InputDataset'])
            elif 'Step1' in req_dict and 'InputDataset' in req_dict['Step1']:
                return self.get_event_count_from_dbs(
                    req_dict['Step1']['InputDataset'])

        else:
            prep_id = req_dict['PrepID']
            url = '/reqmgr2/data/request?mask=TotalInputEvents&mask=RequestType&prep_id=%s' % (
                prep_id)
            ret = make_cmsweb_request(url)
            ret = ret['result']
            if len(ret) > 0:
                ret = ret[0]
                for r in ret:
                    if ret[r]['RequestType'].lower() != 'resubmission' and ret[
                            r]['TotalInputEvents'] is not None:
                        return int(f * ret[r]['TotalInputEvents'])

        self.logger.error('%s does not have total events!' % (req_dict['_id']))
        return -1

    def get_campaigns_from_request(self, req_dict):
        """
        Get list of campaigns or acquisition eras in tasks. If there are no tasks, request's
        campaign or acquisition era will be used
        """
        task_number = 1
        campaigns = []
        # Check whether it's a TaskChain or a StepChain
        if 'StepChain' in req_dict:
            task_format = 'Step%s'
        else:
            task_format = 'Task%s'

        while True:
            task_name = task_format % task_number
            if task_name not in req_dict:
                break

            if 'Campaign' in req_dict[task_name]\
                    and req_dict[task_name]['Campaign'] is not None\
                    and len(req_dict[task_name]['Campaign']) > 0:
                campaigns.append(req_dict[task_name]['Campaign'])
            elif 'AcquisitionEra' in req_dict[task_name]\
                    and req_dict[task_name]['AcquisitionEra'] is not None\
                    and len(req_dict[task_name]['AcquisitionEra']) > 0:
                campaigns.append(req_dict[task_name]['AcquisitionEra'])

            task_number += 1

        if len(campaigns) == 0:
            if 'Campaign' in req_dict\
                    and req_dict['Campaign'] is not None\
                    and len(req_dict['Campaign']) > 0:
                campaigns.append(req_dict['Campaign'])
            elif 'AcquisitionEra' in req_dict\
                    and req_dict['AcquisitionEra'] is not None\
                    and len(req_dict['AcquisitionEra']) > 0:
                campaigns.append(req_dict['AcquisitionEra'])

        return campaigns

    def sort_datasets(self, dataset_list):
        """
        Sort dataset list by specific priority list.
        """
        if len(dataset_list) <= 1:
            return dataset_list

        def tierLevel(dataset):
            tier = dataset.split('/')[-1:][0]
            # DQMIO priority is the lowest because it does not produce any events
            # and is used only for some statistical things
            tier_priority = [
                'USER', 'FEVT', 'RAW-HLT', 'ALCARECO', 'ALCAPROMPT', 'HLT',
                'DQM', 'DQMIO', 'DQMROOT', 'GEN-SIM-RECODEBUG',
                'GEN-SIM-DIGI-RECODEBUG', 'GEN-SIM-RAWDEBUG',
                'GEN-SIM-RAW-HLTDEBUG', 'GEN-SIM-RAW-HLTDEBUG-RECO',
                'GEN-SIM-RAW-HLTDEBUG-RECODEBUG',
                'GEN-SIM-DIGI-RAW-HLTDEBUG-RECO', 'GEN-SIM-DIGI-RAW-HLTDEBUG',
                'GEN-SIM-DIGI-HLTDEBUG-RECO', 'GEN-SIM-DIGI-HLTDEBUG',
                'FEVTDEBUGHLT', 'GEN-RAWDEBUG', 'RAWDEBUG', 'RECODEBUG',
                'HLTDEBUG', 'RAWRECOSIMHLT', 'RAW-RECOSIMHLT', 'RECOSIMHLT',
                'FEVTHLTALL', 'PREMIXRAW', 'PREMIX-RAW', 'RAW', 'RAW-RECO',
                'LHE', 'GEN', 'GEN-RAW', 'GEN-SIM', 'SIM', 'DIGI', 'DIGI-RECO',
                'RECO', 'RAWAODSIM', 'GEN-SIM-RECO', 'GEN-SIM-RAW',
                'GEN-SIM-RAW-HLT', 'GEN-SIM-RAW-RECO', 'GEN-SIM-DIGI',
                'GEN-SIM-DIGI-RECO', 'GEN-SIM-DIGI-RAW',
                'GEN-SIM-DIGI-RAW-RECO', 'AOD', 'AODSIM', 'MINIAOD',
                'MINIAODSIM', 'NANOAOD', 'NANOAODSIM'
            ]

            for (p, t) in enumerate(tier_priority):
                if t.upper() == tier:
                    return p

            return -1

        dataset_list = sorted(dataset_list, key=tierLevel)
        return dataset_list

    def get_list_of_changed_requests(self):
        """
        Get list of requests that changed in RequestManager since last update.
        """
        last_seq = self.database.get_setting('last_reqmgr_sequence', 0)
        url = '/couchdb/reqmgr_workload_cache/_changes?since=%d' % (last_seq)
        self.logger.info('Getting the list of all requests since %d from %s' %
                         (last_seq, url))
        response = make_cmsweb_request(url)
        last_seq = int(response['last_seq'])
        req_list = response['results']
        changed_req_list = list(
            filter(lambda x: not x.get('deleted', False), req_list))
        changed_req_list = [req['id'] for req in changed_req_list]
        changed_req_list = list(
            filter(lambda x: '_design' not in x, changed_req_list))
        deleted_req_list = list(
            filter(lambda x: x.get('deleted', False), req_list))
        deleted_req_list = [req['id'] for req in deleted_req_list]
        deleted_req_list = list(
            filter(lambda x: '_design' not in x, deleted_req_list))
        self.logger.info('Got %d updated requests. Got %d deleted requests.' %
                         (len(changed_req_list), len(deleted_req_list)))
        return changed_req_list, deleted_req_list, last_seq

    def get_updated_dataset_list_from_dbs(self, since_timestamp=0):
        """
        Get list of datasets that changed since last update.
        """
        url = '/dbs/prod/global/DBSReader/datasets?min_ldate=%d&dataset_access_type=*' % (
            since_timestamp)
        self.logger.info(
            'Getting the list of modified datasets since %d from %s' %
            (since_timestamp, url))
        dataset_list = make_cmsweb_request(url)
        dataset_list = [dataset['dataset'] for dataset in dataset_list]
        self.logger.info('Got %d datasets' % (len(dataset_list)))
        return dataset_list

    def get_list_of_requests_with_changed_datasets(self):
        """
        Get list of requests whose datasets changed since last update.
        """
        self.logger.info('Will get list of changed datasets')
        requests = set()
        last_dataset_modification_date = self.database.get_setting(
            'last_dbs_update_date', 0)
        updated_datasets = self.get_updated_dataset_list_from_dbs(
            since_timestamp=last_dataset_modification_date)
        self.logger.info(
            'Will find if any of changed datasets belong to requests in database'
        )
        for dataset in updated_datasets:
            dataset_requests = self.database.get_requests_with_dataset(
                dataset, page_size=1000)
            self.logger.info('%d requests contain %s' %
                             (len(dataset_requests), dataset))
            requests.update(dataset_requests)

        requests_from_wmstats = self.get_active_requests_from_wmstats()
        requests.update(set(requests_from_wmstats))

        self.logger.info('Found %d requests for changed datasets' %
                         (len(requests)))
        return requests

    def get_active_requests_from_wmstats(self):
        """
        Get list of requests which are currently putting data to DBS.
        """
        self.logger.info(
            'Will get list of requests which are currently putting data to DBS'
        )
        url = '/wmstatsserver/data/filtered_requests?mask=RequestName'
        request_list = make_cmsweb_request(url).get('result', [])
        request_list = [request['RequestName'] for request in request_list]

        self.logger.info(
            'Found %d requests which are currently putting data to DBS' %
            (len(request_list)))
        return request_list

    def steal_history_from_old_stats(self, req_dict):
        from time import strptime, mktime
        self.logger.info('Stealing history for %s from old Stats... ;)' %
                         (req_dict['_id']))
        if 'EventNumberHistory' not in req_dict:
            req_dict['EventNumberHistory'] = []

        try:
            stats_url = "http://vocms074:5984/stats/%s" % (req_dict['_id'])
            stats_req = make_simple_request(stats_url)
            stats_history = stats_req.get('pdmv_monitor_history', [])
            for stats_history_entry in stats_history:
                timestamp = mktime(
                    strptime(stats_history_entry['pdmv_monitor_time']))
                new_history_entry = {'Time': int(timestamp), 'Datasets': {}}
                for dataset, events_dict in stats_history_entry.get(
                        'pdmv_dataset_statuses', {}).items():
                    type_in_stats = events_dict.get('pdmv_status_in_DAS',
                                                    'NONE')
                    if not type_in_stats:
                        type_in_stats = 'NONE'

                    events_in_stats = int(
                        events_dict.get('pdmv_evts_in_DAS', 0))
                    new_history_entry['Datasets'][dataset] = {
                        'Events': events_in_stats,
                        'Type': type_in_stats
                    }

                self.add_history_entry_to_request(req_dict, new_history_entry)

            def sort_by_time(history_entry):
                return history_entry['Time']

            req_dict['EventNumberHistory'].sort(key=sort_by_time)
        except Exception as ex:
            self.logger.error(ex)