예제 #1
0
    def __check_if_running(self, relmon, database):
        """
        Check if given RelMon is running in HTCondor and get it's status there
        """
        relmon_condor_id = relmon.get_condor_id()
        self.logger.info('Will check if %s is running in HTCondor, id: %s',
                         relmon, relmon_condor_id)
        stdout, stderr = self.ssh_executor.execute_command(
            'module load lxbatch/tzero && condor_q -af:h ClusterId JobStatus | '
            'grep %s' % (relmon_condor_id))
        new_condor_status = '<unknown>'
        if stdout and not stderr:
            status_number = stdout.split()[-1]
            self.logger.info('Relmon %s status is %s', relmon, status_number)
            status_dict = {
                '0': 'UNEXPLAINED',
                '1': 'IDLE',
                '2': 'RUN',
                '3': 'REMOVED',
                '4': 'DONE',
                '5': 'HOLD',
                '6': 'SUBMISSION ERROR'
            }
            new_condor_status = status_dict.get(status_number, 'REMOVED')
        else:
            self.logger.error('Error with HTCondor?\nOutput: %s.\nError %s',
                              stdout, stderr)

        relmon = RelMon(database.get_relmon(relmon.get_id()))
        self.logger.info('Saving %s condor status as %s', relmon,
                         new_condor_status)
        relmon.set_condor_status(new_condor_status)
        database.update_relmon(relmon)
예제 #2
0
 def __delete_relmon(self, relmon_id, database):
     """
     Terminate and delete RelMon
     """
     relmon_json = database.get_relmon(relmon_id)
     relmon = RelMon(relmon_json)
     self.__terminate_relmon(relmon)
     database.delete_relmon(relmon)
     local_relmon_directory = 'relmons/%s' % (relmon_id)
     if os.path.isdir(local_relmon_directory):
         shutil.rmtree(local_relmon_directory, ignore_errors=True)
예제 #3
0
def edit_relmon():
    """
    API for RelMon editing
    """
    if not is_user_authorized():
        return output_text({'message': 'Unauthorized'}, code=403)

    relmon = json.loads(request.data.decode('utf-8'))
    relmon = RelMon(relmon)
    database = Database()
    existing_relmons_with_same_name = database.get_relmons_with_name(relmon.get_name())
    for existing_relmon_with_same_name in existing_relmons_with_same_name:
        if existing_relmon_with_same_name['id'] != relmon.get_id():
            return output_text({'message': 'RelMon with this name already exists'}, code=409)

    relmon_id = relmon.get_id()
    existing_relmon = database.get_relmon(relmon_id)
    if not relmon_id or not existing_relmon:
        return output_text({'message': 'RelMon does not exist'}, code=404)

    controller.edit_relmon(relmon, database, user_info_dict())
    controller_tick()
    return output_text({'message': 'OK'})
예제 #4
0
    def __reset_relmon(self, relmon_id, database, user_info):
        """
        Perform RelMon reset
        Terminate it in HTCondor and set to new so it would be submitted again
        """
        relmon_json = database.get_relmon(relmon_id)
        relmon = RelMon(relmon_json)
        relmon_status = relmon.get_status()
        self.__terminate_relmon(relmon)
        old_username = relmon.get_user_info().get('login')
        new_username = user_info.get('login')
        if old_username != new_username and relmon_status != 'done':
            self.logger.info('Reset by %s while not done, should inform %s',
                             new_username, old_username)
            self.__send_reset_notification(relmon, user_info)

        relmon.reset()
        relmon.set_user_info(user_info)
        database.update_relmon(relmon)
예제 #5
0
    def tick(self):
        """
        Controller works by doing "ticks" every once in a while
        During a tick it shoud check all existing relmon's and
        their status and, if necessary, perform actions like
        submission or output collection
        Actions go like this:
        * Delete relmons that are in deletion list
        * Reset relmons that are in reset list
        * Check running relmons
        * Submit new relmons
        """
        database = Database()
        self.logger.info('Controller will tick')
        tick_start = time.time()
        # Delete relmons
        self.logger.info('Relmons to delete (%s): %s.',
                         len(self.relmons_to_delete),
                         ','.join([x['id'] for x in self.relmons_to_delete]))
        for relmon_dict in self.relmons_to_delete:
            relmon_id = relmon_dict['id']
            self.__delete_relmon(relmon_id, database)
            self.relmons_to_delete.remove(relmon_dict)

        # Reset relmons
        self.logger.info('Relmons to reset (%s): %s.',
                         len(self.relmons_to_reset),
                         ', '.join([x['id'] for x in self.relmons_to_reset]))
        for relmon_dict in self.relmons_to_reset:
            relmon_id = relmon_dict['id']
            self.__reset_relmon(relmon_id, database, relmon_dict['user_info'])
            self.relmons_to_reset.remove(relmon_dict)

        # Check relmons
        relmons_to_check = database.get_relmons_with_status('submitted')
        relmons_to_check.extend(database.get_relmons_with_status('running'))
        relmons_to_check.extend(database.get_relmons_with_status('finishing'))
        # Add relmons with HTCondor status RUN to be checked
        for relmon_dict in database.get_relmons_with_condor_status('RUN'):
            for added_relmon in relmons_to_check:
                if added_relmon['_id'] == relmon_dict['_id']:
                    break
            else:
                relmons_to_check.append(relmon_dict)

        self.logger.info('Relmons to check (%s): %s.', len(relmons_to_check),
                         ', '.join(r.get('id') for r in relmons_to_check))
        for relmon_json in relmons_to_check:
            relmon = RelMon(relmon_json)
            self.__check_if_running(relmon, database)
            relmon = RelMon(database.get_relmon(relmon.get_id()))
            condor_status = relmon.get_condor_status()
            if condor_status in ('DONE', 'REMOVED'):
                # Refetch after check if running save
                self.__collect_output(relmon, database)

        # Submit relmons
        relmons_to_submit = database.get_relmons_with_status('new')
        self.logger.info('Relmons to submit (%s): %s.', len(relmons_to_submit),
                         ', '.join(r.get('id') for r in relmons_to_submit))
        for relmon_json in relmons_to_submit:
            relmon = RelMon(relmon_json)
            status = relmon.get_status()
            if status == 'new':
                # Double check and if it is new, submit it
                self.__submit_to_condor(relmon, database)

        self.ssh_executor.close_connections()
        tick_end = time.time()
        self.logger.info('Controller tick finished. Took %.2fs',
                         tick_end - tick_start)
예제 #6
0
    def __submit_to_condor(self, relmon, database):
        """
        Take relmon object and submit it to HTCondor
        """
        relmon_id = relmon.get_id()
        local_relmon_directory = 'relmons/%s' % (relmon_id)
        if not os.path.isdir(local_relmon_directory):
            os.mkdir(local_relmon_directory)

        remote_relmon_directory = '%s/%s' % (self.remote_directory, relmon_id)
        self.logger.info('Will submit %s to HTCondor', relmon)
        self.logger.info('Remote directory of %s is %s', relmon,
                         remote_relmon_directory)
        self.logger.info('Saving %s to database', relmon)
        database.update_relmon(relmon)
        # Refetch after update
        relmon = RelMon(database.get_relmon(relmon_id))
        self.logger.info('Resources for %s: CPU: %s, memory: %s, disk %s',
                         relmon, relmon.get_cpu(), relmon.get_memory(),
                         relmon.get_disk())
        try:
            self.logger.info('Will create files for %s', relmon)
            # Dump the json to a file
            self.file_creator.create_relmon_file(relmon)
            # Create HTCondor submit file
            self.file_creator.create_condor_job_file(relmon)
            # Create actual job script file
            self.file_creator.create_job_script_file(relmon)

            self.logger.info('Will prepare remote directory for %s', relmon)
            # Prepare remote directory. Delete old one and create a new one
            self.ssh_executor.execute_command([
                'rm -rf %s' % (remote_relmon_directory),
                'mkdir -p %s' % (remote_relmon_directory)
            ])

            self.logger.info('Will upload files for %s', relmon)
            # Upload relmon json, submit file and script to run
            local_name = '%s/RELMON_%s' % (local_relmon_directory, relmon_id)
            remote_name = '%s/RELMON_%s' % (remote_relmon_directory, relmon_id)
            self.ssh_executor.upload_file('%s.json' % (local_name),
                                          '%s.json' % (remote_name))
            self.ssh_executor.upload_file('%s.sub' % (local_name),
                                          '%s.sub' % (remote_name))
            self.ssh_executor.upload_file('%s.sh' % (local_name),
                                          '%s.sh' % (remote_name))

            self.logger.info('Will try to submit %s', relmon)
            # Run condor_submit
            # Submission happens through lxplus as condor is not available on website machine
            # It is easier to ssh to lxplus than set up condor locally
            stdout, stderr = self.ssh_executor.execute_command([
                'cd %s' % (remote_relmon_directory),
                'voms-proxy-init -voms cms --valid 24:00 --out $(pwd)/proxy.txt',
                'module load lxbatch/tzero && condor_submit RELMON_%s.sub' %
                (relmon_id)
            ])
            # Parse result of condor_submit
            if stdout and '1 job(s) submitted to cluster' in stdout:
                # output is "1 job(s) submitted to cluster 801341"
                relmon.set_status('submitted')
                condor_id = int(float(stdout.split()[-1]))
                relmon.set_condor_id(condor_id)
                relmon.set_condor_status('IDLE')
                self.logger.info('Submitted %s. Condor job id %s', relmon,
                                 condor_id)
            else:
                self.logger.error(
                    'Error submitting %s.\nOutput: %s.\nError %s', relmon,
                    stdout, stderr)
                relmon.set_status('failed')

        except Exception as ex:
            relmon.set_status('failed')
            self.logger.error('Exception while trying to submit %s: %s',
                              relmon, str(ex))

        self.logger.info('%s status is %s', relmon, relmon.get_status())
        database.update_relmon(relmon)
예제 #7
0
    def edit_relmon(self, new_relmon, database, user_info):
        """
        Update relmon categories
        """
        relmon_id = new_relmon.get_id()
        old_relmon_data = database.get_relmon(relmon_id)
        old_relmon = RelMon(old_relmon_data)
        if old_relmon.get_status() == 'done':
            self.logger.info('Relmon %s is done, will try to do a smart edit',
                             old_relmon)
            new_category_names = [
                x['name'] for x in new_relmon.get_json().get('categories')
            ]
            old_category_names = [
                x['name'] for x in old_relmon.get_json().get('categories')
            ]
            self.logger.info('Relmon %s had these categories: %s', old_relmon,
                             old_category_names)
            self.logger.info('Relmon %s have these categories: %s', new_relmon,
                             new_category_names)
            categories_changed = False
            for category_name in set(new_category_names + old_category_names):
                old_category = old_relmon.get_bare_category(category_name)
                new_category = new_relmon.get_bare_category(category_name)
                old_category_string = json.dumps(old_category)
                new_category_string = json.dumps(new_category)
                force_rerun = new_relmon.get_category(category_name).get(
                    'rerun', False)
                if force_rerun or old_category_string != new_category_string:
                    self.logger.info('Category %s of %s changed',
                                     category_name, old_relmon)
                    categories_changed = True
                    old_relmon.get_category(category_name).update(new_category)
                    old_relmon.reset_category(category_name)

            name_changed = old_relmon_data['name'] != new_relmon.get_name()
            if name_changed or categories_changed:
                new_name = new_relmon.get_name()
                if not categories_changed:
                    # Only name changed, categories did not change, just a rename
                    self.logger.info(
                        'Renaming %s to %s without changing categories',
                        old_relmon, new_name)
                    self.rename_relmon_reports(relmon_id, new_name)
                else:
                    # Categories changed, will have to resubmit
                    # Reset relmon without resetting all categories
                    old_relmon.reset(False)

                old_relmon.set_name(new_name)
                old_relmon.set_user_info(user_info)
                database.update_relmon(old_relmon)
            else:
                self.logger.info('Nothing changed for %s?', old_relmon)

        else:
            self.logger.info('Relmon %s will be reset', old_relmon)
            old_relmon.get_json()['name'] = new_relmon.get_name()
            old_relmon.get_json()['categories'] = new_relmon.get_json().get(
                'categories', [])
            # Update only name and categories, do not allow to update anything else
            old_relmon.reset()
            database.update_relmon(old_relmon)
            self.add_to_reset_list(relmon_id, user_info)

        self.logger.info('Relmon %s was edited', old_relmon)