def __check_if_running(self, relmon, database): """ Check if given RelMon is running in HTCondor and get it's status there """ relmon_condor_id = relmon.get_condor_id() self.logger.info('Will check if %s is running in HTCondor, id: %s', relmon, relmon_condor_id) stdout, stderr = self.ssh_executor.execute_command( 'module load lxbatch/tzero && condor_q -af:h ClusterId JobStatus | ' 'grep %s' % (relmon_condor_id)) new_condor_status = '<unknown>' if stdout and not stderr: status_number = stdout.split()[-1] self.logger.info('Relmon %s status is %s', relmon, status_number) status_dict = { '0': 'UNEXPLAINED', '1': 'IDLE', '2': 'RUN', '3': 'REMOVED', '4': 'DONE', '5': 'HOLD', '6': 'SUBMISSION ERROR' } new_condor_status = status_dict.get(status_number, 'REMOVED') else: self.logger.error('Error with HTCondor?\nOutput: %s.\nError %s', stdout, stderr) relmon = RelMon(database.get_relmon(relmon.get_id())) self.logger.info('Saving %s condor status as %s', relmon, new_condor_status) relmon.set_condor_status(new_condor_status) database.update_relmon(relmon)
def __delete_relmon(self, relmon_id, database): """ Terminate and delete RelMon """ relmon_json = database.get_relmon(relmon_id) relmon = RelMon(relmon_json) self.__terminate_relmon(relmon) database.delete_relmon(relmon) local_relmon_directory = 'relmons/%s' % (relmon_id) if os.path.isdir(local_relmon_directory): shutil.rmtree(local_relmon_directory, ignore_errors=True)
def edit_relmon(): """ API for RelMon editing """ if not is_user_authorized(): return output_text({'message': 'Unauthorized'}, code=403) relmon = json.loads(request.data.decode('utf-8')) relmon = RelMon(relmon) database = Database() existing_relmons_with_same_name = database.get_relmons_with_name(relmon.get_name()) for existing_relmon_with_same_name in existing_relmons_with_same_name: if existing_relmon_with_same_name['id'] != relmon.get_id(): return output_text({'message': 'RelMon with this name already exists'}, code=409) relmon_id = relmon.get_id() existing_relmon = database.get_relmon(relmon_id) if not relmon_id or not existing_relmon: return output_text({'message': 'RelMon does not exist'}, code=404) controller.edit_relmon(relmon, database, user_info_dict()) controller_tick() return output_text({'message': 'OK'})
def __reset_relmon(self, relmon_id, database, user_info): """ Perform RelMon reset Terminate it in HTCondor and set to new so it would be submitted again """ relmon_json = database.get_relmon(relmon_id) relmon = RelMon(relmon_json) relmon_status = relmon.get_status() self.__terminate_relmon(relmon) old_username = relmon.get_user_info().get('login') new_username = user_info.get('login') if old_username != new_username and relmon_status != 'done': self.logger.info('Reset by %s while not done, should inform %s', new_username, old_username) self.__send_reset_notification(relmon, user_info) relmon.reset() relmon.set_user_info(user_info) database.update_relmon(relmon)
def tick(self): """ Controller works by doing "ticks" every once in a while During a tick it shoud check all existing relmon's and their status and, if necessary, perform actions like submission or output collection Actions go like this: * Delete relmons that are in deletion list * Reset relmons that are in reset list * Check running relmons * Submit new relmons """ database = Database() self.logger.info('Controller will tick') tick_start = time.time() # Delete relmons self.logger.info('Relmons to delete (%s): %s.', len(self.relmons_to_delete), ','.join([x['id'] for x in self.relmons_to_delete])) for relmon_dict in self.relmons_to_delete: relmon_id = relmon_dict['id'] self.__delete_relmon(relmon_id, database) self.relmons_to_delete.remove(relmon_dict) # Reset relmons self.logger.info('Relmons to reset (%s): %s.', len(self.relmons_to_reset), ', '.join([x['id'] for x in self.relmons_to_reset])) for relmon_dict in self.relmons_to_reset: relmon_id = relmon_dict['id'] self.__reset_relmon(relmon_id, database, relmon_dict['user_info']) self.relmons_to_reset.remove(relmon_dict) # Check relmons relmons_to_check = database.get_relmons_with_status('submitted') relmons_to_check.extend(database.get_relmons_with_status('running')) relmons_to_check.extend(database.get_relmons_with_status('finishing')) # Add relmons with HTCondor status RUN to be checked for relmon_dict in database.get_relmons_with_condor_status('RUN'): for added_relmon in relmons_to_check: if added_relmon['_id'] == relmon_dict['_id']: break else: relmons_to_check.append(relmon_dict) self.logger.info('Relmons to check (%s): %s.', len(relmons_to_check), ', '.join(r.get('id') for r in relmons_to_check)) for relmon_json in relmons_to_check: relmon = RelMon(relmon_json) self.__check_if_running(relmon, database) relmon = RelMon(database.get_relmon(relmon.get_id())) condor_status = relmon.get_condor_status() if condor_status in ('DONE', 'REMOVED'): # Refetch after check if running save self.__collect_output(relmon, database) # Submit relmons relmons_to_submit = database.get_relmons_with_status('new') self.logger.info('Relmons to submit (%s): %s.', len(relmons_to_submit), ', '.join(r.get('id') for r in relmons_to_submit)) for relmon_json in relmons_to_submit: relmon = RelMon(relmon_json) status = relmon.get_status() if status == 'new': # Double check and if it is new, submit it self.__submit_to_condor(relmon, database) self.ssh_executor.close_connections() tick_end = time.time() self.logger.info('Controller tick finished. Took %.2fs', tick_end - tick_start)
def __submit_to_condor(self, relmon, database): """ Take relmon object and submit it to HTCondor """ relmon_id = relmon.get_id() local_relmon_directory = 'relmons/%s' % (relmon_id) if not os.path.isdir(local_relmon_directory): os.mkdir(local_relmon_directory) remote_relmon_directory = '%s/%s' % (self.remote_directory, relmon_id) self.logger.info('Will submit %s to HTCondor', relmon) self.logger.info('Remote directory of %s is %s', relmon, remote_relmon_directory) self.logger.info('Saving %s to database', relmon) database.update_relmon(relmon) # Refetch after update relmon = RelMon(database.get_relmon(relmon_id)) self.logger.info('Resources for %s: CPU: %s, memory: %s, disk %s', relmon, relmon.get_cpu(), relmon.get_memory(), relmon.get_disk()) try: self.logger.info('Will create files for %s', relmon) # Dump the json to a file self.file_creator.create_relmon_file(relmon) # Create HTCondor submit file self.file_creator.create_condor_job_file(relmon) # Create actual job script file self.file_creator.create_job_script_file(relmon) self.logger.info('Will prepare remote directory for %s', relmon) # Prepare remote directory. Delete old one and create a new one self.ssh_executor.execute_command([ 'rm -rf %s' % (remote_relmon_directory), 'mkdir -p %s' % (remote_relmon_directory) ]) self.logger.info('Will upload files for %s', relmon) # Upload relmon json, submit file and script to run local_name = '%s/RELMON_%s' % (local_relmon_directory, relmon_id) remote_name = '%s/RELMON_%s' % (remote_relmon_directory, relmon_id) self.ssh_executor.upload_file('%s.json' % (local_name), '%s.json' % (remote_name)) self.ssh_executor.upload_file('%s.sub' % (local_name), '%s.sub' % (remote_name)) self.ssh_executor.upload_file('%s.sh' % (local_name), '%s.sh' % (remote_name)) self.logger.info('Will try to submit %s', relmon) # Run condor_submit # Submission happens through lxplus as condor is not available on website machine # It is easier to ssh to lxplus than set up condor locally stdout, stderr = self.ssh_executor.execute_command([ 'cd %s' % (remote_relmon_directory), 'voms-proxy-init -voms cms --valid 24:00 --out $(pwd)/proxy.txt', 'module load lxbatch/tzero && condor_submit RELMON_%s.sub' % (relmon_id) ]) # Parse result of condor_submit if stdout and '1 job(s) submitted to cluster' in stdout: # output is "1 job(s) submitted to cluster 801341" relmon.set_status('submitted') condor_id = int(float(stdout.split()[-1])) relmon.set_condor_id(condor_id) relmon.set_condor_status('IDLE') self.logger.info('Submitted %s. Condor job id %s', relmon, condor_id) else: self.logger.error( 'Error submitting %s.\nOutput: %s.\nError %s', relmon, stdout, stderr) relmon.set_status('failed') except Exception as ex: relmon.set_status('failed') self.logger.error('Exception while trying to submit %s: %s', relmon, str(ex)) self.logger.info('%s status is %s', relmon, relmon.get_status()) database.update_relmon(relmon)
def edit_relmon(self, new_relmon, database, user_info): """ Update relmon categories """ relmon_id = new_relmon.get_id() old_relmon_data = database.get_relmon(relmon_id) old_relmon = RelMon(old_relmon_data) if old_relmon.get_status() == 'done': self.logger.info('Relmon %s is done, will try to do a smart edit', old_relmon) new_category_names = [ x['name'] for x in new_relmon.get_json().get('categories') ] old_category_names = [ x['name'] for x in old_relmon.get_json().get('categories') ] self.logger.info('Relmon %s had these categories: %s', old_relmon, old_category_names) self.logger.info('Relmon %s have these categories: %s', new_relmon, new_category_names) categories_changed = False for category_name in set(new_category_names + old_category_names): old_category = old_relmon.get_bare_category(category_name) new_category = new_relmon.get_bare_category(category_name) old_category_string = json.dumps(old_category) new_category_string = json.dumps(new_category) force_rerun = new_relmon.get_category(category_name).get( 'rerun', False) if force_rerun or old_category_string != new_category_string: self.logger.info('Category %s of %s changed', category_name, old_relmon) categories_changed = True old_relmon.get_category(category_name).update(new_category) old_relmon.reset_category(category_name) name_changed = old_relmon_data['name'] != new_relmon.get_name() if name_changed or categories_changed: new_name = new_relmon.get_name() if not categories_changed: # Only name changed, categories did not change, just a rename self.logger.info( 'Renaming %s to %s without changing categories', old_relmon, new_name) self.rename_relmon_reports(relmon_id, new_name) else: # Categories changed, will have to resubmit # Reset relmon without resetting all categories old_relmon.reset(False) old_relmon.set_name(new_name) old_relmon.set_user_info(user_info) database.update_relmon(old_relmon) else: self.logger.info('Nothing changed for %s?', old_relmon) else: self.logger.info('Relmon %s will be reset', old_relmon) old_relmon.get_json()['name'] = new_relmon.get_name() old_relmon.get_json()['categories'] = new_relmon.get_json().get( 'categories', []) # Update only name and categories, do not allow to update anything else old_relmon.reset() database.update_relmon(old_relmon) self.add_to_reset_list(relmon_id, user_info) self.logger.info('Relmon %s was edited', old_relmon)