def kill_jobs(kill_names, message1='Killing job: ', message2=' early'): """This function takes a list of job names and kills the jobs associated with them, if the jobs are active Parameters ---------- kill_names : list List of jobs to kill. message1 : str Message prefix to report to stdout. message2 : str Message suffix to report to stdout. """ # This function takes a list of job names and kills the jobs associated with them, if the jobs are active if type(kill_names) != list: kill_names = [kill_names] machine = tools.get_machine() active_jobs, active_ids = tools.list_active_jobs(ids=True) active_jobs = list(zip(active_jobs, active_ids)) jobs_to_kill = [[name, id_] for name, id_ in active_jobs if name in kill_names] for name, id_ in jobs_to_kill: print(message1 + name + message2) if machine in ['gibraltar']: tools.call_bash('qdel ' + str(id_)) elif machine in ['comet', 'bridges']: tools.call_bash('scancel ' + str(id_)) else: raise ValueError('Sardines.')
def loop_convert_jobs(path=False, gene=False): runs = dict() basejobs = collect_base_jobs(path=path) active_jobs = list_active_jobs() for job in basejobs: this_run = jobmanager2mAD(job, active_jobs, gene=gene) if this_run and this_run.converged: runs.update({'/'.join(job): this_run}) return runs
def kill_jobs(kill_names, message1='Killing job: ', message2=' early'): # This function takes a list of job names and kills the jobs associated with them, if the jobs are active if type(kill_names) != list: kill_names = [kill_names] active_jobs, active_ids = tools.list_active_jobs(ids=True) active_jobs = list(zip(active_jobs, active_ids)) jobs_to_kill = [[name, id_] for name, id_ in active_jobs if name in kill_names] for name, id_ in jobs_to_kill: print((message1 + name + message2)) tools.call_bash('qdel ' + str(id_))
def resub(directory='in place'): # Takes a directory, resubmits errors, scf failures, and spin contaminated cases configure_dict = manager_io.read_configure(directory, None) max_resub = configure_dict['max_resub'] max_jobs = configure_dict['max_jobs'] hard_job_limit = configure_dict['hard_job_limit'] hit_queue_limit = False # Describes if this run has limitted the number of jobs submitted to work well with the queue # Get the state of all jobs being managed by this instance of the job manager completeness = moltools.check_completeness(directory, max_resub, configure_dict=configure_dict) print("completeness: ", completeness) errors = completeness[ 'Error'] # These are calculations which failed to complete scf_errors = completeness[ 'SCF_Error'] # These are calculations which failed to complete, appear to have an scf error, and hit wall time oscillating_scf_errors = completeness[ 'oscillating_scf_errors'] # These are calculations which failed to complete, appear to have an oscillaing scf error, need_resub = completeness[ 'Needs_resub'] # These are calculations with level shifts changed or hfx exchange changed spin_contaminated = completeness[ 'Spin_contaminated'] # These are finished jobs with spin contaminated solutions active = completeness[ 'Active'] # These are jobs which are currently running thermo_grad_error = completeness[ 'Thermo_grad_error'] # These are thermo jobs encountering the thermo grad error waiting = completeness[ 'Waiting'] # These are jobs which are or were waiting for another job to finish before continuing. bad_geos = completeness[ 'Bad_geos'] # These are jobs which finished, but converged to a bad geometry. finished = completeness['Finished'] molscontrol_kills = completeness['molscontrol_kills'] nactive = tools.get_number_active( ) # number of active jobs, counting bundled jobs as a single job # Kill SCF errors in progress, which are wasting computational resources all_scf_errors = completeness[ 'SCF_Errors_Including_Active'] # These are all jobs which appear to have scf error, including active ones scf_errors_to_kill = [ scf_err for scf_err in all_scf_errors if scf_err not in scf_errors ] names_to_kill = [ os.path.split(scf_err)[-1].rsplit('.', 1)[0] for scf_err in scf_errors_to_kill ] kill_jobs(names_to_kill, message1='Job: ', message2=' appears to have an scf error. Killing this job early') # Prep derivative jobs such as thermo single points, vertical IP, and ligand dissociation energies needs_derivative_jobs = list(filter(tools.check_original, finished)) print("needs_derivative_jobs: ", needs_derivative_jobs) prep_derivative_jobs(directory, needs_derivative_jobs) resubmitted = [ ] # Resubmitted list gets True if the job is submitted or False if not. Contains booleans, not job identifiers. for job in molscontrol_kills: print("killed by molscontrol: ", job) # Resub unidentified errors for error in errors: if ((nactive + np.sum(resubmitted)) >= max_jobs) or ( (tools.get_total_queue_usage() + np.sum(resubmitted)) >= hard_job_limit): hit_queue_limit = True continue resub_tmp = recovery.simple_resub(error) if resub_tmp: print(('Unidentified error in job: ' + os.path.split(error)[-1] + ' -Resubmitting')) print('') resubmitted.append(resub_tmp) # Resub oscillating_scf convergence errors for error in oscillating_scf_errors: if ((nactive + np.sum(resubmitted)) >= max_jobs) or ( (tools.get_total_queue_usage() + np.sum(resubmitted)) >= hard_job_limit): hit_queue_limit = True continue local_configure = manager_io.read_configure(directory, None) if 'scf' in local_configure['job_recovery']: resub_tmp = recovery.resub_oscillating_scf(error) if resub_tmp: print(('Oscillating SCF error identified in job: ' + os.path.split(error)[-1] + ' -Resubmitting with adjusted precision and grid.')) print('') resubmitted.append(resub_tmp) # Resub scf convergence errors for error in scf_errors: if ((nactive + np.sum(resubmitted)) >= max_jobs) or ( (tools.get_total_queue_usage() + np.sum(resubmitted)) >= hard_job_limit): hit_queue_limit = True continue local_configure = manager_io.read_configure(directory, None) if 'scf' in local_configure['job_recovery']: resub_tmp = recovery.resub_scf(error) if resub_tmp: print(('SCF error identified in job: ' + os.path.split(error)[-1] + ' -Resubmitting with adjusted levelshifts')) print('') resubmitted.append(resub_tmp) # Resub jobs which converged to bad geometries with additional constraints for error in bad_geos: if ((nactive + np.sum(resubmitted)) >= max_jobs) or ( (tools.get_total_queue_usage() + np.sum(resubmitted)) >= hard_job_limit): hit_queue_limit = True continue local_configure = manager_io.read_configure(directory, None) if 'bad_geo' in local_configure['job_recovery']: resub_tmp = recovery.resub_bad_geo(error, directory) if resub_tmp: print(( 'Bad final geometry in job: ' + os.path.split(error)[-1] + ' -Resubmitting from initial structure with additional constraints' )) print('') resubmitted.append(resub_tmp) # Resub spin contaminated cases for error in spin_contaminated: if ((nactive + np.sum(resubmitted)) >= max_jobs) or ( (tools.get_total_queue_usage() + np.sum(resubmitted)) >= hard_job_limit): hit_queue_limit = True continue local_configure = manager_io.read_configure(directory, None) if 'spin_contaminated' in local_configure['job_recovery']: resub_tmp = recovery.resub_spin(error) if resub_tmp: print(('Spin contamination identified in job: ' + os.path.split(error)[-1] + ' -Resubmitting with adjusted HFX')) print('') resubmitted.append(resub_tmp) # Resub jobs with atypical parameters used to aid convergence for error in need_resub: if ((nactive + np.sum(resubmitted)) >= max_jobs) or ( (tools.get_total_queue_usage() + np.sum(resubmitted)) >= hard_job_limit): hit_queue_limit = True continue resub_tmp = recovery.clean_resub(error) if resub_tmp: print(('Job ' + os.path.split(error)[-1] + ' needs to be rerun with typical paramters. -Resubmitting')) print('') resubmitted.append(resub_tmp) # Create a job with a tighter convergence threshold for failed thermo jobs for error in thermo_grad_error: if ((nactive + np.sum(resubmitted)) >= max_jobs) or ( (tools.get_total_queue_usage() + np.sum(resubmitted)) >= hard_job_limit): hit_queue_limit = True continue local_configure = manager_io.read_configure(directory, None) if 'thermo_grad_error' in local_configure['job_recovery']: resub_tmp = recovery.resub_tighter(error) if resub_tmp: print(( 'Job ' + os.path.split(error)[-1] + ' needs a better initial geo. Creating a geometry run with tighter convergence criteria' )) print('') resubmitted.append(resub_tmp) # Look at jobs in "waiting," resume them if the job they were waiting for is finished # Currently, this should only ever be thermo jobs waiting for an ultratight job for waiting_dict in waiting: if ((nactive + np.sum(resubmitted)) >= max_jobs) or ( (tools.get_total_queue_usage() + np.sum(resubmitted)) >= hard_job_limit): hit_queue_limit = True continue if len(list(waiting_dict.keys())) > 1: raise Exception('Waiting job list improperly constructed') job = list(waiting_dict.keys())[0] waiting_for = waiting_dict[job] if waiting_for in finished: history = recovery.load_history(job) history.waiting = None history.save() results_for_this_job = manager_io.read_outfile(job) if results_for_this_job['thermo_grad_error']: resubmitted.append(recovery.resub_thermo(job)) else: raise Exception('A method for resuming job: ' + job + ' is not defined') else: resubmitted.append(False) # Submit jobs which haven't yet been submitted if not ((nactive + np.sum(resubmitted)) >= max_jobs) or ( (tools.get_total_queue_usage() + np.sum(resubmitted)) >= hard_job_limit): to_submit = [] jobscripts = tools.find('*_jobscript') active_jobs = tools.list_active_jobs(home_directory=directory, parse_bundles=True) for job in jobscripts: if not os.path.isfile(job.rsplit('_', 1)[0] + '.out') and not os.path.split( job.rsplit('_', 1)[0])[-1] in active_jobs: to_submit.append(job) short_jobs_to_submit = [ i for i in to_submit if tools.check_short_single_point(i) ] long_jobs_to_submit = [ i for i in to_submit if i not in short_jobs_to_submit ] if len(short_jobs_to_submit) > 0: bundled_jobscripts = tools.bundle_jobscripts( os.getcwd(), short_jobs_to_submit) else: bundled_jobscripts = [] to_submit = bundled_jobscripts + long_jobs_to_submit submitted = [] for job in to_submit: if ((len(submitted) + nactive + np.sum(resubmitted)) >= max_jobs ) or ((tools.get_total_queue_usage() + len(submitted) + np.sum(resubmitted)) >= hard_job_limit): hit_queue_limit = True continue print(('Initial submission for job: ' + os.path.split(job)[-1])) tools.qsub(job) submitted.append(True) else: hit_queue_limit = True submitted = [] number_resubmitted = np.sum(np.array(resubmitted + submitted)) # ~ print str(number_resubmitted)+' Jobs submitted' return int(number_resubmitted), int(len( completeness['Active'])), hit_queue_limit