def launch_job(self, exp_dict, savedir, command, job=None): """Submit a job job and save job dict and exp_dict.""" add_job_utils() import haven_jobs_utils as hju # Check for duplicates if job is not None: assert self._assert_no_duplicates(job) hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict) # Define paths workdir_job = os.path.join(savedir, "code") # Copy the experiment code into the experiment folder hu.copy_code(self.workdir + "/", workdir_job) # Run command job_command = hju.get_job_command(self.job_config, command, savedir, workdir=workdir_job) job_id = hu.subprocess_call(job_command).replace("\n", "") # Verbose if self.verbose: print("Job_id: %s command: %s" % (job_id, command)) job_dict = {"job_id": job_id, "started at (Montreal)":hu.time_to_montreal(), "command":command} hu.save_json(hju.get_job_fname(savedir), job_dict) return job_dict
def kill_jobs(self): add_job_utils() import haven_jobs_utils as hju hu.check_duplicates(self.exp_list) pr = hu.Parallel() submit_dict = {} for exp_dict in self.exp_list: exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(self.savedir_base, exp_id) fname = hju.get_job_fname(savedir) if os.path.exists(fname): job_id = hu.load_json(fname)['job_id'] pr.add(hju.kill_job, self.api, job_id) submit_dict[exp_id] = 'KILLED' else: submit_dict[exp_id] = 'NoN-Existent' pr.run() pr.close() pprint.pprint(submit_dict) print("%d/%d experiments killed." % (len([ s for s in submit_dict.values() if 'KILLED' in s]), len(submit_dict))) return submit_dict
def get_job_fname(savedir, job_fname=None): import haven_jobs_utils as hju if job_fname is None: fname = hju.get_job_fname(savedir) else: fname = os.path.join(savedir, job_fname) return fname
def _submit_job(self, exp_dict, command, reset, submit_dict={}): """Submit one job. It checks if the experiment exist and manages the special casses, e.g., new experiment, reset, failed, job is already running, completed """ add_job_utils() import haven_jobs_utils as hju # Define paths savedir = os.path.join(self.savedir_base, hu.hash_dict(exp_dict)) fname = hju.get_job_fname(savedir) if not os.path.exists(fname): # Check if the job already exists job_dict = self.launch_job(exp_dict, savedir, command, job=None) job_id = job_dict['job_id'] message = "SUBMITTED: Launching" elif reset: # Check if the job already exists job_id = hu.load_json(fname).get("job_id") hju.kill_job(self.api, job_id) hc.delete_and_backup_experiment(savedir) job_dict = self.launch_job(exp_dict, savedir, command, job=None) job_id = job_dict['job_id'] message = "SUBMITTED: Resetting" else: job_id = hu.load_json(fname).get("job_id") job = hju.get_job(self.api, job_id) if job.alive or job.state == 'SUCCEEDED': # If the job is alive, do nothing message = 'IGNORED: Job %s' % job.state elif job.state in ["FAILED", "CANCELLED"]: message = "SUBMITTED: Retrying %s Job" % job.state job_dict = self.launch_job(exp_dict, savedir, command, job=job) job_id = job_dict['job_id'] # This shouldn't happen else: raise ValueError('wtf') submit_dict[job_id] = message
def get_summary(self, failed_only=False, columns=None, max_lines=200): """[summary] Returns ------- [type] [description] """ add_job_utils() import haven_jobs_utils as hju # get job ids job_id_list = [] for exp_dict in self.exp_list: exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(self.savedir_base, exp_id) fname = hju.get_job_fname(savedir) if os.path.exists(fname): job_id_list += [hu.load_json(fname)["job_id"]] jobs_dict = hju.get_jobs_dict(self.api, job_id_list) # fill summary summary_dict = {'table':[], 'status':[], 'logs_failed':[], 'logs':[]} for exp_dict in self.exp_list: result_dict = copy.deepcopy(exp_dict) exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(self.savedir_base, exp_id) result_dict["exp_id"] = exp_id fname = hju.get_job_fname(savedir) # Job results result_dict["job_id"] = None result_dict["job_state"] = 'NEVER LAUNCHED' if os.path.exists(fname): job_dict = hu.load_json(fname) job_id = job_dict["job_id"] if job_id not in jobs_dict: continue job = jobs_dict[job_id] result_dict['started at (Montreal)'] = job_dict["started at (Montreal)"] result_dict["job_id"] = job_id result_dict["job_state"] = job.state summary_dict['table'] += [copy.deepcopy(result_dict)] result_dict["command"] = job.command[2] if job.state == "FAILED": fname = os.path.join(savedir, "err.txt") if os.path.exists(fname): result_dict["logs"] = hu.read_text(fname)[-max_lines:] summary_dict['logs_failed'] += [result_dict] else: if self.verbose: print('%s: err.txt does not exist' % exp_id) else: fname = os.path.join(savedir, "logs.txt") if os.path.exists(fname): result_dict["logs"] = hu.read_text(fname)[-max_lines:] summary_dict['logs'] += [result_dict] else: if self.verbose: print('%s: logs.txt does not exist' % exp_id) else: result_dict['job_state'] = 'NEVER LAUNCHED' summary_dict['table'] += [copy.deepcopy(result_dict)] # get info df = pd.DataFrame(summary_dict['table']) df = df.set_index('exp_id') if columns: df = df[[c for c in columns if (c in df.columns and c not in ['err'])]] if "job_state" in df: stats = np.vstack(np.unique(df['job_state'].fillna("NaN"),return_counts=True)).T status = ([{a:b} for (a,b) in stats]) else: df['job_state'] = None summary_dict['status'] = status summary_dict['table'] = df summary_dict['queuing'] = df[df['job_state']=='QUEUING'] summary_dict['running'] = df[df['job_state']=='RUNNING'] summary_dict['succeeded'] = df[df['job_state']=='SUCCEEDED'] summary_dict['failed'] = df[df['job_state']=='FAILED'] return summary_dict