def update(self, timeout=False): by_states = defaultdict(list) for run in self.mpi_runs: state = self.check_state(run) by_states[state].append(run) done_pks = [r.job.pk for r in by_states['RUN_DONE']] BalsamJob.batch_update_state(done_pks, 'RUN_DONE') self.jobsource.release(done_pks) error_pks = [r.job.pk for r in by_states['RUN_ERROR']] with db.transaction.atomic(): models.safe_select(BalsamJob.objects.filter(pk__in=error_pks)) for run in by_states['RUN_ERROR']: run.job.refresh_from_db() run.job.update_state('RUN_ERROR', run.err_msg) self.jobsource.release(error_pks) active_pks = [r.job.pk for r in by_states['RUNNING']] if timeout: self.timeout_kill(by_states['RUNNING']) BalsamJob.batch_update_state(active_pks, 'RUN_TIMEOUT') self.jobsource.release(active_pks) else: killquery = self.jobsource.filter(job_id__in=active_pks, state='USER_KILLED') kill_pks = killquery.values_list('job_id', flat=True) to_kill = [run for run in by_states['RUNNING'] if run.job.pk in kill_pks] self.timeout_kill(to_kill) self.jobsource.release(kill_pks) for run in to_kill: by_states['RUNNING'].remove(run) if timeout: self.mpi_runs = [] else: self.mpi_runs = by_states['RUNNING']
def _handle_errors(self, error_jobs): error_pks = [j[0] for j in error_jobs] safe_select(BalsamJob.objects.filter(pk__in=error_pks)) for pk, retcode, tail in error_jobs: rank = self.running_locations[pk] self.revert_assign(rank, pk) job = BalsamJob.objects.get(pk=pk) state_msg = f"nonzero return {retcode}: {tail}" job.update_state('RUN_ERROR', state_msg) self.job_source.release(error_pks)
def _handle_errors(self, error_msgs): error_pks = [uuid.UUID(msg[0]) for msg in error_msgs] jobs = { job.pk: job for job in safe_select(BalsamJob.objects.filter(pk__in=error_pks)) } for pk, retcode, tail in error_msgs: job = jobs[uuid.UUID(pk)] state_msg = f"nonzero return {retcode}: {tail}" job.update_state('RUN_ERROR', state_msg, release=True)