async def remove(self, param_ids, hash_ids): """Remove job output data that match given IDs.""" async with self.lock: trash_dir = os.path.join(self.path, 'trash') if not os.path.exists(trash_dir): os.mkdir(trash_dir) queue_state = await self.queue.get_state() # Keep symlinks related to started/queued jobs jobs = queue_state['started_jobs'] + queue_state['queued_jobs'] param_ids = set(param_ids) param_ids -= set([get_param_id(job['param']) for job in jobs]) param_ids -= set([get_param_id(job['param']) + '_tmp' for job in jobs]) hash_ids = set(hash_ids) hash_ids -= set([get_hash_id(job['param']) for job in jobs]) hash_ids -= set([get_hash_id(job['param']) + '_tmp' for job in jobs]) removed_output = self._remove_job_output(trash_dir, param_ids, hash_ids) removed_output += self._remove_dangling_noref(trash_dir) # Second pass to ensure deleting "last" if needed removed_output += self._remove_dangling_noref(trash_dir) return removed_output
def _get_state_fast(self): """Return the state. Parameter details are removed.""" assert self.lock.locked() state = { 'finished_jobs': None, 'started_jobs': None, 'queued_jobs': None, 'concurrency': self._state['concurrency'], 'next_job_id': self._state['next_job_id'], } for key in ['finished_jobs', 'started_jobs', 'queued_jobs']: jobs = [] for job in self._state[key].values(): job = dict(job) param = job['param'] job['param'] = { '_': { 'param_id': get_param_id(param), 'hash_id': get_hash_id(param), }, '_name': get_name(param), } jobs.append(job) state[key] = jobs return state
async def filter_augment(self, params): """Augment parameters with their history.""" hash_ids = [get_hash_id(param) for param in params] history_list = await self.history.history_list(hash_ids) for param, history in zip(params, history_list): param['_'].update(history) return params
async def _paramset_migrate(self, old_paramset, new_paramset): old_param_ids = await self.client.registry.paramset(old_paramset) new_param_ids = await self.client.registry.paramset(new_paramset) old_hash_ids = [get_hash_id(param) for param in await self.client.registry.params(old_param_ids)] new_hash_ids = [get_hash_id(param) for param in await self.client.registry.params(new_param_ids)] if len(old_param_ids) != len(new_param_ids): raise RuntimeError('Two parameter sets must have the same number of parameters') param_id_pairs = list(zip(old_param_ids, new_param_ids)) hash_id_pairs = list(zip(old_hash_ids, new_hash_ids)) migrated_param_id_pairs, migrated_hash_id_pairs = ( await self.client.output.migrate(param_id_pairs, hash_id_pairs)) migrated_hash_id_pairs = await self.client.history.migrate(hash_id_pairs) print(f'Migrated: ' + f'{len(migrated_param_id_pairs) + len(migrated_hash_id_pairs)} runner data, ' + f'{len(migrated_hash_id_pairs)} history data')
async def update(self, job): """Record the job execution result.""" hash_id = get_hash_id(job['param']) hist_data = {} for key in HISTORY_STUB: hist_data[key] = job[key] async with self.lock: self._state[hash_id] = hist_data self.logger.info(f'Updated history for parameter {hash_id}') self.lock.notify_all() self._schedule_dump()
def _construct_env(job, job_paths): """Construct environment variables.""" param = job['param'] env = dict(os.environ) env['EXPTOOLS_JOB_DIR'] = job_paths['job_dir'] env['EXPTOOLS_JOB_ID'] = job['job_id'] env['EXPTOOLS_PARAM_ID'] = get_param_id(param) env['EXPTOOLS_HASH_ID'] = get_hash_id(param) env['EXPTOOLS_NAME'] = get_name(param) env['EXPTOOLS_CWD'] = get_cwd(param) or os.getcwd() env['EXPTOOLS_RETRY'] = str(get_retry(param)) env['EXPTOOLS_RETRY_DELAY'] = str(get_retry_delay(param)) env['EXPTOOLS_TIME_LIMIT'] = str(get_time_limit(param)) env['EXPTOOLS_JOB_JSON_PATH'] = job_paths['job.json'] env['EXPTOOLS_PARAM_JSON_PATH'] = job_paths['param.json'] env['EXPTOOLS_RESOURCES_JSON_PATH'] = job_paths['resources.json'] env['EXPTOOLS_STATUS_JSON_PATH'] = job_paths['status.json'] return env
async def _remove(self, paramset, param_ids): """Remove parameters from a parameter set.""" assert self.lock.locked() if paramset not in self._state['paramsets']: raise RuntimeError(f'Parameter set does not exist: {paramset}') # Ensure not to delete parameters in a different parameter set paramset_param_ids = set(self._state['paramsets'][paramset]) for param_id in param_ids: if param_id not in paramset_param_ids: raise RuntimeError( f'Parameter {param_id} is not in {paramset}') for param_id in param_ids: param = self._state['params'][param_id] hash_id = get_hash_id(param) pos = self._hash_id_index[hash_id].index(param_id) assert pos != -1 del self._hash_id_index[hash_id][pos] if not self._hash_id_index[hash_id]: del self._hash_id_index[hash_id] del self._state['params'][param_id] param_ids_set = set(param_ids) self._state['paramsets'][paramset] = [ param_id for param_id in self._state['paramsets'][paramset] if param_id not in param_ids_set ] self.logger.info( f'Removed {len(param_ids)} parameters from {paramset}') self.lock.notify_all() self._schedule_dump() return param_ids
async def _make_fid_mapping(self, history_map, hash_ids): """Make a value to feature index mapping, and make a feature vector list for existig parameters.""" param_ids_list = await self.registry.param_ids_by_hash_ids(hash_ids) param_ids = [param_ids[0] for param_ids in param_ids_list if param_ids] params = await self.registry.params(param_ids) fid_mapping = {} fvec_map = {} succeeded_params = [] for param in params: hash_id = get_hash_id(param) if hash_id not in history_map: assert False if not history_map[hash_id]['succeeded']: assert False if history_map[hash_id]['duration'] is None: assert False for key, value in param.items(): if key.startswith('_'): continue if isinstance(value, dict): continue value_hash = hash(key + '___' + str(value)) if value_hash not in fid_mapping: fid_mapping[value_hash] = len(fid_mapping) succeeded_params.append((hash_id, param)) for hash_id, param in succeeded_params: fvec_map[hash_id] = self._extract_fvec(fid_mapping, param) return fid_mapping, fvec_map
async def get_hash_ids(self, params): """Return hash IDs.""" return [get_hash_id(param) for param in params]
async def filter_omit(self, params, types): """Omit parameters of specified types""" valid_types = [ 'succeeded', 'failed', 'finished', 'started', 'queued', 'identical', 'duplicate', 'has_output' ] for type_ in types: assert type_ in valid_types hash_ids = [get_hash_id(param) for param in params] if 'succeeded' in types: params = [ param for param in params if param['_']['succeeded'] != True ] if 'failed' in types: params = [ param for param in params if param['_']['succeeded'] != False ] if 'finished' in types: params = [ param for param in params if param['_']['finished'] is None ] if 'started' in types: started_jobs = await self.queue.jobs(await self.queue.job_ids( ['started'])) started_job_hash_ids = set( [get_hash_id(job['param']) for job in started_jobs]) new_params = [] for param, hash_id in zip(params, hash_ids): if hash_id in started_job_hash_ids: continue new_params.append(param) params = new_params if 'queued' in types: queued_jobs = await self.queue.jobs(await self.queue.job_ids(['queued'])) queued_job_hash_ids = set( [get_hash_id(job['param']) for job in queued_jobs]) new_params = [] for param, hash_id in zip(params, hash_ids): if hash_id in queued_job_hash_ids: continue new_params.append(param) params = new_params if 'identical' in types: seen_unique_ids = set() new_params = [] for param in params: unique_id = make_unique_id(param) if unique_id not in seen_unique_ids: seen_unique_ids.add(unique_id) new_params.append(param) params = new_params if 'duplicate' in types: seen_hash_ids = set() new_params = [] for param in params: hash_id = get_hash_id(param) if hash_id not in seen_hash_ids: seen_hash_ids.add(hash_id) new_params.append(param) params = new_params if 'has_output' in types: output_hash_ids = await self.runner.hash_ids() new_params = [] for param in params: hash_id = get_hash_id(param) if hash_id not in output_hash_ids: new_params.append(param) params = new_params return params
async def estimate_remaining_time(self, state, oneshot, use_similar): """Estimate the remaining time using the queue state.""" now = utcnow() epsilon = 0.001 # Potential underestimation until the progress reaches 0.1% # Future parallelism cannot be higher than the remaining job count concurrency = max( 1., min(state['concurrency'], len(state['started_jobs']) + len(state['queued_jobs']))) hash_ids = await self.history.hash_ids() history_list = await self.history.history_list(hash_ids) history_map = dict(zip(hash_ids, history_list)) if use_similar: fid_mapping, fvec_map = await self._make_fid_mapping( history_map, hash_ids) # Estimate average per-job duration known_hash_ids = set() known_duration = 0. known_count = 0 # Consider recent jobs first (in case some jobs have duplicate hash_id) for job in reversed(state['started_jobs']): hash_id = get_hash_id(job['param']) if hash_id in known_hash_ids: continue known_hash_ids.add(hash_id) if job['started'] is None: started = now else: started = parse_utc(job['started']) if job.get('status', None) and job['status'].get('progress') >= epsilon: known_duration += diff_sec(now, started) / job['status']['progress'] known_count += 1 for hash_id, history in history_map.items(): if hash_id in known_hash_ids: continue known_hash_ids.add(hash_id) if history['duration'] is not None: known_duration += history['duration'] known_count += 1 avg_duration = known_duration / max(known_count, 1) remaining_time_map = {} for job in state['finished_jobs']: remaining_time_map[job['job_id']] = 0. # Calculate started jobs' remaining time remaining_duration = 0. for job in state['started_jobs']: hash_id = get_hash_id(job['param']) history = history_map.get(hash_id, None) if job['started'] is None: started = now else: started = parse_utc(job['started']) if job.get('status', None) and job['status'].get('progress') >= epsilon: exp_duration = diff_sec(now, started) / job['status']['progress'] remaining_duration += max( exp_duration - diff_sec(now, started), 0.) elif history and history['duration'] is not None: remaining_duration += max( history['duration'] - diff_sec(now, started), 0.) else: if use_similar: exp_duration = (await self._find_closest_duration( history_map, fid_mapping, fvec_map, job['param'])) if exp_duration is None: exp_duration = avg_duration else: exp_duration = avg_duration remaining_duration += max( exp_duration - diff_sec(now, started), 0.) # Take into account concurrency remaining_time_map[ job['job_id']] = remaining_duration / concurrency # Calculate queued jobs' remaining time if not oneshot: for job in state['queued_jobs']: hash_id = get_hash_id(job['param']) history = history_map.get(hash_id, None) if history and history['duration'] is not None: remaining_duration += history['duration'] else: if use_similar: exp_duration = (await self._find_closest_duration( history_map, fid_mapping, fvec_map, job['param'])) if exp_duration is None: exp_duration = avg_duration else: exp_duration = avg_duration remaining_duration += exp_duration # Take into account concurrency remaining_time_map[ job['job_id']] = remaining_duration / concurrency else: for job in state['queued_jobs']: remaining_time_map[ job['job_id']] = remaining_duration / concurrency # Take into account concurrency remaining_time = remaining_duration / concurrency return remaining_time, remaining_time_map
async def _handle_status(self): """show the queue state""" limit = self.args.limit use_similar = self.args.similar if self.args.job_types == 'all': job_types = {'finished', 'started', 'queued'} else: job_types = set(self.args.job_types) estimator = Estimator(self.client.registry, self.client.history) use_color = self.common_args.color == 'yes' if use_color: colored = termcolor.colored else: def colored(s, *args, **kwargs): # pylint: disable=invalid-name,unused-argument """Use no color.""" return s async for queue_state in self._get_queue_state(): oneshot = await self.client.scheduler.is_oneshot() output = '' all_jobs = (queue_state['finished_jobs'] + queue_state['started_jobs'] + queue_state['queued_jobs']) all_params = [job['param'] for job in all_jobs] job_id_max_len = self._get_job_id_max_len(all_jobs) param_id_max_len = self._get_param_id_max_len(all_params) if 'finished' in job_types: succeeded_count = len([job for job in queue_state['finished_jobs'] if job['succeeded']]) failed_count = len(queue_state['finished_jobs']) - succeeded_count finished_jobs_color = 'red' if failed_count else 'green' output += colored( f"Finished jobs (S:{succeeded_count} / F:{failed_count})", finished_jobs_color, attrs=['reverse']) + '\n' if limit and len(queue_state['finished_jobs']) > limit: line = colored(' ', finished_jobs_color, attrs=['reverse']) output += line + ' ...\n' jobs = queue_state['finished_jobs'] if limit: jobs = jobs[-limit:] for job in jobs: if job['succeeded']: line = colored(' ', 'green', attrs=['reverse']) else: line = colored(' ', 'red', attrs=['reverse']) param_id = get_param_id(job['param']) hash_id = get_hash_id(job['param']) name = get_name(job['param']) line += f" {job['job_id']:{job_id_max_len}} {param_id:{param_id_max_len}} {hash_id}" line += f' [{format_sec_short(job_elapsed_time(job)):>7}]' if job['succeeded']: line += ' succeeded ' else: line += ' FAILED ' line += f"{name}" output += line + '\n' output += '\n' remaining_time, rem_map = ( await estimator.estimate_remaining_time(queue_state, False, use_similar)) last_rem = 0. if 'started' in job_types: output += colored( f"Started jobs (A:{len(queue_state['started_jobs'])})", 'cyan', attrs=['reverse']) + '\n' if limit and len(queue_state['started_jobs']) > limit: line = colored(' ', 'cyan', attrs=['reverse']) output += line + ' ...\n' jobs = queue_state['started_jobs'] if limit: jobs = jobs[-limit:] for job in jobs: rem = rem_map[job['job_id']] param_id = get_param_id(job['param']) hash_id = get_hash_id(job['param']) name = get_name(job['param']) line = colored(' ', 'cyan', attrs=['reverse']) line += f" {job['job_id']:{job_id_max_len}} {param_id:{param_id_max_len}} {hash_id}" line += f' [{format_sec_short(job_elapsed_time(job)):>7}]' line += f'+[{format_sec_short(max(rem - last_rem, 0)):>7}]' line += ' ' last_rem = rem line += f"{name}" output += line + '\n' output += '\n' if 'queued' in job_types: output += colored( f"Queued jobs (Q:{len(queue_state['queued_jobs'])})", 'blue', attrs=['reverse']) + ' ' output += 'Scheduler: ' if oneshot: output += colored('Oneshot', 'blue') elif await self.client.scheduler.is_running(): output += colored('Running', 'cyan') else: output += colored('Stopped', 'red') output += '\n' jobs = queue_state['queued_jobs'] if limit: jobs = jobs[:limit] for job in jobs: rem = rem_map[job['job_id']] param_id = get_param_id(job['param']) hash_id = get_hash_id(job['param']) name = get_name(job['param']) line = colored(' ', 'blue', attrs=['reverse']) line += f" {job['job_id']:{job_id_max_len}} {param_id:{param_id_max_len}} {hash_id}" line += f' [{format_sec_short(max(rem - last_rem, 0)):>7}]' line += ' ' last_rem = rem line += f"{name}" output += line + '\n' if limit and len(queue_state['queued_jobs']) > limit: line = colored(' ', 'blue', attrs=['reverse']) output += line + ' ...\n' output += '\n' # output += f"Concurrency: {queue_state['concurrency']}" if not oneshot: # reuse remaining_time pass else: remaining_time, _ = await estimator.estimate_remaining_time(queue_state, True, use_similar) output += await format_estimated_time(remaining_time, queue_state, use_color) + '\n' if self.args.clear_screen: os.system('clear') print(output) if (self.args.stop_empty and not queue_state['started_jobs'] and (oneshot or not queue_state['queued_jobs'])): break
async def _try(self, job, job_id, param, current_retry): """Run a job.""" param_id = get_param_id(param) hash_id = get_hash_id(param) name = get_name(param) expanded_command = [arg.format(**param) for arg in get_command(param)] cwd = get_cwd(param) or os.getcwd() time_limit = get_time_limit(param) succeeded = False try: self.logger.info(f'Launching job {job_id}: {name}') job_paths = await self.output.make_job_directory(job, current_retry) job_paths = await self.output.create_job_files(job, job_paths) env = self._construct_env(job, job_paths) with self.output.open_job_stdio(job_paths) as stdio: stdout, stderr = stdio await self.output.make_tmp_symlinks(param_id, hash_id, job_paths) # Launch process proc = await asyncio.create_subprocess_exec( *expanded_command, cwd=cwd, stdin=asyncio.subprocess.DEVNULL, stdout=stdout, stderr=stderr, env=env, loop=self.loop) await self.queue.set_pid(job_id, proc.pid) # Watch status changes status_task = asyncio.ensure_future( self._watch_status(job_id, job_paths), loop=self.loop) try: if time_limit <= 0: await proc.communicate() else: await asyncio.wait_for(proc.communicate(), time_limit, loop=self.loop) except asyncio.TimeoutError: self.logger.error(f'Timeout while waiting for job {job_id}') finally: status_task.cancel() try: await status_task except concurrent.futures.CancelledError: # Ignore CancelledError because we caused it pass if proc.returncode is None: try: proc.send_signal(signal.SIGTERM) except Exception: # pylint: disable=broad-except self.logger.exception('Exception while killing process') try: await asyncio.wait_for(proc.wait(), 10, loop=self.loop) except Exception: # pylint: disable=broad-except self.logger.exception('Exception while waiting for process') if proc.returncode is None: try: proc.send_signal(signal.SIGKILL) except Exception: # pylint: disable=broad-except self.logger.exception('Exception while killing process') try: await proc.wait() except Exception: # pylint: disable=broad-except self.logger.exception('Exception while waiting for process') # Read status before making the job finished await self._read_status(job_id, job_paths) if proc.returncode == 0: await self.output.make_symlinks(param_id, hash_id, job_paths) succeeded = True except concurrent.futures.CancelledError: # Pass through raise except Exception: # pylint: disable=broad-except self.logger.exception(f'Exception while running job {job_id}') finally: await self.output.remove_tmp_symlinks(param_id, hash_id) return succeeded