def load_from_checkpoint(cfg): filename = cfg_file(cfg) if not os.path.isfile(filename): raise Exception( f'Could not load saved parameters for experiment {cfg.experiment}') with open(filename, 'r') as json_file: json_params = json.load(json_file) log.warning('Loading existing experiment configuration from %s', filename) loaded_cfg = AttrDict(json_params) # override the parameters in config file with values passed from command line for key, value in cfg.cli_args.items(): if key in loaded_cfg and loaded_cfg[key] != value: log.debug( 'Overriding arg %r with value %r passed from command line', key, value) loaded_cfg[key] = value # incorporate extra CLI parameters that were not present in JSON file for key, value in vars(cfg).items(): if key not in loaded_cfg: log.debug( 'Adding new argument %r=%r that is not in the saved config file!', key, value) loaded_cfg[key] = value return loaded_cfg
def add_new_level(self, level, seed, key, pk3_path): with self.locks[level]: num_used_seeds = self.num_seeds_used_in_current_run[level].value if num_used_seeds < len(self.available_seeds.get(level, [])): log.warning( 'We should only add new levels to cache if we ran out of pre-generated levels (seeds)' ) log.warning( 'Num used seeds: %d, available seeds: %d, level: %s, seed %r, key %r', num_used_seeds, len(self.available_seeds.get(level, [])), level, seed, key, ) # some DMLab-30 environments, e.g. language_select_located_object may require different levels even # for the same seed. This is most likely a bug in DeepMind Lab, because the same seed should generate # identical environments path = os.path.join(self.cache_dir, key) if not os.path.isfile(path): # copy the cached file DeepMind Lab has written to the cache directory shutil.copyfile(pk3_path, path) # add new map to the list of available seeds for this level # so it can be used next time we run the experiment lvl_seeds_filename = join(self.cache_dir, level_to_filename(level)) safe_ensure_dir_exists(os.path.dirname(lvl_seeds_filename)) with open(lvl_seeds_filename, 'a') as fobj: fobj.write(f'{seed} {key}\n')
def safe_get(q, timeout=1e6, msg='Queue timeout'): """Using queue.get() with timeout is necessary, otherwise KeyboardInterrupt is not handled.""" while True: try: return q.get(timeout=timeout) except Empty: log.warning(msg)
def _game_init(self, with_locking=True, max_parallel=10): lock_file = lock = None if with_locking: lock_file = doom_lock_file(max_parallel) lock = FileLock(lock_file) init_attempt = 0 while True: init_attempt += 1 try: if with_locking: with lock.acquire(timeout=20): self.game.init() else: self.game.init() break except Timeout: if with_locking: log.debug( 'Another process currently holds the lock %s, attempt: %d', lock_file, init_attempt, ) except Exception as exc: log.warning( 'VizDoom game.init() threw an exception %r. Terminate process...', exc) from sample_factory.envs.env_utils import EnvCriticalError raise EnvCriticalError()
def cat(self, dict_of_tensor_arrays, macro_batch_size, use_pinned_memory, timing): """ Here 'macro_batch' is the overall size of experience per iteration. Macro-batch = mini-batch * num_batches_per_iteration """ tensor_batch = self.batch_pool.get() if tensor_batch is not None: old_batch_size = tensor_batch_size(tensor_batch) if old_batch_size != macro_batch_size: # this can happen due to PBT changing batch size during the experiment log.warning('Tensor macro-batch size changed from %d to %d!', old_batch_size, macro_batch_size) log.warning('Discarding the cached tensor batch!') del tensor_batch tensor_batch = None if tensor_batch is None: tensor_batch = copy_dict_structure(dict_of_tensor_arrays) log.info('Allocating new CPU tensor batch (could not get from the pool)') for d1, cache_d, key, tensor_arr, _ in iter_dicts_recursively(dict_of_tensor_arrays, tensor_batch): cache_d[key] = torch.cat(tensor_arr, dim=0) if use_pinned_memory: cache_d[key] = cache_d[key].pin_memory() else: with timing.add_time('batcher_mem'): for d1, cache_d, key, tensor_arr, cache_t in iter_dicts_recursively(dict_of_tensor_arrays, tensor_batch): offset = 0 for t in tensor_arr: first_dim = t.shape[0] cache_t[offset:offset + first_dim].copy_(t) offset += first_dim return tensor_batch
def resolve_env_name(self, full_env_name): """ :param full_env_name: complete name of the environment, to be passed to the make_env_func, e.g. atari_breakout :return: env registry entry :rtype: EnvRegistryEntry """ # we find a match with a registered env family prefix for env_prefix, registry_entry in self.registry.items(): if not full_env_name.startswith(env_prefix): continue # We found a match. If it's a callable, we should first handle a deferred registry entry if callable(registry_entry): make_env_func, add_extra_params_func, override_default_params_func = registry_entry( ) self.register_env(env_prefix, make_env_func, add_extra_params_func, override_default_params_func) return self.registry[env_prefix] msg = ( f'Could not resolve {full_env_name}. ' 'Did you register the family of environments in the registry? See sample_factory_examples for details.' ) log.warning(msg) raise RuntimeError(msg)
def make_voxel_env(env_name, cfg=None, env_config=None, **kwargs): scenario_name = env_name.split('voxel_env_')[-1].casefold() log.debug('Using scenario %s', scenario_name) if 'multitask' in scenario_name: if env_config is not None and 'worker_index' in env_config: task_idx = env_config['worker_index'] else: log.warning('Could not find information about task id. Use task_id=0. (It is okay if this message appears once)') task_idx = 0 env = make_env_multitask( scenario_name, task_idx, num_envs=cfg.voxel_num_envs_per_instance, num_agents_per_env=cfg.voxel_num_agents_per_env, num_simulation_threads=cfg.voxel_num_simulation_threads, use_vulkan=cfg.voxel_use_vulkan, ) else: env = VoxelEnv( scenario_name=scenario_name, num_envs=cfg.voxel_num_envs_per_instance, num_agents_per_env=cfg.voxel_num_agents_per_env, num_simulation_threads=cfg.voxel_num_simulation_threads, use_vulkan=cfg.voxel_use_vulkan, ) env = Wrapper(env, cfg.voxel_increase_team_spirit, cfg.voxel_max_team_spirit_steps) return env
def get_trajectory_buffers(self, num_buffers: int, timing: Optional = None): """ :param num_buffers: number of free buffer indices to obtain :param timing: for performance analysis :return: a list of indices of free buffers """ indices: List[int] = [] block = False while len(indices) < num_buffers: with timing.add_time( 'wait_buffers' ) if timing is not None else contextlib.suppress(): try: indices.extend( self.free_buffers_queue.get_many( max_messages_to_get=num_buffers - len(indices), timeout=5, block=block, )) except faster_fifo.Empty: log.warning('Waiting for %d trajectory buffers...', num_buffers - len(indices)) if len(indices) < num_buffers: block = True return indices
def close(self): try: if self.game is not None: self.game.close() except RuntimeError as exc: log.warning('Runtime error in VizDoom game close(): %r', exc) if self.viewer is not None: self.viewer.close()
def is_udp_port_available(port): try: sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) sock.bind(('', port)) sock.close() except OSError as exc: log.warning(f'UDP port {port} cannot be used {str(exc)}') return False else: return True
def maybe_load_from_checkpoint(cfg): filename = cfg_file(cfg) if not os.path.isfile(filename): log.warning( 'Saved parameter configuration for experiment %s not found!', cfg.experiment) log.warning('Starting experiment from scratch!') return AttrDict(vars(cfg)) return load_from_checkpoint(cfg)
def register_custom_encoder(custom_encoder_name, encoder_cls): if custom_encoder_name in ENCODER_REGISTRY: log.warning('Encoder %s already registered', custom_encoder_name) assert issubclass( encoder_cls, EncoderBase), 'Custom encoders must be derived from EncoderBase' log.debug('Adding model class %r to registry (with name %s)', encoder_cls, custom_encoder_name) ENCODER_REGISTRY[custom_encoder_name] = encoder_cls
def dmlab_env_by_name(name): for spec in DMLAB_ENVS: if spec.name == name: return spec # not a known "named" environment with a predefined spec log.warning( 'Level %s not found. Interpreting the level name as an unmodified DMLab-30 env name!', name) level = name.split('dmlab_')[1] spec = DmLabSpec(name, level) return spec
def get_algo_class(algo): algo_class = None if algo == 'APPO': from sample_factory.algorithms.appo.appo import APPO algo_class = APPO elif algo == 'DUMMY_SAMPLER': from sample_factory.algorithms.dummy_sampler import DummySampler algo_class = DummySampler else: log.warning('Algorithm %s is not supported', algo) return algo_class
def _actors_update_shaping_scheme(self, policy_id): log.debug('Sending latest reward scheme to actors for policy %d...', policy_id) for actor_worker in self.actor_workers: reward_scheme_task = (PbtTask.UPDATE_REWARD_SCHEME, (policy_id, self.policy_reward_shaping[policy_id])) task = (TaskType.PBT, reward_scheme_task) try: actor_worker.task_queue.put(task, timeout=0.1) except Full: log.warning( 'Could not add task %r to queue, it is likely that worker died', task)
def fetch(self, key, pk3_path): """Environment object itself acts as a proxy to the global level cache.""" if not self.env_uses_level_cache: self.env_uses_level_cache = True # log.debug('Env %s uses level cache!', self.level_name) path = join(self.level_cache_path, key) if os.path.isfile(path): # copy the cached file to the path expected by DeepMind Lab shutil.copyfile(path, pk3_path) return True else: log.warning('Cache miss in environment %s key: %s!', self.level_name, key) return False
def wait_for_traj_buffers(self): """ In very rare cases the learner might not have freed the shared memory buffer by the time we need it. Here we wait until the learner is done with it. """ print_warning = True while self.traj_tensors_available[:, :, self.traj_buffer_idx].min() == 0: if print_warning: log.warning( 'Waiting for trajectory buffer %d on actor %d-%d', self.traj_buffer_idx, self.worker_idx, self.split_idx, ) print_warning = False time.sleep(0.002)
def reset(self): self._ensure_initialized() if self.record_to is not None and not self.is_multiplayer: # does not work in multiplayer (uses different mechanism) if not os.path.exists(self.record_to): os.makedirs(self.record_to) demo_path = self.demo_path(self._num_episodes) log.warning('Recording episode demo to %s', demo_path) self.game.new_episode(demo_path) else: if self._num_episodes > 0: # no demo recording (default) self.game.new_episode() self.state = self.game.get_state() img = None try: img = self.state.screen_buffer except AttributeError: # sometimes Doom does not return screen buffer at all??? Rare bug pass if img is None: log.error( 'Game returned None screen buffer! This is not supposed to happen!' ) img = self._black_screen() # Swap current and previous histogram if self.current_histogram is not None and self.previous_histogram is not None: swap = self.current_histogram self.current_histogram = self.previous_histogram self.previous_histogram = swap self.current_histogram.fill(0) self._actions_flattened = None self._last_episode_info = copy.deepcopy(self._prev_info) self._prev_info = None self._num_episodes += 1 return np.transpose(img, (1, 2, 0))
def generate_experiments(self, experiment_arg_name, customize_experiment_name, param_prefix): """Yields tuples of (cmd, experiment_name)""" num_experiments = 1 if len(self.params) == 0 else len(self.params) for experiment_idx in range(num_experiments): cmd_tokens = [self.cmd] experiment_name_tokens = [self.base_name] # abbreviations for parameter names that we've used param_shorthands = [] if len(self.params) > 0: params = self.params[experiment_idx] for param, value in params.items(): param_str = f'{param_prefix}{param}={value}' cmd_tokens.append(param_str) param_tokens = re.split('[._-]', param) shorthand_tokens = [t[0] for t in param_tokens[:-1]] last_token_l = min(3, len(param_tokens[-1])) shorthand = '.'.join(shorthand_tokens + [param_tokens[-1][:last_token_l]]) while last_token_l <= len(param_tokens[-1]) and shorthand in param_shorthands: last_token_l += 1 shorthand = '.'.join(shorthand_tokens + [param_tokens[-1][:last_token_l]]) param_shorthands.append(shorthand) experiment_name_token = f'{shorthand}_{value}' experiment_name_tokens.append(experiment_name_token) if customize_experiment_name: experiment_name = f'{experiment_idx:02d}_' + '_'.join(experiment_name_tokens) if len(experiment_name) > 100: log.warning('Experiment name is extra long! (%d characters)', len(experiment_name)) else: experiment_name = self.base_name cmd_tokens.append(f'{experiment_arg_name}={experiment_name}') param_str = ' '.join(cmd_tokens) yield param_str, experiment_name
def _run(self): """ Main loop of the actor worker (rollout worker). Process tasks (mainly ROLLOUT_STEP) until we get the termination signal, which usually means end of training. Currently there is no mechanism to restart dead workers if something bad happens during training. We can only retry on the initial reset(). This is definitely something to work on. """ log.info('Initializing vector env runner %d...', self.worker_idx) # workers should ignore Ctrl+C because the termination is handled in the event loop by a special msg signal.signal(signal.SIGINT, signal.SIG_IGN) if self.cfg.actor_worker_gpus: set_gpus_for_process( self.worker_idx, num_gpus_per_process=1, process_type='actor', gpu_mask=self.cfg.actor_worker_gpus, ) torch.multiprocessing.set_sharing_strategy('file_system') timing = Timing() last_report = time.time() with torch.no_grad(): while not self.terminate: try: try: with timing.add_time('waiting'), timing.timeit( 'wait_actor'): tasks = self.task_queue.get_many(timeout=0.1) except Empty: tasks = [] for task in tasks: task_type, data = task if task_type == TaskType.INIT: self._init() continue if task_type == TaskType.TERMINATE: self._terminate() break # handling actual workload if task_type == TaskType.ROLLOUT_STEP: if 'work' not in timing: timing.waiting = 0 # measure waiting only after real work has started with timing.add_time('work'), timing.timeit( 'one_step'): self._advance_rollouts(data, timing) elif task_type == TaskType.RESET: with timing.add_time('reset'): self._handle_reset() elif task_type == TaskType.PBT: self._process_pbt_task(data) elif task_type == TaskType.UPDATE_ENV_STEPS: for env in self.env_runners: env.update_env_steps(data) if time.time() - last_report > 5.0 and 'one_step' in timing: timing_stats = dict(wait_actor=timing.wait_actor, step_actor=timing.one_step) memory_mb = memory_consumption_mb() stats = dict(memory_actor=memory_mb) safe_put(self.report_queue, dict(timing=timing_stats, stats=stats), queue_name='report') last_report = time.time() except RuntimeError as exc: log.warning( 'Error while processing data w: %d, exception: %s', self.worker_idx, exc) log.warning('Terminate process...') self.terminate = True safe_put(self.report_queue, dict(critical_error=self.worker_idx), queue_name='report') except KeyboardInterrupt: self.terminate = True except: log.exception('Unknown exception in rollout worker') self.terminate = True if self.worker_idx <= 1: time.sleep(0.1) log.info( 'Env runner %d, CPU aff. %r, rollouts %d: timing %s', self.worker_idx, psutil.Process().cpu_affinity(), self.num_complete_rollouts, timing, )
def run(self): for p in self.processes: time.sleep(0.3) p.start() finished_reset = np.zeros([self.cfg.num_workers], dtype=np.bool) while not all(finished_reset): try: msg = self.report_queue.get(timeout=0.1) if 'finished_reset' in msg: finished_reset[msg['proc_idx']] = True log.debug('Process %d finished reset! Status %r', msg['proc_idx'], finished_reset) except Empty: pass log.debug('All workers finished reset!') time.sleep(2) self.start_event.set() start = time.time() env_frames = 0 last_process_report = [time.time() for _ in self.processes] while not self.terminate.value: try: try: msgs = self.report_queue.get_many( timeout=self.report_every_sec * 1.5) for msg in msgs: last_process_report[msg['proc_idx']] = time.time() if 'crash' in msg: self.terminate.value = True log.error( 'Terminating due to process %d crashing...', msg['proc_idx']) break env_frames += msg['env_frames'] if env_frames >= self.cfg.sample_env_frames: log.warning('Desired number of frames reached') self.terminate.value = True if time.time() - start > self.cfg.timeout_seconds: log.warning('Terminated by timer') self.terminate.value = True except Empty: pass except KeyboardInterrupt: self.terminate.value = True log.error('KeyboardInterrupt in main loop! Terminating...') break if time.time() - self.last_report > self.report_every_sec: self.report(env_frames) for proc_idx, p in enumerate(self.processes): delay = time.time() - last_process_report[proc_idx] if delay > 600: # killing the whole script is the best way to know that some of the processes froze log.error( 'Process %d had not responded in %.1f s!!! Terminating...', proc_idx, delay) self.terminate.value = True for p in self.processes: if not p.is_alive(): self.terminate.value = True log.error('Process %r died! terminating...', p) total_time = time.time() - start log.info('Collected %d frames in %.1f s, avg FPS: %.1f', env_frames, total_time, env_frames / total_time) log.debug('Done sampling...')
def sample(self, proc_idx): # workers should ignore Ctrl+C because the termination is handled in the event loop by a special msg signal.signal(signal.SIGINT, signal.SIG_IGN) if self.cfg.sampler_worker_gpus: set_gpus_for_process( proc_idx, num_gpus_per_process=1, process_type='sampler_proc', gpu_mask=self.cfg.sampler_worker_gpus, ) timing = Timing() from threadpoolctl import threadpool_limits with threadpool_limits(limits=1, user_api=None): if self.cfg.set_workers_cpu_affinity: set_process_cpu_affinity(proc_idx, self.cfg.num_workers) initial_cpu_affinity = psutil.Process().cpu_affinity( ) if platform != 'darwin' else None psutil.Process().nice(10) with timing.timeit('env_init'): envs = [] env_key = ['env' for _ in range(self.cfg.num_envs_per_worker)] for env_idx in range(self.cfg.num_envs_per_worker): global_env_id = proc_idx * self.cfg.num_envs_per_worker + env_idx env_config = AttrDict(worker_index=proc_idx, vector_index=env_idx, env_id=global_env_id) env = make_env_func(cfg=self.cfg, env_config=env_config) log.debug( 'CPU affinity after create_env: %r', psutil.Process().cpu_affinity() if platform != 'darwin' else 'MacOS - None') env.seed(global_env_id) envs.append(env) # this is to track the performance for individual DMLab levels if hasattr(env.unwrapped, 'level_name'): env_key[env_idx] = env.unwrapped.level_name episode_length = [0 for _ in envs] episode_lengths = [deque([], maxlen=20) for _ in envs] # sample a lot of random actions once, otherwise it is pretty slow in Python total_random_actions = 500 actions = [[ env.action_space.sample() for _ in range(env.num_agents) ] for _ in range(total_random_actions)] action_i = 0 try: with timing.timeit('first_reset'): for env_idx, env in enumerate(envs): env.reset() log.info('Process %d finished resetting %d/%d envs', proc_idx, env_idx + 1, len(envs)) self.report_queue.put( dict(proc_idx=proc_idx, finished_reset=True)) self.start_event.wait() with timing.timeit('work'): last_report = last_report_frames = total_env_frames = 0 while not self.terminate.value and total_env_frames < self.cfg.sample_env_frames_per_worker: for env_idx, env in enumerate(envs): with timing.add_time(f'{env_key[env_idx]}.step'): obs, rewards, dones, infos = env.step( actions[action_i]) action_i = (action_i + 1) % total_random_actions num_frames = sum( [info.get('num_frames', 1) for info in infos]) total_env_frames += num_frames episode_length[env_idx] += num_frames if all(dones): episode_lengths[env_idx].append( episode_length[env_idx] / env.num_agents) episode_length[env_idx] = 0 with timing.add_time('report'): now = time.time() if now - last_report > self.report_every_sec: last_report = now frames_since_last_report = total_env_frames - last_report_frames last_report_frames = total_env_frames self.report_queue.put( dict(proc_idx=proc_idx, env_frames=frames_since_last_report)) if proc_idx == 0: log.debug('Memory usage: %.4f Mb', memory_consumption_mb()) # Extra check to make sure cpu affinity is preserved throughout the execution. # I observed weird effect when some environments tried to alter affinity of the current process, leading # to decreased performance. # This can be caused by some interactions between deep learning libs, OpenCV, MKL, OpenMP, etc. # At least user should know about it if this is happening. cpu_affinity = psutil.Process().cpu_affinity( ) if platform != 'darwin' else None assert initial_cpu_affinity == cpu_affinity, \ f'Worker CPU affinity was changed from {initial_cpu_affinity} to {cpu_affinity}!' \ f'This can significantly affect performance!' except: log.exception('Unknown exception') log.error('Unknown exception in worker %d, terminating...', proc_idx) self.report_queue.put(dict(proc_idx=proc_idx, crash=True)) time.sleep(proc_idx * 0.01 + 0.01) log.info('Process %d finished sampling. Timing: %s', proc_idx, timing) for env_idx, env in enumerate(envs): if len(episode_lengths[env_idx]) > 0: log.warning('Level %s avg episode len %d', env_key[env_idx], np.mean(episode_lengths[env_idx])) for env in envs: env.close()
def run(self): """ This function contains the main loop of the algorithm, as well as initialization/cleanup code. :return: ExperimentStatus (SUCCESS, FAILURE, INTERRUPTED). Useful in testing. """ status = ExperimentStatus.SUCCESS if os.path.isfile(done_filename(self.cfg)): log.warning( 'Training already finished! Remove "done" file to continue training' ) return status self.init_workers() self.init_pbt() self.finish_initialization() log.info('Collecting experience...') timing = Timing() with timing.timeit('experience'): # noinspection PyBroadException try: while not self._should_end_training(): try: reports = self.report_queue.get_many(timeout=0.1) for report in reports: self.process_report(report) except Empty: pass if time.time() - self.last_report > self.report_interval: self.report() now = time.time() self.total_train_seconds += now - self.last_report self.last_report = now self.update_env_steps_actor() self.pbt.update(self.env_steps, self.policy_avg_stats) except Exception: log.exception('Exception in driver loop') status = ExperimentStatus.FAILURE except KeyboardInterrupt: log.warning( 'Keyboard interrupt detected in driver loop, exiting...') status = ExperimentStatus.INTERRUPTED for learner in self.learner_workers.values(): # timeout is needed here because some environments may crash on KeyboardInterrupt (e.g. VizDoom) # Therefore the learner train loop will never do another iteration and will never save the model. # This is not an issue with normal exit, e.g. due to desired number of frames reached. learner.save_model(timeout=5.0) all_workers = self.actor_workers for workers in self.policy_workers.values(): all_workers.extend(workers) all_workers.extend(self.learner_workers.values()) child_processes = list_child_processes() time.sleep(0.1) log.debug('Closing workers...') for i, w in enumerate(all_workers): w.close() time.sleep(0.01) for i, w in enumerate(all_workers): w.join() log.debug('Workers joined!') finish_wandb(self.cfg) # VizDoom processes often refuse to die for an unidentified reason, so we're force killing them with a hack kill_processes(child_processes) fps = self.total_env_steps_since_resume / timing.experience log.info('Collected %r, FPS: %.1f', self.env_steps, fps) log.info('Timing: %s', timing) if self._should_end_training(): with open(done_filename(self.cfg), 'w') as fobj: fobj.write(f'{self.env_steps}') time.sleep(0.5) log.info('Done!') return status
def _run(self): # workers should ignore Ctrl+C because the termination is handled in the event loop by a special msg signal.signal(signal.SIGINT, signal.SIG_IGN) psutil.Process().nice(min(self.cfg.default_niceness + 2, 20)) cuda_envvars_for_policy(self.policy_id, 'inference') torch.multiprocessing.set_sharing_strategy('file_system') timing = Timing() with timing.timeit('init'): # initialize the Torch modules log.info('Initializing model on the policy worker %d-%d...', self.policy_id, self.worker_idx) log.info( f'POLICY worker {self.policy_id}-{self.worker_idx}\tpid {os.getpid()}\tparent {os.getppid()}' ) torch.set_num_threads(1) if self.cfg.device == 'gpu': # we should already see only one CUDA device, because of env vars assert torch.cuda.device_count() == 1 self.device = torch.device('cuda', index=0) else: self.device = torch.device('cpu') self.actor_critic = create_actor_critic(self.cfg, self.obs_space, self.action_space, timing) self.actor_critic.model_to_device(self.device) for p in self.actor_critic.parameters(): p.requires_grad = False # we don't train anything here log.info('Initialized model on the policy worker %d-%d!', self.policy_id, self.worker_idx) last_report = last_cache_cleanup = time.time() last_report_samples = 0 request_count = deque(maxlen=50) # very conservative limit on the minimum number of requests to wait for # this will almost guarantee that the system will continue collecting experience # at max rate even when 2/3 of workers are stuck for some reason (e.g. doing a long env reset) # Although if your workflow involves very lengthy operations that often freeze workers, it can be beneficial # to set min_num_requests to 1 (at a cost of potential inefficiency, i.e. policy worker will use very small # batches) min_num_requests = self.cfg.num_workers // ( self.cfg.num_policies * self.cfg.policy_workers_per_policy) min_num_requests //= 3 min_num_requests = max(1, min_num_requests) log.info('Min num requests: %d', min_num_requests) # Again, very conservative timer. Only wait a little bit, then continue operation. wait_for_min_requests = 0.025 while not self.terminate: try: while self.shared_buffers.stop_experience_collection[ self.policy_id]: with self.resume_experience_collection_cv: self.resume_experience_collection_cv.wait(timeout=0.05) waiting_started = time.time() while len(self.requests) < min_num_requests and time.time( ) - waiting_started < wait_for_min_requests: try: with timing.timeit('wait_policy'), timing.add_time( 'wait_policy_total'): policy_requests = self.policy_queue.get_many( timeout=0.005) self.requests.extend(policy_requests) except Empty: pass self._update_weights(timing) with timing.timeit('one_step'), timing.add_time( 'handle_policy_step'): if self.initialized: if len(self.requests) > 0: request_count.append(len(self.requests)) self._handle_policy_steps(timing) try: task_type, data = self.task_queue.get_nowait() # task from the task_queue if task_type == TaskType.INIT: self._init() elif task_type == TaskType.TERMINATE: self.terminate = True break elif task_type == TaskType.INIT_MODEL: self._init_model(data) self.task_queue.task_done() except Empty: pass if time.time() - last_report > 3.0 and 'one_step' in timing: timing_stats = dict(wait_policy=timing.wait_policy, step_policy=timing.one_step) samples_since_last_report = self.total_num_samples - last_report_samples stats = memory_stats('policy_worker', self.device) if len(request_count) > 0: stats['avg_request_count'] = np.mean(request_count) self.report_queue.put( dict( timing=timing_stats, samples=samples_since_last_report, policy_id=self.policy_id, stats=stats, )) last_report = time.time() last_report_samples = self.total_num_samples if time.time() - last_cache_cleanup > 300.0 or ( not self.cfg.benchmark and self.total_num_samples < 1000): if self.cfg.device == 'gpu': torch.cuda.empty_cache() last_cache_cleanup = time.time() except KeyboardInterrupt: log.warning('Keyboard interrupt detected on worker %d-%d', self.policy_id, self.worker_idx) self.terminate = True except: log.exception('Unknown exception on policy worker') self.terminate = True time.sleep(0.2) log.info('Policy worker avg. requests %.2f, timing: %s', np.mean(request_count), timing)
def __init__( self, task_id, level, action_repeat, res_w, res_h, benchmark_mode, renderer, dataset_path, with_instructions, extended_action_set, use_level_cache, level_cache_path, gpu_index, extra_cfg=None, ): self.width = res_w self.height = res_h # self._main_observation = 'DEBUG.CAMERA_INTERLEAVED.PLAYER_VIEW_NO_RETICLE' self.main_observation = 'RGB_INTERLEAVED' self.instructions_observation = DMLAB_INSTRUCTIONS self.with_instructions = with_instructions and not benchmark_mode self.action_repeat = action_repeat self.random_state = None self.task_id = task_id self.level = level self.level_name = dmlab_level_to_level_name(self.level) # the policy index which currently acts in the environment self.curr_policy_idx = 0 self.curr_cache = dmlab_level_cache.DMLAB_GLOBAL_LEVEL_CACHE[ self.curr_policy_idx] self.instructions = np.zeros([DMLAB_MAX_INSTRUCTION_LEN], dtype=np.int32) observation_format = [self.main_observation] if self.with_instructions: observation_format += [self.instructions_observation] config = { 'width': self.width, 'height': self.height, 'gpuDeviceIndex': str(gpu_index), 'datasetPath': dataset_path, } if extra_cfg is not None: config.update(extra_cfg) config = {k: str(v) for k, v in config.items()} self.use_level_cache = use_level_cache self.level_cache_path = ensure_dir_exists(level_cache_path) env_level_cache = self if use_level_cache else None self.env_uses_level_cache = False # will be set to True when this env instance queries the cache self.last_reset_seed = None if env_level_cache is not None: if not isinstance(self.curr_cache, dmlab_level_cache.DmlabLevelCacheGlobal): raise Exception( 'DMLab global level cache object is not initialized! Make sure to call' 'dmlab_ensure_global_cache_initialized() in the main thread before you fork any child processes' 'or create any DMLab envs') self.dmlab = deepmind_lab.Lab( level, observation_format, config=config, renderer=renderer, level_cache=env_level_cache, ) self.action_set = EXTENDED_ACTION_SET if extended_action_set else ACTION_SET self.action_list = np.array( self.action_set, dtype=np.intc) # DMLAB requires intc type for actions self.last_observation = None self.render_scale = 5 self.render_fps = 30 self.last_frame = time.time() self.action_space = gym.spaces.Discrete(len(self.action_set)) self.observation_space = gym.spaces.Dict( obs=gym.spaces.Box(low=0, high=255, shape=(self.height, self.width, 3), dtype=np.uint8)) if self.with_instructions: self.observation_space.spaces[ self.instructions_observation] = gym.spaces.Box( low=0, high=DMLAB_VOCABULARY_SIZE, shape=[DMLAB_MAX_INSTRUCTION_LEN], dtype=np.int32, ) self.benchmark_mode = benchmark_mode if self.benchmark_mode: log.warning( 'DmLab benchmark mode is true! Use this only for testing, not for actual training runs!' ) self.seed()
def multi_agent_match(policy_indices, max_num_episodes=int(1e9), max_num_frames=1e10): log.debug('Starting eval process with policies %r', policy_indices) for i, rival in enumerate(RIVALS): rival.policy_index = policy_indices[i] curr_dir = os.path.dirname(os.path.abspath(__file__)) evaluation_filename = join(curr_dir, f'eval_{"vs".join([str(pi) for pi in policy_indices])}.txt') with open(evaluation_filename, 'w') as fobj: fobj.write('start\n') common_config = RIVALS[0].cfg render_action_repeat = common_config.render_action_repeat if common_config.render_action_repeat is not None else common_config.env_frameskip if render_action_repeat is None: log.warning('Not using action repeat!') render_action_repeat = 1 log.debug('Using action repeat %d during evaluation', render_action_repeat) common_config.env_frameskip = 1 # for evaluation common_config.num_envs = 1 common_config.timelimit = 4.0 # for faster evaluation def make_env_func(env_config): return create_env(ENV_NAME, cfg=common_config, env_config=env_config) env = make_env_func(AttrDict({'worker_index': 0, 'vector_index': 0})) env.seed(0) is_multiagent = is_multiagent_env(env) if not is_multiagent: env = MultiAgentWrapper(env) else: assert env.num_agents == len(RIVALS) device = torch.device('cuda') for rival in RIVALS: rival.actor_critic = create_actor_critic(rival.cfg, env.observation_space, env.action_space) rival.actor_critic.model_to_device(device) policy_id = rival.policy_index checkpoints = LearnerWorker.get_checkpoints(LearnerWorker.checkpoint_dir(rival.cfg, policy_id)) checkpoint_dict = LearnerWorker.load_checkpoint(checkpoints, device) rival.actor_critic.load_state_dict(checkpoint_dict['model']) episode_rewards = [] num_frames = 0 last_render_start = time.time() def max_frames_reached(frames): return max_num_frames is not None and frames > max_num_frames wins = [0 for _ in RIVALS] ties = 0 frag_differences = [] with torch.no_grad(): for _ in range(max_num_episodes): obs = env.reset() obs_dict_torch = dict() done = [False] * len(obs) for rival in RIVALS: rival.rnn_states = torch.zeros([1, rival.cfg.hidden_size], dtype=torch.float32, device=device) episode_reward = 0 prev_frame = time.time() while True: actions = [] for i, obs_dict in enumerate(obs): for key, x in obs_dict.items(): obs_dict_torch[key] = torch.from_numpy(x).to(device).float().view(1, *x.shape) rival = RIVALS[i] policy_outputs = rival.actor_critic(obs_dict_torch, rival.rnn_states) rival.rnn_states = policy_outputs.rnn_states actions.append(policy_outputs.actions[0].cpu().numpy()) for _ in range(render_action_repeat): if not NO_RENDER: target_delay = 1.0 / FPS if FPS > 0 else 0 current_delay = time.time() - last_render_start time_wait = target_delay - current_delay if time_wait > 0: # log.info('Wait time %.3f', time_wait) time.sleep(time_wait) last_render_start = time.time() env.render() obs, rew, done, infos = env.step(actions) if all(done): log.debug('Finished episode!') frag_diff = infos[0]['PLAYER1_FRAGCOUNT'] - infos[0]['PLAYER2_FRAGCOUNT'] if frag_diff > 0: wins[0] += 1 elif frag_diff < 0: wins[1] += 1 else: ties += 1 frag_differences.append(frag_diff) avg_frag_diff = np.mean(frag_differences) report = f'wins: {wins}, ties: {ties}, avg_frag_diff: {avg_frag_diff}' with open(evaluation_filename, 'a') as fobj: fobj.write(report + '\n') # log.info('%d:%d', infos[0]['PLAYER1_FRAGCOUNT'], infos[0]['PLAYER2_FRAGCOUNT']) episode_reward += np.mean(rew) num_frames += 1 if num_frames % 100 == 0: log.debug('%.1f', render_action_repeat / (time.time() - prev_frame)) prev_frame = time.time() if all(done): log.info('Episode finished at %d frames', num_frames) break if all(done) or max_frames_reached(num_frames): break if not NO_RENDER: env.render() time.sleep(0.01) episode_rewards.append(episode_reward) last_episodes = episode_rewards[-100:] avg_reward = sum(last_episodes) / len(last_episodes) log.info( 'Episode reward: %f, avg reward for %d episodes: %f', episode_reward, len(last_episodes), avg_reward, ) if max_frames_reached(num_frames): break env.close()
def enjoy(cfg, max_num_frames=1e9): cfg = load_from_checkpoint(cfg) render_action_repeat = cfg.render_action_repeat if cfg.render_action_repeat is not None else cfg.env_frameskip if render_action_repeat is None: log.warning('Not using action repeat!') render_action_repeat = 1 log.debug('Using action repeat %d during evaluation', render_action_repeat) cfg.env_frameskip = 1 # for evaluation cfg.num_envs = 1 def make_env_func(env_config): return create_env(cfg.env, cfg=cfg, env_config=env_config) env = make_env_func(AttrDict({'worker_index': 0, 'vector_index': 0})) # env.seed(0) is_multiagent = is_multiagent_env(env) if not is_multiagent: env = MultiAgentWrapper(env) if hasattr(env.unwrapped, 'reset_on_init'): # reset call ruins the demo recording for VizDoom env.unwrapped.reset_on_init = False actor_critic = create_actor_critic(cfg, env.observation_space, env.action_space) device = torch.device('cpu' if cfg.device == 'cpu' else 'cuda') actor_critic.model_to_device(device) policy_id = cfg.policy_index checkpoints = LearnerWorker.get_checkpoints( LearnerWorker.checkpoint_dir(cfg, policy_id)) checkpoint_dict = LearnerWorker.load_checkpoint(checkpoints, device) actor_critic.load_state_dict(checkpoint_dict['model']) episode_rewards = [deque([], maxlen=100) for _ in range(env.num_agents)] true_rewards = [deque([], maxlen=100) for _ in range(env.num_agents)] num_frames = 0 last_render_start = time.time() def max_frames_reached(frames): return max_num_frames is not None and frames > max_num_frames obs = env.reset() rnn_states = torch.zeros( [env.num_agents, get_hidden_size(cfg)], dtype=torch.float32, device=device) episode_reward = np.zeros(env.num_agents) finished_episode = [False] * env.num_agents with torch.no_grad(): while not max_frames_reached(num_frames): obs_torch = AttrDict(transform_dict_observations(obs)) for key, x in obs_torch.items(): obs_torch[key] = torch.from_numpy(x).to(device).float() policy_outputs = actor_critic(obs_torch, rnn_states, with_action_distribution=True) # sample actions from the distribution by default actions = policy_outputs.actions action_distribution = policy_outputs.action_distribution if isinstance(action_distribution, ContinuousActionDistribution): if not cfg.continuous_actions_sample: # TODO: add similar option for discrete actions actions = action_distribution.means actions = actions.cpu().numpy() rnn_states = policy_outputs.rnn_states for _ in range(render_action_repeat): if not cfg.no_render: target_delay = 1.0 / cfg.fps if cfg.fps > 0 else 0 current_delay = time.time() - last_render_start time_wait = target_delay - current_delay if time_wait > 0: # log.info('Wait time %.3f', time_wait) time.sleep(time_wait) last_render_start = time.time() env.render() obs, rew, done, infos = env.step(actions) episode_reward += rew num_frames += 1 for agent_i, done_flag in enumerate(done): if done_flag: finished_episode[agent_i] = True episode_rewards[agent_i].append( episode_reward[agent_i]) true_rewards[agent_i].append(infos[agent_i].get( 'true_reward', episode_reward[agent_i])) log.info( 'Episode finished for agent %d at %d frames. Reward: %.3f, true_reward: %.3f', agent_i, num_frames, episode_reward[agent_i], true_rewards[agent_i][-1]) rnn_states[agent_i] = torch.zeros( [get_hidden_size(cfg)], dtype=torch.float32, device=device) episode_reward[agent_i] = 0 # if episode terminated synchronously for all agents, pause a bit before starting a new one if all(done): if not cfg.no_render: env.render() time.sleep(0.05) if all(finished_episode): finished_episode = [False] * env.num_agents avg_episode_rewards_str, avg_true_reward_str = '', '' for agent_i in range(env.num_agents): avg_rew = np.mean(episode_rewards[agent_i]) avg_true_rew = np.mean(true_rewards[agent_i]) if not np.isnan(avg_rew): if avg_episode_rewards_str: avg_episode_rewards_str += ', ' avg_episode_rewards_str += f'#{agent_i}: {avg_rew:.3f}' if not np.isnan(avg_true_rew): if avg_true_reward_str: avg_true_reward_str += ', ' avg_true_reward_str += f'#{agent_i}: {avg_true_rew:.3f}' log.info('Avg episode rewards: %s, true rewards: %s', avg_episode_rewards_str, avg_true_reward_str) log.info( 'Avg episode reward: %.3f, avg true_reward: %.3f', np.mean([ np.mean(episode_rewards[i]) for i in range(env.num_agents) ]), np.mean([ np.mean(true_rewards[i]) for i in range(env.num_agents) ])) # VizDoom multiplayer stuff # for player in [1, 2, 3, 4, 5, 6, 7, 8]: # key = f'PLAYER{player}_FRAGCOUNT' # if key in infos[0]: # log.debug('Score for player %d: %r', player, infos[0][key]) env.close() return ExperimentStatus.SUCCESS, np.mean(episode_rewards)
def sample(self, proc_idx): # workers should ignore Ctrl+C because the termination is handled in the event loop by a special msg signal.signal(signal.SIGINT, signal.SIG_IGN) timing = Timing() psutil.Process().nice(10) num_envs = len(DMLAB30_LEVELS_THAT_USE_LEVEL_CACHE) assert self.cfg.num_workers % num_envs == 0, f'should have an integer number of workers per env, e.g. {1 * num_envs}, {2 * num_envs}, etc...' assert self.cfg.num_envs_per_worker == 1, 'use populate_cache with 1 env per worker' with timing.timeit('env_init'): env_key = 'env' env_desired_num_levels = 0 global_env_id = proc_idx * self.cfg.num_envs_per_worker env_config = AttrDict(worker_index=proc_idx, vector_index=0, env_id=global_env_id) env = create_env(self.cfg.env, cfg=self.cfg, env_config=env_config) env.seed(global_env_id) # this is to track the performance for individual DMLab levels if hasattr(env.unwrapped, 'level_name'): env_key = env.unwrapped.level_name env_level = env.unwrapped.level approx_num_episodes_per_1b_frames = DMLAB30_APPROX_NUM_EPISODES_PER_BILLION_FRAMES[env_key] num_billions = DESIRED_TRAINING_LENGTH / int(1e9) num_workers_for_env = self.cfg.num_workers // num_envs env_desired_num_levels = int((approx_num_episodes_per_1b_frames * num_billions) / num_workers_for_env) env_num_levels_generated = len(dmlab_level_cache.DMLAB_GLOBAL_LEVEL_CACHE[0].all_seeds[env_level]) // num_workers_for_env log.warning('Worker %d (env %s) generated %d/%d levels!', proc_idx, env_key, env_num_levels_generated, env_desired_num_levels) time.sleep(4) env.reset() env_uses_level_cache = env.unwrapped.env_uses_level_cache self.report_queue.put(dict(proc_idx=proc_idx, finished_reset=True)) self.start_event.wait() try: with timing.timeit('work'): last_report = last_report_frames = total_env_frames = 0 while not self.terminate.value and total_env_frames < self.cfg.sample_env_frames_per_worker: action = env.action_space.sample() with timing.add_time(f'{env_key}.step'): env.step(action) total_env_frames += 1 with timing.add_time(f'{env_key}.reset'): env.reset() env_num_levels_generated += 1 log.debug('Env %s done %d/%d resets', env_key, env_num_levels_generated, env_desired_num_levels) if env_num_levels_generated >= env_desired_num_levels: log.debug('%s finished %d/%d resets, sleeping...', env_key, env_num_levels_generated, env_desired_num_levels) time.sleep(30) # free up CPU time for other envs # if env does not use level cache, there is no need to run it # let other workers proceed if not env_uses_level_cache: log.debug('Env %s does not require cache, sleeping...', env_key) time.sleep(200) with timing.add_time('report'): now = time.time() if now - last_report > self.report_every_sec: last_report = now frames_since_last_report = total_env_frames - last_report_frames last_report_frames = total_env_frames self.report_queue.put(dict(proc_idx=proc_idx, env_frames=frames_since_last_report)) if get_free_disk_space_mb(self.cfg) < 3 * 1024: log.error('Not enough disk space! %d', get_free_disk_space_mb(self.cfg)) time.sleep(200) except: log.exception('Unknown exception') log.error('Unknown exception in worker %d, terminating...', proc_idx) self.report_queue.put(dict(proc_idx=proc_idx, crash=True)) time.sleep(proc_idx * 0.1 + 0.1) log.info('Process %d finished sampling. Timing: %s', proc_idx, timing) env.close()
def __init__(self, action_space, config_file, coord_limits=None, max_histogram_length=200, show_automap=False, skip_frames=1, async_mode=False, record_to=None): self.initialized = False # essential game data self.game = None self.state = None self.curr_seed = 0 self.rng = None self.skip_frames = skip_frames self.async_mode = async_mode # optional - for topdown view rendering and visitation heatmaps self.show_automap = show_automap self.coord_limits = coord_limits # can be adjusted after the environment is created (but before any reset() call) via observation space wrapper self.screen_w, self.screen_h, self.channels = 640, 480, 3 self.screen_resolution = ScreenResolution.RES_640X480 self.calc_observation_space() self.black_screen = None # provided as a part of environment definition, since these depend on the scenario and # can be quite complex multi-discrete spaces self.action_space = action_space self.composite_action_space = hasattr(self.action_space, 'spaces') self.delta_actions_scaling_factor = 7.5 if os.path.isabs(config_file): self.config_path = config_file else: scenarios_dir = join(os.path.dirname(__file__), 'scenarios') self.config_path = join(scenarios_dir, config_file) if not os.path.isfile(self.config_path): log.warning( 'File %s not found in scenarios dir %s. Consider providing absolute path?', config_file, scenarios_dir, ) self.variable_indices = self._parse_variable_indices(self.config_path) # only created if we call render() method self.viewer = None # record full episodes using VizDoom recording functionality self.record_to = record_to self.is_multiplayer = False # overridden in derived classes # (optional) histogram to track positional coverage # do not pass coord_limits if you don't need this, to avoid extra calculation self.max_histogram_length = max_histogram_length self.current_histogram, self.previous_histogram = None, None if self.coord_limits: x = (self.coord_limits[2] - self.coord_limits[0]) y = (self.coord_limits[3] - self.coord_limits[1]) if x > y: len_x = self.max_histogram_length len_y = int((y / x) * self.max_histogram_length) else: len_x = int((x / y) * self.max_histogram_length) len_y = self.max_histogram_length self.current_histogram = np.zeros((len_x, len_y), dtype=np.int32) self.previous_histogram = np.zeros_like(self.current_histogram) # helpers for human play with pynput keyboard input self._terminate = False self._current_actions = [] self._actions_flattened = None self._prev_info = None self._last_episode_info = None self._num_episodes = 0 self.mode = 'algo' self.seed()