def init_workers(self): """ Initialize all types of workers and start their worker processes. """ actor_queues = [MpQueue(2 * 1000 * 1000) for _ in range(self.cfg.num_workers)] policy_worker_queues = dict() for policy_id in range(self.cfg.num_policies): policy_worker_queues[policy_id] = [] for i in range(self.cfg.policy_workers_per_policy): policy_worker_queues[policy_id].append(TorchJoinableQueue()) log.info('Initializing learners...') policy_locks = [multiprocessing.Lock() for _ in range(self.cfg.num_policies)] resume_experience_collection_cv = [multiprocessing.Condition() for _ in range(self.cfg.num_policies)] learner_idx = 0 for policy_id in range(self.cfg.num_policies): learner_worker = LearnerWorker( learner_idx, policy_id, self.cfg, self.obs_space, self.action_space, self.report_queue, policy_worker_queues[policy_id], self.traj_buffers, policy_locks[policy_id], resume_experience_collection_cv[policy_id], ) learner_worker.start_process() learner_worker.init() self.learner_workers[policy_id] = learner_worker learner_idx += 1 log.info('Initializing policy workers...') for policy_id in range(self.cfg.num_policies): self.policy_workers[policy_id] = [] policy_queue = MpQueue() self.policy_queues[policy_id] = policy_queue for i in range(self.cfg.policy_workers_per_policy): policy_worker = PolicyWorker( i, policy_id, self.cfg, self.obs_space, self.action_space, self.traj_buffers, policy_queue, actor_queues, self.report_queue, policy_worker_queues[policy_id][i], policy_locks[policy_id], resume_experience_collection_cv[policy_id], ) self.policy_workers[policy_id].append(policy_worker) policy_worker.start_process() log.info('Initializing actors...') # We support actor worker initialization in groups, which can be useful for some envs that # e.g. crash when too many environments are being initialized in parallel. # Currently the limit is not used since it is not required for any envs supported out of the box, # so we parallelize initialization as hard as we can. # If this is required for your environment, perhaps a better solution would be to use global locks, # like FileLock (see doom_gym.py) self.actor_workers = [] max_parallel_init = int(1e9) # might be useful to limit this for some envs worker_indices = list(range(self.cfg.num_workers)) for i in range(0, self.cfg.num_workers, max_parallel_init): workers = self.init_subset(worker_indices[i:i + max_parallel_init], actor_queues) self.actor_workers.extend(workers)
def enjoy(cfg, max_num_frames=1e9): cfg = load_from_checkpoint(cfg) render_action_repeat = cfg.render_action_repeat if cfg.render_action_repeat is not None else cfg.env_frameskip if render_action_repeat is None: log.warning('Not using action repeat!') render_action_repeat = 1 log.debug('Using action repeat %d during evaluation', render_action_repeat) cfg.env_frameskip = 1 # for evaluation cfg.num_envs = 1 if cfg.record_to: tstamp = datetime.datetime.now().strftime('%Y_%m_%d__%H_%M_%S') cfg.record_to = join(cfg.record_to, f'{cfg.experiment}', tstamp) if not os.path.isdir(cfg.record_to): os.makedirs(cfg.record_to) else: cfg.record_to = None def make_env_func(env_config): return create_env(cfg.env, cfg=cfg, env_config=env_config) env = make_env_func(AttrDict({'worker_index': 0, 'vector_index': 0})) # env.seed(0) is_multiagent = is_multiagent_env(env) if not is_multiagent: env = MultiAgentWrapper(env) if hasattr(env.unwrapped, 'reset_on_init'): # reset call ruins the demo recording for VizDoom env.unwrapped.reset_on_init = False actor_critic = create_actor_critic(cfg, env.observation_space, env.action_space) device = torch.device('cpu' if cfg.device == 'cpu' else 'cuda') actor_critic.model_to_device(device) policy_id = cfg.policy_index checkpoints = LearnerWorker.get_checkpoints( LearnerWorker.checkpoint_dir(cfg, policy_id)) checkpoint_dict = LearnerWorker.load_checkpoint(checkpoints, device) actor_critic.load_state_dict(checkpoint_dict['model']) episode_rewards = [deque([], maxlen=100) for _ in range(env.num_agents)] true_rewards = [deque([], maxlen=100) for _ in range(env.num_agents)] num_frames = 0 last_render_start = time.time() def max_frames_reached(frames): return max_num_frames is not None and frames > max_num_frames obs = env.reset() rnn_states = torch.zeros( [env.num_agents, get_hidden_size(cfg)], dtype=torch.float32, device=device) episode_reward = np.zeros(env.num_agents) finished_episode = [False] * env.num_agents with torch.no_grad(): while not max_frames_reached(num_frames): obs_torch = AttrDict(transform_dict_observations(obs)) for key, x in obs_torch.items(): obs_torch[key] = torch.from_numpy(x).to(device).float() policy_outputs = actor_critic(obs_torch, rnn_states, with_action_distribution=True) # sample actions from the distribution by default actions = policy_outputs.actions action_distribution = policy_outputs.action_distribution if isinstance(action_distribution, ContinuousActionDistribution): if not cfg.continuous_actions_sample: # TODO: add similar option for discrete actions actions = action_distribution.means actions = actions.cpu().numpy() rnn_states = policy_outputs.rnn_states for _ in range(render_action_repeat): if not cfg.no_render: target_delay = 1.0 / cfg.fps if cfg.fps > 0 else 0 current_delay = time.time() - last_render_start time_wait = target_delay - current_delay if time_wait > 0: # log.info('Wait time %.3f', time_wait) time.sleep(time_wait) last_render_start = time.time() env.render() obs, rew, done, infos = env.step(actions) episode_reward += rew num_frames += 1 for agent_i, done_flag in enumerate(done): if done_flag: finished_episode[agent_i] = True episode_rewards[agent_i].append( episode_reward[agent_i]) true_rewards[agent_i].append(infos[agent_i].get( 'true_reward', math.nan)) log.info( 'Episode finished for agent %d at %d frames. Reward: %.3f, true_reward: %.3f', agent_i, num_frames, episode_reward[agent_i], true_rewards[agent_i][-1]) rnn_states[agent_i] = torch.zeros( [get_hidden_size(cfg)], dtype=torch.float32, device=device) episode_reward[agent_i] = 0 # if episode terminated synchronously for all agents, pause a bit before starting a new one if all(done): if not cfg.no_render: env.render() time.sleep(0.05) if all(finished_episode): finished_episode = [False] * env.num_agents avg_episode_rewards_str, avg_true_reward_str = '', '' for agent_i in range(env.num_agents): avg_rew = np.mean(episode_rewards[agent_i]) avg_true_rew = np.mean(true_rewards[agent_i]) if not np.isnan(avg_rew): if avg_episode_rewards_str: avg_episode_rewards_str += ', ' avg_episode_rewards_str += f'#{agent_i}: {avg_rew:.3f}' if not np.isnan(avg_true_rew): if avg_true_reward_str: avg_true_reward_str += ', ' avg_true_reward_str += f'#{agent_i}: {avg_true_rew:.3f}' log.info('Avg episode rewards: %s, true rewards: %s', avg_episode_rewards_str, avg_true_reward_str) log.info( 'Avg episode reward: %.3f, avg true_reward: %.3f', np.mean([ np.mean(episode_rewards[i]) for i in range(env.num_agents) ]), np.mean([ np.mean(true_rewards[i]) for i in range(env.num_agents) ])) # VizDoom multiplayer stuff # for player in [1, 2, 3, 4, 5, 6, 7, 8]: # key = f'PLAYER{player}_FRAGCOUNT' # if key in infos[0]: # log.debug('Score for player %d: %r', player, infos[0][key]) env.close() return ExperimentStatus.SUCCESS, np.mean(episode_rewards)
def multi_agent_match(policy_indices, max_num_episodes=int(1e9), max_num_frames=1e10): log.debug('Starting eval process with policies %r', policy_indices) for i, rival in enumerate(RIVALS): rival.policy_index = policy_indices[i] curr_dir = os.path.dirname(os.path.abspath(__file__)) evaluation_filename = join(curr_dir, f'eval_{"vs".join([str(pi) for pi in policy_indices])}.txt') with open(evaluation_filename, 'w') as fobj: fobj.write('start\n') common_config = RIVALS[0].cfg render_action_repeat = common_config.render_action_repeat if common_config.render_action_repeat is not None else common_config.env_frameskip if render_action_repeat is None: log.warning('Not using action repeat!') render_action_repeat = 1 log.debug('Using action repeat %d during evaluation', render_action_repeat) common_config.env_frameskip = 1 # for evaluation common_config.num_envs = 1 common_config.timelimit = 4.0 # for faster evaluation def make_env_func(env_config): return create_env(ENV_NAME, cfg=common_config, env_config=env_config) env = make_env_func(AttrDict({'worker_index': 0, 'vector_index': 0})) env.seed(0) is_multiagent = is_multiagent_env(env) if not is_multiagent: env = MultiAgentWrapper(env) else: assert env.num_agents == len(RIVALS) device = torch.device('cuda') for rival in RIVALS: rival.actor_critic = create_actor_critic(rival.cfg, env.observation_space, env.action_space) rival.actor_critic.model_to_device(device) policy_id = rival.policy_index checkpoints = LearnerWorker.get_checkpoints( LearnerWorker.checkpoint_dir(rival.cfg, policy_id)) checkpoint_dict = LearnerWorker.load_checkpoint(checkpoints, device) rival.actor_critic.load_state_dict(checkpoint_dict['model']) episode_rewards = [] num_frames = 0 last_render_start = time.time() def max_frames_reached(frames): return max_num_frames is not None and frames > max_num_frames wins = [0 for _ in RIVALS] ties = 0 frag_differences = [] with torch.no_grad(): for _ in range(max_num_episodes): obs = env.reset() obs_dict_torch = dict() done = [False] * len(obs) for rival in RIVALS: rival.rnn_states = torch.zeros([1, rival.cfg.hidden_size], dtype=torch.float32, device=device) episode_reward = 0 prev_frame = time.time() while True: actions = [] for i, obs_dict in enumerate(obs): for key, x in obs_dict.items(): obs_dict_torch[key] = torch.from_numpy(x).to(device).float().view( 1, *x.shape) rival = RIVALS[i] policy_outputs = rival.actor_critic(obs_dict_torch, rival.rnn_states) rival.rnn_states = policy_outputs.rnn_states actions.append(policy_outputs.actions[0].cpu().numpy()) for _ in range(render_action_repeat): if not NO_RENDER: target_delay = 1.0 / FPS if FPS > 0 else 0 current_delay = time.time() - last_render_start time_wait = target_delay - current_delay if time_wait > 0: # log.info('Wait time %.3f', time_wait) time.sleep(time_wait) last_render_start = time.time() env.render() obs, rew, done, infos = env.step(actions) if all(done): log.debug('Finished episode!') frag_diff = infos[0]['PLAYER1_FRAGCOUNT'] - infos[0]['PLAYER2_FRAGCOUNT'] if frag_diff > 0: wins[0] += 1 elif frag_diff < 0: wins[1] += 1 else: ties += 1 frag_differences.append(frag_diff) avg_frag_diff = np.mean(frag_differences) report = f'wins: {wins}, ties: {ties}, avg_frag_diff: {avg_frag_diff}' with open(evaluation_filename, 'a') as fobj: fobj.write(report + '\n') # log.info('%d:%d', infos[0]['PLAYER1_FRAGCOUNT'], infos[0]['PLAYER2_FRAGCOUNT']) episode_reward += np.mean(rew) num_frames += 1 if num_frames % 100 == 0: log.debug('%.1f', render_action_repeat / (time.time() - prev_frame)) prev_frame = time.time() if all(done): log.info('Episode finished at %d frames', num_frames) break if all(done) or max_frames_reached(num_frames): break if not NO_RENDER: env.render() time.sleep(0.01) episode_rewards.append(episode_reward) last_episodes = episode_rewards[-100:] avg_reward = sum(last_episodes) / len(last_episodes) log.info( 'Episode reward: %f, avg reward for %d episodes: %f', episode_reward, len(last_episodes), avg_reward, ) if max_frames_reached(num_frames): break env.close()
def enjoy(cfg, max_num_episodes=1000000, max_num_frames=1e9): cfg = load_from_checkpoint(cfg) render_action_repeat = cfg.render_action_repeat if cfg.render_action_repeat is not None else cfg.env_frameskip if render_action_repeat is None: log.warning('Not using action repeat!') render_action_repeat = 1 log.debug('Using action repeat %d during evaluation', render_action_repeat) cfg.env_frameskip = 1 # for evaluation cfg.num_envs = 1 if cfg.record_to: tstamp = datetime.datetime.now().strftime('%Y_%m_%d__%H_%M_%S') cfg.record_to = join(cfg.record_to, f'{cfg.experiment}', tstamp) if not os.path.isdir(cfg.record_to): os.makedirs(cfg.record_to) else: cfg.record_to = None def make_env_func(env_config): return create_env(cfg.env, cfg=cfg, env_config=env_config) env = make_env_func(AttrDict({'worker_index': 0, 'vector_index': 0})) env.seed(0) is_multiagent = hasattr(env, 'num_agents') and env.num_agents > 1 if not is_multiagent: env = MultiAgentWrapper(env) if hasattr(env.unwrapped, 'reset_on_init'): # reset call ruins the demo recording for VizDoom env.unwrapped.reset_on_init = False actor_critic = create_actor_critic(cfg, env.observation_space, env.action_space) device = torch.device('cpu' if cfg.device == 'cpu' else 'cuda') actor_critic.model_to_device(device) policy_id = cfg.policy_index checkpoints = LearnerWorker.get_checkpoints(LearnerWorker.checkpoint_dir(cfg, policy_id)) checkpoint_dict = LearnerWorker.load_checkpoint(checkpoints, device) actor_critic.load_state_dict(checkpoint_dict['model']) episode_rewards = [] true_rewards = deque([], maxlen=100) num_frames = 0 last_render_start = time.time() def max_frames_reached(frames): return max_num_frames is not None and frames > max_num_frames obs = env.reset() with torch.no_grad(): for _ in range(max_num_episodes): done = [False] * len(obs) rnn_states = torch.zeros([env.num_agents, get_hidden_size(cfg)], dtype=torch.float32, device=device) episode_reward = 0 while True: obs_torch = AttrDict(transform_dict_observations(obs)) for key, x in obs_torch.items(): obs_torch[key] = torch.from_numpy(x).to(device).float() policy_outputs = actor_critic(obs_torch, rnn_states, with_action_distribution=True) action_distribution = policy_outputs.action_distribution # sample actions from the distribution by default actions = policy_outputs.actions if isinstance(action_distribution, ContinuousActionDistribution): if not cfg.continuous_actions_sample: # TODO: add similar option for discrete actions actions = action_distribution.means actions = actions.cpu().numpy() rnn_states = policy_outputs.rnn_states for _ in range(render_action_repeat): if not cfg.no_render: target_delay = 1.0 / cfg.fps if cfg.fps > 0 else 0 current_delay = time.time() - last_render_start time_wait = target_delay - current_delay if time_wait > 0: # log.info('Wait time %.3f', time_wait) time.sleep(time_wait) last_render_start = time.time() env.render() obs, rew, done, infos = env.step(actions) episode_reward += np.mean(rew) num_frames += 1 if all(done): true_rewards.append(infos[0].get('true_reward', math.nan)) log.info('Episode finished at %d frames', num_frames) if not math.isnan(np.mean(true_rewards)): log.info('true rew %.3f avg true rew %.3f', true_rewards[-1], np.mean(true_rewards)) # VizDoom multiplayer stuff # for player in [1, 2, 3, 4, 5, 6, 7, 8]: # key = f'PLAYER{player}_FRAGCOUNT' # if key in infos[0]: # log.debug('Score for player %d: %r', player, infos[0][key]) break if all(done) or max_frames_reached(num_frames): break if not cfg.no_render: env.render() time.sleep(0.01) episode_rewards.append(episode_reward) last_episodes = episode_rewards[-100:] avg_reward = sum(last_episodes) / len(last_episodes) log.info( 'Episode reward: %f, avg reward for %d episodes: %f', episode_reward, len(last_episodes), avg_reward, ) if max_frames_reached(num_frames): break env.close() return ExperimentStatus.SUCCESS, np.mean(episode_rewards)
def init_workers(self): actor_queues = [ faster_fifo.Queue() for _ in range(self.cfg.num_workers) ] policy_worker_queues = dict() for policy_id in range(self.cfg.num_policies): policy_worker_queues[policy_id] = [] for i in range(self.cfg.policy_workers_per_policy): policy_worker_queues[policy_id].append(TorchJoinableQueue()) log.info('Initializing learners...') policy_locks = [ multiprocessing.Lock() for _ in range(self.cfg.num_policies) ] resume_experience_collection_cv = [ multiprocessing.Condition() for _ in range(self.cfg.num_policies) ] learner_idx = 0 for policy_id in range(self.cfg.num_policies): learner_worker = LearnerWorker( learner_idx, policy_id, self.cfg, self.obs_space, self.action_space, self.report_queue, policy_worker_queues[policy_id], self.traj_buffers, policy_locks[policy_id], resume_experience_collection_cv[policy_id], ) learner_worker.start_process() learner_worker.init() self.learner_workers[policy_id] = learner_worker learner_idx += 1 log.info('Initializing policy workers...') for policy_id in range(self.cfg.num_policies): self.policy_workers[policy_id] = [] policy_queue = faster_fifo.Queue() self.policy_queues[policy_id] = policy_queue for i in range(self.cfg.policy_workers_per_policy): policy_worker = PolicyWorker( i, policy_id, self.cfg, self.obs_space, self.action_space, self.traj_buffers, policy_queue, actor_queues, self.report_queue, policy_worker_queues[policy_id][i], policy_locks[policy_id], resume_experience_collection_cv[policy_id], ) self.policy_workers[policy_id].append(policy_worker) policy_worker.start_process() log.info('Initializing actors...') self.actor_workers = [] max_parallel_init = int( 1e9) # might be useful to limit this for some envs worker_indices = list(range(self.cfg.num_workers)) for i in range(0, self.cfg.num_workers, max_parallel_init): workers = self.init_subset(worker_indices[i:i + max_parallel_init], actor_queues) self.actor_workers.extend(workers)