def play_with_car(): maximum_steps_allowed = 250 env = TimeLimit(MountainCarEnv(), max_episode_steps=maximum_steps_allowed + 1) actions = {'left': 0, 'stop': 1, 'right': 2} initial_state = env.reset() print('Initial state: ', initial_state) for t in range(maximum_steps_allowed): # need to modify policy if t < 50: s, r, done, _ = env.step(actions['left']) elif t < 70: s, r, done, _ = env.step(actions['right']) elif t < 120: s, r, done, _ = env.step(actions['left']) else: s, r, done, _ = env.step(actions['right']) print('State {}, Reward {}, Step {}'.format(s, r, t)) env.render() if done: if s[0] > 0.47: print('Well done!') else: print('Please, try again.') break else: print('Time is up. Please, try again.') env.close()
def test_basics(): env = TimeLimit(gym.make("CartPole-v0"), max_episode_steps=10) env = EnvDataset(env) env = EpisodeLimit(env, max_episodes=3) env.seed(123) for episode in range(3): obs = env.reset() done = False step = 0 while not done: print(f"step {step}") obs, reward, done, info = env.step(env.action_space.sample()) step += 1 assert env.is_closed() with pytest.raises(gym.error.ClosedEnvironmentError): _ = env.reset() with pytest.raises(gym.error.ClosedEnvironmentError): _ = env.step(env.action_space.sample()) with pytest.raises(gym.error.ClosedEnvironmentError): for _ in env: break
def test_task_schedule_monsterkong(): env: MetaMonsterKongEnv = gym.make("MetaMonsterKong-v1") from gym.wrappers import TimeLimit env = TimeLimit(env, max_episode_steps=10) env = MultiTaskEnvironment(env, task_schedule={ 0: { "level": 0 }, 100: { "level": 1 }, 200: { "level": 2 }, 300: { "level": 3 }, 400: { "level": 4 }, }, add_task_id_to_obs=True) obs = env.reset() # img, task_labels = obs assert obs[1] == 0 assert env.get_level() == 0 for i in range(500): obs, reward, done, info = env.step(env.action_space.sample()) assert obs[1] == i // 100 assert env.level == i // 100 env.render() assert isinstance(done, bool) if done: print(f"End of episode at step {i}") obs = env.reset() assert obs[1] == 4 assert env.level == 4 # level stays the same even after reaching that objective. for i in range(500): obs, reward, done, info = env.step(env.action_space.sample()) assert obs[1] == 4 assert env.level == 4 env.render() if done: print(f"End of episode at step {i}") obs = env.reset() env.close()
def play_one_session( env: TimeLimit, max_size: int, action_chooser: Callable[[TimeLimit, Any], Any], render: bool = False, custom_actions: Callable[[int, TimeLimit, Any, Any, Any, bool, Any], None] = None, stop_when_done: bool = True, ) -> Tuple[float, List[Dict[str, Any]]]: observation = env.reset() score = 0 history = [] for i in range(max_size): if render: env.render() action = action_chooser(env, observation) current_iteration_history = {"observation": observation, "action": action} observation, reward, done, info = env.step(action.reshape((-1,))) score += reward history.append(current_iteration_history) if custom_actions is not None: custom_actions(i, env, action, observation, reward, done, info) if stop_when_done and done: break return score / max_size, history
def play(env_name: str, manual_control: bool, max_steps: int): # Make environment env = TimeLimit(gym.make(env_name, render=True), max_steps) observation = env.reset() if manual_control: # Create user debug interface import pybullet as p params = [ p.addUserDebugParameter( p.getJointInfo(env.robot_id, j)[1].decode(), -1, 1, 0) for j in env.joint_list ] reward_sum = 0 while True: if manual_control: # Read user input and simulate motor a = [p.readUserDebugParameter(param) for param in params] else: a = env.action_space.sample() observation, reward, done, _ = env.step(a) reward_sum += reward print("\nobservation", observation) print("reward", reward) print("total reward", reward_sum) print("done", done) # Reset when done if done: observation = env.reset() reward_sum = 0 env.close()
def test_random_task_on_each_episode_and_only_one_task_in_schedule(): """ BUG: When the goal is to have only one task, it instead keeps sampling a new task from the 'distribution', in the case of cartpole! """ env: MetaMonsterKongEnv = gym.make("CartPole-v1") from gym.wrappers import TimeLimit env = TimeLimit(env, max_episode_steps=10) env = MultiTaskEnvironment( env, task_schedule={ 0: { "length": 0.1 }, }, add_task_id_to_obs=True, new_random_task_on_reset=True, ) task_labels = [] lengths = [] for i in range(10): obs = env.reset() task_labels.append(obs[1]) lengths.append(env.length) done = False while not done: obs, reward, done, info = env.step(env.action_space.sample()) task_labels.append(obs[1]) lengths.append(env.length) assert set(task_labels) == {0} assert set(lengths) == {0.1}
def test_random_task_on_each_episode(): env: MetaMonsterKongEnv = gym.make("MetaMonsterKong-v1") from gym.wrappers import TimeLimit env = TimeLimit(env, max_episode_steps=10) env = MultiTaskEnvironment( env, task_schedule={ 0: {"level": 0}, 5: {"level": 1}, 200: {"level": 2}, 300: {"level": 3}, 400: {"level": 4}, }, add_task_id_to_obs=True, new_random_task_on_reset=True, ) task_labels = [] for i in range(10): obs = env.reset() task_labels.append(obs["task_labels"]) assert len(set(task_labels)) > 1 # Episodes only last 10 steps. Tasks don't have anything to do with the task # schedule. obs = env.reset() start_task_label = obs["task_labels"] for i in range(10): obs, reward, done, info = env.step(env.action_space.sample()) assert obs["task_labels"] == start_task_label if i == 9: assert done else: assert not done env.close()
def test_task_schedule_with_callables(): """ Apply functions to the env at a given step. """ env: MetaMonsterKongEnv = gym.make("MetaMonsterKong-v1") from gym.wrappers import TimeLimit env = TimeLimit(env, max_episode_steps=10) from operator import methodcaller env = MultiTaskEnvironment(env, task_schedule={ 0: methodcaller("set_level", 0), 100: methodcaller("set_level", 1), 200: methodcaller("set_level", 2), 300: methodcaller("set_level", 3), 400: methodcaller("set_level", 4), }, add_task_id_to_obs=True) obs = env.reset() # img, task_labels = obs assert obs[1] == 0 assert env.get_level() == 0 for i in range(500): obs, reward, done, info = env.step(env.action_space.sample()) assert obs[1] == i // 100 assert env.level == i // 100 env.render() assert isinstance(done, bool) if done: print(f"End of episode at step {i}") obs = env.reset() assert obs[1] == 4 assert env.level == 4 # level stays the same even after reaching that objective. for i in range(500): obs, reward, done, info = env.step(env.action_space.sample()) assert obs[1] == 4 assert env.level == 4 env.render() if done: print(f"End of episode at step {i}") obs = env.reset()
def test_change_gravity_each_step(self): env: ModifiedMassEnv = self.Environment() max_episode_steps = 500 n_episodes = 5 # NOTE: Interestingly, the renderer will show # `env.frame_skip * max_episode_steps` frames per episode, even when # "Ren[d]er every frame" is set to False. env = TimeLimit(env, max_episode_steps=max_episode_steps) env: ModifiedMassEnv total_steps = 0 for episode in range(n_episodes): initial_state = env.reset() done = False episode_steps = 0 start_y = initial_state[1] moved_up = 0 previous_state = initial_state state = initial_state body_part = self.body_names[0] start_mass = env.get_mass(body_part) while not done: previous_state = state state, reward, done, info = env.step(env.action_space.sample()) env.render("human") episode_steps += 1 total_steps += 1 env.set_mass(body_part=body_part, mass=start_mass + 5 * total_steps / max_episode_steps) moved_up += (state[1] > previous_state[1]) # print(f"Moving upward? {obs[1] > state[1]}") print(f"Gravity at end of episode: {env.gravity}") # TODO: Check that the position (in the observation) is obeying gravity? # if env.gravity <= 0: # # Downward force, so should not have any significant preference for # # moving up vs moving down. # assert 0.4 <= (moved_up / max_episode_steps) <= 0.6, env.gravity # # if env.gravity == 0: # # assert 0.5 <= (moved_up / max_episode_steps) <= 1.0 # if env.gravity > 0: # assert 0.5 <= (moved_up / max_episode_steps) <= 1.0, env.gravity assert total_steps == n_episodes * max_episode_steps initial_z = env.init_qpos[1] final_z = env.sim.data.qpos[1] assert initial_z == 0 # Check that the robot is high up in the sky! :D assert final_z > 20
class BaseTestRotMAB: """Base test class for RotMAB environment.""" def __init__(self, winning_probs, max_steps): """Initialize test class.""" self.winning_probs = winning_probs self.max_steps = max_steps self.env = TimeLimit( NonMarkovianRotatingMAB(winning_probs=self.winning_probs), max_episode_steps=self.max_steps, ) def test_action_space(self): """Test action spaces.""" assert self.env.action_space == Discrete(len(self.winning_probs)) def test_observation_space(self): """Test observation spaces.""" assert self.env.observation_space == Discrete(2) def test_interaction(self): """Test interaction with Rotating MAB.""" self.env.seed() state = self.env.reset() assert state == 0 def assert_consistency(obs, reward): """Assert obs = 1 iff reward = 1.""" positive_reward = reward > 0.0 positive_obs = obs == 1 assert (positive_reward and positive_obs or (not positive_reward and not positive_obs)) for _i in range(self.max_steps - 1): action = self.env.action_space.sample() obs, reward, done, info = self.env.step(action) assert_consistency(obs, reward) assert not done # last action obs, reward, done, info = self.env.step(0) assert_consistency(obs, reward) assert done
def test_noop_reset_env(self): # runable test noop_max = 20 env = gym.make(TEST_ENV_ID) env = TimeLimit(env, 3) env = atari.NoopResetEnv(env, noop_max=noop_max) env.reset() for i in range(20): obs, rew, done, info = env.step(env.action_space.sample()) if done: break
def test_change_gravity_each_step(self): env: ModifiedGravityEnv = self.Environment() max_episode_steps = 50 n_episodes = 3 # NOTE: Interestingly, the renderer will show # `env.frame_skip * max_episode_steps` frames per episode, even when # "Ren[d]er every frame" is set to False. env = TimeLimit(env, max_episode_steps=max_episode_steps) total_steps = 0 for episode in range(n_episodes): initial_state = env.reset() done = False episode_steps = 0 start_y = initial_state[1] moved_up = 0 previous_state = initial_state state = initial_state while not done: previous_state = state state, reward, done, info = env.step(env.action_space.sample()) env.render("human") episode_steps += 1 total_steps += 1 # decrease the gravity continually over time. # By the end, things should be floating. env.set_gravity(-10 + 5 * total_steps / max_episode_steps) moved_up += (state[1] > previous_state[1]) # print(f"Moving upward? {obs[1] > state[1]}") if episode_steps != max_episode_steps: print(f"Episode ended early?") print(f"Gravity at end of episode: {env.gravity}") # TODO: Check that the position (in the observation) is obeying gravity? # if env.gravity <= 0: # # Downward force, so should not have any significant preference for # # moving up vs moving down. # assert 0.4 <= (moved_up / max_episode_steps) <= 0.6, env.gravity # # if env.gravity == 0: # # assert 0.5 <= (moved_up / max_episode_steps) <= 1.0 # if env.gravity > 0: # assert 0.5 <= (moved_up / max_episode_steps) <= 1.0, env.gravity assert total_steps <= n_episodes * max_episode_steps initial_z = env.init_qpos[1] final_z = env.sim.data.qpos[1] if env.gravity > 0: assert final_z > initial_z
def test(pkl_path, pth_path, env, attempts, display=False, video_dir=None): with open(pkl_path, 'rb') as f: logs = pickle.load(f) if logs['params']['max_episode_steps'] is not None: env = TimeLimit(env, max_episode_steps=logs['params']['max_episode_steps']) if video_dir: if not os.path.exists(video_dir): os.makedirs(video_dir) env = Monitor(env, video_dir, force=True) if logs['agent'] == 'dqn': agent = DQNAgent(env.observation_space, env.action_space, **logs['params']) agent.epsilon = 0 elif logs['agent'] == 'a2c': agent = A2CAgent(env.observation_space, env.action_space, **logs['params']) elif logs['agent'] == 'td3': agent = TD3Agent(env.observation_space, env.action_space, **logs['params']) elif logs['agent'] == 'random': agent = RandomAgent(env.observation_space, env.action_space, **logs['params']) agent.load(pth_path) try: rewards = [] for attempt in range(attempts): state = env.reset() sum_reward = 0 t = 0 done = False while not done: action = agent.get_action(state) next_state, reward, done, _ = env.step(action) state = next_state sum_reward += reward t += 1 if display: title = f'Attempt: {attempt+1} | Timestep: {t} | Reward: {reward} | Sum Reward: {sum_reward}' render(env, title) rewards.append(sum_reward) env.close() return rewards except Exception: traceback.print_exc() breakpoint() env.close()
def test_max_and_skip_env(self): # runable test skip = 4 env = gym.make(TEST_ENV_ID) env = TimeLimit(env, 20) env = atari.MaxAndSkipEnv(env, skip=skip) env.seed(1) ub_utils.set_seed(1) env.reset() for i in range(20): obs, rew, done, info = env.step(env.action_space.sample()) if done: break self.assertEqual(4, i)
def test_monitor(n_episodes): steps = 15 env = gym.make("CartPole-v1") # unwrap default TimeLimit and wrap with new one to simulate done=True # at step 5 assert isinstance(env, TimeLimit) env = env.env # unwrap env = TimeLimit(env, max_episode_steps=5) # wrap tmpdir = tempfile.mkdtemp() try: env = pfrl.wrappers.Monitor( env, directory=tmpdir, video_callable=lambda episode_id: True ) episode_idx = 0 episode_len = 0 t = 0 _ = env.reset() while True: _, _, done, info = env.step(env.action_space.sample()) episode_len += 1 t += 1 if episode_idx == 1 and episode_len >= 3: info["needs_reset"] = True # simulate ContinuingTimeLimit if done or info.get("needs_reset", False) or t == steps: if episode_idx + 1 == n_episodes or t == steps: break env.reset() episode_idx += 1 episode_len = 0 # `env.close()` is called when `env` is gabage-collected # (or explicitly deleted/closed). del env # check if videos & meta files were generated files = os.listdir(tmpdir) mp4s = [f for f in files if f.endswith(".mp4")] metas = [f for f in files if f.endswith(".meta.json")] stats = [f for f in files if f.endswith(".stats.json")] manifests = [f for f in files if f.endswith(".manifest.json")] assert len(mp4s) == n_episodes assert len(metas) == n_episodes assert len(stats) == 1 assert len(manifests) == 1 finally: shutil.rmtree(tmpdir)
def run_episodes(neps, seed): reward_fn = 'task1_reward' termination_fn = 'pos_and_rot_close_to_goal' # termination_fn = 'position_close_to_goal' initializer = 'task4_init' env = make_training_env(reward_fn, termination_fn, initializer, action_space='torque_and_position', init_joint_conf=True, visualization=True, grasp='pinch', rank=seed) env = env.env # HACK to remove FLatObservationWrapper # tmp_dir = '/tmp/video' # env = Monitor(RenderWrapper(TimeLimit(env, 1000)), tmp_dir, # video_callable=lambda episode_id: True, mode='evaluation', # force=True) env = TimeLimit(env, 1000) viz = Viz() for _ in range(neps): obs = env.reset() p.configureDebugVisualizer(p.COV_ENABLE_GUI, 0) p.resetDebugVisualizerCamera(cameraDistance=0.6, cameraYaw=0, cameraPitch=-40, cameraTargetPosition=[0, 0, 0]) viz.reset(obs) # tip_pd = TipPD([10, 1], 0.7 * env.cube_tip_positions) tip_pd = None controller = ForceControlPolicy(env, True, tip_pd) # obs = grasp_force_control(env, obs, controller.get_grasp_torque) obs = grasp_tippos_control(env, obs) # Then move toward the goal positions env.unwrapped.action_space = TriFingerPlatform.spaces.robot_torque.gym env.unwrapped.action_type = cube_env.ActionType.TORQUE done = False while not done: # transform wrenches to base frame torque = controller(obs) obs, reward, done, info = env.step(torque) viz.update_cube_orientation(obs) time.sleep(0.01) env.close()
def main(): env = make_cmdp(args.cmdp, episodic=True) env = TimeLimit(env, 10) agent_model_name = args.cmdp.split('/')[-1] agent_model = agent_models.get_agent_model(agent_model_name) values_df_index = 'E[G]', 'E[G | A=a]', 'E[G | do(A=a)]' values_df_columns = env.model.actions _, state = env.reset() for t in itt.count(): print() print(f't: {t}') env.render() Qs_none = [ infer_Q(env, action, 'none', agent_model=agent_model).item() for action in range(env.action_space.n) ] Qs_condition = [ infer_Q(env, action, 'condition', agent_model=agent_model).item() for action in range(env.action_space.n) ] Qs_intervention = [ infer_Q(env, action, 'intervention', agent_model=agent_model).item() for action in range(env.action_space.n) ] values_df = pd.DataFrame( [Qs_none, Qs_condition, Qs_intervention], values_df_index, values_df_columns, ) print(values_df) action = torch.tensor(Qs_intervention).argmax() state, _, done, _ = env.step(action) if done: print() print(f'final state: {state}') print(f'Episode finished after {t+1} timesteps') break env.close()
class Renderer: def __init__(self, args): self.env = TimeLimit(gym.make(args.env), max_episode_steps=args.max_steps) def get_action(self, obs, ch): raise NotImplementedError def reset(self, init_obs): pass def main_loop(self, window): obs = self.env.reset() self.reset(obs) done = False action = None reward = None steps = 0 ret = 0 while not done: self.display(action, done, ret, reward, steps, window) ch = window.getch() action = self.get_action(obs, ch) obs, reward, done, _ = self.env.step(action) ret += reward steps += 1 # Clear screen self.display(action, done, ret, reward, steps, window) window.getch() def display(self, action, done, ret, reward, steps, window): show( self.env, window, { 'steps': steps, 'action': gym_psketch.ID2ACTIONS[action] if action is not None else action, 'reward': reward, 'return': ret, 'done': done })
def main(): env = make_mdp(args.mdp, episodic=True) env = TimeLimit(env, 10) env.reset() for t in itt.count(): print('---') print(f't: {t}') print('state:') env.render() action = policy(env, log=True) _, reward, done, _ = env.step(action) print(f'reward: {reward}') if done: print('final state:') env.render() print(f'Episode finished after {t+1} timesteps') break env.close()
# ALGO LOGIC: put action logic here logits, std = pg.forward(obs[step:step + 1]) values[step] = vf.forward(obs[step:step + 1]) # ALGO LOGIC: `env.action_space` specific logic probs = Normal(logits, std) action = probs.sample() clipped_action = torch.clamp( action, torch.min(torch.Tensor(env.action_space.low)), torch.min(torch.Tensor(env.action_space.high))) actions[step], neglogprobs[step], entropys[ step] = clipped_action.tolist( )[0], -probs.log_prob(action).sum(), probs.entropy().sum() # TRY NOT TO MODIFY: execute the game and log data. next_obs, rewards[step], dones[step], _ = env.step(actions[step]) next_obs = np.array(next_obs) if dones[step]: break # ALGO LOGIC: training. # calculate the discounted rewards, or namely, returns returns = np.zeros_like(rewards) for t in reversed(range(rewards.shape[0] - 1)): returns[t] = rewards[t] + args.gamma * returns[t + 1] * (1 - dones[t]) # advantages are returns - baseline, value estimates in our case advantages = returns - values.detach().cpu().numpy() vf_loss = loss_fn(torch.Tensor(returns).to(device), values) * args.vf_coef pg_loss = torch.Tensor(advantages).to(device) * neglogprobs loss = (pg_loss - entropys * args.ent_coef).mean() + vf_loss
obs[step] = next_obs.copy() # ALGO LOGIC: put action logic here with torch.no_grad(): values[step] = vf.forward(obs[step:step + 1]) action, logproba, _ = pg.get_action(obs[step:step + 1]) actions[step] = action.data.cpu().numpy()[0] logprobs[step] = logproba.data.cpu().numpy()[0] # SUGGESTION: Find a better way to constrain policy actions to action low and higher bounds clipped_action = np.clip(action.tolist(), env.action_space.low, env.action_space.high)[0] # TRY NOT TO MODIFY: execute the game and log data. next_obs, rewards[step], dones[step], info = env.step(clipped_action) real_rewards += [info['real_reward']] next_obs = np.array(next_obs) # Annealing the rate if instructed to do so. if args.anneal_lr: pg_lr_scheduler.step() vf_lr_scheduler.step() if dones[step]: # Computing the discounted returns: writer.add_scalar("charts/episode_reward", np.sum(real_rewards), global_step) print( f"global_step={global_step}, episode_reward={np.sum(real_rewards)}" )
obs = np.empty((args.episode_length, ) + env.observation_space.shape) # ALGO LOGIC: put other storage logic here entropys = torch.zeros((args.episode_length, ), device=device) # TRY NOT TO MODIFY: prepare the execution of the game. for step in range(args.episode_length): global_step += 1 obs[step] = next_obs.copy() # ALGO LOGIC: put action logic here action, _, _ = pg.get_action(obs[step:step + 1]) actions[step] = action.tolist()[0] # TRY NOT TO MODIFY: execute the game and log data. next_obs, rewards[step], dones[step], _ = env.step(action.tolist()[0]) rb.put( (obs[step], actions[step], rewards[step], next_obs, dones[step])) next_obs = np.array(next_obs) # ALGO LOGIC: training. if len(rb.buffer) > 2000: s_obs, s_actions, s_rewards, s_next_obses, s_dones = rb.sample( args.batch_size) with torch.no_grad(): next_state_action, next_state_log_pi, _ = pg.get_action( s_next_obses) qf1_next_target = qf1_target.forward(s_next_obses, next_state_action) qf2_next_target = qf2_target.forward(s_next_obses, next_state_action) min_qf_next_target = torch.min(
def replay_memory(env: TimeLimit, memory: List[List[Any]]): for episode_memory in memory: env.reset() for action in episode_memory: env.step(action) env.render()
def train(agent_type, env, verbose=True, save_freq=50, save_dir='./', **params): if verbose: print(params) if agent_type == 'dqn': agent = DQNAgent(env.observation_space, env.action_space, **params) elif agent_type == 'a2c': agent = A2CAgent(env.observation_space, env.action_space, **params) elif agent_type == 'td3': agent = TD3Agent(env.observation_space, env.action_space, **params) elif agent_type == 'random': agent = RandomAgent(env.observation_space, env.action_space, **params) if params['max_episode_steps'] is not None: env = TimeLimit(env, max_episode_steps=params['max_episode_steps']) log = {'agent':agent_type, 'params':params, 'episodes':[]} if save_dir[-1] != '/': raise NotADirectory if not os.path.exists(save_dir): os.makedirs(save_dir) try: ep = 0 t_total = 0 while t_total < params['max_steps']: state = env.reset() sum_reward = 0 t_ep = 0 done = False while not done: if t_total > params['start_at']: action = agent.get_action(state) else: action = env.action_space.sample() next_state, reward, done, _ = env.step(action) agent.remember(state, action, reward, next_state, done) state = next_state sum_reward += reward t_ep += 1 # for agents using online training if agent.online and t_total > params['start_at']: agent.learn() # for agents using offline training if not agent.online and t_total > params['start_at']: agent.learn() ep += 1 t_total += t_ep ep_info = {'episode':ep, 't_ep':t_ep, 't_total':t_total, 'sum_reward':sum_reward, 'optim_steps':agent.optim_steps, 'memory':len(agent.memory)} log['episodes'].append(ep_info) if verbose: print(ep_info) if ep % save_freq == 0: agent.save(save_dir + params['file_name'] + '.pth') with open(save_dir + params['file_name'] + '.pkl', 'wb') as f: pickle.dump(log, f) if verbose: print('Episode ' + str(ep) + ': Saved model weights and log.') env.close() except Exception: traceback.print_exc() breakpoint()
plt.show() return np.array(s_s).T #s_s = play(agt, play_env) #labels = ['x', 'v_x', 'cos(theta)', 'sin(theta)', 'thetadot'] #for label, line in zip(labels, s_s): # plt.plot(line, label=label) #plt.legend() #plt.show() s = env.reset() for step in range(100000): a = int(q.get_action(s)) sp, r, done, _ = env.step(a) agt.handle_transition(s, a, r, sp, done) s_s.append(s.detach().numpy()) s = sp if done: s = env.reset() done = False if (step % 1000) == 0: print( f'{step}: {adp.evaluate(eval_env, 10)} (adp) {q.evaluate(eval_env, 10)} (Q)' )
action, logproba, _, probs = pg.get_action( obs[step:step + 1], invalid_action_masks=invalid_action_masks[step:step + 1]) # CORE LOGIC: # use the action generated by CategoricalMasked, but # don't adjust the logprobability accordingly. Instead, calculate the log # probability using Categorical action, logproba, _, probs = pg.get_action(obs[step:step + 1], action=action) actions[step] = action[:, 0].data.cpu().numpy() logprobs[:, [step]] = logproba # TRY NOT TO MODIFY: execute the game and log data. next_obs, rewards[step], dones[step], info = env.step( action[:, 0].data.cpu().numpy()) raw_rewards[:, step] = info["rewards"] real_rewards += [info['real_reward']] invalid_action_stats += [info['invalid_action_stats']] next_obs = np.array(next_obs) # Annealing the rate if instructed to do so. if args.anneal_lr: pg_lr_scheduler.step() vf_lr_scheduler.step() if dones[step]: # Computing the discounted returns: writer.add_scalar("charts/episode_reward", np.sum(real_rewards), global_step) print(
def get_cswm_data(env_name, seed, num_episodes=1000): logger.set_level(logger.INFO) env = gym.make(env_name) np.random.seed(seed) env.action_space.seed(seed) env.seed(seed) agent = RandomAgent(env.action_space) episode_count = num_episodes reward = 0 done = False crop = None warmstart = None if env_name == 'PongDeterministic-v4': crop = (35, 190) warmstart = 58 elif env_name == 'SpaceInvadersDeterministic-v4': crop = (30, 200) warmstart = 50 else: crop = (35, 190) warmstart = 58 max_episode_steps = warmstart + 11 env = TimeLimit(env, max_episode_steps=max_episode_steps) env = AtariARIWrapper(env) replay_buffer = [] for i in range(episode_count): replay_buffer.append({ 'obs': [], 'action': [], 'next_obs': [], 'label': [] }) ob = env.reset() # Burn-in steps for _ in range(warmstart): action = agent.act(ob, reward, done) ob, _, _, _ = env.step(action) prev_ob = crop_normalize(ob, crop) ob, _, _, info = env.step(0) ob = crop_normalize(ob, crop) while True: replay_buffer[i]['obs'].append( np.concatenate((ob, prev_ob), axis=0)) prev_ob = ob replay_buffer[i]["label"].append(info["labels"]) action = agent.act(ob, reward, done) ob, reward, done, info = env.step(action) ob = crop_normalize(ob, crop) replay_buffer[i]['action'].append(action) replay_buffer[i]['next_obs'].append( np.concatenate((ob, prev_ob), axis=0)) if done: break if i % 10 == 0: print("iter " + str(i)) return replay_buffer
if args.use_levy: action = (action.tolist()[0] + sampleFromLevy( args.levy_mu, args.levy_scale, env.action_space)).clip( env.action_space.low, env.action_space.high) else: action = (action.tolist()[0] + np.random.normal(0, max_action * args.exploration_noise, size=env.action_space.shape[0])).clip( env.action_space.low, env.action_space.high) # TRY NOT TO MODIFY: execute the game and log data. next_obs, reward, done, info = env.step(action) episode_reward += reward # ALGO LOGIC: training. rb.put((obs, action, reward, next_obs, done)) if global_step > args.learning_starts: s_obs, s_actions, s_rewards, s_next_obses, s_dones = rb.sample( args.batch_size) with torch.no_grad(): next_state_actions = (target_actor.forward( s_next_obses, device)).clamp(env.action_space.low[0], env.action_space.high[0]) qf1_next_target = qf1_target.forward(s_next_obses, next_state_actions, device) next_q_value = torch.Tensor(s_rewards).to(device) + ( 1 - torch.Tensor(s_dones).to(device)) * args.gamma * (
def evaluate( env: TimeLimit, total_episodes: int, *, q_table: np.ndarray = None, winning_reward: float = None, is_random: bool = False, render: bool = False, display_result: bool = False, ) -> float: """ Evaluate the performance of a q-table to solve a gym environment problem It may also use random instead of a q-table in order to compare the performance of a q-table against a random solution :param env: gym environment to solve :param total_episodes: number of time to repeat the evaluation. The bigger the more statistically significant the output will be :param q_table: Q-table to used solve the problem if given, is_random must be False :param winning_reward: the reward given to the agent when it solves the problem. It is used to compute the number of time the agent solved the problem :param is_random: if True will use random instead of Q-table. If True, q-table must not be given :param render: if True will call env.render() :param display_result: If True, prints evaluation summary in the console at the evaluation end """ # Todo : rename and re-think is_random parameter into policy parameter # Todo : render only last evaluation # Todo : yield q-table, evaluate it and continue evaluation if it is not good enough if (q_table is not None) and is_random: raise RuntimeError("is_random and q_table given") elif q_table is None and is_random is None: raise RuntimeError( "at least one of q_table and is_random must be given") total_epochs, total_reward, total_won_episodes = 0, 0, 0 for _ in range(total_episodes): state = env.reset() if render: env.render() done = False while not done: if is_random: action = env.action_space.sample() else: action = np.argmax(q_table[state, :]) state, reward, done, info = env.step(action) total_epochs += 1 total_reward += reward if render: env.render() # noinspection PyUnboundLocalVariable if reward == winning_reward: total_won_episodes += 1 score = round(total_won_episodes * 100 / total_episodes, 2) if display_result: print("-" * 30) print( f"Results after {total_episodes} episodes using {'random' if is_random else 'q_table'}:" ) print(f"Average steps per episode: {total_epochs / total_episodes}") print(f"Average reward per episode: {total_reward / total_episodes}") print(f"Percentage of won episodes : {score}%") return score