def play_with_car(): maximum_steps_allowed = 250 env = TimeLimit(MountainCarEnv(), max_episode_steps=maximum_steps_allowed + 1) actions = {'left': 0, 'stop': 1, 'right': 2} initial_state = env.reset() print('Initial state: ', initial_state) for t in range(maximum_steps_allowed): # need to modify policy if t < 50: s, r, done, _ = env.step(actions['left']) elif t < 70: s, r, done, _ = env.step(actions['right']) elif t < 120: s, r, done, _ = env.step(actions['left']) else: s, r, done, _ = env.step(actions['right']) print('State {}, Reward {}, Step {}'.format(s, r, t)) env.render() if done: if s[0] > 0.47: print('Well done!') else: print('Please, try again.') break else: print('Time is up. Please, try again.') env.close()
def test_random_task_on_each_episode(): env: MetaMonsterKongEnv = gym.make("MetaMonsterKong-v1") from gym.wrappers import TimeLimit env = TimeLimit(env, max_episode_steps=10) env = MultiTaskEnvironment( env, task_schedule={ 0: {"level": 0}, 5: {"level": 1}, 200: {"level": 2}, 300: {"level": 3}, 400: {"level": 4}, }, add_task_id_to_obs=True, new_random_task_on_reset=True, ) task_labels = [] for i in range(10): obs = env.reset() task_labels.append(obs["task_labels"]) assert len(set(task_labels)) > 1 # Episodes only last 10 steps. Tasks don't have anything to do with the task # schedule. obs = env.reset() start_task_label = obs["task_labels"] for i in range(10): obs, reward, done, info = env.step(env.action_space.sample()) assert obs["task_labels"] == start_task_label if i == 9: assert done else: assert not done env.close()
def play(env_name: str, manual_control: bool, max_steps: int): # Make environment env = TimeLimit(gym.make(env_name, render=True), max_steps) observation = env.reset() if manual_control: # Create user debug interface import pybullet as p params = [ p.addUserDebugParameter( p.getJointInfo(env.robot_id, j)[1].decode(), -1, 1, 0) for j in env.joint_list ] reward_sum = 0 while True: if manual_control: # Read user input and simulate motor a = [p.readUserDebugParameter(param) for param in params] else: a = env.action_space.sample() observation, reward, done, _ = env.step(a) reward_sum += reward print("\nobservation", observation) print("reward", reward) print("total reward", reward_sum) print("done", done) # Reset when done if done: observation = env.reset() reward_sum = 0 env.close()
def test_task_schedule_monsterkong(): env: MetaMonsterKongEnv = gym.make("MetaMonsterKong-v1") from gym.wrappers import TimeLimit env = TimeLimit(env, max_episode_steps=10) env = MultiTaskEnvironment(env, task_schedule={ 0: { "level": 0 }, 100: { "level": 1 }, 200: { "level": 2 }, 300: { "level": 3 }, 400: { "level": 4 }, }, add_task_id_to_obs=True) obs = env.reset() # img, task_labels = obs assert obs[1] == 0 assert env.get_level() == 0 for i in range(500): obs, reward, done, info = env.step(env.action_space.sample()) assert obs[1] == i // 100 assert env.level == i // 100 env.render() assert isinstance(done, bool) if done: print(f"End of episode at step {i}") obs = env.reset() assert obs[1] == 4 assert env.level == 4 # level stays the same even after reaching that objective. for i in range(500): obs, reward, done, info = env.step(env.action_space.sample()) assert obs[1] == 4 assert env.level == 4 env.render() if done: print(f"End of episode at step {i}") obs = env.reset() env.close()
def test(pkl_path, pth_path, env, attempts, display=False, video_dir=None): with open(pkl_path, 'rb') as f: logs = pickle.load(f) if logs['params']['max_episode_steps'] is not None: env = TimeLimit(env, max_episode_steps=logs['params']['max_episode_steps']) if video_dir: if not os.path.exists(video_dir): os.makedirs(video_dir) env = Monitor(env, video_dir, force=True) if logs['agent'] == 'dqn': agent = DQNAgent(env.observation_space, env.action_space, **logs['params']) agent.epsilon = 0 elif logs['agent'] == 'a2c': agent = A2CAgent(env.observation_space, env.action_space, **logs['params']) elif logs['agent'] == 'td3': agent = TD3Agent(env.observation_space, env.action_space, **logs['params']) elif logs['agent'] == 'random': agent = RandomAgent(env.observation_space, env.action_space, **logs['params']) agent.load(pth_path) try: rewards = [] for attempt in range(attempts): state = env.reset() sum_reward = 0 t = 0 done = False while not done: action = agent.get_action(state) next_state, reward, done, _ = env.step(action) state = next_state sum_reward += reward t += 1 if display: title = f'Attempt: {attempt+1} | Timestep: {t} | Reward: {reward} | Sum Reward: {sum_reward}' render(env, title) rewards.append(sum_reward) env.close() return rewards except Exception: traceback.print_exc() breakpoint() env.close()
def main(): env = make_cmdp(args.cmdp, episodic=True) env = TimeLimit(env, 10) agent_model_name = args.cmdp.split('/')[-1] agent_model = agent_models.get_agent_model(agent_model_name) values_df_index = 'E[G]', 'E[G | A=a]', 'E[G | do(A=a)]' values_df_columns = env.model.actions _, state = env.reset() for t in itt.count(): print() print(f't: {t}') env.render() Qs_none = [ infer_Q(env, action, 'none', agent_model=agent_model).item() for action in range(env.action_space.n) ] Qs_condition = [ infer_Q(env, action, 'condition', agent_model=agent_model).item() for action in range(env.action_space.n) ] Qs_intervention = [ infer_Q(env, action, 'intervention', agent_model=agent_model).item() for action in range(env.action_space.n) ] values_df = pd.DataFrame( [Qs_none, Qs_condition, Qs_intervention], values_df_index, values_df_columns, ) print(values_df) action = torch.tensor(Qs_intervention).argmax() state, _, done, _ = env.step(action) if done: print() print(f'final state: {state}') print(f'Episode finished after {t+1} timesteps') break env.close()
def run_episodes(neps, seed): reward_fn = 'task1_reward' termination_fn = 'pos_and_rot_close_to_goal' # termination_fn = 'position_close_to_goal' initializer = 'task4_init' env = make_training_env(reward_fn, termination_fn, initializer, action_space='torque_and_position', init_joint_conf=True, visualization=True, grasp='pinch', rank=seed) env = env.env # HACK to remove FLatObservationWrapper # tmp_dir = '/tmp/video' # env = Monitor(RenderWrapper(TimeLimit(env, 1000)), tmp_dir, # video_callable=lambda episode_id: True, mode='evaluation', # force=True) env = TimeLimit(env, 1000) viz = Viz() for _ in range(neps): obs = env.reset() p.configureDebugVisualizer(p.COV_ENABLE_GUI, 0) p.resetDebugVisualizerCamera(cameraDistance=0.6, cameraYaw=0, cameraPitch=-40, cameraTargetPosition=[0, 0, 0]) viz.reset(obs) # tip_pd = TipPD([10, 1], 0.7 * env.cube_tip_positions) tip_pd = None controller = ForceControlPolicy(env, True, tip_pd) # obs = grasp_force_control(env, obs, controller.get_grasp_torque) obs = grasp_tippos_control(env, obs) # Then move toward the goal positions env.unwrapped.action_space = TriFingerPlatform.spaces.robot_torque.gym env.unwrapped.action_type = cube_env.ActionType.TORQUE done = False while not done: # transform wrenches to base frame torque = controller(obs) obs, reward, done, info = env.step(torque) viz.update_cube_orientation(obs) time.sleep(0.01) env.close()
def main(): env = make_mdp(args.mdp, episodic=True) env = TimeLimit(env, 10) env.reset() for t in itt.count(): print('---') print(f't: {t}') print('state:') env.render() action = policy(env, log=True) _, reward, done, _ = env.step(action) print(f'reward: {reward}') if done: print('final state:') env.render() print(f'Episode finished after {t+1} timesteps') break env.close()
nn.utils.clip_grad_norm_(list(actor.parameters()), args.max_grad_norm) actor_optimizer.step() # update the target network for param, target_param in zip(actor.parameters(), target_actor.parameters()): target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) for param, target_param in zip(qf1.parameters(), qf1_target.parameters()): target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) if global_step % 100 == 0: writer.add_scalar("losses/qf1_loss", qf1_loss.item(), global_step) writer.add_scalar("losses/actor_loss", actor_loss.item(), global_step) # TRY NOT TO MODIFY: CRUCIAL step easy to overlook obs = next_obs if done: # TRY NOT TO MODIFY: record rewards for plotting purposes print(f"global_step={global_step}, episode_reward={episode_reward}") writer.add_scalar("charts/episode_reward", episode_reward, global_step) obs, episode_reward = env.reset(), 0 env.close() writer.close()
def train(agent_type, env, verbose=True, save_freq=50, save_dir='./', **params): if verbose: print(params) if agent_type == 'dqn': agent = DQNAgent(env.observation_space, env.action_space, **params) elif agent_type == 'a2c': agent = A2CAgent(env.observation_space, env.action_space, **params) elif agent_type == 'td3': agent = TD3Agent(env.observation_space, env.action_space, **params) elif agent_type == 'random': agent = RandomAgent(env.observation_space, env.action_space, **params) if params['max_episode_steps'] is not None: env = TimeLimit(env, max_episode_steps=params['max_episode_steps']) log = {'agent':agent_type, 'params':params, 'episodes':[]} if save_dir[-1] != '/': raise NotADirectory if not os.path.exists(save_dir): os.makedirs(save_dir) try: ep = 0 t_total = 0 while t_total < params['max_steps']: state = env.reset() sum_reward = 0 t_ep = 0 done = False while not done: if t_total > params['start_at']: action = agent.get_action(state) else: action = env.action_space.sample() next_state, reward, done, _ = env.step(action) agent.remember(state, action, reward, next_state, done) state = next_state sum_reward += reward t_ep += 1 # for agents using online training if agent.online and t_total > params['start_at']: agent.learn() # for agents using offline training if not agent.online and t_total > params['start_at']: agent.learn() ep += 1 t_total += t_ep ep_info = {'episode':ep, 't_ep':t_ep, 't_total':t_total, 'sum_reward':sum_reward, 'optim_steps':agent.optim_steps, 'memory':len(agent.memory)} log['episodes'].append(ep_info) if verbose: print(ep_info) if ep % save_freq == 0: agent.save(save_dir + params['file_name'] + '.pth') with open(save_dir + params['file_name'] + '.pkl', 'wb') as f: pickle.dump(log, f) if verbose: print('Episode ' + str(ep) + ': Saved model weights and log.') env.close() except Exception: traceback.print_exc() breakpoint()