from tqdm import trange def make_env(env_id): def _f(): env = gym.make(env_id) return env return _f env_id = "Reach-v3" n_envs = 16 envs = [make_env(env_id) for _ in range(n_envs)] envs = SubprocVecEnv(envs, context='fork', in_series=4) states = envs.reset() env = gym.make(env_id) max_steps = env._max_episode_steps state_dim = env.observation_space['observation'].shape[0] action_dim = env.action_space.shape[0] dynamics = NonLinearDynamics( state_dim=envs.observation_space['observation'].shape[0], action_dim=envs.action_space.shape[0], device='cuda:0') print("[INFO] Model learning") for epoch in trange(50): # data collection stage
def main(mode, device): # hyperparameters and same code snippets for both modes n_epochs = 50000 gamma = 0.999 tau = 5e-3 batch_size = 256 model_name = "lander_1" writer_name = f"./runs/{model_name}" writer = SummaryWriter(writer_name) if mode == 'multi_env': def make_env(env_id): def _f(): env = gym.make(env_id) return env return _f env_id = "LunarLanderContinuous-v2" n_envs = 48 envs = [make_env(env_id) for _ in range(n_envs)] envs = SubprocVecEnv(envs, context='fork', in_series=6) states = envs.reset() test_env = gym.make(env_id) replay_buffer = ExperienceReplay(size=int(1e7 / n_envs)) agent = DDPGAgent( observation_space_shape=envs.observation_space.shape[0], action_space_shape=envs.action_space.shape[0], action_ranges=(envs.action_space.low[0], envs.action_space.high[0]), gamma=gamma, tau=tau, q_lr=3e-4, policy_lr=3e-4, device=device, ) pretrained = False if pretrained: agent.load_pretrained_models('reach_1') epoch_delay = 50 for epoch in trange(n_epochs): for step in range(1000): if epoch < epoch_delay: actions = np.array( [envs.action_space.sample() for _ in range(n_envs)]) else: actions = agent.select_action(states) next_states, rewards, dones, info = envs.step(actions) replay_buffer.put(states, actions, rewards, next_states, dones) # Training if epoch > epoch_delay: # Training batch = replay_buffer.sample(batch_size) # entropy_loss, alpha q_1_loss, policy_loss, mean_q = agent.train(batch) writer.add_scalar( "Q1_loss", q_1_loss, epoch * test_env._max_episode_steps + step) writer.add_scalar( "Policy_loss", policy_loss, epoch * test_env._max_episode_steps + step) writer.add_scalar( "Mean_Q", mean_q, epoch * test_env._max_episode_steps + step) states = next_states if np.all(dones): states = envs.reset() break ep2log = 50 if (epoch + 1) % ep2log == 0 and epoch > epoch_delay: agent.save_models(model_name) # testing state = test_env.reset() rewards_sum = 0 for _ in range(1000): action = agent.select_action(state, evaluate=True) next_state, reward, done, info = test_env.step(action) rewards_sum += reward if done: writer.add_scalar("Episode reward sum", rewards_sum, global_step=(epoch + 1) // ep2log) break else: replay_buffer = ExperienceReplay(size=1000000, mode=mode, device=device) env = gym.make('Reach-v1') state = env.reset() agent = DDPGAgent( observation_space_shape=env.observation_space["observation"]. shape[0], action_space_shape=env.action_space.shape[0], action_ranges=(env.action_space.low[0], env.action_space.high[0]), gamma=gamma, tau=tau, q_lr=1e-4, policy_lr=1e-4, device=device, mode='single_env', ) for epoch in trange(n_epochs): for step in range(1000): action = agent.select_action(state) next_state, reward, done, info = env.step(action) replay_buffer.put(state, action, reward, next_state, done) # replay_buffer.collect_episodes(state, weights, rewards, next_states, dones) state = next_state if done: # replay_buffer.store_episodes() state = env.reset() if len(replay_buffer) > batch_size: # Training batch = replay_buffer.sample(batch_size) update_alpha = False if update_alpha: q_1_loss, q_2_loss, policy_loss, entropy_loss, alpha = agent.train( batch) else: q_1_loss, q_2_loss, policy_loss = agent.train( batch) writer.add_scalar("Q1_loss", q_1_loss, epoch) writer.add_scalar("Q2_loss", q_2_loss, epoch) writer.add_scalar("Policy_loss", policy_loss, epoch) if (epoch + 1) % 500 == 0: distance = np.linalg.norm(state['desired_goal'] - state['achieved_goal']) writer.add_scalar("Evaluation distance", distance, global_step=(epoch + 1) // 500) writer.add_scalar("Success", info['is_success'], global_step=(epoch + 1) // 500) break if (epoch + 1) % 10000 == 0: agent.save_models('sac_8')
def main(mode, device): # hyperparameters and same code snippets for both modes n_epochs = 5000 gamma = 0.999 tau = 5e-3 batch_size = 64 model_name = "reachV2run_1" writer_name = f"./runs/{model_name}" writer = SummaryWriter(writer_name) if mode == 'multi_env': def make_env(env_id): def _f(): env = gym.make(env_id) return env return _f env_id = "Reach-v2" n_envs = 32 envs = [make_env(env_id) for _ in range(n_envs)] envs = SubprocVecEnv(envs, context='fork', in_series=4) states = envs.reset() test_env = gym.make(env_id) n_steps = test_env._max_episode_steps env_params = { 'obs': test_env.observation_space['observation'].shape[0], 'actions': test_env.action_space.shape[0], 'goals': test_env.observation_space['achieved_goal'].shape[0], 'reward_function': test_env.compute_reward, 'max_episode_timesteps': n_steps } replay_buffer = HindsightExperienceReplay(env_params=env_params, size=1000000, n_envs=n_envs, k=16, use_achieved_goal=False) agent = TD3Agent( observation_dim=envs.observation_space["observation"].shape[0], goal_dim=envs.observation_space["achieved_goal"].shape[0], action_dim=envs.action_space.shape[0], action_ranges=(envs.action_space.low[0], envs.action_space.high[0]), gamma=gamma, tau=tau, q_lr=3e-4, policy_lr=3e-4, device=device, image_as_state=False) pretrained = False if pretrained: agent.load_pretrained_models('pick_td3_1') for epoch in trange(n_epochs): for step in range(n_steps): iteration = n_envs * (epoch * n_steps + step) actions = agent.select_action(states) next_states, rewards, dones, info = envs.step(actions) replay_buffer.collect_episodes(states, actions, rewards, next_states, dones) # Training if epoch > 200: # Training batch = replay_buffer.sample(batch_size) agent.train(batch, iteration, writer) states = next_states if np.all(dones): states = envs.reset() replay_buffer.store_episodes() writer.add_scalar( "Success_rate", round( sum([_info['is_success'] for _info in info]) / n_envs, 3), iteration) break ep2log = 100 if (epoch + 1) % ep2log == 0: agent.save_models(model_name) if not os.path.exists('./figures'): os.mkdir('./figures') # testing success = 0 rewards_sum = 0 for _ in range(10): state = test_env.reset() for _ in range(n_steps): action = agent.select_action(state, evaluate=True) next_state, reward, done, info = test_env.step(action) rewards_sum += reward if done: if info['is_success']: success += 1 break writer.add_scalar("Test_average_rewards", rewards_sum / 10, n_envs * epoch * n_steps) writer.add_scalar("Test_success_rate", round(success / 10, 5), n_envs * epoch * n_steps) else: replay_buffer = ExperienceReplay(size=1000000, mode=mode, device=device) env = gym.make('Reach-v1') state = env.reset() agent = TD3Agent( observation_dim=env.observation_space["observation"].shape[0], goal_dim=env.observation_space["achieved_goal"].shape[0], action_dim=env.action_space.shape[0], action_ranges=(env.action_space.low[0], env.action_space.high[0]), gamma=gamma, tau=tau, q_lr=1e-4, policy_lr=1e-4, device=device, mode='single_env', image_as_state=False) for epoch in trange(n_epochs): for step in range(1000): action = agent.select_action(state) next_state, reward, done, info = env.step(action) replay_buffer.put(state, action, reward, next_state, done) # replay_buffer.collect_episodes(state, actions, rewards, next_states, dones) state = next_state if done: # replay_buffer.store_episodes() state = env.reset() if len(replay_buffer) > batch_size: # Training batch = replay_buffer.sample(batch_size) q_1_loss, policy_loss = agent.train(batch) writer.add_scalar("Q1_loss", q_1_loss, epoch) writer.add_scalar("Policy_loss", policy_loss, epoch) if (epoch + 1) % 500 == 0: distance = np.linalg.norm(state['desired_goal'] - state['achieved_goal']) writer.add_scalar("Evaluation distance", distance, global_step=(epoch + 1) // 500) writer.add_scalar("Success", info['is_success'], global_step=(epoch + 1) // 500) break if (epoch + 1) % 10000 == 0: agent.save_models('sac_8')
def main(device): # hyperparameters and same code snippets for both modes n_epochs = 5000 n_substeps = 10 gamma = 0.999 tau = 5e-3 batch_size = 128 hidden_dim = 10 model_name = "reach_image_2" writer_name = f"./runs/{model_name}" writer = SummaryWriter(writer_name) def make_env(env_id): def _f(): env = gym.make(env_id) return env return _f env_id = "Reach-v0" n_envs = 32 envs = [make_env(env_id) for _ in range(n_envs)] envs = SubprocVecEnv(envs, context='fork', in_series=4) states = envs.reset() test_env = gym.make(env_id) n_steps = test_env._max_episode_steps env_params = { 'obs': hidden_dim, 'actions': test_env.action_space.shape[0], 'goals': test_env.observation_space['achieved_goal'].shape[0], 'reward_function': test_env.compute_reward, 'max_episode_timesteps': n_steps } img_buf = ImageBuffer(size=10000, device=device) img_buf.put(states) agent = CURL_SACAgent( hidden_dim=hidden_dim, goal_dim=envs.observation_space["achieved_goal"].shape[0], action_dim=envs.action_space.shape[0], action_ranges=(envs.action_space.low[0], envs.action_space.high[0]), gamma=gamma, tau=tau, alpha=1, q_lr=3e-4, alpha_lr=3e-4, policy_lr=3e-4, device=device) replay_buffer = HindsightExperienceReplay(env_params=env_params, size=1000000, n_envs=n_envs, use_achieved_goal=True, k=8) pretrained = False if pretrained: agent.load_pretrained_models('reach_1') epoch_delay = 20 for epoch in trange(n_epochs): for step in range(n_steps): encoded_states = agent.encode_obs(states, to_numpy=True) actions = agent.select_action(encoded_states) next_states, rewards, dones, info = envs.step(actions) encoded_next_states = agent.encode_obs(next_states, to_numpy=True) img_buf.put(next_states['observation']) replay_buffer.collect_episodes(encoded_states, actions, rewards, encoded_next_states, dones) # Training if epoch > epoch_delay: # CURL training for inner_step in range(n_substeps): obs_batch = img_buf.sample(batch_size=256) contrastive_loss = agent.train_encoder(obs_batch) writer.add_scalar( "Contrastive_loss", contrastive_loss, n_envs * (epoch * n_steps * n_substeps + step * n_substeps + inner_step)) # RL training batch = replay_buffer.sample(batch_size) q_1_loss, q_2_loss, policy_loss, mean_q, entropy_loss, alpha = agent.train( batch, update_alpha=True) # logging writer.add_scalar("Q1_loss", q_1_loss, n_envs * (epoch * n_steps + step)) writer.add_scalar("Q2_loss", q_2_loss, n_envs * (epoch * n_steps + step)) writer.add_scalar("Policy_loss", policy_loss, n_envs * (epoch * n_steps + step)) writer.add_scalar("Mean_Q", mean_q, n_envs * (epoch * n_steps + step)) writer.add_scalar("Entropy loss", entropy_loss, n_envs * (epoch * n_steps + step)) writer.add_scalar("Alpha", alpha, n_envs * (epoch * n_steps + step)) writer.add_scalar( "Success_rate", round( sum([_info['is_success'] for _info in info]) / n_envs, 2), n_envs * (epoch * n_steps + step)) states = next_states if np.all(dones): states = envs.reset() replay_buffer.store_episodes() break ep2log = 20 if (epoch + 1) % ep2log == 0 and epoch > epoch_delay: agent.save_models(model_name) if not os.path.exists('./figures'): os.mkdir('./figures') # testing success = 0 rewards_sum = 0 for _ in range(10): state = test_env.reset() for _ in range(n_steps): action = agent.select_action(state, evaluate=True) next_state, reward, done, info = test_env.step(action) rewards_sum += reward if done: if info['is_success']: success += 1 break writer.add_scalar("Test_average_rewards", rewards_sum / 10, n_envs * epoch * n_steps) writer.add_scalar("Test_success_rate", round(success / 10, 5), n_envs * epoch * n_steps)