def build_policy(): model = namedtuple('model', ['policy_net', 'value_net']) actor = ActorSAC(state_space, hidden_dim, action_space) critic = CriticModel(state_space, hidden_dim, action_space, use_dist=False) rl_agent = model(actor, critic) policy = SAC(rl_agent, **kwargs) return policy
def train(): model = namedtuple('model', ['policy_net', 'value_net', 'v_net']) actor = ActorModel(state_space, hidden_dim, action_space) critic = CriticModel(state_space, hidden_dim, action_space) v_net = ValueModel(state_space) rl_agent = model(actor, critic, v_net) policy = SAC(rl_agent, buffer_size=buffer_size, actor_learn_freq=actor_learn_freq, update_iteration=update_iteration, target_update_freq=target_update_freq, target_update_tau=target_update_tau, batch_size=batch_size, learning_rate=lr) writer = SummaryWriter(writer_path) if not TRAIN: policy.load_model(model_save_dir, save_file, load_actor=True) mean, std = [], [] live_time = [] # while policy.warm_up(): # sample(env, policy, max_step, warm_up=True) # print (f'Warm up for buffer {policy.buffer.size()}', end='\r') for i_eps in range(episodes): rewards = sample(env, policy, max_step) reward_mean = np.mean(rewards) reward_std = np.std(rewards) mean.append(reward_mean) std.append(reward_std) if not TRAIN: print(f'EPS:{i_eps + 1}, reward:{round(reward_mean, 3)}') else: #==============learn============== pg_loss, q_loss, v_loss = policy.learn() if PLOT: live_time.append(reward_mean) plot(live_time, POLT_NAME, model_save_dir, 100) if WRITER: writer.add_scalar('reward', reward_mean, global_step=i_eps) writer.add_scalar('loss/pg_loss', pg_loss, global_step=i_eps) writer.add_scalar('loss/q_loss', q_loss, global_step=i_eps) writer.add_scalar('loss/v_loss', v_loss, global_step=i_eps) if i_eps % 5 == 0: print( f'EPS:{i_eps}, reward_mean:{round(reward_mean, 3)}, pg_loss:{round(pg_loss, 3)}, q_loss:{round(q_loss, 3)}, alpha_loss:{round(v_loss, 3)}' ) if i_eps % 200 == 0: policy.save_model(model_save_dir, save_file, save_actor=True, save_critic=True) writer.close() env.close() return mean, std
writer.add_scalar('loss/alpha_loss', a_loss, global_step=i_eps) if i_eps % 5 == 0: print( f'EPS:{i_eps}, reward_mean:{round(reward_mean, 3)}, pg_loss:{round(pg_loss, 3)}, q_loss:{round(q_loss, 3)}, alpha_loss:{round(a_loss, 3)}' ) if i_eps % 200 == 0: policy.save_model(model_save_dir, save_file, save_actor=True, save_critic=True) writer.close() env.close() return mean, std if __name__ == '__main__': model = namedtuple('model', ['policy_net', 'value_net', 'v_net']) actor = ActorModel(state_space, hidden_dim, action_space) critic = CriticModel(state_space, hidden_dim, action_space) v_net = ValueModel(state_space) rl_agent = model(actor, critic, v_net) policy = SAC(rl_agent, buffer_size=buffer_size, actor_learn_freq=actor_learn_freq, update_iteration=update_iteration, target_update_freq=target_update_freq, batch_size=size, use_priority=use_priority) writer = SummaryWriter(writer_path) train()
def forward(self, state): x = F.relu(self.linear1(state)) x = F.relu(self.linear2(x)) x = self.linear3(x) return x model = namedtuple('model', ['policy_net', 'value_net', 'v_net']) actor = ActorModel(state_space, hidden_dim, action_space) critic = CriticModel(state_space, hidden_dim, action_space) v_net = ValueModel(state_space) model = model(actor, critic, v_net) policy = SAC(model, buffer_size=buffer_size, actor_learn_freq=actor_learn_freq, target_update_freq=target_update_freq, batch_size=batch_size) writer = SummaryWriter(writer_path) TRAIN = True PLOT = True WRITER = False def sample(env, policy, max_step): # rewards = 0 rewards = [] state = env.reset() for step in range(max_step): #==============choose_action==============
def main(seed): if not os.path.exists("./results"): os.makedirs("./results") env = gym.make(env_name) env.seed(seed) torch.manual_seed(seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] hidden_dim = 256 kwargs = { 'buffer_size': int(1e6), 'batch_size': 256, 'policy_freq': 2, 'tau': 0.005, 'discount': 0.99, 'policy_lr': 3e-4, 'value_lr': 3e-4, 'learn_iteration': 1, 'verbose': False, 'act_dim': action_dim, # 'alpha': 1.0, # 'use_priority': False, # 'use_munchausen': False, # 'use_PAL': False, # 'n_step': 1, } # file_name = f"MSAC_{env_name}_{seed}_{kwargs['use_priority']}_{kwargs['use_munchausen']}_{kwargs['use_PAL']}" file_name = f"SAC_{env_name}_{seed}" print("---------------------------------------") print(f"Settings: {file_name}") print("---------------------------------------") model = namedtuple('model', ['policy_net', 'value_net']) actor = ActorModel(state_dim, hidden_dim, action_dim) critic = CriticModelDist(state_dim, hidden_dim, action_dim, use_dist=False) rl_agent = model(actor, critic) policy = SAC(rl_agent, **kwargs) # Evaluate untrained policy evaluations = [eval_policy(policy, env_name, seed)] episode_reward = 0 episode_timesteps = 0 episode_num = 0 max_timesteps = 3e6 start_timesteps = 25e3 eval_freq = 5e3 state = env.reset() for t in range(int(max_timesteps)): episode_timesteps += 1 # Select action randomly or according to policy if t < start_timesteps: action = env.action_space.sample() else: action = policy.choose_action(state) action = map_action(env, action) # Perform action next_state, reward, done, _ = env.step(action) # env.render() done_bool = float(done) if episode_timesteps < env._max_episode_steps else 0 mask = 0 if done_bool else 1 policy.process(s=state, a=action, r=reward, m=mask, s_=next_state) state = next_state episode_reward += reward # Train agent after collecting sufficient data if t >= start_timesteps: pg_loss, q_loss, a_loss = policy.learn() if done: # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True print(f"Total T: {t+1} Episode Num: {episode_num+1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f}") state, done = env.reset(), False episode_reward = 0 episode_timesteps = 0 episode_num += 1 # Evaluate episode if (t + 1) % eval_freq == 0: evaluations.append(eval_policy(policy, env_name, seed)) np.save("./results/%s" % (file_name), evaluations)
'policy_freq': 2, 'tau': 0.005, 'discount': 0.99, 'policy_lr': 3e-4, 'value_lr': 3e-4, 'learn_iteration': 1, 'verbose': False, 'act_dim': action_dim, } model = namedtuple('model', ['policy_net', 'value_net', 'v_net']) actor = ActorModel(state_space, hidden_dim, action_space) critic = CriticModel(state_space, hidden_dim, action_space) v_net = ValueModel(state_space) rl_agent = model(actor, critic, v_net) policy = SAC(rl_agent, **kwargs) writer = SummaryWriter(writer_path) TRAIN = True PLOT = True WRITER = False def sample(env, policy, max_step): # rewards = 0 rewards = [] state = env.reset() for step in range(max_step): #==============choose_action============== action = policy.choose_action(state) next_state, reward, done, info = env.step(action)