Exemplo n.º 1
0
def build_policy():
    model = namedtuple('model', ['policy_net', 'value_net'])
    actor = ActorSAC(state_space, hidden_dim, action_space)
    critic = CriticModel(state_space, hidden_dim, action_space, use_dist=False)
    rl_agent = model(actor, critic)
    policy = SAC(rl_agent, **kwargs)
    return policy
Exemplo n.º 2
0
def train():
    model = namedtuple('model', ['policy_net', 'value_net', 'v_net'])
    actor = ActorModel(state_space, hidden_dim, action_space)
    critic = CriticModel(state_space, hidden_dim, action_space)
    v_net = ValueModel(state_space)
    rl_agent = model(actor, critic, v_net)
    policy = SAC(rl_agent,
                 buffer_size=buffer_size,
                 actor_learn_freq=actor_learn_freq,
                 update_iteration=update_iteration,
                 target_update_freq=target_update_freq,
                 target_update_tau=target_update_tau,
                 batch_size=batch_size,
                 learning_rate=lr)
    writer = SummaryWriter(writer_path)

    if not TRAIN:
        policy.load_model(model_save_dir, save_file, load_actor=True)
    mean, std = [], []
    live_time = []

    # while policy.warm_up():
    #     sample(env, policy, max_step, warm_up=True)
    #     print (f'Warm up for buffer {policy.buffer.size()}', end='\r')

    for i_eps in range(episodes):
        rewards = sample(env, policy, max_step)
        reward_mean = np.mean(rewards)
        reward_std = np.std(rewards)

        mean.append(reward_mean)
        std.append(reward_std)
        if not TRAIN:
            print(f'EPS:{i_eps + 1}, reward:{round(reward_mean, 3)}')
        else:
            #==============learn==============
            pg_loss, q_loss, v_loss = policy.learn()
            if PLOT:
                live_time.append(reward_mean)
                plot(live_time, POLT_NAME, model_save_dir, 100)
            if WRITER:
                writer.add_scalar('reward', reward_mean, global_step=i_eps)
                writer.add_scalar('loss/pg_loss', pg_loss, global_step=i_eps)
                writer.add_scalar('loss/q_loss', q_loss, global_step=i_eps)
                writer.add_scalar('loss/v_loss', v_loss, global_step=i_eps)

            if i_eps % 5 == 0:
                print(
                    f'EPS:{i_eps}, reward_mean:{round(reward_mean, 3)}, pg_loss:{round(pg_loss, 3)}, q_loss:{round(q_loss, 3)}, alpha_loss:{round(v_loss, 3)}'
                )
            if i_eps % 200 == 0:
                policy.save_model(model_save_dir,
                                  save_file,
                                  save_actor=True,
                                  save_critic=True)
    writer.close()
    env.close()
    return mean, std
Exemplo n.º 3
0
                writer.add_scalar('loss/alpha_loss', a_loss, global_step=i_eps)

            if i_eps % 5 == 0:
                print(
                    f'EPS:{i_eps}, reward_mean:{round(reward_mean, 3)}, pg_loss:{round(pg_loss, 3)}, q_loss:{round(q_loss, 3)}, alpha_loss:{round(a_loss, 3)}'
                )
            if i_eps % 200 == 0:
                policy.save_model(model_save_dir,
                                  save_file,
                                  save_actor=True,
                                  save_critic=True)
    writer.close()
    env.close()
    return mean, std


if __name__ == '__main__':
    model = namedtuple('model', ['policy_net', 'value_net', 'v_net'])
    actor = ActorModel(state_space, hidden_dim, action_space)
    critic = CriticModel(state_space, hidden_dim, action_space)
    v_net = ValueModel(state_space)
    rl_agent = model(actor, critic, v_net)
    policy = SAC(rl_agent,
                 buffer_size=buffer_size,
                 actor_learn_freq=actor_learn_freq,
                 update_iteration=update_iteration,
                 target_update_freq=target_update_freq,
                 batch_size=size,
                 use_priority=use_priority)
    writer = SummaryWriter(writer_path)
    train()
Exemplo n.º 4
0
    def forward(self, state):
        x = F.relu(self.linear1(state))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)
        return x


model = namedtuple('model', ['policy_net', 'value_net', 'v_net'])
actor = ActorModel(state_space, hidden_dim, action_space)
critic = CriticModel(state_space, hidden_dim, action_space)
v_net = ValueModel(state_space)
model = model(actor, critic, v_net)
policy = SAC(model,
             buffer_size=buffer_size,
             actor_learn_freq=actor_learn_freq,
             target_update_freq=target_update_freq,
             batch_size=batch_size)
writer = SummaryWriter(writer_path)

TRAIN = True
PLOT = True
WRITER = False


def sample(env, policy, max_step):
    # rewards = 0
    rewards = []
    state = env.reset()
    for step in range(max_step):
        #==============choose_action==============
Exemplo n.º 5
0
def main(seed):
    if not os.path.exists("./results"):
        os.makedirs("./results")

    env = gym.make(env_name)
    env.seed(seed)
    torch.manual_seed(seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    hidden_dim = 256

    kwargs = {
        'buffer_size': int(1e6),
        'batch_size': 256,
        'policy_freq': 2,
        'tau': 0.005,
        'discount': 0.99,
        'policy_lr': 3e-4,
        'value_lr': 3e-4,
        'learn_iteration': 1,
        'verbose': False,
        'act_dim': action_dim,
        # 'alpha': 1.0,
        # 'use_priority': False,
        # 'use_munchausen': False,
        # 'use_PAL': False,
        # 'n_step': 1,
    }
    
    # file_name = f"MSAC_{env_name}_{seed}_{kwargs['use_priority']}_{kwargs['use_munchausen']}_{kwargs['use_PAL']}"
    file_name = f"SAC_{env_name}_{seed}"
    print("---------------------------------------")
    print(f"Settings: {file_name}")
    print("---------------------------------------")

    model = namedtuple('model', ['policy_net', 'value_net'])
    actor = ActorModel(state_dim, hidden_dim, action_dim)
    critic = CriticModelDist(state_dim, hidden_dim, action_dim, use_dist=False)
    rl_agent = model(actor, critic)
    policy = SAC(rl_agent, **kwargs)

    # Evaluate untrained policy
    evaluations = [eval_policy(policy, env_name, seed)]

    episode_reward = 0
    episode_timesteps = 0
    episode_num = 0
    max_timesteps = 3e6
    start_timesteps = 25e3
    eval_freq = 5e3

    state = env.reset()
    for t in range(int(max_timesteps)):
        episode_timesteps += 1

        # Select action randomly or according to policy
        if t < start_timesteps:
            action = env.action_space.sample()
        else:
            action = policy.choose_action(state)
            action = map_action(env, action)
        # Perform action
        next_state, reward, done, _ = env.step(action)
        # env.render()

        done_bool = float(done) if episode_timesteps < env._max_episode_steps else 0
        mask = 0 if done_bool else 1

        policy.process(s=state, a=action, r=reward, m=mask, s_=next_state)

        state = next_state
        episode_reward += reward

        # Train agent after collecting sufficient data
        if t >= start_timesteps:
            pg_loss, q_loss, a_loss = policy.learn()

        if done:
            # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True
            print(f"Total T: {t+1} Episode Num: {episode_num+1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f}")
            state, done = env.reset(), False
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

        # Evaluate episode
        if (t + 1) % eval_freq == 0:
            evaluations.append(eval_policy(policy, env_name, seed))
            np.save("./results/%s" % (file_name), evaluations)
Exemplo n.º 6
0
    'policy_freq': 2,
    'tau': 0.005,
    'discount': 0.99,
    'policy_lr': 3e-4,
    'value_lr': 3e-4,
    'learn_iteration': 1,
    'verbose': False,
    'act_dim': action_dim,
}

model = namedtuple('model', ['policy_net', 'value_net', 'v_net'])
actor = ActorModel(state_space, hidden_dim, action_space)
critic = CriticModel(state_space, hidden_dim, action_space)
v_net = ValueModel(state_space)
rl_agent = model(actor, critic, v_net)
policy = SAC(rl_agent, **kwargs)
writer = SummaryWriter(writer_path)

TRAIN = True
PLOT = True
WRITER = False


def sample(env, policy, max_step):
    # rewards = 0
    rewards = []
    state = env.reset()
    for step in range(max_step):
        #==============choose_action==============
        action = policy.choose_action(state)
        next_state, reward, done, info = env.step(action)