Exemplo n.º 1
0
from tqdm import trange


def make_env(env_id):
    def _f():
        env = gym.make(env_id)
        return env

    return _f


env_id = "Reach-v3"
n_envs = 16

envs = [make_env(env_id) for _ in range(n_envs)]
envs = SubprocVecEnv(envs, context='fork', in_series=4)
states = envs.reset()

env = gym.make(env_id)
max_steps = env._max_episode_steps
state_dim = env.observation_space['observation'].shape[0]
action_dim = env.action_space.shape[0]

dynamics = NonLinearDynamics(
    state_dim=envs.observation_space['observation'].shape[0],
    action_dim=envs.action_space.shape[0],
    device='cuda:0')

print("[INFO] Model learning")
for epoch in trange(50):
    # data collection stage
Exemplo n.º 2
0
def main(mode, device):
    # hyperparameters and same code snippets for both modes
    n_epochs = 50000
    gamma = 0.999
    tau = 5e-3
    batch_size = 256
    model_name = "lander_1"
    writer_name = f"./runs/{model_name}"

    writer = SummaryWriter(writer_name)

    if mode == 'multi_env':

        def make_env(env_id):
            def _f():
                env = gym.make(env_id)
                return env

            return _f

        env_id = "LunarLanderContinuous-v2"
        n_envs = 48

        envs = [make_env(env_id) for _ in range(n_envs)]
        envs = SubprocVecEnv(envs, context='fork', in_series=6)
        states = envs.reset()

        test_env = gym.make(env_id)

        replay_buffer = ExperienceReplay(size=int(1e7 / n_envs))

        agent = DDPGAgent(
            observation_space_shape=envs.observation_space.shape[0],
            action_space_shape=envs.action_space.shape[0],
            action_ranges=(envs.action_space.low[0],
                           envs.action_space.high[0]),
            gamma=gamma,
            tau=tau,
            q_lr=3e-4,
            policy_lr=3e-4,
            device=device,
        )

        pretrained = False
        if pretrained:
            agent.load_pretrained_models('reach_1')

        epoch_delay = 50

        for epoch in trange(n_epochs):
            for step in range(1000):
                if epoch < epoch_delay:
                    actions = np.array(
                        [envs.action_space.sample() for _ in range(n_envs)])
                else:
                    actions = agent.select_action(states)
                next_states, rewards, dones, info = envs.step(actions)
                replay_buffer.put(states, actions, rewards, next_states, dones)
                # Training
                if epoch > epoch_delay:
                    # Training
                    batch = replay_buffer.sample(batch_size)
                    # entropy_loss, alpha
                    q_1_loss, policy_loss, mean_q = agent.train(batch)

                    writer.add_scalar(
                        "Q1_loss", q_1_loss,
                        epoch * test_env._max_episode_steps + step)
                    writer.add_scalar(
                        "Policy_loss", policy_loss,
                        epoch * test_env._max_episode_steps + step)
                    writer.add_scalar(
                        "Mean_Q", mean_q,
                        epoch * test_env._max_episode_steps + step)

                states = next_states
                if np.all(dones):
                    states = envs.reset()
                    break

            ep2log = 50
            if (epoch + 1) % ep2log == 0 and epoch > epoch_delay:
                agent.save_models(model_name)
                # testing
                state = test_env.reset()
                rewards_sum = 0
                for _ in range(1000):
                    action = agent.select_action(state, evaluate=True)
                    next_state, reward, done, info = test_env.step(action)
                    rewards_sum += reward
                    if done:
                        writer.add_scalar("Episode reward sum",
                                          rewards_sum,
                                          global_step=(epoch + 1) // ep2log)
                        break

    else:
        replay_buffer = ExperienceReplay(size=1000000,
                                         mode=mode,
                                         device=device)

        env = gym.make('Reach-v1')
        state = env.reset()

        agent = DDPGAgent(
            observation_space_shape=env.observation_space["observation"].
            shape[0],
            action_space_shape=env.action_space.shape[0],
            action_ranges=(env.action_space.low[0], env.action_space.high[0]),
            gamma=gamma,
            tau=tau,
            q_lr=1e-4,
            policy_lr=1e-4,
            device=device,
            mode='single_env',
        )

        for epoch in trange(n_epochs):
            for step in range(1000):
                action = agent.select_action(state)
                next_state, reward, done, info = env.step(action)
                replay_buffer.put(state, action, reward, next_state, done)
                # replay_buffer.collect_episodes(state, weights, rewards, next_states, dones)
                state = next_state
                if done:
                    # replay_buffer.store_episodes()
                    state = env.reset()
                    if len(replay_buffer) > batch_size:
                        # Training
                        batch = replay_buffer.sample(batch_size)
                        update_alpha = False
                        if update_alpha:
                            q_1_loss, q_2_loss, policy_loss, entropy_loss, alpha = agent.train(
                                batch)
                        else:
                            q_1_loss, q_2_loss, policy_loss = agent.train(
                                batch)
                        writer.add_scalar("Q1_loss", q_1_loss, epoch)
                        writer.add_scalar("Q2_loss", q_2_loss, epoch)
                        writer.add_scalar("Policy_loss", policy_loss, epoch)
                        if (epoch + 1) % 500 == 0:
                            distance = np.linalg.norm(state['desired_goal'] -
                                                      state['achieved_goal'])
                            writer.add_scalar("Evaluation distance",
                                              distance,
                                              global_step=(epoch + 1) // 500)
                            writer.add_scalar("Success",
                                              info['is_success'],
                                              global_step=(epoch + 1) // 500)

                    break

            if (epoch + 1) % 10000 == 0:
                agent.save_models('sac_8')
Exemplo n.º 3
0
def main(mode, device):
    # hyperparameters and same code snippets for both modes
    n_epochs = 5000
    gamma = 0.999
    tau = 5e-3
    batch_size = 64
    model_name = "reachV2run_1"
    writer_name = f"./runs/{model_name}"

    writer = SummaryWriter(writer_name)

    if mode == 'multi_env':

        def make_env(env_id):
            def _f():
                env = gym.make(env_id)
                return env

            return _f

        env_id = "Reach-v2"
        n_envs = 32

        envs = [make_env(env_id) for _ in range(n_envs)]
        envs = SubprocVecEnv(envs, context='fork', in_series=4)
        states = envs.reset()

        test_env = gym.make(env_id)
        n_steps = test_env._max_episode_steps
        env_params = {
            'obs': test_env.observation_space['observation'].shape[0],
            'actions': test_env.action_space.shape[0],
            'goals': test_env.observation_space['achieved_goal'].shape[0],
            'reward_function': test_env.compute_reward,
            'max_episode_timesteps': n_steps
        }

        replay_buffer = HindsightExperienceReplay(env_params=env_params,
                                                  size=1000000,
                                                  n_envs=n_envs,
                                                  k=16,
                                                  use_achieved_goal=False)

        agent = TD3Agent(
            observation_dim=envs.observation_space["observation"].shape[0],
            goal_dim=envs.observation_space["achieved_goal"].shape[0],
            action_dim=envs.action_space.shape[0],
            action_ranges=(envs.action_space.low[0],
                           envs.action_space.high[0]),
            gamma=gamma,
            tau=tau,
            q_lr=3e-4,
            policy_lr=3e-4,
            device=device,
            image_as_state=False)

        pretrained = False
        if pretrained:
            agent.load_pretrained_models('pick_td3_1')

        for epoch in trange(n_epochs):
            for step in range(n_steps):
                iteration = n_envs * (epoch * n_steps + step)
                actions = agent.select_action(states)
                next_states, rewards, dones, info = envs.step(actions)
                replay_buffer.collect_episodes(states, actions, rewards,
                                               next_states, dones)
                # Training
                if epoch > 200:
                    # Training
                    batch = replay_buffer.sample(batch_size)
                    agent.train(batch, iteration, writer)

                states = next_states
                if np.all(dones):
                    states = envs.reset()
                    replay_buffer.store_episodes()
                    writer.add_scalar(
                        "Success_rate",
                        round(
                            sum([_info['is_success']
                                 for _info in info]) / n_envs, 3), iteration)
                    break

            ep2log = 100
            if (epoch + 1) % ep2log == 0:
                agent.save_models(model_name)
                if not os.path.exists('./figures'):
                    os.mkdir('./figures')
                # testing
                success = 0
                rewards_sum = 0
                for _ in range(10):
                    state = test_env.reset()
                    for _ in range(n_steps):
                        action = agent.select_action(state, evaluate=True)
                        next_state, reward, done, info = test_env.step(action)
                        rewards_sum += reward
                        if done:
                            if info['is_success']:
                                success += 1
                            break

                writer.add_scalar("Test_average_rewards", rewards_sum / 10,
                                  n_envs * epoch * n_steps)
                writer.add_scalar("Test_success_rate", round(success / 10, 5),
                                  n_envs * epoch * n_steps)

    else:
        replay_buffer = ExperienceReplay(size=1000000,
                                         mode=mode,
                                         device=device)

        env = gym.make('Reach-v1')
        state = env.reset()

        agent = TD3Agent(
            observation_dim=env.observation_space["observation"].shape[0],
            goal_dim=env.observation_space["achieved_goal"].shape[0],
            action_dim=env.action_space.shape[0],
            action_ranges=(env.action_space.low[0], env.action_space.high[0]),
            gamma=gamma,
            tau=tau,
            q_lr=1e-4,
            policy_lr=1e-4,
            device=device,
            mode='single_env',
            image_as_state=False)

        for epoch in trange(n_epochs):
            for step in range(1000):
                action = agent.select_action(state)
                next_state, reward, done, info = env.step(action)
                replay_buffer.put(state, action, reward, next_state, done)
                # replay_buffer.collect_episodes(state, actions, rewards, next_states, dones)
                state = next_state
                if done:
                    # replay_buffer.store_episodes()
                    state = env.reset()
                    if len(replay_buffer) > batch_size:
                        # Training
                        batch = replay_buffer.sample(batch_size)
                        q_1_loss, policy_loss = agent.train(batch)
                        writer.add_scalar("Q1_loss", q_1_loss, epoch)
                        writer.add_scalar("Policy_loss", policy_loss, epoch)
                        if (epoch + 1) % 500 == 0:
                            distance = np.linalg.norm(state['desired_goal'] -
                                                      state['achieved_goal'])
                            writer.add_scalar("Evaluation distance",
                                              distance,
                                              global_step=(epoch + 1) // 500)
                            writer.add_scalar("Success",
                                              info['is_success'],
                                              global_step=(epoch + 1) // 500)

                    break

            if (epoch + 1) % 10000 == 0:
                agent.save_models('sac_8')
Exemplo n.º 4
0
def main(device):
    # hyperparameters and same code snippets for both modes
    n_epochs = 5000
    n_substeps = 10
    gamma = 0.999
    tau = 5e-3
    batch_size = 128
    hidden_dim = 10
    model_name = "reach_image_2"
    writer_name = f"./runs/{model_name}"

    writer = SummaryWriter(writer_name)

    def make_env(env_id):
        def _f():
            env = gym.make(env_id)
            return env

        return _f

    env_id = "Reach-v0"
    n_envs = 32

    envs = [make_env(env_id) for _ in range(n_envs)]
    envs = SubprocVecEnv(envs, context='fork', in_series=4)
    states = envs.reset()

    test_env = gym.make(env_id)
    n_steps = test_env._max_episode_steps
    env_params = {
        'obs': hidden_dim,
        'actions': test_env.action_space.shape[0],
        'goals': test_env.observation_space['achieved_goal'].shape[0],
        'reward_function': test_env.compute_reward,
        'max_episode_timesteps': n_steps
    }

    img_buf = ImageBuffer(size=10000, device=device)
    img_buf.put(states)

    agent = CURL_SACAgent(
        hidden_dim=hidden_dim,
        goal_dim=envs.observation_space["achieved_goal"].shape[0],
        action_dim=envs.action_space.shape[0],
        action_ranges=(envs.action_space.low[0], envs.action_space.high[0]),
        gamma=gamma,
        tau=tau,
        alpha=1,
        q_lr=3e-4,
        alpha_lr=3e-4,
        policy_lr=3e-4,
        device=device)

    replay_buffer = HindsightExperienceReplay(env_params=env_params,
                                              size=1000000,
                                              n_envs=n_envs,
                                              use_achieved_goal=True,
                                              k=8)

    pretrained = False
    if pretrained:
        agent.load_pretrained_models('reach_1')

    epoch_delay = 20

    for epoch in trange(n_epochs):
        for step in range(n_steps):
            encoded_states = agent.encode_obs(states, to_numpy=True)
            actions = agent.select_action(encoded_states)
            next_states, rewards, dones, info = envs.step(actions)
            encoded_next_states = agent.encode_obs(next_states, to_numpy=True)
            img_buf.put(next_states['observation'])
            replay_buffer.collect_episodes(encoded_states, actions, rewards,
                                           encoded_next_states, dones)
            # Training
            if epoch > epoch_delay:
                # CURL training
                for inner_step in range(n_substeps):
                    obs_batch = img_buf.sample(batch_size=256)
                    contrastive_loss = agent.train_encoder(obs_batch)
                    writer.add_scalar(
                        "Contrastive_loss", contrastive_loss,
                        n_envs * (epoch * n_steps * n_substeps +
                                  step * n_substeps + inner_step))
                # RL training
                batch = replay_buffer.sample(batch_size)
                q_1_loss, q_2_loss, policy_loss, mean_q, entropy_loss, alpha = agent.train(
                    batch, update_alpha=True)
                # logging
                writer.add_scalar("Q1_loss", q_1_loss,
                                  n_envs * (epoch * n_steps + step))
                writer.add_scalar("Q2_loss", q_2_loss,
                                  n_envs * (epoch * n_steps + step))
                writer.add_scalar("Policy_loss", policy_loss,
                                  n_envs * (epoch * n_steps + step))
                writer.add_scalar("Mean_Q", mean_q,
                                  n_envs * (epoch * n_steps + step))
                writer.add_scalar("Entropy loss", entropy_loss,
                                  n_envs * (epoch * n_steps + step))
                writer.add_scalar("Alpha", alpha,
                                  n_envs * (epoch * n_steps + step))
                writer.add_scalar(
                    "Success_rate",
                    round(
                        sum([_info['is_success'] for _info in info]) / n_envs,
                        2), n_envs * (epoch * n_steps + step))

            states = next_states
            if np.all(dones):
                states = envs.reset()
                replay_buffer.store_episodes()
                break

        ep2log = 20
        if (epoch + 1) % ep2log == 0 and epoch > epoch_delay:
            agent.save_models(model_name)
            if not os.path.exists('./figures'):
                os.mkdir('./figures')
            # testing
            success = 0
            rewards_sum = 0
            for _ in range(10):
                state = test_env.reset()
                for _ in range(n_steps):
                    action = agent.select_action(state, evaluate=True)
                    next_state, reward, done, info = test_env.step(action)
                    rewards_sum += reward
                    if done:
                        if info['is_success']:
                            success += 1
                        break

            writer.add_scalar("Test_average_rewards", rewards_sum / 10,
                              n_envs * epoch * n_steps)
            writer.add_scalar("Test_success_rate", round(success / 10, 5),
                              n_envs * epoch * n_steps)