示例#1
0
def rollout_sim_single_step_parallel(task_id, env_name, horizon, actor=None, env=None):
    time_1 = time.time()
    # initialize environment
    if actor is None: actor = gen_actor(env_name, 512)
    if env is None: env = gym.make(env_name)
    # env.seed(task_id)
    # initialize logger
    old_states, new_states, raw_actions, dones, rewards, log_probs, advantages, episode_reward = [], [], [], [], [], [], [], 0.
    # collect episode
    old_obs = env.reset()
    for step in range(horizon):
        # interact with environment
        action, log_prob, raw_action = actor.gen_action(torch.Tensor(old_obs).cuda())
        assert (env.action_space.low < np.array(action)).all() and (np.array(action) < env.action_space.high).all()
        new_obs, reward, done, info = env.step(action)
        # record trajectory step
        old_states.append(old_obs)
        new_states.append(new_obs)
        raw_actions.append(raw_action.view(-1))
        rewards.append(reward)
        dones.append(done)
        log_probs.append(log_prob)
        episode_reward += reward
        # update old observation
        old_obs = new_obs
        # if done:
        #     break
    dones[-1] = True
    time_2 = time.time()
    print("    id={}, reward: {}, episode_time: {:.3f}sec".format(task_id, episode_reward, time_2 - time_1))
    return [old_states, new_states, raw_actions, rewards, dones, log_probs, episode_reward]
示例#2
0
def parallel_rollout_sim(env_name, env_number, horizon):
    envs = [gym.make(env_name) for _ in range(env_number)]
    actor = gen_actor(env_name, 512)
    critic = gen_critic(env_name, 512)
    rolloutmem = RolloutMemory(env_number * horizon, env_name)
    time_start = time.time()
    episodes_rewards = []
    data = ray.get(
        [rollout_sim_single_step_parallel.remote(i, env_name, horizon, None, None) for i in range(env_number)])
    time_end = time.time()
    for episode in data:
        old_states, new_states, raw_actions, rewards, dones, log_probs, episode_reward = \
            torch.Tensor(episode[0]).cuda(), torch.Tensor(episode[1]).cuda(), torch.stack(episode[2]).detach().cuda(), \
            torch.Tensor(episode[3]).cuda(), torch.Tensor(episode[4]).cuda(), torch.stack(episode[5]).detach().cuda(), \
            torch.Tensor([episode[6]]).cuda()
        gae_deltas = critic.gae_delta(old_states, new_states, rewards, 0.99)
        advantages = torch.Tensor(get_advantage_new(gae_deltas, 0.99, 0.95)).cuda()
        values = get_values(rewards, 0.99).cuda()
        if len(advantages.shape) == 1: advantages = advantages[:, None]
        if len(values.shape) == 1: values = values[:, None]
        rolloutmem.append(old_states, new_states, raw_actions, rewards, dones, log_probs, advantages, values)
        episodes_rewards.append(episode_reward)
    time_reformat = time.time()
    print(
        "parallel_time: {}, reformat_time: {:.3f}\nrollout_time: {:.3f}\ndata_len: {}\navgR: {:.3f}\nsaved_step_num: {}\n\n"
            .format(time_end - time_start, time_reformat - time_end, time_reformat - time_start, len(data),
                    torch.mean(torch.Tensor(episodes_rewards)), rolloutmem.offset))
    return torch.mean(torch.Tensor(episodes_rewards)), time_end - time_start
示例#3
0
def serial_rollout(env_name, env_number, horizon):
    envs = [gym.make(env_name) for _ in range(env_number)]
    actors = [gen_actor(env_name, 512) for _ in range(env_number)]
    time_start = time.time()
    data = [rollout_single_step(i, envs[i], actors[i], horizon) for i in range(env_number)]
    time_end = time.time()
    print("parallel_time: {}, data:{}".format(time_end - time_start, data))
示例#4
0
def parallel_rollout(env_name, env_number, horizon):
    actors = [gen_actor(env_name, 512) for _ in range(env_number)]
    time_start = time.time()
    data = ray.get(
        [rollout_single_step_parallel.remote(i, env_name, actors[i], horizon) for i in range(env_number)])
    time_end = time.time()
    print("parallel_time: {}, data:{}".format(time_end - time_start, data))
示例#5
0
def test_save(env_name):
    iteration = 1000
    actor = gen_actor(env_name, 64)
    critic = gen_critic(env_name, 64)
    rolloutmem = RolloutMemory(5 * 10, env_name)
    envs = [ParallelEnv.remote(env_name, i) for i in range(5)]
    optimizer = torch.optim.Adam(list(actor.parameters()) + list(critic.parameters()),
                                 lr=0.0001)
    seed = 123
    tb = SummaryWriter()
    for i in range(100): tb.add_scalar('loss', i, i)
    rollout_time, update_time = AverageMeter(), AverageMeter()
    rollout_time.update(100)
    update_time.update(100)

    save_path = os.path.join("../base/save/model", 'dev_Hopper_resume.tar')
    torch.save({
        'iteration': iteration,
        'seed': seed,
        'actor_state_dict': actor.state_dict(),
        'critic_state_dict': critic.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'rolloutmem': rolloutmem,
        'time_recorder': [rollout_time, update_time],
    }, save_path)
    print("Save Done!")
示例#6
0
def repeat_rollout(env_name, env_number, horizon, iter_num):
    # ingredients prepare
    time_start = time.time()
    envs = [ParallelEnv.remote(env_name, id) for id in range(env_number)]
    actor = gen_actor(env_name, 512)
    critic = gen_critic(env_name, 512)
    rolloutmem = RolloutMemory(env_number * horizon, env_name)
    print("    build_time: {}".format(time.time() - time_start))
    # repeat iteration
    for i in range(iter_num):
        print("iter_{}".format(i))
        parallel_rollout_env(envs, actor, critic, rolloutmem, horizon)
    print("Work Done!")
示例#7
0
def test_state_dict(env_name):
    env = gen_env(env_name)
    actor = gen_actor(env_name, 64)
    critic = gen_critic(env_name, 64)
    rolloutmem = RolloutMemory(50 * 200, env_name)
    optimizer = torch.optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=0.0001)
    tb = SummaryWriter()

    for param_tensor in actor.state_dict():
        print(param_tensor, "\t", actor.state_dict()[param_tensor].size())
    for param_tensor in critic.state_dict():
        print(param_tensor, "\t", critic.state_dict()[param_tensor].size())
    for param_tensor in optimizer.state_dict():
        print(param_tensor, "\t", optimizer.state_dict()[param_tensor])
示例#8
0
def loop_rollout(env_name, env_number, horizon):
    envs = [gym.make(env_name) for _ in range(env_number)]
    actor = gen_actor(env_name, 512)
    time_start = time.time()
    for env in envs:
        time_1 = time.time()
        obs = env.reset()
        total_reward = 0.
        for _ in range(horizon):
            obs, reward, _, _ = env.step(actor.gen_action(torch.Tensor(obs))[0])
            total_reward += reward
        time_2 = time.time()
        print('episode_time={}'.format(time_2 - time_1))
    time_end = time.time()
    print("parallel_time: {}".format(time_end - time_start))
示例#9
0
def test_load(env_name):
    actor = gen_actor(env_name, 64)
    critic = gen_critic(env_name, 64)
    optimizer = torch.optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=0.0001)

    load_path = os.path.join("../base/save/model", 'dev_Hopper_resume.tar')
    checkpoint = torch.load(load_path)

    actor.load_state_dict(checkpoint['actor_state_dict'])
    actor.train()
    critic.load_state_dict(checkpoint['critic_state_dict'])
    critic.train()
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    rolloutmem = checkpoint['rolloutmem']
    iteration = checkpoint['iteration']
    seed = checkpoint['seed']
    [rollout_time, update_time] = checkpoint['time_recorder']
    print("Load Done!")
    print('')
示例#10
0
def serial_rollout_sim(env_name, env_number, horizon):
    actor = gen_actor(env_name, 512)
    envs = [gym.make(env_name) for _ in range(env_number)]
    for i in range(env_number): envs[i].seed(seed=i)
    data = []
    time_start = time.time()
    for env_id in range(len(envs)):
        env = envs[env_id]
        time_1 = time.time()
        # initialize logger
        old_states, new_states, raw_actions, dones, rewards, log_probs, advantages, episode_reward = \
            [], [], [], [], [], [], [], 0.
        # collect episode
        old_obs = env.reset()
        for step in range(horizon):
            # interact with environment
            action, log_prob, raw_action = actor.gen_action(torch.Tensor(old_obs))
            new_obs, reward, done, info = env.step(action)
            # record trajectory step
            old_states.append(old_obs)
            new_states.append(new_obs)
            raw_actions.append(raw_action)
            rewards.append(reward)
            dones.append(done)
            log_probs.append(log_prob)
            episode_reward += reward
            # update old observation
            old_obs = new_obs
            # if done:
            #     break
        dones[-1] = True
        time_2 = time.time()
        data.append([old_states, new_states, raw_actions, rewards, dones, log_probs, episode_reward])
        print("    env_id={}, reward: {}, episode_time: {:.3f}sec".format(env_id, episode_reward, time_2 - time_1))
    time_end = time.time()
    print("parallel_time: {}\ndata_len:{}\n\n".format(time_end - time_start, len(data)))
示例#11
0
def train(params):
    # ============
    # Preparations
    # ============
    gc.collect()
    ray.init(log_to_driver=False, local_mode=False,
             num_gpus=1)  # or, ray.init()
    if not params.use_pretrain:
        # algorithm ingredients instantiation
        seed = params.seed
        actor = gen_actor(params.env_name, params.policy_params.hidden_dim)
        critic = gen_critic(params.env_name, params.policy_params.hidden_dim)
        optimizer = torch.optim.Adam(list(actor.parameters()) +
                                     list(critic.parameters()),
                                     lr=params.policy_params.learning_rate)
        rollout_time, update_time = AverageMeter(), AverageMeter()
        iteration_pretrain = 0
        # set random seed (for reproducing experiment)
        os.environ['PYTHONHASHSEED'] = str(seed)
        random.seed(seed)
        torch.manual_seed(seed)
        np.random.seed(seed)
    else:
        # build models
        actor = gen_actor(params.env_name,
                          params.policy_params.hidden_dim).cuda()
        critic = gen_critic(params.env_name,
                            params.policy_params.hidden_dim).cuda()
        optimizer = torch.optim.Adam(list(actor.parameters()) +
                                     list(critic.parameters()),
                                     lr=0.0001)
        # load models
        print("\n\nLoading training checkpoint...")
        print("------------------------------")
        load_path = os.path.join('./save/model', params.pretrain_file)
        checkpoint = torch.load(load_path)
        seed = checkpoint['seed']
        actor.load_state_dict(checkpoint['actor_state_dict'])
        actor.train()
        critic.load_state_dict(checkpoint['critic_state_dict'])
        critic.train()
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        [rollout_time, update_time] = checkpoint['time_recorder']
        iteration_pretrain = checkpoint['iteration']
        # >> set random seed (for reproducing experiment)
        os.environ['PYTHONHASHSEED'] = str(seed)
        random.seed(seed)
        torch.manual_seed(seed)
        np.random.seed(seed)
        print("Loading finished!")
        print("------------------------------\n\n")
    rolloutmem = RolloutMemory(
        params.policy_params.envs_num * params.policy_params.horizon,
        params.env_name)
    envs = [
        ParallelEnv.remote(params.env_name, i)
        for i in range(params.policy_params.envs_num)
    ]
    for i in range(len(envs)):
        envs[i].seed.remote(seed=seed + i)
    tb = SummaryWriter()
    # ============
    # Training
    # ============
    # >> training loop
    print("----------------------------------")
    print("Training model with {} parameters...".format(
        count_model_params(actor) + count_model_params(critic)))
    print("----------------------------------")
    time_start = time.time()
    for iteration in range(int(params.iter_num - iteration_pretrain)):
        # collect rollouts from current policy
        rolloutmem.reset()
        iter_start_time = time.time()
        mean_iter_reward = rollout(rolloutmem, envs, actor, critic, params)
        # optimize by gradient descent
        update_start_time = time.time()
        loss, policy_loss, critic_loss, entropy_loss, advantage, ratio, surr1, surr2, epochs_len = \
            None, None, None, None, None, None, None, None, None,
        for epoch in range(params.policy_params.epochs_num):
            loss, policy_loss, critic_loss, entropy_loss, advantage, ratio, surr1, surr2, epochs_len = \
                optimize_step(optimizer, rolloutmem, actor, critic, params, iteration)
        iter_end_time = time.time()
        tb = logger_scalar(tb, iteration + iteration_pretrain, loss,
                           policy_loss, critic_loss, entropy_loss, advantage,
                           ratio, surr1, surr2, epochs_len, mean_iter_reward,
                           time_start)
        # tb = logger_histogram(tb, iteration + iteration_pretrain, actor, critic)
        rollout_time.update(update_start_time - iter_start_time)
        update_time.update(iter_end_time - update_start_time)
        tb.add_scalar('rollout_time', rollout_time.val,
                      iteration + iteration_pretrain)
        tb.add_scalar('update_time', update_time.val,
                      iteration + iteration_pretrain)
        print(
            'it {}: avgR: {:.3f} avgL: {:.3f} | rollout_time: {:.3f}sec update_time: {:.3f}sec'
            .format(iteration + iteration_pretrain, mean_iter_reward,
                    epochs_len, rollout_time.val, update_time.val))
        # save rollout video
        if (iteration + 1) % int(params.plotting_iters) == 0 \
                and iteration > 0 \
                and params.log_video \
                and params.env_name not in envnames_classiccontrol:
            log_policy_rollout(
                params, actor, params.env_name,
                'iter-{}'.format(iteration + iteration_pretrain))
        # save model
        if (iteration + 1) % int(
                params.checkpoint_iter
        ) == 0 and iteration > 0 and params.save_checkpoint:
            save_model(params.prefix, iteration, iteration_pretrain, seed,
                       actor, critic, optimizer, rollout_time, update_time)
    # save rollout videos
    if params.log_video:
        save_model(params.prefix, params.iter_num, iteration_pretrain, seed,
                   actor, critic, optimizer, rollout_time, update_time)
        if params.env_name not in envnames_classiccontrol:
            for i in range(3):
                log_policy_rollout(params, actor, params.env_name,
                                   'final-{}'.format(i))