示例#1
0
running_state = ZFilter((state_dim, ), clip=5)
# running_reward = ZFilter((1,), demean=False, clip=10)
"""define actor and critic"""
if args.model_path is None:
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n)
    else:
        policy_net = Policy(state_dim,
                            env_dummy.action_space.shape[0],
                            log_std=args.log_std)
    value_net = Value(state_dim)
else:
    policy_net, value_net, running_state = pickle.load(
        open(args.model_path, "rb"))
policy_net.to(device)
value_net.to(device)
del env_dummy

optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=0.01)
optimizer_value = torch.optim.Adam(value_net.parameters(), lr=0.01)
"""create agent"""
agent = Agent(env_factory,
              policy_net,
              device,
              running_state=running_state,
              render=args.render,
              num_threads=args.num_threads)


def update_params(batch):
    states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device)
示例#2
0
def train(**kwargs):
    print('here')
    config = {
        "lr": kwargs['lr'],
        "gamma": kwargs['gamma']
    }
    dtype = torch.float64
    torch.set_default_dtype(dtype)
    device = torch.device('cuda', index=args.gpu_index) if torch.cuda.is_available() else torch.device('cpu')
    if torch.cuda.is_available():
        torch.cuda.set_device(args.gpu_index)

    """environment"""
    env = gym.make(args.env_name)
    state_dim = env.observation_space.shape[0]
    is_disc_action = len(env.action_space.shape) == 0
    running_state = ZFilter((state_dim,), clip=5)
    # running_reward = ZFilter((1,), demean=False, clip=10)

    """seeding"""
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    env.seed(args.seed)

    # """define actor and critic"""
    if args.model_path is None:
        if is_disc_action:
            policy_net = DiscretePolicy(state_dim, env.action_space.n)
        else:
            policy_net = Policy(state_dim, env.action_space.shape[0], log_std=args.log_std)
        value_net = Value(state_dim)
    else:
        policy_net, value_net, running_state = pickle.load(open(args.model_path, "rb"))
    policy_net.to(device)
    value_net.to(device)

    # optimization epoch number and batch size for PPO
    optim_epochs = 10
    optim_batch_size = 64

    """create agent"""
    agent = Agent(env, policy_net, device, running_state=running_state, render=args.render, num_threads=args.num_threads)
    def update_params(batch, i_iter, config):
        states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device)
        actions = torch.from_numpy(np.stack(batch.action)).to(dtype).to(device)
        rewards = torch.from_numpy(np.stack(batch.reward)).to(dtype).to(device)
        masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device)
        with torch.no_grad():
            values = value_net(states)
            fixed_log_probs = policy_net.get_log_prob(states, actions)

        """get advantage estimation from the trajectories"""
        advantages, returns = estimate_advantages(rewards, masks, values, config['gamma'], args.tau, device)

        """perform mini-batch PPO update"""
        optim_iter_num = int(math.ceil(states.shape[0] / optim_batch_size))
        for _ in range(optim_epochs):
            perm = np.arange(states.shape[0])
            np.random.shuffle(perm)
            perm = LongTensor(perm).to(device)

            states, actions, returns, advantages, fixed_log_probs = \
                states[perm].clone(), actions[perm].clone(), returns[perm].clone(), advantages[perm].clone(), fixed_log_probs[perm].clone()

            for i in range(optim_iter_num):
                ind = slice(i * optim_batch_size, min((i + 1) * optim_batch_size, states.shape[0]))
                states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \
                    states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind]

                ppo_step(policy_net, value_net, optimizer_policy, optimizer_value, 1, states_b, actions_b, returns_b,
                        advantages_b, fixed_log_probs_b, args.clip_epsilon, args.l2_reg)

    def main_loop(config):
        optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=config['lr'])
        optimizer_value = torch.optim.Adam(value_net.parameters(), lr=config['lr'])
        for i_iter in range(args.max_iter_num):
            """generate multiple trajectories that reach the minimum batch_size"""
            batch, log = agent.collect_samples(args.min_batch_size)
            t0 = time.time()
            update_params(batch, i_iter, config)
            t1 = time.time()

            if i_iter % args.log_interval == 0:
                print('{}\tT_sample {:.4f}\tT_update {:.4f}\tR_min {:.2f}\tR_max {:.2f}\tR_avg {:.2f}'.format(
                    i_iter, log['sample_time'], t1-t0, log['min_reward'], log['max_reward'], log['avg_reward']))

            if args.save_model_interval > 0 and (i_iter+1) % args.save_model_interval == 0:
                to_device(torch.device('cpu'), policy_net, value_net)
                pickle.dump((policy_net, value_net, running_state),
                            open(os.path.join(assets_dir(), 'learned_models/{}_ppo.p'.format(args.env_name)), 'wb'))
                to_device(device, policy_net, value_net)

        #     """clean up gpu memory"""
            torch.cuda.empty_cache()
        return agent.evaluate()

    print('a')
    print(config)
    print(args)
    return main_loop(config)
示例#3
0
if args.model_path is None:
    if is_disc_action:
        policy_net = DiscretePolicy(state_dim, env.action_space.n)
    else:
        policy_mgr = DiscretePolicy(state_dim, 7)
        policy_wrk = Policy(state_dim + subgoal_dim,
                            env.action_space.shape[0],
                            log_std=args.log_std)
    value_mgr = Value(state_dim)
    value_wrk = Value(state_dim + subgoal_dim)
else:
    policy_net, value_net, running_state = pickle.load(
        open(args.model_path, "rb"))
policy_mgr.to(device)
policy_wrk.to(device)
value_mgr.to(device)
value_wrk.to(device)

# optim_policy_m = torch.optim.Adam(policy_mgr.parameters(), lr=0.01)
# optim_policy_w = torch.optim.Adam(policy_wrk.parameters(), lr=0.01)
# optim_value_m = torch.optim.Adam(value_mgr.parameters(), lr=0.01)
# optim_value_w = torch.optim.Adam(value_wrk.parameters(), lr=0.01)
"""create agent"""
agent = Agent(env,
              policy_mgr,
              policy_wrk,
              device,
              running_state=running_state,
              render=args.render,
              num_threads=args.num_threads)
示例#4
0
def learn_model(args):

    print("RL result will be saved at %s" % args.rl_filename)
    print("RL model will be saved at %s" % args.rl_model_filename)
    if use_gpu:
        print("Using CUDA.")

    torch.manual_seed(args.rl_seed)
    if use_gpu:
        torch.cuda.manual_seed_all(args.rl_seed)
        torch.backends.cudnn.deterministic = True
    np.random.seed(args.rl_seed)
    random.seed(args.rl_seed)

    env = gym.make(args.env_name)
    env.seed(args.rl_seed)

    env_test = gym.make(args.env_name)
    env_test.seed(args.rl_seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    a_bound = np.asscalar(env.action_space.high[0])
    a_low = np.asscalar(env.action_space.low[0])
    assert a_bound == -a_low

    ## Binary flag for manually cliping actions for step function after adding Gaussian noise.
    clip = (args.env_name == "LunarLanderContinuous-v2"
            or args.env_name == "BipedalWalker-v2")

    print(env.observation_space)
    print(env.action_space)
    """define actor and critic"""
    policy_net = Policy(state_dim,
                        action_dim,
                        log_std=args.log_std,
                        a_bound=a_bound,
                        hidden_size=args.hidden_size,
                        activation=args.activation).to(device)
    value_net = Value(state_dim,
                      hidden_size=args.hidden_size,
                      activation=args.activation).to(device)

    optimizer_value = torch.optim.Adam(value_net.parameters(),
                                       lr=args.learning_rate_v)
    decayed_lambda_td = args.lambda_td

    def update_params_c(batch, i_iter):
        states = torch.from_numpy(np.stack(batch.state)).float().to(device)
        actions = torch.from_numpy(np.stack(batch.action)).float().to(device)
        rewards = torch.from_numpy(np.stack(batch.reward)).float().to(device)
        masks = torch.from_numpy(np.stack(batch.mask).astype(
            np.float32)).to(device)
        """get advantage estimation from the trajectories"""
        values = value_net(states).data
        advantages, lambda_returns, mc_returns = estimate_advantages(
            rewards, masks, values, args.gamma, args.tau)

        if args.lamret:
            returns = lambda_returns
        else:
            returns = mc_returns
        """perform critic update"""
        #gae_step(value_net, optimizer_value, states, lambda_returns, args.l2_reg)  # full batch GD
        gae_step_epoch(value_net, optimizer_value, states, returns,
                       args.l2_reg)  # Stochastic GD

    """ Function to update the parameters of value and policy networks"""

    def update_params_p(batch, i_iter):

        nonlocal decayed_lambda_td

        states = torch.from_numpy(np.stack(batch.state)).float().to(device)
        actions = torch.from_numpy(np.stack(batch.action)).float().to(device)
        next_states = torch.from_numpy(np.stack(
            batch.next_state)).float().to(device)
        rewards = torch.from_numpy(np.stack(batch.reward)).float().to(device)
        masks = torch.from_numpy(np.stack(batch.mask).astype(
            np.float32)).to(device)
        """get advantage estimation from the trajectories, this is done after gae_step update"""
        values = value_net(states).data
        advantages, lambda_returns, mc_returns = estimate_advantages(
            rewards, masks, values, gamma=args.gamma, tau=args.tau)

        if args.method_name == "TRPO-RET-MC":
            returns = mc_returns.detach(
            )  # detach() does not matter since we back prop policy network only.
        elif args.method_name == "TRPO-RET-GAE":
            returns = lambda_returns.detach(
            )  # detach() does not matter actually.
        else:
            returns = 0  # returns is not used for TRPO and TRPO-TD.

        # standardize or not ?
        if args.mgae:
            advantages = (advantages - advantages.mean()
                          ) / advantages.std()  # this will be m-std version
        else:
            advantages = advantages / advantages.std(
            )  # this will be std version

        trpo_step_td(policy_net=policy_net, value_net=value_net, states=states, actions=actions, next_states=next_states, rewards=rewards, masks=masks, gamma=args.gamma, advantages=advantages, \
            max_kl=args.max_kl, damping=args.damping, \
            lambda_td=decayed_lambda_td, method_name=args.method_name, returns=returns, mtd=args.mtd)
        """ decay the td_reg parameter after update """
        decayed_lambda_td = decayed_lambda_td * args.decay_td

    """create agent"""
    agent = Agent(env, policy_net, render=False)
    agent_test = Agent(env_test,
                       policy_net,
                       mean_action=True,
                       render=args.render)
    """ The actual learning loop"""
    for i_iter in range(args.rl_max_iter_num):
        """ Save the learned policy model """
        if ( (i_iter) % args.rl_save_model_interval == 0 and args.rl_save_model_interval > 0 ) \
            or (i_iter == args.rl_max_iter_num + 1) or i_iter == 0:

            policy_net = policy_net.to(device_cpu)
            value_net = value_net.to(device_cpu)

            pickle.dump((policy_net, value_net),
                        open(args.rl_model_filename + ("_I%d.p" % (i_iter)),
                             'wb'))

            policy_net = policy_net.to(device)
            value_net = value_net.to(device)
        """ Test the policy before update """
        if i_iter % args.log_interval == 0 or i_iter + 1 == args.rl_max_iter_num:
            _, log_test = agent_test.collect_samples_test(max_num_episodes=20,
                                                          render=args.render,
                                                          clip=clip)
        """generate multiple trajectories that reach the minimum batch_size"""
        t0 = time.time()
        batch, log = agent.collect_samples_train(
            args.min_batch_size, render=False,
            clip=clip)  # this is on-policy samples
        t1 = time.time()
        """ update parameters """
        t0_d = time.time()
        update_params_c(batch, i_iter)  #critic update
        update_params_p(batch, i_iter)  #actor update
        t1_d = time.time()
        """ Print out result to stdout and save it to a text file for later usage"""
        if i_iter % args.log_interval == 0:

            result_text = t_format("Iter %6d (%2.2fs)+(%2.2fs)" %
                                   (i_iter, t1 - t0, t1_d - t0_d))
            result_text += " | [R] " + t_format(
                "Avg: %.2f (%.2f)" % (log['avg_reward'], log['std_reward']), 2)
            result_text += " | [R_test] " + t_format("min: %.2f" % log_test['min_reward'], 1) + t_format("max: %.2f" % log_test['max_reward'], 1) \
                            + t_format("Avg: %.2f (%.2f)" % (log_test['avg_reward'], log_test['std_reward']), 2)
            print(result_text)

            with open(args.rl_filename, 'a') as f:
                print(result_text, file=f)