Exemplo n.º 1
0
def main(alg):
    env_fn = lambda: get_environment(ENV_NAME, reward_type='sparse')

    ac_kwargs = dict(hidden_sizes=[128, 128], activation=torch.nn.ReLU)

    logger_kwargs = dict(output_dir='./corl/{}'.format(alg),
                         exp_name=alg + '_reacher_sparse')

    if alg == 'PPO':
        # ppo(env_fn=env_fn, ac_kwargs=ac_kwargs, steps_per_epoch=4000, epochs=250,
        #     logger_kwargs=logger_kwargs)
        ppo(env_fn=env_fn,
            ac_kwargs=ac_kwargs,
            steps_per_epoch=10000,
            epochs=250,
            max_ep_len=H_total,
            logger_kwargs=logger_kwargs)
    elif alg == 'SAC':
        sac(env_fn=env_fn,
            ac_kwargs=ac_kwargs,
            steps_per_epoch=10000,
            epochs=250,
            max_ep_len=H_total,
            logger_kwargs=logger_kwargs)
    else:
        print('Invalid Algorithm. Exiting...')
Exemplo n.º 2
0
def test_critic(critic, dim=14):
    e = get_environment(ENV_NAME, reward_type='sparse')
    set_goal = (0.0, 0.0, 0.0)
    e.reset_model(seed=None, goal=set_goal)
    mean = np.zeros(e.action_dim)
    sigma = 1.0 * np.ones(e.action_dim)
    filter_coefs = [sigma, 0.25, 0.8, 0.0]

    test_agent = MPPI(e,
                      H=H,
                      paths_per_cpu=40,
                      num_cpu=1,
                      kappa=25.0,
                      gamma=1.0,
                      mean=mean,
                      filter_coefs=filter_coefs,
                      default_act='mean',
                      seed=SEED,
                      reward_type='sparse')
    for t in tqdm(range(H_total)):
        test_agent.train_step(critic=critic,
                              niter=N_ITER,
                              goal=set_goal,
                              dim=dim)
    return test_agent
def test_goals(critic, seeds=None, goals=None, dim=14):
    print('=' * 20)

    if seeds is not None:
        iters = len(seeds)
    elif goals is not None:
        iters = len(goals)
    else:
        return

    for i in range(iters):
        e = get_environment(ENV_NAME, sparse_reward=True)
        if seeds is not None:
            e.reset_model(seed=seeds[i])
        else:
            e.reset_model(seed=None, goal=goals[i])
        goal = e.get_env_state()['target_pos']
        mean = np.zeros(e.action_dim)
        sigma = 1.0*np.ones(e.action_dim)
        filter_coefs = [sigma, 0.25, 0.8, 0.0]

        agent_test = MPPI(e, H=H, paths_per_cpu=40, num_cpu=1,
                          kappa=25.0, gamma=1.0, mean=mean,
                          filter_coefs=filter_coefs,
                          default_act='mean', seed=SEED,
                          init_seq=None)

        for t in tqdm(range(H_total)):
            agent_test.train_step(critic=critic, niter=N_ITER, dim=dim, goal=goal)

        print("Trajectory reward = %f" % np.sum(agent_test.sol_reward))
        print("Custom reward = %f" % custom_reward_fn(agent_test.sol_reward))

    print('=' * 20)
Exemplo n.º 4
0
def do_env_rollout(env_name, start_state, act_list, goal, reward_type,
                   reference, alpha):
    """
        1) Construct env with env_name and set it to start_state.
        2) Generate rollouts using act_list.
           act_list is a list with each element having size (H,m).
           Length of act_list is the number of desired rollouts.
    """
    e = get_environment(env_name, reward_type=reward_type, reference=reference)
    e.reset_model(goal=goal, alpha=alpha)
    e.real_step = False
    paths = []
    H = act_list[0].shape[0]
    N = len(act_list)
    for i in range(N):
        if isinstance(goal, int):  # humanoid pybullet save state
            e.set_env_state(goal)
        else:
            e.set_env_state(start_state)

        obs = []
        act = []
        rewards = []
        env_infos = []
        states = []
        next_obs = []

        for k in range(H):
            obs.append(e._get_obs())
            act.append(act_list[i][k])
            env_infos.append(e.get_env_infos())
            states.append(e.get_env_state())
            s, r, d, ifo = e.step(act[-1])
            rewards.append(r)
            next_obs.append(s)

        path = dict(observations=np.array(obs),
                    actions=np.array(act),
                    rewards=np.array(rewards),
                    env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
                    states=states,
                    next_observations=next_obs)
        paths.append(path)

    return paths
Exemplo n.º 5
0
def do_env_rollout(env_name, start_state, act_list):
    """
    Construct env with given name and set it to the start state
    The make rollouts using the act_list
    act_list is a list with each element having size (H, m). \
    len of act_list is number of rollouts
    """
    e = get_environment(env_name)
    e.reset_model()
    e.real_step = False
    paths = []
    H = act_list[0].shape[0]
    N = len(act_list)
    for i in range(N):
        e.set_env_state(start_state)
        obs = []
        act = []
        rewards = []
        env_infos = []
        states = []

        for k in range(H):
            obs.append(e._get_obs())
            act.append(act_list[i][k])
            env_infos.append(e.get_env_infos())
            states.append(e.get_env_state())
            s, r, d, ifo = e.step(act[-1])
            rewards.append(r)

        path = dict(observations=np.array(obs),
                    actions=np.array(act),
                    rewards=np.array(rewards),
                    env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
                    states=states)
        paths.append(path)

    return paths
Exemplo n.º 6
0
def do_env_rollout(env_id, start_state, act_list):
    """
        1) Construct env with env_id and set it to start_state.
        2) Generate rollouts using act_list.
           act_list is a list with each element having size (H,m).
           Length of act_list is the number of desired rollouts.
    """
    e = get_environment(env_id)
    e.reset()
    e.real_env_step(False)
    paths = []
    H = act_list[0].shape[0]
    N = len(act_list)
    for i in range(N):
        e.set_env_state(start_state)
        obs = []
        act = []
        rewards = []
        env_infos = []
        states = []

        for k in range(H):
            obs.append(e.get_obs())
            act.append(act_list[i][k])
            env_infos.append(e.get_env_infos())
            states.append(e.get_env_state())
            s, r, d, ifo = e.step(act[-1])
            rewards.append(r)

        path = dict(observations=np.array(obs),
                    actions=np.array(act),
                    rewards=np.array(rewards),
                    env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
                    states=states)
        paths.append(path)

    return paths
Exemplo n.º 7
0
    dsts = [np.linalg.norm(obs[-6:-3] - obs[-3:]) for obs in sol_obs]
    return int(min(dsts) < radius)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--critic',
                        default=None,
                        help='path to critic model (.pt) file')
    parser.add_argument('--iters',
                        default=10,
                        type=int,
                        help='number of random initializations to test')
    args = parser.parse_args()

    e = get_environment(ENV_NAME, reward_type='dense', reference=None)
    # e.reset_model(seed=SEED)

    mean = np.zeros(e.action_dim)
    sigma = 1.0 * np.ones(e.action_dim)
    filter_coefs = [sigma, 0.25, 0.8, 0.0]

    critic = None
    if args.critic is not None:
        critic = Critic(input_dim=STATE_DIM)
        critic.load_state_dict(torch.load(args.critic))
        critic.eval()
        critic.float()

    joint_limits = np.array([[-2.2854, 1.714602],
                             [-0.5236, 1.3963], [-1.5, 1.7], [-2.3213, 0],
Exemplo n.º 8
0
from trajopt.algos.mppi import MPPI
from trajopt.envs.utils import get_environment
from tqdm import tqdm
import time as timer
import numpy as np
import pickle

# =======================================
ENV_NAME = 'reacher_7dof'
PICKLE_FILE = ENV_NAME + '_mppi.pickle'
SEED = 12345
N_ITER = 5
H_total = 100
# =======================================

e = get_environment(ENV_NAME)
e.reset_model(seed=SEED)
mean = np.zeros(e.action_dim)
sigma = 1.0 * np.ones(e.action_dim)
filter_coefs = [sigma, 0.25, 0.8, 0.0]

agent = MPPI(e,
             H=16,
             paths_per_cpu=40,
             num_cpu=1,
             kappa=25.0,
             gamma=1.0,
             mean=mean,
             filter_coefs=filter_coefs,
             default_act='mean',
             seed=SEED)
Exemplo n.º 9
0
# Load reference trajectory
reference_agent = pickle.load(open('dense_agent.pickle', 'rb'))
reference_pos = []
for i in range(len(reference_agent.sol_state)):
    reference_agent.env.set_env_state(reference_agent.sol_state[i])
    reference_pos.append(
        reference_agent.env.data.site_xpos[reference_agent.env.hand_sid])
reference_pos = np.array(reference_pos)

goal = (0, 0, 0)
reward_type = 'tracking'
reference = reference_pos

replay_buffer = ReplayBuffer(max_size=10000)

e = get_environment(ENV_NAME, reward_type=reward_type, reference=reference)
# e = get_environment(ENV_NAME, reward_type='sparse', reference=None)
e.reset_model(seed=SEED, goal=goal, alpha=1.0)
mean = np.zeros(e.action_dim)
sigma = 1.0 * np.ones(e.action_dim)
filter_coefs = [sigma, 0.25, 0.8, 0.0]

agent = MPPI(e,
             H=16,
             paths_per_cpu=40,
             num_cpu=1,
             kappa=25.0,
             gamma=1.0,
             mean=mean,
             filter_coefs=filter_coefs,
             default_act='mean',
Exemplo n.º 10
0
    # Writer will output to ./runs/ directory by default
    writer = SummaryWriter()

    # Load reference trajectory
    reference_agent = pickle.load(open('dense_agent.pickle', 'rb'))
    reference_pos = []
    for i in range(len(reference_agent.sol_state)):
        reference_agent.env.set_env_state(reference_agent.sol_state[i])
        reference_pos.append(
            reference_agent.env.data.site_xpos[reference_agent.env.hand_sid])
    reference_pos = np.array(reference_pos)
    reward_type = 'cooling'
    reference = reference_pos

    e = get_environment(ENV_NAME,
                        reward_type='sparse')  # need sparse for sol_reward
    mean = np.zeros(e.action_dim)
    sigma = 1.0 * np.ones(e.action_dim)
    filter_coefs = [sigma, 0.25, 0.8, 0.0]

    replay_buffer = ReplayBuffer(max_size=10000)

    critic = Critic(num_iters=args.iters,
                    input_dim=STATE_DIM,
                    inner_layer=128,
                    batch_size=128,
                    gamma=0.9)
    if args.critic is not None:
        critic.load_state_dict(torch.load(args.critic))
    critic.eval()
    critic.float()
Exemplo n.º 11
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--critic',
                        default=None,
                        help='path to critic model (.pt) file')
    parser.add_argument('--iters',
                        default=10,
                        type=int,
                        help='number of random initializations to test')
    parser.add_argument('--goals', action='store_true')
    args = parser.parse_args()

    if args.goals:
        STATE_DIM += 1

    e = get_environment(ENV_NAME, reward_type='sparse')
    goal = np.zeros(3)
    # e.reset_model(seed=SEED, goal=goal)
    seed = 11

    mean = np.zeros(e.action_dim)
    sigma = 1.0 * np.ones(e.action_dim)
    filter_coefs = [sigma, 0.25, 0.8, 0.0]

    critic = None
    if args.critic is not None:
        critic = Critic(input_dim=STATE_DIM)
        critic.load_state_dict(torch.load(args.critic))
        critic.eval()
        critic.float()
                        help='save the replay buffer?')
    parser.add_argument('--target', action='store_true')
    parser.add_argument('--eta', default=0.9, type=float)
    parser.add_argument('--goals', action='store_true')
    parser.add_argument('--lr', default=1e-2, type=float)
    parser.add_argument('--iters', default=2000, type=int)
    args = parser.parse_args()

    # Check to add goal position to state space
    if args.goals:
        STATE_DIM += 3

    # Writer will output to ./runs/ directory by default
    writer = SummaryWriter()

    e = get_environment(ENV_NAME, sparse_reward=True)
    # e.sparse_reward = True
    # e.reset_model(seed=SEED)
    mean = np.zeros(e.action_dim)
    sigma = 1.0*np.ones(e.action_dim)
    filter_coefs = [sigma, 0.25, 0.8, 0.0]

    good_agent = pickle.load(open('116_agent.pickle', 'rb'))
    sol_actions = np.array(good_agent.sol_act)  # should be (100, 7)
    init_seq = sol_actions[:H]

    replay_buffer = ReplayBuffer(max_size=1000000)

    critic = Critic(num_iters=args.iters, input_dim=STATE_DIM, inner_layer=128, batch_size=128, gamma=0.9)
    if args.critic is not None:
        critic.load_state_dict(torch.load(args.critic))