def main(alg): env_fn = lambda: get_environment(ENV_NAME, reward_type='sparse') ac_kwargs = dict(hidden_sizes=[128, 128], activation=torch.nn.ReLU) logger_kwargs = dict(output_dir='./corl/{}'.format(alg), exp_name=alg + '_reacher_sparse') if alg == 'PPO': # ppo(env_fn=env_fn, ac_kwargs=ac_kwargs, steps_per_epoch=4000, epochs=250, # logger_kwargs=logger_kwargs) ppo(env_fn=env_fn, ac_kwargs=ac_kwargs, steps_per_epoch=10000, epochs=250, max_ep_len=H_total, logger_kwargs=logger_kwargs) elif alg == 'SAC': sac(env_fn=env_fn, ac_kwargs=ac_kwargs, steps_per_epoch=10000, epochs=250, max_ep_len=H_total, logger_kwargs=logger_kwargs) else: print('Invalid Algorithm. Exiting...')
def test_critic(critic, dim=14): e = get_environment(ENV_NAME, reward_type='sparse') set_goal = (0.0, 0.0, 0.0) e.reset_model(seed=None, goal=set_goal) mean = np.zeros(e.action_dim) sigma = 1.0 * np.ones(e.action_dim) filter_coefs = [sigma, 0.25, 0.8, 0.0] test_agent = MPPI(e, H=H, paths_per_cpu=40, num_cpu=1, kappa=25.0, gamma=1.0, mean=mean, filter_coefs=filter_coefs, default_act='mean', seed=SEED, reward_type='sparse') for t in tqdm(range(H_total)): test_agent.train_step(critic=critic, niter=N_ITER, goal=set_goal, dim=dim) return test_agent
def test_goals(critic, seeds=None, goals=None, dim=14): print('=' * 20) if seeds is not None: iters = len(seeds) elif goals is not None: iters = len(goals) else: return for i in range(iters): e = get_environment(ENV_NAME, sparse_reward=True) if seeds is not None: e.reset_model(seed=seeds[i]) else: e.reset_model(seed=None, goal=goals[i]) goal = e.get_env_state()['target_pos'] mean = np.zeros(e.action_dim) sigma = 1.0*np.ones(e.action_dim) filter_coefs = [sigma, 0.25, 0.8, 0.0] agent_test = MPPI(e, H=H, paths_per_cpu=40, num_cpu=1, kappa=25.0, gamma=1.0, mean=mean, filter_coefs=filter_coefs, default_act='mean', seed=SEED, init_seq=None) for t in tqdm(range(H_total)): agent_test.train_step(critic=critic, niter=N_ITER, dim=dim, goal=goal) print("Trajectory reward = %f" % np.sum(agent_test.sol_reward)) print("Custom reward = %f" % custom_reward_fn(agent_test.sol_reward)) print('=' * 20)
def do_env_rollout(env_name, start_state, act_list, goal, reward_type, reference, alpha): """ 1) Construct env with env_name and set it to start_state. 2) Generate rollouts using act_list. act_list is a list with each element having size (H,m). Length of act_list is the number of desired rollouts. """ e = get_environment(env_name, reward_type=reward_type, reference=reference) e.reset_model(goal=goal, alpha=alpha) e.real_step = False paths = [] H = act_list[0].shape[0] N = len(act_list) for i in range(N): if isinstance(goal, int): # humanoid pybullet save state e.set_env_state(goal) else: e.set_env_state(start_state) obs = [] act = [] rewards = [] env_infos = [] states = [] next_obs = [] for k in range(H): obs.append(e._get_obs()) act.append(act_list[i][k]) env_infos.append(e.get_env_infos()) states.append(e.get_env_state()) s, r, d, ifo = e.step(act[-1]) rewards.append(r) next_obs.append(s) path = dict(observations=np.array(obs), actions=np.array(act), rewards=np.array(rewards), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), states=states, next_observations=next_obs) paths.append(path) return paths
def do_env_rollout(env_name, start_state, act_list): """ Construct env with given name and set it to the start state The make rollouts using the act_list act_list is a list with each element having size (H, m). \ len of act_list is number of rollouts """ e = get_environment(env_name) e.reset_model() e.real_step = False paths = [] H = act_list[0].shape[0] N = len(act_list) for i in range(N): e.set_env_state(start_state) obs = [] act = [] rewards = [] env_infos = [] states = [] for k in range(H): obs.append(e._get_obs()) act.append(act_list[i][k]) env_infos.append(e.get_env_infos()) states.append(e.get_env_state()) s, r, d, ifo = e.step(act[-1]) rewards.append(r) path = dict(observations=np.array(obs), actions=np.array(act), rewards=np.array(rewards), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), states=states) paths.append(path) return paths
def do_env_rollout(env_id, start_state, act_list): """ 1) Construct env with env_id and set it to start_state. 2) Generate rollouts using act_list. act_list is a list with each element having size (H,m). Length of act_list is the number of desired rollouts. """ e = get_environment(env_id) e.reset() e.real_env_step(False) paths = [] H = act_list[0].shape[0] N = len(act_list) for i in range(N): e.set_env_state(start_state) obs = [] act = [] rewards = [] env_infos = [] states = [] for k in range(H): obs.append(e.get_obs()) act.append(act_list[i][k]) env_infos.append(e.get_env_infos()) states.append(e.get_env_state()) s, r, d, ifo = e.step(act[-1]) rewards.append(r) path = dict(observations=np.array(obs), actions=np.array(act), rewards=np.array(rewards), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), states=states) paths.append(path) return paths
dsts = [np.linalg.norm(obs[-6:-3] - obs[-3:]) for obs in sol_obs] return int(min(dsts) < radius) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--critic', default=None, help='path to critic model (.pt) file') parser.add_argument('--iters', default=10, type=int, help='number of random initializations to test') args = parser.parse_args() e = get_environment(ENV_NAME, reward_type='dense', reference=None) # e.reset_model(seed=SEED) mean = np.zeros(e.action_dim) sigma = 1.0 * np.ones(e.action_dim) filter_coefs = [sigma, 0.25, 0.8, 0.0] critic = None if args.critic is not None: critic = Critic(input_dim=STATE_DIM) critic.load_state_dict(torch.load(args.critic)) critic.eval() critic.float() joint_limits = np.array([[-2.2854, 1.714602], [-0.5236, 1.3963], [-1.5, 1.7], [-2.3213, 0],
from trajopt.algos.mppi import MPPI from trajopt.envs.utils import get_environment from tqdm import tqdm import time as timer import numpy as np import pickle # ======================================= ENV_NAME = 'reacher_7dof' PICKLE_FILE = ENV_NAME + '_mppi.pickle' SEED = 12345 N_ITER = 5 H_total = 100 # ======================================= e = get_environment(ENV_NAME) e.reset_model(seed=SEED) mean = np.zeros(e.action_dim) sigma = 1.0 * np.ones(e.action_dim) filter_coefs = [sigma, 0.25, 0.8, 0.0] agent = MPPI(e, H=16, paths_per_cpu=40, num_cpu=1, kappa=25.0, gamma=1.0, mean=mean, filter_coefs=filter_coefs, default_act='mean', seed=SEED)
# Load reference trajectory reference_agent = pickle.load(open('dense_agent.pickle', 'rb')) reference_pos = [] for i in range(len(reference_agent.sol_state)): reference_agent.env.set_env_state(reference_agent.sol_state[i]) reference_pos.append( reference_agent.env.data.site_xpos[reference_agent.env.hand_sid]) reference_pos = np.array(reference_pos) goal = (0, 0, 0) reward_type = 'tracking' reference = reference_pos replay_buffer = ReplayBuffer(max_size=10000) e = get_environment(ENV_NAME, reward_type=reward_type, reference=reference) # e = get_environment(ENV_NAME, reward_type='sparse', reference=None) e.reset_model(seed=SEED, goal=goal, alpha=1.0) mean = np.zeros(e.action_dim) sigma = 1.0 * np.ones(e.action_dim) filter_coefs = [sigma, 0.25, 0.8, 0.0] agent = MPPI(e, H=16, paths_per_cpu=40, num_cpu=1, kappa=25.0, gamma=1.0, mean=mean, filter_coefs=filter_coefs, default_act='mean',
# Writer will output to ./runs/ directory by default writer = SummaryWriter() # Load reference trajectory reference_agent = pickle.load(open('dense_agent.pickle', 'rb')) reference_pos = [] for i in range(len(reference_agent.sol_state)): reference_agent.env.set_env_state(reference_agent.sol_state[i]) reference_pos.append( reference_agent.env.data.site_xpos[reference_agent.env.hand_sid]) reference_pos = np.array(reference_pos) reward_type = 'cooling' reference = reference_pos e = get_environment(ENV_NAME, reward_type='sparse') # need sparse for sol_reward mean = np.zeros(e.action_dim) sigma = 1.0 * np.ones(e.action_dim) filter_coefs = [sigma, 0.25, 0.8, 0.0] replay_buffer = ReplayBuffer(max_size=10000) critic = Critic(num_iters=args.iters, input_dim=STATE_DIM, inner_layer=128, batch_size=128, gamma=0.9) if args.critic is not None: critic.load_state_dict(torch.load(args.critic)) critic.eval() critic.float()
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--critic', default=None, help='path to critic model (.pt) file') parser.add_argument('--iters', default=10, type=int, help='number of random initializations to test') parser.add_argument('--goals', action='store_true') args = parser.parse_args() if args.goals: STATE_DIM += 1 e = get_environment(ENV_NAME, reward_type='sparse') goal = np.zeros(3) # e.reset_model(seed=SEED, goal=goal) seed = 11 mean = np.zeros(e.action_dim) sigma = 1.0 * np.ones(e.action_dim) filter_coefs = [sigma, 0.25, 0.8, 0.0] critic = None if args.critic is not None: critic = Critic(input_dim=STATE_DIM) critic.load_state_dict(torch.load(args.critic)) critic.eval() critic.float()
help='save the replay buffer?') parser.add_argument('--target', action='store_true') parser.add_argument('--eta', default=0.9, type=float) parser.add_argument('--goals', action='store_true') parser.add_argument('--lr', default=1e-2, type=float) parser.add_argument('--iters', default=2000, type=int) args = parser.parse_args() # Check to add goal position to state space if args.goals: STATE_DIM += 3 # Writer will output to ./runs/ directory by default writer = SummaryWriter() e = get_environment(ENV_NAME, sparse_reward=True) # e.sparse_reward = True # e.reset_model(seed=SEED) mean = np.zeros(e.action_dim) sigma = 1.0*np.ones(e.action_dim) filter_coefs = [sigma, 0.25, 0.8, 0.0] good_agent = pickle.load(open('116_agent.pickle', 'rb')) sol_actions = np.array(good_agent.sol_act) # should be (100, 7) init_seq = sol_actions[:H] replay_buffer = ReplayBuffer(max_size=1000000) critic = Critic(num_iters=args.iters, input_dim=STATE_DIM, inner_layer=128, batch_size=128, gamma=0.9) if args.critic is not None: critic.load_state_dict(torch.load(args.critic))