def test_learning(self): pol_net = PolNet(self.env.observation_space, self.env.action_space, h1=32, h2=32) pol = GaussianPol(self.env.observation_space, self.env.action_space, pol_net) vf_net = VNet(self.env.observation_space) vf = DeterministicSVfunc(self.env.observation_space, vf_net) rewf_net = VNet(self.env.observation_space, h1=32, h2=32) rewf = DeterministicSVfunc(self.env.observation_space, rewf_net) shaping_vf_net = VNet(self.env.observation_space, h1=32, h2=32) shaping_vf = DeterministicSVfunc( self.env.observation_space, shaping_vf_net) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4) optim_discrim = torch.optim.Adam( list(rewf_net.parameters()) + list(shaping_vf_net.parameters()), 3e-4) with open(os.path.join('data/expert_epis', 'Pendulum-v0_2epis.pkl'), 'rb') as f: expert_epis = pickle.load(f) expert_traj = Traj() expert_traj.add_epis(expert_epis) expert_traj = ef.add_next_obs(expert_traj) expert_traj.register_epis() epis = sampler.sample(pol, max_steps=32) agent_traj = Traj() agent_traj.add_epis(epis) agent_traj = ef.add_next_obs(agent_traj) agent_traj = ef.compute_pseudo_rews( agent_traj, rew_giver=rewf, state_only=True) agent_traj = ef.compute_vs(agent_traj, vf) agent_traj = ef.compute_rets(agent_traj, 0.99) agent_traj = ef.compute_advs(agent_traj, 0.99, 0.95) agent_traj = ef.centerize_advs(agent_traj) agent_traj = ef.compute_h_masks(agent_traj) agent_traj.register_epis() result_dict = airl.train(agent_traj, expert_traj, pol, vf, optim_vf, optim_discrim, rewf=rewf, shaping_vf=shaping_vf, rl_type='trpo', epoch=1, batch_size=32, discrim_batch_size=32, discrim_step=1, pol_ent_beta=1e-3, gamma=0.99) del sampler
def test_learning(self): ob_space = self.env.real_observation_space skill_space = self.env.skill_space ob_skill_space = self.env.observation_space ac_space = self.env.action_space ob_dim = ob_skill_space.shape[0] - 4 f_dim = ob_dim def discrim_f(x): return x pol_net = PolNet(ob_skill_space, ac_space) pol = GaussianPol(ob_skill_space, ac_space, pol_net) qf_net1 = QNet(ob_skill_space, ac_space) qf1 = DeterministicSAVfunc(ob_skill_space, ac_space, qf_net1) targ_qf_net1 = QNet(ob_skill_space, ac_space) targ_qf_net1.load_state_dict(qf_net1.state_dict()) targ_qf1 = DeterministicSAVfunc(ob_skill_space, ac_space, targ_qf_net1) qf_net2 = QNet(ob_skill_space, ac_space) qf2 = DeterministicSAVfunc(ob_skill_space, ac_space, qf_net2) targ_qf_net2 = QNet(ob_skill_space, ac_space) targ_qf_net2.load_state_dict(qf_net2.state_dict()) targ_qf2 = DeterministicSAVfunc(ob_skill_space, ac_space, targ_qf_net2) qfs = [qf1, qf2] targ_qfs = [targ_qf1, targ_qf2] log_alpha = nn.Parameter(torch.ones(())) high = np.array([np.finfo(np.float32).max]*f_dim) f_space = gym.spaces.Box(-high, high, dtype=np.float32) discrim_net = DiaynDiscrimNet( f_space, skill_space, h_size=100, discrim_f=discrim_f) discrim = DeterministicSVfunc(f_space, discrim_net) optim_pol = torch.optim.Adam(pol_net.parameters(), 1e-4) optim_qf1 = torch.optim.Adam(qf_net1.parameters(), 3e-4) optim_qf2 = torch.optim.Adam(qf_net2.parameters(), 3e-4) optim_qfs = [optim_qf1, optim_qf2] optim_alpha = torch.optim.Adam([log_alpha], 1e-4) optim_discrim = torch.optim.SGD(discrim.parameters(), lr=0.001, momentum=0.9) off_traj = Traj() sampler = EpiSampler(self.env, pol, num_parallel=1) epis = sampler.sample(pol, max_steps=200) on_traj = Traj() on_traj.add_epis(epis) on_traj = ef.add_next_obs(on_traj) on_traj = ef.compute_diayn_rews( on_traj, lambda x: diayn_sac.calc_rewards(x, 4, discrim)) on_traj.register_epis() off_traj.add_traj(on_traj) step = on_traj.num_step log_alpha = nn.Parameter(np.log(0.1)*torch.ones(())) # fix alpha result_dict = diayn_sac.train( off_traj, pol, qfs, targ_qfs, log_alpha, optim_pol, optim_qfs, optim_alpha, step, 128, 5e-3, 0.99, 1, discrim, 4, True) discrim_losses = diayn.train( discrim, optim_discrim, on_traj, 32, 100, 4) del sampler
def test_learning(self): pol_net = PolNet(self.env.ob_space, self.env.ac_space, h1=32, h2=32) pol = GaussianPol(self.env.ob_space, self.env.ac_space, pol_net) targ_pol_net = PolNet(self.env.ob_space, self.env.ac_space, 32, 32) targ_pol_net.load_state_dict(pol_net.state_dict()) targ_pol = GaussianPol( self.env.ob_space, self.env.ac_space, targ_pol_net) qf_net = QNet(self.env.ob_space, self.env.ac_space, h1=32, h2=32) qf = DeterministicSAVfunc(self.env.ob_space, self.env.ac_space, qf_net) targ_qf_net = QNet(self.env.ob_space, self.env.ac_space, 32, 32) targ_qf_net.load_state_dict(targ_qf_net.state_dict()) targ_qf = DeterministicSAVfunc( self.env.ob_space, self.env.ac_space, targ_qf_net) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4) optim_qf = torch.optim.Adam(qf_net.parameters(), 3e-4) epis = sampler.sample(pol, max_steps=32) traj = Traj() traj.add_epis(epis) traj = ef.add_next_obs(traj) traj.register_epis() result_dict = svg.train( traj, pol, targ_pol, qf, targ_qf, optim_pol, optim_qf, 1, 32, 0.01, 0.9, 1) del sampler
def test_learning(self): pol_net = PolNet(self.env.ob_space, self.env.ac_space, h1=32, h2=32) pol = GaussianPol(self.env.ob_space, self.env.ac_space, pol_net) qf_net1 = QNet(self.env.ob_space, self.env.ac_space) qf1 = DeterministicSAVfunc(self.env.ob_space, self.env.ac_space, qf_net1) targ_qf_net1 = QNet(self.env.ob_space, self.env.ac_space) targ_qf_net1.load_state_dict(qf_net1.state_dict()) targ_qf1 = DeterministicSAVfunc(self.env.ob_space, self.env.ac_space, targ_qf_net1) qf_net2 = QNet(self.env.ob_space, self.env.ac_space) qf2 = DeterministicSAVfunc(self.env.ob_space, self.env.ac_space, qf_net2) targ_qf_net2 = QNet(self.env.ob_space, self.env.ac_space) targ_qf_net2.load_state_dict(qf_net2.state_dict()) targ_qf2 = DeterministicSAVfunc(self.env.ob_space, self.env.ac_space, targ_qf_net2) qfs = [qf1, qf2] targ_qfs = [targ_qf1, targ_qf2] log_alpha = nn.Parameter(torch.zeros(())) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4) optim_qf1 = torch.optim.Adam(qf_net1.parameters(), 3e-4) optim_qf2 = torch.optim.Adam(qf_net2.parameters(), 3e-4) optim_qfs = [optim_qf1, optim_qf2] optim_alpha = torch.optim.Adam([log_alpha], 3e-4) epis = sampler.sample(pol, max_steps=32) traj = Traj() traj.add_epis(epis) traj = ef.add_next_obs(traj) traj.register_epis() result_dict = sac.train( traj, pol, qfs, targ_qfs, log_alpha, optim_pol, optim_qfs, optim_alpha, 2, 32, 0.01, 0.99, 2, ) del sampler
def test_learning_rnn(self): def rew_func(next_obs, acs, mean_obs=0., std_obs=1., mean_acs=0., std_acs=1.): next_obs = next_obs * std_obs + mean_obs acs = acs * std_acs + mean_acs # Pendulum rews = -(torch.acos(next_obs[:, 0].clamp(min=-1, max=1))**2 + 0.1 * (next_obs[:, 2].clamp(min=-8, max=8)**2) + 0.001 * acs.squeeze(-1)**2) rews = rews.squeeze(0) return rews # init models dm_net = ModelNetLSTM(self.env.observation_space, self.env.action_space) dm = DeterministicSModel(self.env.observation_space, self.env.action_space, dm_net, rnn=True, data_parallel=False, parallel_dim=0) mpc_pol = MPCPol(self.env.observation_space, self.env.action_space, dm_net, rew_func, 1, 1, mean_obs=0., std_obs=1., mean_acs=0., std_acs=1., rnn=True) optim_dm = torch.optim.Adam(dm_net.parameters(), 1e-3) # sample with mpc policy sampler = EpiSampler(self.env, mpc_pol, num_parallel=1) epis = sampler.sample(mpc_pol, max_epis=1) traj = Traj() traj.add_epis(epis) traj = ef.add_next_obs(traj) traj = ef.compute_h_masks(traj) traj.register_epis() traj.add_traj(traj) # train result_dict = mpc.train_dm(traj, dm, optim_dm, epoch=1, batch_size=1) del sampler
def test_learning(self): pol_net = PolNet(self.env.observation_space, self.env.action_space, h1=32, h2=32, deterministic=True) noise = OUActionNoise(self.env.action_space) pol = DeterministicActionNoisePol(self.env.observation_space, self.env.action_space, pol_net, noise) targ_pol_net = PolNet(self.env.observation_space, self.env.action_space, 32, 32, deterministic=True) targ_pol_net.load_state_dict(pol_net.state_dict()) targ_noise = OUActionNoise(self.env.action_space) targ_pol = DeterministicActionNoisePol(self.env.observation_space, self.env.action_space, targ_pol_net, targ_noise) qf_net = QNet(self.env.observation_space, self.env.action_space, h1=32, h2=32) qf = DeterministicSAVfunc(self.env.observation_space, self.env.action_space, qf_net) targ_qf_net = QNet(self.env.observation_space, self.env.action_space, 32, 32) targ_qf_net.load_state_dict(targ_qf_net.state_dict()) targ_qf = DeterministicSAVfunc(self.env.observation_space, self.env.action_space, targ_qf_net) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4) optim_qf = torch.optim.Adam(qf_net.parameters(), 3e-4) epis = sampler.sample(pol, max_steps=32) traj = Traj() traj.add_epis(epis) traj = ef.add_next_obs(traj) traj.register_epis() result_dict = ddpg.train(traj, pol, targ_pol, qf, targ_qf, optim_pol, optim_qf, 1, 32, 0.01, 0.9) del sampler
def test_learning(self): qf_net = QNet(self.env.observation_space, self.env.action_space, 32, 32) lagged_qf_net = QNet(self.env.observation_space, self.env.action_space, 32, 32) lagged_qf_net.load_state_dict(qf_net.state_dict()) targ_qf1_net = QNet(self.env.observation_space, self.env.action_space, 32, 32) targ_qf1_net.load_state_dict(qf_net.state_dict()) targ_qf2_net = QNet(self.env.observation_space, self.env.action_space, 32, 32) targ_qf2_net.load_state_dict(lagged_qf_net.state_dict()) qf = DeterministicSAVfunc(self.env.observation_space, self.env.action_space, qf_net) lagged_qf = DeterministicSAVfunc(self.env.observation_space, self.env.action_space, lagged_qf_net) targ_qf1 = CEMDeterministicSAVfunc(self.env.observation_space, self.env.action_space, targ_qf1_net, num_sampling=60, num_best_sampling=6, num_iter=2, multivari=False) targ_qf2 = DeterministicSAVfunc(self.env.observation_space, self.env.action_space, targ_qf2_net) pol = ArgmaxQfPol(self.env.observation_space, self.env.action_space, targ_qf1, eps=0.2) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_qf = torch.optim.Adam(qf_net.parameters(), 3e-4) epis = sampler.sample(pol, max_steps=32) traj = Traj() traj.add_epis(epis) traj = ef.add_next_obs(traj) traj.register_epis() result_dict = qtopt.train(traj, qf, lagged_qf, targ_qf1, targ_qf2, optim_qf, 1000, 32, 0.9999, 0.995, 'mse') del sampler
num_update_lagged = 0 # lagged netの更新回数 max_rew = -1000 print('start') while args.max_epis > total_epi: with measure('sample'): print('sampling') # policyにしたがって行動し、経験を貯める(env.stepをone_epiの__init__内で行っている) # off-policy epis = sampler.sample(pol, max_steps=args.max_steps_per_iter) with measure('train'): # on-policyのサンプリング print('on-policy') on_traj = Traj(traj_device='cpu') on_traj.add_epis(epis) on_traj = epi_functional.add_next_obs(on_traj) on_traj.register_epis() off_traj.add_traj(on_traj) # off-policyに加える # episodeとstepのカウント total_epi += on_traj.num_epi step = on_traj.num_step total_step += step epoch = step if args.data_parallel: qf.dp_run = True lagged_qf.dp_run = True targ_qf1.dp_run = True targ_qf2.dp_run = True # train
vf_net = VNet(observation_space) vf = DeterministicSVfunc(observation_space, vf_net, data_parallel=args.data_parallel) sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed) optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr) optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr) with open(os.path.join(args.expert_dir, args.expert_fname), 'rb') as f: expert_epis = pickle.load(f) expert_traj = Traj() expert_traj.add_epis(expert_epis) expert_traj = ef.add_next_obs(expert_traj) expert_traj.register_epis() expert_rewards = [np.sum(epi['rews']) for epi in expert_epis] expert_mean_rew = np.mean(expert_rewards) logger.log('expert_score={}'.format(expert_mean_rew)) logger.log('expert_num_epi={}'.format(expert_traj.num_epi)) total_epi = 0 total_step = 0 max_rew = -1e6 kl_beta = args.init_kl_beta if args.pretrain: with measure('bc pretrain'): for _ in range(args.bc_epoch): _ = behavior_clone.train(expert_traj, pol, optim_pol,
off_traj = Traj(args.max_steps_off, traj_device='cpu') total_epi = 0 total_step = 0 max_rew = -1e6 while args.max_epis > total_epi: with measure('sample'): epis = sampler.sample(pol, max_steps=args.max_steps_per_iter) with measure('train'): on_traj = Traj(traj_device='cpu') on_traj.add_epis(epis) on_traj = ef.add_next_obs(on_traj) max_pri = on_traj.get_max_pri() on_traj = ef.set_all_pris(on_traj, max_pri) on_traj = ef.compute_seq_pris(on_traj, args.seq_length) on_traj = ef.compute_h_masks(on_traj) for i in range(len(qfs)): on_traj = ef.compute_hs( on_traj, qfs[i], hs_name='q_hs'+str(i), input_acs=True) on_traj = ef.compute_hs( on_traj, targ_qfs[i], hs_name='targ_q_hs'+str(i), input_acs=True) on_traj.register_epis() off_traj.add_traj(on_traj) total_epi += on_traj.num_epi step = on_traj.num_step
optim_discrim = torch.optim.Adam(advf_net.parameters(), args.discrim_lr) rewf = None shaping_vf = None else: raise ValueError('Only rew and adv are supported') sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed) optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr) optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr) with open(os.path.join(args.expert_dir, args.expert_fname), 'rb') as f: expert_epis = pickle.load(f) expert_traj = Traj() expert_traj.add_epis(expert_epis) expert_traj = ef.add_next_obs(expert_traj) expert_traj.register_epis() expert_rewards = [np.sum(epi['rews']) for epi in expert_epis] expert_mean_rew = np.mean(expert_rewards) logger.log('expert_score={}'.format(expert_mean_rew)) logger.log('expert_num_epi={}'.format(expert_traj.num_epi)) total_epi = 0 total_step = 0 max_rew = -1e6 if args.rl_type == 'ppo_kl': kl_beta = args.init_kl_beta if args.pretrain: with measure('bc pretrain'):
### Model-Based RL ### ###################### ### Prepare the dataset D_RAND ### # Performing rollouts to collect training data rand_sampler = EpiSampler(env, random_pol, num_parallel=args.num_parallel, seed=args.seed) epis = rand_sampler.sample(random_pol, max_epis=args.num_random_rollouts) epis = add_noise_to_init_obs(epis, args.noise_to_init_obs) traj = Traj(traj_device='cpu') traj.add_epis(epis) traj = ef.add_next_obs(traj) traj = ef.compute_h_masks(traj) # obs, next_obs, and acs should become mean 0, std 1 traj, mean_obs, std_obs, mean_acs, std_acs = ef.normalize_obs_and_acs(traj) traj.register_epis() del rand_sampler ### Train Dynamics Model ### # initialize dynamics model and mpc policy if args.rnn: dm_net = ModelNetLSTM(ob_space, ac_space) else: dm_net = ModelNet(ob_space, ac_space) dm = DeterministicSModel(ob_space,
def test_learning(self): pol_net = PolNetLSTM( self.env.observation_space, self.env.action_space, h_size=32, cell_size=32) pol = GaussianPol(self.env.observation_space, self.env.action_space, pol_net, rnn=True) qf_net1 = QNetLSTM(self.env.observation_space, self.env.action_space, h_size=32, cell_size=32) qf1 = DeterministicSAVfunc( self.env.observation_space, self.env.action_space, qf_net1, rnn=True) targ_qf_net1 = QNetLSTM( self.env.observation_space, self.env.action_space, h_size=32, cell_size=32) targ_qf_net1.load_state_dict(qf_net1.state_dict()) targ_qf1 = DeterministicSAVfunc( self.env.observation_space, self.env.action_space, targ_qf_net1, rnn=True) qf_net2 = QNetLSTM(self.env.observation_space, self.env.action_space, h_size=32, cell_size=32) qf2 = DeterministicSAVfunc( self.env.observation_space, self.env.action_space, qf_net2, rnn=True) targ_qf_net2 = QNetLSTM( self.env.observation_space, self.env.action_space, h_size=32, cell_size=32) targ_qf_net2.load_state_dict(qf_net2.state_dict()) targ_qf2 = DeterministicSAVfunc( self.env.observation_space, self.env.action_space, targ_qf_net2, rnn=True) qfs = [qf1, qf2] targ_qfs = [targ_qf1, targ_qf2] log_alpha = nn.Parameter(torch.zeros(())) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4) optim_qf1 = torch.optim.Adam(qf_net1.parameters(), 3e-4) optim_qf2 = torch.optim.Adam(qf_net2.parameters(), 3e-4) optim_qfs = [optim_qf1, optim_qf2] optim_alpha = torch.optim.Adam([log_alpha], 3e-4) epis = sampler.sample(pol, max_steps=32) traj = Traj() traj.add_epis(epis) traj = ef.add_next_obs(traj) max_pri = traj.get_max_pri() traj = ef.set_all_pris(traj, max_pri) traj = ef.compute_seq_pris(traj, 4) traj = ef.compute_h_masks(traj) for i in range(len(qfs)): traj = ef.compute_hs( traj, qfs[i], hs_name='q_hs'+str(i), input_acs=True) traj = ef.compute_hs( traj, targ_qfs[i], hs_name='targ_q_hs'+str(i), input_acs=True) traj.register_epis() result_dict = r2d2_sac.train( traj, pol, qfs, targ_qfs, log_alpha, optim_pol, optim_qfs, optim_alpha, 2, 32, 4, 2, 0.01, 0.99, 2, ) del sampler