def test_learning_rnn(self): pol_net = PolNetLSTM( self.env.observation_space, self.env.action_space, h_size=32, cell_size=32) pol = GaussianPol(self.env.observation_space, self.env.action_space, pol_net, rnn=True) vf_net = VNetLSTM(self.env.observation_space, h_size=32, cell_size=32) vf = DeterministicSVfunc(self.env.observation_space, vf_net, rnn=True) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4) optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4) epis = sampler.sample(pol, max_steps=400) traj = Traj() traj.add_epis(epis) traj = ef.compute_vs(traj, vf) traj = ef.compute_rets(traj, 0.99) traj = ef.compute_advs(traj, 0.99, 0.95) traj = ef.centerize_advs(traj) traj = ef.compute_h_masks(traj) traj.register_epis() result_dict = ppo_clip.train(traj=traj, pol=pol, vf=vf, clip_param=0.2, optim_pol=optim_pol, optim_vf=optim_vf, epoch=1, batch_size=2) result_dict = ppo_kl.train(traj=traj, pol=pol, vf=vf, kl_beta=0.1, kl_targ=0.2, optim_pol=optim_pol, optim_vf=optim_vf, epoch=1, batch_size=2, max_grad_norm=20) del sampler
def test_learning_rnn(self): pol_net = PolNetLSTM( self.env.observation_space, self.env.action_space, h_size=32, cell_size=32) pol = GaussianPol(self.env.observation_space, self.env.action_space, pol_net, rnn=True) vf_net = VNetLSTM(self.env.observation_space, h_size=32, cell_size=32) vf = DeterministicSVfunc(self.env.observation_space, vf_net, rnn=True) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4) optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4) epis = sampler.sample(pol, max_steps=400) traj = Traj() traj.add_epis(epis) traj = ef.compute_vs(traj, vf) traj = ef.compute_rets(traj, 0.99) traj = ef.compute_advs(traj, 0.99, 0.95) traj = ef.centerize_advs(traj) traj = ef.compute_h_masks(traj) traj.register_epis() result_dict = trpo.train(traj, pol, vf, optim_vf, 1, 2) del sampler
log_dir=os.path.join( args.log, 'movie'), record_video=args.record) env.env.seed(args.seed) if args.c2d: env = C2DEnv(env) observation_space = env.observation_space action_space = env.action_space # Generate teacher (t) policy and student (s) policy and load teacher policy # Please note that the two policies do not have to have the same hidden architecture if args.rnn: t_pol_net = PolNetLSTM(observation_space, action_space, h_size=256, cell_size=256) s_pol_net = PolNetLSTM(observation_space, action_space, h_size=256, cell_size=256) else: t_pol_net = PolNet(observation_space, action_space) s_pol_net = PolNet(observation_space, action_space, h1=190, h2=90) if isinstance(action_space, gym.spaces.Box): t_pol = GaussianPol(observation_space, action_space, t_pol_net, args.rnn) s_pol = GaussianPol(observation_space, action_space, s_pol_net, args.rnn) elif isinstance(action_space, gym.spaces.Discrete): t_pol = CategoricalPol( observation_space, action_space, t_pol_net, args.rnn) s_pol = CategoricalPol( observation_space, action_space, s_pol_net, args.rnn) elif isinstance(action_space, gym.spaces.MultiDiscrete): t_pol = MultiCategoricalPol(
device_name = 'cpu' if args.cuda < 0 else "cuda:{}".format(args.cuda) device = torch.device(device_name) set_device(device) score_file = os.path.join(args.log, 'progress.csv') logger.add_tabular_output(score_file) logger.add_tensorboard_output(args.log) env = GymEnv(args.env_name, log_dir=os.path.join( args.log, 'movie'), record_video=args.record) env.env.seed(args.seed) observation_space = env.observation_space action_space = env.action_space pol_net = PolNetLSTM(observation_space, action_space) pol = GaussianPol(observation_space, action_space, pol_net, rnn=True) qf_net1 = QNetLSTM(observation_space, action_space) qf1 = DeterministicSAVfunc(observation_space, action_space, qf_net1, rnn=True) targ_qf_net1 = QNetLSTM(observation_space, action_space) targ_qf_net1.load_state_dict(qf_net1.state_dict()) targ_qf1 = DeterministicSAVfunc( observation_space, action_space, targ_qf_net1, rnn=True) qf_net2 = QNetLSTM(observation_space, action_space) qf2 = DeterministicSAVfunc(observation_space, action_space, qf_net2, rnn=True) targ_qf_net2 = QNetLSTM(observation_space, action_space) targ_qf_net2.load_state_dict(qf_net2.state_dict()) targ_qf2 = DeterministicSAVfunc( observation_space, action_space, targ_qf_net2, rnn=True)
score_file = os.path.join(args.log, 'progress.csv') logger.add_tabular_output(score_file) env = GymEnv(args.env_name, log_dir=os.path.join(args.log, 'movie'), record_video=args.record) env.env.seed(args.seed) if args.c2d: env = C2DEnv(env) ob_space = env.observation_space ac_space = env.action_space if args.rnn: pol_net = PolNetLSTM(ob_space, ac_space, h_size=256, cell_size=256) else: pol_net = PolNet(ob_space, ac_space) if isinstance(ac_space, gym.spaces.Box): pol = GaussianPol(ob_space, ac_space, pol_net, args.rnn, data_parallel=args.data_parallel, parallel_dim=1 if args.rnn else 0) elif isinstance(ac_space, gym.spaces.Discrete): pol = CategoricalPol(ob_space, ac_space, pol_net, args.rnn, data_parallel=args.data_parallel,
score_file = os.path.join(args.log, 'progress.csv') logger.add_tabular_output(score_file) env = GymEnv(args.env_name, log_dir=os.path.join(args.log, 'movie'), record_video=args.record) env.env.seed(args.seed) if args.c2d: env = C2DEnv(env) ob_space = env.observation_space ac_space = env.action_space if args.rnn: pol_net = PolNetLSTM(ob_space, ac_space, h_size=256, cell_size=256) else: pol_net = PolNet(ob_space, ac_space) if isinstance(ac_space, gym.spaces.Box): pol = GaussianPol(ob_space, ac_space, pol_net, args.rnn) elif isinstance(ac_space, gym.spaces.Discrete): pol = CategoricalPol(ob_space, ac_space, pol_net, args.rnn) elif isinstance(ac_space, gym.spaces.MultiDiscrete): pol = MultiCategoricalPol(ob_space, ac_space, pol_net, args.rnn) else: raise ValueError('Only Box, Discrete, and MultiDiscrete are supported') if args.rnn: vf_net = VNetLSTM(ob_space, h_size=256, cell_size=256) else: vf_net = VNet(ob_space)
observation_space = env.observation_space action_space = env.action_space if args.ddpg: pol_net = PolNet(observation_space, action_space, args.pol_h1, args.pol_h2, deterministic=True) noise = OUActionNoise(action_space) pol = DeterministicActionNoisePol(observation_space, action_space, pol_net, noise) else: if args.rnn: pol_net = PolNetLSTM(observation_space, action_space, h_size=256, cell_size=256) else: pol_net = PolNet(observation_space, action_space) if isinstance(action_space, gym.spaces.Box): pol = GaussianPol(observation_space, action_space, pol_net, args.rnn) elif isinstance(action_space, gym.spaces.Discrete): pol = CategoricalPol(observation_space, action_space, pol_net, args.rnn) elif isinstance(action_space, gym.spaces.MultiDiscrete): pol = MultiCategoricalPol(observation_space, action_space, pol_net, args.rnn) else: raise ValueError('Only Box, Discrete, and MultiDiscrete are supported') sampler = EpiSampler(env, pol, num_parallel=1, seed=args.seed)
def test_learning(self): pol_net = PolNetLSTM( self.env.observation_space, self.env.action_space, h_size=32, cell_size=32) pol = GaussianPol(self.env.observation_space, self.env.action_space, pol_net, rnn=True) qf_net1 = QNetLSTM(self.env.observation_space, self.env.action_space, h_size=32, cell_size=32) qf1 = DeterministicSAVfunc( self.env.observation_space, self.env.action_space, qf_net1, rnn=True) targ_qf_net1 = QNetLSTM( self.env.observation_space, self.env.action_space, h_size=32, cell_size=32) targ_qf_net1.load_state_dict(qf_net1.state_dict()) targ_qf1 = DeterministicSAVfunc( self.env.observation_space, self.env.action_space, targ_qf_net1, rnn=True) qf_net2 = QNetLSTM(self.env.observation_space, self.env.action_space, h_size=32, cell_size=32) qf2 = DeterministicSAVfunc( self.env.observation_space, self.env.action_space, qf_net2, rnn=True) targ_qf_net2 = QNetLSTM( self.env.observation_space, self.env.action_space, h_size=32, cell_size=32) targ_qf_net2.load_state_dict(qf_net2.state_dict()) targ_qf2 = DeterministicSAVfunc( self.env.observation_space, self.env.action_space, targ_qf_net2, rnn=True) qfs = [qf1, qf2] targ_qfs = [targ_qf1, targ_qf2] log_alpha = nn.Parameter(torch.zeros(())) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4) optim_qf1 = torch.optim.Adam(qf_net1.parameters(), 3e-4) optim_qf2 = torch.optim.Adam(qf_net2.parameters(), 3e-4) optim_qfs = [optim_qf1, optim_qf2] optim_alpha = torch.optim.Adam([log_alpha], 3e-4) epis = sampler.sample(pol, max_steps=32) traj = Traj() traj.add_epis(epis) traj = ef.add_next_obs(traj) max_pri = traj.get_max_pri() traj = ef.set_all_pris(traj, max_pri) traj = ef.compute_seq_pris(traj, 4) traj = ef.compute_h_masks(traj) for i in range(len(qfs)): traj = ef.compute_hs( traj, qfs[i], hs_name='q_hs'+str(i), input_acs=True) traj = ef.compute_hs( traj, targ_qfs[i], hs_name='targ_q_hs'+str(i), input_acs=True) traj.register_epis() result_dict = r2d2_sac.train( traj, pol, qfs, targ_qfs, log_alpha, optim_pol, optim_qfs, optim_alpha, 2, 32, 4, 2, 0.01, 0.99, 2, ) del sampler
def setup_nets(self): ob_space = self.env.observation_space ac_space = self.env.action_space if self.args.mirror is True: print("Initiating a symmetric network") pol_net = SymmetricNet( *self.env.unwrapped.mirror_sizes, hidden_size=int(self.args.hidden_size / 4), num_layers=self.args.num_layers, varying_std=self.args.varying_std, tanh_finish=self.args.tanh_finish, log_std=self.args.log_stdev, ) elif self.args.rnn: pol_net = PolNetLSTM(ob_space, ac_space, h_size=256, cell_size=256) elif self.args.net_version == 1: pol_net = PolNet(ob_space, ac_space, log_std=self.args.log_stdev) else: pol_net = PolNetB( ob_space, ac_space, hidden_size=self.args.hidden_size, num_layers=self.args.num_layers, varying_std=self.args.varying_std, tanh_finish=self.args.tanh_finish, log_std=self.args.log_stdev, ) if self.args.mirror == "new": print("Initiating a new symmetric network") # TODO: in this case the action_space for the previous pol_net is incorrect, but it isn't easy to fix ... # we can use this for now which just ignores some of the final indices pol_net = SymNet( pol_net, ob_space.shape[0], *self.env.unwrapped.sym_act_inds, varying_std=self.args.varying_std, log_std=self.args.log_stdev, deterministic=False, ) if isinstance(ac_space, gym.spaces.Box): pol_class = GaussianPol elif isinstance(ac_space, gym.spaces.Discrete): pol_class = CategoricalPol elif isinstance(ac_space, gym.spaces.MultiDiscrete): pol_class = MultiCategoricalPol else: raise ValueError( "Only Box, Discrete, and MultiDiscrete are supported") policy = pol_class( ob_space, ac_space, pol_net, self.args.rnn, data_parallel=self.args.data_parallel, parallel_dim=1 if self.args.rnn else 0, ) if self.args.mirror is True: vf_net = SymmetricValue( *self.env.unwrapped.mirror_sizes[:3], hidden_size=self.args.hidden_size, num_layers=self.args.num_layers, ) elif self.args.rnn: vf_net = VNetLSTM(ob_space, h_size=256, cell_size=256) elif self.args.net_version == 1: vf_net = VNet(ob_space) else: vf_net = VNetB( ob_space, hidden_size=self.args.hidden_size, num_layers=self.args.num_layers, ) if self.args.mirror == "new": print("Initiating a new symmetric value network") vf_net = SymVNet(vf_net, ob_space.shape[0]) vf = DeterministicSVfunc( ob_space, vf_net, self.args.rnn, data_parallel=self.args.data_parallel, parallel_dim=1 if self.args.rnn else 0, ) self.pol = policy self.vf = vf