Python compute_h_masks示例，machina.traj.epi_functional.compute_h_masks Python示例

示例#1

0

显示文件

    def test_learning_rnn(self):
        pol_net = PolNetLSTM(
            self.env.observation_space, self.env.action_space, h_size=32, cell_size=32)
        pol = CategoricalPol(
            self.env.observation_space, self.env.action_space, pol_net, rnn=True)

        vf_net = VNetLSTM(self.env.observation_space, h_size=32, cell_size=32)
        vf = DeterministicSVfunc(self.env.observation_space, vf_net, rnn=True)

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4)

        epis = sampler.sample(pol, max_steps=400)

        traj = Traj()
        traj.add_epis(epis)

        traj = ef.compute_vs(traj, vf)
        traj = ef.compute_rets(traj, 0.99)
        traj = ef.compute_advs(traj, 0.99, 0.95)
        traj = ef.centerize_advs(traj)
        traj = ef.compute_h_masks(traj)
        traj.register_epis()

        result_dict = trpo.train(traj, pol, vf, optim_vf, 1, 2)

        del sampler

示例#2

0

显示文件

文件： test_algos.py 项目： takerfume/machina

    def test_learning(self):
        pol_net = PolNet(self.env.ob_space, self.env.ac_space, h1=32, h2=32)
        pol = CategoricalPol(self.env.ob_space, self.env.ac_space, pol_net)

        vf_net = VNet(self.env.ob_space, h1=32, h2=32)
        vf = DeterministicSVfunc(self.env.ob_space, vf_net)

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4)
        optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4)

        epis = sampler.sample(pol, max_steps=32)

        traj = Traj()
        traj.add_epis(epis)

        traj = ef.compute_vs(traj, vf)
        traj = ef.compute_rets(traj, 0.99)
        traj = ef.compute_advs(traj, 0.99, 0.95)
        traj = ef.centerize_advs(traj)
        traj = ef.compute_h_masks(traj)
        traj.register_epis()

        result_dict = ppo_clip.train(traj=traj, pol=pol, vf=vf, clip_param=0.2,
                                     optim_pol=optim_pol, optim_vf=optim_vf, epoch=1, batch_size=32)
        result_dict = ppo_kl.train(traj=traj, pol=pol, vf=vf, kl_beta=0.1, kl_targ=0.2,
                                   optim_pol=optim_pol, optim_vf=optim_vf, epoch=1, batch_size=32, max_grad_norm=10)

        del sampler

示例#3

0

显示文件

    def test_learning_rnn(self):
        pol_net = PolNetLSTM(
            self.env.observation_space, self.env.action_space, h_size=32, cell_size=32)
        pol = GaussianPol(self.env.observation_space,
                          self.env.action_space, pol_net, rnn=True)

        vf_net = VNetLSTM(self.env.observation_space, h_size=32, cell_size=32)
        vf = DeterministicSVfunc(self.env.observation_space, vf_net, rnn=True)

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4)
        optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4)

        epis = sampler.sample(pol, max_steps=400)

        traj = Traj()
        traj.add_epis(epis)

        traj = ef.compute_vs(traj, vf)
        traj = ef.compute_rets(traj, 0.99)
        traj = ef.compute_advs(traj, 0.99, 0.95)
        traj = ef.centerize_advs(traj)
        traj = ef.compute_h_masks(traj)
        traj.register_epis()

        result_dict = ppo_clip.train(traj=traj, pol=pol, vf=vf, clip_param=0.2,
                                     optim_pol=optim_pol, optim_vf=optim_vf, epoch=1, batch_size=2)
        result_dict = ppo_kl.train(traj=traj, pol=pol, vf=vf, kl_beta=0.1, kl_targ=0.2,
                                   optim_pol=optim_pol, optim_vf=optim_vf, epoch=1, batch_size=2, max_grad_norm=20)

        del sampler

示例#4

0

显示文件

    def train(self, epis):
        traj = Traj(ddp=True, traj_device=self.device)
        traj.add_epis(epis)

        traj = ef.compute_vs(traj, self.vf)
        traj = ef.compute_rets(traj, args.gamma)
        traj = ef.compute_advs(traj, args.gamma, args.lam)
        traj = ef.centerize_advs(traj)
        traj = ef.compute_h_masks(traj)
        traj.register_epis()

        result_dict = ppo_clip.train(traj=traj,
                                     pol=self.ddp_pol,
                                     vf=self.ddp_vf,
                                     clip_param=self.args.clip_param,
                                     optim_pol=self.optim_pol,
                                     optim_vf=self.optim_vf,
                                     epoch=self.args.epoch_per_iter,
                                     batch_size=self.args.batch_size,
                                     max_grad_norm=self.args.max_grad_norm,
                                     log_enable=self.rank == 0)

        result_dict["traj_num_step"] = traj.num_step
        result_dict["traj_num_epi"] = traj.num_epi
        return result_dict

示例#5

0

显示文件

    def test_learning(self):
        pol_net = PolNet(self.env.observation_space,
                         self.env.action_space, h1=32, h2=32)
        pol = GaussianPol(self.env.observation_space,
                          self.env.action_space, pol_net)

        vf_net = VNet(self.env.observation_space, h1=32, h2=32)
        vf = DeterministicSVfunc(self.env.observation_space, vf_net)

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4)

        epis = sampler.sample(pol, max_steps=32)

        traj = Traj()
        traj.add_epis(epis)

        traj = ef.compute_vs(traj, vf)
        traj = ef.compute_rets(traj, 0.99)
        traj = ef.compute_advs(traj, 0.99, 0.95)
        traj = ef.centerize_advs(traj)
        traj = ef.compute_h_masks(traj)
        traj.register_epis()

        result_dict = trpo.train(traj, pol, vf, optim_vf, 1, 24)

        del sampler

示例#6

0

显示文件

    def test_learning(self):
        t_pol_net = PolNet(self.env.observation_space,
                           self.env.action_space, h1=200, h2=100)
        s_pol_net = PolNet(self.env.observation_space,
                           self.env.action_space, h1=190, h2=90)

        t_pol = GaussianPol(
            self.env.observation_space, self.env.action_space, t_pol_net)
        s_pol = GaussianPol(
            self.env.observation_space, self.env.action_space, s_pol_net)

        student_sampler = EpiSampler(self.env, s_pol, num_parallel=1)

        optim_pol = torch.optim.Adam(s_pol.parameters(), 3e-4)

        epis = student_sampler.sample(s_pol, max_steps=32)

        traj = Traj()
        traj.add_epis(epis)

        traj = ef.compute_h_masks(traj)
        traj.register_epis()
        result_dict = on_pol_teacher_distill.train(
            traj=traj,
            student_pol=s_pol,
            teacher_pol=t_pol,
            student_optim=optim_pol,
            epoch=1,
            batchsize=32)

        del student_sampler

示例#7

0

显示文件

文件： test_algos.py 项目： yumion/machina

    def test_learning(self):
        pol_net = PolNet(self.env.observation_space,
                         self.env.action_space,
                         h1=32,
                         h2=32)
        pol = GaussianPol(self.env.observation_space, self.env.action_space,
                          pol_net)

        vf_net = VNet(self.env.observation_space)
        vf = DeterministicSVfunc(self.env.observation_space, vf_net)

        discrim_net = DiscrimNet(self.env.observation_space,
                                 self.env.action_space,
                                 h1=32,
                                 h2=32)
        discrim = DeterministicSAVfunc(self.env.observation_space,
                                       self.env.action_space, discrim_net)

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4)
        optim_discrim = torch.optim.Adam(discrim_net.parameters(), 3e-4)

        with open(os.path.join('data/expert_epis', 'Pendulum-v0_2epis.pkl'),
                  'rb') as f:
            expert_epis = pickle.load(f)
        expert_traj = Traj()
        expert_traj.add_epis(expert_epis)
        expert_traj.register_epis()

        epis = sampler.sample(pol, max_steps=32)

        agent_traj = Traj()
        agent_traj.add_epis(epis)
        agent_traj = ef.compute_pseudo_rews(agent_traj, discrim)
        agent_traj = ef.compute_vs(agent_traj, vf)
        agent_traj = ef.compute_rets(agent_traj, 0.99)
        agent_traj = ef.compute_advs(agent_traj, 0.99, 0.95)
        agent_traj = ef.centerize_advs(agent_traj)
        agent_traj = ef.compute_h_masks(agent_traj)
        agent_traj.register_epis()

        result_dict = gail.train(agent_traj,
                                 expert_traj,
                                 pol,
                                 vf,
                                 discrim,
                                 optim_vf,
                                 optim_discrim,
                                 rl_type='trpo',
                                 epoch=1,
                                 batch_size=32,
                                 discrim_batch_size=32,
                                 discrim_step=1,
                                 pol_ent_beta=1e-3,
                                 discrim_ent_beta=1e-5)

        del sampler

示例#8

0

显示文件

文件： test_algos.py 项目： yumion/machina

    def test_learning_rnn(self):
        def rew_func(next_obs,
                     acs,
                     mean_obs=0.,
                     std_obs=1.,
                     mean_acs=0.,
                     std_acs=1.):
            next_obs = next_obs * std_obs + mean_obs
            acs = acs * std_acs + mean_acs
            # Pendulum
            rews = -(torch.acos(next_obs[:, 0].clamp(min=-1, max=1))**2 + 0.1 *
                     (next_obs[:, 2].clamp(min=-8, max=8)**2) +
                     0.001 * acs.squeeze(-1)**2)
            rews = rews.squeeze(0)

            return rews

        # init models
        dm_net = ModelNetLSTM(self.env.observation_space,
                              self.env.action_space)
        dm = DeterministicSModel(self.env.observation_space,
                                 self.env.action_space,
                                 dm_net,
                                 rnn=True,
                                 data_parallel=False,
                                 parallel_dim=0)

        mpc_pol = MPCPol(self.env.observation_space,
                         self.env.action_space,
                         dm_net,
                         rew_func,
                         1,
                         1,
                         mean_obs=0.,
                         std_obs=1.,
                         mean_acs=0.,
                         std_acs=1.,
                         rnn=True)
        optim_dm = torch.optim.Adam(dm_net.parameters(), 1e-3)

        # sample with mpc policy
        sampler = EpiSampler(self.env, mpc_pol, num_parallel=1)
        epis = sampler.sample(mpc_pol, max_epis=1)

        traj = Traj()
        traj.add_epis(epis)
        traj = ef.add_next_obs(traj)
        traj = ef.compute_h_masks(traj)
        traj.register_epis()
        traj.add_traj(traj)

        # train
        result_dict = mpc.train_dm(traj, dm, optim_dm, epoch=1, batch_size=1)

        del sampler

示例#9

0

显示文件

文件： test_algos.py 项目： takerfume/machina

    def test_learning(self):
        pol_net = PolNet(self.env.ob_space, self.env.ac_space, h1=32, h2=32)
        pol = GaussianPol(self.env.ob_space, self.env.ac_space, pol_net)

        vf_net = VNet(self.env.ob_space)
        vf = DeterministicSVfunc(self.env.ob_space, vf_net)

        rewf_net = VNet(self.env.ob_space, h1=32, h2=32)
        rewf = DeterministicSVfunc(self.env.ob_space, rewf_net)
        shaping_vf_net = VNet(self.env.ob_space, h1=32, h2=32)
        shaping_vf = DeterministicSVfunc(self.env.ob_space, shaping_vf_net)

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4)
        optim_discrim = torch.optim.Adam(
            list(rewf_net.parameters()) + list(shaping_vf_net.parameters()), 3e-4)

        with open(os.path.join('data/expert_epis', 'Pendulum-v0_2epis.pkl'), 'rb') as f:
            expert_epis = pickle.load(f)
        expert_traj = Traj()
        expert_traj.add_epis(expert_epis)
        expert_traj = ef.add_next_obs(expert_traj)
        expert_traj.register_epis()

        epis = sampler.sample(pol, max_steps=32)

        agent_traj = Traj()
        agent_traj.add_epis(epis)
        agent_traj = ef.add_next_obs(agent_traj)
        agent_traj = ef.compute_pseudo_rews(
            agent_traj, rew_giver=rewf, state_only=True)
        agent_traj = ef.compute_vs(agent_traj, vf)
        agent_traj = ef.compute_rets(agent_traj, 0.99)
        agent_traj = ef.compute_advs(agent_traj, 0.99, 0.95)
        agent_traj = ef.centerize_advs(agent_traj)
        agent_traj = ef.compute_h_masks(agent_traj)
        agent_traj.register_epis()

        result_dict = airl.train(agent_traj, expert_traj, pol, vf, optim_vf, optim_discrim,
                                 rewf=rewf, shaping_vf=shaping_vf,
                                 rl_type='trpo',
                                 epoch=1,
                                 batch_size=32, discrim_batch_size=32,
                                 discrim_step=1,
                                 pol_ent_beta=1e-3, gamma=0.99)

        del sampler

示例#10

0

显示文件

文件： run_bc_ppo.py 项目： syundo0730/rl-robo-book-examples

                                     args.bc_batch_size)
    torch.save(pol.state_dict(), os.path.join(args.log, 'models',
                                              'pol_bc.pkl'))

while args.max_epis > total_epi:
    with measure('sample'):
        epis = sampler.sample(pol, max_steps=args.max_steps_per_iter)
    with measure('train'):
        traj = Traj()
        traj.add_epis(epis)

        traj = ef.compute_vs(traj, vf)
        traj = ef.compute_rets(traj, args.gamma)
        traj = ef.compute_advs(traj, args.gamma, args.lam)
        traj = ef.centerize_advs(traj)
        traj = ef.compute_h_masks(traj)
        traj.register_epis()

        if args.data_parallel:
            pol.dp_run = True
            vf.dp_run = True

        if args.ppo_type == 'clip':
            result_dict = ppo_clip.train(traj=traj,
                                         pol=pol,
                                         vf=vf,
                                         clip_param=args.clip_param,
                                         optim_pol=optim_pol,
                                         optim_vf=optim_vf,
                                         epoch=args.epoch_per_iter,
                                         batch_size=args.batch_size if

示例#11

0

显示文件

文件： run_teacher_distill.py 项目： yuishihara/machina

total_epi = 0
total_step = 0
max_rew = -1e6

while args.max_epis > total_epi:
    with measure('sample'):
        if args.sampling_policy == 'teacher':
            epis = teacher_sampler.sample(
                t_pol, max_epis=args.max_epis_per_iter)
        else:
            epis = student_sampler.sample(
                s_pol, max_epis=args.max_epis_per_iter)
    with measure('train'):
        traj = Traj()
        traj.add_epis(epis)
        traj = ef.compute_h_masks(traj)
        traj.register_epis()
        result_dict = on_pol_teacher_distill.train(
            traj=traj,
            student_pol=s_pol,
            teacher_pol=t_pol,
            student_optim=optim_pol,
            epoch=args.epoch_per_iter,
            batchsize=args.batch_size)

    logger.log('Testing Student-policy')
    with measure('sample'):
        epis_measure = student_sampler.sample(
            s_pol, max_epis=args.max_epis_per_iter)

    with measure('measure'):

示例#12

0

显示文件

文件： run_r2d2_sac.py 项目： yuishihara/machina

total_step = 0
max_rew = -1e6

while args.max_epis > total_epi:
    with measure('sample'):
        epis = sampler.sample(pol, max_steps=args.max_steps_per_iter)

    with measure('train'):
        on_traj = Traj(traj_device='cpu')
        on_traj.add_epis(epis)

        on_traj = ef.add_next_obs(on_traj)
        max_pri = on_traj.get_max_pri()
        on_traj = ef.set_all_pris(on_traj, max_pri)
        on_traj = ef.compute_seq_pris(on_traj, args.seq_length)
        on_traj = ef.compute_h_masks(on_traj)
        for i in range(len(qfs)):
            on_traj = ef.compute_hs(
                on_traj, qfs[i], hs_name='q_hs'+str(i), input_acs=True)
            on_traj = ef.compute_hs(
                on_traj, targ_qfs[i], hs_name='targ_q_hs'+str(i), input_acs=True)
        on_traj.register_epis()

        off_traj.add_traj(on_traj)

        total_epi += on_traj.num_epi
        step = on_traj.num_step
        total_step += step

        result_dict = r2d2_sac.train(
            off_traj,

示例#13

0

显示文件

    with measure('bc pretrain'):
        _ = behavior_clone.train(expert_traj, pol, optim_pol,
                                 args.bc_batch_size, args.bc_epoch)

while args.max_epis > total_epi:
    with measure('sample'):
        epis = sampler.sample(pol, max_steps=args.max_steps_per_iter)
    with measure('train'):
        agent_traj = Traj()
        agent_traj.add_epis(epis)
        agent_traj = ef.compute_pseudo_rews(agent_traj, discrim)
        agent_traj = ef.compute_vs(agent_traj, vf)
        agent_traj = ef.compute_rets(agent_traj, args.gamma)
        agent_traj = ef.compute_advs(agent_traj, args.gamma, args.lam)
        agent_traj = ef.centerize_advs(agent_traj)
        agent_traj = ef.compute_h_masks(agent_traj)
        agent_traj.register_epis()

        if args.data_parallel:
            pol.dp_run = True
            vf.dp_run = True
            discrim.dp_run = True

        if args.rl_type == 'trpo':
            result_dict = gail.train(
                agent_traj,
                expert_traj,
                pol,
                vf,
                discrim,
                optim_vf,

示例#14

0

显示文件

文件： main.py 项目： farzadab/walking-benchmark

    def train(self):
        args = self.args

        # TODO: cuda seems to be broken, I don't care about it right now
        # if args.cuda:
        #     # current_obs = current_obs.cuda()
        #     rollouts.cuda()

        self.train_start_time = time.time()
        total_epi = 0
        total_step = 0
        max_rew = -1e6
        sampler = None

        score_file = os.path.join(self.logger.get_logdir(), "progress.csv")
        logger.add_tabular_output(score_file)

        num_total_frames = args.num_total_frames

        mirror_function = None
        if args.mirror_tuples and hasattr(self.env.unwrapped,
                                          "mirror_indices"):
            mirror_function = get_mirror_function(
                **self.env.unwrapped.mirror_indices)
            num_total_frames *= 2
            if not args.tanh_finish:
                warnings.warn(
                    "When `mirror_tuples` is `True`,"
                    " `tanh_finish` should be set to `True` as well."
                    " Otherwise there is a chance of the training blowing up.")

        while num_total_frames > total_step:
            # setup the correct curriculum learning environment/parameters
            new_curriculum = self.curriculum_handler(total_step /
                                                     args.num_total_frames)

            if total_step == 0 or new_curriculum:
                if sampler is not None:
                    del sampler
                sampler = EpiSampler(
                    self.env,
                    self.pol,
                    num_parallel=self.args.num_processes,
                    seed=self.args.seed + total_step,  # TODO: better fix?
                )

            with measure("sample"):
                epis = sampler.sample(self.pol,
                                      max_steps=args.num_steps *
                                      args.num_processes)

            with measure("train"):
                with measure("epis"):
                    traj = Traj()
                    traj.add_epis(epis)

                    traj = ef.compute_vs(traj, self.vf)
                    traj = ef.compute_rets(traj, args.decay_gamma)
                    traj = ef.compute_advs(traj, args.decay_gamma,
                                           args.gae_lambda)
                    traj = ef.centerize_advs(traj)
                    traj = ef.compute_h_masks(traj)
                    traj.register_epis()

                    if mirror_function:
                        traj.add_traj(mirror_function(traj))

                # if args.data_parallel:
                #     self.pol.dp_run = True
                #     self.vf.dp_run = True

                result_dict = ppo_clip.train(
                    traj=traj,
                    pol=self.pol,
                    vf=self.vf,
                    clip_param=args.clip_eps,
                    optim_pol=self.optim_pol,
                    optim_vf=self.optim_vf,
                    epoch=args.epoch_per_iter,
                    batch_size=args.batch_size
                    if not args.rnn else args.rnn_batch_size,
                    max_grad_norm=args.max_grad_norm,
                )

                # if args.data_parallel:
                #     self.pol.dp_run = False
                #     self.vf.dp_run = False

            ## append the metrics to the `results_dict` (reported in the progress.csv)
            result_dict.update(self.get_extra_metrics(epis))

            total_epi += traj.num_epi
            step = traj.num_step
            total_step += step
            rewards = [np.sum(epi["rews"]) for epi in epis]
            mean_rew = np.mean(rewards)
            logger.record_results(
                self.logger.get_logdir(),
                result_dict,
                score_file,
                total_epi,
                step,
                total_step,
                rewards,
                plot_title=args.env,
            )

            if mean_rew > max_rew:
                self.save_models("max")
                max_rew = mean_rew

            self.save_models("last")

            self.scheduler_pol.step()
            self.scheduler_vf.step()

            del traj

示例#15

0

显示文件

文件： run_mpc.py 项目： takerfume/machina

######################

### Prepare the dataset D_RAND ###

# Performing rollouts to collect training data
rand_sampler = EpiSampler(env,
                          random_pol,
                          num_parallel=args.num_parallel,
                          seed=args.seed)

epis = rand_sampler.sample(random_pol, max_epis=args.num_random_rollouts)
epis = add_noise_to_init_obs(epis, args.noise_to_init_obs)
traj = Traj(traj_device='cpu')
traj.add_epis(epis)
traj = ef.add_next_obs(traj)
traj = ef.compute_h_masks(traj)
# obs, next_obs, and acs should become mean 0, std 1
traj, mean_obs, std_obs, mean_acs, std_acs = ef.normalize_obs_and_acs(traj)
traj.register_epis()

del rand_sampler

### Train Dynamics Model ###

# initialize dynamics model and mpc policy
if args.rnn:
    dm_net = ModelNetLSTM(ob_space, ac_space)
else:
    dm_net = ModelNet(ob_space, ac_space)
dm = DeterministicSModel(ob_space,
                         ac_space,

示例#16

0

显示文件

    def test_learning(self):
        pol_net = PolNetLSTM(
            self.env.observation_space, self.env.action_space, h_size=32, cell_size=32)
        pol = GaussianPol(self.env.observation_space,
                          self.env.action_space, pol_net, rnn=True)

        qf_net1 = QNetLSTM(self.env.observation_space,
                           self.env.action_space, h_size=32, cell_size=32)
        qf1 = DeterministicSAVfunc(
            self.env.observation_space, self.env.action_space, qf_net1, rnn=True)
        targ_qf_net1 = QNetLSTM(
            self.env.observation_space, self.env.action_space, h_size=32, cell_size=32)
        targ_qf_net1.load_state_dict(qf_net1.state_dict())
        targ_qf1 = DeterministicSAVfunc(
            self.env.observation_space, self.env.action_space, targ_qf_net1, rnn=True)

        qf_net2 = QNetLSTM(self.env.observation_space,
                           self.env.action_space, h_size=32, cell_size=32)
        qf2 = DeterministicSAVfunc(
            self.env.observation_space, self.env.action_space, qf_net2, rnn=True)
        targ_qf_net2 = QNetLSTM(
            self.env.observation_space, self.env.action_space, h_size=32, cell_size=32)
        targ_qf_net2.load_state_dict(qf_net2.state_dict())
        targ_qf2 = DeterministicSAVfunc(
            self.env.observation_space, self.env.action_space, targ_qf_net2, rnn=True)

        qfs = [qf1, qf2]
        targ_qfs = [targ_qf1, targ_qf2]

        log_alpha = nn.Parameter(torch.zeros(()))

        sampler = EpiSampler(self.env, pol, num_parallel=1)

        optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4)
        optim_qf1 = torch.optim.Adam(qf_net1.parameters(), 3e-4)
        optim_qf2 = torch.optim.Adam(qf_net2.parameters(), 3e-4)
        optim_qfs = [optim_qf1, optim_qf2]
        optim_alpha = torch.optim.Adam([log_alpha], 3e-4)

        epis = sampler.sample(pol, max_steps=32)

        traj = Traj()
        traj.add_epis(epis)

        traj = ef.add_next_obs(traj)
        max_pri = traj.get_max_pri()
        traj = ef.set_all_pris(traj, max_pri)
        traj = ef.compute_seq_pris(traj, 4)
        traj = ef.compute_h_masks(traj)
        for i in range(len(qfs)):
            traj = ef.compute_hs(
                traj, qfs[i], hs_name='q_hs'+str(i), input_acs=True)
            traj = ef.compute_hs(
                traj, targ_qfs[i], hs_name='targ_q_hs'+str(i), input_acs=True)
        traj.register_epis()

        result_dict = r2d2_sac.train(
            traj,
            pol, qfs, targ_qfs, log_alpha,
            optim_pol, optim_qfs, optim_alpha,
            2, 32, 4, 2,
            0.01, 0.99, 2,
        )

        del sampler

示例#17

0

显示文件

文件： run_mixed_env.py 项目： takerfume/machina

total_step = 0
max_rew = -1e6
while args.max_epis > total_epi:
    with measure('sample'):
        epis1 = sampler1.sample(pol, max_epis=args.max_epis_per_iter)
        epis2 = sampler2.sample(pol, max_epis=args.max_epis_per_iter)
    with measure('train'):
        traj1 = Traj()
        traj2 = Traj()

        traj1.add_epis(epis1)
        traj1 = ef.compute_vs(traj1, vf)
        traj1 = ef.compute_rets(traj1, args.gamma)
        traj1 = ef.compute_advs(traj1, args.gamma, args.lam)
        traj1 = ef.centerize_advs(traj1)
        traj1 = ef.compute_h_masks(traj1)
        traj1.register_epis()

        traj2.add_epis(epis2)
        traj2 = ef.compute_vs(traj2, vf)
        traj2 = ef.compute_rets(traj2, args.gamma)
        traj2 = ef.compute_advs(traj2, args.gamma, args.lam)
        traj2 = ef.centerize_advs(traj2)
        traj2 = ef.compute_h_masks(traj2)
        traj2.register_epis()

        traj1.add_traj(traj2)

        if args.data_parallel:
            pol.dp_run = True
            vf.dp_run = True