Exemplo n.º 1
0
    def test_learning(self):
        ob_space = self.env.real_observation_space
        skill_space = self.env.skill_space
        ob_skill_space = self.env.observation_space
        ac_space = self.env.action_space
        ob_dim = ob_skill_space.shape[0] - 4
        f_dim = ob_dim
        def discrim_f(x): return x

        pol_net = PolNet(ob_skill_space, ac_space)
        pol = GaussianPol(ob_skill_space, ac_space, pol_net)
        qf_net1 = QNet(ob_skill_space, ac_space)
        qf1 = DeterministicSAVfunc(ob_skill_space, ac_space, qf_net1)
        targ_qf_net1 = QNet(ob_skill_space, ac_space)
        targ_qf_net1.load_state_dict(qf_net1.state_dict())
        targ_qf1 = DeterministicSAVfunc(ob_skill_space, ac_space, targ_qf_net1)
        qf_net2 = QNet(ob_skill_space, ac_space)
        qf2 = DeterministicSAVfunc(ob_skill_space, ac_space, qf_net2)
        targ_qf_net2 = QNet(ob_skill_space, ac_space)
        targ_qf_net2.load_state_dict(qf_net2.state_dict())
        targ_qf2 = DeterministicSAVfunc(ob_skill_space, ac_space, targ_qf_net2)
        qfs = [qf1, qf2]
        targ_qfs = [targ_qf1, targ_qf2]
        log_alpha = nn.Parameter(torch.ones(()))

        high = np.array([np.finfo(np.float32).max]*f_dim)
        f_space = gym.spaces.Box(-high, high, dtype=np.float32)
        discrim_net = DiaynDiscrimNet(
            f_space, skill_space, h_size=100, discrim_f=discrim_f)
        discrim = DeterministicSVfunc(f_space, discrim_net)

        optim_pol = torch.optim.Adam(pol_net.parameters(), 1e-4)
        optim_qf1 = torch.optim.Adam(qf_net1.parameters(), 3e-4)
        optim_qf2 = torch.optim.Adam(qf_net2.parameters(), 3e-4)
        optim_qfs = [optim_qf1, optim_qf2]
        optim_alpha = torch.optim.Adam([log_alpha], 1e-4)
        optim_discrim = torch.optim.SGD(discrim.parameters(),
                                        lr=0.001, momentum=0.9)

        off_traj = Traj()
        sampler = EpiSampler(self.env, pol, num_parallel=1)

        epis = sampler.sample(pol, max_steps=200)
        on_traj = Traj()
        on_traj.add_epis(epis)
        on_traj = ef.add_next_obs(on_traj)
        on_traj = ef.compute_diayn_rews(
            on_traj, lambda x: diayn_sac.calc_rewards(x, 4, discrim))
        on_traj.register_epis()
        off_traj.add_traj(on_traj)
        step = on_traj.num_step
        log_alpha = nn.Parameter(np.log(0.1)*torch.ones(()))  # fix alpha
        result_dict = diayn_sac.train(
            off_traj, pol, qfs, targ_qfs, log_alpha,
            optim_pol, optim_qfs, optim_alpha,
            step, 128, 5e-3, 0.99, 1, discrim, 4, True)
        discrim_losses = diayn.train(
            discrim, optim_discrim, on_traj, 32, 100, 4)

        del sampler
Exemplo n.º 2
0
    def test_learning_rnn(self):
        def rew_func(next_obs,
                     acs,
                     mean_obs=0.,
                     std_obs=1.,
                     mean_acs=0.,
                     std_acs=1.):
            next_obs = next_obs * std_obs + mean_obs
            acs = acs * std_acs + mean_acs
            # Pendulum
            rews = -(torch.acos(next_obs[:, 0].clamp(min=-1, max=1))**2 + 0.1 *
                     (next_obs[:, 2].clamp(min=-8, max=8)**2) +
                     0.001 * acs.squeeze(-1)**2)
            rews = rews.squeeze(0)

            return rews

        # init models
        dm_net = ModelNetLSTM(self.env.observation_space,
                              self.env.action_space)
        dm = DeterministicSModel(self.env.observation_space,
                                 self.env.action_space,
                                 dm_net,
                                 rnn=True,
                                 data_parallel=False,
                                 parallel_dim=0)

        mpc_pol = MPCPol(self.env.observation_space,
                         self.env.action_space,
                         dm_net,
                         rew_func,
                         1,
                         1,
                         mean_obs=0.,
                         std_obs=1.,
                         mean_acs=0.,
                         std_acs=1.,
                         rnn=True)
        optim_dm = torch.optim.Adam(dm_net.parameters(), 1e-3)

        # sample with mpc policy
        sampler = EpiSampler(self.env, mpc_pol, num_parallel=1)
        epis = sampler.sample(mpc_pol, max_epis=1)

        traj = Traj()
        traj.add_epis(epis)
        traj = ef.add_next_obs(traj)
        traj = ef.compute_h_masks(traj)
        traj.register_epis()
        traj.add_traj(traj)

        # train
        result_dict = mpc.train_dm(traj, dm, optim_dm, epoch=1, batch_size=1)

        del sampler
Exemplo n.º 3
0
print('start')
while args.max_epis > total_epi:
    with measure('sample'):
        print('sampling')
        # policyにしたがって行動し、経験を貯める(env.stepをone_epiの__init__内で行っている)
        # off-policy
        epis = sampler.sample(pol, max_steps=args.max_steps_per_iter)
    with measure('train'):
        # on-policyのサンプリング
        print('on-policy')
        on_traj = Traj(traj_device='cpu')
        on_traj.add_epis(epis)
        on_traj = epi_functional.add_next_obs(on_traj)
        on_traj.register_epis()
        off_traj.add_traj(on_traj)  # off-policyに加える

        # episodeとstepのカウント
        total_epi += on_traj.num_epi
        step = on_traj.num_step
        total_step += step
        epoch = step

        if args.data_parallel:
            qf.dp_run = True
            lagged_qf.dp_run = True
            targ_qf1.dp_run = True
            targ_qf2.dp_run = True
        # train
        print('train')
        result_dict = qtopt.train(off_traj,
Exemplo n.º 4
0
        on_traj = Traj(traj_device='cpu')
        on_traj.add_epis(epis)

        on_traj = ef.add_next_obs(on_traj)
        max_pri = on_traj.get_max_pri()
        on_traj = ef.set_all_pris(on_traj, max_pri)
        on_traj = ef.compute_seq_pris(on_traj, args.seq_length)
        on_traj = ef.compute_h_masks(on_traj)
        for i in range(len(qfs)):
            on_traj = ef.compute_hs(
                on_traj, qfs[i], hs_name='q_hs'+str(i), input_acs=True)
            on_traj = ef.compute_hs(
                on_traj, targ_qfs[i], hs_name='targ_q_hs'+str(i), input_acs=True)
        on_traj.register_epis()

        off_traj.add_traj(on_traj)

        total_epi += on_traj.num_epi
        step = on_traj.num_step
        total_step += step

        result_dict = r2d2_sac.train(
            off_traj,
            pol, qfs, targ_qfs, log_alpha,
            optim_pol, optim_qfs, optim_alpha,
            step//50, args.rnn_batch_size, args.seq_length, args.burn_in_length,
            args.tau, args.gamma, args.sampling, not args.no_reparam
        )

    rewards = [np.sum(epi['rews']) for epi in epis]
    mean_rew = np.mean(rewards)
Exemplo n.º 5
0
                         std_acs, args.rnn)
        epis = rl_sampler.sample(mpc_pol, max_epis=args.max_epis_per_iter)

        curr_traj = Traj(traj_device='cpu')
        curr_traj.add_epis(epis)

        curr_traj = ef.add_next_obs(curr_traj)
        curr_traj = ef.compute_h_masks(curr_traj)
        traj = ef.normalize_obs_and_acs(curr_traj,
                                        mean_obs,
                                        std_obs,
                                        mean_acs,
                                        std_acs,
                                        return_statistic=False)
        curr_traj.register_epis()
        traj.add_traj(curr_traj)

    total_epi += curr_traj.num_epi
    step = curr_traj.num_step
    total_step += step
    rewards = [np.sum(epi['rews']) for epi in epis]
    mean_rew = np.mean(rewards)
    logger.record_results(args.log,
                          result_dict,
                          score_file,
                          total_epi,
                          step,
                          total_step,
                          rewards,
                          plot_title=args.env_name)
Exemplo n.º 6
0
 def test_add_traj(self):
     new_traj = Traj()
     new_traj.add_traj(self.traj)
     assert new_traj.num_epi == self.traj.num_epi
     assert new_traj.num_step == self.traj.num_step
Exemplo n.º 7
0
        traj1 = ef.compute_vs(traj1, vf)
        traj1 = ef.compute_rets(traj1, args.gamma)
        traj1 = ef.compute_advs(traj1, args.gamma, args.lam)
        traj1 = ef.centerize_advs(traj1)
        traj1 = ef.compute_h_masks(traj1)
        traj1.register_epis()

        traj2.add_epis(epis2)
        traj2 = ef.compute_vs(traj2, vf)
        traj2 = ef.compute_rets(traj2, args.gamma)
        traj2 = ef.compute_advs(traj2, args.gamma, args.lam)
        traj2 = ef.centerize_advs(traj2)
        traj2 = ef.compute_h_masks(traj2)
        traj2.register_epis()

        traj1.add_traj(traj2)

        if args.data_parallel:
            pol.dp_run = True
            vf.dp_run = True

        result_dict = ppo_clip.train(traj=traj1,
                                     pol=pol,
                                     vf=vf,
                                     clip_param=args.clip_param,
                                     optim_pol=optim_pol,
                                     optim_vf=optim_vf,
                                     epoch=args.epoch_per_iter,
                                     batch_size=args.batch_size
                                     if not args.rnn else args.rnn_batch_size,
                                     max_grad_norm=args.max_grad_norm)
Exemplo n.º 8
0
    def train(self):
        args = self.args

        # TODO: cuda seems to be broken, I don't care about it right now
        # if args.cuda:
        #     # current_obs = current_obs.cuda()
        #     rollouts.cuda()

        self.train_start_time = time.time()
        total_epi = 0
        total_step = 0
        max_rew = -1e6
        sampler = None

        score_file = os.path.join(self.logger.get_logdir(), "progress.csv")
        logger.add_tabular_output(score_file)

        num_total_frames = args.num_total_frames

        mirror_function = None
        if args.mirror_tuples and hasattr(self.env.unwrapped,
                                          "mirror_indices"):
            mirror_function = get_mirror_function(
                **self.env.unwrapped.mirror_indices)
            num_total_frames *= 2
            if not args.tanh_finish:
                warnings.warn(
                    "When `mirror_tuples` is `True`,"
                    " `tanh_finish` should be set to `True` as well."
                    " Otherwise there is a chance of the training blowing up.")

        while num_total_frames > total_step:
            # setup the correct curriculum learning environment/parameters
            new_curriculum = self.curriculum_handler(total_step /
                                                     args.num_total_frames)

            if total_step == 0 or new_curriculum:
                if sampler is not None:
                    del sampler
                sampler = EpiSampler(
                    self.env,
                    self.pol,
                    num_parallel=self.args.num_processes,
                    seed=self.args.seed + total_step,  # TODO: better fix?
                )

            with measure("sample"):
                epis = sampler.sample(self.pol,
                                      max_steps=args.num_steps *
                                      args.num_processes)

            with measure("train"):
                with measure("epis"):
                    traj = Traj()
                    traj.add_epis(epis)

                    traj = ef.compute_vs(traj, self.vf)
                    traj = ef.compute_rets(traj, args.decay_gamma)
                    traj = ef.compute_advs(traj, args.decay_gamma,
                                           args.gae_lambda)
                    traj = ef.centerize_advs(traj)
                    traj = ef.compute_h_masks(traj)
                    traj.register_epis()

                    if mirror_function:
                        traj.add_traj(mirror_function(traj))

                # if args.data_parallel:
                #     self.pol.dp_run = True
                #     self.vf.dp_run = True

                result_dict = ppo_clip.train(
                    traj=traj,
                    pol=self.pol,
                    vf=self.vf,
                    clip_param=args.clip_eps,
                    optim_pol=self.optim_pol,
                    optim_vf=self.optim_vf,
                    epoch=args.epoch_per_iter,
                    batch_size=args.batch_size
                    if not args.rnn else args.rnn_batch_size,
                    max_grad_norm=args.max_grad_norm,
                )

                # if args.data_parallel:
                #     self.pol.dp_run = False
                #     self.vf.dp_run = False

            ## append the metrics to the `results_dict` (reported in the progress.csv)
            result_dict.update(self.get_extra_metrics(epis))

            total_epi += traj.num_epi
            step = traj.num_step
            total_step += step
            rewards = [np.sum(epi["rews"]) for epi in epis]
            mean_rew = np.mean(rewards)
            logger.record_results(
                self.logger.get_logdir(),
                result_dict,
                score_file,
                total_epi,
                step,
                total_step,
                rewards,
                plot_title=args.env,
            )

            if mean_rew > max_rew:
                self.save_models("max")
                max_rew = mean_rew

            self.save_models("last")

            self.scheduler_pol.step()
            self.scheduler_vf.step()

            del traj