示例#1
0
        # train policy
        def on_iteration(i, loss, states, actions, rewards, discount):
            writer.add_scalar('mc_pilco/episode_%d/training loss' % ps_it,
                              loss, i)

        print("Policy search iteration %d" % (ps_it + 1))
        algorithms.mc_pilco(x0,
                            dyn,
                            pol,
                            args.pred_H,
                            opt2,
                            exp,
                            args.pol_opt_iters,
                            discount=args.discount_factor,
                            pegasus=True,
                            mm_states=True,
                            mm_rewards=True,
                            mm_groups=args.mm_groups,
                            maximize=True,
                            clip_grad=args.pol_clip,
                            resampling_period=args.resampling_period,
                            step_idx_to_sample=args.timesteps_to_sample,
                            init_state_noise=1e-2 * x0.std(0),
                            prioritized_replay=args.prioritized_replay,
                            on_iteration=on_iteration,
                            debug=args.debug)
        torch.save(pol.state_dict(),
                   os.path.join(results_folder, 'latest_policy.pth.tar'))
        if args.plot_level > 0:
            utils.plot_rollout(x0[:25], dyn, pol, args.pred_H * 2)
        writer.add_scalar('robot/evaluation_loss',
                          torch.tensor(ret[2]).sum(), ps_it + 1)
示例#2
0
        def on_iteration(i, loss, states, actions, rewards, discount):
            writer.add_scalar('mc_pilco/episode_%d/training loss' % ps_it,
                              loss, i)

        print("Policy search iteration %d" % (ps_it + 1))
        algorithms.mc_pilco(x0,
                            dyn,
                            pol,
                            args.pred_H,
                            opt2,
                            exp,
                            args.pol_opt_iters,
                            value_func=V,
                            discount=args.discount_factor,
                            pegasus=True,
                            mm_states=False,
                            mm_rewards=False,
                            mm_groups=args.mm_groups,
                            maximize=True,
                            clip_grad=args.pol_clip,
                            step_idx_to_sample=args.timesteps_to_sample,
                            init_state_noise=1e-1 * x0.std(0),
                            prioritized_replay=args.prioritized_replay,
                            on_iteration=on_iteration,
                            on_rollout=update_V_fn,
                            debug=args.debug)
        torch.save(pol.state_dict(),
                   os.path.join(results_folder, 'latest_policy.pth.tar'))
        torch.save(V.state_dict(),
                   os.path.join(results_folder, 'latest_critic.pth.tar'))
        if args.plot_level > 0:
        x0 = x0 + 1e-2 * x0.std(0) * torch.randn_like(x0)
        x0 = x0.detach()

        utils.plot_rollout(x0[:25], dyn, pol, pred_H * 2)

        # train policy
        def on_iteration(i, loss, states, actions, rewards, discount):
            writer.add_scalar('mc_pilco/episode_%d/training loss' % ps_it,
                              loss, i)
            if i % 100 == 0:
                writer.flush()

        print("Policy search iteration %d" % (ps_it + 1))
        algorithms.mc_pilco(x0,
                            dyn,
                            pol,
                            max(10, min(pred_H, ps_it)),
                            opt2,
                            exp,
                            N_polopt,
                            pegasus=True,
                            mm_states=False,
                            mm_rewards=False,
                            maximize=True,
                            clip_grad=1.0,
                            step_idx_to_sample=None,
                            on_iteration=on_iteration)
        utils.plot_rollout(x0[:25], dyn, pol, pred_H * 2)
        writer.add_scalar('robot/evaluation_loss',
                          torch.tensor(ret[2]).sum(), ps_it + 1)
示例#4
0
                              loss, i)
            if i % 100 == 0:
                '''
                states = states.transpose(0, 1).cpu().detach().numpy()
                actions = actions.transpose(0, 1).cpu().detach().numpy()
                rewards = rewards.transpose(0, 1).cpu().detach().numpy()
                utils.plot_trajectories(states,
                                        actions,
                                        rewards,
                                        plot_samples=True)
                '''
                writer.flush()

        print("Policy search iteration %d" % (ps_it + 1))
        algorithms.mc_pilco(x0,
                            dyn,
                            pol,
                            pred_H,
                            opt2,
                            exp,
                            N_polopt,
                            pegasus=True,
                            mm_states=False,
                            mm_rewards=False,
                            maximize=True,
                            clip_grad=1.0,
                            on_iteration=on_iteration)
        utils.plot_rollout(x0[:25], dyn, pol, pred_H * 2)
        writer.add_scalar('robot/evaluation_loss',
                          torch.tensor(ret[2]).sum(), ps_it + 1)
示例#5
0
        # train policy
        def on_iteration(i, loss, states, actions, rewards, discount):
            writer.add_scalar('mc_pilco/episode_%d/training loss' % ps_it,
                              loss, i)

        print("Policy search iteration %d" % (ps_it + 1))
        algorithms.mc_pilco(x0,
                            dyn,
                            pol,
                            args.pred_H,
                            opt2,
                            exp,
                            args.pol_opt_iters,
                            value_func=V,
                            discount=args.discount_factor,
                            pegasus=True,
                            mm_states=False,
                            mm_rewards=False,
                            maximize=True,
                            clip_grad=args.pol_clip,
                            step_idx_to_sample=None,
                            init_state_noise=0.0,
                            prioritized_replay=True,
                            on_iteration=on_iteration,
                            on_rollout=update_V_fn)
        torch.save(pol.state_dict(),
                   os.path.join(results_folder, 'latest_policy.pth.tar'))
        torch.save(V.state_dict(),
                   os.path.join(results_folder, 'latest_critic.pth.tar'))
        if args.plot_level > 0:
            utils.plot_rollout(x0[:25], dyn, pol, args.pred_H * 2)
示例#6
0
            utils.plot_rollout(x0[:25], dyn, pol, args.pred_H * 2)

        # train policy
        def on_iteration(i, loss, states, actions, rewards, discount):
            writer.add_scalar('mc_pilco/episode_%d/training loss' % ps_it,
                              loss, i)

        print("Policy search iteration %d" % (ps_it + 1))
        algorithms.mc_pilco(x0,
                            dyn,
                            pol,
                            args.pred_H,
                            opt2,
                            exp,
                            args.pol_opt_iters,
                            discount=args.discount_factor,
                            pegasus=True,
                            mm_states=True,
                            mm_rewards=True,
                            maximize=True,
                            clip_grad=args.pol_clip,
                            step_idx_to_sample=0,
                            init_state_noise=1e-1 * x0.std(0),
                            on_iteration=on_iteration)
        torch.save(pol.state_dict(),
                   os.path.join(results_folder, 'latest_policy.pth.tar'))
        if args.plot_level > 0:
            utils.plot_rollout(x0[:25], dyn, pol, args.pred_H * 2)
        writer.add_scalar('robot/evaluation_loss',
                          torch.tensor(ret[2]).sum(), ps_it + 1)
示例#7
0
                           H,
                           callback=lambda *args, **kwargs: env.render())
    exp.append_episode(*ret)

    # train dynamics
    X, Y = exp.get_dynmodel_dataset(deltas=True, return_costs=learn_reward)
    dyn.set_dataset(
        torch.tensor(X).to(dyn.X.device).float(),
        torch.tensor(Y).to(dyn.X.device).float())
    train_regressor(dyn, 1000, N_particles, True)
    x0 = torch.tensor(exp.sample_initial_state(N_particles)).to(
        dyn.X.device).float()
    x0 += 1e-2 * x0.std(0) * torch.randn_like(x0)
    utils.plot_rollout(x0, forward_fn, pol, H)

    # train policy
    print "Policy search iteration %d" % (ps_it + 1)
    algorithms.mc_pilco(x0,
                        forward_fn,
                        dyn,
                        pol,
                        H,
                        opt,
                        exp=exp,
                        maximize=False,
                        pegasus=True,
                        mm_states=True,
                        mm_rewards=True,
                        angle_dims=angle_dims)
    utils.plot_rollout(x0, forward_fn, pol, H)
示例#8
0
                states = states.transpose(0, 1).cpu().detach().numpy()
                actions = actions.transpose(0, 1).cpu().detach().numpy()
                rewards = rewards.transpose(0, 1).cpu().detach().numpy()
                utils.plot_trajectories(states,
                                        actions,
                                        rewards,
                                        plot_samples=True)
                '''
                writer.flush()

        print("Policy search iteration %d" % (ps_it + 1))
        algorithms.mc_pilco(x0,
                            dyn,
                            pol,
                            pred_H,
                            opt2,
                            exp,
                            N_polopt,
                            value_func=None if ps_it < N_val_warmup else V,
                            discount=0.001**(1.0 / control_H),
                            pegasus=True,
                            mm_states=False,
                            mm_rewards=False,
                            maximize=True,
                            clip_grad=1.0,
                            step_idx_to_sample=None,
                            on_iteration=on_iteration)
        utils.plot_rollout(x0[:25], dyn, pol, pred_H * 2)
        writer.add_scalar('robot/evaluation_loss',
                          torch.tensor(ret[2]).sum(), ps_it + 1)
        x0 = x0.detach()

        utils.plot_rollout(x0, dyn, pol, control_H)

        # train policy
        def on_iteration(i, loss, states, actions, rewards, discount):
            writer.add_scalar('mc_pilco/episode_%d/training loss' % ps_it,
                              loss, i)
            if i % 100 == 0:
                writer.flush()

        print("Policy search iteration %d" % (ps_it + 1))
        algorithms.mc_pilco(x0,
                            dyn,
                            pol,
                            pred_H,
                            opt2,
                            exp,
                            N_polopt,
                            pegasus=True,
                            mm_states=True,
                            mm_rewards=True,
                            maximize=True,
                            clip_grad=1.0,
                            on_iteration=on_iteration,
                            step_idx_to_sample=0,
                            init_state_noise=1e-1 * x0.std(0))
        utils.plot_rollout(x0, dyn, pol, control_H)
        writer.add_scalar('robot/evaluation_loss',
                          torch.tensor(ret[2]).sum(), ps_it + 1)
示例#10
0
                              2000,
                              N_particles,
                              True,
                              opt1,
                              log_likelihood=log_likelihood_loss)

        # sample initial states for policy optimization
        x0 = torch.tensor(exp.sample_states(N_particles, timestep=0)).to(
            dyn.X.device).float()
        x0 = x0 + 1e-1 * x0.std(0) * torch.randn_like(x0)
        x0 = x0.detach()
        utils.plot_rollout(x0, dyn, pol, H)

        # train policy
        print("Policy search iteration %d" % (ps_it + 1))
        algorithms.mc_pilco(x0,
                            dyn,
                            pol,
                            H,
                            opt2,
                            exp,
                            1000,
                            pegasus=True,
                            mm_states=True,
                            mm_rewards=True,
                            maximize=True,
                            clip_grad=1.0)
        utils.plot_rollout(x0, dyn, pol, H)
        writer.add_scalar('robot/evaluation_loss',
                          torch.tensor(ret[2]).sum(), ps_it + 1)