opt1, log_likelihood=dyn.output_density.log_prob, prioritized_sampling=args.prioritized_replay, summary_writer=writer, summary_scope='model_learning/episode_%d' % ps_it) torch.save(dyn.state_dict(), os.path.join(results_folder, 'latest_dynamics.pth.tar')) # sample initial states for policy optimization x0 = exp.sample_states(args.pol_batch_size, timestep=0).to(dyn.X.device, dyn.X.dtype).detach() if args.plot_level > 0: utils.plot_rollout(x0[:25], dyn, pol, args.pred_H * 2) # train policy def on_iteration(i, loss, states, actions, rewards, discount): writer.add_scalar('mc_pilco/episode_%d/training loss' % ps_it, loss, i) print("Policy search iteration %d" % (ps_it + 1)) algorithms.mc_pilco(x0, dyn, pol, args.pred_H, opt2, exp, args.pol_opt_iters, discount=args.discount_factor,
exp.save(results_filename) if it < n_rnd - 1: continue ps_it = it - n_rnd + 1 def on_iteration(i, loss, states, actions, rewards, opt, policy, dynamics): writer.add_scalar('mc_pilco/episode_%d/training loss' % ps_it, loss, i) if i % 100 == 0: states = states.transpose(0, 1).cpu().detach().numpy() actions = actions.transpose(0, 1).cpu().detach().numpy() rewards = rewards.transpose(0, 1).cpu().detach().numpy() utils.plot_trajectories(states, actions, rewards, plot_samples=False) # train agent agent.fit(exp, H, 120, batch_size=N_particles) # plot rollout x0 = torch.tensor(exp.sample_states(N_particles, timestep=0)).to( agent.dyn.X.device).float() x0 = x0 + 1e-1 * x0.std(0) * torch.randn_like(x0) x0 = x0.detach() utils.plot_rollout(x0, agent.dyn, agent.actor_target, H) writer.add_scalar('robot/evaluation_loss', torch.tensor(ret[2]).sum(), ps_it + 1)
N_dynopt, N_particles, True, opt1, log_likelihood=dyn.output_density.log_prob, summary_writer=writer, summary_scope='model_learning/episode_%d' % ps_it) # sample initial states for policy optimization x0 = exp.sample_states(N_particles, timestep=0).to(dyn.X.device).float() x0 = x0 + 1e-1 * torch.randn_like(x0) x0 = x0.detach() utils.plot_rollout(x0, dyn, pol, control_H) # train policy def on_iteration(i, loss, states, actions, rewards, discount): writer.add_scalar('mc_pilco/episode_%d/training loss' % ps_it, loss, i) if i % 100 == 0: writer.flush() print("Policy search iteration %d" % (ps_it + 1)) algorithms.mc_pilco(x0, dyn, pol, pred_H, opt2, exp,
ret = apply_controller(env, pol, H, callback=lambda *args, **kwargs: env.render()) exp.append_episode(*ret) # train dynamics X, Y = exp.get_dynmodel_dataset(deltas=True, return_costs=learn_reward) dyn.set_dataset( torch.tensor(X).to(dyn.X.device).float(), torch.tensor(Y).to(dyn.X.device).float()) train_regressor(dyn, 1000, N_particles, True) x0 = torch.tensor(exp.sample_initial_state(N_particles)).to( dyn.X.device).float() x0 += 1e-2 * x0.std(0) * torch.randn_like(x0) utils.plot_rollout(x0, forward_fn, pol, H) # train policy print "Policy search iteration %d" % (ps_it + 1) algorithms.mc_pilco(x0, forward_fn, dyn, pol, H, opt, exp=exp, maximize=False, pegasus=True, mm_states=True, mm_rewards=True, angle_dims=angle_dims)
dyn.set_dataset( torch.tensor(X).to(dyn.X.device).float(), torch.tensor(Y).to(dyn.X.device).float()) utils.train_regressor(dyn, 2000, N_particles, True, opt1, log_likelihood=log_likelihood_loss) # sample initial states for policy optimization x0 = torch.tensor(exp.sample_states(N_particles, timestep=0)).to( dyn.X.device).float() x0 = x0 + 1e-1 * x0.std(0) * torch.randn_like(x0) x0 = x0.detach() utils.plot_rollout(x0, dyn, pol, H) # train policy print("Policy search iteration %d" % (ps_it + 1)) algorithms.mc_pilco(x0, dyn, pol, H, opt2, exp, 1000, pegasus=True, mm_states=True, mm_rewards=True, maximize=True, clip_grad=1.0)