# train policy def on_iteration(i, loss, states, actions, rewards, discount): writer.add_scalar('mc_pilco/episode_%d/training loss' % ps_it, loss, i) print("Policy search iteration %d" % (ps_it + 1)) algorithms.mc_pilco(x0, dyn, pol, args.pred_H, opt2, exp, args.pol_opt_iters, discount=args.discount_factor, pegasus=True, mm_states=True, mm_rewards=True, mm_groups=args.mm_groups, maximize=True, clip_grad=args.pol_clip, resampling_period=args.resampling_period, step_idx_to_sample=args.timesteps_to_sample, init_state_noise=1e-2 * x0.std(0), prioritized_replay=args.prioritized_replay, on_iteration=on_iteration, debug=args.debug) torch.save(pol.state_dict(), os.path.join(results_folder, 'latest_policy.pth.tar')) if args.plot_level > 0: utils.plot_rollout(x0[:25], dyn, pol, args.pred_H * 2) writer.add_scalar('robot/evaluation_loss', torch.tensor(ret[2]).sum(), ps_it + 1)
def on_iteration(i, loss, states, actions, rewards, discount): writer.add_scalar('mc_pilco/episode_%d/training loss' % ps_it, loss, i) print("Policy search iteration %d" % (ps_it + 1)) algorithms.mc_pilco(x0, dyn, pol, args.pred_H, opt2, exp, args.pol_opt_iters, value_func=V, discount=args.discount_factor, pegasus=True, mm_states=False, mm_rewards=False, mm_groups=args.mm_groups, maximize=True, clip_grad=args.pol_clip, step_idx_to_sample=args.timesteps_to_sample, init_state_noise=1e-1 * x0.std(0), prioritized_replay=args.prioritized_replay, on_iteration=on_iteration, on_rollout=update_V_fn, debug=args.debug) torch.save(pol.state_dict(), os.path.join(results_folder, 'latest_policy.pth.tar')) torch.save(V.state_dict(), os.path.join(results_folder, 'latest_critic.pth.tar')) if args.plot_level > 0:
x0 = x0 + 1e-2 * x0.std(0) * torch.randn_like(x0) x0 = x0.detach() utils.plot_rollout(x0[:25], dyn, pol, pred_H * 2) # train policy def on_iteration(i, loss, states, actions, rewards, discount): writer.add_scalar('mc_pilco/episode_%d/training loss' % ps_it, loss, i) if i % 100 == 0: writer.flush() print("Policy search iteration %d" % (ps_it + 1)) algorithms.mc_pilco(x0, dyn, pol, max(10, min(pred_H, ps_it)), opt2, exp, N_polopt, pegasus=True, mm_states=False, mm_rewards=False, maximize=True, clip_grad=1.0, step_idx_to_sample=None, on_iteration=on_iteration) utils.plot_rollout(x0[:25], dyn, pol, pred_H * 2) writer.add_scalar('robot/evaluation_loss', torch.tensor(ret[2]).sum(), ps_it + 1)
loss, i) if i % 100 == 0: ''' states = states.transpose(0, 1).cpu().detach().numpy() actions = actions.transpose(0, 1).cpu().detach().numpy() rewards = rewards.transpose(0, 1).cpu().detach().numpy() utils.plot_trajectories(states, actions, rewards, plot_samples=True) ''' writer.flush() print("Policy search iteration %d" % (ps_it + 1)) algorithms.mc_pilco(x0, dyn, pol, pred_H, opt2, exp, N_polopt, pegasus=True, mm_states=False, mm_rewards=False, maximize=True, clip_grad=1.0, on_iteration=on_iteration) utils.plot_rollout(x0[:25], dyn, pol, pred_H * 2) writer.add_scalar('robot/evaluation_loss', torch.tensor(ret[2]).sum(), ps_it + 1)
# train policy def on_iteration(i, loss, states, actions, rewards, discount): writer.add_scalar('mc_pilco/episode_%d/training loss' % ps_it, loss, i) print("Policy search iteration %d" % (ps_it + 1)) algorithms.mc_pilco(x0, dyn, pol, args.pred_H, opt2, exp, args.pol_opt_iters, value_func=V, discount=args.discount_factor, pegasus=True, mm_states=False, mm_rewards=False, maximize=True, clip_grad=args.pol_clip, step_idx_to_sample=None, init_state_noise=0.0, prioritized_replay=True, on_iteration=on_iteration, on_rollout=update_V_fn) torch.save(pol.state_dict(), os.path.join(results_folder, 'latest_policy.pth.tar')) torch.save(V.state_dict(), os.path.join(results_folder, 'latest_critic.pth.tar')) if args.plot_level > 0: utils.plot_rollout(x0[:25], dyn, pol, args.pred_H * 2)
utils.plot_rollout(x0[:25], dyn, pol, args.pred_H * 2) # train policy def on_iteration(i, loss, states, actions, rewards, discount): writer.add_scalar('mc_pilco/episode_%d/training loss' % ps_it, loss, i) print("Policy search iteration %d" % (ps_it + 1)) algorithms.mc_pilco(x0, dyn, pol, args.pred_H, opt2, exp, args.pol_opt_iters, discount=args.discount_factor, pegasus=True, mm_states=True, mm_rewards=True, maximize=True, clip_grad=args.pol_clip, step_idx_to_sample=0, init_state_noise=1e-1 * x0.std(0), on_iteration=on_iteration) torch.save(pol.state_dict(), os.path.join(results_folder, 'latest_policy.pth.tar')) if args.plot_level > 0: utils.plot_rollout(x0[:25], dyn, pol, args.pred_H * 2) writer.add_scalar('robot/evaluation_loss', torch.tensor(ret[2]).sum(), ps_it + 1)
H, callback=lambda *args, **kwargs: env.render()) exp.append_episode(*ret) # train dynamics X, Y = exp.get_dynmodel_dataset(deltas=True, return_costs=learn_reward) dyn.set_dataset( torch.tensor(X).to(dyn.X.device).float(), torch.tensor(Y).to(dyn.X.device).float()) train_regressor(dyn, 1000, N_particles, True) x0 = torch.tensor(exp.sample_initial_state(N_particles)).to( dyn.X.device).float() x0 += 1e-2 * x0.std(0) * torch.randn_like(x0) utils.plot_rollout(x0, forward_fn, pol, H) # train policy print "Policy search iteration %d" % (ps_it + 1) algorithms.mc_pilco(x0, forward_fn, dyn, pol, H, opt, exp=exp, maximize=False, pegasus=True, mm_states=True, mm_rewards=True, angle_dims=angle_dims) utils.plot_rollout(x0, forward_fn, pol, H)
states = states.transpose(0, 1).cpu().detach().numpy() actions = actions.transpose(0, 1).cpu().detach().numpy() rewards = rewards.transpose(0, 1).cpu().detach().numpy() utils.plot_trajectories(states, actions, rewards, plot_samples=True) ''' writer.flush() print("Policy search iteration %d" % (ps_it + 1)) algorithms.mc_pilco(x0, dyn, pol, pred_H, opt2, exp, N_polopt, value_func=None if ps_it < N_val_warmup else V, discount=0.001**(1.0 / control_H), pegasus=True, mm_states=False, mm_rewards=False, maximize=True, clip_grad=1.0, step_idx_to_sample=None, on_iteration=on_iteration) utils.plot_rollout(x0[:25], dyn, pol, pred_H * 2) writer.add_scalar('robot/evaluation_loss', torch.tensor(ret[2]).sum(), ps_it + 1)
x0 = x0.detach() utils.plot_rollout(x0, dyn, pol, control_H) # train policy def on_iteration(i, loss, states, actions, rewards, discount): writer.add_scalar('mc_pilco/episode_%d/training loss' % ps_it, loss, i) if i % 100 == 0: writer.flush() print("Policy search iteration %d" % (ps_it + 1)) algorithms.mc_pilco(x0, dyn, pol, pred_H, opt2, exp, N_polopt, pegasus=True, mm_states=True, mm_rewards=True, maximize=True, clip_grad=1.0, on_iteration=on_iteration, step_idx_to_sample=0, init_state_noise=1e-1 * x0.std(0)) utils.plot_rollout(x0, dyn, pol, control_H) writer.add_scalar('robot/evaluation_loss', torch.tensor(ret[2]).sum(), ps_it + 1)
2000, N_particles, True, opt1, log_likelihood=log_likelihood_loss) # sample initial states for policy optimization x0 = torch.tensor(exp.sample_states(N_particles, timestep=0)).to( dyn.X.device).float() x0 = x0 + 1e-1 * x0.std(0) * torch.randn_like(x0) x0 = x0.detach() utils.plot_rollout(x0, dyn, pol, H) # train policy print("Policy search iteration %d" % (ps_it + 1)) algorithms.mc_pilco(x0, dyn, pol, H, opt2, exp, 1000, pegasus=True, mm_states=True, mm_rewards=True, maximize=True, clip_grad=1.0) utils.plot_rollout(x0, dyn, pol, H) writer.add_scalar('robot/evaluation_loss', torch.tensor(ret[2]).sum(), ps_it + 1)