high = np.inf * np.ones(4) observation_space = Box(low=-high, high=high, dtype=np.float32) high = np.ones(1) action_space = Box(low=-high, high=high) pol_net = PolNetLSTM(observation_space, action_space, h_size=args.h_size, cell_size=args.cell_size) pol = GaussianPol(observation_space, action_space, pol_net, data_parallel=args.data_parallel, rnn=True) if args.pol: pol.load_state_dict(torch.load(args.pol, map_location=lambda storage, loc: storage)) else: raise Exception pol.to(device) pol.dp_run = False pol.reset() r.set('start', 'false') while True: if r.get('start').decode('utf-8') == 'true': break class Process(object): def run(self): joint_pendulum = float(r.get('joint_info')) joint_pendulum_vel = 0 self.action_input = 0 first_ob = [np.cos(joint_pendulum), np.sin(joint_pendulum), joint_pendulum_vel,
while args.max_epis > total_epi: with measure('sample'): epis = sampler.sample(pol, max_steps=args.max_steps_per_iter) with measure('train'): traj = Traj() traj.add_epis(epis) traj = ef.compute_vs(traj, vf) traj = ef.compute_rets(traj, args.gamma) traj = ef.compute_advs(traj, args.gamma, args.lam) traj = ef.centerize_advs(traj) traj = ef.compute_h_masks(traj) traj.register_epis() if args.data_parallel: pol.dp_run = True vf.dp_run = True if args.ppo_type == 'clip': result_dict = ppo_clip.train(traj=traj, pol=pol, vf=vf, clip_param=args.clip_param, optim_pol=optim_pol, optim_vf=optim_vf, epoch=args.epoch_per_iter, batch_size=args.batch_size if not args.rnn else args.rnn_batch_size, max_grad_norm=args.max_grad_norm) else: result_dict = ppo_kl.train(traj=traj,
while args.max_epis > total_epi: with measure('sample'): epis = sampler.sample(pol, max_steps=args.max_steps_per_iter) with measure('train'): traj = Traj() traj.add_epis(epis) traj = ef.compute_vs(traj, vf) traj = ef.compute_rets(traj, args.gamma) traj = ef.compute_advs(traj, args.gamma, args.lam) traj = ef.centerize_advs(traj) traj = ef.compute_h_masks(traj) traj.register_epis() if args.data_parallel: pol.dp_run = True vf.dp_run = True result_dict = ppo_clip.train(traj=traj, pol=pol, vf=vf, clip_param=args.clip_param, optim_pol=optim_pol, optim_vf=optim_vf, epoch=args.epoch_per_iter, batch_size=args.batch_size if not args.rnn else args.rnn_batch_size, max_grad_norm=args.max_grad_norm) total_epi += traj.num_epi step = traj.num_step total_step += step rewards = [np.sum(epi['rews']) for epi in epis] mean_rew = np.mean(rewards) logger.record_results(args.log, result_dict, score_file, total_epi, step, total_step, rewards, plot_title=args.env_name)