device_name = 'cpu' if args.cuda < 0 else "cuda:{}".format(args.cuda) device = torch.device(device_name) set_device(device) r = redis.StrictRedis() high = np.inf * np.ones(4) observation_space = Box(low=-high, high=high, dtype=np.float32) high = np.ones(1) action_space = Box(low=-high, high=high) pol_net = PolNetLSTM(observation_space, action_space, h_size=args.h_size, cell_size=args.cell_size) pol = GaussianPol(observation_space, action_space, pol_net, data_parallel=args.data_parallel, rnn=True) if args.pol: pol.load_state_dict(torch.load(args.pol, map_location=lambda storage, loc: storage)) else: raise Exception pol.to(device) pol.dp_run = False pol.reset() r.set('start', 'false') while True: if r.get('start').decode('utf-8') == 'true': break class Process(object):
s_pol = GaussianPol(observation_space, action_space, s_pol_net, args.rnn) elif isinstance(action_space, gym.spaces.Discrete): t_pol = CategoricalPol( observation_space, action_space, t_pol_net, args.rnn) s_pol = CategoricalPol( observation_space, action_space, s_pol_net, args.rnn) elif isinstance(action_space, gym.spaces.MultiDiscrete): t_pol = MultiCategoricalPol( observation_space, action_space, t_pol_net, args.rnn) s_pol = MultiCategoricalPol( observation_space, action_space, s_pol_net, args.rnn) else: raise ValueError('Only Box, Discrete and Multidiscrete are supported') if args.teacher_pol: t_pol.load_state_dict(torch.load( os.path.join(args.teacher_dir, args.teacher_fname))) if args.rnn: s_vf_net = VNetLSTM(observation_space, h_size=256, cell_size=256) else: s_vf_net = VNet(observation_space) if args.sampling_policy == 'teacher': teacher_sampler = EpiSampler( env, t_pol, num_parallel=args.num_parallel, seed=args.seed) student_sampler = EpiSampler( env,
log_dir_name = 'garbage_airl_20190915_2' env_name = 'RoboschoolPremaidAIWalker-v0' env = gym.make(env_name) env.seed(seed) # check dimension of observation space and action space observation_space = env.observation_space action_space = env.action_space # policy pol_net = PolNet(observation_space, action_space) # load best policy best_path = f'{log_dir_name}/models/pol_max.pkl' best_pol = GaussianPol(observation_space, action_space, pol_net) best_pol.load_state_dict(torch.load(best_path), ) # show trained policy's behavior done = False o = env.reset() for _ in range(1000): # show 16.5 sec (0.0165 * 1000) if done: time.sleep(1) # when the boundary of eposode o = env.reset() ac_real, ac, a_i = best_pol.deterministic_ac_real( torch.tensor(o, dtype=torch.float)) ac_real = ac_real.reshape(best_pol.action_space.shape) next_o, r, done, e_i = env.step(np.array(ac_real)) o = next_o env.render() # time.sleep(0.0165)