'cpu', torch.uint8).numpy() im = Image.fromarray(ndarr) return im episodes = 2 gru_size = 32 bhx_size = 64 ox_size = 100 input_c_features = 8 * 5 * 5 eps = (0, 0) # hx_ae_model = HxQBNet(gru_size, bhx_size) ox_ae_best_path = "./resources/pongD_deconv_obs_model_v1.p" env_name = "PongDeterministic-v4" env = atari_wrapper(env_name) obs = env.reset() ox_ae_model = ConvObsQBNet(len(obs), ox_size) # initialize visualization app _, _, obs_data, _ = pickle.loads( open("./resources/pongD_bottleneck_data.p", "rb").read()) vis_board = visboard() vis_board.add_ae(ox_ae_model, obs_data, latent_options={ 'n': ox_ae_model.latent_size, 'min': -1, 'max': 1,
if not os.path.exists('results/'): os.mkdir('results/') if not os.path.exists('results/' + args.model_type): os.mkdir('results/' + args.model_type) # start to create models... if args.model_type == 'inception': model = models.inception_v3(pretrained=True) elif args.model_type == 'resnet152': model = models.resnet152(pretrained=True) elif args.model_type == 'resnet18': model = models.resnet18(pretrained=True) elif args.model_type == 'vgg19': model = models.vgg19_bn(pretrained=True) elif args.model_type == 'atari': env = atari_wrapper("PongDeterministic-v4") env.seed(1) obs = env.reset() gru_net = GRUNetConv(len(obs), 32, int(env.action_space.n)) ox_net = ObsQBNet(gru_net.input_c_features, 100) model = MMNet(gru_net, None, ox_net) model_path = "./pongD_bgru_model.p" pretrained_ox_dict = { k[8:]: v for k, v in torch.load(model_path, map_location='cpu').items() if k.startswith("obx_net") } model.obx_net.load_state_dict(pretrained_ox_dict) pretrained_conv_dict = { k[8:]: v for k, v in torch.load(model_path, map_location='cpu').items()
def gather_observations(env_name, gru_size, bhx_size, ox_size, bgru_net_path, device, episodes=1, env_type='atari'): if os.path.exists('./inputs/' + str(env_name) + '/observations.pt'): observations = torch.load('./inputs/' + str(env_name) + '/observations.pt', map_location=device) return observations if env_type == 'atari': env = atari_wrapper(env_name) env.seed(0) obs = env.reset() gru_net = GRUNet(len(obs), gru_size, int(env.action_space.n)) bhx_net = HxQBNet(gru_size, bhx_size) ox_net = ObsQBNet(gru_net.input_c_features, ox_size) bgru_net = MMNet(gru_net, bhx_net, ox_net) elif env_type == 'classic_control': env = gym.make(env_name) env.seed(0) obs = env.reset() gru_net = ControlGRUNet(len(obs), gru_size, int(env.action_space.n)) bhx_net = ControlHxQBNet(gru_size, bhx_size) ox_net = ControlObsQBNet(gru_net.input_c_features, ox_size) bgru_net = ControlMMNet(gru_net, bhx_net, ox_net) if cuda: bgru_net = bgru_net.cuda() bgru_net.load_state_dict(torch.load(bgru_net_path, map_location='cpu')) bgru_net.eval() bgru_net.eval() max_actions = 10000 random.seed(0) x = set([]) observations = [] with torch.no_grad(): for ep in range(episodes): done = False obs = env.reset() curr_state = bgru_net.init_hidden() if cuda: curr_state = curr_state.cuda() curr_state_x = bgru_net.state_encode(curr_state) ep_reward = 0 ep_actions = [] record_changes = [] while not done: # env.render() curr_action = bgru_net.get_action_linear(curr_state_x, decode=True) prob = F.softmax(curr_action, dim=1) curr_action = int(prob.max(1)[1].cpu().data.numpy()[0]) obs = torch.Tensor(obs).unsqueeze(0) if cuda: obs = obs.cuda() critic, logit, next_state, (next_state_c, next_state_x), ( _, obs_x, obs_tanh) = bgru_net((obs, curr_state), inspect=True) observations.append(obs) prob = F.softmax(logit, dim=1) next_action = int(prob.max(1)[1].cpu().data.numpy()) obs, reward, done, _ = env.step(next_action) done = done if len(ep_actions) <= max_actions else True # a quick hack to prevent the agent from stucking max_same_action = 5000 if len(ep_actions) > max_same_action: actions_to_consider = ep_actions[-max_same_action:] if actions_to_consider.count( actions_to_consider[0]) == max_same_action: done = True curr_state = next_state curr_state_x = next_state_x ep_reward += reward x.add(''.join( [str(int(i)) for i in next_state.cpu().data.numpy()[0]])) torch.save(observations, './inputs/' + str(env_name) + '/observations.pt') return observations
def state_encode(self, state): return self.bhx_net.encode(state) def obs_encode(self, obs, hx=None): if hx is None: hx = Variable(torch.zeros(1, self.gru_units)) if next(self.parameters()).is_cuda: hx = hx.cuda() _, _, _, (_, _, _, input_x) = self.gru_net((obs, hx), input_fn=self.obx_net, hx_fn=self.bhx_net, inspect=True) return input_x if __name__ == '__main__': args = tl.get_args() env = atari_wrapper(args.env) env.seed(args.env_seed) obs = env.reset() # create directories to store results result_dir = tl.ensure_directory_exits(os.path.join(args.result_dir, 'Atari')) env_dir = tl.ensure_directory_exits(os.path.join(result_dir, args.env)) gru_dir = tl.ensure_directory_exits(os.path.join(env_dir, 'gru_{}'.format(args.gru_size))) gru_net_path = os.path.join(gru_dir, 'model.p') gru_plot_dir = tl.ensure_directory_exits(os.path.join(gru_dir, 'Plots')) bhx_dir = tl.ensure_directory_exits( os.path.join(env_dir, 'gru_{}_bhx_{}{}'.format(args.gru_size, args.bhx_size, args.bhx_suffix))) bhx_net_path = os.path.join(bhx_dir, 'model.p') bhx_plot_dir = tl.ensure_directory_exits(os.path.join(bhx_dir, 'Plots'))