def _thunk(): env = MiniPacman(mode, 1000) return env
plt.show() keys = { 'w': 2, 'd': 1, 'a': 3, 's': 4, ' ': 0 } MODES = ('regular', 'avoid', 'hunt', 'ambush', 'rush') frame_cap = 1000 mode = 'rush' env = MiniPacman(mode, 1000) state = env.reset() done = False total_reward = 0 step = 1 displayImage(state.transpose(1, 2, 0), step, total_reward) while not done: #x = raw_input() #clear_output() try: keys[x] except:
def main(): mode = "regular" num_envs = 16 def make_env(): def _thunk(): env = MiniPacman(mode, 1000) return env return _thunk envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) state_shape = envs.observation_space.shape #a2c hyperparams: gamma = 0.99 entropy_coef = 0.01 value_loss_coef = 0.5 max_grad_norm = 0.5 num_steps = 5 num_frames = int(10e3) #rmsprop hyperparams: lr = 7e-4 eps = 1e-5 alpha = 0.99 #Init a2c and rmsprop actor_critic = ActorCritic(envs.observation_space.shape, envs.action_space.n) optimizer = optim.RMSprop(actor_critic.parameters(), lr, eps=eps, alpha=alpha) #if USE_CUDA: # actor_critic = actor_critic.cuda() rollout = RolloutStorage(num_steps, num_envs, envs.observation_space.shape) #rollout.cuda() all_rewards = [] all_losses = [] state = envs.reset() state = torch.FloatTensor(np.float32(state)) rollout.states[0].copy_(state) episode_rewards = torch.zeros(num_envs, 1) final_rewards = torch.zeros(num_envs, 1) for i_update in tqdm(range(num_frames)): for step in range(num_steps): action = actor_critic.act(autograd.Variable(state)) next_state, reward, done, _ = envs.step( action.squeeze(1).cpu().data.numpy()) reward = torch.FloatTensor(reward).unsqueeze(1) episode_rewards += reward masks = torch.FloatTensor(1 - np.array(done)).unsqueeze(1) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks #if USE_CUDA: # masks = masks.cuda() state = torch.FloatTensor(np.float32(next_state)) rollout.insert(step, state, action.data, reward, masks) _, next_value = actor_critic( autograd.Variable(rollout.states[-1], volatile=True)) next_value = next_value.data returns = rollout.compute_returns(next_value, gamma) logit, action_log_probs, values, entropy = actor_critic.evaluate_actions( autograd.Variable(rollout.states[:-1]).view(-1, *state_shape), autograd.Variable(rollout.actions).view(-1, 1)) values = values.view(num_steps, num_envs, 1) action_log_probs = action_log_probs.view(num_steps, num_envs, 1) advantages = autograd.Variable(returns) - values value_loss = advantages.pow(2).mean() action_loss = -(autograd.Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() loss = value_loss * value_loss_coef + action_loss - entropy * entropy_coef loss.backward() nn.utils.clip_grad_norm(actor_critic.parameters(), max_grad_norm) optimizer.step() if i_update % num_frames == 0: all_rewards.append(final_rewards.mean()) all_losses.append(loss.item()) #clear_output(True) plt.figure(figsize=(20, 5)) plt.subplot(131) plt.title('epoch %s. reward: %s' % (i_update, np.mean(all_rewards[-10:]))) plt.plot(all_rewards) plt.subplot(132) plt.title('loss %s' % all_losses[-1]) plt.plot(all_losses) plt.show() rollout.after_update() torch.save(actor_critic.state_dict(), "actor_critic_" + mode) import time def displayImage(image, step, reward): #clear_output(True) s = "step: " + str(step) + " reward: " + str(reward) plt.figure(figsize=(10, 3)) plt.title(s) plt.imshow(image) plt.show() time.sleep(0.1) env = MiniPacman(mode, 1000) done = False state = env.reset() total_reward = 0 step = 1 while not done: current_state = torch.FloatTensor(state).unsqueeze(0) #if USE_CUDA: # current_state = current_state.cuda() action = actor_critic.act(autograd.Variable(current_state)) next_state, reward, done, _ = env.step(action.data[0, 0]) total_reward += reward state = next_state image = torch.FloatTensor(state).permute(1, 2, 0).cpu().numpy() displayImage(image, step, total_reward) step += 1
def main(): mode = "regular" num_envs = 16 def make_env(): def _thunk(): env = MiniPacman(mode, 1000) return env return _thunk envs = [make_env() for i in range(num_envs)] envs = SubprocVecEnv(envs) state_shape = envs.observation_space.shape num_actions = envs.action_space.n env_model = EnvModel(envs.observation_space.shape, num_pixels, len(mode_rewards["regular"])) actor_critic = ActorCritic(envs.observation_space.shape, envs.action_space.n) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(env_model.parameters()) actor_critic.load_state_dict(torch.load("actor_critic_" + mode)) def get_action(state): if state.ndim == 4: state = torch.FloatTensor(np.float32(state)) else: state = torch.FloatTensor(np.float32(state)).unsqueeze(0) action = actor_critic.act(autograd.Variable(state, volatile=True)) action = action.data.cpu().squeeze(1).numpy() return action def play_games(envs, frames): states = envs.reset() for frame_idx in range(frames): actions = get_action(states) next_states, rewards, dones, _ = envs.step(actions) yield frame_idx, states, actions, rewards, next_states, dones states = next_states reward_coef = 0.1 num_updates = 5000 losses = [] all_rewards = [] for frame_idx, states, actions, rewards, next_states, dones in tqdm( play_games(envs, num_updates), total=num_updates): states = torch.FloatTensor(states) actions = torch.LongTensor(actions) batch_size = states.size(0) onehot_actions = torch.zeros(batch_size, num_actions, *state_shape[1:]) onehot_actions[range(batch_size), actions] = 1 inputs = autograd.Variable(torch.cat([states, onehot_actions], 1)) #if USE_CUDA: # inputs = inputs.cuda() imagined_state, imagined_reward = env_model(inputs) target_state = pix_to_target(next_states) target_state = autograd.Variable(torch.LongTensor(target_state)) target_reward = rewards_to_target(mode, rewards) target_reward = autograd.Variable(torch.LongTensor(target_reward)) optimizer.zero_grad() image_loss = criterion(imagined_state, target_state) reward_loss = criterion(imagined_reward, target_reward) loss = image_loss + reward_coef * reward_loss loss.backward() optimizer.step() losses.append(loss.item()) all_rewards.append(np.mean(rewards)) if frame_idx % num_updates == 0: plot(frame_idx, all_rewards, losses) torch.save(env_model.state_dict(), "env_model_" + mode) import time env = MiniPacman(mode, 1000) batch_size = 1 done = False state = env.reset() iss = [] ss = [] steps = 0 while not done: steps += 1 actions = get_action(state) onehot_actions = torch.zeros(batch_size, num_actions, *state_shape[1:]) onehot_actions[range(batch_size), actions] = 1 state = torch.FloatTensor(state).unsqueeze(0) inputs = autograd.Variable(torch.cat([state, onehot_actions], 1)) #if USE_CUDA: # inputs = inputs.cuda() imagined_state, imagined_reward = env_model(inputs) imagined_state = F.softmax(imagined_state) iss.append(imagined_state) next_state, reward, done, _ = env.step(actions[0]) ss.append(state) state = next_state imagined_image = target_to_pix( imagined_state.view(batch_size, -1, len(pixels))[0].max(1)[1].data.cpu().numpy()) imagined_image = imagined_image.reshape(15, 19, 3) state_image = torch.FloatTensor(next_state).permute(1, 2, 0).cpu().numpy() #clear_output() plt.figure(figsize=(10, 3)) plt.subplot(131) plt.title("Imagined") plt.imshow(imagined_image) plt.subplot(132) plt.title("Actual") plt.imshow(state_image) plt.show() time.sleep(0.3) if steps > 30: break