def play_random_custom(env, steps): _NOP = 0 actions = [['start'], ['NOOP'], ['right', 'A'], ['left', 'A'], ['left', 'B'], ['right', 'B'], ['up'], ['down'], ['A'], ['B']] env = JoypadSpace(env, actions) env.reset() action = 0 start = time.time() # play_human for t in range(0, steps): # get the mapping of keyboard keys to actions in the environment if hasattr(env, 'get_keys_to_action'): keys_to_action = env.get_keys_to_action() elif hasattr(env.unwrapped, 'get_keys_to_action'): keys_to_action = env.unwrapped.get_keys_to_action() else: raise ValueError('env has no get_keys_to_action method') # # change action every 6 frames if t % 6 == 0: action = env.action_space.sample() # after 500 timesteps, stop pressing start button if t > 500: while action == 0: action = env.action_space.sample() observation, reward, done, info = env.step(action) # print("---------------------------t: ", t) # print("action space: ", action, env.action_space) # print("obs: ", observation) # print("reward: ", reward) # print("info: ", info) # runs game at about 60fps time.sleep(0.016667) env.render() end = time.time() env.close() print("time: ", (end - start), " seconds for ", steps, "steps")
def play_random_custom(env, steps): _NOP = 0 env = JoypadSpace(env, actions) env.reset() action = 0 start = time.time() if SHOULD_TRAIN: init_screen = get_screen() _, _, screen_height, screen_width = init_screen.shape # INIT Neural Network policy = Policy(screen_height, screen_width, len(actions)) if SHOULD_LOAD_STATE: print("Loading model from: ", DATA_PATH) policy.load_state_dict(torch.load(DATA_PATH)) optimizer = optim.Adam(policy.parameters(), lr=1e-2) eps = np.finfo(np.float32).eps.item() # Helper functions def select_action(state): global steps_done sample = random.random() eps_threshold = reward_threshold # eps_threshold = EPS_END + (EPS_START - EPS_END) * \ # math.exp(-1. * steps_done / EPS_DECAY) steps_done += 1 if sample > eps_threshold: with torch.no_grad(): # t.max(1) will return largest column value of each row. # second column on max result is index of where max element was # found, so we pick action with the larger expected reward. return policy(state).max(1)[1].view(1, 1) else: return torch.tensor([[random.randrange(len(actions))]], device=device, dtype=torch.long) def finish_episode(): R = 0 policy_loss = [] returns = [] for r in policy.rewards[::-1]: R = r + GAMMA * R returns.insert(0, R) returns = torch.tensor(returns) returns = (returns - returns.mean()) / \ (returns.std() + eps) for log_prob, R in zip(policy.saved_log_probs, returns): policy_loss.append(-log_prob * R) optimizer.zero_grad() print("POLICY LOSS: ", policy_loss) # policy_loss = torch.cat(policy_loss).sum() # policy_loss.backward() optimizer.step() torch.save(policy.state_dict(), DATA_PATH) del policy.rewards[:] del policy.saved_log_probs[:] running_reward = 10 for i_episode in count(1): print("Episode: ", i_episode) state, ep_reward = env.reset(), 0 # Don't infinite loop while learning for t in range(1, num_steps_per_episode): action = select_action(state).data.cpu().numpy()[0][0] # print("ACTION:", action) state, reward, done, info = env.step(action) if SHOULD_RENDER: env.render() policy.rewards.append(reward) ep_reward += reward if done: break running_reward = 0.05 * ep_reward + \ (1 - 0.05) * running_reward finish_episode() if i_episode % log_interval == 0: print( 'Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}' .format(i_episode, ep_reward, running_reward)) print("Running reward: ", running_reward) if running_reward > reward_threshold: print("Solved! Running reward is now {} and " "the last episode runs to {} time steps!".format( running_reward, t)) break else: # PLAY RANDOMLY for t in range(0, steps): # get the mapping of keyboard keys to actions in the environment if hasattr(env, 'get_keys_to_action'): keys_to_action = env.get_keys_to_action() elif hasattr(env.unwrapped, 'get_keys_to_action'): keys_to_action = env.unwrapped.get_keys_to_action() else: raise ValueError( 'env has no get_keys_to_action method') # # change action every 6 frames if t % 6 == 0: action = env.action_space.sample() # after 500 timesteps, stop pressing start button if t > 500: while action == 0: action = env.action_space.sample() observation, reward, done, info = env.step(action) print("---------------------------t: ", t) print("action space: ", action, env.action_space) print("obs: ", observation.shape) print("reward: ", reward) print("info: ", info) # runs game at about 60fps time.sleep(0.016667) env.render() end = time.time() env.close() print("time: ", (end - start), " seconds for ", steps, "steps")