Exemplo n.º 1
0
epsilon_decay = 30000  #used in ?
epsilon_by_frame = lambda frame_idx: epsilon_final + (
    epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

losses = []
all_rewards = []
episode_reward = 0

state = env.reset()  # initial state

for frame_idx in range(1, num_frames +
                       1):  # plays until player or model gets score 21
    #print("Frame: " + str(frame_idx))      #uncomment to look at frames

    epsilon = epsilon_by_frame(frame_idx)
    action = model.act(state, epsilon)  #will write this function

    next_state, reward, done, _ = env.step(action)  #get next state
    replay_buffer.push(state, action, reward, next_state,
                       done)  #push actions resutls to buffer

    state = next_state
    episode_reward += reward

    if done:  # reset game and
        state = env.reset()
        all_rewards.append((frame_idx, episode_reward))
        episode_reward = 0

    if len(
            replay_buffer
Exemplo n.º 2
0
losses = []
all_rewards = []
episode_reward = 0

state = env.reset()

for frame_idx in range(
        1, num_frames + 1
):  # QUESTION: Why is num_frames > replay_buffer capacity? replay_buffer should
    # overfill because num_frames is larger, so it will keep adding. Does it automatically expand when you push? I think it does expand,
    # using the numpy expand_dims funciton

    #print("Frame: " + str(frame_idx))

    epsilon = epsilon_by_frame(frame_idx)  # get the epsilon value
    action = model.act(state, epsilon)  # This is where act function is used

    next_state, reward, done, _ = env.step(
        action)  # look at next state to see if it gives us a reward
    replay_buffer.push(state, action, reward, next_state, done)

    state = next_state
    episode_reward += reward

    # if the game is over
    if done:
        state = env.reset()
        all_rewards.append(
            (frame_idx, episode_reward))  # record reward for that game
        episode_reward = 0  # reset
frame_list = random.sample(range(2000, num_frames), 8000)
frame_list.sort()

hiddenLayers = []
state_list = []
action_list = []
reward_frame_list = []
accumulated_reward = []  #####
frame_order = []

epsilon = -1

for frame_idx in range(1, num_frames + 1):

    action = model.act(state, epsilon)

    next_state, reward, done, _ = env.step(action)

    if (frame_idx in frame_list) or (frame_idx < 2000):
        hiddenTensor = model.get_hidden_layer(state)
        temp = hiddenTensor.data.cpu().numpy()
        hiddenLayers.append(temp[0])
        #hiddenLayers.append(hiddenTensor.data.cpu().numpy())
        #hiddenLayers = np.concatenate((hiddenLayers, hiddenTensor.data.cpu().numpy()), axis=0)
        state_list.append(state.squeeze(0))
        action_list.append(action)
        reward_frame_list.append(reward)
        accumulated_reward.append(episode_reward)  #####
        frame_order.append(frame_idx)
        #env.env.ale.saveScreenPNG('test_image.png')
Exemplo n.º 4
0
if USE_CUDA:
    model = model.cuda()
    print("Using cuda")

model.load_state_dict(torch.load(pthname, map_location='cpu'))

env.seed(1)
state = env.reset()
done = False

games_won = 0

while not done:
    if use_gui:
        env.render()

    action = model.act(state, 0)

    state, reward, done, _ = env.step(action)

    if reward != 0:
        print(reward)
    if reward == 1:
        games_won += 1

print("Games Won: {}".format(games_won))
try:
    sys.exit(0)
except:
    pass
Exemplo n.º 5
0
    model = model.cuda()
    target_model = target_model.cuda()
    print("Using cuda")

# Neg exp func. Start exploring then exploiting according to frame_indx
epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

losses = []
all_rewards = []
episode_reward = 0
state = env.reset()  # Initial state

best_mean_reward = float('-inf')
for frame_idx in range(starting_frame, num_frames + 1):  # Each frame in # frames played
    epsilon = epsilon_by_frame(frame_idx)   # Epsilon decreases as frames played
    action = model.act(state, epsilon)      # if (rand < e) explore. Else action w max(Q-val). action: int

    next_state, reward, done, _ = env.step(action)  # Get env info after taking action. next_state: 2d int. reward: float. done: bool.
    replay_buffer.push(state, action, reward, next_state, done)  # Save state info onto buffer (note: every frame)

    state = next_state                      # Change to next state
    episode_reward += reward                # Keep adding rewards until goal state

    if done:                                # Goal state
        state = env.reset()                 # Restart game
        all_rewards.append((frame_idx, episode_reward))  # Store episode_reward w frame it ended
        episode_reward = 0

    if len(replay_buffer) > replay_initial:     # If enough frames in replay_buffer (10000)
        loss = compute_td_loss(model, target_model, batch_size, gamma, replay_buffer)
        optimizer.zero_grad()                   # Resets gradient after every mini-batch
Exemplo n.º 6
0
epsilon_final = 0.01  # go towards this from above
epsilon_decay = 30000
epsilon_by_frame = lambda frame_idx: epsilon_final + (
    epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

losses = []
all_rewards = []
episode_reward = 0

state = env.reset()

for frame_idx in range(1, num_frames + 1):  # play til 21 pts
    #print("Frame: " + str(frame_idx))

    epsilon = epsilon_by_frame(frame_idx)
    action = model.act(state, epsilon)  # I write this

    next_state, reward, done, _ = env.step(action)
    replay_buffer.push(state, action, reward, next_state, done)

    state = next_state
    episode_reward += reward

    if done:  # game over, save
        state = env.reset()
        all_rewards.append((frame_idx, episode_reward))
        episode_reward = 0

    if len(replay_buffer) > replay_initial:  # go past replay buffer
        loss = compute_td_loss(
            model, target_model, batch_size, gamma, replay_buffer