Exemplo n.º 1
0
def populate_memory():
    memory = Memory(max_size=memory_size)
    state = World.player

    # Make a bunch of random actions and store the experiences
    for ii in range(pretrain_length):
        if ii % 1000 == 0:
            print("Pre-training...")

        # Make a random action
        action = random.choice(actions)
        state, action, reward, next_state = move(action)

        if World.has_restarted():

            # The simulation ends so no next state
            next_state = np.zeros(state.shape)
            # Add experience to memory
            memory.add((state, action, reward, next_state))

            World.restart_game(board_rs=False)

        else:
            # Add experience to memory
            memory.add((state, action, reward, next_state))
            state = next_state
    return memory, state
Exemplo n.º 2
0
def run(saver):
    time.sleep(0.1)
    test_episodes = 10
    test_max_steps = 600
    with tf.Session(graph=graph1) as sess:
        saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
        for ep in range(1, test_episodes):
            t = 0
            state = World.player
            while t < test_max_steps:
                time.sleep(0.1)
                # Get action from Q-network
                feed = {mainQN.inputs_: state.reshape((1, *state.shape))}
                Qs = sess.run(mainQN.output, feed_dict=feed)
                action = np.argmax(Qs)
                print(action)

                # Take action, get new state and reward
                state, action, reward, next_state = move(action)

                if World.has_restarted():
                    t = test_max_steps
                    print("Game restart, score: ", reward)
                    World.restart_game()
                    time.sleep(0.01)

                else:
                    state = next_state
                    t += 1
Exemplo n.º 3
0
def main():
    memory, state = populate_memory()
    rewards_list, saver = train(memory)
    #plot(rewards_list)

    t = threading.Thread(target=run, args=(saver,))
    #t = threading.Thread(target=train, args=(memory))
    t.daemon = True
    t.start()
    World.start_game()
    plot(rewards_list)
Exemplo n.º 4
0
def run():
    global discount, epsilon
    time.sleep(0.1)
    alpha = .08  # learning rate
    t = 1  # time
    ep = 0
    p = 0.001
    total_r = 0
    while True:
        #board = World.board_image
        #board.show()
        s = World.player
        s = (s[0], s[1])

        # choose an action:
        max_act, _ = max_Q(s)
        #max_act = policy(max_act)
        # perform action and get new state and received reward
        (s, a, r, s_2) = move(max_act)

        _, maxQ = max_Q(s_2)
        update_Q(s, a, alpha, r, discount, s_2, maxQ)
        t += 1

        total_r += r
        if World.has_restarted():
            print("World restart, score: ", total_r)
            rewards_list.append((ep, total_r))
            total_r = 0
            print(World.player)
            World.restart_game()
            #board = World.board_image
            time.sleep(0.1)
            t = 1
            epsilon *= 0.995
            epsilon = max(0.05, epsilon)
            ep += 1

        if epsilon < 0.3:
            p = 0.1
        # update learning rate
        #alpha = pow(t, -0.1)

        time.sleep(0.01)
Exemplo n.º 5
0
def move(act):
    s_1 = World.player
    reward = -World.score
    # Up, down, left, right
    if act == actions[0]:
        World.try_move(0, -1)
    if act == actions[1]:
        World.try_move(0, 1)
    if act == actions[2]:
        World.try_move(-1, 0)
    if act == actions[3]:
        World.try_move(1, 0)
    s_2 = World.player
    reward += World.score
    return s_1, act, reward, s_2
Exemplo n.º 6
0
def train(memory):
    time.sleep(0.1)
    # Now train with experiences
    saver = tf.train.Saver()
    rewards_list = []
    with tf.Session(graph=graph1) as sess:
        # Initialize variables
        sess.run(tf.global_variables_initializer())

        tau = 0

        update_target = update_target_graph()
        sess.run(update_target)

        step = 0
        for ep in range(1, train_episodes):
            total_reward = 0
            t = 0
            state = World.player
            while t < max_steps:
                step += 1
                tau += 1

                # Explore or Exploit
                explore_p = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * step)
                if explore_p > np.random.rand():
                    # Make a random action
                    action = random.choice(actions)
                else:
                    # Get action from Q-network
                    feed = {mainQN.inputs_: state.reshape((1, *state.shape))}
                    Qs = sess.run(mainQN.output, feed_dict=feed)
                    action = np.argmax(Qs)

                # Take action, get new state and reward
                state, action, reward, next_state = move(action)

                total_reward += reward

                if World.has_restarted():
                    # the episode ends so no next state
                    next_state = np.zeros(state.shape)
                    t = max_steps

                    print('Episode: {}'.format(ep),
                          'Total reward: {:.4f}'.format(total_reward),
                          'Training loss: {:.4f}'.format(loss),
                          'Explore P: {:.4f}'.format(explore_p))
                    rewards_list.append((ep, total_reward))

                    # Add experience to memory
                    memory.add((state, action, reward, next_state))

                    # Start new episode
                    World.restart_game(board_rs=False)

                else:
                    # Add experience to memory
                    memory.add((state, action, reward, next_state))
                    state = next_state
                    t += 1

                # Sample mini-batch from memory
                batch = memory.sample(batch_size)
                states = np.array([each[0] for each in batch])
                acts = np.array([each[1] for each in batch])
                rewards = np.array([each[2] for each in batch])
                next_states = np.array([each[3] for each in batch])

                # DOUBLE DQN Logic:
                # Use DQNNetwork to select the action to take at next_state (a') (action with the highest Q-value)
                # Use TargetNetwork to calculate the Q_val of Q(s',a')

                # Calculate Q_target for all actions that state
                Qs_target_next_state = sess.run(TargetNetwork.output, feed_dict={TargetNetwork.inputs_: next_states})
                # Get Q values for next_state
                Qs_next_state = sess.run(mainQN.output, feed_dict={mainQN.inputs_: next_states})

                target_Qs_batch = []

                # Set target_Qs to r for states where episode ends
                episode_ends = (next_states == np.zeros(states[0].shape)).all(axis=1)
                for i in range(0, len(batch)):
                    terminal = episode_ends[i]
                    action = np.argmax(Qs_next_state[i])
                    if terminal:
                        target_Qs_batch.append(rewards[i])
                    else:
                        target = rewards[i] + gamma * Qs_target_next_state[i][action]
                        target_Qs_batch.append(target)

                #targets = rewards + gamma * np.max(target_Qs, axis=1)

                targets_mb = np.array([each for each in target_Qs_batch])

                loss, _ = sess.run([mainQN.loss, mainQN.opt],
                                   feed_dict={mainQN.inputs_: states,
                                              mainQN.targetQs_: targets_mb,
                                              mainQN.actions_: acts})

                if tau > max_tau:
                    print("Model updated")
                    tau = 0
                    update_target = update_target_graph()
                    sess.run(update_target)

        save_path = saver.save(sess, "checkpoints/model_dqn.ckpt")
        print("Model saved in path: %s" % save_path)
    return rewards_list, saver
Exemplo n.º 7
0
def update_Q(s, a, alpha, reward, gamma, s_2, maxQ):
    Q[s][a] = (1 - alpha) * Q[s][a] + alpha * (reward + gamma * maxQ)
    World.set_cell_score(s, a, Q[s][a])
Exemplo n.º 8
0
discount = .8
actions = World.actions  # ["up", "down", "left", "right"]
states = []  # states = (x, y) location
Q = {}

# Set state-coordinates
for i in range(World.x):
    for j in range(World.y):
        states.append((i, j))

# Init Q matrix and set colors for cells
for state in states:
    temp = {}
    for action in actions:
        temp[action] = 0.1
        World.set_cell_score(state, action, temp[action])
    Q[state] = temp

for (i, j, c, w) in World.specials:
    for action in actions:
        Q[(i, j)][action] = w
        World.set_cell_score((i, j), action, w)


def move(action):
    s_1 = World.player
    s_1 = (s_1[0], s_1[1])
    reward = -World.score
    # Up, down, left, right
    if action == actions[0]:
        World.try_move(0, -1)