예제 #1
0
 def respond(self, env):
     mask = get_mask(to_char(self.env.get_curr_cards()), self.action_space, to_char(self.env.get_last_cards()))
     s = env.get_state()
     s = np.reshape(s, [1, -1])
     policy, val = self.sess.run([
         self.agents[0].network.valid_policy,
         self.agents[0].network.val_pred],
         feed_dict={
             self.agents[0].network.input: s,
             self.agents[0].network.mask: np.reshape(mask, [1, -1])
         })
     policy = policy[0]
     valid_actions = np.take(np.arange(self.a_dim), mask.nonzero())
     valid_actions = valid_actions.reshape(-1)
     # a = np.random.choice(valid_actions, p=policy)
     a = valid_actions[np.argmax(policy)]
     # print("taking action: ", self.action_space[a])
     return env.step(self.action_space[a])
예제 #2
0
파일: train.py 프로젝트: moon44432/mario-ai
from network import set_network, create_network
from process_image import img_width

if __name__ == '__main__':
    set_network()
    main_qn = load_model('./model/model.h5')
    main_qn.compile(loss=huber_loss, optimizer=Adam(lr=0.0005))
    target_qn = create_network()

    memory = Memory(memory_size)

    total_step = 0

    for episode in range(1, num_episodes + 1):
        while True:
            state = get_state()
            if start_of_episode(state) == 1:
                break

        step, action, value = 0, 1, 0
        do_learn, dead = True, False
        state_deque = deque(maxlen=state_deque_size)

        target_qn.model.set_weights(main_qn.model.get_weights())

        for _ in range(1, max_steps + 1):
            step += 1
            total_step += 1

            # epsilon decay
            epsilon = epsilon_stop + (epsilon_start - epsilon_stop) * np.exp(
예제 #3
0
def rl(q_table):

    for episode in range(MAX_EPISODES):

        print("episode", episode)
        environment = pd.DataFrame(np.zeros((3, 3)))
        result = "continue"

        # 玩家1操作
        state1 = env.get_state(environment)
        action1 = choose_action(state1, q_table, environment)
        env.get_result(environment, 1, action1)

        while True:
            # 玩家2操作
            state2 = env.get_state(environment)
            action2 = choose_action(state2, q_table, environment)
            result = env.get_result(environment, 2, action2)
            state1_ = env.get_state(environment)  # 玩家2完成动作后,就是玩家1的转移后的状态
            if result == 'win':
                R1, R2 = -1, 1
                q_table.loc[state1, action1] += ALPHA * (
                    R1 + LAMBDA * q_table.iloc[state1_, :].max() -
                    q_table.loc[state1, action1])
                q_table.loc[state2,
                            action2] += ALPHA * (R2 -
                                                 q_table.loc[state2, action2])
                break
            elif result == 'continue':
                R1 = 0
                q_table.loc[state1, action1] += ALPHA * (
                    R1 + LAMBDA * q_table.iloc[state1_, :].max() -
                    q_table.loc[state1, action1])
            else:
                R1, R2 = 0.1, 0.1
                q_table.loc[state1, action1] += ALPHA * (
                    R1 + LAMBDA * q_table.iloc[state1_, :].max() -
                    q_table.loc[state1, action1])
                q_table.loc[state2,
                            action2] += ALPHA * (R2 -
                                                 q_table.loc[state2, action2])
                break

            # 玩家1操作
            state1 = env.get_state(environment)
            action1 = choose_action(state1, q_table, environment)
            result = env.get_result(environment, 1, action1)
            state2_ = env.get_state(environment)
            if result == 'win':
                R1, R2 = 1, -1
                q_table.loc[state2, action2] += ALPHA * (
                    R2 + LAMBDA * q_table.iloc[state2_, :].max() -
                    q_table.loc[state2, action2])
                q_table.loc[state1,
                            action1] += ALPHA * (R1 -
                                                 q_table.loc[state1, action1])
                break
            elif result == 'continue':
                R2 = 0
                q_table.loc[state2, action2] += ALPHA * (
                    R2 + LAMBDA * q_table.iloc[state2_, :].max() -
                    q_table.loc[state2, action2])
            else:
                R1, R2 = 0.1, 0.1
                q_table.loc[state2, action2] += ALPHA * (
                    R2 + LAMBDA * q_table.iloc[state2_, :].max() -
                    q_table.loc[state2, action2])
                q_table.loc[state1,
                            action1] += ALPHA * (R1 -
                                                 q_table.loc[state1, action1])
                break
    return q_table