Python step 예제들, game.step Python 예제들

예제 #1

0

파일 보기

파일: convDQN.py 프로젝트: Fasdr/tic-tac-toe

def deep_q_learning_step(epsilon, player):
    global loss_for_one_episode
    index = epsilon_greedy(epsilon, player)
    q_value = (model(torch.FloatTensor(game.board))[(player + 2) % 3])[index]
    a_p, reward = game.step(index, player)
    if abs(a_p) == 10 or game.full_board():
        loss = ((reward - q_value)**2)
    else:
        while a_p != player and abs(a_p) != 10 and not game.full_board():
            index = epsilon_greedy(agr, a_p)
            a_p, _ = game.step(index, a_p)
        if abs(a_p) == 10:
            loss = ((reward - 17 - q_value)**2)
        elif game.full_board():
            loss = ((reward - 5 - q_value)**2)
        else:
            q_value_max = (model(torch.FloatTensor(game.board) *
                                 player)[(a_p + 2) % 3]).max()
            loss = ((reward + GAMMA * q_value_max - q_value)**2)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    loss_for_one_episode = loss_for_one_episode + loss

    return a_p

예제 #2

0

파일 보기

파일: run.py 프로젝트: 20ChaituR/tetris-bot

def update_frame(x):
    global state, score, high_score, last_move, bot_mode, down_press

    if bot_mode:
        # a = policy(Variable(torch.from_numpy(state).type(torch.FloatTensor)))
        # _, ac = a.max(0)
        # action = ac.item()

        a = policy(Variable(torch.from_numpy(state).type(torch.FloatTensor)))
        # a = F.softmax(a, dim=-1)
        c = Categorical(a)
        action = c.sample()

        # action = train.select_action(state).item()

        state, reward, done = game.step(action)

        last_move = action
    else:
        state, reward, done = game.step(0)
        if down_press:
            game.active_piece, game.grid, _ = game.move_down(game.active_piece, game.grid)
            last_move = 4

    score += reward

    if done:
        game.reset()
        high_score = max(high_score, score)
        score = 0

예제 #3

0

파일 보기

파일: convDQN.py 프로젝트: Fasdr/tic-tac-toe

def play_with():
    game.new_game()
    player = 1
    print(game.board)
    while abs(player) != 10 and not game.full_board():
        index = epsilon_greedy(0.0, player)
        player, _ = game.step(index, player)
        print(game.board)
        if not (abs(player) != 10 and not game.full_board()):
            continue
        my_index = -1 + int(input("index: "))
        player, _ = game.step(my_index, player)
        print(game.board)

예제 #4

0

파일 보기

파일: sarsa.py 프로젝트: bobogei81123/RL_homework

    def run() -> int:

        E = DefaultDict(float)  # type: Dict[StateAction, float]
        state = random_state()
        action = choose_action(state)


        while state is not None:
            next_state, reward = step(state, action)

            if next_state is None:
                next_action = None
                q_next = 0.0
            else:
                next_action = choose_action(next_state)
                q_next = value[(next_state, next_action)]

            delta = reward + q_next - value[(state, action)] 
            N_s[state] += 1
            N_sa[(state, action)] += 1
            E[(state, action)] += 1

            for (s, a) in E:
                alpha = 1.0 / N_sa[(s, a)]
                value[(s, a)] += alpha * E[(s, a)] * delta
                E[(s, a)] *= lamb

            (state, action) = (next_state, next_action)

        if plot:
            X.append(X[-1]+1 if X else 1)
            Y.append(calc_err())

        return reward

예제 #5

0

파일 보기

파일: QL.py 프로젝트: mahdiabdollahpour/Easy21

def Q_learning(epidoes, w):
    # action_value = np.random.random(size=(22, 22,2))
    action_value = np.zeros(shape=(22, 11, 2))
    for i in range(epidoes):
        print("==============", i, "==============")
        ps = random.randint(1, 11)
        ds = random.randint(1, 11)

        r = 0
        while r == 0:
            a = policy(0.5, action_value[ps, ds])
            # print(a)
            r, s = game.step([ps, ds], a)
            # print("r",r,a,ps,ds)
            if r == 0 and a != 0:
                action_value[
                    ps, ds, a] = (1 - w) * action_value[ps, ds, a] + w * (
                        r +
                        action_value[s[0], s[1],
                                     optimal_policy(action_value[s[0], s[1]])])
                ps = s[0]
                ds = s[1]
            else:
                print(r)
                action_value[ps, ds,
                             a] = (1 - w) * action_value[ps, ds, a] + w * (r)
                ps = s[0]
                ds = s[1]
                break

    return action_value

예제 #6

0

파일 보기

파일: q2.py 프로젝트: axelahmer/easy21

def sarsa(lamb: int, num_episodes: int, Qstar, record=False):
    Q = state_action_map(plus=True)
    N = state_action_map()
    N_s = state_map(plus=True)
    mses = []
    for k in range(num_episodes):
        E = state_action_map()
        s = State(deal=True)
        a = get_e_greedy_action(Q, N_s, s)
        while not s.terminal():
            N_s[s.get_state()] += 1
            N[s.get_state(), a] += 1
            s_dash, r = step(s, a)
            a_dash = get_e_greedy_action(Q, N_s, s_dash)
            delta = r + Q[s_dash.get_state(), a_dash] - Q[s.get_state(), a]
            E[s.get_state(), a] += 1

            for d in DEALER_RANGE:
                for p in PLAYER_RANGE:
                    for action in ACTIONS:
                        Q[(d, p),
                          action] += (1 /
                                      (N[(d, p), action] + 1e-9)) * delta * E[
                                          (d, p), action]
                        E[(d, p), action] *= lamb
            s = s_dash
            a = a_dash
        if record:
            mses.append(calc_mse(Q, Qstar))
    return Q, mses

예제 #7

0

파일 보기

파일: MonteCarloControl.py 프로젝트: mahdiabdollahpour/Easy21

    def train(self):
        for i in range(self.episodes):
            if i % 10000 == 0:
                print("==============", i, "==============")
            episode = []
            s = game.init()

            ps = s[0]
            ds = s[1]

            r = 0
            while r == 0:
                a = self.policy(self.get_e(s), self.Q[ps, ds])
                # print(a)
                r, s = game.step([ps, ds], a)
                # print("r",r,a,ps,ds)
                episode.append([[ps, ds], a, r])
                if a == 0:
                    # print(r)
                    break
                else:
                    ps = s[0]
                    ds = s[1]

            self.control(episode)

        return self.Q

예제 #8

0

파일 보기

파일: convDQN.py 프로젝트: Fasdr/tic-tac-toe

def test():
    game.new_game()
    player = 1
    print(game.board)
    while abs(player) != 10 and not game.full_board():
        index = epsilon_greedy(0.0, player)
        player, _ = game.step(index, player)
        print(game.board)

예제 #9

0

파일 보기

파일: utils.py 프로젝트: axelahmer/easy21

def sample_episode(pi):
    history = []
    s = State(deal=True)

    while not s.terminal():
        a = pi[s.get_state()]
        # rewards do not need to be appended to history as rewards are only *rewarded* when entering the terminal state.
        history.append([s.get_state(), a])
        s, r = step(s, a)

    return history, r

예제 #10

0

파일 보기

파일: main.py 프로젝트: Aurel37/TDLog_jeu

def main():
    """
    Code the entiere game: we display every moove and while we don't reach the
    door or while we aren't stuck, it we continue to ask you what moves
    you want to put
    """
    grid, players, end = grids.grid_init()
    if (end):
        grid.display()
        grid.display()
        val = input("Select your moves:")
        while (not game.step(grid, val, players)):
            val = input("Select your moves: (Press s if you are stuck)")
        return "Victory"

예제 #11

0

파일 보기

파일: monte_control.py 프로젝트: bobogei81123/RL_homework

def play(state=None) -> Tuple[List[StateAction], int]:
    if state is None:
        state = init_state()
    r_sum = 0
    history = []  # type: List[StateAction]

    while state is not None:
        action = choose_action(state)
        history.append((state, action))

        state, r = step(state, action)
        r_sum += r

    return (history, r_sum)

예제 #12

0

파일 보기

파일: convDQN.py 프로젝트: Fasdr/tic-tac-toe

def one_episode(epsilon, player):
    game.new_game()
    global loss_for_one_episode, loss_for_sever_episodes
    loss_for_one_episode = 0
    if player == 1:
        while abs(player) != 10 and not game.full_board():
            player = deep_q_learning_step(epsilon, player)
    else:
        index = epsilon_greedy(0.0, 1)
        player, _ = game.step(index, player)
        while abs(player) != 10 and not game.full_board():
            player = deep_q_learning_step(epsilon, player)
    print(loss_for_one_episode)
    loss_for_sever_episodes += loss_for_one_episode

예제 #13

0

파일 보기

파일: train.py 프로젝트: 20ChaituR/tetris-bot

def train(num_episodes, save_rate=0, starting_episode=0):
    global f
    import time

    if starting_episode > 0:
        model = 'models/tetris_policy_' + str(starting_episode) + '.pth'
        policy.load_state_dict(torch.load(model))

    start_time = time.time()
    total_time = 0

    running_reward = 1
    episode = starting_episode
    while episode != num_episodes:
        state = game.reset()  # Reset environment and record the starting state
        f = True

        game_reward = 0

        for _ in range(max_time):
            action = select_action(state)
            f = False
            # Step through environment using chosen action
            state, reward, done = game.step(action.item())

            # Save reward
            policy.reward_episode.append(reward)
            game_reward += reward
            if done:
                break

        # Used to determine when the environment is solved.
        running_reward = (running_reward * 0.99) + (game_reward * 0.01)

        update_policy()

        if episode % 50 == 0:
            cur_time = time.time()
            total_time += cur_time - start_time
            start_time = cur_time
            print(
                'Episode {}\tLast reward: {:5d}\tAverage reward: {:.2f}\tTime: {:.2f}'
                .format(episode, game_reward, running_reward, total_time))

        if save_rate != 0 and (episode + 1) % save_rate == 0:
            PATH = 'models/tetris_policy_' + str(episode + 1) + '.pth'
            torch.save(policy.state_dict(), PATH)

        episode += 1

예제 #14

0

파일 보기

파일: DQN.py 프로젝트: al-este/snake-DQN

    def create_set(game, epsilon):
        pre = model.predict(matrix_to_array(game.get_move_matrix()))
        if np.random.random() < epsilon:
            a = randint(0, 3)
        else:
            a = np.argmax(pre)

        pre_set.append(pre[0])
        a_set.append(a)
        state = game.get_move_matrix()
        state_set.append(image.img_to_array(state))

        r, done = game.step(get_movement(a))

        r_set.append(r)

        if done == 'playing':
            done_set.append(1)
        else:
            done_set.append(0)

        post_set.append(game.get_move_matrix())

예제 #15

0

파일 보기

def sarsa(lamb: int, num_episodes: int, Qstar, record=False):
    alpha = ALPHA
    w = np.zeros(36)
    # w = np.random.uniform(-1, 1, 36)
    mses = []
    for k in range(num_episodes):
        E = np.zeros(36)
        s = State(deal=True)
        a = get_e_greedy_action(s, w)
        while not s.terminal():
            x = phi(s, a)
            s_dash, r = step(s, a)
            a_dash = get_e_greedy_action(s_dash, w)

            delta = r + q_hat(s_dash, a_dash, w) - q_hat(s, a, w)
            E = np.add(np.multiply(E, lamb), x)
            dw = np.multiply(E, alpha * delta)
            w += dw

            s = s_dash
            a = a_dash
        if record:
            mses.append(calc_mse_linear(w, Qstar))
    return w, mses

예제 #16

0

파일 보기

파일: linear_approx.py 프로젝트: bobogei81123/RL_homework

    def run() -> int:

        E = DefaultDict(float)  # type: Dict[StateAction, float]
        state = random_state()
        action = choose_action(state)
        nonlocal theta

        while state is not None:
            next_state, reward = step(state, action)

            if next_state is None:
                next_action = None
                q_next = 0.0
            else:
                next_action = choose_action(next_state)
                q_next = get_value(next_state, next_action)

            delta = reward + q_next - get_value(state, action) 

            N_s[state] += 1
            N_sa[(state, action)] += 1
            E[(state, action)] += 1

            for (s, a) in E:
                alpha = 0.01
                theta += alpha * E[(s, a)] * delta * get_feature(s, a)
                E[(s, a)] *= lamb

            (state, action) = (next_state, next_action)

        if plot:
            X.append(X[-1]+1 if X else 1)
            Y.append(calc_err())


        return reward

예제 #17

0

파일 보기

                save_path = mainDQN.saver.save(mainDQN.session,
                                               model_path,
                                               global_step=episode)
                print("Model(episode : ", episode, ") saved in file : ",
                      save_path)
            last_100_game_reward.append(step_count)

            if len(last_100_game_reward) > 50:
                last_100_game_reward.popleft()
                avg_reward = np.mean(last_100_game_reward)
                if avg_reward > 100:
                    print(
                        "Game Cleared in {episode} episodes with avg reward {avg_reward}"
                    )
                    break


map_data = [[1 for j in range(game.y_res)] for i in range(game.x_res)]
user_loc = {'x': int(game.x_res / 2), 'y': int(game.y_res / 2)}
ball_list = game.ball_list_init(game.ball_list)
map_data, reward, done = game.mapping2map(user_loc, ball_list, map_data, 1)
next_roi_data = game.roi_calculation(map_data, user_loc, game.input_size)
os.system('clear')
game.game_print(map_data, next_roi_data)
while True:
    next_roi_data = game.roi_calculation(map_data, user_loc, game.input_size)
    o_dqn(next_roi_data, map_data, user_loc, ball_list)
    action = np.argmax(mainDQN.predict(next_roi_data))
    game.step(action, game.input_size, map_ata, user_loc, ball_list, 1, 0)
    time.sleep(delay_time)

예제 #18

0

파일 보기

파일: sarsa.py 프로젝트: Soundpulse/easy21-rl

    else:
        return bool(np.argmax(q))


if __name__ == "__main__":

    for k in range(1, ITERATIONS):
        terminal = False

        E_matrix = np.zeros_like(Q_matrix)

        state = game.initialise_state()
        action = epsilon_greedy(allQ(state), allN(state))

        while not terminal:
            next_state, reward = game.step(state, action)

            terminal = state.terminal

            if not terminal:
                next_action = epsilon_greedy(allQ(state), allN(state))
                delta = reward + Q(next_state, next_action) - Q(state, action)
            else:
                delta = reward - Q(state, action)

            allE(state)[int(action)] += 1
            allN(state)[int(action)] += 1

            alpha = 1 / N(state, action)

            Q_matrix += alpha * delta * E_matrix

예제 #19

0

파일 보기

import game

game = game.Game()

while not game.gameState.isEndGame:
    game.gameState.display_console()
    action = int(input())
    game.step(action)

game.gameState.display_console()

예제 #20

0

파일 보기

        for t in range(MAX_STEPS):
            action = agent.act(state)
            key = action2key[game.key][action]
            if int(e / 100) * 100 == e:
                game.render()
                print "key:", key2str[key], "    action:", action2str[
                    action], "   time:", t
                quality = score_sum / (score_cnt + 1)
                msg_str = "episode: {}/{}, epsilon: {:.2}, q: {:0.2f}, mem: {}, mem_done: {}, time: {}"\
                 .format(e, EPISODES, agent.epsilon, quality, len(agent.memory), len(agent.memory_done), time_sum/100.0)
                print msg_str
                #	print "----------------"
                #	game.render_dxy_state()
                #	print "----------------"
                time.sleep(0.05)
            next_state, reward = game.step(key)

            #if reward == 0:
            #	steps_wo_r += 1
            #else:
            #	steps_wo_r = 0

            #if int(e/100)*100 == e:
            #	game.render_dxy_state()
            #	print "----------------"
            #	time.sleep(0.15)
            reward = reward if not game.done else -100.0
            score_sum += game.score
            score_cnt += 1
            #print "reward", reward
            agent.remember(state, action, reward, next_state, game.done)

예제 #21

0

파일 보기

     i2 = 0
     for state_t, action, reward, next_state_t, done in seq_list:
         for k in range(roi_width):
             for m in range(roi_width):
                 state_temp[k][m][i2] = state_t[k][m]
         i2 += 1
     QQ = mainDQN.predict(state_temp)
     action = np.argmax(QQ)
     f3 = open("predict_log.txt", "a")
     f3.write(
         'episode : %3d, step_count : %3d, i = %3d, max_step = %d \n' %
         (episode, step_count, i, max_step))
     f3.close()
     print "DQN:" + str(action) + " Q:" + str(QQ)
     next_state, reward, done, ball_list, user_loc = game.step(
         action, game.input_size, map_data, user_loc, ball_list, 1,
         episode)
 elif i < 2:
     print "-------------   no  choice " + str(
         i) + " ----------------\n"
     next_state, reward, done, ball_list, user_loc = game.step(
         action, game.input_size, map_data, user_loc, ball_list, 0,
         episode)
     seq_list[i] = (state, action, reward, next_state, done)
 print "----- episode : " + str(episode) + " reward : " + str(
     reward) + "  step : " + str(step_count) + " avg_reward : " + str(
         avg_reward) + "  max_step : " + str(max_step) + "----"
 state = next_state
 if done == True:
     map_data = [[5 for j in range(game.y_res)]
                 for i in range(game.x_res)]

예제 #22

0

파일 보기

파일: solve.py 프로젝트: jonnelafin/boxibreaker

import game
import random

print("Solving")
it = 10000

mov = ["w", "a", "s", "d"]

steps = 800

bs = 0
bp = []
for i in range(it):
    game.init()
    for s in range(steps):
        c = mov[random.randint(0, len(mov) - 1)]
        print(str(i) + ", " + str(s) + "(" + c + ")", end="\r")
        game.step(c)
        if game.co > bs:
            bs = game.co
            print("\n" + str(bs))
            if bs == 4:
                bp = game.path
                break
        #else:
        #print(".", end="")
for i in bp:
    print(i)

예제 #23

0

파일 보기

def o_dqn(roi_data, map_data, user_loc, ball_list):
    loss = 0
    avg_reward = 0
    f = open("train_log.txt", "w")
    f3 = open("predict_log.txt", "w")
    f3.close()
    action_list = [0, 1, 2, 3, 4, 5, 6, 7, 8]
    max_episodes = 10000
    replay_buffer = deque()
    replay_buffer_recent = deque()

    last_100_game_reward = deque()

    with tf.Session() as sess:
        mainDQN = dqn.DQN(sess, input_size, output_size, name="main")
        targetDQN = dqn.DQN(sess, input_size, output_size, name="target")
        tf.global_variables_initializer().run()

        copy_ops = get_copy_var_ops(dest_scope_name="target",
                                    src_scope_name="main")
        mainDQN.saver.restore(sess, model_path_in)
        sess.run(copy_ops)
        state = roi_data

        for episode in range(max_episodes):
            e = 1. / ((episode / 10) + 1)
            done = False
            step_count = 0
            flag = 0
            action = 0
            while not flag:
                reward_flag = 1
                seq_list = [(), (), ()]
                next_seq_list = [(), (), ()]
                for i in range(6):
                    if i == 2:
                        step_count += 1
                        seq_list[i] = (state, action, reward, next_state, done)
                        if np.random.rand(1) < e:
                            action = random.choice(action_list)
                            print "------------- random choice :" + str(
                                action) + " ----------------\n"
                        else:
                            state_temp = [[[5, 5, 5] for j in range(roi_width)]
                                          for i3 in range(roi_width)]
                            i2 = 0
                            for state_t, action, reward, next_state_t, done in seq_list:
                                for k in range(roi_width):
                                    for m in range(roi_width):
                                        state_temp[k][m][i2] = state_t[k][m]
                                i2 += 1
                            QQ = mainDQN.predict(state_temp)
                            action = np.argmax(QQ)
                            f3 = open("predict_log.txt", "a")
                            f3.write(
                                'episode : %3d, step_count : %3d, i = %3d  ' %
                                (episode, step_count, i))
                            f3.close()
                            print "--- DQN choice :" + str(
                                action) + "  Q : " + str(QQ) + " Q_sum = "
                        next_state, reward, done, ball_list, user_loc = game.step(
                            action, game.input_size, map_data, user_loc,
                            ball_list, 1, episode)
                        ball_list_temp = copy.deepcopy(ball_list)
                        user_loc_temp = copy.deepcopy(user_loc)
                        state_temp = copy.deepcopy(next_state)
                        seq_list[i] = (state, action, reward, next_state, done)
                    elif i < 2:
                        print "-------------   no  choice" + str(
                            i) + "  " + str(action) + " ----------------\n"
                        next_state, reward, done, ball_list, user_loc = game.step(
                            action, game.input_size, map_data, user_loc,
                            ball_list, 0, episode)
                        seq_list[i] = (state, action, reward, next_state, done)
                    elif i > 2 and i < 5:
                        print "-------------   no  choice" + str(
                            i) + "  " + str(action) + " ----------------\n"
                        next_state, reward, done, ball_list, user_loc = game.step(
                            action, game.input_size, map_data, user_loc,
                            ball_list, 0, -1)
                        next_seq_list[i - 3] = (state, action, reward,
                                                next_state, done)
                    elif i == 5:
                        reward = reward_flag
                        next_seq_list[i - 3] = (state, action, reward,
                                                next_state, done)
                        next_state = state_temp
                        if flag == 0:
                            ball_list = ball_list_temp
                            user_loc = user_loc_temp
                        elif flag == 1:
                            replay_buffer_recent.append(
                                (seq_list + next_seq_list))
                            if len(replay_buffer_recent
                                   ) > REPLAY_MEMORY_RECENT:
                                replay_buffer_recent.popleft()
                            map_data = [[5 for j in range(game.y_res)]
                                        for i in range(game.x_res)]
                            user_loc = {
                                'x': int(game.x_res / 2),
                                'y': int(game.y_res / 2)
                            }
                            ball_list = game.ball_list_init(ball_list)
                            map_data, reward, done = game.mapping2map(
                                user_loc, ball_list, map_data, 1)
                            roi_data = game.roi_calculation(
                                map_data, user_loc, game.input_size)
                            os.system('clear')
                    if i < 5:
                        print "----- episode : " + str(
                            episode) + " reward : " + str(
                                reward) + " done : " + str(
                                    done) + "  step : " + str(
                                        step_count) + "  Loss : " + str(
                                            loss) + " avg_reward : " + str(
                                                avg_reward) + " ----"

                    state = next_state
                    if done == True:
                        flag = 1
                    if i >= 2 and reward_flag > reward:
                        reward_flag = reward

                replay_buffer.append((seq_list + next_seq_list))
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()

            f.write("Episode: {}	steps: {}\n".format(episode, step_count))
            if episode % 50 == 0 and episode > 499:
                for train in range(50):
                    t_num = 0
                    b_s = 24
                    max_t_num = 8
                    max_t = 1
                    minibatch = random.sample(replay_buffer, b_s)
                    minibatch2 = random.sample(replay_buffer_recent, 8)
                    minibatch = minibatch + minibatch2
                    loss, _ = ddqn_replay_train(mainDQN, targetDQN, minibatch)
                    print 'training... loss = %f --%d' % (loss, train)
                    f.write("Loss: {}\n".format(loss))
                sess.run(copy_ops)
            if episode % 100 == 0:
                save_path = mainDQN.saver.save(mainDQN.session,
                                               model_path,
                                               global_step=episode)
                print("Model(episode : ", episode, ") saved in file : ",
                      save_path)
            last_100_game_reward.append(step_count)

            if len(last_100_game_reward) > 50:
                last_100_game_reward.popleft()
                avg_reward = np.mean(last_100_game_reward)
                if avg_reward > 100:
                    print(
                        "Game Cleared in {episode} episodes with avg reward {avg_reward}"
                    )
                    break