Exemplo n.º 1
0
def dqn():
    env = Tetris()
    episodes = 2000
    max_steps = None
    epsilon_stop_episode = 1500
    mem_size = 20000
    discount = 0.95
    batch_size = 512
    epochs = 1
    render_every = 50
    log_every = 50
    replay_start_size = 2000
    train_every = 1
    n_neurons = [32, 32]
    render_delay = None
    activations = ['relu', 'relu', 'linear']

    agent = DQNAgent(env.get_state_size(),
                     n_neurons=n_neurons,
                     activations=activations,
                     epsilon_stop_episode=epsilon_stop_episode,
                     mem_size=mem_size,
                     discount=discount,
                     replay_start_size=replay_start_size)

    log_dir = f'logs/tetris-nn={str(n_neurons)}-mem={mem_size}-bs={batch_size}-e={epochs}-{datetime.now().strftime("%Y%m%d-%H%M%S")}'
    log = CustomTensorBoard(log_dir=log_dir)

    scores = []

    current_state = env.reset()
    done = False
    steps = 0

    # if render_every and episode % render_every == 0:
    #   render = True
    # else:
    render = True
    actions = []

    for episode in tqdm(range(episodes)):
        current_state = env.reset()
        done = False
        steps = 0

        # if render_every and episode % render_every == 0:
        #   render = True
        # else:
        render = False
        actions = []

        # Game
        while not done and (not max_steps or steps < max_steps):
            next_states = env.get_next_states()
            best_state = agent.best_state(next_states.values())

            best_action = None
            for action, state in next_states.items():
                if state == best_state:
                    best_action = action
                    break

            reward, done = env.play(best_action[0],
                                    best_action[1],
                                    render=render,
                                    render_delay=render_delay)
            agent.add_to_memory(current_state, next_states[best_action],
                                reward, done)
            current_state = next_states[best_action]
            actions.append(best_action)
            steps += 1
        scores.append(env.get_game_score())

        # Train
        if episode % train_every == 0:
            agent.train(batch_size=batch_size, epochs=epochs)

        # Logs
        if log_every and episode and episode % log_every == 0:
            avg_score = mean(scores[-log_every:])
            min_score = min(scores[-log_every:])
            max_score = max(scores[-log_every:])

            log.log(episode,
                    avg_score=avg_score,
                    min_score=min_score,
                    max_score=max_score)

    print(agent.model.evaluate(current_state))
    agent.model.save_weights("ia_tetris_weights.h5")
    while True:
        current_state = env.reset()
        done = False
        steps = 0
        render = True

        # Game
        while not done and (not max_steps or steps < max_steps):
            next_states = env.get_next_states()
            best_state = agent.best_state(next_states.values())

            best_action = None
            for action, state in next_states.items():
                if state == best_state:
                    best_action = action
                    break

            reward, done = env.play(best_action[0],
                                    best_action[1],
                                    render=render,
                                    render_delay=render_delay)

            agent.add_to_memory(current_state, next_states[best_action],
                                reward, done)
            current_state = next_states[best_action]
            actions.append(best_action)
            steps += 1

        scores.append(env.get_game_score())
Exemplo n.º 2
0
def dqn():
    env = Tetris()
    episodes = 2000
    max_steps = 1000000000
    epsilon_stop_episode = 1750
    mem_size = 20000
    discount = 0.95
    batch_size = 1024
    epochs = 1
    render_every = 1
    log_every = 1
    replay_start_size = 2000
    train_every = 1
    n_neurons = [32, 32]
    render_delay = 0.01
    activations = ['relu', 'relu', 'linear']
    m = 0

    agent = DQNAgent(env.get_state_size(),
                     n_neurons=n_neurons, activations=activations,
                     epsilon_stop_episode=epsilon_stop_episode, mem_size=mem_size,
                     discount=discount, replay_start_size=replay_start_size)

    log_dir = f'logs/tetris-eps={episodes}-e-stop={epsilon_stop_episode}-e={epochs}-{datetime.now().strftime("%Y%m%d-%H%M%S")}'
    log = CustomTensorBoard(log_dir=log_dir)

    scores = []
    steps_list = []

    for episode in tqdm(range(episodes)):
        current_state = env.reset()
        done = False
        steps = 0
        
        if (render_every and episode % render_every == 0) or episode == (episodes - 1):
            render = True
            record = True
        else:
            render = False
            record = False

        # Game
        while not done and (not max_steps or steps < max_steps):
            next_states = env.get_next_states()
            best_state = agent.best_state(next_states.values())
            
            best_action = None
            for action, state in next_states.items():
                if state == best_state:
                    best_action = action
                    break

            reward, done = env.play(best_action[0], best_action[1], episode, render=render,
                                    render_delay=render_delay, record=record)
            
            agent.add_to_memory(current_state, next_states[best_action], reward, done)
            current_state = next_states[best_action]
            steps += 1

        scores.append(env.get_game_score())
        steps_list.append(steps)

        # Train
        if episode % train_every == 0:
            agent.train(batch_size=batch_size, epochs=epochs)

        # Logs
        if log_every and episode and episode % log_every == 0:
            score = scores[-log_every]
            steps = steps_list[-log_every]

            log.log(episode, score = score, steps = steps)
Exemplo n.º 3
0
update_target_every = None

agent = DQNAgent(env.get_state_size(),
                 n_neurons=n_neurons,
                 activations=activations,
                 add_batch_norm=add_batch_norm,
                 epsilon=epsilon,
                 epsilon_min=epsilon_min,
                 use_target_model=use_target_model,
                 update_target_every=update_target_every,
                 epsilon_stop_episode=epsilon_stop_episode,
                 mem_size=mem_size,
                 discount=discount,
                 replay_start_size=replay_start_size)

log_dir = f'logs/tetris-epsilon={epsilon}-epsilon_min={epsilon_min}-epsilon_stop_episode={epsilon_stop_episode}-{datetime.now().strftime("%Y%m%d-%H%M%S")}'
log = CustomTensorBoard(log_dir=log_dir)

start_time = time()
wall_time = []
scores = []
val_scores = []
val_steps = []

dqn()

#%% Analyze

plt.plot(range(len(val_scores)), val_scores)
plt.plot(range(len(scores)), scores)
Exemplo n.º 4
0
def dqn():
    env = Tetris()
    episodes = 2000
    max_steps = None
    epsilon_stop_episode = 500
    mem_size = 20000
    discount = 0.95
    batch_size = 512
    epochs = 1
    render_every = 50
    log_every = 50
    replay_start_size = 2000
    train_every = 1
    n_neurons = [32, 32]
    render_delay = None
    activations = ['relu', 'relu', 'linear']

    agent = DQNAgent(env.get_state_size(),
                     n_neurons=n_neurons,
                     activations=activations,
                     epsilon_stop_episode=epsilon_stop_episode,
                     mem_size=mem_size,
                     discount=discount,
                     replay_start_size=replay_start_size)

    log_dir = f'logs/tetris-nn={str(n_neurons)}-mem={mem_size}-bs={batch_size}-e={epochs}-{datetime.now().strftime("%Y%m%d-%H%M%S")}'
    log = CustomTensorBoard(log_dir=log_dir)

    scores = []
    _max_height = True
    _min_height = True
    _current_piece = False
    _next_piece = False
    _max_bumpiness = False
    _lines = False
    _holes = True
    _total_bumpiness = True
    _sum_height = False

    for episode in tqdm(range(episodes)):
        current_state = env.reset(_max_height, _min_height, _current_piece,
                                  _next_piece, _max_bumpiness, _lines, _holes,
                                  _total_bumpiness, _sum_height)
        done = False
        steps = 0

        if render_every and episode % render_every == 0:
            render = True
        else:
            render = False

        # Game
        while not done and (not max_steps or steps < max_steps):
            # No params for default
            next_states = env.get_next_states(_max_height, _min_height,
                                              _current_piece, _next_piece,
                                              _max_bumpiness, _lines, _holes,
                                              _total_bumpiness, _sum_height)
            best_state = agent.best_state(next_states.values())

            best_action = None
            for action, state in next_states.items():
                if state == best_state:
                    best_action = action
                    break

            reward, done = env.play(best_action[0],
                                    best_action[1],
                                    render=render,
                                    render_delay=render_delay)

            agent.add_to_memory(current_state, next_states[best_action],
                                reward, done)
            current_state = next_states[best_action]
            steps += 1

        scores.append(env.get_game_score())

        # Train
        if episode % train_every == 0:
            agent.train(batch_size=batch_size, epochs=epochs)

        # Logs
        if log_every and episode and episode % log_every == 0:
            avg_score = mean(scores[-log_every:])
            min_score = min(scores[-log_every:])
            max_score = max(scores[-log_every:])
            cleared_lines = env.get_lines()

            log.log(episode,
                    avg_score=avg_score,
                    min_score=min_score,
                    max_score=max_score,
                    cleared_lines=cleared_lines)
Exemplo n.º 5
0
def dqn():
    env = Tetris()
    episodes = 2000
    max_steps = None
    epsilon_stop_episode = 1500
    mem_size = 20000
    discount = 0.95
    batch_size = 512
    epochs = 1
    render_every = 50
    log_every = 50
    replay_start_size = 2000
    train_every = 1
    n_neurons = [64, 32, 16]
    render_delay = None
    activations = ['relu', 'relu', 'relu', 'linear']

    agent = DQNAgent(
        env.get_state_size(),
        epsilon=0,
        n_neurons=n_neurons,
        activations=activations,
        epsilon_stop_episode=epsilon_stop_episode,
        mem_size=mem_size,
        discount=discount,
        replay_start_size=replay_start_size,
    )

    log_dir = f'logs/tetris-nn={str(n_neurons)}-mem={mem_size}-bs={batch_size}-e={epochs}-{datetime.now().strftime("%Y%m%d-%H%M%S")}'
    log = CustomTensorBoard(log_dir=log_dir)

    scores = []
    scores_sum = 0
    score_max = 0

    for episode in tqdm(range(episodes)):
        current_state = env.reset()
        done = False
        steps = 0

        if render_every and episode % render_every == 0:
            render = True
        else:
            render = False

        # Game
        while not done and (not max_steps or steps < max_steps):
            next_states = env.get_next_states()
            # print('\n\n', next_states)
            best_state = agent.best_state(next_states.values())

            best_action = None
            for action, state in next_states.items():
                if state == best_state:
                    best_action = action
                    break

            reward, done = env.play(best_action[0],
                                    best_action[1],
                                    render=render,
                                    render_delay=render_delay)

            agent.add_to_memory(current_state, next_states[best_action],
                                reward, done)
            current_state = next_states[best_action]
            steps += 1

        score = env.get_game_score()
        scores.append(score)
        scores_sum += score
        if score > score_max:
            score_max = score

        if episode != 0 and episode % render_every == 0:
            # print('SCORES SUM:', scores_sum, 'AVG:', scores_sum / render_every, 'MAX:', score_max)
            scores_sum = 0
            score_max = 0

        # Train
        # if episode % train_every == 0:
        #    agent.train(batch_size=batch_size, epochs=epochs)

        print('Done!')
        sleep(30)

        # Logs
        if log_every and episode and episode % log_every == 0:
            avg_score = mean(scores[-log_every:])
            min_score = min(scores[-log_every:])
            max_score = max(scores[-log_every:])

            log.log(episode,
                    avg_score=avg_score,
                    min_score=min_score,
                    max_score=max_score)
Exemplo n.º 6
0
def dqn(conf: AgentConf):
    env = Tetris()

    agent = DQNAgent(env.get_state_size(),
                     n_neurons=conf.n_neurons,
                     activations=conf.activations,
                     epsilon=conf.epsilon,
                     epsilon_min=conf.epsilon_min,
                     epsilon_stop_episode=conf.epsilon_stop_episode,
                     mem_size=conf.mem_size,
                     discount=conf.discount,
                     replay_start_size=conf.replay_start_size)

    timestamp_str = datetime.now().strftime("%Y%m%d-%H%M%S")
    # conf.mem_size = mem_size
    # conf.epochs = epochs
    # conf.epsilon_stop_episode = epsilon_stop_episode
    # conf.discount = discount
    log_dir = f'logs/tetris-{timestamp_str}-ms{conf.mem_size}-e{conf.epochs}-ese{conf.epsilon_stop_episode}-d{conf.discount}'
    log = CustomTensorBoard(log_dir=log_dir)

    print(f"AGENT_CONF = {log_dir}")

    scores = []

    episodes_wrapped: Iterable[int] = tqdm(range(conf.episodes))
    for episode in episodes_wrapped:
        current_state = env.reset()
        done = False
        steps = 0

        # update render flag
        render = True if conf.render_every and episode % conf.render_every == 0 else False

        # game
        while not done and (not conf.max_steps or steps < conf.max_steps):
            next_states = env.get_next_states()
            best_state = agent.best_state(next_states.values())

            # find the action, that corresponds to the best state
            best_action = None
            for action, state in next_states.items():
                if state == best_state:
                    best_action = action
                    break

            reward, done = env.hard_drop([best_action[0], 0],
                                         best_action[1],
                                         render=render)

            agent.add_to_memory(current_state, next_states[best_action],
                                reward, done)
            current_state = next_states[best_action]
            steps += 1

        # just return score
        scores.append(env.get_game_score())

        # train
        if episode % conf.train_every == 0:
            # n = len(agent.memory)
            # print(f" agent.memory.len: {n}")
            agent.train(batch_size=conf.batch_size, epochs=conf.epochs)

        # logs
        if conf.log_every and episode and episode % conf.log_every == 0:
            avg_score = mean(scores[-conf.log_every:])
            min_score = min(scores[-conf.log_every:])
            max_score = max(scores[-conf.log_every:])
            log.log(episode,
                    avg_score=avg_score,
                    min_score=min_score,
                    max_score=max_score)
    # save_model
    save_model(agent.model,
               f'{log_dir}/model.hdf',
               overwrite=True,
               include_optimizer=True)
Exemplo n.º 7
0
def dqn():
    episodes = 10000
    max_steps = None
    epsilon_stop_episode = 7000
    mem_size = 20000
    discount = 0.95
    batch_size = 512
    epochs = 1
    render_every = 1000
    log_every = 20
    replay_start_size = 2000
    train_every = 1
    n_neurons = [32, 32]
    render_delay = None
    activations = ['relu', 'relu', 'linear']

    env = Tetris()
    '''
    with open(r"saved_agents/pickled_new_agent_10000_7000", "rb") as input_file:
        agent = pickle.load(input_file)
        agent.epsilon = 0
    
    '''
    agent = DQNAgent(env.get_state_size(),
                     n_neurons=n_neurons,
                     activations=activations,
                     epsilon_stop_episode=epsilon_stop_episode,
                     mem_size=mem_size,
                     discount=discount,
                     replay_start_size=replay_start_size)
    agent.epsilon = 0
    '''
    hateris = DQNAgent(env.get_state_size()+1,
                     n_neurons=n_neurons, activations=activations,
                     epsilon_stop_episode=epsilon_stop_episode, mem_size=mem_size,
                     discount=discount, replay_start_size=replay_start_size)
    #env.hater = hateris
    '''

    log_dir = f'logs/tetris-nn={str(n_neurons)}-mem={mem_size}-bs={batch_size}-e={epochs}-{datetime.now().strftime("%Y%m%d-%H%M%S")}'
    log = CustomTensorBoard(log_dir=log_dir)

    scores = []
    tot_max_score = 0
    for episode in tqdm(range(episodes)):
        current_state = env.reset()
        done = False
        steps = 0

        if render_every and episode % render_every == 0:
            render = True
        else:
            render = False

        # Game
        while not done and (not max_steps or steps < max_steps):
            next_states = env.get_next_states(env.current_piece)
            best_state = agent.best_state(next_states.values())

            best_action = None
            for action, state in next_states.items():
                if state == best_state:
                    best_action = action
                    break

            reward, done = env.play(best_action[0],
                                    best_action[1],
                                    render=render,
                                    render_delay=render_delay)
            #agent.add_to_memory(current_state, next_states[best_action], reward, done)
            #hateris.add_to_memory(current_state+[env.current_piece], next_states[best_action]+[env.current_piece], -reward, done)
            current_state = next_states[best_action]
            steps += 1

        scores.append(env.get_game_score())

        # Train
        #if episode % train_every == 0:
        #agent.train(batch_size=batch_size, epochs=epochs)
        #hateris.train(batch_size=batch_size, epochs=epochs)

        # Logs
        #if log_every and episode and episode % log_every == 0 and episode>101:
        if log_every and episode and episode % log_every == 0:
            avg_score = mean(scores[-log_every:])
            min_score = min(scores[-log_every:])
            max_score = max(scores[-log_every:])
            print(
                str(episode) + " " + str(avg_score) + " " + str(min_score) +
                " " + str(max_score))
            '''if (tot_max_score < max_score):
                agent.save("dqnAgentMax10000.h5", episode)
                tot_max_score = max_score'''

    #agent.save("dqnAgent10000.h5", episode)

# with open("saved_agents/pickled_new_agent_10000_7000", "wb") as input_file:
#pickle.dump(agent,input_file)
    plt.plot(scores)
    plt.show()
Exemplo n.º 8
0
def dqn():
    trainingAgent = False
    trainingHater = False
    env = Tetris(trainingAgent or trainingHater)
    episodes = 2000
    max_steps = None
    epsilon_stop_episode = 1500
    mem_size = 20000
    discount = 0.95
    batch_size = 512
    epochs = 1
    render_every = 200 if (trainingAgent or trainingHater) else 10
    log_every = 50
    replay_start_size = 2000
    train_every = 1
    n_neurons = [32, 32]
    render_delay = None
    activations = ['relu', 'relu', 'linear']
    agent_save_filepath = "keras_saved_maxbump.h5"
    # hater_save_filepath = "hater_changed_reward.h5"
    hater_save_filepath = "hater_best.h5"

    # Avg 135 || reward function = 1 + (lines_cleared ** 2)*self.BOARD_WIDTH - (.1)*self._bumpiness(self.board)[0]/self.BOARD_WIDTH
    # 200 death penalty
    # agent_save_filepath = "keras_saved_maxbump.h5"

    # Avg 25 || reward function = 1 + (lines_cleared ** 2)*self.BOARD_WIDTH
    # 2 death penalty
    # agent_save_filepath = "keras_saved.h5"

    agent = DQNAgent(env.get_state_size(),
                     n_neurons=n_neurons,
                     activations=activations,
                     epsilon_stop_episode=epsilon_stop_episode,
                     mem_size=mem_size,
                     discount=discount,
                     replay_start_size=replay_start_size,
                     training=trainingAgent,
                     agent_save_filepath=agent_save_filepath)

    hateris = DQNAgent(env.get_state_size(),
                       n_neurons=n_neurons,
                       activations=activations,
                       epsilon_stop_episode=epsilon_stop_episode,
                       mem_size=mem_size,
                       discount=discount,
                       replay_start_size=replay_start_size,
                       training=trainingHater,
                       agent_save_filepath=hater_save_filepath)
    env.hater = hateris

    log_dir = f'logs/tetris-nn={str(n_neurons)}-mem={mem_size}-bs={batch_size}-e={epochs}-{datetime.now().strftime("%Y%m%d-%H%M%S")}'
    log = CustomTensorBoard(log_dir=log_dir)

    scores = []

    for episode in tqdm(range(episodes)):
        current_state = env.reset()
        done = False
        steps = 0

        if render_every and episode % render_every == 0:
            render = True
        else:
            render = False

        # Game
        while not done and (not max_steps or steps < max_steps):
            next_states = env.get_next_states()
            best_state = agent.best_state(next_states.values())

            best_action = None
            for action, state in next_states.items():
                if state == best_state:
                    best_action = action
                    break

            reward, done = env.play(best_action[0],
                                    best_action[1],
                                    render=render,
                                    render_delay=render_delay)
            if len(current_state
                   ) == env.get_state_size() - 1 and trainingAgent:
                toBeAdded = current_state + [env.next_piece]
            elif len(current_state
                     ) == env.get_state_size() - 1 and trainingHater:
                toBeAdded = current_state + [env.current_piece]
            else:
                toBeAdded = current_state
            if trainingAgent:
                agent.add_to_memory(toBeAdded, next_states[best_action],
                                    reward, done)
            if trainingHater:
                hateris.add_to_memory(toBeAdded, next_states[best_action],
                                      -reward, done)
            current_state = next_states[best_action]
            steps += 1

        scores.append(env.get_game_score())

        # Train
        if episode % train_every == 0 and trainingAgent:
            agent.train(batch_size=batch_size, epochs=epochs)
        if episode % train_every == 0 and trainingHater:
            hateris.train(batch_size=batch_size, epochs=epochs)

        # Logs
        if log_every and episode and episode % log_every == 0:
            avg_score = mean(scores[-log_every:])
            min_score = min(scores[-log_every:])
            max_score = max(scores[-log_every:])
            std_score = stdev(scores[-log_every:])
            print(
                str(episode) + " Avg: " + str(avg_score) + "   Min: " +
                str(min_score) + "   Max: " + str(max_score) + "   Std: " +
                str(round(std_score, 2)))

        if episode == epsilon_stop_episode and trainingAgent:
            agent.save_agent("agent_stopEps.h5")
        if episode == epsilon_stop_episode and trainingHater:
            hateris.save_agent("hater_stopEps.h5")

    if trainingAgent: agent.save_agent("real_agent.h5")
    if trainingHater: hateris.save_agent("real_hater.h5")
    plt.plot(scores)
    plt.show()
Exemplo n.º 9
0
def dqn():
    training = False
    env = Tetris(training)
    episodes = 2000
    max_steps = None
    epsilon_stop_episode = 1500
    mem_size = 20000
    discount = 0.95
    batch_size = 512
    epochs = 1
    render_every = 200 if training else 10
    log_every = 50
    replay_start_size = 2000
    train_every = 1
    n_neurons = [32, 32]
    render_delay = None
    activations = ['relu', 'relu', 'linear']
    #agent_save_filepath = "keras_saved_maxbump.h5"

    # with open("saved_agent", "rb") as input_file:
    #     agent = pickle.load(input_file)
    #     agent.epsilon = 0

    agent = DQNAgent(env.get_state_size(),
                     n_neurons=n_neurons,
                     activations=activations,
                     epsilon_stop_episode=epsilon_stop_episode,
                     mem_size=mem_size,
                     discount=discount,
                     replay_start_size=replay_start_size,
                     training=training,
                     agent_save_filepath=agent_save_filepath)

    log_dir = f'logs/tetris-nn={str(n_neurons)}-mem={mem_size}-bs={batch_size}-e={epochs}-{datetime.now().strftime("%Y%m%d-%H%M%S")}'
    log = CustomTensorBoard(log_dir=log_dir)

    scores = []

    for episode in tqdm(range(episodes)):
        current_state = env.reset()
        done = False
        steps = 0

        if render_every and episode % render_every == 0:
            render = True
        else:
            render = False

        # Game
        while not done and (not max_steps or steps < max_steps):
            next_states = env.get_next_states()
            best_state = agent.best_state(next_states.values())

            best_action = None
            for action, state in next_states.items():
                if state == best_state:
                    best_action = action
                    break

            reward, done = env.play(best_action[0],
                                    best_action[1],
                                    render=render,
                                    render_delay=render_delay)
            if len(current_state) == env.get_state_size() - 1:
                toBeAdded = current_state + [env.next_piece]
            else:
                toBeAdded = current_state
            if training:
                agent.add_to_memory(toBeAdded, next_states[best_action],
                                    reward, done)
            current_state = next_states[best_action]
            steps += 1

        scores.append(env.get_game_score())

        # Train
        if episode % train_every == 0 and training:
            agent.train(batch_size=batch_size, epochs=epochs)

        # Logs
        if log_every and episode and episode % log_every == 0:
            avg_score = mean(scores[-log_every:])
            min_score = min(scores[-log_every:])
            max_score = max(scores[-log_every:])
            std_score = stdev(scores[-log_every:])
            print(
                str(episode) + " Avg: " + str(avg_score) + "   Min: " +
                str(min_score) + "   Max: " + str(max_score) + "   Std: " +
                str(round(std_score, 2)))

        if episode == epsilon_stop_episode:
            agent.save_agent("keras_saved_stopEps.h5")

    if training: agent.save_agent("keras_saved.h5")
    plt.plot(scores)
    plt.show()
Exemplo n.º 10
0
def dqn():
    env = Tetris()
    episodes = 4000
    max_steps = None
    batch_size = 512
    epochs = 1
    render_every = 50
    log_every = 50
    replay_start_size = 2000
    train_every = 1
    render_delay = None

    algo = DQNAlgorithm(env.get_state_size())

    log_dir = f'logs/tetris-nn={str([32, 32])}-mem={20000}-bs={batch_size}-e={epochs}-{datetime.now().strftime("%Y%m%d-%H%M%S")}'
    log = CustomTensorBoard(log_dir=log_dir)

    scores = []
    times = []

    for episode in tqdm(range(episodes)):
        current_state = env.reset()
        done = False
        steps = 0

        if render_every and episode % render_every == 0:
            render = True
        else:
            render = False

        # Game
        while not done and (not max_steps or steps < max_steps):
            next_states = env.get_next_states()
            best_state = algo.best_state(next_states.values())
            
            best_action = None
            for action, state in next_states.items():
                if state == best_state:
                    best_action = action
                    break

            reward, done = env.play(best_action[0], best_action[1], render=render,
                                    render_delay=render_delay)
            
            algo.add_to_memory(current_state, next_states[best_action], reward, done)
            current_state = next_states[best_action]
            steps += 1

        scores.append(env.get_game_score())
        times.append(env.get_game_time())


        # Train
        if episode % train_every == 0:
            algo.train(batch_size=batch_size, epochs=epochs)

        # Logs
        if log_every and episode and episode % log_every == 0:
            score = scores[-log_every:]
            time = times[-log_every:]
            avg_score = mean(score)
            min_score = min(score)
            max_score = max(score)

            avg_time = mean(time)
            min_time = min(time)
            max_time = max(time)

            log.log(episode, avg_score=avg_score, min_score=min_score, max_score=max_score, avg_time=avg_time, min_time=min_time, max_time=max_time)