Exemplo n.º 1
0
def run_game():
    env = Tetris()
    episodes = 2000
    max_steps = None
    discount = 0.98
    replay_mem_size = 20000
    minibatch_size = 512
    epsilon = 1
    epsilon_min = 0
    epsilon_stop_episode = 1500
    learning_rate = 5e-3
    epochs = 1
    show_every = 50
    log_every = 50
    replay_start_size = 2000
    train_every = 1
    hidden_dims = [64, 64]
    activations = ['relu', 'relu', 'linear']

    agent = DQNAgent(env.get_state_size(), discount=discount, \
                   replay_mem_size=replay_mem_size, \
                   minibatch_size=minibatch_size, epsilon=epsilon, \
                   # epsilon_decay=epsilon_decay, \
                   epsilon_min=epsilon_min, \
                   epsilon_stop_episode=epsilon_stop_episode, \
                   learning_rate=learning_rate, hidden_dims=hidden_dims, \
                   activations=activations, \
                   replay_start_size=replay_start_size)

    log_dir = f'log/tetris-{datetime.now().strftime("%Y%m%d-%H%M%S")}-nn={str(hidden_dims)}-mem={replay_mem_size}-bs={minibatch_size}-discount={discount}'
    log = ModifiedTensorBoard(log_dir=log_dir)

    scores = []
    for episode in tqdm(range(episodes)):
        current_state = env.reset_game()
        done = False
        step = 0
        log.step = episode

        if show_every and episode % show_every == 0:
            show = True
        else:
            show = False

        # Run the game until either game over or we've hit max number of steps
        while not done and (not max_steps or step < max_steps):
            next_states = env.get_next_states()
            best_state = agent.best_state(next_states.values())

            best_action = None
            # action is (x,i), state is [lines_cleared, holes, total_bumpiness, sum_height]
            for action, state in next_states.items():
                if state == best_state:
                    best_action = action
                    break

            # reward is the score, done is gameover status
            reward, done = env.play_game(best_action[0],
                                         best_action[1],
                                         show=show)
            if show:
                env.show()
            agent.update_replay_memory(current_state, best_action,
                                       next_states[best_action], reward, done)

            # move to next timestep
            current_state = next_states[best_action]
            step += 1
        if show:
            # After game is completed, collect the final score
            print("Episode %d  score: %d  epsilon: %.2f" %
                  (episode, env.get_game_score(), agent.epsilon))
        scores.append(env.get_game_score())

        agent.train(epochs=epochs)

        if log_every and episode % log_every == 0:
            avg_score = mean(scores[-log_every:])
            min_score = min(scores[-log_every:])
            max_score = max(scores[-log_every:])

            log.update_stats(avg_score=avg_score,
                             min_score=min_score,
                             max_score=max_score)

        if env.get_game_score() >= MIN_SCORE:
            if not os.path.exists('models/'):
                os.makedirs('models/')
            agent.model.save(
                f'models/eps_{str(episode)}nn_{str(hidden_dims)}__bs{minibatch_size}__score_{env.get_game_score()}__{int(time.time())}.h5'
            )
Exemplo n.º 2
0
            action = np.argmax(agent.get_qs(current_state))
        else:
            # Get random action
            action = np.random.randint(0, env.ACTION_SPACE_SIZE)

        new_state, reward, done = env.step(action)

        # Transform new continous state to new discrete state and count reward
        episode_reward += reward

        if SHOW_PREVIEW:# and not episode % AGGREGATE_STATS_EVERY:
            env.render()

        # Every step we update replay memory and train main network
        agent.update_replay_memory((current_state, action, reward, new_state, done))
        agent.train(done, step)

        current_state = new_state
        step += 1

    # Append episode reward to a list and log stats (every given number of episodes)
    ep_rewards.append(episode_reward)
    if not episode % AGGREGATE_STATS_EVERY or episode == 1:
        average_reward = sum(ep_rewards[-AGGREGATE_STATS_EVERY:])/len(ep_rewards[-AGGREGATE_STATS_EVERY:])
        min_reward = min(ep_rewards[-AGGREGATE_STATS_EVERY:])
        max_reward = max(ep_rewards[-AGGREGATE_STATS_EVERY:])
        agent.tensorboard.update_stats(reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward, epsilon=epsilon)

    # Save model, but only when min reward is greater or equal a set value
    if episode % 50 == 0:
        agent.model.save(f'models/{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{int(time.time())}.model')
Exemplo n.º 3
0
        env = FireResetEnv(env)
    env = ProcessFrame84(env)
    env = FrameStack(env, 4)
    env = ClippedRewardsWrapper(env)
    return env


env = wrap_dqn(gym.make('PongNoFrameskip-v4'))

agent = DQNAgent(env=env, num_actions=NUM_ACTIONS, lr=LR, discount=GAMMA)

# Load model
# agent.load_model(weights_file="snaps/model")

# Train agent
agent.train(TRAIN_STEPS, weights_file="snaps/model")

# Evaluate
success = 0
for tr in range(TRIALS):
    state = env.reset()
    t = 0
    acc_r = 0
    while True:
        env.render()
        action = agent.act(state)
        state, reward, done, _ = env.step(action)
        acc_r += reward
        t += 1
        if done:
            print("Trial {} finished after {} timesteps".format(tr, t))
Exemplo n.º 4
0
                             "epsilon_min": epsilon_min,
                             "batch_size": batch_size,
                             "memory_size": memory_size,
                             "name": name
                         },
                         name=name,
                         allow_val_change=True)

        # Utilize the hyperparameters of the model like this: config.parameter
        config = wandb.config

        model = DQNAgent(env, config, epsilon, training_episodes,
                         testing_episodes, frames)

        hyper_param_counter += 1
        model.train()
        print("Run {} of {}.".format(hyper_param_counter, total_runs))
        model_dir = "saved_models"
        model_save_name = model_dir + "LR_{}_LS_{}_BS_{}_MS_{}_Timestamp_{}".format(
            learning_rate, layer_size, batch_size, memory_size, int(
                time.time())) + "sb.h5"
        model.save(model_save_name)

        #---------------------------------------------------------------------------------------

        # Model previously finished in 242 Episodes
        learning_rate = 0.001
        layer_size = 256
        batch_size = 64
        memory_size = 50_000
Exemplo n.º 5
0
            premio = 0
            informacion = env.get_info()
            antiguo_statistics = informacion['statistics']
            board = env.board()
            mejor_estado = board_prop(board)[:]
            if lineas_completadas < informacion[
                    'number_of_lines'] and not terminado:
                premio = premio + 40 * (-lineas_completadas +
                                        informacion['number_of_lines'])

                lineas_completadas = informacion['number_of_lines']
                state, reward, terminado, info = env.step(0)
                agent.add_memoria(estado, mejor_estado, premio, terminado)

            agent.add_memoria(estado, mejor_estado, premio, terminado)
            estado = mejor_estado[:]
            pieza_colocada = True
            eliminado = False

    puntuacion = informacion['score']
    file.write(str(puntuacion) + ",")

    if puntuacion > puntuacion_max:
        puntuacion_max = puntuacion
        agent.modelo.save('modelo_max.modelo')
    if episodio % entrenar_cada == 0:
        agent.train(batch_size=128, epochs=1)

file.close()
env.close()
Exemplo n.º 6
0
                                        informacion['number_of_lines'])
                lineas_completadas = informacion['number_of_lines']
                state, reward, terminado, info = env.step(0)
                agent.add_memoria(estado, mejor_estado, premio, terminado)
                #movimientos.append([estado,mejor_estado,premio,terminado,valor])

            agent.add_memoria(estado, mejor_estado, premio, terminado)
            #ant_casillas_para_completar=casillas_para_completar
            #movimientos.append([estado,mejor_estado,premio,terminado,valor])
            estado = mejor_estado[:]
            pieza_colocada = True

    puntuacion = informacion['score']
    file.write(str(puntuacion) + ",")
    #if puntuacion>puntuacion_max*0.5:
    if puntuacion > puntuacion_max:
        puntuacion_max = puntuacion
        agent.modelo.save('modelo_max.modelo')

    # Entrenamos al agente con los datos obtenidos de la partida
    if episodio % entrenar_cada == 0:
        agent.train(batch_size=500,
                    epochs=1,
                    puntuacion=lineas_completadas,
                    q_actual=valor)
    fin = time.perf_counter()
    print(fin - inicio)

file.close()
env.close()