예제 #1
0
    # return stdscr


if __name__ == '__main__':
    # Curses standard screen
    stdscr = curses.initscr()

    # Init environment
    width, height = 10, 20  # standard tetris friends rules
    env = TetrisEngine(width, height)

    # Play games on repeat
    while True:
        init()
        stdscr.clear()
        env.clear()
        db = play_game()

        # Return to terminal
        terminate()
        # Should the game info be saved?
        if save_game():
            try:
                fr = open('training_data.npy', 'rb')
                x = np.load(fr)
                fr.close()
                fw = open('training_data.npy', 'wb')
                x = np.concatenate((x, db))
                # print('Saving {0} moves...'.format(len(db)))
                np.save(fw, x)
                print('{0} data points in the training set'.format(len(x)))
예제 #2
0
def main(episode, load, learn, debug, random_rate, session):
    load_model = load
    print("load model", load_model, "learn", learn, "debug", debug, "episode",
          episode)

    width, height = 7, 14  # standard tetris friends rules
    env = TetrisEngine(width, height)
    action_count = 7
    agent = Agent(lr=1e-4,
                  input_dims=width * height,
                  gamma=0.5,
                  n_actions=action_count,
                  l1_size=512,
                  l2_size=128)
    if session:
        model_filename = "%s-trained_model.torch" % session
    else:
        model_filename = "trained_model.torch"
    parameter_size = sum([len(p) for p in agent.policy.parameters()])
    print("network parameter size:", parameter_size)

    action_idx = 0

    if load_model:
        agent.policy.load_state_dict(T.load(model_filename))
    for i in range(episode):
        done = False
        score = 0
        state = env.clear()
        counter = 0
        while not done:
            counter += 1
            action, probs = agent.choose_action(state)
            prob = probs[action].item()
            state, reward, done = env.step(action)
            agent.store_rewards(reward)
            score += reward
            if debug:
                stdscr = curses.initscr()
                stdscr.clear()
                stdscr.addstr(str(env))
                stdscr.addstr('\ncumulative reward: ' + str(score))
                stdscr.addstr('\nreward: ' + str(reward))
                time.sleep(.2)
                continue

            if not debug and i % 100 == 0 and counter % 100 == 1:
                idx2direction = {
                    0: "left",
                    1: "right",
                    2: "hard_drop",
                    3: "soft_drop",
                    4: "rotate_left",
                    5: "rotate_right",
                    6: "idle"
                }
                probs_str = ""
                for z, item in enumerate(probs):
                    probs_str += "%s:%0.2f, " % (idx2direction[z], item.item())
                print(probs_str)
                print('episode: ', i, 'counter: ', counter,
                      'reward %0.3f' % reward,
                      'action: %s (%0.2f)' % (action, prob))
            writer.add_scalar("action prob", prob, action_idx)
            action_idx += 1

        if not debug and i % 100 == 0:
            print('episode: ', i, 'score %0.3f' % score)
        writer.add_scalar("final score", score, i)
        if learn:
            agent.learn()
            if i % 1000 == 0:
                T.save(agent.policy.state_dict(), model_filename)
    writer.close()
예제 #3
0
                CHECKPOINT_FILE, start_epoch))
        else:
            print("=> no checkpoint found at '{}'".format(CHECKPOINT_FILE))

    ######################################################################
    #
    # Below, you can find the main training loop. At the beginning we reset
    # the environment and initialize the ``state`` variable. Then, we sample
    # an action, execute it, observe the next screen and the reward (always
    # 1), and optimize our model once. When the episode ends (our model
    # fails), we restart the loop.

    f = open('log.out', 'w+')
    for i_episode in count(start_epoch):
        # Initialize the environment and state
        state = FloatTensor(engine.clear()[None, None, :, :])

        score = 0
        for t in count():
            # Select and perform an action
            action = select_action(state).type(LongTensor)

            # Observations
            last_state = state
            state, reward, done = engine.step(action[0, 0])
            state = FloatTensor(state[None, None, :, :])

            # Accumulate reward
            score += int(reward)

            reward = FloatTensor([float(reward)])
예제 #4
0

def print_placement(state):
    s = np.asarray(state)
    s = np.swapaxes(s, 1, 0)
    print(s)


agent = FixedPolicyAgent()
if __name__ == '__main__':
    # Check if user specified to resume from a checkpoint
    start_epoch = 0
    best_score = float('-inf')
    for i_episode in count(start_epoch):
        # Initialize the environment and state
        state = engine.clear()
        last_state = None
        score = 0
        cl = 0
        for t in count():
            # Select and perform an action
            actions_name, placement, actions = agent.select_action(
                engine, engine.shape, engine.anchor, engine.board)
            # Observations
            state, reward, done, cleared_lines, sent_lines = engine.step_to_final(
                actions)
            if not done:
                last_state = state
            # Accumulate reward
            score += reward
            cl += cleared_lines