示例#1
0
def train_ql(size, lr, rd, eps_start=1.0, eps_end=0.05, eps_decay=0.999):
    env = gym.make('game2048-v0', size=size)
    agent = model.QLearning(env.action_space, learning_rate=lr, reward_decay=rd)
    total_steps = 0
    total_scores = 0
    highest_score = 0
    #  trials = 1 * 100000 * (size ** 2)
    trials = 400000
    rewards_window = deque(maxlen=100)
    scores_window = deque(maxlen=100)
    eps = eps_start

    for trial in range(1, trials+1):
        obs = env.reset()
        obs = str(obs.reshape(size ** 2).tolist())
        stepno = 0
        rewards = 0
        while True:
            stepno += 1
            total_steps += 1
            action = agent.choose_action(str(obs), eps)
            obs_, reward, done, _ = env.step(action)
            obs_ = str(obs_.reshape(size ** 2).tolist())
            if done:
                obs_ = 'terminal'
            agent.learn(obs, action, reward, obs_)
            obs = obs_
            rewards += reward
            if done:
                break

        #env.render()
        eps = max(eps_end, eps_decay * eps)
        rewards_window.append(rewards)
        scores_window.append(env.get_score())
        if env.get_score() > highest_score:
            highest_score = env.get_score()
        total_scores += env.get_score()
        print('\rEpisode {}\t total_steps: {}\t Average Rewards: {:.2f}\t Average Scores: {:.2f} {}'.
                format(trial, total_steps, np.mean(rewards_window), np.mean(scores_window), eps), end="")
        if trial% 100 == 0:
            print('\rEpisode {}\t total_steps: {}\t Average Rewards: {:.2f}\t Average Scores: {:.2f} {}'.
                    format(trial, total_steps, np.mean(rewards_window), np.mean(scores_window), eps))

    eval(env, agent, 1000, render=False)
    print(f'table_len: {len(agent.q_table)} steps: {total_steps} avg_score: {total_scores / trials} \
highest_score: {highest_score} at size: {size} lr: {lr} reward_decay: {rd}')
    print(f'table_len: {len(agent.q_table)} steps: {total_steps}')
示例#2
0
def train_sarsa(size, lr, rd):
    env = gym.make('game2048-v0', size=size)
    agent = model.Sarsa(env.action_space, learning_rate=lr, reward_decay=rd)
    total_steps = 0
    total_scores = 0
    highest_score = 0
    trials = 1 * 1000 * (size ** 2)

    for trial in range(trials):
        obs = env.reset()
        obs = str(obs.reshape(size ** 2).tolist())
        action = agent.choose_action(obs)
        stepno = 0
        rewards = 0
        while True:
            stepno += 1
            total_steps += 1
            obs_, reward, done, _ = env.step(action)
            obs_ = str(obs_.reshape(size ** 2).tolist())
            action_ = agent.choose_action(obs_, True)
            if done:
                obs_ = 'terminal'
            agent.learn(obs, action, reward, obs_, action_)
            obs = obs_
            action = action_
            rewards += reward
            if done:
                break

        #env.render()
        print(f'Completed in {trial} use {stepno} steps highest: \
{env.highest()} rewards: {rewards}', end="")
        if env.highest() >= 2 ** (size ** 2 - 1):
            highest[trial] = env.highest()
            if env.highest() >= 2 ** (size ** 2):
                targets[trial] = env.highest()
        if env.get_score() > highest_score:
            highest_score = env.get_score()
        total_scores += env.get_score()
        stepno = 0
        rewards = 0

    eval(env, agent, render=False)
    print(f'table_len: {len(agent.q_table)} steps: {total_steps} avg_score: {total_scores / trials} \
highest_score: {highest_score} at size: {size} lr: {lr} reward_decay: {rd}')
    print(f'highest len: {len(highest)} prob: {len(highest) * 1.0 / trials} \
target len: {len(targets)} prob: {len(targets) * 1.0 / trials}')
示例#3
0
def eval(env, agent, times=1000, render=False):
    if False:
        write_explore(agent, 'explore_old.file')

    highest_score = 0
    total_scores = 0
    size = env.get_size()
    scores = []
    max_tiles = []

    for i in range(times):
        obs = env.reset()
        obs = str(obs.reshape(size ** 2).tolist())

        while True:
            action = agent.choose_action(obs)
            obs_, reward, done, _ = env.step(action)
            obs_ = str(obs_.reshape(size ** 2).tolist())
            if render:
                print(f'action is: {action} {obs} {obs_}')
                env.render()
            if obs_ == obs:
                #  env.render()
                agent.learn(obs, action, reward, obs_)
            obs = obs_
            if done:
                break

        env.render()
        scores.append(env.get_score())
        max_tiles.append(env.highest())
        if env.get_score() > highest_score:
            highest_score = env.get_score()
        total_scores += env.get_score()

    if times > 0:
        plot_score(scores, max_tiles)
        print(f'eval avg_score: {total_scores / times} highest_score: {highest_score}')

    if False:
        write_explore(agent, 'explore_new.file')
示例#4
0
def test_env(model, vis=False):
    state = env.reset()
    if vis: env.render()
    done = False
    total_reward = 0
    while not done:
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        dist, _ = model(state)
        next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0])
        state = next_state
        if vis: env.render()
        total_reward += reward
    return total_reward, env.get_score()
示例#5
0
def eval(env, agent, times=1000, render=False):
    highest_score = 0
    scores = []
    max_tiles = []
    eps = 0.0

    random = False
    for i in range(times):
        obs = env.reset()
        while True:
            action, action_values = agent.choose_action(obs, eps, rand=random)
            obs_, reward, done, _ = env.step(action)
            if render:
                env.render()
            if str(obs_) == str(obs):
                random = True
                #env.render()
                #  print(f'action is: {action} {reward} {action_values} {obs} {obs_}')
                print(
                    f'action is: {action} {reward} {action_values} {obs} {obs_}'
                )
            else:
                random = False
            obs = obs_
            if done:
                break

        env.render()
        scores.append(env.get_score())
        max_tiles.append(env.highest())
        if env.get_score() > highest_score:
            highest_score = env.get_score()

    if times > 0:
        plot_score(scores, max_tiles)
        print(
            f'eval avg_score: {np.mean(scores)} highest_score: {highest_score}'
        )
示例#6
0
def train_dqn(size, agt, eps_start=1.0, eps_end=0.05, eps_decay=0.999):
    env = gym.make('game2048-v0', size=size, norm=FLAGS.norm)
    env.seed(1)

    if FLAGS.norm:
        channels = size * size + 2
    else:
        channels = 1
    agent = model.DQNAgent(size, channels, 4, 0, FLAGS.double_q, FLAGS.dueling)
    if FLAGS.model_file:
        print(f'load {FLAGS.model_file}')
        agent.load(FLAGS.model_file)
    total_steps = 0
    total_scores = 0
    highest_score = 0
    trials = 10000
    eps = eps_start
    scores_window = deque(maxlen=WINDOWS_SIZE)
    rewards_window = deque(maxlen=WINDOWS_SIZE)
    scores = []
    sd_name = 'model_%dx%d.checkpoint' % (size, size)

    random = False
    for trial in range(1, trials + 1):
        obs = env.reset()
        stepno = 0
        rewards = 0
        loss = 0
        while True:
            stepno += 1
            total_steps += 1
            action, _ = agent.choose_action(obs, eps, rand=random)
            obs_, reward, done, _ = env.step(action)
            random = np.all(obs == obs_)
            loss = agent.step(obs, action, reward, obs_, done)
            obs = obs_
            rewards += reward
            if done:
                break

        eps = max(eps_end, eps * eps_decay)
        rewards_window.append(rewards)
        scores_window.append(env.get_score())
        scores.append(rewards)
        #  env.render()
        if env.get_score() > highest_score:
            highest_score = env.get_score()
        total_scores += env.get_score()
        print(
            '\rEpisode {}\t Steps: {}\t\t Average Reward: {:.2f}\t\t Average Scores: {:.2f}\t loss: {:.2f}\t highest: {}\t eps: {:.4f}'
            .format(trial, total_steps, np.mean(rewards_window),
                    np.mean(scores_window), loss, highest_score, eps),
            end="")
        if trial % WINDOWS_SIZE == 0:
            print(
                '\rEpisode {}\t Steps: {}\t\t Average Reward: {:.2f}\t\t Average Scores: {:.2f}\t loss: {:.2f}\t highest: {}\t eps: {:.4f}'
                .format(trial, total_steps, np.mean(rewards_window),
                        np.mean(scores_window), loss, highest_score, eps))
        if trial % 1000 == 0:
            agent.save(sd_name)

    eval(env, agent, 1000, render=False)
    print(f'steps: {total_steps} avg_score: {total_scores / trials} \
highest_score: {highest_score} at size: {size}')
    plot_score(scores, [])