Exemplo n.º 1
0
def mc_rollout(state, game: Game_2048, rollouts=1, policy=random_policy):
    cum_rews = []
    for _ in range(rollouts):
        game.state = state
        done = False
        cum_rew = 0
        while not done:
            action = random_policy(game)
            _, reward, done = game.step(action)
            cum_rew += reward
        cum_rews.append(cum_rew)
    return sum(cum_rews) / len(cum_rews)
Exemplo n.º 2
0
 def __init__(self,
              game=Game_2048(),
              playout_policy=get_policy_func("Random_agent")):
     self.game = game
     self.exploration_constant = 200
     self.playout_policy = playout_policy
     self.root = probabilistic_Node(self.game.state, 1)
Exemplo n.º 3
0
 def rollout_batch(self, batch_size):
     games = [Game_2048() for _ in range(batch_size)]
     states = [[game.state] for game in games]
     still_running = list(range(batch_size))
     rewardlist = [[] for _ in games]
     while len(still_running) > 0:
         """eval_batch = []
         evaluations = np.zeros((len(still_running),self.game.action_space.n))
         for sn in range(len(still_running)):
             i = still_running[sn]
             for a in range(games[i].action_space.n):
                 new_state,reward,done = games[i].check_update(games[i].state,a)
                 if done:
                     evaluations[sn,a]=reward
                 else:
                     nn_input = self.game.convert_to_nn_input(new_state)
                     eval_batch.append(nn_input)
                     evaluations[sn,a] = reward
         if len(eval_batch)>0:
             predictions = self.model.predict(np.array(eval_batch)).reshape(len(eval_batch))
         pindex = 0
         for e in range(evaluations.shape[0]):
             for a in range(evaluations.shape[1]):
                 if evaluations[e,a]!=self.game.done_reward:
                     #if e==0 and 0 in still_running:
                     #    log_file.write(f"action {BACKMAP[a]}, reward: {evaluations[e,a]}, value: {predictions[pindex]}\n")
                     evaluations[e,a] += predictions[pindex]
                     pindex+=1
                 #else:
                     #if e==0 and 0 in still_running:
                     #    log_file.write(f"action {BACKMAP[a]}, reward: {evaluations[e,a]}, value: done\n")"""
         for sn in range(len(still_running) - 1, -1, -1):
             i = still_running[sn]
             action = None
             best_val = -np.inf
             for a in range(games[i].action_space.n):
                 _, reward, _ = games[i].check_update(games[i].state, a)
                 if reward > best_val:
                     best_val = reward
                     action = a
             new_state, reward, done = games[i].check_update(
                 games[i].state, action)
             if done:
                 del still_running[sn]
                 continue
             games[i].state = new_state
             states[i].append(new_state.copy())
             games[i].spawn_number()
             rewardlist[i].append(reward)
     cum_rewards = [sum(x) for x in rewardlist]
     for i in range(len(states)):
         for j in range(len(states[i])):
             if j < len(states[i]) - 1:
                 self.memory.append(
                     (states[i][j], rewardlist[i][j], states[i][j + 1]))
     return cum_rewards
 def __init__(self, game=Game_2048()):
     self.game = game
     self.memory = deque(maxlen=100000)
     self.learning_rate = 0.01
     self.batch_size = 4096
     self.rollout_batch_size = 400
     self.train_per_it = 100
     self.train_start = 10000
     self.model = mlp(self.game.space_1d.n,
                      5,
                      256,
                      1,
                      lr=self.learning_rate)
Exemplo n.º 5
0
    def __init__(self,game=Game_2048()):
        #Hyperparams
        self.discount_factor = 0.99
        self.learning_rate = 0.001
        self.epsilon = 0
        self.epsilon_decay = 0
        self.epsilon_min = 0.05
        self.batch_size = 64
        self.soft_update_rate = 0.01
        self.train_start = 1000
        #most simple replay memory
        self.memory = deque(maxlen=10000)

        self.state_size = game.observation_space.n
        self.action_size = game.action_space.n
        self.game = game
        self.model = self.build_model()
        self.target_model = clone_model(self.model)
Exemplo n.º 6
0
 def evaluate_net(self,
                  new_net,
                  game_batch_len,
                  time_per_move,
                  batch_num=1):
     score_sum = 0
     for i in range(batch_num):
         games = [Game_2048() for _ in range(game_batch_len)]
         scores = [0] * game_batch_len
         paths = [None] * game_batch_len
         nn_inputs = [None] * game_batch_len
         while not reduce(lambda a, b: a.done * b.done, games, True):
             roots = [probabilistic_Node(game.state, 1) for game in games]
             next_move_time = time.time() + time_per_move
             while time.time() < next_move_time:
                 for i, game in enumerate(games):
                     if game.done:
                         continue
                     path = self.select_most_promising(roots[i])
                     node = path[-1]
                     nn_input = self.game.convert_to_nn_input(node.state)
                     paths[i] = path
                     nn_inputs[i] = nn_input
                 prediction = new_net.predict(np.array(nn_inputs))
                 for i, game in enumerate(games):
                     if game.done:
                         continue
                     value = prediction[i][0]
                     move_props = prediction[i][1:]
                     back_path = paths[i]
                     node = back_path[-1]
                     self.expand(node, move_props)
                     self.backtrack(back_path, value, True)
             for i, game in enumerate(games):
                 if game.done:
                     continue
                 move_props = self.get_move_props(roots[i])
                 action = np.argmax(move_props)
                 _, reward, _done = game.step(action)
                 scores[i] += reward
         score_sum += sum(scores) / len(scores)
     return score_sum / batch_num
Exemplo n.º 7
0
 def __init__(self, game=Game_2048(), batch_size=1024, lr=0.01):
     self.game = game
     self.memory = deque(maxlen=100000)
     self.learning_rate = lr
     self.batch_size = batch_size
     self.model = mlp(self.game.space_1d.n,
                      5,
                      256,
                      self.game.action_space.n + 1,
                      lr=self.learning_rate)
     self.experimental_model = mlp(self.game.space_1d.n,
                                   5,
                                   256,
                                   self.game.action_space.n + 1,
                                   lr=self.learning_rate)
     rmtree(os.path.abspath(os.path.dirname(__file__)) +
            "/tensorboard_logs",
            ignore_errors=True)
     self.tensorboard_callback = keras.callbacks.TensorBoard(
         log_dir="./tensorboard_logs")
     self.generator = RL_sequence(self.memory, self.batch_size)
     self.best_net_avg_score = 0
     self.workers = MCTS_workers(game)
Exemplo n.º 8
0
        train_writer = tf.summary.create_file_writer(
            os.path.abspath(os.path.dirname(__file__)) + "/logdir")
        state = self.game.reset()
        print("Filling up memory to get started")
        while len(self.memory) < self.train_start:
            self.rollout_batch(500)
        print("Min samples stored, starting training now")
        #rew_avg = 0
        for i in range(iterations):
            self.greedy_rollout_value_func(i)
            print(f"Performing {self.train_per_it} batch trainings")
            loss_avg = 0
            for _ in range(self.train_per_it):
                loss = self.train_one_batch()
                loss_avg += loss
            loss_avg = loss_avg / self.train_per_it
            print(f"Rolling out policy {self.rollout_batch_size} times")
            rewards = self.rollout_batch(self.rollout_batch_size)
            rew_avg = sum(rewards) / len(rewards)
            print(
                f"Iteration: {i}, Reward avg: {rew_avg}, avg loss: {loss_avg:.3f}"
            )
            with train_writer.as_default():
                tf.summary.scalar('reward', rew_avg, step=i)
                tf.summary.scalar('avg loss', loss_avg, step=i)


if __name__ == "__main__":
    game = Game_2048()
    learner = DeepTD0(game)
    learner.train_iterations(100000000)
Exemplo n.º 9
0
 def __init__(self, game=Game_2048()):
     self.game = game
     self.rollout_game = Game_2048()
Exemplo n.º 10
0
 def __init__(self,game=Game_2048()):
     self.game = game
Exemplo n.º 11
0
 def __init__(self, game=Game_2048(), heuristic=human_heuristic):
     self.game = game
     self.heuristic = heuristic
Exemplo n.º 12
0
        cnt = 0
        for i in range(iterations):
            action = self.epsilon_greedy(state)
            print(self.game)
            new_state,reward,done = self.game.step(action)
            rew_sum += reward
            self.append_sample(new_state)
            loss = self.train_one_batch()
            self.soft_target_update()
            loss_sum += loss
            cnt+=1
            if done:
                state=self.game.reset()
                nn_input = self.game.convert_to_nn_input(state).reshape(1,-1)
                predict_mod = self._get_state_expected_value(state,self.model)
                predict_targ = self._get_state_expected_value(state,self.target_model)
                self.append_sample(state)
                if len(self.memory) > self.train_start:
                    self.decay_epsilon()
                    print(f"Iteration: {i}, Reward sum: {rew_sum}, cnt: {cnt}, avg loss: {loss_sum/cnt:.3f}, eps: {self.epsilon:.3f}")
                    with train_writer.as_default():
                        tf.summary.scalar('reward', rew_sum, step=i)
                        tf.summary.scalar('avg loss', loss_sum/cnt, step=i)
                        tf.summary.scalar('predict_mod', predict_mod, step=i)
                        tf.summary.scalar('predict_targ', predict_targ, step=i)
                rew_sum = 0
                loss_sum = 0
                cnt = 0
if __name__ == '__main__':
    learner = DeepTD0(Game_2048())
    learner.train_iterations(100000)
 def rollout_batch(self, batch_size):
     #log_file = open("rollout.log","a")
     #log_file.write("\n==============STARTING NEW BATCH=============\n")
     games = [Game_2048() for _ in range(batch_size)]
     #log_file.write(str(games[0])+"\n")
     states = [[game.state] for game in games]
     still_running = list(range(batch_size))
     rewardlist = [[] for _ in games]
     while len(still_running) > 0:
         eval_batch = []
         evaluations = np.zeros(
             (len(still_running), self.game.action_space.n))
         for sn in range(len(still_running)):
             i = still_running[sn]
             for a in range(games[i].action_space.n):
                 new_state, reward, done = games[i].check_update(
                     games[i].state, a)
                 if done:
                     evaluations[sn, a] = reward
                 else:
                     nn_input = self.game.convert_to_nn_input(new_state)
                     eval_batch.append(nn_input)
                     evaluations[sn, a] = reward
         if len(eval_batch) > 0:
             predictions = self.model.predict(np.array(eval_batch)).reshape(
                 len(eval_batch))
         pindex = 0
         for e in range(evaluations.shape[0]):
             for a in range(evaluations.shape[1]):
                 if evaluations[e, a] != self.game.done_reward:
                     #if e==0 and 0 in still_running:
                     #    log_file.write(f"action {BACKMAP[a]}, reward: {evaluations[e,a]}, value: {predictions[pindex]}\n")
                     evaluations[e, a] += predictions[pindex]
                     pindex += 1
                 #else:
                 #if e==0 and 0 in still_running:
                 #    log_file.write(f"action {BACKMAP[a]}, reward: {evaluations[e,a]}, value: done\n")
         for sn in range(len(still_running) - 1, -1, -1):
             i = still_running[sn]
             action_vals = evaluations[sn]
             action = np.argmax(action_vals)
             """action = None
             best_val = -np.inf
             for a in range(games[i].action_space.n):
                 _,reward,_ = games[i].check_update(games[i].state,a)
                 if reward>best_val:
                     best_val = reward
                     action = a"""
             new_state, reward, done = games[i].check_update(
                 games[i].state, action)
             if done:
                 del still_running[sn]
                 continue
             games[i].state = new_state
             #if i==0:
             #    log_file.write(f"chosen action: {BACKMAP[action]}, new state before spawn: {games[i]}\n")
             states[i].append(new_state.copy())
             games[i].spawn_number()
             #if i==0:
             #    log_file.write(f"state after spawn: {games[i]}")
             rewardlist[i].append(reward)
     #log_file.write("\n==========MEMORY STORE AHEAD===========\n")
     cum_rewards = [sum(x) for x in rewardlist]
     samples = []
     for i in range(len(states)):
         store_reward = cum_rewards[i]
         for j in range(len(states[i])):
             state = states[i][j]
             #if i==0:
             #    temp_game = Game_2048()
             #    temp_game.state = state
             #    log_file.write(f"REWARD: {store_reward}, STATE: {temp_game}\n")
             samples.append((store_reward, state))
             if j < len(states[i]) - 1:
                 store_reward -= rewardlist[i][j]
     return cum_rewards, samples