def mc_rollout(state, game: Game_2048, rollouts=1, policy=random_policy): cum_rews = [] for _ in range(rollouts): game.state = state done = False cum_rew = 0 while not done: action = random_policy(game) _, reward, done = game.step(action) cum_rew += reward cum_rews.append(cum_rew) return sum(cum_rews) / len(cum_rews)
def __init__(self, game=Game_2048(), playout_policy=get_policy_func("Random_agent")): self.game = game self.exploration_constant = 200 self.playout_policy = playout_policy self.root = probabilistic_Node(self.game.state, 1)
def rollout_batch(self, batch_size): games = [Game_2048() for _ in range(batch_size)] states = [[game.state] for game in games] still_running = list(range(batch_size)) rewardlist = [[] for _ in games] while len(still_running) > 0: """eval_batch = [] evaluations = np.zeros((len(still_running),self.game.action_space.n)) for sn in range(len(still_running)): i = still_running[sn] for a in range(games[i].action_space.n): new_state,reward,done = games[i].check_update(games[i].state,a) if done: evaluations[sn,a]=reward else: nn_input = self.game.convert_to_nn_input(new_state) eval_batch.append(nn_input) evaluations[sn,a] = reward if len(eval_batch)>0: predictions = self.model.predict(np.array(eval_batch)).reshape(len(eval_batch)) pindex = 0 for e in range(evaluations.shape[0]): for a in range(evaluations.shape[1]): if evaluations[e,a]!=self.game.done_reward: #if e==0 and 0 in still_running: # log_file.write(f"action {BACKMAP[a]}, reward: {evaluations[e,a]}, value: {predictions[pindex]}\n") evaluations[e,a] += predictions[pindex] pindex+=1 #else: #if e==0 and 0 in still_running: # log_file.write(f"action {BACKMAP[a]}, reward: {evaluations[e,a]}, value: done\n")""" for sn in range(len(still_running) - 1, -1, -1): i = still_running[sn] action = None best_val = -np.inf for a in range(games[i].action_space.n): _, reward, _ = games[i].check_update(games[i].state, a) if reward > best_val: best_val = reward action = a new_state, reward, done = games[i].check_update( games[i].state, action) if done: del still_running[sn] continue games[i].state = new_state states[i].append(new_state.copy()) games[i].spawn_number() rewardlist[i].append(reward) cum_rewards = [sum(x) for x in rewardlist] for i in range(len(states)): for j in range(len(states[i])): if j < len(states[i]) - 1: self.memory.append( (states[i][j], rewardlist[i][j], states[i][j + 1])) return cum_rewards
def __init__(self, game=Game_2048()): self.game = game self.memory = deque(maxlen=100000) self.learning_rate = 0.01 self.batch_size = 4096 self.rollout_batch_size = 400 self.train_per_it = 100 self.train_start = 10000 self.model = mlp(self.game.space_1d.n, 5, 256, 1, lr=self.learning_rate)
def __init__(self,game=Game_2048()): #Hyperparams self.discount_factor = 0.99 self.learning_rate = 0.001 self.epsilon = 0 self.epsilon_decay = 0 self.epsilon_min = 0.05 self.batch_size = 64 self.soft_update_rate = 0.01 self.train_start = 1000 #most simple replay memory self.memory = deque(maxlen=10000) self.state_size = game.observation_space.n self.action_size = game.action_space.n self.game = game self.model = self.build_model() self.target_model = clone_model(self.model)
def evaluate_net(self, new_net, game_batch_len, time_per_move, batch_num=1): score_sum = 0 for i in range(batch_num): games = [Game_2048() for _ in range(game_batch_len)] scores = [0] * game_batch_len paths = [None] * game_batch_len nn_inputs = [None] * game_batch_len while not reduce(lambda a, b: a.done * b.done, games, True): roots = [probabilistic_Node(game.state, 1) for game in games] next_move_time = time.time() + time_per_move while time.time() < next_move_time: for i, game in enumerate(games): if game.done: continue path = self.select_most_promising(roots[i]) node = path[-1] nn_input = self.game.convert_to_nn_input(node.state) paths[i] = path nn_inputs[i] = nn_input prediction = new_net.predict(np.array(nn_inputs)) for i, game in enumerate(games): if game.done: continue value = prediction[i][0] move_props = prediction[i][1:] back_path = paths[i] node = back_path[-1] self.expand(node, move_props) self.backtrack(back_path, value, True) for i, game in enumerate(games): if game.done: continue move_props = self.get_move_props(roots[i]) action = np.argmax(move_props) _, reward, _done = game.step(action) scores[i] += reward score_sum += sum(scores) / len(scores) return score_sum / batch_num
def __init__(self, game=Game_2048(), batch_size=1024, lr=0.01): self.game = game self.memory = deque(maxlen=100000) self.learning_rate = lr self.batch_size = batch_size self.model = mlp(self.game.space_1d.n, 5, 256, self.game.action_space.n + 1, lr=self.learning_rate) self.experimental_model = mlp(self.game.space_1d.n, 5, 256, self.game.action_space.n + 1, lr=self.learning_rate) rmtree(os.path.abspath(os.path.dirname(__file__)) + "/tensorboard_logs", ignore_errors=True) self.tensorboard_callback = keras.callbacks.TensorBoard( log_dir="./tensorboard_logs") self.generator = RL_sequence(self.memory, self.batch_size) self.best_net_avg_score = 0 self.workers = MCTS_workers(game)
train_writer = tf.summary.create_file_writer( os.path.abspath(os.path.dirname(__file__)) + "/logdir") state = self.game.reset() print("Filling up memory to get started") while len(self.memory) < self.train_start: self.rollout_batch(500) print("Min samples stored, starting training now") #rew_avg = 0 for i in range(iterations): self.greedy_rollout_value_func(i) print(f"Performing {self.train_per_it} batch trainings") loss_avg = 0 for _ in range(self.train_per_it): loss = self.train_one_batch() loss_avg += loss loss_avg = loss_avg / self.train_per_it print(f"Rolling out policy {self.rollout_batch_size} times") rewards = self.rollout_batch(self.rollout_batch_size) rew_avg = sum(rewards) / len(rewards) print( f"Iteration: {i}, Reward avg: {rew_avg}, avg loss: {loss_avg:.3f}" ) with train_writer.as_default(): tf.summary.scalar('reward', rew_avg, step=i) tf.summary.scalar('avg loss', loss_avg, step=i) if __name__ == "__main__": game = Game_2048() learner = DeepTD0(game) learner.train_iterations(100000000)
def __init__(self, game=Game_2048()): self.game = game self.rollout_game = Game_2048()
def __init__(self,game=Game_2048()): self.game = game
def __init__(self, game=Game_2048(), heuristic=human_heuristic): self.game = game self.heuristic = heuristic
cnt = 0 for i in range(iterations): action = self.epsilon_greedy(state) print(self.game) new_state,reward,done = self.game.step(action) rew_sum += reward self.append_sample(new_state) loss = self.train_one_batch() self.soft_target_update() loss_sum += loss cnt+=1 if done: state=self.game.reset() nn_input = self.game.convert_to_nn_input(state).reshape(1,-1) predict_mod = self._get_state_expected_value(state,self.model) predict_targ = self._get_state_expected_value(state,self.target_model) self.append_sample(state) if len(self.memory) > self.train_start: self.decay_epsilon() print(f"Iteration: {i}, Reward sum: {rew_sum}, cnt: {cnt}, avg loss: {loss_sum/cnt:.3f}, eps: {self.epsilon:.3f}") with train_writer.as_default(): tf.summary.scalar('reward', rew_sum, step=i) tf.summary.scalar('avg loss', loss_sum/cnt, step=i) tf.summary.scalar('predict_mod', predict_mod, step=i) tf.summary.scalar('predict_targ', predict_targ, step=i) rew_sum = 0 loss_sum = 0 cnt = 0 if __name__ == '__main__': learner = DeepTD0(Game_2048()) learner.train_iterations(100000)
def rollout_batch(self, batch_size): #log_file = open("rollout.log","a") #log_file.write("\n==============STARTING NEW BATCH=============\n") games = [Game_2048() for _ in range(batch_size)] #log_file.write(str(games[0])+"\n") states = [[game.state] for game in games] still_running = list(range(batch_size)) rewardlist = [[] for _ in games] while len(still_running) > 0: eval_batch = [] evaluations = np.zeros( (len(still_running), self.game.action_space.n)) for sn in range(len(still_running)): i = still_running[sn] for a in range(games[i].action_space.n): new_state, reward, done = games[i].check_update( games[i].state, a) if done: evaluations[sn, a] = reward else: nn_input = self.game.convert_to_nn_input(new_state) eval_batch.append(nn_input) evaluations[sn, a] = reward if len(eval_batch) > 0: predictions = self.model.predict(np.array(eval_batch)).reshape( len(eval_batch)) pindex = 0 for e in range(evaluations.shape[0]): for a in range(evaluations.shape[1]): if evaluations[e, a] != self.game.done_reward: #if e==0 and 0 in still_running: # log_file.write(f"action {BACKMAP[a]}, reward: {evaluations[e,a]}, value: {predictions[pindex]}\n") evaluations[e, a] += predictions[pindex] pindex += 1 #else: #if e==0 and 0 in still_running: # log_file.write(f"action {BACKMAP[a]}, reward: {evaluations[e,a]}, value: done\n") for sn in range(len(still_running) - 1, -1, -1): i = still_running[sn] action_vals = evaluations[sn] action = np.argmax(action_vals) """action = None best_val = -np.inf for a in range(games[i].action_space.n): _,reward,_ = games[i].check_update(games[i].state,a) if reward>best_val: best_val = reward action = a""" new_state, reward, done = games[i].check_update( games[i].state, action) if done: del still_running[sn] continue games[i].state = new_state #if i==0: # log_file.write(f"chosen action: {BACKMAP[action]}, new state before spawn: {games[i]}\n") states[i].append(new_state.copy()) games[i].spawn_number() #if i==0: # log_file.write(f"state after spawn: {games[i]}") rewardlist[i].append(reward) #log_file.write("\n==========MEMORY STORE AHEAD===========\n") cum_rewards = [sum(x) for x in rewardlist] samples = [] for i in range(len(states)): store_reward = cum_rewards[i] for j in range(len(states[i])): state = states[i][j] #if i==0: # temp_game = Game_2048() # temp_game.state = state # log_file.write(f"REWARD: {store_reward}, STATE: {temp_game}\n") samples.append((store_reward, state)) if j < len(states[i]) - 1: store_reward -= rewardlist[i][j] return cum_rewards, samples