def train(self): tf.train.write_graph(self.sess.graph_def, self.model_path, 'td_gammon.pb', as_text=False) summary_writer = tf.summary.FileWriter('{0}{1}'.format(self.summary_path, int(time.time()), self.sess.graph_def)) # the agent plays against itself, making the best move for each player #players = [TDAgent(Game.TOKENS[0], self,p=np.random.rand()/10), TDAgent(Game.TOKENS[1], self,p=np.random.rand()/10)] #players = [TDAgent(Game.TOKENS[0], self), TDAgent(Game.TOKENS[1], self)] validation_interval = 10000 episodes = 200000 t = trange(episodes, desc='Bar desc', leave=True) for episode in t: players = [TDAgent(Game.TOKENS[0], self,p=np.random.rand()/3), TDAgent(Game.TOKENS[1], self,p=np.random.rand()/3)] if episode != 0 and episode % validation_interval == 0: self.test(episodes=200) np.random.seed() #self.test(episodes=200,mode=1) t.refresh() game = Game.new() player_num = random.randint(0, 1) if player_num==0: game.reverse() x = game.extract_features(players[player_num].player) #print(self.xy.eval()) game_step = 0 while not game.is_over(): game.next_step(players[player_num], player_num) player_num = (player_num + 1) % 2 x_next = game.extract_features(players[player_num].player) V_next = self.get_output(x_next) self.sess.run(self.train_op, feed_dict={ self.x: x, self.V_next: V_next }) x = x_next game_step += 1 winner = game.winner() _, global_step, summaries, _ = self.sess.run([ self.train_op, self.global_step, self.summaries_op, self.reset_op ], feed_dict={ self.x: x, self.V_next: np.array([[winner]], dtype='float') }) summary_writer.add_summary(summaries, global_step=global_step) #tqdm.write("Game %d/%d (Winner: %s) in %d turns" % (episode, episodes, players[winner].player, game_step)) self.saver.save(self.sess, self.checkpoint_path + 'checkpoint', global_step=global_step) tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) summary_writer.close()
def test(self, episodes=100, draw=False): players = [TDAgent(Game.TOKENS[0], self), RandomAgent(Game.TOKENS[1])] winners = [0, 0] for episode in range(episodes): game = Game.new() winner = game.play(players, draw=draw) winners[winner] += 1 winners_total = sum(winners) print("[Episode %d] %s (%s) vs %s (%s) %d:%d of %d games (%.2f%%)" % (episode, \ players[0].name, players[0].player, \ players[1].name, players[1].player, \ winners[0], winners[1], winners_total, \ (winners[0] / winners_total) * 100.0))
def train(self): with tf.device('/gpu:0') as dev: tf.train.write_graph(self.sess.graph_def, self.model_path, 'td_gammon.pb', as_text=False) summary_writer = tf.summary.FileWriter('{0}{1}'.format(self.summary_path, int(time.time()), self.sess.graph_def)) # the agent plays against itself, making the best move for each player players = [TDAgent(Game.TOKENS[0], self), TDAgent(Game.TOKENS[1], self)] validation_interval = 1000 episodes = 5000 for episode in range(episodes): if episode != 0 and episode % validation_interval == 0: self.test(episodes=100) game = Game.new() player_num = random.randint(0, 1) x = game.extract_features(players[player_num].player) game_step = 0 while not game.is_over(): game.next_step(players[player_num], player_num) player_num = (player_num + 1) % 2 x_next = game.extract_features(players[player_num].player) V_next = self.get_output(x_next) self.sess.run(self.train_op, feed_dict={ self.x: x, self.V_next: V_next }) x = x_next game_step += 1 winner = game.winner() _, global_step, summaries, _ = self.sess.run([ self.train_op, self.global_step, self.summaries_op, self.reset_op ], feed_dict={ self.x: x, self.V_next: np.array([[winner]], dtype='float') }) summary_writer.add_summary(summaries, global_step=global_step) print("Game %d/%d (Winner: %s) in %d turns" % (episode, episodes, players[winner].player, game_step)) self.saver.save(self.sess, self.checkpoint_path + 'checkpoint', global_step=global_step) summary_writer.close() self.test(episodes=1000)
def train(self): tf.train.write_graph(self.sess.graph_def, self.model_path, 'td_gammon.pb', as_text=False) summary_writer = tf.train.SummaryWriter('{0}{1}'.format(self.summary_path, int(time.time()), self.sess.graph_def)) # the agent plays against itself, making the best move for each player players = [TDAgent(Game.TOKENS[0], self), TDAgent(Game.TOKENS[1], self)] validation_interval = 1000 episodes = 5000 for episode in range(episodes): if episode != 0 and episode % validation_interval == 0: self.test(episodes=100) game = Game.new() player_num = random.randint(0, 1) x = game.extract_features(players[player_num].player) game_step = 0 while not game.is_over(): game.next_step(players[player_num], player_num) player_num = (player_num + 1) % 2 x_next = game.extract_features(players[player_num].player) V_next = self.get_output(x_next) self.sess.run(self.train_op, feed_dict={ self.x: x, self.V_next: V_next }) x = x_next game_step += 1 winner = game.winner() _, global_step, summaries, _ = self.sess.run([ self.train_op, self.global_step, self.summaries_op, self.reset_op ], feed_dict={ self.x: x, self.V_next: np.array([[winner]], dtype='float') }) summary_writer.add_summary(summaries, global_step=global_step) print("Game %d/%d (Winner: %s) in %d turns" % (episode, episodes, players[winner].player, game_step)) self.saver.save(self.sess, self.checkpoint_path + 'checkpoint', global_step=global_step) summary_writer.close() self.test(episodes=1000)
def test(self, episodes=100, draw=False, save=None): players = [ TDAgent(Game.TOKENS[0], self), TDAgent(Game.TOKENS[1], self) ] winners = [0, 0] for episode in range(episodes): game = Game.new() winner = game.play(players, draw=draw) if save: game.save_tmg(os.path.join(save, str(episode) + '.tmg')) winners[winner] += 1 winners_total = sum(winners) print("[Episode %d] %s (%s) vs %s (%s) %d:%d of %d games (%.2f%%)" % (episode, \ players[0].name, players[0].player, \ players[1].name, players[1].player, \ winners[0], winners[1], winners_total, \ (winners[0] / winners_total) * 100.0))
def test(self, episodes=100, draw=False, mode=0): if mode==0: players = [TDAgent(Game.TOKENS[0], self), Today_bot(Game.TOKENS[1])] if mode==1: players = [Today_bot(Game.TOKENS[0]), TDAgent(Game.TOKENS[1], self)] if mode==3: players = [TDAgent(Game.TOKENS[0], self), TDAgent(Game.TOKENS[1], self)] #players = [TDAgent(Game.TOKENS[0], self), TDAgent(Game.TOKENS[1], self)] winners = [0, 0] for episode in range(episodes): np.random.seed(episode) game = Game.new() winner = game.play(players, draw=draw) winners[winner] += 1 winners_total = sum(winners) if mode<3: print("[Episode %d] %s (%s) vs %s (%s) %d:%d of %d games (%.2f%%)" % (episode, \ players[0].name, players[0].player, \ players[1].name, players[1].player, \ winners[0], winners[1], winners_total, \ (winners[mode] / winners_total) * 100.0)) if (winners[mode] / winners_total) * 100.0 >self.max_wr: self.max_wr=(winners[mode] / winners_total) * 100.0 w1=self.l1_W.eval() b1=self.l1_b.eval() w2=self.l2_W.eval() b2=self.l2_b.eval() np.savetxt("w1.txt", w1) np.savetxt("w2.txt", w2) np.savetxt("b1.txt", b1) np.savetxt("b2.txt", b2) with open("max_wr.txt", "w") as text_file: text_file.write(str(self.max_wr)) else: print("[Episode %d] %s (%s) vs %s (%s) %d:%d of %d games (%.2f%%)" % (episode, \ players[0].name, players[0].player, \ players[1].name, players[1].player, \ winners[0], winners[1], winners_total, \ (winners[0] / winners_total) * 100.0))
def play(self): game = Game.new() game.play([TDAgent(Game.TOKENS[0], self), Today_bot(Game.TOKENS[1])], draw=True)
def play(self): game = Game.new() game.play([TDAgent(Game.TOKENS[0], self), HumanAgent(Game.TOKENS[1])], draw=True)
elif tup[0] == 'x': pos[i + 1] = len(tup) else: pos[i + 1] = -len(tup) return pos if __name__ == '__main__': print pubeval(False, [0] + [-2, 0, 0, 0, 0, 5] + [ 0, 3, 0, 0, 0, -5] + [ 5, 0, 0, 0, -3, 0] + [-5, 0, 0, 0, 0, 2] + [ 0] + [ 0, 0]) from backgammon.game import Game g = Game.new() print pubeval(False, game_to_pos(g)) actions = g.get_actions((5, 6), 'x', nodups=True) for a in sorted([str(foo) for foo in actions]): print a print actions = g.get_actions((5, 6), 'o', nodups=True) for a in sorted([str(foo) for foo in actions]): print a
def random_selfplay(self): players = [RandomAgent(Game.TOKENS[0]), RandomAgent(Game.TOKENS[1])] game = Game.new() game.SLEEP = 0 winner = game.play(players, draw=True)
def play(self, ts=False): game = Game.new() game.play([TDAgent(Game.TOKENS[0], self), HumanAgent(Game.TOKENS[1])], draw=True, ts=ts)
def train(self, episodes = 5000): tf.train.write_graph(self.sess.graph_def, self.model_path, 'td_gammon.pb', as_text=False) summary_writer = tf.train.SummaryWriter('{0}{1}'.format(self.summary_path, int(time.time()), graph_def=self.sess.graph_def)) # the agent plays against itself, making the best move for each player players = [TDAgent(Game.TOKENS[0], self), TDAgent(Game.TOKENS[1], self)] validation_interval = 1000 report_freq = 10 prev_time = time.time() prev_step = self.sess.run(self.global_step) plies_per_batch = 0 for episode in range(episodes): if episode != 0 and episode % validation_interval == 0: self.test(episodes=100) game = Game.new() player_num = random.randint(0, 1) x = game.extract_features(players[player_num].player) game_step = 0 while not game.is_over(): game.next_step(players[player_num], player_num) player_num = (player_num + 1) % 2 x_next = game.extract_features(players[player_num].player) V_next = self.get_output(x_next) self.sess.run(self.train_op, feed_dict={ self.x: x, self.V_next: V_next }) x = x_next game_step += 1 winner = game.winner() _, global_step, summaries, _ = self.sess.run([ self.train_op, self.global_step, self.summaries_op, self.reset_op ], feed_dict={ self.x: x, self.V_next: np.array([[winner]], dtype='float') }) print("Game %d/%d (Winner: %s) in %d turns" % (episode, episodes, players[winner].player, game_step)) plies_per_batch += game_step if episode != 0 and episode % report_freq == 0: now = time.time() elapsed_time = now - prev_time steps_per_sec = (global_step - prev_step) / elapsed_time games_per_sec = report_freq / elapsed_time plies_per_game = plies_per_batch / report_freq print('e=%.2f sps=%.2f gps=%.2f ppg=%.1f global=%d prev=%d' % (elapsed_time, steps_per_sec, games_per_sec, plies_per_game, global_step, prev_step)) summary_writer.add_summary(summaries, global_step=global_step) s1 = tf.Summary(value=[tf.Summary.Value(tag='rate/global_steps_sec', simple_value=steps_per_sec)]) summary_writer.add_summary(s1, global_step) s2 = tf.Summary(value=[tf.Summary.Value(tag='rate/games_sec', simple_value=games_per_sec)]) summary_writer.add_summary(s2, global_step) s3 = tf.Summary(value=[tf.Summary.Value(tag='rate/plies_per_game', simple_value=plies_per_game)]) summary_writer.add_summary(s3, global_step) self.saver.save(self.sess, self.checkpoint_path + 'checkpoint', global_step=global_step) prev_time = now prev_step = global_step plies_per_batch = 0 summary_writer.close() self.test(episodes=1000)
pos[25] = len(game.bar_pieces['x']) pos[26] = len(game.off_pieces['x']) pos[27] = -len(game.off_pieces['o']) for i, tup in enumerate(game.grid): if len(tup) == 0: pos[i + 1] = 0 elif tup[0] == 'x': pos[i + 1] = len(tup) else: pos[i + 1] = -len(tup) return pos if __name__ == '__main__': print pubeval(False, [0] + [-2, 0, 0, 0, 0, 5] + [0, 3, 0, 0, 0, -5] + [5, 0, 0, 0, -3, 0] + [-5, 0, 0, 0, 0, 2] + [0] + [0, 0]) from backgammon.game import Game g = Game.new() print pubeval(False, game_to_pos(g)) actions = g.get_actions((5, 6), 'x', nodups=True) for a in sorted([str(foo) for foo in actions]): print a print actions = g.get_actions((5, 6), 'o', nodups=True) for a in sorted([str(foo) for foo in actions]): print a
def train(self): tf.train.write_graph(self.sess.graph_def, self.model_path, 'td_gammon.pb', as_text=False) summary_writer = tf.summary.FileWriter('{0}{1}'.format( self.summary_path, int(time.time()), self.sess.graph_def)) # the agent plays against itself, making the best move for each player players = [ TDAgent(Game.TOKENS[0], self), TDAgent(Game.TOKENS[1], self) ] #validation_interval = 1000 #episodes = 5000 validation_interval = 500 episodes = 5000 train_start_ts = time.time() for episode in range(episodes): start_ts = time.time() if episode != 0 and episode % validation_interval == 0: print('Episode:', episode) write('Episode: %d' % episode) self.test(episodes=100) game = Game.new() player_num = random.randint(0, 1) x = game.extract_features(players[player_num].player) game_step = 0 while not game.is_over(): game.next_step(players[player_num], player_num) player_num = (player_num + 1) % 2 x_next = game.extract_features(players[player_num].player) V_next = self.get_output(x_next) self.sess.run(self.train_op, feed_dict={ self.x: x, self.V_next: V_next }) x = x_next game_step += 1 winner = game.winner() _, global_step, summaries, _ = self.sess.run([ self.train_op, self.global_step, self.summaries_op, self.reset_op, ], feed_dict={ self.x: x, self.V_next: np.array( [[winner]], dtype='float') }) summary_writer.add_summary(summaries, global_step=global_step) end_ts = time.time() print("%.2f - Game %d/%d (Winner: %s) in %d turns (%.2f secs)" % (self.k, episode, episodes, players[winner].player, game_step, end_ts - start_ts)) """if episode in [9, 99, 999, 9999, 99999]: print("%d games avg time: %.2f secs" % (episode+1, (end_ts - train_start_ts) / (episode+1))) """ self.saver.save(self.sess, self.checkpoint_path + 'checkpoint', global_step=global_step) summary_writer.close() write('Episode: 5000') self.test(episodes=100)