def main(_): params = {} if FLAGS.symbolic: params = {'seed': FLAGS.seed, 'level_name': FLAGS.level_name} env_generator = symbolic_alchemy.get_symbolic_alchemy_level else: env_settings = dm_alchemy.EnvironmentSettings( seed=FLAGS.seed, level_name=FLAGS.level_name) params = {'name': FLAGS.docker_image_name, 'settings': env_settings} env_generator = dm_alchemy.load_from_docker with env_generator(**params) as env: agent = RandomAgent(env.action_spec()) timestep = env.reset() score = 0 while not timestep.last(): action = agent.act(timestep) timestep = env.step(action) if timestep.reward: score += timestep.reward print('Total score: {:.2f}, reward: {:.2f}'.format( score, timestep.reward))
def move(self, game: Game, possible_steps=None): winning_step, loosing_steps, draw_steps, possible_steps = self.check_next_step( game, possible_steps) if winning_step is not None: return winning_step steps_to_exclude = [] for step in possible_steps: game_clone = game.copy_and_move(step) winning_step2, loosing_steps2, draw_steps2, possible_steps2 = self.check_next_step( game_clone) if len(loosing_steps2) > 0: steps_to_exclude.append(step) draw_steps += draw_steps2 if len(steps_to_exclude) > 0: possible_steps = [ step2 for step2 in possible_steps if step2 not in steps_to_exclude ] if len(possible_steps) == 1: return possible_steps[0] elif len(possible_steps) > 1: return self.agent.move(game, possible_steps) else: agent = RandomAgent(self.label) return agent.move(game)
def main(): try: shutil.rmtree('images') print("delete images directory") except OSError as e: print("Error: %s : %s" % ('images', e.strerror)) gym.logger.set_level(INFO) start_date = date(2019, 5, 1) simulate_company_list = [2, 3, 4, 5, 6, 44, 300, 67, 100, 200] # simulate_company_list = [3] env = gym.make("AsxGym-v0", start_date=start_date, simulate_company_list=simulate_company_list) stock_agent = RandomAgent(env) # stock_agent = RandomAgent(env, min_volume=100, max_volume=500) # stock_agent = BuyAndKeepAgent(env, 3) observation = env.reset() for _ in range(200000 * 24): env.render() company_count = len(env.simulate_company_list) observation, reward, done, info = env.step(stock_agent.action()) if done: env.insert_summary_images(30) observation = env.reset() stock_agent.reset() if observation is not None: asx_observation = AsxObservation(observation) print(asx_observation.to_json_obj()) print(info) env.close()
def main(argv=None): '''Evaluate agent performances against RandomAgent and AIAgent''' logger = BriscolaLogger(BriscolaLogger.LoggerLevels.TEST) game = brisc.BriscolaGame(2, logger) # agent to be evaluated is RandomAgent or QAgent if a model is provided if FLAGS.model_dir: eval_agent = QAgent(network=FLAGS.network) eval_agent.load_model(FLAGS.model_dir) eval_agent.make_greedy() else: eval_agent = RandomAgent() # test agent against RandomAgent agents = [eval_agent, RandomAgent()] total_wins, points_history = evaluate(game, agents, FLAGS.num_evaluations) stats_plotter(agents, points_history, total_wins) # test agent against AIAgent agents = [eval_agent, AIAgent()] total_wins, points_history = evaluate(game, agents, FLAGS.num_evaluations) stats_plotter(agents, points_history, total_wins)
def run_baseline_comparison_v5(n_games=2500): experiment_name = 'baseline_comparison_v5' agent10 = RandomAgent(distribution='uniform') agent11 = RandomAgent(distribution='uniform_on_types') agent12 = RandomAgent(distribution='first_buy') multi_arena = ArenaMultiThread() list_of_agents = [agent10, agent11, agent12] results = multi_arena.all_vs_all(list_of_agents, n_games) if main_thread: print(' \n \n {}'.format(results.to_pandas())) print('\n \n \n') print(results) wins = results.to_pandas(param='wins').to_csv('wins.csv') vic_points = results.to_pandas( param='victory_points').to_csv('victory_points.csv') rewards = results.to_pandas(param='reward').to_csv('reward.csv') #leader_board = LeaderBoard(list_of_agents) #leader_board.load_from_file() #leader_board.register_from_games_statistics(results) #print(leader_board) #leader_board.save_to_file() plt.title('Average win rate over {} games per pair:'.format(2 * n_games)) wins_pic = results.create_heatmap(param='wins', average=True, p2=2) plt.savefig('reports/wins.png') plt.clf() plt.title('Average reward over {} games per pair:'.format(2 * n_games)) reward_pic = results.create_heatmap('reward', average=True, p2=2) plt.savefig('reports/reward.png') plt.clf() plt.title('Average victory points over {} games per pair:'.format( 2 * n_games)) vic_points_pic = results.create_heatmap('victory_points', average=True, p2=2) plt.savefig('reports/victory_points.png') plt.clf() plt.title('Average games played over {} games per pair:'.format( 2 * n_games)) vic_points_pic = results.create_heatmap('games', average=True, n_games=n_games, p2=2) plt.savefig('reports/games.png') plt.clf()
def run_experiment(): trainer = MCTS_value_trainer() trainer.include_params_file(PARAMS_FILE) trainer.include_params_file( 'gym_splendor_code/envs/mechanics/game_settings.py') if not CLUSTER: trainer.run_training_games_multi_process( opponent_to_train='self', baselines=[RandomAgent(distribution='first_buy'), RandomAgent()], epochs=50, mcts_passes=15, n_test_games=0, exploration_ceofficient=0.41, experiment_name='MCTS local', weights_path= '/home/tomasz/ML_Research/splendor/gym-splendor/archive/weights_tt1/', neural_network_train_epochs=1, reset_network=True, confidence_threshold=1, confidence_limit=4, count_ratio=0.8, replay_buffer_n_games=50, use_neptune=True, tags=['local-run'], source_files=[__file__, PARAMS_FILE]) if CLUSTER: trainer.run_training_games_multi_process( opponent_to_train='self', baselines=[ RandomAgent(distribution='first_buy'), GreedyAgentBoost() ], epochs=250, mcts_passes=50, n_test_games=24, exploration_ceofficient=0.41, experiment_name='MCTS with NN', weights_path= '/net/archive/groups/plggluna/plgtodrzygozdz/weights_temp/', neural_network_train_epochs=1, reset_network=True, confidence_threshold=1, confidence_limit=4, count_ratio=0.7, replay_buffer_n_games=100, use_neptune=True, tags=['cluster-run'], source_files=[__file__, PARAMS_FILE])
def main(episode_count): env = gym.make('CartPole-v0') agent = RandomAgent(env.action_space.n) for i in range(episode_count): observation = env.reset() # initialize the environment done = False step = 0 while not done: env.render() action = agent.act(observation) next_observation, reward, done, info = env.step(action) if done: print("Episode finished after {} timesteps".format(step + 1)) observation = next_observation step += 1
def simulate(self, node: MCSTTreeNode): tmp_game = copy.deepcopy(node.game) tmp_game.debug = False tmp_agents = [RandomAgent(label) for label in tmp_game.labels] tmp_game.play(tmp_agents) winner = tmp_game.evaluate() return self.winner2score(winner, tmp_game.moves_num)
def train_agent(hype_space): print("----------------------") print("Evaluating model: ", hype_space) logger = BriscolaLogger(BriscolaLogger.LoggerLevels.TEST) game = brisc.BriscolaGame(2, logger) tf.reset_default_graph() # Initialize agents agents = [] agent = QAgent( 0, hype_space['epsilon_increment'], hype_space['epsilon_max'], hype_space['discount'], NETWORK, hype_space['layers'], hype_space['learning_rate'], hype_space['replace_target_iter']) agents.append(agent) agents.append(RandomAgent()) best_total_wins = train(game, agents, NUM_EPOCHS, EVALUATE_EVERY, EVALUATE_FOR, MODEL_DIR) print ("Best total wins ----->", best_total_wins) best_total_loses = EVALUATE_FOR - best_total_wins return best_total_loses
def create_agent(conf, action_space, observation_space): if conf['agent'] == "dqn": return DQNAgent( action_space, observation_space, batch_size=conf['batch_size'], learning_rate=conf['learning_rate'], discount=conf['discount'], epsilon=conf['random_explore']) elif conf['agent'] == "conv_dqn": return ConvDQNAgent( action_space, observation_space, batch_size=conf['batch_size'], learning_rate=conf['learning_rate'], discount=conf['discount'], epsilon=conf['random_explore']) elif conf['agent'] == "tabular_q": return TabularQAgent( action_space, observation_space, q_init=conf['q_value_init'], learning_rate=conf['learning_rate'], discount=conf['discount'], epsilon=conf['random_explore']) elif conf['agent'] == "random": return RandomAgent(action_space, observation_space) else: raise ArgumentError("Agent type [%s] is not supported." % conf['agent'])
def run(): agent1 = RandomAgent(mpi_communicator=comm) agent2 = RandomAgent(mpi_communicator=comm) # agent3 = MultiMCTSAgent(1, 5, True, False) # random.randint.seed(100) arek = DeterministicMultiProcessArena() result = arek.run_one_duel_multi_process_deterministic( comm, [agent3, agent1]) result2 = arek.run_one_duel_multi_process_deterministic( comm, [agent3, agent1]) if main_process: print(result) print(result2)
def main(): parser = setup_parser() args = parser.parse_args() env = gym.make(args.env) model_type = args.model if not model_type: raise ValueError("Please specify the model") model_config, train_config, load_config = get_configs(args) if model_type == "pg": model = PolicyGradient(env, **model_config) elif model_type == "ac": model = ActorCritic(env, **model_config) elif model_type == "gae": model = GeneralizedAdvantageEstimation(env, **model_config) elif model_type == "rnd": model = RandomAgent(env, **model_config) if args.load: model.load_model(**load_config) if args.train: reward_history, loss = model.train(**train_config) plot_rewards(reward_history) if args.evaluate: model.evaluate(n_episodes=10, n_steps=1000, render=args.render) plot_rewards(evaluation_results)
def produce_data(when_to_start, dump_p, n_games, filename, folder): list_of_agents = [RandomAgent(), GreedyAgentBoost(), MinMaxAgent()] arek = ArenaMultiThread() arek.start_collecting_states() arek.collect_only_from_middle_game(when_to_start, dump_p) arek.all_vs_all('deterministic', list_of_agents, n_games) arek.dump_collected_states(filename, folder)
def __init__(self, mode, iteration_limit, rollout_repetition, choose_best): assert mode == 'dqn', 'You must provide mode of training' self.iteration_limit = iteration_limit self.rollout_repetition = rollout_repetition self.data_collector = TreeDataCollector() self.opponent = RandomAgent(distribution='first_buy') self.choose_best = choose_best self.env = gym_open_ai.make('splendor-v0')
def set_agent(self, agent: Agent = None): # todo agent verify methods availability if not agent: random_agent = RandomAgent() self.agent = random_agent else: self.agent = agent self.agent.connect_player(self) print(f'Player {self.id} using {agent.__class__.__name__}')
def main(): env = TicTacToeEnv() model = ValueModel(env.feature_vector_size, 100) # agent = SimpleAgent('agent_0', model, env) # agent = TDAgent('agent_0', model, env) # agent = ForwardAgent('agent_0', model, env) # agent = BackwardAgent('agent_0', model, env) agent = LeafAgent('agent_0', model, env) random_agent = RandomAgent(env) log_dir = "./log/leaf" summary_op = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(log_dir) scaffold = tf.train.Scaffold(summary_op=summary_op) with tf.train.MonitoredTrainingSession(checkpoint_dir=log_dir, scaffold=scaffold) as sess: agent.sess = sess env.sess = sess while True: episode_count = sess.run(agent.episode_count) if episode_count % 1000 == 0: results = random_agent.test(agent) sess.run(agent.update_random_agent_test_results, feed_dict={ random_agent_test_: result for random_agent_test_, result in zip( agent.random_agent_test_s, results) }) print(episode_count, ':', results) if results[2] + results[5] == 0: final_summary = sess.run(summary_op) summary_writer.add_summary(final_summary, global_step=episode_count) break else: agent.train(.2) sess.run(agent.increment_episode_count)
def __init__(self): # Board dimension self.nb_rows = 6 self.nb_columns = 7 self.done = False # nb_empty indicate the number of available space per column self.nb_empty = [self.nb_rows] * self.nb_columns # Save the board state self.state = np.zeros((self.nb_rows, self.nb_columns), dtype=int) # Learn about spaces here: http://gym.openai.com/docs/#spaces self.action_space = spaces.Discrete(self.nb_columns) self.observation_space = spaces.Box(low=-1, high=1, shape=(self.nb_rows, self.nb_columns), dtype=np.int) # Tuple corresponding to the min and max possible rewards self.reward_range = (-10, 1) self.rewards = { "invalid": -10, "valid": 1 / 42, "won": 1, "lost": -1, "draw": 0, } # Render properties self.render_tokens = {} self.render_tokens[-1] = 'x' self.render_tokens[1] = 'o' self.render_tokens[0] = ' ' # Random agent self.opponent = RandomAgent(self.action_space, self.state) # StableBaselines throws error if these are not defined self.spec = None self.metadata = None
def objective(trial): env = CurlingEnv(hard_mode=True) """ agent1 = TDZero(str(random.randint(0, 100)), training_mode=True, alpha=trial.suggest_float('alpha', 0.1, 1.0), gamma=trial.suggest_float('gamma', 0.1, 1.0), epsilon=trial.suggest_float('epsilon', 0.9, 1.0), decay_rate=trial.suggest_float('decay_rate', 0.9, 0.99999)) """ """ agent1 = MonteCarlo("Monte Carlo", training_mode=True, action_space=env.action_space.n, gamma=trial.suggest_float('gamma', 0.1, 1.0), epsilon=trial.suggest_float('epsilon', 0.1, 1.0), decay_rate=trial.suggest_float('decay_rate', 0.5, 0.9999)) """ agent1 = ActorCritic("Actor Critic", training_mode=True, action_space=env.action_space.n, actor_lr=trial.suggest_float("actor_lr", 0.0001, 0.3), critic_lr=trial.suggest_float("critic_lr", 0.0001, 0.3), gamma=trial.suggest_float("gamma", 0.1, 0.9)) agent2 = RandomAgent("Random", False, env.action_space.n) wins = [] rolling_average = [] for _ in tqdm(range(1000)): state = env.reset() coordinator = PlayerCoordinator(agent1, agent2, state) coordinator.start_episode() done = False while not done: action = coordinator.next_move(state) state, reward, done, _ = env.step(action) coordinator.inform_players(state, action, reward, done) coordinator.next_turn() if done: if reward[0] > reward[1]: wins.append(1) else: wins.append(0) coordinator.end_episode() if len(wins) > 100: rolling_average.append(np.mean(wins[-100:])) score = np.mean(rolling_average[-5000:]) return score
def run(env_name, agent_name, nb_episodes, render_freq, render_mode): logger.set_level(logger.INFO) env = gym.make(env_name) # You provide the directory to write to (can be an existing # directory, including one with existing data -- all monitor files # will be namespaced). You can also dump to a tempdir if you'd # like: tempfile.mkdtemp(). #outdir = '/tmp/random-agent-results' #video_callable = None if render_mode == 'human' else False #env = wrappers.Monitor(env, directory=outdir, force=True, video_callable=video_callable) #env = DynamicMonitor(env, directory=outdir, force=True, video_callable=video_callable) env.render(mode=render_mode) env.seed(0) if agent_name == 'RandomAgent': agent = RandomAgent(env.env.action_space) elif agent_name == 'EpsilonGreedyAgent': agent = EpsilonGreedy(env.env.action_space) elif agent_name == 'GradientBanditAgent': agent = GradientBandit(env.env.action_space) elif agent_name == 'ucb': agent = ucb(env.env.action_space) elif agent_name == 'ThompsonSampling': agent = ThompsonSampling(env.env.action_space) step = 0 reward = 0 done = False for episode in range(nb_episodes): print(f'--------- Episode {episode} ---------') ob = env.reset() agent = agent.reset() while True: step += 1 # action space may have change # agent = EpsilonGreedy(env.env.action_space) action = agent.act(ob, reward, done) ob, reward, done, _ = env.step(action) if done: break if step % render_freq == 0: env.render() # Note there's no env.render() here. But the environment still can open window and # render if asked by env.monitor: it calls env.render('rgb_array') to record video. # Video is not recorded every episode, see capped_cubic_video_schedule for details. # Close the env and write monitor result info to disk env.env.close()
def add_player(self, name, player_type, params={}): if name in self._players: print(f"Player '{name}' already exists.") player = [player_type] if player_type == "minimax": player.append(MiniMaxAgent(self, params)) elif player_type == "deep-q": player.append(DeepQAgent(self, params)) elif player_type == "random": player.append(RandomAgent(self, params)) self._players[name] = player self._board.add_player(name)
class RandomizedAgent(Agent): def __init__(self, epsilon): super().__init__() self.smart_agent = GreedySearchAgent() self.epsilon = epsilon self.random_agent = RandomAgent(distribution='uniform') def choose_action(self, observation: SplendorObservation, previous_actions: List[Action]): p = np.random.uniform(0, 1) if p < self.epsilon: return self.random_agent.choose_action(observation, previous_actions) else: return self.smart_agent.choose_action(observation, previous_actions)
def _make_frames(set_: Set, n: int): e = gym_super_mario_bros.make("SuperMarioBros-1-1-v0") e = JoypadSpace(e, RIGHT_ONLY) e = SkipWrapper(e, 5) e = CopyFrame(e) e = FrameStack(e, 4) e = NoopResetEnv(e, 4) a = RandomAgent(env=e) play( a, e, frames_directory=set_.path / "img", display=False, n=n * 10, save_each=15, state2img=lambda frames: cv2.cvtColor(np.vstack(frames._frames), cv2.COLOR_RGB2BGR), )
def main(argv=None): # Initializing the environment logger = BriscolaLogger(BriscolaLogger.LoggerLevels.TRAIN) game = brisc.BriscolaGame(2, logger) # Initialize agents agents = [] agent = QAgent(FLAGS.epsilon, FLAGS.epsilon_increment, FLAGS.epsilon_max, FLAGS.discount, FLAGS.network, FLAGS.layers, FLAGS.learning_rate, FLAGS.replace_target_iter, FLAGS.batch_size) agents.append(agent) agent = RandomAgent() agents.append(agent) train(game, agents, FLAGS.num_epochs, FLAGS.evaluate_every, FLAGS.num_evaluations, FLAGS.model_dir)
def full_training(self, n_repetitions, alpha, epochs): self.prepare_training() for i in range(n_repetitions): if main_process: print('Game number = {}'.format(i)) self.run_self_play('deterministic', alpha=alpha, epochs=epochs) agent_to_test = self.mcts_agent arena = MultiArena() results = arena.run_many_duels( 'deterministic', [agent_to_test, RandomAgent(distribution='first_buy')], 1, 24) if main_process: self.eval_policy.model.save_weights( 'Weights_i = {}.h5'.format(i)) text_file = open("Results_{}.txt".format(i), "w") text_file.write(results.__repr__()) text_file.close()
def run_comparison(n_games=1000): gohan = GreedyAgentBoost() goku = RandomAgent(distribution='uniform') print( fight_pit.run_many_duels([goku, gohan], number_of_games=n_games, shuffle_agents=True)) goku = RandomAgent(distribution='uniform_on_types') print( fight_pit.run_many_duels([goku, gohan], number_of_games=n_games, shuffle_agents=True)) goku = RandomAgent(distribution='first_buy') print( fight_pit.run_many_duels([goku, gohan], number_of_games=n_games, shuffle_agents=True)) gohan = GreedyAgentBoost(weight=[100, 2.5, 1.5, 1, 0.1]) goku = RandomAgent(distribution='uniform') print( fight_pit.run_many_duels([goku, gohan], number_of_games=n_games, shuffle_agents=True)) goku = RandomAgent(distribution='uniform_on_types') print( fight_pit.run_many_duels([goku, gohan], number_of_games=n_games, shuffle_agents=True)) goku = RandomAgent(distribution='first_buy') print( fight_pit.run_many_duels([goku, gohan], number_of_games=n_games, shuffle_agents=True))
from agents.random_agent import RandomAgent from agents.greedy_agent import GreedyAgent, GreedyAgentBoost from arena import Arena import time import random import numpy as np fight_pit = Arena() goku = RandomAgent(distribution='first_buy') #gohan = RandomAgent(distribution='uniform_on_types') #gohan = RandomAgent(distribution='uniform') gohan = GreedyAgent(weight=0.08) g1 = GreedyAgentBoost("Greedy1", [100, 2, 2, 1, 0.1]) g2 = GreedyAgentBoost("Greedy2", [0, 0, 0, 0, 0]) g3 = GreedyAgentBoost("Greedy3", [10, 2, 2, 1, 0.2]) g4 = GreedyAgentBoost("Greedy4", [100, 0, 0, 1, 0.1]) g5 = GreedyAgentBoost( "Greedy5", [0.99954913, 0.01997425, 0.02001405, 0.01004779, 0.00101971]) g6 = GreedyAgentBoost( "Greedy6", [0.99953495, 0.02010871, 0.02010487, 0.01095619, 0.00113329]) gv1 = RandomAgent(distribution='first_buy') gv2 = GreedyAgent(weight=0.1) gv3 = GreedyAgentBoost("RandomAgent", [0, 0, 0, 0, 0]) g_list = {g1, g2, g3, g4, g5, g6} gv_list = [gv1, gv2, gv3] g_list_remove = set() lr = 0.000005
def __init__(self, gems_encoder_dim : int = None, price_encoder_dim : int = None, profit_encoder_dim : int = None, cards_points_dim: int = None, cards_dense1_dim: int = None, cards_dense2_dim: int = None, board_nobles_dense1_dim : int = None, board_nobles_dense2_dim : int = None, full_board_dense1_dim: int = None, full_board_dense2_dim: int = None, player_points_dim: int = None, player_nobles_dim: int = None, full_player_dense1_dim: int = None, full_player_dense2_dim: int = None, final_layer= None, data_transformer = None, network_name: str = None ): super().__init__() self.vectorizer = Vectorizer() self.final_layer = final_layer self.data_transformer = data_transformer self.params['data transormation'] = self.data_transformer.name self.params['final layer name'] = self.final_layer.name self.params['gems_encoder_dim'] = gems_encoder_dim self.params['gems_encoder_dim'] = gems_encoder_dim self.params['price_encoder_dim'] = price_encoder_dim self.params['profit_encoder_dim'] = profit_encoder_dim self.params['cards_points_dim'] = cards_points_dim self.params['cards_dense1_dim'] = cards_dense1_dim self.params['cards_dense2_dim'] = cards_dense2_dim self.params['board_nobles_dense1_dim'] = board_nobles_dense1_dim self.params['board_nobles_dense2_dim'] = board_nobles_dense2_dim self.params['full_board_dense1_dim']= full_board_dense1_dim self.params['full_board_dense2_dim'] = full_board_dense2_dim self.params['player_points_dim'] = player_points_dim self.params['player_nobles_dim'] = player_nobles_dim self.params['full_player_dense1_dim'] = full_player_dense1_dim self.params['full_player_dense2_dim']= full_player_dense2_dim self.arena = Arena() self.network_agent = ValueNNAgent(self) self.easy_opp = RandomAgent(distribution='first_buy') self.medium_opp = GreedyAgentBoost() self.hard_opp = MinMaxAgent() self.neptune_monitor = NeptuneMonitor() self.network_name = network_name self.gems_encoder = GemsEncoder(gems_encoder_dim) self.price_encoder = PriceEncoder(price_encoder_dim) self.board_encoder = BoardEncoder(self.gems_encoder, ManyNoblesEncoder(price_encoder_dim, board_nobles_dense1_dim, board_nobles_dense2_dim), ManyCardsEncoder(MAX_CARDS_ON_BORD, profit_encoder_dim, price_encoder_dim, cards_points_dim, cards_dense1_dim, cards_dense2_dim ), full_board_dense1_dim, full_board_dense2_dim) self.player_encoder = PlayerEncoder(self.gems_encoder, self.price_encoder, ManyCardsEncoder(MAX_RESERVED_CARDS, profit_encoder_dim, price_encoder_dim, cards_points_dim, cards_dense1_dim, cards_dense2_dim ), player_points_dim, player_nobles_dim, full_player_dense1_dim, full_player_dense2_dim) active_player_input = PlayersInputGenerator('active_').inputs other_player_input = PlayersInputGenerator('other_').inputs board_input = self.board_encoder.inputs self.inputs = board_input + active_player_input + other_player_input board_encoded = self.board_encoder(board_input) active_player_encoded = self.player_encoder(active_player_input) other_player_encoded = self.player_encoder(other_player_input) full_state = Concatenate(axis=-1)([board_encoded, active_player_encoded, other_player_encoded]) full_state = Dense(full_player_dense1_dim, activation='relu')(full_state) final_state = Dense(full_player_dense2_dim, activation='relu')(full_state) result = self.final_layer(final_state) self.layer = Model(inputs = self.inputs, outputs = final_state, name = 'full_state_splendor_estimator') self.network = Model(inputs = self.inputs, outputs = result, name = 'full_state_splendor_estimator') self.network.compile(Adam(), loss='mean_squared_error') self.params['Model name'] = 'Average pooling model' self.params['optimizer_name'] = 'Adam'
break reward, terminate = s.step(agents[agent].get_move(s), agent) if visualise: print(s) if terminate: if stop_point == -1: stop_point = agent agent = (agent + 1) % len(agents) if visualise: print("Game Over!\n") best_score = -math.inf best_agent = None for agent in agents: if visualise: print(str(agent) + " ended with " + str(s.scores[agent.index])) if s.scores[agent.index] > best_score: best_score = s.scores[agent.index] best_agent = agent if visualise: print(str(best_agent) + " wins!") return {agent: score for agent, score in zip(agents, s.scores)} if __name__ == "__main__": play_game(agents=[OneLookAheadAgent(0, 2), RandomAgent(1, 2)], visualise=True)
import gym import gym_connect4 from agents.random_agent import RandomAgent if __name__ == "__main__": # Build environment print("[.] Build Environment") env = gym.make('gym_connect4:connect4-v0') # Create random agent print("[.] Create Random Agent") agent = RandomAgent(env.action_space, env.observation_space) print(env.action_space.n) # Init environment done = False obs = env.reset() # Run game print("[.] Running game") while not done: obs, reward, done, info = env.step(agent.get_action(obs)) # Final render print("[+] Done.") print("Infos: ", info) print("Final board: ") env.render() # Close environment
import gin from gym_splendor_code.envs.mechanics.abstract_observation import DeterministicObservation from gym_splendor_code.envs.mechanics.state import State from nn_models.architectures.average_pool_v0 import StateEncoder, ValueRegressor, IdentityTransformer gin.parse_config_file( '/home/tomasz/ML_Research/splendor/gym-splendor/experiments/MCTS_series_1/params.gin' ) from agents.random_agent import RandomAgent from agents.single_mcts_agent import SingleMCTSAgent from arena.arena import Arena from monte_carlo_tree_search.evaluation_policies.value_evaluator_nn import ValueEvaluator from monte_carlo_tree_search.mcts_algorithms.single_process.single_mcts import SingleMCTS arek = Arena() a1 = RandomAgent() a2 = SingleMCTSAgent(5, ValueEvaluator(), 0.6, True, True) # results = arek.run_one_duel('deterministic', [a1, a2]) # state1 = State() # fufu = SingleMCTS(5, 0.6, ValueEvaluator()) # fufu.create_root(DeterministicObservation(state1)) # fufu.run_mcts_pass()
def main(): agent1 = RandomAgent(Reversi()) agent1.name = "1" agent1.color = BLACK agent2 = RandomMonteCarloAgent(Reversi(), 1) agent2.name = "2" agent2.color = WHITE engine_ref = Reversi() agent_ref = EdaxAgent(engine_ref) iteration = 1000 while True: predict_correct = 0 predict_wrong = 0 total = 1000 #file_name = 'training/saved_conv_networks/reversi-nn-' + str(iteration) #while not os.path.isfile(file_name): # time.sleep(60) #time.sleep(1) #agent2.load_nn(file_name) for _ in xrange(total): agent1.reset_engine() agent2.reset_engine() agent_ref.reset_engine() current_agent = agent1 opponent_agent = agent2 ignore_moves = 6 while not current_agent.get_engine().is_full(): move = current_agent.get_best_move() assert current_agent.color != current_agent.get_engine().get_current_player() if move is None: #print('No move for ', current_agent.name) tmp_engine = copy.deepcopy(current_agent.get_engine()) if not tmp_engine.get_legal_moves(): tmp_engine.apply_move(None) if not tmp_engine.get_legal_moves(): # game finished break if ignore_moves <= 0 and (not (move is None)) and current_agent == agent2: move_ref = agent_ref.get_best_move_without_apply() print(str(move_ref) + ', ' + str(move)) if move_ref == move: predict_correct += 1 else: predict_wrong += 1 if ignore_moves > 0: ignore_moves -= 1 agent_ref.apply_opponent_move(move) opponent_agent.apply_opponent_move(move) assert opponent_agent.color == opponent_agent.get_engine().get_current_player() if current_agent is agent1: current_agent = agent2 opponent_agent = agent1 else: current_agent = agent1 opponent_agent = agent2 print('result: ' + str(predict_correct) + ', ' + str(predict_wrong)) print('correct: ' + str(float(predict_correct) / (predict_correct + predict_wrong))) iteration += 1000