def args_evaluate(args): model_agent0 = args.model_agent0 model_agent1 = args.model_agent1 model_type = args.type hidden_units_agent0 = args.hidden_units_agent0 hidden_units_agent1 = args.hidden_units_agent1 n_episodes = args.episodes if path_exists(model_agent0) and path_exists(model_agent1): # assert os.path.exists(model_agent0), print("The path {} doesn't exists".format(model_agent0)) # assert os.path.exists(model_agent1), print("The path {} doesn't exists".format(model_agent1)) if model_type == 'nn': net0 = TDGammon(hidden_units=hidden_units_agent0, lr=0.1, lamda=None, init_weights=False) net1 = TDGammon(hidden_units=hidden_units_agent1, lr=0.1, lamda=None, init_weights=False) env = gym.make('gym_backgammon:backgammon-v0') else: net0 = TDGammonCNN(lr=0.0001) net1 = TDGammonCNN(lr=0.0001) env = gym.make('gym_backgammon:backgammon-pixel-v0') net0.load(checkpoint_path=model_agent0, optimizer=None, eligibility_traces=False) net1.load(checkpoint_path=model_agent1, optimizer=None, eligibility_traces=False) agents = {WHITE: TDAgent(WHITE, net=net1), BLACK: TDAgent(BLACK, net=net0)} evaluate_agents(agents, env, n_episodes)
def main(args): # environment env = GridWorld() # agent agent = TDAgent( env, epsilon=args.epsilon, gamma=args.discout, alpha=0.05, lamda=0.7) agent.control(method=args.algorithm)
def evaluate(existing_model_path, num_episodes=100, num_hidden_units=(40,), starting_alpha=0.1, starting_lamda=0.9, min_alpha=0.1, min_lamda=0.7, alpha_decay=1, lamda_decay=0.96, alpha_decay_interval=1, lamda_decay_interval=3e4, hidden_activation=nn.Sigmoid(), num_inputs=198, opponent="pubeval"): """ Evaluate a saved model against an opponent and prints out the model's win rate. :param existing_model_path: String. Path of the saved model. :param num_episodes: Integer. Number of games to play per model. :param num_hidden_units: See EvaluationModel class. :param starting_alpha: See EvaluationModel class. :param starting_lamda: See EvaluationModel class. :param min_alpha: See EvaluationModel class. :param min_lamda: See EvaluationModel class. :param alpha_decay: See EvaluationModel class. :param lamda_decay: See EvaluationModel class. :param alpha_decay_interval: See EvaluationModel class. :param lamda_decay_interval: See EvaluationModel class. :param hidden_activation: See EvaluationModel class. :param num_inputs: See EvaluationModel class. :param opponent: "pubeval" or "random". """ model = EvaluationModel(num_inputs=num_inputs, num_hidden_units=num_hidden_units, starting_alpha=starting_alpha, starting_lamda=starting_lamda, min_alpha=min_alpha, min_lamda=min_lamda, alpha_decay=alpha_decay, lamda_decay=lamda_decay, alpha_decay_interval=alpha_decay_interval, lamda_decay_interval=lamda_decay_interval, hidden_activation=hidden_activation) model.load(checkpoint_path=existing_model_path) if opponent == "pubeval": opponent_agent = PubevalAgent(0) else: opponent_agent = RandomAgent(0) agents = [opponent_agent, TDAgent(1, model)] wins = [0, 0] for i in range(num_episodes): game = Game(agents) wins[game.play()] += 1 print("\n{}: \t{}".format(existing_model_path, float(wins[1]) / float(sum(wins))))
def args_gui(args): if path_exists(args.model): # assert os.path.exists(args.model), print("The path {} doesn't exists".format(args.model)) if args.type == 'nn': net = TDGammon(hidden_units=args.hidden_units, lr=0.1, lamda=None, init_weights=False) env = gym.make('gym_backgammon:backgammon-v0') else: net = TDGammonCNN(lr=0.0001) env = gym.make('gym_backgammon:backgammon-pixel-v0') net.load(checkpoint_path=args.model, optimizer=None, eligibility_traces=False) agents = {BLACK: TDAgent(BLACK, net=net), WHITE: HumanAgent(WHITE)} gui = GUI(env=env, host=args.host, port=args.port, agents=agents) gui.run()
def main(args): env = GridWorld() agent = TDAgent(env, epsilon=args.epsilon, gamma=args.discount, alpha=args.lr) agent.control(method=args.algorithm)
def args_plot(args, parser): ''' This method is used to plot the number of time an agent wins when it plays against an opponent. Instead of evaluating the agent during training (it can require some time and slow down the training), I decided to plot the wins separately, loading the different model saved during training. For example, suppose I run the training for 100 games and save my model every 10 games. Later I will load these 10 models, and for each of them, I will compute how many times the agent would win against an opponent. :return: None ''' src = args.save_path hidden_units = args.hidden_units n_episodes = args.episodes opponents = args.opponent.split(',') host = args.host port = args.port difficulties = args.difficulty.split(',') model_type = args.type if path_exists(src): # assert os.path.exists(src), print("The path {} doesn't exists".format(src)) for d in difficulties: if d not in [ 'beginner', 'intermediate', 'advanced', 'world_class' ]: parser.error( "--difficulty should be (one or more of) 'beginner','intermediate', 'advanced' ,'world_class'" ) dst = args.dst if 'gnubg' in opponents and (not host or not port): parser.error( "--host and --port are required when 'gnubg' is specified in --opponent" ) for root, dirs, files in os.walk(src): global_step = 0 files = sorted(files) writer = SummaryWriter(dst) for file in files: if ".tar" in file: print("\nLoad {}".format(os.path.join(root, file))) if model_type == 'nn': net = TDGammon(hidden_units=hidden_units, lr=0.1, lamda=None, init_weights=False) env = gym.make('gym_backgammon:backgammon-v0') else: net = TDGammonCNN(lr=0.0001) env = gym.make('gym_backgammon:backgammon-pixel-v0') net.load(checkpoint_path=os.path.join(root, file), optimizer=None, eligibility_traces=False) if 'gnubg' in opponents: tag_scalar_dict = {} gnubg_interface = GnubgInterface(host=host, port=port) for difficulty in difficulties: gnubg_env = GnubgEnv(gnubg_interface, difficulty=difficulty, model_type=model_type) wins = evaluate_vs_gnubg(agent=TDAgentGNU( WHITE, net=net, gnubg_interface=gnubg_interface), env=gnubg_env, n_episodes=n_episodes) tag_scalar_dict[difficulty] = wins[WHITE] writer.add_scalars('wins_vs_gnubg/', tag_scalar_dict, global_step) with open(root + '/results.txt', 'a') as f: print("{};".format(file) + str(tag_scalar_dict), file=f) if 'random' in opponents: tag_scalar_dict = {} agents = { WHITE: TDAgent(WHITE, net=net), BLACK: RandomAgent(BLACK) } wins = evaluate_agents(agents, env, n_episodes) tag_scalar_dict['random'] = wins[WHITE] writer.add_scalars('wins_vs_random/', tag_scalar_dict, global_step) global_step += 1 writer.close()
def train_agent(self, env, n_episodes, save_path=None, eligibility=False, save_step=0, name_experiment=''): start_episode = self.start_episode n_episodes += start_episode wins = {WHITE: 0, BLACK: 0} network = self agents = { WHITE: TDAgent(WHITE, net=network), BLACK: TDAgent(BLACK, net=network) } durations = [] steps = 0 start_training = time.time() for episode in range(start_episode, n_episodes): if eligibility: self.init_eligibility_traces() agent_color, first_roll, observation = env.reset() agent = agents[agent_color] t = time.time() for i in count(): if first_roll: roll = first_roll first_roll = None else: roll = agent.roll_dice() p = self(observation) actions = env.get_valid_actions(roll) action = agent.choose_best_action(actions, env) observation_next, reward, done, winner = env.step(action) p_next = self(observation_next) if done: if winner is not None: loss = self.update_weights(p, reward) wins[agent.color] += 1 tot = sum(wins.values()) tot = tot if tot > 0 else 1 print( "Game={:<6d} | Winner={} | after {:<4} plays || Wins: {}={:<6}({:<5.1f}%) | {}={:<6}({:<5.1f}%) | Duration={:<.3f} sec" .format(episode + 1, winner, i, agents[WHITE].name, wins[WHITE], (wins[WHITE] / tot) * 100, agents[BLACK].name, wins[BLACK], (wins[BLACK] / tot) * 100, time.time() - t)) durations.append(time.time() - t) steps += i break else: loss = self.update_weights(p, p_next) agent_color = env.get_opponent_agent() agent = agents[agent_color] observation = observation_next if save_path and save_step > 0 and episode > 0 and ( episode + 1) % save_step == 0: self.checkpoint(checkpoint_path=save_path, step=episode, name_experiment=name_experiment) agents_to_evaluate = { WHITE: TDAgent(WHITE, net=network), BLACK: RandomAgent(BLACK) } evaluate_agents(agents_to_evaluate, env, n_episodes=20) print() print("\nAverage duration per game: {} seconds".format( round(sum(durations) / n_episodes, 3))) print("Average game length: {} plays | Total Duration: {}".format( round(steps / n_episodes, 2), datetime.timedelta(seconds=int(time.time() - start_training)))) if save_path: self.checkpoint(checkpoint_path=save_path, step=n_episodes - 1, name_experiment=name_experiment) with open('{}/comments.txt'.format(save_path), 'a') as file: file.write("Average duration per game: {} seconds".format( round(sum(durations) / n_episodes, 3))) file.write( "\nAverage game length: {} plays | Total Duration: {}". format( round(steps / n_episodes, 2), datetime.timedelta(seconds=int(time.time() - start_training)))) env.close()
easy_inputs = {"e", "easy"} medium_inputs = {"m", "med", "medium"} hard_inputs = {"h", "hard"} while difficulty_input not in easy_inputs.union(medium_inputs).union( hard_inputs): difficulty_input = input( "Select difficulty level: EASY (E), MEDIUM (M), HARD (H)").lower() if difficulty_input in easy_inputs: difficulty = Difficulty.EASY elif difficulty_input in medium_inputs: difficulty = Difficulty.MEDIUM elif difficulty_input in hard_inputs: difficulty = Difficulty.HARD v = vision.Vision() td_agent = TDAgent(WHITE, model, v, difficulty) human_agent = HumanAgent(BLACK, v) agents_list = [td_agent, human_agent] game = Game(agents_list) set_start_state = False if set_start_state: start_points = [[ 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], [ 0, 0, 3, 3, 2, 2, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]] start_bar = [0, 2]