def __init__(self): self.state_size = rospy.get_param('/cartpole_v0/state_size') self.learning_rate = rospy.get_param('/cartpole_v0/learning_rate') action_size = rospy.get_param('/cartpole_v0/n_actions') gamma = rospy.get_param('/cartpole_v0/gamma') epsilon = rospy.get_param('/cartpole_v0/epsilon') epsilon_decay = rospy.get_param('/cartpole_v0/epsilon_decay') epsilon_min = rospy.get_param('/cartpole_v0/epsilon_min') batch_size = rospy.get_param('/cartpole_v0/batch_size') QLearningAgent.__init__(self, state_size=self.state_size, action_size=action_size, gamma=gamma, epsilon=epsilon, epsilon_decay=epsilon_decay, epsilon_min=epsilon_min, batch_size=batch_size)
def _main_learning_curve(plan_ids, n_iter): for plan_id in plan_ids: env = get_env(plan_id, default_reward=-0.001) mdp = env.getMDP()[1] non_terminal_states = mdp.keys() state, transitions = next(iter(mdp.items())) action_space = list(mdp[state].keys()) evaluator = PolicyIterationAgent(mdp.keys(), env.action_space, mdp) evaluator.compute_best_policy() agents = [ ("Optimal", "black", evaluator), ("Random", "gray", RandomAgent(env.action_space)), ("QLearning", "red", QLearningAgent(action_space)), ("Sarsa", "blue", SarsaAgent(action_space)), ("Dyna-Q", "green", DynaQAgent(action_space, non_terminal_states, lr_R=0.5, lr_P=0.5, k=5)) ] res = [(name, color, get_learning_curve(env, agent, name=name, evaluator=evaluator, n_iter=n_iter)) for name, color, agent in agents] plt.suptitle("map " + str(plan_id), fontsize=16) plt.title("Policy loss") for name, color, (policy_values, cum_reward, action_count) in res: if name == "Optimal": plt.plot(policy_values, "--", label=name, color=color) else: plt.plot(policy_values, label=name, color=color) plt.legend() plt.figure() plt.suptitle("map " + str(plan_id), fontsize=16) plt.title("Cumulated reward") for name, color, (policy_values, cum_reward, action_count) in res: if name == "Optimal": plt.plot(cum_reward, "--", label=name, color=color) else: plt.plot(cum_reward, label=name, color=color) plt.legend() plt.show()
def _main_demo(plan_id=0): env = get_env(plan_id) mdp = env.getMDP()[1] state, transitions = next(iter(mdp.items())) action_space = list(mdp[state].keys()) agent = QLearningAgent(action_space) # env.render() # permet de visualiser la grille du jeu env.render(mode="human") #visualisation sur la console # Faire un fichier de log sur plusieurs scenarios episode_count = 1000 FPS = 1e-6 # ~temps de pause entre deux affichages all_rsums = [] for i in range(episode_count): obs = env.state2str(env.reset()) agent.reset(obs) env.verbose = (i % 10 == 0 and i > 0) # afficher 1 episode sur 10 if env.verbose: env.render(FPS) j = 0 rsum = 0 while True: action = agent.act() obs, reward, done, _ = env.step(action) obs = env.state2str(obs) agent.get_result(obs, reward, done) rsum += reward j += 1 if env.verbose: env.render(FPS) if done: print("Episode : " + str(i) + " rsum=" + str(rsum) + ", " + str(j) + " actions") all_rsums.append(rsum) break print("done") print("Average rsum : {} +/- {}".format(np.mean(all_rsums), np.std(all_rsums))) env.close()
def simulate(side, instance, slip, obfuscate, randomseed, maxLength, gamma, num_episodes): env = Environment(side, instance, slip, obfuscate, randomseed, maxLength) agent = QLearningAgent(env, gamma, lr=0.8) episode_rewards = np.zeros(num_episodes) for i in range(num_episodes): event = 'continue' episode_reward = 0 while event == 'continue': action = agent.getAction() # Take action state, reward, event = env.step(action) agent.observe(state, reward, event) episode_reward += reward episode_rewards[i] = episode_reward # print(episode_rewards[-100:]) avg = np.mean(episode_rewards[-100:]) pi = agent.getPi() print("Slip: " + str(slip) + " Avg: " + str(avg)) env.printPolicy(pi) # print(episode_rewards[-1000:]) # print("Mean episode reward: {}".format(np.mean(episode_rewards[-1000:]))) return round(avg, 4)
break return total_reward if __name__ == '__main__': env = gym.make("CartPole-v0").env env.reset() n_actions = env.action_space.n print(env.observation_space.high) print(env.observation_space.low) print('CartPole state: %s' % (env.reset())) agent = QLearningAgent(alpha=0.3, epsilon=0.5, discount=1.0, get_legal_actions=lambda s: range(n_actions)) rewards = [] for i in range(2000): rewards.append(play_and_train(env, agent)) agent.epsilon *= 0.999 if i % 10 == 0: print('Iteration {}, Average reward {:.2f}, Epsilon {:.3f}'.format( i, np.mean(rewards), agent.epsilon)) print('Reward of Test agent = %.3f' % play_and_train(env, agent, visualize=True))
from game import Game from agent import Agent from maxAgent import MaxAgent from randomAgent import RandomAgent from qlearning import QLearningAgent qAgent = QLearningAgent(10000) qAgent.train() print('trained on 10000 games') agent_two = MaxAgent() wins = 0 for i in range(5000): game = Game() while not game.over(): if game.turn_player() == 1: game.move(qAgent.move(game)) else: game.move(agent_two.move(game)) if game.score()[0] > game.score()[1]: wins = wins + 1 for i in range(5000): game = Game() while not game.over(): if game.turn_player() == 2: game.move(qAgent.move(game)) else: game.move(agent_two.move(game)) if game.score()[1] > game.score()[0]:
def render(self, mode='human', close=False): print(self.s) def getLegalActions(s): legalActions = [] for action in actions: if (action.pre(s)): legalActions.append(action) return legalActions from qlearning import QLearningAgent agent = QLearningAgent(alpha=0.5, epsilon=0.5, discount=0.99, get_legal_actions=getLegalActions) def play_and_train(env, agent, t_max=10**4): """ This function should - run a full game, actions given by agent's e-greedy policy - train agent using agent.update(...) whenever it is possible - return total reward """ total_reward = 0.0 s = env.reset() for t in range(t_max): # get agent to pick action given state s.
MESSAGE_BASE = 85 # Distance from bottom to message area SNOWMAN_BASE = 140 # Distance from bottom to Snowman base MAX_INCORRECT_GUESSES = 8 # Number of incorrect guesses allowed INCORRECT_COLOR = "#FF9999" # Color used for incorrect guesses CORRECT_COLOR = "#009900" # Color used to mark correct guesses # Fonts WORD_FONT = "bold 36px 'Monaco','Monospaced'" LETTER_FONT = "bold 24px 'Monaco','Monospaced'" MESSAGE_FONT = "30px 'Helvetica Neue','Arial','Sans-Serif'" gw = GWindow(GWINDOW_WIDTH, GWINDOW_HEIGHT) env = BlackjackEnvironment() agent = QLearningAgent(env) def createWindow(): def createCardTotalLabels(): alphabet = ['02','03','04','05','06','07','08','09','10', '11','12','13','14','15','16','17','18','19', '20','21'] alphabetLabels = [GLabel(letter) for letter in alphabet] for label in alphabetLabels: label.setFont(LETTER_FONT) return alphabetLabels def createUpCardLabels():
import matplotlib.pyplot as plt import seaborn as sns from environment import Environment from qlearning import QLearningAgent from double_qlearning import DoubleQLearningAgent sns.set() env = Environment() agent = QLearningAgent(env) agent2 = DoubleQLearningAgent(env) left_actions_ratio_a1 = agent.update_policy() left_actions_ratio_a2 = agent2.update_policy() fig, ax = plt.subplots() ax.plot(range(len(left_actions_ratio_a1)), left_actions_ratio_a1, color="red", label="Q-Learning") ax.plot(range(len(left_actions_ratio_a2)), left_actions_ratio_a2, color="green", label="Double Q-Learning") ax.plot(range(len(left_actions_ratio_a1)), [5] * len(left_actions_ratio_a1), '--', color='black', label='optimal') ax.set_xlabel("Number of episodes") ax.set_ylabel("% of left actions from A")
default=False, action='store_true') parser.add_argument('-v', '--verbose', dest="verbose", action='store_true', default=False) parser.add_argument('-s', '--silent', dest="silent", action='store_true', default=False) args = parser.parse_args() agent = QLearningAgent(TaxiEnv(env), epsilon=args.epsilon, alpha=args.alpha, gamma=args.gamma) total_episodes = args.train_episodes + args.val_episodes + 1 episode_start_val = args.train_episodes + 1 train_timesteps_list = [] val_timesteps_list = [] cmd = None for i_episode in range(1, total_episodes): state = env.reset() done = 0 t = 0 reward = 0 if i_episode >= episode_start_val: