def __init__( self, observation_space=500, action_space=6, alpha=0.1, gamma=0.9, epsilon=1.0, epsilon_decay=0.9999, epsilon_min=0.01 ): """ Initialize agent. Params ====== - nA: number of actions available to the agent """ self.nA = action_space self.possible_actions = np.arange(self.nA) self.epsilon_decay = epsilon_decay self.epsilon = epsilon self.epsilon_min = epsilon_min self.q_table = QTable( observation_space=observation_space, action_space=action_space, alpha=alpha, gamma=gamma )
class Agent: def __init__(self, observation_space=500, action_space=6, alpha=0.1, gamma=0.9, epsilon=1.0, epsilon_decay=0.9999, epsilon_min=0.01): """ Initialize agent. Params ====== - nA: number of actions available to the agent """ self.nA = action_space self.possible_actions = np.arange(self.nA) self.epsilon_decay = epsilon_decay self.epsilon = epsilon self.epsilon_min = epsilon_min self.q_table = QTable(observation_space=observation_space, action_space=action_space, alpha=alpha, gamma=gamma) def select_action(self, state): """ Given the state, select an action. Params ====== - state: the current state of the environment Returns ======= - action: an integer, compatible with the task's action space """ action_probabilities = self.epsilon_greedy(state) return np.random.choice(self.possible_actions, p=action_probabilities) def update_epsilon(self): self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min) def epsilon_greedy(self, state): policy = np.ones(self.nA) * (self.epsilon / self.nA) best_action_idx = np.argmax(self.q_table.q(state)) policy[best_action_idx] = (1 - self.epsilon) + (self.epsilon / self.nA) return policy def step(self, state, action, reward, next_state, done): """ Update the agent's knowledge, using the most recently sampled tuple. Params ====== - state: the previous state of the environment - action: the agent's previous choice of action - reward: last reward received - next_state: the current state of the environment - done: whether the episode is complete (True or False) """ self.q_table.sarsa_max_update(state, action, reward, next_state)
def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint self.q_table = QTable(alpha=0.1, gamma=0.1) self.q_table_updater = QTableUpdater(self.q_table) self.total_actions = 0.0 self.total_rewards = 0.0
def __init__(self, env, qmodel: QNetwork, amodel: PolicyNetwork, tasks, gamma: float = None, num_learn: int = 10, steps_per_episode: int = 1000, scheduler_period: int = 150, num_avg_gradient: int = 10, listeners=None, temperature=1): if gamma is None: gamma = qmodel.gamma super().__init__(env, qmodel, amodel, tasks, gamma, num_learn, steps_per_episode, scheduler_period, num_avg_gradient, listeners) self.Q = QTable() self.M = defaultdict(lambda: 0) self.scheduler = self.Q.derive_policy(BoltzmannPolicy, lambda x: self.tasks, temperature=temperature)
def __init__(self, env: FiniteActionEnvironment, gamma: float = 1.0): """ Create a new MonteCarlo Agent :param env: The environment the agent will learn from :param gamma: Reward discount factor """ super().__init__(env) self.q_table = QTable() self.visit_count = defaultdict(int) self.policy = self.q_table.derive_policy(EpsilonGreedyPolicy, env.valid_actions_from, epsilon=self.epsilon) self.gamma = gamma
class SACQ(SACU): def __init__(self, env, qmodel: QNetwork, amodel: PolicyNetwork, tasks, gamma: float = None, num_learn: int = 10, steps_per_episode: int = 1000, scheduler_period: int = 150, num_avg_gradient: int = 10, listeners=None, temperature=1): if gamma is None: gamma = qmodel.gamma super().__init__(env, qmodel, amodel, tasks, gamma, num_learn, steps_per_episode, scheduler_period, num_avg_gradient, listeners) self.Q = QTable() self.M = defaultdict(lambda: 0) self.scheduler = self.Q.derive_policy(BoltzmannPolicy, lambda x: self.tasks, temperature=temperature) def train_scheduler(self, tau, Tau): main_task = self.tasks[0] xi = self.scheduler_period main_rewards = [r[main_task] for _, _, r, _ in tau] for h in range(len(Tau)): R = sum([r * self.gamma**k for k, r in enumerate(main_rewards[h*xi:])]) self.M[Tau[h]] += 1 #self.Q[tuple(Tau[:h]), Tau[h]] += (R - self.Q[tuple(Tau[:h]), Tau[h]])/self.M[Tau[h]] # We used a Q-Table with 0.1 learning rate to update the values in the table. # Change 0.1 to the desired learning rate self.Q[tuple(Tau[:h]), Tau[h]] += 0.1 * (R - self.Q[tuple(Tau[:h]), Tau[h]]) def schedule_task(self, Tau): return self.scheduler.sample(tuple(Tau))
def run_session(problem, param={}): '''run a session of qtable''' sys_vars = init_sys_vars(problem, param) # rl system, see util.py env = gym.make(sys_vars['GYM_ENV_NAME']) env_spec = get_env_spec(env) replay_memory = ReplayMemory(env_spec) qtable = QTable(env_spec, **param) for epi in range(sys_vars['MAX_EPISODES']): sys_vars['epi'] = epi run_episode(sys_vars, env, qtable, replay_memory) # Best so far, increment num epochs every 2 up to a max of 5 if sys_vars['solved']: break return sys_vars
class MonteCarlo(Agent): """ Monte Carlo Agent implementation """ def __init__(self, env: FiniteActionEnvironment, gamma: float = 1.0): """ Create a new MonteCarlo Agent :param env: The environment the agent will learn from :param gamma: Reward discount factor """ super().__init__(env) self.q_table = QTable() self.visit_count = defaultdict(int) self.policy = self.q_table.derive_policy(EpsilonGreedyPolicy, env.valid_actions_from, epsilon=self.epsilon) self.gamma = gamma def learn(self, num_iter=100000) -> EpsilonGreedyPolicy: """ Learn a policy from the environment :param num_iter: The number of iterations the algorithm should run :return: the derived policy """ Q, N, pi = self.q_table, self.visit_count, self.policy for _ in range(num_iter): s = self.env.reset() e, r = [], 0 while not s.is_terminal(): # Execute an episode a = pi.sample(s) e += [[s, a]] s, r = self.env.step(a) e[-1] += [r] for i, (s, a, r) in enumerate(reversed(e)): # Reverse rewards so G can be computed efficiently g = r if i == 0 else g * self.gamma + r N[s, a] += 1 N[s] += 1 Q[s, a] += (1 / N[s, a]) * (g - Q[s, a]) return pi def epsilon(self, s): N_0, N = 100, self.visit_count return N_0 / (N_0 + N[s])
class ValueIterator: def __init__(self, target_position): self.target_position = target_position self._tran = Transitions() self._rewards = Rewarder(target_position) self._q_tab = QTable() self._v_tab = VTable() def update(self, debug=False): for s1 in self.all_states(): for a in range(len(Config.actions)): s2 = self._tran.run(s1, a) rew = self._rewards[s1, s2] if s2: q = rew + Config.gamma * self._v_tab[s2] else: q = rew self._q_tab[s1, a] = q if debug: pprint_transition(s1, a, s2, rew) self._v_tab.update_from_q_table(self._q_tab) # noinspection PyMethodMayBeStatic def all_states(self): for i in range(len(Config.letters)): for j in range(len(Config.numbers)): if (i, j) == self.target_position: continue for o in range(len(Config.orientations)): yield i, j, o def path(self, s0): a, _ = self._q_tab.get_best_action(s0) s1 = self._tran.run(s0, a) if not s1: raise ValueError("Переход в запрещенное состояние: " + state_to_str(s0) + "-" + action_to_str(a) + "-> None") elif (s1[0], s1[1]) == self.target_position: return [s0, a, s1] return [s0, a] + self.path(s1)
return 0 def is_game_over(self, board): winner = self.get_who_wins(board) if winner == 0: if np.sum(np.abs(board)) == 9: return True else: return False else: return True from q_table import QTable table = QTable() game = SelfPlay() game.verbose = False game.epsilon = 0.8 for k in range(50000): game.play_game(table) print(k) table.save_q_table_to_file('q_table.npy') game.verbose = True game.epsilon = 1 # table.load_q_table_from_file('q_table.npy') game.play_game(table) board = np.array( [[-1, 1, 0],
from appJar import gui from random import randint from tictactoe import TTT from q_table import QTable import numpy as np app = gui("TTT", "400x400") app.setSticky("news") app.setStretch("both") app.setFont(40) game_ttt = TTT() q_table = QTable() q_table.load_q_table_from_file("q_table.npy") def refresh_buttons(): for x in range(0, 3): for y in range(0, 3): title = 3 * x + y if game_ttt.board[x][y] == 0: app.setButton(title, "") elif game_ttt.board[x][y] == 1: app.setButton(title, "X") elif game_ttt.board[x][y] == -1: app.setButton(title, "O") def restart(): game_ttt.restart() refresh_buttons()
break step += 1 print('over') env.destroy def q_learning_run(): env.after(100, q_learning_update()) env.mainloop() def nn_run(): env1.after(100, nn_update) env1.mainloop() if __name__ == '__main__': env = Maze() env1 = Maze_dqn() table = QTable(actions=list(range(env.n_actions))) nn = DQN(env1.n_actions, env1.n_features, alpha=0.01, gamma=0.9, epsilon=0.9, replace_target_iter=200, memory_size=2000, epsilon_increment=0.1) nn_run() # q_learning_run()
EPSILON_DECAY = 25 * EPSILON_MIN / max_num_steps train_params = TrainingParameters(MAX_NUM_EPISODES, STEPS_PER_EPISODE) learn_params = LearningParameters(ALPHA, GAMMA) agent_params = AgentParameters(EPSILON_MIN, EPSILON_DECAY, 1) # %% [markdown] # ### Agente # %% from agent import Agent from q_table import QTable import numpy as np agent = Agent(agent_params, env.action_space.n) q_table = QTable(env.observation_space.n, env.action_space.n, learn_params) agent.set_q_table(q_table) # %% [markdown] # ### Funções de treino e teste # %% def train(agent: Agent, env, params: TrainingParameters): best_reward = -float('inf') for episode in range(MAX_NUM_EPISODES): obs = env.reset() done = False total_reward = 0.0 while not done: action = agent.get_action(obs) next_obs, reward, done, info = env.step(action)
def __init__(self, target_position): self.target_position = target_position self._tran = Transitions() self._rewards = Rewarder(target_position) self._q_tab = QTable() self._v_tab = VTable()
def set_q_table(self, alpha=0.0, gamma=0.0): self.q_table = QTable(alpha=alpha, gamma=gamma) self.q_table_updater = QTableUpdater(self.q_table)
# Testing settings flags.DEFINE_boolean('run_test', True, 'If the final model should be tested.') flags.DEFINE_integer('test_runs', 100, 'Number of times to run the test.') flags.DEFINE_float('test_epsilon', 0.1, 'Epsilon to use on test run.') flags.DEFINE_integer( 'test_step_limit', 1000, 'Limits the number of steps in test to avoid badly performing agents running forever.' ) settings = flags.FLAGS # Set up GridWorld env = GridWorld(settings.field_size, settings.random_seed) # Set up Q-table q_table = QTable(settings.field_size, settings.random_seed) sess = tf.InteractiveSession() np.random.seed(settings.random_seed) summary_dir = '../../logs/q-gridworld-fieldsize{}-episodes{}-lr{}/'.format( settings.field_size, settings.episodes, settings.learning_rate) summary_writer = tf.summary.FileWriter(summary_dir, sess.graph) stats = Stats(sess, summary_writer, 3) episode = 0 epsilon = settings.initial_epsilon while settings.episodes > episode: # Prepare environment for playing env.reset()
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint self.q_table = QTable(alpha=0.1, gamma=0.1) self.q_table_updater = QTableUpdater(self.q_table) self.total_actions = 0.0 self.total_rewards = 0.0 # self.last_occurence_of_punishment = 0.0 def set_q_table(self, alpha=0.0, gamma=0.0): self.q_table = QTable(alpha=alpha, gamma=gamma) self.q_table_updater = QTableUpdater(self.q_table) def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # Update state self.state = 'light: {}, left: {}, oncoming: {}, next_waypoint: {}'.format(inputs['light'], inputs['left'], inputs['oncoming'], self.next_waypoint) # Select action according to your policy action = self.q_table.best_action(light=inputs['light'], next_waypoint=self.next_waypoint, left=inputs['left'], oncoming=inputs['oncoming']) # Execute action and get reward reward = self.env.act(self, action) # Learn policy based on state, action, reward self.q_table_updater.update(light=inputs['light'], next_waypoint=self.next_waypoint, left=inputs['left'], oncoming=inputs['oncoming'], action=action, reward=reward) self.total_rewards += reward self.total_actions += 1.0 print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}, next_waypoint = {}".format(deadline, inputs, action, reward, self.next_waypoint) # [debug] def __init_q_table(self): self.q_table = {} def __positions(self): positions_list = [] for i in range(6): for j in range(8): positions_list.append((i+1,j+1)) return positions_list
from q_state import next_state, random_state, actions from q_learning import QLearning from q_table import QTable if __name__ == "__main__": episode = 100 model_save_interval = 10 table = QTable(actions) learning = QLearning(table) for step in range(episode): init_state = random_state() i = 0 reward = 0 while reward != 1: state = init_state while True: i += 1 action = learning.choose_action(state) state2, reward, done = next_state(state, action, table) learning.learn(state, action, reward, state2, done) if done: break state = state2 print(init_state, i, len(table.q_table)) if (step + 1) % model_save_interval == 0: table.save()