class DQNAgent: def __init__(self, Q, Q_target, num_actions, discount_factor=0.99, batch_size=64, epsilon=0.05,epsilon_decay=1, epsilon_min=0.05,tau=1, game='cartpole',exploration="epsilon_greedy", history_length=0) :#, load_data=False): """ Q-Learning agent for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: Q: Action-Value function estimator (Neural Network) Q_target: Slowly updated target network to calculate the targets. num_actions: Number of actions of the environment. discount_factor: gamma, discount factor of future rewards. batch_size: Number of samples per batch. epsilon: Chance to sample a random action. Float betwen 0 and 1. """ self.Q = Q self.Q_target = Q_target self.game = game self.epsilon = epsilon self.epsilon_decay = epsilon_decay self.tau = tau self.epsilon_min = epsilon_min self.num_actions = num_actions self.batch_size = batch_size self.discount_factor = discount_factor self.exploration = exploration # define replay buffer self.replay_buffer = ReplayBuffer() # Start tensorflow session self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() def train(self, state, action, next_state, reward, terminal): """ This method stores a transition to the replay buffer and updates the Q networks. """ # TODO: # 1. add current transition to replay buffer # 2. sample next batch and perform batch update: # 2.1 compute td targets: # td_target = reward + discount * max_a Q_target(next_state_batch, a) # 2.2 update the Q network # self.Q.update(...) # 2.3 call soft update for target network # self.Q_target.update(...) ''' self.replay_buffer.add_transition(state, action, next_state, reward, terminal) states, actions, next_states, rewards, dones = self.replay_buffer.next_batch (self.batch_size) target_f = np.zeros((self.batch_size)) for i in range(self.batch_size): if dones[i]: target_f[i] = rewards[i] else: target_f[i] = rewards[i] + self.discount_factor * np.max(self.Q_target.predict(self.sess, [next_states[i]]), 1) loss = self.Q.update(self.sess, states, actions, target_f) self.Q_target.update(self.sess) ''' self.replay_buffer.add_transition(state, action, next_state, reward, terminal) batch_state, batch_action, batch_next_state, batch_rewards, batch_done = self.replay_buffer.next_batch(self.batch_size) td_target = batch_rewards best_action = np.argmax(self.Q.predict(self.sess, batch_next_state)[np.logical_not(batch_done)], 1) td_target[np.logical_not(batch_done)] += self.discount_factor * self.Q_target.predict(self.sess, batch_next_state)[np.logical_not(batch_done), best_action] loss = self.Q.update(self.sess, batch_state, batch_action, td_target) self.Q_target.update(self.sess) return loss def act(self, state, deterministic): """ This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action) Args: state: current state input deterministic: if True, the agent should execute the argmax action (False in training, True in evaluation) Returns: action id """ ''' r = np.random.uniform() if deterministic or r > self.epsilon: # TODO: take greedy action (argmax) # action_id = ... action_id = np.argmax(self.Q.predict(self.sess, [state])) else: if self.game == 'cartpole': action_id = random.randrange(self.num_actions) elif self.game == 'carracing': # TODO: sample random action # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work. # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight. # To see how the agent explores, turn the rendering in the training on and look what the agent is doing. # action_id = ... probabilities = [0.1, 0.2, 0.2, 0.45, 0.05] action_id = np.random.choice (self.num_actions, p=probabilities) ''' if deterministic: action_id = np.argmax(self.Q.predict(self.sess, [state])) else: if self.exploration == "greedy": if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay r = np.random.uniform() if r > self.epsilon: # TODO: take greedy action (argmax) action_id = np.argmax(self.Q.predict(self.sess, [state])) else: # TODO: sample random action # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work. # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight. # To see how the agent explores, turn the rendering inthe training on and look what the agent is doing. if self.game == "cartpole" : action_id = np.random.randint(self.num_actions) elif self.game == "carracing": probabilities = [0.15, 0.15, 0.15, 0.3, 0.05, 0.1, 0.1] action_id = np.random.choice (self.num_actions, p=probabilities) else: print("Invalid game") elif self.exploration == "boltzmann": action_value = self.Q.predict(self.sess, [state])[0] prob = self.softmax(action_value/self.tau) action_id = np.random.choice(self.num_actions, p=prob) else: print("Invalid Exploration Type") return action_id def softmax(self, input): """ Safe Softmax function to avoid overflow Args: input: input vector Returns: prob: softmax of input """ input_max = np.max(input) e = np.exp(input-input_max) prob = e / np.sum(e) return prob def load(self, file_name): self.saver.restore(self.sess, file_name) def check_early_stop(self, reward, totalreward): return self.Q_target.check_early_stop (reward, totalreward)
class DQNAgent: def __init__(self, Q, Q_target, num_actions, game, exploration, discount_factor=0.99, batch_size=64, epsilon=0.2, epsilon_decay=0.99, epsilon_min=0.03): """ Q-Learning agent for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: Q: Action-Value function estimator (Neural Network) Q_target: Slowly updated target network to calculate the targets. num_actions: Number of actions of the environment. discount_factor: gamma, discount factor of future rewards. batch_size: Number of samples per batch. epsilon: Chance to sample a random action. Float betwen 0 and 1. """ self.Q = Q self.Q_target = Q_target self.epsilon = epsilon self.epsilon_decay = epsilon_decay self.epsilon_min = epsilon_min self.num_actions = num_actions self.batch_size = batch_size self.discount_factor = discount_factor self.exploration = exploration self.game = game # define replay buffer self.replay_buffer = ReplayBuffer() # Start tensorflow session self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() def train(self, state, action, next_state, reward, done): """ This method stores a transition to the replay buffer and updates the Q networks. """ # TODO: # 1. add current transition to replay buffer # 2. sample next batch and perform batch update: # 2.1 compute td targets: # td_target = reward + discount * max_a Q_target(next_state_batch, a) # 2.2 update the Q network # self.Q.update(...) # 2.3 call soft update for target network # self.Q_target.update(...) self.replay_buffer.add_transition(state, action, next_state, reward, done) batch_state, batch_action, batch_next_state, batch_rewards, batch_done = self.replay_buffer.next_batch( self.batch_size) td_target = batch_rewards #td_target += self.discount_factor * np.amax(self.Q_target.predict(self.sess, batch_next_state)) #use this or think of something better best_action = np.amax( self.Q.predict(self.sess, batch_next_state)[np.logical_not(batch_done)], 1) td_target[np.logical_not( batch_done)] += self.discount_factor * self.Q_target.predict( self.sess, batch_next_state)[np.logical_not(batch_done), best_action] self.Q.update(self.sess, batch_state, batch_action, td_target) self.Q_target.update(self.sess) def act(self, state, deterministic): """ This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action) Args: state: current state input deterministic: if True, the agent should execute the argmax action (False in training, True in evaluation) Returns: action id """ r = np.random.uniform() if deterministic: action_id = np.argmax(self.Q.predict(self.sess, [state])) else: if self.exploration == "greedy": if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay r = np.random.uniform() if r > self.epsilon: # TODO: take greedy action (argmax) action_id = np.argmax(self.Q.predict(self.sess, [state])) else: # TODO: sample random action # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work. # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight. # To see how the agent explores, turn the rendering in the training on and look what the agent is doing. # action_id = ... if self.game == "cartpole": action_id = np.random.randint( self.num_actions) #define number of actions #else if self.game == "CarRacing" : #action_id = .... else: print('Please enter a valid game.') # if exploration == "boltzmann": # else: return action_id def load(self, file_name): self.saver.restore(self.sess, file_name)
class Agent: def __init__(self, Q, Q_target, num_actions, discount_factor=0.99, batch_size=64, epsilon=0.95, epsilon_min=0.05, epsilon_decay=0.995, exploration_type='e-annealing', learning_type='dq', replay_buffer_size=1e5): self.Q = Q self.Q_target = Q_target self.epsilon = epsilon self.epsilon_min = epsilon_min self.epsilon_decay = epsilon_decay self.exploration_type = exploration_type self.learning_type = learning_type self.num_actions = num_actions self.batch_size = batch_size self.discount_factor = discount_factor # initialize replay buffer self.replay_buffer = ReplayBuffer(replay_buffer_size) # start tensorflow session self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() # add transition to the replay buffer def add(self, state, action, next_state, reward, terminal): self.replay_buffer.add_transition(state, action, next_state, reward, terminal) # train network def train(self): # sample batch from the replay buffer batch_states, batch_actions, batch_next_states, batch_rewards, batch_dones = self.replay_buffer.next_batch( self.batch_size) # compute td targets using q- or double q-learning if self.learning_type == 'q': # q learning batch_rewards[np.logical_not( batch_dones)] += self.discount_factor * np.max( self.Q_target.predict(self.sess, batch_next_states), axis=1)[np.logical_not(batch_dones)] else: # double q learning q_actions = np.argmax(self.Q.predict(self.sess, batch_next_states), axis=1) batch_rewards[np.logical_not( batch_dones)] += self.discount_factor * self.Q_target.predict( self.sess, batch_next_states)[np.arange(self.batch_size), q_actions][np.logical_not(batch_dones)] # update network and target network loss = self.Q.update(self.sess, batch_states, batch_actions, batch_rewards) self.Q_target.update(self.sess) return loss # get action for state def act(self, state, deterministic): r = np.random.uniform() if deterministic or (self.exploration_type != 'boltzmann' and r > self.epsilon): # take greedy action (argmax) a_pred = self.Q.predict(self.sess, [state]) action_id = np.argmax(a_pred) else: if self.exploration_type == 'boltzmann': actions = self.Q.predict(self.sess, [state])[0] # softmax calculation, subtracting max for stability actions = np.exp((actions - max(actions)) / self.epsilon) actions /= np.sum(actions) # selecting action following probabilities a_value = np.random.choice(actions, p=actions) action_id = np.argmax(a_value == actions) else: # sample random action action_id = np.random.randint(0, self.num_actions) return action_id # anneal epsilon def anneal(self, e=0): self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay) # linear #self.epsilon = max(self.epsilon_min, self.epsilon * np.exp(-(1 - self.epsilon_decay) * e)) # load trained network def load(self, folder): self.saver.restore(self.sess, tf.train.latest_checkpoint(folder))
class DQNAgent: def __init__(self, Q, Q_target, num_actions, discount_factor=0.995, batch_size=64, epsilon=0.05): """ Q-Learning agent for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: Q: Action-Value function estimator (Neural Network) Q_target: Slowly updated target network to calculate the targets. num_actions: Number of actions of the environment. discount_factor: gamma, discount factor of future rewards. batch_size: Number of samples per batch. epsilon: Chance to sample a random action. Float betwen 0 and 1. """ self.Q = Q self.Q_target = Q_target self.epsilon = epsilon self.epsilon_min = 0.1 self.epsilon_decay = 0.99 self.num_actions = num_actions self.batch_size = batch_size self.discount_factor = discount_factor self.neg_reward_counter = 0 self.max_neg_rewards = 100 # define replay buffer self.replay_buffer = ReplayBuffer() # Start tensorflow session self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() def train(self, state, action, next_state, reward, terminal): """ This method stores a transition to the replay buffer and updates the Q networks. """ # TODO: # 1. add current transition to replay buffer self.replay_buffer.add_transition(state, action, next_state, reward, terminal) # 2. sample next batch and perform batch update: #self.gas_actions = np.array([a == 3 for a in self.replay_buffer._data.actions]) batch_states, batch_actions, batch_next_states, batch_rewards, batch_dones = self.replay_buffer.next_batch( self.batch_size) td_target = batch_rewards td_target[np.logical_not( batch_dones)] += self.discount_factor * np.amax( self.Q_target.predict(self.sess, batch_next_states), 1)[np.logical_not(batch_dones)] #print(batch_actions) loss = self.Q.update(self.sess, batch_states, batch_actions, td_target) self.Q_target.update(self.sess) #if self.epsilon > self.epsilon_min: # self.epsilon *= self.epsilon_decay #print(self.epsilon) def act(self, state, deterministic): """ This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action) Args: state: current state input deterministic: if True, the agent should execute the argmax action (False in training, True in evaluation) Returns: action id """ r = np.random.uniform() if deterministic or r > self.epsilon: act_values = self.Q.predict(self.sess, [state]) action_id = np.argmax(self.Q.predict(self.sess, [state])) #print("I PREDICTED") #print("action_id_predicted: ", action_id) return action_id else: action_id = np.random.choice( [0, 1, 2, 3, 4], p=[0.3, 0.1, 0.1, 0.49, 0.01]) #straight, left, right, accelerate, brake # TODO: sample random action # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work. # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight. # To see how the agent explores, turn the rendering in the training on and look what the agent is doing. # print("action_id: ", action_id) #print("action_id_random: ", action_id) return action_id def load(self, file_name): self.saver.restore(self.sess, file_name)
class DQNAgent: def __init__(self, Q, Q_target, num_actions, discount_factor=0.99, batch_size=64, epsilon=0.05, act_probabilities=None, double_q=False, buffer_capacity=100000, prefill_bs_percentage=5): """ Q-Learning agent for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: Q: Action-Value function estimator (Neural Network) Q_target: Slowly updated target network to calculate the targets. num_actions: Number of actions of the environment. discount_factor: gamma, discount factor of future rewards. batch_size: Number of samples per batch. epsilon: Chance to sample a random action. Float betwen 0 and 1. """ self.Q = Q self.Q_target = Q_target self.epsilon = epsilon self.num_actions = num_actions self.batch_size = batch_size self.discount_factor = discount_factor # define replay buffer self.replay_buffer = ReplayBuffer(capacity=buffer_capacity, min_fill=prefill_bs_percentage * batch_size) # Start tensorflow session self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() # <JAB> if act_probabilities is None: self.act_probabilities = np.ones(num_actions) / num_actions else: self.act_probabilities = act_probabilities self.double_dqn = double_q def train(self, state, action, next_state, reward, terminal): """ This method stores a transition to the replay buffer and updates the Q networks. """ # TODO: # 1. add current transition to replay buffer # 2. sample next batch and perform batch update: # 2.1 compute td targets: # td_target = reward + discount * argmax_a Q_target(next_state_batch, a) # 2.2 update the Q network # self.Q.update(...) # 2.3 call soft update for target network # self.Q_target.update(...) # <JAB> self.replay_buffer.add_transition(state, action, next_state, reward, terminal) # Let the buffer fill up, otherwise we will burn up a lot of $#!+¥ states early on if self.replay_buffer.has_min_items(): buffer = self.replay_buffer.next_batch(self.batch_size) batch_states = buffer[0] batch_actions = buffer[1] batch_next_states = buffer[2] batch_rewards = buffer[3] batch_dones = buffer[4] non_terminal_states = np.logical_not(batch_dones) if self.double_dqn: a_predictions = self.Q.predict(self.sess, batch_next_states) a_predictions = np.argmax(a_predictions, axis=1) action_indexes = [np.arange(len(a_predictions)), a_predictions] q_predictions = self.Q_target.predict(self.sess, batch_next_states) q_predictions = q_predictions[action_indexes] else: q_predictions = self.Q_target.predict(self.sess, batch_next_states) q_predictions = np.max(q_predictions, axis=1) td_target = batch_rewards # If episode is not finished, add predicted Q values to the current rewards td_target[ non_terminal_states] += self.discount_factor * q_predictions[ non_terminal_states] # Update Step self.Q.update(self.sess, batch_states, batch_actions, td_target) self.Q_target.update(self.sess) def act(self, state, deterministic): """ This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action) Args: state: current state input deterministic: if True, the agent should execute the argmax action (False in training, True in evaluation) Returns: action id """ r = np.random.uniform() if deterministic or r > self.epsilon: # <JAB> action_id = np.argmax(self.Q.predict(self.sess, state)) # </JAB> else: # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work. # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight. # To see how the agent explores, turn the rendering in the training on and look what the agent is doing. # action_id = ... # <JAB> action_id = np.random.choice(np.arange(self.num_actions), p=self.act_probabilities) # </JAB> return action_id def load(self, file_name): self.saver.restore(self.sess, file_name)
class DQNAgent: def __init__(self, name, Q_current, Q_target, num_actions, discount_factor, batch_size, epsilon, epsilon_decay, boltzmann, double_q, buffer_capacity, random_probs=None): """ Q-Learning agent for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: Q: Action-Value function estimator (Neural Network) Q_target: Slowly updated target network to calculate the targets. num_actions: Number of actions of the environment. discount_factor: gamma, discount factor of future rewards. batch_size: Number of samples per batch. epsilon: Chance to sample a random action. Float betwen 0 and 1. """ # save hyperparameters in folder self.name = name # probably useless self.Q_current = Q_current self.Q_target = Q_target self.epsilon = epsilon self.epsilon_decay = epsilon_decay self.boltzmann = boltzmann self.num_actions = num_actions self.batch_size = batch_size self.discount_factor = discount_factor self.buffer_capacity = buffer_capacity self.double_q = double_q self.random_probs = random_probs # define replay buffer self.replay_buffer = ReplayBuffer(capacity=buffer_capacity) # Start tensorflow session self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() def train(self, state, action, next_state, reward, terminal): """ This method stores a transition to the replay buffer and updates the Q networks. """ # TODO: # 1. add current transition to replay buffer self.replay_buffer.add_transition(state, action, next_state, reward, terminal) # 2. sample next batch batch_states, batch_actions, batch_next_states, batch_rewards, batch_dones = self.replay_buffer.next_batch( self.batch_size) # find optimal actions for the sampled s' states if self.double_q: # double Q learning (select actions using current network, rather than target network) # ...in order to decorrelate noise between selection and evaluation # (Q(state,action) is still evaluated using target network in any case) action_selector = self.Q_current else: action_selector = self.Q_target # as usual, the Q network returns a vector of... predicted values for every possible action a_prime = np.argmax(action_selector.predict(self.sess, batch_next_states), axis=1) # pick a''th value from each column of the Q prediction # note, this will include action predictions for "done" state, but we'll kill them later q_values_next = self.Q_current.predict( self.sess, batch_next_states)[np.arange(self.batch_size), a_prime] # 2.1 compute td targets: # if done, there will be no next state td_targets = batch_rewards + np.where( batch_dones, 0, self.discount_factor * q_values_next) # 2.2 update the Q (current) network self.Q_current.update(self.sess, batch_states, batch_actions, td_targets) # 2.3 call soft update for target network # this is done by the dodgy associate_method therein self.Q_target.update(self.sess) def act(self, state, deterministic): """ This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action) Args: state: current state input deterministic: if True, the agent should execute the argmax action (False in training, True in evaluation) Returns: action id """ # get action probabilities from current network Q_values = np.squeeze( self.Q_current.predict(self.sess, np.expand_dims(state, axis=0))) argmax_a = np.argmax(Q_values) if deterministic: # take greedy action return argmax_a if self.boltzmann: # implementing an interaction here between boltzmann exploration and epsilon: # viz. that epsilon controls the temperature of the softmax function # so that as before, higher eps -> higher exploration action_probs = softmax(Q_values, temperature=1 / (1 - self.epsilon)**2) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) else: action_probs = np.zeros_like(Q_values) if np.random.uniform() > self.epsilon: # choose the best action action = argmax_a else: # explore if self.random_probs is None: action = np.random.randint(self.num_actions, size=1)[0] else: action = np.random.choice(np.arange(self.num_actions), p=self.random_probs) # we decay epsilon AFTER we've checked it # (nb: if deterministic, epsilon will never decay, but of course this doesn't matter) if self.epsilon_decay > 0: self.epsilon *= (1 - self.epsilon_decay) return action def load(self, file_name): self.saver.restore(self.sess, file_name)
class DQNAgent: def __init__(self, Q, Q_target, num_actions, discount_factor=0.95, batch_size=64, epsilon=1): """ Q-Learning agent for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: Q: Action-Value function estimator (Neural Network) Q_target: Slowly updated target network to calculate the targets. num_actions: Number of actions of the environment. discount_factor: gamma, discount factor of future rewards. batch_size: Number of samples per batch. epsilon: Chance to sample a random action. Float betwen 0 and 1. """ self.Q = Q self.Q_target = Q_target self.epsilon = epsilon self.epsilon_decay = 0.995 self.epsilon_min = 0.01 self.num_actions = num_actions self.batch_size = batch_size self.discount_factor = discount_factor # define replay buffer self.replay_buffer = ReplayBuffer() # Start tensorflow session self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() def train(self, state, action, next_state, reward, terminal): """ This method stores a transition to the replay buffer and updates the Q networks. """ # TODO: # 1. add current transition to replay buffer self.replay_buffer.add_transition(state, action, next_state, reward, terminal) # 2. sample next batch and perform batch update: batch_states, batch_actions, batch_next_states, batch_rewards, batch_dones = self.replay_buffer.next_batch( self.batch_size) for i in range(self.batch_size): # print("next state: ", batch_next_states[i]) td_target = batch_rewards[i] if not batch_dones[i]: td_target = batch_rewards[i] + self.discount_factor * np.amax( self.Q_target.predict(self.sess, [batch_next_states[i]])) target_f = self.Q_target.predict(self.sess, [batch_states[i]]) target_f[0][batch_actions[i]] = td_target loss = self.Q.update(self.sess, [batch_states[i]], [batch_actions[i]], target_f[0]) #td_targets) self.Q_target.update(self.sess) #print("loss:", loss) if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay #print("epsilon: ", self.epsilon) def act(self, state, deterministic): """ This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action) Args: state: current state input deterministic: if True, the agent should execute the argmax action (False in training, True in evaluation) Returns: action id """ r = np.random.uniform() if deterministic or r > self.epsilon: # TODO: take greedy action (argmax) #state = np.reshape(state, (1,4)) act_values = self.Q.predict(self.sess, [state]) #it was q target # we should be using act_values[0], i guess # print("act values: ", act_values) # act values: [[0.05641035 0.06138265]] # print("act values[0]: ", act_values[0]) # act values[0]: [0.05641035 0.06138265] action_id = np.argmax(act_values[0]) #print("predicted action. deterministic: {}. epsilon cond: {}. action_id: {}." #.format(deterministic, (r > self.epsilon), action_id)) else: action_id = random.randrange(self.num_actions) #print("random action. deterministic: {}. epsilon cond.: {}. action_id: {}." #.format(deterministic, (r > self.epsilon), action_id)) # TODO: sample random action # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work. # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight. # To see how the agent explores, turn the rendering in the training on and look what the agent is doing. # print("action_id: ", action_id) return action_id def load(self, file_name): self.saver.restore(self.sess, file_name)
class DQNAgent: def __init__(self, Q, Q_target, num_actions, game="cartpole", explore_type="epsilon_greedy", epsilon_decay=1, epsilon_min=0.05, tau=1, method="CQL", discount_factor=0.99, batch_size=64, epsilon=0.05): """ Q-Learning agent for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: Q: Action-Value function estimator (Neural Network) Q_target: Slowly updated target network to calculate the targets. num_actions: Number of actions of the environment. discount_factor: gamma, discount factor of future rewards. batch_size: Number of samples per batch. epsilon: Chance to sample a random action. Float betwen 0 and 1. """ self.Q = Q self.Q_target = Q_target # now support cartpole or carracing two games self.game = game # self.state_dim = Q. self.epsilon = epsilon self.num_actions = num_actions self.batch_size = batch_size self.discount_factor = discount_factor # now support CQL(classical Q) or DQL(Double Q) self.method = method self.explore_type = explore_type # for epsilon annealing self.epsilon_decay = epsilon_decay self.epsilon_min = epsilon_min # for boltzmann exploration self.tau = tau # define replay buffer self.replay_buffer = ReplayBuffer() # start tensorflow session self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() def train(self, state, action, next_state, reward, terminal): """ This method stores a transition to the replay buffer and updates the Q networks. """ # TODO: # 1. add current transition to replay buffer # 2. sample next batch and perform batch update: # 2.1 compute td targets: # td_target = reward + discount * argmax_a Q_target(next_state_batch, a) # 2.2 update the Q network # self.Q.update(...) # 2.3 call soft update for target network # self.Q_target.update(...) self.replay_buffer.add_transition(state, action, next_state, reward, terminal) batch_states, batch_actions, batch_next_states, batch_rewards, batch_dones = self.replay_buffer.next_batch( self.batch_size) td_target = batch_rewards if self.method == "CQL": td_target[np.logical_not( batch_dones)] += self.discount_factor * np.max( self.Q_target.predict(self.sess, batch_next_states), 1)[np.logical_not(batch_dones)] self.Q.update(self.sess, batch_states, batch_actions, td_target) self.Q_target.update(self.sess) elif self.method == "DQL": best_action = np.argmax( self.Q.predict(self.sess, batch_next_states)[np.logical_not(batch_dones)], 1) td_target[np.logical_not( batch_dones)] += self.discount_factor * self.Q_target.predict( self.sess, batch_next_states)[np.logical_not(batch_dones), best_action] self.Q.update(self.sess, batch_states, batch_actions, td_target) self.Q_target.update(self.sess) def act(self, state, deterministic): """ This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action) Args: state: current state input deterministic: if True, the agent should execute the argmax action (False in training, True in evaluation) Returns: action id """ if deterministic: action_id = np.argmax(self.Q.predict(self.sess, [state])) else: if self.explore_type == "epsilon_greedy": if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay r = np.random.uniform() if r > self.epsilon: # TODO: take greedy action (argmax) action_id = np.argmax(self.Q.predict(self.sess, [state])) else: # TODO: sample random action # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work. # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight. # To see how the agent explores, turn the rendering inthe training on and look what the agent is doing. if self.game == "cartpole" or self.game == "mountaincar": action_id = np.random.randint(self.num_actions) elif self.game == "carracing": # action_probability = np.array([1, 2, 2, 10, 1, 1, 1]) action_probability = np.array([2, 5, 5, 10, 1]) action_probability = action_probability / np.sum( action_probability) action_id = np.random.choice(self.num_actions, p=action_probability) else: print("Invalid game") elif self.explore_type == "boltzmann": action_value = self.Q.predict(self.sess, [state])[0] prob = self.softmax(action_value / self.tau) action_id = np.random.choice(self.num_actions, p=prob) else: print("Invalid Exploration Type") return action_id def softmax(self, input): """ Safe Softmax function to avoid overflow Args: input: input vector Returns: prob: softmax of input """ input_max = np.max(input) e = np.exp(input - input_max) prob = e / np.sum(e) return prob def load(self, file_name): self.saver.restore(self.sess, file_name)
class DQNAgent: def __init__(self, Q, Q_target, num_actions, discount_factor=0.99, batch_size=64, epsilon=0.05): """ Q-Learning agent for off-policy TD control using Function Approximation. ######################################################################## TD here for using as new target R + discount_factor * Q(S', A') off-policy -> use old data collected on other policy, too ####################################################################### Finds the optimal greedy policy while following an epsilon-greedy policy. Args: Q: Action-Value function estimator (Neural Network) Q_target: Slowly updated target network to calculate the targets. num_actions: Number of actions of the environment. discount_factor: gamma, discount factor of future rewards. batch_size: Number of samples per batch. epsilon: Chance to sample a random action. Float betwen 0 and 1. """ self.Q = Q self.Q_target = Q_target self.epsilon = epsilon self.num_actions = num_actions self.batch_size = batch_size self.discount_factor = discount_factor # define replay buffer self.replay_buffer = ReplayBuffer(use_manual_data=False) # Start tensorflow session self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() def train(self, state, action, next_state, reward, terminal, collect_data_first=False): """ This method stores a transition to the replay buffer and updates the Q networks. """ # add current transition to replay buffer self.replay_buffer.add_transition(state, action, next_state, reward, terminal) # if the ReplayBuffer should be filled up first, then the train step is done here if collect_data_first and len( self.replay_buffer._data.states) < self.batch_size: print("No training yet. Filling up replay buffer..") # return 0 for loss and q_values return 0, [0, 0] # If the ReplayBuffer should not be filled up or is full enough, do the following else: # get a random batch from the ReplayBuffer batch_states, batch_actions, batch_next_states, batch_rewards, batch_dones = \ self.replay_buffer.next_batch(self.batch_size) batch_targets = np.zeros((self.batch_size)) for i in range(self.batch_size): # if a state is a final state, only use the direct reward if batch_dones[i]: batch_targets[i] = batch_rewards[i] # otherwise comput the td_target else: td_target = batch_rewards[i] + self.discount_factor * \ np.max(self.Q_target.predict(self.sess, [batch_next_states[i]])) batch_targets[i] = td_target # update Q network loss = self.Q.update(self.sess, batch_states, batch_actions, batch_targets) # get predictions to check q-values -> e.g. are they diverging? q_preds = self.Q.predict(self.sess, batch_states) # update target network self.Q_target.update(self.sess) return loss, q_preds def act(self, state, deterministic): """ This method creates an epsilon-greedy policy based on the Q-function approximator and epsilon (probability to select a random action) Args: state: current state input deterministic: if True, the agent should execute the argmax action (False in training, True in evaluation) Returns: action id """ r = np.random.uniform() if deterministic or r > self.epsilon: # take greedy action (argmax) action_id = np.argmax(self.Q.predict(self.sess, [state])) # print("Deterministic action:", action_id) else: # sample random action # Hint for the exploration in CarRacing: sampling the action from a uniform distribution will probably not work. # You can sample the agents actions with different probabilities (need to sum up to 1) so that the agent will prefer to accelerate or going straight. # To see how the agent explores, turn the rendering in the training on and look what the agent is doing. # for carracing: if self.num_actions == 5: action_id = np.random.choice(range(5), p=[0.32, 0.09, 0.09, 0.4, 0.1]) # for cartpole action_id = np.random.randint(self.num_actions) # print("Explorative action:", action_id) return action_id def load(self, file_name): self.saver.restore(self.sess, file_name)