class MAML(Method): def __init__(self,sess,settings,netConfigOverride,stateShape,actionSize,nTrajs=1,**kwargs): """ Initializes a training method for a neural network. Parameters ---------- Model : Keras Model Object A Keras model object with fully defined layers and a call function. See examples in networks module. sess : Tensorflow Session Initialized Tensorflow session stateShape : list List of integers of the inputs shape size. Ex [39,39,6] actionSize : int Output size of the network. nTrajs : int (Optional) Number that specifies the number of trajectories to be created for collecting training data. scope : str (Optional) Name of the PPO method. Used to group and differentiate variables between other networks. Returns ------- N/A """ #Processing inputs self.actionSize = actionSize self.sess=sess self.HPs = settings["NetworkHPs"] self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize,scope="local") self.Model2 = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize,scope="global") self.scope =scope ="MAML" #Creating appropriate buffer for the method. self.buffer = [Trajectory(depth=7) for _ in range(nTrajs)] with self.sess.as_default(), self.sess.graph.as_default(): with tf.name_scope("MAML"): #Placeholders if len(stateShape) == 4: self.s = tf.placeholder(tf.float32, [None]+stateShape[1:4], 'S') else: self.s = tf.placeholder(tf.float32, [None]+stateShape, 'S') self.a_his = tf.placeholder(tf.int32, [None, ], 'A') self.td_target_ = tf.placeholder(tf.float32, [None], 'Vtarget') self.advantage_ = tf.placeholder(shape=[None], dtype=tf.float32, name='adv_hold') self.old_log_logits_ = tf.placeholder(shape=[None, actionSize], dtype=tf.float32, name='old_logit_hold') #Initializing Netowrk I/O inputs = {"state":self.s} out = self.Model(inputs) _ = self.Model2(inputs) self.a_prob = out["actor"] self.v = out["critic"] self.log_logits = out["log_logits"] # Entropy def _log(val): return tf.log(tf.clip_by_value(val, 1e-10, 10.0)) entropy = self.entropy = -tf.reduce_mean(self.a_prob * _log(self.a_prob), name='entropy') # Critic Loss td_error = self.td_target_ - self.v critic_loss = self.critic_loss = tf.reduce_mean(tf.square(td_error), name='critic_loss') # Actor Loss action_OH = tf.one_hot(self.a_his, actionSize, dtype=tf.float32) log_prob = tf.reduce_sum(self.log_logits * action_OH, 1) old_log_prob = tf.reduce_sum(self.old_log_logits_ * action_OH, 1) # Clipped surrogate function ratio = tf.exp(log_prob - old_log_prob) surrogate = ratio * self.advantage_ clipped_surrogate = tf.clip_by_value(ratio, 1-self.HPs["eps"], 1+self.HPs["eps"]) * self.advantage_ surrogate_loss = tf.minimum(surrogate, clipped_surrogate, name='surrogate_loss') actor_loss = self.actor_loss = -tf.reduce_mean(surrogate_loss, name='actor_loss') actor_loss = actor_loss - entropy * self.HPs["EntropyBeta"] loss = actor_loss + critic_loss * self.HPs["CriticBeta"] # Build Trainer if self.HPs["Optimizer"] == "Adam": self.optimizer = tf.keras.optimizers.Adam(self.HPs["LR"]) self.metaOptimizer = tf.keras.optimizers.Adam(self.HPs["Meta LR"]) elif self.HPs["Optimizer"] == "RMS": self.optimizer = tf.keras.optimizers.RMSProp(self.HPs["LR"]) self.metaOptimizer = tf.keras.optimizers.RMSProp(self.HPs["Meta LR"]) elif self.HPs["Optimizer"] == "Adagrad": self.optimizer = tf.keras.optimizers.Adagrad(self.HPs["LR"]) self.metaOptimizer = tf.keras.optimizers.Adagrad(self.HPs["Meta LR"]) elif self.HPs["Optimizer"] == "Adadelta": self.optimizer = tf.keras.optimizers.Adadelta(self.HPs["LR"]) self.metaOptimizer = tf.keras.optimizers.Adadelta(self.HPs["Meta LR"]) elif self.HPs["Optimizer"] == "Adamax": self.optimizer = tf.keras.optimizers.Adamax(self.HPs["LR"]) self.metaOptimizer = tf.keras.optimizers.Adamax(self.HPs["Meta LR"]) elif self.HPs["Optimizer"] == "Nadam": self.optimizer = tf.keras.optimizers.Nadam(self.HPs["LR"]) self.metaOptimizer = tf.keras.optimizers.Nadam(self.HPs["Meta LR"]) elif self.HPs["Optimizer"] == "SGD": self.optimizer = tf.keras.optimizers.SGD(self.HPs["LR"]) self.metaOptimizer = tf.keras.optimizers.SGD(self.HPs["Meta LR"]) elif self.HPs["Optimizer"] == "Amsgrad": self.optimizer = tf.keras.optimizers.Nadam(self.HPs["LR"],amsgrad=True) self.metaOptimizer = tf.keras.optimizers.Nadam(self.HPs["Meta LR"],amsgrad=True) else: print("Not selected a proper Optimizer") exit() vars1 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope+'/local') self.gradients = self.optimizer.get_gradients(loss, vars1) self.update_ops = self.optimizer.apply_gradients(zip(self.gradients, vars1)) with tf.name_scope("MetaUpdater"): vars2 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope+'/global') self.meta_update_ops = self.metaOptimizer.apply_gradients(zip(self.gradients, vars2)) with tf.name_scope('sync'): self.pull_params_op = [l_p.assign(g_p) for l_p, g_p in zip(vars1,vars2)] #Creating variables for logging. self.EntropyMA = MovingAverage(400) self.CriticLossMA = MovingAverage(400) self.ActorLossMA = MovingAverage(400) self.GradMA = MovingAverage(400) self.counter = 0 def next_task(self): if self.counter > 3: self.counter = 0 # self.sess.run(self.update_op) self.sess.run(self.pull_params_op) return True else: return False def GetAction(self, state, episode=1,step=0): """ Method to run data through the neural network. Parameters ---------- state : np.array Data with the shape of [N, self.stateShape] where N is number of smaples Returns ------- actions : list[int] List of actions based on NN output. extraData : list List of data that is passed to the execution code to be bundled with state data. """ try: probs,log_logits,v = self.sess.run([self.a_prob,self.log_logits,self.v], {self.s: state}) except ValueError: probs,log_logits,v = self.sess.run([self.a_prob,self.log_logits,self.v], {self.s: np.expand_dims(state,axis=0)}) actions = np.array([np.random.choice(probs.shape[1], p=prob / sum(prob)) for prob in probs]) return actions, [v,log_logits] def Update(self,episode=0): """ Process the buffer and backpropagates the loses through the NN. Parameters ---------- HPs : dict Hyperparameters for training. Returns ------- N/A """ samples=0 for i in range(len(self.buffer)): samples +=len(self.buffer[i]) if samples < self.HPs["BatchSize"]: return for traj in range(len(self.buffer)): #Finding if there are more than 1 done in the sequence. Clipping values if required. td_target, advantage = self.ProcessBuffer(traj) batches = len(self.buffer[traj][0])//self.HPs["MinibatchSize"]+1 s = np.array_split( self.buffer[traj][0], batches) a_his = np.array_split( np.asarray(self.buffer[traj][1]).reshape(-1), batches) td_target_ = np.array_split( td_target, batches) advantage_ = np.array_split( np.reshape(advantage, [-1]), batches) old_log_logits_ = np.array_split( np.reshape(self.buffer[traj][6], [-1,self.actionSize]), batches) #Create a dictionary with all of the samples? #Use a sampler to feed the update operation? #Staging Buffer inputs into the entries to run through the network. # print(td_target) for epoch in range(self.HPs["Epochs"]): for i in range(batches): feed_dict = {self.s: np.squeeze(np.asarray(s[i])), self.a_his: np.asarray(a_his[i]), self.td_target_:np.asarray(td_target_[i]), self.advantage_: np.asarray(advantage_[i]), self.old_log_logits_: np.asarray(old_log_logits_[i])} # aLoss= self.sess.run([self.actor_loss], feed_dict) if self.counter == 3: aLoss, cLoss, entropy,grads, _ = self.sess.run([self.actor_loss,self.critic_loss,self.entropy,self.gradients,self.meta_update_ops], feed_dict) else: aLoss, cLoss, entropy,grads, _ = self.sess.run([self.actor_loss,self.critic_loss,self.entropy,self.gradients,self.update_ops], feed_dict) self.EntropyMA.append(entropy) self.CriticLossMA.append(cLoss) self.ActorLossMA.append(aLoss) total_counter = 0 vanish_counter = 0 for grad in grads: total_counter += np.prod(grad.shape) vanish_counter += (np.absolute(grad)<1e-8).sum() self.GradMA.append(vanish_counter/total_counter) self.counter += 1 self.ClearTrajectory() def GetStatistics(self): dict = {"Training Results/Entropy":self.EntropyMA(), "Training Results/Loss Critic":self.CriticLossMA(), "Training Results/Loss Actor":self.ActorLossMA(), "Training Results/Vanishing Gradient":self.GradMA(),} return dict def ProcessBuffer(self,traj): """ Process the buffer and backpropagates the loses through the NN. Parameters ---------- Model : HPs Hyperparameters for training. traj : Trajectory Data stored by the neural network. clip : list[bool] List where the trajectory has finished. Returns ------- td_target : list List Temporal Difference Target for particular states. advantage : list List of advantages for particular actions. """ split_loc = [i+1 for i, x in enumerate(self.buffer[traj][4]) if x] reward_lists = np.split(self.buffer[traj][2],split_loc) value_lists = np.split(self.buffer[traj][5],split_loc) td_target=[]; advantage=[] for rew,value in zip(reward_lists,value_lists): td_target_i, advantage_i = gae(rew.reshape(-1),value.reshape(-1).tolist(),0,self.HPs["Gamma"],self.HPs["lambda"]) td_target.extend(td_target_i); advantage.extend( advantage_i) return td_target, advantage @property def getVars(self): return self.Model.getVars("PPO_Training")
class PPO(Method): def __init__(self, sess, settings, netConfigOverride, stateShape, actionSize, nTrajs=1, **kwargs): """ Initializes a training method for a neural network. Parameters ---------- Model : Keras Model Object A Keras model object with fully defined layers and a call function. See examples in networks module. sess : Tensorflow Session Initialized Tensorflow session stateShape : list List of integers of the inputs shape size. Ex [39,39,6] actionSize : int Output size of the network. HPs : dict Dictionary that contains all hyperparameters to be used in the methods training nTrajs : int (Optional) Number that specifies the number of trajectories to be created for collecting training data. scope : str (Optional) Name of the PPO method. Used to group and differentiate variables between other networks. Returns ------- N/A """ #Processing inputs self.actionSize = actionSize self.sess = sess self.HPs = settings["NetworkHPs"] #Building the network. self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"], netConfigOverride=netConfigOverride, actionSize=actionSize) #Creating appropriate buffer for the method. self.buffer = [Trajectory(depth=7) for _ in range(nTrajs)] with self.sess.as_default(), self.sess.graph.as_default(): with tf.name_scope("PPO"): #Placeholders if len(stateShape) == 4: self.s = tf.placeholder(tf.float32, [None] + stateShape[0:4], 'S') else: self.s = tf.placeholder(tf.float32, [None] + stateShape, 'S') self.a_his = tf.placeholder(tf.int32, [ None, ], 'A') self.td_target_ = tf.placeholder(tf.float32, [None], 'Vtarget') self.advantage_ = tf.placeholder(shape=[None], dtype=tf.float32, name='adv_hold') self.old_log_logits_ = tf.placeholder(shape=[None, actionSize], dtype=tf.float32, name='old_logit_hold') #Initializing Netowrk I/O inputs = {"state": self.s} out = self.Model(inputs) self.a_prob = out["actor"] self.v = out["critic"] self.log_logits = out["log_logits"] # Entropy def _log(val): return tf.log(tf.clip_by_value(val, 1e-10, 10.0)) self.entropy = -tf.reduce_mean(self.a_prob * _log(self.a_prob), name='entropy') # Critic Loss td_error = self.td_target_ - self.v self.critic_loss = tf.reduce_mean(tf.square(td_error), name='critic_loss') # Actor Loss action_OH = tf.one_hot(self.a_his, actionSize, dtype=tf.float32) log_prob = tf.reduce_sum(self.log_logits * action_OH, 1) old_log_prob = tf.reduce_sum(self.old_log_logits_ * action_OH, 1) # Clipped surrogate function ratio = tf.exp(log_prob - old_log_prob) surrogate = ratio * self.advantage_ clipped_surrogate = tf.clip_by_value( ratio, 1 - self.HPs["eps"], 1 + self.HPs["eps"]) * self.advantage_ surrogate_loss = tf.minimum(surrogate, clipped_surrogate, name='surrogate_loss') self.actor_loss = -tf.reduce_mean(surrogate_loss, name='actor_loss') loss = self.actor_loss + self.critic_loss * self.HPs[ "CriticBeta"] # Build Trainer if self.HPs["Optimizer"] == "Adam": self.optimizerA = tf.keras.optimizers.Adam( self.HPs["LR Actor"]) self.optimizerE = tf.keras.optimizers.Adam( self.HPs["LR Entropy"]) elif self.HPs["Optimizer"] == "RMS": self.optimizerA = tf.keras.optimizers.RMSProp( self.HPs["LR Actor"]) self.optimizerE = tf.keras.optimizers.RMSProp( self.HPs["LR Entropy"]) elif self.HPs["Optimizer"] == "Adagrad": self.optimizerA = tf.keras.optimizers.Adagrad( self.HPs["LR Actor"]) self.optimizerE = tf.keras.optimizers.Adagrad( self.HPs["LR Entropy"]) elif self.HPs["Optimizer"] == "Adadelta": self.optimizerA = tf.keras.optimizers.Adadelta( self.HPs["LR Actor"]) self.optimizerE = tf.keras.optimizers.Adadelta( self.HPs["LR Entropy"]) elif self.HPs["Optimizer"] == "Adamax": self.optimizerA = tf.keras.optimizers.Adamax( self.HPs["LR Actor"]) self.optimizerE = tf.keras.optimizers.Adamax( self.HPs["LR Entropy"]) elif self.HPs["Optimizer"] == "Nadam": self.optimizerA = tf.keras.optimizers.Nadam( self.HPs["LR Actor"]) self.optimizerE = tf.keras.optimizers.Nadam( self.HPs["LR Entropy"]) elif self.HPs["Optimizer"] == "SGD": self.optimizerA = tf.keras.optimizers.SGD( self.HPs["LR Actor"]) self.optimizerE = tf.keras.optimizers.SGD( self.HPs["LR Entropy"]) elif self.HPs["Optimizer"] == "Amsgrad": self.optimizerA = tf.keras.optimizers.Nadam( self.HPs["LR Actor"], amsgrad=True) self.optimizerE = tf.keras.optimizers.Nadam( self.HPs["LR Entropy"], amsgrad=True) else: print("Not selected a proper Optimizer") exit() a_params = self.Model.GetVariables("Actor") c_params = self.Model.GetVariables("Critic") self.gradients_a = self.optimizerA.get_gradients( loss, self.Model.trainable_variables) self.update_op_a = self.optimizerA.apply_gradients( zip(self.gradients_a, self.Model.trainable_variables)) entropy_loss = -self.entropy * self.HPs["EntropyBeta"] self.gradients_e = self.optimizerE.get_gradients( entropy_loss, a_params) self.update_op_e = self.optimizerE.apply_gradients( zip(self.gradients_e, a_params)) total_counter = 1 vanish_counter = 0 for gradient in self.gradients_a: total_counter += np.prod(gradient.shape) stuff = tf.reduce_sum( tf.cast( tf.math.less_equal(tf.math.abs(gradient), tf.constant(1e-8)), tf.int32)) vanish_counter += stuff self.vanishing_gradient = vanish_counter / total_counter self.update_ops = [self.update_op_a, self.update_op_e] self.logging_ops = [ self.actor_loss, self.critic_loss, self.entropy, tf.reduce_mean(self.advantage_), tf.reduce_mean(ratio), loss, self.vanishing_gradient ] self.labels = [ "Loss Actor", "Loss Critic", "Entropy", "Advantage", "PPO Ratio", "Loss Total", "Vanishing Gradient" ] self.logging_MA = [ MovingAverage(400) for i in range(len(self.logging_ops)) ] def GetAction(self, state, episode=1, step=0): """ Method to run data through the neural network. Parameters ---------- state : np.array Data with the shape of [N, self.stateShape] where N is number of smaples Returns ------- actions : list[int] List of actions based on NN output. extraData : list List of data that is passed to the execution code to be bundled with state data. """ try: probs, log_logits, v = self.sess.run( [self.a_prob, self.log_logits, self.v], {self.s: state}) except ValueError: probs, log_logits, v = self.sess.run( [self.a_prob, self.log_logits, self.v], {self.s: np.expand_dims(state, axis=0)}) actions = np.array([ np.random.choice(probs.shape[1], p=prob / sum(prob)) for prob in probs ]) return actions, [v, log_logits] def Update(self, episode=0): """ Process the buffer and backpropagates the loses through the NN. Parameters ---------- HPs : dict Hyperparameters for training. Returns ------- N/A """ samples = 0 for i in range(len(self.buffer)): samples += len(self.buffer[i]) if samples < self.HPs["BatchSize"]: return for traj in range(len(self.buffer)): #Finding if there are more than 1 done in the sequence. Clipping values if required. td_target, advantage = self.ProcessBuffer(traj) batches = len( self.buffer[traj][0]) // self.HPs["MinibatchSize"] + 1 s = np.array_split(self.buffer[traj][0], batches) a_his = np.array_split( np.asarray(self.buffer[traj][1]).reshape(-1), batches) td_target_ = np.array_split(td_target, batches) advantage_ = np.array_split(np.reshape(advantage, [-1]), batches) old_log_logits_ = np.array_split( np.reshape(self.buffer[traj][6], [-1, self.actionSize]), batches) #Create a dictionary with all of the samples? #Use a sampler to feed the update operation? #Staging Buffer inputs into the entries to run through the network. # print(td_target) for epoch in range(self.HPs["Epochs"]): for i in range(batches): feedDict = { self.s: np.squeeze(np.asarray(s[i])), self.a_his: np.asarray(a_his[i]), self.td_target_: np.asarray(td_target_[i]), self.advantage_: np.asarray(advantage_[i]), self.old_log_logits_: np.asarray(old_log_logits_[i]) } out = self.sess.run( self.update_ops + self.logging_ops, feedDict) # local grads applied to global net. logging = out[len(self.update_ops):] for i, log in enumerate(logging): self.logging_MA[i].append(log) self.ClearTrajectory() def GetStatistics(self): dict = {} for i, label in enumerate(self.labels): dict["Training Results/" + label] = self.logging_MA[i]() return dict def ProcessBuffer(self, traj): """ Process the buffer and backpropagates the loses through the NN. Parameters ---------- Model : HPs Hyperparameters for training. traj : Trajectory Data stored by the neural network. clip : list[bool] List where the trajectory has finished. Returns ------- td_target : list List Temporal Difference Target for particular states. advantage : list List of advantages for particular actions. """ # print("Starting Processing Buffer\n") # tracker.print_diff() split_loc = [i + 1 for i, x in enumerate(self.buffer[traj][4]) if x] reward_lists = np.split(self.buffer[traj][2], split_loc) value_lists = np.split(self.buffer[traj][5], split_loc) td_target = [] advantage = [] for rew, value in zip(reward_lists, value_lists): td_target_i, advantage_i = gae(rew.reshape(-1), value.reshape(-1).tolist(), 0, self.HPs["Gamma"], self.HPs["lambda"]) td_target.extend(td_target_i) advantage.extend(advantage_i) return td_target, advantage @property def getVars(self): return self.Model.getVars("PPO_Training")
class DQN_ms_v2(Method): def __init__(self,sess,settings,netConfigOverride,stateShape,actionSize,nTrajs=1,**kwargs): """ Initializes I/O placeholders and the training process of a Multi-step DQN. Main principal is that instead of one-step TD diference, the loss is evaluated on a temporally extended basis. G = R_t + γR_t+1 + ... γ^n-1 R_t+n + q(S_t+n,a*,θ-) loss = MSE(G,q(S_t,A_t,θ)) """ #Placeholders self.actionSize = actionSize self.sess=sess self.scope="worker" self.HPs = settings["NetworkHPs"] self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize,scope="worker") self.Model_ = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize,scope="target") self.buffer = [Trajectory(depth=5) for _ in range(nTrajs)] with self.sess.as_default(), self.sess.graph.as_default(): self.states_ = tf.placeholder(shape=[None]+stateShape, dtype=tf.float32, name='states') self.next_states_ = tf.placeholder(shape=[None]+stateShape, dtype=tf.float32, name='next_states') self.actions_ = tf.placeholder(shape=[None], dtype=tf.int32, name='actions_hold') self.rewards_ = tf.placeholder(shape=[None], dtype=tf.float32, name='rewards_hold') self.done_ = tf.placeholder(shape=[None], dtype=tf.float32, name='done_hold') with tf.name_scope("target"): out2 = self.Model_({"state":self.next_states_}) q_next = out2["Q"] self.targetParams = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "target") with tf.name_scope(self.scope): input = {"state":self.states_} out = self.Model(input) self.q = out["Q"] with tf.name_scope('current_Q'): oh_action = tf.one_hot(self.actions_, actionSize, dtype=tf.float32) # [?, num_agent, action_size] curr_q = tf.reduce_sum(tf.multiply(self.q, oh_action), axis=-1) # [?, num_agent] with tf.name_scope('target_Q'): max_next_q = tf.reduce_max(q_next, axis=-1) td_target = self.rewards_ + self.HPs["Gamma"] * max_next_q # td_target = self.rewards_ + self.HPs["Gamma"] * max_next_q * (1. - self.done_) with tf.name_scope('td_error'): loss = tf.keras.losses.MSE(td_target, curr_q) softmax_q = tf.nn.softmax(curr_q) self.entropy = -tf.reduce_mean(softmax_q * tf.log(softmax_q+ 1e-5)) self.loss=total_loss = loss + self.HPs["EntropyBeta"] * self.entropy if self.HPs["Optimizer"] == "Adam": self.optimizer = tf.keras.optimizers.Adam(self.HPs["LR"]) elif self.HPs["Optimizer"] == "RMS": self.optimizer = tf.keras.optimizers.RMSProp(self.HPs["LR"]) elif self.HPs["Optimizer"] == "Adagrad": self.optimizer = tf.keras.optimizers.Adagrad(self.HPs["LR"]) elif self.HPs["Optimizer"] == "Adadelta": self.optimizer = tf.keras.optimizers.Adadelta(self.HPs["LR"]) elif self.HPs["Optimizer"] == "Adamax": self.optimizer = tf.keras.optimizers.Adamax(self.HPs["LR"]) elif self.HPs["Optimizer"] == "Nadam": self.optimizer = tf.keras.optimizers.Nadam(self.HPs["LR"]) elif self.HPs["Optimizer"] == "SGD": self.optimizer = tf.keras.optimizers.SGD(self.HPs["LR"]) elif self.HPs["Optimizer"] == "Amsgrad": self.optimizer = tf.keras.optimizers.Nadam(self.HPs["LR"],amsgrad=True) else: print("Not selected a proper Optimizer") exit() self.workerParams = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) self.gradients = self.optimizer.get_gradients(total_loss, self.workerParams) self.update_op = self.optimizer.apply_gradients(zip(self.gradients, self.workerParams)) with tf.name_scope('push'): self.push_ops = [l_p.assign(g_p) for l_p, g_p in zip(self.targetParams, self.workerParams)] self.grads=[self.gradients] self.losses=[self.loss] self.update_ops=[self.update_op] self.grad_MA = [MovingAverage(400) for i in range(len(self.grads))] self.loss_MA = [MovingAverage(400) for i in range(len(self.losses))] self.labels = ["Critic"] def GetAction(self, state,episode,step): """ Contains the code to run the network based on an input. """ if len(state.shape) == 3: state = state[np.newaxis, :] if len(state.shape) == 1: state = state[np.newaxis, :] q = self.sess.run(self.q, {self.states_: state}) if "Exploration" in self.HPs: if self.HPs["Exploration"]=="EGreedy": prob = self.HPs["ExploreSS"] + (1-self.HPs["ExploreSS"])*(np.exp(-episode/self.HPs["ExplorationDecay"])) if random.uniform(0, 1) < prob: actions = random.randint(0,self.actionSize-1) else: actions = np.argmax(q, axis=-1) else: actions = np.argmax(q, axis=-1) else: actions = np.argmax(q, axis=-1) return actions ,[] # return a int and extra data that needs to be fed to buffer. def Update(self,episode=0): """ The main update function for A3C. The function pushes gradients to the global AC Network. The second function is to Pull """ #Checking that there is enough data for a batch samples=0 for i in range(len(self.buffer)): samples +=len(self.buffer[i]) if samples < self.HPs["BatchSize"]: return #Combining all trajs into 1: s_list = [] a_list = [] done_list = [] g_list = [] s_n_list = [] for traj in range(len(self.buffer)): g,s_n=MultiStepDiscountProcessing(self.buffer[traj][2],self.buffer[traj][3],self.HPs["Gamma"],self.HPs["MultiStep"]) s_list.extend(self.buffer[traj][0]) a_list.extend(self.buffer[traj][1]) g_list.extend(g) s_n_list.extend(s_n) done_list.extend(self.buffer[traj][4]) #Separating into different batches batches = len(s_list)//self.HPs["MinibatchSize"]+1 s = np.array_split( s_list, batches) a_his = np.array_split( np.asarray(a_list).reshape(-1), batches) r = np.array_split( np.asarray(g_list).reshape(-1), batches) s_next = np.array_split( s_n_list, batches) done = np.array_split( done_list, batches) #Running all batches through multiple epochs for epoch in range(self.HPs["Epochs"]): for i in range(batches): #Create a feedDict from the buffer feedDict = { self.states_ : np.squeeze(np.asarray(s[i])), self.next_states_ : np.squeeze(np.asarray(s_next[i])), self.actions_ : np.squeeze(np.asarray(a_his[i])), self.rewards_ : np.squeeze(np.asarray(r[i])), self.done_ : np.squeeze(np.asarray(done[i],dtype=float)) } out = self.sess.run(self.update_ops+self.losses+self.grads, feedDict) out = np.array_split(out,3) losses = out[1] grads = out[2] for i,loss in enumerate(losses): self.loss_MA[i].append(loss) for i,grads_i in enumerate(grads): total_counter = 1 vanish_counter = 0 for grad in grads_i: total_counter += np.prod(grad.shape) vanish_counter += (np.absolute(grad)<1e-8).sum() self.grad_MA[i].append(vanish_counter/total_counter) self.ClearTrajectory() self.sess.run(self.push_ops, feedDict) def GetStatistics(self): dict ={} for i,label in enumerate(self.labels): dict["Training Results/Vanishing Gradient " + label] = self.grad_MA[i]() dict["Training Results/Loss " + label] = self.loss_MA[i]() return dict @property def getVars(self): return self.Model.getVars(self.scope)
class PPO(Method): def __init__(self, sess, settings, netConfigOverride, stateShape, actionSize, nTrajs=1, **kwargs): """ Initializes a training method for a neural network. Parameters ---------- Model : Keras Model Object A Keras model object with fully defined layers and a call function. See examples in networks module. sess : Tensorflow Session Initialized Tensorflow session stateShape : list List of integers of the inputs shape size. Ex [39,39,6] actionSize : int Output size of the network. HPs : dict Dictionary that contains all hyperparameters to be used in the methods training nTrajs : int (Optional) Number that specifies the number of trajectories to be created for collecting training data. scope : str (Optional) Name of the PPO method. Used to group and differentiate variables between other networks. Returns ------- N/A """ #Processing inputs self.actionSize = actionSize self.sess = sess self.HPs = settings["NetworkHPs"] self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"], netConfigOverride=netConfigOverride, actionSize=actionSize) scope = "PPO" #Creating appropriate buffer for the method. self.buffer = [Trajectory(depth=8) for _ in range(nTrajs)] with self.sess.as_default(), self.sess.graph.as_default(): with tf.name_scope(scope): #Placeholders self.s = tf.placeholder(tf.float32, [None] + stateShape, 'S') self.a_his = tf.placeholder(tf.int32, [ None, ], 'A') self.td_target_ = tf.placeholder(tf.float32, [None], 'TD_target') self.advantage_ = tf.placeholder(shape=[None], dtype=tf.float32, name='adv_hold') self.old_log_logits_ = tf.placeholder(shape=[None, actionSize], dtype=tf.float32, name='old_logit_hold') #Initializing Netowrk I/O inputs = {"state": self.s} out = self.Model(inputs) self.a_prob = out["actor"] self.v = out["critic"] self.log_logits = out["log_logits"] # Entropy def _log(val): return tf.log(tf.clip_by_value(val, 1e-10, 10.0)) entropy = self.entropy = -tf.reduce_mean( self.a_prob * _log(self.a_prob), name='entropy') # Critic Loss td_error = self.td_target_ - self.v critic_loss = self.critic_loss = tf.reduce_mean( tf.square(td_error), name='critic_loss') # Actor Loss action_OH = tf.one_hot(self.a_his, actionSize, dtype=tf.float32) log_prob = tf.reduce_sum(self.log_logits * action_OH, 1) old_log_prob = tf.reduce_sum(self.old_log_logits_ * action_OH, 1) # Clipped surrogate function ratio = tf.exp(log_prob - old_log_prob) surrogate = ratio * self.advantage_ clipped_surrogate = tf.clip_by_value( ratio, 1 - self.HPs["eps"], 1 + self.HPs["eps"]) * self.advantage_ surrogate_loss = tf.minimum(surrogate, clipped_surrogate, name='surrogate_loss') actor_loss = self.actor_loss = -tf.reduce_mean( surrogate_loss, name='actor_loss') loss = self.actor_loss + self.critic_loss * self.HPs[ "CriticBeta"] # Build Trainer if self.HPs["Optimizer"] == "Adam": self.optimizerA = tf.keras.optimizers.Adam( self.HPs["LR Actor"]) self.optimizerE = tf.keras.optimizers.Adam( self.HPs["LR Entropy"]) elif self.HPs["Optimizer"] == "RMS": self.optimizerA = tf.keras.optimizers.RMSProp( self.HPs["LR Actor"]) self.optimizerE = tf.keras.optimizers.RMSProp( self.HPs["LR Entropy"]) elif self.HPs["Optimizer"] == "Adagrad": self.optimizerA = tf.keras.optimizers.Adagrad( self.HPs["LR Actor"]) self.optimizerE = tf.keras.optimizers.Adagrad( self.HPs["LR Entropy"]) elif self.HPs["Optimizer"] == "Adadelta": self.optimizerA = tf.keras.optimizers.Adadelta( self.HPs["LR Actor"]) self.optimizerE = tf.keras.optimizers.Adadelta( self.HPs["LR Entropy"]) elif self.HPs["Optimizer"] == "Adamax": self.optimizerA = tf.keras.optimizers.Adamax( self.HPs["LR Actor"]) self.optimizerE = tf.keras.optimizers.Adamax( self.HPs["LR Entropy"]) elif self.HPs["Optimizer"] == "Nadam": self.optimizerA = tf.keras.optimizers.Nadam( self.HPs["LR Actor"]) self.optimizerE = tf.keras.optimizers.Nadam( self.HPs["LR Entropy"]) elif self.HPs["Optimizer"] == "SGD": self.optimizerA = tf.keras.optimizers.SGD( self.HPs["LR Actor"]) self.optimizerE = tf.keras.optimizers.SGD( self.HPs["LR Entropy"]) elif self.HPs["Optimizer"] == "Amsgrad": self.optimizerA = tf.keras.optimizers.Nadam( self.HPs["LR Actor"], amsgrad=True) self.optimizerE = tf.keras.optimizers.Nadam( self.HPs["LR Entropy"], amsgrad=True) else: print("Not selected a proper Optimizer") exit() a_params = self.Model.GetVariables("Actor") c_params = self.Model.GetVariables("Critic") self.gradients_a = self.optimizerA.get_gradients( loss, self.Model.trainable_variables) # capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in self.gradients_a] self.update_op_a = self.optimizerA.apply_gradients( zip(self.gradients_a, self.Model.trainable_variables)) entropy_loss = -self.entropy * self.HPs["EntropyBeta"] self.gradients_e = self.optimizerE.get_gradients( entropy_loss, a_params) self.update_op_e = self.optimizerE.apply_gradients( zip(self.gradients_e, a_params)) total_counter = 1 vanish_counter = 0 for gradient in self.gradients_a: total_counter += np.prod(gradient.shape) stuff = tf.reduce_sum( tf.cast( tf.math.less_equal(tf.math.abs(gradient), tf.constant(1e-8)), tf.int32)) vanish_counter += stuff self.vanishing_gradient = vanish_counter / total_counter self.update_ops = [self.update_op_a, self.update_op_e] self.logging_ops = [ self.actor_loss, self.critic_loss, self.entropy, tf.reduce_mean(self.advantage_), tf.reduce_mean(ratio), loss, self.vanishing_gradient ] self.labels = [ "Loss Actor", "Loss Critic", "Entropy", "Advantage", "PPO Ratio", "Loss Total", "Vanishing Gradient" ] self.logging_MA = [ MovingAverage(400) for i in range(len(self.logging_ops)) ] self.count_MA = MovingAverage(400) def GetAction(self, state, episode=1, step=0): """ Method to run data through the neural network. Parameters ---------- state : np.array Data with the shape of [N, self.stateShape] where N is number of smaples Returns ------- actions : list[int] List of actions based on NN output. extraData : list List of data that is passed to the execution code to be bundled with state data. """ try: probs, log_logits, v = self.sess.run( [self.a_prob, self.log_logits, self.v], {self.s: state}) except ValueError: probs, log_logits, v = self.sess.run( [self.a_prob, self.log_logits, self.v], {self.s: np.expand_dims(state, axis=0)}) actions = np.array([ np.random.choice(probs.shape[1], p=prob / sum(prob)) for prob in probs ]) confid = -np.mean(probs * np.log(probs), axis=1) if step == 0: self.store_actions = actions self.old_confid = confid self.count = 0 return actions, [v, log_logits, True] else: if confid < self.old_confid: # compare inverse entropy self.old_confid = confid self.store_actions = actions self.count_MA.append(self.count) self.count = 0 return actions, [v, log_logits, True] else: if self.count >= 4: self.old_confid = np.maximum( self.old_confid + self.HPs["ConfidenceAnnealing"], self.HPs["MinConfidence"]) self.count += 1 return self.store_actions, [v, log_logits, False] def Update(self, episode=0): """ Process the buffer and backpropagates the loses through the NN. Parameters ---------- HPs : dict Hyperparameters for training. Returns ------- N/A """ #Counting number of samples. samples = 0 for i in range(len(self.buffer)): samples += len(self.buffer[i]) if samples < self.HPs["BatchSize"]: return for traj in range(len(self.buffer)): td_target_hier, advantage_hier, actions_hier, ll_hier, s_hier = self.ProcessBuffer( traj) for epoch in range(self.HPs["Epochs"]): for batch in MultiBatchDivider([ s_hier, actions_hier, td_target_hier, advantage_hier, ll_hier ], self.HPs["MinibatchSize"]): #Staging Buffer inputs into the entries to run through the network. feedDict = { self.s: np.asarray(batch[0]).squeeze(), self.a_his: np.asarray(batch[1]).squeeze(), self.td_target_: np.asarray(batch[2]).squeeze(), self.advantage_: np.reshape(batch[3], [-1]), self.old_log_logits_: np.asarray(batch[4]).squeeze() } out = self.sess.run( self.update_ops + self.logging_ops, feedDict) # local grads applied to global net. logging = out[len(self.update_ops):] for i, log in enumerate(logging): self.logging_MA[i].append(log) self.ClearTrajectory() def GetStatistics(self): dict = {} for i, label in enumerate(self.labels): dict["Training Results/" + label] = self.logging_MA[i]() dict["Training Results/Average Traj Length"] = self.count_MA() return dict def ProcessBuffer(self, traj): """ Process the buffer and backpropagates the loses through the NN. Parameters ---------- Model : HPs Hyperparameters for training. traj : Trajectory Data stored by the neural network. Returns ------- td_target : list List Temporal Difference Target for particular states. advantage : list List of advantages for particular actions. """ # Split into different episodes based on the "done" signal. Assumes that episode terminates at done. # Cannot account for instances where there are multiple done signals in a row. split_loc = [i + 1 for i, x in enumerate(self.buffer[traj][4]) if x] # reward_lists = np.split(self.buffer[traj][2],split_loc) # value_lists = np.split(self.buffer[traj][5],split_loc) # # td_target=[]; advantage=[] # for rew,value in zip(reward_lists,value_lists): # td_target_i, advantage_i = gae(rew.reshape(-1).tolist(),value.reshape(-1).tolist(),0,self.HPs["Gamma"],self.HPs["lambda"]) # td_target.extend(td_target_i); advantage.extend( advantage_i) # return td_target, advantage reward_lists = np.split(self.buffer[traj][2], split_loc[:-1]) #Stuff needed for the HL_S_lists = np.split(self.buffer[traj][0], split_loc[:-1]) HL_Critic_lists = np.split(self.buffer[traj][5], split_loc[:-1]) HL_Logits_lists = np.split(self.buffer[traj][6], split_loc[:-1]) HL_action_lists = np.split(self.buffer[traj][1], split_loc[:-1]) HL_flag_lists = np.split(self.buffer[traj][7], split_loc[:-1]) td_target_hier = [] advantage_hier = [] ll = [] actions = [] s = [] for rew, HL_critic, HL_ll, HL_a, HL_flag, HL_s in zip( reward_lists, HL_Critic_lists, HL_Logits_lists, HL_action_lists, HL_flag_lists, HL_S_lists): #Colapsing different trajectory lengths for the hierarchical controller split_loc_ = [i for i, x in enumerate(HL_flag[:-1]) if x][1:] rew_hier = [np.sum(l) for l in np.split(rew, split_loc_)] value_hier = [l[0] for l in np.split(HL_critic, split_loc_)] actions.extend([l[0] for l in np.split(HL_a, split_loc_)]) ll.extend([l[0] for l in np.split(HL_ll, split_loc_)]) s.extend([l[0] for l in np.split(HL_s, split_loc_)]) #Calculating the td_target and advantage for the hierarchical controller. td_target_i_, advantage_i_ = gae( np.asarray(rew_hier).reshape(-1).tolist(), np.asarray(value_hier).reshape(-1).tolist(), 0, self.HPs["Gamma"], self.HPs["lambda"]) td_target_hier.extend(td_target_i_) advantage_hier.extend(advantage_i_) return td_target_hier, advantage_hier, actions, ll, s @property def getVars(self): return self.Model.getVars("PPO_Training")
class PPO(Method): def __init__(self,sess,settings,netConfigOverride,stateShape,actionSize,nTrajs=1,**kwargs): """ Initializes a training method for a neural network. Parameters ---------- Model : Keras Model Object A Keras model object with fully defined layers and a call function. See examples in networks module. sess : Tensorflow Session Initialized Tensorflow session stateShape : list List of integers of the inputs shape size. Ex [39,39,6] actionSize : int Output size of the network. HPs : dict Dictionary that contains all hyperparameters to be used in the methods training nTrajs : int (Optional) Number that specifies the number of trajectories to be created for collecting training data. scope : str (Optional) Name of the PPO method. Used to group and differentiate variables between other networks. Returns ------- N/A """ #Processing inputs self.actionSize = actionSize self.sess=sess self.HPs = settings["NetworkHPs"] self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize) scope="PPO" #Creating appropriate buffer for the method. self.buffer = [Trajectory(depth=7) for _ in range(nTrajs)] with self.sess.as_default(), self.sess.graph.as_default(): with tf.name_scope(scope): #Placeholders self.s = tf.placeholder(tf.float32, [None]+stateShape, 'S') self.a_his = tf.placeholder(tf.int32, [None, ], 'A') self.td_target_ = tf.placeholder(tf.float32, [None], 'TD_target') self.advantage_ = tf.placeholder(shape=[None], dtype=tf.float32, name='adv_hold') self.old_log_logits_ = tf.placeholder(shape=[None, actionSize], dtype=tf.float32, name='old_logit_hold') #Initializing Netowrk I/O inputs = {"state":self.s} out = self.Model(inputs) self.a_prob = out["actor"] self.v = out["critic"] self.log_logits = out["log_logits"] # Entropy def _log(val): return tf.log(tf.clip_by_value(val, 1e-10, 10.0)) entropy = self.entropy = -tf.reduce_mean(self.a_prob * _log(self.a_prob), name='entropy') # Critic Loss td_error = self.td_target_ - self.v critic_loss = self.critic_loss = tf.reduce_mean(tf.square(td_error), name='critic_loss') # Actor Loss action_OH = tf.one_hot(self.a_his, actionSize, dtype=tf.float32) log_prob = tf.reduce_sum(self.log_logits * action_OH, 1) old_log_prob = tf.reduce_sum(self.old_log_logits_ * action_OH, 1) # Clipped surrogate function ratio =self.ratio= tf.exp(log_prob - old_log_prob) surrogate = ratio * self.advantage_ clipped_surrogate = tf.clip_by_value(ratio, 1-self.HPs["eps"], 1+self.HPs["eps"]) * self.advantage_ surrogate_loss = tf.minimum(surrogate, clipped_surrogate, name='surrogate_loss') actor_loss = self.actor_loss = -tf.reduce_mean(surrogate_loss, name='actor_loss') actor_loss = actor_loss - entropy * self.HPs["EntropyBeta"] loss = actor_loss + critic_loss * self.HPs["CriticBeta"] # Build Trainer self.optimizer = tf.keras.optimizers.Adam(self.HPs["LR"]) self.gradients = self.optimizer.get_gradients(loss, self.Model.trainable_variables) self.update_ops = self.optimizer.apply_gradients(zip(self.gradients, self.Model.trainable_variables)) #Creating variables for logging. self.EntropyMA = MovingAverage(400) self.CriticLossMA = MovingAverage(400) self.ActorLossMA = MovingAverage(400) self.GradMA = MovingAverage(400) def GetAction(self, state, episode=1,step=0): """ Method to run data through the neural network. Parameters ---------- state : np.array Data with the shape of [N, self.stateShape] where N is number of smaples Returns ------- actions : list[int] List of actions based on NN output. extraData : list List of data that is passed to the execution code to be bundled with state data. """ try: probs,log_logits,v = self.sess.run([self.a_prob,self.log_logits,self.v], {self.s: state}) except ValueError: probs,log_logits,v = self.sess.run([self.a_prob,self.log_logits,self.v], {self.s: np.expand_dims(state,axis=0)}) actions = np.array([np.random.choice(probs.shape[1], p=prob / sum(prob)) for prob in probs]) if step % self.HPs["FS"] == 0: self.store_actions = actions return actions, [v,log_logits] else: return self.store_actions, [v,log_logits] def Update(self,episode=0): """ Process the buffer and backpropagates the loses through the NN. Parameters ---------- HPs : dict Hyperparameters for training. Returns ------- N/A """ #Counting number of samples. samples=0 for i in range(len(self.buffer)): samples +=len(self.buffer[i]) if samples < self.HPs["BatchSize"]: return for traj in range(len(self.buffer)): td_target, advantage = self.ProcessBuffer(traj) batches = len(self.buffer[traj][0])//self.HPs["MinibatchSize"]+1 s = np.array_split( self.buffer[traj][0], batches) a_his = np.array_split( np.asarray(self.buffer[traj][1]).reshape(-1), batches) td_target_ = np.array_split( td_target, batches) advantage_ = np.array_split( np.reshape(advantage, [-1]), batches) old_log_logits_ = np.array_split( np.reshape(self.buffer[traj][6], [-1,self.actionSize]), batches) for epoch in range(self.HPs["Epochs"]): for i in range(batches): #Staging Buffer inputs into the entries to run through the network. feed_dict = {self.s: np.squeeze(s[i]), self.a_his: a_his[i], self.td_target_:td_target_[i], self.advantage_: advantage_[i], self.old_log_logits_: old_log_logits_[i]} aLoss, cLoss, entropy,grads, _ = self.sess.run([self.actor_loss,self.critic_loss,self.entropy,self.gradients,self.update_ops], feed_dict) self.EntropyMA.append(entropy) self.CriticLossMA.append(cLoss) self.ActorLossMA.append(aLoss) total_counter = 0 vanish_counter = 0 for grad in grads: total_counter += np.prod(grad.shape) vanish_counter += (np.absolute(grad)<1e-8).sum() self.GradMA.append(vanish_counter/total_counter) self.ClearTrajectory() def GetStatistics(self): dict = {"Training Results/Entropy":self.EntropyMA(), "Training Results/Loss Critic":self.CriticLossMA(), "Training Results/Loss Actor":self.ActorLossMA(), "Training Results/Vanishing Gradient":self.GradMA(),} return dict def ProcessBuffer(self,traj): """ Process the buffer and backpropagates the loses through the NN. Parameters ---------- Model : HPs Hyperparameters for training. traj : Trajectory Data stored by the neural network. Returns ------- td_target : list List Temporal Difference Target for particular states. advantage : list List of advantages for particular actions. """ # Split into different episodes based on the "done" signal. Assumes that episode terminates at done. # Cannot account for instances where there are multiple done signals in a row. split_loc = [i+1 for i, x in enumerate(self.buffer[traj][4]) if x] reward_lists = np.split(self.buffer[traj][2],split_loc) value_lists = np.split(self.buffer[traj][5],split_loc) td_target=[]; advantage=[] for rew,value in zip(reward_lists,value_lists): td_target_i, advantage_i = gae(rew.reshape(-1).tolist(),value.reshape(-1).tolist(),0,self.HPs["Gamma"],self.HPs["lambda"]) td_target.extend(td_target_i); advantage.extend( advantage_i) return td_target, advantage @property def getVars(self): return self.Model.getVars("PPO_Training")
class OptionCritic(Method): def __init__(self, sess, settings, netConfigOverride, stateShape, actionSize, nTrajs=1, **kwargs): """ Initializes a training method for a neural network. Parameters ---------- Model : Keras Model Object A Keras model object with fully defined layers and a call function. See examples in networks module. sess : Tensorflow Session Initialized Tensorflow session stateShape : list List of integers of the inputs shape size. Ex [39,39,6] actionSize : int Output size of the network. HPs : dict Dictionary that contains all hyperparameters to be used in the methods training nTrajs : int (Optional) Number that specifies the number of trajectories to be created for collecting training data. scope : str (Optional) Name of the PPO method. Used to group and differentiate variables between other networks. Returns ------- N/A """ #Processing inputs self.actionSize = actionSize self.sess = sess self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"], netConfigOverride=netConfigOverride, actionSize=actionSize) self.method = "Confidence" #Create input for this. self.HPs = settings["NetworkHPs"] self.subReward = False self.UpdateSubpolicies = True self.nTrajs = nTrajs #Creating buffer self.buffer = [Trajectory(depth=7) for _ in range(nTrajs)] #[s0,a,r,s1,done]+[HL_action] with self.sess.as_default(), self.sess.graph.as_default(): with tf.name_scope("OptionCritic"): #Generic placeholders self.batch_size = tf.placeholder(tf.int32, 1, 'BS') self.s = tf.placeholder(tf.float32, [None] + stateShape, 'S') self.actions = tf.placeholder(tf.int32, [ None, ], 'A') self.rewards = tf.placeholder(tf.float32, [None], 'R') # self.advantage_ = tf.placeholder(shape=[None], dtype=tf.float32, name='adv_hold') self.options = tf.placeholder(shape=[None], dtype=tf.int32, name="options") batch_indexer = tf.range(tf.reshape(self.batch_size, [])) #Initializing Netowrk I/O inputs = {"state": self.s} out = self.Model(inputs) self.term = out["metaTermination"] self.q = out["metaCritic"] self.sub_a_prob = out["subActor"] self.sub_log_logits = out["subLogLogits"] self.nPolicies = len(self.sub_a_prob) # Creating the Loss and update calls for the Hierarchical policy # Indexers self.responsible_options = tf.stack( [batch_indexer, self.options], axis=1) self.responsible_actions = tf.stack( [batch_indexer, self.actions], axis=1) self.network_indexer = tf.stack([self.options, batch_indexer], axis=1) # Q Values OVER options self.disconnected_q_vals = tf.stop_gradient(self.q) # Q values of each option that was taken self.responsible_opt_q_vals = tf.gather_nd( params=self.q, indices=self.responsible_options ) # Extract q values for each option self.disconnected_q_vals_option = tf.gather_nd( params=self.disconnected_q_vals, indices=self.responsible_options) # Termination probability of each option that was taken self.terminations = tf.gather_nd( params=self.term, indices=self.responsible_options) # Q values for each action that was taken relevant_networks = tf.gather_nd(params=self.sub_a_prob, indices=self.network_indexer) relevant_networks = tf.nn.softmax(relevant_networks, dim=1) self.action_values = tf.gather_nd( params=relevant_networks, indices=self.responsible_actions) # Weighted average value option_eps = 0.001 self.value = tf.reduce_max(self.q) * (1 - option_eps) + ( option_eps * tf.reduce_mean(self.q)) disconnected_value = tf.stop_gradient(self.value) # Losses; TODO: Why reduce sum vs reduce mean? vf_coef = 0.5 self.value_loss = vf_coef * tf.reduce_mean( vf_coef * 0.5 * tf.square(self.rewards - self.responsible_opt_q_vals)) self.policy_loss = tf.reduce_mean( _log(self.action_values) * (self.rewards - self.disconnected_q_vals_option)) self.deliberation_costs = 0.020 self.termination_loss = tf.reduce_mean( self.terminations * ((self.disconnected_q_vals_option - disconnected_value) + self.deliberation_costs)) ent_coef = 0.01 action_probabilities = self.sub_a_prob self.entropy = ent_coef * tf.reduce_mean( action_probabilities * _log(action_probabilities)) self.loss = -self.policy_loss - self.entropy - self.value_loss - self.termination_loss variables = self.Model.getVars() variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "OptionCritic") optimizer = tf.keras.optimizers.Adam(self.HPs["LR"]) gradients = optimizer.get_gradients(self.loss, variables) self.update_op = optimizer.apply_gradients( zip(gradients, variables)) #Creating Variables for teh purpose of logging. self.SubpolicyDistribution = MovingAverage(1000) def GetAction(self, state, step, episode=0): """ Method to run data through hierarchical network First run the state through the meta network to select subpolicy to use. Second run the state through the proper Subpolicy ToDo: Check if faster to run the entire network and select appropriate subpolicy afterwards. or run only the required bit. Parameters ---------- state : np.array Data with the shape of [N, self.stateShape] where N is number of smaples Returns ------- actions : list[int] List of actions based on NN output. extraData : list List of data that is passed to the execution code to be bundled with state data. """ #Determine number of steps and whether to initiate confidence based on the length of the Buffer. if step == 0: self.pastActions = [None] * self.nTrajs # Run the Meta and Sub-policy Networks targets = [self.q, self.term] + self.sub_a_prob + self.sub_log_logits res = self.sess.run(targets, {self.s: state}) q = res[0] terminations = res[1] sub_probs = res[2:3 + self.nPolicies] sub_log_logits = res[2 + self.nPolicies:2 + 2 * self.nPolicies] HL_actions = [] for i, term in enumerate(terminations): if step == 0: action = np.argmax(q[i]) HL_actions.append(action) self.pastActions[i] = action elif random.uniform(0, 1) < term[self.pastActions[i]]: # action = np.argmax(q[i]) action = random.randint(0, 2) HL_actions.append(action) self.pastActions[i] = action else: action = random.randint(0, 2) HL_actions.append(action) # HL_actions.append(self.pastActions[i]) self.traj_action = HL_actions print(q, HL_actions) # Run the Subpolicy Network actions = np.array([ np.random.choice(self.actionSize, p=sub_probs[mod][idx] / sum(sub_probs[mod][idx])) for idx, mod in enumerate(HL_actions) ]) logits = [ sub_log_logits[mod][idx] for idx, mod in enumerate(HL_actions) ] return actions, [HL_actions, q] def Update(self, HPs): """ Process the buffer and backpropagates the loses through the NN. Parameters ---------- HPs : dict Hyperparameters for training. Returns ------- N/A """ samples = 0 for i in range(len(self.buffer)): samples += len(self.buffer[i]) if samples < self.HPs["BatchSize"]: return for traj in range(len(self.buffer)): advantage = self.ProcessBuffer(traj) # Updating the Hierarchical Controller for epoch in range(self.HPs["Epochs"]): for batch in MultiBatchDivider([ self.buffer[traj][0], self.buffer[traj][1], advantage, self.buffer[traj][5] ], self.HPs["MinibatchSize"]): feed_dict = { self.batch_size: [np.asarray(batch[0]).squeeze().shape[0]], self.s: np.asarray(batch[0]).squeeze(), self.actions: np.asarray(batch[1]).squeeze(), self.rewards: np.asarray(batch[2]).squeeze(), self.options: np.reshape(batch[3], [-1]) } self.sess.run(self.update_op, feed_dict) self.SubpolicyDistribution.extend(np.asarray(self.buffer[traj][5])) self.ClearTrajectory() def GetStatistics(self): stats = {} for i in range(self.nPolicies): length = len(self.SubpolicyDistribution.tolist()) if length == 0: length = 1 stats[ "Subpolicy Use/" + str(i)] = self.SubpolicyDistribution.tolist().count(i) / length return stats def ProcessBuffer(self, traj): """ Process the buffer and backpropagates the loses through the NN. Parameters ---------- Model : HPs Hyperparameters for training. traj : Trajectory Data stored by the neural network. clip : list[bool] List where the trajectory has finished. Returns ------- td_target : list List Temporal Difference Target for particular states. advantage : list List of advantages for particular actions. """ #Splitting the buffer into different episodes based on the done tag. split_loc = [i + 1 for i, x in enumerate(self.buffer[traj][4]) if x] #Stuff need to be processed for the Low Level Controllers reward_lists = np.split(self.buffer[traj][2], split_loc[:-1]) value_lists = np.split(self.buffer[traj][6], split_loc[:-1]) HL_action_lists = np.split(self.buffer[traj][5], split_loc[:-1]) td_target = [] advantage = [] for rew, value, options in zip(reward_lists, value_lists, HL_action_lists): # Calculating the per step advantage of each of the different sections val = [] for i, option in enumerate(options): val.append(value[i, 0, option]) td_target_i, advantage_i = gae( rew.reshape(-1).tolist(), np.asarray(val).reshape(-1).tolist(), 0, self.HPs["Gamma"], self.HPs["lambda"]) td_target.extend(td_target_i) advantage.extend(advantage_i) return advantage @property def getVars(self): return self.Model.getVars("PPO_Training")
class AC(Method): def __init__(self,sess,settings,netConfigOverride,stateShape,actionSize,nTrajs=1,**kwargs): """ Initializes I/O placeholders used for Tensorflow session runs. Initializes and Actor and Critic Network to be used for the purpose of RL. """ #Creating appropriate buffer for the method. self.buffer = [Trajectory(depth=5) for _ in range(nTrajs)] #Placeholders self.sess=sess self.HPs = settings["NetworkHPs"] self.s = tf.placeholder(dtype=tf.float32, shape=[None]+stateShape, name="state") self.a = tf.placeholder(tf.int32, [None,1], "act") # self.td_error = tf.placeholder(tf.float32, None, "td_error") # TD_error self.v_ = tf.placeholder(tf.float32, [None, 1], "v_next") self.r = tf.placeholder(tf.float32, [None,1], 'r') #These need to be returned in the call function of a tf.keras.Model class. self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize) inputs = {"state":self.s} out = self.Model(inputs) self.acts_prob = out["actor"] self.critic = out["critic"] #Defining Training Operations which will be called in the Update Function. with tf.variable_scope('Update_Operation'): with tf.name_scope('squared_TD_error'): self.td_error = self.r + self.HPs["Gamma"] * self.v_ - self.critic self.c_loss = tf.reduce_mean(tf.square(self.td_error)) # TD_error = (r+gamma*V_next) - V_eval with tf.name_scope('train_critic'): self.c_params = self.Model.GetVariables("Critic") self.c_grads = tf.gradients(self.c_loss, self.c_params) self.update_c_op = tf.train.AdamOptimizer(self.HPs["Critic LR"]).apply_gradients(zip(self.c_grads, self.c_params)) with tf.name_scope('exp_v'): log_prob = tf.log(self.acts_prob + 1e-5) * tf.one_hot(self.a, actionSize, dtype=tf.float32) self.a_loss = -tf.reduce_mean(log_prob * self.td_error) # advantage (TD_error) guided loss with tf.name_scope('train_actor'): self.a_params = self.Model.GetVariables("Actor") print(self.a_params) self.a_grads = tf.gradients(self.a_loss, self.a_params) self.update_a_op = tf.train.AdamOptimizer(self.HPs["Actor LR"]).apply_gradients(zip(self.a_grads, self.a_params)) self.update_ops=[self.update_c_op,self.update_a_op] self.entropy = -tf.reduce_mean(self.acts_prob * _log(self.acts_prob), name='entropy') self.logging_ops = [self.a_loss,self.c_loss,self.entropy] self.labels = ["Loss Actor","Loss Critic","Entropy"] self.logging_MA = [MovingAverage(400) for i in range(len(self.logging_ops))] def GetAction(self, state, episode=0, step=0): """ Contains the code to run the network based on an input. """ try: s = state[np.newaxis, :] probs = self.sess.run(self.acts_prob, {self.s: s}) # get probabilities for all actions except ValueError: probs = self.sess.run(self.acts_prob, {self.s: state}) # get probabilities for all actions return np.random.choice(np.arange(probs.shape[1]), p=probs.ravel()), [] # return a int def Update(self, episode=0): """ Takes an input buffer and applies the updates to the networks through gradient backpropagation """ samples=0 for i in range(len(self.buffer)): samples +=len(self.buffer[i]) if samples < 1: return for traj in range(len(self.buffer)): v_ = self.sess.run(self.critic, {self.s: np.vstack(self.buffer[traj][3])}) feedDict = {self.s: np.vstack(self.buffer[traj][0]), self.v_: v_, self.r: np.vstack(self.buffer[traj][2]), self.a:np.vstack(self.buffer[traj][1]) } out = self.sess.run(self.update_ops+self.logging_ops, feedDict) # local grads applied to global net. logging = out[len(self.update_ops):] for i,log in enumerate(logging): self.logging_MA[i].append(log) #Clear of reset the buffer. self.ClearTrajectory() def GetStatistics(self): dict ={} for i,label in enumerate(self.labels): dict["Training Results/" + label] = self.logging_MA[i]() return dict @property def getVars(self): return self.Model.getVars("PPO_Training")
class A2C(Method): def __init__(self,sess,settings,netConfigOverride,stateShape,actionSize,nTrajs=1,**kwargs): """ Initializes I/O placeholders used for Tensorflow session runs. Initializes and Actor and Critic Network to be used for the purpose of RL. """ #Creating appropriate buffer for the method. self.buffer = [Trajectory(depth=5) for _ in range(nTrajs)] #Placeholders self.sess=sess self.HPs = settings["NetworkHPs"] self.s = tf.placeholder(dtype=tf.float32, shape=[None]+stateShape, name="state") self.a = tf.placeholder(tf.int32, [None,1], "act") # self.td_error = tf.placeholder(tf.float32, None, "td_error") # TD_error self.v_ = tf.placeholder(tf.float32, [None, 1], "v_next") self.r = tf.placeholder(tf.float32, [None,1], 'r') #These need to be returned in the call function of a tf.keras.Model class. self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize) inputs = {"state":self.s} out = self.Model(inputs) self.acts_prob = out["actor"] self.critic = out["critic"] #Defining Training Operations which will be called in the Update Function. with tf.variable_scope('Update_Operation'): with tf.name_scope('squared_TD_error'): self.td_error = self.r + self.HPs["Gamma"] * self.v_ - self.critic self.loss = tf.reduce_mean(tf.square(self.td_error)) # TD_error = (r+gamma*V_next) - V_eval with tf.name_scope('train_critic'): self.train_op_c = tf.train.AdamOptimizer(self.HPs["Critic LR"]).minimize(self.loss) with tf.name_scope('exp_v'): log_prob = tf.log(self.acts_prob + 1e-5) * tf.one_hot(self.a, actionSize, dtype=tf.float32) self.exp_v = tf.reduce_mean(log_prob * self.td_error) # advantage (TD_error) guided loss with tf.name_scope('train_actor'): self.train_op_a = tf.train.AdamOptimizer(self.HPs["Actor LR"]).minimize(-self.exp_v) # minimize(-exp_v) = maximize(exp_v) self.update_ops=[self.train_op_c,self.train_op_a] self.entropy = -tf.reduce_mean(self.acts_prob * _log(self.acts_prob), name='entropy') self.logging_ops = [self.exp_v,self.loss,self.entropy] self.labels = ["Loss Actor","Loss Critic","Entropy"] self.logging_MA = [MovingAverage(400) for i in range(len(self.logging_ops))] def GetAction(self, state, episode=0, step=0): """ Contains the code to run the network based on an input. """ try: s = state[np.newaxis, :] probs,critic = self.sess.run(self.acts_prob, {self.s: s}) # get probabilities for all actions except ValueError: probs,critic = self.sess.run(self.acts_prob, {self.s: state}) # get probabilities for all actions return np.random.choice(np.arange(probs.shape[1]), p=probs.ravel()), [critic] # return a int def Update(self, episode=0): """ Takes an input buffer and applies the updates to the networks through gradient backpropagation """ samples=0 for i in range(len(self.buffer)): samples +=len(self.buffer[i]) if samples < 1: return for traj in range(len(self.buffer)): td_target,advantage=self.ProcessBuffer(traj) v_ = self.sess.run(self.critic, {self.s: np.vstack(self.buffer[traj][3])}) feedDict = {self.s: np.vstack(self.buffer[traj][0]), self.v_: v_, self.r: np.vstack(self.buffer[traj][2]), self.a:np.vstack(self.buffer[traj][1]) } out = self.sess.run(self.update_ops+self.logging_ops, feedDict) # local grads applied to global net. logging = out[len(self.update_ops):] for i,log in enumerate(logging): self.logging_MA[i].append(log) def ProcessBuffer(self,traj): """ Process the buffer to calculate td_target. Parameters ---------- Model : HPs Hyperparameters for training. traj : Trajectory Data stored by the neural network. Returns ------- td_target : list List Temporal Difference Target for particular states. advantage : list List of advantages for particular actions. """ split_loc = [i+1 for i, x in enumerate(self.buffer[traj][4]) if x] reward_lists = np.split(self.buffer[traj][2],split_loc) value_lists = np.split(self.buffer[traj][5],split_loc) td_target=[]; advantage=[] for rew,value in zip(reward_lists,value_lists): td_target_i, advantage_i = gae(rew.reshape(-1),value.reshape(-1).tolist(),0,self.HPs["Gamma"],self.HPs["lambda"]) td_target.extend(td_target_i); advantage.extend( advantage_i) return td_target, advantage #Clear of reset the buffer. self.ClearTrajectory() def GetStatistics(self): dict ={} for i,label in enumerate(self.labels): dict["Training Results/" + label] = self.logging_MA[i]() return dict @property def getVars(self): return self.Model.getVars("PPO_Training")
class DQN(Method): def __init__(self,sess,settings,netConfigOverride,stateShape,actionSize,nTrajs=1,**kwargs): """ Initializes I/O placeholders used for Tensorflow session runs. Initializes and Actor and Critic Network to be used for the purpose of RL. """ #Placeholders self.sess=sess self.scope="DQN" self.HPs = settings["NetworkHPs"] self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],netConfigOverride=netConfigOverride,actionSize=actionSize) self.buffer = [Trajectory(depth=5) for _ in range(nTrajs)] with self.sess.as_default(), self.sess.graph.as_default(): with tf.name_scope(self.scope): if len(stateShape) == 4: self.states_ = tf.placeholder(shape=[None]+stateShape[1:4], dtype=tf.float32, name='states') self.next_states_ = tf.placeholder(shape=[None]+stateShape[1:4], dtype=tf.float32, name='next_states') else: self.states_ = tf.placeholder(shape=[None]+stateShape, dtype=tf.float32, name='states') self.next_states_ = tf.placeholder(shape=[None]+stateShape, dtype=tf.float32, name='next_states') self.actions_ = tf.placeholder(shape=[None], dtype=tf.int32, name='actions_hold') self.rewards_ = tf.placeholder(shape=[None], dtype=tf.float32, name='rewards_hold') self.done_ = tf.placeholder(shape=[None], dtype=tf.float32, name='done_hold') input = {"state":self.states_} out = self.Model(input) self.q = out["Q"] out2 = self.Model({"state":self.next_states_}) q_next = out2["Q"] with tf.name_scope('current_Q'): oh_action = tf.one_hot(self.actions_, actionSize, dtype=tf.float32) # [?, num_agent, action_size] curr_q = tf.reduce_sum(tf.multiply(self.q, oh_action), axis=-1) # [?, num_agent] with tf.name_scope('target_Q'): max_next_q = tf.reduce_max(q_next, axis=-1) td_target = self.rewards_ + self.HPs["Gamma"] * max_next_q # td_target = self.rewards_ + self.HPs["Gamma"] * max_next_q * (1. - self.done_) with tf.name_scope('td_error'): loss = tf.keras.losses.MSE(td_target, curr_q) softmax_q = tf.nn.softmax(curr_q) self.entropy = -tf.reduce_mean(softmax_q * tf.log(softmax_q+ 1e-5)) self.loss=total_loss = loss + self.HPs["EntropyBeta"] * self.entropy if self.HPs["Optimizer"] == "Adam": self.optimizer = tf.keras.optimizers.Adam(self.HPs["LR"]) elif self.HPs["Optimizer"] == "RMS": self.optimizer = tf.keras.optimizers.RMSProp(self.HPs["LR"]) elif self.HPs["Optimizer"] == "Adagrad": self.optimizer = tf.keras.optimizers.Adagrad(self.HPs["LR"]) elif self.HPs["Optimizer"] == "Adadelta": self.optimizer = tf.keras.optimizers.Adadelta(self.HPs["LR"]) elif self.HPs["Optimizer"] == "Adamax": self.optimizer = tf.keras.optimizers.Adamax(self.HPs["LR"]) elif self.HPs["Optimizer"] == "Nadam": self.optimizer = tf.keras.optimizers.Nadam(self.HPs["LR"]) elif self.HPs["Optimizer"] == "SGD": self.optimizer = tf.keras.optimizers.SGD(self.HPs["LR"]) elif self.HPs["Optimizer"] == "Amsgrad": self.optimizer = tf.keras.optimizers.Nadam(self.HPs["LR"],amsgrad=True) else: print("Not selected a proper Optimizer") exit() self.params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) self.gradients = self.optimizer.get_gradients(total_loss, self.params) self.update_op = self.optimizer.apply_gradients(zip(self.gradients, self.params)) self.grads=[self.gradients] self.losses=[self.loss] self.update_ops=[self.update_op] self.grad_MA = [MovingAverage(400) for i in range(len(self.grads))] self.loss_MA = [MovingAverage(400) for i in range(len(self.losses))] self.entropy_MA = MovingAverage(400) self.labels = ["Critic"] def GetAction(self, state,episode,step): """ Contains the code to run the network based on an input. """ if len(state.shape) == 3: state = state[np.newaxis, :] if len(state.shape) == 1: state = state[np.newaxis, :] q = self.sess.run(self.q, {self.states_: state}) if "Exploration" in self.HPs: if self.HPs["Exploration"]=="EGreedy": prob = 0.1 + 0.9*(np.exp(-episode/self.HPs["ExplorationDecay"])) if random.uniform(0, 1) < prob: actions = random.randint(0,4) else: actions = np.argmax(q, axis=-1) else: actions = np.argmax(q, axis=-1) return actions ,[] # return a int and extra data that needs to be fed to buffer. def Update(self,episode=0): """ The main update function for A3C. The function pushes gradients to the global AC Network. The second function is to Pull """ #Process the data from the buffer samples=0 for i in range(len(self.buffer)): samples +=len(self.buffer[i]) if samples < self.HPs["BatchSize"]: return for traj in range(len(self.buffer)): batches = len(self.buffer[traj][0])//self.HPs["MinibatchSize"]+1 s = np.array_split( self.buffer[traj][0], batches) a_his = np.array_split( np.asarray(self.buffer[traj][1]).reshape(-1), batches) r = np.array_split( np.asarray(self.buffer[traj][2]).reshape(-1), batches) s_next = np.array_split( self.buffer[traj][3], batches) done = np.array_split( self.buffer[traj][4], batches) for epoch in range(self.HPs["Epochs"]): for i in range(batches): #Create a feedDict from the buffer feedDict = { self.states_ : np.squeeze(np.asarray(s[i])), self.next_states_ : np.squeeze(np.asarray(s_next[i])), self.actions_ : np.squeeze(np.asarray(a_his[i])), self.rewards_ : np.squeeze(np.asarray(r[i])), self.done_ : np.squeeze(np.asarray(done[i],dtype=float)) } out = self.sess.run(self.update_ops+self.losses+self.grads, feedDict) # local grads applied to global net. out = np.array_split(out,3) losses = out[1] grads = out[2] for i,loss in enumerate(losses): self.loss_MA[i].append(loss) for i,grads_i in enumerate(grads): total_counter = 1 vanish_counter = 0 for grad in grads_i: total_counter += np.prod(grad.shape) vanish_counter += (np.absolute(grad)<1e-8).sum() self.grad_MA[i].append(vanish_counter/total_counter) ent = self.sess.run(self.entropy, feedDict) # local grads applied to global net. entropy = np.average(np.asarray(ent)) self.entropy_MA.append(entropy) self.ClearTrajectory() def GetStatistics(self): dict ={} for i,label in enumerate(self.labels): dict["Training Results/Vanishing Gradient " + label] = self.grad_MA[i]() dict["Training Results/Loss " + label] = self.loss_MA[i]() dict["Training Results/Entropy"] = self.entropy_MA() return dict @property def getVars(self): return self.Model.getVars(self.scope)