class DDPG: def __init__(self, sess, params): self.sess = sess self.__dict__.update(params) # create placeholders self.create_input_placeholders() # create actor/critic models self.actor = Actor(self.sess, self.inputs, **self.actor_params) self.critic = Critic(self.sess, self.inputs, **self.critic_params) self.noise_params = {k: np.array(list(map(float, v.split(",")))) for k, v in self.noise_params.items()} self.noise = Noise(**self.noise_params) self.ou_level = np.zeros(self.dimensions["u"]) self.memory = Memory(self.n_mem_objects, self.memory_size) def create_input_placeholders(self): self.inputs = {} with tf.name_scope("inputs"): for ip_name, dim in self.dimensions.items(): self.inputs[ip_name] = tf.placeholder(tf.float32, shape=(None, dim), name=ip_name) self.inputs["g"] = tf.placeholder(tf.float32, shape=self.inputs["u"].shape, name="a_grad") self.inputs["p"] = tf.placeholder(tf.float32, shape=(None, 1), name="pred_q") def step(self, x, is_u_discrete, explore=True): x = x.reshape(-1, self.dimensions["x"]) u = self.actor.predict(x) if explore: self.ou_level = self.noise.ornstein_uhlenbeck_level(self.ou_level) u = u + self.ou_level q = self.critic.predict(x, u) if is_u_discrete: return [np.argmax(u), u[0], q[0]] return [u[0], u, q[0]] def remember(self, experience): self.memory.add(experience) def train(self): # check if the memory contains enough experiences if self.memory.size < 3*self.b_size: return x, g, ag, u, r, nx, ng, t = self.get_batch() # for her transitions her_idxs = np.where(np.random.random(self.b_size) < 0.80)[0] # print("{} of {} selected for HER transitions". # format(len(her_idxs), self.b_size)) g[her_idxs] = ag[her_idxs] r[her_idxs] = 1 t[her_idxs] = 1 x = np.hstack([x, g]) nx = np.hstack([nx, ng]) nu = self.actor.predict_target(nx) tq = r + self.gamma*self.critic.predict_target(nx, nu)*(1-t) self.critic.train(x, u, tq) grad = self.critic.get_action_grads(x, u) # print("Grads:\n", g) self.actor.train(x, grad) self.update_targets() def get_batch(self): return self.memory.sample(self.b_size) def update_targets(self): self.critic.update_target() self.actor.update_target()
class Agent(object): def __init__(self, alpha, beta, input_dims, tau, env, gamma=0.99, max_size=10000, layer1_size=400, layer2_size=300, batch_size=64): n_actions = env.action_space.shape[0] self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.sess = tf.Session() self.actor = Actor(alpha, n_actions, 'Actor', input_dims, self.sess, layer1_size, layer2_size, env.action_space.high, self.batch_size, ckpt_dir='tmp/ddpg/actor') self.critic = Critic(beta, n_actions, 'Critic', input_dims, self.sess, layer1_size, layer2_size, self.batch_size, ckpt_dir='tmp/ddpg/critic') self.target_actor = Actor(alpha, n_actions, 'TargetActor', input_dims, self.sess, layer1_size, layer2_size, env.action_space.high, self.batch_size, ckpt_dir='tmp/ddpg/target_actor') self.target_critic = Critic(beta, n_actions, 'TargetCritic', input_dims, self.sess, layer1_size, layer2_size, self.batch_size, ckpt_dir='tmp/ddpg/target_critic') self.noise = OUActionNoise(mu=np.zeros(n_actions)) self.update_actor = [ self.target_actor.params[i].assign( tf.multiply(self.actor.params[i], self.tau) + tf.multiply(self.target_actor.params[i], 1. - self.tau)) for i in range(len(self.target_actor.params)) ] self.update_critic = [ self.target_critic.params[i].assign( tf.multiply(self.critic.params[i], self.tau) + tf.multiply(self.target_critic.params[i], 1. - self.tau)) for i in range(len(self.target_critic.params)) ] self.sess.run(tf.global_variables_initializer()) self.update_target_network_parameters(first=True) def update_target_network_parameters(self, first=False): for _, d in enumerate(["/device:GPU:0", "/device:GPU:1"]): with tf.device(d): if first: old_tau = self.tau self.tau = 1.0 self.target_actor.sess.run(self.update_actor) self.target_critic.sess.run(self.update_critic) self.tau = old_tau else: self.target_critic.sess.run(self.update_critic) self.target_actor.sess.run(self.update_actor) def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def choose_action(self, state): # print("State[0]: ",state[0].shape) # print("State[1]: ",state[1].shape) state1 = state[0][np.newaxis, :] state2 = state[1][np.newaxis, :] state = [state1, state2] for _, d in enumerate(["/device:GPU:0", "/device:GPU:1"]): with tf.device(d): mu = self.actor.predict(state) noise = self.noise() mu_prime = mu + noise return mu_prime[0] def learn(self): if self.memory.mem_cntr < self.batch_size: return for _, d in enumerate(["/device:GPU:0", "/device:GPU:1"]): with tf.device(d): state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) #target q-value(new_state) with actor's bounded action forward pass critic_value_ = self.target_critic.predict( new_state, self.target_actor.predict(new_state)) target = [] for j in range(self.batch_size): target.append(reward[j] + self.gamma * critic_value_[j] * done[j]) target = np.reshape(target, (self.batch_size, 1)) _ = self.critic.train(state, action, target) #s_i, a_i and y_i # a = mu(s_i) a_outs = self.actor.predict(state) # gradients of Q w.r.t actions grads = self.critic.get_action_gradients(state, a_outs) self.actor.train(state, grads[0]) self.update_target_network_parameters(first=True) def save_models(self): self.actor.save_checkpoint() self.target_actor.save_checkpoint() self.critic.save_checkpoint() self.target_critic.save_checkpoint() def load_models(self): self.actor.load_checkpoint() self.target_actor.load_checkpoint() self.critic.load_checkpoint() self.target_critic.load_checkpoint()
class Agent: #Warning! policy.py and critic.py are still work in progress and contain many global variables that should be converted to #class member variables. Before that is done, all instances of Agent must use the same values for the following: #PPOepsilon,nHidden,nUnitsPerLayer,activation,H,entropyLossWeight,sdLowLimit def __init__(self, stateDim: int, actionDim: int, actionMin: np.array, actionMax: np.array, learningRate=0.0005, gamma=0.99, GAElambda=0.95, PPOepsilon=0.2, PPOentropyLossWeight=0, nHidden: int = 2, nUnitsPerLayer: int = 128, mode="PPO-CMA-m", activation="lrelu", H: int = 9, entropyLossWeight: float = 0, sdLowLimit=0.01, useScaler: bool = True, criticTimestepScale=0.001): #Create policy network print("Creating policy") self.actionMin = actionMin.copy() self.actionMax = actionMax.copy() self.actionDim = actionDim self.stateDim = stateDim self.useScaler = useScaler if useScaler: self.scaler = Scaler(stateDim) self.scalerInitialized = False self.normalizeAdvantages = True self.gamma = gamma self.GAElambda = GAElambda self.criticTimestepScale = 0 if gamma == 0 else criticTimestepScale #with gamma==0, no need for this piEpsilon = None nHistory = 1 negativeAdvantageAvoidanceSigma = 0 if mode == "PPO-CMA" or mode == "PPO-CMA-m": usePPOLoss = False #if True, we use PPO's clipped surrogate loss function instead of the standard -A_i * log(pi(a_i | s_i)) separateVarAdapt = True self.reluAdvantages = True if mode == "PPO-CMA" else False nHistory = H #policy mean adapts immediately, policy covariance as an aggreagate of this many past iterations useSigmaSoftClip = True negativeAdvantageAvoidanceSigma = 1 if mode == "PPO-CMA-m" else 0 elif mode == "PPO": usePPOLoss = True #if True, we use PPO's clipped surrogate loss function instead of the standard -A_i * log(pi(a_i | s_i)) separateVarAdapt = False # separateSigmaAdapt=False self.reluAdvantages = False useSigmaSoftClip = True piEpsilon = 0 else: raise ("Unknown mode {}".format(mode)) self.policy = Policy( stateDim, actionDim, actionMin, actionMax, entropyLossWeight=PPOentropyLossWeight, networkActivation=activation, networkDepth=nHidden, networkUnits=nUnitsPerLayer, networkSkips=False, learningRate=learningRate, minSigma=sdLowLimit, PPOepsilon=PPOepsilon, usePPOLoss=usePPOLoss, separateVarAdapt=separateVarAdapt, nHistory=nHistory, useSigmaSoftClip=useSigmaSoftClip, piEpsilon=piEpsilon, negativeAdvantageAvoidanceSigma=negativeAdvantageAvoidanceSigma) #Create critic network, +1 stateDim because at least in OpenAI gym, episodes are time-limited and the value estimates thus depend on simulation time. #Thus, we use time step as an additional feature for the critic. #Note that this does not mess up generalization, as the feature is not used for the policy during training or at runtime print("Creating critic network") self.critic = Critic(stateDim=stateDim + 1, learningRate=learningRate, nHidden=nHidden, networkUnits=nUnitsPerLayer, networkActivation=activation, useSkips=False, lossType="L1") #Experience trajectory buffers for the memorize() and updateWithMemorized() methods self.experienceTrajectories = [] self.currentTrajectory = [] #call this after tensorflow's global variables initializer def init(self, sess: tf.Session, verbose=False): #Pretrain the policy to output the initial Gaussian for all states self.policy.init( sess, 0, 1, 0.5 * (self.actionMin + self.actionMax) * np.ones(self.actionDim), 0.5 * (self.actionMax - self.actionMin) * np.ones(self.actionDim), 256, 2000, verbose) #stateObs is an n-by-m tensor, where n = number of observations, m = number of observation variables def act(self, sess: tf.Session, stateObs: np.array, deterministic=False, clipActionToLimits=True): #Expand a single 1d-observation into a batch of 1 vectors if len(stateObs.shape) == 1: stateObs = np.reshape(stateObs, [1, stateObs.shape[0]]) #Query the policy for the action, except for the first iteration where we sample directly from the initial exploration Gaussian #that covers the whole action space. #This is done because we don't know the scale of state observations a priori; thus, we can only init the state scaler in update(), #after we have collected some experience. if self.useScaler and (not self.scalerInitialized): actions = np.random.normal( 0.5 * (self.actionMin + self.actionMax) * np.ones(self.actionDim), 0.5 * (self.actionMax - self.actionMin) * np.ones(self.actionDim), size=[stateObs.shape[0], self.actionDim]) if clipActionToLimits: actions = np.clip( actions, np.reshape(self.actionMin, [1, self.actionDim]), np.reshape(self.actionMax, [1, self.actionDim])) return actions else: if self.useScaler: scaledObs = self.scaler.process(stateObs) else: scaledObs = stateObs if deterministic: actions = self.policy.getExpectation(sess, scaledObs) else: actions = self.policy.sample(sess, scaledObs) if clipActionToLimits: actions = np.clip(actions, self.actionMin, self.actionMax) return actions def memorize(self, observation: np.array, action: np.array, reward: float, nextObservation: np.array, done: bool): e = Experience(observation, action, reward, nextObservation, done) self.currentTrajectory.append(e) if done: self.experienceTrajectories.append(self.currentTrajectory) self.currentTrajectory = [] def getAverageActionStdev(self): if self.useScaler and (not self.scalerInitialized): return np.mean(0.5 * (self.actionMax - self.actionMin)) else: return self.policy.usedSigmaSum / (1e-20 + self.policy.usedSigmaSumCounter) #If you call memorize() after each action, you can update the agent with this method. #If you handle the experience buffers yourself, e.g., due to a multithreaded implementation, use the update() method instead. def updateWithMemorized(self, sess: tf.Session, batchSize: int = 512, nBatches: int = 100, verbose=True, valuesValid=False, timestepsValid=False): self.update(sess, experienceTrajectories=self.experienceTrajectories, batchSize=batchSize, nBatches=nBatches, verbose=verbose, valuesValid=valuesValid, timestepsValid=timestepsValid) averageEpisodeReturn = 0 for t in self.experienceTrajectories: episodeReturn = 0 for e in t: episodeReturn += e.r averageEpisodeReturn += episodeReturn averageEpisodeReturn /= len(self.experienceTrajectories) self.experienceTrajectories = [] self.currentTrajectory = [] return averageEpisodeReturn #experienceTrajectories is a list of lists of Experience instances such that each of the contained lists corresponds to an episode simulation trajectory def update(self, sess: tf.Session, experienceTrajectories, batchSize: int = 512, nBatches: int = 100, verbose=True, valuesValid=False, timestepsValid=False): trajectories = experienceTrajectories #shorthand #Collect all data into linear arrays for training. nTrajectories = len(trajectories) nData = 0 for trajectory in trajectories: nData += len(trajectory) #propagate values backwards along trajectory if not already done if not valuesValid: for i in reversed(range(len(trajectory) - 1)): #value estimates, used for training the critic and estimating advantages trajectory[i].V = trajectory[ i].r + self.gamma * trajectory[i + 1].V #update time steps if not updated if not timestepsValid: for i in range(len(trajectory)): trajectory[i].timeStep = i allStates = np.zeros([nData, self.stateDim]) allActions = np.zeros([nData, self.actionDim]) allValues = np.zeros([nData]) allTimes = np.zeros([nData, 1]) k = 0 for trajectory in trajectories: for e in trajectory: allStates[k, :] = e.s allValues[k] = e.V allActions[k, :] = e.a allTimes[k, 0] = e.timeStep * self.criticTimestepScale k += 1 #Update scalers if self.useScaler: self.scaler.update(allStates) scale, offset = self.scaler.get() self.scalerInitialized = True else: offset = 0 scale = 1 #Scale the observations for training the critic scaledStates = self.scaler.process(allStates) #Train critic def augmentCriticObs(obs: np.array, timeSteps: np.array): return np.concatenate([obs, timeSteps], axis=1) self.critic.train(sess, augmentCriticObs(scaledStates, allTimes), allValues, batchSize, nEpochs=0, nBatches=nBatches, verbose=verbose) #Policy training needs advantages, which depend on the critic we just trained. #We use Generalized Advantage Estimation by Schulman et al. if verbose: print("Estimating advantages...".format(len(trajectories))) for t in trajectories: #query the critic values of all states of this trajectory in one big batch nSteps = len(t) states = np.zeros([nSteps + 1, self.stateDim]) timeSteps = np.zeros([nSteps + 1, 1]) for i in range(nSteps): states[i, :] = t[i].s timeSteps[i, 0] = t[i].timeStep * self.criticTimestepScale states[nSteps, :] = t[nSteps - 1].s_next states = (states - offset) * scale values = self.critic.predict(sess, augmentCriticObs(states, timeSteps)) #GAE loop, i.e., take the instantaneous advantage (how much value a single action brings, assuming that the #values given by the critic are unbiased), and smooth those along the trajectory using 1st-order IIR filter. for step in reversed(range(nSteps - 1)): delta_t = t[step].r + self.gamma * values[step + 1] - values[step] t[step].advantage = delta_t + self.GAElambda * self.gamma * t[ step + 1].advantage #Gather the advantages to linear array and apply ReLU and normalization if needed allAdvantages = np.zeros([nData]) k = 0 for trajectory in trajectories: for e in trajectory: allAdvantages[k] = e.advantage k += 1 if self.reluAdvantages: allAdvantages = np.clip(allAdvantages, 0, np.inf) if self.normalizeAdvantages: aMean = np.mean(allAdvantages) aSd = np.std(allAdvantages) if verbose: print("Advantage mean {}, sd{}".format(aMean, aSd)) allAdvantages /= 1e-10 + aSd #Train policy. Note that this uses original unscaled states, because the PPO-CMA variance training needs a history of #states in the same scale self.policy.train(sess, allStates, allActions, allAdvantages, batchSize, nEpochs=0, nBatches=nBatches, stateOffset=offset, stateScale=scale, verbose=verbose)