class Policy(nn.Module): """Actor (Policy) Model.""" def __init__(self, policy_params): """ arch_parameters is a dictionary like: {'state_and_action_dims' : (num1, num2), layers : {'Linear_1' : layer_size_1,..,'Linear_n' : layer_size_n} } """ super(Policy, self).__init__() self.policy_params = policy_params self.seed_as_int = policy_params['seed'] torch.manual_seed(self.seed_as_int) self.arch_params = policy_params['arch_params'] self.__state_dim = self.arch_params['state_and_action_dims'][0] self.__action_dim = self.arch_params['state_and_action_dims'][1] self.eps = policy_params['eps'] self.min_eps = policy_params['min_eps'] self.eps_decay = policy_params['eps_decay'] self.__noise_type = policy_params['noise_type'] keys = list(self.arch_params['layers'].keys()) list_of_layers = [] prev_layer_size = self.__state_dim for i in range(len(self.arch_params['layers'])): key = keys[i] layer_type = key.split('_')[0] if layer_type == 'Linear': layer_size = self.arch_params['layers'][key] list_of_layers.append(nn.Linear(prev_layer_size, layer_size)) prev_layer_size = layer_size elif layer_type == 'LayerNorm': list_of_layers.append(nn.LayerNorm(prev_layer_size)) elif layer_type == 'ReLU': list_of_layers.append(nn.ReLU()) elif layer_type == 'Tanh': list_of_layers.append(nn.Tanh()) else: print("Error: got unspecified layer type: '{}'. Check your layers!".format(layer_type)) break self.layers = nn.ModuleList(list_of_layers) #noise if self.__noise_type == 'action': self.__rand_process = OUNoise((self.__action_dim,)) elif self.__noise_type == 'parameter': self.network_params_perturbations = dict() for i in range(len(self.layers)): if not ('Linear' in str(type(self.layers[i]))): i += 1 else: self.network_params_perturbations[i] = OUNoise(tuple(self.layers[i].weight.shape)) else: assert ValueError('Got an unspecified type of noise. The only available options are \'parameter\' and \'action\'') def forward(self, state): # get action values """Build a network that maps state -> action.""" if self.__noise_type == 'action': y = state.float() for i in range(len(self.layers)): y = self.layers[i](y).float() y_perturbed = y + self.eps*torch.from_numpy(self.__rand_process.noise()).float() return y, torch.clamp(y_perturbed, min = -1.0, max = 1.0) elif self.__noise_type == 'parameter': y = state.float() y_perturbed = state.float() for i in range(len(self.layers)): if not ('Linear' in str(type(self.layers[i]))): y = self.layers[i](y).float() #layernorm y_perturbed = self.layers[i](y_perturbed).float() else: #weights if (self.layers[i].weight).shape[1] == y.shape[0]: #if there is a single state y = (self.layers[i].weight).matmul(y) n = torch.from_numpy(self.network_params_perturbations[i].noise()).float() y_perturbed = ((self.layers[i].weight) + self.eps*n).matmul(y_perturbed) else: #if there is a batch of states y = y.matmul((self.layers[i].weight).t()) n = torch.from_numpy(self.network_params_perturbations[i].noise()).float() y_perturbed = y_perturbed.matmul(((self.layers[i].weight) + self.eps*n).t()) #biases y = y + (self.layers[i].bias) y_perturbed = y_perturbed + (self.layers[i].bias) return y, y_perturbed
def run_n_episodes(self, num_episodes, max_ep_length, minibatch_size, explore=True, num_updates=5, summary_checkpoint=1, eta=0.01, num_updates_ac=1, T=1): #num_updates from article for i in range(num_episodes): noise = OUNoise(self.a_dim) x = self.env.reset() x = x.reshape(1, -1) u = np.zeros(self.s_dim) t = False episodes_reward = 0 #for REINFORCE self.r_rs.append([]) self.r_xs.append([]) self.r_us.append([]) self.episodes_ls.append(0) while True: self.episodes_ls[-1] = self.episodes_ls[-1] + 1 if self.det: u, V = self.sess.run((self.model.mu_det, self.model.V), feed_dict={self.model.inputs_x: x}) self.episodes_Vs.append(V) else: u, P, sigma, V = self.sess.run( (self.model.mu_norm, self.model.P, self.model.sigma, self.model.V), feed_dict={self.model.inputs_x: x}) self.episodes_Ps.append(P) self.episodes_ss.append(sigma) self.episodes_Vs.append(V) if self.separate_V: self.episodes_V_s.append(self.critic.predict_V_sep(x)) if explore: u += noise.noise() u = np.clip(u, -1.0, 1.0) u = u.reshape(1, -1) x1, r, t, info = self.env.step(u.reshape(-1)) self.r_xs[-1].append(x) self.r_us[-1].append(u) self.r_rs[-1].append(r) episodes_reward += r self.buffer.add(x.reshape(1, -1), u, r, t, x1.reshape(1, -1)) self.episodes_xs.append(x) self.episodes_us.append(u) self.episodes_rs.append(r) #Actor-Critic x = x1.reshape(1, -1) if self.qNAF: for k in range(num_updates): x_batch, u_batch, r_batch, t_batch, x1_batch = \ self.buffer.sample_batch(minibatch_size) x_batch, u_batch, r_batch, t_batch, x1_batch = \ x_batch.reshape(-1, self.s_dim), u_batch.reshape(-1, self.a_dim), r_batch.reshape(-1, 1),\ t_batch.reshape(-1), x1_batch.reshape(-1, self.s_dim) if self.qNAF: y_batch = self.gamma * self.target_model.predict_V( x1_batch) + r_batch self.model.update_Q(x_batch, u_batch, y_batch) self.target_model.soft_update_from(self.model) if t: break if self.ac: r_xs_l = np.array(self.r_xs[-1]).reshape(-1, self.s_dim) r_rs_l = np.array(self.r_rs[-1]).reshape(-1, 1) for idx in range(2, len(r_rs_l) + 1): r_rs_l[-idx] += self.gamma * r_rs_l[-idx + 1] self.r_rs[-1] = r_rs_l r_rs_ = np.array(self.r_rs).reshape(-1, 1) r_xs_ = np.array(self.r_xs).reshape(-1, self.s_dim) r_us_ = np.array(self.r_us).reshape(-1, self.a_dim) for _ in range(num_updates_ac): #update V every episode if self.separate_V: self.critic.update_V_sep(r_xs_l, r_rs_l) if i % T == 0: #Q_target = r_rs_[:-1] + self.gamma * self.critic.predict_V_sep(r_xs_[1:]) #Q_target = np.vstack((Q_target, (r_rs_[-1]))) deltas = r_rs_ if self.separate_V: deltas = deltas - self.critic.predict_V_sep(r_xs_) else: deltas = deltas - self.target_model.predict_V( r_xs_) ''' loss = self.sess.run((self.model.loss_spg), feed_dict={self.model.inputs_x: r_xs_, self.model.inputs_u: r_us_, self.model.inputs_Q: deltas}) print('loss before update', loss) ''' self.model.update_mu(r_xs_, r_us_, deltas) self.target_model.soft_update_from(self.model) ''' loss = self.sess.run((self.model.loss_spg), feed_dict={self.model.inputs_x: r_xs_, self.model.inputs_u: r_us_, self.model.inputs_Q: deltas}) print('loss after update', loss) ''' #self.target_model.soft_update_from(self.model) self.r_rs = [] self.r_xs = [] self.r_us = [] if summary_checkpoint > 0 and i % summary_checkpoint == 0: print('| Reward: %.2i' % int(episodes_reward), " | Episode", i) self.plot_rewards(self.summary_dir) self.rewards.append(episodes_reward)
class RDPGAgent: def __init__(self, env, batchSize = 10, bufferSize = 100, gamma = 0.98, actorLR = 1e-4, criticLR = 1e-3, maxSteps = 200, targetUpdate = 1e-3, epsilon = 1, decay = 0.99, rewardScale = 1e-3, logFile = 'run.log'): self.env = env self.gamma = gamma self.batchSize = batchSize self.bufferSize = bufferSize self.maxSteps = maxSteps + 1 self.rewardScale = rewardScale self.epsilon = epsilon self.decay = decay # Useful helpers. self.actionDim = self.env.action_space.shape[0] self.stateDim = self.env.observation_space.shape[0] self.featureDim = self.actionDim + self.stateDim self.minAction = self.env.action_space.low self.maxAction = self.env.action_space.high # For scaling output action values. self.actionBiasZeroOne = self.minAction self.actionScaleZeroOne = self.maxAction - self.minAction self.actionBiasTanH = (self.maxAction + self.minAction) / 2.0 self.actionScaleTanH = self.maxAction - self.actionBiasTanH # Initialize noise process. self.noise = OUNoise(self.actionDim) # Initialize replay buffer. self.buffer = ReplayBuffer(self.bufferSize) # Initialize logging. logging.basicConfig(filename = logFile, level = logging.INFO, format = '[%(asctime)s] %(message)s', datefmt = '%m/%d/%Y %I:%M:%S %p') logging.info('Initializing DRPG agent with passed settings.') # Tensorflow GPU optimization. config = tf.ConfigProto() # GPU fix? config.gpu_options.allow_growth = True self.sess = tf.Session(config = config) from keras import backend as K K.set_session(self.sess) # Make actor network (creates target model internally). self.actor = Actor(self.sess, self.maxSteps, self.featureDim, self.actionDim, self.batchSize, targetUpdate, actorLR, self.actionScaleTanH, self.actionBiasTanH) # Make critic network (creates target model internally). self.critic = Critic(self.sess, self.maxSteps, self.featureDim, self.actionDim, self.batchSize, targetUpdate, actorLR) # Train or run for some number of episodes. def run(self, numEpisodes, training = False, warmUp = 30): for i in range(numEpisodes): sequence = [] totalReward = 0 totalSteps = 0 o = self.env.reset() # Stores (O1, A1, O2, A2, etc) for prediction. history = np.zeros((self.maxSteps * self.featureDim)) history[:self.stateDim] = o for j in range(self.maxSteps - 1): # We do this reshaping to get history into (BatchSize, TimeSteps, Dims). batchedHistory = np.reshape(history, (self.maxSteps, self.featureDim)) batchedHistory = np.expand_dims(batchedHistory, axis = 0) # Predict action or use random with e-greedy. # if (np.random.random_sample() < self.epsilon and training): # a = np.random.random((self.actionDim)) # a = a * self.actionScaleZeroOne # a = a + self.actionBiasZeroOne # else: # a = self.actor.model.predict(batchedHistory)[0] # Predict an action and add noise to it for exploration purposes. a = self.actor.model.predict(batchedHistory)[0] + self.epsilon * self.noise.noise() a = np.clip(a, self.minAction, self.maxAction) # Take a single step. oPrime, r, d, _ = self.env.step(a) r *= self.rewardScale newTimeStart = (j + 1) * self.featureDim # Update agent state and ongoing agent history data. History is # passed to our actor for prediction, and sequence is for later. history[j * self.featureDim + self.stateDim:newTimeStart] = a history[newTimeStart:(j + 1) * self.featureDim + self.stateDim] = oPrime sequence.append({'o': o, 'a': a, 'r': r, 'd': d}) totalReward += r totalSteps += 1 o = oPrime # Quit early. if d: break # Anneal epsilon. if i > warmUp: self.epsilon *= self.decay # Print some episode debugging and reward information. print('Episode: %03d / Steps: %d / Reward: %f' % (i + 1, totalSteps, totalReward / self.rewardScale)) logging.info('Episode: %03d / Steps: %d / Reward: %f' % (i + 1, totalSteps, totalReward / self.rewardScale)) # Simulation only. if not training: continue # Add sequence to buffer. self.buffer.add(sequence) # Resample sequences from the buffer samples = self.buffer.getBatch(self.batchSize) numSamples = len(samples) # Do not train until we have # seen self.warmUp episodes. if self.buffer.getCount() < warmUp: continue # Some more debug info. print('Training on sampled sequences from all episodes.') logging.info('Training on sampled sequences from all episodes.') # All of these do not include time step t = T. # Used to store H[i, t] for each episode i and step t. sampleHistories = np.zeros((numSamples, self.maxSteps - 1, self.maxSteps * self.featureDim)) # Used to store H[i, t + 1] for each episode i and step t. sampleHistoriesWithNext = np.zeros((numSamples, self.maxSteps - 1, self.maxSteps * self.featureDim)) # Used to store R[i, t] for each episode i and step t. sampleRewards = np.zeros((numSamples, self.maxSteps - 1)) # Used to store NotDone[i, t] for each episode i and step t. sampleNotDoneMasks = np.zeros((numSamples, self.maxSteps - 1)) # Used to store action[i, t] taken for each episode i and step t. sampleActions = np.zeros((numSamples, self.maxSteps - 1, self.actionDim)) # Compute info for each episode i. for i in range(numSamples): sample = samples[i] historySoFar = np.zeros((self.maxSteps * self.featureDim)) # Iteratively build up historySoFar for each timestep t. for t in range(len(sample) - 1): step, nextStep = sample[i], sample[i + 1] # This is (oT, aT), which we are adding to running history. history = np.concatenate([step['o'], step['a']], axis = 0) historySoFar[t * self.featureDim:(t + 1) * self.featureDim] = history # This is (o1, a1, o2, a2 ... ot). sampleHistoryEnd = (t + 1) * self.featureDim - self.actionDim sampleHistories[i, t, :sampleHistoryEnd] = historySoFar[:sampleHistoryEnd] # This is (o1, a1, o2, a2 ... ot, at, ot+1). sampleNextEnd = (t + 1) * self.featureDim sampleHistoriesWithNext[i, t, :sampleNextEnd] = historySoFar[:sampleNextEnd] sampleHistoriesWithNext[i, t, sampleNextEnd:sampleNextEnd + self.stateDim] = nextStep['o'] # Set rewards and not done masks. sampleRewards[i, t] = step['r'] sampleActions[i, t] = step['a'] sampleNotDoneMasks[i, t] = 0 if step['d'] else 1 # Separate out self.maxSteps since it is the timestep dimension for RNN. sampleHistories = np.reshape(sampleHistories, (numSamples, self.maxSteps - 1, self.maxSteps, self.featureDim)) # Separate out self.maxSteps since it is the timestep dimension for RNN. sampleHistoriesWithNext = np.reshape(sampleHistoriesWithNext, (numSamples, self.maxSteps - 1, self.maxSteps, self.featureDim)) # Update models using samples, rewards, and masks. self.update(numSamples, sampleHistories, sampleHistoriesWithNext, sampleRewards, sampleActions, sampleNotDoneMasks) # Given a bunch of experienced histories, update our models. def update(self, numSamples, histories, historiesNext, rewards, chosenActions, notDoneMasks): # Reshape [i, t] pairs to a single dimension, which will be the RNN batch dimension. historiesBatch = np.reshape(histories, (-1, self.maxSteps, self.featureDim)) historiesNextBatch = np.reshape(historiesNext, (-1, self.maxSteps, self.featureDim)) # Compute QSample targets [y] for updating the critic Q[S][A] outputs. targetActions = self.actor.target.predict(historiesNextBatch) # (B * (T - 1), F). targetQ = self.critic.target.predict([historiesNextBatch, targetActions]) # (B * (T - 1), 1). targetQ = np.reshape(targetQ, (numSamples, self.maxSteps - 1)) # (B, T - 1). y = rewards + notDoneMasks * (self.gamma * targetQ) # (B, T - 1) y = np.reshape(y, (numSamples * (self.maxSteps - 1), 1)) # (B * (T - 1), 1) # Train the critic model, passing in both the history and chosen actions. chosenActionsFlat = np.reshape(chosenActions, (numSamples * (self.maxSteps - 1), self.actionDim)) # print (chosenActionsFlat.shape, historiesBatch.shape, historiesNextBatch.shape) self.critic.model.train_on_batch([historiesBatch, chosenActionsFlat], y) # Compute the gradient of the critic output WRT to its action input. # We cannot use chosenActions here since those were noisy predictions. currentActionsForGrad = self.actor.model.predict(historiesBatch) currentActionsGrad = self.critic.modelActionGradients(historiesBatch, currentActionsForGrad) # Train the actor model using the critic gradient WRT action input. self.actor.trainModel(historiesBatch, currentActionsGrad) # Update target models. self.actor.trainTarget() self.critic.trainTarget()