def __init__(self, state_dim, action_dim, args): self.discrete = args.discrete self.buffer = On_Policy_Buffer(args.buffer_size) self.gamma = args.gamma self.lambda_gae = args.lambda_gae self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.state_dim = state_dim self.action_dim = action_dim self.training_start = 0 self.training_step = 1 if self.discrete == True: self.actor = Policy_network(self.state_dim, self.action_dim, args.hidden_dim) else: self.actor = Gaussian_Actor(self.state_dim, self.action_dim, args.hidden_dim) self.critic = V_network(self.state_dim) self.network_list = {'Actor': self.actor, 'Critic': self.critic} self.name = 'VPG'
def __init__(self, state_dim, action_dim, args): self.buffer = On_Policy_Buffer(args.buffer_size) self.state_dim = state_dim self.action_dim = action_dim self.discrete = args.discrete self.gamma = args.gamma self.training_start = 0 self.training_step = 1 self.optimizer = tf.keras.optimizers.Adam(args.learning_rate) if args.discrete == True: self.network = Policy_network(self.state_dim, self.action_dim, args.hidden_dim) else: self.network = Gaussian_Actor(self.state_dim, self.action_dim, args.hidden_dim) self.network_list = {'Network': self.network} self.name = 'REINFORCE'
def __init__(self, state_dim, action_dim, args): self.discrete = args.discrete self.buffer = On_Policy_Buffer(args.buffer_size) self.ppo_mode = args.ppo_mode #mode: 'clip' assert self.ppo_mode is 'clip' self.gamma = args.gamma self.lambda_gae = args.lambda_gae self.batch_size = args.batch_size self.clip = args.clip self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.state_dim = state_dim self.action_dim = action_dim self.training_start = 0 self.training_step = args.training_step if self.discrete == True: self.actor = Policy_network(self.state_dim, self.action_dim, args.hidden_dim, kernel_initializer='RandomUniform') else: self.actor = Gaussian_Actor(self.state_dim, self.action_dim, args.hidden_dim, kernel_initializer='RandomUniform') self.critic = V_network(self.state_dim) self.network_list = {'Actor': self.actor, 'Critic': self.critic} self.name = 'PPO'
class PPO: #make it useful for both discrete(cartegorical actor) and continuous actor(gaussian policy) def __init__(self, state_dim, action_dim, args): self.discrete = args.discrete self.buffer = On_Policy_Buffer(args.buffer_size) self.ppo_mode = args.ppo_mode #mode: 'clip' assert self.ppo_mode is 'clip' self.gamma = args.gamma self.lambda_gae = args.lambda_gae self.batch_size = args.batch_size self.clip = args.clip self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.state_dim = state_dim self.action_dim = action_dim self.training_start = 0 self.training_step = args.training_step if self.discrete == True: self.actor = Policy_network(self.state_dim, self.action_dim, args.hidden_dim, kernel_initializer='RandomUniform') else: self.actor = Gaussian_Actor(self.state_dim, self.action_dim, args.hidden_dim, kernel_initializer='RandomUniform') self.critic = V_network(self.state_dim) self.network_list = {'Actor': self.actor, 'Critic': self.critic} self.name = 'PPO' def get_action(self, state): state = np.expand_dims(np.array(state, dtype=np.float32), axis=0) if self.discrete == True: policy = self.actor(state, activation='softmax') dist = tfp.distributions.Categorical(probs=policy) action = dist.sample().numpy() log_prob = dist.log_prob(action).numpy() action = action[0] else: action, log_prob = self.actor(state) action = action.numpy()[0] log_prob = log_prob.numpy()[0] return action, log_prob def eval_action(self, state): state = np.expand_dims(np.array(state, dtype=np.float32), axis=0) if self.discrete == True: policy = self.actor(state, activation='softmax') dist = tfp.distributions.Categorical(probs=policy) action = dist.sample().numpy()[0] else: action, _ = self.actor(state, deterministic=True) action = action.numpy()[0] return action def train(self, training_num): total_a_loss = 0 total_c_loss = 0 s, a, r, ns, d, log_prob = self.buffer.all_sample() old_values = self.critic(s) returns = np.zeros_like(r.numpy()) advantages = np.zeros_like(returns) running_return = np.zeros(1) previous_value = np.zeros(1) running_advantage = np.zeros(1) #GAE for t in reversed(range(len(r))): running_return = (r[t] + self.gamma * running_return * (1 - d[t])).numpy() running_tderror = (r[t] + self.gamma * previous_value * (1 - d[t]) - old_values[t]).numpy() running_advantage = ( running_tderror + (self.gamma * self.lambda_gae) * running_advantage * (1 - d[t])).numpy() returns[t] = running_return previous_value = old_values[t] advantages[t] = running_advantage advantages = (advantages - advantages.mean()) / (advantages.std()) n = len(s) arr = np.arange(n) training_num2 = max(int(n / self.batch_size), 1) #200/32 = 6 for i in range(training_num): for epoch in range(training_num2): #0, 1, 2, 3, 4, 5 if epoch < training_num2 - 1: #5 batch_index = arr[self.batch_size * epoch:self.batch_size * (epoch + 1)] else: batch_index = arr[self.batch_size * epoch:] batch_s = s.numpy()[batch_index] batch_a = a.numpy()[batch_index] batch_returns = returns[batch_index] batch_advantages = advantages[batch_index] batch_old_log_policy = log_prob.numpy()[batch_index] with tf.GradientTape(persistent=True) as tape: if self.discrete == True: policy = self.actor(batch_s, activation='softmax') dist = tfp.distributions.Categorical(probs=policy) log_policy = tf.reshape( dist.log_prob(tf.squeeze(batch_a)), (-1, 1)) ratio = tf.exp(log_policy - batch_old_log_policy) surrogate = ratio * batch_advantages if self.ppo_mode == 'clip': clipped_surrogate = tf.clip_by_value( ratio, 1 - self.clip, 1 + self.clip) * batch_advantages actor_loss = tf.reduce_mean( -tf.minimum(surrogate, clipped_surrogate)) else: raise NotImplementedError else: dist = self.actor.dist(batch_s) log_policy = dist.log_prob(batch_a) ratio = tf.exp(log_policy - batch_old_log_policy) surrogate = ratio * batch_advantages if self.ppo_mode == 'clip': clipped_surrogate = tf.clip_by_value( ratio, 1 - self.clip, 1 + self.clip) * batch_advantages actor_loss = -tf.reduce_mean( tf.minimum(surrogate, clipped_surrogate)) else: raise NotImplementedError critic_loss = 0.5 * tf.reduce_mean( tf.square(batch_returns - self.critic(batch_s))) actor_variables = self.actor.trainable_variables critic_variables = self.critic.trainable_variables actor_gradients = tape.gradient(actor_loss, actor_variables) critic_gradients = tape.gradient(critic_loss, critic_variables) self.actor_optimizer.apply_gradients( zip(actor_gradients, actor_variables)) self.critic_optimizer.apply_gradients( zip(critic_gradients, critic_variables)) del tape total_a_loss += actor_loss.numpy() total_c_loss += critic_loss.numpy() self.buffer.delete() return [['Loss/Actor', total_a_loss], ['Loss/Critic', total_c_loss], ['Entropy/Actor', tf.reduce_mean(dist.entropy()).numpy()]]
class REINFORCE: def __init__(self, state_dim, action_dim, args): self.buffer = On_Policy_Buffer(args.buffer_size) self.state_dim = state_dim self.action_dim = action_dim self.discrete = args.discrete self.gamma = args.gamma self.training_start = 0 self.training_step = 1 self.optimizer = tf.keras.optimizers.Adam(args.learning_rate) if args.discrete == True: self.network = Policy_network(self.state_dim, self.action_dim, args.hidden_dim) else: self.network = Gaussian_Actor(self.state_dim, self.action_dim, args.hidden_dim) self.network_list = {'Network': self.network} self.name = 'REINFORCE' def get_action(self, state): state = np.expand_dims(np.array(state), axis=0) if self.discrete == True: policy = self.network(state, activation='softmax') dist = tfp.distributions.Categorical(probs=policy) action = dist.sample().numpy() log_prob = dist.log_prob(action).numpy() action = action[0] else: action, log_prob = self.network(state) action = action.numpy()[0] log_prob = log_prob.numpy()[0] return action, log_prob def eval_action(self, state): state = np.expand_dims(np.array(state), axis=0) if self.discrete == True: policy = self.network(state, activation='softmax') dist = tfp.distributions.Categorical(probs=policy) action = dist.sample().numpy()[0] else: action, _ = self.network(state, deterministic=True) action = action.numpy()[0] return action def train(self, training_num): total_loss = 0 s, a, r, ns, d, _ = self.buffer.all_sample() returns = np.zeros_like(r.numpy()) running_return = 0 for t in reversed(range(len(r))): running_return = r[t] + self.gamma * running_return * (1 - d[t]) returns[t] = running_return with tf.GradientTape() as tape: if self.discrete == True: policy = self.network(s, activation='softmax') dist = tfp.distributions.Categorical(probs=policy) log_policy = tf.reshape(dist.log_prob(tf.squeeze(a)), (-1, 1)) else: dist = self.network.dist(s) log_policy = dist.log_prob(a) loss = tf.reduce_sum(-log_policy * returns) variables = self.network.trainable_variables gradients = tape.gradient(loss, variables) self.optimizer.apply_gradients(zip(gradients, variables)) total_loss += loss.numpy() self.buffer.delete() return [['Loss/Loss', total_loss]]
class TRPO: def __init__(self, state_dim, action_dim, args): self.discrete = args.discrete self.buffer = Buffer(args.buffer_size) self.gamma = args.gamma self.lambda_gae = args.lambda_gae self.batch_size = args.batch_size self.backtrack_iter = args.backtrack_iter self.backtrack_coeff = args.backtrack_coeff self.delta = args.delta self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.state_dim = state_dim self.action_dim = action_dim self.training_start = 0 self.training_step = args.training_step if self.discrete == True: self.actor = Policy_network(self.state_dim, self.action_dim, args.hidden_dim) self.backup_actor = Policy_network(self.state_dim, self.action_dim, args.hidden_dim) else: self.actor = Gaussian_Actor(self.state_dim, self.action_dim, args.hidden_dim) self.backup_actor = Gaussian_Actor(self.state_dim, self.action_dim, args.hidden_dim) self.critic = V_network(self.state_dim) self.network_list = {'Actor': self.actor, 'Critic': self.critic} self.name = 'TRPO' def get_action(self, state): state = np.expand_dims(np.array(state), axis=0) if self.discrete == True: policy = self.actor(state, activation='softmax').numpy()[0] action = np.random.choice(self.action_dim, 1, p=policy)[0] else: action = self.actor(state).numpy()[0] return action def eval_action(self, state): state = np.expand_dims(np.array(state), axis=0) if self.discrete == True: policy = self.actor(state, activation='softmax').numpy()[0] action = np.argmax(policy) else: action = self.actor(state, deterministic=True).numpy()[0] return action def fisher_vector_product(self, states, p): with tf.GradientTape() as tape2: with tf.GradientTape() as tape1: if self.discrete == True: kl_divergence = tfp.distributions.kl_divergence( tfp.distributions.Categorical( probs=self.actor(states, activation='softmax')), tfp.distributions.Categorical(probs=self.backup_actor( states, activation='softmax'))) else: dist = self.actor.dist(states) backup_dist = self.backup_actor.dist(states) kl_divergence = tfp.distributions.kl_divergence( dist, backup_dist) kl_divergence = tf.reduce_mean(kl_divergence) kl_grad = tape1.gradient(kl_divergence, self.actor.trainable_variables) flatten_kl_grad = tf.concat( [tf.reshape(grad, [-1]) for grad in kl_grad], axis=0) kl_grad_p = tf.reduce_sum(flatten_kl_grad * p) kl_hessian_p = tape2.gradient(kl_grad_p, self.actor.trainable_variables) flatten_kl_hessian_p = tf.concat( [tf.reshape(hessian, [-1]) for hessian in kl_hessian_p], axis=0).numpy() return flatten_kl_hessian_p + 0.1 * p def conjugate_gradient(self, states, b, nsteps): x = np.zeros_like(b) r = copy.deepcopy(b) p = copy.deepcopy(r) rdotr = np.dot(r, r) for i in range(nsteps): _Avp = self.fisher_vector_product(states, p) alpha = rdotr / (np.dot(p, _Avp) + 1e-8) x += alpha * p r -= alpha * _Avp new_rdotr = np.dot(r, r) beta = new_rdotr / (rdotr + 1e-8) p = r + beta * p rdotr = new_rdotr return x def update_model(self, model, new_variables): index = 0 for variable in model.trainable_variables: variable_length = len(tf.reshape(variable, [-1])) new_variable = new_variables[index:index + variable_length] new_variable = tf.reshape(new_variable, tf.shape(variable)) variable.assign(new_variable) index += variable_length def train(self, training_num): total_c_loss = 0 s, a, r, ns, d = self.buffer.all_sample() old_values = self.critic(s) returns = np.zeros_like(r.numpy()) advantages = np.zeros_like(returns) running_return = np.zeros(1) previous_value = np.zeros(1) running_advantage = np.zeros(1) for t in reversed(range(len(r))): #General Advantage Estimation running_return = (r[t] + self.gamma * running_return * (1 - d[t])).numpy() running_tderror = (r[t] + self.gamma * previous_value * (1 - d[t]) - old_values[t]).numpy() running_advantage = ( running_tderror + (self.gamma * self.lambda_gae) * running_advantage * (1 - d[t])).numpy() returns[t] = running_return previous_value = old_values[t] advantages[t] = running_advantage if self.discrete == True: old_policy = self.actor(s, activation='softmax') old_a_one_hot = tf.squeeze(tf.one_hot(tf.cast(a, tf.int32), depth=self.action_dim), axis=1) old_log_policy = tf.reduce_sum(tf.math.log(old_policy) * tf.stop_gradient(old_a_one_hot), axis=1, keepdims=True) else: old_dist = self.actor.dist(s) old_log_policy = old_dist.log_prob(a) old_log_policy = tf.expand_dims(old_log_policy, axis=1) flattened_actor = tf.concat([ tf.reshape(variable, [-1]) for variable in self.actor.trainable_variables ], axis=0) self.update_model(self.backup_actor, flattened_actor) with tf.GradientTape() as tape: if self.discrete == True: policy = self.actor(s, activation='softmax') a_one_hot = tf.squeeze(tf.one_hot(tf.cast(a, tf.int32), depth=self.action_dim), axis=1) log_policy = tf.reduce_sum(tf.math.log(policy) * tf.stop_gradient(a_one_hot), axis=1, keepdims=True) surrogate = tf.reduce_mean( tf.exp(log_policy - tf.stop_gradient(old_log_policy)) * advantages) else: dist = self.actor.dist(s) log_policy = dist.log_prob(a) log_policy = tf.expand_dims(log_policy, axis=1) surrogate = tf.reduce_mean( tf.exp(log_policy - tf.stop_gradient(old_log_policy)) * advantages) policy_grad = tape.gradient(surrogate, self.actor.trainable_variables) flatten_policy_grad = tf.concat( [tf.reshape(grad, [-1]) for grad in policy_grad], axis=0) step_dir = self.conjugate_gradient(s, flatten_policy_grad.numpy(), 10) shs = 0.5 * tf.reduce_sum( step_dir * self.fisher_vector_product(s, step_dir), axis=0) step_size = 1 / tf.sqrt(shs / self.delta) full_step = step_size * step_dir expected_improve = tf.reduce_sum(flatten_policy_grad * full_step, axis=0) flag = False fraction = 1.0 for i in range(self.backtrack_iter): new_flattened_actor = flattened_actor + fraction * full_step self.update_model(self.actor, new_flattened_actor) if self.discrete == True: new_policy = self.actor(s, activation='softmax') new_a_one_hot = tf.squeeze(tf.one_hot(tf.cast(a, tf.int32), depth=self.action_dim), axis=1) new_log_policy = tf.reduce_sum(tf.math.log(new_policy) * tf.stop_gradient(new_a_one_hot), axis=1, keepdims=True) else: new_dist = self.actor.dist(s) new_log_policy = new_dist.log_prob(a) new_log_policy = tf.expand_dims(new_log_policy, axis=1) new_surrogate = tf.reduce_mean( tf.exp(new_log_policy - old_log_policy) * advantages) loss_improve = new_surrogate - surrogate expected_improve *= fraction if self.discrete == True: new_kl_divergence = tfp.distributions.kl_divergence( tfp.distributions.Categorical( probs=self.actor(s, activation='softmax')), tfp.distributions.Categorical( probs=self.backup_actor(s, activation='softmax'))) else: new_dist = self.actor.dist(s) backup_dist = self.backup_actor.dist(s) new_kl_divergence = tfp.distributions.kl_divergence( new_dist, backup_dist) new_kl_divergence = tf.reduce_mean(new_kl_divergence) #print('kl: {:.4f} loss improve: {:.4f} expected improve: {:.4f} ' 'number of line search: {}'.format(new_kl_divergence.numpy(), loss_improve, expected_improve, i)) if new_kl_divergence.numpy( ) <= self.delta and loss_improve >= expected_improve: flag = True break fraction *= self.backtrack_coeff if not flag: self.update_model(self.actor, flattened_actor) print("Policy update failed") #critic_train n = len(s) arr = np.arange(n) for epoch in range(self.training_step): if n // self.batch_size > 0: np.random.shuffle(arr) batch_index = arr[:self.batch_size] batch_index.sort() else: batch_index = arr batch_s = s.numpy()[batch_index] batch_returns = returns[batch_index] with tf.GradientTape() as tape: critic_loss = 0.5 * tf.reduce_mean( tf.square( tf.stop_gradient(batch_returns) - self.critic(batch_s))) critic_variables = self.critic.trainable_variables critic_gradients = tape.gradient(critic_loss, critic_variables) self.critic_optimizer.apply_gradients( zip(critic_gradients, critic_variables)) total_c_loss += critic_loss.numpy() self.buffer.delete() return [['Loss/Critic', total_c_loss]]
class VPG: #make it useful for both discrete(cartegorical actor) and continuous actor(gaussian policy) def __init__(self, state_dim, action_dim, args): self.discrete = args.discrete self.buffer = On_Policy_Buffer(args.buffer_size) self.gamma = args.gamma self.lambda_gae = args.lambda_gae self.actor_optimizer = tf.keras.optimizers.Adam(args.actor_lr) self.critic_optimizer = tf.keras.optimizers.Adam(args.critic_lr) self.state_dim = state_dim self.action_dim = action_dim self.training_start = 0 self.training_step = 1 if self.discrete == True: self.actor = Policy_network(self.state_dim, self.action_dim, args.hidden_dim) else: self.actor = Gaussian_Actor(self.state_dim, self.action_dim, args.hidden_dim) self.critic = V_network(self.state_dim) self.network_list = {'Actor': self.actor, 'Critic': self.critic} self.name = 'VPG' def get_action(self, state): state = np.expand_dims(np.array(state), axis=0) if self.discrete == True: policy = self.actor(state, activation='softmax').numpy() dist = tfp.distributions.Categorical(probs=policy) action = dist.sample().numpy() log_prob = dist.log_prob(action).numpy() action = action[0] else: action, log_prob = self.actor(state) action = action.numpy()[0] log_prob = log_prob.numpy()[0] return action, log_prob def eval_action(self, state): state = np.expand_dims(np.array(state, dtype=np.float32), axis=0) if self.discrete == True: policy = self.actor(state, activation='softmax') dist = tfp.distributions.Categorical(probs=policy) action = dist.sample().numpy()[0] else: action, _ = self.actor(state, deterministic=True) action = action.numpy()[0] return action def train(self, training_num): total_a_loss = 0 total_c_loss = 0 s, a, r, ns, d, _ = self.buffer.all_sample() values = self.critic(s) returns = np.zeros_like(r.numpy()) advantages = np.zeros_like(returns) running_return = np.zeros(1) previous_value = np.zeros(1) running_advantage = np.zeros(1) for t in reversed(range(len(r))): running_return = (r[t] + self.gamma * running_return * (1 - d[t])).numpy() running_tderror = (r[t] + self.gamma * previous_value * (1 - d[t]) - values[t]).numpy() running_advantage = ( running_tderror + (self.gamma * self.lambda_gae) * running_advantage * (1 - d[t])).numpy() returns[t] = running_return previous_value = values[t] advantages[t] = running_advantage with tf.GradientTape(persistent=True) as tape: if self.discrete == True: policy = self.actor(s, activation='softmax') dist = tfp.distributions.Categorical(probs=policy) log_policy = tf.reshape(dist.log_prob(tf.squeeze(a)), (-1, 1)) else: dist = self.actor.dist(s) log_policy = dist.log_prob(a) actor_loss = -tf.reduce_sum( log_policy * tf.stop_gradient(advantages)) critic_loss = 0.5 * tf.reduce_mean( tf.square(tf.stop_gradient(returns) - self.critic(s))) actor_variables = self.actor.trainable_variables critic_variables = self.critic.trainable_variables actor_gradients = tape.gradient(actor_loss, actor_variables) critic_gradients = tape.gradient(critic_loss, critic_variables) self.actor_optimizer.apply_gradients( zip(actor_gradients, actor_variables)) self.critic_optimizer.apply_gradients( zip(critic_gradients, critic_variables)) total_a_loss += actor_loss.numpy() total_c_loss += critic_loss.numpy() self.buffer.delete() return [['Loss/Actor', total_a_loss], ['Loss/Critic', total_c_loss]]