def __init__(self, policy, ob_space, action_space, nenvs, nsteps, ent_coef, vf_coef, max_grad_norm): sess = tf.get_default_session() # CREATE THE PLACEHOLDERS actions_ = tf.placeholder(tf.int32, [None], name="actions_") advantages_ = tf.placeholder(tf.float32, [None], name="advantages_") rewards_ = tf.placeholder(tf.float32, [None], name="rewards_") lr_ = tf.placeholder(tf.float32, name="learning_rate_") # Keep track of old actor oldneglopac_ = tf.placeholder(tf.float32, [None], name="oldneglopac_") # Keep track of old critic oldvpred_ = tf.placeholder(tf.float32, [None], name="oldvpred_") # Cliprange cliprange_ = tf.placeholder(tf.float32, []) # CREATE OUR TWO MODELS # Step_model that is used for sampling step_model = policy(sess, ob_space, action_space, nenvs, 1, reuse=False) # Test model for testing our agent #test_model = policy(sess, ob_space, action_space, 1, 1, reuse=False) # Train model for training train_model = policy(sess, ob_space, action_space, nenvs * nsteps, nsteps, reuse=True) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value # Get the value predicted value_prediction = train_model.vf # Clip the value = Oldvalue + clip(value - oldvalue, min = - cliprange, max = cliprange) value_prediction_clipped = oldvpred_ + tf.clip_by_value( train_model.vf - oldvpred_, -cliprange_, cliprange_) # Unclipped value value_loss_unclipped = tf.square(value_prediction - rewards_) # Clipped value value_loss_clipped = tf.square(value_prediction_clipped - rewards_) # Value loss 0.5 * SUM [max(unclipped, clipped) vf_loss = 0.5 * tf.reduce_mean( tf.maximum(value_loss_unclipped, value_loss_clipped)) # Clip the policy # Output -log(pi) (new -log(pi)) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=actions_) # Remember we want ratio (pi current policy / pi old policy) # But neglopac returns us -log(policy) # So we want to transform it into ratio # e^(-log old - (-log new)) == e^(log new - log old) == e^(log(new / old)) # = new/old (since exponential function cancels log) # Wish we can use latex in comments ratio = tf.exp(oldneglopac_ - neglogpac) # ratio = pi new / pi old # Remember also that we're doing gradient ascent, aka we want to MAXIMIZE the objective function which is equivalent to say # Loss = - J # To make objective function negative we can put a negation on the multiplication (pi new / pi old) * - Advantages pg_loss_unclipped = -advantages_ * ratio # value, min [1 - e] , max [1 + e] pg_loss_clipped = -advantages_ * tf.clip_by_value( ratio, 1.0 - cliprange_, 1.0 + cliprange_) # Final PG loss # Why maximum, because pg_loss_unclipped and pg_loss_clipped are negative, getting the min of positive elements = getting # the max of negative elements pg_loss = tf.reduce_mean(tf.maximum(pg_loss_unclipped, pg_loss_clipped)) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # Total loss (Remember that L = - J because it's the same thing than max J loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = find_trainable_variables("model") # 2. Calculate the gradients grads = tf.gradients(loss, params) if max_grad_norm is not None: # Clip the gradients (normalize) grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da # 3. Build our trainer trainer = tf.train.RMSPropOptimizer(learning_rate=lr_, epsilon=1e-5) # 4. Backpropagation _train = trainer.apply_gradients(grads) # Train function def train(states_in, actions, returns, values, neglogpacs, lr, cliprange): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # Returns = R + yV(s') advantages = returns - values # Normalize the advantages (taken from aborghi implementation) advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) # We create the feed dictionary td_map = { train_model.inputs_: states_in, actions_: actions, advantages_: advantages, # Use to calculate our policy loss rewards_: returns, # Use as a bootstrap for real value lr_: lr, cliprange_: cliprange, oldneglopac_: neglogpacs, oldvpred_: values } policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): """ Save the model """ saver = tf.train.Saver() saver.save(sess, save_path) def load(load_path): """ Load the model """ saver = tf.train.Saver() print('Loading ' + load_path) saver.restore(sess, load_path) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, action_space, nenvs, nsteps, ent_coef, vf_coef, max_grad_norm): sess = tf.get_default_session() # Here we create the placeholders actions_ = tf.placeholder(tf.int32, [None], name="actions_") advantages_ = tf.placeholder(tf.float32, [None], name="advantages_") rewards_ = tf.placeholder(tf.float32, [None], name="rewards_") lr_ = tf.placeholder(tf.float32, name="learning_rate_") # Here we create our two models: # Step_model that is used for sampling step_model = policy(sess, ob_space, action_space, nenvs, 1, reuse=False) # Train model for training train_model = policy(sess, ob_space, action_space, nenvs * nsteps, nsteps, reuse=True) """ Calculate the loss Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss """ # Policy loss # Output -log(pi) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=actions_) # 1/n * sum A(si,ai) * -logpi(ai|si) pg_loss = tf.reduce_mean(advantages_ * neglogpac) # Value loss 1/2 SUM [R - V(s)]^2 vf_loss = tf.reduce_mean( tf.losses.mean_squared_error(tf.squeeze(train_model.vf), rewards_)) # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # Update parameters using loss # 1. Get the model parameters params = find_trainable_variables("model") # 2. Calculate the gradients grads = tf.gradients(loss, params) if max_grad_norm is not None: # Clip the gradients (normalize) grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da # 3. Build our trainer trainer = tf.train.RMSPropOptimizer(learning_rate=lr_, decay=0.99, epsilon=1e-5) # 4. Backpropagation _train = trainer.apply_gradients(grads) def train(states_in, actions, returns, values, lr): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # Returns = R + yV(s') advantages = returns - values # We create the feed dictionary td_map = { train_model.inputs_: states_in, actions_: actions, advantages_: advantages, # Use to calculate our policy loss rewards_: returns, # Use as a bootstrap for real value lr_: lr } policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): """ Save the model """ saver = tf.train.Saver() saver.save(sess, save_path) def load(load_path): """ Load the model """ saver = tf.train.Saver() print('Loading ' + load_path) saver.restore(sess, load_path) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, action_space, nenvs, nsteps, ent_coef, vf_coef, max_grad_norm): sess = tf.get_default_session() # CREATE THE PLACEHOLDERS actions_ = tf.placeholder(tf.int32, [None], name="actions_") advantages_ = tf.placeholder(tf.float32, [None], name="advantages_") rewards_ = tf.placeholder(tf.float32, [None], name="rewards_") lr_ = tf.placeholder(tf.float32, name="learning_rate_") # Keep track of old actor oldneglopac_ = tf.placeholder(tf.float32, [None], name="oldneglopac_") # Keep track of old critic oldvpred_ = tf.placeholder(tf.float32, [None], name="oldvpred_") # Cliprange cliprange_ = tf.placeholder(tf.float32, []) # CREATE OUR TWO MODELS # Step_model that is used for sampling step_model = policy(sess, ob_space, action_space, nenvs, 1, reuse=False) # Test model for testing our agent #test_model = policy(sess, ob_space, action_space, 1, 1, reuse=False) # Train model for training train_model = policy(sess, ob_space, action_space, nenvs*nsteps, nsteps, reuse=True) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value # Get the value predicted value_prediction = train_model.vf # Clip the value = Oldvalue + clip(value - oldvalue, min = - cliprange, max = cliprange) value_prediction_clipped = oldvpred_ + tf.clip_by_value(train_model.vf - oldvpred_, - cliprange_, cliprange_) # Unclipped value value_loss_unclipped = tf.square(value_prediction - rewards_) # Clipped value value_loss_clipped = tf.square(value_prediction_clipped - rewards_) # Value loss 0.5 * SUM [max(unclipped, clipped) vf_loss = 0.5 * tf.reduce_mean(tf.maximum(value_loss_unclipped,value_loss_clipped )) # Clip the policy # Output -log(pi) (new -log(pi)) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=actions_) # Remember we want ratio (pi current policy / pi old policy) # But neglopac returns us -log(policy) # So we want to transform it into ratio # e^(-log old - (-log new)) == e^(log new - log old) == e^(log(new / old)) # = new/old (since exponential function cancels log) # Wish we can use latex in comments ratio = tf.exp(oldneglopac_ - neglogpac) # ratio = pi new / pi old # Remember also that we're doing gradient ascent, aka we want to MAXIMIZE the objective function which is equivalent to say # Loss = - J # To make objective function negative we can put a negation on the multiplication (pi new / pi old) * - Advantages pg_loss_unclipped = -advantages_ * ratio # value, min [1 - e] , max [1 + e] pg_loss_clipped = -advantages_ * tf.clip_by_value(ratio, 1.0 - cliprange_, 1.0 + cliprange_) # Final PG loss # Why maximum, because pg_loss_unclipped and pg_loss_clipped are negative, getting the min of positive elements = getting # the max of negative elements pg_loss = tf.reduce_mean(tf.maximum(pg_loss_unclipped, pg_loss_clipped)) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # Total loss (Remember that L = - J because it's the same thing than max J loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = find_trainable_variables("model") # 2. Calculate the gradients grads = tf.gradients(loss, params) if max_grad_norm is not None: # Clip the gradients (normalize) grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da # 3. Build our trainer trainer = tf.train.RMSPropOptimizer(learning_rate=lr_, epsilon=1e-5) # 4. Backpropagation _train = trainer.apply_gradients(grads) # Train function def train(states_in, actions, returns, values, neglogpacs, lr, cliprange): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # Returns = R + yV(s') advantages = returns - values # Normalize the advantages (taken from aborghi implementation) advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) # We create the feed dictionary td_map = {train_model.inputs_: states_in, actions_: actions, advantages_: advantages, # Use to calculate our policy loss rewards_: returns, # Use as a bootstrap for real value lr_: lr, cliprange_: cliprange, oldneglopac_: neglogpacs, oldvpred_: values} policy_loss, value_loss, policy_entropy, _= sess.run([pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): """ Save the model """ saver = tf.train.Saver() saver.save(sess, save_path) def load(load_path): """ Load the model """ saver = tf.train.Saver() print('Loading ' + load_path) saver.restore(sess, load_path) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, action_space, nenvs, nsteps, ent_coef, vf_coef, max_grad_norm): sess = tf.get_default_session() # sess = tf_debug.LocalCLIDebugWrapperSessionp.array([[1, 1], [2, 2], [3, 3]], dtype=np.float32)n(sess) # Here we create the placeholders timestr = time.strftime("%Y%m%d-%H%M%S") dirname = "./" + timestr + "log" logger.configure(dir=dirname) actions_ = tf.placeholder(tf.int32, [None], name="actions_") advantages_ = tf.placeholder(tf.float32, [None], name="advantages_") rewards_ = tf.placeholder(tf.float32, [None], name="rewards_") lr_ = tf.placeholder(tf.float32, name="learning_rate_") # Here we create our two models: # Step_model that is used for sampling step_model = policy(sess, ob_space, action_space, nenvs, 1, reuse=False) #reuse why? # Train model for training train_model = policy(sess, ob_space, action_space, nenvs * nsteps, nsteps, reuse=True) """ Calculate the loss Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss """ # Policy loss # Output -log(pi) l1 = [] # print(actions_.shape) # # actions_copy=tf.identity(actions_) # # for i in range(0-0.01],actions_copy.shape): # actions_copy[i]=train_model.softmax_layer[actions_copy[i]] # # # # result = recursive_map(actions_copy) # neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=actions_) if flag.LAST_LAYER_IMPL: neglogpac = (-1) * tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.p_layer, labels=actions_) else: neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=actions_) #neglogpac=train_model.pd.neglogp(actions_) # 1/n * sum A(si,ai) * -logpi(ai|si) # pg_loss = tf.reduce_mean(advantages_ * neglogpac) pg_loss = tf.reduce_mean(advantages_ * neglogpac) # Value loss 1/2 SUM [R - V(s)]^2 vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), rewards_)) # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. # entropy = tf.reduce_mean(train_model.pd.entropy()) if flag.LAST_LAYER_IMPL: entropy = tf.reduce_mean(train_model.dist.entropy(name="ent")) else: entropy = tf.reduce_mean(train_model.pd.entropy()) # vf_loss=tf.zeros(vf_loss.shape,dtype=tf.float32) loss = pg_loss - (entropy * ent_coef) + (vf_loss * vf_coef) # Update parameters using loss # 1. Get the model parameters params = find_trainable_variables("model") # 2. Calculate the gradients grads = tf.gradients(loss, params) if max_grad_norm is not None: # Clip the gradients (normalize) grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da # 3. Build our trainer trainer = tf.train.RMSPropOptimizer(learning_rate=lr_, decay=0.99, epsilon=1e-5) # 4. Backpropagation _train = trainer.apply_gradients(grads) def train(states_in, actions, returns, values, lr): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # Returns = R + yV(s') advantages = returns - values # print(advantages.shape) # print(actions_.shape) # exit # We create the feed dictionary td_map = { train_model.inputs_: states_in, actions_: actions, advantages_: advantages, # Use to calculate our policy loss rewards_: returns, # Use as a bootstrap for real value lr_: lr } if flag.LAST_LAYER_IMPL: pi1, policy_loss, neglogpac1, value_loss, policy_entropy, _ = sess.run( [ train_model.softmax_layer, pg_loss, neglogpac, vf_loss, entropy, _train ], td_map) else: pi1, policy_loss, neglogpac1, value_loss, policy_entropy, _ = sess.run( [ train_model.pi, pg_loss, neglogpac, vf_loss, entropy, _train ], td_map) if flag.DEBUG: print("pd", pi1) #logger.record_tabular("neglog", neglogpac1) #logger.record_tabular("adv", advantages) return policy_loss, value_loss, policy_entropy def save(save_path): """ Save the model """ saver = tf.train.Saver() saver.save(sess, save_path) def load(load_path): """ Load the model """ saver = tf.train.Saver() print('Loading ' + load_path) saver.restore(sess, load_path) self.train = train self.train_model = train_model # self.step_model = step_model self.step_model = train_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, action_space, nenvs, nsteps, ent_coef, vf_coef, max_grad_norm): sess = tf.get_default_session() K.set_session(sess) K.set_learning_phase(1) # Create the placeholders actions_ = tf.placeholder(tf.int32, [None], name='actions_') advantages_ = tf.placeholder(tf.float32, [None], name='advantages_') rewards_ = tf.placeholder(tf.float32, [None], name='rewards_') lr_ = tf.placeholder(tf.float32, name='learning_rate_') # keep track of old actor oldneglopac_ = tf.placeholder(tf.float32, [None], name='oldneglopac_') # Keep track of old critic oldvpred_ = tf.placeholder(tf.float32, [None], name='oldvpred_') # Cliprange cliprange_ = tf.placeholder(tf.float32, []) # Create our two models # Step model that is used for sampling step_model = policy(sess, ob_space, action_space, nenvs, 1, reuse=False) # Test model for testing our agent #test_model = policy(sess, ob_space, action_space, 1, 1, reuse=False) # Train model for training train_model = policy(sess, ob_space, action_space, nenvs * nsteps, nsteps, reuse=True) print('availPi', train_model.availPi) tf.print(train_model.availPi, [train_model.availPi], 'train_model.availPi') l0 = train_model.availPi - tf.reduce_max( train_model.availPi, axis=-1, keep_dims=True) el0 = tf.exp(l0) z0 = tf.reduce_sum(el0, axis=-1, keep_dims=True) p0 = el0 / z0 entropy = -tf.reduce_sum((p0 + 1e-8) * tf.log(p0 + 1e-8), axis=-1) oneHotActions = tf.one_hot(actions_, train_model.pi.get_shape().as_list()[-1]) neglogpac = -tf.log( tf.reduce_sum(tf.multiply(p0, oneHotActions), axis=-1)) def neglogp(state, valid_ins, actions): return sess.run( neglogpac, { network.X: state, network.available_moves: valid_ins, actions_: actions }) self.neglogp = neglogp # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value # Get the value predicted value_prediction = train_model.vf # Clip the value = Oldvalue + clip(value - oldvalue, min = -cliprange, max = cliprange) value_prediction_clipped = oldvpred_ + tf.clip_by_value( train_model.vf - oldvpred_, -cliprange_, cliprange_) # Unclipped value value_loss_unclipped = tf.square(value_prediction - rewards_) # Clipped value value_loss_clipped = tf.square(value_prediction_clipped - rewards_) # Value loss 0.5 * SUM [max(unclipped, clipped)] vf_loss = 0.5 * tf.reduce_mean( tf.maximum(value_loss_unclipped, value_loss_clipped)) # Clip the policy # Output -log(pi) (new - log(pi)) # neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=actions_) # Remember we want ratio (pi current policy / pi old policy) # But neglogpac returns us -log(policy) # So we want to transform it into ratio # e^(-log old - (ilog new)) == e^(log new - log old) == e^(log(new / old)) # = new/old (since expoenential function cancels log) # wish we can use latex in comments ratio = tf.exp(oldneglopac_ - neglogpac) # ratio = pi new / pi old # remember also that we're doing gradient ascent, aka we want to maximize the objective function # which Loss = - J # To make objective function negative we put a negation on the multiplcation (pi new/pi old) * - Advantages pg_loss_unclipped = -advantages_ * ratio # value, min [1-e], max[1+e] pg_loss_clipped = -advantages_ * tf.clip_by_value( ratio, 1.0 - cliprange_, 1.0 + cliprange_) # Final PG Loss # Why maximum because log_loss_unclipped and pg_loss_clipped are negative, gitting the min of positive elements = getting # the max of negative elements pg_loss = tf.reduce_mean(tf.maximum(pg_loss_unclipped, pg_loss_clipped)) # Calculate the entropy # Entropy is usedto improve exploration by limiting the premature convergence to suboptimal policy. entropy_loss = tf.reduce_mean(entropy) # Total loss (Remember that L = - J because it's the same thing than max J) loss = pg_loss - entropy_loss * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = find_trainable_variables('model') # 2. Calculate the gradients grads = tf.gradients(loss, params) if max_grad_norm is not None: # Clip the gradients (normalize) grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da # 3. Build our training trainer = tf.train.RMSPropOptimizer(learning_rate=lr_, epsilon=1e-5) # 4. Backpropagation _train = trainer.apply_gradients(grads) # Train function def train(states_in, valid_ins, text_ins, actions, returns, values, neglogpacs, lr, cliprange): for ob_text in text_ins: # print('ob_text', ob_text) train_model.tokenizer.fit_on_texts([ob_text.decode("utf-8")]) # preprocess text. maybe do inside of env later? ob_text_input = [] for ob_text in text_ins: # print('ob_text utf8', ob_text.decode("utf-8")) token = train_model.tokenizer.texts_to_sequences( [ob_text.decode("utf-8")]) token = sequence.pad_sequences( token, maxlen=200) # pre_padding with 0 ob_text_input.append(token) # print('token', token) # print('token shape', token.shape) ob_text_input = np.array(ob_text_input) shape = ob_text_input.shape # print('ob_text_input shape', shape) ob_text_input = ob_text_input.reshape(shape[0], shape[2]) # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # Retruns = R + yV(s') advantages = returns - values #Normalize the advantages (taken from aborghi implementation) advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) td_map = { train_model.text_inputs_: ob_text_input, train_model.available_moves: valid_ins, actions_: actions, advantages_: advantages, # Use to calculate our policy loss rewards_: returns, # Use as a bootstrap for reward value lr_: lr, cliprange_: cliprange, oldneglopac_: neglogpacs, oldvpred_: values } td_map.update(train_model.split_categories_from_state(states_in)) policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy_loss, _train], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): """ Save the model """ saver = tf.train.Saver() saver.save(sess, save_path) def load(load_path): """ Load the model """ saver = tf.train.Saver() print('Loading ' + load_path) saver.restore(sess, load_path) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, action_space, nenvs, nsteps, ent_coeff, vf_coeff, max_grad_norm): sess = tf.get_default_session() #Define placeholders actions_ = tf.placeholder(tf.int32, [None], name="actions_") advantages_ = tf.placeholder(tf.float32, [None], name="advantages_") rewards_ = tf.placeholder(tf.float32, [None], name="rewards_") lr_ = tf.placeholder(tf.float32, name="learning_rate_") #Create our two models here #take one step for each environment step_model = policy(sess, ob_space, action_space, nenvs, 1, reuse=False) #take number of steps * number of environments for total steps train_model = policy(sess, ob_space, action_space, nenvs * nsteps, nsteps, reuse=True) #calculate the loss #Note: in the future we can add clipped Loss to control the step size of our parameter updates. #This can lead to better convergence *Using PPO* #Recall that Total Loss = PolicyGradientLoss - Entropy*EntropyCoeff + Value*ValueCoeff #output loss -log(policy) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( Logits=train_model.pi, Labels=actions_, ) #1/n * sum(A(s,a) * -logpi(a|s)) pg_loss = tf.reduce_mean(advantages_ * neglogpac) #value loss vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), rewards_)) #entropy entropy = tf.reduce_mean(train_model.pd.entropy()) #total loss loss = pg_loss - (entropy * ent_coeff) + (vf_loss * vf_coeff) #Update the parameters using the loss we've just calculated #Grab model params params = find_trainable_variables("model") #Calculate gradients. *We'll want to zip our parameters w/ our gradients grads = tf.gradients(loss, params) if max_grad_norm is not None: #Clip the gradients (normalize) grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = List(zip(grads, params)) #build our trainer trainer = tf.train.RMSPropOptimizer(Learning_rate=lr_, decay=0.99, epsilon=1e-5) #Backprop _train = trainer.apply_gradients(grads) def train(states_in, actions, returns, values, lr): #here we calculate advantage A(s, a) = R+yV(s') - V(s) #Returns = R+yV(S') advantages = returns - values td_map = { train_model.inputs_: states_in, actions_: actions, advantages_: advantages, rewards_: returns, #Recall we bootstrap "real" value since we're learning 1 step at a time. (not episode) lr_: lr } policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): saver = tf.train.Saver() saver.save(sess, save_path) def load(load_path): saver = tf.train.Saver() saver.restore(sess, load_path) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, action_space, nenvs, nsteps, ent_coef, vf_coef, max_grad_norm): sess = tf.get_default_session() # Here we create the placeholders actions_ = tf.placeholder(tf.int32, [None], name="actions_") advantages_ = tf.placeholder(tf.float32, [None], name="advantages_") rewards_ = tf.placeholder(tf.float32, [None], name="rewards_") lr_ = tf.placeholder(tf.float32, name="learning_rate_") # Here we create our two models: # Step_model that is used for sampling step_model = policy(sess, ob_space, action_space, nenvs, 1, reuse=False) # Train model for training train_model = policy(sess, ob_space, action_space, nenvs*nsteps, nsteps, reuse=True) """ Calculate the loss Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss """ # Policy loss # Output -log(pi) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=actions_) # 1/n * sum A(si,ai) * -logpi(ai|si) pg_loss = tf.reduce_mean(advantages_ * neglogpac) # Value loss 1/2 SUM [R - V(s)]^2 vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf),rewards_)) # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # Update parameters using loss # 1. Get the model parameters params = find_trainable_variables("model") # 2. Calculate the gradients grads = tf.gradients(loss, params) if max_grad_norm is not None: # Clip the gradients (normalize) grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da # 3. Build our trainer trainer = tf.train.RMSPropOptimizer(learning_rate=lr_, decay=0.99, epsilon=1e-5) # 4. Backpropagation _train = trainer.apply_gradients(grads) def train(states_in, actions, returns, values, lr): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # Returns = R + yV(s') advantages = returns - values # We create the feed dictionary td_map = {train_model.inputs_: states_in, actions_: actions, advantages_: advantages, # Use to calculate our policy loss rewards_: returns, # Use as a bootstrap for real value lr_: lr} policy_loss, value_loss, policy_entropy, _= sess.run([pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): """ Save the model """ saver = tf.train.Saver() saver.save(sess, save_path) def load(load_path): """ Load the model """ saver = tf.train.Saver() print('Loading ' + load_path) saver.restore(sess, load_path) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)