def build_permutation(self): with tf.variable_scope("encoder"): Encoder = Attentive_encoder(self.config) encoder_output = Encoder.encode(self.input_) with tf.variable_scope('decoder'): # Ptr-net returns permutations (self.positions), with their log-probability for backprop self.ptr = Pointer_decoder(encoder_output, self.config) self.positions, self.log_softmax = self.ptr.loop_decode() variable_summaries('log_softmax',self.log_softmax, with_max_min = True)
def build_permutation(self): with tf.variable_scope("encoder"): with tf.variable_scope("embedding"): # Embed input sequence W_embed = tf.get_variable( "weights", [1, self.input_dimension + 2, self.input_embed], initializer=self.initializer) # +2 for TW feat. here too embedded_input = tf.nn.conv1d(self.input_, W_embed, 1, "VALID", name="embedded_input") # Batch Normalization embedded_input = tf.layers.batch_normalization( embedded_input, axis=2, training=self.is_training, name='layer_norm', reuse=None) with tf.variable_scope("dynamic_rnn"): # Encode input sequence cell1 = LSTMCell( self.num_neurons, initializer=self.initializer ) # BNLSTMCell(self.num_neurons, self.training) or cell1 = DropoutWrapper(cell1, output_keep_prob=0.9) # Return the output activations [Batch size, Sequence Length, Num_neurons] and last hidden state as tensors. encoder_output, encoder_state = tf.nn.dynamic_rnn( cell1, embedded_input, dtype=tf.float32) with tf.variable_scope('decoder'): # Ptr-net returns permutations (self.positions), with their log-probability for backprop self.ptr = Pointer_decoder(encoder_output, self.config) self.positions, self.log_softmax, self.attending, self.pointing = self.ptr.loop_decode( encoder_state) variable_summaries('log_softmax', self.log_softmax, with_max_min=True)
class Actor(object): def __init__(self, config): self.config=config # Data config self.batch_size = config.batch_size # batch size self.max_length = config.max_length # input sequence length (number of cities) self.input_dimension = config.input_dimension # dimension of a city (coordinates) # Reward config self.avg_baseline = tf.Variable(config.init_baseline, trainable=False, name="moving_avg_baseline") # moving baseline for Reinforce self.alpha = config.alpha # moving average update # Training config (actor) self.global_step= tf.Variable(0, trainable=False, name="global_step") # global step self.lr1_start = config.lr1_start # initial learning rate self.lr1_decay_rate= config.lr1_decay_rate # learning rate decay rate self.lr1_decay_step= config.lr1_decay_step # learning rate decay step # Training config (critic) self.global_step2 = tf.Variable(0, trainable=False, name="global_step2") # global step self.lr2_start = config.lr1_start # initial learning rate self.lr2_decay_rate= config.lr1_decay_rate # learning rate decay rate self.lr2_decay_step= config.lr1_decay_step # learning rate decay step # Tensor block holding the input sequences [Batch Size, Sequence Length, Features] self.input_ = tf.placeholder(tf.float32, [self.batch_size, self.max_length, self.input_dimension], name="input_coordinates") self.build_permutation() self.build_critic() self.build_reward() self.build_optim() self.merged = tf.summary.merge_all() def build_permutation(self): with tf.variable_scope("encoder"): Encoder = Attentive_encoder(self.config) encoder_output = Encoder.encode(self.input_) with tf.variable_scope('decoder'): # Ptr-net returns permutations (self.positions), with their log-probability for backprop self.ptr = Pointer_decoder(encoder_output, self.config) self.positions, self.log_softmax = self.ptr.loop_decode() variable_summaries('log_softmax',self.log_softmax, with_max_min = True) def build_critic(self): with tf.variable_scope("critic"): # Critic predicts reward (parametric baseline for REINFORCE) self.critic = Critic(self.config) self.critic.predict_rewards(self.input_) variable_summaries('predictions',self.critic.predictions, with_max_min = True) def build_reward(self): with tf.name_scope('permutations'): # Reorder input % tour self.ordered_input_ = [] for input_, path in zip(tf.unstack(self.input_,axis=0), tf.unstack(self.positions,axis=0)): # Unstack % batch axis self.ordered_input_.append(tf.gather_nd(input_,tf.expand_dims(path,1))) self.ordered_input_ = tf.transpose(tf.stack(self.ordered_input_,0),[2,1,0]) # [batch size, seq length +1 , features] to [features, seq length +1, batch_size] Rq: +1 because end = start = first_city # Ordered coordinates ordered_x_ = self.ordered_input_[0] # [seq length +1, batch_size] delta_x2 = tf.transpose(tf.square(ordered_x_[1:]-ordered_x_[:-1]),[1,0]) # [batch_size, seq length] delta_x**2 ordered_y_ = self.ordered_input_[1] # [seq length +1, batch_size] delta_y2 = tf.transpose(tf.square(ordered_y_[1:]-ordered_y_[:-1]),[1,0]) # [batch_size, seq length] delta_y**2 with tf.name_scope('environment'): # Get tour length (euclidean distance) inter_city_distances = tf.sqrt(delta_x2+delta_y2) # sqrt(delta_x**2 + delta_y**2) this is the euclidean distance between each city: depot --> ... ---> depot [batch_size, seq length] self.distances = tf.reduce_sum(inter_city_distances, axis=1) # [batch_size] #variable_summaries('tour_length',self.distances, with_max_min = True) # Define reward from tour length self.reward = tf.cast(self.distances,tf.float32) variable_summaries('reward',self.reward, with_max_min = True) def build_optim(self): # Update moving_mean and moving_variance for batch normalization layers update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): with tf.name_scope('baseline'): # Update baseline reward_mean, reward_var = tf.nn.moments(self.reward,axes=[0]) self.base_op = tf.assign(self.avg_baseline, self.alpha*self.avg_baseline+(1.0-self.alpha)*reward_mean) tf.summary.scalar('average baseline',self.avg_baseline) with tf.name_scope('reinforce'): # Actor learning rate self.lr1 = tf.train.exponential_decay(self.lr1_start, self.global_step, self.lr1_decay_step,self.lr1_decay_rate, staircase=False, name="learning_rate1") # Optimizer self.opt1 = tf.train.AdamOptimizer(learning_rate=self.lr1,beta1=0.9,beta2=0.99, epsilon=0.0000001) # Discounted reward self.reward_baseline = tf.stop_gradient(self.reward - self.avg_baseline - self.critic.predictions) # [Batch size, 1] variable_summaries('reward_baseline',self.reward_baseline, with_max_min = True) # Loss self.loss1 = tf.reduce_mean(self.reward_baseline*self.log_softmax,0) tf.summary.scalar('loss1', self.loss1) # Minimize step gvs = self.opt1.compute_gradients(self.loss1) capped_gvs = [(tf.clip_by_norm(grad, 1.), var) for grad, var in gvs if grad is not None] # L2 clip self.train_step1 = self.opt1.apply_gradients(capped_gvs, global_step=self.global_step) with tf.name_scope('state_value'): # Critic learning rate self.lr2 = tf.train.exponential_decay(self.lr2_start, self.global_step2, self.lr2_decay_step,self.lr2_decay_rate, staircase=False, name="learning_rate1") # Optimizer self.opt2 = tf.train.AdamOptimizer(learning_rate=self.lr2,beta1=0.9,beta2=0.99, epsilon=0.0000001) # Loss weights_ = 1.0 #weights_ = tf.exp(self.log_softmax-tf.reduce_max(self.log_softmax)) # probs / max_prob self.loss2 = tf.losses.mean_squared_error(self.reward - self.avg_baseline, self.critic.predictions, weights = weights_) tf.summary.scalar('loss2', self.loss1) # Minimize step gvs2 = self.opt2.compute_gradients(self.loss2) capped_gvs2 = [(tf.clip_by_norm(grad, 1.), var) for grad, var in gvs2 if grad is not None] # L2 clip self.train_step2 = self.opt1.apply_gradients(capped_gvs2, global_step=self.global_step2)
class Actor(object): def __init__(self, config): self.config = config # Data config self.batch_size = config.batch_size # batch size self.max_length = config.nCells * config.nMuts # input sequence length self.input_dimension = config.input_dimension # dimension of a input # Reward config #self.avg_baseline = tf.Variable(config.init_baseline, trainable=False, name="moving_avg_baseline") # moving baseline for Reinforce # self.ma = config.ma # moving average update self.beta = config.beta # hyperparameter for adjusting NLL # Training config (actor) self.global_step = tf.Variable(0, trainable=False, name="global_step") # global step self.lr1_start = config.lr1_start # initial learning rate self.lr1_decay_rate = config.lr1_decay_rate # learning rate decay rate self.lr1_decay_step = config.lr1_decay_step # learning rate decay step # Training config (critic) self.global_step2 = tf.Variable(0, trainable=False, name="global_step2") # global step self.lr2_start = config.lr1_start # initial learning rate self.lr2_decay_rate = config.lr1_decay_rate # learning rate decay rate self.lr2_decay_step = config.lr1_decay_step # learning rate decay step # Tensor block holding the input sequences [Batch Size, Sequence Length, Features] self.input_ = tf.placeholder( tf.float32, [self.batch_size, self.max_length, self.input_dimension], name="input_coordinates") self.build_permutation() self.build_critic() self.build_reward() self.build_optim() self.merged = tf.summary.merge_all() def count3gametes(self, input_): columnPairs = list(itertools.permutations(range(self.config.nMuts), 2)) nColumnPairs = len(columnPairs) columnReplicationList = np.array(columnPairs).reshape(-1) l = [] for i in range(input_.get_shape()[0]): for j in range(self.config.nCells): for k in columnReplicationList: l.append([i, j, k]) replicatedColumns = tf.reshape(tf.gather_nd(input_, l), [ input_.get_shape()[0], self.config.nCells, len(columnReplicationList) ]) replicatedColumns = tf.transpose(replicatedColumns, perm=[0, 2, 1]) x = tf.reshape( replicatedColumns, [input_.get_shape()[0], nColumnPairs, 2, self.config.nCells]) col10 = tf.count_nonzero(tf.greater(x[:, :, 0, :], x[:, :, 1, :]), axis=2) # batch_size * nColumnPairs col01 = tf.count_nonzero(tf.greater(x[:, :, 1, :], x[:, :, 0, :]), axis=2) # batch_size * nColumnPairs col11 = tf.count_nonzero(tf.equal(x[:, :, 0, :] + x[:, :, 1, :], 2), axis=2) # batch_size * nColumnPairs eachColPair = col10 * col01 * col11 # batch_size * nColumnPairs return tf.reduce_sum(eachColPair, axis=1) # batch_size def build_permutation(self): with tf.variable_scope("encoder"): Encoder = Attentive_encoder(self.config) encoder_output = Encoder.encode(self.input_) with tf.variable_scope('decoder'): # Ptr-net returns permutations (self.positions), with their log-probability for backprop self.ptr = Pointer_decoder(encoder_output, self.config) self.positions, self.log_softmax = self.ptr.loop_decode() variable_summaries('log_softmax', self.log_softmax, with_max_min=True) def build_critic(self): with tf.variable_scope("critic"): # Critic predicts reward (parametric baseline for REINFORCE) self.critic = Critic(self.config) self.critic.predict_rewards(self.input_) variable_summaries('predictions', self.critic.predictions, with_max_min=True) def build_reward(self): with tf.name_scope('permutations'): # Reorder input % tour inp_ = tf.identity(self.input_) pos = tf.identity(self.positions) x = tf.zeros([int(self.max_length / 2), self.batch_size], tf.float32) for i in range(int(self.max_length / 2)): r = tf.range(start=0, limit=self.batch_size, delta=1) r = tf.expand_dims(r, 1) r = tf.expand_dims(r, 2) r3 = tf.cast( tf.ones([self.max_length, 1]) * tf.cast(r, tf.float32), tf.int32) r4 = tf.squeeze(r, axis=2) r5 = tf.expand_dims(tf.fill([self.batch_size], i), axis=1) u = tf.ones_like(r5) r4_r5 = tf.concat([r4, r5], axis=1) pos_mask = tf.squeeze(tf.scatter_nd( indices=r4_r5, updates=u, shape=[self.batch_size, self.max_length, 1]), axis=2) pos_mask_cum1 = tf.cumsum(pos_mask, reverse=True, exclusive=True, axis=1) pos_mask_cum2 = tf.cumsum(pos_mask, reverse=False, exclusive=False, axis=1) # for calculating NLL per_pos = tf.concat([r3, tf.expand_dims(pos, axis=2)], axis=2) per_ = tf.gather_nd(inp_, indices=per_pos) per_fp_fn = per_[:, :, 2:3] per_fp_fn_log = tf.log(1 / per_fp_fn) # for N01 and N10 per_fp_fn_com = tf.subtract(tf.ones_like(per_fp_fn), per_fp_fn) # for N00 and N11 per_fp_fn_com_log = tf.log(1 / per_fp_fn_com) NLL_N10_N01 = tf.reduce_sum(tf.multiply( tf.squeeze(per_fp_fn_log, axis=2), tf.cast(pos_mask_cum1, tf.float32)), axis=1, keepdims=True) per_matrix_mul_cum2 = tf.multiply( tf.squeeze(per_[:, :, 3:4], axis=2), tf.cast(pos_mask_cum2, tf.float32)) N11 = tf.reduce_sum(per_matrix_mul_cum2, axis=1, keepdims=True) sum_mask_cum2 = tf.reduce_sum(tf.cast(pos_mask_cum2, tf.float32), axis=1, keepdims=True) N00 = tf.subtract(sum_mask_cum2, N11) per_matrix = per_[:, :, 3:4] sum_per_matrix = tf.reduce_sum(tf.squeeze(per_matrix, axis=2), axis=1) sum_per_fp = tf.reduce_sum(tf.squeeze(tf.multiply( per_fp_fn, per_matrix), axis=2), axis=1) fp = tf.divide(sum_per_fp, sum_per_matrix) sum_per_fn = tf.subtract( tf.reduce_sum(tf.squeeze(per_fp_fn, axis=2), axis=1), sum_per_fp) q = tf.cast( tf.tile(tf.constant([self.max_length]), tf.constant([self.batch_size])), tf.float32) fn = tf.divide(sum_per_fn, tf.subtract(q, sum_per_matrix)) fp_com = tf.log(1 / tf.subtract( tf.cast( tf.tile(tf.constant([1]), tf.constant( [self.batch_size])), tf.float32), fp)) fn_com = tf.log(1 / tf.subtract( tf.cast( tf.tile(tf.constant([1]), tf.constant( [self.batch_size])), tf.float32), fn)) N00_NLL = tf.multiply(tf.expand_dims(fp_com, axis=1), N00) N11_NLL = tf.multiply(tf.expand_dims(fn_com, axis=1), N11) NLL = tf.scalar_mul(self.beta, tf.add_n([NLL_N10_N01, N00_NLL, N11_NLL])) m1 = tf.multiply(tf.squeeze(per_matrix, axis=2), tf.cast(pos_mask_cum1, tf.float32)) m1 = tf.subtract(tf.cast(pos_mask_cum1, tf.float32), m1) m2 = tf.multiply(tf.squeeze(per_matrix, axis=2), tf.cast(pos_mask_cum2, tf.float32)) T_f = tf.add(m1, m2) per_flipped = tf.concat( [per_[:, :, 0:3], tf.expand_dims(T_f, axis=2)], axis=2) idx = tf.concat( [r3, tf.cast(per_flipped[:, :, 0:2], tf.int32)], axis=2) m_f = tf.scatter_nd(indices=tf.expand_dims(idx, 2), updates=per_flipped[:, :, 3:4], shape=tf.constant([ self.batch_size, self.config.nCells, self.config.nMuts ])) c_v = self.count3gametes(m_f) c_t = tf.expand_dims(tf.add(tf.squeeze(NLL, axis=1), tf.cast(c_v, tf.float32)), axis=0) ind = [] for i1 in range(x.get_shape()[1]): ind.append([i, i1]) ind = tf.convert_to_tensor(ind) ind = tf.expand_dims(ind, axis=0) x_n = tf.scatter_nd(indices=ind, updates=c_t, shape=x.get_shape()) x_m = tf.reduce_min(x, axis=0) self.cost = tf.identity(x_m) with tf.name_scope('environment'): cost = tf.identity(self.cost) # Define reward from tour length self.reward = tf.cast(cost, tf.float32) variable_summaries('reward', self.reward, with_max_min=True) def build_optim(self): # Update moving_mean and moving_variance for batch normalization layers update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): with tf.name_scope('baseline'): # Update baseline reward_mean, reward_var = tf.nn.moments(self.reward, axes=[0]) with tf.name_scope('reinforce'): # Actor learning rate self.lr1 = tf.train.exponential_decay(self.lr1_start, self.global_step, self.lr1_decay_step, self.lr1_decay_rate, staircase=False, name="learning_rate1") # Optimizer self.opt1 = tf.train.AdamOptimizer(learning_rate=self.lr1, beta1=0.9, beta2=0.99, epsilon=0.0000001) # Discounted reward self.reward_baseline = tf.stop_gradient( self.reward - self.critic.predictions) # [Batch size, 1] variable_summaries('reward_baseline', self.reward_baseline, with_max_min=True) # Loss self.loss1 = tf.reduce_mean( self.reward_baseline * self.log_softmax, 0) tf.summary.scalar('loss1', self.loss1) # Minimize step gvs = self.opt1.compute_gradients(self.loss1) capped_gvs = [(tf.clip_by_norm(grad, 1.), var) for grad, var in gvs if grad is not None] # L2 clip self.train_step1 = self.opt1.apply_gradients( capped_gvs, global_step=self.global_step) with tf.name_scope('state_value'): # Critic learning rate self.lr2 = tf.train.exponential_decay(self.lr2_start, self.global_step2, self.lr2_decay_step, self.lr2_decay_rate, staircase=False, name="learning_rate1") # Optimizer self.opt2 = tf.train.AdamOptimizer(learning_rate=self.lr2, beta1=0.9, beta2=0.99, epsilon=0.0000001) # Loss weights_ = 1.0 #weights_ = tf.exp(self.log_softmax-tf.reduce_max(self.log_softmax)) # probs / max_prob self.loss2 = tf.losses.mean_squared_error( self.reward, self.critic.predictions, weights=weights_) tf.summary.scalar('loss2', self.loss1) # Minimize step gvs2 = self.opt2.compute_gradients(self.loss2) capped_gvs2 = [(tf.clip_by_norm(grad, 1.), var) for grad, var in gvs2 if grad is not None] # L2 clip self.train_step2 = self.opt1.apply_gradients( capped_gvs2, global_step=self.global_step2)
class Actor(object): def __init__(self, config): self.config = config # Data config self.batch_size = config.batch_size # batch size self.max_length = config.max_length # input sequence length (number of cities) self.input_dimension = config.input_dimension # dimension of a city (coordinates) self.speed = config.speed # agent's speed # Network config self.input_embed = config.input_embed # dimension of embedding space self.num_neurons = config.hidden_dim # dimension of hidden states (LSTM cell) self.initializer = tf.contrib.layers.xavier_initializer( ) # variables initializer # Reward config self.beta = config.beta # penalty for constraint # Training config (actor) self.global_step = tf.Variable(0, trainable=False, name="global_step") # global step self.lr1_start = config.lr1_start # initial learning rate self.lr1_decay_rate = config.lr1_decay_rate # learning rate decay rate self.lr1_decay_step = config.lr1_decay_step # learning rate decay step self.is_training = not config.inference_mode # Training config (critic) self.global_step2 = tf.Variable(0, trainable=False, name="global_step2") # global step self.lr2_start = config.lr1_start # initial learning rate self.lr2_decay_rate = config.lr1_decay_rate # learning rate decay rate self.lr2_decay_step = config.lr1_decay_step # learning rate decay step # Tensor block holding the input sequences [Batch Size, Sequence Length, Features] self.input_ = tf.placeholder( tf.float32, [self.batch_size, self.max_length + 1, self.input_dimension + 2], name="input_raw") # +1 for depot / +2 for TW mean and TW width self.build_permutation() self.build_critic() self.build_reward() self.build_optim() self.merged = tf.summary.merge_all() def build_permutation(self): with tf.variable_scope("encoder"): with tf.variable_scope("embedding"): # Embed input sequence W_embed = tf.get_variable( "weights", [1, self.input_dimension + 2, self.input_embed], initializer=self.initializer) # +2 for TW feat. here too embedded_input = tf.nn.conv1d(self.input_, W_embed, 1, "VALID", name="embedded_input") # Batch Normalization embedded_input = tf.layers.batch_normalization( embedded_input, axis=2, training=self.is_training, name='layer_norm', reuse=None) with tf.variable_scope("dynamic_rnn"): # Encode input sequence cell1 = LSTMCell( self.num_neurons, initializer=self.initializer ) # BNLSTMCell(self.num_neurons, self.training) or cell1 = DropoutWrapper(cell1, output_keep_prob=0.9) # Return the output activations [Batch size, Sequence Length, Num_neurons] and last hidden state as tensors. encoder_output, encoder_state = tf.nn.dynamic_rnn( cell1, embedded_input, dtype=tf.float32) with tf.variable_scope('decoder'): # Ptr-net returns permutations (self.positions), with their log-probability for backprop self.ptr = Pointer_decoder(encoder_output, self.config) self.positions, self.log_softmax, self.attending, self.pointing = self.ptr.loop_decode( encoder_state) variable_summaries('log_softmax', self.log_softmax, with_max_min=True) def build_critic(self): with tf.variable_scope("critic"): # Critic predicts reward (parametric baseline for REINFORCE) self.critic = Critic(self.config) self.critic.predict_rewards(self.input_) variable_summaries('predictions', self.critic.predictions, with_max_min=True) def build_reward(self): with tf.name_scope('permutations'): # Reorder input % tour self.permutations = tf.stack([ tf.tile( tf.expand_dims(tf.range(self.batch_size, dtype=tf.int32), 1), [1, self.max_length + 2]), self.positions ], 2) self.ordered_input_ = tf.gather_nd(self.input_, self.permutations) self.ordered_input_ = tf.transpose( self.ordered_input_, [2, 1, 0] ) # [batch size, seq length +1 , features] to [features, seq length +1, batch_size] Rq: +1 because end = start = depot # Ordered coordinates ordered_x_ = self.ordered_input_[0] # [seq length +1, batch_size] delta_x2 = tf.transpose( tf.square(ordered_x_[1:] - ordered_x_[:-1]), [1, 0]) # [batch_size, seq length] delta_x**2 ordered_y_ = self.ordered_input_[1] # [seq length +1, batch_size] delta_y2 = tf.transpose( tf.square(ordered_y_[1:] - ordered_y_[:-1]), [1, 0]) # [batch_size, seq length] delta_y**2 # Ordered TW constraints self.ordered_tw_mean_ = tf.transpose( self.ordered_input_[2][:-1], [1, 0]) # [seq length, batch_size] to [batch_size, seq length] self.ordered_tw_width_ = tf.transpose( self.ordered_input_[3][:-1], [1, 0]) # [seq length, batch_size] to [batch_size, seq length] self.ordered_tw_open_ = self.ordered_tw_mean_ - self.ordered_tw_width_ / 2 self.ordered_tw_close_ = self.ordered_tw_mean_ + self.ordered_tw_width_ / 2 with tf.name_scope('environment'): # Get tour length (euclidean distance) inter_city_distances = tf.sqrt( delta_x2 + delta_y2 ) # sqrt(delta_x**2 + delta_y**2) this is the euclidean distance between each city: depot --> ... ---> depot [batch_size, seq length] self.distances = tf.reduce_sum(inter_city_distances, axis=1) # [batch_size] variable_summaries('tour_length', self.distances, with_max_min=True) # Get time at each city if no constraint self.time_at_cities = (1 / self.speed) * tf.cumsum( inter_city_distances, axis=1, exclusive=True ) - 10 # [batch size, seq length] # Rq: -10 to be on time at depot (t_mean centered) # Apply constraints to each city self.constrained_delivery_time = [] cumul_lateness = 0 for time_open, delivery_time in zip( tf.unstack(self.ordered_tw_open_, axis=1), tf.unstack(self.time_at_cities, axis=1)): # Unstack % seq length delayed_delivery = delivery_time + cumul_lateness cumul_lateness += tf.maximum( time_open - delayed_delivery, tf.zeros([ self.batch_size ])) # if you have to wait... wait (impacts further states) self.constrained_delivery_time.append(delivery_time + cumul_lateness) self.constrained_delivery_time = tf.stack( self.constrained_delivery_time, 1) # Define delay from lateness self.delay = tf.maximum( self.constrained_delivery_time - self.ordered_tw_close_ - 0.0001, tf.zeros([self.batch_size, self.max_length + 1]) ) # Delay perceived by the client (doesn't care if the deliver waits..) self.delay = tf.count_nonzero(self.delay, 1) variable_summaries('delay', tf.cast(self.delay, tf.float32), with_max_min=True) # Define reward from tour length & delay self.reward = tf.cast(self.distances, tf.float32) + self.beta * tf.sqrt( tf.cast(self.delay, tf.float32)) variable_summaries('reward', self.reward, with_max_min=True) def build_optim(self): # Update moving_mean and moving_variance for batch normalization layers update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): with tf.name_scope('reinforce'): # Actor learning rate self.lr1 = tf.train.exponential_decay(self.lr1_start, self.global_step, self.lr1_decay_step, self.lr1_decay_rate, staircase=False, name="learning_rate1") # Optimizer self.opt1 = tf.train.AdamOptimizer(learning_rate=self.lr1, beta1=0.9, beta2=0.99, epsilon=0.0000001) # Discounted reward self.reward_baseline = tf.stop_gradient( self.reward - self.critic.predictions) # [Batch size, 1] variable_summaries('reward_baseline', self.reward_baseline, with_max_min=True) # Loss self.loss1 = tf.reduce_mean( self.reward_baseline * self.log_softmax, 0) tf.summary.scalar('loss1', self.loss1) # Minimize step gvs = self.opt1.compute_gradients(self.loss1) capped_gvs = [(tf.clip_by_norm(grad, 1.), var) for grad, var in gvs if grad is not None] # L2 clip self.train_step1 = self.opt1.apply_gradients( capped_gvs, global_step=self.global_step) with tf.name_scope('state_value'): # Critic learning rate self.lr2 = tf.train.exponential_decay(self.lr2_start, self.global_step2, self.lr2_decay_step, self.lr2_decay_rate, staircase=False, name="learning_rate1") # Optimizer self.opt2 = tf.train.AdamOptimizer(learning_rate=self.lr2, beta1=0.9, beta2=0.99, epsilon=0.0000001) # Loss self.loss2 = tf.losses.mean_squared_error( self.reward, self.critic.predictions, weights=1.0) tf.summary.scalar('loss2', self.loss1) # Minimize step gvs2 = self.opt2.compute_gradients(self.loss2) capped_gvs2 = [(tf.clip_by_norm(grad, 1.), var) for grad, var in gvs2 if grad is not None] # L2 clip self.train_step2 = self.opt1.apply_gradients( capped_gvs2, global_step=self.global_step2)