def build_graph(): # Create shared global policy and value networks s, p_network, v_network, p_params, v_params = build_policy_and_value_networks(num_actions=ACTIONS, agent_history_length=AGENT_HISTORY_LENGTH, resized_width=RESIZED_WIDTH, resized_height=RESIZED_HEIGHT) # Shared global optimizer optimizer = tf.train.AdamOptimizer(LEARNING_RATE) # Op for applying remote gradients R_t = tf.placeholder("float", [None]) a_t = tf.placeholder("float", [None, ACTIONS]) log_prob = tf.log(tf.reduce_sum(tf.mul(p_network, a_t), reduction_indices=1)) p_loss = -log_prob * (R_t - v_network) v_loss = tf.reduce_mean(tf.square(R_t - v_network)) total_loss = p_loss + (0.5 * v_loss) minimize = optimizer.minimize(total_loss) return s, a_t, R_t, minimize
def build_graph(): # Create shared global policy and value networks (s, p_network, v_network, p_params, v_params) = build_policy_and_value_networks(num_actions=NUM_ACTIONS, input_shape=STATE_SHAPE) # Shared global optimizer optimizer = tf.train.RMSPropOptimizer(learning_rate=LEARNING_RATE, decay=DECAY) # Op for applying remote gradients R_t = tf.placeholder("float", [None]) a_t = tf.placeholder("float", [None, NUM_ACTIONS]) log_prob = tf.log(tf.reduce_sum(p_network * a_t, axis=1)) p_loss = -log_prob * (R_t - v_network) v_loss = tf.reduce_mean(tf.square(R_t - v_network)) total_loss = p_loss + (0.5 * v_loss) minimize = optimizer.minimize(total_loss) return s, a_t, R_t, minimize, p_network, v_network