def nested_fill(m, n): mat = [] ag.set_element_type(mat, tf.int32) for _ in range(m): l = [] ag.set_element_type(l, tf.int32) for j in range(n): l.append(j) mat.append(ag.stack(l, strict=False)) return ag.stack(mat, strict=False)
def element_update(): l = [] l.append(1) l.append(2) l.append(3) ag.set_element_type(l, tf.int32) l[1] = 5 return ag.stack(l, strict=False)
def read_write_loop(n): l = [] l.append(1) l.append(1) ag.set_element_type(l, tf.int32) for i in range(2, n): l.append(l[i - 1] + l[i - 2]) l[i - 2] = -l[i - 2] return ag.stack(l, strict=False)
def simple_empty(n): l = [] l.append(1) l.append(2) l.append(3) l.append(4) ag.set_element_type(l, tf.int32, ()) s = 0 for _ in range(n): s += l.pop() return ag.stack(l, strict=False), s
def graph_train_model(policy_network, cart_pole_env, optimizer, iterations): """Trains the policy network for a given number of iterations.""" i = tf.constant(0) mean_steps_per_iteration = [] ag.set_element_type(mean_steps_per_iteration, tf.int32) while i < iterations: steps_per_game = policy_network.train(cart_pole_env, optimizer, discount_rate=0.95, num_games=20, max_steps_per_game=200) mean_steps_per_iteration.append(tf.reduce_mean(steps_per_game)) i += 1 return ag.stack(mean_steps_per_iteration)
def graph_train_model(policy_network, cart_pole_env, optimizer, iterations): """Trains the policy network for a given number of iterations.""" i = tf.constant(0) mean_steps_per_iteration = [] ag.set_element_type(mean_steps_per_iteration, tf.int32) while i < iterations: steps_per_game = policy_network.train( cart_pole_env, optimizer, discount_rate=0.95, num_games=20, max_steps_per_game=200) mean_steps_per_iteration.append(tf.reduce_mean(steps_per_game)) i += 1 return ag.stack(mean_steps_per_iteration)
def train(self, cart_pole_env, optimizer, discount_rate, num_games, max_steps_per_game): var_list = tf.trainable_variables() grad_list = [ tf.TensorArray(tf.float32, 0, dynamic_size=True) for _ in var_list ] step_counts = [] discounted_rewards = [] ag.set_element_type(discounted_rewards, tf.float32) ag.set_element_type(step_counts, tf.int32) # Note: we use a shared object, cart_pole_env here. Because calls to the # object's method are made through py_func, TensorFlow cannot detect its # data dependencies. Hence we must manually synchronize access to it # and ensure the control dependencies are set in such a way that # calls to reset(), take_one_step, etc. are made in the correct order. sync_counter = tf.constant(0) for _ in tf.range(num_games): with tf.control_dependencies([sync_counter]): obs = cart_pole_env.reset() with tf.control_dependencies([obs]): sync_counter += 1 game_rewards = [] ag.set_element_type(game_rewards, tf.float32) for step in tf.range(max_steps_per_game): logits, actions = self(obs) # pylint:disable=not-callable logits = tf.reshape(logits, ()) actions = tf.reshape(actions, ()) labels = 1.0 - tf.cast(actions, tf.float32) loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=labels, logits=logits) grads = tf.gradients(loss, var_list) for i in range(len(grads)): grad_list[i].append(grads[i]) with tf.control_dependencies([sync_counter]): obs, reward, done = cart_pole_env.step(actions) with tf.control_dependencies([obs]): sync_counter += 1 obs = tf.reshape(obs, (1, 4)) game_rewards.append(reward) if reward < 0.1 or done: step_counts.append(step + 1) break discounted_rewards = graph_append_discounted_rewards( discounted_rewards, game_rewards, discount_rate) discounted_rewards = ag.stack(discounted_rewards) discounted_rewards.set_shape((None, )) mean, variance = tf.nn.moments(discounted_rewards, [0]) normalized_rewards = (discounted_rewards - mean) / tf.sqrt(variance) for i in range(len(grad_list)): g = ag.stack(grad_list[i]) # This block just adjusts the shapes to match for multiplication. r = normalized_rewards if r.shape.ndims < g.shape.ndims: r = tf.expand_dims(r, -1) if r.shape.ndims < g.shape.ndims: r = tf.expand_dims(r, -1) grad_list[i] = tf.reduce_mean(g * r, axis=0) optimizer.apply_gradients(zip(grad_list, var_list), global_step=tf.train.get_global_step()) return ag.stack(step_counts)
def train(self, cart_pole_env, optimizer, discount_rate, num_games, max_steps_per_game): var_list = tf.trainable_variables() grad_list = [ tf.TensorArray(tf.float32, 0, dynamic_size=True) for _ in var_list ] step_counts = [] discounted_rewards = [] ag.set_element_type(discounted_rewards, tf.float32) ag.set_element_type(step_counts, tf.int32) # Note: we use a shared object, cart_pole_env here. Because calls to the # object's method are made through py_func, TensorFlow cannot detect its # data dependencies. Hence we must manually synchronize access to it # and ensure the control dependencies are set in such a way that # calls to reset(), take_one_step, etc. are made in the correct order. sync_counter = tf.constant(0) for _ in tf.range(num_games): with tf.control_dependencies([sync_counter]): obs = cart_pole_env.reset() with tf.control_dependencies([obs]): sync_counter += 1 game_rewards = [] ag.set_element_type(game_rewards, tf.float32) for step in tf.range(max_steps_per_game): logits, actions = self(obs) # pylint:disable=not-callable logits = tf.reshape(logits, ()) actions = tf.reshape(actions, ()) labels = 1.0 - tf.cast(actions, tf.float32) loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=labels, logits=logits) grads = tf.gradients(loss, var_list) for i in range(len(grads)): grad_list[i].append(grads[i]) with tf.control_dependencies([sync_counter]): obs, reward, done = cart_pole_env.step(actions) with tf.control_dependencies([obs]): sync_counter += 1 obs = tf.reshape(obs, (1, 4)) game_rewards.append(reward) if reward < 0.1 or done: step_counts.append(step + 1) break discounted_rewards = graph_append_discounted_rewards( discounted_rewards, game_rewards, discount_rate) discounted_rewards = ag.stack(discounted_rewards) discounted_rewards.set_shape((None,)) mean, variance = tf.nn.moments(discounted_rewards, [0]) normalized_rewards = (discounted_rewards - mean) / tf.sqrt(variance) for i in range(len(grad_list)): g = ag.stack(grad_list[i]) # This block just adjusts the shapes to match for multiplication. r = normalized_rewards if r.shape.ndims < g.shape.ndims: r = tf.expand_dims(r, -1) if r.shape.ndims < g.shape.ndims: r = tf.expand_dims(r, -1) grad_list[i] = tf.reduce_mean(g * r, axis=0) optimizer.apply_gradients( zip(grad_list, var_list), global_step=tf.train.get_global_step()) return ag.stack(step_counts)
def simple_fill(n): l = [] ag.set_element_type(l, tf.int32) for i in range(n): l.append(i) return ag.stack(l, strict=False)
def type_not_annotated(n): l = [] # TODO(mdan): Here, we ought to infer the dtype and shape when i is staged. for i in range(n): l.append(i) return ag.stack(l, strict=False)