def step(self, step_type, reward, obs, prev_state): """Step through and return an action. This function will only be called once for graph creation and the resulting graph will be run repeatedly for agent evaluation. All the below fields are expected to be batched in the first dimension. (No time dimension) Args: step_type: [B,] Current steptype reward: [B,] Previous step reward. obs: Current Observations. prev_state: Prev agent state. Returns: StepOutput """ self._validate_observations(obs) with tf.variable_scope(self._name): # flatten graph features for the policy network # convert dict to graphstuple obs['graph_features'] = self._process_graph_features(obs['graph_features']) logits, _ = self._model.get_logits(self._model.compute_graph_embeddings(obs), obs['node_mask']) action = sample_from_logits(logits, self.seed) return StepOutput( action, logits, self._model.dummy_state(infer_shape(step_type)[0]), self._model.dummy_state(infer_shape(step_type)[0]), )
def step(self, step_type, reward, obs, prev_state): """Step through and return an action. This function will only be called once for graph creation and the resulting graph will be run repeatedly for agent evaluation. All the below fields are expected to be batched in the first dimension. (No time dimension) Args: step_type: [B,] Current steptype reward: [B,] Previous step reward. obs: Current Observations. prev_state: Prev agent state. Returns: StepOutput """ self._validate_observations(obs) with tf.variable_scope(self._name): # flatten graph features for the policy network # convert dict to graphstuple pack_as_structure = dict(**obs['graph_features']) obs['graph_features'] = self._process_graph_features(obs['graph_features']) ge = self._model.compute_graph_embeddings(obs) logitss, actions = self._model.get_actions(ge, obs) # pack by padding to the max nodes. packed_ge = self._model.pack_graph_embeddings(pack_as_structure, ge) return StepOutput(actions, logitss, self._model.dummy_state(infer_shape(step_type)[0]), dict(**packed_ge._asdict()))
def step(self, step_type, reward, obs, prev_state): """Pick a random discrete action from action_spec.""" with tf.variable_scope(self._name): with tf.name_scope('ur_step'): batch_size = tf.shape(step_type)[0] action = tf.fill((batch_size, ), 0) logits = tf.fill(tf.expand_dims(batch_size, 0), 0) return StepOutput(action, logits, self._dummy_state(batch_size))
def testUpdate(self): agent = self._get_agent_instance() bs_ph = tf.placeholder_with_default(B, ()) sess = self.session() init_state = agent.initial_state(bs=bs_ph) init_state_val = sess.run(init_state) step_type = np.zeros((T + 1, B), dtype=np.int32) reward = np.zeros((T + 1, B), dtype=np.float32) discount = np.zeros((T + 1, B), dtype=np.float32) var_type_mask = np.zeros((T + 1, B, N_NODES), dtype=np.int32) constraint_type_mask = np.zeros((T + 1, B, N_NODES), dtype=np.int32) obj_type_mask = np.zeros((T + 1, B, N_NODES), dtype=np.int32) var_type_mask[:, :, 0] = 1 constraint_type_mask[:, :, 1] = 1 obj_type_mask[:, :, 2] = 1 obs = dict(features=np.zeros((T + 1, B, N_NODES), dtype=np.float32), graph_features=self._get_graph_features_update(), node_mask=np.ones(((T + 1), B, N_NODES), dtype=np.int32), var_type_mask=var_type_mask, constraint_type_mask=constraint_type_mask, obj_type_mask=obj_type_mask) step_output = StepOutput(action=np.zeros((T, B), dtype=np.int32), logits=np.zeros((T, B, N_NODES), dtype=np.float32), next_state=np.zeros_like( np.vstack([init_state_val] * T))) step_output, _, step_type, reward, obs, discount = agent.update_preprocess( step_output, None, step_type, reward, obs, discount) def f(np_arr): return tf.constant(np_arr) with tf.variable_scope('update', reuse=tf.AUTO_REUSE): agent.build_update_ops( nest.map_structure(f, step_output), tf.zeros_like(np.vstack([init_state_val] * (T + 1))), nest.map_structure(f, step_type), nest.map_structure(f, reward), nest.map_structure(f, obs), nest.map_structure(f, discount)) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) for _ in range(3): agent.update(sess, {}, {}) print('.', end='') print('') print('Done!')
def step(self, step_type, reward, obs, prev_state): """Step through and return an action. This function will only be called once for graph creation and the resulting graph will be run repeatedly for agent evaluation. All the below fields are expected to be batched in the first dimension. (No time dimension) Args: step_type: [B,] Current steptype reward: [B,] Previous step reward. obs: Current Observations. prev_state: Prev agent state. Returns: StepOutput """ with tf.variable_scope(self._name): if self.config.evaluation_mode: # evaluate gcn agent at the evaluators. # flatten graph features for the policy network # convert dict to graphstuple graph_features = gn.graphs.GraphsTuple(**obs['graph_features']) obs['graph_features'] = flatten_graphs(graph_features) logits, _ = self._model.get_logits( self._model.compute_graph_embeddings(obs), obs['node_mask']) action = sample_from_logits(logits, self.seed) return StepOutput(action, logits, self._model.dummy_state(infer_shape(step_type)[0])) else: # use mlp to imitate during training mode in actors. logits, next_state, _ = self._mlp_model.get_logits_and_next_state( step_type, reward, obs, prev_state) action = sample_from_logits(logits, self.seed) return StepOutput(action, logits, next_state)
def step(self, step_type, reward, obs, prev_state): """Step through and return an action. This function will only be called once for graph creation and the resulting graph will be run repeatedly for agent evaluation. All the below fields are expected to be batched in the first dimension. (No time dimension) Args: step_type: [B,] Current steptype reward: [B,] Previous step reward. obs: Current Observations. prev_state: Prev agent state. Returns: StepOutput """ with tf.variable_scope(self._name): logits, next_state, _ = self._model.get_logits_and_next_state( step_type, reward, obs, prev_state) action = sample_from_logits(logits, self.seed) return StepOutput(action, logits, next_state)
def step(self, step_type, reward, obs, prev_state): """Pick a random discrete action from action_spec.""" with tf.variable_scope(self._name): with tf.name_scope('ur_step'): batch_size = tf.shape(step_type)[0] if 'mask' in obs: logits = tf.cast(tf.identity(obs['mask']), tf.float32) logits *= 1e9 # multiply by infinity action = sample_from_logits(logits, self.seed) else: base = tf.random.uniform(self._action_spec.shape, dtype=tf.float32, minval=0, maxval=1) L = self._action_spec.minimum R = self._action_spec.maximum action = tf.cast(L + (base * (R - L)), self._action_spec.dtype) logits = tf.fill(tf.expand_dims(batch_size, 0), 0) return StepOutput(action, logits, self._dummy_state(batch_size))
def start(self, step_type, reward, discount, observation, next_state): self.add(step_type, reward, discount, observation, StepOutput(next_state=next_state, action=None, logits=None))
def testUpdate(self): self._setup() agent = self._get_agent_instance() bs_ph = tf.placeholder_with_default(B, ()) sess = self.session() init_state = agent.initial_state(bs=bs_ph) init_state_val = sess.run(init_state) step_type = np.zeros((T + 1, B), dtype=np.int32) reward = np.zeros((T + 1, B), dtype=np.float32) discount = np.zeros((T + 1, B), dtype=np.float32) obs = dict(features=np.zeros((T + 1, B, N_NODES), dtype=np.float32), graph_features=self._get_graph_features_update(), node_mask=np.ones(((T + 1), B, N_NODES), dtype=np.int32)) step_output = StepOutput(action=np.zeros((T, B), dtype=np.int32), logits=np.zeros((T, B, N_NODES), dtype=np.float32), next_state=np.zeros_like( np.vstack([init_state_val] * T))) step_output, _, step_type, reward, obs, discount = agent.update_preprocess( step_output, None, step_type, reward, obs, discount) feed_dict = {} def f(np_arr): ph = tf.placeholder(shape=np_arr.shape, dtype=np_arr.dtype) feed_dict[ph] = np_arr return ph with tf.variable_scope('update', reuse=tf.AUTO_REUSE): agent.build_update_ops( nest.map_structure(f, step_output), tf.zeros_like(np.vstack([init_state_val] * (T + 1))), nest.map_structure(f, step_type), nest.map_structure(f, reward), nest.map_structure(f, obs), nest.map_structure(f, discount)) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) N_ITERS = 50 for i in range(N_ITERS): profile_kwargs = {} if i == N_ITERS - 1: run_metadata = tf.RunMetadata() profile_kwargs = dict(options=tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE), run_metadata=run_metadata) agent.update(sess, feed_dict, profile_kwargs) print('.', end='') print('') # save the final timeline tl = timeline.Timeline(run_metadata.step_stats) ctf = tl.generate_chrome_trace_format() export_path = '/tmp/' with open(os.path.join(export_path, 'timeline.json'), 'w') as f: f.write(ctf) print('Done!')