예제 #1
0
  def step(self, step_type, reward, obs, prev_state):
    """Step through and return an action.
    This function will only be called once for graph creation and
    the resulting graph will be run repeatedly for agent evaluation.

    All the below fields are expected to be batched in the first
    dimension. (No time dimension)

    Args:
      step_type: [B,] Current steptype
      reward: [B,] Previous step reward.
      obs: Current Observations.
      prev_state: Prev agent state.

    Returns:
      StepOutput
    """

    self._validate_observations(obs)
    with tf.variable_scope(self._name):
      # flatten graph features for the policy network
      # convert dict to graphstuple
      obs['graph_features'] = self._process_graph_features(obs['graph_features'])

      logits, _ = self._model.get_logits(self._model.compute_graph_embeddings(obs),
                                         obs['node_mask'])

      action = sample_from_logits(logits, self.seed)
      return StepOutput(
          action,
          logits,
          self._model.dummy_state(infer_shape(step_type)[0]),
          self._model.dummy_state(infer_shape(step_type)[0]),
      )
예제 #2
0
  def step(self, step_type, reward, obs, prev_state):
    """Step through and return an action.
    This function will only be called once for graph creation and
    the resulting graph will be run repeatedly for agent evaluation.

    All the below fields are expected to be batched in the first
    dimension. (No time dimension)

    Args:
      step_type: [B,] Current steptype
      reward: [B,] Previous step reward.
      obs: Current Observations.
      prev_state: Prev agent state.

    Returns:
      StepOutput
    """

    self._validate_observations(obs)
    with tf.variable_scope(self._name):
      # flatten graph features for the policy network
      # convert dict to graphstuple
      pack_as_structure = dict(**obs['graph_features'])
      obs['graph_features'] = self._process_graph_features(obs['graph_features'])
      ge = self._model.compute_graph_embeddings(obs)
      logitss, actions = self._model.get_actions(ge, obs)
      # pack by padding to the max nodes.
      packed_ge = self._model.pack_graph_embeddings(pack_as_structure, ge)
      return StepOutput(actions, logitss, self._model.dummy_state(infer_shape(step_type)[0]),
                        dict(**packed_ge._asdict()))
예제 #3
0
 def step(self, step_type, reward, obs, prev_state):
     """Pick a random discrete action from action_spec."""
     with tf.variable_scope(self._name):
         with tf.name_scope('ur_step'):
             batch_size = tf.shape(step_type)[0]
             action = tf.fill((batch_size, ), 0)
             logits = tf.fill(tf.expand_dims(batch_size, 0), 0)
             return StepOutput(action, logits,
                               self._dummy_state(batch_size))
예제 #4
0
    def testUpdate(self):
        agent = self._get_agent_instance()
        bs_ph = tf.placeholder_with_default(B, ())
        sess = self.session()

        init_state = agent.initial_state(bs=bs_ph)
        init_state_val = sess.run(init_state)

        step_type = np.zeros((T + 1, B), dtype=np.int32)
        reward = np.zeros((T + 1, B), dtype=np.float32)
        discount = np.zeros((T + 1, B), dtype=np.float32)

        var_type_mask = np.zeros((T + 1, B, N_NODES), dtype=np.int32)
        constraint_type_mask = np.zeros((T + 1, B, N_NODES), dtype=np.int32)
        obj_type_mask = np.zeros((T + 1, B, N_NODES), dtype=np.int32)
        var_type_mask[:, :, 0] = 1
        constraint_type_mask[:, :, 1] = 1
        obj_type_mask[:, :, 2] = 1

        obs = dict(features=np.zeros((T + 1, B, N_NODES), dtype=np.float32),
                   graph_features=self._get_graph_features_update(),
                   node_mask=np.ones(((T + 1), B, N_NODES), dtype=np.int32),
                   var_type_mask=var_type_mask,
                   constraint_type_mask=constraint_type_mask,
                   obj_type_mask=obj_type_mask)

        step_output = StepOutput(action=np.zeros((T, B), dtype=np.int32),
                                 logits=np.zeros((T, B, N_NODES),
                                                 dtype=np.float32),
                                 next_state=np.zeros_like(
                                     np.vstack([init_state_val] * T)))

        step_output, _, step_type, reward, obs, discount = agent.update_preprocess(
            step_output, None, step_type, reward, obs, discount)

        def f(np_arr):
            return tf.constant(np_arr)

        with tf.variable_scope('update', reuse=tf.AUTO_REUSE):
            agent.build_update_ops(
                nest.map_structure(f, step_output),
                tf.zeros_like(np.vstack([init_state_val] * (T + 1))),
                nest.map_structure(f, step_type),
                nest.map_structure(f, reward), nest.map_structure(f, obs),
                nest.map_structure(f, discount))

        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())
        for _ in range(3):
            agent.update(sess, {}, {})
            print('.', end='')
        print('')
        print('Done!')
예제 #5
0
  def step(self, step_type, reward, obs, prev_state):
    """Step through and return an action.
    This function will only be called once for graph creation and
    the resulting graph will be run repeatedly for agent evaluation.

    All the below fields are expected to be batched in the first
    dimension. (No time dimension)

    Args:
      step_type: [B,] Current steptype
      reward: [B,] Previous step reward.
      obs: Current Observations.
      prev_state: Prev agent state.

    Returns:
      StepOutput
    """
    with tf.variable_scope(self._name):
      if self.config.evaluation_mode:
        # evaluate gcn agent at the evaluators.
        # flatten graph features for the policy network
        # convert dict to graphstuple
        graph_features = gn.graphs.GraphsTuple(**obs['graph_features'])
        obs['graph_features'] = flatten_graphs(graph_features)

        logits, _ = self._model.get_logits(
            self._model.compute_graph_embeddings(obs), obs['node_mask'])

        action = sample_from_logits(logits, self.seed)
        return StepOutput(action, logits,
                          self._model.dummy_state(infer_shape(step_type)[0]))
      else:
        # use mlp to imitate during training mode in actors.
        logits, next_state, _ = self._mlp_model.get_logits_and_next_state(
            step_type, reward, obs, prev_state)
        action = sample_from_logits(logits, self.seed)
      return StepOutput(action, logits, next_state)
예제 #6
0
    def step(self, step_type, reward, obs, prev_state):
        """Step through and return an action.
    This function will only be called once for graph creation and
    the resulting graph will be run repeatedly for agent evaluation.

    All the below fields are expected to be batched in the first
    dimension. (No time dimension)

    Args:
      step_type: [B,] Current steptype
      reward: [B,] Previous step reward.
      obs: Current Observations.
      prev_state: Prev agent state.

    Returns:
      StepOutput
    """
        with tf.variable_scope(self._name):
            logits, next_state, _ = self._model.get_logits_and_next_state(
                step_type, reward, obs, prev_state)
            action = sample_from_logits(logits, self.seed)
            return StepOutput(action, logits, next_state)
예제 #7
0
    def step(self, step_type, reward, obs, prev_state):
        """Pick a random discrete action from action_spec."""
        with tf.variable_scope(self._name):
            with tf.name_scope('ur_step'):
                batch_size = tf.shape(step_type)[0]
                if 'mask' in obs:
                    logits = tf.cast(tf.identity(obs['mask']), tf.float32)
                    logits *= 1e9  # multiply by infinity
                    action = sample_from_logits(logits, self.seed)
                else:
                    base = tf.random.uniform(self._action_spec.shape,
                                             dtype=tf.float32,
                                             minval=0,
                                             maxval=1)

                    L = self._action_spec.minimum
                    R = self._action_spec.maximum

                    action = tf.cast(L + (base * (R - L)),
                                     self._action_spec.dtype)
                    logits = tf.fill(tf.expand_dims(batch_size, 0), 0)
                return StepOutput(action, logits,
                                  self._dummy_state(batch_size))
예제 #8
0
 def start(self, step_type, reward, discount, observation, next_state):
     self.add(step_type, reward, discount, observation,
              StepOutput(next_state=next_state, action=None, logits=None))
예제 #9
0
    def testUpdate(self):
        self._setup()
        agent = self._get_agent_instance()
        bs_ph = tf.placeholder_with_default(B, ())
        sess = self.session()

        init_state = agent.initial_state(bs=bs_ph)
        init_state_val = sess.run(init_state)

        step_type = np.zeros((T + 1, B), dtype=np.int32)
        reward = np.zeros((T + 1, B), dtype=np.float32)
        discount = np.zeros((T + 1, B), dtype=np.float32)
        obs = dict(features=np.zeros((T + 1, B, N_NODES), dtype=np.float32),
                   graph_features=self._get_graph_features_update(),
                   node_mask=np.ones(((T + 1), B, N_NODES), dtype=np.int32))

        step_output = StepOutput(action=np.zeros((T, B), dtype=np.int32),
                                 logits=np.zeros((T, B, N_NODES),
                                                 dtype=np.float32),
                                 next_state=np.zeros_like(
                                     np.vstack([init_state_val] * T)))

        step_output, _, step_type, reward, obs, discount = agent.update_preprocess(
            step_output, None, step_type, reward, obs, discount)

        feed_dict = {}

        def f(np_arr):
            ph = tf.placeholder(shape=np_arr.shape, dtype=np_arr.dtype)
            feed_dict[ph] = np_arr
            return ph

        with tf.variable_scope('update', reuse=tf.AUTO_REUSE):
            agent.build_update_ops(
                nest.map_structure(f, step_output),
                tf.zeros_like(np.vstack([init_state_val] * (T + 1))),
                nest.map_structure(f, step_type),
                nest.map_structure(f, reward), nest.map_structure(f, obs),
                nest.map_structure(f, discount))

        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())
        N_ITERS = 50
        for i in range(N_ITERS):
            profile_kwargs = {}
            if i == N_ITERS - 1:
                run_metadata = tf.RunMetadata()
                profile_kwargs = dict(options=tf.RunOptions(
                    trace_level=tf.RunOptions.FULL_TRACE),
                                      run_metadata=run_metadata)

            agent.update(sess, feed_dict, profile_kwargs)
            print('.', end='')

        print('')

        # save the final timeline
        tl = timeline.Timeline(run_metadata.step_stats)
        ctf = tl.generate_chrome_trace_format()
        export_path = '/tmp/'
        with open(os.path.join(export_path, 'timeline.json'), 'w') as f:
            f.write(ctf)
        print('Done!')