Exemplo n.º 1
0
    def _prepare_networks(self, hparams, sess):
        self.action = tf.placeholder(shape=(1, ), dtype=tf.int32)
        batch_env = batch_env_factory(hparams)
        self.reward, self.done = batch_env.simulate(self.action)
        self.observation = batch_env.observ
        self.reset_op = batch_env.reset(tf.constant([0], dtype=tf.int32))

        environment_wrappers = hparams.environment_spec.wrappers
        wrappers = copy.copy(
            environment_wrappers) if environment_wrappers else []

        to_initialize = [batch_env]
        for w in wrappers:
            batch_env = w[0](batch_env, **w[1])
            to_initialize.append(batch_env)

        def initialization_lambda():
            for batch_env in to_initialize:
                batch_env.initialize(sess)

        self.initialize = initialization_lambda

        obs_copy = batch_env.observ + 0

        actor_critic = get_policy(tf.expand_dims(obs_copy, 0), hparams)
        self.policy_probs = actor_critic.policy.probs[0, 0, :]
        self.value = actor_critic.value[0, :]
Exemplo n.º 2
0
def define_collect(hparams,
                   scope,
                   eval_phase,
                   collect_level=-1,
                   policy_to_actions_lambda=None,
                   on_simulated=False):
    """Collect trajectories."""

    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
        batch_env = batch_env_factory(hparams)
        environment_wrappers = hparams.environment_spec.wrappers
        wrappers = copy.copy(
            environment_wrappers) if environment_wrappers else []
        # Put memory wrapper at the level you want to gather observations at.
        # Negative indices need to be shifted for insert to work correctly.
        collect_level = collect_level if \
          collect_level >= 0 else len(wrappers) + collect_level + 1
        wrappers.insert(collect_level, [_MemoryWrapper, {}])
        rollout_metadata = None
        speculum = None
        for w in wrappers:
            batch_env = w[0](batch_env, **w[1])
            if w[0] == _MemoryWrapper:
                rollout_metadata = _rollout_metadata(batch_env)
                speculum = batch_env.speculum

        eval_phase = tf.convert_to_tensor(eval_phase)
        on_simulated = tf.convert_to_tensor(on_simulated)

        memory = [
            tf.get_variable("collect_memory_{}".format(name),
                            shape=[hparams.epoch_length] + shape,
                            dtype=dtype,
                            initializer=tf.zeros_initializer(),
                            trainable=False)
            for (shape, dtype, name) in rollout_metadata
        ]

        cumulative_rewards = tf.get_variable("cumulative_rewards",
                                             len(batch_env),
                                             trainable=False)

        should_reset_var = tf.Variable(True, trainable=False)

        zeros_tensor = tf.zeros(len(batch_env))

    def group():
        return tf.group(batch_env.reset(tf.range(len(batch_env))),
                        tf.assign(cumulative_rewards, zeros_tensor))

    reset_op = tf.cond(
        tf.logical_or(should_reset_var, tf.logical_or(eval_phase,
                                                      on_simulated)), group,
        tf.no_op)

    with tf.control_dependencies([reset_op]):
        reset_once_op = tf.assign(should_reset_var, False)

    with tf.control_dependencies([reset_once_op]):

        def step(index, scores_sum, scores_num):
            """Single step."""
            index %= hparams.epoch_length  # Only needed in eval runs.
            # Note - the only way to ensure making a copy of tensor is to run simple
            # operation. We are waiting for tf.copy:
            # https://github.com/tensorflow/tensorflow/issues/11186
            obs_copy = batch_env.observ + 0

            def env_step(arg1, arg2):  # pylint: disable=unused-argument
                """Step of the environment."""
                actor_critic = get_policy(tf.expand_dims(obs_copy, 0), hparams)
                policy = actor_critic.policy
                if policy_to_actions_lambda:
                    action = policy_to_actions_lambda(policy)
                else:
                    action = tf.cond(eval_phase, policy.mode, policy.sample)

                postprocessed_action = actor_critic.action_postprocessing(
                    action)
                simulate_output = batch_env.simulate(postprocessed_action[0,
                                                                          ...])

                pdf = policy.prob(action)[0]
                value_function = actor_critic.value[0]
                pdf = tf.reshape(pdf, shape=(hparams.num_agents, ))
                value_function = tf.reshape(value_function,
                                            shape=(hparams.num_agents, ))

                with tf.control_dependencies(simulate_output):
                    return tf.identity(pdf), tf.identity(value_function)

            pdf, value_function = tf.while_loop(
                lambda _1, _2: tf.equal(speculum.size(), 0),
                env_step,
                [
                    tf.constant(0.0, shape=(hparams.num_agents, )),
                    tf.constant(0.0, shape=(hparams.num_agents, ))
                ],
                parallel_iterations=1,
                back_prop=False,
            )

            with tf.control_dependencies([pdf, value_function]):
                obs, reward, done, action = speculum.dequeue()

                done = tf.reshape(done, (len(batch_env), ))
                to_save = [obs, reward, done, action, pdf, value_function]
                save_ops = [
                    tf.scatter_update(memory_slot, index, value)
                    for memory_slot, value in zip(memory, to_save)
                ]
                cumulate_rewards_op = cumulative_rewards.assign_add(reward)
                agent_indices_to_reset = tf.where(done)[:, 0]
            with tf.control_dependencies([cumulate_rewards_op]):
                scores_sum_delta = tf.reduce_sum(
                    tf.gather(cumulative_rewards, agent_indices_to_reset))
                scores_num_delta = tf.count_nonzero(done, dtype=tf.int32)
            with tf.control_dependencies(save_ops +
                                         [scores_sum_delta, scores_num_delta]):
                reset_env_op = batch_env.reset(agent_indices_to_reset)
                reset_cumulative_rewards_op = tf.scatter_update(
                    cumulative_rewards, agent_indices_to_reset,
                    tf.gather(zeros_tensor, agent_indices_to_reset))
            with tf.control_dependencies(
                [reset_env_op, reset_cumulative_rewards_op]):
                return [
                    index + 1, scores_sum + scores_sum_delta,
                    scores_num + scores_num_delta
                ]

        def stop_condition(i, _, resets):
            return tf.cond(eval_phase,
                           lambda: resets < hparams.num_eval_agents,
                           lambda: i < hparams.epoch_length)

        init = [tf.constant(0), tf.constant(0.0), tf.constant(0)]
        index, scores_sum, scores_num = tf.while_loop(stop_condition,
                                                      step,
                                                      init,
                                                      parallel_iterations=1,
                                                      back_prop=False)
    mean_score = tf.cond(tf.greater(scores_num, 0),
                         lambda: scores_sum / tf.cast(scores_num, tf.float32),
                         lambda: 0.)
    printing = tf.Print(0, [mean_score, scores_sum, scores_num],
                        "mean_score: ")
    with tf.control_dependencies([index, printing]):
        memory = [tf.identity(mem) for mem in memory]
        mean_score_summary = tf.cond(
            tf.greater(scores_num, 0),
            lambda: tf.summary.scalar("mean_score_this_iter", mean_score), str)
        summaries = tf.summary.merge([
            mean_score_summary,
            tf.summary.scalar("episodes_finished_this_iter", scores_num)
        ])
        return memory, summaries
Exemplo n.º 3
0
def define_collect(hparams, scope, eval_phase,
                   collect_level=-1,
                   policy_to_actions_lambda=None):
  """Collect trajectories.

  Args:
    hparams: HParams.
    scope: var scope.
    eval_phase: bool, is eval phase.
    collect_level: int, which level to collect observations.
    policy_to_actions_lambda: lambda.

  Returns:
    Returns memory (observtions, rewards, dones, actions,
    pdfs, values_functions)
    containing a rollout of environment from collect_level of nested wrapper
    structure. Note that pdfs and values_functions are meaningful only if
    collect_level==-1.
  """

  to_initialize = []
  with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
    batch_env = batch_env_factory(hparams)
    to_initialize.append(batch_env)
    environment_wrappers = hparams.environment_spec.wrappers
    wrappers = copy.copy(environment_wrappers) if environment_wrappers else []
    # Put memory wrapper at the level you want to gather observations at.
    # Negative indices need to be shifted for insert to work correctly.
    collect_level = collect_level if \
      collect_level >= 0 else len(wrappers) + collect_level + 1
    wrappers.insert(collect_level, [_MemoryWrapper, {}])
    rollout_metadata = None
    speculum = None
    for w in wrappers:
      tf.logging.info("Applying wrapper %s(%s) to env %s."
                      % (str(w[0]), str(w[1]), str(batch_env)))
      batch_env = w[0](batch_env, **w[1])
      to_initialize.append(batch_env)
      if w[0] == _MemoryWrapper:
        rollout_metadata = _rollout_metadata(batch_env)
        speculum = batch_env.speculum

    def initialization_lambda(sess):
      for batch_env in to_initialize:
        batch_env.initialize(sess)

    memory = [
        tf.get_variable("collect_memory_%d_%s" % (hparams.epoch_length, name),
                        shape=[hparams.epoch_length] + shape,
                        dtype=dtype,
                        initializer=tf.zeros_initializer(),
                        trainable=False)
        for (shape, dtype, name) in rollout_metadata]

    cumulative_rewards = tf.get_variable("cumulative_rewards", len(batch_env),
                                         trainable=False)

    eval_phase = tf.convert_to_tensor(eval_phase)
    should_reset_var = tf.Variable(True, trainable=False)
    zeros_tensor = tf.zeros(len(batch_env))

  if "force_beginning_resets" in hparams:
    force_beginning_resets = hparams.force_beginning_resets
  else:
    force_beginning_resets = False
  force_beginning_resets = tf.convert_to_tensor(force_beginning_resets)

  def reset_ops_group():
    return tf.group(batch_env.reset(tf.range(len(batch_env))),
                    tf.assign(cumulative_rewards, zeros_tensor))

  reset_op = tf.cond(
      tf.logical_or(should_reset_var.read_value(), force_beginning_resets),
      reset_ops_group, tf.no_op)

  with tf.control_dependencies([reset_op]):
    reset_once_op = tf.assign(should_reset_var, False)

  with tf.control_dependencies([reset_once_op]):

    def step(index, scores_sum, scores_num):
      """Single step."""
      index %= hparams.epoch_length  # Only needed in eval runs.
      # Note - the only way to ensure making a copy of tensor is to run simple
      # operation. We are waiting for tf.copy:
      # https://github.com/tensorflow/tensorflow/issues/11186
      obs_copy = batch_env.observ + 0

      def env_step(arg1, arg2, arg3):  # pylint: disable=unused-argument
        """Step of the environment."""
        actor_critic = get_policy(tf.expand_dims(obs_copy, 0), hparams)
        policy = actor_critic.policy
        if policy_to_actions_lambda:
          action = policy_to_actions_lambda(policy)
        else:
          action = tf.cond(eval_phase,
                           policy.mode,
                           policy.sample)

        postprocessed_action = actor_critic.action_postprocessing(action)
        reward, done = batch_env.simulate(postprocessed_action[0, ...])

        pdf = policy.prob(action)[0]
        value_function = actor_critic.value[0]
        pdf = tf.reshape(pdf, shape=(hparams.num_agents,))
        value_function = tf.reshape(value_function, shape=(hparams.num_agents,))
        done = tf.reshape(done, shape=(hparams.num_agents,))

        with tf.control_dependencies([reward, done]):
          return tf.identity(pdf), tf.identity(value_function), \
                 tf.identity(done)

      # TODO(piotrmilos): while_body is executed at most once,
      # thus should be replaced with tf.cond
      pdf, value_function, top_level_done = tf.while_loop(
          lambda _1, _2, _3: tf.equal(speculum.size(), 0),
          env_step,
          [
              tf.constant(0.0, shape=(hparams.num_agents,)),
              tf.constant(0.0, shape=(hparams.num_agents,)),
              tf.constant(False, shape=(hparams.num_agents,))
          ],
          parallel_iterations=1,
          back_prop=False,
      )

      with tf.control_dependencies([pdf, value_function]):
        obs, reward, done, action = speculum.dequeue()

        to_save = [obs, reward, done, action,
                   pdf, value_function]
        save_ops = [tf.scatter_update(memory_slot, index, value)
                    for memory_slot, value in zip(memory, to_save)]
        cumulate_rewards_op = cumulative_rewards.assign_add(reward)

        agent_indices_to_reset = tf.where(top_level_done)[:, 0]
      with tf.control_dependencies([cumulate_rewards_op]):
        # TODO(piotrmilos): possibly we need cumulative_rewards.read_value()
        scores_sum_delta = tf.reduce_sum(
            tf.gather(cumulative_rewards.read_value(), agent_indices_to_reset))
        scores_num_delta = tf.count_nonzero(done, dtype=tf.int32)
      with tf.control_dependencies(save_ops + [scores_sum_delta,
                                               scores_num_delta]):
        reset_env_op = batch_env.reset(agent_indices_to_reset)
        reset_cumulative_rewards_op = tf.scatter_update(
            cumulative_rewards, agent_indices_to_reset,
            tf.gather(zeros_tensor, agent_indices_to_reset))
      with tf.control_dependencies([reset_env_op,
                                    reset_cumulative_rewards_op]):
        return [index + 1, scores_sum + scores_sum_delta,
                scores_num + scores_num_delta]

    def stop_condition(i, _, resets):
      return tf.cond(eval_phase,
                     lambda: resets < hparams.num_eval_agents,
                     lambda: i < hparams.epoch_length)

    init = [tf.constant(0), tf.constant(0.0), tf.constant(0)]
    index, scores_sum, scores_num = tf.while_loop(
        stop_condition,
        step,
        init,
        parallel_iterations=1,
        back_prop=False)

  # We handle force_beginning_resets differently. We assume that all envs are
  # reseted at the end of episod (though it happens at the beginning of the
  # next one
  scores_num = tf.cond(force_beginning_resets,
                       lambda: scores_num + len(batch_env), lambda: scores_num)

  with tf.control_dependencies([scores_sum]):
    scores_sum = tf.cond(
        force_beginning_resets,
        lambda: scores_sum + tf.reduce_sum(cumulative_rewards.read_value()),
        lambda: scores_sum)

  mean_score = tf.cond(tf.greater(scores_num, 0),
                       lambda: scores_sum / tf.cast(scores_num, tf.float32),
                       lambda: 0.)
  printing = tf.Print(0, [mean_score, scores_sum, scores_num], "mean_score: ")
  with tf.control_dependencies([index, printing]):
    memory = [mem.read_value() for mem in memory]
    # When generating real data together with PPO training we must use single
    # agent. For PPO to work we reshape the history, as if it was generated
    # by real_ppo_effective_num_agents.
    if getattr(hparams, "effective_num_agents", None):
      new_memory = []
      effective_num_agents = hparams.effective_num_agents
      assert hparams.epoch_length % effective_num_agents == 0, (
          "The rollout of hparams.epoch_length will be distributed amongst"
          "effective_num_agents of agents")
      new_epoch_length = int(hparams.epoch_length / effective_num_agents)
      for mem, info in zip(memory, rollout_metadata):
        shape, _, name = info
        new_shape = [effective_num_agents, new_epoch_length] + shape[1:]
        perm = list(range(len(shape)+1))
        perm[0] = 1
        perm[1] = 0
        mem = tf.transpose(mem, perm=perm)
        mem = tf.reshape(mem, shape=new_shape)
        mem = tf.transpose(mem, perm=perm,
                           name="collect_memory_%d_%s"
                           % (new_epoch_length, name))
        new_memory.append(mem)
      memory = new_memory

    mean_score_summary = tf.cond(
        tf.greater(scores_num, 0),
        lambda: tf.summary.scalar("mean_score_this_iter", mean_score),
        str)
    summaries = tf.summary.merge(
        [mean_score_summary,
         tf.summary.scalar("episodes_finished_this_iter", scores_num)])
    return memory, summaries, initialization_lambda
Exemplo n.º 4
0
def define_collect(hparams, scope, eval_phase,
                   collect_level=-1,
                   policy_to_actions_lambda=None):
  """Collect trajectories."""
  to_initialize = []
  with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
    batch_env = batch_env_factory(hparams)
    to_initialize.append(batch_env)
    environment_wrappers = hparams.environment_spec.wrappers
    wrappers = copy.copy(environment_wrappers) if environment_wrappers else []
    # Put memory wrapper at the level you want to gather observations at.
    # Negative indices need to be shifted for insert to work correctly.
    collect_level = collect_level if \
      collect_level >= 0 else len(wrappers) + collect_level + 1
    wrappers.insert(collect_level, [_MemoryWrapper, {}])
    rollout_metadata = None
    speculum = None
    for w in wrappers:
      batch_env = w[0](batch_env, **w[1])
      to_initialize.append(batch_env)
      if w[0] == _MemoryWrapper:
        rollout_metadata = _rollout_metadata(batch_env)
        speculum = batch_env.speculum

    def initialization_lambda(sess):
      for batch_env in to_initialize:
        batch_env.initialize(sess)

    memory = [tf.get_variable("collect_memory_{}".format(name),
                              shape=[hparams.epoch_length]+shape,
                              dtype=dtype,
                              initializer=tf.zeros_initializer(),
                              trainable=False)
              for (shape, dtype, name) in rollout_metadata]

    cumulative_rewards = tf.get_variable("cumulative_rewards", len(batch_env),
                                         trainable=False)

    eval_phase = tf.convert_to_tensor(eval_phase)
    should_reset_var = tf.Variable(True, trainable=False)
    zeros_tensor = tf.zeros(len(batch_env))

  if "force_beginning_resets" in hparams:
    force_beginning_resets = hparams.force_beginning_resets
  else:
    force_beginning_resets = False

  def group():
    return tf.group(batch_env.reset(tf.range(len(batch_env))),
                    tf.assign(cumulative_rewards, zeros_tensor))
  reset_op = tf.cond(
      tf.logical_or(should_reset_var, tf.convert_to_tensor(
          force_beginning_resets)),
      group, tf.no_op)

  with tf.control_dependencies([reset_op]):
    reset_once_op = tf.assign(should_reset_var, False)

  with tf.control_dependencies([reset_once_op]):

    def step(index, scores_sum, scores_num):
      """Single step."""
      index %= hparams.epoch_length  # Only needed in eval runs.
      # Note - the only way to ensure making a copy of tensor is to run simple
      # operation. We are waiting for tf.copy:
      # https://github.com/tensorflow/tensorflow/issues/11186
      obs_copy = batch_env.observ + 0

      def env_step(arg1, arg2):  # pylint: disable=unused-argument
        """Step of the environment."""
        actor_critic = get_policy(tf.expand_dims(obs_copy, 0), hparams)
        policy = actor_critic.policy
        if policy_to_actions_lambda:
          action = policy_to_actions_lambda(policy)
        else:
          action = tf.cond(eval_phase,
                           policy.mode,
                           policy.sample)

        postprocessed_action = actor_critic.action_postprocessing(action)
        simulate_output = batch_env.simulate(postprocessed_action[0, ...])

        pdf = policy.prob(action)[0]
        value_function = actor_critic.value[0]
        pdf = tf.reshape(pdf, shape=(hparams.num_agents,))
        value_function = tf.reshape(value_function, shape=(hparams.num_agents,))

        with tf.control_dependencies(simulate_output):
          return tf.identity(pdf), tf.identity(value_function)

      pdf, value_function = tf.while_loop(
          lambda _1, _2: tf.equal(speculum.size(), 0),
          env_step,
          [tf.constant(0.0, shape=(hparams.num_agents,)),
           tf.constant(0.0, shape=(hparams.num_agents,))],
          parallel_iterations=1,
          back_prop=False,)

      with tf.control_dependencies([pdf, value_function]):
        obs, reward, done, action = speculum.dequeue()

        done = tf.reshape(done, (len(batch_env),))
        to_save = [obs, reward, done, action,
                   pdf, value_function]
        save_ops = [tf.scatter_update(memory_slot, index, value)
                    for memory_slot, value in zip(memory, to_save)]
        cumulate_rewards_op = cumulative_rewards.assign_add(reward)
        agent_indices_to_reset = tf.where(done)[:, 0]
      with tf.control_dependencies([cumulate_rewards_op]):
        scores_sum_delta = tf.reduce_sum(
            tf.gather(cumulative_rewards, agent_indices_to_reset))
        scores_num_delta = tf.count_nonzero(done, dtype=tf.int32)
      with tf.control_dependencies(save_ops + [scores_sum_delta,
                                               scores_num_delta]):
        reset_env_op = batch_env.reset(agent_indices_to_reset)
        reset_cumulative_rewards_op = tf.scatter_update(
            cumulative_rewards, agent_indices_to_reset,
            tf.gather(zeros_tensor, agent_indices_to_reset))
      with tf.control_dependencies([reset_env_op,
                                    reset_cumulative_rewards_op]):
        return [index + 1, scores_sum + scores_sum_delta,
                scores_num + scores_num_delta]

    def stop_condition(i, _, resets):
      return tf.cond(eval_phase,
                     lambda: resets < hparams.num_eval_agents,
                     lambda: i < hparams.epoch_length)

    init = [tf.constant(0), tf.constant(0.0), tf.constant(0)]
    index, scores_sum, scores_num = tf.while_loop(
        stop_condition,
        step,
        init,
        parallel_iterations=1,
        back_prop=False)
  mean_score = tf.cond(tf.greater(scores_num, 0),
                       lambda: scores_sum / tf.cast(scores_num, tf.float32),
                       lambda: 0.)
  printing = tf.Print(0, [mean_score, scores_sum, scores_num], "mean_score: ")
  with tf.control_dependencies([index, printing]):
    memory = [tf.identity(mem) for mem in memory]
    mean_score_summary = tf.cond(
        tf.greater(scores_num, 0),
        lambda: tf.summary.scalar("mean_score_this_iter", mean_score),
        str)
    summaries = tf.summary.merge(
        [mean_score_summary,
         tf.summary.scalar("episodes_finished_this_iter", scores_num)])
    return memory, summaries, initialization_lambda