예제 #1
0
    def __init__(self, *args, **kwargs):
        super(GymDiscreteProblemWithAgent, self).__init__(*args, **kwargs)
        self._env = None
        self.debug_dump_frames_path = "debug_frames_env"
        self.make_extra_debug_info = True
        self.autoencoder_model = None

        # Defaults.
        self.environment_spec = lambda: gym.make(self.env_name)
        self._real_env = None
        self.real_env_problem = None
        self.in_graph_wrappers = []
        self.collect_hparams = rl.ppo_pong_base()
        if FLAGS.autoencoder_path:
            self.collect_hparams = rl.ppo_pong_ae_base()
        self.settable_num_steps = 50000
        self.simulated_environment = None
        self.eval_phase = False
        self.warm_up = 10  # TODO(piotrm): This should be probably removed.

        # Debug info.
        self.dones = 0
        self.real_reward = 0
        self.total_sim_reward, self.total_real_reward = 0.0, 0.0
        self.sum_of_rewards = 0.0
        self.successful_episode_reward_predictions = 0
예제 #2
0
  def _setup(self):
    # TODO(piotrmilos):this should be consistent with
    # ppo_params in model_rl_experiment
    collect_hparams = rl.ppo_pong_base()
    collect_hparams.add_hparam("environment_spec", self.environment_spec)
    collect_hparams.add_hparam("force_beginning_resets",
                               self._internal_memory_force_beginning_resets)
    collect_hparams.epoch_length = self._internal_memory_size
    collect_hparams.num_agents = 1

    if not FLAGS.agent_policy_path:
      collect_hparams.policy_network = rl.random_policy_fun

    policy_to_actions_lambda = None
    if self.settable_eval_phase:
      policy_to_actions_lambda = lambda policy: policy.mode()

    with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
      self.collect_memory, self.collect_trigger_op, collect_init = (
          collect.define_collect(
              collect_hparams,
              scope="gym_problems",
              eval_phase=False,
              collect_level=0,
              policy_to_actions_lambda=policy_to_actions_lambda))

    self._session = tf.Session()
    collect_init(self._session)
    self._session.run(tf.global_variables_initializer())
    self.restore_networks(self._session)
예제 #3
0
  def _setup(self, data_dir, extra_collect_hparams=None,
             override_collect_hparams=None):
    dumper_path = os.path.join(data_dir, "dumper")
    dumper_exists = tf.gfile.Exists(dumper_path)
    tf.logging.info("Dumper path %s." % dumper_path)
    if dumper_exists and not self.settable_eval_phase:
      tf.logging.info("Using dumper data.")
      self._use_dumper_data = True
      self._dumper_data_index = 0
      self._dumper_path = dumper_path
    else:
      # TODO(piotrmilos):this should be consistent with
      # ppo_params in model_rl_experiment
      collect_hparams = rl.ppo_pong_base()
      collect_hparams.add_hparam("environment_spec", self.environment_spec)
      collect_hparams.add_hparam("force_beginning_resets",
                                 self._internal_memory_force_beginning_resets)
      collect_hparams.epoch_length = self._internal_memory_size
      collect_hparams.num_agents = 1

      if not FLAGS.agent_policy_path:
        collect_hparams.policy_network = rl.random_policy_fun

      if extra_collect_hparams is not None:
        for (key, value) in six.iteritems(extra_collect_hparams):
          collect_hparams.add_hparam(key, value)

      if override_collect_hparams is not None:
        # Override hparams manually - HParams.override_from_dict does not work
        # with functions.
        for (key, value) in six.iteritems(override_collect_hparams):
          setattr(collect_hparams, key, value)

      policy_to_actions_lambda = None
      if self.settable_eval_phase:
        policy_to_actions_lambda = lambda policy: policy.mode()

      collect_level = 2  # After Resize and RewardClipping.
      if collect_hparams.environment_spec.simulated_env:
        collect_level = 1  # We still have reward clipping.
      if self._forced_collect_level is not None:  # For autoencoders.
        collect_level = self._forced_collect_level

      with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
        self.collect_memory, self.collect_trigger_op, collect_init = (
            collect.define_collect(
                collect_hparams,
                scope="gym_problems",
                eval_phase=False,
                collect_level=collect_level,
                policy_to_actions_lambda=policy_to_actions_lambda))

      self._session = tf.Session()
      collect_init(self._session)
      self._session.run(tf.global_variables_initializer())
      self.restore_networks(self._session)
      self.memory_index = 0
      self.memory = None
예제 #4
0
    def _setup(self):
        collect_hparams = rl.ppo_pong_base()
        collect_hparams.add_hparam("environment_spec", self.environment_spec)

        if not FLAGS.agent_policy_path:
            collect_hparams.policy_network = rl.random_policy_fun

        self._internal_memory_size = 10
        with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
            collect_hparams.epoch_length = self._internal_memory_size
            # TODO(piotrmilos). it is possible to set more than 1.
            collect_hparams.num_agents = 1
            self.collect_memory, self.collect_trigger_op \
              = collect.define_collect(collect_hparams, scope="gym_problems",
                                       collect_level=0, eval_phase=self.eval_phase)
예제 #5
0
  def _setup(self):
    collect_hparams = rl.ppo_pong_base()
    collect_hparams.add_hparam("environment_spec", self.environment_spec)
    collect_hparams.add_hparam("force_beginning_resets",
                               self._internal_memory_force_beginning_resets)
    collect_hparams.epoch_length = self._internal_memory_size
    collect_hparams.num_agents = 1

    if not FLAGS.agent_policy_path:
      collect_hparams.policy_network = rl.random_policy_fun

    with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
      self.collect_memory, self.collect_trigger_op, collect_init \
        = collect.define_collect(collect_hparams, scope="gym_problems",
                                 collect_level=0, eval_phase=self.eval_phase)

    self._session = tf.Session()
    collect_init(self._session)
    self._session.run(tf.global_variables_initializer())
예제 #6
0
    def _setup(self, data_dir):
        # TODO(piotrmilos):this should be consistent with
        # ppo_params in model_rl_experiment
        dumper_path = os.path.join(data_dir, "dumper")
        if os.path.isdir(dumper_path):
            tf.logging.info("Using dumper data.")
            self._use_dumper_data = True
            self._dumper_data_index = 0
            self._dumper_path = dumper_path
        else:
            collect_hparams = rl.ppo_pong_base()
            collect_hparams.add_hparam("environment_spec",
                                       self.environment_spec)
            collect_hparams.add_hparam(
                "force_beginning_resets",
                self._internal_memory_force_beginning_resets)
            collect_hparams.epoch_length = self._internal_memory_size
            collect_hparams.num_agents = 1

            if not FLAGS.agent_policy_path:
                collect_hparams.policy_network = rl.random_policy_fun

            policy_to_actions_lambda = None
            if self.settable_eval_phase:
                policy_to_actions_lambda = lambda policy: policy.mode()

            with tf.variable_scope(tf.get_variable_scope(),
                                   reuse=tf.AUTO_REUSE):
                self.collect_memory, self.collect_trigger_op, collect_init = (
                    collect.define_collect(
                        collect_hparams,
                        scope="gym_problems",
                        eval_phase=False,
                        collect_level=
                        1,  # After ResizeWrapper but before others.
                        policy_to_actions_lambda=policy_to_actions_lambda))

            self._session = tf.Session()
            collect_init(self._session)
            self._session.run(tf.global_variables_initializer())
            self.restore_networks(self._session)
            self.memory_index = 0
            self.memory = None
예제 #7
0
  def _setup(self):
    collect_hparams = rl.ppo_pong_base()
    collect_hparams.add_hparam("environment_spec", self.environment_spec)
    collect_hparams.add_hparam("force_beginning_resets",
                               self._internal_memory_force_beginning_resets)
    collect_hparams.epoch_length = self._internal_memory_size
    collect_hparams.num_agents = 1

    if not FLAGS.agent_policy_path:
      collect_hparams.policy_network = rl.random_policy_fun

    with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
      self.collect_memory, self.collect_trigger_op, collect_init \
        = collect.define_collect(collect_hparams, scope="gym_problems",
                                 collect_level=0, eval_phase=self.eval_phase)

    self._session = tf.Session()
    collect_init(self._session)
    self._session.run(tf.global_variables_initializer())