def __init__(self, *args, **kwargs): super(GymDiscreteProblemWithAgent, self).__init__(*args, **kwargs) self._env = None self.debug_dump_frames_path = "debug_frames_env" self.make_extra_debug_info = True self.autoencoder_model = None # Defaults. self.environment_spec = lambda: gym.make(self.env_name) self._real_env = None self.real_env_problem = None self.in_graph_wrappers = [] self.collect_hparams = rl.ppo_pong_base() if FLAGS.autoencoder_path: self.collect_hparams = rl.ppo_pong_ae_base() self.settable_num_steps = 50000 self.simulated_environment = None self.eval_phase = False self.warm_up = 10 # TODO(piotrm): This should be probably removed. # Debug info. self.dones = 0 self.real_reward = 0 self.total_sim_reward, self.total_real_reward = 0.0, 0.0 self.sum_of_rewards = 0.0 self.successful_episode_reward_predictions = 0
def _setup(self): # TODO(piotrmilos):this should be consistent with # ppo_params in model_rl_experiment collect_hparams = rl.ppo_pong_base() collect_hparams.add_hparam("environment_spec", self.environment_spec) collect_hparams.add_hparam("force_beginning_resets", self._internal_memory_force_beginning_resets) collect_hparams.epoch_length = self._internal_memory_size collect_hparams.num_agents = 1 if not FLAGS.agent_policy_path: collect_hparams.policy_network = rl.random_policy_fun policy_to_actions_lambda = None if self.settable_eval_phase: policy_to_actions_lambda = lambda policy: policy.mode() with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): self.collect_memory, self.collect_trigger_op, collect_init = ( collect.define_collect( collect_hparams, scope="gym_problems", eval_phase=False, collect_level=0, policy_to_actions_lambda=policy_to_actions_lambda)) self._session = tf.Session() collect_init(self._session) self._session.run(tf.global_variables_initializer()) self.restore_networks(self._session)
def _setup(self, data_dir, extra_collect_hparams=None, override_collect_hparams=None): dumper_path = os.path.join(data_dir, "dumper") dumper_exists = tf.gfile.Exists(dumper_path) tf.logging.info("Dumper path %s." % dumper_path) if dumper_exists and not self.settable_eval_phase: tf.logging.info("Using dumper data.") self._use_dumper_data = True self._dumper_data_index = 0 self._dumper_path = dumper_path else: # TODO(piotrmilos):this should be consistent with # ppo_params in model_rl_experiment collect_hparams = rl.ppo_pong_base() collect_hparams.add_hparam("environment_spec", self.environment_spec) collect_hparams.add_hparam("force_beginning_resets", self._internal_memory_force_beginning_resets) collect_hparams.epoch_length = self._internal_memory_size collect_hparams.num_agents = 1 if not FLAGS.agent_policy_path: collect_hparams.policy_network = rl.random_policy_fun if extra_collect_hparams is not None: for (key, value) in six.iteritems(extra_collect_hparams): collect_hparams.add_hparam(key, value) if override_collect_hparams is not None: # Override hparams manually - HParams.override_from_dict does not work # with functions. for (key, value) in six.iteritems(override_collect_hparams): setattr(collect_hparams, key, value) policy_to_actions_lambda = None if self.settable_eval_phase: policy_to_actions_lambda = lambda policy: policy.mode() collect_level = 2 # After Resize and RewardClipping. if collect_hparams.environment_spec.simulated_env: collect_level = 1 # We still have reward clipping. if self._forced_collect_level is not None: # For autoencoders. collect_level = self._forced_collect_level with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): self.collect_memory, self.collect_trigger_op, collect_init = ( collect.define_collect( collect_hparams, scope="gym_problems", eval_phase=False, collect_level=collect_level, policy_to_actions_lambda=policy_to_actions_lambda)) self._session = tf.Session() collect_init(self._session) self._session.run(tf.global_variables_initializer()) self.restore_networks(self._session) self.memory_index = 0 self.memory = None
def _setup(self): collect_hparams = rl.ppo_pong_base() collect_hparams.add_hparam("environment_spec", self.environment_spec) if not FLAGS.agent_policy_path: collect_hparams.policy_network = rl.random_policy_fun self._internal_memory_size = 10 with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): collect_hparams.epoch_length = self._internal_memory_size # TODO(piotrmilos). it is possible to set more than 1. collect_hparams.num_agents = 1 self.collect_memory, self.collect_trigger_op \ = collect.define_collect(collect_hparams, scope="gym_problems", collect_level=0, eval_phase=self.eval_phase)
def _setup(self): collect_hparams = rl.ppo_pong_base() collect_hparams.add_hparam("environment_spec", self.environment_spec) collect_hparams.add_hparam("force_beginning_resets", self._internal_memory_force_beginning_resets) collect_hparams.epoch_length = self._internal_memory_size collect_hparams.num_agents = 1 if not FLAGS.agent_policy_path: collect_hparams.policy_network = rl.random_policy_fun with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): self.collect_memory, self.collect_trigger_op, collect_init \ = collect.define_collect(collect_hparams, scope="gym_problems", collect_level=0, eval_phase=self.eval_phase) self._session = tf.Session() collect_init(self._session) self._session.run(tf.global_variables_initializer())
def _setup(self, data_dir): # TODO(piotrmilos):this should be consistent with # ppo_params in model_rl_experiment dumper_path = os.path.join(data_dir, "dumper") if os.path.isdir(dumper_path): tf.logging.info("Using dumper data.") self._use_dumper_data = True self._dumper_data_index = 0 self._dumper_path = dumper_path else: collect_hparams = rl.ppo_pong_base() collect_hparams.add_hparam("environment_spec", self.environment_spec) collect_hparams.add_hparam( "force_beginning_resets", self._internal_memory_force_beginning_resets) collect_hparams.epoch_length = self._internal_memory_size collect_hparams.num_agents = 1 if not FLAGS.agent_policy_path: collect_hparams.policy_network = rl.random_policy_fun policy_to_actions_lambda = None if self.settable_eval_phase: policy_to_actions_lambda = lambda policy: policy.mode() with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): self.collect_memory, self.collect_trigger_op, collect_init = ( collect.define_collect( collect_hparams, scope="gym_problems", eval_phase=False, collect_level= 1, # After ResizeWrapper but before others. policy_to_actions_lambda=policy_to_actions_lambda)) self._session = tf.Session() collect_init(self._session) self._session.run(tf.global_variables_initializer()) self.restore_networks(self._session) self.memory_index = 0 self.memory = None