Пример #1
0
    def test_normalize(self):
        env = make_doom_env(doom_env_by_name(TEST_ENV_NAME))
        obs_space = main_observation_space(env)

        env.reset()
        obs = [env.step(0)[0] for _ in range(10)]

        self.assertTrue(np.all(obs_space.low == 0))
        self.assertTrue(np.all(obs_space.high == 255))
        self.assertEqual(obs_space.dtype, np.uint8)

        self.assertFalse(is_normalized(obs_space))

        tf.reset_default_graph()

        ph_obs = placeholder_from_space(obs_space)
        obs_tensor = tf_normalize(ph_obs, obs_space)

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            normalized_obs = sess.run(obs_tensor, feed_dict={ph_obs: obs})

            self.assertEqual(normalized_obs.dtype, np.float32)
            self.assertLessEqual(normalized_obs.max(), 1.0)
            self.assertGreaterEqual(normalized_obs.min(), -1.0)

        tf.reset_default_graph()
        gc.collect()
Пример #2
0
    def __init__(self, make_env_func, params):
        super(AgentCuriousPPO, self).__init__(make_env_func, params)

        env = self.make_env_func(
        )  # we need it to query observation shape, number of actions, etc.
        self.ph_next_observations = placeholder_from_space(
            main_observation_space(env))
        self.num_actions = env.action_space.n
        env.close()

        if self.params.curiosity_type == 'icm':
            # create graph for curiosity module (ICM)
            self.curiosity = IntrinsicCuriosityModule(
                env,
                self.ph_observations,
                self.ph_next_observations,
                self.ph_actions,
                params.forward_fc,
                params,
            )
        elif self.params.curiosity_type == 'ecr':
            self.curiosity = ECRModule(env, params)
        elif self.params.curiosity_type == 'ecr_map':
            self.curiosity = ECRMapModule(env, params)
        elif self.params.curiosity_type == 'rnd':
            self.curiosity = RandomNetworkDistillation(env,
                                                       self.ph_observations,
                                                       params)
        else:
            raise Exception(
                f'Curiosity type {self.params.curiosity_type} not supported')

        self.previous_actions = np.random.randint(0, self.num_actions,
                                                  self.params.num_envs)
Пример #3
0
    def __init__(self, env, ph_observations, params):
        self.ph_observations = ph_observations

        num_actions = env.action_space.n
        obs_space = main_observation_space(env)

        # Goal observation
        self.ph_goal_obs = None
        self.is_goal_env = is_goal_based_env(env)
        if self.is_goal_env:
            # goal obs has the same shape as main obs
            self.ph_goal_obs = placeholder_from_space(main_observation_space(env))

        make_encoder_func = make_encoder_with_goal if self.is_goal_env else make_encoder

        regularizer = None  # don't use L2 regularization

        actor_enc_params = get_enc_params(params, 'actor')

        # actor computation graph
        # use actor encoder as main observation encoder (including landmarks, etc.)
        if self.is_goal_env:
            actor_encoder = make_encoder_func(
                self.ph_observations, self.ph_goal_obs, obs_space, regularizer, actor_enc_params, name='act_enc',
            )
        else:
            actor_encoder = make_encoder_func(
                self.ph_observations, obs_space, regularizer, actor_enc_params, name='act_enc',
            )

        actor_model = make_model(actor_encoder.encoded_input, regularizer, params, 'act_mdl')

        actions_fc = dense(actor_model.latent, params.model_fc_size // 2, regularizer)
        action_logits = tf.contrib.layers.fully_connected(actions_fc, num_actions, activation_fn=None)
        self.best_action_deterministic = tf.argmax(action_logits, axis=1)
        self.actions_distribution = CategoricalProbabilityDistribution(action_logits)
        self.act = self.actions_distribution.sample()
        self.action_prob = self.actions_distribution.probability(self.act)

        critic_enc_params = get_enc_params(params, 'critic')

        # critic computation graph
        if self.is_goal_env:
            value_encoder = make_encoder_func(
                self.ph_observations, self.ph_goal_obs, obs_space, regularizer, critic_enc_params, 'val_enc',
            )
        else:
            value_encoder = make_encoder_func(
                self.ph_observations, obs_space, regularizer, critic_enc_params, 'val_enc',
            )

        value_model = make_model(value_encoder.encoded_input, regularizer, params, 'val_mdl')

        value_fc = dense(value_model.latent, params.model_fc_size // 2, regularizer)
        self.value = tf.squeeze(tf.contrib.layers.fully_connected(value_fc, 1, activation_fn=None), axis=[1])

        log.info('Total parameters in the model: %d', count_total_parameters())
Пример #4
0
    def __init__(self, make_env_func, params):
        """Initialize PPO computation graph and some auxiliary tensors."""
        super(AgentPPO, self).__init__(params)

        self.actor_step = tf.Variable(0, trainable=False, dtype=tf.int64, name='actor_step')
        self.critic_step = tf.Variable(0, trainable=False, dtype=tf.int64, name='critic_step')

        self.make_env_func = make_env_func
        env = make_env_func()  # we need the env to query observation shape, number of actions, etc.

        self.obs_shape = [-1] + list(main_observation_space(env).shape)
        self.ph_observations = placeholder_from_space(main_observation_space(env))
        self.ph_actions = placeholder_from_space(env.action_space)  # actions sampled from the policy
        self.ph_advantages, self.ph_returns, self.ph_old_action_probs = placeholders(None, None, None)

        self.actor_critic = ActorCritic(env, self.ph_observations, self.params)

        env.close()

        self.objectives = self.add_ppo_objectives(
            self.actor_critic,
            self.ph_actions, self.ph_old_action_probs, self.ph_advantages, self.ph_returns,
            self.params,
            self.actor_step,
        )

        # optimizers
        actor_opt = tf.train.AdamOptimizer(learning_rate=self.params.learning_rate, name='actor_opt')
        self.train_actor = actor_opt.minimize(self.objectives.actor_loss, global_step=self.actor_step)

        critic_opt = tf.train.AdamOptimizer(learning_rate=self.params.learning_rate, name='critic_opt')
        self.train_critic = critic_opt.minimize(self.objectives.critic_loss, global_step=self.critic_step)

        self.add_ppo_summaries()

        summary_dir = summaries_dir(self.params.experiment_dir())
        self.summary_writer = tf.summary.FileWriter(summary_dir)
        self.actor_summaries = merge_summaries(collections=['actor'])
        self.critic_summaries = merge_summaries(collections=['critic'])

        if self.params.use_env_map:
            self.map_img, self.coord_limits = generate_env_map(make_env_func)
Пример #5
0
    def setup_graph(env, params, use_dataset):
        tf.reset_default_graph()

        step = tf.Variable(0, trainable=False, dtype=tf.int64, name='step')

        ph_observations = placeholder_from_space(env.observation_space)
        ph_actions = placeholder_from_space(env.action_space)
        ph_old_actions_probs, ph_advantages, ph_returns = placeholders(
            None, None, None)

        if use_dataset:
            dataset = tf.data.Dataset.from_tensor_slices((
                ph_observations,
                ph_actions,
                ph_old_actions_probs,
                ph_advantages,
                ph_returns,
            ))
            dataset = dataset.batch(params.batch_size)
            dataset = dataset.prefetch(10)
            iterator = dataset.make_initializable_iterator()
            observations, act, old_action_probs, adv, ret = iterator.get_next()
        else:
            observations = ph_observations
            act, old_action_probs, adv, ret = ph_actions, ph_old_actions_probs, ph_advantages, ph_returns

        actor_critic = ActorCritic(env, observations, params)
        env.close()

        objectives = AgentPPO.add_ppo_objectives(actor_critic, act,
                                                 old_action_probs, adv, ret,
                                                 params, step)
        train_op = tf.train.AdamOptimizer(learning_rate=1e-4).minimize(
            objectives.actor_loss, global_step=step)

        return AttrDict(locals())
Пример #6
0
    def __init__(self, env, params):
        obs_space = main_observation_space(env)
        self.ph_obs_prev, self.ph_obs_curr, self.ph_obs_goal = placeholders_from_spaces(
            obs_space, obs_space, obs_space)
        self.ph_actions = placeholder_from_space(env.action_space)
        self.ph_is_training = tf.placeholder(dtype=tf.bool, shape=[])

        with tf.variable_scope('loco') as scope:
            log.info('Locomotion network graph...')

            self.step = tf.Variable(0,
                                    trainable=False,
                                    dtype=tf.int64,
                                    name='loco_step')

            reg = tf.contrib.layers.l2_regularizer(scale=1e-5)

            enc_params = EncoderParams()
            enc_params.enc_name = params.locomotion_encoder
            enc_params.batch_norm = params.locomotion_use_batch_norm
            enc_params.ph_is_training = self.ph_is_training
            enc_params.summary_collections = ['loco']

            encoder = tf.make_template(
                'enc_loco',
                make_encoder,
                create_scope_now_=True,
                obs_space=obs_space,
                regularizer=reg,
                enc_params=enc_params,
            )

            if params.locomotion_siamese:
                obs_curr_encoder = encoder(self.ph_obs_curr)
                obs_curr_encoded = obs_curr_encoder.encoded_input

                obs_goal_encoder = encoder(self.ph_obs_goal)
                obs_goal_encoded = obs_goal_encoder.encoded_input

                obs_encoder = obs_curr_encoder  # any of the two
                obs_encoded = tf.concat([obs_curr_encoded, obs_goal_encoded],
                                        axis=1)
            else:
                if params.locomotion_use_prev:
                    obs_concat = tf.concat(
                        [self.ph_obs_prev, self.ph_obs_curr, self.ph_obs_goal],
                        axis=3)
                else:
                    obs_concat = tf.concat(
                        [self.ph_obs_curr, self.ph_obs_goal], axis=3)

                obs_encoder = encoder(obs_concat)
                obs_encoded = obs_encoder.encoded_input

            encoder_reg_loss = 0.0
            if hasattr(obs_encoder, 'reg_loss'):
                encoder_reg_loss = obs_encoder.reg_loss

            fc_layers = [params.locomotion_fc_size] * params.locomotion_fc_num
            x = obs_encoded
            for fc_layer_size in fc_layers:
                x = dense(
                    x,
                    fc_layer_size,
                    regularizer=reg,
                    batch_norm=params.locomotion_use_batch_norm,
                    is_training=self.ph_is_training,
                )

            action_logits = tf.layers.dense(x,
                                            env.action_space.n,
                                            activation=None)
            self.actions_distribution = CategoricalProbabilityDistribution(
                action_logits)
            self.best_action_deterministic = tf.argmax(action_logits, axis=1)
            self.act = self.actions_distribution.sample()

            self.correct = tf.reduce_mean(
                tf.to_float(
                    tf.equal(
                        self.ph_actions,
                        tf.cast(tf.argmax(action_logits, axis=1),
                                tf.int32))), )

            actions_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=self.ph_actions, logits=action_logits)
            self.actions_loss = tf.reduce_mean(actions_loss)

            reg_losses = tf.losses.get_regularization_losses(scope=scope.name)
            self.reg_loss = tf.reduce_sum(reg_losses) + encoder_reg_loss

            self.loss = self.actions_loss + self.reg_loss

            loco_opt = tf.train.AdamOptimizer(
                learning_rate=params.learning_rate, name='loco_opt')

            with tf.control_dependencies(
                    tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope='loco')):
                self.train_loco = loco_opt.minimize(self.loss,
                                                    global_step=self.step)