Пример #1
0
 def predict_sym(self, obs_var, act_var):
     assert self.normalize_input
     with tf.variable_scope(self.name, reuse=True):
         in_obs_var = normalize(obs_var,
                                mean=self._mean_obs_var,
                                std=self._std_obs_var)
         in_act_var = normalize(act_var,
                                mean=self._mean_act_var,
                                std=self._std_act_var)
         input_var = tf.concat([in_obs_var, in_act_var], axis=1)
         mlp = MLP(
             self.name,
             output_dim=self.obs_space_dims,
             hidden_sizes=self.hidden_sizes,
             hidden_nonlinearity=self.hidden_nonlinearity,
             output_nonlinearity=self.output_nonlinearity,
             input_var=input_var,
             input_dim=self.obs_space_dims + self.action_space_dims,
         )
         delta = denormalize(mlp.output_var,
                             mean=self._mean_delta_var,
                             std=self._std_delta_var)
         pred_obs = delta + obs_var
         pred_obs = tf.clip_by_value(pred_obs, -1e2, 1e2)
     return pred_obs
Пример #2
0
 def distribution_info_sym(self, obs_var):
     with tf.variable_scope(self.name + '/value_function', reuse=True):
         input_var = (obs_var -
                      self._mean_input_var) / (self._std_input_var + 1e-8)
         mlp = MLP(self.name,
                   output_dim=1,
                   hidden_sizes=self.hidden_sizes,
                   hidden_nonlinearity=self.hidden_nonlinearity,
                   output_nonlinearity=self.output_nonlinearity,
                   input_var=input_var,
                   input_dim=self.obs_space_dims)
         output_var = tf.reshape(mlp.output_var, shape=(-1, ))
         output_var = output_var * self._std_output_var + self._mean_output_var
     return dict(mean=output_var)
Пример #3
0
    def predict_batches_sym(self, obs_ph, act_ph):
        """
        Same batch fed into all models. Randomly output one of the predictions for each observation.
        :param obs_ph: (batch_size, obs_space_dims)
        :param act_ph: (batch_size, act_space_dims)
        :return: (batch_size, obs_space_dims)
        """
        original_obs = obs_ph
        obs_ph, act_ph = tf.split(obs_ph, self.num_models,
                                  axis=0), tf.split(act_ph,
                                                    self.num_models,
                                                    axis=0)

        delta_preds = []
        with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
            for i in range(self.num_models):
                with tf.variable_scope('model_{}'.format(i), reuse=True):
                    assert self.normalize_input
                    in_obs_var = tf_normalize(obs_ph[i],
                                              mean=self._mean_obs_var[i],
                                              std=self._std_obs_var[i])
                    in_act_var = tf_normalize(act_ph[i],
                                              mean=self._mean_act_var[i],
                                              std=self._std_act_var[i])
                    input_var = tf.concat([in_obs_var, in_act_var], axis=1)
                    mlp = MLP(
                        self.name + '/model_{}'.format(i),
                        output_dim=self.obs_space_dims,
                        hidden_sizes=self.hidden_sizes,
                        hidden_nonlinearity=self.hidden_nonlinearity,
                        output_nonlinearity=self.output_nonlinearity,
                        input_var=input_var,
                        input_dim=self.obs_space_dims + self.action_space_dims,
                    )

                    delta_pred = tf_denormalize(mlp.output_var,
                                                mean=self._mean_delta_var[i],
                                                std=self._std_delta_var[i])
                    delta_preds.append(delta_pred)

        delta_preds = tf.concat(delta_preds, axis=0)
        next_obs = original_obs + delta_preds

        return tf.clip_by_value(next_obs, -1e2, 1e2)
Пример #4
0
    def __init__(
        self,
        name,
        env,
        hidden_sizes=(500, 500),
        hidden_nonlinearity="tanh",
        output_nonlinearity=None,
        batch_size=500,
        learning_rate=0.001,
        weight_normalization=True,
        normalize_input=True,
        optimizer=tf.train.AdamOptimizer,
        valid_split_ratio=0.2,
        rolling_average_persitency=0.99,
        buffer_size=100000,
    ):

        Serializable.quick_init(self, locals())

        self.normalization = None
        self.normalize_input = normalize_input
        self.use_reward_model = False
        self.buffer_size = buffer_size
        self.name = name
        self.hidden_sizes = hidden_sizes

        self._dataset_train = None
        self._dataset_test = None
        self.next_batch = None

        self.valid_split_ratio = valid_split_ratio
        self.rolling_average_persitency = rolling_average_persitency
        self.hidden_nonlinearity = hidden_nonlinearity = self._activations[
            hidden_nonlinearity]
        self.output_nonlinearity = output_nonlinearity = self._activations[
            output_nonlinearity]

        with tf.variable_scope(name):
            self.batch_size = batch_size
            self.learning_rate = learning_rate

            # determine dimensionality of state and action space
            self.obs_space_dims = env.observation_space.shape[0]
            self.action_space_dims = env.action_space.shape[0]

            # placeholders
            self.obs_ph = tf.placeholder(tf.float32,
                                         shape=(None, self.obs_space_dims))
            self.act_ph = tf.placeholder(tf.float32,
                                         shape=(None, self.action_space_dims))
            self.delta_ph = tf.placeholder(tf.float32,
                                           shape=(None, self.obs_space_dims))

            self._create_stats_vars()

            # concatenate action and observation --> NN input
            self.nn_input = tf.concat([self.obs_ph, self.act_ph], axis=1)

            # create MLP
            mlp = MLP(name,
                      output_dim=self.obs_space_dims,
                      hidden_sizes=hidden_sizes,
                      hidden_nonlinearity=hidden_nonlinearity,
                      output_nonlinearity=output_nonlinearity,
                      input_var=self.nn_input,
                      input_dim=self.obs_space_dims + self.action_space_dims,
                      weight_normalization=weight_normalization)

            self.delta_pred = mlp.output_var

            # define loss and train_op
            self.loss = tf.reduce_mean(
                tf.linalg.norm(self.delta_ph - self.delta_pred, axis=-1))
            self.optimizer = optimizer(self.learning_rate)
            self.train_op = self.optimizer.minimize(self.loss)

            # tensor_utils
            self.f_delta_pred = compile_function([self.obs_ph, self.act_ph],
                                                 self.delta_pred)

        self._networks = [mlp]
Пример #5
0
    def predict_sym(self, obs_ph, act_ph, pred_type='rand', perm_dict=None):
        """
        Same batch fed into all models. Randomly output one of the predictions for each observation.
        :param obs_ph: (batch_size, obs_space_dims)
        :param act_ph: (batch_size, act_space_dims)
        :return: (batch_size, obs_space_dims)
        """
        if pred_type == 'rand':
            # shuffle
            if perm_dict is not None:
                perm = perm_dict['perm']
            else:
                perm = tf.range(0, limit=tf.shape(obs_ph)[0], dtype=tf.int32)
                perm = tf.random.shuffle(perm)
            obs_ph_perm, act_ph_perm = tf.gather(obs_ph, perm), tf.gather(
                act_ph, perm)

            next_obs_perm = self.predict_batches_sym(obs_ph_perm, act_ph_perm)

            # unshuffle
            if perm_dict is not None:
                perm_inv = perm_dict['perm_inv']
            else:
                perm_inv = tf.invert_permutation(perm)
            next_obs = tf.gather(next_obs_perm, perm_inv)
            return next_obs

        delta_preds = []
        with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
            for i in range(self.num_models):
                with tf.variable_scope('model_{}'.format(i), reuse=True):
                    assert self.normalize_input
                    in_obs_var = tf_normalize(obs_ph,
                                              mean=self._mean_obs_var[i],
                                              std=self._std_obs_var[i])
                    in_act_var = tf_normalize(act_ph,
                                              mean=self._mean_act_var[i],
                                              std=self._std_act_var[i])
                    input_var = tf.concat([in_obs_var, in_act_var], axis=1)
                    mlp = MLP(
                        self.name + '/model_{}'.format(i),
                        output_dim=self.obs_space_dims,
                        hidden_sizes=self.hidden_sizes,
                        hidden_nonlinearity=self.hidden_nonlinearity,
                        output_nonlinearity=self.output_nonlinearity,
                        input_var=input_var,
                        input_dim=self.obs_space_dims + self.action_space_dims,
                    )

                    delta_pred = tf_denormalize(mlp.output_var,
                                                mean=self._mean_delta_var[i],
                                                std=self._std_delta_var[i])
                    delta_preds.append(delta_pred)

        delta_preds = tf.stack(delta_preds,
                               axis=2)  # (batch_size, obs_dims, num_models)
        next_obs = tf.expand_dims(obs_ph, axis=2) + delta_preds

        if pred_type == 'all':
            pass
        elif pred_type == 'mean':
            next_obs = tf.reduce_mean(next_obs, axis=2)
        else:
            NotImplementedError('[rand, mean, all]')

        return tf.clip_by_value(next_obs, -1e2, 1e2)
Пример #6
0
    def __init__(
        self,
        name,
        env,
        num_models=5,
        hidden_sizes=(512, 512),
        hidden_nonlinearity='swish',
        output_nonlinearity=None,
        batch_size=500,
        learning_rate=0.001,
        weight_normalization=False,  # Doesn't work
        normalize_input=True,
        optimizer=tf.train.AdamOptimizer,
        valid_split_ratio=0.2,  # 0.1
        rolling_average_persitency=0.99,
        buffer_size=50000,
        loss_str='MSE',
    ):

        Serializable.quick_init(self, locals())

        max_logvar = 1
        min_logvar = 0.1

        self.normalization = None
        self.normalize_input = normalize_input
        self.next_batch = None

        self.valid_split_ratio = valid_split_ratio
        self.rolling_average_persitency = rolling_average_persitency

        self.buffer_size_train = int(buffer_size * (1 - valid_split_ratio))
        self.buffer_size_test = int(buffer_size * valid_split_ratio)
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.num_models = num_models
        self.hidden_sizes = hidden_sizes
        self.name = name
        self._dataset_train = None
        self._dataset_test = None

        # determine dimensionality of state and action space
        self.obs_space_dims = obs_space_dims = env.observation_space.shape[0]
        self.action_space_dims = action_space_dims = env.action_space.shape[0]
        self.timesteps_counter = 0
        self.used_timesteps_counter = 0

        self.hidden_nonlinearity = hidden_nonlinearity = self._activations[
            hidden_nonlinearity]
        self.output_nonlinearity = output_nonlinearity = self._activations[
            output_nonlinearity]
        """ computation graph for training and simple inference """
        with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
            # placeholders
            self.obs_ph = tf.placeholder(tf.float32,
                                         shape=(None, obs_space_dims))
            self.act_ph = tf.placeholder(tf.float32,
                                         shape=(None, action_space_dims))
            self.delta_ph = tf.placeholder(tf.float32,
                                           shape=(None, obs_space_dims))

            self._create_stats_vars()

            # concatenate action and observation --> NN input
            self.nn_input = tf.concat([self.obs_ph, self.act_ph], axis=1)
            obs_ph = tf.split(self.nn_input, self.num_models, axis=0)

            # create MLP
            mlps = []
            delta_preds = []
            self.obs_next_pred = []
            for i in range(num_models):
                with tf.variable_scope('model_{}'.format(i),
                                       reuse=tf.AUTO_REUSE):
                    mlp = MLP(
                        name + '/model_{}'.format(i),
                        output_dim=obs_space_dims,
                        hidden_sizes=hidden_sizes,
                        hidden_nonlinearity=hidden_nonlinearity,
                        output_nonlinearity=output_nonlinearity,
                        input_var=obs_ph[i],
                        input_dim=obs_space_dims + action_space_dims,
                    )
                    mlps.append(mlp)

                delta_preds.append(mlp.output_var)

            self.delta_pred = tf.stack(
                delta_preds, axis=2)  # shape: (batch_size, ndim_obs, n_models)

            # define loss and train_op
            if loss_str == 'L2':
                self.loss = tf.reduce_mean(
                    tf.linalg.norm(self.delta_ph[:, :, None] - self.delta_pred,
                                   axis=1))
            elif loss_str == 'MSE':
                self.loss = tf.reduce_mean(
                    (self.delta_ph[:, :, None] - self.delta_pred)**2)
            else:
                raise NotImplementedError

            self.optimizer = optimizer(learning_rate=self.learning_rate)
            self.train_op = self.optimizer.minimize(self.loss)

            # tensor_utils
            self.f_delta_pred = compile_function([self.obs_ph, self.act_ph],
                                                 self.delta_pred)
        """ computation graph for inference where each of the models receives a different batch"""
        with tf.variable_scope(name, reuse=True):
            # placeholders
            self.obs_model_batches_stack_ph = tf.placeholder(
                tf.float32, shape=(None, obs_space_dims))
            self.act_model_batches_stack_ph = tf.placeholder(
                tf.float32, shape=(None, action_space_dims))
            self.delta_model_batches_stack_ph = tf.placeholder(
                tf.float32, shape=(None, obs_space_dims))

            # split stack into the batches for each model --> assume each model receives a batch of the same size
            self.obs_model_batches = tf.split(self.obs_model_batches_stack_ph,
                                              self.num_models,
                                              axis=0)
            self.act_model_batches = tf.split(self.act_model_batches_stack_ph,
                                              self.num_models,
                                              axis=0)
            self.delta_model_batches = tf.split(
                self.delta_model_batches_stack_ph, self.num_models, axis=0)

            # reuse previously created MLP but each model receives its own batch
            delta_preds = []
            self.obs_next_pred = []
            self.loss_model_batches = []
            self.train_op_model_batches = []
            for i in range(num_models):
                with tf.variable_scope('model_{}'.format(i), reuse=True):
                    # concatenate action and observation --> NN input
                    nn_input = tf.concat(
                        [self.obs_model_batches[i], self.act_model_batches[i]],
                        axis=1)
                    mlp = MLP(name + '/model_{}'.format(i),
                              output_dim=obs_space_dims,
                              hidden_sizes=hidden_sizes,
                              hidden_nonlinearity=hidden_nonlinearity,
                              output_nonlinearity=output_nonlinearity,
                              input_var=nn_input,
                              input_dim=obs_space_dims + action_space_dims,
                              weight_normalization=weight_normalization)

                delta_preds.append(mlp.output_var)

                # define loss and train_op
                if loss_str == 'L2':
                    loss = tf.reduce_mean(
                        tf.linalg.norm(self.delta_model_batches[i] -
                                       mlp.output_var,
                                       axis=1))
                elif loss_str == 'MSE':
                    loss = tf.reduce_mean(
                        (self.delta_model_batches[i] - mlp.output_var)**2)
                else:
                    raise NotImplementedError
                self.loss_model_batches.append(loss)
                self.train_op_model_batches.append(
                    optimizer(learning_rate=self.learning_rate).minimize(loss))

            self.delta_pred_model_batches_stack = tf.concat(
                delta_preds,
                axis=0)  # shape: (batch_size_per_model*num_models, ndim_obs)

            # tensor_utils
            self.f_delta_pred_model_batches = compile_function([
                self.obs_model_batches_stack_ph,
                self.act_model_batches_stack_ph
            ], self.delta_pred_model_batches_stack)

        self._networks = mlps