示例#1
0
    def __init__(
        self,
        env,
        policy,
        num_rollouts,
        max_path_length,
        n_parallel=1,
        vae=None,
    ):
        Serializable.quick_init(self, locals())
        super(Sampler, self).__init__(env, policy, n_parallel, max_path_length)

        self.total_samples = num_rollouts * max_path_length
        self.n_parallel = n_parallel
        self.total_timesteps_sampled = 0
        self.vae = vae

        # setup vectorized environment

        if self.n_parallel > 1:
            self.vec_env = ParallelEnvExecutor(env, n_parallel, num_rollouts,
                                               self.max_path_length)
        else:
            self.vec_env = IterativeEnvExecutor(env, num_rollouts,
                                                self.max_path_length)
示例#2
0
    def __init__(self,
                 name,
                 input_dim,
                 output_dim,
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=tf.nn.relu,
                 output_nonlinearity=None,
                 input_var=None,
                 **kwargs):
        Serializable.quick_init(self, locals())

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.name = name
        self.input_var = input_var

        self.hidden_sizes = hidden_sizes
        self.hidden_nonlinearity = hidden_nonlinearity
        self.output_nonlinearity = output_nonlinearity

        self.batch_normalization = kwargs.get('batch_normalization', False)

        self._params = None
        self._assign_ops = None
        self._assign_phs = None
示例#3
0
    def __init__(self,
                 name,
                 env,
                 dynamics_model,
                 reward_model=None,
                 discount=1,
                 use_cem=False,
                 n_candidates=1024,
                 horizon=10,
                 num_cem_iters=8,
                 percent_elites=0.05,
                 use_reward_model=False):
        self.dynamics_model = dynamics_model
        self.reward_model = reward_model
        self.discount = discount
        self.n_candidates = n_candidates
        self.horizon = horizon
        self.use_cem = use_cem
        self.num_cem_iters = num_cem_iters
        self.percent_elites = percent_elites
        self.env = env
        self.use_reward_model = use_reward_model
        self._hidden_state = None

        self.unwrapped_env = env
        while hasattr(self.unwrapped_env, 'wrapped_env'):
            self.unwrapped_env = self.unwrapped_env.wrapped_env

        # make sure that env has reward function
        if not self.use_reward_model:
            assert hasattr(self.unwrapped_env,
                           'reward'), "env must have a reward function"

        Serializable.quick_init(self, locals())
示例#4
0
    def __init__(self, *args, **kwargs):
        # store the init args for serialization and call the super constructors
        Serializable.quick_init(self, locals())
        Layer.__init__(self, *args, **kwargs)
        self._cell_type = kwargs.get('cell_type', 'gru')
        self.state_var = kwargs.get('state_var', None)

        self.build_graph()
示例#5
0
    def __init__(self, *args, **kwargs):
        # store the init args for serialization and call the super constructors
        Serializable.quick_init(self, locals())
        Layer.__init__(self, *args, **kwargs)
        self.num_filters = kwargs.get('num_filters')
        self.kernel_sizes = kwargs.get('kernel_sizes')
        self.strides = kwargs.get('strides')
        self.hidden_nonlinearity = kwargs.get('hidden_nonlinearity')
        self.output_nonlinearity = kwargs.get('output_nonlinearity')

        self.build_graph()
示例#6
0
    def __init__(self, obs_dim, object_dim, max_replay_buffer_size):
        super(ObsReplayBuffer, self).__init__()
        Serializable.quick_init(self, locals())

        max_replay_buffer_size = int(max_replay_buffer_size)

        self._observation_dim = obs_dim
        self._object_dim = object_dim
        self._max_buffer_size = max_replay_buffer_size
        self._observations = np.zeros(
            (max_replay_buffer_size, self._observation_dim))
        self._top = 0
        self._size = 0
示例#7
0
 def __getstate__(self):
     # state = LayersPowered.__getstate__(self)
     state = dict()
     state['init_args'] = Serializable.__getstate__(self)
     state['normalization'] = self.normalization
     state['networks'] = [nn.__getstate__() for nn in self._networks]
     return state
示例#8
0
    def __init__(self,
                 obs_dim,
                 act_dim,
                 max_replay_buffer_size,
                 episode_size=100):
        super(SequenceReplayBuffer, self).__init__()
        Serializable.quick_init(self, locals())

        max_replay_buffer_size = int(max_replay_buffer_size)

        self._observation_dim = obs_dim
        self._action_dim = act_dim
        self._max_buffer_size = max_replay_buffer_size
        self._episode_size = episode_size
        self._neps = (max_replay_buffer_size // episode_size) + 1
        self._observations = np.zeros(
            (self._neps, episode_size, self._observation_dim))
        self._actions = np.zeros(
            (self._neps, episode_size - 1, self._action_dim))
        self._size = 0
        self._episode = 0
示例#9
0
 def __setstate__(self, state):
     # LayersPowered.__setstate__(self, state)
     Serializable.__setstate__(self, state['init_args'])
     self.normalization = state['normalization']
     for i in range(len(self._networks)):
         self._networks[i].__setstate__(state['networks'][i])
示例#10
0
    def __init__(
        self,
        name,
        env,
        hidden_sizes=(500, 500),
        hidden_nonlinearity="tanh",
        output_nonlinearity=None,
        batch_size=500,
        learning_rate=0.001,
        weight_normalization=True,
        normalize_input=True,
        optimizer=tf.train.AdamOptimizer,
        valid_split_ratio=0.2,
        rolling_average_persitency=0.99,
        buffer_size=100000,
    ):

        Serializable.quick_init(self, locals())

        self.normalization = None
        self.normalize_input = normalize_input
        self.use_reward_model = False
        self.buffer_size = buffer_size
        self.name = name
        self.hidden_sizes = hidden_sizes

        self._dataset_train = None
        self._dataset_test = None
        self.next_batch = None

        self.valid_split_ratio = valid_split_ratio
        self.rolling_average_persitency = rolling_average_persitency
        self.hidden_nonlinearity = hidden_nonlinearity = self._activations[
            hidden_nonlinearity]
        self.output_nonlinearity = output_nonlinearity = self._activations[
            output_nonlinearity]

        with tf.variable_scope(name):
            self.batch_size = batch_size
            self.learning_rate = learning_rate

            # determine dimensionality of state and action space
            self.obs_space_dims = env.observation_space.shape[0]
            self.action_space_dims = env.action_space.shape[0]

            # placeholders
            self.obs_ph = tf.placeholder(tf.float32,
                                         shape=(None, self.obs_space_dims))
            self.act_ph = tf.placeholder(tf.float32,
                                         shape=(None, self.action_space_dims))
            self.delta_ph = tf.placeholder(tf.float32,
                                           shape=(None, self.obs_space_dims))

            self._create_stats_vars()

            # concatenate action and observation --> NN input
            self.nn_input = tf.concat([self.obs_ph, self.act_ph], axis=1)

            # create MLP
            mlp = MLP(name,
                      output_dim=self.obs_space_dims,
                      hidden_sizes=hidden_sizes,
                      hidden_nonlinearity=hidden_nonlinearity,
                      output_nonlinearity=output_nonlinearity,
                      input_var=self.nn_input,
                      input_dim=self.obs_space_dims + self.action_space_dims,
                      weight_normalization=weight_normalization)

            self.delta_pred = mlp.output_var

            # define loss and train_op
            self.loss = tf.reduce_mean(
                tf.linalg.norm(self.delta_ph - self.delta_pred, axis=-1))
            self.optimizer = optimizer(self.learning_rate)
            self.train_op = self.optimizer.minimize(self.loss)

            # tensor_utils
            self.f_delta_pred = compile_function([self.obs_ph, self.act_ph],
                                                 self.delta_pred)

        self._networks = [mlp]
示例#11
0
    def __init__(
        self,
        name,
        env,
        num_models=5,
        hidden_sizes=(512, 512),
        hidden_nonlinearity='swish',
        output_nonlinearity=None,
        batch_size=500,
        learning_rate=0.001,
        weight_normalization=False,  # Doesn't work
        normalize_input=True,
        optimizer=tf.train.AdamOptimizer,
        valid_split_ratio=0.2,
        rolling_average_persitency=0.99,
        early_stopping=0,
        buffer_size=50000,
    ):

        Serializable.quick_init(self, locals())

        max_logvar = .5
        min_logvar = -10

        self.normalization = None
        self.normalize_input = normalize_input
        self.next_batch = None

        self.valid_split_ratio = valid_split_ratio
        self.rolling_average_persitency = rolling_average_persitency

        self.buffer_size_train = int(buffer_size * (1 - valid_split_ratio))
        self.buffer_size_test = int(buffer_size * valid_split_ratio)
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.num_models = num_models
        self.name = name
        self.hidden_sizes = hidden_sizes
        self._dataset_train = None
        self._dataset_test = None

        # determine dimensionality of state and action space
        self.obs_space_dims = obs_space_dims = env.observation_space.shape[0]
        self.action_space_dims = action_space_dims = env.action_space.shape[0]
        self.timesteps_counter = 0
        self.used_timesteps_counter = 0

        hidden_nonlinearity = self._activations[hidden_nonlinearity]
        output_nonlinearity = self._activations[output_nonlinearity]
        self.hidden_nonlinearity = hidden_nonlinearity
        self.output_nonlinearity = output_nonlinearity
        self.early_stopping = early_stopping
        """ computation graph for training and simple inference """
        with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
            self._create_stats_vars()

            self.max_logvar = tf.Variable(np.ones([1, obs_space_dims]) *
                                          max_logvar,
                                          dtype=tf.float32,
                                          trainable=True,
                                          name="max_logvar")
            self.min_logvar = tf.Variable(np.ones([1, obs_space_dims]) *
                                          min_logvar,
                                          dtype=tf.float32,
                                          trainable=True,
                                          name="min_logvar")
            self._create_assign_ph()

            # placeholders
            self.obs_ph = tf.placeholder(tf.float32,
                                         shape=(None, obs_space_dims))
            self.act_ph = tf.placeholder(tf.float32,
                                         shape=(None, action_space_dims))
            self.delta_ph = tf.placeholder(tf.float32,
                                           shape=(None, obs_space_dims))

            # concatenate action and observation --> NN input
            self.nn_input = tf.concat([self.obs_ph, self.act_ph], axis=1)

            obs_ph = tf.split(self.nn_input, self.num_models, axis=0)

            # create MLP
            mlps = []
            delta_preds = []
            var_preds = []
            logvar_preds = []
            invar_preds = []
            self.obs_next_pred = []
            for i in range(num_models):
                with tf.variable_scope('model_{}'.format(i)):
                    mlp = MLP(
                        name + '/model_{}'.format(i),
                        output_dim=2 * obs_space_dims,
                        hidden_sizes=hidden_sizes,
                        hidden_nonlinearity=hidden_nonlinearity,
                        output_nonlinearity=output_nonlinearity,
                        input_var=obs_ph[i],
                        input_dim=obs_space_dims +
                        action_space_dims,  # FIXME: input weight_normalization?
                    )
                    mlps.append(mlp)

                mean, logvar = tf.split(mlp.output_var, 2, axis=-1)
                logvar = self.max_logvar - tf.nn.softplus(self.max_logvar -
                                                          logvar)
                logvar = self.min_logvar + tf.nn.softplus(logvar -
                                                          self.min_logvar)
                var = tf.exp(logvar)
                inv_var = tf.exp(-logvar)

                delta_preds.append(mean)
                logvar_preds.append(logvar)
                var_preds.append(var)
                invar_preds.append(inv_var)

            self.delta_pred = tf.stack(
                delta_preds, axis=2)  # shape: (batch_size, ndim_obs, n_models)
            self.logvar_pred = tf.stack(
                logvar_preds,
                axis=2)  # shape: (batch_size, ndim_obs, n_models)
            self.var_pred = tf.stack(
                var_preds, axis=2)  # shape: (batch_size, ndim_obs, n_models)
            self.invar_pred = tf.stack(
                invar_preds, axis=2)  # shape: (batch_size, ndim_obs, n_models)

            # define loss and train_op
            self.loss = tf.reduce_mean(
                tf.square(self.delta_ph[:, :, None] - self.delta_pred) *
                self.invar_pred + self.logvar_pred)
            self.loss += 0.01 * tf.reduce_mean(
                self.max_logvar) - 0.01 * tf.reduce_mean(self.min_logvar)
            self.optimizer = optimizer(learning_rate=self.learning_rate)
            self.train_op = self.optimizer.minimize(self.loss)

            # tensor_utils
            self.f_delta_pred = compile_function([self.obs_ph, self.act_ph],
                                                 self.delta_pred)
            self.f_var_pred = compile_function([self.obs_ph, self.act_ph],
                                               self.var_pred)
        """ computation graph for inference where each of the models receives a different batch"""
        with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
            # placeholders
            self.obs_model_batches_stack_ph = tf.placeholder(
                tf.float32, shape=(None, obs_space_dims))
            self.act_model_batches_stack_ph = tf.placeholder(
                tf.float32, shape=(None, action_space_dims))
            self.delta_model_batches_stack_ph = tf.placeholder(
                tf.float32, shape=(None, obs_space_dims))

            # split stack into the batches for each model --> assume each model receives a batch of the same size
            self.obs_model_batches = tf.split(self.obs_model_batches_stack_ph,
                                              self.num_models,
                                              axis=0)
            self.act_model_batches = tf.split(self.act_model_batches_stack_ph,
                                              self.num_models,
                                              axis=0)
            self.delta_model_batches = tf.split(
                self.delta_model_batches_stack_ph, self.num_models, axis=0)

            # reuse previously created MLP but each model receives its own batch
            delta_preds = []
            var_preds = []
            self.obs_next_pred = []
            self.loss_model_batches = []
            self.train_op_model_batches = []
            for i in range(num_models):
                with tf.variable_scope('model_{}'.format(i), reuse=True):
                    # concatenate action and observation --> NN input
                    nn_input = tf.concat(
                        [self.obs_model_batches[i], self.act_model_batches[i]],
                        axis=1)
                    mlp = MLP(name + '/model_{}'.format(i),
                              output_dim=2 * obs_space_dims,
                              hidden_sizes=hidden_sizes,
                              hidden_nonlinearity=hidden_nonlinearity,
                              output_nonlinearity=output_nonlinearity,
                              input_var=nn_input,
                              input_dim=obs_space_dims + action_space_dims,
                              weight_normalization=weight_normalization)

                mean, logvar = tf.split(mlp.output_var, 2, axis=-1)
                logvar = self.max_logvar - tf.nn.softplus(self.max_logvar -
                                                          logvar)
                logvar = self.min_logvar + tf.nn.softplus(logvar -
                                                          self.min_logvar)
                var = tf.exp(logvar)
                inv_var = tf.exp(-logvar)

                loss = tf.reduce_mean(
                    tf.square(self.delta_model_batches[i] - mean) * inv_var +
                    logvar)
                loss += (0.01 * tf.reduce_mean(self.max_logvar) -
                         0.01 * tf.reduce_mean(self.min_logvar))

                delta_preds.append(mean)
                var_preds.append(var)
                self.loss_model_batches.append(loss)
                self.train_op_model_batches.append(
                    optimizer(learning_rate=self.learning_rate).minimize(loss))

            self.delta_pred_model_batches_stack = tf.concat(
                delta_preds,
                axis=0)  # shape: (batch_size_per_model*num_models, ndim_obs)
            self.var_pred_model_batches_stack = tf.concat(var_preds, axis=0)

            # tensor_utils
            self.f_delta_pred_model_batches = compile_function([
                self.obs_model_batches_stack_ph,
                self.act_model_batches_stack_ph
            ], self.delta_pred_model_batches_stack)

            self.f_var_pred_model_batches = compile_function([
                self.obs_model_batches_stack_ph,
                self.act_model_batches_stack_ph
            ], self.var_pred_model_batches_stack)

        self._networks = mlps
示例#12
0
 def __setstate__(self, state):
     Serializable.__setstate__(self, state['init_args'])
     self.policy = state['policy']
示例#13
0
 def __getstate__(self):
     state = dict()
     state['init_args'] = Serializable.__getstate__(self)
     # dumps policy
     state['policy'] = self.policy.__getstate__()
     return state
示例#14
0
    def __init__(
        self,
        name,
        Qs,
        env,
        dynamics_model,
        reward_model=None,
        discount=1,
        use_cem=False,
        n_candidates=1024,
        horizon=10,
        num_cem_iters=8,
        percent_elites=0.1,
        use_reward_model=False,
        alpha=0.1,
        num_particles=20,
        use_graph=True,
    ):
        Serializable.quick_init(self, locals())
        self.dynamics_model = dynamics_model
        self.reward_model = reward_model
        self.discount = discount
        self.n_candidates = n_candidates
        self.horizon = horizon
        self.use_cem = use_cem
        self.num_cem_iters = num_cem_iters
        self.percent_elites = percent_elites
        self.num_elites = int(percent_elites * n_candidates)
        self.env = env
        self.use_reward_model = use_reward_model
        self.alpha = alpha
        self.num_particles = num_particles
        self.use_graph = use_graph
        self.Qs = Qs

        self.unwrapped_env = env
        while hasattr(self.unwrapped_env, '_wrapped_env'):
            self.unwrapped_env = self.unwrapped_env._wrapped_env

        assert len(env.observation_space.shape) == 1
        assert len(env.action_space.shape) == 1
        self.obs_space_dims = env.observation_space.shape[0]
        self.action_space_dims = env.action_space.shape[0]

        # make sure that enc has reward function
        assert hasattr(self.unwrapped_env,
                       'reward'), "env must have a reward function"

        if use_graph:
            self.obs_ph = tf.placeholder(dtype=tf.float32,
                                         shape=(None, self.obs_space_dims),
                                         name='obs')
            self.mean = tf.placeholder(dtype=tf.float32,
                                       shape=(self.horizon + 1,
                                              self.action_space_dims),
                                       name='mean')
            self.std = tf.placeholder(dtype=tf.float32,
                                      shape=(self.horizon + 1,
                                             self.action_space_dims),
                                      name='std')
            self.optimal_action = None
            if not use_cem:
                self.build_rs_graph()
            else:
                self.build_cem_graph()
示例#15
0
 def __setstate__(self, state):
     Serializable.__setstate__(self, state['init_args'])
示例#16
0
 def __getstate__(self):
     state = dict()
     state['init_args'] = Serializable.__getstate__(self)
     return state
示例#17
0
    def __init__(self, *args, **kwargs):
        # store the init args for serialization and call the super constructors
        Serializable.quick_init(self, locals())
        Layer.__init__(self, *args, **kwargs)

        self.build_graph()
示例#18
0
 def __getstate__(self):
     # state = LayersPowered.__getstate__(self)
     state = dict()
     state['init_args'] = Serializable.__getstate__(self)
     return state
示例#19
0
    def __init__(
        self,
        name,
        env,
        hidden_sizes=(512, 512),
        hidden_nonlinearity='swish',
        output_nonlinearity=None,
        batch_size=500,
        learning_rate=0.001,
        weight_normalization=False,  # Doesn't work
        normalize_input=True,
        optimizer=tf.train.AdamOptimizer,
        valid_split_ratio=0.2,
        rolling_average_persitency=0.99,
        buffer_size=50000,
    ):

        Serializable.quick_init(self, locals())

        max_logvar = .0
        min_logvar = -10

        self.normalization = None
        self.normalize_input = normalize_input
        self.next_batch = None

        self.valid_split_ratio = valid_split_ratio
        self.rolling_average_persitency = rolling_average_persitency

        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.name = name
        self._dataset_train = None
        self._dataset_test = None

        # determine dimensionality of state and action space
        self.obs_space_dims = obs_space_dims = env.observation_space.shape[0]
        self.action_space_dims = action_space_dims = env.action_space.shape[0]

        self.hidden_nonlinearity = self._activations[hidden_nonlinearity]
        self.output_nonlinearity = self._activations[output_nonlinearity]
        self.hidden_sizes = hidden_sizes
        """ computation graph for training and simple inference """
        with tf.variable_scope(name):
            self.max_logvar = tf.Variable(np.ones([1, obs_space_dims]) *
                                          max_logvar,
                                          dtype=tf.float32,
                                          name="max_logvar")
            self.min_logvar = tf.Variable(np.ones([1, obs_space_dims]) *
                                          min_logvar,
                                          dtype=tf.float32,
                                          name="min_logvar")

            # placeholders
            self.obs_ph = tf.placeholder(tf.float32,
                                         shape=(None, obs_space_dims))
            self.act_ph = tf.placeholder(tf.float32,
                                         shape=(None, action_space_dims))
            self.delta_ph = tf.placeholder(tf.float32,
                                           shape=(None, obs_space_dims))

            self._create_stats_vars()

            # concatenate action and observation --> NN input
            self.nn_input = tf.concat([self.obs_ph, self.act_ph], axis=1)

            # create MLP
            delta_preds = []
            var_preds = []
            self.obs_next_pred = []
            with tf.variable_scope('dynamics_model'):
                mlp = MLP(
                    name,
                    output_dim=2 * obs_space_dims,
                    hidden_sizes=self.hidden_sizes,
                    hidden_nonlinearity=self.hidden_nonlinearity,
                    output_nonlinearity=self.output_nonlinearity,
                    input_var=self.nn_input,
                    input_dim=obs_space_dims + action_space_dims,
                )

            mean, logvar = tf.split(mlp.output_var, 2, axis=-1)
            logvar = self.max_logvar - tf.nn.softplus(self.max_logvar - logvar)
            logvar = self.min_logvar + tf.nn.softplus(logvar - self.min_logvar)
            var = tf.exp(logvar)

            self.delta_pred = mean
            self.var_pred = var

            # define loss and train_op
            self.loss = tf.reduce_mean((self.delta_ph - self.delta_pred)**2 /
                                       self.var_pred + tf.log(self.var_pred))
            self.loss += 0.01 * tf.reduce_mean(
                self.max_logvar) - 0.01 * tf.reduce_mean(self.min_logvar)
            self.optimizer = optimizer(learning_rate=self.learning_rate)
            self.train_op = self.optimizer.minimize(self.loss)

            # tensor_utils
            self.f_delta_pred = compile_function([self.obs_ph, self.act_ph],
                                                 self.delta_pred)
            self.f_var_pred = compile_function([self.obs_ph, self.act_ph],
                                               self.var_pred)
        """ computation graph for inference where each of the models receives a different batch"""
        self._networks = [mlp]
示例#20
0
    def __init__(
        self,
        name,
        env,
        num_models=5,
        hidden_sizes=(512, 512),
        hidden_nonlinearity='swish',
        output_nonlinearity=None,
        batch_size=500,
        learning_rate=0.001,
        weight_normalization=False,  # Doesn't work
        normalize_input=True,
        optimizer=tf.train.AdamOptimizer,
        valid_split_ratio=0.2,  # 0.1
        rolling_average_persitency=0.99,
        buffer_size=50000,
        loss_str='MSE',
    ):

        Serializable.quick_init(self, locals())

        max_logvar = 1
        min_logvar = 0.1

        self.normalization = None
        self.normalize_input = normalize_input
        self.next_batch = None

        self.valid_split_ratio = valid_split_ratio
        self.rolling_average_persitency = rolling_average_persitency

        self.buffer_size_train = int(buffer_size * (1 - valid_split_ratio))
        self.buffer_size_test = int(buffer_size * valid_split_ratio)
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.num_models = num_models
        self.hidden_sizes = hidden_sizes
        self.name = name
        self._dataset_train = None
        self._dataset_test = None

        # determine dimensionality of state and action space
        self.obs_space_dims = obs_space_dims = env.observation_space.shape[0]
        self.action_space_dims = action_space_dims = env.action_space.shape[0]
        self.timesteps_counter = 0
        self.used_timesteps_counter = 0

        self.hidden_nonlinearity = hidden_nonlinearity = self._activations[
            hidden_nonlinearity]
        self.output_nonlinearity = output_nonlinearity = self._activations[
            output_nonlinearity]
        """ computation graph for training and simple inference """
        with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
            # placeholders
            self.obs_ph = tf.placeholder(tf.float32,
                                         shape=(None, obs_space_dims))
            self.act_ph = tf.placeholder(tf.float32,
                                         shape=(None, action_space_dims))
            self.delta_ph = tf.placeholder(tf.float32,
                                           shape=(None, obs_space_dims))

            self._create_stats_vars()

            # concatenate action and observation --> NN input
            self.nn_input = tf.concat([self.obs_ph, self.act_ph], axis=1)

            obs_ph = tf.split(self.nn_input, self.num_models, axis=0)

            # create MLP
            mlps = []
            delta_preds = []
            self.obs_next_pred = []
            for i in range(num_models):
                with tf.variable_scope('model_{}'.format(i),
                                       reuse=tf.AUTO_REUSE):
                    mlp = MLP(
                        name + '/model_{}'.format(i),
                        output_dim=obs_space_dims,
                        hidden_sizes=hidden_sizes,
                        hidden_nonlinearity=hidden_nonlinearity,
                        output_nonlinearity=output_nonlinearity,
                        input_var=obs_ph[i],
                        input_dim=obs_space_dims + action_space_dims,
                    )
                    mlps.append(mlp)

                delta_preds.append(mlp.output_var)

            self.delta_pred = tf.stack(
                delta_preds, axis=2)  # shape: (batch_size, ndim_obs, n_models)

            # define loss and train_op
            if loss_str == 'L2':
                self.loss = tf.reduce_mean(
                    tf.linalg.norm(self.delta_ph[:, :, None] - self.delta_pred,
                                   axis=1))
            elif loss_str == 'MSE':
                self.loss = tf.reduce_mean(
                    (self.delta_ph[:, :, None] - self.delta_pred)**2)
            else:
                raise NotImplementedError

            self.optimizer = optimizer(learning_rate=self.learning_rate)
            self.train_op = self.optimizer.minimize(self.loss)

            # tensor_utils
            self.f_delta_pred = compile_function([self.obs_ph, self.act_ph],
                                                 self.delta_pred)
        """ computation graph for inference where each of the models receives a different batch"""
        with tf.variable_scope(name, reuse=True):
            # placeholders
            self.obs_model_batches_stack_ph = tf.placeholder(
                tf.float32, shape=(None, obs_space_dims))
            self.act_model_batches_stack_ph = tf.placeholder(
                tf.float32, shape=(None, action_space_dims))
            self.delta_model_batches_stack_ph = tf.placeholder(
                tf.float32, shape=(None, obs_space_dims))

            # split stack into the batches for each model --> assume each model receives a batch of the same size
            self.obs_model_batches = tf.split(self.obs_model_batches_stack_ph,
                                              self.num_models,
                                              axis=0)
            self.act_model_batches = tf.split(self.act_model_batches_stack_ph,
                                              self.num_models,
                                              axis=0)
            self.delta_model_batches = tf.split(
                self.delta_model_batches_stack_ph, self.num_models, axis=0)

            # reuse previously created MLP but each model receives its own batch
            delta_preds = []
            self.obs_next_pred = []
            self.loss_model_batches = []
            self.train_op_model_batches = []
            for i in range(num_models):
                with tf.variable_scope('model_{}'.format(i), reuse=True):
                    # concatenate action and observation --> NN input
                    nn_input = tf.concat(
                        [self.obs_model_batches[i], self.act_model_batches[i]],
                        axis=1)
                    mlp = MLP(name + '/model_{}'.format(i),
                              output_dim=obs_space_dims,
                              hidden_sizes=hidden_sizes,
                              hidden_nonlinearity=hidden_nonlinearity,
                              output_nonlinearity=output_nonlinearity,
                              input_var=nn_input,
                              input_dim=obs_space_dims + action_space_dims,
                              weight_normalization=weight_normalization)

                delta_preds.append(mlp.output_var)
                if loss_str == 'L2':
                    loss = tf.reduce_mean(
                        tf.linalg.norm(self.delta_model_batches[i] -
                                       mlp.output_var,
                                       axis=1))
                elif loss_str == 'MSE':
                    loss = tf.reduce_mean(
                        (self.delta_model_batches[i] - mlp.output_var)**2)
                else:
                    raise NotImplementedError
                self.loss_model_batches.append(loss)
                self.train_op_model_batches.append(
                    optimizer(learning_rate=self.learning_rate).minimize(loss))
            self.delta_pred_model_batches_stack = tf.concat(
                delta_preds,
                axis=0)  # shape: (batch_size_per_model*num_models, ndim_obs)

            # tensor_utils
            self.f_delta_pred_model_batches = compile_function([
                self.obs_model_batches_stack_ph,
                self.act_model_batches_stack_ph
            ], self.delta_pred_model_batches_stack)

        self._networks = mlps
示例#21
0
 def __setstate__(self, state):
     # LayersPowered.__setstate__(self, state)
     Serializable.__setstate__(self, state['init_args'])