Пример #1
0
    def __init__(self,
                 policy,
                 supervised_model=None,
                 supervised_ground_truth='teacher',
                 name="ppo",
                 learning_rate=1e-3,
                 clip_eps=0.2,
                 max_epochs=5,
                 max_epochs_r=20,
                 entropy_bonus=0.,
                 reward_predictor=None,
                 reward_predictor_type='gaussian',
                 grad_clip_threshold=None,
                 **kwargs):

        # TODO: Check to avoid duplicates of variables and scopes
        self.reward_predictor = reward_predictor
        Serializable.quick_init(self, locals())
        super(PPO, self).__init__(policy)

        self.recurrent = getattr(self.policy, 'recurrent', False)
        self.supervised_model = supervised_model
        if self.recurrent:
            backprop_steps = kwargs.get('backprop_steps', 32)
            self.optimizer = RL2FirstOrderOptimizer(
                learning_rate=learning_rate,
                max_epochs=max_epochs,
                backprop_steps=backprop_steps,
                grad_clip_threshold=grad_clip_threshold)
            if self.reward_predictor is not None:
                self.optimizer_r = RL2FirstOrderOptimizer(
                    learning_rate=learning_rate,
                    max_epochs=max_epochs_r,
                    backprop_steps=backprop_steps,
                    grad_clip_threshold=grad_clip_threshold)
            if self.supervised_model is not None:
                self.optimizer_s = RL2FirstOrderOptimizer(
                    learning_rate=learning_rate,
                    max_epochs=max_epochs_r,
                    backprop_steps=backprop_steps,
                    grad_clip_threshold=grad_clip_threshold)
        else:
            self.optimizer = FirstOrderOptimizer(learning_rate=learning_rate,
                                                 max_epochs=max_epochs)
        # TODO figure out what this does
        self._optimization_keys = [
            'observations', 'actions', 'advantages', 'rewards', 'agent_infos',
            'env_infos'
        ]
        self._optimization_r_keys = [
            'observations', 'actions', 'advantages', 'rewards', 'agent_infos',
            'env_infos'
        ]
        self.name = name
        self._clip_eps = clip_eps
        self.entropy_bonus = entropy_bonus
        self.supervised_ground_truth = supervised_ground_truth
        self.reward_predictor_type = reward_predictor_type

        self.build_graph()
Пример #2
0
    def __init__(self,
                 obs_dim,
                 action_dim,
                 name='v_fun',
                 hidden_sizes=(256, 256),
                 hidden_nonlinearity=tf.tanh,
                 output_nonlinearity=None,
                 **kwargs):
        # store the init args for serialization and call the super constructors
        Serializable.quick_init(self, locals())
        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.name = name
        self.hidden_sizes = hidden_sizes
        self.hidden_nonlinearity = hidden_nonlinearity
        self.output_nonlinearity = output_nonlinearity

        self.vfun_params = None
        self.input_var = None
        self.qval_var = None
        self.log_std_var = None
        self.action_var = None
        self._assign_ops = None

        self.build_graph()
Пример #3
0
    def __init__(self,
                 obs_dim,
                 action_dim,
                 name='policy',
                 hidden_sizes=(32, 32),
                 learn_std=True,
                 hidden_nonlinearity=tf.tanh,
                 output_nonlinearity=None,
                 **kwargs
                 ):
        Serializable.quick_init(self, locals()) 

        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.name = name

        self.hidden_sizes = hidden_sizes
        self.learn_std = learn_std
        self.hidden_nonlinearity = hidden_nonlinearity
        self.output_nonlinearity = output_nonlinearity

        self._dist = None
        self.policy_params = None
        self._assign_ops = None
        self._assign_phs = None
        self.policy_params_keys = None
        self.policy_params_ph = None
Пример #4
0
    def __init__(
            self,
            policy,
            name="ppo",
            learning_rate=1e-3,
            clip_eps=0.2,
            max_epochs=5,
            entropy_bonus=0.,
            **kwargs
            ):
        Serializable.quick_init(self, locals())
        super(PPO, self).__init__(policy)

        self.recurrent = getattr(self.policy, 'recurrent', False)
        if self.recurrent:
            backprop_steps = kwargs.get('backprop_steps', 32)
            self.optimizer = RL2FirstOrderOptimizer(learning_rate=learning_rate, max_epochs=max_epochs,
                                                    backprop_steps=backprop_steps)
        else:
            self.optimizer = FirstOrderOptimizer(learning_rate=learning_rate, max_epochs=max_epochs)
        self._optimization_keys = ['observations', 'actions', 'advantages', 'agent_infos']
        self.name = name
        self._clip_eps = clip_eps
        self.entropy_bonus = entropy_bonus

        self.build_graph()
Пример #5
0
    def __init__(self,
                 *args,
                 init_std=1.,
                 min_std=1e-6,
                 cell_type='lstm',
                 **kwargs):
        # store the init args for serialization and call the super constructors
        Serializable.quick_init(self, locals())
        Policy.__init__(self, *args, **kwargs)

        self.min_log_std = np.log(min_std)
        self.init_log_std = np.log(init_std)

        self.init_policy = None
        self.policy_params = None
        self.obs_var = None
        self.mean_var = None
        self.log_std_var = None
        self.action_var = None
        self._dist = None
        self._hidden_state = None
        self.recurrent = True
        self._cell_type = cell_type

        self.build_graph()
        self._zero_hidden = self.cell.zero_state(1, tf.float32)
    def __init__(
        self,
        tf_optimizer_cls=tf.train.AdamOptimizer,
        tf_optimizer_args=None,
        learning_rate=1e-3,
        max_epochs=1,
        tolerance=1e-6,
        num_minibatches=1,
        verbose=False,
    ):

        Serializable.quick_init(self, locals())
        self._target = None
        if tf_optimizer_args is None:
            tf_optimizer_args = dict()
        tf_optimizer_args['learning_rate'] = learning_rate

        self._tf_optimizer = tf_optimizer_cls(**tf_optimizer_args)
        self._max_epochs = max_epochs
        self._tolerance = tolerance
        self._verbose = verbose
        self._num_minibatches = num_minibatches
        self._all_inputs = None
        self._train_op = None
        self._loss = None
        self._input_ph_dict = None
Пример #7
0
 def __getstate__(self):
     state = {
         'init_args': Serializable.__getstate__(self),
         'network_params': self.get_params(),
         'filter': [obs_filter.get_params() for obs_filter in self.obs_filters],
     }
     return state
Пример #8
0
 def __getstate__(self):
     state = dict()
     state['init_args'] = Serializable.__getstate__(self)
     print('getstate\n')
     print(state['init_args'])
     state['policy'] = self.policy.__getstate__()
     state['optimizer'] = self.optimizer.__getstate__()
     return state
Пример #9
0
    def __init__(self, *args, init_std=1., min_std=1e-6, **kwargs):
        # store the init args for serialization and call the super constructors
        Serializable.quick_init(self, locals())
        Policy.__init__(self, *args, **kwargs)

        self.min_log_std = np.log(min_std)
        self.init_log_std = np.log(init_std)

        self.init_policy = None
        self.policy_params = None
        self.obs_var = None
        self.mean_var = None
        self.log_std_var = None
        self.action_var = None
        self._dist = None

        self.build_graph()
Пример #10
0
    def __init__(self,
                 obs_dim,
                 action_dim,
                 name='np_policy',
                 **kwargs
                 ):
        Serializable.quick_init(self, locals())

        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.name = name

        self._dist = None
        self.policy_params = None
        self.policy_params_batch = None
        self._num_deltas = None
        self.obs_filters = [Filter((self.obs_dim,))]
Пример #11
0
    def __init__(self,
                 obs_dim,
                 action_dim,
                 name='np_policy',
                 hidden_sizes=(64, 64),
                 hidden_nonlinearity='tanh',
                 output_nonlinearity=None,
                 normalization='first',
                 **kwargs):
        Serializable.quick_init(self, locals())
        NpPolicy.__init__(self, obs_dim, action_dim, name, **kwargs)

        assert normalization in ['all', 'first', None, 'none']

        self.obs_filter = MeanStdFilter(shape=(obs_dim, ))
        self.hidden_nonlinearity = self._activations[hidden_nonlinearity]
        self.output_nonlinearity = self._activations[output_nonlinearity]
        self.hidden_sizes = hidden_sizes
        self.policy_params = OrderedDict()

        self.obs_filters = []
        prev_size = obs_dim
        for i, hidden_size in enumerate(hidden_sizes):
            W = np.zeros((hidden_size, prev_size), dtype=np.float64)
            b = np.zeros((hidden_size, ))

            self.policy_params['W_%d' % i] = W
            self.policy_params['b_%d' % i] = b

            if normalization == 'all' or (normalization == 'first' and i == 0):
                self.obs_filters.append(MeanStdFilter(shape=(prev_size, )))
            else:
                self.obs_filters.append(Filter(shape=(prev_size, )))

            prev_size = hidden_size

        if normalization == 'all' or (normalization == 'first'
                                      and len(hidden_sizes) == 0):
            self.obs_filters.append(MeanStdFilter(shape=(prev_size, )))
        else:
            self.obs_filters.append(Filter(shape=(prev_size, )))

        W = np.zeros((action_dim, prev_size), dtype=np.float64)
        b = np.zeros((action_dim, ))
        self.policy_params['W_out'] = W
        self.policy_params['b_out'] = b
Пример #12
0
 def __setstate__(self, state):
     Serializable.__setstate__(self, state['init_args'])
     self.policy.__setstate__(state['policy'])
     self.optimizer.__getstate__(state['optimizer'])
Пример #13
0
 def __setstate__(self, state):
     Serializable.__setstate__(self, state['init_args'])
     # tf.get_default_session().run(tf.global_variables_initializer())
     self.set_params(state['network_params'])
Пример #14
0
 def __getstate__(self):
     state = {
         'init_args': Serializable.__getstate__(self),
         'network_params': self.get_param_values()
     }
     return state
Пример #15
0
from meta_mb.logger import logger
Пример #16
0
 def __setstate__(self, state):
     Serializable.__setstate__(self, state['init_args'])
     self.set_params(state['network_params'])
     [obs_filter.set_params(params) for obs_filter, params in zip(self.obs_filters, state['filter'])]