Exemplo n.º 1
0
    def __init__(self,
                 env_spec,
                 obs_history_len,
                 N,
                 size,
                 labeller=None,
                 save_rollouts=False,
                 save_rollouts_observations=True,
                 save_env_infos=False,
                 alpha=None,
                 beta_schedule=None):
        assert (alpha is not None)
        assert (beta_schedule is not None)

        self._alpha = alpha
        self._beta_schedule = schedules.PiecewiseSchedule(**beta_schedule)
        self._max_pri_set = set()
        super(PERPool, self).__init__(
            env_spec=env_spec,
            obs_history_len=obs_history_len,
            N=N,
            size=size,
            labeller=labeller,
            save_rollouts=save_rollouts,
            save_rollouts_observations=save_rollouts_observations,
            save_env_infos=save_env_infos)
Exemplo n.º 2
0
    def __init__(self, env_spec, endpoints, outside_value):
        super(EpsilonGreedyStrategy, self).__init__(env_spec)

        self.schedule = schedules.PiecewiseSchedule(
            endpoints=endpoints, outside_value=outside_value)
Exemplo n.º 3
0
    def __init__(self, **kwargs):
        self._outputs = kwargs['outputs']
        self._rew_fn = kwargs['rew_fn']

        ### environment
        self._env_spec = kwargs['env_spec']
        self._obs_vec_keys = list(self._env_spec.observation_vec_spec.keys())
        self._action_keys = list(self._env_spec.action_spec.keys())
        self._goal_keys = list(self._env_spec.goal_spec.keys())
        self._output_keys = sorted(
            [output['name'] for output in self._outputs])
        self._obs_im_shape = self._env_spec.observation_im_space.shape

        self._obs_im_dim = np.prod(self._obs_im_shape)
        self._obs_vec_dim = len(self._obs_vec_keys)
        self._action_dim = len(self._action_keys)
        self._goal_dim = len(self._goal_keys)
        self._output_dim = len(self._output_keys)

        ### model horizons
        self._N = kwargs['N']  # number of returns to use (N-step)
        self._H = kwargs['H']  # action planning horizon for training
        self._gamma = kwargs['gamma']  # reward decay
        self._obs_history_len = kwargs[
            'obs_history_len']  # how many previous observations to use

        ### model architecture
        self._inference_only = kwargs.get('inference_only', False)
        self._image_graph = kwargs['image_graph']
        self._observation_graph = kwargs['observation_graph']
        self._action_graph = kwargs['action_graph']
        self._rnn_graph = kwargs['rnn_graph']
        self._output_graph = kwargs['output_graph']
        ### scopes
        self._image_scope = 'image_scope'
        self._observation_scope = 'observation_scope'
        self._action_scope = 'action_scope'
        self._rnn_scope = 'rnn_scope'
        self._output_scope = 'output_scope'

        ### target network
        self._use_target = kwargs['use_target']
        self._separate_target_params = kwargs['separate_target_params']
        ### training
        self._only_completed_episodes = kwargs['only_completed_episodes']
        self._weight_decay = kwargs['weight_decay']
        self._lr_schedule = schedules.PiecewiseSchedule(
            **kwargs['lr_schedule'])
        self._grad_clip_norm = kwargs['grad_clip_norm']
        self._gpu_device = kwargs['gpu_device']
        self._gpu_frac = kwargs['gpu_frac']

        ### action selection and exploration
        self._get_action_test = kwargs['get_action_test']
        self._get_action_target = kwargs['get_action_target']
        assert (self._get_action_target['type'] == 'random')
        gaussian_es_params = kwargs['exploration_strategies'].get(
            'GaussianStrategy', None)
        if gaussian_es_params is not None:
            self._gaussian_es = GaussianStrategy(
                self._env_spec, **
                gaussian_es_params) if gaussian_es_params else None
        else:
            self._gaussian_es = None
        epsilon_greedy_es_params = kwargs['exploration_strategies'].get(
            'EpsilonGreedyStrategy', None)
        if epsilon_greedy_es_params is not None:
            self._epsilon_greedy_es = EpsilonGreedyStrategy(
                self._env_spec, **epsilon_greedy_es_params)
        else:
            self._epsilon_greedy_es = None

        ### setup the model
        self._tf_debug = dict()
        self._tf_dict = self._graph_setup()

        ### logging
        self._log_stats = defaultdict(list)

        assert (self._N >= self._H)
Exemplo n.º 4
0
    def __init__(self, **kwargs):
        ### environment
        self._env_spec = kwargs['env_spec']

        ### model horizons
        self._N = kwargs['N']  # number of returns to use (N-step)
        self._H = kwargs['H']  # action planning horizon for training
        self._gamma = kwargs['gamma']  # reward decay
        self._obs_history_len = kwargs[
            'obs_history_len']  # how many previous observations to use

        ### model architecture
        self._inference_only = kwargs.get('inference_only', False)
        self._image_graph = kwargs['image_graph']
        self._observation_graph = kwargs['observation_graph']
        self._action_graph = kwargs['action_graph']
        self._rnn_graph = kwargs['rnn_graph']
        self._output_graph = kwargs['output_graph']

        ### target network
        self._values_softmax = kwargs[
            'values_softmax']  # which value horizons to train over
        self._use_target = kwargs['use_target']
        self._separate_target_params = kwargs['separate_target_params']
        self._clip_cost_target_with_dones = kwargs[
            'clip_cost_target_with_dones']

        ### training
        self._only_completed_episodes = kwargs['only_completed_episodes']
        self._weight_decay = kwargs['weight_decay']
        self._lr_schedule = schedules.PiecewiseSchedule(
            **kwargs['lr_schedule'])
        self._grad_clip_norm = kwargs['grad_clip_norm']
        self._preprocess_params = kwargs['preprocess']
        self._gpu_device = kwargs['gpu_device']
        self._gpu_frac = kwargs['gpu_frac']

        ### action selection and exploration
        self._get_action_test = kwargs['get_action_test']
        self._get_action_target = kwargs['get_action_target']
        assert (self._get_action_target['type'] == 'random')
        gaussian_es_params = kwargs['exploration_strategies'].get(
            'GaussianStrategy', None)
        if gaussian_es_params is not None:
            self._gaussian_es = GaussianStrategy(
                self._env_spec, **
                gaussian_es_params) if gaussian_es_params else None
        else:
            self._gaussian_es = None
        epsilon_greedy_es_params = kwargs['exploration_strategies'].get(
            'EpsilonGreedyStrategy', None)
        if epsilon_greedy_es_params is not None:
            self._epsilon_greedy_es = EpsilonGreedyStrategy(
                self._env_spec, **epsilon_greedy_es_params)
        else:
            self._epsilon_greedy_es = None

        ### setup the model
        self._tf_debug = dict()
        self._tf_dict = self._graph_setup()

        ### logging
        self._log_stats = defaultdict(list)

        assert ((self._N == 1 and self._H == 1)
                or (self._N > 1 and self._H == 1)
                or (self._N > 1 and self._H > 1))
Exemplo n.º 5
0
 def __init__(self, env_spec, endpoints, outside_value):
     assert isinstance(env_spec.action_space, Box)
     self._env_spec = env_spec
     self.schedule = schedules.PiecewiseSchedule(
         endpoints=endpoints, outside_value=outside_value)
Exemplo n.º 6
0
 def __init__(self, env_spec, endpoints, outside_value):
     self._env_spec = env_spec
     self.schedule = schedules.PiecewiseSchedule(
         endpoints=endpoints, outside_value=outside_value)
Exemplo n.º 7
0
    def __init__(self, env_spec, endpoints, outside_value):
        super(GaussianStrategy, self).__init__(env_spec)

        assert isinstance(env_spec.action_space, Box)
        self.schedule = schedules.PiecewiseSchedule(
            endpoints=endpoints, outside_value=outside_value)