Пример #1
0
 def __init__(self, config, return_function):
     """ Parameters:
     Name:               Type:           Default:            Description: (Omitted when self-explanatory)
     buff_sz             int             10                  buffer size
     batch_sz            int             1
     env_state_dims      list            [2,2]               dimensions of the observations to be stored in the buffer
     num_actions         int             2                   number of actions available to the agent
     obs_dtype           np.type         np.uint8            the data type of the observations
     initial_rand_steps  int             0                   number of random steps before decaying sigma
     rand_steps_count    int             0                   number of random steps taken so far
     store_return        bool            True                save the computed return so that it can be reused
     """
     assert isinstance(config, Config)
     self.config = config
     self.buff_sz = check_attribute_else_default(self.config, 'buff_sz', 10)
     self.batch_sz = check_attribute_else_default(self.config, 'batch_sz',
                                                  1)
     self.env_state_dims = list(
         check_attribute_else_default(self.config, 'env_state_dims',
                                      [2, 2]))
     self.num_actions = check_attribute_else_default(
         self.config, 'num_actions', 2)
     self.obs_dtype = check_attribute_else_default(self.config, 'obs_dtype',
                                                   np.uint8)
     self.initial_rand_steps = check_attribute_else_default(
         self.config, 'initial_rand_steps', 0)
     check_attribute_else_default(self.config, 'rand_steps_count', 0)
     self.store_return = check_attribute_else_default(
         self.config, 'store_return', True)
     """ Parameters for Return Function """
     assert isinstance(return_function, nStep_Retrace_ReturnFunction)
     self.return_function = return_function
     self.n = return_function.n
     """ Termination or Timeout Count for Applying the Decay on Sigma """
     self.episodes_since_last_decay = 0
     """ Parameters to keep track of the current state of the buffer """
     self.current_index = 0
     self.full_buffer = False
     """ Circular Buffers """
     self.state = CircularBuffer(self.buff_sz,
                                 shape=tuple(self.env_state_dims),
                                 dtype=self.obs_dtype)
     self.action = CircularBuffer(self.buff_sz, shape=(), dtype=np.uint8)
     self.reward = CircularBuffer(self.buff_sz, shape=(), dtype=np.int32)
     self.terminate = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool)
     self.timeout = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool)
     self.bprobabilities = CircularBuffer(self.buff_sz,
                                          shape=(self.num_actions, ),
                                          dtype=np.float64)
     self.estimated_return = CircularBuffer(self.buff_sz,
                                            shape=(),
                                            dtype=np.float64)
     self.up_to_date = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool)
Пример #2
0
 def __init__(self, config, return_function):
     """ Parameters:
     Name:               Type:           Default:            Description: (Omitted when self-explanatory)
     buff_sz             int             10                  buffer size
     batch_sz            int             1
     frame_stack         int             4                   number of frames to stack, see Mnih et. al. (2015)
     env_state_dims      list            [2,2]               dimensions of the observations to be stored in the buffer
     num_actions         int             2                   number of actions available to the agent
     obs_dtype           np.type         np.uint8            the data type of the observations
     reward_clipping     bool            False               clipping the reward , see Mnih et. al. (2015)
     sigma               float           0.5                 Sigma parameter, see De Asis et. al (2018)
     sigma_decay         float           1.0                 decay rate of sigma
     """
     self.config = config
     self.buff_sz = check_attribute_else_default(config, 'buff_sz', 10)
     self.batch_sz = check_attribute_else_default(config, 'batch_sz', 1)
     self.frame_stack = check_attribute_else_default(
         config, 'frame_stack', 4)
     self.env_state_dims = list(
         check_attribute_else_default(config, 'env_state_dims', [2, 2]))
     self.num_actions = check_attribute_else_default(
         config, 'num_actions', 2)
     self.obs_dtype = check_attribute_else_default(config, 'obs_dtype',
                                                   np.uint8)
     self.reward_clipping = check_attribute_else_default(
         config, 'reward_clipping', False)
     self.sigma = check_attribute_else_default(config, 'sigma', 0.5)
     self.sigma_decay = check_attribute_else_default(
         config, 'sigma_decay', 1.0)
     """ Parameters for Return Function """
     assert isinstance(return_function, OnPolicyQSigmaReturnFunction)
     self.return_function = return_function
     self.n = return_function.n
     """ Parameters to keep track of the current state of the buffer """
     self.current_index = 0
     self.full_buffer = False
     """ Circular Buffers """
     self.state = CircularBuffer(self.buff_sz,
                                 shape=tuple(self.env_state_dims),
                                 dtype=self.obs_dtype)
     self.action = CircularBuffer(self.buff_sz, shape=(), dtype=np.uint8)
     self.reward = CircularBuffer(self.buff_sz, shape=(), dtype=np.int32)
     self.terminate = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool)
     self.sigma = CircularBuffer(self.buff_sz, shape=(), dtype=np.float64)
Пример #3
0
 def __init__(self, config, return_function):
     """ Parameters:
     Name:               Type:           Default:            Description: (Omitted when self-explanatory)
     buff_sz             int             10                  buffer size
     batch_sz            int             1
     frame_stack         int             4                   number of frames to stack, see Mnih et. al. (2015)
     env_state_dims      list            [2,2]               dimensions of the observations to be stored in the buffer
     num_actions         int             2                   number of actions available to the agent
     obs_dtype           np.type         np.uint8            the data type of the observations
     reward_clipping     bool            False               clipping the reward , see Mnih et. al. (2015)
     sigma               float           0.5                 Sigma parameter, see De Asis et. al (2018)
     sigma_decay         float           1.0                 decay rate of sigma
     store_bprobs        bool            False               whether to store and use the behaviour policy probabilities
                                                             for the return function
     store_sigma         bool            False               whether to store sigma at every time step and use
                                                             the stored sigmas to compute the return. True = use the
                                                             sigma from the buffer, False = use the current sigma
     initial_rand_steps  int             0                   number of random steps before decaying sigma
     rand_steps_count    int             0                   number of random steps taken so far
     store_return        bool            True                save the computed return so that it can be reused
     """
     assert isinstance(config, Config)
     self.config = config
     self.buff_sz = check_attribute_else_default(self.config, 'buff_sz', 10)
     self.batch_sz = check_attribute_else_default(self.config, 'batch_sz',
                                                  1)
     self.frame_stack = check_attribute_else_default(
         self.config, 'frame_stack', 4)
     self.env_state_dims = list(
         check_attribute_else_default(self.config, 'env_state_dims',
                                      [2, 2]))
     self.num_actions = check_attribute_else_default(
         self.config, 'num_actions', 2)
     self.obs_dtype = check_attribute_else_default(self.config, 'obs_dtype',
                                                   np.uint8)
     self.reward_clipping = check_attribute_else_default(
         self.config, 'reward_clipping', False)
     self.sigma = check_attribute_else_default(self.config, 'sigma', 0.5)
     self.sigma_decay = check_attribute_else_default(
         self.config, 'sigma_decay', 1.0)
     self.store_bprobs = check_attribute_else_default(
         self.config, 'store_bprobs', False)
     self.store_sigma = check_attribute_else_default(
         self.config, 'store_sigma', False)
     self.initial_rand_steps = check_attribute_else_default(
         self.config, 'initial_rand_steps', 0)
     check_attribute_else_default(self.config, 'rand_steps_count', 0)
     self.store_return = check_attribute_else_default(
         self.config, 'store_return', True)
     """ Parameters for Return Function """
     assert isinstance(return_function, QSigmaReturnFunction)
     self.return_function = return_function
     self.n = return_function.n
     """ Parameters to keep track of the current state of the buffer """
     self.current_index = 0
     self.full_buffer = False
     """ Circular Buffers """
     self.state = CircularBuffer(self.buff_sz,
                                 shape=tuple(self.env_state_dims),
                                 dtype=self.obs_dtype)
     self.action = CircularBuffer(self.buff_sz, shape=(), dtype=np.uint8)
     self.reward = CircularBuffer(self.buff_sz, shape=(), dtype=np.int32)
     self.terminate = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool)
     if self.store_bprobs:
         self.bprobabilities = CircularBuffer(self.buff_sz,
                                              shape=(self.num_actions, ),
                                              dtype=np.float64)
     if self.store_sigma:
         self.sigma_buffer = CircularBuffer(self.buff_sz,
                                            shape=(),
                                            dtype=np.float64)
     self.estimated_return = CircularBuffer(self.buff_sz,
                                            shape=(),
                                            dtype=np.float64)
     self.up_to_date = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool)
Пример #4
0
 def __init__(self, config, return_function):
     """ Parameters:
     Name:               Type:           Default:            Description: (Omitted when self-explanatory)
     buff_sz             int             10                  buffer size
     batch_sz            int             1
     env_state_dims      list            [2,2]               dimensions of the observations to be stored in the buffer
     num_actions         int             2                   number of actions available to the agent
     obs_dtype           np.type         np.uint8            the data type of the observations
     sigma               float           0.5                 Sigma parameter, see De Asis et. al (2018)
     sigma_decay         float           1.0                 decay rate of sigma
     decay_type          string          exp                 decay type of sigma. Options: exp and lin
     decay_freq          int             1                   how often to decay sigma, e.g. a decay frequency of
                                                             10 would apply the decay once very 10 episodes
     sigma_min           float           0                   the lowest value sigma can attain when decaying
     store_bprobs        bool            False               whether to store and use the behaviour policy probabilities
                                                             for the return function
     store_sigma         bool            False               whether to store sigma at every time step and use
                                                             the stored sigmas to compute the return. True = use the
                                                             sigma from the buffer, False = use the current sigma
     initial_rand_steps  int             0                   number of random steps before decaying sigma
     rand_steps_count    int             0                   number of random steps taken so far
     store_return        bool            True                save the computed return so that it can be reused
     """
     assert isinstance(config, Config)
     self.config = config
     self.buff_sz = check_attribute_else_default(self.config, 'buff_sz', 10)
     self.batch_sz = check_attribute_else_default(self.config, 'batch_sz',
                                                  1)
     self.env_state_dims = list(
         check_attribute_else_default(self.config, 'env_state_dims',
                                      [2, 2]))
     self.num_actions = check_attribute_else_default(
         self.config, 'num_actions', 2)
     self.obs_dtype = check_attribute_else_default(self.config, 'obs_dtype',
                                                   np.uint8)
     self.sigma = check_attribute_else_default(self.config, 'sigma', 0.5)
     self.sigma_decay = check_attribute_else_default(
         self.config, 'sigma_decay', 1.0)
     self.decay_type = check_attribute_else_default(self.config,
                                                    'decay_type', 'exp')
     self.decay_freq = check_attribute_else_default(self.config,
                                                    'decay_freq', 1)
     self.sigma_min = check_attribute_else_default(self.config, 'sigma_min',
                                                   0.0)
     self.store_bprobs = check_attribute_else_default(
         self.config, 'store_bprobs', False)
     self.store_sigma = check_attribute_else_default(
         self.config, 'store_sigma', False)
     self.initial_rand_steps = check_attribute_else_default(
         self.config, 'initial_rand_steps', 0)
     check_attribute_else_default(self.config, 'rand_steps_count', 0)
     self.store_return = check_attribute_else_default(
         self.config, 'store_return', True)
     """ Parameters for Return Function """
     assert isinstance(return_function, QSigmaReturnFunction)
     self.return_function = return_function
     self.n = return_function.n
     """ Termination or Timeout Count for Applying the Decay on Sigma """
     self.episodes_since_last_decay = 0
     """ Parameters to keep track of the current state of the buffer """
     self.current_index = 0
     self.full_buffer = False
     """ Circular Buffers """
     self.state = CircularBuffer(self.buff_sz,
                                 shape=tuple(self.env_state_dims),
                                 dtype=self.obs_dtype)
     self.action = CircularBuffer(self.buff_sz, shape=(), dtype=np.uint8)
     self.reward = CircularBuffer(self.buff_sz, shape=(), dtype=np.int32)
     self.terminate = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool)
     self.timeout = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool)
     if self.store_bprobs:
         self.bprobabilities = CircularBuffer(self.buff_sz,
                                              shape=(self.num_actions, ),
                                              dtype=np.float64)
     if self.store_sigma:
         self.sigma_buffer = CircularBuffer(self.buff_sz,
                                            shape=(),
                                            dtype=np.float64)
     self.estimated_return = CircularBuffer(self.buff_sz,
                                            shape=(),
                                            dtype=np.float64)
     self.up_to_date = CircularBuffer(self.buff_sz, shape=(), dtype=np.bool)
Пример #5
0
    def __init__(self,
                 config,
                 games_directory=None,
                 rom_filename=None,
                 summary=None):
        super().__init__()
        """ Parameters:
        Name:                       Type            Default:        Description(omitted when self-explanatory):
        display_screen              bool            False           Display game screen
        agent_render                bool            False           Display current frame the way the agent sees it
        frame_skip                  int             4               See ALE Documentation
        repeat_action_probability   float           0.25            in [0,1], see ALE Documentation
        max_num_frames              int             18000           Max number of frames per episode
        color_averaging             bool            False           If true, it averages over the skipped frames. 
                                                                    Otherwise, it takes the maximum over the skipped
                                                                    frames.
        frame_stack                 int             4               Stack of frames for agent, see Mnih et. al. (2015)
        save_summary                bool            False           Save the summary of the environment
        """

        assert isinstance(config, Config)
        self.display_screen = check_attribute_else_default(
            config, 'display_screen', False)
        self.agent_render = check_attribute_else_default(
            config, 'agent_render', False)
        self.frame_skip = check_attribute_else_default(config, 'frame_skip', 4)
        self.repeat_action_probability = check_attribute_else_default(
            config, 'repeat_action_probability', 0.25)
        max_num_frames = check_attribute_else_default(config, 'max_num_frames',
                                                      18000)
        self.color_averaging = check_attribute_else_default(
            config, 'color_averaging', True)
        if self.color_averaging:
            self.aggregate_func = np.average
        else:
            self.aggregate_func = np.amax
        self.frame_stack = check_attribute_else_default(
            config, 'frame_stack', 4)
        self.save_summary = check_attribute_else_default(
            config, 'save_summary', False)
        if self.save_summary:
            assert isinstance(summary, dict)
            self.summary = summary
            check_dict_else_default(self.summary, "frames_per_episode", [])

        " Environment variables"
        self.env = ALEInterface()
        self.env.setInt(b'frame_skip', 1)
        self.env.setInt(b'random_seed', 0)
        self.env.setFloat(b'repeat_action_probability', 0)
        self.env.setInt(b"max_num_frames_per_episode", max_num_frames)
        self.env.setBool(b"color_averaging", False)
        self.env.setBool(b'display_screen', self.display_screen)
        self.rom_file = str.encode(games_directory + rom_filename)
        self.frame_count = 0
        " Loading ROM "
        self.env.loadROM(self.rom_file)
        """ Fixed Parameters:
        Frame Format: "NCHW" (batch_size, channels, height, width). Decided to adopt this format because
        it's the fastest to process in tensorflow with a gpu.
        Frame Height and Width: 84, the default value in the literature.
        """
        " Inner state of the environment "
        self.height = 84
        self.width = 84
        self.current_state = np.zeros(
            [self.frame_stack, self.height, self.width], dtype=np.uint8)
        self.original_height = 210
        self.original_width = 160
        self.history = np.zeros(
            [self.frame_skip, self.original_height, self.original_width],
            np.uint8)
        self.reset()

        self.observations_dimensions = self.current_state.shape
        self.frame_dims = self.current_state[0].shape
        self.actions = self.env.getLegalActionSet()
        self.previous_action = 0