Exemplo n.º 1
0
    def __init__(self,
                 input_shape: Union[Sequence[int], int],
                 output_shape: Union[Sequence[int], int],
                 network_fn: Callable[[], NetworkType] = None,
                 network_class: Type[NetworkTypeClass] = None,
                 state_transform: Optional[Callable] = None,
                 reward_transform: Optional[Callable] = None,
                 **kwargs):
        """Initiates the DQN agent.

        Parameters:
            hidden_layers: (default: (64, 64) ) Tuple defining hidden dimensions in fully connected nets.
            lr: (default: 1e-3) learning rate
            gamma: (default: 0.99) discount factor
            tau: (default: 0.002) soft-copy factor
            update_freq: (default: 1)
            batch_size: (default: 32)
            buffer_size: (default: 1e5)
            warm_up: (default: 0)
            number_updates: (default: 1)
            max_grad_norm: (default: 10)
            using_double_q: (default: True) Whether to use double Q value
            n_steps: (int: 1) N steps reward lookahead

        """
        super().__init__(**kwargs)

        self.device = self._register_param(kwargs,
                                           "device",
                                           DEVICE,
                                           update=True)
        # TODO: All this should be condenced with some structure, e.g. gym spaces
        self.input_shape: Sequence[int] = input_shape if not isinstance(
            input_shape, int) else (input_shape, )
        self.state_size: int = self.input_shape[0]
        self.output_shape: Sequence[int] = output_shape if not isinstance(
            output_shape, int) else (output_shape, )
        self.action_size: int = self.output_shape[0]
        self._config['state_size'] = self.state_size
        self._config['action_size'] = self.action_size

        self.lr = float(self._register_param(kwargs, 'lr',
                                             3e-4))  # Learning rate
        self.gamma = float(self._register_param(kwargs, 'gamma',
                                                0.99))  # Discount value
        self.tau = float(self._register_param(kwargs, 'tau',
                                              0.002))  # Soft update

        self.update_freq = int(self._register_param(kwargs, 'update_freq', 1))
        self.batch_size = int(
            self._register_param(kwargs, 'batch_size', 64, update=True))
        self.buffer_size = int(
            self._register_param(kwargs, 'buffer_size', int(1e5), update=True))
        self.warm_up = int(self._register_param(kwargs, 'warm_up', 0))
        self.number_updates = int(
            self._register_param(kwargs, 'number_updates', 1))
        self.max_grad_norm = float(
            self._register_param(kwargs, 'max_grad_norm', 10))

        self.iteration: int = 0
        self.buffer = PERBuffer(**kwargs)
        self.using_double_q = bool(
            self._register_param(kwargs, "using_double_q", True))

        self.n_steps = int(self._register_param(kwargs, 'n_steps', 1))
        self.n_buffer = NStepBuffer(n_steps=self.n_steps, gamma=self.gamma)

        hidden_layers = to_numbers_seq(
            self._register_param(kwargs, 'hidden_layers', (64, 64)))
        self.state_transform = state_transform if state_transform is not None else lambda x: x
        self.reward_transform = reward_transform if reward_transform is not None else lambda x: x
        if network_fn is not None:
            self.net = network_fn()
            self.target_net = network_fn()
        elif network_class is not None:
            self.net = network_class(self.input_shape,
                                     self.action_size,
                                     hidden_layers=hidden_layers,
                                     device=self.device)
            self.target_net = network_class(self.input_shape,
                                            self.action_size,
                                            hidden_layers=hidden_layers,
                                            device=self.device)
        else:
            self.net = DuelingNet(self.input_shape,
                                  self.output_shape,
                                  hidden_layers=hidden_layers,
                                  device=self.device)
            self.target_net = DuelingNet(self.input_shape,
                                         self.output_shape,
                                         hidden_layers=hidden_layers,
                                         device=self.device)
        self.optimizer = optim.Adam(self.net.parameters(), lr=self.lr)
        self._loss: float = float('inf')
Exemplo n.º 2
0
    def __init__(self,
                 input_shape: Union[Sequence[int], int],
                 output_shape: Union[Sequence[int], int],
                 network_fn: Callable[[], NetworkType] = None,
                 network_class: Type[NetworkTypeClass] = None,
                 state_transform: Optional[Callable] = None,
                 reward_transform: Optional[Callable] = None,
                 **kwargs):
        """Initiates the DQN agent.

        Parameters:
            hidden_layers (tuple of ints): Tuple defining hidden dimensions in fully connected nets. Default: (64, 64).
            lr (float): Learning rate value. Default: 3e-4.
            gamma (float): Discount factor. Default: 0.99.
            tau (float): Soft-copy factor. Default: 0.002.
            update_freq (int): Number of steps between each learning step. Default 1.
            batch_size (int): Number of samples to use at each learning step. Default: 80.
            buffer_size (int): Number of most recent samples to keep in memory for learning. Default: 1e5.
            warm_up (int): Number of samples to observe before starting any learning step. Default: 0.
            number_updates (int): How many times to use learning step in the learning phase. Default: 1.
            max_grad_norm (float): Maximum norm of the gradient used in learning. Default: 10.
            using_double_q (bool): Whether to use Double Q Learning network. Default: True.
            n_steps (int): Number of lookahead steps when estimating reward. See :ref:`NStepBuffer`. Default: 3.

        """
        super().__init__(**kwargs)

        self.device = self._register_param(kwargs,
                                           "device",
                                           DEVICE,
                                           update=True)
        # TODO: All this should be condensed with some structure, e.g. gym spaces
        self.input_shape: Sequence[int] = input_shape if not isinstance(
            input_shape, int) else (input_shape, )
        self.state_size: int = self.input_shape[0]
        self.output_shape: Sequence[int] = output_shape if not isinstance(
            output_shape, int) else (output_shape, )
        self.action_size: int = self.output_shape[0]
        self._config['state_size'] = self.state_size
        self._config['action_size'] = self.action_size

        self.lr = float(self._register_param(kwargs, 'lr',
                                             3e-4))  # Learning rate
        self.gamma = float(self._register_param(kwargs, 'gamma',
                                                0.99))  # Discount value
        self.tau = float(self._register_param(kwargs, 'tau',
                                              0.002))  # Soft update

        self.update_freq = int(self._register_param(kwargs, 'update_freq', 1))
        self.batch_size = int(
            self._register_param(kwargs, 'batch_size', 64, update=True))
        self.buffer_size = int(
            self._register_param(kwargs, 'buffer_size', int(1e5), update=True))
        self.warm_up = int(self._register_param(kwargs, 'warm_up', 0))
        self.number_updates = int(
            self._register_param(kwargs, 'number_updates', 1))
        self.max_grad_norm = float(
            self._register_param(kwargs, 'max_grad_norm', 10))

        self.iteration: int = 0
        self.buffer = PERBuffer(**kwargs)
        self.using_double_q = bool(
            self._register_param(kwargs, "using_double_q", True))

        self.n_steps = int(self._register_param(kwargs, 'n_steps', 1))
        self.n_buffer = NStepBuffer(n_steps=self.n_steps, gamma=self.gamma)

        hidden_layers = to_numbers_seq(
            self._register_param(kwargs, 'hidden_layers', (64, 64)))
        self.state_transform = state_transform if state_transform is not None else lambda x: x
        self.reward_transform = reward_transform if reward_transform is not None else lambda x: x
        if network_fn is not None:
            self.net = network_fn()
            self.target_net = network_fn()
        elif network_class is not None:
            self.net = network_class(self.input_shape,
                                     self.action_size,
                                     hidden_layers=hidden_layers,
                                     device=self.device)
            self.target_net = network_class(self.input_shape,
                                            self.action_size,
                                            hidden_layers=hidden_layers,
                                            device=self.device)
        else:
            self.net = DuelingNet(self.input_shape,
                                  self.output_shape,
                                  hidden_layers=hidden_layers,
                                  device=self.device)
            self.target_net = DuelingNet(self.input_shape,
                                         self.output_shape,
                                         hidden_layers=hidden_layers,
                                         device=self.device)
        self.optimizer = optim.Adam(self.net.parameters(), lr=self.lr)
        self._loss: float = float('inf')