Пример #1
0
    def create_model(self):
        state_dim = self.env.observation_space.shape[0]
        action_dim = self.env.action_space.shape[0]
        if self.noise is not None:
            self.noise = self.noise(np.zeros_like(action_dim),
                                    self.noise_std * np.ones_like(action_dim))

        self.ac = get_model("ac", self.network_type)(state_dim, action_dim,
                                                     self.layers, "Qsa",
                                                     False).to(self.device)

        # load paramaters if already trained
        if self.pretrained is not None:
            self.load(self)
            self.ac.load_state_dict(self.checkpoint["weights"])
            for key, item in self.checkpoint.items():
                if key not in ["weights", "save_model"]:
                    setattr(self, key, item)
            print("Loaded pretrained model")

        self.ac_target = deepcopy(self.ac).to(self.device)

        # freeze target network params
        for param in self.ac_target.parameters():
            param.requires_grad = False

        self.replay_buffer = ReplayBuffer(self.replay_size)
        self.optimizer_policy = opt.Adam(self.ac.actor.parameters(),
                                         lr=self.lr_p)
        self.optimizer_q = opt.Adam(self.ac.critic.parameters(), lr=self.lr_q)
Пример #2
0
    def create_model(self) -> None:
        """
        Initialize the model
        Initializes optimizer and replay buffers as well.
        """
        state_dim, action_dim, discrete, _ = get_env_properties(self.env)
        if discrete:
            raise Exception(
                "Discrete Environments not supported for {}.".format(
                    __class__.__name__))
        if self.noise is not None:
            self.noise = self.noise(np.zeros_like(action_dim),
                                    self.noise_std * np.ones_like(action_dim))

        self.ac = get_model("ac", self.network_type)(state_dim, action_dim,
                                                     self.layers, "Qsa",
                                                     False).to(self.device)

        self.ac_target = deepcopy(self.ac).to(self.device)

        # freeze target network params
        for param in self.ac_target.parameters():
            param.requires_grad = False

        self.replay_buffer = ReplayBuffer(self.replay_size, self.env)
        self.optimizer_policy = opt.Adam(self.ac.actor.parameters(),
                                         lr=self.lr_p)
        self.optimizer_q = opt.Adam(self.ac.critic.parameters(), lr=self.lr_q)
Пример #3
0
    def create_model(self) -> None:
        """
        Initialize the model
        Initializes optimizer and replay buffers as well.
        """
        state_dim, action_dim, discrete, _ = get_env_properties(self.env)

        self.q1 = (get_model("v", self.network_type)(
            state_dim, action_dim, "Qsa", self.layers).to(self.device).float())

        self.q2 = (get_model("v", self.network_type)(
            state_dim, action_dim, "Qsa", self.layers).to(self.device).float())

        self.policy = (get_model(
            "p", self.network_type)(state_dim,
                                    action_dim,
                                    self.layers,
                                    discrete,
                                    False,
                                    sac=True).to(self.device).float())

        self.q1_targ = deepcopy(self.q1).to(self.device).float()
        self.q2_targ = deepcopy(self.q2).to(self.device).float()

        # freeze target parameters
        for param in self.q1_targ.parameters():
            param.requires_grad = False
        for param in self.q2_targ.parameters():
            param.requires_grad = False

        # optimizers
        self.q1_optimizer = opt.Adam(self.q1.parameters(), self.lr)
        self.q2_optimizer = opt.Adam(self.q2.parameters(), self.lr)
        self.policy_optimizer = opt.Adam(self.policy.parameters(), self.lr)

        if self.entropy_tuning:
            self.target_entropy = -torch.prod(
                torch.Tensor(self.env.action_space.shape).to(
                    self.device)).item()
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha_optim = opt.Adam([self.log_alpha], lr=self.lr)

        self.replay_buffer = ReplayBuffer(self.replay_size, self.env)

        # set action scales
        if self.env.action_space is None:
            self.action_scale = torch.tensor(1.0).to(self.device)
            self.action_bias = torch.tensor(0.0).to(self.device)
        else:
            self.action_scale = torch.FloatTensor(
                (self.env.action_space.high - self.env.action_space.low) /
                2.0).to(self.device)
            self.action_bias = torch.FloatTensor(
                (self.env.action_space.high + self.env.action_space.low) /
                2.0).to(self.device)
Пример #4
0
    def create_model(self):
        state_dim, action_dim, disc = self.get_env_properties()
        if self.noise is not None:
            self.noise = self.noise(np.zeros_like(action_dim),
                                    self.noise_std * np.ones_like(action_dim))

        self.ac = get_model("ac", self.network_type)(state_dim, action_dim,
                                                     self.layers, "Qsa",
                                                     False).to(self.device)

        self.ac.qf1 = self.ac.critic
        self.ac.qf2 = get_model("v", self.network_type)(state_dim,
                                                        action_dim,
                                                        hidden=self.layers,
                                                        val_type="Qsa")

        self.ac.qf1.to(self.device)
        self.ac.qf2.to(self.device)

        if self.pretrained is not None:
            self.load(self)
            self.ac.actor.load_state_dict(self.checkpoint["policy_weights"])
            self.ac.qf1.load_state_dict(self.checkpoint["q1_weights"])
            self.ac.qf2.load_state_dict(self.checkpoint["q2_weights"])

            for key, item in self.checkpoint.items():
                if key not in ["weights", "save_model"]:
                    setattr(self, key, item)
            print("Loaded pretrained model")

        self.ac_target = deepcopy(self.ac).to(self.device)

        # freeze target network params
        for param in self.ac_target.parameters():
            param.requires_grad = False

        self.replay_buffer = ReplayBuffer(self.replay_size)
        self.q_params = (list(self.ac.qf1.parameters()) +
                         list(self.ac.qf2.parameters()))
        self.optimizer_q = torch.optim.Adam(self.q_params, lr=self.lr_q)

        self.optimizer_policy = torch.optim.Adam(self.ac.actor.parameters(),
                                                 lr=self.lr_p)
Пример #5
0
    def create_model(self) -> None:
        """
        Initialize the model and target model for various variants of DQN.
        Initializes optimizer and replay buffers as well.
        """
        state_dim, action_dim, _, _ = get_env_properties(self.env)
        if self.network_type == "mlp":
            if self.dueling_dqn:
                self.model = DuelingDQNValueMlp(state_dim, action_dim)
            elif self.categorical_dqn:
                self.model = CategoricalDQNValue(state_dim, action_dim,
                                                 self.num_atoms)
            elif self.noisy_dqn:
                self.model = NoisyDQNValue(state_dim, action_dim)
            else:
                self.model = get_model("v",
                                       self.network_type)(state_dim,
                                                          action_dim, "Qs")

        elif self.network_type == "cnn":
            self.framestack = self.env.framestack

            if self.dueling_dqn:
                self.model = DuelingDQNValueCNN(action_dim, self.framestack)
            elif self.noisy_dqn:
                self.model = NoisyDQNValueCNN(action_dim, self.framestack)
            elif self.categorical_dqn:
                self.model = CategoricalDQNValueCNN(action_dim, self.num_atoms,
                                                    self.framestack)
            else:
                self.model = get_model("v", self.network_type)(action_dim,
                                                               self.framestack,
                                                               "Qs")

        self.target_model = deepcopy(self.model)

        if self.prioritized_replay:
            self.replay_buffer = PrioritizedBuffer(
                self.replay_size, self.prioritized_replay_alpha)
        else:
            self.replay_buffer = ReplayBuffer(self.replay_size, self.env)

        self.optimizer = opt.Adam(self.model.parameters(), lr=self.lr)
Пример #6
0
    def create_model(self) -> None:
        state_dim, action_dim, discrete, _ = get_env_properties(self.env)
        if discrete:
            raise Exception(
                "Discrete Environments not supported for {}.".format(__class__.__name__)
            )
        if self.noise is not None:
            self.noise = self.noise(
                np.zeros_like(action_dim), self.noise_std * np.ones_like(action_dim)
            )

        self.ac = get_model("ac", self.network_type)(
            state_dim, action_dim, self.layers, "Qsa", False
        ).to(self.device)

        self.ac.qf1 = self.ac.critic
        self.ac.qf2 = get_model("v", self.network_type)(
            state_dim, action_dim, hidden=self.layers, val_type="Qsa"
        )

        self.ac.qf1.to(self.device)
        self.ac.qf2.to(self.device)

        self.ac_target = deepcopy(self.ac).to(self.device)

        # freeze target network params
        for param in self.ac_target.parameters():
            param.requires_grad = False

        self.replay_buffer = ReplayBuffer(self.replay_size, self.env)
        self.q_params = list(self.ac.qf1.parameters()) + list(self.ac.qf2.parameters())
        self.optimizer_q = torch.optim.Adam(self.q_params, lr=self.lr_q)

        self.optimizer_policy = torch.optim.Adam(
            self.ac.actor.parameters(), lr=self.lr_p
        )
Пример #7
0
class TD3:
    """
    Twin Delayed DDPG

    Paper: https://arxiv.org/abs/1509.02971

    :param network_type: (str) The deep neural network layer types ['mlp']
    :param env: (Gym environment) The environment to learn from
    :param gamma: (float) discount factor
    :param replay_size: (int) Replay memory size
    :param batch_size: (int) Update batch size
    :param lr_p: (float) Policy network learning rate
    :param lr_q: (float) Q network learning rate
    :param polyak: (float) Polyak averaging weight to update target network
    :param policy_frequency: (int) Update actor and target networks every
        policy_frequency steps
    :param epochs: (int) Number of epochs
    :param start_steps: (int) Number of exploratory steps at start
    :param steps_per_epoch: (int) Number of steps per epoch
    :param noise_std: (float) Standard deviation for action noise
    :param max_ep_len: (int) Maximum steps per episode
    :param start_update: (int) Number of steps before first parameter update
    :param update_interval: (int) Number of steps between parameter updates
    :param layers: (tuple or list) Number of neurons in hidden layers
    :param seed (int): seed for torch and gym
    :param render (boolean): if environment is to be rendered
    :param device (str): device to use for tensor operations; 'cpu' for cpu
        and 'cuda' for gpu
    :type network_type: str
    :type env: Gym environment
    :type gamma: float
    :type replay_size: int
    :type batch_size: int
    :type lr_p: float
    :type lr_q: float
    :type polyak: float
    :type policy_frequency: int
    :type epochs: int
    :type start_steps: int
    :type steps_per_epoch: int
    :type noise_std: float
    :type max_ep_len: int
    :type start_update: int
    :type update_interval: int
    :type layers: tuple or list
    :type seed: int
    :type render: boolean
    :type device: str
    """

    def __init__(
        self,
        network_type: str,
        env: Union[gym.Env, VecEnv],
        gamma: float = 0.99,
        replay_size: int = 1000,
        batch_size: int = 100,
        lr_p: float = 0.001,
        lr_q: float = 0.001,
        polyak: float = 0.995,
        policy_frequency: int = 2,
        epochs: int = 100,
        start_steps: int = 10000,
        steps_per_epoch: int = 4000,
        noise: Optional[Any] = None,
        noise_std: float = 0.1,
        max_ep_len: int = 1000,
        start_update: int = 1000,
        update_interval: int = 50,
        layers: Tuple = (256, 256),
        seed: Optional[int] = None,
        render: bool = False,
        device: Union[torch.device, str] = "cpu",
    ):

        self.network_type = network_type
        self.env = env
        self.gamma = gamma
        self.replay_size = replay_size
        self.batch_size = batch_size
        self.lr_p = lr_p
        self.lr_q = lr_q
        self.polyak = polyak
        self.policy_frequency = policy_frequency
        self.epochs = epochs
        self.start_steps = start_steps
        self.steps_per_epoch = steps_per_epoch
        self.noise = noise
        self.noise_std = noise_std
        self.max_ep_len = max_ep_len
        self.start_update = start_update
        self.update_interval = update_interval
        self.layers = layers
        self.seed = seed
        self.render = render

        # Assign device
        if "cuda" in device and torch.cuda.is_available():
            self.device = torch.device(device)
        else:
            self.device = torch.device("cpu")

        # Assign seed
        if seed is not None:
            set_seeds(seed, self.env)

        self.empty_logs()
        self.create_model()

    def create_model(self) -> None:
        state_dim, action_dim, discrete, _ = get_env_properties(self.env)
        if discrete:
            raise Exception(
                "Discrete Environments not supported for {}.".format(__class__.__name__)
            )
        if self.noise is not None:
            self.noise = self.noise(
                np.zeros_like(action_dim), self.noise_std * np.ones_like(action_dim)
            )

        self.ac = get_model("ac", self.network_type)(
            state_dim, action_dim, self.layers, "Qsa", False
        ).to(self.device)

        self.ac.qf1 = self.ac.critic
        self.ac.qf2 = get_model("v", self.network_type)(
            state_dim, action_dim, hidden=self.layers, val_type="Qsa"
        )

        self.ac.qf1.to(self.device)
        self.ac.qf2.to(self.device)

        self.ac_target = deepcopy(self.ac).to(self.device)

        # freeze target network params
        for param in self.ac_target.parameters():
            param.requires_grad = False

        self.replay_buffer = ReplayBuffer(self.replay_size, self.env)
        self.q_params = list(self.ac.qf1.parameters()) + list(self.ac.qf2.parameters())
        self.optimizer_q = torch.optim.Adam(self.q_params, lr=self.lr_q)

        self.optimizer_policy = torch.optim.Adam(
            self.ac.actor.parameters(), lr=self.lr_p
        )

    def update_params_before_select_action(self, timestep: int) -> None:
        """
        Update any parameters before selecting action like epsilon for decaying epsilon greedy

        :param timestep: Timestep in the training process
        :type timestep: int
        """
        pass

    def select_action(
        self, state: np.ndarray, deterministic: bool = False
    ) -> np.ndarray:
        with torch.no_grad():
            action = self.ac_target.get_action(
                torch.as_tensor(state, dtype=torch.float32, device=self.device),
                deterministic=deterministic,
            )[0].numpy()

        # add noise to output from policy network
        if self.noise is not None:
            action += self.noise()

        return np.clip(
            action, -self.env.action_space.high[0], self.env.action_space.high[0]
        )

    def get_q_loss(
        self,
        state: np.ndarray,
        action: np.ndarray,
        reward: np.ndarray,
        next_state: np.ndarray,
        done: np.ndarray,
    ) -> torch.Tensor:
        q1 = self.ac.qf1.get_value(torch.cat([state, action], dim=-1))
        q2 = self.ac.qf2.get_value(torch.cat([state, action], dim=-1))

        with torch.no_grad():
            target_q1 = self.ac_target.qf1.get_value(
                torch.cat(
                    [
                        next_state,
                        self.ac_target.get_action(next_state, deterministic=True)[0],
                    ],
                    dim=-1,
                )
            )
            target_q2 = self.ac_target.qf2.get_value(
                torch.cat(
                    [
                        next_state,
                        self.ac_target.get_action(next_state, deterministic=True)[0],
                    ],
                    dim=-1,
                )
            )
            target_q = torch.min(target_q1, target_q2).unsqueeze(1)

            target = reward.squeeze(1) + self.gamma * (1 - done) * target_q.squeeze(1)

        l1 = nn.MSELoss()(q1, target)
        l2 = nn.MSELoss()(q2, target)

        return l1 + l2

    def get_p_loss(self, state: np.array) -> torch.Tensor:
        q_pi = self.ac.get_value(
            torch.cat([state, self.ac.get_action(state, deterministic=True)[0]], dim=-1)
        )
        return -torch.mean(q_pi)

    def update_params(self, update_interval: int) -> None:
        for timestep in range(update_interval):
            batch = self.replay_buffer.sample(self.batch_size)
            state, action, reward, next_state, done = (x.to(self.device) for x in batch)
            self.optimizer_q.zero_grad()
            # print(state.shape, action.shape, reward.shape, next_state.shape, done.shape)
            loss_q = self.get_q_loss(state, action, reward, next_state, done)
            loss_q.backward()
            self.optimizer_q.step()

            # Delayed Update
            if timestep % self.policy_frequency == 0:
                # freeze critic params for policy update
                for param in self.q_params:
                    param.requires_grad = False

                self.optimizer_policy.zero_grad()
                loss_p = self.get_p_loss(state)
                loss_p.backward()
                self.optimizer_policy.step()

                # unfreeze critic params
                for param in self.ac.critic.parameters():
                    param.requires_grad = True

                # update target network
                with torch.no_grad():
                    for param, param_target in zip(
                        self.ac.parameters(), self.ac_target.parameters()
                    ):
                        param_target.data.mul_(self.polyak)
                        param_target.data.add_((1 - self.polyak) * param.data)

                self.logs["policy_loss"].append(loss_p.item())
                self.logs["value_loss"].append(loss_q.item())

    def learn(self) -> None:  # pragma: no cover
        state, episode_reward, episode_len, episode = (
            self.env.reset(),
            np.zeros(self.env.n_envs),
            np.zeros(self.env.n_envs),
            np.zeros(self.env.n_envs),
        )
        total_steps = self.steps_per_epoch * self.epochs * self.env.n_envs

        if self.noise is not None:
            self.noise.reset()

        for timestep in range(0, total_steps, self.env.n_envs):
            # execute single transition
            if timestep > self.start_steps:
                action = self.select_action(state)
            else:
                action = self.env.sample()

            next_state, reward, done, _ = self.env.step(action)
            if self.render:
                self.env.render()
            episode_reward += reward
            episode_len += 1

            # dont set d to True if max_ep_len reached
            # done = self.env.n_envs*[False] if np.any(episode_len == self.max_ep_len) else done
            done = np.array(
                [
                    False if episode_len[i] == self.max_ep_len else done[i]
                    for i, ep_len in enumerate(episode_len)
                ]
            )

            self.replay_buffer.extend(zip(state, action, reward, next_state, done))

            state = next_state

            if np.any(done) or np.any(episode_len == self.max_ep_len):

                if sum(episode) % 20 == 0:
                    print(
                        "Ep: {}, reward: {}, t: {}".format(
                            sum(episode), np.mean(episode_reward), timestep
                        )
                    )

                for i, di in enumerate(done):
                    # print(d)
                    if di or episode_len[i] == self.max_ep_len:
                        episode_reward[i] = 0
                        episode_len[i] = 0
                        episode += 1

                if self.noise is not None:
                    self.noise.reset()

                state, episode_reward, episode_len = (
                    self.env.reset(),
                    np.zeros(self.env.n_envs),
                    np.zeros(self.env.n_envs),
                )
                episode += 1

            # update params
            if timestep >= self.start_update and timestep % self.update_interval == 0:
                self.update_params(self.update_interval)

        self.env.close()

    def get_hyperparams(self) -> Dict[str, Any]:
        hyperparams = {
            "network_type": self.network_type,
            "gamma": self.gamma,
            "lr_p": self.lr_p,
            "lr_q": self.lr_q,
            "polyak": self.polyak,
            "policy_frequency": self.policy_frequency,
            "noise_std": self.noise_std,
            "q1_weights": self.ac.qf1.state_dict(),
            "q2_weights": self.ac.qf2.state_dict(),
            "actor_weights": self.ac.actor.state_dict(),
        }

        return hyperparams

    def load_weights(self, weights) -> None:
        """
        Load weights for the agent from pretrained model
        """
        self.ac.actor.load_state_dict(weights["actor_weights"])
        self.ac.qf1.load_state_dict(weights["q1_weights"])
        self.ac.qf2.load_state_dict(weights["q2_weights"])

    def get_logging_params(self) -> Dict[str, Any]:
        """
        :returns: Logging parameters for monitoring training
        :rtype: dict
        """
        logs = {
            "policy_loss": safe_mean(self.logs["policy_loss"]),
            "value_loss": safe_mean(self.logs["value_loss"]),
        }

        self.empty_logs()
        return logs

    def empty_logs(self):
        """
        Empties logs
        """
        self.logs = {}
        self.logs["policy_loss"] = []
        self.logs["value_loss"] = []
Пример #8
0
class SAC:
    """
    Soft Actor Critic algorithm (SAC)
    Paper: https://arxiv.org/abs/1812.05905
    :param network_type: (str) The deep neural network layer types ['mlp']
    :param env: (Gym environment) The environment to learn from
    :param gamma: (float) discount factor
    :param replay_size: (int) Replay memory size
    :param batch_size: (int) Update batch size
    :param lr: (float) network learning rate
    :param alpha: (float) entropy weight
    :param polyak: (float) Polyak averaging weight to update target network
    :param epochs: (int) Number of epochs
    :param start_steps: (int) Number of exploratory steps at start
    :param steps_per_epoch: (int) Number of steps per epoch
    :param max_ep_len: (int) Maximum steps per episode
    :param start_update: (int) Number of steps before first parameter update
    :param update_interval: (int) Number of steps between parameter updates
    :param save_interval: (int) Number of steps between saves of models
    :param layers: (tuple or list) Number of neurons in hidden layers
    :param tensorboard_log: (str) the log location for tensorboard (if None,
        no logging)
    :param seed (int): seed for torch and gym
    :param render (boolean): if environment is to be rendered
    :param device (str): device to use for tensor operations; 'cpu' for cpu
        and 'cuda' for gpu
    :param run_num: (boolean) if model has already been trained
    :param save_name: (str) model save name (if None, model hasn't been
        pretrained)
    :param save_version: (int) model save version (if None, model hasn't been
        pretrained)
    """
    def __init__(
        self,
        network_type,
        env,
        gamma=0.99,
        replay_size=1000000,
        batch_size=256,
        lr=3e-4,
        alpha=0.01,
        polyak=0.995,
        entropy_tuning=True,
        epochs=1000,
        start_steps=0,
        steps_per_epoch=1000,
        max_ep_len=1000,
        start_update=256,
        update_interval=1,
        layers=(256, 256),
        pretrained=None,
        tensorboard_log=None,
        seed=None,
        render=False,
        device="cpu",
        run_num=None,
        save_model=None,
        save_interval=5000,
    ):

        self.network_type = network_type
        self.env = env
        self.gamma = gamma
        self.replay_size = replay_size
        self.batch_size = batch_size
        self.lr = lr
        self.alpha = alpha
        self.polyak = polyak
        self.entropy_tuning = entropy_tuning
        self.epochs = epochs
        self.start_steps = start_steps
        self.steps_per_epoch = steps_per_epoch
        self.max_ep_len = max_ep_len
        self.start_update = start_update
        self.update_interval = update_interval
        self.save_interval = save_interval
        self.layers = layers
        self.pretrained = pretrained
        self.tensorboard_log = tensorboard_log
        self.seed = seed
        self.render = render
        self.run_num = run_num
        self.save_model = save_model
        self.save = save_params
        self.load = load_params
        self.evaluate = evaluate

        # Assign device
        if "cuda" in device and torch.cuda.is_available():
            self.device = torch.device(device)
        else:
            self.device = torch.device("cpu")

        # Assign seed
        if seed is not None:
            set_seeds(seed, self.env)

        # Setup tensorboard writer
        self.writer = None
        if self.tensorboard_log is not None:  # pragma: no cover
            from torch.utils.tensorboard import SummaryWriter

            self.writer = SummaryWriter(log_dir=self.tensorboard_log)

        self.create_model()

    def create_model(self):
        state_dim = self.env.observation_space.shape[0]

        # initialize models
        if isinstance(self.env.action_space, gym.spaces.Discrete):
            action_dim = self.env.action_space.n
            disc = True
        elif isinstance(self.env.action_space, gym.spaces.Box):
            action_dim = self.env.action_space.shape[0]
            disc = False
        else:
            raise NotImplementedError

        self.q1 = get_model("v",
                            self.network_type)(state_dim, action_dim, "Qsa",
                                               self.layers).to(self.device)
        self.q2 = get_model("v",
                            self.network_type)(state_dim, action_dim, "Qsa",
                                               self.layers).to(self.device)

        self.policy = get_model("p",
                                self.network_type)(state_dim,
                                                   action_dim,
                                                   self.layers,
                                                   disc,
                                                   False,
                                                   sac=True).to(self.device)

        if self.pretrained is not None:
            self.load(self)
            self.q1.load_state_dict(self.checkpoint["q1_weights"])
            self.q2.load_state_dict(self.checkpoint["q2_weights"])
            self.policy.load_state_dict(self.checkpoint["policy_weights"])

            for key, item in self.checkpoint.items():
                if key not in ["weights", "save_model"]:
                    setattr(self, key, item)
            print("Loaded pretrained model")

        self.q1_targ = deepcopy(self.q1).to(self.device)
        self.q2_targ = deepcopy(self.q2).to(self.device)

        # freeze target parameters
        for p in self.q1_targ.parameters():
            p.requires_grad = False
        for p in self.q2_targ.parameters():
            p.requires_grad = False

        # optimizers
        self.q1_optimizer = opt.Adam(self.q1.parameters(), self.lr)
        self.q2_optimizer = opt.Adam(self.q2.parameters(), self.lr)
        self.policy_optimizer = opt.Adam(self.policy.parameters(), self.lr)

        if self.entropy_tuning:
            self.target_entropy = -torch.prod(
                torch.Tensor(self.env.action_space.shape).to(
                    self.device)).item()
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha_optim = opt.Adam([self.log_alpha], lr=self.lr)

        self.replay_buffer = ReplayBuffer(self.replay_size)

        # set action scales
        if self.env.action_space is None:
            self.action_scale = torch.tensor(1.0).to(self.device)
            self.action_bias = torch.tensor(0.0).to(self.device)
        else:
            self.action_scale = torch.FloatTensor(
                (self.env.action_space.high - self.env.action_space.low) /
                2.0).to(self.device)
            self.action_bias = torch.FloatTensor(
                (self.env.action_space.high + self.env.action_space.low) /
                2.0).to(self.device)

    def sample_action(self, state):
        mean, log_std = self.policy.forward(state)
        std = log_std.exp()

        # reparameterization trick
        distribution = Normal(mean, std)
        xi = distribution.rsample()
        yi = torch.tanh(xi)
        action = yi * self.action_scale + self.action_bias
        log_pi = distribution.log_prob(xi)

        # enforcing action bound (appendix of paper)
        log_pi -= torch.log(self.action_scale * (1 - yi.pow(2)) +
                            np.finfo(np.float32).eps)
        log_pi = log_pi.sum(1, keepdim=True)
        mean = torch.tanh(mean) * self.action_scale + self.action_bias
        return action, log_pi, mean

    def select_action(self, state):
        state = torch.FloatTensor(state).to(self.device).unsqueeze(0)
        action, _, _ = self.sample_action(state)
        return action.detach().cpu().numpy()[0]

    def update_params(self, state, action, reward, next_state, done):
        reward = reward.unsqueeze(1)
        done = done.unsqueeze(1)
        # compute targets
        with torch.no_grad():
            next_action, next_log_pi, _ = self.sample_action(next_state)
            next_q1_targ = self.q1_targ(
                torch.cat([next_state, next_action], dim=-1))
            next_q2_targ = self.q2_targ(
                torch.cat([next_state, next_action], dim=-1))
            next_q_targ = (torch.min(next_q1_targ, next_q2_targ) -
                           self.alpha * next_log_pi)
            next_q = reward + self.gamma * (1 - done) * next_q_targ

        # compute losses
        q1 = self.q1(torch.cat([state, action], dim=-1))
        q2 = self.q2(torch.cat([state, action], dim=-1))

        q1_loss = nn.MSELoss()(q1, next_q)
        q2_loss = nn.MSELoss()(q2, next_q)

        pi, log_pi, _ = self.sample_action(state)
        q1_pi = self.q1(torch.cat([state, pi], dim=-1))
        q2_pi = self.q2(torch.cat([state, pi], dim=-1))
        min_q_pi = torch.min(q1_pi, q2_pi)
        policy_loss = ((self.alpha * log_pi) - min_q_pi).mean()

        # gradient step
        self.q1_optimizer.zero_grad()
        q1_loss.backward()
        self.q1_optimizer.step()

        self.q2_optimizer.zero_grad()
        q2_loss.backward()
        self.q2_optimizer.step()

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        # alpha loss
        if self.entropy_tuning:
            alpha_loss = -(self.log_alpha *
                           (log_pi + self.target_entropy).detach()).mean()

            self.alpha_optim.zero_grad()
            alpha_loss.backward()
            self.alpha_optim.step()

            self.alpha = self.log_alpha.exp()
        else:
            alpha_loss = torch.tensor(0.0).to(self.device)

        # soft update target params
        for target_param, param in zip(self.q1_targ.parameters(),
                                       self.q1.parameters()):
            target_param.data.copy_(target_param.data * self.polyak +
                                    param.data * (1 - self.polyak))

        for target_param, param in zip(self.q2_targ.parameters(),
                                       self.q2.parameters()):
            target_param.data.copy_(target_param.data * self.polyak +
                                    param.data * (1 - self.polyak))

        return (q1_loss.item(), q2_loss.item(), policy_loss.item(),
                alpha_loss.item())

    def learn(self):  # pragma: no cover
        if self.tensorboard_log:
            writer = SummaryWriter(self.tensorboard_log)

        timestep = 0
        episode = 1
        total_steps = self.steps_per_epoch * self.epochs

        while episode >= 1:
            episode_reward = 0
            state = env.reset()
            done = False
            j = 0

            while not done:
                # sample action
                if timestep > self.start_steps:
                    action = self.select_action(state)
                else:
                    action = self.env.action_space.sample()

                if (timestep >= self.start_update
                        and timestep % self.update_interval == 0
                        and self.replay_buffer.get_len() > self.batch_size):
                    # get losses
                    batch = self.replay_buffer.sample(self.batch_size)
                    states, actions, next_states, rewards, dones = (x.to(
                        self.device) for x in batch)

                    (q1_loss, q2_loss, policy_loss,
                     alpha_loss) = self.update_params(states, actions,
                                                      next_states, rewards,
                                                      dones)

                    # write loss logs to tensorboard
                    if self.tensorboard_log:
                        writer.add_scalar("loss/q1_loss", q1_loss, timestep)
                        writer.add_scalar("loss/q2_loss", q2_loss, timestep)
                        writer.add_scalar("loss/policy_loss", policy_loss,
                                          timestep)
                        writer.add_scalar("loss/alpha_loss", alpha_loss,
                                          timestep)

                if self.save_model is not None:
                    if (timestep >= self.start_update
                            and timestep % self.save_interval == 0):
                        self.checkpoint = self.get_hyperparams()
                        self.save(self, timestep)
                        print("Saved current model")

                # prepare transition for replay memory push
                next_state, reward, done, _ = self.env.step(action)
                if self.render:
                    self.env.render()
                timestep += 1
                j += 1
                episode_reward += reward

                ndone = 1 if j == self.max_ep_len else float(not done)
                self.replay_buffer.push(
                    (state, action, reward, next_state, 1 - ndone))
                state = next_state

            if timestep > total_steps:
                break

            # write episode reward to tensorboard logs
            if self.tensorboard_log:
                writer.add_scalar("reward/episode_reward", episode_reward,
                                  timestep)

            if episode % 5 == 0:
                print("Episode: {}, Total Timesteps: {}, Reward: {}".format(
                    episode, timestep, episode_reward))
            episode += 1

        self.env.close()
        if self.tensorboard_log:
            self.writer.close()

    def get_hyperparams(self):
        hyperparams = {
            "network_type": self.network_type,
            "gamma": self.gamma,
            "lr": self.lr,
            "replay_size": self.replay_size,
            "entropy_tuning": self.entropy_tuning,
            "alpha": self.alpha,
            "polyak": self.polyak,
            "q1_weights": self.q1.state_dict(),
            "q2_weights": self.q2.state_dict(),
            "policy_weights": self.policy.state_dict(),
        }

        return hyperparams
Пример #9
0
    def create_model(self):
        state_dim = self.env.observation_space.shape[0]

        # initialize models
        if isinstance(self.env.action_space, gym.spaces.Discrete):
            action_dim = self.env.action_space.n
            disc = True
        elif isinstance(self.env.action_space, gym.spaces.Box):
            action_dim = self.env.action_space.shape[0]
            disc = False
        else:
            raise NotImplementedError

        self.q1 = get_model("v",
                            self.network_type)(state_dim, action_dim, "Qsa",
                                               self.layers).to(self.device)
        self.q2 = get_model("v",
                            self.network_type)(state_dim, action_dim, "Qsa",
                                               self.layers).to(self.device)

        self.policy = get_model("p",
                                self.network_type)(state_dim,
                                                   action_dim,
                                                   self.layers,
                                                   disc,
                                                   False,
                                                   sac=True).to(self.device)

        if self.pretrained is not None:
            self.load(self)
            self.q1.load_state_dict(self.checkpoint["q1_weights"])
            self.q2.load_state_dict(self.checkpoint["q2_weights"])
            self.policy.load_state_dict(self.checkpoint["policy_weights"])

            for key, item in self.checkpoint.items():
                if key not in ["weights", "save_model"]:
                    setattr(self, key, item)
            print("Loaded pretrained model")

        self.q1_targ = deepcopy(self.q1).to(self.device)
        self.q2_targ = deepcopy(self.q2).to(self.device)

        # freeze target parameters
        for p in self.q1_targ.parameters():
            p.requires_grad = False
        for p in self.q2_targ.parameters():
            p.requires_grad = False

        # optimizers
        self.q1_optimizer = opt.Adam(self.q1.parameters(), self.lr)
        self.q2_optimizer = opt.Adam(self.q2.parameters(), self.lr)
        self.policy_optimizer = opt.Adam(self.policy.parameters(), self.lr)

        if self.entropy_tuning:
            self.target_entropy = -torch.prod(
                torch.Tensor(self.env.action_space.shape).to(
                    self.device)).item()
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha_optim = opt.Adam([self.log_alpha], lr=self.lr)

        self.replay_buffer = ReplayBuffer(self.replay_size)

        # set action scales
        if self.env.action_space is None:
            self.action_scale = torch.tensor(1.0).to(self.device)
            self.action_bias = torch.tensor(0.0).to(self.device)
        else:
            self.action_scale = torch.FloatTensor(
                (self.env.action_space.high - self.env.action_space.low) /
                2.0).to(self.device)
            self.action_bias = torch.FloatTensor(
                (self.env.action_space.high + self.env.action_space.low) /
                2.0).to(self.device)
Пример #10
0
class DQN:
    """
    Deep Q Networks
    Paper (DQN) https://arxiv.org/pdf/1312.5602.pdf
    Paper (Double DQN) https://arxiv.org/abs/1509.06461

    :param network_type: The deep neural network layer types ['mlp', 'cnn']
    :param env: The environment to learn from
    :param double_dqn: For training Double DQN
    :param dueling_dqn:  For training Dueling DQN
    :param noisy_dqn: For using Noisy Q
    :param categorical_dqn: For using Distributional DQN
    :param parameterized_replay: For using a prioritized buffer
    :param epochs: Number of epochs
    :param max_iterations_per_epoch: Number of iterations per epoch
    :param max_ep_len: Maximum steps per episode
    :param gamma: discount factor
    :param lr: learing rate for the optimizer
    :param batch_size: Update batch size
    :param replay_size: Replay memory size
    :param tensorboard_log: the log location for tensorboard
    :param seed: seed for torch and gym
    :param render: if environment is to be rendered
    :param device: device to use for tensor operations; 'cpu' for cpu and 'cuda' for gpu
    :type network_type: string
    :type env: Gym environment
    :type double_dqn: bool
    :type dueling_dqn: bool
    :type noisy_dqn: bool 
    :type categorical_dqn: bool 
    :type parameterized_replay: bool 
    :type epochs: int
    :type max_iterations_per_epoch: int
    :type max_ep_len: int
    :type gamma: float
    :type lr: float
    :type batch_size: int
    :type replay_size: int
    :type tensorboard_log: string
    :type seed: int
    :type render: bool
    :type device: string
    """
    def __init__(
        self,
        network_type,
        env,
        double_dqn=False,
        dueling_dqn=False,
        noisy_dqn=False,
        categorical_dqn=False,
        prioritized_replay=False,
        epochs=100,
        max_iterations_per_epoch=100,
        max_ep_len=1000,
        gamma=0.99,
        lr=0.001,
        batch_size=32,
        replay_size=100,
        prioritized_replay_alpha=0.6,
        max_epsilon=1.0,
        min_epsilon=0.01,
        epsilon_decay=1000,
        num_atoms=51,
        Vmin=-10,
        Vmax=10,
        tensorboard_log=None,
        seed=None,
        render=False,
        device="cpu",
        save_interval=5000,
        pretrained=None,
        run_num=None,
        save_model=None,
        transform=None,
    ):
        self.env = env
        self.double_dqn = double_dqn
        self.dueling_dqn = dueling_dqn
        self.noisy_dqn = noisy_dqn
        self.categorical_dqn = categorical_dqn
        self.prioritized_replay = prioritized_replay
        self.max_epochs = epochs
        self.max_iterations_per_epoch = max_iterations_per_epoch
        self.max_ep_len = max_ep_len
        self.replay_size = replay_size
        self.prioritized_replay_alpha = prioritized_replay_alpha
        self.lr = lr
        self.gamma = gamma
        self.batch_size = batch_size
        self.num_atoms = num_atoms
        self.Vmin = Vmin
        self.Vmax = Vmax
        self.tensorboard_log = tensorboard_log
        self.render = render
        self.loss_hist = []
        self.reward_hist = []
        self.max_epsilon = max_epsilon
        self.min_epsilon = min_epsilon
        self.epsilon_decay = epsilon_decay
        self.evaluate = evaluate
        self.run_num = run_num
        self.save_model = save_model
        self.save_interval = save_interval
        self.save = save_params
        self.load = load_params
        self.pretrained = pretrained
        self.network_type = network_type
        self.history_length = None
        self.transform = transform

        # Assign device
        if "cuda" in device and torch.cuda.is_available():
            self.device = torch.device(device)
        else:
            self.device = torch.device("cpu")

        # Assign seed
        if seed is not None:
            set_seeds(seed, self.env)

        # Setup tensorboard writer
        self.writer = None
        if self.tensorboard_log is not None:  # pragma: no cover
            from torch.utils.tensorboard import SummaryWriter

            self.writer = SummaryWriter(log_dir=self.tensorboard_log)

        self.create_model()

    def create_model(self):
        '''
        Initialize the model and target model for various variants of DQN. 
        Initializes optimizer and replay buffers as well.
        '''
        state_dim, action_dim, disc = self.get_env_properties()
        if self.network_type == "mlp":
            if self.dueling_dqn:
                self.model = DuelingDQNValueMlp(state_dim, action_dim)
            elif self.categorical_dqn:
                self.model = CategoricalDQNValue(
                    state_dim,
                    action_dim,
                    self.num_atoms,
                )
            elif self.noisy_dqn:
                self.model = NoisyDQNValue(state_dim, action_dim)
            else:
                self.model = get_model("v",
                                       self.network_type)(state_dim,
                                                          action_dim, "Qs")

        elif self.network_type == "cnn":
            if self.history_length is None:
                self.history_length = 4

            if self.transform is None:
                self.transform = transforms.Compose([
                    transforms.ToPILImage(),
                    transforms.Grayscale(),
                    transforms.Resize((110, 84)),
                    transforms.CenterCrop(84),
                    transforms.ToTensor()
                ])

            self.state_history = deque([
                self.transform(self.env.observation_space.sample()).reshape(
                    -1, 84, 84) for _ in range(self.history_length)
            ],
                                       maxlen=self.history_length)

            if self.dueling_dqn:
                self.model = DuelingDQNValueCNN(self.env.action_space.n,
                                                self.history_length)
            elif self.noisy_dqn:
                self.model = NoisyDQNValueCNN(self.env.action_space.n,
                                              self.history_length)
            elif self.categorical_dqn:
                self.model = CategoricalDQNValueCNN(self.env.action_space.n,
                                                    self.num_atoms,
                                                    self.history_length)
            else:
                self.model = get_model("v", self.network_type)(
                    self.env.action_space.n, self.history_length, "Qs")

        # load paramaters if already trained
        if self.pretrained is not None:
            self.load(self)
            self.model.load_state_dict(self.checkpoint["weights"])
            for key, item in self.checkpoint.items():
                if key not in ["weights", "save_model"]:
                    setattr(self, key, item)
            print("Loaded pretrained model")

        self.target_model = deepcopy(self.model)

        if self.prioritized_replay:
            self.replay_buffer = PrioritizedBuffer(
                self.replay_size, self.prioritized_replay_alpha)
        else:
            self.replay_buffer = ReplayBuffer(self.replay_size)

        self.optimizer = opt.Adam(self.model.parameters(), lr=self.lr)

    def get_env_properties(self):
        '''
        Helper function to extract the observation and action space

        :returns: Observation space, Action Space and whether the action space is discrete or not 
        :rtype: int, float, ... ; int, float, ... ; bool
        '''
        state_dim = self.env.observation_space.shape[0]

        if isinstance(self.env.action_space, gym.spaces.Discrete):
            action_dim = self.env.action_space.n
            disc = True
        elif isinstance(self.env.action_space, gym.spaces.Box):
            action_dim = self.env.action_space.shape[0]
            disc = False
        else:
            raise NotImplementedError

        return state_dim, action_dim, disc

    def update_target_model(self):
        '''
        Copy the target model weights with the model
        '''
        self.target_model.load_state_dict(self.model.state_dict())

    def select_action(self, state):
        '''
        Epsilon Greedy selection of action

        :param state: Observation state
        :type state: int, float, ...
        :returns: Action based on the state and epsilon value 
        :rtype: int, float, ... 
        '''
        if np.random.rand() > self.epsilon:
            if self.categorical_dqn:
                with torch.no_grad():
                    state = Variable(torch.FloatTensor(state))
                    dist = self.model(state).data.cpu()
                    dist = (
                        dist *
                        torch.linspace(self.Vmin, self.Vmax, self.num_atoms))
                    action = dist.sum(2).max(1)[1].numpy()[0]
            else:
                state = Variable(torch.FloatTensor(state))
                q_value = self.model(state)
                action = np.argmax(q_value.detach().numpy())
        else:
            action = self.env.action_space.sample()

        return action

    def get_td_loss(self):
        '''
        Computes loss for various variants 

        :returns: the TD loss depending upon the variant
        :rtype: float
        '''
        if self.prioritized_replay:
            (
                state,
                action,
                reward,
                next_state,
                done,
                indices,
                weights,
            ) = self.replay_buffer.sample(self.batch_size)
            weights = Variable(torch.FloatTensor(weights))
        else:
            (state, action, reward, next_state,
             done) = self.replay_buffer.sample(self.batch_size)

        state = Variable(torch.FloatTensor(np.float32(state)))
        next_state = Variable(torch.FloatTensor(np.float32(next_state)))
        action = Variable(torch.LongTensor(action.long()))
        reward = Variable(torch.FloatTensor(reward))
        done = Variable(torch.FloatTensor(done))

        if self.network_type == "cnn":
            state = state.view(-1, 4, 84, 84)
            next_state = next_state.view(-1, 4, 84, 84)

        if self.categorical_dqn:
            projection_dist = self.projection_distribution(
                next_state, reward, done)
            dist = self.model(state)
            action = (action.unsqueeze(1).unsqueeze(1).expand(
                self.batch_size, 1, self.num_atoms))
            dist = dist.gather(1, action).squeeze(1)
            dist.data.clamp_(0.01, 0.99)

        elif self.double_dqn:
            q_values = self.model(state)
            q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)

            q_next_state_values = self.model(next_state)
            action_next = q_next_state_values.max(1)[1]

            q_target_next_state_values = self.target_model(next_state)
            q_target_s_a_prime = q_target_next_state_values.gather(
                1, action_next.unsqueeze(1)).squeeze(1)
            expected_q_value = (reward + self.gamma * q_target_s_a_prime *
                                (1 - done))

        else:
            q_values = self.model(state)
            q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)

            q_next_state_values = self.target_model(next_state)
            q_s_a_prime = q_next_state_values.max(1)[0]
            expected_q_value = reward + self.gamma * q_s_a_prime * (1 - done)

        if self.categorical_dqn:
            loss = -(Variable(projection_dist) * dist.log()).sum(1).mean()
        else:
            if self.prioritized_replay:
                loss = (q_value - expected_q_value.detach()).pow(2) * weights
                priorities = loss + 1e-5
                loss = loss.mean()
                self.replay_buffer.update_priorities(
                    indices,
                    priorities.data.cpu().numpy())
            else:
                loss = (q_value - expected_q_value.detach()).pow(2).mean()

        self.loss_hist.append(loss)

        return loss

    def update_params(self):
        '''
        Takes the step for optimizer. This internally call get_td_loss(), so no need to call the function explicitly.
        '''
        loss = self.get_td_loss()
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.noisy_dqn or self.categorical_dqn:
            self.model.reset_noise()
            self.target_model.reset_noise()

    def calculate_epsilon_by_frame(self, frame_idx):
        '''
        A helper function to calculate the value of epsilon after every step. 

        :param frame_idx: Current step 
        :type frame_idx: int
        :returns: epsilon value for the step
        :rtype: float 
        '''
        return (self.min_epsilon + (self.max_epsilon - self.min_epsilon) *
                np.exp(-1.0 * frame_idx / self.epsilon_decay))

    def projection_distribution(self, next_state, rewards, dones):
        '''
        A helper function used for categorical DQN

        :param next_state: next observation state
        :param rewards: rewards collected
        :param dones: dones 
        :type next_state: int, float, ...
        :type rewards: list 
        :type dones: list
        :returns: projection distribution 
        :rtype: float 
        '''
        batch_size = next_state.size(0)

        delta_z = float(self.Vmax - self.Vmin) / (self.num_atoms - 1)
        support = torch.linspace(self.Vmin, self.Vmax, self.num_atoms)

        next_dist = self.target_model(next_state).data.cpu() * support
        next_action = next_dist.sum(2).max(1)[1]
        next_action = (next_action.unsqueeze(1).unsqueeze(1).expand(
            next_dist.size(0), 1, next_dist.size(2)))
        next_dist = next_dist.gather(1, next_action).squeeze(1)

        rewards = rewards.unsqueeze(1).expand_as(next_dist)
        dones = dones.unsqueeze(1).expand_as(next_dist)
        support = support.unsqueeze(0).expand_as(next_dist)

        Tz = rewards + (1 - dones) * 0.99 * support
        Tz = Tz.clamp(min=self.Vmin, max=self.Vmax)
        b = (Tz - self.Vmin) / delta_z
        lower = b.floor().long()
        upper = b.ceil().long()

        offset = torch.linspace(0, (batch_size - 1) * self.num_atoms,
                                batch_size).long().unsqueeze(1).expand(
                                    self.batch_size, self.num_atoms)

        projection_dist = torch.zeros(next_dist.size())
        projection_dist.view(-1).index_add_(0, (lower + offset).view(-1),
                                            (next_dist *
                                             (upper.float() - b)).view(-1))
        projection_dist.view(-1).index_add_(0, (upper + offset).view(-1),
                                            (next_dist *
                                             (b - lower.float())).view(-1))

        return projection_dist

    def learn(self):  # pragma: no cover
        total_steps = self.max_epochs * self.max_iterations_per_epoch
        state, episode_reward, episode, episode_len = self.env.reset(), 0, 0, 0

        if self.network_type == "cnn":
            self.state_history.append(self.transform(state))
            phi_state = torch.stack(list(self.state_history), dim=1)

        if self.double_dqn:
            self.update_target_model()

        for frame_idx in range(1, total_steps + 1):
            self.epsilon = self.calculate_epsilon_by_frame(frame_idx)

            if self.network_type == "mlp":
                action = self.select_action(state)
            elif self.network_type == "cnn":
                action = self.select_action(phi_state)

            next_state, reward, done, _ = self.env.step(action)

            if self.render:
                self.env.render()

            if self.network_type == "cnn":
                self.state_history.append(self.transform(next_state))
                phi_next_state = torch.stack(list(self.state_history), dim=1)
                self.replay_buffer.push(
                    (phi_state, action, reward, phi_next_state, done))
                phi_state = phi_next_state
            else:
                self.replay_buffer.push(
                    (state, action, reward, next_state, done))
                state = next_state

            episode_reward += reward
            episode_len += 1

            done = False if episode_len == self.max_ep_len else done

            if done or (episode_len == self.max_ep_len):
                if episode % 2 == 0:
                    print("Episode: {}, Reward: {}, Frame Index: {}".format(
                        episode, episode_reward, frame_idx))
                if self.tensorboard_log:
                    self.writer.add_scalar("episode_reward", episode_reward,
                                           frame_idx)

                self.reward_hist.append(episode_reward)
                state, episode_reward, episode_len = self.env.reset(), 0, 0
                episode += 1

            if self.replay_buffer.get_len() > self.batch_size:
                self.update_params()

            if self.save_model is not None:
                if frame_idx % self.save_interval == 0:
                    self.checkpoint = self.get_hyperparams()
                    self.save(self, frame_idx)
                    print("Saved current model")

            if frame_idx % 100 == 0:
                self.update_target_model()

        self.env.close()
        if self.tensorboard_log:
            self.writer.close()

    def get_hyperparams(self):
        hyperparams = {
            "gamma": self.gamma,
            "batch_size": self.batch_size,
            "lr": self.lr,
            "replay_size": self.replay_size,
            "double_dqn": self.double_dqn,
            "dueling_dqn": self.dueling_dqn,
            "noisy_dqn": self.noisy_dqn,
            "categorical_dqn": self.categorical_dqn,
            "prioritized_replay": self.prioritized_replay,
            "prioritized_replay_alpha": self.prioritized_replay_alpha,
            "weights": self.model.state_dict(),
        }

        return hyperparams
Пример #11
0
    def create_model(self):
        '''
        Initialize the model and target model for various variants of DQN. 
        Initializes optimizer and replay buffers as well.
        '''
        state_dim, action_dim, disc = self.get_env_properties()
        if self.network_type == "mlp":
            if self.dueling_dqn:
                self.model = DuelingDQNValueMlp(state_dim, action_dim)
            elif self.categorical_dqn:
                self.model = CategoricalDQNValue(
                    state_dim,
                    action_dim,
                    self.num_atoms,
                )
            elif self.noisy_dqn:
                self.model = NoisyDQNValue(state_dim, action_dim)
            else:
                self.model = get_model("v",
                                       self.network_type)(state_dim,
                                                          action_dim, "Qs")

        elif self.network_type == "cnn":
            if self.history_length is None:
                self.history_length = 4

            if self.transform is None:
                self.transform = transforms.Compose([
                    transforms.ToPILImage(),
                    transforms.Grayscale(),
                    transforms.Resize((110, 84)),
                    transforms.CenterCrop(84),
                    transforms.ToTensor()
                ])

            self.state_history = deque([
                self.transform(self.env.observation_space.sample()).reshape(
                    -1, 84, 84) for _ in range(self.history_length)
            ],
                                       maxlen=self.history_length)

            if self.dueling_dqn:
                self.model = DuelingDQNValueCNN(self.env.action_space.n,
                                                self.history_length)
            elif self.noisy_dqn:
                self.model = NoisyDQNValueCNN(self.env.action_space.n,
                                              self.history_length)
            elif self.categorical_dqn:
                self.model = CategoricalDQNValueCNN(self.env.action_space.n,
                                                    self.num_atoms,
                                                    self.history_length)
            else:
                self.model = get_model("v", self.network_type)(
                    self.env.action_space.n, self.history_length, "Qs")

        # load paramaters if already trained
        if self.pretrained is not None:
            self.load(self)
            self.model.load_state_dict(self.checkpoint["weights"])
            for key, item in self.checkpoint.items():
                if key not in ["weights", "save_model"]:
                    setattr(self, key, item)
            print("Loaded pretrained model")

        self.target_model = deepcopy(self.model)

        if self.prioritized_replay:
            self.replay_buffer = PrioritizedBuffer(
                self.replay_size, self.prioritized_replay_alpha)
        else:
            self.replay_buffer = ReplayBuffer(self.replay_size)

        self.optimizer = opt.Adam(self.model.parameters(), lr=self.lr)
Пример #12
0
class DDPG:
    """
    Deep Deterministic Policy Gradient algorithm (DDPG)
    Paper: https://arxiv.org/abs/1509.02971
    :param network_type: (str) The deep neural network layer types ['mlp']
    :param env: (Gym environment) The environment to learn from
    :param gamma: (float) discount factor
    :param replay_size: (int) Replay memory size
    :param batch_size: (int) Update batch size
    :param lr_p: (float) Policy network learning rate
    :param lr_q: (float) Q network learning rate
    :param polyak: (float) Polyak averaging weight to update target network
    :param epochs: (int) Number of epochs
    :param start_steps: (int) Number of exploratory steps at start
    :param steps_per_epoch: (int) Number of steps per epoch
    :param noise_std: (float) Standard deviation for action noise
    :param max_ep_len: (int) Maximum steps per episode
    :param start_update: (int) Number of steps before first parameter update
    :param update_interval: (int) Number of steps between parameter updates
    :param save_interval: (int) Number of steps between saves of models
    :param layers: (tuple or list) Number of neurons in hidden layers
    :param tensorboard_log: (str) the log location for tensorboard (if None,
        no logging)
    :param seed (int): seed for torch and gym
    :param render (boolean): if environment is to be rendered
    :param device (str): device to use for tensor operations; 'cpu' for cpu
        and 'cuda' for gpu
    :param run_num: (int) model run number if it has already been trained,
        (if None, don't load from past model)
    :param save_model: (string) directory the user wants to save models to
    """
    def __init__(
        self,
        network_type,
        env,
        gamma=0.99,
        replay_size=1000000,
        batch_size=100,
        lr_p=0.0001,
        lr_q=0.001,
        polyak=0.995,
        epochs=100,
        start_steps=10000,
        steps_per_epoch=4000,
        noise=None,
        noise_std=0.1,
        max_ep_len=1000,
        start_update=1000,
        update_interval=50,
        layers=(32, 32),
        pretrained=None,
        tensorboard_log=None,
        seed=None,
        render=False,
        device="cpu",
        run_num=None,
        save_model=None,
        save_interval=5000,
    ):

        self.network_type = network_type
        self.env = env
        self.gamma = gamma
        self.replay_size = replay_size
        self.batch_size = batch_size
        self.lr_p = lr_p
        self.lr_q = lr_q
        self.polyak = polyak
        self.epochs = epochs
        self.start_steps = start_steps
        self.steps_per_epoch = steps_per_epoch
        self.noise = noise
        self.noise_std = noise_std
        self.max_ep_len = max_ep_len
        self.start_update = start_update
        self.update_interval = update_interval
        self.save_interval = save_interval
        self.pretrained = pretrained
        self.layers = layers
        self.tensorboard_log = tensorboard_log
        self.seed = seed
        self.render = render
        self.evaluate = evaluate
        self.run_num = run_num
        self.save_model = save_model
        self.save = save_params
        self.load = load_params

        # Assign device
        if "cuda" in device and torch.cuda.is_available():
            self.device = torch.device(device)
        else:
            self.device = torch.device("cpu")

        # Assign seed
        if seed is not None:
            set_seeds(seed, self.env)

        # Setup tensorboard writer
        self.writer = None
        if self.tensorboard_log is not None:  # pragma: no cover
            from torch.utils.tensorboard import SummaryWriter

            self.writer = SummaryWriter(log_dir=self.tensorboard_log)

        self.create_model()

    def create_model(self):
        state_dim = self.env.observation_space.shape[0]
        action_dim = self.env.action_space.shape[0]
        if self.noise is not None:
            self.noise = self.noise(np.zeros_like(action_dim),
                                    self.noise_std * np.ones_like(action_dim))

        self.ac = get_model("ac", self.network_type)(state_dim, action_dim,
                                                     self.layers, "Qsa",
                                                     False).to(self.device)

        # load paramaters if already trained
        if self.pretrained is not None:
            self.load(self)
            self.ac.load_state_dict(self.checkpoint["weights"])
            for key, item in self.checkpoint.items():
                if key not in ["weights", "save_model"]:
                    setattr(self, key, item)
            print("Loaded pretrained model")

        self.ac_target = deepcopy(self.ac).to(self.device)

        # freeze target network params
        for param in self.ac_target.parameters():
            param.requires_grad = False

        self.replay_buffer = ReplayBuffer(self.replay_size)
        self.optimizer_policy = opt.Adam(self.ac.actor.parameters(),
                                         lr=self.lr_p)
        self.optimizer_q = opt.Adam(self.ac.critic.parameters(), lr=self.lr_q)

    def select_action(self, state, deterministic=True):
        with torch.no_grad():
            action, _ = self.ac.get_action(torch.as_tensor(
                state, dtype=torch.float32).to(self.device),
                                           deterministic=deterministic)
            action = action.detach().cpu().numpy()

        # add noise to output from policy network
        if self.noise is not None:
            action += self.noise()

        return np.clip(action, self.env.action_space.low[0],
                       self.env.action_space.high[0])

    def get_q_loss(self, state, action, reward, next_state, done):
        q = self.ac.critic.get_value(torch.cat([state, action], dim=-1))

        with torch.no_grad():
            q_pi_target = self.ac_target.get_value(
                torch.cat([
                    next_state,
                    self.ac_target.get_action(next_state, True)[0]
                ],
                          dim=-1))
            target = reward + self.gamma * (1 - done) * q_pi_target

        return nn.MSELoss()(q, target)

    def get_p_loss(self, state):
        q_pi = self.ac.get_value(
            torch.cat([state, self.ac.get_action(state, True)[0]], dim=-1))
        return -torch.mean(q_pi)

    def update_params(self, state, action, reward, next_state, done):
        self.optimizer_q.zero_grad()
        loss_q = self.get_q_loss(state, action, reward, next_state, done)
        loss_q.backward()
        self.optimizer_q.step()

        # freeze critic params for policy update
        for param in self.ac.critic.parameters():
            param.requires_grad = False

        self.optimizer_policy.zero_grad()
        loss_p = self.get_p_loss(state)
        loss_p.backward()
        self.optimizer_policy.step()

        # unfreeze critic params
        for param in self.ac.critic.parameters():
            param.requires_grad = True

        # update target network
        with torch.no_grad():
            for param, param_target in zip(self.ac.parameters(),
                                           self.ac_target.parameters()):
                param_target.data.mul_(self.polyak)
                param_target.data.add_((1 - self.polyak) * param.data)

    def learn(self):  # pragma: no cover
        state, episode_reward, episode_len, episode = self.env.reset(), 0, 0, 0
        total_steps = self.steps_per_epoch * self.epochs

        if self.noise is not None:
            self.noise.reset()

        for t in range(total_steps):
            # execute single transition
            if t > self.start_steps:
                action = self.select_action(state, deterministic=True)
            else:
                action = self.env.action_space.sample()

            next_state, reward, done, _ = self.env.step(action)
            if self.render:
                self.env.render()
            episode_reward += reward
            episode_len += 1

            # don't set done to True if max_ep_len reached
            done = False if episode_len == self.max_ep_len else done

            self.replay_buffer.push((state, action, reward, next_state, done))

            state = next_state

            if done or (episode_len == self.max_ep_len):

                if self.noise is not None:
                    self.noise.reset()

                if episode % 20 == 0:
                    print("Episode: {}, Reward: {}, Timestep: {}".format(
                        episode, episode_reward, t))
                if self.tensorboard_log:
                    self.writer.add_scalar("episode_reward", episode_reward, t)

                state, episode_reward, episode_len = self.env.reset(), 0, 0
                episode += 1

            # update params
            if t >= self.start_update and t % self.update_interval == 0:
                for _ in range(self.update_interval):
                    batch = self.replay_buffer.sample(self.batch_size)
                    states, actions, next_states, rewards, dones = (x.to(
                        self.device) for x in batch)
                    self.update_params(states, actions, next_states, rewards,
                                       dones)

            if self.save_model is not None:
                if t >= self.start_update and t % self.save_interval == 0:
                    self.checkpoint = self.get_hyperparams()
                    self.save(self, t)
                    print("Saved current model")

        self.env.close()
        if self.tensorboard_log:
            self.writer.close()

    def get_hyperparams(self):
        hyperparams = {
            "network_type": self.network_type,
            "gamma": self.gamma,
            "batch_size": self.batch_size,
            "replay_size": self.replay_size,
            "polyak": self.polyak,
            "noise_std": self.noise_std,
            "lr_policy": self.lr_p,
            "lr_value": self.lr_q,
            "weights": self.ac.state_dict(),
        }

        return hyperparams
Пример #13
0
class SAC:
    """
    Soft Actor Critic algorithm (SAC)

    Paper: https://arxiv.org/abs/1812.05905

    :param network_type: The deep neural network layer types ['mlp', 'cnn']
    :param env: The environment to learn from
    :param gamma: discount factor
    :param replay_size: Replay memory size
    :param batch_size: Update batch size
    :param lr: learning rate for optimizers
    :param alpha: entropy coefficient
    :param polyak: polyak averaging weight for target network update
    :param entropy_tuning: if alpha should be a learned parameter
    :param epochs: Number of epochs to train on
    :param start_steps: Number of initial exploratory steps
    :param steps_per_epoch: Number of parameter updates per epoch
    :param max_ep_len: Maximum number of steps per episode
    :param start_update: Number of steps before first parameter update
    :param update_interval: Number of step between updates
    :param layers: Neural network layer dimensions
    :param seed: seed for torch and gym
    :param render: if environment is to be rendered
    :param device: device to use for tensor operations; ['cpu','cuda']
    :type network_type: string
    :type env: Gym environment
    :type gamma: float
    :type replay_size: int
    :type batch_size: int
    :type lr: float
    :type alpha: float
    :type polyak: float
    :type entropy_tuning: bool
    :type epochs: int
    :type start_steps: int
    :type steps_per_epoch: int
    :type max_ep_len: int
    :type start_update: int
    :type update_interval: int
    :type layers: tuple
    :type seed: int
    :type render: bool
    :type device: string
    """
    def __init__(
        self,
        network_type: str,
        env: Union[gym.Env, VecEnv],
        gamma: float = 0.99,
        replay_size: int = 1000000,
        batch_size: int = 256,
        lr: float = 3e-4,
        alpha: float = 0.01,
        polyak: float = 0.995,
        entropy_tuning: bool = True,
        epochs: int = 1000,
        start_steps: int = 0,
        steps_per_epoch: int = 1000,
        max_ep_len: int = 1000,
        start_update: int = 256,
        update_interval: int = 1,
        layers: Tuple = (256, 256),
        seed: Optional[int] = None,
        render: bool = False,
        device: Union[torch.device, str] = "cpu",
    ):

        self.network_type = network_type
        self.env = env
        self.gamma = gamma
        self.replay_size = replay_size
        self.batch_size = batch_size
        self.lr = lr
        self.alpha = alpha
        self.polyak = polyak
        self.entropy_tuning = entropy_tuning
        self.epochs = epochs
        self.start_steps = start_steps
        self.steps_per_epoch = steps_per_epoch
        self.max_ep_len = max_ep_len
        self.start_update = start_update
        self.update_interval = update_interval
        self.layers = layers
        self.seed = seed
        self.render = render

        # Assign device
        if "cuda" in device and torch.cuda.is_available():
            self.device = torch.device(device)
        else:
            self.device = torch.device("cpu")

        # Assign seed
        if seed is not None:
            set_seeds(seed, self.env)

        # Setup tensorboard writer
        self.writer = None

        self.empty_logs()
        self.create_model()

    def create_model(self) -> None:
        """
        Initialize the model
        Initializes optimizer and replay buffers as well.
        """
        state_dim, action_dim, discrete, _ = get_env_properties(self.env)

        self.q1 = (get_model("v", self.network_type)(
            state_dim, action_dim, "Qsa", self.layers).to(self.device).float())

        self.q2 = (get_model("v", self.network_type)(
            state_dim, action_dim, "Qsa", self.layers).to(self.device).float())

        self.policy = (get_model(
            "p", self.network_type)(state_dim,
                                    action_dim,
                                    self.layers,
                                    discrete,
                                    False,
                                    sac=True).to(self.device).float())

        self.q1_targ = deepcopy(self.q1).to(self.device).float()
        self.q2_targ = deepcopy(self.q2).to(self.device).float()

        # freeze target parameters
        for param in self.q1_targ.parameters():
            param.requires_grad = False
        for param in self.q2_targ.parameters():
            param.requires_grad = False

        # optimizers
        self.q1_optimizer = opt.Adam(self.q1.parameters(), self.lr)
        self.q2_optimizer = opt.Adam(self.q2.parameters(), self.lr)
        self.policy_optimizer = opt.Adam(self.policy.parameters(), self.lr)

        if self.entropy_tuning:
            self.target_entropy = -torch.prod(
                torch.Tensor(self.env.action_space.shape).to(
                    self.device)).item()
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha_optim = opt.Adam([self.log_alpha], lr=self.lr)

        self.replay_buffer = ReplayBuffer(self.replay_size, self.env)

        # set action scales
        if self.env.action_space is None:
            self.action_scale = torch.tensor(1.0).to(self.device)
            self.action_bias = torch.tensor(0.0).to(self.device)
        else:
            self.action_scale = torch.FloatTensor(
                (self.env.action_space.high - self.env.action_space.low) /
                2.0).to(self.device)
            self.action_bias = torch.FloatTensor(
                (self.env.action_space.high + self.env.action_space.low) /
                2.0).to(self.device)

    def sample_action(self,
                      state: np.ndarray,
                      deterministic: bool = False) -> np.ndarray:
        """
        sample action normal distribution parameterized by policy network

        :param state: Observation state
        :param deterministic: Is the greedy action being chosen?
        :type state: int, float, ...
        :type deterministic: bool
        :returns: action
        :returns: log likelihood of policy
        :returns: scaled mean of normal distribution
        :rtype: int, float, ...
        :rtype: float
        :rtype: float
        """
        mean, log_std = self.policy.forward(state)
        std = log_std.exp()

        # reparameterization trick
        distribution = Normal(mean, std)
        xi = distribution.rsample()
        yi = torch.tanh(xi)
        action = yi * self.action_scale + self.action_bias
        log_pi = distribution.log_prob(xi)

        # enforcing action bound (appendix of paper)
        log_pi -= torch.log(self.action_scale * (1 - yi.pow(2)) +
                            np.finfo(np.float32).eps)
        log_pi = log_pi.sum(1, keepdim=True)
        mean = torch.tanh(mean) * self.action_scale + self.action_bias
        return action.float(), log_pi, mean

    def update_params_before_select_action(self, timestep: int) -> None:
        """
        Update any parameters before selecting action like epsilon for decaying epsilon greedy

        :param timestep: Timestep in the training process
        :type timestep: int
        """
        pass

    def select_action(self, state, deterministic=False):
        """
        select action given a state

        :param state: Observation state
        :param deterministic: Is the greedy action being chosen?
        :type state: int, float, ...
        :type deterministic: bool
        """
        state = torch.FloatTensor(state).to(self.device)
        action, _, _ = self.sample_action(state, deterministic)
        return action.detach().cpu().numpy()

    def update_params(self, update_interval: int) -> (Tuple[float]):
        """
        Computes loss and takes optimizer step

        :param timestep: timestep
        :type timestep: int
        :returns: policy loss
        :rtype: float
        :returns: entropy coefficient loss
        :rtype: float
        """
        for timestep in range(update_interval):
            batch = self.replay_buffer.sample(self.batch_size)
            state, action, reward, next_state, done = (x.to(self.device)
                                                       for x in batch)
            # compute targets
            if self.env.n_envs == 1:
                state, action, next_state = (
                    state.squeeze().float(),
                    action.squeeze(1).float(),
                    next_state.squeeze().float(),
                )
            else:
                state, action, next_state = (
                    state.reshape(-1, *self.env.obs_shape).float(),
                    action.reshape(-1, *self.env.action_shape).float(),
                    next_state.reshape(-1, *self.env.obs_shape).float(),
                )
                reward, done = reward.reshape(-1, 1), done.reshape(-1, 1)

            with torch.no_grad():
                next_action, next_log_pi, _ = self.sample_action(next_state)
                next_q1_targ = self.q1_targ(
                    torch.cat([next_state, next_action], dim=-1))
                next_q2_targ = self.q2_targ(
                    torch.cat([next_state, next_action], dim=-1))
                next_q_targ = (torch.min(next_q1_targ, next_q2_targ) -
                               self.alpha * next_log_pi)
                next_q = reward + self.gamma * (1 - done) * next_q_targ

            # compute losses
            q1 = self.q1(torch.cat([state, action], dim=-1))
            q2 = self.q2(torch.cat([state, action], dim=-1))

            q1_loss = nn.MSELoss()(q1, next_q)
            q2_loss = nn.MSELoss()(q2, next_q)

            pi, log_pi, _ = self.sample_action(state)
            q1_pi = self.q1(torch.cat([state, pi.float()], dim=-1).float())
            q2_pi = self.q2(torch.cat([state, pi.float()], dim=-1).float())
            min_q_pi = torch.min(q1_pi, q2_pi)
            policy_loss = ((self.alpha * log_pi) - min_q_pi).mean()

            # gradient step
            self.q1_optimizer.zero_grad()
            q1_loss.backward()
            self.q1_optimizer.step()

            self.q2_optimizer.zero_grad()
            q2_loss.backward()
            self.q2_optimizer.step()

            self.policy_optimizer.zero_grad()
            policy_loss.backward()
            self.policy_optimizer.step()

            # alpha loss
            alpha_loss = torch.tensor(0.0).to(self.device)

            if self.entropy_tuning:
                alpha_loss = -(self.log_alpha *
                               (log_pi + self.target_entropy).detach()).mean()

                self.alpha_optim.zero_grad()
                alpha_loss.backward()
                self.alpha_optim.step()

                self.alpha = self.log_alpha.exp()

            # soft update target params
            for target_param, param in zip(self.q1_targ.parameters(),
                                           self.q1.parameters()):
                target_param.data.copy_(target_param.data * self.polyak +
                                        param.data * (1 - self.polyak))

            for target_param, param in zip(self.q2_targ.parameters(),
                                           self.q2.parameters()):
                target_param.data.copy_(target_param.data * self.polyak +
                                        param.data * (1 - self.polyak))

        self.logs["q1_loss"].append(q1_loss.item())
        self.logs["q2_loss"].append(q2_loss.item())
        self.logs["policy_loss"].append(policy_loss.item())
        self.logs["alpha_loss"].append(alpha_loss.item())

    def learn(self) -> None:  # pragma: no cover

        total_steps = self.steps_per_epoch * self.epochs * self.env.n_envs

        episode_reward, episode_len = (
            np.zeros(self.env.n_envs),
            np.zeros(self.env.n_envs),
        )
        state = self.env.reset()
        for i in range(0, total_steps, self.env.n_envs):
            # done = [False] * self.env.n_envs

            # while not done:
            # sample action
            if i > self.start_steps:
                action = self.select_action(state)
            else:
                action = self.env.sample()

            if (i >= self.start_update and i % self.update_interval == 0
                    and self.replay_buffer.pos > self.batch_size):
                self.update_params(self.update_interval)

                # prepare transition for replay memory push
            next_state, reward, done, _ = self.env.step(action)
            if self.render:
                self.env.render()

            done = [
                False if ep_len == self.max_ep_len else done
                for ep_len in episode_len
            ]

            if np.any(done) or np.any(episode_len == self.max_ep_len):
                for j, di in enumerate(done):
                    if di:
                        episode_reward[j] = 0
                        episode_len[j] = 0

            self.replay_buffer.extend(
                zip(state, action, reward, next_state, done))
            state = next_state

            if i > total_steps:
                break

            if sum(episode_len) % (
                    5 * self.env.n_envs) == 0 and sum(episode_len) != 0:
                print("Episode: {}, total numsteps: {}, reward: {}".format(
                    sum(episode_len), i, episode_reward))
            # ep += 1

        self.env.close()

    def get_hyperparams(self) -> Dict[str, Any]:
        hyperparams = {
            "network_type": self.network_type,
            "gamma": self.gamma,
            "lr": self.lr,
            "replay_size": self.replay_size,
            "entropy_tuning": self.entropy_tuning,
            "alpha": self.alpha,
            "polyak": self.polyak,
            "q1_weights": self.q1.state_dict(),
            "q2_weights": self.q2.state_dict(),
            "policy_weights": self.policy.state_dict(),
        }

        return hyperparams

    def load_weights(self, weights) -> None:
        """
        Load weights for the agent from pretrained model
        """
        self.q1.load_state_dict(weights["q1_weights"])
        self.q2.load_state_dict(weights["q2_weights"])
        self.policy.load_state_dict(weights["policy_weights"])

    def get_logging_params(self) -> Dict[str, Any]:
        """
        :returns: Logging parameters for monitoring training
        :rtype: dict
        """
        logs = {
            "policy_loss": safe_mean(self.logs["policy_loss"]),
            "q1_loss": safe_mean(self.logs["q1_loss"]),
            "q2_loss": safe_mean(self.logs["q2_loss"]),
            "alpha_loss": safe_mean(self.logs["alpha_loss"]),
        }

        self.empty_logs()
        return logs

    def empty_logs(self):
        """
        Empties logs
        """
        self.logs = {}
        self.logs["q1_loss"] = []
        self.logs["q2_loss"] = []
        self.logs["policy_loss"] = []
        self.logs["alpha_loss"] = []
Пример #14
0
class DDPG:
    """
    Deep Deterministic Policy Gradient algorithm (DDPG)

    Paper: https://arxiv.org/abs/1509.02971

    :param network_type: The deep neural network layer types ['mlp', 'cnn']
    :param env: The environment to learn from
    :param gamma: discount factor
    :param replay_size: Replay memory size
    :param batch_size: Update batch size
    :param lr_p: learning rate for policy optimizer
    :param lr_q: learning rate for value fn optimizer
    :param polyak: polyak averaging weight for target network update
    :param epochs: Number of epochs
    :param start_steps: Number of exploratory steps at start
    :param steps_per_epoch: Number of steps per epoch
    :param noise_std: Standard deviation for action noise
    :param max_ep_len: Maximum steps per episode
    :param start_update: Number of steps before first parameter update
    :param update_interval: Number of steps between parameter updates
    :param layers: Number of neurons in hidden layers
    :param seed: seed for torch and gym
    :param render: if environment is to be rendered
    :param device: device to use for tensor operations; ['cpu','cuda']
    :type network_type: string
    :type env: Gym environment
    :type gamma: float
    :type replay_size: int
    :type batch_size: int
    :type lr_p: float
    :type lr_q: float
    :type polyak: float
    :type epochs: int
    :type start_steps: int
    :type steps_per_epoch: int
    :type noise_std: float
    :type max_ep_len: int
    :type start_update: int
    :type update_interval: int
    :type layers: tuple
    :type seed: int
    :type render: bool
    :type device: string
    """
    def __init__(
        self,
        network_type: str,
        env: Union[gym.Env, VecEnv],
        gamma: float = 0.99,
        replay_size: int = 1000000,
        batch_size: int = 100,
        lr_p: float = 0.0001,
        lr_q: float = 0.001,
        polyak: float = 0.995,
        epochs: int = 100,
        start_steps: int = 10000,
        steps_per_epoch: int = 4000,
        noise: Optional[Any] = None,
        noise_std: float = 0.1,
        max_ep_len: int = 1000,
        start_update: int = 1000,
        update_interval: int = 50,
        layers: Tuple = (32, 32),
        seed: Optional[int] = None,
        render: bool = False,
        device: Union[torch.device, str] = "cpu",
    ):

        self.network_type = network_type
        self.env = env
        self.gamma = gamma
        self.replay_size = replay_size
        self.batch_size = batch_size
        self.lr_p = lr_p
        self.lr_q = lr_q
        self.polyak = polyak
        self.epochs = epochs
        self.start_steps = start_steps
        self.steps_per_epoch = steps_per_epoch
        self.noise = noise
        self.noise_std = noise_std
        self.max_ep_len = max_ep_len
        self.start_update = start_update
        self.update_interval = update_interval
        self.layers = layers
        self.seed = seed
        self.render = render

        # Assign device
        if "cuda" in device and torch.cuda.is_available():
            self.device = torch.device(device)
        else:
            self.device = torch.device("cpu")

        # Assign seed
        if seed is not None:
            set_seeds(seed, self.env)

        # Setup tensorboard writer
        self.writer = None

        self.empty_logs()
        self.create_model()

    def create_model(self) -> None:
        """
        Initialize the model
        Initializes optimizer and replay buffers as well.
        """
        state_dim, action_dim, discrete, _ = get_env_properties(self.env)
        if discrete:
            raise Exception(
                "Discrete Environments not supported for {}.".format(
                    __class__.__name__))
        if self.noise is not None:
            self.noise = self.noise(np.zeros_like(action_dim),
                                    self.noise_std * np.ones_like(action_dim))

        self.ac = get_model("ac", self.network_type)(state_dim, action_dim,
                                                     self.layers, "Qsa",
                                                     False).to(self.device)

        self.ac_target = deepcopy(self.ac).to(self.device)

        # freeze target network params
        for param in self.ac_target.parameters():
            param.requires_grad = False

        self.replay_buffer = ReplayBuffer(self.replay_size, self.env)
        self.optimizer_policy = opt.Adam(self.ac.actor.parameters(),
                                         lr=self.lr_p)
        self.optimizer_q = opt.Adam(self.ac.critic.parameters(), lr=self.lr_q)

    def update_params_before_select_action(self, timestep: int) -> None:
        """
        Update any parameters before selecting action like epsilon for decaying epsilon greedy

        :param timestep: Timestep in the training process
        :type timestep: int
        """
        pass

    def select_action(self,
                      state: np.ndarray,
                      deterministic: bool = False) -> np.ndarray:
        """
        Selection of action

        :param state: Observation state
        :param deterministic: Action selection type
        :type state: int, float, ...
        :type deterministic: bool
        :returns: Action based on the state and epsilon value
        :rtype: int, float, ...
        """
        with torch.no_grad():
            action, _ = self.ac.get_action(
                torch.as_tensor(state, dtype=torch.float32).to(self.device),
                deterministic=deterministic,
            )
            action = action.detach().cpu().numpy()

        # add noise to output from policy network
        if self.noise is not None:
            action += self.noise()

        return np.clip(action, self.env.action_space.low[0],
                       self.env.action_space.high[0])

    def get_q_loss(
        self,
        state: np.ndarray,
        action: np.ndarray,
        reward: float,
        next_state: np.ndarray,
        done: bool,
    ) -> torch.Tensor:
        """
        Computes loss for Q-Network

        :param state: environment observation
        :param action: agent action
        :param: reward: environment reward
        :param next_state: environment next observation
        :param done: if episode is over
        :type state: int, float, ...
        :type action: float
        :type: reward: float
        :type next_state: int, float, ...
        :type done: bool
        :returns: the Q loss value
        :rtype: float
        """
        quality = self.ac.critic.get_value(torch.cat([state, action], dim=-1))

        with torch.no_grad():
            q_pi_target = self.ac_target.get_value(
                torch.cat([
                    next_state,
                    self.ac_target.get_action(next_state, True)[0]
                ],
                          dim=-1))
            target = reward + self.gamma * (1 - done) * q_pi_target

        value_loss = F.mse_loss(quality, target)
        self.logs["value_loss"].append(value_loss.item())
        return value_loss

    def get_p_loss(self, state: np.ndarray) -> torch.Tensor:
        """
        Computes policy loss

        :param state: Environment observation
        :type state: int, float, ...
        :returns: Policy loss
        :rtype: float
        """
        q_pi = self.ac.get_value(
            torch.cat([state, self.ac.get_action(state, True)[0]], dim=-1))

        policy_loss = torch.mean(q_pi)
        self.logs["policy_loss"].append(policy_loss.item())

        return -policy_loss

    def update_params(self, update_interval: int) -> None:
        """
        Takes the step for optimizer.

        :param timestep: timestep
        :type timestep: int
        """
        for timestep in range(update_interval):
            batch = self.replay_buffer.sample(self.batch_size)
            state, action, reward, next_state, done = (x.to(self.device)
                                                       for x in batch)

            self.optimizer_q.zero_grad()
            loss_q = self.get_q_loss(state, action, reward, next_state, done)
            loss_q.backward()
            self.optimizer_q.step()

            # freeze critic params for policy update
            for param in self.ac.critic.parameters():
                param.requires_grad = False

            self.optimizer_policy.zero_grad()
            loss_p = self.get_p_loss(state)
            loss_p.backward()
            self.optimizer_policy.step()

            # unfreeze critic params
            for param in self.ac.critic.parameters():
                param.requires_grad = True

            # update target network
            with torch.no_grad():
                for param, param_target in zip(self.ac.parameters(),
                                               self.ac_target.parameters()):
                    param_target.data.mul_(self.polyak)
                    param_target.data.add_((1 - self.polyak) * param.data)

    def learn(self):  # pragma: no cover
        state, episode_reward, episode_len, episode = (
            self.env.reset(),
            np.zeros(self.env.n_envs),
            np.zeros(self.env.n_envs),
            np.zeros(self.env.n_envs),
        )
        total_steps = self.steps_per_epoch * self.epochs * self.env.n_envs

        if self.noise is not None:
            self.noise.reset()

        for timestep in range(0, total_steps, self.env.n_envs):
            # execute single transition
            if timestep > self.start_steps:
                action = self.select_action(state)
            else:
                action = self.env.sample()

            next_state, reward, done, _ = self.env.step(action)
            if self.render:
                self.env.render()
            episode_reward += reward
            episode_len += 1

            # dont set d to True if max_ep_len reached
            done = [
                False if ep_len == self.max_ep_len else done
                for ep_len in episode_len
            ]

            self.replay_buffer.extend(
                zip(state, action, reward, next_state, done))

            state = next_state

            if np.any(done) or np.any(episode_len == self.max_ep_len):

                if self.noise is not None:
                    self.noise.reset()

                if sum(episode) % 20 == 0:
                    print("Ep: {}, reward: {}, t: {}".format(
                        sum(episode), np.mean(episode_reward), timestep))

                for i, di in enumerate(done):
                    if di:
                        episode_reward[i] = 0
                        episode_len[i] = 0
                        episode += 1

            # update params
            if timestep >= self.start_update and timestep % self.update_interval == 0:
                self.update_params(self.update_interval)

        self.env.close()

    def get_hyperparams(self) -> Dict[str, Any]:
        hyperparams = {
            "network_type": self.network_type,
            "gamma": self.gamma,
            "batch_size": self.batch_size,
            "replay_size": self.replay_size,
            "polyak": self.polyak,
            "noise_std": self.noise_std,
            "lr_policy": self.lr_p,
            "lr_value": self.lr_q,
            "weights": self.ac.state_dict(),
        }

        return hyperparams

    def load_weights(self, weights) -> None:
        """
        Load weights for the agent from pretrained model
        """
        self.ac.load_state_dict(weights["weights"])

    def get_logging_params(self) -> Dict[str, Any]:
        """
        :returns: Logging parameters for monitoring training
        :rtype: dict
        """
        logs = {
            "policy_loss": safe_mean(self.logs["policy_loss"]),
            "value_loss": safe_mean(self.logs["value_loss"]),
        }

        self.empty_logs()

        return logs

    def empty_logs(self):
        """
        Empties logs
        """
        self.logs = {}
        self.logs["policy_loss"] = []
        self.logs["value_loss"] = []
Пример #15
0
class DQN:
    """
    Deep Q Networks

    Paper (DQN) https://arxiv.org/pdf/1312.5602.pdf

    Paper (Double DQN) https://arxiv.org/abs/1509.06461

    :param network_type: The deep neural network layer types ['mlp', 'cnn']
    :param env: The environment to learn from
    :param double_dqn: For training Double DQN
    :param dueling_dqn:  For training Dueling DQN
    :param noisy_dqn: For using Noisy Q
    :param categorical_dqn: For using Distributional DQN
    :param parameterized_replay: For using a prioritized buffer
    :param epochs: Number of epochs
    :param max_iterations_per_epoch: Number of iterations per epoch
    :param max_ep_len: Maximum steps per episode
    :param gamma: discount factor
    :param lr: learing rate for the optimizer
    :param batch_size: Update batch size
    :param replay_size: Replay memory size
    :param seed: seed for torch and gym
    :param render: if environment is to be rendered
    :param device: device to use for tensor operations; 'cpu' for cpu and 'cuda' for gpu
    :type network_type: string
    :type env: Gym environment
    :type double_dqn: bool
    :type dueling_dqn: bool
    :type noisy_dqn: bool
    :type categorical_dqn: bool
    :type parameterized_replay: bool
    :type epochs: int
    :type max_iterations_per_epoch: int
    :type max_ep_len: int
    :type gamma: float
    :type lr: float
    :type batch_size: int
    :type replay_size: int
    :type seed: int
    :type render: bool
    :type device: string
    """
    def __init__(
        self,
        network_type: str,
        env: Union[gym.Env, VecEnv],
        double_dqn: bool = False,
        dueling_dqn: bool = False,
        noisy_dqn: bool = False,
        categorical_dqn: bool = False,
        prioritized_replay: bool = False,
        epochs: int = 100,
        max_iterations_per_epoch: int = 100,
        max_ep_len: int = 1000,
        gamma: float = 0.99,
        lr: float = 0.001,
        batch_size: int = 32,
        replay_size: int = 100,
        prioritized_replay_alpha: float = 0.6,
        max_epsilon: float = 1.0,
        min_epsilon: float = 0.01,
        epsilon_decay: int = 1000,
        num_atoms: int = 51,
        vmin: int = -10,
        vmax: int = 10,
        seed: Optional[int] = None,
        render: bool = False,
        device: Union[torch.device, str] = "cpu",
    ):
        self.env = env
        self.double_dqn = double_dqn
        self.dueling_dqn = dueling_dqn
        self.noisy_dqn = noisy_dqn
        self.categorical_dqn = categorical_dqn
        self.prioritized_replay = prioritized_replay
        self.max_epochs = epochs
        self.max_iterations_per_epoch = max_iterations_per_epoch
        self.max_ep_len = max_ep_len
        self.replay_size = replay_size
        self.prioritized_replay_alpha = prioritized_replay_alpha
        self.lr = lr
        self.gamma = gamma
        self.batch_size = batch_size
        self.num_atoms = num_atoms
        self.Vmin = vmin
        self.Vmax = vmax
        self.render = render
        self.reward_hist = []
        self.max_epsilon = max_epsilon
        self.min_epsilon = min_epsilon
        self.epsilon_decay = epsilon_decay
        self.network_type = network_type

        # Assign device
        if "cuda" in device and torch.cuda.is_available():
            self.device = torch.device(device)
        else:
            self.device = torch.device("cpu")

        # Assign seed
        if seed is not None:
            set_seeds(seed, self.env)

        # Setup tensorboard writer
        self.writer = None

        self.empty_logs()
        self.create_model()

    def create_model(self) -> None:
        """
        Initialize the model and target model for various variants of DQN.
        Initializes optimizer and replay buffers as well.
        """
        state_dim, action_dim, _, _ = get_env_properties(self.env)
        if self.network_type == "mlp":
            if self.dueling_dqn:
                self.model = DuelingDQNValueMlp(state_dim, action_dim)
            elif self.categorical_dqn:
                self.model = CategoricalDQNValue(state_dim, action_dim,
                                                 self.num_atoms)
            elif self.noisy_dqn:
                self.model = NoisyDQNValue(state_dim, action_dim)
            else:
                self.model = get_model("v",
                                       self.network_type)(state_dim,
                                                          action_dim, "Qs")

        elif self.network_type == "cnn":
            self.framestack = self.env.framestack

            if self.dueling_dqn:
                self.model = DuelingDQNValueCNN(action_dim, self.framestack)
            elif self.noisy_dqn:
                self.model = NoisyDQNValueCNN(action_dim, self.framestack)
            elif self.categorical_dqn:
                self.model = CategoricalDQNValueCNN(action_dim, self.num_atoms,
                                                    self.framestack)
            else:
                self.model = get_model("v", self.network_type)(action_dim,
                                                               self.framestack,
                                                               "Qs")

        self.target_model = deepcopy(self.model)

        if self.prioritized_replay:
            self.replay_buffer = PrioritizedBuffer(
                self.replay_size, self.prioritized_replay_alpha)
        else:
            self.replay_buffer = ReplayBuffer(self.replay_size, self.env)

        self.optimizer = opt.Adam(self.model.parameters(), lr=self.lr)

    def update_target_model(self) -> None:
        """
        Copy the target model weights with the model
        """
        self.target_model.load_state_dict(self.model.state_dict())

    def update_params_before_select_action(self, timestep: int) -> None:
        """
        Update any parameters before selecting action like epsilon for decaying epsilon greedy

        :param timestep: Timestep in the training process
        :type timestep: int
        """
        self.timestep = timestep
        self.epsilon = self.calculate_epsilon_by_frame()

    def select_action(self,
                      state: np.ndarray,
                      deterministic: bool = False) -> np.ndarray:
        """
        Epsilon Greedy selection of action

        :param state: Observation state
        :param deterministic: Whether greedy action should be taken always
        :type state: int, float, ...
        :type deterministic: bool
        :returns: Action based on the state and epsilon value
        :rtype: int, float, ...
        """

        if not deterministic:
            if np.random.rand() < self.epsilon:
                return np.asarray(self.env.sample())

        if self.categorical_dqn:
            state = Variable(torch.FloatTensor(state))
            dist = self.model(state).data.cpu()
            dist = dist * torch.linspace(self.Vmin, self.Vmax, self.num_atoms)
            action = dist.sum(2).max(1)[1].numpy()  # [0]
        else:
            state = Variable(torch.FloatTensor(state))
            q_value = self.model(state)
            action = np.argmax(q_value.detach().numpy(), axis=-1)

        return action

    def get_td_loss(self) -> torch.Tensor:
        """
        Computes loss for various variants

        :returns: the TD loss depending upon the variant
        :rtype: float
        """
        if self.prioritized_replay:
            (
                state,
                action,
                reward,
                next_state,
                done,
                indices,
                weights,
            ) = self.replay_buffer.sample(self.batch_size)
            weights = Variable(torch.FloatTensor(weights))
        else:
            (state, action, reward, next_state,
             done) = self.replay_buffer.sample(self.batch_size)

        state = state.reshape(self.batch_size * self.env.n_envs,
                              *self.env.obs_shape)
        action = action.reshape(self.batch_size * self.env.n_envs,
                                *self.env.action_shape)
        reward = reward.reshape(-1, 1)
        done = done.reshape(-1, 1)
        next_state = next_state.reshape(self.batch_size * self.env.n_envs,
                                        *self.env.obs_shape)

        state = Variable(torch.FloatTensor(np.float32(state)))
        next_state = Variable(torch.FloatTensor(np.float32(next_state)))
        action = Variable(torch.LongTensor(action.long()))
        reward = Variable(torch.FloatTensor(reward))
        done = Variable(torch.FloatTensor(done))

        if self.network_type == "cnn":
            state = state.view(
                -1,
                self.framestack,
                self.env.screen_size,
                self.env.screen_size,
            )
            next_state = next_state.view(
                -1,
                self.framestack,
                self.env.screen_size,
                self.env.screen_size,
            )

        if self.categorical_dqn:
            projection_dist = self.projection_distribution(
                next_state, reward, done)
            dist = self.model(state)
            action = action.unsqueeze(1).expand(
                self.batch_size * self.env.n_envs, 1, self.num_atoms)
            dist = dist.gather(1, action).squeeze(1)
            dist.data.clamp_(0.01, 0.99)

        elif self.double_dqn:
            q_values = self.model(state)
            q_value = q_values.gather(1, action).squeeze(1)

            q_next_state_values = self.model(next_state)
            action_next = q_next_state_values.max(1)[1]

            q_target_next_state_values = self.target_model(next_state)
            q_target_s_a_prime = q_target_next_state_values.gather(
                1, action_next.unsqueeze(1)).squeeze(1)
            expected_q_value = reward + self.gamma * q_target_s_a_prime.reshape(
                -1, 1) * (1 - done)
        else:
            q_values = self.model(state)
            q_value = q_values.gather(1, action).squeeze(1)

            q_next_state_values = self.target_model(next_state)
            q_s_a_prime = q_next_state_values.max(1)[0]
            expected_q_value = reward + self.gamma * q_s_a_prime.reshape(
                -1, 1) * (1 - done)

        if self.categorical_dqn:
            loss = -(Variable(projection_dist) * dist.log()).sum(1).mean()
        else:
            if self.prioritized_replay:
                loss = (q_value - expected_q_value.detach()).pow(2) * weights
                priorities = loss + 1e-5
                loss = loss.mean()
                self.replay_buffer.update_priorities(
                    indices,
                    priorities.data.cpu().numpy())
            else:
                loss = (q_value - expected_q_value.detach()).pow(2).mean()

        self.logs["value_loss"].append(loss.item())

        return loss

    def update_params(self, update_interval: int) -> None:
        """
        (Takes the step for optimizer. This internally call get_td_loss(),
so no need to call the function explicitly.)
        """
        for timestep in range(update_interval):
            loss = self.get_td_loss()
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            if self.noisy_dqn or self.categorical_dqn:
                self.model.reset_noise()
                self.target_model.reset_noise()

            if timestep % update_interval == 0:
                self.update_target_model()

    def calculate_epsilon_by_frame(self) -> float:
        """
        A helper function to calculate the value of epsilon after every step.

        :returns: epsilon value for the step
        :rtype: float
        """
        return self.min_epsilon + (
            self.max_epsilon - self.min_epsilon) * np.exp(
                -1.0 * self.timestep / self.epsilon_decay)

    def projection_distribution(self, next_state: np.ndarray,
                                rewards: List[float], dones: List[bool]):
        """
        A helper function used for categorical DQN

        :param next_state: next observation state
        :param rewards: rewards collected
        :param dones: dones
        :type next_state: int, float, ...
        :type rewards: list
        :type dones: list
        :returns: projection distribution
        :rtype: float
        """
        batch_size = next_state.size(0)

        delta_z = float(self.Vmax - self.Vmin) / (self.num_atoms - 1)
        support = torch.linspace(self.Vmin, self.Vmax, self.num_atoms)

        next_dist = self.target_model(next_state).data.cpu() * support
        next_action = next_dist.sum(2).max(1)[1]
        next_action = (next_action.unsqueeze(1).unsqueeze(1).expand(
            next_dist.size(0), 1, next_dist.size(2)))
        next_dist = next_dist.gather(1, next_action).squeeze(1)

        rewards = rewards.expand_as(next_dist)
        dones = dones.expand_as(next_dist)
        support = support.unsqueeze(0).expand_as(next_dist)

        tz = rewards + (1 - dones) * 0.99 * support
        tz = tz.clamp(min=self.Vmin, max=self.Vmax)
        bz = (tz - self.Vmin) / delta_z
        lower = bz.floor().long()
        upper = bz.ceil().long()

        offset = (torch.linspace(0, (batch_size - 1) * self.num_atoms,
                                 batch_size).long().unsqueeze(1).expand(
                                     self.batch_size * self.env.n_envs,
                                     self.num_atoms))

        projection_dist = torch.zeros(next_dist.size())
        projection_dist.view(-1).index_add_(0, (lower + offset).view(-1),
                                            (next_dist *
                                             (upper.float() - bz)).view(-1))
        projection_dist.view(-1).index_add_(0, (upper + offset).view(-1),
                                            (next_dist *
                                             (bz - lower.float())).view(-1))

        return projection_dist

    def learn(self) -> None:  # pragma: no cover
        total_steps = self.max_epochs * self.max_iterations_per_epoch
        state, episode_reward, episode, episode_len = self.env.reset(), 0, 0, 0

        if self.double_dqn:
            self.update_target_model()

        for frame_idx in range(1, total_steps + 1):
            self.timestep = frame_idx
            self.epsilon = self.calculate_epsilon_by_frame()

            action = self.select_action(state)

            next_state, reward, done, _ = self.env.step(action)

            if self.render:
                self.env.render()

            self.replay_buffer.push((state, action, reward, next_state, done))
            state = next_state

            episode_reward += reward
            episode_len += 1

            done = False if episode_len == self.max_ep_len else done

            if done or (episode_len == self.max_ep_len):
                if episode % 20 == 0:
                    print("Episode: {}, Reward: {}, Frame Index: {}".format(
                        episode, episode_reward, frame_idx))

                self.reward_hist.append(episode_reward)
                state, episode_reward, episode_len = self.env.reset(), 0, 0
                episode += 1

            if frame_idx >= self.start_update and frame_idx % self.update_interval == 0:
                self.agent.update_params(self.update_interval)

            if frame_idx % 100 == 0:
                self.update_target_model()

        self.env.close()

    def get_hyperparams(self) -> Dict[str, Any]:
        hyperparams = {
            "gamma": self.gamma,
            "batch_size": self.batch_size,
            "lr": self.lr,
            "replay_size": self.replay_size,
            "double_dqn": self.double_dqn,
            "dueling_dqn": self.dueling_dqn,
            "noisy_dqn": self.noisy_dqn,
            "categorical_dqn": self.categorical_dqn,
            "prioritized_replay": self.prioritized_replay,
            "prioritized_replay_alpha": self.prioritized_replay_alpha,
            "weights": self.model.state_dict(),
            "timestep": self.timestep,
        }

        return hyperparams

    def load_weights(self, weights) -> None:
        """
        Load weights for the agent from pretrained model
        """
        self.model.load_state_dict(weights["weights"])

    def get_logging_params(self) -> Dict[str, Any]:
        """
        :returns: Logging parameters for monitoring training
        :rtype: dict
        """
        logs = {
            "value_loss": safe_mean(self.logs["value_loss"]),
        }

        self.empty_logs()

        return logs

    def empty_logs(self):
        """
        Empties logs
        """
        self.logs = {}
        self.logs["value_loss"] = []