예제 #1
0
    def __init__(self, state_dim, action_dim, max_action, args):

        # Mu stuff
        self.mu = Actor(state_dim, action_dim, max_action, args)
        self.mu_t = Actor(state_dim, action_dim, max_action, args)
        self.mu_t.load_state_dict(self.mu.state_dict())

        # Sigma stuff
        self.log_sigma = FloatTensor(
            np.log(args.sigma_init) * np.ones(self.mu.get_size()))
        self.log_sigma_t = FloatTensor(
            np.log(args.sigma_init) * np.ones(self.mu.get_size()))

        # Optimizer
        self.opt = torch.optim.Adam(self.mu.parameters(), lr=args.actor_lr)
        self.opt.add_param_group({"params": self.log_sigma})

        # Critic stuff
        self.critic = Critic(state_dim, action_dim, max_action, args)
        self.critic_t = Critic(state_dim, action_dim, max_action, args)
        self.critic_t.load_state_dict(self.critic.state_dict())
        self.critic_opt = torch.optim.Adam(self.critic.parameters(),
                                           lr=args.critic_lr)

        # Env stuff
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.max_action = max_action

        # Hyperparams
        self.tau = args.tau
        self.n_steps = args.n_steps
        self.discount = args.discount
        self.pop_size = args.pop_size
        self.batch_size = args.batch_size
        self.noise_clip = args.noise_clip
        self.policy_freq = args.policy_freq
        self.policy_noise = args.policy_noise
        self.reward_scale = args.reward_scale
        self.n_actor_params = self.mu.get_size()
        self.weights = FloatTensor(
            [self.discount**i for i in range(self.n_steps)])

        # cuda
        if USE_CUDA:
            self.mu.cuda()
            self.mu_t.cuda()
            self.log_sigma.cuda()
            self.log_sigma_t.cuda()
            self.critic.cuda()
            self.critic_t.cuda()
예제 #2
0
    def __init__(self, state_dim, action_dim, max_action, args):

        # Actor stuff
        self.actor = Actor(state_dim, action_dim, max_action, args)
        self.actor_t = Actor(state_dim, action_dim, max_action, args)
        self.actor_t.load_state_dict(self.actor.state_dict())
        self.actor_opt = torch.optim.Adam(self.actor.parameters(),
                                          lr=args.actor_lr)

        # Critic stuff
        self.critic = Critic(state_dim, action_dim, max_action, args)
        self.critic_t = Critic(state_dim, action_dim, max_action, args)
        self.critic_t.load_state_dict(self.critic.state_dict())
        self.critic_opt = torch.optim.Adam(self.critic.parameters(),
                                           lr=args.critic_lr)

        # Env stuff
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.max_action = max_action

        # Hyperparams
        self.tau = args.tau
        self.n_steps = args.n_steps
        self.discount = args.discount
        self.batch_size = args.batch_size
        self.noise_clip = args.noise_clip
        self.policy_freq = args.policy_freq
        self.policy_noise = args.policy_noise
        self.reward_scale = args.reward_scale
        self.weights = FloatTensor(
            [self.discount**i for i in range(self.n_steps)])

        # cuda
        if args.use_cuda:
            self.actor.cuda()
            self.actor_t.cuda()
            self.critic.cuda()
            self.critic_t.cuda()
예제 #3
0
class D2TD3(object):
    """
    Double-Smoothed Twin Delayed Deep Deterministic Policy Gradient Algorithm
    """
    def __init__(self, state_dim, action_dim, max_action, args):

        # Mu stuff
        self.mu = Actor(state_dim, action_dim, max_action, args)
        self.mu_t = Actor(state_dim, action_dim, max_action, args)
        self.mu_t.load_state_dict(self.mu.state_dict())

        # Sigma stuff
        self.log_sigma = FloatTensor(
            np.log(args.sigma_init) * np.ones(self.mu.get_size()))
        self.log_sigma_t = FloatTensor(
            np.log(args.sigma_init) * np.ones(self.mu.get_size()))

        # Optimizer
        self.opt = torch.optim.Adam(self.mu.parameters(), lr=args.actor_lr)
        self.opt.add_param_group({"params": self.log_sigma})

        # Critic stuff
        self.critic = Critic(state_dim, action_dim, max_action, args)
        self.critic_t = Critic(state_dim, action_dim, max_action, args)
        self.critic_t.load_state_dict(self.critic.state_dict())
        self.critic_opt = torch.optim.Adam(self.critic.parameters(),
                                           lr=args.critic_lr)

        # Env stuff
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.max_action = max_action

        # Hyperparams
        self.tau = args.tau
        self.n_steps = args.n_steps
        self.discount = args.discount
        self.pop_size = args.pop_size
        self.batch_size = args.batch_size
        self.noise_clip = args.noise_clip
        self.policy_freq = args.policy_freq
        self.policy_noise = args.policy_noise
        self.reward_scale = args.reward_scale
        self.n_actor_params = self.mu.get_size()
        self.weights = FloatTensor(
            [self.discount**i for i in range(self.n_steps)])

        # cuda
        if USE_CUDA:
            self.mu.cuda()
            self.mu_t.cuda()
            self.log_sigma.cuda()
            self.log_sigma_t.cuda()
            self.critic.cuda()
            self.critic_t.cuda()

    def train(self, memory, n_iter):
        """
        Trains the model for n_iter steps
        """

        for it in range(n_iter):

            # Sample replay buffer
            states, actions, n_states, rewards, steps, dones, stops = memory.sample(
                self.batch_size)
            rewards = self.reward_scale * rewards * self.weights
            rewards = rewards.sum(dim=1, keepdim=True)

            # Select policy according to noise
            # mu_t = self.mu_t.get_params()
            # log_sigma_t = self.log_sigma_t.data.cpu().numpy()
            # noise = np.random.randn(self.n_actor_params)
            # pi_t = mu_t + noise * np.exp(log_sigma_t)

            # self.mu_t.set_params(pi_t)
            n_actions = self.mu_t(n_states)
            # self.mu.set_params(mu_t)

            # Q target = reward + discount * min_i(Qi(next_state, pi(next_state)))
            with torch.no_grad():
                target_Q1, target_Q2 = self.critic_t(n_states, n_actions)
                target_Q = torch.min(target_Q1, target_Q2)
                target_Q = target_Q * self.discount**(steps + 1)
                target_Q = rewards + (1 - stops) * target_Q

            # Get current Q estimates
            current_Q1, current_Q2 = self.critic(states, actions)

            # Compute critic loss
            critic_loss = nn.MSELoss()(current_Q1, target_Q) + \
                nn.MSELoss()(current_Q2, target_Q)

            # Optimize the critic
            self.critic_opt.zero_grad()
            critic_loss.backward()
            self.critic_opt.step()

            # Delayed policy updates
            if it % self.policy_freq == 0:

                # Creating random policy
                mu = self.mu.get_params()
                log_sigma = self.log_sigma.data.cpu().numpy()
                noise = np.random.randn(self.n_actor_params)
                pi = mu + noise * np.exp(log_sigma)

                # Computing loss
                self.mu.set_params(pi)
                pi_loss = -self.critic(states, self.mu(states))[0].mean()

                # Computing gradient wrt noisy policy
                pi_loss.backward()
                pi_grad = self.mu.get_grads()
                self.mu.set_params(mu)

                # Setting gradients
                self.opt.zero_grad()
                self.mu.set_params(mu)
                self.mu.set_grads(pi_grad)
                self.log_sigma.grad = FloatTensor(pi_grad * noise *
                                                  np.exp(log_sigma))
                self.opt.step()

                # Update the frozen mu
                for param, target_param in zip(self.mu.parameters(),
                                               self.mu_t.parameters()):
                    target_param.data.copy_(self.tau * param.data +
                                            (1 - self.tau) * target_param.data)

                # Update the frozen sigma
                self.log_sigma_t = self.tau * self.log_sigma + \
                    (1 - self.tau) * self.log_sigma_t

            # Update the frozen critic models
            for param, target_param in zip(self.critic.parameters(),
                                           self.critic_t.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

    def save(self, directory):
        """
        Save the model in given folder
        """
        self.mu.save_model(directory, "actor")
        self.critic.save_model(directory, "critic")

    def load(self, directory):
        """
        Load model from folder
        """
        self.mu.load_model(directory, "actor")
        self.critic.load_model(directory, "critic")
예제 #4
0
class STD3(object):
    """
    Smoothed Twin Delayed Deep Deterministic Policy Gradient Algorithm
    """
    def __init__(self, state_dim, action_dim, max_action, args):

        # Actor stuff
        self.actor = Actor(state_dim, action_dim, max_action, args)
        self.actor_t = Actor(state_dim, action_dim, max_action, args)
        self.actor_t.load_state_dict(self.actor.state_dict())
        self.actor_opt = torch.optim.Adam(self.actor.parameters(),
                                          lr=args.actor_lr)

        # Critic stuff
        self.critic = Critic(state_dim, action_dim, max_action, args)
        self.critic_t = Critic(state_dim, action_dim, max_action, args)
        self.critic_t.load_state_dict(self.critic.state_dict())
        self.critic_opt = torch.optim.Adam(self.critic.parameters(),
                                           lr=args.critic_lr)

        # Env stuff
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.max_action = max_action

        # Hyperparams
        self.tau = args.tau
        self.n_steps = args.n_steps
        self.discount = args.discount
        self.batch_size = args.batch_size
        self.noise_clip = args.noise_clip
        self.policy_freq = args.policy_freq
        self.policy_noise = args.policy_noise
        self.reward_scale = args.reward_scale
        self.weights = FloatTensor(
            [self.discount**i for i in range(self.n_steps)])

        # cuda
        if args.use_cuda:
            self.actor.cuda()
            self.actor_t.cuda()
            self.critic.cuda()
            self.critic_t.cuda()

    def action(self, state):
        """
        Returns action given state
        """
        state = FloatTensor(state.reshape(1, -1))
        return self.actor(state).cpu().data.numpy().flatten()

    def train(self, memory, n_iter):
        """
        Trains the model for n_iter steps
        """

        for it in range(n_iter):

            # Sample replay buffer
            states, actions, n_states, rewards, steps, dones, stops = memory.sample(
                self.batch_size)
            print("before:", rewards)
            rewards = self.reward_scale * rewards * self.weights
            rewards = rewards.sum(dim=1, keepdim=True)
            print("after:", rewards)

            # Select action according to policy and add clipped noise
            noise = np.clip(
                np.random.normal(0,
                                 self.policy_noise,
                                 size=(self.batch_size, self.action_dim)),
                -self.noise_clip, self.noise_clip)
            n_actions = self.actor_t(n_states)  # + FloatTensor(noise)
            n_actions = n_actions.clamp(-self.max_action, self.max_action)

            # Q target = reward + discount * min_i(Qi(next_state, pi(next_state)))
            with torch.no_grad():
                target_Q1, target_Q2 = self.critic_t(n_states, n_actions)
                target_Q = torch.min(target_Q1, target_Q2)
                target_Q = target_Q * self.discount**(steps + 1)
                target_Q = rewards.sum + (1 - stops) * target_Q

            # Get current Q estimates
            current_Q1, current_Q2 = self.critic(states, actions)

            # Compute critic loss
            critic_loss = nn.MSELoss()(current_Q1, target_Q) + \
                nn.MSELoss()(current_Q2, target_Q)

            # Optimize the critic
            self.critic_opt.zero_grad()
            critic_loss.backward()
            self.critic_opt.step()

            # Delayed policy updates
            if it % self.policy_freq == 0:

                # Compute actor loss
                # noise = np.clip(np.random.normal(0, self.policy_noise, size=(
                #     self.batch_size, self.action_dim)), -self.noise_clip, self.noise_clip)
                # n_actions = self.actor(states) + FloatTensor(noise)
                # n_actions = n_actions.clamp(-self.max_action, self.max_action)
                # actor_loss = -self.critic(states, n_actions)[0].mean()

                actor_params = self.actor.get_params()
                grads = np.zeros(self.actor.get_size())

                for _ in range(5):

                    noise = np.random.normal(0,
                                             self.policy_noise,
                                             size=(self.actor.get_size()))
                    self.actor.set_params(actor_params +
                                          noise * self.policy_noise)

                    n_actions = self.actor(states)  # + FloatTensor(noise)
                    n_actions = n_actions.clamp(-self.max_action,
                                                self.max_action)

                    self.actor_opt.zero_grad()
                    actor_loss = -self.critic(states, n_actions)[0].mean()
                    actor_loss.backward()

                    # * np.exp(- noise ** 2 / (2 * self.policy_noise ** 2)
                    grads += self.actor.get_grads()
                    #        ) / np.sqrt(2 * np.pi) / self.policy_noise

                self.actor_opt.zero_grad()
                self.actor.set_params(actor_params)
                self.actor.set_grads(grads / 5)
                self.actor_opt.step()

                # Optimize the actor
                # self.actor_opt.zero_grad()
                # actor_loss.backward()
                # self.actor_opt.step()

                # Update the frozen actor models
                for param, target_param in zip(self.actor.parameters(),
                                               self.actor_t.parameters()):
                    target_param.data.copy_(self.tau * param.data +
                                            (1 - self.tau) * target_param.data)

            # Update the frozen critic models
            for param, target_param in zip(self.critic.parameters(),
                                           self.critic_t.parameters()):
                target_param.data.copy_(self.tau * param.data +
                                        (1 - self.tau) * target_param.data)

    def save(self, directory):
        """
        Save the model in given folder
        """
        self.actor.save_model(directory, "actor")
        self.critic.save_model(directory, "critic")

    def load(self, directory):
        """
        Load model from folder
        """
        self.actor.load_model(directory, "actor")
        self.critic.load_model(directory, "critic")