示例#1
0
class SACAgent():
    def __init__(self,
                 state_dim=None,
                 action_dim=None,
                 hidden_dim=None,
                 discount=0.99,
                 tau=0.005,
                 lr_actor=None,
                 lr_critic=None,
                 batch_size=256,
                 replay_buffer_capacity=1e5,
                 learning_start=None,
                 reward_scaling=1.,
                 seed=0,
                 rbc_controller=None,
                 safe_exploration=None,
                 automatic_entropy_tuning=False,
                 alpha=1):

        if hidden_dim is None:
            hidden_dim = [256, 256]
        self.learning_start = learning_start
        self.discount = discount
        self.batch_size = batch_size
        self.tau = tau
        self.reward_scaling = reward_scaling
        t.manual_seed(seed)
        np.random.seed(seed)
        self.action_list_ = []
        self.action_list2_ = []
        self.hidden_dim = hidden_dim
        self.rbc_controller = rbc_controller
        self.safe_exploration = safe_exploration
        self.automatic_entropy_tuning = automatic_entropy_tuning

        self.time_step = 0

        # Optimizers/Loss using the Huber loss
        # self.soft_q_criterion = f.mse_loss

        # device
        self.device = t.device("cuda" if t.cuda.is_available() else "cpu")

        self.memory = ReplayBuffer(input_shape=int(state_dim),
                                   n_actions=int(1),
                                   max_mem_size=int(replay_buffer_capacity))

        # init networks
        self.soft_q_net1 = SoftQNetworkDiscrete(state_dim, action_dim,
                                                hidden_dim).to(self.device)
        self.soft_q_net2 = SoftQNetworkDiscrete(state_dim, action_dim,
                                                hidden_dim).to(self.device)

        self.target_soft_q_net1 = SoftQNetworkDiscrete(
            state_dim, action_dim, hidden_dim).to(self.device)
        self.target_soft_q_net2 = SoftQNetworkDiscrete(
            state_dim, action_dim, hidden_dim).to(self.device)

        for target_param, param in zip(self.target_soft_q_net1.parameters(),
                                       self.soft_q_net1.parameters()):
            target_param.data.copy_(param.data)

        for target_param, param in zip(self.target_soft_q_net2.parameters(),
                                       self.soft_q_net2.parameters()):
            target_param.data.copy_(param.data)

        # Policy
        self.policy_net = PolicyNetworkDiscrete(state_dim, action_dim,
                                                hidden_dim).to(self.device)
        self.soft_q_optimizer1 = optim.Adam(self.soft_q_net1.parameters(),
                                            lr=lr_critic)
        self.soft_q_optimizer2 = optim.Adam(self.soft_q_net2.parameters(),
                                            lr=lr_critic)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(),
                                           lr=lr_actor)

        if self.automatic_entropy_tuning:
            # we set the max possible entropy as the target entropy
            self.target_entropy = -np.log((1.0 / action_dim)) * 0.98
            self.log_alpha = t.zeros(1, requires_grad=True, device=self.device)
            self.alpha = self.log_alpha.exp()
            self.alpha_optimizer = optim.Adam([self.log_alpha],
                                              lr=lr_critic,
                                              eps=1e-4)
        else:
            self.alpha = alpha

    def choose_action(self, simulation_step, electricity_price, storage_soc,
                      observation):

        if simulation_step < self.safe_exploration:
            action = self.rbc_controller.choose_action(
                electricity_price=electricity_price, storage_soc=storage_soc)
            actions = t.tensor([action], dtype=t.float).to(self.device)
            # print(action)
        else:
            if self.device.type == "cuda":
                state = t.cuda.FloatTensor([observation]).to(self.device)
            else:
                state = t.FloatTensor([observation]).to(self.device)
            actions, _, _ = self.policy_net.sample(state)

        return actions.cpu().detach().numpy()[0]

    def get_actions_probabilities(self, observation):

        if self.device.type == "cuda":
            state = t.cuda.FloatTensor([observation]).to(self.device)
        else:
            state = t.FloatTensor([observation]).to(self.device)
        _, (actions_probabilities, _), _ = self.policy_net.sample(state)

        return actions_probabilities.cpu().detach().numpy()[0]

    def get_q_values(self, observation):

        if self.device.type == "cuda":
            state = t.cuda.FloatTensor([observation]).to(self.device)
        else:
            state = t.FloatTensor([observation]).to(self.device)
        q_1 = self.soft_q_net1(state)
        q_2 = self.soft_q_net2(state)

        q_1 = q_1.cpu().detach().numpy()[0]
        q_2 = q_2.cpu().detach().numpy()[0]

        return q_1, q_2

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def learn(self):
        if self.memory.mem_ctr < self.batch_size:
            return

        state, action, reward, next_state, done = self.memory.sample_buffer(
            self.batch_size)

        if self.device.type == "cuda":
            state = t.cuda.FloatTensor(state).to(self.device)
            next_state = t.cuda.FloatTensor(next_state).to(self.device)
            action = t.cuda.LongTensor(action).to(self.device)
            reward = t.cuda.FloatTensor(reward).unsqueeze(1).to(self.device)
            done = t.cuda.FloatTensor(done).unsqueeze(1).to(self.device)
        else:
            state = t.FloatTensor(state).to(self.device)
            next_state = t.FloatTensor(next_state).to(self.device)
            action = t.FloatTensor(action).to(self.device)
            reward = t.FloatTensor(reward).unsqueeze(1).to(self.device)
            done = t.FloatTensor(done).unsqueeze(1).to(self.device)

        with t.no_grad():
            # Update Q-values. First, sample an action from the Gaussian policy/distribution for the current (next) state and its associated log probability of occurrence.

            new_next_actions, (action_probabilities, log_action_probabilities
                               ), _ = self.policy_net.sample(next_state)

            qf1_next_target = self.target_soft_q_net1(next_state)
            qf2_next_target = self.target_soft_q_net2(next_state)

            min_qf_next_target = action_probabilities * (
                t.min(qf1_next_target, qf2_next_target) -
                self.alpha * log_action_probabilities)
            min_qf_next_target = min_qf_next_target.sum(dim=1).unsqueeze(-1)

            q_target = reward + (1 - done) * self.discount * min_qf_next_target
            # self.q_tracker.append(q_target.mean())

        # Update Soft Q-Networks
        q1_pred = self.soft_q_net1(state)
        q2_pred = self.soft_q_net2(state)

        q1_pred = q1_pred.gather(1, action.reshape([self.batch_size, 1]))
        q2_pred = q2_pred.gather(1, action.reshape([self.batch_size, 1]))

        q1_loss = f.mse_loss(q1_pred, q_target)
        q2_loss = f.mse_loss(q2_pred, q_target)

        self.soft_q_optimizer1.zero_grad()
        q1_loss.backward()
        self.soft_q_optimizer1.step()

        self.soft_q_optimizer2.zero_grad()
        q2_loss.backward()
        self.soft_q_optimizer2.step()

        # Update Policy
        new_actions, (
            action_probabilities,
            log_action_probabilities), _ = self.policy_net.sample(state)

        min_qf_pi = t.min(self.soft_q_net1(state), self.soft_q_net2(state))

        inside_term = self.alpha * log_action_probabilities - min_qf_pi
        policy_loss = (action_probabilities * inside_term).sum(dim=1).mean()
        log_action_probabilities = t.sum(log_action_probabilities *
                                         action_probabilities,
                                         dim=1)

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        if self.automatic_entropy_tuning:
            alpha_loss = -(self.log_alpha *
                           (log_action_probabilities +
                            self.target_entropy).detach()).mean()
        else:
            alpha_loss = None

        if alpha_loss is not None:
            self.alpha_optimizer.zero_grad()
            alpha_loss.backward()
            self.alpha_optimizer.step()
            self.alpha = self.log_alpha.exp()

        # Soft Updates
        for target_param, param in zip(self.target_soft_q_net1.parameters(),
                                       self.soft_q_net1.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.tau) +
                                    param.data * self.tau)

        for target_param, param in zip(self.target_soft_q_net2.parameters(),
                                       self.soft_q_net2.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.tau) +
                                    param.data * self.tau)

    def save_models(self, path):
        print('...saving models...')
        t.save(self.soft_q_net1, path + '\\critic_1.pth')
        t.save(self.soft_q_net2, path + '\\critic_2.pth')
        t.save(self.policy_net, path + '\\actor.pth')

    def load_models(self, path):
        print('...loading models...')
        dev = self.device
        self.soft_q_net1 = t.load(path + '\\critic_1.pth', map_location=dev)
        self.soft_q_net2 = t.load(path + '\\critic_2.pth', map_location=dev)
        self.policy_net = t.load(path + '\\actor.pth', map_location=dev)
class SAC2Agent():
    def __init__(self,
                 observation_space=None,
                 action_space=None,
                 hidden_dim=None,
                 discount=0.99,
                 tau=0.005,
                 lr=None,
                 batch_size=256,
                 replay_buffer_capacity=1e5,
                 start_training=None,
                 exploration_period=None,
                 action_scaling_coef=1.,
                 reward_scaling=1.,
                 update_per_step=1,
                 iterations_as=2,
                 seed=0,
                 deterministic=None,
                 rbc_controller=None,
                 safe_exploration=None):

        if hidden_dim is None:
            hidden_dim = [256, 256]
        self.start_training = start_training
        self.discount = discount
        self.batch_size = batch_size
        self.tau = tau
        self.action_scaling_coef = action_scaling_coef
        self.reward_scaling = reward_scaling
        t.manual_seed(seed)
        np.random.seed(seed)
        self.deterministic = deterministic
        self.update_per_step = update_per_step
        self.iterations_as = iterations_as
        self.exploration_period = exploration_period
        self.action_list_ = []
        self.action_list2_ = []
        self.hidden_dim = hidden_dim
        self.rbc_controller = rbc_controller
        self.safe_exploration = safe_exploration
        self.reset_action_tracker()

        self.reset_reward_tracker()

        self.time_step = 0
        self.action_space = action_space
        self.observation_space = observation_space

        # Optimizers/Loss using the Huber loss
        self.soft_q_criterion = nn.SmoothL1Loss()

        # device
        self.device = t.device("cuda" if t.cuda.is_available() else "cpu")

        state_dim = self.observation_space.shape[0]
        action_dim = self.action_space.shape[0]
        self.alpha = 0.05

        self.memory = ReplayBuffer(input_shape=int(state_dim),
                                   n_actions=int(action_dim),
                                   max_mem_size=int(replay_buffer_capacity))

        # init networks
        self.soft_q_net1 = SoftQNetwork(state_dim, action_dim,
                                        hidden_dim).to(self.device)
        self.soft_q_net2 = SoftQNetwork(state_dim, action_dim,
                                        hidden_dim).to(self.device)

        self.target_soft_q_net1 = SoftQNetwork(state_dim, action_dim,
                                               hidden_dim).to(self.device)
        self.target_soft_q_net2 = SoftQNetwork(state_dim, action_dim,
                                               hidden_dim).to(self.device)

        for target_param, param in zip(self.target_soft_q_net1.parameters(),
                                       self.soft_q_net1.parameters()):
            target_param.data.copy_(param.data)

        for target_param, param in zip(self.target_soft_q_net2.parameters(),
                                       self.soft_q_net2.parameters()):
            target_param.data.copy_(param.data)

        # Policy
        self.policy_net = PolicyNetwork(state_dim, action_dim,
                                        self.action_space,
                                        self.action_scaling_coef,
                                        hidden_dim).to(self.device)
        self.soft_q_optimizer1 = optim.Adam(self.soft_q_net1.parameters(),
                                            lr=lr)
        self.soft_q_optimizer2 = optim.Adam(self.soft_q_net2.parameters(),
                                            lr=lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)
        self.target_entropy = -np.prod(self.action_space.shape).item()
        self.log_alpha = t.zeros(1, requires_grad=True, device=self.device)
        self.alpha_optimizer = optim.Adam([self.log_alpha], lr=lr)

    def reset_action_tracker(self):
        self.action_tracker = []

    def reset_reward_tracker(self):
        self.reward_tracker = []

    def choose_action(self, simulation_step, electricity_price, storage_soc,
                      observation):

        if simulation_step < self.safe_exploration:
            action = self.rbc_controller.choose_action(
                electricity_price=electricity_price, storage_soc=storage_soc)
            actions = t.tensor([action], dtype=t.float).to(self.device)
            # print(action)
        else:
            if self.device.type == "cuda":
                state = t.cuda.FloatTensor([observation]).to(self.device)
            else:
                state = t.FloatTensor([observation]).to(self.device)
            actions, _, _ = self.policy_net.sample(state)

        return actions.cpu().detach().numpy()[0]

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def learn(self):
        if self.memory.mem_ctr < self.batch_size:
            return

        state, action, reward, next_state, done = self.memory.sample_buffer(
            self.batch_size)

        if self.device.type == "cuda":
            state = t.cuda.FloatTensor(state).to(self.device)
            next_state = t.cuda.FloatTensor(next_state).to(self.device)
            action = t.cuda.FloatTensor(action).to(self.device)
            reward = t.cuda.FloatTensor(reward).unsqueeze(1).to(self.device)
            done = t.cuda.FloatTensor(done).unsqueeze(1).to(self.device)
        else:
            state = t.FloatTensor(state).to(self.device)
            next_state = t.FloatTensor(next_state).to(self.device)
            action = t.FloatTensor(action).to(self.device)
            reward = t.FloatTensor(reward).unsqueeze(1).to(self.device)
            done = t.FloatTensor(done).unsqueeze(1).to(self.device)

        with t.no_grad():
            # Update Q-values. First, sample an action from the Gaussian policy/distribution for the current (next) state and its associated log probability of occurrence.
            new_next_actions, new_log_pi, _ = self.policy_net.sample(
                next_state)

            target_q_values = t.min(
                self.target_soft_q_net1(next_state, new_next_actions),
                self.target_soft_q_net2(next_state, new_next_actions),
            ) - self.alpha * new_log_pi

            q_target = reward + (1 - done) * self.discount * target_q_values
            # self.q_tracker.append(q_target.mean())

        # Update Soft Q-Networks
        q1_pred = self.soft_q_net1(state, action)
        q2_pred = self.soft_q_net2(state, action)

        q1_loss = self.soft_q_criterion(q1_pred, q_target)
        q2_loss = self.soft_q_criterion(q2_pred, q_target)

        self.soft_q_optimizer1.zero_grad()
        q1_loss.backward()
        self.soft_q_optimizer1.step()

        self.soft_q_optimizer2.zero_grad()
        q2_loss.backward()
        self.soft_q_optimizer2.step()

        # Update Policy
        new_actions, log_pi, _ = self.policy_net.sample(state)

        q_new_actions = t.min(self.soft_q_net1(state, new_actions),
                              self.soft_q_net2(state, new_actions))

        policy_loss = (self.alpha * log_pi - q_new_actions).mean()

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        self.alpha = 0.05  # self.log_alpha.exp()

        # Soft Updates
        for target_param, param in zip(self.target_soft_q_net1.parameters(),
                                       self.soft_q_net1.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.tau) +
                                    param.data * self.tau)

        for target_param, param in zip(self.target_soft_q_net2.parameters(),
                                       self.soft_q_net2.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.tau) +
                                    param.data * self.tau)
示例#3
0
class SACAgent:
    def __init__(self,
                 lr_actor=0.0003,
                 lr_critic=0.0003,
                 state_dim=8,
                 discount=0.99,
                 action_dim=1,
                 replay_buffer_capacity=1000000,
                 tau=0.005,
                 batch_size=256,
                 reward_scaling=1,
                 rbc_controller=RBCAgent,
                 safe_exploration=None,
                 hidden_dim=None):
        self.gamma = discount
        self.tau = tau
        self.memory = ReplayBuffer(input_shape=state_dim,
                                   n_actions=action_dim,
                                   max_mem_size=replay_buffer_capacity)
        self.batch_size = batch_size
        self.n_actions = action_dim
        self.rbc_controller = rbc_controller
        self.safe_exploration = safe_exploration
        self.hidden_size = hidden_dim

        self.actor = ActorNetwork(learning_rate=lr_actor,
                                  input_size=state_dim,
                                  max_action=1,
                                  n_actions=action_dim,
                                  name='actor',
                                  hidden_size=self.hidden_size)
        self.critic_1 = CriticNetwork(learning_rate=lr_critic,
                                      input_size=state_dim,
                                      n_actions=action_dim,
                                      name='critic_1',
                                      hidden_size=self.hidden_size)
        self.critic_2 = CriticNetwork(learning_rate=lr_critic,
                                      input_size=state_dim,
                                      n_actions=action_dim,
                                      name='critic_2',
                                      hidden_size=self.hidden_size)

        self.value = ValueNetwork(learning_rate=lr_critic,
                                  input_size=state_dim,
                                  name='value',
                                  hidden_size=self.hidden_size)
        self.target_value = ValueNetwork(learning_rate=lr_critic,
                                         input_size=state_dim,
                                         name='target_value',
                                         hidden_size=self.hidden_size)

        self.scale = reward_scaling
        self.update_network_parameters(tau=1)

    def choose_action(self, simulation_step, electricity_price, storage_soc,
                      observation):

        if simulation_step < self.safe_exploration:
            action = self.rbc_controller.choose_action(
                electricity_price=electricity_price, storage_soc=storage_soc)
            actions = t.tensor([action], dtype=t.float).to(self.actor.device)
            # print(action)
        else:
            state = t.tensor([observation],
                             dtype=t.float).to(self.actor.device)
            actions, _ = self.actor.sample_normal(state, rep=False)

        return actions.cpu().detach().numpy()[0]

    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        target_value_params = self.target_value.named_parameters()
        value_params = self.value.named_parameters()

        target_value_state_dict = dict(target_value_params)
        value_state_dict = dict(value_params)

        for name in value_state_dict:
            value_state_dict[name] = tau * value_state_dict[name].clone() + \
                                     (1 - tau) * target_value_state_dict[name].clone()
        self.target_value.load_state_dict(value_state_dict)

    def save_models(self, path):
        print('...saving models...')
        t.save(self.actor, path + '\\actor.pth')
        t.save(self.value, path + '\\value.pth')
        t.save(self.target_value, path + '\\target_value.pth')
        t.save(self.critic_1, path + '\\critic_1.pth')
        t.save(self.critic_2, path + '\\critic_2.pth')

    def load_models(self, path):
        print('...loading models...')
        dev = self.actor.device
        self.actor = t.load(path + '\\actor.pth', map_location=dev)
        self.value = t.load(path + '\\value.pth', map_location=dev)
        self.target_value = t.load(path + '\\target_value.pth',
                                   map_location=dev)
        self.critic_1 = t.load(path + '\\critic_1.pth', map_location=dev)
        self.critic_2 = t.load(path + '\\critic_2.pth', map_location=dev)

    def learn(self):
        if self.memory.mem_ctr < self.batch_size:
            return

        state, action, reward, new_state, done = self.memory.sample_buffer(
            self.batch_size)

        reward = t.tensor(reward, dtype=t.float).to(self.actor.device)
        done = t.tensor(done).to(self.actor.device)
        new_state = t.tensor(new_state, dtype=t.float).to(self.actor.device)
        state = t.tensor(state, dtype=t.float).to(self.actor.device)
        action = t.tensor(action, dtype=t.float).to(self.actor.device)

        value = self.value(state).view(-1)
        value_ = self.target_value(new_state).view(-1)
        value_[done] = 0.0

        actions, log_prob = self.actor.sample_normal(state, rep=False)
        log_prob = log_prob.view(-1)
        q1_new_policy = self.critic_1.forward(state, actions)
        q2_new_policy = self.critic_2.forward(state, actions)
        critic_value = t.min(q1_new_policy, q2_new_policy)
        critic_value = critic_value.view(-1)

        self.value.opt.zero_grad()
        value_target = critic_value - log_prob
        value_loss = 0.5 * f.mse_loss(value, value_target)
        value_loss.backward(retain_graph=True)
        self.value.opt.step()

        actions, log_prob = self.actor.sample_normal(state, rep=True)
        log_prob = log_prob.view(-1)
        q1_new_policy = self.critic_1.forward(state, actions)
        q2_new_policy = self.critic_2.forward(state, actions)
        critic_value = t.min(q1_new_policy, q2_new_policy)
        critic_value = critic_value.view(-1)

        actor_loss = log_prob - critic_value
        actor_loss = t.mean(actor_loss)
        self.actor.opt.zero_grad()
        actor_loss.backward(retain_graph=True)
        self.actor.opt.step()

        self.critic_1.opt.zero_grad()
        self.critic_2.opt.zero_grad()
        q_hat = self.scale * reward + self.gamma * value_
        q1_old_policy = self.critic_1.forward(state, action).view(-1)
        q2_old_policy = self.critic_2.forward(state, action).view(-1)
        critic_1_loss = 0.5 * f.mse_loss(q1_old_policy, q_hat)
        critic_2_loss = 0.5 * f.mse_loss(q2_old_policy, q_hat)
        critic_loss = critic_1_loss + critic_2_loss
        critic_loss.backward()
        self.critic_1.opt.step()
        self.critic_2.opt.step()

        self.update_network_parameters()

    def learn_actor(self, updates: int, batch_size):

        for i in range(0, updates):
            print(i)
            state, _, _, _, _ = self.memory.sample_buffer(batch_size)

            state = t.tensor(state, dtype=t.float).to(self.actor.device)

            actions, log_prob = self.actor.sample_normal(state, rep=True)
            log_prob = log_prob.view(-1)
            q1_new_policy = self.critic_1.forward(state, actions)
            q2_new_policy = self.critic_2.forward(state, actions)
            critic_value = t.min(q1_new_policy, q2_new_policy)
            critic_value = critic_value.view(-1)

            actor_loss = log_prob - critic_value
            actor_loss = t.mean(actor_loss)
            self.actor.opt.zero_grad()
            actor_loss.backward(retain_graph=True)
            self.actor.opt.step()