class SACAgent(): def __init__(self, state_dim=None, action_dim=None, hidden_dim=None, discount=0.99, tau=0.005, lr_actor=None, lr_critic=None, batch_size=256, replay_buffer_capacity=1e5, learning_start=None, reward_scaling=1., seed=0, rbc_controller=None, safe_exploration=None, automatic_entropy_tuning=False, alpha=1): if hidden_dim is None: hidden_dim = [256, 256] self.learning_start = learning_start self.discount = discount self.batch_size = batch_size self.tau = tau self.reward_scaling = reward_scaling t.manual_seed(seed) np.random.seed(seed) self.action_list_ = [] self.action_list2_ = [] self.hidden_dim = hidden_dim self.rbc_controller = rbc_controller self.safe_exploration = safe_exploration self.automatic_entropy_tuning = automatic_entropy_tuning self.time_step = 0 # Optimizers/Loss using the Huber loss # self.soft_q_criterion = f.mse_loss # device self.device = t.device("cuda" if t.cuda.is_available() else "cpu") self.memory = ReplayBuffer(input_shape=int(state_dim), n_actions=int(1), max_mem_size=int(replay_buffer_capacity)) # init networks self.soft_q_net1 = SoftQNetworkDiscrete(state_dim, action_dim, hidden_dim).to(self.device) self.soft_q_net2 = SoftQNetworkDiscrete(state_dim, action_dim, hidden_dim).to(self.device) self.target_soft_q_net1 = SoftQNetworkDiscrete( state_dim, action_dim, hidden_dim).to(self.device) self.target_soft_q_net2 = SoftQNetworkDiscrete( state_dim, action_dim, hidden_dim).to(self.device) for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()): target_param.data.copy_(param.data) # Policy self.policy_net = PolicyNetworkDiscrete(state_dim, action_dim, hidden_dim).to(self.device) self.soft_q_optimizer1 = optim.Adam(self.soft_q_net1.parameters(), lr=lr_critic) self.soft_q_optimizer2 = optim.Adam(self.soft_q_net2.parameters(), lr=lr_critic) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=lr_actor) if self.automatic_entropy_tuning: # we set the max possible entropy as the target entropy self.target_entropy = -np.log((1.0 / action_dim)) * 0.98 self.log_alpha = t.zeros(1, requires_grad=True, device=self.device) self.alpha = self.log_alpha.exp() self.alpha_optimizer = optim.Adam([self.log_alpha], lr=lr_critic, eps=1e-4) else: self.alpha = alpha def choose_action(self, simulation_step, electricity_price, storage_soc, observation): if simulation_step < self.safe_exploration: action = self.rbc_controller.choose_action( electricity_price=electricity_price, storage_soc=storage_soc) actions = t.tensor([action], dtype=t.float).to(self.device) # print(action) else: if self.device.type == "cuda": state = t.cuda.FloatTensor([observation]).to(self.device) else: state = t.FloatTensor([observation]).to(self.device) actions, _, _ = self.policy_net.sample(state) return actions.cpu().detach().numpy()[0] def get_actions_probabilities(self, observation): if self.device.type == "cuda": state = t.cuda.FloatTensor([observation]).to(self.device) else: state = t.FloatTensor([observation]).to(self.device) _, (actions_probabilities, _), _ = self.policy_net.sample(state) return actions_probabilities.cpu().detach().numpy()[0] def get_q_values(self, observation): if self.device.type == "cuda": state = t.cuda.FloatTensor([observation]).to(self.device) else: state = t.FloatTensor([observation]).to(self.device) q_1 = self.soft_q_net1(state) q_2 = self.soft_q_net2(state) q_1 = q_1.cpu().detach().numpy()[0] q_2 = q_2.cpu().detach().numpy()[0] return q_1, q_2 def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def learn(self): if self.memory.mem_ctr < self.batch_size: return state, action, reward, next_state, done = self.memory.sample_buffer( self.batch_size) if self.device.type == "cuda": state = t.cuda.FloatTensor(state).to(self.device) next_state = t.cuda.FloatTensor(next_state).to(self.device) action = t.cuda.LongTensor(action).to(self.device) reward = t.cuda.FloatTensor(reward).unsqueeze(1).to(self.device) done = t.cuda.FloatTensor(done).unsqueeze(1).to(self.device) else: state = t.FloatTensor(state).to(self.device) next_state = t.FloatTensor(next_state).to(self.device) action = t.FloatTensor(action).to(self.device) reward = t.FloatTensor(reward).unsqueeze(1).to(self.device) done = t.FloatTensor(done).unsqueeze(1).to(self.device) with t.no_grad(): # Update Q-values. First, sample an action from the Gaussian policy/distribution for the current (next) state and its associated log probability of occurrence. new_next_actions, (action_probabilities, log_action_probabilities ), _ = self.policy_net.sample(next_state) qf1_next_target = self.target_soft_q_net1(next_state) qf2_next_target = self.target_soft_q_net2(next_state) min_qf_next_target = action_probabilities * ( t.min(qf1_next_target, qf2_next_target) - self.alpha * log_action_probabilities) min_qf_next_target = min_qf_next_target.sum(dim=1).unsqueeze(-1) q_target = reward + (1 - done) * self.discount * min_qf_next_target # self.q_tracker.append(q_target.mean()) # Update Soft Q-Networks q1_pred = self.soft_q_net1(state) q2_pred = self.soft_q_net2(state) q1_pred = q1_pred.gather(1, action.reshape([self.batch_size, 1])) q2_pred = q2_pred.gather(1, action.reshape([self.batch_size, 1])) q1_loss = f.mse_loss(q1_pred, q_target) q2_loss = f.mse_loss(q2_pred, q_target) self.soft_q_optimizer1.zero_grad() q1_loss.backward() self.soft_q_optimizer1.step() self.soft_q_optimizer2.zero_grad() q2_loss.backward() self.soft_q_optimizer2.step() # Update Policy new_actions, ( action_probabilities, log_action_probabilities), _ = self.policy_net.sample(state) min_qf_pi = t.min(self.soft_q_net1(state), self.soft_q_net2(state)) inside_term = self.alpha * log_action_probabilities - min_qf_pi policy_loss = (action_probabilities * inside_term).sum(dim=1).mean() log_action_probabilities = t.sum(log_action_probabilities * action_probabilities, dim=1) self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() if self.automatic_entropy_tuning: alpha_loss = -(self.log_alpha * (log_action_probabilities + self.target_entropy).detach()).mean() else: alpha_loss = None if alpha_loss is not None: self.alpha_optimizer.zero_grad() alpha_loss.backward() self.alpha_optimizer.step() self.alpha = self.log_alpha.exp() # Soft Updates for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau) for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau) def save_models(self, path): print('...saving models...') t.save(self.soft_q_net1, path + '\\critic_1.pth') t.save(self.soft_q_net2, path + '\\critic_2.pth') t.save(self.policy_net, path + '\\actor.pth') def load_models(self, path): print('...loading models...') dev = self.device self.soft_q_net1 = t.load(path + '\\critic_1.pth', map_location=dev) self.soft_q_net2 = t.load(path + '\\critic_2.pth', map_location=dev) self.policy_net = t.load(path + '\\actor.pth', map_location=dev)
class SAC2Agent(): def __init__(self, observation_space=None, action_space=None, hidden_dim=None, discount=0.99, tau=0.005, lr=None, batch_size=256, replay_buffer_capacity=1e5, start_training=None, exploration_period=None, action_scaling_coef=1., reward_scaling=1., update_per_step=1, iterations_as=2, seed=0, deterministic=None, rbc_controller=None, safe_exploration=None): if hidden_dim is None: hidden_dim = [256, 256] self.start_training = start_training self.discount = discount self.batch_size = batch_size self.tau = tau self.action_scaling_coef = action_scaling_coef self.reward_scaling = reward_scaling t.manual_seed(seed) np.random.seed(seed) self.deterministic = deterministic self.update_per_step = update_per_step self.iterations_as = iterations_as self.exploration_period = exploration_period self.action_list_ = [] self.action_list2_ = [] self.hidden_dim = hidden_dim self.rbc_controller = rbc_controller self.safe_exploration = safe_exploration self.reset_action_tracker() self.reset_reward_tracker() self.time_step = 0 self.action_space = action_space self.observation_space = observation_space # Optimizers/Loss using the Huber loss self.soft_q_criterion = nn.SmoothL1Loss() # device self.device = t.device("cuda" if t.cuda.is_available() else "cpu") state_dim = self.observation_space.shape[0] action_dim = self.action_space.shape[0] self.alpha = 0.05 self.memory = ReplayBuffer(input_shape=int(state_dim), n_actions=int(action_dim), max_mem_size=int(replay_buffer_capacity)) # init networks self.soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.device) self.soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.device) self.target_soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.device) self.target_soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.device) for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()): target_param.data.copy_(param.data) # Policy self.policy_net = PolicyNetwork(state_dim, action_dim, self.action_space, self.action_scaling_coef, hidden_dim).to(self.device) self.soft_q_optimizer1 = optim.Adam(self.soft_q_net1.parameters(), lr=lr) self.soft_q_optimizer2 = optim.Adam(self.soft_q_net2.parameters(), lr=lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=lr) self.target_entropy = -np.prod(self.action_space.shape).item() self.log_alpha = t.zeros(1, requires_grad=True, device=self.device) self.alpha_optimizer = optim.Adam([self.log_alpha], lr=lr) def reset_action_tracker(self): self.action_tracker = [] def reset_reward_tracker(self): self.reward_tracker = [] def choose_action(self, simulation_step, electricity_price, storage_soc, observation): if simulation_step < self.safe_exploration: action = self.rbc_controller.choose_action( electricity_price=electricity_price, storage_soc=storage_soc) actions = t.tensor([action], dtype=t.float).to(self.device) # print(action) else: if self.device.type == "cuda": state = t.cuda.FloatTensor([observation]).to(self.device) else: state = t.FloatTensor([observation]).to(self.device) actions, _, _ = self.policy_net.sample(state) return actions.cpu().detach().numpy()[0] def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def learn(self): if self.memory.mem_ctr < self.batch_size: return state, action, reward, next_state, done = self.memory.sample_buffer( self.batch_size) if self.device.type == "cuda": state = t.cuda.FloatTensor(state).to(self.device) next_state = t.cuda.FloatTensor(next_state).to(self.device) action = t.cuda.FloatTensor(action).to(self.device) reward = t.cuda.FloatTensor(reward).unsqueeze(1).to(self.device) done = t.cuda.FloatTensor(done).unsqueeze(1).to(self.device) else: state = t.FloatTensor(state).to(self.device) next_state = t.FloatTensor(next_state).to(self.device) action = t.FloatTensor(action).to(self.device) reward = t.FloatTensor(reward).unsqueeze(1).to(self.device) done = t.FloatTensor(done).unsqueeze(1).to(self.device) with t.no_grad(): # Update Q-values. First, sample an action from the Gaussian policy/distribution for the current (next) state and its associated log probability of occurrence. new_next_actions, new_log_pi, _ = self.policy_net.sample( next_state) target_q_values = t.min( self.target_soft_q_net1(next_state, new_next_actions), self.target_soft_q_net2(next_state, new_next_actions), ) - self.alpha * new_log_pi q_target = reward + (1 - done) * self.discount * target_q_values # self.q_tracker.append(q_target.mean()) # Update Soft Q-Networks q1_pred = self.soft_q_net1(state, action) q2_pred = self.soft_q_net2(state, action) q1_loss = self.soft_q_criterion(q1_pred, q_target) q2_loss = self.soft_q_criterion(q2_pred, q_target) self.soft_q_optimizer1.zero_grad() q1_loss.backward() self.soft_q_optimizer1.step() self.soft_q_optimizer2.zero_grad() q2_loss.backward() self.soft_q_optimizer2.step() # Update Policy new_actions, log_pi, _ = self.policy_net.sample(state) q_new_actions = t.min(self.soft_q_net1(state, new_actions), self.soft_q_net2(state, new_actions)) policy_loss = (self.alpha * log_pi - q_new_actions).mean() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() self.alpha = 0.05 # self.log_alpha.exp() # Soft Updates for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau) for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau)
class SACAgent: def __init__(self, lr_actor=0.0003, lr_critic=0.0003, state_dim=8, discount=0.99, action_dim=1, replay_buffer_capacity=1000000, tau=0.005, batch_size=256, reward_scaling=1, rbc_controller=RBCAgent, safe_exploration=None, hidden_dim=None): self.gamma = discount self.tau = tau self.memory = ReplayBuffer(input_shape=state_dim, n_actions=action_dim, max_mem_size=replay_buffer_capacity) self.batch_size = batch_size self.n_actions = action_dim self.rbc_controller = rbc_controller self.safe_exploration = safe_exploration self.hidden_size = hidden_dim self.actor = ActorNetwork(learning_rate=lr_actor, input_size=state_dim, max_action=1, n_actions=action_dim, name='actor', hidden_size=self.hidden_size) self.critic_1 = CriticNetwork(learning_rate=lr_critic, input_size=state_dim, n_actions=action_dim, name='critic_1', hidden_size=self.hidden_size) self.critic_2 = CriticNetwork(learning_rate=lr_critic, input_size=state_dim, n_actions=action_dim, name='critic_2', hidden_size=self.hidden_size) self.value = ValueNetwork(learning_rate=lr_critic, input_size=state_dim, name='value', hidden_size=self.hidden_size) self.target_value = ValueNetwork(learning_rate=lr_critic, input_size=state_dim, name='target_value', hidden_size=self.hidden_size) self.scale = reward_scaling self.update_network_parameters(tau=1) def choose_action(self, simulation_step, electricity_price, storage_soc, observation): if simulation_step < self.safe_exploration: action = self.rbc_controller.choose_action( electricity_price=electricity_price, storage_soc=storage_soc) actions = t.tensor([action], dtype=t.float).to(self.actor.device) # print(action) else: state = t.tensor([observation], dtype=t.float).to(self.actor.device) actions, _ = self.actor.sample_normal(state, rep=False) return actions.cpu().detach().numpy()[0] def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def update_network_parameters(self, tau=None): if tau is None: tau = self.tau target_value_params = self.target_value.named_parameters() value_params = self.value.named_parameters() target_value_state_dict = dict(target_value_params) value_state_dict = dict(value_params) for name in value_state_dict: value_state_dict[name] = tau * value_state_dict[name].clone() + \ (1 - tau) * target_value_state_dict[name].clone() self.target_value.load_state_dict(value_state_dict) def save_models(self, path): print('...saving models...') t.save(self.actor, path + '\\actor.pth') t.save(self.value, path + '\\value.pth') t.save(self.target_value, path + '\\target_value.pth') t.save(self.critic_1, path + '\\critic_1.pth') t.save(self.critic_2, path + '\\critic_2.pth') def load_models(self, path): print('...loading models...') dev = self.actor.device self.actor = t.load(path + '\\actor.pth', map_location=dev) self.value = t.load(path + '\\value.pth', map_location=dev) self.target_value = t.load(path + '\\target_value.pth', map_location=dev) self.critic_1 = t.load(path + '\\critic_1.pth', map_location=dev) self.critic_2 = t.load(path + '\\critic_2.pth', map_location=dev) def learn(self): if self.memory.mem_ctr < self.batch_size: return state, action, reward, new_state, done = self.memory.sample_buffer( self.batch_size) reward = t.tensor(reward, dtype=t.float).to(self.actor.device) done = t.tensor(done).to(self.actor.device) new_state = t.tensor(new_state, dtype=t.float).to(self.actor.device) state = t.tensor(state, dtype=t.float).to(self.actor.device) action = t.tensor(action, dtype=t.float).to(self.actor.device) value = self.value(state).view(-1) value_ = self.target_value(new_state).view(-1) value_[done] = 0.0 actions, log_prob = self.actor.sample_normal(state, rep=False) log_prob = log_prob.view(-1) q1_new_policy = self.critic_1.forward(state, actions) q2_new_policy = self.critic_2.forward(state, actions) critic_value = t.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) self.value.opt.zero_grad() value_target = critic_value - log_prob value_loss = 0.5 * f.mse_loss(value, value_target) value_loss.backward(retain_graph=True) self.value.opt.step() actions, log_prob = self.actor.sample_normal(state, rep=True) log_prob = log_prob.view(-1) q1_new_policy = self.critic_1.forward(state, actions) q2_new_policy = self.critic_2.forward(state, actions) critic_value = t.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) actor_loss = log_prob - critic_value actor_loss = t.mean(actor_loss) self.actor.opt.zero_grad() actor_loss.backward(retain_graph=True) self.actor.opt.step() self.critic_1.opt.zero_grad() self.critic_2.opt.zero_grad() q_hat = self.scale * reward + self.gamma * value_ q1_old_policy = self.critic_1.forward(state, action).view(-1) q2_old_policy = self.critic_2.forward(state, action).view(-1) critic_1_loss = 0.5 * f.mse_loss(q1_old_policy, q_hat) critic_2_loss = 0.5 * f.mse_loss(q2_old_policy, q_hat) critic_loss = critic_1_loss + critic_2_loss critic_loss.backward() self.critic_1.opt.step() self.critic_2.opt.step() self.update_network_parameters() def learn_actor(self, updates: int, batch_size): for i in range(0, updates): print(i) state, _, _, _, _ = self.memory.sample_buffer(batch_size) state = t.tensor(state, dtype=t.float).to(self.actor.device) actions, log_prob = self.actor.sample_normal(state, rep=True) log_prob = log_prob.view(-1) q1_new_policy = self.critic_1.forward(state, actions) q2_new_policy = self.critic_2.forward(state, actions) critic_value = t.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) actor_loss = log_prob - critic_value actor_loss = t.mean(actor_loss) self.actor.opt.zero_grad() actor_loss.backward(retain_graph=True) self.actor.opt.step()