class SAC_Alpha: def __init__(self, env, render=False, num_process=1, memory_size=1000000, lr_p=1e-3, lr_a=3e-4, lr_q=1e-3, gamma=0.99, polyak=0.995, batch_size=100, min_update_step=1000, update_step=50, target_update_delay=1, seed=1, ): self.env = env self.gamma = gamma self.polyak = polyak self.memory = FixedMemory(memory_size) self.render = render self.num_process = num_process self.lr_p = lr_p self.lr_a = lr_a self.lr_q = lr_q self.batch_size = batch_size self.min_update_step = min_update_step self.update_step = update_step self.target_update_delay = target_update_delay self.seed = seed self._init_model() def _init_model(self): """init model from parameters""" self.num_states = self.env.observation_space.shape[0] self.num_actions = self.env.action_space.shape[0] self.action_low, self.action_high = self.env.action_space.low[0], self.env.action_space.high[0] self.target_entropy = - np.prod(self.env.action_space.shape) # seeding np.random.seed(self.seed) torch.manual_seed(self.seed) self.env.seed(self.seed) self.policy_net = Actor(self.num_states, self.num_actions, action_limit=self.action_high).to(device) self.q_net_1 = Value(self.num_states + self.num_actions).to(device) self.q_net_target_1 = Value(self.num_states + self.num_actions).to(device) self.q_net_2 = Value(self.num_states + self.num_actions).to(device) self.q_net_target_2 = Value(self.num_states + self.num_actions).to(device) # self.alpha init self.alpha = torch.exp(torch.zeros(1, device=device)).requires_grad_() self.q_net_target_1.load_state_dict(self.q_net_1.state_dict()) self.q_net_target_2.load_state_dict(self.q_net_2.state_dict()) self.optimizer_p = optim.Adam(self.policy_net.parameters(), lr=self.lr_p) self.optimizer_a = optim.Adam([self.alpha], lr=self.lr_a) self.optimizer_q_1 = optim.Adam(self.q_net_1.parameters(), lr=self.lr_q) self.optimizer_q_2 = optim.Adam(self.q_net_2.parameters(), lr=self.lr_q) def choose_action(self, state): """select action""" state = FLOAT(state).unsqueeze(0).to(device) with torch.no_grad(): action, _ = self.policy_net.get_action_log_prob(state) action = action.cpu().numpy()[0] return action, None def eval(self, i_iter, render=False): """evaluate model""" state = self.env.reset() test_reward = 0 while True: if render: self.env.render() action, _ = self.choose_action(state) state, reward, done, _ = self.env.step(action) test_reward += reward if done: break print(f"Iter: {i_iter}, test Reward: {test_reward}") self.env.close() def learn(self, writer, i_iter, step): """interact""" state = self.env.reset() episode_reward = 0 while True: if self.render: self.env.render() action, _ = self.choose_action(state) next_state, reward, done, _ = self.env.step(action) mask = 0 if done else 1 # ('state', 'action', 'reward', 'next_state', 'mask') self.memory.push(state, action, reward, next_state, mask) episode_reward += reward if step >= self.min_update_step and step % self.update_step == 0: for k in range(1, self.update_step + 1): batch = self.memory.sample(self.batch_size) # random sample batch self.update(batch, k) if done: break state = next_state self.env.close() print(f"Iter: {i_iter}, reward: {episode_reward}") # record reward information writer.add_scalar("sac_alpha/reward", episode_reward, i_iter) def update(self, batch, k_iter): """learn model""" batch_state = FLOAT(batch.state).to(device) batch_action = FLOAT(batch.action).to(device) batch_reward = FLOAT(batch.reward).to(device) batch_next_state = FLOAT(batch.next_state).to(device) batch_mask = FLOAT(batch.mask).to(device) # update by SAC Alpha sac_alpha_step(self.policy_net, self.q_net_1, self.q_net_2, self.alpha, self.q_net_target_1, self.q_net_target_2, self.optimizer_p, self.optimizer_q_1, self.optimizer_q_2, self.optimizer_a, batch_state, batch_action, batch_reward, batch_next_state, batch_mask, self.gamma, self.polyak, self.target_entropy, k_iter % self.target_update_delay == 0) def load(self, model_path): print(f"Loading Saved Model from {model_path}") self.policy_net, self.q_net_1, self.q_net_2, self.alpha = torch.load(model_path, map_location=device) def save(self, save_path): """save model""" if not os.path.exists(save_path): os.mkdir(save_path) torch.save((self.policy_net, self.q_net_1, self.q_net_2, self.alpha), f"{save_path}/WebEye_sac_alpha.pt")
class DDPG: def __init__( self, env=None, render=False, num_process=1, memory_size=1000000, lr_p=1e-3, lr_v=1e-3, gamma=0.99, polyak=0.995, explore_size=10000, batch_size=100, min_update_step=1000, update_step=50, action_noise=0.1, seed=1, ): self.env = env self.render = render self.gamma = gamma self.polyak = polyak self.memory = FixedMemory(memory_size) self.explore_size = explore_size self.num_process = num_process self.lr_p = lr_p self.lr_v = lr_v self.batch_size = batch_size self.min_update_step = min_update_step self.update_step = update_step self.action_noise = action_noise self.seed = seed self._init_model() def _init_model(self): """init model from parameters""" self.num_states = self.env.observation_space.shape[0] self.num_actions = self.env.action_space.shape[0] self.action_low, self.action_high = self.env.action_space.low[ 0], self.env.action_space.high[0] # seeding np.random.seed(self.seed) torch.manual_seed(self.seed) self.env.seed(self.seed) self.policy_net = Actor(self.num_states, self.num_actions, self.action_high).to(device) self.policy_net_target = Actor(self.num_states, self.num_actions, self.action_high).to(device) self.value_net = Value(self.num_states + self.num_actions).to(device) self.value_net_target = Value(self.num_states + self.num_actions).to(device) self.policy_net_target.load_state_dict(self.policy_net.state_dict()) self.value_net_target.load_state_dict(self.value_net.state_dict()) self.optimizer_p = optim.Adam(self.policy_net.parameters(), lr=self.lr_p) self.optimizer_v = optim.Adam(self.value_net.parameters(), lr=self.lr_v) def choose_action(self, state, noise_scale): """select action""" self.policy_net.eval() state = FLOAT(state).unsqueeze(0).to(device) with torch.no_grad(): action = self.policy_net(state) self.policy_net.train() action = action.cpu().numpy()[0] # add noise noise = noise_scale * np.random.randn(self.num_actions) action += noise action = np.clip(action, -self.action_high, self.action_high) return action def eval(self, i_iter, render=False): """evaluate model""" self.policy_net.eval() self.value_net.eval() state = self.env.reset() test_reward = 0 while True: if render: self.env.render() action = self.choose_action(state, 0) state, reward, done, _ = self.env.step(action) test_reward += reward if done: break print(f"Iter: {i_iter}, test Reward: {test_reward}") self.env.close() def learn(self, writer, i_iter, step): """interact""" self.policy_net.train() self.value_net.train() state = self.env.reset() episode_reward = 0 while True: if self.render: self.env.render() action = self.choose_action(state, self.action_noise) next_state, reward, done, _ = self.env.step(action) mask = 0 if done else 1 # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob') self.memory.push(state, action, reward, next_state, mask) episode_reward += reward if step >= self.min_update_step and step % self.update_step == 0: for _ in range(self.update_step): batch = self.memory.sample( self.batch_size) # random sample batch self.update(batch) if done: break state = next_state self.env.close() print(f"Iter: {i_iter}, reward: {episode_reward}") # record reward information writer.add_scalar("ddpg/reward", episode_reward, i_iter) def update(self, batch): """learn model""" batch_state = FLOAT(batch.state).to(device) batch_action = FLOAT(batch.action).to(device) batch_reward = FLOAT(batch.reward).to(device) batch_next_state = FLOAT(batch.next_state).to(device) batch_mask = FLOAT(batch.mask).to(device) # update by DDPG ddpg_step(self.policy_net, self.policy_net_target, self.value_net, self.value_net_target, self.optimizer_p, self.optimizer_v, batch_state, batch_action, batch_reward, batch_next_state, batch_mask, self.gamma, self.polyak) def load(self, model_path): print(f"Loading Saved Model from {model_path}") self.policy_net, self.value_net = torch.load(model_path, map_location=device) def save(self, save_path): if not os.path.exists(save_path): os.mkdir(save_path) """save model""" torch.save((self.policy_net, self.value_net), f"{save_path}/WebEye_ddpg.pt")
def train_v_upper_envelope(states, actions, returns, state_dim, device, seed, upper_learning_rate=3e-3, weight_decay=0.02, max_step_num=int(1e6), consecutive_steps=4, k=10000): states = torch.from_numpy(np.array(states)) actions = torch.from_numpy(np.array(actions)) returns = torch.from_numpy(np.array(returns)) # returns is actually Gts use_gpu = True if device == "cuda:0" else False # Init upper_envelope net (*use relu as activation function upper_envelope = Value(state_dim, activation='relu') upper_envelope_retrain = Value(state_dim, activation='relu') optimizer_upper = torch.optim.Adam(upper_envelope.parameters(), lr=upper_learning_rate, weight_decay=weight_decay) optimizer_upper_retrain = torch.optim.Adam( upper_envelope_retrain.parameters(), lr=upper_learning_rate, weight_decay=weight_decay) if use_gpu: upper_envelope = upper_envelope.cuda() upper_envelope_retrain = upper_envelope_retrain.cuda() # =========================== # # Split data into training and testing # # But make sure the highest Ri is in the training set # pick out the highest data point highestR, indice = torch.max(returns, 0) highestR = highestR.view(-1, 1) highestS = states[indice] highestA = actions[indice] print("HighestR:", highestR) statesW = torch.cat((states[:indice], states[indice + 1:])) actionsW = torch.cat((actions[:indice], actions[indice + 1:])) returnsW = torch.cat((returns[:indice], returns[indice + 1:])) # shuffle the data perm = np.arange(statesW.shape[0]) np.random.shuffle(perm) perm = LongTensor(perm).cuda() if use_gpu else LongTensor(perm) statesW, actionsW, returnsW = statesW[perm], actionsW[perm], returnsW[perm] # divide data into train/test divide = int(states.shape[0] * 0.8) train_states, train_actions, train_returns = statesW[: divide], actionsW[: divide], returnsW[: divide] test_states, test_actions, test_returns = statesW[divide:], actionsW[ divide:], returnsW[divide:] # add the highest data into training print(train_states.size(), highestS.size()) print(train_actions.size(), highestA.size()) print(train_returns.size(), highestR.size()) train_states = torch.cat((train_states.squeeze(), highestS.unsqueeze(0))) train_actions = torch.cat((train_actions.squeeze(), highestA.unsqueeze(0))) train_returns = torch.cat( (train_returns.squeeze(), highestR.squeeze().unsqueeze(0))) # train upper envelope # env_dummy = env_factory(0) # state_dim = env_dummy.observation_space.shape[0] # upper_envelope = Value(state_dim) # optimizer = torch.optim.Adam(upper_envelope.parameters(), lr=0.003, weight_decay=20) epoch_n = 100 batch_size = 64 optim_iter_num = int(math.ceil(train_states.shape[0] / batch_size)) num_increase = 0 previous_loss = math.inf calculate_vali = 2 best_parameters = upper_envelope.state_dict() running_traning_steps = 0 best_training_steps = running_traning_steps # Upper Envelope Training starts upper_envelope.train() while num_increase < consecutive_steps: # update theta for n steps, n = calculate_vali # train calculate_vali steps for i in range(calculate_vali): train_loss = 0 perm = np.arange(train_states.shape[0]) np.random.shuffle(perm) perm = LongTensor(perm).cuda() if use_gpu else LongTensor(perm) train_states, train_actions, train_returns = train_states[ perm], train_actions[perm], train_returns[perm] for i in range(optim_iter_num): ind = slice(i * batch_size, min((i + 1) * batch_size, states.shape[0])) states_b, returns_b = train_states[ind], train_returns[ind] states_b = Variable(states_b.float()) returns_b = Variable(returns_b.float()) Vsi = upper_envelope(states_b) # loss = loss_fn(Vsi, returns_b) loss = L2PenaltyLoss(Vsi, returns_b, k_val=k) train_loss += loss.detach() upper_envelope.zero_grad() loss.backward() optimizer_upper.step() # early stopping running_traning_steps += calculate_vali # calculate validation error test_iter = int(math.ceil(test_states.shape[0] / batch_size)) validation_loss = 0 for n in range(test_iter): ind = slice(n * batch_size, min((n + 1) * batch_size, states.shape[0])) states_t, returns_t = test_states[ind], test_returns[ind] states_t = Variable(states_t.float()) returns_t = Variable(returns_t.float()) Vsi = upper_envelope(states_t) loss = L2PenaltyLoss(Vsi, returns_t, k_val=k) validation_loss += loss if validation_loss < previous_loss: best_training_steps = running_traning_steps previous_loss = validation_loss best_parameters = upper_envelope.state_dict() num_increase = 0 else: num_increase += 1 print("best_training_steps:", best_training_steps) upper_envelope.load_state_dict(best_parameters) # retrain on the whole set upper_envelope_retrain.train() optim_iter_num = int(math.ceil(states.shape[0] / batch_size)) for i in range(best_training_steps): train_loss = 0 perm = np.arange(states.shape[0]) np.random.shuffle(perm) perm = LongTensor(perm).cuda() if use_gpu else LongTensor(perm) states, actions, returns = states[perm], actions[perm], returns[perm] for i in range(optim_iter_num): ind = slice(i * batch_size, min((i + 1) * batch_size, states.shape[0])) states_b, returns_b = states[ind], returns[ind] states_b = Variable(states_b.float()) returns_b = Variable(returns_b.float()) Vsi = upper_envelope_retrain(states_b) #loss = loss_fn(Vsi, returns_b) loss = L2PenaltyLoss(Vsi, returns_b, k_val=k) train_loss += loss.detach() upper_envelope_retrain.zero_grad() loss.backward() optimizer_upper_retrain.step() upper_envelope.load_state_dict(upper_envelope_retrain.state_dict()) print("Policy training is complete.") return upper_envelope