class DDPG: def __init__( self, env=None, render=False, num_process=1, memory_size=1000000, lr_p=1e-3, lr_v=1e-3, gamma=0.99, polyak=0.995, explore_size=10000, batch_size=100, min_update_step=1000, update_step=50, action_noise=0.1, seed=1, ): self.env = env self.render = render self.gamma = gamma self.polyak = polyak self.memory = FixedMemory(memory_size) self.explore_size = explore_size self.num_process = num_process self.lr_p = lr_p self.lr_v = lr_v self.batch_size = batch_size self.min_update_step = min_update_step self.update_step = update_step self.action_noise = action_noise self.seed = seed self._init_model() def _init_model(self): """init model from parameters""" self.num_states = self.env.observation_space.shape[0] self.num_actions = self.env.action_space.shape[0] self.action_low, self.action_high = self.env.action_space.low[ 0], self.env.action_space.high[0] # seeding np.random.seed(self.seed) torch.manual_seed(self.seed) self.env.seed(self.seed) self.policy_net = Actor(self.num_states, self.num_actions, self.action_high).to(device) self.policy_net_target = Actor(self.num_states, self.num_actions, self.action_high).to(device) self.value_net = Value(self.num_states + self.num_actions).to(device) self.value_net_target = Value(self.num_states + self.num_actions).to(device) self.policy_net_target.load_state_dict(self.policy_net.state_dict()) self.value_net_target.load_state_dict(self.value_net.state_dict()) self.optimizer_p = optim.Adam(self.policy_net.parameters(), lr=self.lr_p) self.optimizer_v = optim.Adam(self.value_net.parameters(), lr=self.lr_v) def choose_action(self, state, noise_scale): """select action""" self.policy_net.eval() state = FLOAT(state).unsqueeze(0).to(device) with torch.no_grad(): action = self.policy_net(state) self.policy_net.train() action = action.cpu().numpy()[0] # add noise noise = noise_scale * np.random.randn(self.num_actions) action += noise action = np.clip(action, -self.action_high, self.action_high) return action def eval(self, i_iter, render=False): """evaluate model""" self.policy_net.eval() self.value_net.eval() state = self.env.reset() test_reward = 0 while True: if render: self.env.render() action = self.choose_action(state, 0) state, reward, done, _ = self.env.step(action) test_reward += reward if done: break print(f"Iter: {i_iter}, test Reward: {test_reward}") self.env.close() def learn(self, writer, i_iter, step): """interact""" self.policy_net.train() self.value_net.train() state = self.env.reset() episode_reward = 0 while True: if self.render: self.env.render() action = self.choose_action(state, self.action_noise) next_state, reward, done, _ = self.env.step(action) mask = 0 if done else 1 # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob') self.memory.push(state, action, reward, next_state, mask) episode_reward += reward if step >= self.min_update_step and step % self.update_step == 0: for _ in range(self.update_step): batch = self.memory.sample( self.batch_size) # random sample batch self.update(batch) if done: break state = next_state self.env.close() print(f"Iter: {i_iter}, reward: {episode_reward}") # record reward information writer.add_scalar("ddpg/reward", episode_reward, i_iter) def update(self, batch): """learn model""" batch_state = FLOAT(batch.state).to(device) batch_action = FLOAT(batch.action).to(device) batch_reward = FLOAT(batch.reward).to(device) batch_next_state = FLOAT(batch.next_state).to(device) batch_mask = FLOAT(batch.mask).to(device) # update by DDPG ddpg_step(self.policy_net, self.policy_net_target, self.value_net, self.value_net_target, self.optimizer_p, self.optimizer_v, batch_state, batch_action, batch_reward, batch_next_state, batch_mask, self.gamma, self.polyak) def load(self, model_path): print(f"Loading Saved Model from {model_path}") self.policy_net, self.value_net = torch.load(model_path, map_location=device) def save(self, save_path): if not os.path.exists(save_path): os.mkdir(save_path) """save model""" torch.save((self.policy_net, self.value_net), f"{save_path}/WebEye_ddpg.pt")
class MAGAIL: def __init__(self, config, log_dir, exp_name): self.config = config self.exp_name = exp_name self.writer = SummaryWriter(log_dir=f"{log_dir}/{self.exp_name}") """seeding""" seed = self.config["general"]["seed"] torch.manual_seed(seed) np.random.seed(seed) self._load_expert_data() self._init_model() def _init_model(self): self.V = Value(num_states=self.config["value"]["num_states"], num_hiddens=self.config["value"]["num_hiddens"], drop_rate=self.config["value"]["drop_rate"], activation=self.config["value"]["activation"]) self.P = JointPolicy( initial_state=self.expert_dataset.state.to(device), config=self.config["jointpolicy"]) self.D = Discriminator( num_states=self.config["discriminator"]["num_states"], num_actions=self.config["discriminator"]["num_actions"], num_hiddens=self.config["discriminator"]["num_hiddens"], drop_rate=self.config["discriminator"]["drop_rate"], use_noise=self.config["discriminator"]["use_noise"], noise_std=self.config["discriminator"]["noise_std"], activation=self.config["discriminator"]["activation"]) print("Model Structure") print(self.P) print(self.V) print(self.D) print() self.optimizer_policy = optim.Adam( self.P.parameters(), lr=self.config["jointpolicy"]["learning_rate"]) self.optimizer_value = optim.Adam( self.V.parameters(), lr=self.config["value"]["learning_rate"]) self.optimizer_discriminator = optim.Adam( self.D.parameters(), lr=self.config["discriminator"]["learning_rate"]) self.scheduler_discriminator = optim.lr_scheduler.StepLR( self.optimizer_discriminator, step_size=2000, gamma=0.95) self.discriminator_func = nn.BCELoss() to_device(self.V, self.P, self.D, self.D, self.discriminator_func) def _load_expert_data(self): num_expert_states = self.config["general"]["num_states"] num_expert_actions = self.config["general"]["num_actions"] expert_batch_size = self.config["general"]["expert_batch_size"] self.expert_dataset = ExpertDataSet( data_set_path=self.config["general"]["expert_data_path"], num_states=num_expert_states, num_actions=num_expert_actions) self.expert_data_loader = DataLoader( dataset=self.expert_dataset, batch_size=expert_batch_size, shuffle=True, num_workers=multiprocessing.cpu_count() // 2) def train(self, epoch): self.P.train() self.D.train() self.V.train() # collect generated batch gen_batch = self.P.collect_samples( self.config["ppo"]["sample_batch_size"]) # batch: ('state', 'action', 'next_state', 'log_prob', 'mask') gen_batch_state = trans_shape_func( torch.stack(gen_batch.state )) # [trajectory length * parallel size, state size] gen_batch_action = trans_shape_func( torch.stack(gen_batch.action )) # [trajectory length * parallel size, action size] gen_batch_next_state = trans_shape_func( torch.stack(gen_batch.next_state) ) # [trajectory length * parallel size, state size] gen_batch_old_log_prob = trans_shape_func( torch.stack( gen_batch.log_prob)) # [trajectory length * parallel size, 1] gen_batch_mask = trans_shape_func(torch.stack( gen_batch.mask)) # [trajectory length * parallel size, 1] # grad_collect_func = lambda d: torch.cat([grad.view(-1) for grad in torch.autograd.grad(d, self.D.parameters(), retain_graph=True)]).unsqueeze(0) #################################################### # update discriminator #################################################### for expert_batch_state, expert_batch_action in self.expert_data_loader: gen_r = self.D(gen_batch_state, gen_batch_action) expert_r = self.D(expert_batch_state.to(device), expert_batch_action.to(device)) # label smoothing for discriminator expert_labels = torch.ones_like(expert_r) gen_labels = torch.zeros_like(gen_r) if self.config["discriminator"]["use_label_smoothing"]: smoothing_rate = self.config["discriminator"][ "label_smooth_rate"] expert_labels *= (1 - smoothing_rate) gen_labels += torch.ones_like(gen_r) * smoothing_rate e_loss = self.discriminator_func(expert_r, expert_labels) g_loss = self.discriminator_func(gen_r, gen_labels) d_loss = e_loss + g_loss # """ WGAN with Gradient Penalty""" # d_loss = gen_r.mean() - expert_r.mean() # differences_batch_state = gen_batch_state[:expert_batch_state.size(0)] - expert_batch_state # differences_batch_action = gen_batch_action[:expert_batch_action.size(0)] - expert_batch_action # alpha = torch.rand(expert_batch_state.size(0), 1) # interpolates_batch_state = gen_batch_state[:expert_batch_state.size(0)] + (alpha * differences_batch_state) # interpolates_batch_action = gen_batch_action[:expert_batch_action.size(0)] + (alpha * differences_batch_action) # gradients = torch.cat([x for x in map(grad_collect_func, self.D(interpolates_batch_state, interpolates_batch_action))]) # slopes = torch.norm(gradients, p=2, dim=-1) # gradient_penalty = torch.mean((slopes - 1.) ** 2) # d_loss += 10 * gradient_penalty self.optimizer_discriminator.zero_grad() d_loss.backward() self.optimizer_discriminator.step() self.scheduler_discriminator.step() self.writer.add_scalar('train/loss/d_loss', d_loss.item(), epoch) self.writer.add_scalar("train/loss/e_loss", e_loss.item(), epoch) self.writer.add_scalar("train/loss/g_loss", g_loss.item(), epoch) self.writer.add_scalar('train/reward/expert_r', expert_r.mean().item(), epoch) self.writer.add_scalar('train/reward/gen_r', gen_r.mean().item(), epoch) with torch.no_grad(): gen_batch_value = self.V(gen_batch_state) gen_batch_reward = self.D(gen_batch_state, gen_batch_action) gen_batch_advantage, gen_batch_return = estimate_advantages( gen_batch_reward, gen_batch_mask, gen_batch_value, self.config["gae"]["gamma"], self.config["gae"]["tau"], self.config["jointpolicy"]["trajectory_length"]) #################################################### # update policy by ppo [mini_batch] #################################################### ppo_optim_epochs = self.config["ppo"]["ppo_optim_epochs"] ppo_mini_batch_size = self.config["ppo"]["ppo_mini_batch_size"] gen_batch_size = gen_batch_state.shape[0] optim_iter_num = int(math.ceil(gen_batch_size / ppo_mini_batch_size)) for _ in range(ppo_optim_epochs): perm = torch.randperm(gen_batch_size) for i in range(optim_iter_num): ind = perm[slice( i * ppo_mini_batch_size, min((i + 1) * ppo_mini_batch_size, gen_batch_size))] mini_batch_state, mini_batch_action, mini_batch_next_state, mini_batch_advantage, mini_batch_return, \ mini_batch_old_log_prob = gen_batch_state[ind], gen_batch_action[ind], gen_batch_next_state[ind], \ gen_batch_advantage[ind], gen_batch_return[ind], gen_batch_old_log_prob[ind] v_loss, p_loss = ppo_step( self.P, self.V, self.optimizer_policy, self.optimizer_value, states=mini_batch_state, actions=mini_batch_action, next_states=mini_batch_next_state, returns=mini_batch_return, old_log_probs=mini_batch_old_log_prob, advantages=mini_batch_advantage, ppo_clip_ratio=self.config["ppo"]["clip_ratio"], value_l2_reg=self.config["value"]["l2_reg"]) self.writer.add_scalar('train/loss/p_loss', p_loss, epoch) self.writer.add_scalar('train/loss/v_loss', v_loss, epoch) print(f" Training episode:{epoch} ".center(80, "#")) print('gen_r:', gen_r.mean().item()) print('expert_r:', expert_r.mean().item()) print('d_loss', d_loss.item()) def eval(self, epoch): self.P.eval() self.D.eval() self.V.eval() gen_batch = self.P.collect_samples( self.config["ppo"]["sample_batch_size"]) gen_batch_state = torch.stack(gen_batch.state) gen_batch_action = torch.stack(gen_batch.action) gen_r = self.D(gen_batch_state, gen_batch_action) for expert_batch_state, expert_batch_action in self.expert_data_loader: expert_r = self.D(expert_batch_state.to(device), expert_batch_action.to(device)) print(f" Evaluating episode:{epoch} ".center(80, "-")) print('validate_gen_r:', gen_r.mean().item()) print('validate_expert_r:', expert_r.mean().item()) self.writer.add_scalar("validate/reward/gen_r", gen_r.mean().item(), epoch) self.writer.add_scalar("validate/reward/expert_r", expert_r.mean().item(), epoch) def save_model(self, save_path): if not os.path.exists(save_path): os.mkdir(save_path) # dump model from pkl file # torch.save((self.D, self.P, self.V), f"{save_path}/{self.exp_name}.pt") torch.save(self.D, f"{save_path}/{self.exp_name}_Discriminator.pt") torch.save(self.P, f"{save_path}/{self.exp_name}_JointPolicy.pt") torch.save(self.V, f"{save_path}/{self.exp_name}_Value.pt") def load_model(self, model_path): # load entire model # self.D, self.P, self.V = torch.load((self.D, self.P, self.V), f"{save_path}/{self.exp_name}.pt") self.D = torch.load(f"{model_path}_Discriminator.pt", map_location=device) self.P = torch.load(f"{model_path}_JointPolicy.pt", map_location=device) self.V = torch.load(f"{model_path}_Value.pt", map_location=device)