def _init_model(self): """init model from parameters""" self.env, env_continuous, num_states, num_actions = get_env_info( self.env_id) # seeding torch.manual_seed(self.seed) self.env.seed(self.seed) if env_continuous: self.policy_net = Policy(num_states, num_actions).to(device) else: self.policy_net = DiscretePolicy(num_states, num_actions).to(device) self.value_net = Value(num_states).to(device) self.ac_net = Actor_Critic(self.policy_net, self.value_net).to(device) self.running_state = ZFilter((num_states, ), clip=5) if self.model_path: print("Loading Saved Model {}_a2c.p".format(self.env_id)) self.ac_net, self.running_state = pickle.load( open('{}/{}_a2c.p'.format(self.model_path, self.env_id), "rb")) self.collector = MemoryCollector(self.env, self.ac_net, render=self.render, running_state=self.running_state, num_process=self.num_process) self.optimizer_ac = optim.Adam(self.ac_net.parameters(), lr=self.lr_ac)
def _init_model(self): # seeding seed = self.config["train"]["general"]["seed"] torch.manual_seed(seed) np.random.seed(seed) self.env, env_continuous, num_states, num_actions = get_env_info( self.env_id) # check env assert num_states == self.expert_dataset.num_states and num_actions == self.expert_dataset.num_actions, \ "Expected corresponding expert dataset and env" dim_dict = {"dim_state": num_states, "dim_action": num_actions} self.config["value"].update(dim_dict) self.config["policy"].update(dim_dict) self.config["discriminator"].update(dim_dict) self.value = Value(dim_state=self.config["value"]["dim_state"], dim_hidden=self.config["value"]["dim_hidden"], activation=resolve_activate_function( self.config["value"]["activation"])) self.policy = Policy(config=self.config["policy"]) self.discriminator = Discriminator( dim_state=self.config["discriminator"]["dim_state"], dim_action=self.config["discriminator"]["dim_action"], dim_hidden=self.config["discriminator"]["dim_hidden"], activation=resolve_activate_function( self.config["discriminator"]["activation"])) self.discriminator_func = nn.BCELoss() self.running_state = None self.collector = MemoryCollector(self.env, self.policy, render=self.render, running_state=self.running_state, num_process=self.num_process) print("Model Structure") print(self.policy) print(self.value) print(self.discriminator) print() self.optimizer_policy = optim.Adam( self.policy.parameters(), lr=self.config["policy"]["learning_rate"]) self.optimizer_value = optim.Adam( self.value.parameters(), lr=self.config["value"]["learning_rate"]) self.optimizer_discriminator = optim.Adam( self.discriminator.parameters(), lr=self.config["discriminator"]["learning_rate"]) to_device(self.value, self.policy, self.discriminator, self.discriminator_func)
class PPO: def __init__(self, env_id, render=False, num_process=4, min_batch_size=2048, lr_p=3e-4, lr_v=3e-4, gamma=0.99, tau=0.95, clip_epsilon=0.2, ppo_epochs=10, model_path=None, seed=1): self.env_id = env_id self.gamma = gamma self.tau = tau self.ppo_epochs = ppo_epochs self.clip_epsilon = clip_epsilon self.render = render self.num_process = num_process self.lr_p = lr_p self.lr_v = lr_v self.min_batch_size = min_batch_size self.model_path = model_path self.seed = seed self._init_model() def _init_model(self): """init model from parameters""" self.env, env_continuous, num_states, num_actions = get_env_info( self.env_id) # seeding torch.manual_seed(self.seed) self.env.seed(self.seed) if env_continuous: self.policy_net = Policy(num_states, num_actions).double().to( device) # current policy self.policy_net_old = Policy(num_states, num_actions).double().to( device) # old policy else: self.policy_net = DiscretePolicy(num_states, num_actions).double().to(device) self.policy_net_old = DiscretePolicy( num_states, num_actions).double().to(device) self.value_net = Value(num_states).double().to(device) self.running_state = ZFilter((num_states, ), clip=5) if self.model_path: print("Loading Saved Model {}_ppo.p".format(self.env_id)) self.policy_net, self.value_net, self.running_state = pickle.load( open('{}/{}_ppo.p'.format(self.model_path, self.env_id), "rb")) self.policy_net_old.load_state_dict(self.policy_net.state_dict()) self.collector = MemoryCollector(self.env, self.policy_net_old, render=self.render, running_state=self.running_state, num_process=self.num_process) self.optimizer_p = optim.Adam(self.policy_net.parameters(), lr=self.lr_p) self.optimizer_v = optim.Adam(self.value_net.parameters(), lr=self.lr_v) def choose_action(self, state): """select action""" state = DOUBLE(state).unsqueeze(0).to(device) with torch.no_grad(): action, log_prob = self.policy_net.get_action_log_prob(state) return action, log_prob def eval(self, i_iter): """init model from parameters""" state = self.env.reset() test_reward = 0 while True: self.env.render() state = self.running_state(state) action, _ = self.choose_action(state) action = action.cpu().numpy()[0] state, reward, done, _ = self.env.step(action) test_reward += reward if done: break print(f"Iter: {i_iter}, test Reward: {test_reward}") self.env.close() def learn(self, writer, i_iter): """learn model""" memory, log = self.collector.collect_samples(self.min_batch_size) print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}" ) # record reward information writer.add_scalars( "ppo", { "total reward": log['total_reward'], "average reward": log['avg_reward'], "min reward": log['min_episode_reward'], "max reward": log['max_episode_reward'], "num steps": log['num_steps'] }, i_iter) batch = memory.sample() # sample all items in memory batch_state = DOUBLE(batch.state).to(device) batch_action = DOUBLE(batch.action).to(device) batch_reward = DOUBLE(batch.reward).to(device) batch_mask = DOUBLE(batch.mask).to(device) batch_log_prob = DOUBLE(batch.log_prob).to(device) with torch.no_grad(): batch_value = self.value_net(batch_state) batch_advantage, batch_return = estimate_advantages( batch_reward, batch_mask, batch_value, self.gamma, self.tau) v_loss, p_loss = torch.empty(1), torch.empty(1) for _ in range(self.ppo_epochs): v_loss, p_loss = ppo_step(self.policy_net, self.value_net, self.optimizer_p, self.optimizer_v, 1, batch_state, batch_action, batch_return, batch_advantage, batch_log_prob, self.clip_epsilon, 1e-3) self.policy_net_old.load_state_dict(self.policy_net.state_dict()) return v_loss, p_loss def save(self, save_path): """save model""" check_path(save_path) pickle.dump((self.policy_net, self.value_net, self.running_state), open('{}/{}_ppo.p'.format(save_path, self.env_id), 'wb'))
class A2C(): def __init__(self, env_id, render=False, num_process=4, min_batch_size=2048, lr_ac=3e-4, value_net_coeff=0.5, entropy_coeff=1e-2, gamma=0.99, tau=0.95, seed=1, model_path=None): self.env_id = env_id self.gamma = gamma self.tau = tau self.render = render self.num_process = num_process self.lr_ac = lr_ac self.value_net_coeff = value_net_coeff self.entropy_coeff = entropy_coeff self.min_batch_size = min_batch_size self.model_path = model_path self.seed = seed self._init_model() def _init_model(self): """init model from parameters""" self.env, env_continuous, num_states, num_actions = get_env_info( self.env_id) # seeding torch.manual_seed(self.seed) self.env.seed(self.seed) if env_continuous: self.policy_net = Policy(num_states, num_actions).to(device) else: self.policy_net = DiscretePolicy(num_states, num_actions).to(device) self.value_net = Value(num_states).to(device) self.ac_net = Actor_Critic(self.policy_net, self.value_net).to(device) self.running_state = ZFilter((num_states, ), clip=5) if self.model_path: print("Loading Saved Model {}_a2c.p".format(self.env_id)) self.ac_net, self.running_state = pickle.load( open('{}/{}_a2c.p'.format(self.model_path, self.env_id), "rb")) self.collector = MemoryCollector(self.env, self.ac_net, render=self.render, running_state=self.running_state, num_process=self.num_process) self.optimizer_ac = optim.Adam(self.ac_net.parameters(), lr=self.lr_ac) def choose_action(self, state): """select action""" state = FLOAT(state).unsqueeze(0).to(device) with torch.no_grad(): action, log_prob = self.ac_net.get_action_log_prob(state) return action, log_prob def eval(self, i_iter, render=False): """init model from parameters""" state = self.env.reset() test_reward = 0 while True: if render: self.env.render() state = self.running_state(state) action, _ = self.choose_action(state) action = action.cpu().numpy()[0] state, reward, done, _ = self.env.step(action) test_reward += reward if done: break print(f"Iter: {i_iter}, test Reward: {test_reward}") self.env.close() def learn(self, writer, i_iter): """learn model""" memory, log = self.collector.collect_samples(self.min_batch_size) print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}" ) # record reward information writer.add_scalar("total reward", log['total_reward'], i_iter) writer.add_scalar("average reward", log['avg_reward'], i_iter) writer.add_scalar("min reward", log['min_episode_reward'], i_iter) writer.add_scalar("max reward", log['max_episode_reward'], i_iter) writer.add_scalar("num steps", log['num_steps'], i_iter) batch = memory.sample() # sample all items in memory batch_state = FLOAT(batch.state).to(device) batch_action = FLOAT(batch.action).to(device) batch_reward = FLOAT(batch.reward).to(device) batch_mask = FLOAT(batch.mask).to(device) with torch.no_grad(): batch_value = self.ac_net.get_value(batch_state) batch_advantage, batch_return = estimate_advantages( batch_reward, batch_mask, batch_value, self.gamma, self.tau) alg_step_stats = a2c_step(self.ac_net, self.optimizer_ac, batch_state, batch_action, batch_return, batch_advantage, self.value_net_coeff, self.entropy_coeff) return alg_step_stats def save(self, save_path): """save model""" check_path(save_path) pickle.dump((self.ac_net, self.running_state), open('{}/{}_a2c.p'.format(save_path, self.env_id), 'wb'))
num_iters = 2000 env = gym.make(env_id) # env = env.unwrapped num_states = env.observation_space.shape[0] if type(env.action_space) == Discrete: num_actions = env.action_space.n else: num_actions = env.action_space.shape[0] actor = ActorContinuous(num_states, num_actions).double().to(device) critic = Critic(num_states).double().to(device) running_state = ZFilter((num_states,), clip=5) agent = MemoryCollector(env, actor, running_state=running_state, num_process=4) opt_p = opt.Adam(actor.parameters(), lr=lr) opt_v = opt.Adam(critic.parameters(), lr=lr) def train(memory): batch = memory.sample() batch_states = DOUBLE(batch.state).to(device) batch_actions = DOUBLE(batch.action).to(device) batch_log_probs = DOUBLE(batch.log_prob).to(device) batch_masks = DOUBLE(batch.mask).to(device) batch_rewards = DOUBLE(batch.reward).to(device) batch_size = batch_states.shape[0] with torch.no_grad():
class GAIL: def __init__(self, render=False, num_process=4, config=None, expert_data_path=None, env_id=None): self.render = render self.env_id = env_id self.num_process = num_process self.expert_data_path = expert_data_path self.config = config self._load_expert_trajectory() self._init_model() def _load_expert_trajectory(self): self.expert_dataset = ExpertDataset( expert_data_path=self.expert_data_path, train_fraction=self.config["expert_data"]["train_fraction"], traj_limitation=self.config["expert_data"]["traj_limitation"], shuffle=self.config["expert_data"]["shuffle"], batch_size=self.config["expert_data"]["batch_size"]) def _init_model(self): # seeding seed = self.config["train"]["general"]["seed"] torch.manual_seed(seed) np.random.seed(seed) self.env, env_continuous, num_states, num_actions = get_env_info( self.env_id) # check env assert num_states == self.expert_dataset.num_states and num_actions == self.expert_dataset.num_actions, \ "Expected corresponding expert dataset and env" dim_dict = {"dim_state": num_states, "dim_action": num_actions} self.config["value"].update(dim_dict) self.config["policy"].update(dim_dict) self.config["discriminator"].update(dim_dict) self.value = Value(dim_state=self.config["value"]["dim_state"], dim_hidden=self.config["value"]["dim_hidden"], activation=resolve_activate_function( self.config["value"]["activation"])) self.policy = Policy(config=self.config["policy"]) self.discriminator = Discriminator( dim_state=self.config["discriminator"]["dim_state"], dim_action=self.config["discriminator"]["dim_action"], dim_hidden=self.config["discriminator"]["dim_hidden"], activation=resolve_activate_function( self.config["discriminator"]["activation"])) self.discriminator_func = nn.BCELoss() self.running_state = None self.collector = MemoryCollector(self.env, self.policy, render=self.render, running_state=self.running_state, num_process=self.num_process) print("Model Structure") print(self.policy) print(self.value) print(self.discriminator) print() self.optimizer_policy = optim.Adam( self.policy.parameters(), lr=self.config["policy"]["learning_rate"]) self.optimizer_value = optim.Adam( self.value.parameters(), lr=self.config["value"]["learning_rate"]) self.optimizer_discriminator = optim.Adam( self.discriminator.parameters(), lr=self.config["discriminator"]["learning_rate"]) to_device(self.value, self.policy, self.discriminator, self.discriminator_func) def choose_action(self, state): """select action""" state = FLOAT(state).unsqueeze(0).to(device) with torch.no_grad(): action, log_prob = self.policy.get_action_log_prob(state) return action, log_prob def learn(self, writer, i_iter): memory, log = self.collector.collect_samples( self.config["train"]["generator"]["sample_batch_size"]) self.policy.train() self.value.train() self.discriminator.train() print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}" ) # record reward information writer.add_scalar("gail/average reward", log['avg_reward'], i_iter) writer.add_scalar("gail/num steps", log['num_steps'], i_iter) # collect generated batch # gen_batch = self.collect_samples(self.config["ppo"]["sample_batch_size"]) gen_batch = memory.sample() gen_batch_state = FLOAT(gen_batch.state).to( device) # [batch size, state size] gen_batch_action = FLOAT(gen_batch.action).to( device) # [batch size, action size] gen_batch_old_log_prob = FLOAT(gen_batch.log_prob).to( device) # [batch size, 1] gen_batch_mask = FLOAT(gen_batch.mask).to(device) # [batch, 1] #################################################### # update discriminator #################################################### d_optim_i_iters = self.config["train"]["discriminator"]["optim_step"] if i_iter % d_optim_i_iters == 0: for step, (expert_batch_state, expert_batch_action) in enumerate( self.expert_dataset.train_loader): if step >= d_optim_i_iters: break # calculate probs and logits gen_prob, gen_logits = self.discriminator( gen_batch_state, gen_batch_action) expert_prob, expert_logits = self.discriminator( expert_batch_state.to(device), expert_batch_action.to(device)) # calculate accuracy gen_acc = torch.mean((gen_prob < 0.5).float()) expert_acc = torch.mean((expert_prob > 0.5).float()) # calculate regression loss expert_labels = torch.ones_like(expert_prob) gen_labels = torch.zeros_like(gen_prob) e_loss = self.discriminator_func(expert_prob, target=expert_labels) g_loss = self.discriminator_func(gen_prob, target=gen_labels) d_loss = e_loss + g_loss # calculate entropy loss logits = torch.cat([gen_logits, expert_logits], 0) entropy = ((1. - torch.sigmoid(logits)) * logits - torch.nn.functional.logsigmoid(logits)).mean() entropy_loss = - \ self.config["train"]["discriminator"]["ent_coeff"] * entropy total_loss = d_loss + entropy_loss self.optimizer_discriminator.zero_grad() total_loss.backward() self.optimizer_discriminator.step() writer.add_scalar('discriminator/d_loss', d_loss.item(), i_iter) writer.add_scalar("discriminator/e_loss", e_loss.item(), i_iter) writer.add_scalar("discriminator/g_loss", g_loss.item(), i_iter) writer.add_scalar("discriminator/ent", entropy.item(), i_iter) writer.add_scalar('discriminator/expert_acc', gen_acc.item(), i_iter) writer.add_scalar('discriminator/gen_acc', expert_acc.item(), i_iter) #################################################### # update policy by ppo [mini_batch] #################################################### with torch.no_grad(): gen_batch_value = self.value(gen_batch_state) d_out, _ = self.discriminator(gen_batch_state, gen_batch_action) gen_batch_reward = -torch.log(1 - d_out + 1e-6) gen_batch_advantage, gen_batch_return = estimate_advantages( gen_batch_reward, gen_batch_mask, gen_batch_value, self.config["train"]["generator"]["gamma"], self.config["train"]["generator"]["tau"]) ppo_optim_i_iters = self.config["train"]["generator"]["optim_step"] ppo_mini_batch_size = self.config["train"]["generator"][ "mini_batch_size"] for _ in range(ppo_optim_i_iters): if ppo_mini_batch_size > 0: gen_batch_size = gen_batch_state.shape[0] optim_iter_num = int( math.ceil(gen_batch_size / ppo_mini_batch_size)) perm = torch.randperm(gen_batch_size) for i in range(optim_iter_num): ind = perm[slice( i * ppo_mini_batch_size, min((i + 1) * ppo_mini_batch_size, gen_batch_size))] mini_batch_state, mini_batch_action, mini_batch_advantage, mini_batch_return, \ mini_batch_old_log_prob = gen_batch_state[ind], gen_batch_action[ind], \ gen_batch_advantage[ind], gen_batch_return[ind], gen_batch_old_log_prob[ ind] v_loss, p_loss, ent_loss = ppo_step( policy_net=self.policy, value_net=self.value, optimizer_policy=self.optimizer_policy, optimizer_value=self.optimizer_value, optim_value_iternum=self.config["value"] ["optim_value_iter"], states=mini_batch_state, actions=mini_batch_action, returns=mini_batch_return, old_log_probs=mini_batch_old_log_prob, advantages=mini_batch_advantage, clip_epsilon=self.config["train"]["generator"] ["clip_ratio"], l2_reg=self.config["value"]["l2_reg"]) else: v_loss, p_loss, ent_loss = ppo_step( policy_net=self.policy, value_net=self.value, optimizer_policy=self.optimizer_policy, optimizer_value=self.optimizer_value, optim_value_iternum=self.config["value"] ["optim_value_iter"], states=gen_batch_state, actions=gen_batch_action, returns=gen_batch_return, old_log_probs=gen_batch_old_log_prob, advantages=gen_batch_advantage, clip_epsilon=self.config["train"]["generator"] ["clip_ratio"], l2_reg=self.config["value"]["l2_reg"]) writer.add_scalar('generator/p_loss', p_loss, i_iter) writer.add_scalar('generator/v_loss', v_loss, i_iter) writer.add_scalar('generator/ent_loss', ent_loss, i_iter) print(f" Training episode:{i_iter} ".center(80, "#")) print('d_gen_prob:', gen_prob.mean().item()) print('d_expert_prob:', expert_prob.mean().item()) print('d_loss:', d_loss.item()) print('e_loss:', e_loss.item()) print("d/bernoulli_entropy:", entropy.item()) def eval(self, i_iter, render=False): self.policy.eval() self.value.eval() self.discriminator.eval() state = self.env.reset() test_reward = 0 while True: if render: self.env.render() if self.running_state: state = self.running_state(state) action, _ = self.choose_action(state) action = action.cpu().numpy()[0] state, reward, done, _ = self.env.step(action) test_reward += reward if done: break print(f"Iter: {i_iter}, test Reward: {test_reward}") self.env.close() def save_model(self, save_path): check_path(save_path) # torch.save((self.discriminator, self.policy, self.value), f"{save_path}/{self.exp_name}.pt") torch.save(self.discriminator, f"{save_path}/{self.env_id}_Discriminator.pt") torch.save(self.policy, f"{save_path}/{self.env_id}_Policy.pt") torch.save(self.value, f"{save_path}/{self.env_id}_Value.pt") def load_model(self, model_path): # load entire model # self.discriminator, self.policy, self.value = torch.load(model_path, map_location=device) self.discriminator = torch.load(f"{model_path}_Discriminator.pt", map_location=device) self.policy = torch.load(f"{model_path}_Policy.pt", map_location=device) self.value = torch.load(f"{model_path}_Value.pt", map_location=device)
class PPO_Minibatch: def __init__(self, env_id, render=False, num_process=4, min_batch_size=2048, lr_p=3e-4, lr_v=3e-4, gamma=0.99, tau=0.95, clip_epsilon=0.2, ppo_mini_batch_size=64, ppo_epochs=10, model_path=None, seed=1): self.env_id = env_id self.gamma = gamma self.tau = tau self.clip_epsilon = clip_epsilon self.ppo_mini_batch_size = ppo_mini_batch_size self.ppo_epochs = ppo_epochs self.render = render self.num_process = num_process self.min_batch_size = min_batch_size self.lr_p = lr_p self.lr_v = lr_v self.model_path = model_path self.seed = seed self._init_model() def _init_model(self): """init model from parameters""" self.env, env_continuous, num_states, num_actions = get_env_info( self.env_id) # seeding torch.manual_seed(self.seed) self.env.seed(self.seed) if self.model_path: print("Loading Saved Model {}_ppo_mini.p".format(self.env_id)) self.policy_net, self.value_net, self.running_state = pickle.load( open('{}/{}_ppo_mini.p'.format(self.model_path, self.env_id), "rb")) else: if env_continuous: self.policy_net = Policy(num_states, num_actions).to(device) else: self.policy_net = DiscretePolicy(num_states, num_actions).to(device) self.value_net = Value(num_states).to(device) self.running_state = ZFilter((num_states, ), clip=5) self.collector = MemoryCollector(self.env, self.policy_net, render=self.render, running_state=self.running_state, num_process=self.num_process) self.optimizer_p = optim.Adam(self.policy_net.parameters(), lr=self.lr_p) self.optimizer_v = optim.Adam(self.value_net.parameters(), lr=self.lr_v) def choose_action(self, state): """select action according to policy""" state = FLOAT(state).unsqueeze(0).to(device) with torch.no_grad(): action, log_prob = self.policy_net.get_action_log_prob(state) return action, log_prob def eval(self, i_iter): """evaluate current model""" state = self.env.reset() test_reward = 0 while True: self.env.render() state = self.running_state(state) action, _ = self.choose_action(state) action = action.cpu().numpy()[0] state, reward, done, _ = self.env.step(action) test_reward += reward if done: break print(f"Iter: {i_iter}, test Reward: {test_reward}") self.env.close() def learn(self, writer, i_iter): """learn model""" memory, log = self.collector.collect_samples(self.min_batch_size) print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}" ) # record reward information writer.add_scalars( "PPO_mini_batch", { "total reward": log['total_reward'], "average reward": log['avg_reward'], "min reward": log['min_episode_reward'], "max reward": log['max_episode_reward'], "num steps": log['num_steps'] }, i_iter) batch = memory.sample() # sample all items in memory batch_state = FLOAT(batch.state).to(device) batch_action = FLOAT(batch.action).to(device) batch_log_prob = FLOAT(batch.log_prob).to(device) batch_reward = FLOAT(batch.reward).to(device) batch_mask = FLOAT(batch.mask).to(device) batch_size = batch_state.shape[0] with torch.no_grad(): batch_values = self.value_net(batch_state) batch_advantages, batch_returns = estimate_advantages( batch_reward, batch_mask, batch_values, self.gamma, self.tau) v_loss, p_loss = torch.empty(1), torch.empty(1) mini_batch_num = int(math.ceil(batch_size / self.ppo_mini_batch_size)) # update with mini-batch for _ in range(self.ppo_epochs): index = torch.randperm(batch_size) for i in range(mini_batch_num): ind = index[slice( i * self.ppo_mini_batch_size, min(batch_size, (i + 1) * self.ppo_mini_batch_size))] state, action, returns, advantages, old_log_pis = batch_state[ ind], batch_action[ind], batch_returns[ ind], batch_advantages[ind], batch_log_prob[ind] v_loss, p_loss = ppo_step(self.policy_net, self.value_net, self.optimizer_p, self.optimizer_v, 1, state, action, returns, advantages, old_log_pis, self.clip_epsilon, 1e-3) return v_loss, p_loss def save(self, save_path): """save model""" pickle.dump((self.policy_net, self.value_net, self.running_state), open('{}/{}_ppo_mini.p'.format(save_path, self.env_id), 'wb'))