def __init__(self, obs_dim, act_dim, normalizer, gamma, tau): self.policy_net = StochasticPolicy(obs_dim, act_dim, 300, normalizer).to(device) self.value_net = Value(obs_dim, hidden_dim=300, normalizer=normalizer).to(device) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=3e-4) self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=3e-4) self.type = 'PPO' self.gamma = gamma self.tau = tau
def __init__(self, obs_dim, act_dim, normalizer): self.policy_net = StochasticPolicy(obs_dim, act_dim, hidden_dim=300, normalizer=normalizer) self.value_net = Value(obs_dim, hidden_dim=300, normalizer=normalizer) self.type = 'TRPO'
class TRPO(object): def __init__(self, obs_dim, act_dim, normalizer): self.policy_net = StochasticPolicy(obs_dim, act_dim, hidden_dim=300, normalizer=normalizer) self.value_net = Value(obs_dim, hidden_dim=300, normalizer=normalizer) self.type = 'TRPO' def get_actor(self): return self.policy_net def to_train(self): self.policy_net.train() self.value_net.train() def to_eval(self): self.policy_net.eval() self.value_net.eval() def cpu(self): self.policy_net.cpu() self.value_net.cpu() def to(self, device): self.policy_net.to(device) self.value_net.to(device) def train(self, batch, entropy_coef=1e-3, gamma=0.995, tau=0.97, l2_reg=1e-3, max_kl=1e-2, damping=1e-1): self.cpu() update_params(batch, self.policy_net, self.value_net, gamma, tau, l2_reg, max_kl, damping, entropy_coef)
from utils.normalizer import Normalizer from models.agent import StochasticPolicy, Policy env, env_name = flow_env(render=False, use_inflows=True) print("simulated task: {}".format(env_name)) act_dim = env.action_space.shape[0] obs_dim = env.observation_space.shape[0] print(obs_dim) normalizer = Normalizer(obs_dim) filename = 'ppo_340000' #filename = 'td3_shortgreenpenalty_1332000' ### load RL policy ### if 'ppo' in filename: actor = StochasticPolicy(obs_dim, act_dim, 300, normalizer=normalizer).to(device) elif 'td3' in filename: actor = Policy(obs_dim, act_dim, hidden_dim=400, normalizer=normalizer).to(device) else: raise NotImplementedError checkpoint = torch.load('./model_log/' + filename) actor.load_state_dict(checkpoint['model_state_dict']) reward_sum = 0. for i in range(1): state = env.reset() for j in range(100000): s = torch.from_numpy(state.reshape(1, -1)).float().to(device) #print(actor(s))
class PPO(object): def __init__(self, obs_dim, act_dim, normalizer, gamma, tau): self.policy_net = StochasticPolicy(obs_dim, act_dim, 300, normalizer).to(device) self.value_net = Value(obs_dim, hidden_dim=300, normalizer=normalizer).to(device) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=3e-4) self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=3e-4) self.type = 'PPO' self.gamma = gamma self.tau = tau def get_actor(self): return self.policy_net def to_train(self): self.policy_net.train() self.value_net.train() def to_eval(self): self.policy_net.eval() self.value_net.eval() def cpu(self): self.policy_net.cpu() self.value_net.cpu() def to(self, device): self.policy_net.to(device) self.value_net.to(device) def train(self, batch, entropy_coef=1e-3, n_iter=1, batch_size=16, clip_param=0.2): states = torch.Tensor(batch.state).to(device) actions = torch.Tensor(batch.action).to(device) returns, advantages = gae(batch, self.value_net, self.gamma, self.tau) #returns = (returns - returns.mean()) / (returns.std() + 1e-8) mean, log_std, std = self.policy_net(states) old_policy = log_density(actions, mean, std, log_std).detach() old_values = self.value_net(states).detach() for _ in range(n_iter): index = np.random.permutation(returns.shape[0]) index = np.array_split(index, returns.shape[0] // batch_size) for idx in index: batch_states = states[idx, :] batch_actions = actions[idx, :] batch_returns = returns[idx, :] batch_advantages = advantages[idx, :] batch_old_values = old_values[idx, :] batch_old_policy = old_policy[idx, :] loss, ratio = surrogate_loss(self.policy_net, batch_advantages, batch_states, batch_old_policy, batch_actions) values = self.value_net(batch_states) clipped_values = batch_old_values + \ torch.clamp(values - batch_old_values, -clip_param, clip_param) value_loss1 = (clipped_values - batch_returns).pow(2) value_loss2 = (values - batch_returns).pow(2) value_loss = torch.max(value_loss1, value_loss2).mean() self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() clipped_ratio = torch.clamp(ratio, 1 - clip_param, 1 + clip_param) clipped_loss = clipped_ratio * batch_advantages loss = -torch.min(loss, clipped_loss).mean() self.policy_optimizer.zero_grad() loss.backward() self.policy_optimizer.step()