class MultiAgent(object): def __init__(self, config: DefaultMunch): self.config = config self.memory = self.config.memory self.n_agents = self.config.n_agents self.action_size = self.config.action_size self.state_size = self.config.state_size self.critic_local = Critic(self.state_size, self.config.action_size, self.config.n_agents).to(self.config.device) self.critic_target = Critic(self.state_size, self.config.action_size, self.config.n_agents).to( self.config.device) self.critic_target.load_state_dict(self.critic_local.state_dict()) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.config.lr_critic) self.agents = [Agent(self.config, self) for i in range(self.n_agents)] def step(self, states, actions, rewards, next_states, dones): self.memory.add((states[0], actions[0], rewards[0], next_states[0], dones[0], states[1], actions[1], next_states[1])) self.agents[0].step() self.memory.add((states[1], actions[1], rewards[1], next_states[1], dones[1], states[0], actions[0], next_states[0])) self.agents[1].step() def act(self, states, add_noise=True): actions1: torch.Tensor = self.agents[0].act(states[0], add_noise) actions2: torch.Tensor = self.agents[1].act(states[1], add_noise) actions = torch.stack([actions1, actions2], dim=0) return actions def reset(self): for agent in self.agents: agent.reset() def save(self, path, episode): for i, agent in enumerate(self.agents): agent.save(path + str(i), episode) def load(self, path): for i, agent in enumerate(self.agents): agent.load(path + str(i))
class DDPGAgent: def __init__(self, state_size, action_size, num_agents, hidden_actor, hidden_critic, lr_actor, lr_critic, buffer_size, agent_id, use_PER=False, seed=0) -> None: super(DDPGAgent, self).__init__() self.seed = torch.manual_seed(seed) self.actor_local = Actor(state_size, hidden_actor, action_size, seed=seed).to(device) self.actor_target = Actor(state_size, hidden_actor, action_size, seed=seed).to(device) self.critic_local = Critic(state_size, num_agents*action_size, hidden_critic, 1, seed=seed).to(device) self.critic_target = Critic(state_size, num_agents*action_size, hidden_critic, 1, seed=seed).to(device) self.actor_optimizer = Adam(self.actor_local.parameters(), lr= lr_actor) self.critic_optimizer = Adam(self.critic_local.parameters(), lr=lr_actor) # initialize targets same as original networks self.noise = OUNoise(out_actor, scale=1.0 ) hard_update(self.actor_target, self.actor_local) hard_update(self.critic_target, self.critic_local) def act(self, obs, noise): obs = obs.to(device) if len(obs.shape)==1: obs = obs.unsqueeze(0) return self.actor_local(obs) + noise*self.noise.noise() def target_act(self, obs, noise): obs = obs.to(device) if len(obs.shape)==1: obs = obs.unsqueeze(0) return self.actor_target(obs) + noise*self.noise.noise()
class DDPGAgent(): """ Agent that interacts with and learns from the environment. """ def __init__(self, state_size, action_size, agent_num, random_seed): """ Initialize an Agent object. :param state_size (int): dimension of each state :param action_size (int): dimension of each action :param random_seed (int): random seed """ # Actor Networks self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Networks self.critic_local = Critic(state_size, action_size, agent_num, random_seed).to(device) self.critic_target = Critic(state_size, action_size, agent_num, random_seed).to(device) self.critic_optimizer = Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed, scale=0.1) def act(self, obs, noise=0.0): obs = obs.to(device) action = self.actor_local(obs) + noise * self.noise.sample() return action def target_act(self, obs, noise=0.0): obs = obs.to(device) action = self.actor_target(obs) + noise * self.noise.sample() return action
class DDPGAgent: def __init__(self, state_size, action_size, num_agents, lr_actor=1.0e-4, lr_critic=1.0e-3): super(DDPGAgent, self).__init__() self.actor = Actor(state_size, action_size).to(DEVICE) self.critic = Critic(state_size, action_size, num_agents).to(DEVICE) self.target_actor = Actor(state_size, action_size).to(DEVICE) self.target_critic = Critic(state_size, action_size, num_agents).to(DEVICE) self.noise = OUNoise(action_size, scale=1.0) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic) def act(self, states, noise=0.0): states = states.to(DEVICE) self.actor.eval() actions = self.actor( states).cpu().data.numpy() + noise * self.noise.noise() return np.clip(actions, -1, 1) def target_act(self, states, noise=0.0): states = states.to(DEVICE) self.target_actor.eval() actions = self.target_actor( states).cpu().data.numpy() + noise * self.noise.noise() return np.clip(actions, -1, 1)
class Agent(): def __init__(self, state_shape, action_shape, stats): # self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.device = torch.device("cpu") self.state_shape = state_shape self.action_shape = action_shape self.stats = stats self.learn_rate = 3e-4 self.num_epochs = 8 self.entropy_weight = 0.001 self.kl_clip = 0.1 self.deterministic_test_mode = False self.hidden_state_size = 16 self.lstm = LSTM(self.state_shape, self.hidden_state_size) self.actor = Actor(self.hidden_state_size, self.action_shape).to(self.device) self.critic = Critic(self.hidden_state_size).to(self.device) self.optimizer = torch.optim.Adam(list(self.actor.parameters()) + list(self.critic.parameters()), lr=self.learn_rate) def act(self, state): with torch.no_grad(): if self.deterministic_test_mode: mu = self.actor.forward_deterministic(state) action = mu else: policy_dist = self.actor(state) action = policy_dist.sample() action = action.clamp(-1, 1) # depends on env action = action.cpu().numpy()[0] return action def learn(self, rollout_collector): for _ in range(self.num_epochs): for state, action, old_log_probs, advantage, return_ in rollout_collector.random_batch_iter( ): policy_dist = self.actor(state) value = self.critic(state) new_log_probs = policy_dist.log_prob(action) ratio = (new_log_probs - old_log_probs).exp() surr1 = ratio * advantage surr2 = torch.clamp(ratio, 1.0 - self.kl_clip, 1.0 + self.kl_clip) * advantage actor_loss = -torch.min(surr1, surr2).mean() critic_loss = (return_ - value).pow(2).mean() entropy = policy_dist.entropy().mean() loss = 0.5 * critic_loss + actor_loss - self.entropy_weight * entropy self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.stats.update_training_stats( num_samples_processed_inc=\ rollout_collector.batch_size * rollout_collector.rollout_length * self.num_epochs)
class Agent: """ --- # test scenario 1. >>> env = environment.gym_env(ID) >>> agent = Agent(env) >>> state = agent.env.reset() # >>> agent.select_action(state) >>> agent.train(100000) """ def __init__(self, env: gym.Env, hidden_dims=128): self.env = env obs_dim = env.observation_space.shape[0] ## TODO 외우기 action_dim = env.action_space.shape[0] self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.gamma = 0.95 # hyperparameter self.entropy_weight = 1e-2 # hyperparameter self.actor = Actor(obs_dim, action_dim, hidden_dims).to(self.device) self.critic = Critic(obs_dim, hidden_dims).to(self.device) # 넣어주시 self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-4) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3) self.done = False self.score = 0 self.transition_store = list() def select_action(self, state: np.ndarray, train='train'): """ 여기 쫌 난해하네 :param state: :param train: :return: """ state_tensor = torch.FloatTensor(state).to(self.device) # 요런거 외우기 action, dist = self.actor(state_tensor) action_map = {'train': action, 'test': dist.mean} selected_action = action_map[train] log_prob = dist.log_prob(selected_action).sum( dim=-1) # entropy ### sum을 왜 하는겨???? return selected_action.clamp(-2.0, 2.0).cpu().detach().numpy( ), log_prob #array([-2.], dtype=float32) # detach가 필요한 이유는? # selected_action.clamp(-2.0, 2.0).cpu().detach().numpy(), # dist.log_prob(action_map) def train( self, number_frames ): #number frames = 500000 (얘가 총 프레임수인가벼), plotting interval = 100 state = self.env.reset() for i in range(1, number_frames): self.env.render() action, log_prob = self.select_action(state, 'train') next_state, reward, done, info = self.env.step(action) self.transition_store.append((state, next_state, reward, done)) state = next_state if done: # 얜 당연히 SGD임? self.update(self.transition_store) self.transition_store = [] state = self.env.reset() self.env.close() def update(self, store): for experience in store: state, next_state, reward, done = experience next_state = torch.FloatTensor(next_state).to(self.device) state = torch.FloatTensor(state).to(self.device) pred_value = self.critic(state) targ_value = reward + self.gamma * self.critic(next_state) * (1 - done) value_loss = F.smooth_l1_loss(pred_value, targ_value.detach()) # update value self.critic_optimizer.zero_grad() value_loss.backward() self.critic_optimizer.step() # advantage = Q_t - V(s_t) _, log_prob = self.select_action(state, 'train') advantage = (targ_value - pred_value).detach() # not backpropagated policy_loss = -advantage * log_prob policy_loss += self.entropy_weight * -log_prob # entropy maximization # update policy self.actor_optimizer.zero_grad() policy_loss.backward() self.actor_optimizer.step() # return policy_loss.item(), value_loss.item() def log(self): #score, actor loss, critic loss to tensorboard NotImplemented # tensorboard에 기록 남기기 def check(self): NotImplemented # 중간중간 저장하기? def test(self): NotImplemented
import torch from torch import optim from tqdm import tqdm from hyperparams import OFF_POLICY_BATCH_SIZE as BATCH_SIZE, DISCOUNT, ENTROPY_WEIGHT, HIDDEN_SIZE, LEARNING_RATE, MAX_STEPS, POLYAK_FACTOR, REPLAY_SIZE, TEST_INTERVAL, UPDATE_INTERVAL, UPDATE_START from env import Env from models import Critic, SoftActor, create_target_network, update_target_network from utils import plot env = Env() actor = SoftActor(HIDDEN_SIZE) critic_1 = Critic(HIDDEN_SIZE, state_action=True) critic_2 = Critic(HIDDEN_SIZE, state_action=True) value_critic = Critic(HIDDEN_SIZE) target_value_critic = create_target_network(value_critic) actor_optimiser = optim.Adam(actor.parameters(), lr=LEARNING_RATE) critics_optimiser = optim.Adam(list(critic_1.parameters()) + list(critic_2.parameters()), lr=LEARNING_RATE) value_critic_optimiser = optim.Adam(value_critic.parameters(), lr=LEARNING_RATE) D = deque(maxlen=REPLAY_SIZE) def test(actor): with torch.no_grad(): env = Env() state, done, total_reward = env.reset(), False, 0 while not done: action = actor( state).mean # Use purely exploitative policy at test time state, reward, done = env.step(action)
class DDPG(object): def __init__(self, n_s, n_a, a_bound, gamma=0.99, memory_size=10000, tau=0.01, lr_a=0.001, lr_c=0.002, batch_size=64, var=3, var_decay=0.9995): self.n_s = n_s self.n_a = n_a self.a_bound = a_bound self.gamma = gamma self.memory_size = memory_size self.tau = tau self.batch_size = batch_size self.var = var self.var_decay = var_decay # memory self.replay_buffer = ReplayBuffer(n_s, n_a, memory_size) # actor self.eval_actor = Actor(n_s, n_a, a_bound) self.target_actor = deepcopy(self.eval_actor) self.actor_optim = torch.optim.Adam(self.eval_actor.parameters(), lr=lr_a) # critic self.eval_critic = Critic(n_s, n_a) self.target_critic = deepcopy(self.eval_critic) self.critic_optim = torch.optim.Adam(self.eval_critic.parameters(), lr=lr_c) def choose_action(self, s): s = torch.FloatTensor(s).unsqueeze(0) action = self.eval_actor(s).detach().numpy()[0] a = np.clip(np.random.normal(action, self.var), -self.a_bound, self.a_bound) return a def step(self, s, a, r, s_, done): self.store(s, a, r, s_, done) if self.replay_buffer.memory_count < self.memory_size: return # start learn self._learn() def _learn(self): # get batch mini_batch = self.replay_buffer.sample(self.batch_size) b_s = torch.FloatTensor(mini_batch[:, :self.n_s]) b_a = torch.FloatTensor(mini_batch[:, self.n_s:self.n_s + self.n_a]) b_r = torch.FloatTensor(mini_batch[:, self.n_s + self.n_a:self.n_s + self.n_a + 2]) b_s_ = torch.FloatTensor(mini_batch[:, -self.n_s:]) b_done = torch.FloatTensor(mini_batch[:, self.n_s + self.n_a + 2:self.n_s + self.n_a + 3]) # learn self.update_critic(b_s, b_a, b_r, b_s_, b_done) self.update_actor(b_s) self.var *= self.var_decay def update_critic(self, s, a, r, s_, done): with torch.no_grad(): target_next_a = self.target_actor(s_) next_a = self.target_critic(s_, target_next_a) target_q = torch.mean(r) + self.gamma * next_a * (1.0 - done) eval_q = self.eval_critic(s, a) critic_loss = F.mse_loss(eval_q, target_q) self.critic_optim.zero_grad() critic_loss.backward() self.critic_optim.step() self._soft_update(self.eval_critic, self.target_critic) def update_actor(self, s): action = self.eval_actor(s) actor_loss = -self.eval_critic(s, action).mean() self.actor_optim.zero_grad() actor_loss.backward() self.actor_optim.step() self._soft_update(self.eval_actor, self.target_actor) def store(self, s, a, r, s_, done): self.replay_buffer.store(s, a, r, s_, done) def _soft_update(self, eval_net, target_net): for eval, target in zip(eval_net.parameters(), target_net.parameters()): target.data.copy_(self.tau * eval.data + (1.0 - self.tau) * target.data) # save all net def save(self, name): torch.save(self.eval_actor, '{}_actor.pt'.format(name)) torch.save(self.eval_critic, '{}_critic.pt'.format(name)) # load all net def load(self, name): actor = torch.load('{}_actor.pt'.format(name)) critic = torch.load('{}_critic.pt'.format(name)) return actor, critic
def main(): order_book_id_number = 10 toy_data = create_toy_data(order_book_ids_number=order_book_id_number, feature_number=20, start="2019-05-01", end="2019-12-12", frequency="D") env = PortfolioTradingGym(data_df=toy_data, sequence_window=5, add_cash=True) env = Numpy(env) env = ch.envs.Logger(env, interval=1000) env = ch.envs.Torch(env) env = ch.envs.Runner(env) # create net action_size = env.action_space.shape[0] number_asset, seq_window, features_number = env.observation_space.shape input_size = features_number actor = Actor(input_size=input_size, hidden_size=50, action_size=action_size) critic = Critic(input_size=input_size, hidden_size=50, action_size=action_size) target_actor = create_target_network(actor) target_critic = create_target_network(critic) actor_optimiser = optim.Adam(actor.parameters(), lr=LEARNING_RATE_ACTOR) critic_optimiser = optim.Adam(critic.parameters(), lr=LEARNING_RATE_CRITIC) replay = ch.ExperienceReplay() ou_noise = OrnsteinUhlenbeckNoise(mu=np.zeros(action_size)) def get_action(state): action = actor(state) action = action + ou_noise()[0] return action def get_random_action(state): action = torch.softmax(torch.randn(action_size), dim=0) return action for step in range(1, MAX_STEPS + 1): with torch.no_grad(): if step < UPDATE_START: replay += env.run(get_random_action, steps=1) else: replay += env.run(get_action, steps=1) replay = replay[-REPLAY_SIZE:] if step > UPDATE_START and step % UPDATE_INTERVAL == 0: sample = random.sample(replay, BATCH_SIZE) batch = ch.ExperienceReplay(sample) next_values = target_critic(batch.next_state(), target_actor(batch.next_state())).view( -1, 1) values = critic(batch.state(), batch.action()).view(-1, 1) rewards = ch.normalize(batch.reward()) #rewards = batch.reward()/100.0 change the convergency a lot value_loss = ch.algorithms.ddpg.state_value_loss( values, next_values.detach(), rewards, batch.done(), DISCOUNT) critic_optimiser.zero_grad() value_loss.backward() critic_optimiser.step() # Update policy by one step of gradient ascent policy_loss = -critic(batch.state(), actor(batch.state())).mean() actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() # Update target networks ch.models.polyak_average(target_critic, critic, POLYAK_FACTOR) ch.models.polyak_average(target_actor, actor, POLYAK_FACTOR)
class DDPG: """Implementation of DDPG. This implementation is adapted to this particular environment running several agent. At each time step, the same actor is controlling each agent sequentially. """ def __init__(self, state_size, action_size, config): """Initialize algorithm.""" if config.PER: self.memory = PrioritizeReplayBuffer( config.BUFFER_SIZE, config.BATCH_SIZE, config.SEED ) else: self.memory = ReplayBuffer( config.BUFFER_SIZE, config.BATCH_SIZE, config.SEED ) # Randomly initialize critic netowrk and actor self.actor = Actor(state_size, action_size, config.SEED).to(device) self.critic = Critic(state_size, action_size, config.SEED).to(device) # Initialize target networks with weights from actor critic # Actor self.actor_target = Actor(state_size, action_size, config.SEED).to(device) self.actor_target.load_state_dict(self.actor.state_dict()) # Critic self.critic_target = Critic(state_size, action_size, config.SEED).to(device) self.critic_target.load_state_dict(self.critic.state_dict()) # Actor optimizer self.actor_optimizer = torch.optim.Adam( self.actor.parameters(), lr=config.LR_ACTOR ) # Critic optimizer self.critic_optimizer = torch.optim.Adam( self.critic.parameters(), lr=config.LR_CRITIC ) self.config = config self.t_step = 0 self.expl_noise = config.EXPL_NOISE def step(self, target_sample=None, **kwargs): """Run a step of algorithm update.""" # Sample a random minibatch of transitions states, actions, rewards, next_states, dones = self._draw_minibatch() # Compute the target Q value target_Q = self.critic_target( next_states, self.actor_target(next_states) ).detach() y = rewards + (1 - dones) * self.config.GAMMA * target_Q # Update critic by minimizing the loss current_Q = self.critic(states, actions) # Compute TD error td_error = y - current_Q if self.config.PER: # Get importance_sampling_weights weights = torch.Tensor(self.memory.importance_sampling()).unsqueeze(1) # Update priorities self.memory.update_priorities(td_error.detach().cpu().numpy()) # Compute critic loss critic_loss = torch.mean(weights * td_error ** 2) else: # Compute critic loss critic_loss = torch.mean(td_error ** 2) # Optimize critic self.critic_optimizer.zero_grad() critic_loss.backward() # Clip gradient nn.utils.clip_grad_norm_(self.critic.parameters(), 1) self.critic_optimizer.step() # Update the actor policy using the sampled policy gradient: actor_loss = -self.critic(states, self.actor(states)).mean() self.actor_optimizer.zero_grad() actor_loss.backward() # CLip gradient nn.utils.clip_grad_norm_(self.actor.parameters(), 1) self.actor_optimizer.step() # Update target networks self.soft_update() def train(self, env, num_episode): """Train a DDPG agent.""" scores = [] scores_window = deque(maxlen=100) for episode in range(num_episode): # Init state and episode score states = env.reset(train_mode=True) score = np.zeros(states.shape[0]) done = False # Run episode while not done: # Select and run action actions = self.predict_actions(states) # TODO: dynamic low and high selection actions = self.add_gaussian_noise(actions, -1, 1) next_states, rewards, dones = env.step(actions) # Store all n_agent episodes in replay buffer for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones ): self.memory.add(state, action, reward, next_state, done) # Update time step self.t_step = (self.t_step + 1) % self.config.UPDATE_EVERY # Optimisation step if UPDATE_EVERY and enough examples in memory if self.t_step == 0 and len(self.memory) > self.config.BATCH_SIZE: for _ in range(self.config.UPDATE_STEPS): self.step() # Update state and scores states = next_states score += rewards # End episode if any of the agent is done, to avoid storing too much # Done transitions in the replay buffer done = any(dones) # Keep track of running mean scores_window.append(max(score)) # Append current mean to scores list scores.append(np.mean(scores_window)) # Logging print( "\rEpisode {}\tAverage Score: {:.2f}, Last Score: {:.2f}".format( episode, np.mean(scores_window), max(score) ), end="", ) if (episode + 1) % 100 == 0: print( "\rEpisode {}\tAverage Score: {:.2f}".format( episode, np.mean(scores_window) ) ) return scores def soft_update(self): """Update the frozen target models.""" tau = self.config.TAU # Actor for param, target_param in zip( self.critic.parameters(), self.critic_target.parameters() ): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) # Critic for param, target_param in zip( self.actor.parameters(), self.actor_target.parameters() ): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) def predict_actions(self, states, **kwargs): """Predict next actions based on current policy.""" states = torch.from_numpy(states).float().unsqueeze(0).to(device) # Set actor to eval mode self.actor.eval() actions = [] with torch.no_grad(): for state in states: action = self.actor(state) actions.append(action.detach().numpy()) # Set actor to train mode self.actor.train() return np.array(actions).squeeze() def add_gaussian_noise(self, action, low, high): """Add Gaussian noise to action, and clip between low and high.""" return (action + np.random.normal(0, self.expl_noise, size=action.shape)).clip( low, high ) def _draw_minibatch(self): """Draw a minibatch in the replay buffer.""" states, actions, rewards, next_states, done = zip(*self.memory.sample()) states = torch.Tensor(states).to(device) actions = torch.Tensor(actions).to(device) rewards = torch.Tensor(rewards).unsqueeze(1).to(device) next_states = torch.Tensor(next_states).to(device) done = torch.Tensor(done).unsqueeze(1).to(device) return states, actions, rewards, next_states, done def save_model(self, path, **kwargs): """Save actor model weights.""" torch.save(self.actor.state_dict(), path)
config.device = device.type print(device) actor = Actor(obs_space=config.obs_space, action_space=config.action_space, hidden_size=config.hidden_size).to(device) critic = Critic(obs_space=config.obs_space, hidden_size=config.hidden_size).to(device) # actor.load_state_dict(torch.load('actor_model.h5')) # critic.load_state_dict(torch.load('critic_model.h5')) wandb.watch(actor) wandb.watch(critic) optimizer_actor = Adam(actor.parameters(), lr=config.actor_lr) optimizer_critic = Adam(critic.parameters(), lr=config.critic_lr) memory = Memory(env.agent_ids) def compute_GAE(rewards, state_values, done, gamma, lamb): """ Computes Generalized Advantage Estimations. """ returns = [rewards[-1] + state_values[-1]] running_sum = rewards[-1] - state_values[-1] for i in reversed(range(len(rewards) - 1)): mask = 0 if done[i + 1] else 1 delta = rewards[i] + gamma * state_values[i + 1] * mask - state_values[i] running_sum = delta + gamma * lamb * running_sum * mask returns.insert(0, running_sum + state_values[i])
class TD3Agent(): def __init__(self, env: object, gamma: float, delay_step: int, tau: float, buffer_maxlen: int, noise_std: float, noise_bound: float, critic_lr: float, actor_lr: float): # Selecting the device to use, wheter CUDA (GPU) if available or CPU self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # Creating the Gym environments for training and evaluation self.env = env # Get max and min values of the action of this environment self.action_range = [ self.env.action_space.low, self.env.action_space.high ] # Get dimension of of the state and the state self.obs_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] # Total_step initialization self.steps = 0 # hyperparameters self.gamma = gamma self.tau = tau self.critic_lr = critic_lr self.actor_lr = actor_lr self.buffer_maxlen = buffer_maxlen self.noise_std = noise_std self.noise_bound = noise_bound self.delay_step = delay_step # Scaling and bias factor for the actions -> We need scaling of the actions because each environment has different min and max values of actions self.scale = (self.action_range[1] - self.action_range[0]) / 2.0 self.bias = (self.action_range[1] + self.action_range[0]) / 2.0 # initialize networks self.critic1 = Critic(self.obs_dim, self.action_dim).to(self.device) self.target_critic1 = Critic(self.obs_dim, self.action_dim).to(self.device) self.critic2 = Critic(self.obs_dim, self.action_dim).to(self.device) self.target_critic2 = Critic(self.obs_dim, self.action_dim).to(self.device) self.actor = Actor(self.obs_dim, self.action_dim).to(self.device) self.target_actor = Actor(self.obs_dim, self.action_dim).to(self.device) # copy weight parameters to the target Q network and actor network for target_param, param in zip(self.target_critic1.parameters(), self.critic1.parameters()): target_param.data.copy_(param) for target_param, param in zip(self.target_critic2.parameters(), self.critic2.parameters()): target_param.data.copy_(param) for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()): target_param.data.copy_(param) # initialize optimizers self.critic1_optimizer = optim.Adam(self.critic1.parameters(), lr=self.critic_lr) self.critic2_optimizer = optim.Adam(self.critic2.parameters(), lr=self.critic_lr) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.actor_lr) # Create a replay buffer self.replay_buffer = BasicBuffer(self.buffer_maxlen) def update(self, batch_size: int, steps: int): self.steps = steps # Sampling experiences from the replay buffer states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size) # Convert numpy arrays of experience tuples into pytorch tensors states = torch.FloatTensor(states).to(self.device) actions = torch.FloatTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones).to(self.device) dones = dones.view(dones.size(0), -1) # Critic update (computing the loss # Sample actions for the next states (s_t+1) using the target actor next_actions = self.target_actor.forward(next_states) next_actions = self.rescale_action(next_actions) # Adding gaussian noise to the actions noise = self.get_noise(next_actions, self.noise_std + 0.1, -self.noise_bound, self.noise_bound) noisy_next_actions = next_actions + noise # Compute Q(s_t+1,a_t+1) next_q1 = self.target_critic1(next_states, noisy_next_actions) next_q2 = self.target_critic2(next_states, noisy_next_actions) # Choose minimum Q min_q = torch.min(next_q1, next_q2) # Find expected Q, i.e., r(t) + gamma*next_q expected_q = rewards + (1 - dones) * self.gamma * min_q # Find current Q values for the given states and actions from replay buffer curr_q1 = self.critic1.forward(states, actions) curr_q2 = self.critic2.forward(states, actions) # Compute loss between Q network and expected Q critic1_loss = F.mse_loss(curr_q1, expected_q.detach()) critic2_loss = F.mse_loss(curr_q2, expected_q.detach()) # Backpropagate the losses and update Q network parameters self.critic1_optimizer.zero_grad() critic1_loss.backward() self.critic1_optimizer.step() self.critic2_optimizer.zero_grad() critic2_loss.backward() self.critic2_optimizer.step() # actor update (computing the loss) if self.steps % self.delay_step == 0: # Sample new actions for the current states (s_t) using the current actor new_actions = self.actor.forward(states) # Compute Q(s_t,a_t) new_q1 = self.critic1.forward(states, new_actions) # Compute the actor loss, i.e., -Q1 actor_loss = -new_q1.mean() # Backpropagate the losses and update actor network parameters self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update the target networks for target_param, param in zip(self.target_critic1.parameters(), self.critic1.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) for target_param, param in zip(self.target_critic2.parameters(), self.critic2.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) def get_noise(self, action: torch.Tensor, sigma: float, bottom: float, top: float) -> torch.Tensor: # sigma: standard deviation of the noise # bottom,top: minimum and maximum values for the given noiuse return torch.normal(torch.zeros(action.size()), sigma).clamp(bottom, top).to(self.device) def get_action(self, state: np.ndarray, stochastic: bool) -> np.ndarray: # state: the state input to the pi network # stochastic: boolean (True -> use noisy action, False -> use noiseless,deterministic action) # Convert state numpy to tensor state = torch.FloatTensor(state).unsqueeze(0).to(self.device) action = self.actor.forward(state) if stochastic: # Add gaussian noise to the rescaled action action = self.rescale_action(action) + self.get_noise( action, self.noise_std, -self.noise_bound, self.noise_bound) else: action = self.rescale_action(action) # Convert action tensor to numpy action = action.squeeze(0).cpu().detach().numpy() return action def rescale_action(self, action: torch.Tensor) -> torch.Tensor: # we use a rescaled action since the output of the actor network is [-1,1] and the mujoco environments could be ranging from [-n,n] where n is an arbitrary real value # scale -> scalar multiplication # bias -> scalar offset return action * self.scale[0] + self.bias[0] def Actor_save(self, WORKSPACE: str): # save 각 node별 모델 저장 print("Save the torch model") savePath = WORKSPACE + "./actor_model5_Hop_.pth" torch.save(self.actor.state_dict(), savePath) def Actor_load(self, WORKSPACE: str): # save 각 node별 모델 로드 print("load the torch model") savePath = WORKSPACE + "./actor_model5_Hop_.pth" # Best self.actor = Actor(self.obs_dim, self.action_dim).to(self.device) self.actor.load_state_dict(torch.load(savePath))
class Agent: """Interacts with and learns from the environment.""" def __init__(self, config: ac_parm, device, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.config = config self.seed = random.seed(random_seed) self.name = config.name self.device = device # Actor Network (w/ Target Network) self.actor_local = Actor(config, random_seed).to(device) self.actor_target = Actor(config, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(config, random_seed).to(device) self.critic_target = Critic(config, random_seed).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=config.lr_critic, weight_decay=config.weight_decay) # Noise process self.noise = OUNoise(config.action_size, random_seed) # Replay memory self.memory = ReplayBuffer(config, device, random_seed) self.step_number = 0 def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.step_number += 1 for (s, a, r, ns, d) in zip(state, action, reward, next_state, done): self.memory.add(s, a, r, ns, d) # Learn, if enough samples are available in memory if len(self.memory) > self.config.batch_size and self.step_number % self.config.learn_every == 0: experiences = self.memory.sample() self.learn(experiences, self.config.gamma) def act(self, state): """Returns actions for given state as per current policy.""" states = torch.from_numpy(state).float().to(self.device) actions = np.zeros((state.shape[0], self.config.action_size)) self.actor_local.eval() with torch.no_grad(): for agent_num, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[agent_num, :] = action self.actor_local.train() if self.config.noise_enabled: actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() if self.config.gradient_clipping: torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.config.tau) self.soft_update(self.actor_local, self.actor_target, self.config.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent: def __init__(self, state_size, action_size, seed, actor_file=None, critic_file=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed actor_file: path of file containing trained weights of actor network critic_file: path of file containing trained weights of critic network """ self.state_size = state_size self.action_size = action_size self.seed = seed #actor network: self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optim = optim.Adam(self.actor_local.parameters(), LR) #critic network self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optim = optim.Adam(self.critic_local.parameters(), LR) #load trained weights if needed if actor_file: weights = torch.load(actor_file) self.actor_local.load_state_dict(weights) self.actor_target.load_state_dict(weights) if critic_file: weights = torch.load(critic_file) self.critic_local.load_state_dict(weights) self.critic_target.load_state_dict(weights) #init replay buffer self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) self.t_step = 0 def act(self, state): """Returns actions for given state as per current Actor network. Params ====== state (array_like): current state """ state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() return np.clip(action, -1, 1) def step(self, state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: if len(self.memory) > BATCH_SIZE: self.learn(GAMMA) def learn(self, GAMMA): """Update value parameters using batch of experience tuples. Params ====== gamma (float): discount factor """ states, actions, rewards, next_states, dones = self.memory.sample() #update critic target_next_actions = self.actor_target(next_states) target_next_q = self.critic_target(next_states, target_next_actions) target_q = rewards + (GAMMA * target_next_q * (1 - dones)) local_q = self.critic_local(states, actions) critic_loss = F.mse_loss(local_q, target_q) self.critic_optim.zero_grad() critic_loss.backward() self.critic_optim.step() #update actor local_actions = self.actor_local(states) actor_loss = -self.critic_local(states, local_actions).mean() self.actor_optim.zero_grad() actor_loss.backward() self.actor_optim.step() self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) def soft_update(self, local_model, target_model, tau=TAU): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class AgentDDPG: """Deep Deterministic Policy Gradient implementation for continuous action space reinforcement learning tasks""" def __init__(self, state_size, hidden_size, action_size, actor_learning_rate=1e-4, critic_learning_rate=1e-3, gamma=0.99, tau=1e-2, use_cuda=False, actor_path=None, critic_path=None): # Params self.state_size, self.hidden_size, self.action_size = state_size, hidden_size, action_size self.gamma, self.tau = gamma, tau self.use_cuda = use_cuda # Networks self.actor = Actor(state_size, hidden_size, action_size) self.actor_target = Actor(state_size, hidden_size, action_size) self.critic = Critic(state_size + action_size, hidden_size, action_size) self.critic_target = Critic(state_size + action_size, hidden_size, action_size) # Load model state_dicts from saved file if actor_path and path.exists(actor_path): self.actor.load_state_dict(torch.load(actor_path)) if critic_path and path.exists(critic_path): self.critic.load_state_dict(torch.load(critic_path)) # Hard copy params from original networks to target networks copy_params(self.actor, self.actor_target) copy_params(self.critic, self.critic_target) if self.use_cuda: self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() # Create replay buffer for storing experience self.replay_buffer = ReplayBuffer(cache_size=int(1e6)) # Training self.critic_criterion = nn.MSELoss() self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_learning_rate) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_learning_rate) def save_to_file(self, actor_file, critic_file): # Save the state_dict's of the Actor and Critic networks torch.save(self.actor.state_dict(), actor_file) torch.save(self.critic.state_dict(), critic_file) def get_action(self, state): """Select action with respect to state according to current policy and exploration noise""" state = Variable(torch.from_numpy(state).float()) if self.use_cuda: state = state.cuda() a = self.actor.forward(state) if self.use_cuda: return a.detach().cpu().numpy() return a.detach().numpy() def save_experience(self, state_t, action_t, reward_t, state_t1): self.replay_buffer.add_sample(state_t, action_t, reward_t, state_t1) def update(self, batch_size): states, actions, rewards, next_states = self.replay_buffer.get_samples( batch_size) states = torch.FloatTensor(states) actions = torch.FloatTensor(actions) rewards = torch.FloatTensor(rewards) next_states = torch.FloatTensor(next_states) if self.use_cuda: states = states.cuda() next_states = next_states.cuda() actions = actions.cuda() rewards = rewards.cuda() # Critic loss Qvals = self.critic.forward(states, actions) next_actions = self.actor_target.forward(next_states) next_Q = self.critic_target.forward(next_states, next_actions.detach()) Qprime = rewards + self.gamma * next_Q critic_loss = self.critic_criterion(Qvals, Qprime) # Update critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Actor loss policy_loss = -self.critic.forward(states, self.actor.forward(states)).mean() # Update actor self.actor_optimizer.zero_grad() policy_loss.backward() self.actor_optimizer.step() # update target networks soft_copy_params(self.actor, self.actor_target, self.tau) soft_copy_params(self.critic, self.critic_target, self.tau) def add_noise_to_weights(self, amount=0.1): self.actor.apply( lambda x: _add_noise_to_weights(x, amount, self.use_cuda)) self.critic.apply( lambda x: _add_noise_to_weights(x, amount, self.use_cuda)) self.actor_target.apply( lambda x: _add_noise_to_weights(x, amount, self.use_cuda)) self.critic_target.apply( lambda x: _add_noise_to_weights(x, amount, self.use_cuda))
class PPO(BaseAgent): def __init__(self, config): super(PPO, self).__init__() self.config = config torch.manual_seed(self.config['seed']) np.random.seed(self.config['seed']) if self.config['experiment'][ 'orthogonal_initialization_and_layer_scaling']: weight_init_scheme = 'orthogonal' else: weight_init_scheme = 'normal' self.actor = Actor( device=self.config['device'], input_dim=self.config['env']['nS'], output_dim=self.config['env']['nA'], hidden_dims=self.config['model']['actor']['hidden_dims'], hidden_activation_fn=self.config['model']['actor'] ['hidden_acivation_fn'], weight_init_scheme=weight_init_scheme) self.actor_optimizer = optim.Adam( self.actor.parameters(), lr=self.config['model']['actor']['lr'], betas=self.config['model']['actor']['betas']) self.critic = Critic( device=self.config['device'], input_dim=self.config['env']['nS'], hidden_dims=self.config['model']['critic']['hidden_dims'], hidden_activation_fn=self.config['model']['critic'] ['hidden_acivation_fn'], weight_init_scheme=weight_init_scheme) self.critic_optimizer = optim.Adam( self.critic.parameters(), lr=self.config['model']['critic']['lr'], betas=self.config['model']['critic']['betas']) if self.config['train']['gail']: self.discriminator = Discriminator( device=self.config['device'], state_dim=self.config['env']['nS'], action_dim=self.config['env']['nA'], hidden_dims=self.config['model']['discriminator'] ['hidden_dims'], hidden_activation_fn=self.config['model']['discriminator'] ['hidden_acivation_fn'], weight_init_scheme=weight_init_scheme) self.discriminator_optimizer = optim.Adam( self.discriminator.parameters(), lr=self.config['model']['discriminator']['lr'], betas=self.config['model']['discriminator']['betas']) # [EXPERIMENT] - reward scaler: r / rs.std() if self.config['experiment']['reward_standardization']: self.reward_scaler = RewardScaler( gamma=self.config['train']['gamma']) # [EXPERIMENT] - observation scaler: (ob - ob.mean()) / (ob.std()) if self.config['experiment']['observation_normalization']: self.observation_scaler = ObservationScaler() # train def train(self): """ # initialize env, memory # foreach episode # foreach timestep # select action # step action # add exp to the memory # if done or timeout or memory_full: update gae & tdlamret # if memory is full # bootstrap value # optimize # clear memory # if done: # wrapup episode # break """ writer_path = os.path.join('experiments', self.config['exp_name'], 'runs') self.writer = SummaryWriter(writer_path) # Pretrain with BC if self.config['train']['bc']: bc_train_set, bc_valid_set = get_bc_dataset( self.config['train']['bc']['samples_exp_name'], self.config['train']['bc']['minimum_score'], self.config['train']['bc']['batch_size'], self.config['train']['bc']['demo_count'], self.config['train']['bc']['val_size']) if self.config['experiment']['observation_normalization']: use_obs_scaler = True else: use_obs_scaler = False self.actor = pretrain(self.actor, self.config['train']['bc']['lr'], self.config['train']['bc']['epochs'], bc_train_set, bc_valid_set, use_obs_scaler, writer=self.writer) # GAIL if self.config['train']['gail']: self.expert_dataset = get_gail_dataset( self.config['train']['gail']['samples_exp_name'], self.config['train']['gail']['minimum_score'], self.config['train']['gail']['n_samples'], self.config['train']['ppo']['memory_size'], self.config['train']['gail']['dstep']) self.best_score = 0 # prepare env, memory, stuff env = self.init_env(self.config['env']['name']) env.seed(self.config['seed']) self.memory = PPOMemory(gamma=self.config['train']['gamma'], tau=self.config['train']['gae']['tau']) score_queue = deque(maxlen=self.config['train']['average_interval']) length_queue = deque(maxlen=self.config['train']['average_interval']) if self.config['train']['gail']: irl_score_queue = deque( maxlen=self.config['train']['average_interval']) for episode in trange(1, self.config['train']['max_episodes'] + 1): self.episode = episode episode_score = 0 if self.config['train']['gail']: irl_episode_score = 0 # reset env state = env.reset() for t in range(1, self.config['train']['max_steps_per_episode'] + 1): if self.episode % 100 == 0: env.render() # [EXPERIMENT] - observation scaler: (ob - ob.mean()) / (ob.std()) if self.config['experiment']['observation_normalization']: state = self.observation_scaler(state, update=True) # select action & estimate value from the state with torch.no_grad(): state_tensor = torch.tensor(state).unsqueeze( 0).float() # bsz = 1 action_tensor, logpa_tensor = self.actor.select_action( state_tensor) value_tensor = self.critic(state_tensor).squeeze( 1) # don't need bsz dim # step action action = action_tensor.numpy()[0] # single worker next_state, reward, done, _ = env.step(action) # update episode_score episode_score += reward # GAIL: get irl_reward if self.config['train']['gail']: with torch.no_grad(): reward = self.discriminator.get_irl_reward( state_tensor, action_tensor).detach() irl_episode_score += reward # [EXPERIMENT] - reward scaler r / rs.std() if self.config['experiment']['reward_standardization']: reward = self.reward_scaler(reward, update=True) # [EXPERIMENT] - reward clipping [-5, 5] if self.config['experiment']['reward_clipping']: reward = np.clip(reward, -5, 5) # add experience to the memory self.memory.store(s=state, a=action, r=reward, v=value_tensor.item(), lp=logpa_tensor.item()) # done or timeout or memory full # done => v = 0 # timeout or memory full => v = critic(next_state) # update gae & return in the memory!! timeout = t == self.config['train']['max_steps_per_episode'] time_to_optimize = len( self.memory) == self.config['train']['ppo']['memory_size'] if done or timeout or time_to_optimize: if done: # cuz the game is over, value of the next state is 0 v = 0 else: # if not, estimate it with the critic next_state_tensor = torch.tensor(next_state).unsqueeze( 0).float() # bsz = 1 with torch.no_grad(): next_value_tensor = self.critic( next_state_tensor).squeeze(1) v = next_value_tensor.item() # update gae & tdlamret self.memory.finish_path(v) # if memory is full, optimize PPO if time_to_optimize: self.optimize() if done: score_queue.append(episode_score) length_queue.append(t) if self.config['train']['gail']: irl_score_queue.append(irl_episode_score) break # update state state = next_state avg_score = np.mean(score_queue) std_score = np.std(score_queue) avg_duration = np.mean(length_queue) self.writer.add_scalar("info/score", avg_score, self.episode) self.writer.add_scalar("info/duration", avg_duration, self.episode) if self.config['train']['gail']: avg_score = np.mean(irl_score_queue) self.writer.add_scalar("info/irl_score", avg_score, self.episode) if self.episode % 100 == 0: print("{} - score: {:.1f} +-{:.1f} \t duration: {}".format( self.episode, avg_score, std_score, avg_duration)) # game-solved condition # if avg_score >= self.config['train']['terminal_score']: # print("game solved at ep {}".format(self.episode)) # self.save_weight(self.actor, self.config['exp_name'], "best") # break if avg_score >= self.best_score and self.episode >= 200: print("found best model at episode: {}".format(self.episode)) self.save_weight(self.actor, self.config['exp_name'], "best") self.best_score = avg_score # [EXPERIMENT] - observation scaler: (ob - ob.mean()) / (ob.std()) if self.config['experiment']['observation_normalization']: self.observation_scaler.save(self.config['exp_name']) self.save_weight(self.actor, self.config['exp_name'], "last") return self.best_score # optimize def optimize(self): data = self.prepare_data(self.memory.get()) # gail if self.config['train']['gail']: self.optimize_gail(data) self.optimize_ppo(data) def prepare_data(self, data): states_tensor = torch.from_numpy(np.stack( data['states'])).float() # bsz, 8 actions_tensor = torch.tensor(data['actions']).long() # bsz logpas_tensor = torch.tensor(data['logpas']).float() # bsz tdlamret_tensor = torch.tensor(data['tdlamret']).float() # bsz advants_tensor = torch.tensor(data['advants']).float() # bsz values_tensor = torch.tensor(data['values']).float() # bsz # normalize advant a.k.a atarg advants_tensor = (advants_tensor - advants_tensor.mean()) / ( advants_tensor.std() + 1e-5) data_tensor = dict(states=states_tensor, actions=actions_tensor, logpas=logpas_tensor, tdlamret=tdlamret_tensor, advants=advants_tensor, values=values_tensor) return data_tensor def ppo_iter(self, batch_size, ob, ac, oldpas, atarg, tdlamret, vpredbefore): total_size = ob.size(0) indices = np.arange(total_size) np.random.shuffle(indices) n_batches = total_size // batch_size for nb in range(n_batches): ind = indices[batch_size * nb:batch_size * (nb + 1)] yield ob[ind], ac[ind], oldpas[ind], atarg[ind], tdlamret[ ind], vpredbefore[ind] def optimize_gail(self, data): """ https://github.com/openai/baselines/blob/master/baselines/gail/trpo_mpi.py bsz = learner_batch_size // d_step for each ob_batch, ac_batch in learner_dataset: get ob_expert, ac_expert from expert_dataset get learner_logit from D get expert_logit from D get learner loss vs. torch.ones() get expert loss vs. torch.zeros() update D """ loss_fn = nn.BCELoss() D_losses = [] learner_accuracies = [] expert_accuracies = [] learner_ob = data['states'] learner_ac = data['actions'] rub = torch.zeros_like( learner_ob) # not doing anything.. just wanted to reuse ppo_iter() learner_iter = self.ppo_iter(self.expert_dataset.batch_size, learner_ob, learner_ac, rub, rub, rub, rub) for learner_ob_b, learner_ac_b, _, _, _, _ in learner_iter: expert_ob_b, expert_ac_b = self.expert_dataset.get_next_batch() if self.config['experiment']['observation_normalization']: expert_ob_b = self.observation_scaler(expert_ob_b, update=False).float() learner_logit = self.discriminator.forward(learner_ob_b, learner_ac_b) learner_prob = torch.sigmoid(learner_logit) expert_logit = self.discriminator.forward(expert_ob_b, expert_ac_b) expert_prob = torch.sigmoid(expert_logit) learner_loss = loss_fn(learner_prob, torch.ones_like(learner_prob)) expert_loss = loss_fn(expert_prob, torch.zeros_like(expert_prob)) loss = learner_loss + expert_loss D_losses.append(loss.item()) self.discriminator_optimizer.zero_grad() loss.backward() self.discriminator_optimizer.step() learner_acc = ((learner_prob >= 0.5).float().mean().item()) expert_acc = ((expert_prob < 0.5).float().mean().item()) learner_accuracies.append(learner_acc) expert_accuracies.append(expert_acc) avg_d_loss = np.mean(D_losses) avg_learner_accuracy = np.mean(learner_accuracies) avg_expert_accuracy = np.mean(expert_accuracies) self.writer.add_scalar("info/discrim_loss", avg_d_loss, self.episode) self.writer.add_scalars("info/gail_accuracy", { 'learner': avg_learner_accuracy, 'expert': avg_expert_accuracy }, self.episode) def optimize_ppo(self, data): """ https://github.com/openai/baselines/blob/master/baselines/ppo1/pposgd_simple.py line 164 # get data from the memory # prepare dataloader # foreach optim_epochs # foreach batch # calculate loss and gradient # update nn """ ob = data['states'] ac = data['actions'] oldpas = data['logpas'] atarg = data['advants'] tdlamret = data['tdlamret'] vpredbefore = data['values'] # can't be arsed.. eps = self.config['train']['ppo']['clip_range'] policy_losses = [] entropy_losses = [] value_losses = [] # foreach policy_update_epochs for i in range(self.config['train']['ppo']['optim_epochs']): # foreach batch data_loader = self.ppo_iter( self.config['train']['ppo']['batch_size'], ob, ac, oldpas, atarg, tdlamret, vpredbefore) for batch in data_loader: ob_b, ac_b, old_logpas_b, atarg_b, vtarg_b, old_vpred_b = batch # policy loss cur_logpas, cur_entropies = self.actor.get_predictions( ob_b, ac_b) ratio = torch.exp(cur_logpas - old_logpas_b) # clip ratio clipped_ratio = torch.clamp(ratio, 1. - eps, 1. + eps) # policy_loss surr1 = ratio * atarg_b if self.config['experiment']['policy_noclip']: pol_surr = -surr1.mean() else: surr2 = clipped_ratio * atarg_b pol_surr = -torch.min(surr1, surr2).mean() # value_loss cur_vpred = self.critic(ob_b).squeeze(1) # [EXPERIMENT] - value clipping: clipped_value = old_values + (curr_values - old_values).clip(-eps, +eps) if self.config['experiment']['value_clipping']: cur_vpred_clipped = old_vpred_b + ( cur_vpred - old_vpred_b).clamp(-eps, eps) vloss1 = (cur_vpred - vtarg_b).pow(2) vloss2 = (cur_vpred_clipped - vtarg_b).pow(2) vf_loss = torch.max(vloss1, vloss2).mean() else: # original value_loss vf_loss = (cur_vpred - vtarg_b).pow(2).mean() # entropy_loss pol_entpen = -cur_entropies.mean() # total loss c1 = self.config['train']['ppo']['coef_vf'] c2 = self.config['train']['ppo']['coef_entpen'] # actor - backward self.actor_optimizer.zero_grad() policy_loss = pol_surr + c2 * pol_entpen policy_loss.backward() # [EXPERIMENT] - clipping gradient with max_norm=0.5 if self.config['experiment']['clipping_gradient']: nn.utils.clip_grad_norm_(self.actor.parameters(), max_norm=0.5) self.actor_optimizer.step() # critic - backward self.critic_optimizer.zero_grad() value_loss = c1 * vf_loss value_loss.backward() # [EXPERIMENT] - clipping gradient with max_norm=0.5 if self.config['experiment']['clipping_gradient']: nn.utils.clip_grad_norm_(self.critic.parameters(), max_norm=0.5) self.critic_optimizer.step() policy_losses.append(pol_surr.item()) entropy_losses.append(pol_entpen.item()) value_losses.append(vf_loss.item()) avg_policy_loss = np.mean(policy_losses) avg_value_losses = np.mean(value_losses) avg_entropy_losses = np.mean(entropy_losses) self.writer.add_scalar("info/policy_loss", avg_policy_loss, self.episode) self.writer.add_scalar("info/value_loss", avg_value_losses, self.episode) self.writer.add_scalar("info/entropy_loss", avg_entropy_losses, self.episode) # play def play(self, num_episodes=1, save_traj=False, seed=9999, record=False, save_result=False): # [EXPERIMENT] - observation scaler: (ob - ob.mean()) / (ob.std()) if self.config['experiment']['observation_normalization']: self.observation_scaler.load(self.config['exp_name']) # load policy self.load_weight(self.actor, self.config['exp_name']) env = self.init_env(self.config['env']['name']) env.seed(seed) if record: from gym import wrappers rec_dir = os.path.join("experiments", self.config['exp_name'], "seed_{}".format(seed)) env = wrappers.Monitor(env, rec_dir, force=True) scores, trajectories = [], [] for episode in range(num_episodes): current_trajectory = [] episode_score = 0 # initialize env state = env.reset() while True: # env.render() # [EXPERIMENT] - observation scaler: (ob - ob.mean()) / (ob.std()) if self.config['experiment']['observation_normalization']: state = self.observation_scaler(state, update=False) # select greedy action with torch.no_grad(): action_tensor = self.actor.select_greedy_action(state) action = action_tensor.numpy()[0] # single env current_trajectory.append((state, action)) # run action next_state, reward, done, _ = env.step(action) # add reward episode_score += reward # update state state = next_state # game over condition if done: scores.append(episode_score) trajectories.append((current_trajectory, episode_score)) break avg_score = np.mean(scores) print("Average score {} on {} games".format(avg_score, num_episodes)) if save_result: played_result_path = os.path.join("experiments", self.config['exp_name'], "runs", "play_score.pth") torch.save(scores, played_result_path) if save_traj: demo_dir = os.path.join("experiments", self.config['exp_name'], "demonstration") os.makedirs(demo_dir) torch.save(trajectories, os.path.join(demo_dir, "demo.pth")) print("saved {} trajectories.".format(num_episodes)) env.close()
class TD3: def __init__(self, env, state_dim, action_dim, max_action, gamma=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2): self.actor = Actor(state_dim, action_dim) self.actor_target = Actor(state_dim, action_dim) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-3) self.critic = Critic(state_dim, action_dim) self.critic_target = Critic(state_dim, action_dim) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3) self.max_action = max_action self.gamma = gamma self.tau = tau self.policy_noise = policy_noise self.noise_clip = noise_clip self.policy_freq = policy_freq self.device = 'cuda' if torch.cuda.is_available() else 'cpu' self.actor.to(self.device) self.actor_target.to(self.device) self.critic.to(self.device) self.critic_target.to(self.device) self.env = env self.total_it = 0 def select_action(self, state, noise=0.1): action = self.actor(state.to(self.device)).data.cpu().numpy().flatten() if noise != 0: action = (action + np.random.normal( 0, noise, size=self.env.action_space.shape[0])) return action.clip(self.env.action_space.low, self.env.action_space.high) def train(self, replay_buffer, batch_size=128): self.total_it += 1 states, states_, actions, rewards, terminal = replay_buffer.sample_buffer( batch_size) with torch.no_grad(): noise = (torch.randn_like(actions.to(self.device)) * self.policy_noise).clamp(-self.noise_clip, self.noise_clip) next_action = (self.actor_target(states_.to(self.device)) + noise).clamp(-self.max_action, self.max_action) # compute the target Q value target_q1, target_q2 = self.critic_target( states_.to(self.device), next_action.to(self.device)) target_q = torch.min(target_q1, target_q2) # target_q = rewards + terminal * self.gamma + target_q.cpu() # target_q = rewards + (terminal.reshape(256, 1) * self.gamma * target_q).detach() target_q = rewards + terminal * self.gamma * target_q[:, 0].cpu() # Get current Q value current_q1, current_q2 = self.critic(states.to(self.device), actions.to(self.device)) # Compute critic loss critic_loss = F.mse_loss(current_q1[:, 0], target_q.to( self.device)) + F.mse_loss(current_q2[:, 0], target_q.to(self.device)) # optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Delayed policy updates if self.total_it % self.policy_freq == 0: # Compote actor loss actor_loss = -self.critic.q1(states.to( self.device), self.actor(states.to(self.device))).mean() # Optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update the frozen target models for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def save(self, filename): torch.save(self.critic.state_dict(), filename + "_critic") torch.save(self.critic_optimizer.state_dict(), filename + "_critic_optimizer") torch.save(self.actor.state_dict(), filename + "_actor") torch.save(self.actor_optimizer.state_dict(), filename + "_actor_optimizer") def load(self, filename): self.critic.load_state_dict(torch.load(filename + "_critic")) self.critic_optimizer.load_state_dict( torch.load(filename + "_critic_optimizer")) self.actor.load_state_dict(torch.load(filename + "_actor")) self.actor_optimizer.load_state_dict( torch.load(filename + "_actor_optimizer"))
class Agent(): def __init__(self, learn_rate, input_shape, num_actions): self.num_actions = num_actions self.gamma = 0.99 self.critic_update_max = 20 self.actor_update_max = 10 self.memories = [] # self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.device = torch.device("cpu") self.actor = Actor().to(self.device) self.critic = Critic().to(self.device) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=learn_rate) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=learn_rate) def choose_action(self, state, hidden_state): state = torch.tensor(state, dtype=torch.float32).to(self.device) policy, hidden_state_ = self.actor(state, hidden_state) policy = F.softmax(policy) actions_probs = torch.distributions.Categorical(policy) action = actions_probs.sample() action_log_prob = actions_probs.log_prob(action).unsqueeze(0) # action = torch.argmax(policy) # prep for storage action = action.item() return action, policy, hidden_state_, action_log_prob def store_memory(self, memory): self.memories.append(memory) def get_discounted_cum_rewards(self, memory): cum_rewards = [] total = 0 for reward in reversed(memory.rewards): total = reward + total * self.gamma cum_rewards.append(total) cum_rewards = list(reversed(cum_rewards)) cum_disc_rewards = torch.tensor(cum_rewards).float().to(self.device) return cum_rewards def learn(self): critic_losses = [] for memory_idx, memory in enumerate(self.memories): print(memory_idx) states, actions, policies, rewards, dones, actor_hidden_states, action_log_probs = \ memory.fetch_on_device(self.device) cum_disc_rewards = self.get_discounted_cum_rewards(memory) ''' train critic ''' self.critic.train() self.actor.eval() critic_hidden_state = self.critic.get_new_hidden_state() for i in range(len(memory.states)): state = states[i].detach() policy = policies[i].detach() action_log_prob = action_log_probs[i].detach() done = dones[i].detach() true_value = cum_disc_rewards[i] value, critic_hidden_state_ = self.critic( state, action_log_prob, critic_hidden_state) if done: true_value *= 0.0 error = value - true_value # print("true: {}, value: {}".format(true_value, value)) critic_loss = error**2 if critic_loss >= self.critic_update_max: print("critic_loss BIG: {}".format(critic_loss)) critic_loss = torch.clamp(critic_loss, -self.critic_update_max, self.critic_update_max) critic_losses.append(critic_loss) critic_hidden_state = critic_hidden_state_ # print("end") all_critic_loss = sum(critic_losses) # all_critic_loss = torch.stack(critic_losses).mean() self.critic_optimizer.zero_grad() all_critic_loss.backward() self.critic_optimizer.step() actor_losses = [] for memory_idx, memory in enumerate(self.memories): print(memory_idx) states, actions, policies, rewards, dones, actor_hidden_states, action_log_probs = \ memory.fetch_on_device(self.device) ''' train actor ''' self.critic.eval() self.actor.train() critic_hidden_state = self.critic.get_new_hidden_state() for i in range(len(memory.states)): state = states[i].detach() # policy = policies[i] action_log_prob = action_log_probs[i] critic_hidden_state = critic_hidden_state.detach() done = dones[i].detach() value, critic_hidden_state_ = self.critic( state, action_log_prob, critic_hidden_state) if done: value *= 0.0 # print("true: {}, value: {}".format(true_value, value)) actor_loss = value if actor_loss >= self.actor_update_max: print("actor_loss BIG: {}".format(actor_loss)) actor_loss = torch.clamp(actor_loss, -self.actor_update_max, self.actor_update_max) actor_losses.append(actor_loss) critic_hidden_state = critic_hidden_state_ all_actor_loss = sum(actor_losses) # all_actor_loss = torch.stack(actor_losses).mean() self.actor_optimizer.zero_grad() all_actor_loss.backward() self.actor_optimizer.step()
def train(BATCH_SIZE, DISCOUNT, ENTROPY_WEIGHT, HIDDEN_SIZE, LEARNING_RATE, MAX_STEPS, POLYAK_FACTOR, REPLAY_SIZE, TEST_INTERVAL, UPDATE_INTERVAL, UPDATE_START, ENV, OBSERVATION_LOW, VALUE_FNC, FLOW_TYPE, FLOWS, DEMONSTRATIONS, PRIORITIZE_REPLAY, BEHAVIOR_CLONING, ARM, BASE, RPA, REWARD_DENSE, logdir): ALPHA = 0.3 BETA = 1 epsilon = 0.0001 #0.1 epsilon_d = 0.1 #0.3 weights = 1 #1 lambda_ac = 0.85 #0.7 lambda_bc = 0.3 #0.4 setup_logger(logdir, locals()) ENV = __import__(ENV) if ARM and BASE: env = ENV.youBotAll('youbot_navig2.ttt', obs_lowdim=OBSERVATION_LOW, rpa=RPA, reward_dense=REWARD_DENSE, boundary=1) elif ARM: env = ENV.youBotArm('youbot_navig.ttt', obs_lowdim=OBSERVATION_LOW, rpa=RPA, reward_dense=REWARD_DENSE) elif BASE: env = ENV.youBotBase('youbot_navig.ttt', obs_lowdim=OBSERVATION_LOW, rpa=RPA, reward_dense=REWARD_DENSE, boundary=1) action_space = env.action_space obs_space = env.observation_space() step_limit = env.step_limit() if OBSERVATION_LOW: actor = SoftActorGated(HIDDEN_SIZE, action_space, obs_space, flow_type=FLOW_TYPE, flows=FLOWS).float().to(device) critic_1 = Critic(HIDDEN_SIZE, 1, obs_space, action_space, state_action=True).float().to(device) critic_2 = Critic(HIDDEN_SIZE, 1, obs_space, action_space, state_action=True).float().to(device) else: actor = ActorImageNet(HIDDEN_SIZE, action_space, obs_space, flow_type=FLOW_TYPE, flows=FLOWS).float().to(device) critic_1 = Critic(HIDDEN_SIZE, 1, obs_space, action_space, state_action=True).float().to(device) critic_2 = Critic(HIDDEN_SIZE, 1, obs_space, action_space, state_action=True).float().to(device) critic_1.load_state_dict( torch.load( 'data/youbot_all_final_21-08-2019_22-32-00/models/critic1_model_473000.pkl' )) critic_2.load_state_dict( torch.load( 'data/youbot_all_final_21-08-2019_22-32-00/models/critic1_model_473000.pkl' )) actor.apply(weights_init) # critic_1.apply(weights_init) # critic_2.apply(weights_init) if VALUE_FNC: value_critic = Critic(HIDDEN_SIZE, 1, obs_space, action_space).float().to(device) target_value_critic = create_target_network(value_critic).float().to( device) value_critic_optimiser = optim.Adam(value_critic.parameters(), lr=LEARNING_RATE) else: target_critic_1 = create_target_network(critic_1) target_critic_2 = create_target_network(critic_2) actor_optimiser = optim.Adam(actor.parameters(), lr=LEARNING_RATE) critics_optimiser = optim.Adam(list(critic_1.parameters()) + list(critic_2.parameters()), lr=LEARNING_RATE) # Replay buffer if PRIORITIZE_REPLAY: # D = PrioritizedReplayBuffer(REPLAY_SIZE, ALPHA) D = ReplayMemory(device, 3, DISCOUNT, 1, BETA, ALPHA, REPLAY_SIZE) else: D = deque(maxlen=REPLAY_SIZE) eval_ = evaluation_sac(env, logdir, device) #Automatic entropy tuning init target_entropy = -np.prod(action_space).item() log_alpha = torch.zeros(1, requires_grad=True, device=device) alpha_optimizer = optim.Adam([log_alpha], lr=LEARNING_RATE) home = os.path.expanduser('~') if DEMONSTRATIONS: dir_dem = os.path.join(home, 'robotics_drl/data/demonstrations/', DEMONSTRATIONS) D, n_demonstrations = load_buffer_demonstrations( D, dir_dem, PRIORITIZE_REPLAY, OBSERVATION_LOW) else: n_demonstrations = 0 if not BEHAVIOR_CLONING: behavior_loss = 0 os.mkdir(os.path.join(home, 'robotics_drl', logdir, 'models')) dir_models = os.path.join(home, 'robotics_drl', logdir, 'models') state, done = env.reset(), False if OBSERVATION_LOW: state = state.float().to(device) else: state['low'] = state['low'].float() state['high'] = state['high'].float() pbar = tqdm(range(1, MAX_STEPS + 1), unit_scale=1, smoothing=0) steps = 0 success = 0 for step in pbar: with torch.no_grad(): if step < UPDATE_START and not DEMONSTRATIONS: # To improve exploration take actions sampled from a uniform random distribution over actions at the start of training action = torch.tensor(env.sample_action(), dtype=torch.float32, device=device).unsqueeze(dim=0) else: # Observe state s and select action a ~ μ(a|s) if not OBSERVATION_LOW: state['low'] = state['low'].float().to(device) state['high'] = state['high'].float().to(device) action, _ = actor(state, log_prob=False, deterministic=False) if not OBSERVATION_LOW: state['low'] = state['low'].float().cpu() state['high'] = state['high'].float().cpu() #if (policy.mean).mean() > 0.4: # print("GOOD VELOCITY") # Execute a in the environment and observe next state s', reward r, and done signal d to indicate whether s' is terminal next_state, reward, done = env.step( action.squeeze(dim=0).cpu().tolist()) if OBSERVATION_LOW: next_state = next_state.float().to(device) else: next_state['low'] = next_state['low'].float() next_state['high'] = next_state['high'].float() # Store (s, a, r, s', d) in replay buffer D if PRIORITIZE_REPLAY: if OBSERVATION_LOW: D.add(state.cpu().tolist(), action.cpu().squeeze().tolist(), reward, next_state.cpu().tolist(), done) else: D.append(state['high'], state['low'], action.cpu().squeeze().tolist(), reward, done) else: D.append({ 'state': state.unsqueeze(dim=0) if OBSERVATION_LOW else state, 'action': action, 'reward': torch.tensor([reward], dtype=torch.float32, device=device), 'next_state': next_state.unsqueeze( dim=0) if OBSERVATION_LOW else next_state, 'done': torch.tensor([True if reward == 1 else False], dtype=torch.float32, device=device) }) state = next_state # If s' is terminal, reset environment state steps += 1 if done or steps > step_limit: #TODO: incorporate step limit in the environment eval_c2 = True #TODO: multiprocess pyrep with a session for each testing and training steps = 0 if OBSERVATION_LOW: state = env.reset().float().to(device) else: state = env.reset() state['low'] = state['low'].float() state['high'] = state['high'].float() if reward == 1: success += 1 if step > UPDATE_START and step % UPDATE_INTERVAL == 0: for _ in range(1): # Randomly sample a batch of transitions B = {(s, a, r, s', d)} from D if PRIORITIZE_REPLAY: if OBSERVATION_LOW: state_batch, action_batch, reward_batch, state_next_batch, done_batch, weights_pr, idxes = D.sample( BATCH_SIZE, BETA) state_batch = torch.from_numpy(state_batch).float().to( device) next_state_batch = torch.from_numpy( state_next_batch).float().to(device) action_batch = torch.from_numpy( action_batch).float().to(device) reward_batch = torch.from_numpy( reward_batch).float().to(device) done_batch = torch.from_numpy(done_batch).float().to( device) weights_pr = torch.from_numpy(weights_pr).float().to( device) else: idxes, high_state_batch, low_state_batch, action_batch, reward_batch, high_state_next_batch, low_state_next_batch, done_batch, weights_pr = D.sample( BATCH_SIZE) state_batch = { 'low': low_state_batch.float().to(device).view(-1, 32), 'high': high_state_batch.float().to(device).view( -1, 12, 128, 128) } next_state_batch = { 'low': low_state_next_batch.float().to(device).view( -1, 32), 'high': high_state_next_batch.float().to(device).view( -1, 12, 128, 128) } action_batch = action_batch.float().to(device) reward_batch = reward_batch.float().to(device) done_batch = done_batch.float().to(device) weights_pr = weights_pr.float().to(device) # for j in range(BATCH_SIZE): # new_state_batch['high'] = torch.cat((new_state_batch['high'], state_batch[j].tolist()['high'].view(-1,(3+1)*env.frames,128,128)), dim=0) # new_state_batch['low'] = torch.cat((new_state_batch['low'], state_batch[j].tolist()['low'].view(-1,32)), dim=0) # new_next_state_batch['high'] = torch.cat((new_next_state_batch['high'], state_next_batch[j].tolist()['high'].view(-1,(3+1)*env.frames,128,128)), dim=0) # new_next_state_batch['low'] = torch.cat((new_next_state_batch['low'], state_next_batch[j].tolist()['low'].view(-1,32)), dim=0) # new_state_batch['high'] = new_state_batch['high'].to(device) # new_state_batch['low'] = new_state_batch['low'].to(device) # new_next_state_batch['high'] = new_next_state_batch['high'].to(device) # new_next_state_batch['low'] = new_next_state_batch['low'].to(device) batch = { 'state': state_batch, 'action': action_batch, 'reward': reward_batch, 'next_state': next_state_batch, 'done': done_batch } state_batch = [] state_next_batch = [] else: batch = random.sample(D, BATCH_SIZE) state_batch = [] action_batch = [] reward_batch = [] state_next_batch = [] done_batch = [] for d in batch: state_batch.append(d['state']) action_batch.append(d['action']) reward_batch.append(d['reward']) state_next_batch.append(d['next_state']) done_batch.append(d['done']) batch = { 'state': torch.cat(state_batch, dim=0), 'action': torch.cat(action_batch, dim=0), 'reward': torch.cat(reward_batch, dim=0), 'next_state': torch.cat(state_next_batch, dim=0), 'done': torch.cat(done_batch, dim=0) } action, log_prob = actor(batch['state'], log_prob=True, deterministic=False) #Automatic entropy tuning alpha_loss = -( log_alpha.float() * (log_prob + target_entropy).float().detach()).mean() alpha_optimizer.zero_grad() alpha_loss.backward() alpha_optimizer.step() alpha = log_alpha.exp() weighted_sample_entropy = (alpha.float() * log_prob).view( -1, 1) # Compute targets for Q and V functions if VALUE_FNC: y_q = batch['reward'] + DISCOUNT * ( 1 - batch['done']) * target_value_critic( batch['next_state']) y_v = torch.min( critic_1(batch['state']['low'], action.detach()), critic_2(batch['state']['low'], action.detach()) ) - weighted_sample_entropy.detach() else: # No value function network with torch.no_grad(): next_actions, next_log_prob = actor( batch['next_state'], log_prob=True, deterministic=False) target_qs = torch.min( target_critic_1( batch['next_state']['low'] if not OBSERVATION_LOW else batch['next_state'], next_actions), target_critic_2( batch['next_state']['low'] if not OBSERVATION_LOW else batch['next_state'], next_actions)) - alpha * next_log_prob y_q = batch['reward'] + DISCOUNT * ( 1 - batch['done']) * target_qs.detach() td_error_critic1 = critic_1( batch['state']['low'] if not OBSERVATION_LOW else batch['state'], batch['action']) - y_q td_error_critic2 = critic_2( batch['state']['low'] if not OBSERVATION_LOW else batch['state'], batch['action']) - y_q q_loss = (td_error_critic1).pow(2).mean() + ( td_error_critic2).pow(2).mean() # q_loss = (F.mse_loss(critic_1(batch['state'], batch['action']), y_q) + F.mse_loss(critic_2(batch['state'], batch['action']), y_q)).mean() critics_optimiser.zero_grad() q_loss.backward() critics_optimiser.step() # Compute priorities, taking demonstrations into account if PRIORITIZE_REPLAY: td_error = weights_pr * (td_error_critic1.detach() + td_error_critic2.detach()).mean() action_dem = torch.tensor([]).to(device) if OBSERVATION_LOW: state_dem = torch.tensor([]).to(device) else: state_dem = { 'low': torch.tensor([]).float().to(device), 'high': torch.tensor([]).float().to(device) } priorities = torch.abs(td_error).tolist() i = 0 count_dem = 0 for idx in idxes: priorities[i] += epsilon if idx < n_demonstrations: priorities[i] += epsilon_d count_dem += 1 if BEHAVIOR_CLONING: action_dem = torch.cat( (action_dem, batch['action'][i].view( 1, -1)), dim=0) if OBSERVATION_LOW: state_dem = torch.cat( (state_dem, batch['state'][i].view( 1, -1)), dim=0) else: state_dem['high'] = torch.cat( (state_dem['high'], batch['state']['high'][i, ].view( -1, (3 + 1) * env.frames, 128, 128)), dim=0) state_dem['low'] = torch.cat( (state_dem['low'], batch['state']['low'][i, ].view( -1, 32)), dim=0) i += 1 if not action_dem.nelement() == 0: actual_action_dem, _ = actor(state_dem, log_prob=False, deterministic=True) # q_value_actor = (critic_1(batch['state'][i], batch['action'][i]) + critic_2(batch['state'][i], batch['action'][i]))/2 # q_value_actual = (critic_1(batch['state'][i], actual_action_dem) + critic_2(batch['state'][i], actual_action_dem))/2 # if q_value_actor > q_value_actual: # Q Filter behavior_loss = F.mse_loss( action_dem, actual_action_dem).unsqueeze(dim=0) else: behavior_loss = 0 D.update_priorities(idxes, priorities) lambda_bc = (count_dem / BATCH_SIZE) / 5 # Update V-function by one step of gradient descent if VALUE_FNC: v_loss = (value_critic(batch['state']) - y_v).pow(2).mean().to(device) value_critic_optimiser.zero_grad() v_loss.backward() value_critic_optimiser.step() # Update policy by one step of gradient ascent with torch.no_grad(): new_qs = torch.min( critic_1( batch["state"]['low'] if not OBSERVATION_LOW else batch['state'], action), critic_2( batch["state"]['low'] if not OBSERVATION_LOW else batch['state'], action)) policy_loss = lambda_ac * (weighted_sample_entropy.view( -1) - new_qs).mean().to(device) + lambda_bc * behavior_loss actor_optimiser.zero_grad() policy_loss.backward() actor_optimiser.step() # Update target value network if VALUE_FNC: update_target_network(value_critic, target_value_critic, POLYAK_FACTOR) else: update_target_network(critic_1, target_critic_1, POLYAK_FACTOR) update_target_network(critic_2, target_critic_2, POLYAK_FACTOR) state_dem = [] # Continues to sample transitions till episode is done and evaluation is on if step > UPDATE_START and step % TEST_INTERVAL == 0: eval_c = True else: eval_c = False if eval_c == True and eval_c2 == True: eval_c = False eval_c2 = False actor.eval() critic_1.eval() critic_2.eval() q_value_eval = eval_.get_qvalue(critic_1, critic_2) return_ep, steps_ep = eval_.sample_episode(actor) logz.log_tabular('Training steps', step) logz.log_tabular('Cumulative Success', success) logz.log_tabular('Validation return', return_ep.mean()) logz.log_tabular('Validation steps', steps_ep.mean()) logz.log_tabular('Validation return std', return_ep.std()) logz.log_tabular('Validation steps std', steps_ep.std()) logz.log_tabular('Q-value evaluation', q_value_eval) logz.log_tabular('Q-network loss', q_loss.detach().cpu().numpy()) if VALUE_FNC: logz.log_tabular('Value-network loss', v_loss.detach().cpu().numpy()) logz.log_tabular('Policy-network loss', policy_loss.detach().cpu().squeeze().numpy()) logz.log_tabular('Alpha loss', alpha_loss.detach().cpu().numpy()) logz.log_tabular('Alpha', alpha.detach().cpu().squeeze().numpy()) logz.log_tabular('Demonstrations current batch', count_dem) logz.dump_tabular() logz.save_pytorch_model(actor.state_dict()) torch.save(actor.state_dict(), os.path.join(dir_models, 'actor_model_%s.pkl' % (step))) torch.save( critic_1.state_dict(), os.path.join(dir_models, 'critic1_model_%s.pkl' % (step))) torch.save( critic_2.state_dict(), os.path.join(dir_models, 'critic1_model_%s.pkl' % (step))) #pbar.set_description('Step: %i | Reward: %f' % (step, return_ep.mean())) actor.train() critic_1.train() critic_2.train() env.terminate()
class Agent(object): ''' Implementation of a DQN agent that interacts with and learns from the environment ''' def __init__(self, state_size, action_size, rand_seed, meta_agent): '''Initialize an MetaAgent object. :param state_size: int. dimension of each state :param action_size: int. dimension of each action :param nb_agents: int. number of agents to use :param rand_seed: int. random seed :param memory: ReplayBuffer object. ''' self.action_size = action_size self.__name__ = 'DDPG' # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, rand_seed).to(DEVC) self.actor_target = Actor(state_size, action_size, rand_seed).to(DEVC) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, meta_agent.nb_agents, rand_seed).to(DEVC) self.critic_target = Critic(state_size, action_size, meta_agent.nb_agents, rand_seed).to(DEVC) # NOTE: the decay corresponds to L2 regularization self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=LR_CRITIC) # , weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, rand_seed) # Replay memory self.memory = meta_agent.memory # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done, others_states, others_actions, others_next_states): self.memory.add(state, action, reward, next_state, done, others_states, others_actions, others_next_states) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: # source: Sample a random minibatch of N transitions from R experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, states, add_noise=True): '''Returns actions for given states as per current policy. :param states: array_like. current states :param add_noise: Boolean. If should add noise to the action ''' states = torch.from_numpy(states).float().to(DEVC) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() self.actor_local.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): ''' Update policy and value params using given batch of experience tuples. Q_targets = r + ? * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value :param experiences: Tuple[torch.Tensor]. tuple of (s, a, r, s', done) :param gamma: float. discount factor ''' (states, actions, rewards, next_states, dones, others_states, others_actions, others_next_states) = experiences # rewards_ = torch.clamp(rewards, min=-1., max=1.) rewards_ = rewards all_states = torch.cat((states, others_states), dim=1).to(DEVC) all_actions = torch.cat((actions, others_actions), dim=1).to(DEVC) all_next_states = torch.cat((next_states, others_next_states), dim=1).to(DEVC) # --------------------------- update critic --------------------------- # Get predicted next-state actions and Q values from target models l_all_next_actions = [] l_all_next_actions.append(self.actor_target(states)) l_all_next_actions.append(self.actor_target(others_states)) all_next_actions = torch.cat(l_all_next_actions, dim=1).to(DEVC) Q_targets_next = self.critic_target(all_next_states, all_next_actions) # Compute Q targets for current states (y_i) Q_targets = rewards_ + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss: L = 1/N SUM{(yi ? Q(si, ai|?Q))^2} Q_expected = self.critic_local(all_states, all_actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # --------------------------- update actor --------------------------- # Compute actor loss this_actions_pred = self.actor_local(states) others_actions_pred = self.actor_local(others_states) others_actions_pred = others_actions_pred.detach() actions_pred = torch.cat((this_actions_pred, others_actions_pred), dim=1).to(DEVC) actor_loss = -self.critic_local(all_states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ---------------------- update target networks ---------------------- # Update the critic target networks # Update the actor target networks self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): '''Soft update model parameters. ?_target = ?*?_local + (1 - ?)*?_target :param local_model: PyTorch model. weights will be copied from :param target_model: PyTorch model. weights will be copied to :param tau: float. interpolation parameter ''' iter_params = zip(target_model.parameters(), local_model.parameters()) for target_param, local_param in iter_params: tensor_aux = tau * local_param.data + (1.0 - tau) * target_param.data target_param.data.copy_(tensor_aux) def reset(self): self.noise.reset()
from torch import optim from tqdm import tqdm from env import Env from models import Actor, Critic, create_target_network, update_target_network from utils import plot max_steps, update_start, update_interval, batch_size, discount, policy_delay, polyak_rate = 100000, 10000, 4, 128, 0.99, 2, 0.995 env = Env() actor = Actor() critic_1 = Critic(state_action=True) critic_2 = Critic(state_action=True) target_actor = create_target_network(actor) target_critic_1 = create_target_network(critic_1) target_critic_2 = create_target_network(critic_2) actor_optimiser = optim.Adam(actor.parameters(), lr=1e-3) critics_optimiser = optim.Adam(list(critic_1.parameters()) + list(critic_2.parameters()), lr=1e-3) D = deque(maxlen=10000) state, done, total_reward = env.reset(), False, 0 pbar = tqdm(range(1, max_steps + 1), unit_scale=1, smoothing=0) for step in pbar: with torch.no_grad(): if step < update_start: # To improve exploration take actions sampled from a uniform random distribution over actions at the start of training action = torch.tensor([[2 * random.random() - 1]]) else: # Observe state s and select action a = clip(μ(s) + ε, a_low, a_high) action = torch.clamp(actor(state) + 0.1 * torch.randn(1, 1), min=-1,
class agent(object): def __init__(self, state_size, action_size, num_agents, lr, seed): """ Twin Delayed DDPG agent. Arguments: state_size : Size of the state from environment. action_size : Size of the action taken by the agent num_agents : Total number of agents lr : Common learning rate for both agents seed : Seed value for reproducibility """ #Initialization of local and target actor networks self.actor = TD3Policy(state_size, action_size, seed) self.actor_opt = opt.Adam(self.actor.parameters(), lr=lr) self.actor_target = TD3Policy(state_size, action_size, seed) #Initialization of local and target critic networks self.critic = Critic(state_size, action_size, seed) self.critic_opt = opt.Adam(self.critic.parameters(), lr=lr) self.critic_target = Critic(state_size, action_size, seed) #Agent hyper parameters self.num_agents = num_agents self.policy_update = 2 self.step = 0 self.noise_clip = 0.5 self.policy_noise = 0.2 self.gamma = 0.998 self.TAU = 0.005 self.batch_size = 64 #Replay buffer for 'COMPETE' mode self.memory = ReplayBuffer(int(1e6), self.batch_size) #Hard updation of target network self.hard_update(self.critic, self.critic_target) def act(self, state, add_noise=True): """ Returns action for the given state Argument: state : State vector containing state variables """ state = torch.from_numpy(state).float() self.actor.eval() with torch.no_grad(): actions = self.actor(state).cpu().data.numpy() self.actor.train() return actions def update(self, experience): """ Updates the agent's replay buffer and trains the agent in 'COMPETE' mode. Arguments: experience : Tuple containing current experience """ self.memory.add(experience) if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.train(experiences) def train(self, observations): """ Trains the agent using the experiences Arguments: experience : Tuple containing current experience """ states, actions, rewards, next_states, dones = observations #Step is updated at each timestep for policy updation self.step = (self.step + 1) % self.policy_update """ Random noise is added to the Q_target. This encourages the agent to explore more. Minimum of the clipped Q values is used to reduce the overestimation behaviour of DDPG """ with torch.no_grad(): noise = (torch.randn_like(actions) * self.policy_noise).clamp( -self.noise_clip, self.noise_clip) next_actions = (self.actor_target(next_states) + noise).clamp( -1, 1) Q1_target, Q2_target = self.critic_target(next_states, next_actions) Q_target = torch.min(Q1_target, Q2_target) Q_target = rewards + (self.gamma * Q_target * (1 - dones)) Q1_expected, Q2_expected = self.critic(states, actions) critic_loss = F.mse_loss(Q1_expected, Q_target) + F.mse_loss( Q2_expected, Q_target) #Updating local critic network self.critic_opt.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1) self.critic_opt.step() """ Policy is updated every self.policy_update timesteps. ie., if the policy update is set to 2, Policy network is updated every two times the updation of critic network. This stabilizes the learning and overestimation. """ if self.step == 0: expected_actions = self.actor(states) actor_loss = -self.critic.Q1(states, expected_actions).mean() #Updating local self.actor_opt.zero_grad() actor_loss.backward() self.actor_opt.step() #Soft update of actor and critic target network using TAU self.soft_update(self.actor, self.actor_target) self.soft_update(self.critic, self.critic_target) def soft_update(self, local, target): """ Soft update of target network parameters using TAU """ for l, t in zip(local.parameters(), target.parameters()): t.data.copy_(self.TAU * l.data + (1 - self.TAU) * t.data) def hard_update(self, local, target): """ Hard update which copies local network parameters to target network """ for l, t in zip(local.parameters(), target.parameters()): t.data.copy_(l.data)
torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = True train_loader, vocab = load(args.batch_size, args.seq_len) autoencoder = Autoencoder(args.enc_hidden_dim, args.dec_hidden_dim, args.embedding_dim, args.latent_dim, vocab.size(), args.dropout, args.seq_len) autoencoder.load_state_dict( torch.load('autoencoder.th', map_location=lambda x, y: x)) generator = Generator(args.n_layers, args.block_dim) critic = Critic(args.n_layers, args.block_dim) g_optimizer = optim.Adam(generator.parameters(), lr=args.lr) c_optimizer = optim.Adam(critic.parameters(), lr=args.lr) if args.cuda: autoencoder = autoencoder.cuda() generator = generator.cuda() critic = critic.cuda() print('G Parameters:', sum([p.numel() for p in generator.parameters() if \ p.requires_grad])) print('C Parameters:', sum([p.numel() for p in critic.parameters() if \ p.requires_grad])) best_loss = np.inf for epoch in range(1, args.epochs + 1): g_loss, c_loss = train(epoch)
class DDPGAgent: def __init__(self, env, gamma, tau, buffer_maxlen, batch_size, critic_learning_rate, actor_learning_rate, update_per_step, seed): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # hyperparameters self.num_replay_updates_per_step = update_per_step self.batch_size = batch_size self.gamma = gamma self.tau = tau # initialize actor and critic networks self.critic = Critic(env.observation_space.shape[0], env.action_space.shape[0], seed).to(self.device) self.critic_target = Critic(env.observation_space.shape[0], env.action_space.shape[0], seed).to(self.device) self.actor = Actor(env.observation_space.shape[0], env.action_space.shape[0], seed).to(self.device) self.actor_target = Actor(env.observation_space.shape[0], env.action_space.shape[0], seed).to(self.device) # optimizers self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_learning_rate) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_learning_rate) self.buffer = ReplayBuffer(buffer_maxlen, batch_size, seed) self.noise = OUNoise(env.action_space.shape[0]) def get_action(self, state): state = torch.FloatTensor(state).to(self.device) self.actor.eval() with torch.no_grad(): action = self.actor(state) self.actor.train() action = action.cpu().numpy() return action def step(self, state, action, reward, next_state, done): # Save experience in replay buffer self.buffer.add(state, action, reward, next_state, done) q_loss, policy_loss = None, None # If enough samples are available in buffer, get random subset and learn if len(self.buffer) >= self.batch_size: # update the network "num_replay_updates_per_step" times in each step for _ in range(self.num_replay_updates_per_step): experiences = self.buffer.sample() q_loss, policy_loss = self.learn(experiences) q_loss = q_loss.detach().item() policy_loss = policy_loss.detach().item() return q_loss, policy_loss def learn(self, experiences): """Updating actor and critic parameters based on sampled experiences from replay buffer.""" states, actions, rewards, next_states, dones = experiences curr_Q = self.critic(states, actions) next_actions = self.actor_target(next_states).detach() next_Q = self.critic_target(next_states, next_actions).detach() target_Q = rewards + self.gamma * next_Q * (1 - dones) # losses q_loss = F.mse_loss(curr_Q, target_Q) policy_loss = -self.critic(states, self.actor(states)).mean() # update actor self.actor_optimizer.zero_grad() policy_loss.backward() self.actor_optimizer.step() # update critic self.critic_optimizer.zero_grad() q_loss.backward() self.critic_optimizer.step() # update target networks for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau)) for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau)) return q_loss, policy_loss
class NPG: def __init__(self, obs_space, action_space, hidden_dim=64): self.npg = NPGNetwork(obs_space, action_space, hidden_dim) # self.actor = Actor(obs_space, action_space, hidden_size=hidden_dim) # self.q = Critic(obs_space, action_space=action_space, hidden_size=hidden_dim) self.v = Critic(obs_space, action_space=1, hidden_size=hidden_dim) self.adv = Critic(obs_space, action_space=action_space, hidden_size=hidden_dim) #self.opt_adv = PKTDOptimizer( # list(self.adv.parameters()) #) self.opt_critic = optim.Adam(self.v.parameters()) self.opt_adv = optim.Adam(self.adv.parameters()) self.tau = 1. self.beta = 10. self.gamma = .99 self.name = f"npg" self.batch_stats = RunningMeanStd() def act(self, s): with torch.no_grad(): pi = self.npg(s) v = self.v(s).squeeze() a = pi.sample().squeeze() return a, v, 0 def update(self, batch): s, a, r, s1, done, w, R = batch v_next = self.v(s1).squeeze() v = self.v(s).squeeze() adv = self.adv(s) a = F.one_hot(a, adv.shape[1]) adv = (a * adv).sum(dim=1) #(adv + v - self.gamma * (1 - done) * v_next).mean().backward() # gradient #innovation = (r - adv - self.gamma * (1 - done) * v_next).mean().detach() self.opt_adv.zero_grad() td = r + (1 - done) * self.gamma * v_next - v adv_loss = (.5 * (td.detach() - adv).pow(2)).mean() adv_loss.backward() self.opt_adv.step() #adv.mean().backward() #adv_norm = torch.stack([p.grad.norm() for p in self.adv.parameters()]).norm() #td = r + v_next - v #innovation = (td - adv).mean().detach() #opt_stats = self.opt_adv.step(innovation=innovation) td_loss = .5 * (td.pow(2)).mean() #td = r + v_next - v #adv_loss = (.5 * (td.detach() - adv).pow(2)).mean() #td_loss = .5 * (td.pow(2).mean()) self.opt_critic.zero_grad() td_loss.backward() self.opt_critic.step() with torch.no_grad(): pi_old = self.npg(s) _update_target_soft_(self.npg.parameters(), src=self.adv.parameters(), tau=1e-4) with torch.no_grad(): pi = self.npg(s) kl = torch.distributions.kl_divergence(pi, pi_old) stats = { "pi/q_loss": adv_loss, #"pi/innovation":innovation, "pi/v_loss": td_loss, "pi/entropy": pi.entropy().mean(), "pi/kl": kl.mean(), #"pi/adv_grad_norm":adv_norm } #stats.update(opt_stats) return stats
class DDPG(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, hyper, num_agents, memory): self.action_size = action_size self.num_agents = num_agents # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=hyper['LR_ACTOR']) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, num_agents, random_seed).to(device) self.critic_target = Critic(state_size, action_size, num_agents, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=hyper['LR_CRITIC']) #, weight_decay=hyper['WEIGHT_DECAY']) # Noise process self.noise = OUNoise(action_size, random_seed) self.t = 0 self.memory = memory def step(self, state, action, reward, next_state, done, others_states,others_actions, others_next_states): self.memory.add(state, action, reward, next_state, done, others_states, others_actions, others_next_states) self.t = (self.t + 1) % hyper['UPDATE_EVERY'] if self.t == 0: if len(self.memory) > hyper['BATCH_SIZE']: experiences = self.memory.sample() self.learn(experiences, hyper['GAMMA']) def act(self, states, add_noise=True): states = torch.from_numpy(states).float().to(device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() self.actor_local.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): (states, actions, rewards, next_states, dones, others_states, others_actions, others_next_states) = experiences rewards_ = rewards all_states = torch.cat((states, others_states), dim=1).to(device) all_actions = torch.cat((actions, others_actions), dim=1).to(device) all_next_states = torch.cat((next_states, others_next_states), dim=1).to(device) # --------------------------- update critic --------------------------- l_all_next_actions = [] l_all_next_actions.append(self.actor_target(states)) l_all_next_actions.append(self.actor_target(others_states)) all_next_actions = torch.cat(l_all_next_actions, dim=1).to(device) Q_targets_next = self.critic_target(all_next_states, all_next_actions) Q_targets = rewards_ + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.critic_local(all_states, all_actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # --------------------------- update actor --------------------------- this_actions_pred = self.actor_local(states) others_actions_pred = self.actor_local(others_states) others_actions_pred = others_actions_pred.detach() actions_pred = torch.cat((this_actions_pred, others_actions_pred), dim=1).to(device) actor_loss = -self.critic_local(all_states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ---------------------- update target networks ---------------------- self.soft_update(self.critic_local, self.critic_target, hyper['TAU']) self.soft_update(self.actor_local, self.actor_target, hyper['TAU']) def soft_update(self, local_model, target_model, tau): iter_params = zip(target_model.parameters(), local_model.parameters()) for target_param, local_param in iter_params: tensor_aux = tau*local_param.data + (1.0-tau)*target_param.data target_param.data.copy_(tensor_aux)
class Agent: def __init__(self, n_states, n_actions, n_goals, action_bounds, capacity, env, k_future, batch_size, action_size=1, tau=0.05, actor_lr=1e-3, critic_lr=1e-3, gamma=0.98): self.device = device("cpu") self.n_states = n_states self.n_actions = n_actions self.n_goals = n_goals self.k_future = k_future self.action_bounds = action_bounds self.action_size = action_size self.env = env self.actor = Actor(self.n_states, n_actions=self.n_actions, n_goals=self.n_goals).to(self.device) self.critic = Critic(self.n_states, action_size=self.action_size, n_goals=self.n_goals).to(self.device) self.sync_networks(self.actor) self.sync_networks(self.critic) self.actor_target = Actor(self.n_states, n_actions=self.n_actions, n_goals=self.n_goals).to(self.device) self.critic_target = Critic(self.n_states, action_size=self.action_size, n_goals=self.n_goals).to(self.device) self.init_target_networks() self.tau = tau self.gamma = gamma self.capacity = capacity self.memory = Memory(self.capacity, self.k_future, self.env) self.batch_size = batch_size self.actor_lr = actor_lr self.critic_lr = critic_lr self.actor_optim = Adam(self.actor.parameters(), self.actor_lr) self.critic_optim = Adam(self.critic.parameters(), self.critic_lr) self.state_normalizer = Normalizer(self.n_states[0], default_clip_range=5) self.goal_normalizer = Normalizer(self.n_goals, default_clip_range=5) def choose_action(self, state, goal, train_mode=True): #takes state and goal, concatenates it and passes it to actor network #actor returns action, to which random weird noises are added and returned state = self.state_normalizer.normalize(state) goal = self.goal_normalizer.normalize(goal) state = np.expand_dims(state, axis=0) goal = np.expand_dims(goal, axis=0) with torch.no_grad(): x = np.concatenate([state, goal], axis=1) x = from_numpy(x).float().to(self.device) action = self.actor(x)[0].cpu().data.numpy() if train_mode: action += 0.2 * np.random.randn(self.n_actions) action = np.clip(action, self.action_bounds[0], self.action_bounds[1]) random_actions = np.random.uniform(low=self.action_bounds[0], high=self.action_bounds[1], size=self.n_actions) action += np.random.binomial(1, 0.3, 1)[0] * (random_actions - action) return action def store(self, mini_batch): for batch in mini_batch: self.memory.add(batch) self._update_normalizer(mini_batch) def init_target_networks(self): self.hard_update_networks(self.actor, self.actor_target) self.hard_update_networks(self.critic, self.critic_target) @staticmethod def hard_update_networks(local_model, target_model): target_model.load_state_dict(local_model.state_dict()) @staticmethod def soft_update_networks(local_model, target_model, tau=0.05): for t_params, e_params in zip(target_model.parameters(), local_model.parameters()): t_params.data.copy_(tau * e_params.data + (1 - tau) * t_params.data) def train(self): states, actions, rewards, next_states, goals = self.memory.sample( self.batch_size) states = self.state_normalizer.normalize(states) next_states = self.state_normalizer.normalize(next_states) goals = self.goal_normalizer.normalize(goals) inputs = np.concatenate([states, goals], axis=1) next_inputs = np.concatenate([next_states, goals], axis=1) inputs = torch.Tensor(inputs).to(self.device) rewards = torch.Tensor(rewards).to(self.device) next_inputs = torch.Tensor(next_inputs).to(self.device) actions = torch.Tensor(actions).to(self.device) with torch.no_grad(): #get Qmax target_q = self.critic_target(next_inputs, self.actor_target(next_inputs)) #apply bellman equation on Qmax to get computed Q for actions from above(initial state, action) target_returns = rewards + self.gamma * target_q.detach() target_returns = torch.clamp(target_returns, -1 / (1 - self.gamma), 0) #use critic to generate actual Q for (initial states and actions) q_eval = self.critic(inputs, actions) critic_loss = (target_returns - q_eval).pow(2).mean() a = self.actor(inputs) actor_loss = -self.critic(inputs, a).mean() actor_loss += a.pow(2).mean() self.actor_optim.zero_grad() actor_loss.backward() self.sync_grads(self.actor) self.actor_optim.step() self.critic_optim.zero_grad() critic_loss.backward() self.sync_grads(self.critic) self.critic_optim.step() return actor_loss.item(), critic_loss.item() def save_weights(self): torch.save( { "actor_state_dict": self.actor.state_dict(), "state_normalizer_mean": self.state_normalizer.mean, "state_normalizer_std": self.state_normalizer.std, "goal_normalizer_mean": self.goal_normalizer.mean, "goal_normalizer_std": self.goal_normalizer.std }, "NBM_FetchPickAndPlace_v2.pth") def load_weights(self): checkpoint = torch.load("NBM_FetchPickAndPlace_v2.pth") actor_state_dict = checkpoint["actor_state_dict"] self.actor.load_state_dict(actor_state_dict) state_normalizer_mean = checkpoint["state_normalizer_mean"] self.state_normalizer.mean = state_normalizer_mean state_normalizer_std = checkpoint["state_normalizer_std"] self.state_normalizer.std = state_normalizer_std goal_normalizer_mean = checkpoint["goal_normalizer_mean"] self.goal_normalizer.mean = goal_normalizer_mean goal_normalizer_std = checkpoint["goal_normalizer_std"] self.goal_normalizer.std = goal_normalizer_std def set_to_eval_mode(self): self.actor.eval() # self.critic.eval() def update_networks(self): self.soft_update_networks(self.actor, self.actor_target, self.tau) self.soft_update_networks(self.critic, self.critic_target, self.tau) def _update_normalizer(self, mini_batch): states, goals = self.memory.sample_for_normalization(mini_batch) self.state_normalizer.update(states) self.goal_normalizer.update(goals) self.state_normalizer.recompute_stats() self.goal_normalizer.recompute_stats() @staticmethod def sync_networks(network): comm = MPI.COMM_WORLD flat_params = _get_flat_params_or_grads(network, mode='params') comm.Bcast(flat_params, root=0) _set_flat_params_or_grads(network, flat_params, mode='params') @staticmethod def sync_grads(network): flat_grads = _get_flat_params_or_grads(network, mode='grads') comm = MPI.COMM_WORLD global_grads = np.zeros_like(flat_grads) comm.Allreduce(flat_grads, global_grads, op=MPI.SUM) _set_flat_params_or_grads(network, global_grads, mode='grads')
class D4PGAgent(Agent): """An advance D4PG agent with an option to run on a simpler DDPG mode. The agent uses a distributional value estimation when running on D4PG vs the traditional single value estimation when running on DDPG mode.""" def __init__(self, params): """Initialize an Agent object.""" self.params = params self.update_target_every = params['update_target_every'] self.update_every = params['update_every'] self.actor_update_every_multiplier = params[ 'actor_update_every_multiplier'] self.update_intensity = params['update_intensity'] self.gamma = params['gamma'] self.action_size = params['actor_params']['action_size'] self.num_agents = params['num_agents'] self.num_atoms = params['critic_params']['num_atoms'] self.v_min = params['critic_params']['v_min'] self.v_max = params['critic_params']['v_max'] self.update_target_type = params['update_target_type'] self.device = params['device'] self.name = params['name'] self.lr_reduction_factor = params['lr_reduction_factor'] self.tau = params['tau'] self.d4pg = params['d4pg'] # Distributes the number of atoms across the range of v min and max self.atoms = torch.linspace(self.v_min, self.v_max, self.num_atoms).to(self.device) # Initialize time step count self.t_step = 0 # Active and Target Actor networks self.actor_active = Actor(params['actor_params']).to(device) self.actor_target = Actor(params['actor_params']).to(device) if self.d4pg: # Active and Target D4PG Critic networks self.critic_active = D4PGCritic(params['critic_params']).to(device) self.critic_target = D4PGCritic(params['critic_params']).to(device) else: # Active and Target Critic networks self.critic_active = Critic(params['critic_params']).to(device) self.critic_target = Critic(params['critic_params']).to(device) self.actor_optimizer = optim.Adam(self.actor_active.parameters(), lr=params['actor_params']['lr']) self.critic_optimizer = optim.Adam(self.critic_active.parameters(), lr=params['critic_params']['lr']) self.schedule_lr = params['schedule_lr'] self.lr_steps = 0 # Create learning rate schedulers if required to reduce the learning rate # depeninding on plateuing of scores if self.schedule_lr: self.actor_scheduler = ReduceLROnPlateau( self.actor_optimizer, mode='max', factor=params['lr_reduction_factor'], patience=params['lr_patience_factor'], verbose=False, ) self.critic_scheduler = ReduceLROnPlateau( self.critic_optimizer, mode='max', factor=params['lr_reduction_factor'], patience=params['lr_patience_factor'], verbose=False, ) print("\n################ ACTOR ################\n") print(self.actor_active) print("\n################ CRITIC ################\n") print(self.critic_active) # Initiate exploration parameters by adding noise to the actions self.noise = params['noise'] # Replay memory self.memory = params['experience_replay'] def act(self, states, add_noise=True, pretrain=False): """Returns actions for given state as per current policy.""" # If pretraining is active, the agent gives a random action thereby encouraging # intial exploration of the state space quickly if pretrain: actions = np.random.uniform(-1., 1., (self.num_agents, self.action_size)) else: with torch.no_grad(): actions = self.actor_active( states.to(device).float()).detach().to('cpu').numpy() if add_noise: noise = self.noise.create_noise(actions.shape) actions += noise actions = np.clip(actions, -1., 1.) return actions, self.noise.epsilon def step(self, states, actions, rewards, next_states, dones, pretrain=False): """Save experience in replay memory, and use random sample from buffer to learn.""" self.memory.add((states, actions, rewards, next_states, dones)) self.t_step += 1 if pretrain == False: return self.learn_() return None, None def learn_(self): "Learns from experience using a distributional value estimation when in D4PG mode" actor_loss = None critic_loss = None # If enough samples are available in memory and its time to learn, then learn! if self.memory.ready() and self.t_step % self.update_every == 0: # Learns multiple times with the same set of experience for _ in range(self.update_intensity): # Samples from the replay buffer which has calculated the n step returns in advance # Next state represents the state at the n'th step states, next_states, actions, rewards, dones = self.memory.sample( ) if self.d4pg: atoms = self.atoms.unsqueeze(0) # Calculate log probability distribution using Zw with regards to stored actions log_probs = self.critic_active(states, actions, log=True) # Calculate the projected log probabilities from the target actor and critic networks # Since back propogation is not required. Tensors are detach to increase speed target_dist = self._get_targets(rewards, next_states).detach() # The critic loss is calculated using a weighted distribution instead of the mean to # arrive at a more accurate result. Cross Entropy loss is used as it is considered to # be the most ideal for categorical value distributions as utlized in the D4PG critic_loss = -(target_dist * log_probs).sum(-1).mean() else: # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target( next_states, actions_next).detach() # Compute Q targets for current states (y_i) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_active(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Execute gradient descent for the critic self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_active.parameters(), 1) self.critic_optimizer.step() critic_loss = critic_loss.item() # Update actor every x multiples of critic if self.t_step % (self.actor_update_every_multiplier * self.update_every) == 0: if self.d4pg: # Predicts the action for the actor networks loss calculation predicted_action = self.actor_active(states) # Predict the value distribution using the critic with regards to action predicted by actor probs = self.critic_active(states, predicted_action) # Multiply probabilities by atom values and sum across columns to get Q values expected_reward = (probs * atoms).sum(-1) # Calculate the actor network loss (Policy Gradient) # Get the negative of the mean across the expected rewards to do gradient ascent actor_loss = -expected_reward.mean() else: actions_pred = self.actor_active(states) actor_loss = -self.critic_active(states, actions_pred).mean() # Execute gradient ascent for the actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() actor_loss = actor_loss.item() # Updates the target networks every n steps if self.t_step % self.update_target_every == 0: self._update_target_networks() # Returns the actor and critic losses to store on tensorboard return actor_loss, critic_loss def _get_targets(self, rewards, next_states): """ Calculate Yᵢ from target networks using the target actor and and distributed critic networks """ target_actions = self.actor_target(next_states) target_probs = self.critic_target(next_states, target_actions) # Project the categorical distribution projected_probs = self._get_value_distribution(rewards, target_probs) return projected_probs def _get_value_distribution(self, rewards, probs): """ Returns the projected value distribution for the input state/action pair """ delta_z = (self.v_max - self.v_min) / (self.num_atoms - 1) # Rewards were stored with the first reward followed by each of the discounted rewards, sum up the # reward with its discounted reward projected_atoms = rewards.unsqueeze( -1 ) + self.gamma**self.memory.rollout_length * self.atoms.unsqueeze(0) projected_atoms.clamp_(self.v_min, self.v_max) b = (projected_atoms - self.v_min) / delta_z # Professional level GPUs have floating point math that is more accurate # to the n'th degree than traditional GPUs. This might be due to binary # imprecision resulting in 99.000000001 ceil() rounding to 100 instead of 99. # According to sources, forcibly reducing the precision seems to be the only # solution to the problem. Luckily it doesn't result in any complications to # the accuracy of calculating the lower and upper bounds correctly precision = 1 b = torch.round(b * 10**precision) / 10**precision lower_bound = b.floor() upper_bound = b.ceil() m_lower = (upper_bound + (lower_bound == upper_bound).float() - b) * probs m_upper = (b - lower_bound) * probs projected_probs = torch.tensor(np.zeros(probs.size())).to(self.device) for idx in range(probs.size(0)): projected_probs[idx].index_add_(0, lower_bound[idx].long(), m_lower[idx].double()) projected_probs[idx].index_add_(0, upper_bound[idx].long(), m_upper[idx].double()) return projected_probs.float()
class DDPGAgents(): """ Agent used to interact with and learns from the environment """ def __init__(self, state_size, action_size, config): """ Initialize an agent object """ self.state_size = state_size self.action_size = action_size self.config = config # retrieve number of agents self.num_agents = config["DDPG"]["num_agents"] # logging for this class self.logger = logging.getLogger(self.__class__.__name__) # gpu support self.device = pick_device(config, self.logger) ## Actor local and target networks self.actor_local = Actor(state_size, action_size, config).to(self.device) self.actor_target = Actor(state_size, action_size, config).to(self.device) self.actor_optimizer = getattr( optim, config["optimizer_actor"]["optimizer_type"])( self.actor_local.parameters(), betas=tuple(config["optimizer_actor"]["betas"]), **config["optimizer_actor"]["optimizer_params"]) ## Critic local and target networks self.critic_local = Critic(state_size, action_size, config).to(self.device) self.critic_target = Critic(state_size, action_size, config).to(self.device) self.critic_optimizer = getattr( optim, config["optimizer_critic"]["optimizer_type"])( self.critic_local.parameters(), betas=tuple(config["optimizer_critic"]["betas"]), **config["optimizer_critic"]["optimizer_params"]) ## Noise process self.noise = OUNoise((self.num_agents, action_size)) ## Replay memory self.memory = ReplayBuffer(config=config, action_size=action_size, buffer_size=int( config["DDPG"]["buffer_size"]), batch_size=config["trainer"]["batch_size"]) def step(self, state, action, reward, next_state, done): """ Save experience in replay memory, and use random sample from buffer to learn """ # Save experience in replay memory shared by all agents for agent in range(self.num_agents): self.memory.add(state[agent, :], action[agent, :], reward[agent], next_state[agent, :], done[agent]) # learn every timestep as long as enough samples are available in memory if len(self.memory) > self.config["trainer"]["batch_size"]: experiences = self.memory.sample() self.learn(experiences, self.config["DDPG"]["gamma"]) def act(self, states, add_noise=False): """ Returns actions for given state as per current policy """ # Convert state to tensor² states = torch.from_numpy(states).float().to(self.device) # prepare actions numpy array for all agents actions = np.zeros((self.num_agents, self.action_size)) ## Evaluation mode self.actor_local.eval() with torch.no_grad(): # Forward pass of local actor network for agent, state in enumerate(states): action_values = self.actor_local.forward( state).cpu().data.numpy() actions[agent, :] = action_values # pdb.set_trace() ## Training mode self.actor_local.train() if add_noise: # Add noise to improve exploration to our actor policy # action_values += torch.from_numpy(self.noise.sample()).type(torch.FloatTensor).to(self.device) actions += self.noise.sample() # Clip action to stay in the range [-1, 1] for our task actions = np.clip(actions, -1, 1) return actions def learn(self, experiences, gamma): """ Update value parameters using given batch of experience tuples """ states, actions, rewards, next_states, dones = experiences ## Update actor (policy) network using the sampled policy gradient # Compute actor loss actions_pred = self.actor_local.forward(states) actor_loss = -self.critic_local.forward(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() ## Update critic (value) network # Get predicted next-state actions and Q-values from target models actions_next = self.actor_target.forward(next_states) Q_targets_next = self.critic_target.forward(next_states, actions_next) # Compute Q-targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q-values from local critic model Q_expected = self.critic_local.forward(states, actions) # Compute loss critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() ## Update target networks with a soft update self.soft_update(self.actor_local, self.actor_target, self.config["DDPG"]["tau"]) self.soft_update(self.critic_local, self.critic_target, self.config["DDPG"]["tau"]) def soft_update(self, local_model, target_model, tau): """ Soft update model parameters, improves the stability of learning """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def reset(self): """ Reset noise """ self.noise.reset()
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.epsilon = EPSILON ### DEFINE THE ACTOR NETWORK ### ### INFINITE STEP BOOTSRAPPING, THEREFORE HIGH VARIANCE ### self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) ### DEFINE THE CRITIC NETWORK ### ### ONE STEP BOOTSRAPPING, THEREFORE HIGH BIAS ### self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) ### PROCCESS TO CREATE NOISE ### self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done, timestep): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn at defined interval, if enough samples are available in memory if len(self.memory) > BATCH_SIZE and timestep % LEARN_EVERY == 0: for _ in range(LEARN_NUM): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.epsilon * self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # ---------------------------- update noise ---------------------------- # self.epsilon -= EPSILON_DECAY self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)