def __init__(self, memory, nb_status, nb_actions, action_noise=None, gamma=0.99, tau=0.001, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), actor_lr=1e-4, critic_lr=1e-3): self.nb_status = nb_status self.nb_actions = nb_actions self.action_range = action_range self.observation_range = observation_range self.normalize_observations = normalize_observations self.actor = Actor(self.nb_status, self.nb_actions) self.actor_target = Actor(self.nb_status, self.nb_actions) self.actor_optim = Adam(self.actor.parameters(), lr=actor_lr) self.critic = Critic(self.nb_status, self.nb_actions) self.critic_target = Critic(self.nb_status, self.nb_actions) self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr) # Create replay buffer self.memory = memory # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.action_noise = action_noise # Hyper-parameters self.batch_size = batch_size self.tau = tau self.discount = gamma if self.normalize_observations: self.obs_rms = RunningMeanStd() else: self.obs_rms = None
def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.discrete = args.discrete self.pic = args.pic self.writer = writer self.select_time = 0 if self.pic: self.nb_status = args.pic_status # Create Actor and Critic Network net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'use_bn':args.bn, 'init_method':args.init_method } if args.pic: self.cnn = CNN(1, args.pic_status) self.cnn_target = CNN(1, args.pic_status) self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate) self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) if args.pic: hard_update(self.cnn_target, self.cnn) #Create replay buffer self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda()
def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def buscarActores(pkActores): """ Busca a más de un actor y su información desde una lista con las Pk de los actores que se desean buscar. """ actores = Actor.actores(pkActores) return actores
def crearActor(id_actor, nombre, nacimiento, genero): """ Método que crea un actor. Valida la información recibida. @param nombre del actor @param fecha de nacimiento del actor @genero masculino o femenino """ nuevo = Actor() nuevo.id_actor = id_actor if len(nombre.strip()) is 0: mensaje = u"Ingrese nombre del actor" return mensaje if nombre.strip().replace(" ", "").isalpha() is False: mensaje = u"Nombre del actor no valido" return mensaje nombre = nombre.strip() nuevo.nombre = nombre if "Mes" in nacimiento: mensaje = u"Ingrese mes de cumpleaños." return mensaje nuevo.nacimiento = nacimiento if "No definido o.O" in genero: mensaje = u"Especifique el genero del actor" return mensaje nuevo.genero = genero nuevo.save() return True
def crearActor(nombre, codigo, semestre, area): """ Método que crea un curso. Lo correcto sería validar que toda la información es correcta Ej: - Semestre puede ser 1 o 2 - Los códigos podrían tener un formato predefinido - Etc """ nuevo = Actor() nuevo.nombre = nombre # Aquí podrían haber validaciones para el codigo nuevo.codigo = codigo nuevo.semetre = semestre nuevo.area = area nuevo.save()
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 #0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor - 0.99 self.tau = 0.01 # for soft update of target parameters - 0.01 # Score tracker and learning parameters self.best_w = None self.best_score = -np.inf self.score = -np.inf
def crearActor(nombre, nacimiento, genero, imagen): """ Método que crea un actor. Valida la información recibida. @param nombre del actor @param fecha de nacimiento del actor @genero masculino o femenino @imagen dirección de la imagen que contiene al actor """ nuevo = Actor() if len(nombre.strip()) is 0: mensaje = u"Ingrese nombre del actor" return mensaje if nombre.strip().replace(" ", "").isalpha() is False: mensaje = u"Nombre del actor no valido" return mensaje nombre = nombre.strip() nuevo.nombre = nombre if "Mes" in nacimiento: mensaje = u"Ingrese mes de cumpleaños." return mensaje nuevo.nacimiento = nacimiento if "No definido o.O" in genero: mensaje = u"Especifique el genero del actor" return mensaje nuevo.genero = genero nuevo.imagen = imagen nuevo.save() # Procedemos a guardar la imagen en su directorio correspondiente id_actor = nuevo.id_actor[0] nuevaImagen = "imgActor/{}".format(id_actor) almacenarImagen(imagen, nuevaImagen) return True
def ppo(): # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") env = UnityEnvironment(file_name="../Reacher_Linux/Reacher.x86_64", no_graphics=True) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # number of agents in the environment print('Number of agents:', len(env_info.agents)) # number of actions action_size = brain.vector_action_space_size print('Number of actions:', action_size) # examine the state space state = env_info.vector_observations[0] print('States look like:', state) state_size = len(state) print('States have length:', state_size) config = Config() config.env = env config.actor_critic_fn = lambda: ActorCritic( actor=Actor(state_size, action_size), critic=Critic(state_size)) config.discount = 0.99 config.use_gae = True config.gae_tau = 0.95 config.gradient_clip = 5 config.rollout_length = 2048 config.optimization_epochs = 5 config.num_mini_batches = 512 config.ppo_ratio_clip = 0.2 config.log_interval = 10 * 2048 config.max_steps = 2e7 config.eval_episodes = 10 # config.logger = get_logger() print("GPU available: {}".format(torch.cuda.is_available())) print("GPU tensor test: {}".format(torch.rand(3, 3).cuda())) agent = PPOAgent(config) random_seed() config = agent.config t0 = time.time() scores = [] scores_window = deque(maxlen=100) # last 100 scores while True: if config.log_interval and not agent.total_steps % config.log_interval and len( agent.episode_rewards): rewards = agent.episode_rewards for reward in rewards: scores.append(reward) scores_window.append(reward) agent.episode_rewards = [] print('\r===> Average Score: {:d} episodes {:.2f}'.format( len(scores), np.mean(scores_window))) if np.mean(scores_window) >= 1.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(len(scores_window), np.mean(scores_window))) torch.save(agent.actor_critic.state_dict(), '../checkpoints/ppo_checkpoint.pth') break print( 'Total steps %d, returns %d/%.2f/%.2f/%.2f/%.2f (count/mean/median/min/max), %.2f steps/s' % (agent.total_steps, len(rewards), np.mean(rewards), np.median(rewards), np.min(rewards), np.max(rewards), config.log_interval / (time.time() - t0))) t0 = time.time() agent.step() return scores
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done, ts): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward # for states, actions, rewards, state_next, complete in zip(state, action, reward, next_state, done): # self.memory.add(states, actions, rewards, state_next, complete) self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE and ts % 20 == 0: for _ in range(10): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" # state = torch.from_numpy(state).float().unsqueeze(0).to(device) state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.epsilon = 1.0 # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=1e-3) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=1e-3, weight_decay=0) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, int(1e6), 256, random_seed) # Make sure target is with the same weight as the source self.hard_copy(self.actor_target, self.actor_local) self.hard_copy(self.critic_target, self.critic_local) def step(self, states, actions, rewards, next_states, dones, timestep): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > 256 and timestep % 20 == 0: for _ in range(10): experiences = self.memory.sample() self.learn(experiences, 0.99) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.epsilon * self.noise.sample() return action def reset(self): self.noise.reset() def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.soft_update(self.critic_local, self.critic_target, 1e-3) self.soft_update(self.actor_local, self.actor_target, 1e-3) self.epsilon -= 1e-6 self.noise.reset() def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def hard_copy(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data)
class Agent(): def __init__(self, state_size, action_size, random_seed): self.state_size = state_size self.action_size = action_size # Construct Actor networks self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Construct Critic networks self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) def step(self, memory): if len(memory) > BATCH_SIZE: experiences = memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): def __init__(self, device, state_size, n_agents, action_size, random_seed, buffer_size, batch_size, gamma, TAU, lr_actor, lr_critic, weight_decay, checkpoint_folder='./'): self.DEVICE = device self.state_size = state_size self.n_agents = n_agents self.action_size = action_size self.seed = random.seed(random_seed) # Hyperparameters self.BUFFER_SIZE = buffer_size self.BATCH_SIZE = batch_size self.GAMMA = gamma self.TAU = TAU self.LR_ACTOR = lr_actor self.LR_CRITIC = lr_critic self.WEIGHT_DECAY = weight_decay self.CHECKPOINT_FOLDER = checkpoint_folder # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(self.DEVICE) self.actor_target = Actor(state_size, action_size, random_seed).to(self.DEVICE) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(self.DEVICE) self.critic_target = Critic(state_size, action_size, random_seed).to(self.DEVICE) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.LR_CRITIC, weight_decay=self.WEIGHT_DECAY) ''' if os.path.isfile(self.CHECKPOINT_FOLDER + 'checkpoint_actor.pth') and os.path.isfile(self.CHECKPOINT_FOLDER + 'checkpoint_critic.pth'): self.actor_local.load_state_dict(torch.load(self.CHECKPOINT_FOLDER + 'checkpoint_actor.pth')) self.actor_target.load_state_dict(torch.load(self.CHECKPOINT_FOLDER + 'checkpoint_actor.pth')) self.critic_local.load_state_dict(torch.load(self.CHECKPOINT_FOLDER + 'checkpoint_critic.pth')) self.critic_target.load_state_dict(torch.load(self.CHECKPOINT_FOLDER + 'checkpoint_critic.pth')) ''' # Noise process self.noise = OUNoise((n_agents, action_size), random_seed) # Replay memory self.memory = ReplayBuffer(device, action_size, self.BUFFER_SIZE, self.BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for i in range(self.n_agents): self.memory.add(state[i, :], action[i, :], reward[i], next_state[i, :], done[i]) # Learn, if enough samples are available in memory if len(self.memory) > self.BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(self.DEVICE) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.GAMMA * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) def soft_update(self, local_model, target_model): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ tau = self.TAU for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def checkpoint(self): torch.save(self.actor_local.state_dict(), self.CHECKPOINT_FOLDER + 'checkpoint_actor.pth') torch.save(self.critic_local.state_dict(), self.CHECKPOINT_FOLDER + 'checkpoint_critic.pth')
class CAC(object): def __init__(self, a_dim, s_dim, variant, action_prior='uniform', max_global_steps=100000): """ a_dim : dimension of action space s_dim: state space dimension variant: dictionary containing parameters for the algorithms """ ############################### Model parameters #################################### set_seed(variant['seed']) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.actor = Actor(input_dim=s_dim, output_dim=a_dim, n_layers=3, layer_sizes=[256, 256, 256], hidden_activation="leakyrelu").to(self.device) self.actor_target = Actor(input_dim=s_dim, output_dim=a_dim, n_layers=3, layer_sizes=[256, 256, 256], hidden_activation="leakyrelu").to( self.device).eval() self.critic = LyapunovCritic(state_dim=s_dim, action_dim=a_dim, output_dim=None, n_layers=2, layer_sizes=[256, 256], hidden_activation="leakyrelu").to( self.device) self.critic_target = LyapunovCritic(state_dim=s_dim, action_dim=a_dim, output_dim=None, n_layers=2, layer_sizes=[256, 256], hidden_activation="leakyrelu").to( self.device).eval() # copy parameters of the learning network to the target network hard_update(self.critic_target, self.critic) hard_update(self.actor_target, self.actor) # disable gradient calculations of the target network stop_grad(self.critic_target) stop_grad(self.actor_target) # self.memory_capacity = variant['memory_capacity'] ################################ parameters for training ############################### self.batch_size = variant[ 'batch_size'] # batch size for learning the actor self.gamma = variant['gamma'] # discount factor self.tau = variant['tau'] # smoothing parameter for the weight updates self.approx_value = True if 'approx_value' not in variant.keys( ) else variant['approx_value'] self._action_prior = action_prior # prior over action space s_dim = s_dim * (variant['history_horizon'] + 1) self.a_dim, self.s_dim, = a_dim, s_dim self.history_horizon = variant[ 'history_horizon'] # horizon to consider for the history self.working_memory = deque(maxlen=variant['history_horizon'] + 1) # memory to store history target_entropy = variant['target_entropy'] if target_entropy is None: self.target_entropy = -self.a_dim #lower bound of the policy entropy else: self.target_entropy = target_entropy self.target_variance = 0.0 self.finite_horizon = variant['finite_horizon'] self.soft_predict_horizon = variant['soft_predict_horizon'] self.use_lyapunov = variant['use_lyapunov'] self.adaptive_alpha = variant['adaptive_alpha'] self.adaptive_beta = variant[ 'adaptive_beta'] if 'adaptive_beta' in variant.keys() else False self.time_near = variant['Time_near'] self.max_global_steps = max_global_steps self.LR_A = variant['lr_a'] self.LR_L = variant['lr_l'] self.LR_lag = self.LR_A / 10 self.alpha3 = variant['alpha3'] labda = variant['labda'] # formula (12) in the paper alpha = variant['alpha'] # entropy temperature (beta in the paper) beta = variant['beta'] # constraint error weight self.log_labda = torch.log(torch.tensor([labda], device=self.device)) self.log_alpha = torch.log(torch.tensor( [alpha], device=self.device)) # Entropy Temperature self.log_beta = torch.log(torch.tensor([beta], device=self.device)) self.log_alpha.requires_grad = True self.log_beta.requires_grad = True self.log_labda.requires_grad = True # The update is in log space self.labda = torch.clamp(torch.exp(self.log_labda), min=SCALE_lambda_MIN_MAX[0], max=SCALE_lambda_MIN_MAX[1]) self.alpha = torch.exp(self.log_alpha) self.beta = torch.clamp(torch.exp(self.log_beta), min=SCALE_beta_MIN_MAX[0], max=SCALE_beta_MIN_MAX[1]) self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=self.LR_A) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=self.LR_L) self.alpha_optim = torch.optim.Adam([self.log_alpha], lr=self.LR_A) self.labda_optim = torch.optim.Adam([self.log_labda], lr=self.LR_lag) self.beta_optim = torch.optim.Adam([self.log_beta], lr=0.01) # step_fn = lambda i : 1.0 - (i - 1.)/self.max_global_steps # self.actor_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.actor_optim, lr_lambda = step_fn) # self.critic_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.critic_optim, lr_lambda = step_fn) # self.alpha_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.alpha_optim, lr_lambda = step_fn) # self.labda_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.labda_optim, lr_lambda = step_fn) # self.beta_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.beta_optim, lr_lambda = step_fn) self.actor.float() self.critic.float() def act(self, s, evaluation=False): a, deterministic_a, _, _ = self.actor(s) if evaluation is True: return deterministic_a else: return a def learn(self, batch): bs = torch.tensor(batch['s'], dtype=torch.float).to(self.device) # state ba = torch.tensor(batch['a'], dtype=torch.float).to(self.device) # action br = torch.tensor(batch['r'], dtype=torch.float).to(self.device) # reward bterminal = torch.tensor(batch['terminal'], dtype=torch.float).to(self.device) bs_ = torch.tensor(batch['s_'], dtype=torch.float).to(self.device) # next state b_s = torch.tensor(batch['_s'], dtype=torch.float).to(self.device) # prev state bv = None b_r_ = None # print(bs) alpha_loss = None beta_loss = None # # beta learning # self.beta_optim.zero_grad() # beta_loss = self.get_beta_loss(b_s) # if self.adaptive_beta: # beta_loss.backward(retain_graph = False) # self.beta_optim.step() # else: # self.beta_optim.zero_grad() # lyapunov learning start_grad(self.critic) if self.finite_horizon: bv = torch.tensor(batch['value']) b_r_ = torch.tensor(batch['r_N_']) self.critic_optim.zero_grad() critic_loss = self.get_lyapunov_loss(bs, bs_, ba, br, b_r_, bv, bterminal) critic_loss.backward() self.critic_optim.step() # actor lerning stop_grad(self.critic) self.actor_optim.zero_grad() actor_loss = self.get_actor_loss(bs, bs_, ba, br) actor_loss.backward(retain_graph=False) self.actor_optim.step() # alpha learning if self.adaptive_alpha: self.alpha_optim.zero_grad() alpha_loss = self.get_alpha_loss(bs, self.target_entropy) alpha_loss.backward(retain_graph=False) self.alpha_optim.step() self.alpha = torch.exp(self.log_alpha) # labda learning self.labda_optim.zero_grad() labda_loss = self.get_labda_loss(br, bs, bs_, ba) # print("labda loss = ", labda_loss) labda_loss.backward(retain_graph=False) self.labda_optim.step() self.labda = torch.clamp(torch.exp(self.log_labda), min=SCALE_lambda_MIN_MAX[0], max=SCALE_lambda_MIN_MAX[1]) # update target networks soft_update(self.critic_target, self.critic, self.tau) soft_update(self.actor_target, self.actor, self.tau) return alpha_loss, beta_loss, labda_loss, actor_loss, critic_loss def get_alpha_loss(self, s, target_entropy): # with torch.no_grad(): # _, self.deterministic_a,self.log_pis, _ = self.actor_target(s) intermediate = (self.log_pis + target_entropy).detach() # self.a, self.deterministic_a, self.log_pis, _ = self.actor(s) # print(self.a) return -torch.mean(self.log_alpha * intermediate) def get_labda_loss(self, r, s, s_, a): # with torch.no_grad(): # l = self.critic(s, a) # lya_a_, _, _, _ = self.actor_target(s_) # self.l_ = self.critic_target(s_, lya_a_) l = self.l.detach() lyapunov_loss = torch.mean(self.l_ - l + self.alpha3 * r) return -torch.mean(self.log_labda * lyapunov_loss) def get_beta_loss(self, _s): with torch.no_grad(): _, _deterministic_a, _, _ = self.actor_target(_s) self.l_action = torch.mean( torch.norm(_deterministic_a.detach() - self.deterministic_a, dim=1)) with torch.no_grad(): intermediate = (self.l_action - 0.02).detach() return -torch.mean(self.log_beta * intermediate) def get_actor_loss(self, s, s_, a, r): if self._action_prior == 'normal': policy_prior = torch.distributions.MultivariateNormal( loc=torch.zeros(self.a_dim), covariance_matrix=torch.diag(torch.ones(self.a_dim))) policy_prior_log_probs = policy_prior.log_prob(self.a) elif self._action_prior == 'uniform': policy_prior_log_probs = 0.0 # only actor weights are updated! _, self.deterministic_a, self.log_pis, _ = self.actor(s) # self.l = self.critic(s, a) with torch.no_grad(): # self.l = self.critic(s, a) lya_a_, _, _, _ = self.actor(s_) self.l_ = self.critic(s_, lya_a_) l = self.l.detach() self.lyapunov_loss = torch.mean(self.l_ - l + self.alpha3 * r) labda = self.labda.detach() alpha = self.alpha.detach() a_loss = labda * self.lyapunov_loss + alpha * torch.mean( self.log_pis) - policy_prior_log_probs return a_loss def get_lyapunov_loss(self, s, s_, a, r, r_n_=None, v=None, terminal=0.): with torch.no_grad(): a_, _, _, _ = self.actor_target(s_) l_ = self.critic_target(s_, a_) self.l = self.critic(s, a) if self.approx_value: if self.finite_horizon: if self.soft_predict_horizon: l_target = r - r_n_ + l_ else: l_target = v else: l_target = r + self.gamma * ( 1 - terminal ) * l_ # Lyapunov critic - self.alpha * next_log_pis else: l_target = r mse_loss = nn.MSELoss() l_loss = mse_loss(self.l, l_target) return l_loss def save_result(self, path): if not os.path.exists(path + "/policy/"): os.mkdir(path + "/policy/") self.actor_target.save(path + "/policy/actor_target.pth") self.critic_target.save(path + "/policy/critic_target.pth") self.actor.save(path + "/policy/actor.pth") self.critic.save(path + "/policy/critic.pth") print("Save to path: ", path + "/policy/") def restore(self, path): result_path = path if not os.path.exists(result_path): raise IOError("Results path ", result_path, " does not contain anything to load") self.actor_target.load(result_path + "/actor_target.pth") self.critic_target.load(result_path + "/critic_target.pth") self.actor.load(result_path + "/actor.pth") self.critic.load(result_path + "/critic.pth") success_load = True print("Load successful, model file from ", result_path) print("#########################################################") return success_load def scheduler_step(self): self.alpha_scheduler.step() self.beta_scheduler.step() self.labda_scheduler.step() self.actor_scheduler.step() self.critic_scheduler.step()
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
if done: # done and print information record(self.g_ep, self.g_ep_r, ep_r, self.res_queue, self.name) break state = next_state total_step += 1 if __name__ == '__main__': # multiple copies of both actor and critic (one pair per worker) # updates sent to global model gnet = {'actor': Actor(state_size, action_size, random_seed).to(device), \ 'critic': Critic(state_size, action_size, random_seed).to(device) } opt = {} # stores both shared optimizers for critic and actor networks LR_ACTOR = 1e-4 LR_CRITIC = 1e-3 print('Networks present are: ') for key, value in gnet.items( ): # Alternatively if gnet is a class, use gnet.__dict__ if isinstance(value, nn.Module): value.share_memory() print('Sharing in memory {}: '.format(key)) if key == 'actor' or key == 'critic': opt[key + '_optimizer'] = SharedAdam( value.parameters(),
class DDPG(): def __init__(self, env, action_dim, state_dim, device, critic_lr=3e-4, actor_lr=3e-4, gamma=0.99, batch_size=100, validate_steps=100, max_episode_length=150): """ param: env: An gym environment param: action_dim: Size of action space param: state_dim: Size of state space param: critic_lr: Learning rate of the critic param: actor_lr: Learning rate of the actor param: gamma: The discount factor param: batch_size: The batch size for training param: device: The device used for training param: validate_steps: Number of iterations after which we evaluate trained policy """ self.gamma = gamma self.batch_size = batch_size self.env = env self.device = device self.eval_env = deepcopy(env) self.validate_steps = validate_steps self.max_episode_length = max_episode_length # actor and actor_target where both networks have the same initial weights self.actor = Actor(state_dim=state_dim, action_dim=action_dim).to(self.device) self.actor_target = deepcopy(self.actor) # critic and critic_target where both networks have the same initial weights self.critic = Critic(state_dim=state_dim, action_dim=action_dim).to(self.device) self.critic_target = deepcopy(self.critic) # Optimizer for the actor and critic self.optimizer_actor = optim.Adam(self.actor.parameters(), lr=actor_lr) self.optimizer_critic = optim.Adam(self.critic.parameters(), lr=critic_lr) # Replay buffer self.ReplayBuffer = ReplayBuffer(buffer_size=10000, init_length=1000, state_dim=state_dim, \ action_dim=action_dim, env=env, device = device) def update_target_networks(self): """ A function to update the target networks """ weighSync(self.actor_target, self.actor) weighSync(self.critic_target, self.critic) def update_network(self, batch): """ A function to update the function just once """ # Sample and parse batch state, action, reward, state_next, done = self.ReplayBuffer.batch_sample( batch) # Predicting the next action and q_value action_next = self.actor_target(state_next) q_next = self.critic_target(state_next, action_next) target_q = reward + (self.gamma * done * q_next) q = self.critic(state, action) # Critic update self.critic.zero_grad() value_loss = F.mse_loss(q, target_q) value_loss.backward() self.optimizer_critic.step() # Actor update self.actor.zero_grad() policy_loss = -self.critic(state, self.actor(state)).mean() policy_loss.backward() self.optimizer_actor.step() # Target update self.update_target_networks() return value_loss.item(), policy_loss.item() def select_action(self, state, isEval): state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) action = self.actor(state).squeeze(0).detach() if isEval: return action.cpu().numpy() action += torch.normal(0, 0.1, size=action.shape).to(self.device) action = torch.clamp(action, -1., 1.).cpu().numpy() return action def train(self, num_steps): """ Train the policy for the given number of iterations :param num_steps:The number of steps to train the policy for """ value_losses, policy_losses, validation_reward, validation_steps = [],[],[],[] step, episode, episode_steps, episode_reward, state = 0, 0, 0, 0., None while step < num_steps: # reset if it is the start of episode if state is None: state = deepcopy(self.env.reset()) action = self.select_action(state, False) # env response with next_state, reward, terminate_info state_next, reward, done, _ = self.env.step(action) state_next = deepcopy(state_next) if self.max_episode_length and episode_steps >= self.max_episode_length - 1: done = True # observe and store in replay buffer self.ReplayBuffer.buffer_add( Exp(state=state, action=action, reward=reward, state_next=state_next, done=done)) # update policy based on sampled batch batch = self.ReplayBuffer.buffer_sample(self.batch_size) value_loss, policy_loss = self.update_network(batch) value_losses.append(value_loss) policy_losses.append(policy_loss) # evaluate if step % self.validate_steps == 0: validate_reward, steps = self.evaluate() validation_reward.append(validate_reward) validation_steps.append(steps) print( "[Eval {:06d}/{:06d}] Steps: {:06d}, Episode Reward:{:04f}" .format(step, int(num_steps), steps, validate_reward)) # update step += 1 episode_steps += 1 episode_reward += reward state = deepcopy(state_next) if done: # reset at the end of episode #print("[Train {:06d}/{:06d}] - Episode Reward:{:04f} ".format(step, num_steps, step, episode_reward)) episode_steps, episode_reward, state = 0, 0., None episode += 1 return value_losses, policy_losses, validation_reward, validation_steps def evaluate(self): """ Evaluate the policy trained so far in an evaluation environment """ state, done, total_reward, steps = self.eval_env.reset(), False, 0., 0 while not done: action = self.select_action(state, True) state_next, reward, done, _ = self.eval_env.step(action) total_reward += reward steps += 1 state = state_next return total_reward / steps, steps
def __init__(self, state_size, action_size, CER=False, num_agents=1, idx=0, random_seed=23, fc1_units=96, fc2_units=96, epsilon=1.0, lr_actor=1e-3, lr_critic=1e-3, weight_decay=0): self.state_size = state_size self.action_size = action_size self.CER = CER self.EXPmemory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) self.CERmem = ReplayBuffer(action_size, CER_SIZE, BATCH_SIZE, random_seed) self.random_seed = random_seed self.fc1_units = fc1_units self.fc2_units = fc2_units self.state_size = state_size self.action_size = action_size if (torch.cuda.is_available()): self.idx = torch.cuda.LongTensor([idx]) else: self.idx = torch.LongTensor([idx]) self.num_agents = num_agents self.epsilon = epsilon self.lr_actor = lr_actor self.lr_critic = lr_critic self.weight_decay = weight_decay self.noise = OUNoise(action_size, random_seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 random.seed(random_seed) #### The actor only sees its own state self.actor_local = Actor(self.state_size, self.action_size, self.random_seed, fc1_units=self.fc1_units, fc2_units=self.fc2_units).to(device) self.actor_target = Actor(self.state_size, self.action_size, self.random_seed, fc1_units=self.fc1_units, fc2_units=self.fc2_units).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(num_agents * state_size, num_agents * action_size, random_seed).to(device) self.critic_target = Critic(num_agents * state_size, num_agents * action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay) # Initialize target and local being the same self.hard_copy(self.actor_target, self.actor_local) self.hard_copy(self.critic_target, self.critic_local)
class DDPG(): def __init__(self, state_size, action_size, CER=False, num_agents=1, idx=0, random_seed=23, fc1_units=96, fc2_units=96, epsilon=1.0, lr_actor=1e-3, lr_critic=1e-3, weight_decay=0): self.state_size = state_size self.action_size = action_size self.CER = CER self.EXPmemory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) self.CERmem = ReplayBuffer(action_size, CER_SIZE, BATCH_SIZE, random_seed) self.random_seed = random_seed self.fc1_units = fc1_units self.fc2_units = fc2_units self.state_size = state_size self.action_size = action_size if (torch.cuda.is_available()): self.idx = torch.cuda.LongTensor([idx]) else: self.idx = torch.LongTensor([idx]) self.num_agents = num_agents self.epsilon = epsilon self.lr_actor = lr_actor self.lr_critic = lr_critic self.weight_decay = weight_decay self.noise = OUNoise(action_size, random_seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 random.seed(random_seed) #### The actor only sees its own state self.actor_local = Actor(self.state_size, self.action_size, self.random_seed, fc1_units=self.fc1_units, fc2_units=self.fc2_units).to(device) self.actor_target = Actor(self.state_size, self.action_size, self.random_seed, fc1_units=self.fc1_units, fc2_units=self.fc2_units).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(num_agents * state_size, num_agents * action_size, random_seed).to(device) self.critic_target = Critic(num_agents * state_size, num_agents * action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay) # Initialize target and local being the same self.hard_copy(self.actor_target, self.actor_local) self.hard_copy(self.critic_target, self.critic_local) def step(self, states, actions, rewards, next_states, dones): # Save experience in replay memory for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.EXPmemory.add(state, action, reward, next_state, done) if (self.CER): self.CERmem.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.EXPmemory) > BATCH_SIZE: for _ in range(NUM_UPDATES): experiences = self.EXPmemory.sample() self.learn(experiences, GAMMA) if (self.CER): for _ in range(5): experiences = self.CERmem.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): if (not torch.is_tensor(state)): state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = (self.actor_local(state).cpu().data.numpy()) self.actor_local.train() if add_noise: action += self.noise.sample() * self.epsilon return np.clip(action, -1, 1) def reset(self): self.noise.reset() #DDPG has to learn from experience and would be actions/next-actions from other agents def learn(self, experiences, wouldbe_actions, wouldbe_next_actions, gamma, tau=1e-3, epsilon_decay=1e-6): states, actions, rewards, next_states, dones = experiences # Get predicted next-state actions and Q values from target models next_actions = torch.cat(wouldbe_next_actions, dim=1).to(device) with torch.no_grad(): Q_targets_next = self.critic_target(next_states, next_actions) # Compute Q targets for current states (y_i) Q_targets = rewards.index_select( 1, self.idx) + (gamma * Q_targets_next * (1 - dones.index_select(1, self.idx))) Q_expected = self.critic_local(states, actions) # Critic update critic_loss = F.mse_loss(Q_expected, Q_targets.detach()) self.critic_optimizer.zero_grad() critic_loss.backward() #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # Actor update, actions from other agents must be detached from optimization of local neywork # wouldbe_action is already calculated via local actor from above layer actions_pred = [ a if i == self.idx else a.detach() for i, a in enumerate(wouldbe_actions) ] actions_pred = torch.cat(actions_pred, dim=1).to(device) actor_loss = -self.critic_local(states, actions_pred).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Targets update self.soft_update(self.critic_local, self.critic_target, tau) self.soft_update(self.actor_local, self.actor_target, tau) # Noise update self.epsilon -= epsilon_decay self.noise.reset() def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def hard_copy(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data) def save(self): torch.save(self.actor_local.state_dict(), '_actor' + str(self.idx.item()) + '.pth') torch.save(self.critic_local.state_dict(), '_critic' + str(self.idx.item()) + '.pth') def load(self): self.critic_local.load_state_dict( torch.load('_critic' + str(self.idx.item()) + '.pth')) self.actor_local.load_state_dict( torch.load('_actor' + str(self.idx.item()) + '.pth')) self.critic_target.load_state_dict( torch.load('_critic' + str(self.idx.item()) + '.pth')) self.actor_target.load_state_dict( torch.load('_actor' + str(self.idx.item()) + '.pth'))
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, config, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.config = config self.device = config['device'] self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.noise_epsilon = config['NOISE_EPSILON'] # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(self.device) self.actor_target = Actor(state_size, action_size, random_seed).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config['LR_ACTOR']) self.hard_update(self.actor_local, self.actor_target) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(self.device) self.critic_target = Critic(state_size, action_size, random_seed).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=config['LR_CRITIC'], weight_decay=config['WEIGHT_DECAY']) self.hard_update(self.critic_local, self.critic_target) # Noise process self.noise = OUNoise((1, action_size), random_seed, 0.0, config['OU_THETA'], config['OU_SIGMA']) self.noise_epsilon = config['NOISE_EPSILON'] # Replay memory self.memory = ReplayBuffer(action_size, self.config, random_seed) def step(self, t, state, action, reward, next_state, done, agent_index): """Save experience in replay memory, and use random sample from buffer to learn.""" self.memory.add(state, action, reward, next_state, done) if t % self.config['DDPG_UPDATE_EVERY'] == 0 and len( self.memory) > self.config['BATCH_SIZE']: for _ in range(self.config['DDPG_LEARN_TIMES']): experiences = self.memory.sample() self.learn(experiences, agent_index) def act(self, states): states = torch.from_numpy(states).float().to(self.device) actions = np.zeros((1, self.action_size)) self.actor_local.eval() with torch.no_grad(): for agent_num, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[agent_num, :] = action self.actor_local.train() actions += self.noise_epsilon * self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, agent_index): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ gamma = self.config['GAMMA'] states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) if agent_index == 0: actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1) else: actions_next = torch.cat((actions[:, :2], actions_next), dim=1) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) if agent_index == 0: actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1) else: actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) # ---------------------------- update noise ---------------------------- # self.noise_epsilon = max( self.noise_epsilon - self.config['NOISE_EPSILON_DECAY'], self.config['NOISE_EPSILON_MIN']) self.noise.reset() def hard_update(self, local_model, target_model): """Hard update model parameters. θ_target = θ_local Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(local_param.data) def soft_update(self, local_model, target_model): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) """ tau = self.config['TAU'] for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def __init__(self, a_dim, s_dim, variant, action_prior='uniform', max_global_steps=100000): """ a_dim : dimension of action space s_dim: state space dimension variant: dictionary containing parameters for the algorithms """ ############################### Model parameters #################################### set_seed(variant['seed']) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.actor = Actor(input_dim=s_dim, output_dim=a_dim, n_layers=3, layer_sizes=[256, 256, 256], hidden_activation="leakyrelu").to(self.device) self.actor_target = Actor(input_dim=s_dim, output_dim=a_dim, n_layers=3, layer_sizes=[256, 256, 256], hidden_activation="leakyrelu").to( self.device).eval() self.critic = LyapunovCritic(state_dim=s_dim, action_dim=a_dim, output_dim=None, n_layers=2, layer_sizes=[256, 256], hidden_activation="leakyrelu").to( self.device) self.critic_target = LyapunovCritic(state_dim=s_dim, action_dim=a_dim, output_dim=None, n_layers=2, layer_sizes=[256, 256], hidden_activation="leakyrelu").to( self.device).eval() # copy parameters of the learning network to the target network hard_update(self.critic_target, self.critic) hard_update(self.actor_target, self.actor) # disable gradient calculations of the target network stop_grad(self.critic_target) stop_grad(self.actor_target) # self.memory_capacity = variant['memory_capacity'] ################################ parameters for training ############################### self.batch_size = variant[ 'batch_size'] # batch size for learning the actor self.gamma = variant['gamma'] # discount factor self.tau = variant['tau'] # smoothing parameter for the weight updates self.approx_value = True if 'approx_value' not in variant.keys( ) else variant['approx_value'] self._action_prior = action_prior # prior over action space s_dim = s_dim * (variant['history_horizon'] + 1) self.a_dim, self.s_dim, = a_dim, s_dim self.history_horizon = variant[ 'history_horizon'] # horizon to consider for the history self.working_memory = deque(maxlen=variant['history_horizon'] + 1) # memory to store history target_entropy = variant['target_entropy'] if target_entropy is None: self.target_entropy = -self.a_dim #lower bound of the policy entropy else: self.target_entropy = target_entropy self.target_variance = 0.0 self.finite_horizon = variant['finite_horizon'] self.soft_predict_horizon = variant['soft_predict_horizon'] self.use_lyapunov = variant['use_lyapunov'] self.adaptive_alpha = variant['adaptive_alpha'] self.adaptive_beta = variant[ 'adaptive_beta'] if 'adaptive_beta' in variant.keys() else False self.time_near = variant['Time_near'] self.max_global_steps = max_global_steps self.LR_A = variant['lr_a'] self.LR_L = variant['lr_l'] self.LR_lag = self.LR_A / 10 self.alpha3 = variant['alpha3'] labda = variant['labda'] # formula (12) in the paper alpha = variant['alpha'] # entropy temperature (beta in the paper) beta = variant['beta'] # constraint error weight self.log_labda = torch.log(torch.tensor([labda], device=self.device)) self.log_alpha = torch.log(torch.tensor( [alpha], device=self.device)) # Entropy Temperature self.log_beta = torch.log(torch.tensor([beta], device=self.device)) self.log_alpha.requires_grad = True self.log_beta.requires_grad = True self.log_labda.requires_grad = True # The update is in log space self.labda = torch.clamp(torch.exp(self.log_labda), min=SCALE_lambda_MIN_MAX[0], max=SCALE_lambda_MIN_MAX[1]) self.alpha = torch.exp(self.log_alpha) self.beta = torch.clamp(torch.exp(self.log_beta), min=SCALE_beta_MIN_MAX[0], max=SCALE_beta_MIN_MAX[1]) self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=self.LR_A) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=self.LR_L) self.alpha_optim = torch.optim.Adam([self.log_alpha], lr=self.LR_A) self.labda_optim = torch.optim.Adam([self.log_labda], lr=self.LR_lag) self.beta_optim = torch.optim.Adam([self.log_beta], lr=0.01) # step_fn = lambda i : 1.0 - (i - 1.)/self.max_global_steps # self.actor_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.actor_optim, lr_lambda = step_fn) # self.critic_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.critic_optim, lr_lambda = step_fn) # self.alpha_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.alpha_optim, lr_lambda = step_fn) # self.labda_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.labda_optim, lr_lambda = step_fn) # self.beta_scheduler = torch.optim.lr_scheduler.MultiplicativeLR(self.beta_optim, lr_lambda = step_fn) self.actor.float() self.critic.float()
def actor(): return Actor.all()
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size * num_agents, action_size * num_agents, random_seed).to(device) self.critic_target = Critic(state_size * num_agents, action_size * num_agents, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) self.soft_update(self.critic_local, self.critic_target, 1) self.soft_update(self.actor_local, self.actor_target, 1) # Noise process self.noise = OUNoise(action_size, random_seed) self.noise_reduction_ratio = NOISE_START self.step_count = 0 def act(self, state, i_episode, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: if i_episode > EPISODES_BEFORE_TRAINING and self.noise_reduction_ratio > NOISE_END: self.noise_reduction_ratio = NOISE_REDUCTION_RATE**( i_episode - EPISODES_BEFORE_TRAINING) # noise_reduction_ratio = 1 action += self.noise_reduction_ratio * self.add_noise2() # action += noise_reduction_ratio * self.noise.sample() return np.clip(action, -1, 1) def add_noise2(self): # noise = 0.5*np.random.randn(1,self.action_size) #sigma of 0.5 as sigma of 1 will have alot of actions just clipped noise = 0.5 * np.random.standard_normal(self.action_size) return noise def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ full_states, actions, actor_local_actions, actor_target_actions, agent_state, agent_action, agent_reward, agent_done, next_states, next_full_states = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models # actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_full_states, actor_target_actions) # Compute Q targets for current states (y_i) Q_targets = agent_reward + (gamma * Q_targets_next * (1 - agent_done)) # Compute critic loss Q_expected = self.critic_local(full_states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss # actions_pred = self.actor_local(agent_state) actor_loss = -self.critic_local(full_states, actor_local_actions).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # # self.soft_update(self.critic_local, self.critic_target, TAU) # self.soft_update(self.actor_local, self.actor_target, TAU) def hard_copy_weights(self, target, source): """ copy weights from source to target network (part of initialization)""" for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, lr_actor, lr_critic, batch_size, buffer_size, noise_decay, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.batch_size = batch_size self.seed = random.seed(random_seed) if torch.cuda.is_available(): print("--- Using GPU ---") else: print("--- Using CPU ---") # Actor Network (w/ Target Network) fc1 = 128 fc2 = 64 fc3 = 32 self.actor_local = Actor(state_size, action_size, random_seed, fc1_units=fc1, fc2_units=fc2, fc3_units=fc3).to(device) self.actor_target = Actor(state_size, action_size, random_seed*2, fc1_units=fc1, fc2_units=fc2, fc3_units=fc3).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) fc1 = 128 fc2 = 64 fc3 = 32 self.critic_local = Critic(state_size, action_size, random_seed*3, fcs1_units=fc1, fc2_units=fc2, fc3_units=fc3).to(device) self.critic_target = Critic(state_size, action_size, random_seed*4, fcs1_units=fc1, fc2_units=fc2, fc3_units=fc3).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic) # Load the networks from previous simulation, if there are any self.load_graphs() # Noise process self.noise = OUNoise(action_size, random_seed) # Noise ampliture self.noise_amplitude = 1.0 # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed) def step(self, states, actions, rewards, next_states, dones, gamma, tau): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, gamma, tau) def act(self, state, add_noise=True, noise_decay=1): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample()*self.noise_amplitude self.noise_amplitude *= noise_decay return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma, tau): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, tau) self.soft_update(self.actor_local, self.actor_target, tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def save_graphs(self): """ Save the graphs in the same directory as the notebook. """ torch.save(self.actor_local.state_dict(), 'checkpoint_actor.pth') torch.save(self.critic_local.state_dict(), 'checkpoint_critic.pth') def load_graphs(self): """ Load the graphs if there are in the same directory as the notebook. """ if (os.path.isfile('checkpoint_actor.pth') and os.path.isfile('checkpoint_critic.pth')): self.actor_local.load_state_dict(torch.load('checkpoint_actor.pth')) self.critic_local.load_state_dict(torch.load('checkpoint_critic.pth'))
class DDPG(object): def __init__(self, memory, nb_status, nb_actions, action_noise=None, gamma=0.99, tau=0.001, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), actor_lr=1e-4, critic_lr=1e-3): self.nb_status = nb_status self.nb_actions = nb_actions self.action_range = action_range self.observation_range = observation_range self.normalize_observations = normalize_observations self.actor = Actor(self.nb_status, self.nb_actions) self.actor_target = Actor(self.nb_status, self.nb_actions) self.actor_optim = Adam(self.actor.parameters(), lr=actor_lr) self.critic = Critic(self.nb_status, self.nb_actions) self.critic_target = Critic(self.nb_status, self.nb_actions) self.critic_optim = Adam(self.critic.parameters(), lr=critic_lr) # Create replay buffer self.memory = memory # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.action_noise = action_noise # Hyper-parameters self.batch_size = batch_size self.tau = tau self.discount = gamma if self.normalize_observations: self.obs_rms = RunningMeanStd() else: self.obs_rms = None def pi(self, obs, apply_noise=True, compute_Q=True): obs = np.array([obs]) action = to_numpy(self.actor(to_tensor(obs))).squeeze(0) if compute_Q: q = self.critic([to_tensor(obs), to_tensor(action)]).cpu().data else: q = None if self.action_noise is not None and apply_noise: noise = self.action_noise() assert noise.shape == action.shape action += noise action = np.clip(action, self.action_range[0], self.action_range[1]) return action, q[0][0] def store_transition(self, obs0, action, reward, obs1, terminal1): self.memory.append(obs0, action, reward, obs1, terminal1) if self.normalize_observations: self.obs_rms.update(np.array([obs0])) def train(self): # Get a batch. batch = self.memory.sample(batch_size=self.batch_size) next_q_values = self.critic_target([ to_tensor(batch['obs1'], volatile=True), self.actor_target(to_tensor(batch['obs1'], volatile=True))]) next_q_values.volatile = False target_q_batch = to_tensor(batch['rewards']) + \ self.discount * to_tensor(1 - batch['terminals1'].astype('float32')) * next_q_values self.critic.zero_grad() q_batch = self.critic([to_tensor(batch['obs0']), to_tensor(batch['actions'])]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() self.actor.zero_grad() policy_loss = -self.critic([to_tensor(batch['obs0']), self.actor(to_tensor(batch['obs0']))]).mean() policy_loss.backward() self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return value_loss.cpu().data[0], policy_loss.cpu().data[0] def initialize(self): hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) def update_target_net(self): soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def reset(self): if self.action_noise is not None: self.action_noise.reset() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda()
class Agent(): """Interacts with and learns from the environment""" def __init__(self, state_size, action_size, random_seed=0): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.epsilon = EPSILON # Actor network self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) #self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR, weight_decay=WEIGHT_DECAY) # Critic network self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) #self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done, timestep): """Save experience in replay memory, and use random sample from buffer to learn""" # save experience/reward # if updating in batches, then add the last memory of the agents(e.g. 20 agents) to a buffer # and if we've met batch size, push to learn in multiples of LEARN_NUM self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE and timestep % LEARN_EVERY == 0: for _ in range(LEARN_NUM): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.epsilon * self.noise.sample() return np.clip(action, -1, 1) def reset(self): """Reset the noise""" self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Update critic # get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # compute Q targets for current states(y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # gradient clipping for critic if GRAD_CLIPPING > 0: torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), GRAD_CLIPPING) self.critic_optimizer.step() # update actor # compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # update target networks self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # update epsilon decay #if EPSILON_DECAY > 0: # self.epsilon -= EPSILON_DECAY # self.noise.reset() self.epsilon -= EPSILON_DECAY self.noise.reset() if EPSILON_DECAY =< 0: self.epsilon = 0 self.noise.reset()
def training(file_name): # Create folders. if not os.path.isdir(SAVE_DIR): os.makedirs(SAVE_DIR) if not os.path.isdir(CSV_DIR): os.makedirs(CSV_DIR) if not os.path.isdir(FIGURE_TRAINING_DIR): os.makedirs(FIGURE_TRAINING_DIR) # Load models. actor = Actor(name="actor") actor_target = Actor(name="actor_target") actor_initial_update_op = target_update_op( actor.trainable_variables, actor_target.trainable_variables, 1.0) actor_target_update_op = target_update_op(actor.trainable_variables, actor_target.trainable_variables, TARGET_UPDATE_RATE) critic = Critic(name="critic") critic.build_training() critic_target = Critic(name="critic_target") critic_initial_update_op = target_update_op( critic.trainable_variables, critic_target.trainable_variables, 1.0) critic_target_update_op = target_update_op( critic.trainable_variables, critic_target.trainable_variables, TARGET_UPDATE_RATE) critic_with_actor = Critic(name="critic", A=actor.pi) actor.build_training(critic_with_actor.actor_loss) env = PendulumEnv() replay_buffer = ReplayBuffer(BUFFER_SIZE) action_noise = OUActionNoise(np.zeros(A_LENGTH)) with tf.Session() as sess: # Initialize actor and critic networks. sess.run(tf.global_variables_initializer()) sess.run([actor_initial_update_op, critic_initial_update_op]) list_final_reward = [] additional_episode = int(np.ceil(MIN_BUFFER_SIZE / MAX_FRAME)) for episode in range(-additional_episode, MAX_EPISODE): list_actor_loss = [] list_critic_loss = [] # Reset the environment and noise. s = env.reset() action_noise.reset() for step in range(MAX_FRAME): env.render() # Get action. a = sess.run(actor.pi, feed_dict={actor.S: np.reshape(s, (1, -1))}) noise = action_noise.get_noise() a = a[0] + ACTION_SCALING * noise a = np.clip(a, -ACTION_SCALING, ACTION_SCALING) # Interact with the game engine. s1, r, _, _ = env.step(a) # Add data to the replay buffer. data = [s, a, [r], s1] replay_buffer.append(data) if episode >= 0: for _ in range(BATCHES_PER_STEP): # Sample data from the replay buffer. batch_data = replay_buffer.sample(BATCH_SIZE) batch_s, batch_a, batch_r, batch_s1 = [ np.array( [batch_data[j][i] for j in range(BATCH_SIZE)]) for i in range(len(batch_data[0])) ] # Compute the next action. a1 = sess.run(actor_target.pi, feed_dict={actor_target.S: batch_s1}) # Compute the target Q. q1 = sess.run(critic_target.q, feed_dict={ critic_target.S: batch_s1, critic_target.A: a1 }) q_target = batch_r + DISCOUNT * q1 # Update actor and critic. _, _, actor_loss, critic_loss = sess.run( [ actor.train_op, critic.train_op, actor.actor_loss, critic.critic_loss ], feed_dict={ actor.S: batch_s, critic_with_actor.S: batch_s, actor.LR: LR_ACTOR, critic.S: batch_s, critic.A: batch_a, critic.QTarget: q_target, critic.LR: LR_CRITIC }) list_actor_loss.append(actor_loss) list_critic_loss.append(critic_loss) # Update target networks. sess.run( [actor_target_update_op, critic_target_update_op]) s = s1 # Postprocessing after each episode. if episode >= 0: list_final_reward.append(r) avg_actor_loss = np.mean(list_actor_loss) avg_critic_loss = np.mean(list_critic_loss) print("Episode ", format(episode, "03d"), ":", sep="") print(" Final Reward = ", format(r, ".6f"), ", Actor Loss = ", format(avg_actor_loss, ".6f"), ", Critic Loss = ", format(avg_critic_loss, ".6f"), sep="") # Testing. avg_reward = 0 for i in range(TEST_EPISODE): # Reset the environment and noise. s = env.reset() action_noise.reset() for step in range(MAX_FRAME): env.render() # Get action. a = sess.run(actor.pi, feed_dict={actor.S: np.reshape(s, (1, -1))}) a = a[0] # Interact with the game engine. s, r, _, _ = env.step(a) # Postprocessing after each episode. avg_reward += r avg_reward /= TEST_EPISODE # Save the parameters. saver = tf.train.Saver( [*actor.trainable_variables, *critic.trainable_variables]) saver.save(sess, SAVE_DIR + file_name) tf.contrib.keras.backend.clear_session() env.close() # Store data in the csv file. with open(CSV_DIR + file_name + ".csv", "w") as f: fieldnames = ["Episode", "Final Reward", "Average Reward"] writer = csv.DictWriter(f, fieldnames=fieldnames, lineterminator="\n") writer.writeheader() for episode in range(MAX_EPISODE): content = { "Episode": episode, "Final Reward": list_final_reward[episode] } if episode == MAX_EPISODE - 1: content.update({"Average Reward": avg_reward}) writer.writerow(content) # Plot the training process. list_episode = list(range(MAX_EPISODE)) f, ax = plt.subplots(nrows=1, ncols=1, figsize=(5, 5)) ax.plot(list_episode, list_final_reward, "r-", label="Final Reward") ax.plot([MAX_EPISODE - 1], [avg_reward], "b.", label="Average Reward") ax.set_title("Final Reward") ax.set_xlabel("Episode") ax.set_ylabel("Reward") ax.legend(loc="lower right") ax.grid() f.savefig(FIGURE_TRAINING_DIR + file_name + ".png") plt.close(f)
class Agent: """ The reinforcement learning agent. """ def __init__(self, state_size: int, action_size: int, n_agents: int, seed: int) -> None: """Initializes an Agent object. Args: state_size (int): The dimension of the state vector. action_size (int): The dimension of the action vector. n_agents (int): The number of agents. seed (int): The initialization value for the random number generator. """ self.state_size = state_size self.action_size = action_size self.n_agents = n_agents self.seed = random.seed(seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # An Ornstein Uhlenbeck process is used to generate noise. self.noise = OrnsteinUhlenbeckNoise(action_size, seed) # Replay buffer self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) def step(self, state: torch.Tensor, action: torch.Tensor, reward: torch.Tensor, next_state: torch.Tensor, done: torch.Tensor) -> None: """ Save the experience within the ReplayBuffer. Args: state (torch.Tensor): A state vector. action (torch.Tensor): An action vector. reward (torch.Tensor): A reward vector. next_state (torch.Tensor): A vector containing the states following the given states. done (torch.Tensor): A vector containing done flags. """ for i in range(self.n_agents): self.memory.add(state[i,:], action[i,:], reward[i], next_state[i,:], done[i]) """ In case there are enough experiences within ReplayBuffer, start learning. """ if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state: torch.Tensor, add_noise: bool = True): """ Using the actor network the method return a vector of actions given the state vector using the current policy. Args: state (torch.Tensor): A state vector. add_noise (bool): A flag indicating the use of noise. Returns: An vector of actions. """ state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: # Add Ornstein Uhlenbeck noise. action += self.noise.sample() return np.clip(action, -1, 1) def reset(self) -> None: """ Reset the Ornstein Uhlenbeck process. """ self.noise.reset() def learn(self, experiences: Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], gamma: float) -> None: """ Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Args: experiences (Tuple[torch.Tensor]): Tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) q_targets = rewards + (gamma * q_targets_next * (1 - dones)) # Retrieve the predicted q value q_expected = self.critic_local(states, actions) # Compute the loss as the measn square error between expected and computed q value. critic_loss = f.mse_loss(q_expected, q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) @staticmethod def soft_update(local_model, target_model, tau: float) -> None: """ Update the model parameters according to this formula: θ_target = τ*θ_local + (1 - τ)*θ_target Args: local_model (PyTorch model): weights will be copied from this model target_model (PyTorch model): weights will be copied to this model tau (float): interpolation parameter, tau = 1 results in complete overwrite """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
def __init__(self, device, state_size, n_agents, action_size, random_seed, buffer_size, batch_size, gamma, TAU, lr_actor, lr_critic, weight_decay, checkpoint_folder='./'): self.DEVICE = device self.state_size = state_size self.n_agents = n_agents self.action_size = action_size self.seed = random.seed(random_seed) # Hyperparameters self.BUFFER_SIZE = buffer_size self.BATCH_SIZE = batch_size self.GAMMA = gamma self.TAU = TAU self.LR_ACTOR = lr_actor self.LR_CRITIC = lr_critic self.WEIGHT_DECAY = weight_decay self.CHECKPOINT_FOLDER = checkpoint_folder # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(self.DEVICE) self.actor_target = Actor(state_size, action_size, random_seed).to(self.DEVICE) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(self.DEVICE) self.critic_target = Critic(state_size, action_size, random_seed).to(self.DEVICE) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.LR_CRITIC, weight_decay=self.WEIGHT_DECAY) ''' if os.path.isfile(self.CHECKPOINT_FOLDER + 'checkpoint_actor.pth') and os.path.isfile(self.CHECKPOINT_FOLDER + 'checkpoint_critic.pth'): self.actor_local.load_state_dict(torch.load(self.CHECKPOINT_FOLDER + 'checkpoint_actor.pth')) self.actor_target.load_state_dict(torch.load(self.CHECKPOINT_FOLDER + 'checkpoint_actor.pth')) self.critic_local.load_state_dict(torch.load(self.CHECKPOINT_FOLDER + 'checkpoint_critic.pth')) self.critic_target.load_state_dict(torch.load(self.CHECKPOINT_FOLDER + 'checkpoint_critic.pth')) ''' # Noise process self.noise = OUNoise((n_agents, action_size), random_seed) # Replay memory self.memory = ReplayBuffer(device, action_size, self.BUFFER_SIZE, self.BATCH_SIZE, random_seed)
class Agent(): """Interacts with and learns from the environment.""" def __init__( self, state_size=None, # state space size action_size=None, # action size memory=None, buffer_size=BUFFER_SIZE, # replay buffer size batch_size=BATCH_SIZE, # minibatch size gamma=GAMMA, # discount factor tau=TAU, # for soft update of target parameters lr_actor=LR_ACTOR, # learning rate of the actor lr_critic=LR_CRITIC, # learning rate of the critic weight_decay=WEIGHT_DECAY, # L2 weight decay random_seed=RANDOM_SEED): self.state_size = state_size self.action_size = action_size self.buffer_size = buffer_size # replay buffer size self.batch_size = batch_size # minibatch size self.gamma = gamma # discount factor self.tau = tau # for soft update of target parameters self.lr_actor = lr_actor # learning rate of the actor self.lr_critic = lr_critic # learning rate of the critic self.weight_decay = weight_decay # L2 weight decay self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=self.weight_decay) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory if not isinstance(memory, ReplayBuffer): memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed, device) self.memory = memory def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Main DDPG agent that extracts experiences and learns from them""" def __init__(self, state_size=8, action_size=2, random_seed=0): """ Initializes Agent object. @Param: 1. state_size: dimension of each state. 2. action_size: number of actions. """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) #Actor network self.actor_local = Actor(self.state_size, self.action_size, random_seed).to(device) self.actor_target = Actor(self.state_size, self.action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) #Critic network self.critic_local = Critic(self.state_size, self.action_size, random_seed).to(device) self.critic_target = Critic(self.state_size, self.action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC) #Noise proccess self.noise = OUNoise(action_size, random_seed) #define Ornstein-Uhlenbeck process #Replay memory self.memory = ReplayBuffer( self.action_size, BUFFER_SIZE, MINI_BATCH, random_seed) #define experience replay buffer object self.time_step = 0 def reset(self): """Resets the noise process to mean""" self.noise.reset() def act(self, state, add_noise=True): """ Returns a deterministic action given current state. @Param: 1. state: current state, S. 2. add_noise: (bool) add bias to agent, default = True (training mode) """ state = torch.from_numpy(state).float().unsqueeze(0).to( device) #typecast to torch.Tensor self.actor_local.eval() #set in evaluation mode with torch.no_grad(): #reset gradients action = self.actor_local(state).cpu().data.numpy( ) #deterministic action based on Actor's forward pass. self.actor_local.train() #set training mode #If training mode, i.e. add_noise = True, add noise to the model to learn a more accurate policy for current state. if (add_noise): action += self.noise.sample() return np.clip(action, -1, 1) def learn(self, experiences, gamma=GAMMA): """ Learn from a set of experiences picked up from a random sampling of even frequency (not prioritized) of experiences when buffer_size = MINI_BATCH. Updates policy and value parameters accordingly @Param: 1. experiences: (Tuple[torch.Tensor]) set of experiences, trajectory, tau. tuple of (s, a, r, s', done) 2. gamma: immediate reward hyper-parameter, 0.99 by default. """ #Extrapolate experience into (state, action, reward, next_state, done) tuples states, actions, rewards, next_states, dones = experiences #Update Critic network actions_next = self.actor_target( next_states ) # Get predicted next-state actions and Q values from target models Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # r + γ * Q-values(a,s) # Compute critic loss using MSE Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) #clip gradients self.critic_optimizer.step() #Update Actor Network # Compute actor loss actions_pred = self.actor_local(states) #gets mu(s) actor_loss = -self.critic_local(states, actions_pred).mean() #gets V(s,a) # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. Copies model τ every experience. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, params, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.n_agents = 1 self.state_size = params['state_size'] self.action_size = params['action_size'] self.batch_size = params['batch_size'] self.gamma = params['gamma'] self.tau = params['tau'] self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(self.state_size, self.action_size, random_seed).to(device) self.actor_target = Actor(self.state_size, self.action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=params['lr_actor']) # Critic Network (w/ Target Network) self.critic_local = Critic(self.state_size, self.action_size, random_seed).to(device) self.critic_target = Critic(self.state_size, self.action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=params['lr_critic'], weight_decay=params['weight_decay']) # Noise process self.noise = OUNoise(self.action_size, random_seed) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, num_agents): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.num_agents = num_agents # Actor Networks self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Networks self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, timestep, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" # Add experience s a r s' d from all agents to replay buffer for i in range(self.num_agents): self.memory.add(states[i,:], actions[i,:], rewards[i], next_states[i,:], dones[i]) if timestep % UPDATE_TIMESTEPS ==0: # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: for i in range(MEMORY_SAMPLE_TIMES): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, states, add_noise=True): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for i in range(self.num_agents): actions[i, :] = self.actor_local(states[i]).cpu().data.numpy() self.actor_local.train() if add_noise: for i in range(len(actions)): actions[i, :] += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) ## use gradient clipping when training self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class SAC(): def __init__(self): self.V=V(n_state).to(device) self.target_V=V(n_state).to(device) self.policy=Actor(n_state,max_action).to(device) self.Q=Q(n_state,n_action).to(device) self.optimV=th.optim.Adam(self.V.parameters(),lr=lr) self.optimQ=th.optim.Adam(self.Q.parameters(),lr=lr) self.optimP=th.optim.Adam(self.policy.parameters(),lr=lr) self.memory=replay_memory(memory_size) def choose_action(self,s): mu,log_std=self.policy(s) dist=Normal(mu,th.exp(log_std)) action=dist.sample() action = th.tanh(action) return action def V_learn(self,batch): b_s=th.FloatTensor(batch[:,0].tolist()).to(device) b_a=th.FloatTensor(batch[:,2].tolist()).to(device) mu,log_std=self.policy(b_s) dist=Normal(mu,th.exp(log_std)) z=dist.sample() b_a=th.tanh(z) prob=dist.log_prob(z) qs=self.Q(b_s,b_a) v=self.V(b_s) target_v=qs-prob loss=(v-target_v.detach())**2 loss=loss.mean() self.optimV.zero_grad() loss.backward() self.optimV.step() def Q_learn(self,batch): b_s=th.FloatTensor(batch[:,0].tolist()).to(device) b_r=th.FloatTensor(batch[:,1].tolist()).to(device) b_a=th.FloatTensor(batch[:,2].tolist()).to(device) b_s_=th.FloatTensor(batch[:,3].tolist()).to(device) b_d=th.FloatTensor(batch[:,4].tolist()).to(device) target_q=b_r+(1-b_d)*gamma*self.target_V(b_s_) eval_q=self.Q(b_s,b_a) loss=(eval_q-target_q.detach())**2 loss=loss.mean() self.optimQ.zero_grad() loss.backward() self.optimQ.step() def P_learn(self,batch): b_s=th.FloatTensor(batch[:,0].tolist()).to(device) norm=Normal(th.zeros((batchsize,1)),th.ones((batchsize,1))) #norm=Normal(0,1) mu,log_std=self.policy(b_s) z=norm.sample() b_a=th.tanh(mu+th.exp(log_std)*z.to(device)) dist=Normal(mu,th.exp(log_std)) log_prob=dist.log_prob(mu+th.exp(log_std)*z.to(device))- th.log(1 - b_a.pow(2) + 1e-7) qs=self.Q(b_s,b_a) loss=alpha*log_prob-qs loss=loss.mean() self.optimP.zero_grad() loss.backward() self.optimP.step() def soft_update(self): for param,target_param in zip(self.V.parameters(),self.target_V.parameters()): target_param.data.copy_(tau*param.data+(1-tau)*target_param.data)
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, params, seed=0): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents params (dict): hyperparameters seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.params = params random.seed(seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.params['lr_actor']) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=self.params['lr_critic'], weight_decay=self.params['weight_decay']) # Noise process self.noise = OUNoise(action_size, seed) # Replay memory self.memory = ReplayBuffer(self.params['buffer_size'], self.params['batch_size'], seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for i in range(self.num_agents): self.memory.add(states[i], actions[i], rewards[i], next_states[i], dones[i]) # Learn every UPDATE_EVERY time steps self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # Learn, if enough samples are available in memory if len(self.memory) > self.params['batch_size']: for i in range(LEARN_UPDATES): experiences = self.memory.sample() self.learn(experiences, self.params['gamma']) def act(self, states, add_noise=True): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for i, state in enumerate(states): actions[i, :] = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Update critic self.update_critic(states, actions, rewards, next_states, dones, gamma) # Update actor self.update_actor(states) # Update target networks self.soft_update(self.critic_local, self.critic_target, self.params['tau']) self.soft_update(self.actor_local, self.actor_target, self.params['tau']) def update_actor(self, states): """Update actor parameters using given batch of experience tuples.""" # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() def update_critic(self, states, actions, rewards, next_states, dones, gamma): """Update critic parameters using given batch of experience tuples.""" # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent: def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize an Agent object. Params ===== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.epsilon = EPSILON_MAX # Actor Network self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) self.noise = [ OUNoise(action_size, random_seed) for i in range(self.num_agents) ] self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # self.hard_update(self.actor_target, self.actor_local) # self.hard_update(self.critic_target, self.critic_local) def step(self, state, action, reward, next_state, done, time_step): """Save experience in memory and use random samples from buffer to learn.""" self.memory.add(state, action, reward, next_state, done, self.num_agents) if len(self.memory) > BATCH_SIZE and time_step % UPDATE_EVERY == 0: # learn LEARN_NUM times every UPDATE_EVERY time steps for _ in range(LEARN_NUM): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Return actions for given state as per current policy""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: for i in range(self.num_agents): agent_action = action[i] for j in agent_action: j += self.epsilon * self.noise[i].sample() return np.clip(action, -1, 1) def reset(self): for i in range(self.num_agents): self.noise[i].reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q_value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s,a,r,s',done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------- update critic ---------------------------- # actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + gamma * Q_targets_next * (1 - dones) # compute the loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------- update actor ----------------------------- # # compute the loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ---------------------- update target network ---------------------- # self.soft_update(self.actor_local, self.actor_target, TAU) self.soft_update(self.critic_local, self.critic_target, TAU) # ---------------------- update noise ------------------------------- # if self.epsilon - EPSILON_DECAY > EPSILON_MIN: self.epsilon -= EPSILON_DECAY else: self.epsilon = EPSILON_MIN def soft_update(self, local_model, target_model, tau): """Soft update model parameters θ_target = τ * θ_local + (1 - τ) * θ_target Params ===== local_model: Network weights to be copied from target_model: Network weights to be copied to tau(float): interpolation parameter """ for local_param, target_param in zip(local_model.parameters(), target_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def main(): env = gym.make(args.env_name) env.seed(args.seed) torch.manual_seed(args.seed) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] running_state = ZFilter((num_inputs,), clip=5) print('state size:', num_inputs) print('action size:', num_actions) actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) discrim = Discriminator(num_inputs + num_actions, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate) # load demonstrations expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb")) demonstrations = np.array(expert_demo) print("demonstrations.shape", demonstrations.shape) writer = SummaryWriter(args.logdir) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) discrim.load_state_dict(ckpt['discrim']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) episodes = 0 train_discrim_flag = True for iter in range(args.max_iter_num): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] while steps < args.total_sample_size: state = env.reset() score = 0 state = running_state(state) for _ in range(10000): if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze(0)) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action) irl_reward = get_reward(discrim, state, action) if done: mask = 0 else: mask = 1 memory.append([state, action, irl_reward, mask]) next_state = running_state(next_state) state = next_state score += reward if done: break episodes += 1 scores.append(score) score_avg = np.mean(scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) actor.train(), critic.train(), discrim.train() if train_discrim_flag: expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, demonstrations, args) print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100)) if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen: train_discrim_flag = False train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args) if iter % 100: score_avg = int(score_avg) model_path = os.path.join(os.getcwd(),'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'discrim': discrim.state_dict(), 'z_filter_n':running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path)
class DDPG(object): def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.discrete = args.discrete self.pic = args.pic self.writer = writer self.select_time = 0 if self.pic: self.nb_status = args.pic_status # Create Actor and Critic Network net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'use_bn':args.bn, 'init_method':args.init_method } if args.pic: self.cnn = CNN(1, args.pic_status) self.cnn_target = CNN(1, args.pic_status) self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate) self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) if args.pic: hard_update(self.cnn_target, self.cnn) #Create replay buffer self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def normalize(self, pic): pic = pic.swapaxes(0, 2).swapaxes(1, 2) return pic def update_policy(self): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch if self.pic: state_batch = np.array([self.normalize(x) for x in state_batch]) state_batch = to_tensor(state_batch, volatile=True) state_batch = self.cnn(state_batch) next_state_batch = np.array([self.normalize(x) for x in next_state_batch]) next_state_batch = to_tensor(next_state_batch, volatile=True) next_state_batch = self.cnn_target(next_state_batch) next_q_values = self.critic_target([ next_state_batch, self.actor_target(next_state_batch) ]) else: next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False q_batch = self.critic([state_batch, to_tensor(action_batch)]) else: q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() if self.pic: self.cnn_optim.step() self.actor.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False policy_loss = -self.critic([ state_batch, self.actor(state_batch) ]) else: policy_loss = -self.critic([ to_tensor(state_batch), self.actor(to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if self.clip_actor_grad is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad)) if self.writer != None: mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()])) #print(mean_policy_grad) self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time) self.actor_optim.step() if self.pic: self.cnn_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) if self.pic: soft_update(self.cnn_target, self.cnn, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() if(self.pic): self.cnn.eval() self.cnn_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() if(self.pic): self.cnn.train() self.cnn_target.train() def cuda(self): self.cnn.cuda() self.cnn_target.cuda() self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self, fix=False): action = np.random.uniform(-1.,1.,self.nb_actions) self.a_t = action if self.discrete and fix == False: action = action.argmax() # if self.pic: # action = np.concatenate((softmax(action[:16]), softmax(action[16:]))) return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): self.eval() if self.pic: s_t = self.normalize(s_t) s_t = self.cnn(to_tensor(np.array([s_t]))) if self.pic: action = to_numpy( self.actor_target(s_t) ).squeeze(0) else: action = to_numpy( self.actor(to_tensor(np.array([s_t]))) ).squeeze(0) self.train() noise_level = noise_level * max(self.epsilon, 0) if np.random.uniform(0, 1) < noise_level: action = self.random_action(fix=True) # episilon greedy if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action if return_fix: return action if self.discrete: return action.argmax() else: return action def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=1): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) def save_model(self, output, num): if self.use_cuda: self.cnn.cpu() self.actor.cpu() self.critic.cpu() torch.save( self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num) ) torch.save( self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num) ) if self.use_cuda: self.cnn.cuda() self.actor.cuda() self.critic.cuda()
def buscarActores(pkActores): actores = Actor.actores(pkActores) return actores
def actor(): """Retorna a todos los actores y sus datos""" return Actor.all()
class DDPG(object): def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.writer = writer self.select_time = 0 # Create Actor and Critic Network net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'init_method':args.init_method } self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = rpm(args.rmsize) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def update_policy(self, train_actor = True): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = nn.MSELoss()(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() self.actor.zero_grad() policy_loss = -self.critic([ to_tensor(state_batch), self.actor(to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if self.clip_actor_grad is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad)) if self.writer != None: mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()])) #print(mean_policy_grad) self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time) if train_actor: self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1.,1.,self.nb_actions) self.a_t = action return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): self.eval() # print(s_t.shape) action = to_numpy( self.actor(to_tensor(np.array([s_t]))) ).squeeze(0) self.train() noise_level = noise_level * max(self.epsilon, 0) action = action * (1 - noise_level) + (self.random_process.sample() * noise_level) action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=1): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) def save_model(self, output, num): if self.use_cuda: self.actor.cpu() self.critic.cpu() torch.save( self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num) ) torch.save( self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num) ) if self.use_cuda: self.actor.cuda() self.critic.cuda()
import os import pickle import numpy as np from constants import * from environment import Game from model import Actor, Critic from ppo import PPO from utils import plot_data from running_state import * from replay_memory import * env = Game() n_input = env.state_dim actor = Actor(n_input, N_HIDDEN) critic = Critic(n_input, N_HIDDEN) # retrieve previous saved model if exists if os.path.exists(ACTOR_SAVE_PATH): print("Loading saved actor model...") actor.load_state_dict(torch.load(ACTOR_SAVE_PATH)) if os.path.exists(CRITIC_SAVE_PATH): print("Loading saved critic model...") critic.load_state_dict(torch.load(CRITIC_SAVE_PATH)) ppo_agent = PPO(env, actor, critic) running_state = ZFilter((2, ), clip=5) statistics = {
def main(): env = gym.make(args.env_name) env.seed(args.seed) torch.manual_seed(args.seed) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] running_state = ZFilter((num_inputs,), clip=5) print('state size:', num_inputs) print('action size:', num_actions) actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) writer = SummaryWriter(comment="-ppo_iter-" + str(args.max_iter_num)) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) episodes = 0 for iter in range(args.max_iter_num): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] while steps < args.total_sample_size: state = env.reset() score = 0 state = running_state(state) for _ in range(10000): if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze(0)) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action) if done: mask = 0 else: mask = 1 memory.append([state, action, reward, mask]) next_state = running_state(next_state) state = next_state score += reward if done: break episodes += 1 scores.append(score) score_avg = np.mean(scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) actor.train(), critic.train() train_model(actor, critic, memory, actor_optim, critic_optim, args) if iter % 100: score_avg = int(score_avg) model_path = os.path.join(os.getcwd(),'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'z_filter_n':running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, env, hyper_params, random_seed=0): """Initialize an Agent object. Params ====== env : the problem environment hyper_params : a dictionary of hyper parameters random_seed (int): random seed """ self.state_size = env.state_size self.action_size = env.action_space_size self.hyper_params = hyper_params self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(self.state_size, self.action_size, random_seed).to(device) self.actor_target = Actor(self.state_size, self.action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=hyper_params['lr_actor']) # Critic Network (w/ Target Network) self.critic_local = Critic(self.state_size, self.action_size, random_seed).to(device) self.critic_target = Critic(self.state_size, self.action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=hyper_params['lr_critic'], weight_decay=hyper_params['weight_decay']) # Noise process self.noise = OUNoise((env.num_agents, self.action_size), random_seed) self.epsilon = hyper_params['epsilon'] # Replay memory self.memory = ReplayBuffer(self.action_size, hyper_params['memory_size'], hyper_params['batch_size'], random_seed) def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.hyper_params['batch_size']: experiences = self.memory.sample() self.learn(experiences, self.hyper_params['gamma']) def act(self, states, add_noise=True): """Returns actions for given state as per current policy.""" states = torch.FloatTensor(states) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states) self.actor_local.train() if add_noise: noise = torch.FloatTensor(self.noise.sample()) actions += (self.epsilon * noise) self.epsilon = max(0.01, self.epsilon * self.hyper_params['epsilon_decay']) actions = torch.clamp(actions, -0.8, 0.8).detach() actions = actions.cpu().numpy() return actions def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) def soft_update(self, local_model, target_model): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) """ tau = self.hyper_params['tau'] for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): def __init__(self, state_size, action_size, n_agents=1, buffer_size=int(1e7), batch_size=256, gamma=.99, tau=1e-3, lr_a=1e-4, lr_c=1e-3, weight_decay=0, update_local=10, n_updates=5, random_seed=1): """Initialize an Agent object Params ===== state_size (int): Dimension of states action_size (int): Dimension of actions n_agents (int): Number of agents buffer_size (int): size of replay buffer batch_size (int): size of sample gamma (float): discount factor tau (float): (soft) update of target parameters lr_a (float): learning rate of actor lr_c (float): learning rate of critic weight_decay (float): L2 weight decay update_local (int): update local network every x steps n_updates (int): number of updates seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.n_agents = n_agents # Hyperparameters self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr_a = lr_a self.lr_c = lr_c self.weight_decay = weight_decay self.update_local = update_local self.n_updates = n_updates # Actor networks self.actor_local = \ Actor(state_size, action_size, seed=random_seed).to(device) self.actor_target = \ Actor(state_size, action_size, seed=random_seed).to(device) self.actor_optimizer = \ optim.Adam(self.actor_local.parameters(), lr=lr_a) # Critic networks self.critic_local = \ Critic(state_size, action_size, seed=random_seed).to(device) self.critic_target = \ Critic(state_size, action_size, seed=random_seed).to(device) self.critic_optimizer = \ optim.Adam(self.critic_local.parameters(), lr=lr_c, weight_decay=weight_decay) # Replay buffer self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed) # Noise process self.noise = OUNoise(action_size, random_seed) # Time step self.t_step = 0 def step(self, state, action, reward, next_state, done, learn=True): # Save experience in replay buffer #for state, action, reward, next_state, done in zip(state, action, reward, next_state, done): self.memory.add(state, action, reward, next_state, done) self.t_step += 1 # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size and learn: if self.t_step % self.update_local == 0: for _ in range(self.n_updates): sample = self.memory.sample() self.__learn(sample, self.gamma) def act(self, state, add_noise=True): """Returns action given a state according to current policy Params ====== state (array_like): current state add_noise (bool): handles exploration """ state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def __learn(self, sample, gamma): """ Params ====== sample (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = sample #----------------- Critic # Next actions and actions values actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local Critic network Q_expected = self.critic_local(states, actions) # Compute loss critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() #----------------- Actor # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() #----------------- update target networks self.__soft_update(self.critic_local, self.critic_target, self.tau) self.__soft_update(self.actor_local, self.actor_target, self.tau) def __soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param \ in zip(target_model.parameters(), local_model.parameters()): target_param.data.\ copy_(tau*local_param.data + (1.0 - tau)*target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.eps = EPS_START self.eps_decay = 1 / (EPS_EP_END * LEARN_NUM ) # set decay rate based on epsilon end target self.timestep = 0 # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise((num_agents, action_size), random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done, agent_number): """Save experience in replay memory, and use random sample from buffer to learn.""" self.timestep += 1 # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory and at learning interval settings if len(self.memory) > BATCH_SIZE and self.timestep % LEARN_EVERY == 0: for _ in range(LEARN_NUM): experiences = self.memory.sample() self.learn(experiences, GAMMA, agent_number) def act(self, states, add_noise): """Returns actions for both agents as per current policy, given their respective states.""" states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): # get action for each agent and concatenate them for agent_num, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[agent_num, :] = action self.actor_local.train() # add noise to actions if add_noise: actions += self.eps * self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma, agent_number): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) # Construct next actions vector relative to the agent if agent_number == 0: actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1) else: actions_next = torch.cat((actions[:, :2], actions_next), dim=1) # Compute Q targets for current states (y_i) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) # Construct action prediction vector relative to each agent if agent_number == 0: actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1) else: actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1) # Compute actor loss actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # update noise decay parameter self.eps -= self.eps_decay self.eps = max(self.eps, EPS_FINAL) self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def __init__(self, state_size, action_size, n_agents=1, buffer_size=int(1e7), batch_size=256, gamma=.99, tau=1e-3, lr_a=1e-4, lr_c=1e-3, weight_decay=0, update_local=10, n_updates=5, random_seed=1): """Initialize an Agent object Params ===== state_size (int): Dimension of states action_size (int): Dimension of actions n_agents (int): Number of agents buffer_size (int): size of replay buffer batch_size (int): size of sample gamma (float): discount factor tau (float): (soft) update of target parameters lr_a (float): learning rate of actor lr_c (float): learning rate of critic weight_decay (float): L2 weight decay update_local (int): update local network every x steps n_updates (int): number of updates seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.n_agents = n_agents # Hyperparameters self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr_a = lr_a self.lr_c = lr_c self.weight_decay = weight_decay self.update_local = update_local self.n_updates = n_updates # Actor networks self.actor_local = \ Actor(state_size, action_size, seed=random_seed).to(device) self.actor_target = \ Actor(state_size, action_size, seed=random_seed).to(device) self.actor_optimizer = \ optim.Adam(self.actor_local.parameters(), lr=lr_a) # Critic networks self.critic_local = \ Critic(state_size, action_size, seed=random_seed).to(device) self.critic_target = \ Critic(state_size, action_size, seed=random_seed).to(device) self.critic_optimizer = \ optim.Adam(self.critic_local.parameters(), lr=lr_c, weight_decay=weight_decay) # Replay buffer self.memory = ReplayBuffer(action_size, buffer_size, batch_size, random_seed) # Noise process self.noise = OUNoise(action_size, random_seed) # Time step self.t_step = 0
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 #0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor - 0.99 self.tau = 0.01 # for soft update of target parameters - 0.01 # Score tracker and learning parameters self.best_w = None self.best_score = -np.inf self.score = -np.inf def reset_episode(self): self.total_reward = 0.0 self.count = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.total_reward += reward self.count += 1 self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" self.score = self.total_reward / float(self.count) if self.count else 0.0 if self.score > self.best_score: self.best_score = self.score # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)