def __init__(self, state_size, action_size, num_agents, random_seed = 1, \ learn_interval = 4, learn_num = 1, lr_actor = 1e-4, lr_critic = 1e-3, \ gamma = 0.99, weight_decay = 0, tau = 0.001, batch_size = 128, buffer_size = 1e5): """Initialize an Agents object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents seed (int): random_seed """ self.STATE_SIZE = state_size self.ACTION_SIZE = action_size self.NUM_AGENTS = num_agents self.seed = random.seed(random_seed) # hyper static parameters: self.LEARN_INTERVAL = learn_interval self.LEARN_NUM = learn_num self.LR_ACTOR = lr_actor self.LR_CRITIC = lr_critic self.GAMMA = gamma self.WEIGHT_DECAY = weight_decay self.TAU = tau self.BATCH_SIZE = batch_size self.BUFFER_SIZE = buffer_size # Actor network with target network self.actor_local = Actor(self.STATE_SIZE, self.ACTION_SIZE, random_seed).to(device) self.actor_target = Actor(self.STATE_SIZE, self.ACTION_SIZE, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.LR_ACTOR) # Critic network with target network self.critic_local = Critic(self.STATE_SIZE, self.ACTION_SIZE, random_seed).to(device) self.critic_target = Critic(self.STATE_SIZE, self.ACTION_SIZE, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.LR_CRITIC, weight_decay=self.WEIGHT_DECAY) # Noise process self.noise = OUNoise((self.NUM_AGENTS, self.ACTION_SIZE), random_seed) # Replay memory self.memory = ReplayBuffer(self.ACTION_SIZE, self.BUFFER_SIZE, self.BATCH_SIZE, random_seed)
def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.state_dim self.action_dim = env.action_dim self.sess = tf.InteractiveSession(config=tf.ConfigProto( log_device_placement=True)) self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) self.model_saver = tf.train.Saver()
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, random_seed = 1, \ learn_interval = 4, learn_num = 1, lr_actor = 1e-4, lr_critic = 1e-3, \ gamma = 0.99, weight_decay = 0, tau = 0.001, batch_size = 128, buffer_size = 1e5): """Initialize an Agents object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents seed (int): random_seed """ self.STATE_SIZE = state_size self.ACTION_SIZE = action_size self.NUM_AGENTS = num_agents self.seed = random.seed(random_seed) # hyper static parameters: self.LEARN_INTERVAL = learn_interval self.LEARN_NUM = learn_num self.LR_ACTOR = lr_actor self.LR_CRITIC = lr_critic self.GAMMA = gamma self.WEIGHT_DECAY = weight_decay self.TAU = tau self.BATCH_SIZE = batch_size self.BUFFER_SIZE = buffer_size # Actor network with target network self.actor_local = Actor(self.STATE_SIZE, self.ACTION_SIZE, random_seed).to(device) self.actor_target = Actor(self.STATE_SIZE, self.ACTION_SIZE, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.LR_ACTOR) # Critic network with target network self.critic_local = Critic(self.STATE_SIZE, self.ACTION_SIZE, random_seed).to(device) self.critic_target = Critic(self.STATE_SIZE, self.ACTION_SIZE, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.LR_CRITIC, weight_decay=self.WEIGHT_DECAY) # Noise process self.noise = OUNoise((self.NUM_AGENTS, self.ACTION_SIZE), random_seed) # Replay memory self.memory = ReplayBuffer(self.ACTION_SIZE, self.BUFFER_SIZE, self.BATCH_SIZE, random_seed) def reset(self): self.noise.reset() def step(self, step, states, actions, rewards, next_states, dones): """Save experience in replay memory and use random sample from buffer to learn.""" for n in range(self.NUM_AGENTS): self.memory.add(states[n, :], actions[n, :], rewards[n], next_states[n, :], dones[n]) # Learn every X frames | intervals if step % self.LEARN_INTERVAL == 0: # Learn, if we have enough samples to learn if len(self.memory) > self.BATCH_SIZE: # amount of times that we want to learn # is not the same as batch size for _ in range(self.LEARN_NUM): experiences = self.memory.sample() self.learn(experiences, self.GAMMA) def action(self, states, add_noise=True): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.NUM_AGENTS, self.ACTION_SIZE)) # get out of the training environment self.actor_local.eval() with torch.no_grad(): # for each state, predict the next action for n, state in enumerate(states): actions[n, :] = self.actor_local(state).cpu().data.numpy() # enter the training environment self.actor_local.train() # add some noise if add_noise: actions += self.noise.sample() # clip the action return np.clip(actions, -1, 1) def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.TAU) self.soft_update(self.actor_local, self.actor_target, self.TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) # Load and Save data def save_agent(self, checkpoint_name): torch.save(self.actor_local.state_dict(), f'./checkpoints/{checkpoint_name}_actor.pth') torch.save(self.critic_local.state_dict(), f'./checkpoints/{checkpoint_name}_critic.pth') def load_agent(self, checkpoint_name): self.actor_local.load_state_dict( torch.load(f'./checkpoints/{checkpoint_name}_actor.pth')) self.critic_local.load_state_dict( torch.load(f'./checkpoints/{checkpoint_name}_critic.pth'))
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Noise process self.mu = 0 self.theta = 0.15 self.sigmaStart = 0.5 self.sigmaEnd = 0.1 self.decayExponent = 0.01 self.noise = OUNoise(self.action_size, self.mu, self.theta, self.sigmaStart, self.sigmaEnd, self.decayExponent) # Replay memory self.buffer_size = 1000000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.0001 # for soft update of target parameters self.learningRateActor = 0.00005 self.learningRateCritic = 0.0005 self.dropoutActor = 0.1 self.dropoutCritic = 0.1 # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learningRate=self.learningRateActor, dropoutRate=self.dropoutActor) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learningRate=self.learningRateActor, dropoutRate=self.dropoutActor) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, learningRate=self.learningRateCritic, dropoutRate=self.dropoutCritic, l2Lambda=1e-2) self.critic_target = Critic(self.state_size, self.action_size, learningRate=self.learningRateCritic, dropoutRate=self.dropoutCritic, l2Lambda=1e-2) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.rewardSum = 0
class AgentDDPG(): def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Noise process self.mu = 0 self.theta = 0.15 self.sigmaStart = 0.5 self.sigmaEnd = 0.1 self.decayExponent = 0.01 self.noise = OUNoise(self.action_size, self.mu, self.theta, self.sigmaStart, self.sigmaEnd, self.decayExponent) # Replay memory self.buffer_size = 1000000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.0001 # for soft update of target parameters self.learningRateActor = 0.00005 self.learningRateCritic = 0.0005 self.dropoutActor = 0.1 self.dropoutCritic = 0.1 # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learningRate=self.learningRateActor, dropoutRate=self.dropoutActor) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, learningRate=self.learningRateActor, dropoutRate=self.dropoutActor) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, learningRate=self.learningRateCritic, dropoutRate=self.dropoutCritic, l2Lambda=1e-2) self.critic_target = Critic(self.state_size, self.action_size, learningRate=self.learningRateCritic, dropoutRate=self.dropoutCritic, l2Lambda=1e-2) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.rewardSum = 0 def reset_episode(self): self.rewardSum = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) self.rewardSum += reward # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] noise = self.noise.sample() return list(action + noise), noise # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPGController(object): """docstring for DDPG""" def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.state_dim self.action_dim = env.action_dim self.sess = tf.InteractiveSession(config=tf.ConfigProto( log_device_placement=True)) self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) self.model_saver = tf.train.Saver() def train(self): # print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.get_batch(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) return action + self.exploration_noise.noise() def action(self, state): action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.train() # if self.time_step % 10000 == 0: # self.actor_network.save_network(self.time_step) # self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset() def initial_train(self, mini_batch): state_batch = np.asarray([data[0] for data in mini_batch]) action_batch = np.asarray([data[1] for data in mini_batch]) action_label_batch = np.asarray([data[2] for data in mini_batch]) value_label_batch = np.asarray([data[3] for data in mini_batch]) done_batch = np.asarray([data[4] for data in mini_batch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) action_label_batch = np.resize(action_label_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch y_batch = [] for i in range(len(mini_batch)): y_batch.append(value_label_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L critic_cost = self.critic_network.train(y_batch, state_batch, action_label_batch) # Update the actor policy using the sampled gradient: # action_batch_for_gradients = self.actor_network.actions(state_batch) # q_gradient_batch = self.critic_network.gradients(state_batch, action_batch_for_gradients) # self.actor_network.train(q_gradient_batch, state_batch) action_cost = self.actor_network.initial_train( action_label_batch=action_label_batch, state_batch=state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() return critic_cost, action_cost def save_model(self, path, check_point): self.model_saver.save(self.sess, path + 'DDPGControllerModel.ckpt', global_step=check_point) print("Model saved at " + path + 'model.ckpt') def load_model(self, path): self.model_saver.restore(self.sess, path) print("Model loaded at " + path) pass