class DDPG: """ input : discrete actions, state space, type of learning(Straight/left/right)""" def __init__(self, action_space, state_space, radar_dim, type): self.action_space = action_space self.state_space = state_space self.radar_space = radar_dim self.lower_bound = -1 self.upper_bound = 1 # Steer limit self.epsilon = 0.8 self.gamma = .99 self.batch_size = 64 self.epsilon_min = .2 self.epsilon_decay = .999 self.critic_lr = 0.004 self.actor_lr = 0.004 # Custom tensorboard object now = time.localtime() self.tensorboard = ModifiedTensorBoard( log_dir= f"logs/{MODEL_NAME}-Feb_{now.tm_mday}_{now.tm_min}_{now.tm_hour}_{self.radar_space}_{self.actor_lr}_{self.batch_size}" ) self.type = type # Networks # we need to share some weights in between actor <--> critic # that we will do after every update self.actor = FeedForwardNN(self.radar_space, self.state_space, self.action_space, "actor") self.critic = FeedForwardNN(self.radar_space, self.state_space, 1, "critic") # Target model this is what we .predict against every step self.target_update_counter = 0 self.target_actor = self.actor self.target_critic = self.critic # We use different np.arrays for each tuple element for replay memory self.buffer_capacity = 50_000 self.buffer_counter = 0 self.state_buffer = np.zeros((self.buffer_capacity, self.state_space)) self.action_buffer = np.zeros( (self.buffer_capacity, self.action_space)) self.reward_buffer = np.zeros((self.buffer_capacity, 1)) self.next_state_buffer = np.zeros( (self.buffer_capacity, self.state_space)) self.radar_buffer = np.zeros((self.buffer_capacity, self.radar_space)) self.next_radar_buffer = np.zeros( (self.buffer_capacity, self.radar_space)) self.t_so_far = 0 self.writer = SummaryWriter( log_dir= f"runs/Feb_{now.tm_mday}_{now.tm_min}_{now.tm_hour}_{self.radar_space}_{self.actor_lr}_{self.batch_size}" ) # Takes (s,a,r,s') obervation tuple as input def remember(self, radar_state, radar_state_next, state, action, reward, next_state, done=None): # Set index to zero if buffer_capacity is exceeded, # replacing old records index = self.buffer_counter % self.buffer_capacity self.radar_buffer[index] = radar_state self.next_radar_buffer[index] = radar_state_next self.state_buffer[index] = state self.action_buffer[index] = action self.reward_buffer[index] = reward self.next_state_buffer[index] = next_state self.buffer_counter += 1 # policy() returns an action sampled from our Actor network plus # some noise for exploration. def policy(self, radar_state, physical_state): # .squeeze() function returns a tensor with the same value as its first # argument, but a different shape. It removes dimensions whose size is one. if np.random.rand() <= self.epsilon: sampled_actions = torch.rand(1) sampled_actions = (self.lower_bound - self.upper_bound) / 2 + sampled_actions * ( self.upper_bound - self.lower_bound) else: sampled_actions = self.actor(radar_state, physical_state, None) sampled_actions = sampled_actions.detach().numpy() # sampled_actions = np.array([(x+1)/2 for x in sampled_actions]) if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay # We make sure action is within bounds # Clip (limit) the values in an array here b/w lower and upper bound legal_action = np.clip(sampled_actions, self.lower_bound, self.upper_bound) return np.squeeze(legal_action) # We compute the loss and update parameters (learn) def replay(self): # Get sampling range record_range = min(self.buffer_counter, self.buffer_capacity) # Randomly sample indices(batch) batch_indices = np.random.choice(record_range, self.batch_size) # Convert to tensors state_batch = torch.tensor(self.state_buffer[batch_indices], dtype=torch.float) action_batch = torch.tensor(self.action_buffer[batch_indices], dtype=torch.float) reward_batch = torch.tensor(self.reward_buffer[batch_indices], dtype=torch.float) next_state_batch = torch.tensor(self.next_state_buffer[batch_indices], dtype=torch.float) radar_batch = torch.tensor(self.radar_buffer[batch_indices], dtype=torch.float) next_radar_batch = torch.tensor(self.next_radar_buffer[batch_indices], dtype=torch.float) """ `````````````````````````````````````````````````````````````````````````` # We are missing one more step # We got to match some preprocess layers of actor and critic # Create a function and call in between the above and here too `````````````````````````````````````````````````````````````````````````` """ # Setting the Actor and Critic common shared layer as mean of both tau = 0.01 new_dict = dict(self.critic.named_parameters()) for name, param in self.actor.named_parameters(): if 'layer' in name: new_dict[name] = (tau * param.data + (1 - tau) * new_dict[name]) # old_dict = dict(self.critic.named_parameters()) self.critic.load_state_dict(new_dict) new_dict = dict(self.actor.named_parameters()) for name, param in self.critic.named_parameters(): if 'layer' in name: new_dict[name] = (tau * param.data + (1 - tau) * new_dict[name]) self.actor.load_state_dict(new_dict) # Critic Network target_actions = self.target_actor(next_radar_batch, next_state_batch, None) y = reward_batch + self.gamma * self.target_critic( next_radar_batch, next_state_batch, target_actions) critic_value = self.critic(radar_batch, state_batch, action_batch) critic_loss = torch.mean((y - critic_value)**2) critic_optimizer = Adam(self.critic.parameters(), lr=self.critic_lr) critic_optimizer.zero_grad() critic_loss.backward() critic_optimizer.step() # Actor Network actions = self.actor(radar_batch, state_batch, None) critic_value = self.critic(radar_batch, state_batch, action_batch) actor_loss = -torch.mean(critic_value) actor_optimizer = Adam(self.actor.parameters(), lr=self.actor_lr) actor_optimizer.zero_grad() actor_loss.backward() actor_optimizer.step() return actor_loss.detach().numpy(), critic_loss.detach().numpy() # This update target parameters slowly # Based on rate `tau`, which is much less than one ~0.001 order # This also logs the historgrams def update_target(self, tau, val): if (tau < 1): new_dict = dict(self.target_critic.named_parameters()) for name, param in self.critic.named_parameters(): new_dict[name].data = (param.data * tau + new_dict[name].data * (1 - tau)) self.target_critic.load_state_dict(new_dict) new_dict = dict(self.target_actor.named_parameters()) for name, param in self.actor.named_parameters(): new_dict[name].data = (param.data * tau + new_dict[name].data * (1 - tau)) self.target_actor.load_state_dict(new_dict) else: self.target_critic.load_state_dict(self.critic.state_dict()) self.target_actor.load_state_dict(self.actor.state_dict()) # Log the histogram data of the Actor/Critic Network if (val % 10 == 0): for name, param in self.actor.named_parameters(): if 'weight' in name: self.writer.add_histogram("actor" + name, param.detach().numpy(), self.t_so_far) for name, param in self.critic.named_parameters(): if 'weight' in name: self.writer.add_histogram("critic" + name, param.detach().numpy(), self.t_so_far) self.t_so_far += 1 def save_model(self): # serialize weights to HDF5 print("---Saved modelweights to disk---") # Save the weights torch.save(self.actor.state_dict(), str(self.type) + "_DDPGactor.pth") torch.save(self.critic.state_dict(), str(self.type) + "_DDPGcritic.pth") torch.save(self.target_actor.state_dict(), str(self.type) + "_target_actor.pth") torch.save(self.target_critic.state_dict(), str(self.type) + "_target_critic.pth")
class DDPG: """ input : discrete actions, state space, type of learning(Straight/left/right)""" def __init__(self, action_space, state_space, radar_dim, type): self.action_space = action_space self.state_space = state_space self.radar_space = radar_dim self.lower_bound = 0.0 self.upper_bound = 1.0 # Throtle limit self.epsilon = 0.8 self.gamma = .99 self.batch_size = 64 self.epsilon_min = .1 # self.lr = 0.01 self.epsilon_decay = .997 self.critic_lr = 0.004 self.actor_lr = 0.004 # Custom tensorboard object self.tensorboard = ModifiedTensorBoard( log_dir="logs/{}-{}".format(MODEL_NAME, int(time.time()))) self.type = type # Networks # we need to share some weights in between actor <--> critic # that we will do after every update self.actor = FeedForwardNN(self.radar_space, self.state_space, self.action_space, "actor") self.critic = FeedForwardNN(self.radar_space, self.state_space, 1, "critic") # Target model this is what we .predict against every step self.target_update_counter = 0 self.target_actor = self.actor self.target_critic = self.critic # We use different np.arrays for each tuple element for replay memory self.buffer_capacity = 50_000 self.buffer_counter = 0 self.state_buffer = np.zeros((self.buffer_capacity, self.state_space)) self.action_buffer = np.zeros( (self.buffer_capacity, self.action_space)) self.reward_buffer = np.zeros((self.buffer_capacity, 1)) self.next_state_buffer = np.zeros( (self.buffer_capacity, self.state_space)) self.radar_buffer = np.zeros((self.buffer_capacity, self.radar_space)) self.next_radar_buffer = np.zeros( (self.buffer_capacity, self.radar_space)) self.t_so_far = 0 self.writer = SummaryWriter() # Takes (s,a,r,s') obervation tuple as input def remember(self, radar_state, radar_state_next, state, action, reward, next_state, done=None): # Set index to zero if buffer_capacity is exceeded, # replacing old records index = self.buffer_counter % self.buffer_capacity self.radar_buffer[index] = radar_state self.next_radar_buffer[index] = radar_state_next self.state_buffer[index] = state self.action_buffer[index] = action self.reward_buffer[index] = reward self.next_state_buffer[index] = next_state self.buffer_counter += 1 # policy() returns an action sampled from our Actor network plus # some noise for exploration. def policy(self, radar_state, physical_state): # .squeeze() function returns a tensor with the same value as its first # argument, but a different shape. It removes dimensions whose size is one. if np.random.rand() <= self.epsilon: sampled_actions = torch.rand(1) else: sampled_actions = self.actor(radar_state, physical_state, None) if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay # We make sure action is within bounds # Clip (limit) the values in an array here b/w lower and upper bound legal_action = np.clip(sampled_actions.detach().numpy(), self.lower_bound, self.upper_bound) return np.squeeze(legal_action) # We compute the loss and update parameters (learn) def replay(self): # Get sampling range record_range = min(self.buffer_counter, self.buffer_capacity) # Randomly sample indices(batch) batch_indices = np.random.choice(record_range, self.batch_size) # Convert to tensors state_batch = torch.tensor(self.state_buffer[batch_indices], dtype=torch.float) action_batch = torch.tensor(self.action_buffer[batch_indices], dtype=torch.float) reward_batch = torch.tensor(self.reward_buffer[batch_indices], dtype=torch.float) next_state_batch = torch.tensor(self.next_state_buffer[batch_indices], dtype=torch.float) radar_batch = torch.tensor(self.radar_buffer[batch_indices], dtype=torch.float) next_radar_batch = torch.tensor(self.next_radar_buffer[batch_indices], dtype=torch.float) ''' # Training and updating Actor & Critic Networks # Gradient Tape tracks the automatic differentiation that occurs in a TF model. with tf.GradientTape() as tape: target_actions = self.target_actor([next_state_batch, next_radar_batch]) y = reward_batch + self.gamma * self.target_critic([next_state_batch, next_radar_batch, target_actions]) critic_value = self.critic([state_batch, radar_batch, action_batch]) critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value)) critic_grad = tape.gradient(critic_loss, self.critic.trainable_variables) self.critic_optimizer.apply_gradients( zip(critic_grad, self.critic.trainable_variables) ) with tf.GradientTape() as tape: actions = self.actor([state_batch, radar_batch]) critic_value = self.critic([state_batch, radar_batch, actions]) # Used `-value` as we want to maximize the value given # by the critic for our actions actor_loss = -tf.math.reduce_mean(critic_value) actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor.trainable_variables) ) ''' # Critic Network target_actions = self.target_actor(next_radar_batch, next_state_batch, None) y = reward_batch + self.gamma * self.target_critic( next_radar_batch, next_state_batch, target_actions) critic_value = self.critic(radar_batch, state_batch, action_batch) critic_loss = torch.mean((y - critic_value)**2) critic_optimizer = Adam(self.critic.parameters(), lr=self.critic_lr) critic_optimizer.zero_grad() critic_loss.backward() critic_optimizer.step() # Actor Network actions = self.actor(radar_batch, state_batch, None) critic_value = self.critic(radar_batch, state_batch, action_batch) actor_loss = -torch.mean(critic_value) actor_optimizer = Adam(self.actor.parameters(), lr=self.actor_lr) actor_optimizer.zero_grad() actor_loss.backward() actor_optimizer.step() """ `````````````````````````````````````````````````````````````````````````` # We are missing one more step # We got to match some preprocess layers of actor and critic # Create a function and call in between the above and here too `````````````````````````````````````````````````````````````````````````` """ return actor_loss.detach().numpy(), critic_loss.detach().numpy() # This update target parameters slowly # Based on rate `tau`, which is much less than one ~0.001 order # This also logs the historgrams def update_target(self, tau, val): if (tau < 1): new_dict = dict(self.target_critic.named_parameters()) for name, param in self.critic.named_parameters(): new_dict[name].data = (param.data * tau + new_dict[name].data * (1 - tau)) self.target_critic.load_state_dict(new_dict) new_dict = dict(self.target_actor.named_parameters()) for name, param in self.actor.named_parameters(): new_dict[name].data = (param.data * tau + new_dict[name].data * (1 - tau)) self.target_actor.load_state_dict(new_dict) else: self.target_critic.load_state_dict(self.critic.state_dict()) self.target_actor.load_state_dict(self.actor.state_dict()) if val % 25 == 0: # Log the histogram data of the Actor/Critic Network for name, param in self.actor.named_parameters(): if 'weight' in name: self.writer.add_histogram("actor" + name, param.detach().numpy(), self.t_so_far) for name, param in self.critic.named_parameters(): if 'weight' in name: self.writer.add_histogram("critic" + name, param.detach().numpy(), self.t_so_far) self.t_so_far += 1 def save_model(self): # serialize weights to HDF5 print("---Saved modelweights to disk---") # Save the weights torch.save(self.actor.state_dict(), str(self.type) + "_DDPGactor.pth") torch.save(self.critic.state_dict(), str(self.type) + "_DDPGcritic.pth") torch.save(self.target_actor.state_dict(), str(self.type) + "_target_actor.pth") torch.save(self.target_critic.state_dict(), str(self.type) + "_target_critic.pth")