def step(self, states, actions, rewards, next_states, dones, running_timestep): # Store experience to the replay buffer for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): # print('adding: ', state.shape, action.shape, reward, next_state.shape, done) self.memory.add(state, action, reward, next_state, done) # When the memory is at-least full as the batch size and if the step num is a factor of UPDATE_AFTER_STEP # then we learn the parameters of the network # Update the weights of local network and soft-update the weighs of the target_network # self.t_step = (self.t_step + 1) % self.UPDATE_AFTER_STEP # Run from {1->UPDATE_AFTER_STEP} # print('[Step] Current Step is: ', self.tstep) if (running_timestep % self.LEARNING_FREQUENCY) == 0: if len(self.memory) > self.BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, self.GAMMA, running_timestep) if running_timestep > self.DATA_TO_BUFFER_BEFORE_LEARNING: if self.IS_HARD_UPDATE: if (running_timestep % self.HARD_UPDATE_FREQUENCY) == 0: utils.hard_update(self.actor_local, self.actor_target) elif self.IS_SOFT_UPDATE: if (running_timestep % self.SOFT_UPDATE_FREQUENCY) == 0: utils.soft_update(self.critic_local, self.critic_target, self.TAU) utils.soft_update(self.actor_local, self.actor_target, self.TAU) else: raise ValueError('Only One of HARD_UPDATE and SOFT_UPDATE is to be activated')
def update(self, running_time_step): if self.IS_HARD_UPDATE: if (running_time_step % self.HARD_UPDATE_FREQUENCY) == 0: utils.hard_update(self.actor_local, self.actor_target) elif self.IS_SOFT_UPDATE: if (running_time_step % self.SOFT_UPDATE_FREQUENCY) == 0: utils.soft_update(self.critic_local, self.critic_target, self.TAU) utils.soft_update(self.actor_local, self.actor_target, self.TAU) else: raise ValueError('Only One of HARD_UPDATE and SOFT_UPDATE is to be activated')
def __init__(self, args, agent_id, mode): """ :param args: Config parameters :param agent_id: The agent id to run :param mode: train or test """ self.agent_id = agent_id self.mode = mode self.SEED = args.SEED random.seed(self.SEED) self.NOISE = args.NOISE_FN() self.STATE_SIZE = args.STATE_SIZE self.ACTION_SIZE = args.ACTION_SIZE self.TAU = args.TAU self.ACTOR_LEARNING_RATE = args.ACTOR_LEARNING_RATE self.CRITIC_LEARNING_RATE = args.CRITIC_LEARNING_RATE self.WEIGHT_DECAY = args.WEIGHT_DECAY self.IS_HARD_UPDATE = args.IS_HARD_UPDATE self.IS_SOFT_UPDATE = args.IS_SOFT_UPDATE self.SOFT_UPDATE_FREQUENCY = args.SOFT_UPDATE_FREQUENCY self.HARD_UPDATE_FREQUENCY = args.HARD_UPDATE_FREQUENCY self.NOISE_AMPLITUDE_DEACAY = args.NOISE_AMPLITUDE_DECAY_FN() self.CHECKPOINT_DIR = args.CHECKPOINT_DIR self.SUMMARY_LOGGER = args.SUMMARY_LOGGER # Actor Network self.actor_local = model.Actor(self.STATE_SIZE, self.ACTION_SIZE, [256, 256, 2], self.SEED).to(device) self.actor_target = model.Actor(self.STATE_SIZE, self.ACTION_SIZE, [256, 256, 2], self.SEED).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.ACTOR_LEARNING_RATE) # Critic Network self.critic_local = model.Critic(self.STATE_SIZE, self.ACTION_SIZE, [256, 256, 1], self.SEED).to(device) self.critic_target = model.Critic(self.STATE_SIZE, self.ACTION_SIZE, [256, 256, 1], self.SEED).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=self.CRITIC_LEARNING_RATE, weight_decay=self.WEIGHT_DECAY ) # Set weights for local and target actor, respectively, critic the same utils.hard_update(self.actor_local, self.actor_target) utils.hard_update(self.critic_local, self.critic_target) # Noise process self.noise = args.NOISE_FN()
def load_models(self, episode): """ loads the target actor and critic models, and copies them onto actor and critic models :param episode: the count of episodes iterated (used to find the file name) :return: """ for i in range(self.num_agents): self.target_actors[i].load_state_dict( torch.load('./Models/' + str(episode) + '_target_actor' + str(i) + '.pt', map_location=self.device)) # self.target_critics[i].load_state_dict( # torch.load('./Models/' + str(episode) + '_target_critic' + str(i) + '.pt', map_location = self.device)) self.actors[i].load_state_dict( torch.load('./Models/' + str(episode) + '_actor' + str(i) + '.pt', map_location=self.device)) # self.critics[i].load_state_dict( # torch.load('./Models/' + str(episode) + '_critic' + str(i) + '.pt', map_location = self.device)) utils.hard_update(self.target_actors[i], self.actors[i]) # utils.hard_update(self.target_critics[i], self.critics[i]) print('Models loaded succesfully')
def step(self, states, actions, rewards, next_states, dones, running_timestep): # print('Taking a step: ', state.shape, action, reward, next_state.shape, done, episode_num, running_time_step) # Insert the tuple into the memory buffer self.memory.add(states, actions, rewards, next_states, dones) if (running_timestep % self.LEARNING_FREQUENCY) == 0: if len(self.memory) > self.BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, running_timestep) if running_timestep > self.DATA_TO_BUFFER_BEFORE_LEARNING: if self.IS_HARD_UPDATE: if (running_timestep % self.HARD_UPDATE_FREQUENCY) == 0: utils.hard_update(self.local_network, self.target_network) elif self.IS_SOFT_UPDATE: if (running_timestep % self.SOFT_UPDATE_FREQUENCY) == 0: utils.soft_update(self.local_network, self.target_network, self.TAU) else: raise ValueError( 'Only One of HARD_UPDATE and SOFT_UPDATE is to be activated' )
def __init__(self, gamma, lr_a, lr_c, state_dim_actor, state_dim_critic, num_agents, num_agent_lim, action_dim, mem_size, batch_size, agent_name, chkpoint, chkpt_dir, env=None): self.state_dim_actor = state_dim_actor self.state_dim_critic = state_dim_critic self.action_dim = action_dim self.action_lim = action_dim self.iter = 0 self.lr_a = lr_a self.lr_c = lr_c self.tau = 0.05 self.steps_done = 0 self.nrand_action = 0 self.gamma = gamma self.num_agent_lim = num_agent_lim self.max_n_agents = self.num_agent_lim self.learn_step_counter = 0 self.batch_size = batch_size self.chkpt_dir = chkpt_dir self.env = env self.critic_loss_value = 0 self.actor_loss_value = 0 self.chkpoint = chkpoint self.num_agents = num_agents self.agent_name = agent_name self.use_cuda = False self.noise = utils.OrnsteinUhlenbeckActionNoise(self.action_dim) self.actors = [ Actor(self.state_dim_actor, self.action_dim) for i in range(num_agent_lim) ] self.critics = [ Critic(self.state_dim_critic, self.action_dim, num_agent_lim) for i in range(num_agent_lim) ] self.target_actors = deepcopy(self.actors) self.target_critics = deepcopy(self.critics) self.actor_optimizers = [ torch.optim.Adam(self.actors[i].parameters(), self.lr_a) for i in range(num_agent_lim) ] self.critic_optimizers = [ torch.optim.Adam(self.critics[i].parameters(), self.lr_c) for i in range(num_agent_lim) ] ''' Setup CUDA Environment''' self.device = 'cuda' if self.use_cuda else 'cpu' if self.use_cuda: for i in range(num_agent_lim): self.actors[i].to(self.device) self.target_actors[i].to(self.device) self.critics[i].to(self.device) self.target_critics[i].to(self.device) for i in range(num_agent_lim): utils.hard_update(self.target_actors[i], self.actors[i]) utils.hard_update(self.target_critics[i], self.critics[i]) self.memories = [ReplayBuffer(mem_size) for i in range(num_agent_lim)]