def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, network_scope="network", scene_scope="scene", task_scope="task"): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.network_scope = network_scope self.scene_scope = scene_scope self.task_scope = task_scope self.scopes = [network_scope, scene_scope, task_scope] self.local_network = ActorCriticFFNetwork(action_size=ACTION_SIZE, device=device, network_scope=network_scope, scene_scopes=[scene_scope]) self.local_network.prepare_loss(ENTROPY_BETA, self.scopes) self.trainer = AccumTrainer(device) self.trainer.prepare_minimize(self.local_network.total_loss, self.local_network.get_vars()) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() accum_grad_names = [ self._local_var_name(x) for x in self.trainer.get_accum_grad_list() ] global_net_vars = [ x for x in global_network.get_vars() if self._get_accum_grad_name(x) in accum_grad_names ] self.apply_gradients = grad_applier.apply_gradients( global_net_vars, self.trainer.get_accum_grad_list()) self.sync = self.local_network.sync_from(global_network) self.env = None self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.episode_length = 0 self.episode_max_q = -np.inf
def _set_trainer_optimizer(self, device, global_network, grad_applier): self.trainer = AccumTrainer(device) self.trainer.prepare_minimize(self.local_network.total_loss, self.local_network.get_vars()) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() accum_grad_names = [self._local_var_name(x) for x in self.trainer.get_accum_grad_list()] global_net_vars = [x for x in global_network.get_vars() if self._get_accum_grad_name(x) in accum_grad_names] self.apply_gradients = grad_applier.apply_gradients(global_net_vars, self.trainer.get_accum_grad_list() )
def __init__(self, thread_index, global_network, global_discriminator, initial_learning_rate, learning_rate_input, grad_applier, grad_applier_discriminator, max_global_time_step, device, device2, network_scope="network", scene_scope="scene", task_scope="task"): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.network_scope = network_scope self.network_scope_D = network_scope + "_d" self.scene_scope = scene_scope self.task_scope = task_scope self.scopes = [network_scope, scene_scope, task_scope] self.scopes_d = [self.network_scope_D, task_scope] self.local_network = ActorCriticFFNetwork(action_size=ACTION_SIZE, device=device, network_scope=network_scope, scene_scopes=[scene_scope]) self.local_network.prepare_loss(ENTROPY_BETA, self.scopes) self.trainer = AccumTrainer(device) self.trainer.prepare_minimize( self.local_network. total_loss, #getting the gradients of for the local network variablkes self.local_network.get_vars()) #This part is for the newly added PPO loss (we need to keep old and new update parameters) new_variable_list = self.local_network.get_vars() old_varaible_list = self.local_network.get_vars_old() #For the ppo loss begining of the each iteration we need to sync old with current self.old_new_sync = self.local_network.sync_curre_old() self.accum_gradients = self.trainer.accumulate_gradients( ) #This is to assign gradients self.reset_gradients = self.trainer.reset_gradients( ) #after applying the grads to variables we need to resent those variables accum_grad_names = [ self._local_var_name(x) for x in self.trainer.get_accum_grad_list() ] #get the name list of all the grad vars global_net_vars = [ x for x in global_network.get_vars() if self._get_accum_grad_name(x) in accum_grad_names ] #check whether the global_network vars are mentioned in gradiet computations for them local_net_vars = [ x for x in self.local_network.get_vars() if self._get_accum_grad_name(x) in accum_grad_names ] self.apply_gradients = grad_applier.apply_gradients( global_net_vars, self.trainer.get_accum_grad_list()) self.apply_gradients_local = grad_applier.apply_gradients_local_net( local_net_vars, self.trainer.get_accum_grad_list()) #If this is unstable it is desireable to first apply the gradients on the local network and then clip and after that we apply self.sync = self.local_network.sync_from( global_network ) #this is to sync from the glocal network Apply updated global params to the local network #This part is for the Discriminator ######################################################################################### # self.local_discriminator = Discriminator_WGAN( # action_size=ACTION_SIZE, # device=device, # network_scope=network_scope, # scene_scopes=[scene_scope]) # # self.local_discriminator.prepare_loss_D(ENTROPY_BETA, self.scopes_d) # # self.trainer_D = AccumTrainer_d(device=device, name="AccumTrainer_d") # # self.trainer_D.prepare_minimize( self.local_discriminator.total_loss_d, # self.local_discriminator.get_vars()) # # # self.accum_gradients_d = self.trainer_D.accumulate_gradients() # self.reset_gradients_d = self.trainer_D.reset_gradients() # # accum_grad_names_discrimi = [ self._local_var_name(x) for x in self.trainer_D.get_accum_grad_list() ] # # global_discri_vars = [ x for x in global_discriminator.get_vars() if self._get_accum_grad_name(x) in accum_grad_names_discrimi ] local_discri_vars = [ x for x in self.local_discriminator.get_vars() if self._get_accum_grad_name(x) in accum_grad_names_discrimi ] # self.apply_gradients_discriminator = grad_applier_discriminator.apply_gradients( local_discri_vars, self.trainer_D.get_accum_grad_list() ) #applying grad to the LOCAL network # self.clip_local_d_weights = self.local_discriminator.clip_weights( ) #here we are clipping the global net weights directly. # self.sync_discriminator_l_G = self.local_discriminator.sync_to( global_discriminator) # self.sync_discriminator_G_l = self.local_discriminator.sync_from( global_discriminator) # self.D_var_G = global_discriminator.get_vars() self.D_var_l = self.local_discriminator.get_vars() # # # ######################################################################################### self.env = None self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.episode_length = 0 self.episode_max_q = -np.inf
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, initial_diffidence_rate_seed, mode="train", network_scope="network", scene_scope="scene", task_scope="task", encourage_symmetry=False): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.network_scope = network_scope #assiciated with the thread number self.scene_scope = scene_scope #Score self.task_scope = task_scope #This the targe self.scopes = [network_scope, scene_scope, task_scope] # ["thread-n", "scene", "target"] self.local_network = SmashNet( #locally smash net policy netwotk action_size=ACTION_SIZE, device=device, network_scope=network_scope, scene_scopes=[scene_scope]) self.local_network.prepare_loss( self.scopes) #This is to calculate the loss for this thread if mode is "train": self.trainer = AccumTrainer(device) self.trainer.prepare_minimize(self.local_network.loss, self.local_network.get_vars()) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() accum_grad_names = [ self._local_var_name(x) for x in self.trainer.get_accum_grad_list() ] #This is more of we apply gradients to globabl network global_net_vars = [ x for x in global_network.get_vars() if self._get_accum_grad_name(x) in accum_grad_names ] self.apply_gradients = grad_applier.apply_gradients( global_net_vars, self.trainer.get_accum_grad_list()) self.sync = self.local_network.sync_from(global_network) self.env = None self.local_t = 0 self.initial_learning_rate = initial_learning_rate # self.episode_reward = 0 self.episode_length = 0 # self.episode_max_q = -np.inf self.episode_pi_sim = 0 self.episode_loss = 0 self.initial_diffidence_rate_seed = initial_diffidence_rate_seed self.oracle = None self.mode = mode self.encourage_symmetry = encourage_symmetry