Exemplo n.º 1
0
    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 max_global_time_step,
                 device,
                 network_scope="network",
                 scene_scope="scene",
                 task_scope="task"):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        self.network_scope = network_scope
        self.scene_scope = scene_scope
        self.task_scope = task_scope
        self.scopes = [network_scope, scene_scope, task_scope]

        self.local_network = ActorCriticFFNetwork(action_size=ACTION_SIZE,
                                                  device=device,
                                                  network_scope=network_scope,
                                                  scene_scopes=[scene_scope])

        self.local_network.prepare_loss(ENTROPY_BETA, self.scopes)

        self.trainer = AccumTrainer(device)
        self.trainer.prepare_minimize(self.local_network.total_loss,
                                      self.local_network.get_vars())

        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        accum_grad_names = [
            self._local_var_name(x)
            for x in self.trainer.get_accum_grad_list()
        ]
        global_net_vars = [
            x for x in global_network.get_vars()
            if self._get_accum_grad_name(x) in accum_grad_names
        ]

        self.apply_gradients = grad_applier.apply_gradients(
            global_net_vars, self.trainer.get_accum_grad_list())

        self.sync = self.local_network.sync_from(global_network)

        self.env = None

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0
        self.episode_length = 0
        self.episode_max_q = -np.inf
  def _set_trainer_optimizer(self, device, global_network, grad_applier):
    self.trainer = AccumTrainer(device)
    self.trainer.prepare_minimize(self.local_network.total_loss,
                                  self.local_network.get_vars())

    self.accum_gradients = self.trainer.accumulate_gradients()
    self.reset_gradients = self.trainer.reset_gradients()

    accum_grad_names = [self._local_var_name(x) for x in self.trainer.get_accum_grad_list()]
    global_net_vars = [x for x in global_network.get_vars() if self._get_accum_grad_name(x) in accum_grad_names]

    self.apply_gradients = grad_applier.apply_gradients(global_net_vars, self.trainer.get_accum_grad_list() )
Exemplo n.º 3
0
    def __init__(self,
                 thread_index,
                 global_network,
                 global_discriminator,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 grad_applier_discriminator,
                 max_global_time_step,
                 device,
                 device2,
                 network_scope="network",
                 scene_scope="scene",
                 task_scope="task"):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        self.network_scope = network_scope
        self.network_scope_D = network_scope + "_d"
        self.scene_scope = scene_scope
        self.task_scope = task_scope
        self.scopes = [network_scope, scene_scope, task_scope]

        self.scopes_d = [self.network_scope_D, task_scope]

        self.local_network = ActorCriticFFNetwork(action_size=ACTION_SIZE,
                                                  device=device,
                                                  network_scope=network_scope,
                                                  scene_scopes=[scene_scope])

        self.local_network.prepare_loss(ENTROPY_BETA, self.scopes)

        self.trainer = AccumTrainer(device)

        self.trainer.prepare_minimize(
            self.local_network.
            total_loss,  #getting the gradients of for the local network variablkes
            self.local_network.get_vars())

        #This part is for the newly added PPO loss (we need to keep old and new update parameters)
        new_variable_list = self.local_network.get_vars()
        old_varaible_list = self.local_network.get_vars_old()

        #For the ppo loss begining of the each iteration we need to sync old with current
        self.old_new_sync = self.local_network.sync_curre_old()

        self.accum_gradients = self.trainer.accumulate_gradients(
        )  #This is to assign gradients
        self.reset_gradients = self.trainer.reset_gradients(
        )  #after applying the grads to variables we need to resent those variables

        accum_grad_names = [
            self._local_var_name(x)
            for x in self.trainer.get_accum_grad_list()
        ]  #get the name list of all the grad vars

        global_net_vars = [
            x for x in global_network.get_vars()
            if self._get_accum_grad_name(x) in accum_grad_names
        ]  #check whether the global_network vars are mentioned in gradiet computations for them
        local_net_vars = [
            x for x in self.local_network.get_vars()
            if self._get_accum_grad_name(x) in accum_grad_names
        ]

        self.apply_gradients = grad_applier.apply_gradients(
            global_net_vars, self.trainer.get_accum_grad_list())
        self.apply_gradients_local = grad_applier.apply_gradients_local_net(
            local_net_vars, self.trainer.get_accum_grad_list())

        #If this is unstable it is desireable to first apply the gradients on the local network and then clip and after that we apply
        self.sync = self.local_network.sync_from(
            global_network
        )  #this is to sync from the glocal network Apply updated global params to the local network

        #This part is for the Discriminator
        #########################################################################################
        #
        self.local_discriminator = Discriminator_WGAN(  #
            action_size=ACTION_SIZE,  # 
            device=device,  #  
            network_scope=network_scope,  #
            scene_scopes=[scene_scope])  #
        #
        self.local_discriminator.prepare_loss_D(ENTROPY_BETA, self.scopes_d)  #
        #
        self.trainer_D = AccumTrainer_d(device=device,
                                        name="AccumTrainer_d")  #
        #
        self.trainer_D.prepare_minimize(
            self.local_discriminator.total_loss_d,  #
            self.local_discriminator.get_vars())  #
        #
        #
        self.accum_gradients_d = self.trainer_D.accumulate_gradients()  #
        self.reset_gradients_d = self.trainer_D.reset_gradients()

        #
        #
        accum_grad_names_discrimi = [
            self._local_var_name(x)
            for x in self.trainer_D.get_accum_grad_list()
        ]
        #
        #
        global_discri_vars = [
            x for x in global_discriminator.get_vars()
            if self._get_accum_grad_name(x) in accum_grad_names_discrimi
        ]
        local_discri_vars = [
            x for x in self.local_discriminator.get_vars()
            if self._get_accum_grad_name(x) in accum_grad_names_discrimi
        ]
        #
        self.apply_gradients_discriminator = grad_applier_discriminator.apply_gradients(
            local_discri_vars, self.trainer_D.get_accum_grad_list()
        )  #applying grad to the LOCAL network

        #
        self.clip_local_d_weights = self.local_discriminator.clip_weights(
        )  #here we are clipping the global net weights directly.
        #
        self.sync_discriminator_l_G = self.local_discriminator.sync_to(
            global_discriminator)  #
        self.sync_discriminator_G_l = self.local_discriminator.sync_from(
            global_discriminator)
        #
        self.D_var_G = global_discriminator.get_vars()
        self.D_var_l = self.local_discriminator.get_vars()  #
        #
        #
        #########################################################################################

        self.env = None
        self.local_t = 0
        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0
        self.episode_length = 0
        self.episode_max_q = -np.inf
Exemplo n.º 4
0
    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 max_global_time_step,
                 device,
                 initial_diffidence_rate_seed,
                 mode="train",
                 network_scope="network",
                 scene_scope="scene",
                 task_scope="task",
                 encourage_symmetry=False):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        self.network_scope = network_scope  #assiciated with the thread number
        self.scene_scope = scene_scope  #Score
        self.task_scope = task_scope  #This the targe
        self.scopes = [network_scope, scene_scope,
                       task_scope]  # ["thread-n", "scene", "target"]

        self.local_network = SmashNet(  #locally smash net policy netwotk
            action_size=ACTION_SIZE,
            device=device,
            network_scope=network_scope,
            scene_scopes=[scene_scope])

        self.local_network.prepare_loss(
            self.scopes)  #This is to calculate the loss for this thread

        if mode is "train":
            self.trainer = AccumTrainer(device)
            self.trainer.prepare_minimize(self.local_network.loss,
                                          self.local_network.get_vars())

            self.accum_gradients = self.trainer.accumulate_gradients()
            self.reset_gradients = self.trainer.reset_gradients()

            accum_grad_names = [
                self._local_var_name(x)
                for x in self.trainer.get_accum_grad_list()
            ]  #This is more of we apply gradients to globabl network
            global_net_vars = [
                x for x in global_network.get_vars()
                if self._get_accum_grad_name(x) in accum_grad_names
            ]

            self.apply_gradients = grad_applier.apply_gradients(
                global_net_vars, self.trainer.get_accum_grad_list())

        self.sync = self.local_network.sync_from(global_network)

        self.env = None

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        # self.episode_reward = 0
        self.episode_length = 0
        # self.episode_max_q = -np.inf
        self.episode_pi_sim = 0
        self.episode_loss = 0

        self.initial_diffidence_rate_seed = initial_diffidence_rate_seed

        self.oracle = None
        self.mode = mode
        self.encourage_symmetry = encourage_symmetry