def __init__(self, thread_index, global_network, initial_learning_rate,
                 max_global_time_step):

        self.thread_index = thread_index
        self.learning_rate_input = tf.placeholder("float")
        self.max_global_time_step = max_global_time_step

        self.local_network = GameACNetwork(ACTION_SIZE)
        self.local_network.prepare_loss(ENTROPY_BETA)

        # policy
        self.policy_trainer = AccumTrainer()
        self.policy_trainer.prepare_minimize(
            self.local_network.policy_loss,
            self.local_network.get_policy_vars())
        self.policy_accum_gradients = self.policy_trainer.accumulate_gradients(
        )
        self.policy_reset_gradients = self.policy_trainer.reset_gradients()

        self.policy_applier = RMSPropApplier(
            learning_rate=self.learning_rate_input,
            decay=0.99,
            momentum=0.0,
            epsilon=RMSP_EPSILON)
        self.policy_apply_gradients = self.policy_applier.apply_gradients(
            global_network.get_policy_vars(),
            self.policy_trainer.get_accum_grad_list())

        # value
        self.value_trainer = AccumTrainer()
        self.value_trainer.prepare_minimize(
            self.local_network.value_loss, self.local_network.get_value_vars())
        self.value_accum_gradients = self.value_trainer.accumulate_gradients()
        self.value_reset_gradients = self.value_trainer.reset_gradients()

        self.value_applier = RMSPropApplier(
            learning_rate=self.learning_rate_input,
            decay=0.99,
            momentum=0.0,
            epsilon=RMSP_EPSILON)
        self.value_apply_gradients = self.value_applier.apply_gradients(
            global_network.get_value_vars(),
            self.value_trainer.get_accum_grad_list())

        self.sync = self.local_network.sync_from(global_network)

        self.game_state = GameState(113 * thread_index)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

        # thread0 will record score for TensorBoard
        if self.thread_index == 0:
            self.score_input = tf.placeholder(tf.int32)
            tf.scalar_summary("score", self.score_input)
Exemplo n.º 2
0
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, grad_applier, max_global_time_step,
                 device):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        self.local_network = GameACNetwork(ACTION_SIZE, device)
        self.local_network.prepare_loss(ENTROPY_BETA)

        self.trainer = AccumTrainer(device)
        self.trainer.prepare_minimize(self.local_network.total_loss,
                                      self.local_network.get_vars())

        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        self.apply_gradients = grad_applier.apply_gradients(
            global_network.get_vars(), self.trainer.get_accum_grad_list())

        self.sync = self.local_network.sync_from(global_network)

        self.game_state = GameState(113 * thread_index)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0
Exemplo n.º 3
0
    def __init__(self, thread_id, env_name, global_model, init_lr, lr_ph,
                 grad_applier, max_time_steps, model_dim, gamma):
        self.thread_id = thread_id
        self.global_model = global_model
        self.init_lr = init_lr
        self.grad_applier = grad_applier
        self.lr_ph = lr_ph
        self.max_time_steps = max_time_steps
        self.gamma = gamma

        height, width, num_frames, num_actions = model_dim
        self.local_model = ConvNetA3C(height, width, num_frames, num_actions)
        self.num_actions = num_actions

        trainer = AccumTrainer("/cpu:0")
        trainer.prepare_minimize(self.local_model.loss,
                                 self.local_model.params)
        self.accum_grads = trainer.accumulate_gradients()
        self.reset_grads = trainer.reset_gradients()
        self.apply_grads = grad_applier.apply_gradients(
            global_model.params, trainer.get_accum_grad_list())

        self.sync = self.local_model.sync_from(global_model)
        self.env = AtariAleEnvironment(env_name)
        self.s_t = self.env.reset()

        self.start_time = None
        self.ep_rwd, self.num_ep = 0, 0
        self.avg_rwd = None
        self.t = 0
        self.prev_t = 0
Exemplo n.º 4
0
    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 max_global_time_step,
                 device,
                 sess,
                 name="agent"):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        #if USE_LSTM:
        #    self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
        #else:

        self.local_network = Network(name=name)

        self.local_network.prepare_loss(FLAGS.entropy_beta)

        # TODO: don't need accum trainer anymore with batch
        self.trainer = AccumTrainer(device)
        self.local_network.vars = self.trainer.prepare_minimize(
            self.local_network.total_loss, self.local_network.get_train_vars())

        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        self.apply_gradients = grad_applier.apply_gradients(
            global_network.get_train_vars(),
            self.trainer.get_accum_grad_list())

        self.sync = self.local_network.sync_from(global_network)

        #if USE_ALE:
        #    self.game_state = GameState(113 * thread_index)
        #else:
        self.game = gym.make('Lis-v2')
        self.game.configure(str(5000 + thread_index))
        # game initialization
        # observation = env.reset()
        self.observation, reward, end_episode, _ = self.game.step(1)
        #self.observation = self.preprocess([self.observation])
        self.history = [self.rgb2gray(self.observation)
                        for _ in range(4)]  #FLAGS.history_frames
        self.observation = np.dstack(self.history)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

        # variable controling log output
        self.prev_local_t = 0
Exemplo n.º 5
0
    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 max_global_time_step,
                 device):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        if NETWORK_TYPE == 'LSTM':
            self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
        elif NETWORK_TYPE == 'DILATED':
            self.local_network = GameACDilatedNetwork(ACTION_SIZE, device)
        elif NETWORK_TYPE == 'CONV':
            self.local_network = GameACFFNetwork(ACTION_SIZE, device)

        self.local_network.prepare_loss(ENTROPY_BETA)

        # TODO: don't need accum trainer anymore with batch
        self.trainer = AccumTrainer(device)
        self.trainer.prepare_minimize( self.local_network.total_loss,
                                       self.local_network.get_vars() )

        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        self.apply_gradients = grad_applier.apply_gradients(
          global_network.get_vars(),
          self.trainer.get_accum_grad_list() )

        self.sync = self.local_network.sync_from(global_network)




        self.game_state = GameState(113 * thread_index)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0
Exemplo n.º 6
0
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, optimizer, max_global_time_step, device):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        if USE_LSTM:
            self.local_network = A3CLSTMNetwork(STATE_DIM, STATE_CHN,
                                                ACTION_DIM, device,
                                                thread_index)
        else:
            self.local_network = A3CFFNetwork(STATE_DIM, STATE_CHN, ACTION_DIM,
                                              device)
        self.local_network.create_loss(ENTROPY_BETA)
        self.trainer = AccumTrainer(device)
        self.trainer.create_minimize(self.local_network.total_loss,
                                     self.local_network.get_vars())
        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        clip_accum_grads = [
            tf.clip_by_norm(accum_grad, 40.0)
            for accum_grad in self.trainer.get_accum_grad_list()
        ]
        self.apply_gradients = optimizer.apply_gradients(
            zip(clip_accum_grads, global_network.get_vars()))

        self.sync = self.local_network.sync_from(global_network)

        self.local_t = 0
        self.initial_learning_rate = initial_learning_rate

        # for log
        self.episode_reward = 0.0
        self.episode_start_time = 0.0
        self.prev_local_t = 0

        # for pull mode, like brower based game
        self.states = []
        self.actions = []
        self.rewards = []
        self.values = []
        self.start_lstm_state = None
        return
Exemplo n.º 7
0
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    if USE_LSTM:
      self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
    else:
      self.local_network = GameACFFNetwork(ACTION_SIZE, device)

    self.local_network.prepare_loss(ENTROPY_BETA)

    # TODO: don't need accum trainer anymore with batch
    self.trainer = AccumTrainer(device)
    self.trainer.prepare_minimize( self.local_network.total_loss,
                                   self.local_network.get_vars() )
    
    self.accum_gradients = self.trainer.accumulate_gradients()
    self.reset_gradients = self.trainer.reset_gradients()
  
    self.apply_gradients = grad_applier.apply_gradients( # watch out: update global_network
      global_network.get_vars(),
      self.trainer.get_accum_grad_list() )

    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(113 * thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    # variable controling log output
    self.prev_local_t = 0
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device,
               game_function=ale_game_state, 
               local_network=None):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    self.local_network = local_network()

    self.local_network.prepare_loss(ENTROPY_BETA)

    # TODO: don't need accum trainer anymore with batch
    self.trainer = AccumTrainer(device)
    self.trainer.prepare_minimize( self.local_network.total_loss,
                                   self.local_network.get_vars() )
    
    self.accum_gradients = self.trainer.accumulate_gradients()
    self.reset_gradients = self.trainer.reset_gradients()
  
    self.apply_gradients = grad_applier.apply_gradients(
      global_network.get_vars(),
      self.trainer.get_accum_grad_list() )

    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = game_function(thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    # variable controling log output
    self.prev_local_t = 0
Exemplo n.º 9
0
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, grad_applier, max_global_time_step,
                 device, environment):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        # self.local_network = GameACNetwork(ACTION_SIZE, device)

        self.local_network = global_network.structural_clone(
            network_name="thread-net-%s" % self.thread_index)

        self.local_network.prepare_loss(ENTROPY_BETA)

        self.trainer = AccumTrainer(device)
        self.trainer.prepare_minimize(self.local_network.total_loss,
                                      self.local_network.get_vars())

        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        self.apply_gradients, self.grad_summary_op = grad_applier.apply_gradients(
            global_network.get_vars(), self.trainer.get_accum_grad_list())

        self.sync = self.local_network.sync_from(global_network)

        # self.game_state = GameState(113 * thread_index)
        self.game_state = GymGameState(113 * thread_index, env=environment)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

        self.lstm_last_output_state = None  # cache last lstm hidden states here
Exemplo n.º 10
0
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device,
               options):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step
    self.options = options

    if options.use_lstm:
      self.local_network = GameACLSTMNetwork(options.action_size, thread_index, device)
    else:
      self.local_network = GameACFFNetwork(options.action_size, device)

    self.local_network.prepare_loss(options.entropy_beta)

    # TODO: don't need accum trainer anymore with batch
    self.trainer = AccumTrainer(device)
    self.trainer.prepare_minimize( self.local_network.total_loss,
                                   self.local_network.get_vars() )
    
    self.accum_gradients = self.trainer.accumulate_gradients()
    self.reset_gradients = self.trainer.reset_gradients()
  
    self.apply_gradients = grad_applier.apply_gradients(
      global_network.get_vars(),
      self.trainer.get_accum_grad_list() )

    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(random.randint(0, 2**16), options, thread_index = thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    self.indent = "         |" * self.thread_index
    self.steps = 0
    self.no_reward_steps = 0
    self.terminate_on_lives_lost = options.terminate_on_lives_lost and (self.thread_index != 0)

    if self.options.train_episode_steps > 0:
      self.max_reward = 0.0
      self.max_episode_reward = 0.0
      self.episode_states = []
      self.episode_actions = []
      self.episode_rewards = []
      self.episode_values = []
      self.episode_liveses = []
      self.episode_scores = Episode_scores(options)
      self.tes = self.options.train_episode_steps
      if self.options.tes_list is not None:
        self.tes = self.options.tes_list[thread_index]
        print("[DIVERSITY]th={}:tes={}".format(thread_index, self.tes))
    self.initial_lives = self.game_state.initial_lives
    self.max_history = int(self.tes * self.options.tes_extend_ratio * 2.1)

    if self.options.record_new_record_dir is not None:
      if self.thread_index == 0:
        if not os.path.exists(self.options.record_new_record_dir):
          os.makedirs(self.options.record_new_record_dir)
      self.episode_screens = []

    if self.options.record_new_room_dir is not None:
      if self.thread_index == 0:
        if not os.path.exists(self.options.record_new_room_dir):
          os.makedirs(self.options.record_new_room_dir)
      self.episode_screens = []

    self.greediness = options.greediness
    self.repeat_action_ratio = options.repeat_action_ratio
    self.prev_action = 0