示例#1
0
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, grad_applier, max_global_time_step,
                 device):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        self.local_network = GameACNetwork(ACTION_SIZE, device)
        self.local_network.prepare_loss(ENTROPY_BETA)

        self.trainer = AccumTrainer(device)
        self.trainer.prepare_minimize(self.local_network.total_loss,
                                      self.local_network.get_vars())

        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        self.apply_gradients = grad_applier.apply_gradients(
            global_network.get_vars(), self.trainer.get_accum_grad_list())

        self.sync = self.local_network.sync_from(global_network)

        self.game_state = GameState(113 * thread_index)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 max_global_time_step):

        self.thread_index = thread_index
        self.learning_rate_input = tf.placeholder("float")
        self.max_global_time_step = max_global_time_step

        self.local_network = GameACNetwork(ACTION_SIZE)
        self.local_network.prepare_loss(ENTROPY_BETA)

        # policy
        self.policy_trainer = AccumTrainer()
        self.policy_trainer.prepare_minimize(
            self.local_network.policy_loss,
            self.local_network.get_policy_vars())
        self.policy_accum_gradients = self.policy_trainer.accumulate_gradients(
        )
        self.policy_reset_gradients = self.policy_trainer.reset_gradients()

        self.policy_applier = RMSPropApplier(
            learning_rate=self.learning_rate_input,
            decay=0.99,
            momentum=0.0,
            epsilon=RMSP_EPSILON)
        self.policy_apply_gradients = self.policy_applier.apply_gradients(
            global_network.get_policy_vars(),
            self.policy_trainer.get_accum_grad_list())

        # value
        self.value_trainer = AccumTrainer()
        self.value_trainer.prepare_minimize(
            self.local_network.value_loss, self.local_network.get_value_vars())
        self.value_accum_gradients = self.value_trainer.accumulate_gradients()
        self.value_reset_gradients = self.value_trainer.reset_gradients()

        self.value_applier = RMSPropApplier(
            learning_rate=self.learning_rate_input,
            decay=0.99,
            momentum=0.0,
            epsilon=RMSP_EPSILON)
        self.value_apply_gradients = self.value_applier.apply_gradients(
            global_network.get_value_vars(),
            self.value_trainer.get_accum_grad_list())

        self.sync = self.local_network.sync_from(global_network)

        self.game_state = GameState(113 * thread_index)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

        # thread0 will record score for TensorBoard
        if self.thread_index == 0:
            self.score_input = tf.placeholder(tf.int32)
            tf.scalar_summary("score", self.score_input)
    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 max_global_time_step,
                 device,
                 sess,
                 name="agent"):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        #if USE_LSTM:
        #    self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
        #else:

        self.local_network = Network(name=name)

        self.local_network.prepare_loss(FLAGS.entropy_beta)

        # TODO: don't need accum trainer anymore with batch
        self.trainer = AccumTrainer(device)
        self.local_network.vars = self.trainer.prepare_minimize(
            self.local_network.total_loss, self.local_network.get_train_vars())

        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        self.apply_gradients = grad_applier.apply_gradients(
            global_network.get_train_vars(),
            self.trainer.get_accum_grad_list())

        self.sync = self.local_network.sync_from(global_network)

        #if USE_ALE:
        #    self.game_state = GameState(113 * thread_index)
        #else:
        self.game = gym.make('Lis-v2')
        self.game.configure(str(5000 + thread_index))
        # game initialization
        # observation = env.reset()
        self.observation, reward, end_episode, _ = self.game.step(1)
        #self.observation = self.preprocess([self.observation])
        self.history = [self.rgb2gray(self.observation)
                        for _ in range(4)]  #FLAGS.history_frames
        self.observation = np.dstack(self.history)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

        # variable controling log output
        self.prev_local_t = 0
  def __init__(self, thread_index, global_network, initial_learning_rate, max_global_time_step):

    self.thread_index = thread_index
    self.learning_rate_input = tf.placeholder("float")
    self.max_global_time_step = max_global_time_step

    self.local_network = GameACNetwork(ACTION_SIZE)
    self.local_network.prepare_loss(ENTROPY_BETA)

    # policy
    self.policy_trainer = AccumTrainer()
    self.policy_trainer.prepare_minimize( self.local_network.policy_loss,
                                          self.local_network.get_policy_vars() )
    self.policy_accum_gradients = self.policy_trainer.accumulate_gradients()
    self.policy_reset_gradients = self.policy_trainer.reset_gradients()
  
    self.policy_applier = RMSPropApplier(learning_rate = self.learning_rate_input,
                                         decay = 0.99,
                                         momentum = 0.0,
                                         epsilon = RMSP_EPSILON )
    self.policy_apply_gradients = self.policy_applier.apply_gradients(
        global_network.get_policy_vars(),
        self.policy_trainer.get_accum_grad_list() )

    # value
    self.value_trainer = AccumTrainer()
    self.value_trainer.prepare_minimize( self.local_network.value_loss,
                                         self.local_network.get_value_vars() )
    self.value_accum_gradients = self.value_trainer.accumulate_gradients()
    self.value_reset_gradients = self.value_trainer.reset_gradients()
  
    self.value_applier = RMSPropApplier(learning_rate = self.learning_rate_input,
                                        decay = 0.99,
                                        momentum = 0.0,
                                        epsilon = RMSP_EPSILON )
    self.value_apply_gradients = self.value_applier.apply_gradients(
        global_network.get_value_vars(),
        self.value_trainer.get_accum_grad_list() )
    
    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(113 * thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    # thread0 will record score for TensorBoard
    if self.thread_index == 0:
      self.score_input = tf.placeholder(tf.int32)
      tf.scalar_summary("score", self.score_input)
示例#5
0
    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 max_global_time_step,
                 device):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        if NETWORK_TYPE == 'LSTM':
            self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
        elif NETWORK_TYPE == 'DILATED':
            self.local_network = GameACDilatedNetwork(ACTION_SIZE, device)
        elif NETWORK_TYPE == 'CONV':
            self.local_network = GameACFFNetwork(ACTION_SIZE, device)

        self.local_network.prepare_loss(ENTROPY_BETA)

        # TODO: don't need accum trainer anymore with batch
        self.trainer = AccumTrainer(device)
        self.trainer.prepare_minimize( self.local_network.total_loss,
                                       self.local_network.get_vars() )

        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        self.apply_gradients = grad_applier.apply_gradients(
          global_network.get_vars(),
          self.trainer.get_accum_grad_list() )

        self.sync = self.local_network.sync_from(global_network)




        self.game_state = GameState(113 * thread_index)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, optimizer, max_global_time_step, device):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        if USE_LSTM:
            self.local_network = A3CLSTMNetwork(STATE_DIM, STATE_CHN,
                                                ACTION_DIM, device,
                                                thread_index)
        else:
            self.local_network = A3CFFNetwork(STATE_DIM, STATE_CHN, ACTION_DIM,
                                              device)
        self.local_network.create_loss(ENTROPY_BETA)
        self.trainer = AccumTrainer(device)
        self.trainer.create_minimize(self.local_network.total_loss,
                                     self.local_network.get_vars())
        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        clip_accum_grads = [
            tf.clip_by_norm(accum_grad, 40.0)
            for accum_grad in self.trainer.get_accum_grad_list()
        ]
        self.apply_gradients = optimizer.apply_gradients(
            zip(clip_accum_grads, global_network.get_vars()))

        self.sync = self.local_network.sync_from(global_network)

        self.local_t = 0
        self.initial_learning_rate = initial_learning_rate

        # for log
        self.episode_reward = 0.0
        self.episode_start_time = 0.0
        self.prev_local_t = 0

        # for pull mode, like brower based game
        self.states = []
        self.actions = []
        self.rewards = []
        self.values = []
        self.start_lstm_state = None
        return
示例#7
0
    def __init__(self, thread_id, env_name, global_model, init_lr, lr_ph,
                 grad_applier, max_time_steps, model_dim, gamma):
        self.thread_id = thread_id
        self.global_model = global_model
        self.init_lr = init_lr
        self.grad_applier = grad_applier
        self.lr_ph = lr_ph
        self.max_time_steps = max_time_steps
        self.gamma = gamma

        height, width, num_frames, num_actions = model_dim
        self.local_model = ConvNetA3C(height, width, num_frames, num_actions)
        self.num_actions = num_actions

        trainer = AccumTrainer("/cpu:0")
        trainer.prepare_minimize(self.local_model.loss,
                                 self.local_model.params)
        self.accum_grads = trainer.accumulate_gradients()
        self.reset_grads = trainer.reset_gradients()
        self.apply_grads = grad_applier.apply_gradients(
            global_model.params, trainer.get_accum_grad_list())

        self.sync = self.local_model.sync_from(global_model)
        self.env = AtariAleEnvironment(env_name)
        self.s_t = self.env.reset()

        self.start_time = None
        self.ep_rwd, self.num_ep = 0, 0
        self.avg_rwd = None
        self.t = 0
        self.prev_t = 0
示例#8
0
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    if USE_LSTM:
      self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
    else:
      self.local_network = GameACFFNetwork(ACTION_SIZE, device)

    self.local_network.prepare_loss(ENTROPY_BETA)

    # TODO: don't need accum trainer anymore with batch
    self.trainer = AccumTrainer(device)
    self.trainer.prepare_minimize( self.local_network.total_loss,
                                   self.local_network.get_vars() )
    
    self.accum_gradients = self.trainer.accumulate_gradients()
    self.reset_gradients = self.trainer.reset_gradients()
  
    self.apply_gradients = grad_applier.apply_gradients( # watch out: update global_network
      global_network.get_vars(),
      self.trainer.get_accum_grad_list() )

    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(113 * thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    # variable controling log output
    self.prev_local_t = 0
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device,
               game_function=ale_game_state, 
               local_network=None):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    self.local_network = local_network()

    self.local_network.prepare_loss(ENTROPY_BETA)

    # TODO: don't need accum trainer anymore with batch
    self.trainer = AccumTrainer(device)
    self.trainer.prepare_minimize( self.local_network.total_loss,
                                   self.local_network.get_vars() )
    
    self.accum_gradients = self.trainer.accumulate_gradients()
    self.reset_gradients = self.trainer.reset_gradients()
  
    self.apply_gradients = grad_applier.apply_gradients(
      global_network.get_vars(),
      self.trainer.get_accum_grad_list() )

    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = game_function(thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    # variable controling log output
    self.prev_local_t = 0
示例#10
0
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, grad_applier, max_global_time_step,
                 device, environment):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        # self.local_network = GameACNetwork(ACTION_SIZE, device)

        self.local_network = global_network.structural_clone(
            network_name="thread-net-%s" % self.thread_index)

        self.local_network.prepare_loss(ENTROPY_BETA)

        self.trainer = AccumTrainer(device)
        self.trainer.prepare_minimize(self.local_network.total_loss,
                                      self.local_network.get_vars())

        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        self.apply_gradients, self.grad_summary_op = grad_applier.apply_gradients(
            global_network.get_vars(), self.trainer.get_accum_grad_list())

        self.sync = self.local_network.sync_from(global_network)

        # self.game_state = GameState(113 * thread_index)
        self.game_state = GymGameState(113 * thread_index, env=environment)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

        self.lstm_last_output_state = None  # cache last lstm hidden states here
    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 max_global_time_step,
                 device,
                 environment):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        # self.local_network = GameACNetwork(ACTION_SIZE, device)

        self.local_network = global_network.structural_clone(network_name="thread-net-%s" % self.thread_index)

        self.local_network.prepare_loss(ENTROPY_BETA)

        self.trainer = AccumTrainer(device)
        self.trainer.prepare_minimize(self.local_network.total_loss,
                                      self.local_network.get_vars())

        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        self.apply_gradients, self.grad_summary_op = grad_applier.apply_gradients(
            global_network.get_vars(),
            self.trainer.get_accum_grad_list())

        self.sync = self.local_network.sync_from(global_network)

        # self.game_state = GameState(113 * thread_index)
        self.game_state = GymGameState(113 * thread_index, env=environment)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

        self.lstm_last_output_state = None          # cache last lstm hidden states here
	def __init__(self,
			 sess,
			 thread_index,
			 global_network,
			 initial_learning_rate,
			 learning_rate_input,
			 grad_applier,
			 max_global_time_step,
			 num_trainable_vars):

		self.thread_index = thread_index
		self.learning_rate_input = learning_rate_input
		self.max_global_time_step = max_global_time_step	
		
		if LSTM:
			initializer = tf.random_uniform_initializer(-0.1, 0.1)		
			with tf.variable_scope("model"+str(thread_index), reuse=None, initializer=initializer):
				self.local_network = AC3LSTM(num_actions, num_states, num_trainable_vars)
		else:
			self.local_network = AC3FF(num_actions, num_states, num_trainable_vars)
			
		self.local_network.prepare_loss(entropy_beta)

		self.trainer = AccumTrainer()
		self.trainer.prepare_minimize(self.local_network.total_loss, self.local_network.trainable_vars)
		
		self.accum_gradients = self.trainer.accumulate_gradients()
		self.reset_gradients = self.trainer.reset_gradients()
	
		self.apply_gradients = grad_applier.apply_gradients(
			global_network.trainable_vars,
			self.trainer.get_accum_grad_list() )

		self.sync = self.local_network.sync_from(global_network)
		self.game_state = ChainMDP()
		self.local_t = 0
		self.initial_learning_rate = initial_learning_rate
		self.episode_reward = 0
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    self.local_network = GameACNetwork(ACTION_SIZE, device)
    self.local_network.prepare_loss(ENTROPY_BETA)

    self.trainer = AccumTrainer(device)
    self.trainer.prepare_minimize( self.local_network.total_loss,
                                   self.local_network.get_vars() )
    
    self.accum_gradients = self.trainer.accumulate_gradients()
    self.reset_gradients = self.trainer.reset_gradients()
  
    self.apply_gradients = grad_applier.apply_gradients(
      global_network.get_vars(),
      self.trainer.get_accum_grad_list() )

    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(113 * thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0
示例#14
0
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device,
               options):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step
    self.options = options

    if options.use_lstm:
      self.local_network = GameACLSTMNetwork(options.action_size, thread_index, device)
    else:
      self.local_network = GameACFFNetwork(options.action_size, device)

    self.local_network.prepare_loss(options.entropy_beta)

    # TODO: don't need accum trainer anymore with batch
    self.trainer = AccumTrainer(device)
    self.trainer.prepare_minimize( self.local_network.total_loss,
                                   self.local_network.get_vars() )
    
    self.accum_gradients = self.trainer.accumulate_gradients()
    self.reset_gradients = self.trainer.reset_gradients()
  
    self.apply_gradients = grad_applier.apply_gradients(
      global_network.get_vars(),
      self.trainer.get_accum_grad_list() )

    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(random.randint(0, 2**16), options, thread_index = thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    self.indent = "         |" * self.thread_index
    self.steps = 0
    self.no_reward_steps = 0
    self.terminate_on_lives_lost = options.terminate_on_lives_lost and (self.thread_index != 0)

    if self.options.train_episode_steps > 0:
      self.max_reward = 0.0
      self.max_episode_reward = 0.0
      self.episode_states = []
      self.episode_actions = []
      self.episode_rewards = []
      self.episode_values = []
      self.episode_liveses = []
      self.episode_scores = Episode_scores(options)
      self.tes = self.options.train_episode_steps
      if self.options.tes_list is not None:
        self.tes = self.options.tes_list[thread_index]
        print("[DIVERSITY]th={}:tes={}".format(thread_index, self.tes))
    self.initial_lives = self.game_state.initial_lives
    self.max_history = int(self.tes * self.options.tes_extend_ratio * 2.1)

    if self.options.record_new_record_dir is not None:
      if self.thread_index == 0:
        if not os.path.exists(self.options.record_new_record_dir):
          os.makedirs(self.options.record_new_record_dir)
      self.episode_screens = []

    if self.options.record_new_room_dir is not None:
      if self.thread_index == 0:
        if not os.path.exists(self.options.record_new_room_dir):
          os.makedirs(self.options.record_new_room_dir)
      self.episode_screens = []

    self.greediness = options.greediness
    self.repeat_action_ratio = options.repeat_action_ratio
    self.prev_action = 0
示例#15
0
class A3CTrainingThread(object):
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device,
               options):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step
    self.options = options

    if options.use_lstm:
      self.local_network = GameACLSTMNetwork(options.action_size, thread_index, device)
    else:
      self.local_network = GameACFFNetwork(options.action_size, device)

    self.local_network.prepare_loss(options.entropy_beta)

    # TODO: don't need accum trainer anymore with batch
    self.trainer = AccumTrainer(device)
    self.trainer.prepare_minimize( self.local_network.total_loss,
                                   self.local_network.get_vars() )
    
    self.accum_gradients = self.trainer.accumulate_gradients()
    self.reset_gradients = self.trainer.reset_gradients()
  
    self.apply_gradients = grad_applier.apply_gradients(
      global_network.get_vars(),
      self.trainer.get_accum_grad_list() )

    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(random.randint(0, 2**16), options, thread_index = thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    self.indent = "         |" * self.thread_index
    self.steps = 0
    self.no_reward_steps = 0
    self.terminate_on_lives_lost = options.terminate_on_lives_lost and (self.thread_index != 0)

    if self.options.train_episode_steps > 0:
      self.max_reward = 0.0
      self.max_episode_reward = 0.0
      self.episode_states = []
      self.episode_actions = []
      self.episode_rewards = []
      self.episode_values = []
      self.episode_liveses = []
      self.episode_scores = Episode_scores(options)
      self.tes = self.options.train_episode_steps
      if self.options.tes_list is not None:
        self.tes = self.options.tes_list[thread_index]
        print("[DIVERSITY]th={}:tes={}".format(thread_index, self.tes))
    self.initial_lives = self.game_state.initial_lives
    self.max_history = int(self.tes * self.options.tes_extend_ratio * 2.1)

    if self.options.record_new_record_dir is not None:
      if self.thread_index == 0:
        if not os.path.exists(self.options.record_new_record_dir):
          os.makedirs(self.options.record_new_record_dir)
      self.episode_screens = []

    if self.options.record_new_room_dir is not None:
      if self.thread_index == 0:
        if not os.path.exists(self.options.record_new_room_dir):
          os.makedirs(self.options.record_new_room_dir)
      self.episode_screens = []

    self.greediness = options.greediness
    self.repeat_action_ratio = options.repeat_action_ratio
    self.prev_action = 0

    
    

  def _anneal_learning_rate(self, global_time_step):
    learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step
    if learning_rate < 0.0:
      learning_rate = 0.0
    return learning_rate

  def choose_action(self, pi_values, global_t):
    # Add greediness for broader exploration
    r = random.random()
    if r < self.greediness:
      action =  int(r * len(pi_values))
    elif r < self.repeat_action_ratio:
      action = self.prev_action
    else:
      # Increase randomness of choice if no reward term is too long
      if self.no_reward_steps > self.options.no_reward_steps:
        randomness = (self.no_reward_steps - self.options.no_reward_steps) * self.options.randomness
        pi_values += randomness
        pi_values /= sum(pi_values)
        if self.local_t % self.options.randomness_log_interval == 0:
          elapsed_time = time.time() - self.start_time
          print("t={:6.0f},s={:9d},th={}:{}randomness={:.8f}".format(
                elapsed_time, global_t, self.thread_index, self.indent, randomness))

      pi_values -= np.finfo(np.float32).epsneg
      action_samples = np.random.multinomial(self.options.num_experiments, pi_values)
      action = action_samples.argmax(0)

    self.prev_action = action
    return action

  def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t):
    summary_str = sess.run(summary_op, feed_dict={
      score_input: score
    })
    summary_writer.add_summary(summary_str, global_t)
    
  def set_start_time(self, start_time):
    self.start_time = start_time

  #@profile
  def process(self, sess, global_t, summary_writer, summary_op, score_input):
    states = []
    actions = []
    rewards = []
    values = []
    liveses = [self.game_state.lives]
    if self.tes > 0:
      if self.episode_liveses == []:
        self.episode_liveses.append(self.game_state.lives)

    terminal_end = False

    # reset accumulated gradients
    sess.run( self.reset_gradients )

    # copy weights from shared to local
    sess.run( self.sync )

    start_local_t = self.local_t

    if self.options.use_lstm:
      start_lstm_state = self.local_network.lstm_state_out
    
    # t_max times loop
    for i in range(self.options.local_t_max):
      pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t)
      action = self.choose_action(pi_, global_t)

      states.append(self.game_state.s_t)
      actions.append(action)
      values.append(value_)
      liveses.append(self.game_state.lives)

      if (self.thread_index == 0) and (self.local_t % self.options.log_interval == 0):
        print("pi={} (thread{})".format(pi_, self.thread_index))
        print(" V={} (thread{})".format(value_, self.thread_index))

      # process game
      self.game_state.process(action)

      # receive game result
      reward = self.game_state.reward
      terminal = self.game_state.terminal

      self.episode_reward += reward
      if reward > 0 and \
         (self.options.rom == "montezuma_revenge.bin" or self.options.gym_env == "MontezumaRevenge-v0"):
        elapsed_time = time.time() - self.start_time
        print("t={:6.0f},s={:4.0f},th={}:{}r={:3.0f}RM{:02d}| NEW-SCORE".format(
              elapsed_time, global_t, self.thread_index, self.indent, self.episode_reward,
              self.game_state.room_no))

      # pseudo-count reward
      if self.options.psc_use:
        reward += self.game_state.psc_reward

      # add basic income after some no reward steps
      if self.no_reward_steps > self.options.no_reward_steps:
        reward += self.options.basic_income

      # clip reward
      if self.options.reward_clip > 0.0:
        reward = np.clip(reward, -self.options.reward_clip, self.options.reward_clip)
      rewards.append( reward )

      # collect episode log
      if self.tes > 0:
        self.episode_states.append(self.game_state.s_t)
        self.episode_actions.append(action)
        self.episode_rewards.append(reward)
        self.episode_values.append(value_)
        self.episode_liveses.append(self.game_state.lives)
        if len(self.episode_states) > self.max_history * 2:
          self.episode_states = self.episode_states[-self.max_history:]
          self.episode_actions = self.episode_actions[-self.max_history:]
          self.episode_rewards = self.episode_rewards[-self.max_history:]
          self.episode_values = self.episode_values[-self.max_history:]
          self.episode_liveses = self.episode_liveses[-self.max_history-1:]
        # requirement for OpenAI Gym: --clear-history-on-death=False
        if self.options.clear_history_on_death and (liveses[-2] > liveses[-1]):
          self.episode_states = []
          self.episode_actions = []
          self.episode_rewards = []
          self.episode_values = []
          self.episode_liveses = self.episode_liveses[-2:]
 
      self.local_t += 1

      if self.options.record_new_record_dir is not None \
         or self.options.record_new_room_dir is not None:
        screen = self.game_state.uncropped_screen
        if self.options.compress_frame:
          screen = lzma.compress(screen.tobytes(), preset=0)
        self.episode_screens.append(screen)

      # terminate if the play time is too long
      self.steps += 1
      if self.steps > self.options.max_play_steps:
        terminal = True

      # requirement for OpenAI Gym: --terminate-on-lives-lost=False
      # terminate if lives lost
      if self.terminate_on_lives_lost and (liveses[-2] > liveses[-1]):
        terminal = True

      # count no reward steps
      if self.game_state.reward == 0.0:
        self.no_reward_steps += 1
      else:
        self.no_reward_steps = 0

      # s_t1 -> s_t
      self.game_state.update()
      
      if self.local_t % self.options.score_log_interval == 0:
        elapsed_time = time.time() - self.start_time
        print("t={:6.0f},s={:9d},th={}:{}r={:3.0f}RM{:02d}| l={:.0f},v={:.5f},pr={:.5f}".format(
              elapsed_time, global_t, self.thread_index, self.indent,
              self.episode_reward, self.game_state.room_no,
              self.game_state.lives, value_, self.game_state.psc_reward))

      # if self.game_state.room_no != self.game_state.prev_room_no:
      #   elapsed_time = time.time() - self.start_time
      #   print("t={:6.0f},s={:9d},th={}:{}RM{:02d}>RM{:02d}| l={:.0f},v={:.5f},pr={:.5f}".format(
      #         elapsed_time, global_t, self.thread_index, self.indent, 
      #         self.game_state.prev_room_no, self.game_state.room_no,
      #         self.game_state.lives, value_, self.game_state.psc_reward))

      if self.tes > 0:
        if self.game_state.lives < self.episode_liveses[-2]:
          elapsed_time = time.time() - self.start_time
          print("t={:6.0f},s={:9d},th={}:{}l={:.0f}>{:.0f}RM{:02d}|".format(
                elapsed_time, global_t, self.thread_index, self.indent, 
                self.episode_liveses[-2], self.game_state.lives, self.game_state.room_no))

      # seperate steps after getting reward
      if self.game_state.reward > 0:
        if not terminal:
          break

      if terminal:
        terminal_end = True
        elapsed_time = time.time() - self.start_time
        end_mark = "end" if self.terminate_on_lives_lost else "END"
        print("t={:6.0f},s={:9d},th={}:{}r={:3.0f}@{}|".format(
              elapsed_time, global_t, self.thread_index, self.indent, self.episode_reward, end_mark))

        self._record_score(sess, summary_writer, summary_op, score_input,
                           self.episode_reward, global_t)
          
        if self.tes > 0:
          if self.options.record_new_room_dir is not None \
             and self.game_state.new_room >= 0:
            dirname = "s{:09d}-th{}-r{:03.0f}-RM{:02d}".format(global_t,  self.thread_index,\
                       self.episode_reward, self.game_state.new_room)
            dirname = os.path.join(self.options.record_new_room_dir, dirname)
            os.makedirs(dirname)
            for index, screen in enumerate(self.episode_screens):
              filename = "{:06d}.png".format(index)
              filename = os.path.join(dirname, filename)
              screen_image = screen
              if self.options.compress_frame:
                screen_image = np.frombuffer(lzma.decompress(screen), dtype=np.uint8).reshape((210, 160))
              cv2.imwrite(filename, screen_image)
            print("@@@ New Room record screens saved to {}".format(dirname))

          if self.episode_reward > self.max_episode_reward:
            if self.options.record_new_record_dir is not None:
              dirname = "s{:09d}-th{}-r{:03.0f}-RM{:02d}".format(global_t,  self.thread_index,\
                         self.episode_reward, self.game_state.room_no)
              dirname = os.path.join(self.options.record_new_record_dir, dirname)
              os.makedirs(dirname)
              for index, screen in enumerate(self.episode_screens):
                filename = "{:06d}.png".format(index)
                filename = os.path.join(dirname, filename)
                screen_image = screen
                if self.options.compress_frame:
                  screen_image = np.frombuffer(lzma.decompress(screen), dtype=np.uint8).reshape((210, 160))
                cv2.imwrite(filename, screen_image)
              print("@@@ New Record screens saved to {}".format(dirname))
            self.max_episode_reward = self.episode_reward
            if self.options.record_all_non0_record:
              self.max_episode_reward = 0

          self.max_reward = 0.0
          self.episode_states = []
          self.episode_actions = []
          self.episode_rewards = []
          self.episode_values = []
          self.episode_liveses = []
          self.episode_scores.add(self.episode_reward, global_t, self.thread_index)
          if self.options.record_new_record_dir is not None \
             or self.options.record_new_room_dir is not None:
            self.episode_screens= []

        self.episode_reward = 0
        self.steps = 0
        self.no_reward_steps = 0
        self.game_state.reset()
        if self.options.use_lstm:
          self.local_network.reset_state()
        break

    if self.thread_index == 0 and self.local_t % self.options.performance_log_interval < self.options.local_t_max:
      elapsed_time = time.time() - self.start_time
      steps_per_sec = global_t / elapsed_time
      print("### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format(
            global_t,  elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.))

    if self.options.gym_eval:
      diff_local_t = self.local_t - start_local_t
      return diff_local_t, terminal_end

    # don't train if following condition
    # requirement for OpenAI Gym: --terminate-on-lives-lost=False
    if self.options.terminate_on_lives_lost and (self.thread_index == 0) and (not self.options.train_in_eval):
      return 0, terminal_end
    else:
      if self.tes > 0:
        _ = self.episode_scores.is_highscore(self.episode_reward)
        if self.episode_reward > self.max_reward:
          self.max_reward = self.episode_reward
          if True:
            tes = self.tes
            # requirement for OpenAI Gym: --test-extend=False
            if self.options.tes_extend and self.initial_lives != 0:
              tes *= self.options.tes_extend_ratio * (self.game_state.lives / self.initial_lives)
              if self.game_state.lives == self.initial_lives:
                tes *= 2
              tes = int(tes)
            tes = min(tes, len(self.episode_states))
            print("[OHL]SCORE={:3.0f},s={:9d},th={},lives={},steps={},tes={},RM{:02d}".format(self.episode_reward,  global_t, self.thread_index, self.game_state.lives, self.steps, tes, self.game_state.room_no))
            if tes == 0:
              states = []
              actions = []
              rewards = []
              values = []
              liveses = self.episode_liveses[-1:]
            else:
              states = self.episode_states[-tes:]
              actions = self.episode_actions[-tes:]
              rewards = self.episode_rewards[-tes:]
              values = self.episode_values[-tes:]
              liveses = self.episode_liveses[-tes-1:]
            if self.options.clear_history_after_ohl:
              self.episode_states = []
              self.episode_actions = []
              self.episode_rewards = []
              self.episode_values = []
              self.episode_liveses = self.episode_liveses[-2:]

      if len(states) > 0:
        R = 0.0
        if not terminal_end:
          R = self.local_network.run_value(sess, self.game_state.s_t)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_a = []
        batch_td = []
        batch_R = []

        lives = liveses.pop()
        # compute and accmulate gradients
        for(ai, ri, si, Vi) in zip(actions, rewards, states, values):
          # Consider the number of lives
          if (not self.options.use_gym) and self.initial_lives != 0.0 and not self.terminate_on_lives_lost:
            prev_lives = liveses.pop()
            if prev_lives > lives:
              weight = self.options.lives_lost_weight
              rratio = self.options.lives_lost_rratio
              R *= rratio * ( (1.0 - weight) + weight * (lives / prev_lives) )
              ri = self.options.lives_lost_reward
              lives = prev_lives

          R = ri + self.options.gamma * R
          td = R - Vi
          a = np.zeros([self.options.action_size])
          a[ai] = 1

          batch_si.append(si)
          batch_a.append(a)
          batch_td.append(td)
          batch_R.append(R)

        if self.options.use_lstm:
          batch_si.reverse()
          batch_a.reverse()
          batch_td.reverse()
          batch_R.reverse()

          sess.run( self.accum_gradients,
                    feed_dict = {
                      self.local_network.s: batch_si,
                      self.local_network.a: batch_a,
                      self.local_network.td: batch_td,
                      self.local_network.r: batch_R,
                      self.local_network.initial_lstm_state: start_lstm_state,
                      self.local_network.step_size : [len(batch_a)] } )
        else:
          sess.run( self.accum_gradients,
                    feed_dict = {
                      self.local_network.s: batch_si,
                      self.local_network.a: batch_a,
                      self.local_network.td: batch_td,
                      self.local_network.r: batch_R} )
          
        cur_learning_rate = self._anneal_learning_rate(global_t)

        sess.run( self.apply_gradients,
                  feed_dict = { self.learning_rate_input: cur_learning_rate } )

      # return advanced local step size
      diff_local_t = self.local_t - start_local_t
      return diff_local_t, terminal_end
class A3CTrainingThread(object):
    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 max_global_time_step,
                 device,
                 sess,
                 name="agent"):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        #if USE_LSTM:
        #    self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
        #else:

        self.local_network = Network(name=name)

        self.local_network.prepare_loss(FLAGS.entropy_beta)

        # TODO: don't need accum trainer anymore with batch
        self.trainer = AccumTrainer(device)
        self.local_network.vars = self.trainer.prepare_minimize(
            self.local_network.total_loss, self.local_network.get_train_vars())

        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        self.apply_gradients = grad_applier.apply_gradients(
            global_network.get_train_vars(),
            self.trainer.get_accum_grad_list())

        self.sync = self.local_network.sync_from(global_network)

        #if USE_ALE:
        #    self.game_state = GameState(113 * thread_index)
        #else:
        self.game = gym.make('Lis-v2')
        self.game.configure(str(5000 + thread_index))
        # game initialization
        # observation = env.reset()
        self.observation, reward, end_episode, _ = self.game.step(1)
        #self.observation = self.preprocess([self.observation])
        self.history = [self.rgb2gray(self.observation)
                        for _ in range(4)]  #FLAGS.history_frames
        self.observation = np.dstack(self.history)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

        # variable controling log output
        self.prev_local_t = 0

    def _anneal_learning_rate(self, global_time_step):
        learning_rate = self.initial_learning_rate * (
            self.max_global_time_step -
            global_time_step) / self.max_global_time_step
        if learning_rate < 0.0:
            learning_rate = 0.0
        return learning_rate

    def choose_action(self, pi_values):
        values = []
        sum = 0.0
        for rate in pi_values:
            sum = sum + rate
            value = sum
            values.append(value)

        r = random.random() * sum
        for i in range(len(values)):
            if values[i] >= r:
                return i
        # fail safe
        return len(values) - 1

    def _record_score(self, sess, summary_writer, summary_op, score_input,
                      score, global_t):
        summary_str = sess.run(summary_op, feed_dict={score_input: score})
        summary_writer.add_summary(summary_str, global_t)

    def set_start_time(self, start_time):
        self.start_time = start_time

    def rgb2gray(self, rgb, i=0):
        if FLAGS.save_frames:
            if self.thread_index == 0 and len(
                    os.listdir(os.path.join(FLAGS.model_dir,
                                            "images"))) < 1000:
                scipy.misc.imsave(
                    "%s/%i.png" % (os.path.join(FLAGS.model_dir, "images"), i),
                    rgb["image"][0])

        img = np.asarray(rgb["image"][0])[..., :3]
        img = np.dot(img, [0.299, 0.587, 0.114])
        img = scipy.misc.imresize(img, (84, 84)) / 255.0
        #flip H
        #
        #img = np.fliplr(img)

        return img
        #return -np.dot(img, [0.299, 0.587, 0.114]) / 255.0 + 1.0

    def preprocess(self, frames, name=0):
        if len(frames) == 1:
            gray = self.rgb2gray(frames[0])
            return np.dstack([gray, gray, gray, gray])

        return np.dstack([self.rgb2gray(frame) for frame in frames])

    def action2string(self, action):
        moveX, moveZ, turn = 0, 0, 0
        """if action == 0:
            moveX = -10
        elif action == 1:
            moveX = 10
        elif action == 2:
            moveZ = -10
        elif action == 3:
            moveZ = 10
        elif action == 4:
            turn = 10
        elif action == 5:
            turn = -10
        elif action == 6:
            pass"""
        if action == 0:
            turn = -10
        elif action == 1:
            turn = 10
        elif action == 2:
            moveZ = 10
        elif action == 3:
            pass

        return "%s %s %s" % (moveX, moveZ, turn)

    def get_frame(self, index):
        if index > len(self.history):
            return self.history[-1]
        else:
            return self.history[-index]

    def process(self, sess, global_t, summary_writer, summary_op, score_input):
        states = []
        actions = []
        rewards = []
        values = []

        terminal_end = False

        # reset accumulated gradients
        sess.run(self.reset_gradients)

        # copy weights from shared to local
        sess.run(self.sync)

        start_local_t = self.local_t

        #if USE_LSTM:
        #    start_lstm_state = self.local_network.lstm_state_out

        # t_max times loop
        for i in range(FLAGS.local_t_max):
            #if USE_ALE:
            #    pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t)
            #else:
            pi_, value_ = self.local_network.run_policy_and_value(
                sess, self.observation)

            #if self.thread_index == 0:
            #print(pi_)
            #cv2.namedWindow("img", cv2.WINDOW_NORMAL)
            #cv2.imshow("img", self.observation)
            #cv2.waitKey(1)
            """if self.thread_index == 0 and len(os.listdir(os.path.join(FLAGS.model_dir, "images"))) < 1000:
                ft = sess.run(self.local_network.col_hiddens[0][0], feed_dict={self.local_network.s: [self.observation]})
                print(ft.shape)

                scipy.misc.imsave("%s/%i-obs.png" % (os.path.join(FLAGS.model_dir, "images"), global_t + i),
                                  self.observation[:, :, 3])

                for m in range(8):
                    img = ft[0, :, :, m]
                    img = img - np.amin(img)
                    img /= np.amax(img)
                    img *= 255.0
                    scipy.misc.imsave("%s/%i-feature-%i.png" % (os.path.join(FLAGS.model_dir, "images"), global_t + i, m),
                                      img)
"""

            action = self.choose_action(pi_)

            states.append(self.observation)
            actions.append(action)
            values.append(value_)

            if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0):
                print("pi={}".format(pi_))
                print(" V={}".format(value_))

            #if USE_ALE:
            #self.game_state.process(action)
            #reward = self.game_state.reward
            #end_episode = self.game_state.terminal
            #else:

            #for i in range(FLAGS.skip_frames):
            new_obs, reward, end_episode, _ = self.game.step(
                self.action2string(action))

            if len(self.history) > 10:
                del self.history[0]

            self.history.append(self.rgb2gray(
                new_obs,
                global_t + self.local_t))  #, "%i-a%i" % (global_t, action)

            def create_history():
                return np.dstack([
                    self.get_frame(1),
                    self.get_frame(2),
                    self.get_frame(3),
                    self.get_frame(4)
                ])

            new_observation = create_history()

            # process game
            #self.game_state.process(action)

            # receive game result
            #reward = self.game_state.reward
            terminal = end_episode  #self.game_state.terminal

            self.episode_reward += reward

            # clip reward
            rewards.append(np.clip(reward, -1, 1))

            self.local_t += 1

            #if USE_ALE:
            # s_t1 -> s_t
            #    self.game_state.update()
            #else:

            if terminal:
                terminal_end = True
                print("score={}".format(self.episode_reward))

                self._record_score(sess, summary_writer, summary_op,
                                   score_input, self.episode_reward, global_t)

                self.episode_reward = 0

                #if USE_ALE:
                self.game.reset()
                #else:
                #self.history = [self.rgb2gray(self.game.step(0))]
                #self.observation = create_history()
                #if USE_LSTM:
                #    self.local_network.reset_state()
                break
            else:
                self.observation = new_observation

        R = 0.0
        if not terminal_end:
            #if USE_ALE:
            #    R = self.local_network.run_value(sess, self.game_state.s_t)
            #else:
            R = self.local_network.run_value(sess, self.observation)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_a = []
        batch_td = []
        batch_R = []

        # compute and accmulate gradients
        for (ai, ri, si, Vi) in zip(actions, rewards, states, values):
            R = ri + FLAGS.gamma * R
            td = R - Vi
            a = np.zeros([FLAGS.action_size])
            a[ai] = 1

            batch_si.append(si)
            batch_a.append(a)
            batch_td.append(td)
            batch_R.append(R)

        sess.run(self.accum_gradients,
                 feed_dict={
                     self.local_network.s: batch_si,
                     self.local_network.a: batch_a,
                     self.local_network.td: batch_td,
                     self.local_network.r: batch_R
                 })

        cur_learning_rate = self._anneal_learning_rate(global_t)

        sess.run(self.apply_gradients,
                 feed_dict={self.learning_rate_input: cur_learning_rate})

        if (self.thread_index == 0) and (self.local_t - self.prev_local_t >=
                                         PERFORMANCE_LOG_INTERVAL):
            self.prev_local_t += PERFORMANCE_LOG_INTERVAL
            elapsed_time = time.time() - self.start_time
            steps_per_sec = global_t / elapsed_time
            print(
                "### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour"
                .format(global_t, elapsed_time, steps_per_sec,
                        steps_per_sec * 3600 / 1000000.))

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t
class A3CTrainingThread(object):
    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 max_global_time_step,
                 device,
                 environment):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        # self.local_network = GameACNetwork(ACTION_SIZE, device)

        self.local_network = global_network.structural_clone(network_name="thread-net-%s" % self.thread_index)

        self.local_network.prepare_loss(ENTROPY_BETA)

        self.trainer = AccumTrainer(device)
        self.trainer.prepare_minimize(self.local_network.total_loss,
                                      self.local_network.get_vars())

        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        self.apply_gradients, self.grad_summary_op = grad_applier.apply_gradients(
            global_network.get_vars(),
            self.trainer.get_accum_grad_list())

        self.sync = self.local_network.sync_from(global_network)

        # self.game_state = GameState(113 * thread_index)
        self.game_state = GymGameState(113 * thread_index, env=environment)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

        self.lstm_last_output_state = None          # cache last lstm hidden states here

    def _anneal_learning_rate(self, global_time_step):
        learning_rate = self.initial_learning_rate * (
        self.max_global_time_step - global_time_step) / self.max_global_time_step
        if learning_rate < 0.0:
            learning_rate = 0.0
        return learning_rate

    def choose_action(self, pi_values):
        values = []
        sum = 0.0
        for rate in pi_values:
            sum = sum + rate
            value = sum
            values.append(value)

        r = random.random() * sum
        for i in range(len(values)):
            if values[i] >= r:
                return i;
        # fail safe
        return len(values) - 1

    # def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t):
    #     summary_str = sess.run(summary_op, feed_dict={
    #         score_input: score,
    #     })
    #     summary_writer.add_summary(summary_str, global_t)

    """next steps
      x init the lstm state before process is called, somewhere

      x reinit lstm state after terminal episodes

      ?!? allow lstm state to persist even after global weights are copied (i guess)

      x feed state in to lstm during policy evals
      how does state work in gradient backups?



        Tests:

            - inspect lstm state inputs, outputs, and episode stored values
            -

    """

    def reset(self):

        # todo: any other states to clean up??
        # could have been absorbed in to a check for validity of game state...
        # but thats kind of magic-y and icky for this scenario
        self.game_state.reset()

    def process(self, sess, global_t, summary_writer, record_score_fn):  #summary_op, score_input):
        states = []
        actions = []
        rewards = []
        values = []
        lstm_states = []

        terminal_end = False

        # reset accumulated gradients
        sess.run(self.reset_gradients)

        # copy weights from shared to local
        sess.run(self.sync)

        # write weight summaries...only need them from one thread really
        if (self.thread_index == 0):
            param_summary = sess.run(self.local_network.param_summary_op)
            summary_writer.add_summary(param_summary, global_step=global_t)

        start_local_t = self.local_t

        # resume with wherever we left off on last time through the action loop
        # TODO: no reason the network itself current should care about this

        if (self.lstm_last_output_state is None):
            self.lstm_last_output_state = self.local_network.lstm_initial_state_value

        lstm_state = self.lstm_last_output_state

        # lstm_state = self.local_network.lstm_last_output_state_value

        # t_max times loop
        for i in range(LOCAL_T_MAX):

            states.append(self.game_state.s_t)
            lstm_states.append(lstm_state)

            pi_, value_, lstm_state = self.local_network.run(sess, self.game_state.s_t,
                                                             lstm_state)

            action = self.local_network.sample_action(pi_)

            # print "a3c train: pi_: ", pi_
            # print "a3c train: action: ", action
            # pi_ = self.local_network.run_policy(sess, self.game_state.s_t)
            # action = choose_action(pi_)  # self.choose_action(pi_)

            actions.append(action)
            # value_ = self.local_network.run_value(sess, self.game_state.s_t)
            values.append(value_)

            if (self.thread_index == 0) and (self.local_t % 100) == 0:
                print "pi=", pi_
                print " V=", value_

            # process game
            self.game_state.process(action)
            # s_t1 -> s_t
            self.game_state.update() # not sure why this is separate...

            # receive game result
            reward = self.game_state.reward
            terminal = self.game_state.terminal

            self.episode_reward += reward

            # clip reward
            rewards.append(np.clip(reward, -1, 1000))

            self.local_t += 1

            if terminal:
                terminal_end = True
                print "terminal score =", self.episode_reward

                # self._record_score(sess, summary_writer, summary_op, score_input,
                #                    self.episode_reward, global_t)

                record_score_fn(sess, summary_writer, self.episode_reward, global_t)

                self.episode_reward = 0
                self.game_state.reset()

                #  ugh. reset lstm state!
                lstm_state = self.local_network.lstm_initial_state_value

                break

        R = 0.0
        if not terminal_end:
            # R = self.local_network.run_value(sess, self.game_state.s_t)

            _, R, _ = self.local_network.run(sess, self.game_state.s_t, lstm_state)

        # self.local_network.lstm_last_output_state_value = lstm_state # preserve for next time through the loop
        self.lstm_last_output_state = lstm_state

        #  TODO: cant store the lists i pass directly since they'll be destructively reversed by
        # this call....hmmmmm
        # maybe just reverse them here and leave it?
        #  start with copying the lists
        self.backup_and_accum_gradients(sess, global_t, summary_writer,
                                        states=states,
                                        lstm_states=lstm_states,
                                        actions=actions,
                                        values=values,
                                        rewards=rewards,
                                        final_reward_estimate=R)


        if (self.thread_index == 0) and (self.local_t % 100) == 0:
            print("TIMESTEP %d GLOBAL %d" % (self.local_t, global_t))

        # 進んだlocal step数を返す
        diff_local_t = self.local_t - start_local_t
        return diff_local_t

    def backup_and_accum_gradients(self, sess, global_t, summary_writer,
                                   states, lstm_states, actions, values, rewards,
                                   final_reward_estimate):
        """ inputs are lists reflecting a recorded episode fragment in the order they occured

            a = sample{ pi(a | s, lstm_s ) }
            v = V(s, lstms)
            r = env.step(a)

        :param states: states
        :param actions:
        :param rewards:
        :param lstm_states:
        :return:
        """


        # TODO: copy these and leave the originals alone...
        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()
        lstm_states.reverse()

        R = final_reward_estimate

        # compute and accmulate gradients
        for (ai, ri, si, Vi, lstm_si) in zip(actions, rewards, states, values, lstm_states):
            R = ri + GAMMA * R
            td = R - Vi

            a = self.local_network.feedback_action(ai)
            # a = np.zeros([self.local_network.action_size])
            # a[ai] = 1

            # reshape state input
            # no batching for now


            _, loss_summary = sess.run([self.accum_gradients, self.local_network.loss_summary_op],
                                       feed_dict=self.local_network.loss_feed_dictionary(si, a, td, R, lstm_si)
                                       # feed_dict={
                                       #     self.local_network.s: [si],
                                       #     self.local_network.a: [a],
                                       #     self.local_network.td: [td],
                                       #     self.local_network.r: [R],
                                       #     self.local_network.lstm_current_state_tensor: lstm_si
                                       # }
                                       )

            if (self.thread_index == 0):
                summary_writer.add_summary(loss_summary, global_step=global_t)


        """ idea: maybe possible to do n-step TBPTT after having retroactively computed R for each state
        feed in batches of size up to n_max to a set of parallel networks with 
        
        
        idea: set up the lstm with say 5 recursive calls. then the initial inputs would need to be padded...maybe?
        would work if made the inputs in batches and altered iteration logic to cycle inputs through the history...
        

        """

        cur_learning_rate = self._anneal_learning_rate(global_t)

        _, grad_summary = sess.run([self.apply_gradients, self.grad_summary_op],
                                   feed_dict={self.learning_rate_input: cur_learning_rate})

        if (self.thread_index == 0):
            summary_writer.add_summary(grad_summary, global_step=global_t)


    #  TODO: rename 'states' variable as 'observations' in next version just to be f*****g crystal clear


    def process_memory(self, sess, global_t, summary_writer,
                       states, initial_lstm_state, actions, rewards, final_state):
        """
        :param sess:
        :param global_t:
        :param summary_writer:
        :param states:
        :param initial_lstm_state:
        :param actions:
        :param rewards:
        :param final_state:         observation after the last game step...use None to signal terminal, otherwise used
         to compute the final boostrap Value
        :return:
        """

        # TODO: gotcha initial_lstm_state must be set carefully
        # if the episode reflects t=0, the state is always known
        # otherwise how can we know what the lstm state output of the *current* policy might plausibly have been
        # unless the same policy was executed from the very beginning of the historical episode and propagated
        # we could just record the lstm_state prior to the beginning of the history episode as an approximation
        # we might expect it to converge reasonably after a number of steps to something from the plausible distribution
        # for the current policy...however, over time, the policy will drift away further and further from what
        # created the original lstm_state
        # this suggests the solution that we update the stored initial lstm state in the replay memory after every refresh
        # ...almost like a real memory trace in a human brain might...
        # but how can we update it ????
        # maybe keep one state in reserve just to prime...but then we can only update the lstm state after it, not the one
        # that initial state needs....HMMM. maybe just
        #
        # for certain environments we could just apply the network to s_t+0 repeatedly until the lstm state converges
        # this works if the problem and/or env dont depend on any direct measure of time..perhaps
        #
        # easiest solution might just be to always reference the episodes to t=0
        # or just ignore first k states when backing up and computing gradients...since presumably we'll have converged
        # to something reasonable by that point



        values = []
        lstm_states = []

        terminal_end = False

        # reset accumulated gradients
        sess.run(self.reset_gradients)

        # copy weights from shared to local
        sess.run(self.sync)
        lstm_state = initial_lstm_state

        for (s_t, a_t, r_t) in zip(states, actions, rewards):

            # accum lstm states
            lstm_states.append(lstm_state)

            pi_, value_, lstm_state = self.local_network.run(sess, s_t, lstm_state)

            # get values
            values.append(values)
示例#18
0
class A3CTrainingThread(object):
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    self.local_network = GameACNetwork(ACTION_SIZE, device)
    self.local_network.prepare_loss(ENTROPY_BETA)

    self.trainer = AccumTrainer(device)
    self.trainer.prepare_minimize( self.local_network.total_loss,
                                   self.local_network.get_vars() )
    
    self.accum_gradients = self.trainer.accumulate_gradients()
    self.reset_gradients = self.trainer.reset_gradients()
  
    self.apply_gradients = grad_applier.apply_gradients(
      global_network.get_vars(),
      self.trainer.get_accum_grad_list() )

    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(113 * thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0


  def _anneal_learning_rate(self, global_time_step):
    learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step
    if learning_rate < 0.0:
      learning_rate = 0.0
    return learning_rate

  def choose_action(self, pi_values):
    values = []
    sum = 0.0
    for rate in pi_values:
      sum = sum + rate
      value = sum
      values.append(value)
    
    r = random.random() * sum
    for i in range(len(values)):
      if values[i] >= r:
        return i;
    #fail safe
    return len(values)-1

  def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t):
    summary_str = sess.run(summary_op, feed_dict={
      score_input: score
    })
    summary_writer.add_summary(summary_str, global_t)
    
  def process(self, sess, global_t, summary_writer, summary_op, score_input):
    states = []
    actions = []
    rewards = []
    values = []

    terminal_end = False

    # reset accumulated gradients
    sess.run( self.reset_gradients )

    # copy weights from shared to local
    sess.run( self.sync )

    start_local_t = self.local_t
    
    # t_max times loop
    for i in range(LOCAL_T_MAX):
      pi_ = self.local_network.run_policy(sess, self.game_state.s_t)
      action = self.choose_action(pi_)

      states.append(self.game_state.s_t)
      actions.append(action)
      value_ = self.local_network.run_value(sess, self.game_state.s_t)
      values.append(value_)

      if (self.thread_index == 0) and (self.local_t % 100) == 0:
        print "pi=", pi_
        print " V=", value_

      # process game
      self.game_state.process(action)

      # receive game result
      reward = self.game_state.reward
      terminal = self.game_state.terminal

      self.episode_reward += reward

      # clip reward
      rewards.append( np.clip(reward, -1, 1) )

      self.local_t += 1

      # s_t1 -> s_t
      self.game_state.update()
      
      if terminal:
        terminal_end = True
        print "score=", self.episode_reward

        self._record_score(sess, summary_writer, summary_op, score_input,
                           self.episode_reward, global_t)
          
        self.episode_reward = 0
        self.game_state.reset()
        break

    R = 0.0
    if not terminal_end:
      R = self.local_network.run_value(sess, self.game_state.s_t)

    actions.reverse()
    states.reverse()
    rewards.reverse()
    values.reverse()

    # compute and accmulate gradients
    for(ai, ri, si, Vi) in zip(actions, rewards, states, values):
      R = ri + GAMMA * R
      td = R - Vi
      a = np.zeros([ACTION_SIZE])
      a[ai] = 1

      sess.run( self.accum_gradients,
                feed_dict = {
                  self.local_network.s: [si],
                  self.local_network.a: [a],
                  self.local_network.td: [td],
                  self.local_network.r: [R]} )
      
    cur_learning_rate = self._anneal_learning_rate(global_t)

    sess.run( self.apply_gradients,
              feed_dict = { self.learning_rate_input: cur_learning_rate } )

    if (self.thread_index == 0) and (self.local_t % 100) == 0:
      print "TIMESTEP", self.local_t

    # 進んだlocal step数を返す
    diff_local_t = self.local_t - start_local_t
    return diff_local_t
class A3CTrainingThread(object):
  def __init__(self, thread_index, global_network, initial_learning_rate,
               learning_rate_input,
               policy_applier, value_applier,
               max_global_time_step):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    self.local_network = GameACNetwork(ACTION_SIZE)
    self.local_network.prepare_loss(ENTROPY_BETA)

    # policy
    self.policy_trainer = AccumTrainer()
    self.policy_trainer.prepare_minimize( self.local_network.policy_loss,
                                          self.local_network.get_policy_vars(),
                                          GRAD_NORM_CLIP )
    
    self.policy_accum_gradients = self.policy_trainer.accumulate_gradients()
    self.policy_reset_gradients = self.policy_trainer.reset_gradients()
  
    self.policy_apply_gradients = policy_applier.apply_gradients(
        global_network.get_policy_vars(),
        self.policy_trainer.get_accum_grad_list() )

    # value
    self.value_trainer = AccumTrainer()
    self.value_trainer.prepare_minimize( self.local_network.value_loss,
                                         self.local_network.get_value_vars(),
                                         GRAD_NORM_CLIP )
    self.value_accum_gradients = self.value_trainer.accumulate_gradients()
    self.value_reset_gradients = self.value_trainer.reset_gradients()
  

    self.value_apply_gradients = value_applier.apply_gradients(
        global_network.get_value_vars(),
        self.value_trainer.get_accum_grad_list() )
    
    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(113 * thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    # thread0 will record score for TensorBoard
    if self.thread_index == 0:
      self.score_input = tf.placeholder(tf.int32)
      tf.scalar_summary("score", self.score_input)

  def _anneal_learning_rate(self, global_time_step):
    learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step
    if learning_rate < 0.0:
      learning_rate = 0.0
    return learning_rate

  def choose_action(self, pi_values):
    values = []
    sum = 0.0
    for rate in pi_values:
      sum = sum + rate
      value = sum
      values.append(value)
    
    r = random.random() * sum
    for i in range(len(values)):
      if values[i] >= r:
        return i;
    #fail safe
    return len(values)-1

  def _record_score(self, sess, summary_writer, summary_op, score, global_t):
    summary_str = sess.run(summary_op, feed_dict={
      self.score_input: score
    })
    summary_writer.add_summary(summary_str, global_t)
    
  def process(self, sess, global_t, summary_writer, summary_op):
    states = []
    actions = []
    rewards = []
    values = []

    terminal_end = False

    # 加算された勾配をリセット
    sess.run( self.policy_reset_gradients )
    sess.run( self.value_reset_gradients )

    # shared から localにweightをコピー
    sess.run( self.sync )

    start_local_t = self.local_t
    
    # t_max times loop
    for i in range(LOCAL_T_MAX):
      pi_ = self.local_network.run_policy(sess, self.game_state.s_t)
      action = self.choose_action(pi_)

      states.append(self.game_state.s_t)
      actions.append(action)
      value_ = self.local_network.run_value(sess, self.game_state.s_t)
      values.append(value_)

      if (self.thread_index == 0) and (self.local_t % 100) == 0:
        print "pi=", pi_
        print " V=", value_

      # gameを実行
      self.game_state.process(action)

      # 実行した結果
      reward = self.game_state.reward
      terminal = self.game_state.terminal

      self.episode_reward += reward

      rewards.append(reward)

      self.local_t += 1

      self.game_state.update()
      
      if terminal:
        terminal_end = True
        print "score=", self.episode_reward

        if self.thread_index == 0:        
          self._record_score(sess, summary_writer, summary_op, self.episode_reward, global_t)
          
        self.episode_reward = 0
        break

    R = 0.0
    if not terminal_end:
      R = self.local_network.run_value(sess, self.game_state.s_t)

    actions.reverse()
    states.reverse()
    rewards.reverse()
    values.reverse()

    # 勾配を算出して加算していく
    for(ai, ri, si, Vi) in zip(actions, rewards, states, values):
      R = ri + GAMMA * R
      td = R - Vi
      a = np.zeros([ACTION_SIZE])
      a[ai] = 1

      sess.run( self.policy_accum_gradients,
                feed_dict = {
                    self.local_network.s: [si],
                    self.local_network.a: [a],
                    self.local_network.td: [td] } )
      
      sess.run( self.value_accum_gradients,
                feed_dict = {
                    self.local_network.s: [si],
                    self.local_network.r: [R] } )

    cur_learning_rate = self._anneal_learning_rate(global_t)

    sess.run( self.policy_apply_gradients,
              feed_dict = { self.learning_rate_input: cur_learning_rate } )
    # Learning rate for Critic is half of Actor's
    sess.run( self.value_apply_gradients,
              feed_dict = { self.learning_rate_input: cur_learning_rate * 0.5 } )

    if (self.thread_index == 0) and (self.local_t % 100) == 0:
      print "TIMESTEP", self.local_t

    # 進んだlocal step数を返す
    diff_local_t = self.local_t - start_local_t
    return diff_local_t
示例#20
0
class A3CTrainingThread(object):
    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 max_global_time_step,
                 device):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        if NETWORK_TYPE == 'LSTM':
            self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
        elif NETWORK_TYPE == 'DILATED':
            self.local_network = GameACDilatedNetwork(ACTION_SIZE, device)
        elif NETWORK_TYPE == 'CONV':
            self.local_network = GameACFFNetwork(ACTION_SIZE, device)

        self.local_network.prepare_loss(ENTROPY_BETA)

        # TODO: don't need accum trainer anymore with batch
        self.trainer = AccumTrainer(device)
        self.trainer.prepare_minimize( self.local_network.total_loss,
                                       self.local_network.get_vars() )

        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        self.apply_gradients = grad_applier.apply_gradients(
          global_network.get_vars(),
          self.trainer.get_accum_grad_list() )

        self.sync = self.local_network.sync_from(global_network)




        self.game_state = GameState(113 * thread_index)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0


    def _anneal_learning_rate(self, global_time_step):
        learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step
        if learning_rate < 0.0:
            learning_rate = 0.0
        return learning_rate

    def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t):
        summary_str = sess.run(summary_op, feed_dict={
          score_input: score
        })
        summary_writer.add_summary(summary_str, global_t)

    def process(self, sess, global_t, summary_writer, summary_op, score_input):
        states = []
        actions = []
        rewards = []
        values = []

        terminal_end = False

        # reset accumulated gradients
        sess.run( self.reset_gradients )

        # copy weights from shared to local
        sess.run( self.sync )

        start_local_t = self.local_t

        if NETWORK_TYPE == 'LSTM':
            start_lstm_state = self.local_network.lstm_state_out

        # t_max times loop
        for i in range(LOCAL_T_MAX):
            pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t)
            action = choose_action(pi_)

            states.append(self.game_state.s_t)
            actions.append(action)
            values.append(value_)

            if (self.thread_index == 0) and (self.local_t % 100) == 0:
                print(('local_t = {:10}  pi = ' + '{:7.5f} '*len(pi_) + ' V = {:8.4f} (thread {})').format(self.local_t,
                    *pi_, value_, self.thread_index))

            # process game
            self.game_state.process(action)

            # receive game result
            reward = self.game_state.reward
            terminal = self.game_state.terminal

            self.episode_reward += reward

            # clip reward
            rewards.append( np.clip(reward, -1, 1) )

            self.local_t += 1

            # s_t1 -> s_t
            self.game_state.update()

            if terminal:
                terminal_end = True
                print ("score=", self.episode_reward)

                self._record_score(sess, summary_writer, summary_op, score_input,
                                   self.episode_reward, global_t)

                self.episode_reward = 0
                self.game_state.reset()
                if NETWORK_TYPE == 'LSTM':
                    self.local_network.reset_state()
                break

        R = 0.0
        if not terminal_end:
            R = self.local_network.run_value(sess, self.game_state.s_t)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_a = []
        batch_td = []
        batch_R = []

        # compute and accmulate gradients
        for(ai, ri, si, Vi) in zip(actions, rewards, states, values):
            R = ri + GAMMA * R
            td = R - Vi
            a = np.zeros([ACTION_SIZE])
            a[ai] = 1

            batch_si.append(si)
            batch_a.append(a)
            batch_td.append(td)
            batch_R.append(R)

        if NETWORK_TYPE == 'LSTM':
            batch_si.reverse()
            batch_a.reverse()
            batch_td.reverse()
            batch_R.reverse()



            sess.run( self.accum_gradients,
                      feed_dict = {
                        self.local_network.s: batch_si,
                        self.local_network.a: batch_a,
                        self.local_network.td: batch_td,
                        self.local_network.r: batch_R,
                        self.local_network.initial_lstm_state: start_lstm_state,
                        self.local_network.step_size : [len(batch_a)] } )
        else:
            sess.run( self.accum_gradients,
                      feed_dict = {
                        self.local_network.s: batch_si,
                        self.local_network.a: batch_a,
                        self.local_network.td: batch_td,
                        self.local_network.r: batch_R} )

        cur_learning_rate = self._anneal_learning_rate(global_t)

        sess.run( self.apply_gradients,
                  feed_dict = { self.learning_rate_input: cur_learning_rate } )

        if (self.thread_index == 0) and (self.local_t % 100) == 0:
            print ("TIMESTEP", self.local_t)

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t
class A3CTrainingThread(object):
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 max_global_time_step):

        self.thread_index = thread_index
        self.learning_rate_input = tf.placeholder("float")
        self.max_global_time_step = max_global_time_step

        self.local_network = GameACNetwork(ACTION_SIZE)
        self.local_network.prepare_loss(ENTROPY_BETA)

        # policy
        self.policy_trainer = AccumTrainer()
        self.policy_trainer.prepare_minimize(
            self.local_network.policy_loss,
            self.local_network.get_policy_vars())
        self.policy_accum_gradients = self.policy_trainer.accumulate_gradients(
        )
        self.policy_reset_gradients = self.policy_trainer.reset_gradients()

        self.policy_applier = RMSPropApplier(
            learning_rate=self.learning_rate_input,
            decay=0.99,
            momentum=0.0,
            epsilon=RMSP_EPSILON)
        self.policy_apply_gradients = self.policy_applier.apply_gradients(
            global_network.get_policy_vars(),
            self.policy_trainer.get_accum_grad_list())

        # value
        self.value_trainer = AccumTrainer()
        self.value_trainer.prepare_minimize(
            self.local_network.value_loss, self.local_network.get_value_vars())
        self.value_accum_gradients = self.value_trainer.accumulate_gradients()
        self.value_reset_gradients = self.value_trainer.reset_gradients()

        self.value_applier = RMSPropApplier(
            learning_rate=self.learning_rate_input,
            decay=0.99,
            momentum=0.0,
            epsilon=RMSP_EPSILON)
        self.value_apply_gradients = self.value_applier.apply_gradients(
            global_network.get_value_vars(),
            self.value_trainer.get_accum_grad_list())

        self.sync = self.local_network.sync_from(global_network)

        self.game_state = GameState(113 * thread_index)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

        # thread0 will record score for TensorBoard
        if self.thread_index == 0:
            self.score_input = tf.placeholder(tf.int32)
            tf.scalar_summary("score", self.score_input)

    def _anneal_learning_rate(self, global_time_step):
        learning_rate = self.initial_learning_rate * (
            self.max_global_time_step -
            global_time_step) / self.max_global_time_step
        if learning_rate < 0.0:
            learning_rate = 0.0
        return learning_rate

    def choose_action(self, pi_values):
        values = []
        sum = 0.0
        for rate in pi_values:
            sum = sum + rate
            value = sum
            values.append(value)

        r = random.random() * sum
        for i in range(len(values)):
            if values[i] >= r:
                return i
        #fail safe
        return len(values) - 1

    def _record_score(self, sess, summary_writer, summary_op, score, global_t):
        summary_str = sess.run(summary_op, feed_dict={self.score_input: score})
        summary_writer.add_summary(summary_str, global_t)

    def process(self, sess, global_t, summary_writer, summary_op):
        states = []
        actions = []
        rewards = []
        values = []

        terminal_end = False

        # 加算された勾配をリセット
        sess.run(self.policy_reset_gradients)
        sess.run(self.value_reset_gradients)

        # shared から localにweightをコピー
        sess.run(self.sync)

        start_local_t = self.local_t

        # 5回ループ
        for i in range(LOCAL_T_MAX):
            pi_ = self.local_network.run_policy(sess, self.game_state.s_t)
            action = self.choose_action(pi_)

            states.append(self.game_state.s_t)
            actions.append(action)
            value_ = self.local_network.run_value(sess, self.game_state.s_t)
            values.append(value_)

            if (self.thread_index == 0) and (self.local_t % 100) == 0:
                print "pi=", pi_
                print " V=", value_

            # gameを実行
            self.game_state.process(action)

            # 実行した結果
            reward = self.game_state.reward
            terminal = self.game_state.terminal

            self.episode_reward += reward

            rewards.append(reward)

            self.local_t += 1

            self.game_state.update()

            if terminal:
                terminal_end = True
                print "score=", self.episode_reward

                if self.thread_index == 0:
                    self._record_score(sess, summary_writer, summary_op,
                                       self.episode_reward, global_t)

                self.episode_reward = 0
                break

        R = 0.0
        if not terminal_end:
            R = self.local_network.run_value(sess, self.game_state.s_t)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        # 勾配を算出して加算していく
        for (ai, ri, si, Vi) in zip(actions, rewards, states, values):
            R = ri + GAMMA * R
            td = R - Vi
            a = np.zeros([ACTION_SIZE])
            a[ai] = 1

            sess.run(self.policy_accum_gradients,
                     feed_dict={
                         self.local_network.s: [si],
                         self.local_network.a: [a],
                         self.local_network.td: [td]
                     })

            sess.run(self.value_accum_gradients,
                     feed_dict={
                         self.local_network.s: [si],
                         self.local_network.r: [R]
                     })

        cur_learning_rate = self._anneal_learning_rate(global_t)

        sess.run(self.policy_apply_gradients,
                 feed_dict={self.learning_rate_input: cur_learning_rate})
        sess.run(self.value_apply_gradients,
                 feed_dict={self.learning_rate_input: cur_learning_rate})

        if (self.thread_index == 0) and (self.local_t % 100) == 0:
            print "TIMESTEP", self.local_t

        # 進んだlocal step数を返す
        diff_local_t = self.local_t - start_local_t
        return diff_local_t
class A3CTrainingThread(object):
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    if USE_LSTM:
      self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
    else:
      self.local_network = GameACFFNetwork(ACTION_SIZE, device)

    self.local_network.prepare_loss(ENTROPY_BETA)

    # TODO: don't need accum trainer anymore with batch
    self.trainer = AccumTrainer(device)
    self.trainer.prepare_minimize( self.local_network.total_loss,
                                   self.local_network.get_vars() )
    
    self.accum_gradients = self.trainer.accumulate_gradients()
    self.reset_gradients = self.trainer.reset_gradients()
  
    self.apply_gradients = grad_applier.apply_gradients(
      global_network.get_vars(),
      self.trainer.get_accum_grad_list() )

    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(113 * thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    # variable controling log output
    self.prev_local_t = 0

  def _anneal_learning_rate(self, global_time_step):
    learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step
    if learning_rate < 0.0:
      learning_rate = 0.0
    return learning_rate

  def choose_action(self, pi_values):
    values = []
    sum = 0.0
    for rate in pi_values:
      sum = sum + rate
      value = sum
      values.append(value)
    
    r = random.random() * sum
    for i in range(len(values)):
      if values[i] >= r:
        return i;
    #fail safe
    return len(values)-1

  def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t):
    summary_str = sess.run(summary_op, feed_dict={
      score_input: score
    })
    summary_writer.add_summary(summary_str, global_t)
    
  def set_start_time(self, start_time):
    self.start_time = start_time

  def process(self, sess, global_t, summary_writer, summary_op, score_input):
    states = []
    actions = []
    rewards = []
    values = []

    terminal_end = False

    # reset accumulated gradients
    sess.run( self.reset_gradients )

    # copy weights from shared to local
    sess.run( self.sync )

    start_local_t = self.local_t

    if USE_LSTM:
      start_lstm_state = self.local_network.lstm_state_out
    
    # t_max times loop
    for i in range(LOCAL_T_MAX):
      pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t)
      action = self.choose_action(pi_)

      states.append(self.game_state.s_t)
      actions.append(action)
      values.append(value_)

      if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0):
        print("pi={}".format(pi_))
        print(" V={}".format(value_))

      # process game
      self.game_state.process(action)

      # receive game result
      reward = self.game_state.reward
      terminal = self.game_state.terminal

      self.episode_reward += reward

      # clip reward
      rewards.append( np.clip(reward, -1, 1) )

      self.local_t += 1

      # s_t1 -> s_t
      self.game_state.update()
      
      if terminal:
        terminal_end = True
        print("score={}".format(self.episode_reward))

        self._record_score(sess, summary_writer, summary_op, score_input,
                           self.episode_reward, global_t)
          
        self.episode_reward = 0
        self.game_state.reset()
        if USE_LSTM:
          self.local_network.reset_state()
        break

    R = 0.0
    if not terminal_end:
      R = self.local_network.run_value(sess, self.game_state.s_t)

    actions.reverse()
    states.reverse()
    rewards.reverse()
    values.reverse()

    batch_si = []
    batch_a = []
    batch_td = []
    batch_R = []

    # compute and accmulate gradients
    for(ai, ri, si, Vi) in zip(actions, rewards, states, values):
      R = ri + GAMMA * R
      td = R - Vi
      a = np.zeros([ACTION_SIZE])
      a[ai] = 1

      batch_si.append(si)
      batch_a.append(a)
      batch_td.append(td)
      batch_R.append(R)

    if USE_LSTM:
      batch_si.reverse()
      batch_a.reverse()
      batch_td.reverse()
      batch_R.reverse()

      sess.run( self.accum_gradients,
                feed_dict = {
                  self.local_network.s: batch_si,
                  self.local_network.a: batch_a,
                  self.local_network.td: batch_td,
                  self.local_network.r: batch_R,
                  self.local_network.initial_lstm_state: start_lstm_state,
                  self.local_network.step_size : [len(batch_a)] } )
    else:
      sess.run( self.accum_gradients,
                feed_dict = {
                  self.local_network.s: batch_si,
                  self.local_network.a: batch_a,
                  self.local_network.td: batch_td,
                  self.local_network.r: batch_R} )
      
    cur_learning_rate = self._anneal_learning_rate(global_t)

    sess.run( self.apply_gradients,
              feed_dict = { self.learning_rate_input: cur_learning_rate } )

    if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL):
      self.prev_local_t += PERFORMANCE_LOG_INTERVAL
      elapsed_time = time.time() - self.start_time
      steps_per_sec = global_t / elapsed_time
      print("### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format(
        global_t,  elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.))

    # return advanced local step size
    diff_local_t = self.local_t - start_local_t
    return diff_local_t
class A3CTrainingThread(object):
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device,
               game_function=ale_game_state, 
               local_network=None):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    self.local_network = local_network()

    self.local_network.prepare_loss(ENTROPY_BETA)

    # TODO: don't need accum trainer anymore with batch
    self.trainer = AccumTrainer(device)
    self.trainer.prepare_minimize( self.local_network.total_loss,
                                   self.local_network.get_vars() )
    
    self.accum_gradients = self.trainer.accumulate_gradients()
    self.reset_gradients = self.trainer.reset_gradients()
  
    self.apply_gradients = grad_applier.apply_gradients(
      global_network.get_vars(),
      self.trainer.get_accum_grad_list() )

    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = game_function(thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    # variable controling log output
    self.prev_local_t = 0

  def _anneal_learning_rate(self, global_time_step):
    learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step
    if learning_rate < 0.0:
      learning_rate = 0.0
    return learning_rate

  def choose_action(self, pi_values):
    values = []
    sum = 0.0
    for rate in pi_values:
      sum = sum + rate
      value = sum
      values.append(value)
    
    r = random.random() * sum
    for i in range(len(values)):
      if values[i] >= r:
        return i;
    #fail safe
    return len(values)-1

  def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t):
    summary_str = sess.run(summary_op, feed_dict={
      score_input: score
    })
    summary_writer.add_summary(summary_str, global_t)
    
  def set_start_time(self, start_time):
    self.start_time = start_time

  def process(self, sess, global_t, summary_writer, summary_op, score_input):
    states = []
    actions = []
    rewards = []
    values = []

    terminal_end = False

    # reset accumulated gradients
    sess.run( self.reset_gradients )

    # copy weights from shared to local
    sess.run( self.sync )

    start_local_t = self.local_t

    if USE_LSTM:
      start_lstm_state = self.local_network.lstm_state_out
    
    # t_max times loop
    for i in range(LOCAL_T_MAX):
      pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t)
      action = self.choose_action(pi_)

      states.append(self.game_state.s_t)
      actions.append(action)
      values.append(value_)

      if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0):
        print("state={}".format(self.game_state.s_t))
        print("pi={}".format(pi_))
        print(" V={}".format(value_))

      # process game
      self.game_state.process(action)

      # receive game result
      reward = self.game_state.reward
      terminal = self.game_state.terminal

      self.episode_reward += reward

      # clip reward
      rewards.append( np.clip(reward, -1, 1) )

      self.local_t += 1

      # s_t1 -> s_t
      self.game_state.update()
      
      if terminal:
        terminal_end = True
        print("score={}".format(self.episode_reward))

        self._record_score(sess, summary_writer, summary_op, score_input,
                           self.episode_reward, global_t)
          
        self.episode_reward = 0
        self.game_state.reset()
        if USE_LSTM:
          self.local_network.reset_state()
        break

    R = 0.0
    if not terminal_end:
      R = self.local_network.run_value(sess, self.game_state.s_t)

    actions.reverse()
    states.reverse()
    rewards.reverse()
    values.reverse()

    batch_si = []
    batch_a = []
    batch_td = []
    batch_R = []

    # compute and accmulate gradients
    for(ai, ri, si, Vi) in zip(actions, rewards, states, values):
      R = ri + GAMMA * R
      td = R - Vi
      a = np.zeros([ACTION_SIZE])
      a[ai] = 1

      batch_si.append(si)
      batch_a.append(a)
      batch_td.append(td)
      batch_R.append(R)

    if USE_LSTM:
      batch_si.reverse()
      batch_a.reverse()
      batch_td.reverse()
      batch_R.reverse()

      sess.run( self.accum_gradients,
                feed_dict = {
                  self.local_network.s: batch_si,
                  self.local_network.a: batch_a,
                  self.local_network.td: batch_td,
                  self.local_network.r: batch_R,
                  self.local_network.initial_lstm_state: start_lstm_state,
                  self.local_network.step_size : [len(batch_a)] } )
    else:
      sess.run( self.accum_gradients,
                feed_dict = {
                  self.local_network.s: batch_si,
                  self.local_network.a: batch_a,
                  self.local_network.td: batch_td,
                  self.local_network.r: batch_R} )
      
    cur_learning_rate = self._anneal_learning_rate(global_t)

    sess.run( self.apply_gradients,
              feed_dict = { self.learning_rate_input: cur_learning_rate } )

    if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL):
      self.prev_local_t += PERFORMANCE_LOG_INTERVAL
      elapsed_time = time.time() - self.start_time
      steps_per_sec = global_t / elapsed_time
      print("### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour".format(
        global_t,  elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.))

    # return advanced local step size
    diff_local_t = self.local_t - start_local_t
    return diff_local_t
class A3CTrainingThread(object):

    def __init__(self,
                 thread_index,
                 global_network,
                 initial_learning_rate,
                 learning_rate_input,
                 grad_applier,
                 max_global_time_step,
                 device):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        if USE_LSTM:
            self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
        else:
            self.local_network = GameACFFNetwork(ACTION_SIZE, device)

        self.local_network.prepare_loss(ENTROPY_BETA)

        # TODO: don't need accum trainer anymore with batch
        self.trainer = AccumTrainer(device)
        self.trainer.prepare_minimize(self.local_network.total_loss,
                                      self.local_network.get_vars())

        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        self.apply_gradients = grad_applier.apply_gradients(
            global_network.get_vars(),
          self.trainer.get_accum_grad_list())

        self.sync = self.local_network.sync_from(global_network)

        self.game_state = GameState(113 * thread_index)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

    def _anneal_learning_rate(self, global_time_step):
        learning_rate = self.initial_learning_rate * \
            (self.max_global_time_step - global_time_step) / \
             self.max_global_time_step
        assert learning_rate > 0, 'Learning rate {} is not >0'.format(
            learning_rate)
        return learning_rate

    def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t):
        summary_str = sess.run(summary_op, feed_dict={
                               score_input: score
                               })
        summary_writer.add_summary(summary_str, global_t)

    def process(self, sess, global_t, summary_writer, summary_op, score_input):
        states = []
        actions = []
        rewards = []
        values = []

        # reset accumulated gradients
        sess.run(self.reset_gradients)

        # copy weights from shared to local
        sess.run(self.sync)

        if USE_LSTM:
            start_lstm_state = self.local_network.lstm_state_out

        # t_max times loop
        start_local_t = self.local_t
        terminal_end = False
        for i in range(LOCAL_T_MAX):
            pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t)
            action = choose_action(pi_)

            states.append(self.game_state.s_t)
            actions.append(action)
            values.append(value_)

            # Debug output for progress
            if (self.thread_index == 0) and (self.local_t % 100) == 0:
                print(('local_t = {:10}  pi = ' + '{:7.5f} ' * len(pi_) + ' V = {:8.4f} (thread {})').format(self.local_t,
                                                                                                             *pi_, value_, self.thread_index))

            # process game
            self.game_state.process(action)

            # receive game result
            reward = self.game_state.reward
            terminal = self.game_state.terminal

            self.episode_reward += reward

            # clip reward
            # TODO: Does this make sense?
            rewards.append(np.clip(reward, -1, 1))

            self.local_t += 1

            # s_t1 -> s_t
            self.game_state.update()

            if terminal:
                terminal_end = True
                print ("score=", self.episode_reward)

                self._record_score(
                    sess, summary_writer, summary_op, score_input,
                                   self.episode_reward, global_t)

                self.episode_reward = 0
                self.game_state.reset()
                if USE_LSTM:
                    self.local_network.reset_state()
                break

        # Compute and accmulate gradients

        R = 0.0 if terminal_end else self.local_network.run_value(sess, self.game_state.s_t)

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        # What is the meaning of these values?
        batch_si = []
        batch_a = []
        batch_td = []
        batch_R = []

        for(ai, ri, si, Vi) in zip(actions, rewards, states, values):
            R = ri + GAMMA * R
            td = R - Vi
            a = np.zeros([ACTION_SIZE])
            a[ai] = 1

            batch_si.append(si)
            batch_a.append(a)
            batch_td.append(td)
            batch_R.append(R)

        if USE_LSTM:
            batch_si.reverse()
            batch_a.reverse()
            batch_td.reverse()
            batch_R.reverse()

            sess.run(self.accum_gradients,
                     feed_dict={
                     self.local_network.s: batch_si,
                     self.local_network.a: batch_a,
                     self.local_network.td: batch_td,
                     self.local_network.r: batch_R,
                     self.local_network.initial_lstm_state: start_lstm_state,
                     self.local_network.step_size: [len(batch_a)]})
        else:
            sess.run(self.accum_gradients,
                     feed_dict={
                     self.local_network.s: batch_si,
                     self.local_network.a: batch_a,
                     self.local_network.td: batch_td,
                     self.local_network.r: batch_R})

        cur_learning_rate = self._anneal_learning_rate(global_t)

        sess.run(self.apply_gradients,
                 feed_dict={self.learning_rate_input: cur_learning_rate})

        if (self.thread_index == 0) and (self.local_t % 100) == 0:
            print ("TIMESTEP", self.local_t)

        # return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t
示例#25
0
class A3CTrainingThread(object):
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, grad_applier, max_global_time_step,
                 device, environment):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        # self.local_network = GameACNetwork(ACTION_SIZE, device)

        self.local_network = global_network.structural_clone(
            network_name="thread-net-%s" % self.thread_index)

        self.local_network.prepare_loss(ENTROPY_BETA)

        self.trainer = AccumTrainer(device)
        self.trainer.prepare_minimize(self.local_network.total_loss,
                                      self.local_network.get_vars())

        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        self.apply_gradients, self.grad_summary_op = grad_applier.apply_gradients(
            global_network.get_vars(), self.trainer.get_accum_grad_list())

        self.sync = self.local_network.sync_from(global_network)

        # self.game_state = GameState(113 * thread_index)
        self.game_state = GymGameState(113 * thread_index, env=environment)

        self.local_t = 0

        self.initial_learning_rate = initial_learning_rate

        self.episode_reward = 0

        self.lstm_last_output_state = None  # cache last lstm hidden states here

    def _anneal_learning_rate(self, global_time_step):
        learning_rate = self.initial_learning_rate * (
            self.max_global_time_step -
            global_time_step) / self.max_global_time_step
        if learning_rate < 0.0:
            learning_rate = 0.0
        return learning_rate

    def choose_action(self, pi_values):
        values = []
        sum = 0.0
        for rate in pi_values:
            sum = sum + rate
            value = sum
            values.append(value)

        r = random.random() * sum
        for i in range(len(values)):
            if values[i] >= r:
                return i
        # fail safe
        return len(values) - 1

    # def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t):
    #     summary_str = sess.run(summary_op, feed_dict={
    #         score_input: score,
    #     })
    #     summary_writer.add_summary(summary_str, global_t)
    """next steps
      x init the lstm state before process is called, somewhere

      x reinit lstm state after terminal episodes

      ?!? allow lstm state to persist even after global weights are copied (i guess)

      x feed state in to lstm during policy evals
      how does state work in gradient backups?



        Tests:

            - inspect lstm state inputs, outputs, and episode stored values
            -

    """

    def reset(self):

        # todo: any other states to clean up??
        # could have been absorbed in to a check for validity of game state...
        # but thats kind of magic-y and icky for this scenario
        self.game_state.reset()

    def process(self, sess, global_t, summary_writer,
                record_score_fn):  #summary_op, score_input):
        states = []
        actions = []
        rewards = []
        values = []
        lstm_states = []

        terminal_end = False

        # reset accumulated gradients
        sess.run(self.reset_gradients)

        # copy weights from shared to local
        sess.run(self.sync)

        # write weight summaries...only need them from one thread really
        if (self.thread_index == 0):
            param_summary = sess.run(self.local_network.param_summary_op)
            summary_writer.add_summary(param_summary, global_step=global_t)

        start_local_t = self.local_t

        # resume with wherever we left off on last time through the action loop
        # TODO: no reason the network itself current should care about this

        if (self.lstm_last_output_state is None):
            self.lstm_last_output_state = self.local_network.lstm_initial_state_value

        lstm_state = self.lstm_last_output_state

        # lstm_state = self.local_network.lstm_last_output_state_value

        # t_max times loop
        for i in range(LOCAL_T_MAX):

            states.append(self.game_state.s_t)
            lstm_states.append(lstm_state)

            pi_, value_, lstm_state = self.local_network.run(
                sess, self.game_state.s_t, lstm_state)

            action = self.local_network.sample_action(pi_)

            # print "a3c train: pi_: ", pi_
            # print "a3c train: action: ", action
            # pi_ = self.local_network.run_policy(sess, self.game_state.s_t)
            # action = choose_action(pi_)  # self.choose_action(pi_)

            actions.append(action)
            # value_ = self.local_network.run_value(sess, self.game_state.s_t)
            values.append(value_)

            if (self.thread_index == 0) and (self.local_t % 100) == 0:
                print "pi=", pi_
                print " V=", value_

            # process game
            self.game_state.process(action)
            # s_t1 -> s_t
            self.game_state.update()  # not sure why this is separate...

            # receive game result
            reward = self.game_state.reward
            terminal = self.game_state.terminal

            self.episode_reward += reward

            # clip reward
            rewards.append(np.clip(reward, -1, 1000))

            self.local_t += 1

            if terminal:
                terminal_end = True
                print "terminal score =", self.episode_reward

                # self._record_score(sess, summary_writer, summary_op, score_input,
                #                    self.episode_reward, global_t)

                record_score_fn(sess, summary_writer, self.episode_reward,
                                global_t)

                self.episode_reward = 0
                self.game_state.reset()

                #  ugh. reset lstm state!
                lstm_state = self.local_network.lstm_initial_state_value

                break

        R = 0.0
        if not terminal_end:
            # R = self.local_network.run_value(sess, self.game_state.s_t)

            _, R, _ = self.local_network.run(sess, self.game_state.s_t,
                                             lstm_state)

        # self.local_network.lstm_last_output_state_value = lstm_state # preserve for next time through the loop
        self.lstm_last_output_state = lstm_state

        #  TODO: cant store the lists i pass directly since they'll be destructively reversed by
        # this call....hmmmmm
        # maybe just reverse them here and leave it?
        #  start with copying the lists
        self.backup_and_accum_gradients(sess,
                                        global_t,
                                        summary_writer,
                                        states=states,
                                        lstm_states=lstm_states,
                                        actions=actions,
                                        values=values,
                                        rewards=rewards,
                                        final_reward_estimate=R)

        if (self.thread_index == 0) and (self.local_t % 100) == 0:
            print("TIMESTEP %d GLOBAL %d" % (self.local_t, global_t))

        # 進んだlocal step数を返す
        diff_local_t = self.local_t - start_local_t
        return diff_local_t

    def backup_and_accum_gradients(self, sess, global_t, summary_writer,
                                   states, lstm_states, actions, values,
                                   rewards, final_reward_estimate):
        """ inputs are lists reflecting a recorded episode fragment in the order they occured

            a = sample{ pi(a | s, lstm_s ) }
            v = V(s, lstms)
            r = env.step(a)

        :param states: states
        :param actions:
        :param rewards:
        :param lstm_states:
        :return:
        """

        # TODO: copy these and leave the originals alone...
        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()
        lstm_states.reverse()

        R = final_reward_estimate

        # compute and accmulate gradients
        for (ai, ri, si, Vi, lstm_si) in zip(actions, rewards, states, values,
                                             lstm_states):
            R = ri + GAMMA * R
            td = R - Vi

            a = self.local_network.feedback_action(ai)
            # a = np.zeros([self.local_network.action_size])
            # a[ai] = 1

            # reshape state input
            # no batching for now

            _, loss_summary = sess.run(
                [self.accum_gradients, self.local_network.loss_summary_op],
                feed_dict=self.local_network.loss_feed_dictionary(
                    si, a, td, R, lstm_si)
                # feed_dict={
                #     self.local_network.s: [si],
                #     self.local_network.a: [a],
                #     self.local_network.td: [td],
                #     self.local_network.r: [R],
                #     self.local_network.lstm_current_state_tensor: lstm_si
                # }
            )

            if (self.thread_index == 0):
                summary_writer.add_summary(loss_summary, global_step=global_t)
        """ idea: maybe possible to do n-step TBPTT after having retroactively computed R for each state
        feed in batches of size up to n_max to a set of parallel networks with 
        
        
        idea: set up the lstm with say 5 recursive calls. then the initial inputs would need to be padded...maybe?
        would work if made the inputs in batches and altered iteration logic to cycle inputs through the history...
        

        """

        cur_learning_rate = self._anneal_learning_rate(global_t)

        _, grad_summary = sess.run(
            [self.apply_gradients, self.grad_summary_op],
            feed_dict={self.learning_rate_input: cur_learning_rate})

        if (self.thread_index == 0):
            summary_writer.add_summary(grad_summary, global_step=global_t)

    #  TODO: rename 'states' variable as 'observations' in next version just to be f*****g crystal clear

    def process_memory(self, sess, global_t, summary_writer, states,
                       initial_lstm_state, actions, rewards, final_state):
        """
        :param sess:
        :param global_t:
        :param summary_writer:
        :param states:
        :param initial_lstm_state:
        :param actions:
        :param rewards:
        :param final_state:         observation after the last game step...use None to signal terminal, otherwise used
         to compute the final boostrap Value
        :return:
        """

        # TODO: gotcha initial_lstm_state must be set carefully
        # if the episode reflects t=0, the state is always known
        # otherwise how can we know what the lstm state output of the *current* policy might plausibly have been
        # unless the same policy was executed from the very beginning of the historical episode and propagated
        # we could just record the lstm_state prior to the beginning of the history episode as an approximation
        # we might expect it to converge reasonably after a number of steps to something from the plausible distribution
        # for the current policy...however, over time, the policy will drift away further and further from what
        # created the original lstm_state
        # this suggests the solution that we update the stored initial lstm state in the replay memory after every refresh
        # ...almost like a real memory trace in a human brain might...
        # but how can we update it ????
        # maybe keep one state in reserve just to prime...but then we can only update the lstm state after it, not the one
        # that initial state needs....HMMM. maybe just
        #
        # for certain environments we could just apply the network to s_t+0 repeatedly until the lstm state converges
        # this works if the problem and/or env dont depend on any direct measure of time..perhaps
        #
        # easiest solution might just be to always reference the episodes to t=0
        # or just ignore first k states when backing up and computing gradients...since presumably we'll have converged
        # to something reasonable by that point

        values = []
        lstm_states = []

        terminal_end = False

        # reset accumulated gradients
        sess.run(self.reset_gradients)

        # copy weights from shared to local
        sess.run(self.sync)
        lstm_state = initial_lstm_state

        for (s_t, a_t, r_t) in zip(states, actions, rewards):

            # accum lstm states
            lstm_states.append(lstm_state)

            pi_, value_, lstm_state = self.local_network.run(
                sess, s_t, lstm_state)

            # get values
            values.append(values)
class A3CActorThread(object):
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, optimizer, max_global_time_step, device):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.max_global_time_step = max_global_time_step

        if USE_LSTM:
            self.local_network = A3CLSTMNetwork(STATE_DIM, STATE_CHN,
                                                ACTION_DIM, device,
                                                thread_index)
        else:
            self.local_network = A3CFFNetwork(STATE_DIM, STATE_CHN, ACTION_DIM,
                                              device)
        self.local_network.create_loss(ENTROPY_BETA)
        self.trainer = AccumTrainer(device)
        self.trainer.create_minimize(self.local_network.total_loss,
                                     self.local_network.get_vars())
        self.accum_gradients = self.trainer.accumulate_gradients()
        self.reset_gradients = self.trainer.reset_gradients()

        clip_accum_grads = [
            tf.clip_by_norm(accum_grad, 40.0)
            for accum_grad in self.trainer.get_accum_grad_list()
        ]
        self.apply_gradients = optimizer.apply_gradients(
            zip(clip_accum_grads, global_network.get_vars()))

        self.sync = self.local_network.sync_from(global_network)

        self.local_t = 0
        self.initial_learning_rate = initial_learning_rate

        # for log
        self.episode_reward = 0.0
        self.episode_start_time = 0.0
        self.prev_local_t = 0

        # for pull mode, like brower based game
        self.states = []
        self.actions = []
        self.rewards = []
        self.values = []
        self.start_lstm_state = None
        return

    def set_log_parmas(self, summary_writer, summary_op, reward_input,
                       time_input):
        '''
        notes: need to be called after initializing the class
        '''
        self.summary_writer = summary_writer
        self.summary_op = summary_op
        self.reward_input = reward_input
        self.time_input = time_input
        return

    def _anneal_learning_rate(self, global_time_step):
        learning_rate = self.initial_learning_rate * \
            (self.max_global_time_step - global_time_step) / self.max_global_time_step
        if learning_rate < 0.0:
            learning_rate = 0.0
        return learning_rate

    def choose_action(self, policy_output):
        sum_pi = []
        sum = 0.0
        for rate in policy_output:
            sum += rate
            sum_pi.append(sum)

        r = random.random() * sum
        for i in range(len(sum_pi)):
            if sum_pi[i] >= r:
                return i
        return len(sum_pi) - 1

    def _record_log(self, sess, global_t, reward, living_time):
        summary_str = sess.run(self.summary_op,
                               feed_dict={
                                   self.reward_input: reward,
                                   self.time_input: living_time
                               })
        self.summary_writer.add_summary(summary_str, global_t)
        return

    def process(self, sess, global_t, state, reward, terminal):
        # reduce the influence of socket connecting time
        if self.episode_start_time == 0.0:
            self.episode_start_time = timestamp()
            # copy weight from global network
            sess.run(self.reset_gradients)
            sess.run(self.sync)
            if USE_LSTM:
                self.start_lstm_state = self.local_network.lstm_state_out

        policy_, value_ = self.local_network.run_policy_and_value(sess, state)
        if self.thread_index == 0 and self.local_t % 1000 == 0:
            print 'policy=', policy_
            print 'value=', value_

        action_id = self.choose_action(policy_)

        self.states.append(state)
        self.actions.append(action_id)
        self.values.append(value_)

        self.episode_reward += reward
        self.rewards.append(np.clip(reward, -1.0, 1.0))

        self.local_t += 1

        if terminal:
            episode_end_time = timestamp()
            living_time = episode_end_time - self.episode_start_time

            self._record_log(sess, global_t, self.episode_reward, living_time)

            print("global_t=%d / reward=%.2f / living_time=%.4f") % (
                global_t, self.episode_reward, living_time)

            # reset variables
            self.episode_reward = 0.0
            self.episode_start_time = episode_end_time
            if USE_LSTM:
                self.local_network.reset_lstm_state()
        elif self.local_t % 2000 == 0:
            # save log per 2000 episodes
            living_time = timestamp() - self.episode_start_time
            self._record_log(sess, global_t, self.episode_reward, living_time)
        # -----------end of batch (LOCAL_T_MAX)--------------------

        # do training
        if self.local_t % LOCAL_T_MAX == 0 or terminal:
            R = 0.0
            if not terminal:
                R = self.local_network.run_value(sess, state)

            self.states.reverse()
            self.actions.reverse()
            self.rewards.reverse()
            self.values.reverse()

            batch_state = []
            batch_action = []
            batch_td = []
            batch_R = []

            for (ai, ri, si, Vi) in zip(self.actions, self.rewards,
                                        self.states, self.values):
                R = ri + GAMMA * R
                td = R - Vi
                action = np.zeros([ACTION_DIM])
                action[ai] = 1

                batch_state.append(si)
                batch_action.append(action)
                batch_td.append(td)
                batch_R.append(R)

            if USE_LSTM:
                batch_state.reverse()
                batch_action.reverse()
                batch_td.reverse()
                batch_R.reverse()
                sess.run(self.accum_gradients,
                         feed_dict={
                             self.local_network.state_input:
                             batch_state,
                             self.local_network.action_input:
                             batch_action,
                             self.local_network.td:
                             batch_td,
                             self.local_network.R:
                             batch_R,
                             self.local_network.step_size: [len(batch_state)],
                             self.local_network.initial_lstm_state:
                             self.start_lstm_state
                         })
                self.start_lstm_state = self.local_network.lstm_state_out
            else:
                sess.run(self.accum_gradients,
                         feed_dict={
                             self.local_network.state_input: batch_state,
                             self.local_network.action_input: batch_action,
                             self.local_network.td: batch_td,
                             self.local_network.R: batch_R
                         })

            cur_learning_rate = self._anneal_learning_rate(global_t)
            sess.run(self.apply_gradients,
                     feed_dict={self.learning_rate_input: cur_learning_rate})

            # print len(self.states), len(self.actions), len(self.values)
            # reste temporal buffer
            self.states = []
            self.actions = []
            self.rewards = []
            self.values = []

            sess.run(self.reset_gradients)
            sess.run(self.sync)

        return action_id
class A3CTrainingThread(object):
  def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    self.local_network = GameACNetwork(ACTION_SIZE, device)
    self.local_network.prepare_loss(ENTROPY_BETA)

    # TODO: don't need accum trainer anymore with batch
    self.trainer = AccumTrainer(device)
    self.trainer.prepare_minimize( self.local_network.total_loss,
                                   self.local_network.get_vars() )
    
    self.accum_gradients = self.trainer.accumulate_gradients()
    self.reset_gradients = self.trainer.reset_gradients()
  
    self.apply_gradients = grad_applier.apply_gradients(
      global_network.get_vars(),
      self.trainer.get_accum_grad_list() )

    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(113 * thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0


  def _anneal_learning_rate(self, global_time_step):
    learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step
    if learning_rate < 0.0:
      learning_rate = 0.0
    return learning_rate

  def choose_action(self, pi_values):
    values = []
    sum = 0.0
    for rate in pi_values:
      sum = sum + rate
      value = sum
      values.append(value)
    
    r = random.random() * sum
    for i in range(len(values)):
      if values[i] >= r:
        return i;
    #fail safe
    return len(values)-1

  def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t):
    summary_str = sess.run(summary_op, feed_dict={
      score_input: score
    })
    summary_writer.add_summary(summary_str, global_t)
    
  def process(self, sess, global_t, summary_writer, summary_op, score_input):
    states = []
    actions = []
    rewards = []
    values = []

    terminal_end = False

    # reset accumulated gradients
    sess.run( self.reset_gradients )

    # copy weights from shared to local
    sess.run( self.sync )

    start_local_t = self.local_t
    
    # t_max times loop
    for i in range(LOCAL_T_MAX):
      pi_, value_ = self.local_network.run_policy_and_value(sess, self.game_state.s_t)      
      action = self.choose_action(pi_)

      states.append(self.game_state.s_t)
      actions.append(action)
      values.append(value_)

      if (self.thread_index == 0) and (self.local_t % 100) == 0:
        print "pi=", pi_
        print " V=", value_

      # process game
      self.game_state.process(action)

      # receive game result
      reward = self.game_state.reward
      terminal = self.game_state.terminal

      self.episode_reward += reward

      # clip reward
      rewards.append( np.clip(reward, -1, 1) )

      self.local_t += 1

      # s_t1 -> s_t
      self.game_state.update()
      
      if terminal:
        terminal_end = True
        print "score=", self.episode_reward

        self._record_score(sess, summary_writer, summary_op, score_input,
                           self.episode_reward, global_t)
          
        self.episode_reward = 0
        self.game_state.reset()
        break

    R = 0.0
    if not terminal_end:
      R = self.local_network.run_value(sess, self.game_state.s_t)

    actions.reverse()
    states.reverse()
    rewards.reverse()
    values.reverse()

    batch_si = []
    batch_a = []
    batch_td = []
    batch_R = []

    # compute and accmulate gradients
    for(ai, ri, si, Vi) in zip(actions, rewards, states, values):
      R = ri + GAMMA * R
      td = R - Vi
      a = np.zeros([ACTION_SIZE])
      a[ai] = 1

      batch_si.append(si)
      batch_a.append(a)
      batch_td.append(td)
      batch_R.append(R)

    sess.run( self.accum_gradients,
              feed_dict = {
                self.local_network.s: batch_si,
                self.local_network.a: batch_a,
                self.local_network.td: batch_td,
                self.local_network.r: batch_R } )
      
    cur_learning_rate = self._anneal_learning_rate(global_t)

    sess.run( self.apply_gradients,
              feed_dict = { self.learning_rate_input: cur_learning_rate } )

    if (self.thread_index == 0) and (self.local_t % 100) == 0:
      print "TIMESTEP", self.local_t

    # return advanced local step size
    diff_local_t = self.local_t - start_local_t
    return diff_local_t
class A3CTrainingthread(object):
	def __init__(self,
			 sess,
			 thread_index,
			 global_network,
			 initial_learning_rate,
			 learning_rate_input,
			 grad_applier,
			 max_global_time_step,
			 num_trainable_vars):

		self.thread_index = thread_index
		self.learning_rate_input = learning_rate_input
		self.max_global_time_step = max_global_time_step	
		
		if LSTM:
			initializer = tf.random_uniform_initializer(-0.1, 0.1)		
			with tf.variable_scope("model"+str(thread_index), reuse=None, initializer=initializer):
				self.local_network = AC3LSTM(num_actions, num_states, num_trainable_vars)
		else:
			self.local_network = AC3FF(num_actions, num_states, num_trainable_vars)
			
		self.local_network.prepare_loss(entropy_beta)

		self.trainer = AccumTrainer()
		self.trainer.prepare_minimize(self.local_network.total_loss, self.local_network.trainable_vars)
		
		self.accum_gradients = self.trainer.accumulate_gradients()
		self.reset_gradients = self.trainer.reset_gradients()
	
		self.apply_gradients = grad_applier.apply_gradients(
			global_network.trainable_vars,
			self.trainer.get_accum_grad_list() )

		self.sync = self.local_network.sync_from(global_network)
		self.game_state = ChainMDP()
		self.local_t = 0
		self.initial_learning_rate = initial_learning_rate
		self.episode_reward = 0


	def _anneal_learning_rate(self, global_time_step):
		learning_rate = self.initial_learning_rate * (self.max_global_time_step - global_time_step) / self.max_global_time_step
		if learning_rate < 0:
			learning_rate = 0
		return learning_rate

	def choose_action(self, pi_values):
		values = []
		sum = 0
		for rate in pi_values:
			sum = sum + rate
			value = sum
			values.append(value)
		
		r = random.random() * sum
		for i in range(len(values)):
			if values[i] >= r:
				return i;
		#fail safe
		return len(values)-1

	# Run for one episode
	def thread(self, sess, global_t):
		states = []
		actions = []
		rewards = []
		values = []

		terminal_end = False
		
		if LSTM:
			self.local_network.reset_state()
			
		# reset accumulated gradients
		sess.run(self.reset_gradients)
		# copy weights from shared to local
		sess.run(self.sync)
		start_local_t = self.local_t
	
		mdp = ChainMDP()
		state = mdp.states[np.random.randint(0, mdp.num_states-1)]
		discounted_reward = 0
		
		for i in range(local_t_max):
			if LSTM:
				action_probs = self.local_network.run_policy(sess, state, update_rnn_state=True)
			else:
				action_probs = self.local_network.run_policy(sess, state)
			
			action = self.choose_action(action_probs)
			states.append(state)
			actions.append(action)
			
			if LSTM:
			#	# Do not update the state again
				value_ = self.local_network.run_value(sess, state, update_rnn_state=False)
			else:
				value_ = self.local_network.run_value(sess, state)
				
			values.append(value_)

			reward, next_state, terminal = mdp.act(state, action)
			self.episode_reward += reward

			rewards.append(reward)

			self.local_t += 1
			state = next_state
			
			if terminal:
				terminal_end = True
				discounted_reward = (discount_rate**i)*self.episode_reward
				self.episode_reward = 0
				state = mdp.states[np.random.randint(0, mdp.num_states-1)]
				if LSTM:
					self.local_network.reset_state()
				break

		R = 0.0
		if not terminal_end:
			if LSTM:
				# Do not update the state again
				R = self.local_network.run_value(sess, state, update_rnn_state=False) 
			else:
				R = self.local_network.run_value(sess, state) 

		# Order from the final time point to the first
		### why?
		actions.reverse()
		states.reverse()
		rewards.reverse()
		values.reverse()

		# compute and accumulate gradients
		for (action, r, state, V) in zip(actions, rewards, states, values):
			R = r[0][0] + discount_rate * R
			td = R - V # temporal difference
			a = np.zeros([num_actions])
			a[action] = 1
			#a = np.reshape(a,[1,num_actions]) ### Should be done when the variable is created - or change something on the other end
			sess.run(self.accum_gradients,
								feed_dict = {
									#self.local_network.state: [state],
									self.local_network.state: np.reshape([float(i) for i in state],[1,mdp.num_states]), ### use np.array( ,dtype=...) instead
									self.local_network.a: [a],
									self.local_network.td: [td],
									self.local_network.r: [R]})
			
		cur_learning_rate = self._anneal_learning_rate(global_t)

		sess.run(self.apply_gradients, feed_dict = {self.learning_rate_input: cur_learning_rate})

		# local step
		diff_local_t = self.local_t - start_local_t
		return diff_local_t, discounted_reward