def train(self): ''' call this function when the episode ends ''' if not self.is_training: self.logger.info("Not in training mode") return else: self.logger.info("Update a2c policy parameters.") self.episodecount += 1 self.logger.info("Sample Num so far: %s" % self.samplecount) self.logger.info("Episode Num so far: %s" % self.episodecount) Settings.add_count() globalEpisodeCount = copy.deepcopy(Settings.get_count()) self.loadLastestPolicy() if self.samplecount >= self.minibatch_size * 1 and globalEpisodeCount % self.training_frequency == 0: self.logger.info('start training...') assert len(Settings.global_policysaver) == Settings.global_threadsnum # self.dqn.reset_noise() total_batch_size = 0 for k, thread_policy in Settings.global_policysaver.items(): s_batch, a_batch_one_hot, V_trace, advantage = Settings.global_hackpolicysaver[k]._sample_and_updateV() grad, batch_size = thread_policy.train(s_batch, a_batch_one_hot, V_trace, advantage) total_batch_size += batch_size Settings.load_grad(grad, k) assert len(Settings.global_gradsaver) == Settings.global_threadsnum grads_list = Settings.grad_sum() self._load_and_update(grads_list, total_batch_size) self.savePolicyInc()
def train(self): ''' call this function when the episode ends ''' # print(threading.currentThread().getName() + ' Domain: ' +self.domainString + ' Training') if not self.is_training: self.logger.info("Not in training mode") return else: self.logger.info("Update dqn policy parameters.") self.episodecount += 1 # lock = Settings.load_lock() # lock.acquire() # try: Settings.add_count() globalEpisodeCount = copy.deepcopy(Settings.get_count()) # print('###################################################') # print(threading.currentThread().getName() + ' ' + str(globalEpisodeCount)) self.logger.info("Sample Num so far: %s" % (self.samplecount)) self.logger.info("Episode Num so far: %s" % (self.episodecount)) # if self.samplecount >= self.minibatch_size * 10 and self.episodecount % self.training_frequency == 0: if self.samplecount >= self.minibatch_size * 10 and globalEpisodeCount % self.training_frequency == 0: self.logger.info('start training...') # print('reshaped_targetdis_shape: ') # print(reshaped_targetdis.shape) # print('a_batch_one_hot_shape: ') # print(a_batch_one_hot.shape) assert len(Settings.global_policysaver) == Settings.global_threadsnum # self.dqn.reset_noise() total_batch_size = 0 for k, v in Settings.global_policysaver.items(): s_batch, a_batch_one_hot, reshaped_targetdis = Settings.global_hackpolicysaver[k]._sample_and_updateQ() grad, batch_size = Settings.global_policysaver[k].train(s_batch, a_batch_one_hot, reshaped_targetdis) total_batch_size += batch_size Settings.load_grad(grad, k) assert len(Settings.global_gradsaver) == Settings.global_threadsnum grads_list = Settings.grad_sum() self._load_and_update(grads_list, self.minibatch_size) self._savePolicyInc()