def train(self): ''' call this function when the episode ends ''' if not self.is_training: self.logger.info("Not in training mode") return else: self.logger.info("Update a2c policy parameters.") self.episodecount += 1 self.logger.info("Sample Num so far: %s" % self.samplecount) self.logger.info("Episode Num so far: %s" % self.episodecount) Settings.add_count() globalEpisodeCount = copy.deepcopy(Settings.get_count()) self.loadLastestPolicy() if self.samplecount >= self.minibatch_size * 1 and globalEpisodeCount % self.training_frequency == 0: self.logger.info('start training...') assert len(Settings.global_policysaver) == Settings.global_threadsnum # self.dqn.reset_noise() total_batch_size = 0 for k, thread_policy in Settings.global_policysaver.items(): s_batch, a_batch_one_hot, V_trace, advantage = Settings.global_hackpolicysaver[k]._sample_and_updateV() grad, batch_size = thread_policy.train(s_batch, a_batch_one_hot, V_trace, advantage) total_batch_size += batch_size Settings.load_grad(grad, k) assert len(Settings.global_gradsaver) == Settings.global_threadsnum grads_list = Settings.grad_sum() self._load_and_update(grads_list, total_batch_size) self.savePolicyInc()
def train(self): ''' call this function when the episode ends ''' # print(threading.currentThread().getName() + ' Domain: ' +self.domainString + ' Training') if not self.is_training: self.logger.info("Not in training mode") return else: self.logger.info("Update dqn policy parameters.") self.episodecount += 1 # lock = Settings.load_lock() # lock.acquire() # try: Settings.add_count() globalEpisodeCount = copy.deepcopy(Settings.get_count()) # print('###################################################') # print(threading.currentThread().getName() + ' ' + str(globalEpisodeCount)) self.logger.info("Sample Num so far: %s" % (self.samplecount)) self.logger.info("Episode Num so far: %s" % (self.episodecount)) # if self.samplecount >= self.minibatch_size * 10 and self.episodecount % self.training_frequency == 0: if self.samplecount >= self.minibatch_size * 10 and globalEpisodeCount % self.training_frequency == 0: self.logger.info('start training...') # print('reshaped_targetdis_shape: ') # print(reshaped_targetdis.shape) # print('a_batch_one_hot_shape: ') # print(a_batch_one_hot.shape) assert len(Settings.global_policysaver) == Settings.global_threadsnum # self.dqn.reset_noise() total_batch_size = 0 for k, v in Settings.global_policysaver.items(): s_batch, a_batch_one_hot, reshaped_targetdis = Settings.global_hackpolicysaver[k]._sample_and_updateQ() grad, batch_size = Settings.global_policysaver[k].train(s_batch, a_batch_one_hot, reshaped_targetdis) total_batch_size += batch_size Settings.load_grad(grad, k) assert len(Settings.global_gradsaver) == Settings.global_threadsnum grads_list = Settings.grad_sum() self._load_and_update(grads_list, self.minibatch_size) self._savePolicyInc()
def loadPolicy(self, filename): """ load model and replay buffer """ # lock.acquire() tmp = self.out_policy_file.split('/') nfilename = tmp[-1].split('-')[1:-1] nfilename.append('shared-lastest') nfilename = '-'.join(nfilename) tmp[-1] = nfilename nfilename = '/'.join(tmp) tmp = filename.split('-') str_tmp = '' for i in range(len(tmp) - 2): str_tmp = str_tmp + tmp[i] + '-' str_tmp = str_tmp + tmp[-2] globalEpisodeCount = copy.deepcopy(Settings.get_count()) if tmp[-1].split('.')[-1] == '0' or globalEpisodeCount == 0: # lock.acquire() self.dqn.load_network(str_tmp + '-00.21.dqn.ckpt') # lock.release() else: # lock.acquire() self.dqn.load_test_network(nfilename + '.dqn.ckpt') self.dqn.load_trainer(filename + '.dqn.ckpt') # lock.release() # load replay buffer try: print 'load from: ', filename f = open(filename + '-' + self.domainString + '.episode', 'rb') loaded_objects = [] for i in range(2): # load nn params and collected data loaded_objects.append(pickle.load(f)) self.samplecount = int(loaded_objects[0]) self.episodes[self.domainString] = copy.deepcopy(loaded_objects[1]) self.logger.info( "Loading both model from %s and replay buffer..." % filename) f.close() except: self.logger.info("Loading only models...")