예제 #1
0
    def train(self):
        '''
        call this function when the episode ends
        '''

        if not self.is_training:
            self.logger.info("Not in training mode")
            return
        else:
            self.logger.info("Update a2c policy parameters.")

        self.episodecount += 1
        self.logger.info("Sample Num so far: %s" % self.samplecount)
        self.logger.info("Episode Num so far: %s" % self.episodecount)

        Settings.add_count()
        globalEpisodeCount = copy.deepcopy(Settings.get_count())
        self.loadLastestPolicy()

        if self.samplecount >= self.minibatch_size * 1 and globalEpisodeCount % self.training_frequency == 0:
            self.logger.info('start training...')

            assert len(Settings.global_policysaver) == Settings.global_threadsnum
            # self.dqn.reset_noise()
            total_batch_size = 0
            for k, thread_policy in Settings.global_policysaver.items():
                s_batch, a_batch_one_hot, V_trace, advantage = Settings.global_hackpolicysaver[k]._sample_and_updateV()
                grad, batch_size = thread_policy.train(s_batch, a_batch_one_hot, V_trace, advantage)
                total_batch_size += batch_size
                Settings.load_grad(grad, k)

            assert len(Settings.global_gradsaver) == Settings.global_threadsnum
            grads_list = Settings.grad_sum()
            self._load_and_update(grads_list, total_batch_size)
            self.savePolicyInc()
예제 #2
0
파일: HackRBPolicy.py 프로젝트: WowCZ/strac
    def train(self):
        '''
        call this function when the episode ends
        '''

        # print(threading.currentThread().getName() + ' Domain: ' +self.domainString + ' Training')

        if not self.is_training:
            self.logger.info("Not in training mode")
            return
        else:
            self.logger.info("Update dqn policy parameters.")

        self.episodecount += 1

        # lock = Settings.load_lock()
        # lock.acquire()
        # try:
        Settings.add_count()
        globalEpisodeCount = copy.deepcopy(Settings.get_count())

        # print('###################################################')
        # print(threading.currentThread().getName() + ' ' + str(globalEpisodeCount))

        self.logger.info("Sample Num so far: %s" % (self.samplecount))
        self.logger.info("Episode Num so far: %s" % (self.episodecount))

        # if self.samplecount >= self.minibatch_size * 10 and self.episodecount % self.training_frequency == 0:
        if self.samplecount >= self.minibatch_size * 10 and globalEpisodeCount % self.training_frequency == 0:
            self.logger.info('start training...')

            # print('reshaped_targetdis_shape: ')
            # print(reshaped_targetdis.shape)
            # print('a_batch_one_hot_shape: ')
            # print(a_batch_one_hot.shape)

            assert len(Settings.global_policysaver) == Settings.global_threadsnum
            # self.dqn.reset_noise()
            total_batch_size = 0
            for k, v in Settings.global_policysaver.items():
                s_batch, a_batch_one_hot, reshaped_targetdis = Settings.global_hackpolicysaver[k]._sample_and_updateQ()
                grad, batch_size = Settings.global_policysaver[k].train(s_batch, a_batch_one_hot, reshaped_targetdis)
                total_batch_size += batch_size
                Settings.load_grad(grad, k)

            assert len(Settings.global_gradsaver) == Settings.global_threadsnum
            grads_list = Settings.grad_sum()
            self._load_and_update(grads_list, self.minibatch_size)
            self._savePolicyInc()
예제 #3
0
파일: HackPolicy.py 프로젝트: WowCZ/strac
    def loadPolicy(self, filename):
        """
        load model and replay buffer
        """
        # lock.acquire()
        tmp = self.out_policy_file.split('/')
        nfilename = tmp[-1].split('-')[1:-1]
        nfilename.append('shared-lastest')
        nfilename = '-'.join(nfilename)
        tmp[-1] = nfilename
        nfilename = '/'.join(tmp)

        tmp = filename.split('-')
        str_tmp = ''
        for i in range(len(tmp) - 2):
            str_tmp = str_tmp + tmp[i] + '-'
        str_tmp = str_tmp + tmp[-2]

        globalEpisodeCount = copy.deepcopy(Settings.get_count())

        if tmp[-1].split('.')[-1] == '0' or globalEpisodeCount == 0:
            # lock.acquire()
            self.dqn.load_network(str_tmp + '-00.21.dqn.ckpt')
            # lock.release()
        else:
            # lock.acquire()
            self.dqn.load_test_network(nfilename + '.dqn.ckpt')
            self.dqn.load_trainer(filename + '.dqn.ckpt')
            # lock.release()

        # load replay buffer

        try:
            print 'load from: ', filename
            f = open(filename + '-' + self.domainString + '.episode', 'rb')
            loaded_objects = []
            for i in range(2):  # load nn params and collected data
                loaded_objects.append(pickle.load(f))
            self.samplecount = int(loaded_objects[0])
            self.episodes[self.domainString] = copy.deepcopy(loaded_objects[1])
            self.logger.info(
                "Loading both model from %s and replay buffer..." % filename)
            f.close()
        except:
            self.logger.info("Loading only models...")