def _get_all_tf(self): """ Build the All Word Cache 建立全词缓存,便于再次计算 """ if os.path.exists(self._all_word_tf_cache) and os.path.isfile(self._all_word_tf_cache): default_logger.debug("[CLS] Full Word Cache has been Built.") else: try: # 全词词频 open(self._all_word_tf_cache, 'wb') # 全词数目 open(self._all_word_num_cache, 'wb') # 取得所有文本路径 _train_paths = pathWalker.pathwalker(self.parent_dir) _trainset = _train_paths['train'] # print(trainset) _train_set_files = [] for cls in _trainset: for _tmp_class_file in os.listdir(cls['classpath']): _train_set_files.append(os.path.join(cls['classpath'], _tmp_class_file)) # 所有训练集的路径 print(_train_set_files) # 定义全词缓存字典 _all_tf_cache = {} # 开始取词频缓存 for train_file in _train_set_files: _temp_tf_dic = tfidf.getTF(self._get_file_content(train_file), self.chcut) for _tmp_word in _temp_tf_dic: _tmp_class_tf = 0 if _all_tf_cache.get(_tmp_word) is not None: _tmp_class_tf = _all_tf_cache.get(_tmp_word) _tmp_document_tf = _temp_tf_dic.get(_tmp_word) _sum_tf = _tmp_class_tf + _tmp_document_tf if _all_tf_cache.get(_tmp_word) is not None: _all_tf_cache[_tmp_word] = _sum_tf else: _all_tf_cache.update({_tmp_word: _sum_tf}) # 序列化存储 print(self._all_word_tf_cache) # 序列化存储词频 _cache_file = open(self._all_word_tf_cache, 'wb') pickle.dump(_all_tf_cache, _cache_file) _cache_file.close() # 序列化存储词的数目 _cache_word_number_file = open(self._all_word_num_cache, 'wb') _word_count_dic = {"all": 0} for ctf in _all_tf_cache: _word_count_dic['all'] += _all_tf_cache.get(ctf) # _word_count = class_tf.get(ctf) pickle.dump(_word_count_dic, _cache_word_number_file) print(_word_count_dic) default_logger.debug("[CLS] Cache has been Built Successfully.") except: # 建立全词缓存出错 default_logger.debug("[CLS] Cache has been Built Failed!!") return
if __name__ == '__main__': ''' 本文件是主要的测试文件,用于测试贝叶斯分类器的准确率和召回率 Precision(c) = 所有被正确归为c类的页面/所有被归为c类的页面(错的也算) Recall(c) = 所有被正确归为c类的页面/所有本应被归为c类的页面(不算错的) ''' # 每个分类测试的文章数目 MAXITEM =200 # 初始化贝叶斯分类器 classifitor = classifier.Classifier() # 开始计时 starttime = time.time() # 取得测试集路径 ALLSet = pathWalker.pathwalker("Parent") TestSet = ALLSet['test'] classficiation_list = [] classficiation_count = {} for cls in TestSet: clsName = cls.get("classname") clsSet = os.listdir(cls.get("classpath")) # 取得每个分类的计数 classficiation_count.update({clsName:len(clsSet)}) print(clsSet) # 测试分类 test_list1 = ['it', 'education'] # 存储某一篇测试结果的字典 for index, clsfilename in enumerate(clsSet):