def _get_class_tf(self, class_dir_path, class_name, reload_cache=False): """Build The Classes Word Dict Cache 用于取得某分类所有词的词频缓存 Key Arguments class_dir_path -- the train set class dir path class_name -- the classname reload_cache -- if reload the class cache or not (default False) """ class_tf = {} if not os.path.exists(class_dir_path): print("输入的分类路径不合法!") return None else: _cache_file_path = os.path.join(self._cache_dir_path,class_name+"-tf.cache") _cache_word_number_file_path = os.path.join(self._cache_dir_path, class_name + "-word-num.cache") if os.path.exists(_cache_word_number_file_path) and os.path.exists(_cache_word_number_file_path) and not reload_cache: default_logger.debug("[CLS] The Class Cache is Already Existed, skip the Cache Building...") return None _base_dir_path = os.path.abspath(class_dir_path) # 开始遍历文件夹 files = os.listdir(class_dir_path) for file in files: full_file_path = os.path.join(_base_dir_path,file) _temp_tf_dic = tfidf.getTF(self._get_file_content(full_file_path), self.chcut) for _tmp_word in _temp_tf_dic: _tmp_class_tf = 0 if class_tf.get(_tmp_word) is not None: _tmp_class_tf = class_tf.get(_tmp_word) _tmp_document_tf = _temp_tf_dic.get(_tmp_word) _sum_tf = _tmp_class_tf + _tmp_document_tf if class_tf.get(_tmp_word) is not None: class_tf[_tmp_word] = _sum_tf else: class_tf.update({_tmp_word: _sum_tf}) # 序列化存储 default_logger.debug("[CLS] Class Cache Path: %s" % _cache_file_path) #序列化存储词频 _cache_file = open(_cache_file_path,'wb') pickle.dump(class_tf,_cache_file) _cache_file.close() #序列化存储词的数目 _cache_word_number_file = open(_cache_word_number_file_path,'wb') _word_count_dic = {class_name:0} for ctf in class_tf: _word_count_dic[class_name] += class_tf.get(ctf) # _word_count = class_tf.get(ctf) pickle.dump(_word_count_dic, _cache_word_number_file) print(_word_count_dic) print("缓存成功...")
def _get_all_tf(self): """ Build the All Word Cache 建立全词缓存,便于再次计算 """ if os.path.exists(self._all_word_tf_cache) and os.path.isfile(self._all_word_tf_cache): default_logger.debug("[CLS] Full Word Cache has been Built.") else: try: # 全词词频 open(self._all_word_tf_cache, 'wb') # 全词数目 open(self._all_word_num_cache, 'wb') # 取得所有文本路径 _train_paths = pathWalker.pathwalker(self.parent_dir) _trainset = _train_paths['train'] # print(trainset) _train_set_files = [] for cls in _trainset: for _tmp_class_file in os.listdir(cls['classpath']): _train_set_files.append(os.path.join(cls['classpath'], _tmp_class_file)) # 所有训练集的路径 print(_train_set_files) # 定义全词缓存字典 _all_tf_cache = {} # 开始取词频缓存 for train_file in _train_set_files: _temp_tf_dic = tfidf.getTF(self._get_file_content(train_file), self.chcut) for _tmp_word in _temp_tf_dic: _tmp_class_tf = 0 if _all_tf_cache.get(_tmp_word) is not None: _tmp_class_tf = _all_tf_cache.get(_tmp_word) _tmp_document_tf = _temp_tf_dic.get(_tmp_word) _sum_tf = _tmp_class_tf + _tmp_document_tf if _all_tf_cache.get(_tmp_word) is not None: _all_tf_cache[_tmp_word] = _sum_tf else: _all_tf_cache.update({_tmp_word: _sum_tf}) # 序列化存储 print(self._all_word_tf_cache) # 序列化存储词频 _cache_file = open(self._all_word_tf_cache, 'wb') pickle.dump(_all_tf_cache, _cache_file) _cache_file.close() # 序列化存储词的数目 _cache_word_number_file = open(self._all_word_num_cache, 'wb') _word_count_dic = {"all": 0} for ctf in _all_tf_cache: _word_count_dic['all'] += _all_tf_cache.get(ctf) # _word_count = class_tf.get(ctf) pickle.dump(_word_count_dic, _cache_word_number_file) print(_word_count_dic) default_logger.debug("[CLS] Cache has been Built Successfully.") except: # 建立全词缓存出错 default_logger.debug("[CLS] Cache has been Built Failed!!") return