def __init__(self, documents_or_func, cache_dir): """ 1. input is { { item_id : {feature1:count1, feature2: count2, ...} }, ... } 2. output is { { item_id : {feature1: rank1, feature2: rank2, ...} }, ... } They're all acts like a dict, whatever persistent or not. """ self.documents_or_func = documents_or_func self.cache_dir = cache_dir # Always load idf cache, and it's really small. d1 = DictUtils.add_default_value(self.idf_cache) self.idf_cache = IdfResult(d1.default_factory, d1)
def entropy_cache(self): """ 训练+测试 预料全在里面了 """ from .lib.entropy import EntropyFunc result = EntropyFunc.process(self.documents_with_segments, self.cache_dir) self.entropy_file = result # 采用自己的熵,给FeaturesWeight用 result = DictUtils.add_default_value(result) """ from etl_utils import is_regular_word, Unicode for k1 in result.keys(): if not (is_regular_word(k1) or Unicode.is_chinese(k1)): del result[k1] """ self.entropy_file = result return result