예제 #1
0
    def __init__(self, documents_or_func, cache_dir):
        """
        1.  input is { { item_id : {feature1:count1, feature2: count2, ...} }, ... }
        2. output is { { item_id : {feature1: rank1, feature2:  rank2, ...} }, ... }
        They're all acts like a dict, whatever persistent or not.
        """
        self.documents_or_func = documents_or_func
        self.cache_dir = cache_dir

        # Always load idf cache, and it's really small.
        d1 = DictUtils.add_default_value(self.idf_cache)
        self.idf_cache = IdfResult(d1.default_factory, d1)
예제 #2
0
    def entropy_cache(self):
        """ 训练+测试 预料全在里面了 """
        from .lib.entropy import EntropyFunc
        result = EntropyFunc.process(self.documents_with_segments, self.cache_dir)
        self.entropy_file = result  # 采用自己的熵,给FeaturesWeight用
        result = DictUtils.add_default_value(result)

        """
        from etl_utils import is_regular_word, Unicode
        for k1 in result.keys():
            if not (is_regular_word(k1) or Unicode.is_chinese(k1)):
                del result[k1]
        """

        self.entropy_file = result
        return result
예제 #3
0
    def entropy_cache(self):
        """ 训练+测试 预料全在里面了 """
        from .lib.entropy import EntropyFunc
        result = EntropyFunc.process(self.documents_with_segments,
                                     self.cache_dir)
        self.entropy_file = result  # 采用自己的熵,给FeaturesWeight用
        result = DictUtils.add_default_value(result)
        """
        from etl_utils import is_regular_word, Unicode
        for k1 in result.keys():
            if not (is_regular_word(k1) or Unicode.is_chinese(k1)):
                del result[k1]
        """

        self.entropy_file = result
        return result