Пример #1
0
 def inspect_global_freq(self):
     d1 = defaultdict(int)
     for k1, v1 in self.total_tag_to_features__dict.iteritems():
         for k2, v2 in v1.iteritems():
             d1[k2] += v2
     uprint(Counter(d1).most_common())
     uprint(u''.join([v1[0] for v1 in Counter(d1).most_common()]))
Пример #2
0
            def func(counts, is_precision=False):
                if not is_precision:
                    for1, for2 = original_tags, recommend_tags
                else:
                    for2, for1 = original_tags, recommend_tags

                processed = set([])

                for method in ["exact", "peer", "child", "parent"]:
                    match_count = 0

                    for t1 in (for1 - processed):  # 其实核心就是对这一层进行遍历
                        matched_t1 = None
                        for t2 in for2:  # 不需要相减,因为其他recommend_tags还要判定关系
                            n_t1, n_t2 = t2, t1
                            if verbose:
                                print method, "[n_t1]", n_t1, "[n_t2]", n_t2
                            if getattr(tags_tree, "is_" + method)(n_t1, n_t2):
                                if verbose:
                                    print "√"
                                matched_t1 = n_t2
                                break
                        if matched_t1:
                            # 这样在这个循环外部的for循环rt1就没有机会重复计算了
                            processed.add(n_t2)
                            match_count += 1
                    update(counts, method, match_count)
                if verbose:
                    uprint("[processed]", processed)
                counts.unmatch += len(for1 - processed)

                # 计算是否完全没有 召回|正确
                if len(processed) == 0:
                    text = "no_precision" if is_precision else "no_recall"
                    item1['eval_result'].append(text)
Пример #3
0
 def inspect_global_freq(self):
     d1 = defaultdict(int)
     for k1, v1 in self.total_tag_to_features__dict.iteritems():
         for k2, v2 in v1.iteritems():
             d1[k2] += v2
     uprint(Counter(d1).most_common())
     uprint(u''.join([v1[0] for v1 in Counter(d1).most_common()]))
Пример #4
0
    def test_test_data(self):
        return True
        match_count = 0
        for record1 in data_list_test:  # 总共 4 个测试items
            item1 = FoobarModel(record1)
            # 至少一个有交集
            common_tags1 = set(recommend_tags(item1)) & set(item1.tags)
            uprint('[common_tags]', common_tags1)
            if common_tags1:
                match_count += 1

        match_rate = match_count / float(len(data_list_test))
        print "[match_rate]", match_rate
Пример #5
0
    def test_test_data(self):
        return True
        match_count = 0
        for record1 in data_list_test:  # 总共 4 个测试items
            item1 = FoobarModel(record1)
            # 至少一个有交集
            common_tags1 = set(recommend_tags(item1)) & set(item1.tags)
            uprint('[common_tags]', common_tags1)
            if common_tags1:
                match_count += 1

        match_rate = match_count / float(len(data_list_test))
        print "[match_rate]", match_rate
Пример #6
0
    def inspect(self, name=None):
        if name == 'name_to_nodes':
            pass

        if name == 'feature_to_nodes':
            pass

        if name == 'features_weight':
            for name1, nodes_set1 in self.name_to_nodes.iteritems():
                for node1 in nodes_set1:
                    uprint(node1.name, Counter(node1.features_weight).most_common(), "\n")

        if name is None:
            uprint(self)
Пример #7
0
    def inspect(self, name=None):
        if name == 'name_to_nodes':
            pass

        if name == 'feature_to_nodes':
            pass

        if name == 'features_weight':
            for name1, nodes_set1 in self.name_to_nodes.iteritems():
                for node1 in nodes_set1:
                    uprint(node1.name,
                           Counter(node1.features_weight).most_common(), "\n")

        if name is None:
            uprint(self)
Пример #8
0
    def extract_features_weight(self, item1):
        assert isinstance(item1.item_content, unicode)

        # 没效果
        #segment_features = self.classify.documents_with_segments.get(item1.item_id, None)
        #if not segment_features: segment_features = Counter(jieba_parse(item1.item_content))
        segment_features = Counter({})

        unicode_features = Counter(list(item1.item_content))

        mix_features = segment_features + unicode_features

        self.filter_by_stop_list(mix_features)
        if self.debug:
            uprint("[mix_features]", mix_features)
        return mix_features
Пример #9
0
    def extract_features_weight(self, item1):
        assert isinstance(item1.item_content, unicode)

        # 没效果
        #segment_features = self.classify.documents_with_segments.get(item1.item_id, None)
        #if not segment_features: segment_features = Counter(jieba_parse(item1.item_content))
        segment_features = Counter({})

        unicode_features = Counter(list(item1.item_content))

        mix_features = segment_features + unicode_features

        self.filter_by_stop_list(mix_features)
        if self.debug:
            uprint("[mix_features]", mix_features)
        return mix_features
Пример #10
0
        def inspect_result(result, filter_item_ids=set([])):
            for idx1, two_parts in enumerate(result):
                print "第", idx1 + 1, "个"
                original_tags, recommend_data = two_parts
                if recommend_data['item_id'] not in filter_item_ids:
                    continue

                print "试题ID", recommend_data['item_id']
                print "试题内容", recommend_data['item_content']
                uprint(u"关键词列表 => 熵", recommend_data['features_weight'])
                uprint(u"原始标签:", original_tags)
                uprint(u"推荐标签:", recommend_data['recommend_tags'])
                uprint(u"推荐细节:", recommend_data['recommend_tags_detail'])
                print "\n" * 3
Пример #11
0
        def inspect_result(result, filter_item_ids=set([])):
            for idx1, two_parts in enumerate(result):
                print "第", idx1 + 1, "个"
                original_tags, recommend_data = two_parts
                if recommend_data['item_id'] not in filter_item_ids:
                    continue

                print "试题ID", recommend_data['item_id']
                print "试题内容", recommend_data['item_content']
                uprint(u"关键词列表 => 熵", recommend_data['features_weight'])
                uprint(u"原始标签:", original_tags)
                uprint(u"推荐标签:", recommend_data['recommend_tags'])
                uprint(u"推荐细节:", recommend_data['recommend_tags_detail'])
                print "\n" * 3
Пример #12
0
        def recommend_tags(item1):
            """ 参数: 输入的item1一般来说必须是持久化的。 """
            # TODO 可能把 node1.features_weight 直接优化成数组来做

            # A. 计算特征相似度
            result_rule = self.association_rule_learning_engine(item1)

            # B. 计算文本相似度
            result_text = self.text_similarity_engine(item1, result_rule['data'])

            count_feature, count_text = result_rule['counter'], result_text['counter']
            result_mix = sorted([[t1[0],
                                  count_feature[t1[0]] + count_text[t1[0]] * self.mix_unicode_coefficient]
                                 for t1 in result_rule['data']],
                                key=lambda i1: -i1[1])

            print "=" * 60
            uprint(u"题目ID", item1.item_id)
            uprint(u"题目content", item1.item_content)
            print
            uprint(u"original 知识点", self.model.tags_model__extract_tags(item1))
            uprint(u"features 相似度", result_rule['data'])
            uprint(u"unicode  相似度", result_text['data'])
            uprint(u"mix      相似度", result_mix)

            if result_mix:
                candidate_tags  = result_mix[0:self.default_guess_count]

                # 用于提升超过两个推荐的"正确度"。
                max_score = max([i1[1] for i1 in result_mix])
                candidate_tags  = filter(lambda i1: i1[1] >= (max_score * self.mix_score_percent), candidate_tags)
            else:
                candidate_tags  = []
            uprint(u"[final]", candidate_tags)
            print "\n" * 3

            candidate_tags = [{"name": name1, "ids": self.tags_tree.fetch_name_ids(name1),
                               "weight": weight1} for name1, weight1 in candidate_tags]

            return {
                "item_id": item1.item_id,
                "item_content": item1.item_content,
                "recommend_tags": candidate_tags,
                "original_tags": self.model.tags_model__extract_tags(item1),
            }
Пример #13
0
        def recommend_tags(item1):
            """ 参数: 输入的item1一般来说必须是持久化的。 """
            # TODO 可能把 node1.features_weight 直接优化成数组来做

            # A. 计算特征相似度
            result_rule = self.association_rule_learning_engine(item1)

            # B. 计算文本相似度
            result_text = self.text_similarity_engine(item1,
                                                      result_rule['data'])

            count_feature, count_text = result_rule['counter'], result_text[
                'counter']
            result_mix = sorted([[
                t1[0], count_feature[t1[0]] +
                count_text[t1[0]] * self.mix_unicode_coefficient
            ] for t1 in result_rule['data']],
                                key=lambda i1: -i1[1])

            print "=" * 60
            uprint(u"题目ID", item1.item_id)
            uprint(u"题目content", item1.item_content)
            print
            uprint(u"original 知识点", self.model.tags_model__extract_tags(item1))
            uprint(u"features 相似度", result_rule['data'])
            uprint(u"unicode  相似度", result_text['data'])
            uprint(u"mix      相似度", result_mix)

            if result_mix:
                candidate_tags = result_mix[0:self.default_guess_count]

                # 用于提升超过两个推荐的"正确度"。
                max_score = max([i1[1] for i1 in result_mix])
                candidate_tags = filter(
                    lambda i1: i1[1] >= (max_score * self.mix_score_percent),
                    candidate_tags)
            else:
                candidate_tags = []
            uprint(u"[final]", candidate_tags)
            print "\n" * 3

            candidate_tags = [{
                "name": name1,
                "ids": self.tags_tree.fetch_name_ids(name1),
                "weight": weight1
            } for name1, weight1 in candidate_tags]

            return {
                "item_id": item1.item_id,
                "item_content": item1.item_content,
                "recommend_tags": candidate_tags,
                "original_tags": self.model.tags_model__extract_tags(item1),
            }