Пример #1
0
    def combine_features(self, news):
        features = []
        # print("combine_features 训练集的关键词:",self.train_data_entity[:10])
        if self.load_from_file is True:
            if self.feature_data_dict is None:
                self.feature_data_dict = fenci.get_fenci_feature_func(
                    '../jieba_fenci_model/result/result_jieba_fenci.txt')
            for ner in self.feature_data_dict[news['newsId']]:
                features.append([
                    [ner],
                    self.feature_data_dict[news['newsId']][ner] + [
                        len(ner),
                        self.num_of_not_word(ner),
                        news['content'].count(ner),  # 正文中的词频
                        news['title'].count(ner),  # title中的词频
                        (news['title'] + news['content']).count(ner),
                        # 总的词频
                        (news['title'] + news['content']).index(ner),
                        # 关键词第一次出现的位置
                        (news['title'] + news['content']).rindex(ner),
                        # 关键词最后一次出现的位置
                        len(news['title']),  # 标题的长度
                        len(news['content'])  # 正文的长度
                    ]
                ])
            return features

        content_words_tfidf, title_words_tfidf = self.get_tfidf_Score(news)
        content_words_textRank, title_words_textRank = self.get_textRank_Score(
            news)
        keys = content_words_tfidf.keys() | title_words_tfidf.keys(
        ) | content_words_textRank.keys() | title_words_textRank.keys()
        for ner in keys:
            features.append([
                [ner],
                [
                    content_words_tfidf[ner]
                    if ner in content_words_tfidf else 0,  # 特征:正文中的tfidf
                    title_words_tfidf[ner]
                    if ner in title_words_tfidf else 0,  # 标题中的tfidf
                    content_words_textRank[ner]
                    if ner in content_words_textRank else 0,
                    # 特征:正文中的textRank
                    title_words_textRank[ner]
                    if ner in title_words_textRank else 0,  # 标题中的textRank
                    len(ner),  # 实体的长度
                    self.num_of_not_word(ner),  # 含有符号的个数
                    news['content'].count(ner),  # 正文中的词频
                    news['title'].count(ner),  # title中的词频
                    (news['title'] + news['content']).count(ner),  # 总的词频
                    (news['title'] +
                     news['content']).index(ner),  # 关键词第一次出现的位置
                    (news['title'] +
                     news['content']).rindex(ner),  # 关键词最后一次出现的位置
                    len(news['title']),  # 标题的长度
                    len(news['content'])  # 正文的长度
                    # self.train_data_entity.count(ner)/len(self.train_data_entity)  # 关键词在训练集中的概率 (效果差)
                ]
            ])
        return features
Пример #2
0
    def combine_features(self, news):
        features = []
        # print("combine_features 训练集的关键词:",self.train_data_entity[:10])
        if True:
            # if self.load_from_file is True:
            if self.feature_data_dict is None:
                print('加载预分词')
                self.feature_data_dict = fenci.get_fenci_feature_func(
                    '../jieba_fenci_model/result/result_jieba_fenci.txt')
            for ner in self.feature_data_dict[news['newsId']]:
                features.append([
                    [ner],
                    self.feature_data_dict[news['newsId']][ner] + [
                        len(ner),
                        self.num_of_not_word(ner),
                        news['content'].count(ner),  # 正文中的词频
                        news['title'].count(ner),  # title中的词频
                        (news['title'] + news['content']).count(ner),
                        # 总的词频
                        (news['title'] + news['content']).index(ner),
                        # 关键词第一次出现的位置
                        (news['title'] + news['content']).rindex(ner),
                        # 关键词最后一次出现的位置
                        len(news['title']),  # 标题的长度
                        len(news['content']),  # 正文的长度
                        self.bert_obj.is_in_bert(news['newsId'], ner),
                        self.ltp_obj.get_label(ner, news['newsId'], model=3)
                        # self.train_data_entity.count(ner)/len(self.train_data_entity)  # 关键词在训练集中的概率 (效果差)
                    ] + self.ltp_obj.get_label(ner, news['newsId'], model=1)
                ])
            return features

        content_words_tfidf, title_words_tfidf = self.get_tfidf_Score(news)
        content_words_textRank, title_words_textRank = self.get_textRank_Score(
            news)
        keys = content_words_tfidf.keys() | title_words_tfidf.keys(
        ) | content_words_textRank.keys() | title_words_textRank.keys()
        for ner in keys:
            # # one_hot = np.zeros(len(self.key_word_pos) + 1)
            # one_hot = np.zeros(len(self.key_word_pos))
            # if (ner in self.word_pos and self.word_pos[ner] in self.key_word_pos):
            #     one_hot[self.key_word_pos.index(self.word_pos[ner])] = 1
            # else:
            #     # one_hot[len(self.key_word_pos)] = 1
            #     one_hot[self.key_word_pos.index('n')] = 1
            features.append([
                [ner],
                [
                    content_words_tfidf[ner]
                    if ner in content_words_tfidf else 0,  # 特征:正文中的tfidf
                    title_words_tfidf[ner]
                    if ner in title_words_tfidf else 0,  # 标题中的tfidf
                    content_words_textRank[ner]
                    if ner in content_words_textRank else 0,
                    # 特征:正文中的textRank
                    title_words_textRank[ner]
                    if ner in title_words_textRank else 0,  # 标题中的textRank
                    len(ner),  # 实体的长度
                    self.num_of_not_word(ner),  # 含有符号的个数
                    news['content'].count(ner),  # 正文中的词频
                    news['title'].count(ner),  # title中的词频
                    (news['title'] + news['content']).count(ner),  # 总的词频
                    (news['title'] +
                     news['content']).index(ner),  # 关键词第一次出现的位置
                    (news['title'] +
                     news['content']).rindex(ner),  # 关键词最后一次出现的位置
                    len(news['title']),  # 标题的长度
                    len(news['content']),  # 正文的长度
                    (self.key_word_pos.index(self.word_pos[ner]) if
                     (ner in self.word_pos and self.word_pos[ner] in self.
                      key_word_pos) else self.key_word_pos.index('n')) * 0.1,
                    self.bert_obj.is_in_bert(news['newsId'], ner),
                    self.ltp_obj.get_label(ner, news['newsId'], model=3)
                    # self.train_data_entity.count(ner)/len(self.train_data_entity)  # 关键词在训练集中的概率 (效果差)
                ] + self.ltp_obj.get_label(ner, news['newsId'], model=1)
            ])
        return features
Пример #3
0
    def combine_features(self, news):
        features = []
        if self.load_from_file is True:
            if self.feature_data_dict is None:
                self.feature_data_dict = fenci.get_fenci_feature_func(
                    '../jieba_fenci_model/result/result_jieba_fenci.txt')
            for ner in self.feature_data_dict[news['newsId']]:
                features.append([
                    [ner],
                    self.feature_data_dict[news['newsId']][ner] + [
                        len(ner),
                        self.num_of_not_word(ner),
                        news['content'].count(ner),  # 正文中的词频
                        news['title'].count(ner),  # title中的词频
                        (news['title'] + news['content']).count(ner),
                        # 总的词频
                        (news['title'] + news['content']).index(ner),
                        # 关键词第一次出现的位置
                        (news['title'] + news['content']
                         ).rindex(ner),  # 关键词最后一次出现的位置
                        len(news['title']),  # 标题的长度
                        len(news['content'])  # 正文的长度
                    ]
                ])
            return features
        content_words_tfidf, title_words_tfidf = self.get_tfidf_Score(news)
        content_words_textRank, title_words_textRank = self.get_textRank_Score(
            news)
        # content_words_tfidf = content_words_textRank
        # title_words_tfidf = title_words_textRank
        keys = content_words_tfidf.keys() | title_words_tfidf.keys(
        ) | content_words_textRank.keys() | title_words_textRank.keys()
        features = []
        for ner in keys:
            features.append([
                [ner],
                [
                    content_words_tfidf[ner]
                    if ner in content_words_tfidf else 0,
                    title_words_tfidf[ner] if ner in title_words_tfidf else 0,
                    content_words_textRank[ner]
                    if ner in content_words_textRank else 0,
                    title_words_textRank[ner]
                    if ner in title_words_textRank else 0,
                    len(ner),
                    self.num_of_not_word(
                        ner),  # 特征:正文中的tfidf,标题中的tfidf,实体的长度,含有符号的个数
                    news['content'].count(ner),  # 正文中的词频
                    news['title'].count(ner),  # title中的词频
                    (news['title'] + news['content']).count(ner),  # 总的词频
                    (news['title'] +
                     news['content']).index(ner),  # 关键词第一次出现的位置
                    (news['title'] +
                     news['content']).rindex(ner),  # 关键词最后一次出现的位置
                    len(news['title']),  # 标题的长度
                    len(news['content'])  # 正文的长度
                ]
            ])
        # self.num_of_not_word(ner)
        # 正则化 (效果差)
        # feature_matrix = [feature[1] for feature in features]
        # feature_matrix = normalize(np.array(feature_matrix))
        # for index, feature in enumerate(features):
        #     feature[1] = list(feature_matrix[index])

        # for ner in self.pkuseg_cut(news):
        #     a = 0
        #     if ner in tfidf: #0
        #         a = tfidf[ner]
        #     features.append([[ner],[a]])  #特征可以继续添加 b,c,d,e,f,g......
        return features