Пример #1
0
class SentenceVec:
    def __init__(self,
                 sentence_vec_path='D:\\data\\videos\\title_vec.csv',
                 model=WordEmbedding()):
        self.sentence_vec_path = sentence_vec_path
        self.model = model
        self.idf, self.median = TFIDF().idf_loader.get_idf()

    def get_tfidf(self, sentence):

        words = jieba.cut(sentence)
        freq = {}
        for w in words:
            freq[w] = freq.get(w, 0.0) + 1.0
        totle = sum(freq.values())
        tfidf_dict = {}
        for w in freq.keys():
            weight = self.idf.get(w, 0.) * freq[w] / totle
            tfidf_dict[w] = weight

        return tfidf_dict

    def get_sentence_vec(self, sentence):
        seg_list = self.model.__jieba_cut__(sentence)
        sen_vec = np.zeros(200)
        for word in seg_list:
            sen_vec += self.model.get_word_vec(word)
        return sen_vec / len(seg_list)

    def get_weight_sent_vec(self, sentence):
        '''
        带tfidf权重的词向量
        :param sentence:
        :return:
        '''
        tfidf_dict = self.get_tfidf(sentence)
        weight_totle = sum(tfidf_dict.values())

        sen_vec = np.zeros(200)
        for word in jieba.cut(sentence):
            sen_vec += self.model.get_word_vec(word) * tfidf_dict[word]

        if weight_totle > 0.0:
            return sen_vec / weight_totle
        else:
            return sen_vec

    def save_sentence_vec(self, sentence):
        sen_vec = pd.DataFrame(
            pd.read_csv(self.sentence_vec_path, header=0,
                        encoding='utf-8-sig'))
        sen_vec['title'].append(sentence)
        sen_vec['title_vec'].append(self.get_sentence_vec(sentence))
Пример #2
0
class GenerateKeyWords(object):
    def __init__(self,
                 PATH_DICT='dict.txt',
                 PATH_STOPWORDS='stopwords.txt',
                 PATH_WORD2VEC_MODEL='wiki.zh.model',
                 TOPK_TITLE=5,
                 TOPK_VOICE=10,
                 ENCODING_STOPWORDS='utf-8-sig',
                 ONLY_NV=True,
                 HMM=False):
        """
        初始化,用于加载模型和参数
        :param PATH_DICT: 用户字典的存放位置,就是整理的兴趣点
        :param PATH_STOPWORDS: 停用词的存放位置,包括之前的停用词和现在整理的停用词
        :param PATH_WORD2VEC_MODEL: 词向量模型,有四个文件
        :param TOPK_TITLE: 标题关键词的提取个数,默认5
        :param TOPK_VOICE: 声音关键词的提取个数,默认10
        :param ENCODING_STOPWORDS: 停用词的编码,默认无标签的utf-8
        :param ONLY_NV: 是否进行词性筛选,默认是,保留名词和动词时的效果较好,
        ’x'为未知词性,用户自定义添加的词典里面有些词是未知词性,需要保留
        :param HMM: 分词时是否使用hmm算法,不使用hmm时效果好,hmm在结巴中主要用于新词发现
        """
        print('load')
        self._path_dict = PATH_DICT
        self._path_stopwords = PATH_STOPWORDS
        self._path_words2vec = PATH_WORD2VEC_MODEL

        jieba.load_userdict(self._path_dict)
        with open(self._path_stopwords, 'r',
                  encoding=ENCODING_STOPWORDS) as sw:
            self._stop_words = [line.strip() for line in sw.readlines()]
        self._idf, _ = TFIDF().idf_loader.get_idf()
        self._model = Word2Vec.load(self._path_words2vec)
        self._topk_title = TOPK_TITLE
        self._topk_voice = TOPK_VOICE
        self._only_nv = ONLY_NV
        self._hmm = HMM
        self.predicted_poi = []
        print('model load success!')

    def __simi(self, word1, word2):
        """
        词语相似度计算
        :param word1:
        :param word2:
        :return: 相似度 float64
        """
        try:
            value = 0.0
            if word1 in self._model and word2 in self._model:
                value = self._model.similarity(word1, word2)
            return value
        except:
            return 0.0

    def __comput_tfidf(self, sentence, topk=10):
        """
        提取句子中的关键词,算法tf-idf
        :param sentence: 分词后的句子,字符串类型,词语之间逗号分隔
        :param number: 需要提取的关键词的个数
        :return: list 排序后的关键词
        """
        words = sentence.split(',')
        freq = {}
        for w in words:
            freq[w] = freq.get(w, 0.0) + 1.0

        totle = sum(freq.values())
        dict_weights = {}
        for w in freq.keys():
            weight = self._idf.get(w, 0.) * freq[w] / totle
            dict_weights[w] = weight

        if len(dict_weights) < 1:
            return []

        sorted_pair = sorted(dict_weights.items(),
                             key=lambda x: x[1],
                             reverse=True)
        new_words, _ = zip(*sorted_pair)

        topk = min(len(new_words), topk)
        return new_words[:topk]

    def __jieba_cut(self, sentence, n_v=True, HMM=True):
        """
        jieba分词,去停用词
        :param sentence:句子
        :param n_v: 是否词性筛选,仅保留名词动词
        :param HMM:
        :return: string e.g.  'ac,adsa,qwe’
        """
        if n_v:
            voice_tok = jieba.posseg.cut(sentence, HMM)
            tok_list = []
            for w in voice_tok:
                if w not in self._stop_words:  # 去停用词
                    if w.flag[0] == 'n' or w.flag[0] == 'v' or w.flag[
                            0] == 'x':  # 仅保留动词和名词,部分自定义的词词性为'x'
                        tok_list.append(w.word)
            return ','.join(tok_list)
        else:
            sentence_tok = jieba.cut(sentence, HMM)
            tok_list = []
            for w in sentence_tok:
                if w not in self._stop_words:  # 去停用词
                    tok_list.append(w.word)
            return ','.join(tok_list)

    def __list_clean(self, sentence_list):
        """
        去重,同时去掉词长度为1的词,保留原始顺序
        :param sentence_list: [...]
        :return: [...]
        """
        words_dict = {}
        iterator = 0
        for w in sentence_list:
            w = w.strip()
            if w not in words_dict and len(w) > 1:
                words_dict[w] = iterator
                iterator += 1

        sorted_w = sorted(words_dict.items(),
                          key=lambda x: x[1],
                          reverse=False)
        words, _ = list(zip(*sorted_w))
        return words

    def generate(self, title, voice):
        """
        根据title和voice两个字符串生成最终的关键词
        :param title: title文本,字符串类型
        :param voice: 声音文本,字符串类型
        :return: 无返回,将最终的结果保存到类的属性里面,那么关键字的属性为:self.predicted_poi
        """
        # 分词
        title_cut = self.__jieba_cut(title, self._only_nv, self._hmm)
        voice_cut = self.__jieba_cut(voice, self._only_nv, self._hmm)
        # 提取关键词
        important_title_words = self.__comput_tfidf(title_cut,
                                                    self._topk_title)
        important_voice_words = self.__comput_tfidf(voice_cut,
                                                    self._topk_voice)
        # 如果title 和 voice 的关键词为空,异常处理
        if len(important_title_words) < 1:
            self.predicted_poi = important_voice_words
        elif len(important_voice_words) < 1:
            self.predicted_poi = important_title_words
        elif len(important_title_words) < 1 and len(important_voice_words) < 1:
            self.predicted_poi = []
        else:
            # 正常状态
            simi_dict = {}
            # 计算相似度
            for w1 in important_title_words:
                for w2 in important_voice_words:
                    simi_value = self.__simi(w1, w2)
                    simi_dict[(w1, w2)] = simi_value

            # 相似度排序
            sorted_pairs = sorted(simi_dict.items(),
                                  key=lambda x: x[1],
                                  reverse=True)
            big_pairs, _ = zip(*sorted_pairs)

            # 去重得到最终结果
            important_words = []
            for tuple_p in big_pairs:
                important_words.append(tuple_p[0])
                important_words.append(tuple_p[1])
            self.predicted_poi = self.__list_clean(important_words)