def count_word(self, text, use_type="text"): """ 词频统计(句子/段落/文章) :param text: str, path or doc, like "大漠帝国。" or "/home/data/doc.txt" :param use_type: str, "text" or "file", file of "utf-8" of "txt" :return: class<Counter>, word-freq """ self.words_count = Counter() if use_type == "text": # 输入为文本形式 texts = cut_sentence(use_type=self.algorithm, text=text) # 切句子, 如中英文的逗号/句号/感叹号 for text in texts: n_grams = get_ngrams(use_type=self.algorithm, len_max=self.len_max, text=text) # 获取一个句子的所有n-gram self.words_count.update(n_grams) elif use_type == "file": # 输入为文件形式 if not os.path.exists(text): raise RuntimeError("path of text must exist!") fr8 = open(text, "r", encoding="utf-8") for text in fr8: if text.strip(): texts = cut_sentence(use_type=self.algorithm, text=text) # 切句子, 如中英文的逗号/句号/感叹号 for text in texts: n_grams = get_ngrams(use_type=self.algorithm, len_max=self.len_max, text=text) # 获取一个句子的所有n-gram self.words_count.update(n_grams) fr8.close() else: raise RuntimeError("use_type must be 'text' or 'file'") self.total_words = sum(self.words_count.values())
def deal_corpus(self): import json token2idx = self.ot_dict.copy() if 'term' in self.corpus_path: with open(file=self.corpus_path, mode='r', encoding='utf-8') as fd: while True: term_one = fd.readline() if not term_one: break if term_one not in token2idx: token2idx[term_one] = len(token2idx) elif os.path.exists(self.corpus_path): with open(file=self.corpus_path, mode='r', encoding='utf-8') as fd: terms = fd.readlines() for line in terms: ques_label = json.loads(line.strip()) term_one = ques_label["question"] term_one = "".join(term_one) if self.level_type == 'char': text = list(term_one.replace(' ', '').strip()) elif self.level_type == 'word': text = macropodus_cut(term_one) elif self.level_type == 'ngram': text = get_ngrams(term_one, ns=self.ngram_ns) else: raise RuntimeError( "your input level_type is wrong, it must be 'word', 'char', 'ngram'" ) for text_one in text: if text_one not in token2idx: token2idx[text_one] = len(token2idx) else: raise RuntimeError( "your input corpus_path is wrong, it must be 'dict' or 'corpus'" ) self.token2idx = token2idx self.idx2token = {} for key, value in self.token2idx.items(): self.idx2token[value] = key
def summarize(self, text, num=320, title=None): """ 文本句子排序 :param docs: list :return: list """ # 切句 if type(text) == str: self.sentences = cut_sentence(text) elif type(text) == list: self.sentences = text else: raise RuntimeError("text type must be list or str") self.title = title if self.title: self.title = macropodus_cut(title) # 切词,含词性标注 self.sentences_tag_cut = [jieba_tag_cut(extract_chinese(sentence)) for sentence in self.sentences] # 词语,不含词性标注 sentences_cut = [[jc for jc in jtc.keys() ] for jtc in self.sentences_tag_cut] # 去除停用词等 self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut] # 词频统计 self.words = [] for sen in self.sentences_cut: self.words = self.words + sen self.word_count = dict(Counter(self.words)) # 按频次计算词语的得分, 得到self.word_freq=[{'word':, 'freq':, 'score':}] self.word_freqs = {} self.len_words = len(self.words) for k, v in self.word_count.items(): self.word_freqs[k] = v * 0.5 / self.len_words # uni_bi_tri_gram特征 [gram_uni, gram_bi, gram_tri] = get_ngrams("".join(self.sentences), ns=[1, 2, 3]) ngrams = gram_uni + gram_bi + gram_tri self.ngrams_count = dict(Counter(ngrams)) # 句子位置打分 scores_posi = self.score_position() # 句子长度打分 scores_length = self.score_length() # 句子词性打分, 名词(1.2)-代词(0.8)-动词(1.0) scores_tag = self.score_tag() res_rank = {} self.res_score = [] for i in range(len(sentences_cut)): sen_cut = self.sentences_cut[i] # 句子中的词语 # ngram得分 [gram_uni_, gram_bi_, gram_tri_] = get_ngrams(self.sentences[i], ns=[1, 2, 3]) # gram_uni_bi_tri(self.sentences[i]) n_gram_s = gram_uni_ + gram_bi_ + gram_tri_ score_ngram = sum([self.ngrams_count[ngs] if ngs in self.ngrams_count else 0 for ngs in n_gram_s]) / (len(n_gram_s) + 1) # 句子中词语的平均长度 score_word_length_avg = sum([len(sc) for sc in sen_cut])/(len(sen_cut)+1) score_posi = scores_posi[i] score_length = scores_length[i] score_tag = scores_tag[i] if self.title: # 有标题的文本打分合并 score_title = self.score_title(sen_cut) score_total = (score_title * 0.5 + score_ngram * 2.0 + score_word_length_avg * 0.5 + score_length * 0.5 + score_posi * 1.0 + score_tag * 0.6) / 6.0 # 可查阅各部分得分统计 self.res_score.append(["score_title", "score_ngram", "score_word_length_avg", "score_length", "score_posi", "score_tag"]) self.res_score.append([score_title, score_ngram, score_word_length_avg, score_length, score_posi, score_tag, self.sentences[i]]) else: # 无标题的文本打分合并 score_total = (score_ngram * 2.0 + score_word_length_avg * 0.5 + score_length * 0.5 + score_posi * 1.0 + score_tag * 0.6) / 5.0 # 可查阅各部分得分统计 self.res_score.append(["score_ngram", "score_word_length_avg", "score_length", "score_posi", "score_tag"]) self.res_score.append([score_ngram, score_word_length_avg, score_length, score_posi, score_tag, self.sentences[i]]) res_rank[self.sentences[i].strip()] = score_total # 最小句子数 num_min = min(num, int(len(self.word_count) * 0.6)) res_rank_sort = sorted(res_rank.items(), key=lambda rr: rr[1], reverse=True) res_rank_sort_reverse = [(rrs[1], rrs[0]) for rrs in res_rank_sort][0:num_min] return res_rank_sort_reverse