示例#1
0
文件: summ.py 项目: jannson/Similar
def summarize4(sents, docs=None):
    if not docs:
        docs = [list(Tokenize(sent)) for sent in sents]
    sim_res = bm25_weights(docs)
    rank = TextRank(sim_res)
    rank.solve()
    top_n_summary = []
    for index in sorted(rank.top_index(3)):
        top_n_summary.append(sents[index])
    return u"。 ".join(top_n_summary).replace("\r", "").replace("\n", "") + u"。"
示例#2
0
文件: summ.py 项目: jannson/Similar
def summarize4(sents, docs=None):
    if not docs:
        docs = [list(Tokenize(sent)) for sent in sents]
    sim_res = bm25_weights(docs)
    rank = TextRank(sim_res)
    rank.solve()
    top_n_summary = []
    for index in sorted(rank.top_index(3)):
        top_n_summary.append(sents[index])
    return u'。 '.join(top_n_summary).replace('\r', '').replace('\n', '') + u'。'
示例#3
0
class Order:
    def __init__(self, text, seg=None, tagger=None):
        self.text = text
        self.tagger = tagger if tagger is not None else self.get_tagger()
        self.seg = seg if seg is not None else self.get_seg()
        self.words_merge = None

    def get_keywords(self, limit=5, merge=False):
        doc = []
        sentences = self.get_sentences()
        for sentence in sentences:
            words = list(self.seg.seg(sentence))
            words = self.filter_stop(words)
            doc.append(words)

        self.keywordrank = KeywordRank(doc)
        self.keywordrank.solve()
        result = []
        for w in self.keywordrank.top_index(limit):
            result.append(w)

        if merge:
            wm = self.words_merge.merge(self.text, result)
            return wm.merge()
        return result

    def get_summaries(self, limit=5):
        doc = []
        sentences = self.get_sentences()
        for sentence in sentences:
            words = list(self.seg.seg(sentence))
            words = self.filter_stop(words)
            doc.append(words)

        self.textrank = TextRank(doc)
        self.textrank.solve()
        result = []
        for index in self.textrank.top_index(limit):
            result.append(sentences[index])
        return result

    def get_sentences(self):
        line_break_re = re.compile('[\r\n]')
        delimiter_re = re.compile('[,。?!;]')
        sentences = []
        for line in line_break_re.split(self.text):
            line = line.strip()
            if not line:
                continue

            for sentence in delimiter_re.split(line):
                sentence = sentence.strip()
                if not sentence:
                    continue
                sentences.append(sentence)

        return sentences

    def get_seg(self, fname='seg.pickle'):
        seg = Seg()
        seg.load(fname)
        return seg

    def get_tagger(self, fname='tag.pickle'):
        tagger = Tag()
        tagger.load(fname)
        return tagger

    def filter_stop(self, words):
        return list(filter(lambda x: x not in stop_words, words))