示例#1
0
    def document(self):
        # a abbr acronym b big blink blockquote cite code
        # dd del dfn dir dl dt em h h1 h2 h3 h4
        # h5 h6 i ins kbd li marquee menu ol pre q
        # s samp strike strong sub sup tt u ul var
        headers = 'h1', 'h2', 'h3'
        annotated_text = self._article.main_text
        paragraphs = []

        for paragraph in annotated_text:
            sentences, current_text = [], ''

            for (text, annotations) in paragraph:

                if annotations and any(h_tag in annotations
                                       for h_tag in headers):
                    sentences.append(
                        Sentence(text, self._tokenizer, is_heading=True))

                elif not (annotations and 'pre' in annotations):
                    # skip <pre> nodes
                    current_text += ' ' + text

            new_sentences = self.tokenize_sentences(current_text)
            sentences.extend(
                Sentence(s, self._tokenizer) for s in new_sentences)
            paragraphs.append(Paragraph(sentences))

        return ObjectDocumentModel(paragraphs)
示例#2
0
def create_sumy_dom(text, tokenizer):
    """Creates a sumy style document from the sentences.
    **TODO: Assumes that paragraphs are specified by lines starting with a space """
    from sumy.models.dom import Sentence, Paragraph, ObjectDocumentModel

    paragraphs = []
    paragraph = []
    for ii, line in enumerate(text):
        if line[0] != ' ' and ii > 0:  # Last line was the last one in paragraph
            paragraphs.append(Paragraph(paragraph))  # Dump paragraph
            paragraph = []  # start new paragraph going forward
        # Process current line
        paragraph.append(Sentence(line, tokenizer))
        if ii + 1 == len(text):  # if last line just dump
            paragraphs.append(Paragraph(paragraph))

    return ObjectDocumentModel(tuple(paragraphs))
示例#3
0
    def test_sentences_equal(self):
        sentence1 = Sentence("", Tokenizer("czech"))
        sentence2 = Sentence("", Tokenizer("czech"))
        self.assertEqual(sentence1, sentence2)

        sentence1 = Sentence("word another.", Tokenizer("czech"))
        sentence2 = Sentence("word another.", Tokenizer("czech"))
        self.assertEqual(sentence1, sentence2)

        sentence1 = Sentence("word another", Tokenizer("czech"))
        sentence2 = Sentence("another word", Tokenizer("czech"))
        self.assertNotEqual(sentence1, sentence2)
示例#4
0
def build_sentence(sentence_as_string, is_heading=False):
    return Sentence(sentence_as_string, _TOKENIZER, is_heading)
示例#5
0
def test_sentences_with_same_words_in_different_order_are_different():
    sentence1 = Sentence("word another", Tokenizer("czech"))
    sentence2 = Sentence("another word", Tokenizer("czech"))

    assert sentence1 != sentence2
示例#6
0
def test_same_sentences_equal():
    sentence1 = Sentence("word another.", Tokenizer("czech"))
    sentence2 = Sentence("word another.", Tokenizer("czech"))

    assert sentence1 == sentence2
示例#7
0
def test_empty_sentences_equal():
    sentence1 = Sentence("", Tokenizer("czech"))
    sentence2 = Sentence("", Tokenizer("czech"))

    assert sentence1 == sentence2
示例#8
0
    #create a list of reference sentences to calculate ROUGE_N scores
    ref_sentences = []
    trim_ref_sentences = []
    for paragraph in parser._article.main_text:
        for sections in paragraph:
            for sentences in sections:
                try:
                    if len(sentences) > 35:
                        # trim off super short - likely a few word sentences
                        ref_sentences.append(sentences)
                except TypeError:
                    #catch type errors caused by annotated text ie h1, b, etc
                    print("typeError")
                    continue
    trim_ref_sentences.extend(
        Sentence(s, Tokenizer(LANGUAGE)) for s in ref_sentences)

    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    #define summarizers for the summarizing methods being used
    summarizer_Lsa = Lsa(stemmer)
    summarizer_Lsa.stop_words = get_stop_words(LANGUAGE)
    summary_Lsa = summarizer_Lsa(parser.document, SENTENCES_COUNT)

    summarizer_LexRank = LexRank()
    summary_LexRank = summarizer_LexRank(parser.document, SENTENCES_COUNT)

    summarizer_Edmundson = Edmundson(stemmer)
    summarizer_Edmundson.null_words = get_stop_words(LANGUAGE)
示例#9
0
def summary(article_url):
    url = article_url
    #url = "http://www.encyclopedia.com/plants-and-animals/plants/plants/potato"
    # url = "http://www.encyclopedia.com/plants-and-animals/plants/plants/cabbage"
    # url = "http://www.encyclopedia.com/medicine/diseases-and-conditions/pathology/accident"
    # url = "http://www.encyclopedia.com/earth-and-environment/atmosphere-and-weather/atmospheric-and-space-sciences-atmosphere/air"
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))

    # create a list of reference sentences to calculate ROUGE_N scores
    ref_sentences = []
    trim_ref_sentences = []
    for paragraph in parser._article.main_text:
        for sections in paragraph:
            for sentences in sections:
                try:
                    if len(sentences) > 35:
                        # trim off super short - likely a few word sentences
                        ref_sentences.append(sentences)
                except TypeError:
                    # catch type errors caused by annotated text ie h1, b, etc
                    print("typeError")
                    continue
    trim_ref_sentences.extend(
        Sentence(s, Tokenizer(LANGUAGE)) for s in ref_sentences)

    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    # define summarizers for the summarizing methods being used
    summarizer_Lsa = Lsa(stemmer)
    summarizer_Lsa.stop_words = get_stop_words(LANGUAGE)
    summary_Lsa = summarizer_Lsa(parser.document, SENTENCES_COUNT)

    summarizer_LexRank = LexRank()
    summary_LexRank = summarizer_LexRank(parser.document, SENTENCES_COUNT)

    summarizer_Edmundson = Edmundson(stemmer)
    summarizer_Edmundson.null_words = get_stop_words(LANGUAGE)
    summarizer_Edmundson.bonus_words = parser.significant_words
    summarizer_Edmundson.stigma_words = parser.stigma_words
    summary_Edmundson = summarizer_Edmundson(parser.document, SENTENCES_COUNT)

    # print summaries
    summary_Lsa_trim = []
    for sentence in summary_Lsa:
        # trim off super short - likely a few word sentences
        if len(sentence._text) > 20:
            print(sentence)
            summary_Lsa_trim.append(sentence)

    # calc rouge_n scores
    calc_value(summary_Lsa_trim, trim_ref_sentences)

    print('\n')
    summary_LexRank_trim = []
    for sentence in summary_LexRank:
        # trim off super short - likely a few word sentences
        if len(sentence._text) > 20:
            print(sentence)
            summary_LexRank_trim.append(sentence)

    # calc rouge_n scores
    calc_value(summary_LexRank_trim, trim_ref_sentences)

    print('\n')
    summary_Edmundson_trim = []
    for sentence in summary_Edmundson:
        # trim off super short - likely a few word sentences
        if len(sentence._text) > 20:
            print(sentence)
            summary_Edmundson_trim.append(sentence)

    # calc rouge_n scores
    calc_value(summary_Edmundson_trim, trim_ref_sentences)

    # returns index of max 0=Ed, 1=Lsa, 2=Lex
    models = {0: "Edmundson Model", 1: "Lsa Model", 2: "LexRank Model"}
    best_summary = max_r_value(summary_Lsa_trim, summary_LexRank_trim,
                               summary_Edmundson_trim, trim_ref_sentences)
    print(
        models.get(best_summary) +
        ' is the best model according to an average of the Rouge_3, 2 and 1 tests'
    )

    #return the summary of the best model
    if (best_summary == 0):
        return summary_Edmundson_trim
    elif (best_summary == 1):
        return summary_Lsa_trim
    elif (best_summary == 2):
        return summary_LexRank_trim
示例#10
0
文件: Quality.py 项目: mahrab/DPP
def to_sentence(text):
    text = to_unicode(text).strip()
    return Sentence(text, Tokenizer("english"))