Пример #1
0
def summarize(document, all=True):
    doc = Document(document)
    sentences, offset = (doc.all_sentences()
                         if all else doc.filtered_sentences())

    # Ranker
    ranker = TextRank(sentences)
    ranker.rank()
    scores = ranker.scores

    # Selector
    summary = []
    sum_len = 0
    for x in range(num):
        idx = scores[x][0] + offset
        sent = doc[idx].sentence
        if sum_len + len(sent.split(' ')) > MAXLEN:
            break
        summary.append((sent, scores[x][1], doc.get_section_name(idx)))
        sum_len += len(sent.split(' '))
    text = ''
    logit("\nP10-1024")
    logit("\nAll Sentences" if all else "\nFiltered Sentences")
    logit("Length of summary : " + str(sum_len))
    for sent, score, section in summary:
        text += '\n' + "[" + section.encode('utf-8') + "] " + \
                sent.encode('utf-8')
        #"[" + str(score) + "] " + sent.encode('utf-8')
    logit(text)

    # Printer
    # this has to be automated
    file = DIR['BASE'] + "data/Summary.txt"
    with open(file, 'w') as sfile:
        sfile.write('\n'.join([sent
                               for sent, sc, sec in summary]).encode('utf-8'))

    # Evaluator
    guess_summary_list = [file]
    ref_summary_list = [[DIR['BASE'] + "data/P10-1024-Ref1.txt"]]
    recall, precision, F_measure = PythonROUGE(guess_summary_list,
                                               ref_summary_list,
                                               ngram_order=1)
    logit("Recall:{0} ; Precision:{1} ; F:{2}".format(recall, precision,
                                                      F_measure))
Пример #2
0
def summarize(document, all=True):
    doc = Document(document)
    sentences, offset = (doc.all_sentences() if all
                         else doc.filtered_sentences())

    # Ranker
    ranker = TextRank(sentences)
    ranker.rank()
    scores = ranker.scores

    # Selector
    summary = []
    sum_len = 0
    for x in range(num):
        idx = scores[x][0] + offset
        sent = doc[idx].sentence
        if sum_len + len(sent.split(' ')) > MAXLEN:
            break
        summary.append((sent, scores[x][1], doc.get_section_name(idx)))
        sum_len += len(sent.split(' '))
    text = ''
    logit("\nP10-1024")
    logit("\nAll Sentences" if all else "\nFiltered Sentences")
    logit("Length of summary : " + str(sum_len))
    for sent, score, section in summary:
        text += '\n' + "[" + section.encode('utf-8') + "] " + \
                sent.encode('utf-8')
                #"[" + str(score) + "] " + sent.encode('utf-8')
    logit(text)

    # Printer
    # this has to be automated
    file = DIR['BASE'] + "data/Summary.txt"
    with open(file, 'w') as sfile:
        sfile.write('\n'.join([sent for sent, sc, sec in summary]).
                    encode('utf-8'))

    # Evaluator
    guess_summary_list = [file]
    ref_summary_list = [[DIR['BASE'] + "data/P10-1024-Ref1.txt"]]
    recall, precision, F_measure = PythonROUGE(guess_summary_list,
                                               ref_summary_list,
                                               ngram_order=1)
    logit("Recall:{0} ; Precision:{1} ; F:{2}".format(recall, precision,
                                                      F_measure))
Пример #3
0
def summarize_secitons(document, sections, coef=0.8):
    logit(document)
    doc = Document(document)
    all_sentences, all_offset = doc.all_sentences()
    summ = []
    for section_name in sections:
        sec_sentences, sec_offset = doc.section_sentences(section_name)
        limit = len(sec_sentences)

        # Ranker
        ranker = SectionMMR(all_sentences)
        ranker.rank(sec_offset=sec_offset, limit=limit, coef=coef)
        sentencs = ranker.scores

        summary = []
        for x in range(num):
            idx = sentencs[x][0] + sec_offset
            sent = doc[idx].sentence
            summary.append((sent, sentencs[x][1], doc.get_section_name(idx)))
            summ.append(sent)
        text = ''
        logit("\nSection : " + section_name)
        for sent, score, section in summary:
            text += '\n' + sent.encode('utf-8')
        logit(text)
    file = DIR['BASE'] + "data/Summary.txt"
    with open(file, 'w') as sfile:
        sfile.write('\n'.join(summ).encode('utf-8'))
Пример #4
0
def summarize_secitons(document, sections, coef=0.8):
    logit(document)
    doc = Document(document)
    all_sentences, all_offset = doc.all_sentences()
    summ = []
    for section_name in sections:
        sec_sentences, sec_offset = doc.section_sentences(section_name)
        limit = len(sec_sentences)

        # Ranker
        ranker = SectionMMR(all_sentences)
        ranker.rank(sec_offset=sec_offset, limit=limit, coef=coef)
        sentencs = ranker.scores

        summary = []
        for x in range(num):
            idx = sentencs[x][0] + sec_offset
            sent = doc[idx].sentence
            summary.append((sent, sentencs[x][1], doc.get_section_name(idx)))
            summ.append(sent)
        text = ""
        logit("\nSection : " + section_name)
        for sent, score, section in summary:
            text += "\n" + sent.encode("utf-8")
        logit(text)
    file = DIR["BASE"] + "data/Summary.txt"
    with open(file, "w") as sfile:
        sfile.write("\n".join(summ).encode("utf-8"))
Пример #5
0
def summarize_secitons(document, sections):
    logit(document)
    doc = Document(document)
    all_sentences, all_offset = doc.all_sentences()
    summ = []
    for section_name in sections:
        sec_sentences, sec_offset = doc.section_sentences(section_name)

        # Ranker
        ranker = TextRank(sec_sentences)
        ranker.rank()
        sentencs = ranker.scores

        summary = []
        for x in range(num):
            idx = sentencs[x][0] + sec_offset
            sent = doc[idx].sentence
            summary.append((sent, sentencs[x][1], doc.get_section_name(idx)))
            summ.append(sent)
        text = ''
        logit("\nSection : " + section_name)
        for sent, score, section in summary:
            text += '\n' + sent.encode('utf-8')
        logit(text)
    file = DIR['BASE'] + "data/Summary.txt"
    with open(file, 'w') as sfile:
        sfile.write('\n'.join(summ).encode('utf-8'))

    # Evaluator
    guess_summary_list = [file]
    ref_summary_list = [[DIR['BASE'] + "data/P10-1024-Ref1.txt"]]
    recall, precision, F_measure = PythonROUGE(guess_summary_list,
                                               ref_summary_list,
                                               ngram_order=1)
    logit("Recall:{0} ; Precision:{1} ; F:{2}".format(recall, precision,
                                                      F_measure))
Пример #6
0
from Document import Document
from Document import logit
from datetime import datetime
from Ranker import SectionMMR
from Config import DIR

logit('\n' + str(datetime.now()))

# number of sentences in the summary
num = 1
# maximum allowed length of the summary
MAXLEN = 200


def summarize_secitons(document, sections, coef=0.8):
    logit(document)
    doc = Document(document)
    all_sentences, all_offset = doc.all_sentences()
    summ = []
    for section_name in sections:
        sec_sentences, sec_offset = doc.section_sentences(section_name)
        limit = len(sec_sentences)

        # Ranker
        ranker = SectionMMR(all_sentences)
        ranker.rank(sec_offset=sec_offset, limit=limit, coef=coef)
        sentencs = ranker.scores

        summary = []
        for x in range(num):
            idx = sentencs[x][0] + sec_offset
Пример #7
0
from Document import Document
from Document import logit
from datetime import datetime
from Ranker import TextRank
from Config import DIR
from PythonROUGE import PythonROUGE

logit('\n' + str(datetime.now()))

# number of sentences in the summary
num = 1
# maximum allowed length of the summary
MAXLEN = 200


def summarize_secitons(document, sections):
    logit(document)
    doc = Document(document)
    all_sentences, all_offset = doc.all_sentences()
    summ = []
    for section_name in sections:
        sec_sentences, sec_offset = doc.section_sentences(section_name)

        # Ranker
        ranker = TextRank(sec_sentences)
        ranker.rank()
        sentencs = ranker.scores

        summary = []
        for x in range(num):
            idx = sentencs[x][0] + sec_offset
Пример #8
0
from Document import Document
from Document import logit
from datetime import datetime
from Ranker import SectionMMR
from Config import DIR

logit("\n" + str(datetime.now()))

# number of sentences in the summary
num = 1
# maximum allowed length of the summary
MAXLEN = 200


def summarize_secitons(document, sections, coef=0.8):
    logit(document)
    doc = Document(document)
    all_sentences, all_offset = doc.all_sentences()
    summ = []
    for section_name in sections:
        sec_sentences, sec_offset = doc.section_sentences(section_name)
        limit = len(sec_sentences)

        # Ranker
        ranker = SectionMMR(all_sentences)
        ranker.rank(sec_offset=sec_offset, limit=limit, coef=coef)
        sentencs = ranker.scores

        summary = []
        for x in range(num):
            idx = sentencs[x][0] + sec_offset