def summarize(text, language, n=5):
    tr = text_rank(text, language)
    top_n = sorted(tr[:n])
    text_rank_result = ' '.join(x[2] for x in top_n)
    lsa_result = LSASummarizer.summarize(text, language, n)
    cosine_text_rank = get_cosine(text_to_vector(text), text_to_vector(text_rank_result))
    cosine_lsa=get_cosine(text_to_vector(text), text_to_vector(lsa_result))
    if cosine_lsa > cosine_text_rank:
        return lsa_result
    return text_rank_result
Exemplo n.º 2
0
def abstract():
    form = AbstractForm()
    if form.is_submitted():
        form.result = summarize(form.body.data, form.language.data,
                                int(form.rows.data))
        form.cosine = get_cosine(text_to_vector(form.body.data),
                                 text_to_vector(form.result.data))
        return render_template('abstract.html',
                               form=form,
                               template='form-template')
    return render_template('abstract.html',
                           form=form,
                           template='form-template')
Exemplo n.º 3
0
def test(language):
    filename = 'RU_dataset.csv'
    if language == 'ukrainian':
        filename = 'UA_dataset_full.csv'
    testData = []

    with io.open(filename, 'r', encoding="utf-8") as file:
        for row in csv.reader(file):
            testData.append({"id": row[0], "title": row[1], "text": row[2]})

    row_list = [[
        "Id", "Title", "Body", "Summary20", "Cosine20", "Summary40", "Cosine40"
    ]]
    for testDataRow in testData:
        testedText = testDataRow["text"]
        result20 = ''
        result40 = ''
        if language == 'ukrainian':
            sentences = sent_tokenizer_ua(testedText)
            if len(sentences) < 10:
                result20 = GeneralSummarizer.summarize(testedText, language, 2)
                if (len(sentences) < 5):
                    result40 = GeneralSummarizer.summarize(
                        testedText, language, 2)
                else:
                    result40 = GeneralSummarizer.summarize(
                        testedText, language, 4)
            elif len(sentences) < 20:
                result20 = GeneralSummarizer.summarize(testedText, language, 4)
                result40 = GeneralSummarizer.summarize(testedText, language, 8)
            else:
                result20 = GeneralSummarizer.summarize(testedText, language,
                                                       len(sentences) // 5)
                result40 = GeneralSummarizer.summarize(testedText, language,
                                                       len(sentences) // 3)
        else:
            sentences = sent_tokenizer_ru(testedText)
            if len(sentences) < 10:
                result20 = GeneralSummarizer.summarize(testedText, language, 2)
                if (len(sentences) < 5):
                    result40 = GeneralSummarizer.summarize(
                        testedText, language, 2)
                else:
                    result40 = GeneralSummarizer.summarize(
                        testedText, language, 4)
            elif len(sentences) < 20:
                result20 = GeneralSummarizer.summarize(testedText, language, 4)
                result40 = GeneralSummarizer.summarize(testedText, language, 8)
            else:
                result20 = GeneralSummarizer.summarize(testedText, language,
                                                       len(sentences) // 5)
                result40 = GeneralSummarizer.summarize(testedText, language,
                                                       len(sentences) // 3)

        cosine20 = get_cosine(text_to_vector(result20),
                              text_to_vector(testedText))
        cosine40 = get_cosine(text_to_vector(result40),
                              text_to_vector(testedText))
        testDataRow["result20"] = result20
        testDataRow["result40"] = result40
        row_list.append([
            testDataRow["id"], testDataRow["title"], testDataRow["text"],
            testDataRow["result20"],
            str(cosine20), testDataRow["result40"],
            str(cosine40)
        ])

    with io.open('result' + filename, 'w', newline='',
                 encoding="utf-8") as file:
        writer = csv.writer(file,
                            delimiter=',',
                            quotechar='"',
                            quoting=csv.QUOTE_ALL)
        writer.writerows(row_list)