示例#1
0
def summary_highlight(text, coref, ratio):
    if coref:
        #  coref_text = find_coreference_replace_pronominal(text)
        coref_text = text
    else:
        coref_text = text

    sum_text = summarize(text, coref_text, ratio)
    # extracted_sentences_number = get_extracted_number(sum_text, text)

    original_sentence_list = _format_results(_clean_text_by_sentences(text),
                                             True)
    extracted_sentence_list = _format_results(
        _clean_text_by_sentences(sum_text), True)

    index = 0
    for i in original_sentence_list:
        try:
            if i == extracted_sentence_list[index]:
                original_index = original_sentence_list.index(i)
                i = '<mark><em>' + i + '</em></mark>'
                original_sentence_list[original_index] = i
                if index < len(extracted_sentence_list) - 1:
                    index += 1
        except IndexError:
            pass

    return " ".join(original_sentence_list)
示例#2
0
def get_extracted_number(sum_sentences, original_text):
    sum_number = []
    sum_sentences = _format_results(_clean_text_by_sentences(sum_sentences),
                                    True)
    sentences = _format_results(_clean_text_by_sentences(original_text), True)
    for ss in sum_sentences:
        sum_number.append(sentences.index(ss))
    return sum_number
示例#3
0
def get_title(text):
    sentences = _clean_text_by_sentences(text)

    if len(sentences) == 0:
        return ""

    if len(sentences) == 1:
        return _format_results(sentences, False)

    sentence_tokenized = _tokenize_sentence(_format_results(sentences, True))
    indices_delete = []

    for i in range(len(sentence_tokenized)):
        #print(sen_word_count[i][1])
        if sentence_tokenized[i][1] > 10:
            indices_delete.append(i)

    sentence_tokenized = [
        i for j, i in enumerate(sentence_tokenized) if j not in indices_delete
    ]

    #print(len(sen_word_count))

    if len(sentence_tokenized) == 0:
        return ""

    if len(sentence_tokenized) == 1:
        return sentence_tokenized[0][0]

    keyword_list = keywords(_format_results(sentences, False),
                            words=4,
                            split=True)
    #print(keyword_list)
    title_score = [0, 0]

    for sen_tuple in sentence_tokenized:
        #print(sen_tuple[0])
        temp_score = 0
        index = sentence_tokenized.index(sen_tuple)
        count = 0
        for word_pos in sen_tuple[2]:
            if word_pos[0] in keyword_list:
                count += 1
        temp_score = count / sen_tuple[1]
        #print(temp_score)
        #print(sen_tuple[1])
        if temp_score > title_score[1]:
            title_score = [index, temp_score]

    return sentence_tokenized[title_score[0]][0]
示例#4
0
def gensim_summarize(text):
    """
    Slightly modified version of the gensim.summarization.summarizer.summarize function
    :param text: article to summarize
    :return:
    """
    sentences = summarizer._clean_text_by_sentences(text)
    corpus = summarizer._build_corpus(sentences)

    most_important_docs = summarizer.summarize_corpus(corpus, ratio=1)
    extracted_sentences = summarizer._extract_important_sentences(
        sentences, corpus, most_important_docs, None)

    return summarizer._format_results(extracted_sentences, True)
示例#5
0
def summary_highlight(text, ratio, omit_placeholders):
    sum_text = summarize(text,
                         ratio=ratio,
                         omit_placeholders=omit_placeholders)
    extracted_sentences_number = get_extracted_number(sum_text, text)

    original_sentence_list = _format_results(_clean_text_by_sentences(text),
                                             True)
    extracted_sentence_list = _format_results(
        _clean_text_by_sentences(sum_text), True)

    index = 0
    for i in original_sentence_list:
        try:
            if i == extracted_sentence_list[index]:
                original_index = original_sentence_list.index(i)
                i = '<mark><em>' + i + '</em></mark>'
                original_sentence_list[original_index] = i
                if index < len(extracted_sentence_list) - 1:
                    index += 1
        except IndexError:
            pass

    return " ".join(original_sentence_list)
示例#6
0
def summerize(
    text: str,
    ratio=0.2,
    word_count: int = None,
    split=False,
    token_filters=DEFAULT_FILTERS,
):
    """
    Reimplementation for the textrank algorithm from gensim.

    TODO:
        - language specific stopwords
            - Maybe SpaCy?? e.g. from spacy.lang.X import STOP_WORDS
        - language specific stemmers
        - language specific filters. currently abbrivation merging in split sentences only works on a-zA-Z.
          maybe we can use \\p{L} via https://stackoverflow.com/a/24245331
    
    WARNING: This implementation strategy applies super poorly to Danish abbreviations
             eg. ca.

    DONE:
        - 
    """

    # NOTE: this is the divergence from gensim preprocessing step
    # we replace
    #   > sentences = _clean_text_by_sentences(text)
    # with
    sentences = split_and_preprocess(text, token_filters)

    ###############################
    # everything under this is the same as in the gensim implementation
    ###############################

    # If no sentence could be identified, the function ends.
    if len(sentences) == 0:
        logger.warning("Input text is empty.")
        return [] if split else u""

    # If only one sentence is present, the function raises an error (Avoids ZeroDivisionError).
    if len(sentences) == 1:
        raise ValueError("input must have more than one sentence")

    # Warns if the text is too short.
    if len(sentences) < INPUT_MIN_LENGTH:
        logger.warning("Input text is expected to have at least %d sentences.",
                       INPUT_MIN_LENGTH)

    corpus = _build_corpus(sentences)

    most_important_docs = summarize_corpus(
        corpus, ratio=ratio if word_count is None else 1)

    # If couldn't get important docs, the algorithm ends.
    if not most_important_docs:
        logger.warning("Couldn't get relevant sentences.")
        return [] if split else u""

    # Extracts the most important sentences with the selected criterion.
    extracted_sentences = _extract_important_sentences(sentences, corpus,
                                                       most_important_docs,
                                                       word_count)

    # Sorts the extracted sentences by apparition order in the original text.
    extracted_sentences.sort(key=lambda s: s.index)

    return _format_results(extracted_sentences, split)
示例#7
0
def get_sentence_from_number(sum_number, text):
    sum_sentences = []
    sentences = _format_results(_clean_text_by_sentences(text), True)
    for i in sum_number:
        sum_sentences.append(sentences[i])
    return sum_sentences
示例#8
0
def summarize(text,
              ratio=0.2,
              word_count=None,
              split=False,
              omit_placeholders=False):
    """
    This is a improved version of gensim's summarization library. It includes 
    the option to exclude placeholders from summary generation & implemented 
    Stanford's CoreNLP sentence splitter for better sentence splitting 
    performance
    """
    """
    Returns a summarized version of the given text using a variation of
    the TextRank algorithm.
    The input must be longer than INPUT_MIN_LENGTH sentences for the
    summary to make sense and must be given as a string.

    The output summary will consist of the most representative sentences
    and will also be returned as a string, divided by newlines. If the
    split parameter is set to True, a list of sentences will be
    returned.

    The length of the output can be specified using the ratio and
    word_count parameters:
        ratio should be a number between 0 and 1 that determines the
    percentage of the number of sentences of the original text to be
    chosen for the summary (defaults at 0.2).
        word_count determines how many words will the output contain.
    If both parameters are provided, the ratio will be ignored.
    
    split must be set to true if you want the result in list of sentences,
    else the result will be returned in chunk of text

    omit_placeholders is set to true if you want the system to not compute the 
    placeholders: text descriptions in square bracket, i.e. [FORMULA].
    """
    # Gets a list of processed sentences.
    sentences = _clean_text_by_sentences(text)

    # If need to omit [placeholders], delete the [placeholders] first
    if omit_placeholders:
        sentences_list = _format_results(sentences, True)
        sentences_original = list(sentences_list)
        for i in range(len(sentences_list)):
            sentences_list[i] = re.sub(r'\[.*?\]', '', sentences_list[i])
        temp_sentences = ' '.join(sentences_list)
        sentences = _clean_text_by_sentences(temp_sentences)

    # If no sentence could be identified, the function ends.
    if len(sentences) == 0:
        logger.warning("Input text is empty.")
        return ""

    # If only one sentence is present, the function raises an error (Avoids ZeroDivisionError).
    if len(sentences) == 1:
        logger.warning("input must have more than one sentence")
        return ""

    # Warns if the text is too short.
    if len(sentences) < INPUT_MIN_LENGTH:
        logger.warning("Input text is expected to have at least " +
                       str(INPUT_MIN_LENGTH) + " sentences.")

    corpus = _build_corpus(sentences)

    most_important_docs = summarize_corpus(
        corpus, ratio=ratio if word_count is None else 1)

    # Extracts the most important sentences with the selected criterion.
    extracted_sentences = _extract_important_sentences(sentences, corpus,
                                                       most_important_docs,
                                                       word_count)

    # Sorts the extracted sentences by apparition order in the original text.
    extracted_sentences.sort(key=lambda s: s.index)

    # If omit_placeholders set to true, after processing replace back to original to preserve the [placeholders]
    if omit_placeholders:
        extracted_sentences = _format_results(extracted_sentences, False)
        sentences_list = '\n'.join(sentences_list)
        sentences_original = '\n'.join(sentences_original)
        extracted_sentences_number = get_extracted_number(
            extracted_sentences, sentences_list)
        #print(extracted_sentences_number)
        extracted_sentences = get_sentence_from_number(
            extracted_sentences_number, sentences_original)
        if split:
            return extracted_sentences
        else:
            return '\n'.join(extracted_sentences)
    else:
        return _format_results(extracted_sentences, split)