Exemplos de _clean_text_by_sentences em Python, exemplos de gensim.summarization.textcleaner._clean_text_by_sentences em Python

Exemplo n.º 1

0

Exibir arquivo

def summary_highlight(text, coref, ratio):
    if coref:
        #  coref_text = find_coreference_replace_pronominal(text)
        coref_text = text
    else:
        coref_text = text

    sum_text = summarize(text, coref_text, ratio)
    # extracted_sentences_number = get_extracted_number(sum_text, text)

    original_sentence_list = _format_results(_clean_text_by_sentences(text),
                                             True)
    extracted_sentence_list = _format_results(
        _clean_text_by_sentences(sum_text), True)

    index = 0
    for i in original_sentence_list:
        try:
            if i == extracted_sentence_list[index]:
                original_index = original_sentence_list.index(i)
                i = '<mark><em>' + i + '</em></mark>'
                original_sentence_list[original_index] = i
                if index < len(extracted_sentence_list) - 1:
                    index += 1
        except IndexError:
            pass

    return " ".join(original_sentence_list)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: improved_keywords.py Projeto: AsalJalilvand/extractive-text-summarization

def get_sentence_score_per_word(text):
    global sentence_score_per_word
    sentence_score_per_word = {}
    # form Gensim summarizer, I moved some code here because I wanted to access pagerank results
    sentences = _clean_text_by_sentences(text)
    corpus = _build_corpus(sentences)
    hashable_corpus = _build_hasheable_corpus(corpus)
    sentences_by_corpus = dict(zip(hashable_corpus, sentences))
    graph = _build_sentence_graph(hashable_corpus)
    _set_graph_edge_weights(graph)
    _remove_unreachable_nodes(graph)
    pagerank_scores = _pagerank(graph)
    for sentence_id, score in pagerank_scores.items():
        sentence = sentences_by_corpus[sentence_id]
        for token in sentence.token.split():
            if token in sentence_score_per_word:
                score_dict = sentence_score_per_word[token]
                sentence_score_per_word[token][
                    'n_sents'] = score_dict['n_sents'] + 1
                sentence_score_per_word[token][
                    'cumulative_score'] = score_dict['cumulative_score'] + score
            else:
                sentence_score_per_word[token] = {
                    'n_sents': 1,
                    'cumulative_score': score
                }

Exemplo n.º 3

0

Exibir arquivo

Arquivo: 48540_summarizer.py Projeto: chitang1990/Gensim-3.1.0

def summarize(text, ratio=0.2, word_count=None, split=False):
    """
    Returns a summarized version of the given text using a variation of
    the TextRank algorithm.
    The input must be longer than INPUT_MIN_LENGTH sentences for the
    summary to make sense and must be given as a string.

    The output summary will consist of the most representative sentences
    and will also be returned as a string, divided by newlines. If the
    split parameter is set to True, a list of sentences will be
    returned.

    The length of the output can be specified using the ratio and
    word_count parameters:

        ratio should be a number between 0 and 1 that determines the
        percentage of the number of sentences of the original text to be
        chosen for the summary (defaults at 0.2).
        word_count determines how many words will the output contain.
        If both parameters are provided, the ratio will be ignored.

    """
    # Gets a list of processed sentences.
    sentences = _clean_text_by_sentences(text)

    # If no sentence could be identified, the function ends.
    if len(sentences) == 0:
        logger.warning("Input text is empty.")
        return [] if split else u""

    # If only one sentence is present, the function raises an error (Avoids ZeroDivisionError).
    if len(sentences) == 1:
        raise ValueError("input must have more than one sentence")

    # Warns if the text is too short.
    if len(sentences) < INPUT_MIN_LENGTH:
        logger.warning("Input text is expected to have at least %d sentences.",
                       INPUT_MIN_LENGTH)

    corpus = _build_corpus(sentences)

    most_important_docs = summarize_corpus(
        corpus, ratio=ratio if word_count is None else 1)

    # If couldn't get important docs, the algorithm ends.
    if not most_important_docs:
        logger.warning("Couldn't get relevant sentences.")
        return [] if split else u""

    # Extracts the most important sentences with the selected criterion.
    extracted_sentences = _extract_important_sentences(sentences, corpus,
                                                       most_important_docs,
                                                       word_count)

    # Sorts the extracted sentences by apparition order in the original text.
    extracted_sentences.sort(key=lambda s: s.index)

    return _format_results(extracted_sentences, split)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: summarizer.py Projeto: rmalouf/gensim

def summarize(text, ratio=0.2, word_count=None, split=False):
    """
    Returns a summarized version of the given text using a variation of
    the TextRank algorithm.
    The input must be longer than INPUT_MIN_LENGTH sentences for the
    summary to make sense and must be given as a string.

    The output summary will consist of the most representative sentences
    and will also be returned as a string, divided by newlines. If the
    split parameter is set to True, a list of sentences will be
    returned.

    The length of the output can be specified using the ratio and
    word_count parameters:

        ratio should be a number between 0 and 1 that determines the
        percentage of the number of sentences of the original text to be
        chosen for the summary (defaults at 0.2).
        word_count determines how many words will the output contain.
        If both parameters are provided, the ratio will be ignored.

    """
    # Gets a list of processed sentences.
    sentences = _clean_text_by_sentences(text)

    # If no sentence could be identified, the function ends.
    if len(sentences) == 0:
        logger.warning("Input text is empty.")
        return [] if split else u""

    # If only one sentence is present, the function raises an error (Avoids ZeroDivisionError).
    if len(sentences) == 1:
        raise ValueError("input must have more than one sentence")

    # Warns if the text is too short.
    if len(sentences) < INPUT_MIN_LENGTH:
        logger.warning("Input text is expected to have at least %d sentences.", INPUT_MIN_LENGTH)

    corpus = _build_corpus(sentences)

    most_important_docs = summarize_corpus(corpus, ratio=ratio if word_count is None else 1)

    # If couldn't get important docs, the algorithm ends.
    if not most_important_docs:
        logger.warning("Couldn't get relevant sentences.")
        return [] if split else u""

    # Extracts the most important sentences with the selected criterion.
    extracted_sentences = _extract_important_sentences(sentences, corpus, most_important_docs, word_count)

    # Sorts the extracted sentences by apparition order in the original text.
    extracted_sentences.sort(key=lambda s: s.index)

    return _format_results(extracted_sentences, split)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: summarizer.py Projeto: polcar/gensim

def summarize(text, ratio=0.2, word_count=None, split=False):
    """
    Returns a summarized version of the given text using a variation of
    the TextRank algorithm.
    The input must be longer than INPUT_MIN_LENGTH sentences for the
    summary to make sense and must be given as a string.

    The output summary will consist of the most representative sentences
    and will also be returned as a string, divided by newlines. If the
    split parameter is set to True, a list of sentences will be
    returned.

    The length of the output can be specified using the ratio and
    word_count parameters:
        ratio should be a number between 0 and 1 that determines the
    percentage of the number of sentences of the original text to be
    chosen for the summary (defaults at 0.2).
        word_count determines how many words will the output contain.
    If both parameters are provided, the ratio will be ignored.
    """
    # Gets a list of processed sentences.
    sentences = _clean_text_by_sentences(text)

    if len(sentences) < INPUT_MIN_LENGTH:
        raise RuntimeError("Input text must have at least " +
                           str(INPUT_MIN_LENGTH) + " sentences.")

    corpus = _build_corpus(sentences)

    most_important_docs = summarize_corpus(
        corpus, ratio=ratio if word_count is None else 1)

    # Extracts the most important sentences with the selected criterion.
    extracted_sentences = _extract_important_sentences(sentences, corpus,
                                                       most_important_docs,
                                                       word_count)

    # Sorts the extracted sentences by apparition order in the original text.
    extracted_sentences.sort(key=lambda s: s.index)

    return _format_results(extracted_sentences, split)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: summarizer.py Projeto: nonva/gensim

def summarize(text, ratio=0.2, word_count=None, split=False):
    """
    Returns a summarized version of the given text using a variation of
    the TextRank algorithm.
    The input must be longer than INPUT_MIN_LENGTH sentences for the
    summary to make sense and must be given as a string.

    The output summary will consist of the most representative sentences
    and will also be returned as a string, divided by newlines. If the
    split parameter is set to True, a list of sentences will be
    returned.

    The length of the output can be specified using the ratio and
    word_count parameters:
        ratio should be a number between 0 and 1 that determines the
    percentage of the number of sentences of the original text to be
    chosen for the summary (defaults at 0.2).
        word_count determines how many words will the output contain.
    If both parameters are provided, the ratio will be ignored.
    """
    # Gets a list of processed sentences.
    sentences = _clean_text_by_sentences(text)

    if len(sentences) < INPUT_MIN_LENGTH:
        raise RuntimeError("Input text must have at least " + str(INPUT_MIN_LENGTH) + " sentences.")

    corpus = _build_corpus(sentences)

    most_important_docs = summarize_corpus(corpus, ratio=ratio if word_count is None else 1)

    # Extracts the most important sentences with the selected criterion.
    extracted_sentences = _extract_important_sentences(sentences, corpus, most_important_docs, word_count)

    # Sorts the extracted sentences by apparition order in the original text.
    extracted_sentences.sort(key=lambda s: s.index)

    return _format_results(extracted_sentences, split)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: summarizer.py Projeto: ShirleyHan6/TextSumDemo

def summarize(text, coref, ratio, word_count=None, split=False):
    """Get a summarized version of the given text.

    The output summary will consist of the most representative sentences
    and will be returned as a string, divided by newlines.

    Note
    ----
    The input should be a string, and must be longer than :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH`
    sentences for the summary to make sense.
    The text will be split into sentences using the split_sentences method in the :mod:`gensim.summarization.texcleaner`
    module. Note that newlines divide sentences.


    Parameters
    ----------
    text : str
        Given text.
    ratio : float, optional
        Number between 0 and 1 that determines the proportion of the number of
        sentences of the original text to be chosen for the summary.
    word_count : int or None, optional
        Determines how many words will the output contain.
        If both parameters are provided, the ratio will be ignored.
    split : bool, optional
        If True, list of sentences will be returned. Otherwise joined
        strings will bwe returned.

    Returns
    -------
    list of str
        If `split` **OR**
    str
        Most representative sentences of given the text.

    """
    # Gets a list of processed sentences.
    sentences = _clean_text_by_sentences(coref)
    original = _clean_text_by_sentences(text)

    # If no sentence could be identified, the function ends.
    if len(sentences) == 0:
        logger.warning("Input text is empty.")
        return [] if split else u""

    # If only one sentence is present, the function raises an error (Avoids ZeroDivisionError).
    if len(sentences) == 1:
        raise ValueError("input must have more than one sentence")

    # Warns if the text is too short.
    if len(sentences) < INPUT_MIN_LENGTH:
        logger.warning("Input text is expected to have at least %d sentences.",
                       INPUT_MIN_LENGTH)

    corpus = _build_corpus(sentences)

    most_important_docs = summarize_corpus(
        corpus, ratio=ratio if word_count is None else 1)

    # If couldn't get important docs, the algorithm ends.
    if not most_important_docs:
        logger.warning("Couldn't get relevant sentences.")
        return [] if split else u""

    # Extracts the most important sentences with the selected criterion.
    extracted_sentences = _extract_important_sentences(original, corpus,
                                                       most_important_docs,
                                                       word_count)

    # Sorts the extracted sentences by apparition order in the original text.
    extracted_sentences.sort(key=lambda s: s.index)

    return _format_results(extracted_sentences, split)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: summarizer.py Projeto: RaRe-Technologies/gensim

def summarize(text, ratio=0.2, word_count=None, split=False):
    """Get a summarized version of the given text.

    The output summary will consist of the most representative sentences
    and will be returned as a string, divided by newlines.

    Note
    ----
    The input should be a string, and must be longer than :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH`
    sentences for the summary to make sense.
    The text will be split into sentences using the split_sentences method in the :mod:`gensim.summarization.texcleaner`
    module. Note that newlines divide sentences.


    Parameters
    ----------
    text : str
        Given text.
    ratio : float, optional
        Number between 0 and 1 that determines the proportion of the number of
        sentences of the original text to be chosen for the summary.
    word_count : int or None, optional
        Determines how many words will the output contain.
        If both parameters are provided, the ratio will be ignored.
    split : bool, optional
        If True, list of sentences will be returned. Otherwise joined
        strings will bwe returned.

    Returns
    -------
    list of str
        If `split` **OR**
    str
        Most representative sentences of given the text.

    """
    # Gets a list of processed sentences.
    sentences = _clean_text_by_sentences(text)

    # If no sentence could be identified, the function ends.
    if len(sentences) == 0:
        logger.warning("Input text is empty.")
        return [] if split else u""

    # If only one sentence is present, the function raises an error (Avoids ZeroDivisionError).
    if len(sentences) == 1:
        raise ValueError("input must have more than one sentence")

    # Warns if the text is too short.
    if len(sentences) < INPUT_MIN_LENGTH:
        logger.warning("Input text is expected to have at least %d sentences.", INPUT_MIN_LENGTH)

    corpus = _build_corpus(sentences)

    most_important_docs = summarize_corpus(corpus, ratio=ratio if word_count is None else 1)

    # If couldn't get important docs, the algorithm ends.
    if not most_important_docs:
        logger.warning("Couldn't get relevant sentences.")
        return [] if split else u""

    # Extracts the most important sentences with the selected criterion.
    extracted_sentences = _extract_important_sentences(sentences, corpus, most_important_docs, word_count)

    # Sorts the extracted sentences by apparition order in the original text.
    extracted_sentences.sort(key=lambda s: s.index)

    return _format_results(extracted_sentences, split)