Пример #1
0
def summarize(sentences,
              ratio=0.2,
              words=None,
              language="english",
              split=False,
              scores=False):
    # Gets a list of processed sentences.
    # sentences = _clean_text_by_sentences(text, language)

    # Creates the graph and calculates the similarity coefficient for every pair of nodes.
    i = 0
    while i < len(sentences):
        sentences[i] = ''.join(str(x) for x in sentences[i])
        i = i + 1

    graph = summarizer._build_graph(sentences)

    summarizer._set_graph_edge_weights(graph)

    # Remove all nodes with all edges weights equal to zero.
    _remove_unreachable_nodes(graph)

    # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
    pagerank_scores = _pagerank(graph)

    i = 0
    scores_list = []
    while i < len(sentences):
        if sentences[i] in pagerank_scores.keys():
            scores_list.append(pagerank_scores[sentences[i]])
        else:
            scores_list.append(0)
        i = i + 1

    return scores_list
def summarize(sentences, similarity_matrix, ratio=0.2, split=False):
    # Creates the graph and calculates the similarity coefficient for every pair of nodes.
    graph = _build_graph([i for i in range(len(sentences))])

    _set_graph_edge_weights(graph, similarity_func=partial(
        cosine_similarity, similarity_matrix))

    # Remove all nodes with all edges weights equal to zero.
    _remove_unreachable_nodes(graph)

    # PageRank cannot be run in an empty graph.
    if len(graph.nodes()) == 0:
        return [] if split else ""

    # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
    pagerank_scores = _pagerank(graph)

    # Adds the summa scores to the sentence objects.
    _add_scores_to_sentences(sentences, pagerank_scores)

    # Extracts the most important sentences with the selected criterion.
    extracted_sentences = _extract_most_important_sentences(
        sentences, ratio, words=None)

    # Sorts the extracted sentences by apparition order in the original text.
    extracted_sentences.sort(key=lambda s: s.index)
    # print([(x.index, x.score) for x in extracted_sentences[:2]])
    return _format_results(extracted_sentences, split, score=None)
Пример #3
0
def summarize_with_model(text, session, model, model_name, additional_stopwords):
    if not isinstance(text, str):
        raise ValueError("Text parameter must be a Unicode object (str)!")
    lang = detect(text)[:2]
    if lang == "en":
        paragraphs = text.split("\n")
        sentences = []
        paragraph_index = 0
        for paragraph in paragraphs:
            # Gets a list of processed sentences.
            if paragraph:
                tmp = _clean_text_by_sentences(
                    paragraph, additional_stopwords)
                if tmp:
                    for j, sent in enumerate(tmp):
                        sent.paragraph = paragraph_index
                        # Hacky way to overwrite token
                        sent.token = len(sentences) + j
                    sentences += tmp
                    paragraph_index += 1
    elif lang == "zh" or lang == "ko":  # zh-Hant sometimes got misclassified into ko
        if model_name != "xling":
            raise ValueError("Only 'xling' model supports zh.")
        sentences = cut_sentences_by_rule(text)
    elif lang == "ja":
        if model_name != "xling":
            raise ValueError("Only 'xling' model supports ja.")
        if not JA_SUPPORT:
            raise ImportError("Missing dependencies for Japanese support.")
        sentences = ja_clean_and_cut_sentences(text)
        for i, sent in enumerate(sentences):
            # Hacky way to overwrite token
            sent.token = i
    else:
        return ["Language not suppored! (supported languages: en, zh, ja)"], None, lang

    # print([sentence.token for sentence in sentences if sentence.token])
    # Creates the graph and calculates the similarity coefficient for every pair of nodes.
    similarities = attach_sentence_embeddings(
        session, sentences, model, batch_size=32)
    graph = _build_graph([x.token for x in sentences])
    _set_graph_edge_weights(graph, partial(cosine_similarity, similarities))

    # Remove all nodes with all edges weights equal to zero.
    _remove_unreachable_nodes(graph)

    # PageRank cannot be run in an empty graph.
    if len(graph.nodes()) == 0:
        return []

    # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
    pagerank_scores = _pagerank(graph)

    # Adds the summa scores to the sentence objects.
    _add_scores_to_sentences(sentences, pagerank_scores)

    # Sorts the sentences
    sentences.sort(key=lambda s: s.score, reverse=True)
    return sentences, graph, lang
Пример #4
0
def summarize_custom(text, ratio=0.2, split=False, scores=False, words=None, stopwords=None):
    cleaned_sentences = clean_sentences(text, stopwords=stopwords)
    graph = build_graph([sentence.token for sentence in cleaned_sentences])
    _set_graph_edge_weights(graph)
    _remove_unreachable_nodes(graph)
    pagerank_scores = _pagerank(graph)
    _add_scores_to_sentences(cleaned_sentences, pagerank_scores)
    extracted_sentences = _extract_most_important_sentences(
        cleaned_sentences, ratio, words)
    extracted_sentences.sort(key=lambda s: s.index)
    return _format_results(extracted_sentences, split, scores)
Пример #5
0
def keywords(
        text: str, deaccent: bool = False,
        additional_stopwords: List[str] = None) -> Tuple[
            List[Tuple[float, str]], Optional[Dict[str, List[str]]],
            Optional[summa.graph.Graph], Dict[str, float]]:
    if not isinstance(text, str):
        raise ValueError("Text parameter must be a Unicode object (str)!")

    lang = detect(text)[:2]
    if lang == "en":
        # Gets a dict of word -> lemma
        tokens = _clean_text_by_word(
            text, "english", deacc=deaccent,
            additional_stopwords=additional_stopwords)
        split_text = list(_tokenize_by_word(text))
    elif lang == "zh" or lang == "ko":  # zh-Hant sometimes got misclassified into ko
        if not ZH_SUPPORT:
            raise ImportError("Missing dependencies for Chinese support.")
        tokens = zh_clean_and_cut_words(text)
        split_text = [x.text for x in tokens]
        tokens = {x.text: x for x in tokens}
    elif lang == "ja":
        if not JA_SUPPORT:
            raise ImportError("Missing dependencies for Japanese support.")
        tokens = ja_clean_and_cut_words(text)
        split_text = [x.text for x in tokens]
        tokens = {x.text: x for x in tokens}
    else:
        print("Language not suppored! (supported languages: en zh)")
        return [], {}, None, {}

    # Creates the graph and adds the edges
    graph = _build_graph(_get_words_for_graph(tokens))
    _set_graph_edges(graph, tokens, split_text)
    del split_text  # It's no longer used

    _remove_unreachable_nodes(graph)

    # PageRank cannot be run in an empty graph.
    if not graph.nodes():
        return [], {}, None, {}

    # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
    pagerank_scores = _pagerank(graph)

    extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores)

    lemmas_to_word = None
    if lang == "en":
        lemmas_to_word = _lemmas_to_words(tokens)

    return extracted_lemmas, lemmas_to_word, graph, pagerank_scores
Пример #6
0
def summarize(text, additional_stopwords=None):
    if not isinstance(text, str):
        raise ValueError("Text parameter must be a Unicode object (str)!")

    lang = detect(text)[:2]
    if lang == "en":
        paragraphs = text.split("\n")
        sentences = []
        paragraph_index = 0
        for paragraph in paragraphs:
            # Gets a list of processed sentences.
            if paragraph:
                tmp = _clean_text_by_sentences(
                    paragraph, additional_stopwords)
                if tmp:
                    for j, sent in enumerate(tmp):
                        sent.paragraph = paragraph_index
                        # Hacky way to overwrite token
                        sent.token = len(sentences) + j
                    sentences += tmp
                    paragraph_index += 1
    elif lang == "zh" or lang == "ko":  # zh-Hant sometimes got misclassified into ko
        sentences = cut_sentences_by_rule(text)
    elif lang == "ja":
        raise NotImplementedError("No ja support yet.")
    else:
        return ["Language not suppored! (supported languages: en, zh)"], None, lang
    similarities = attach_sentence_embeddings(
        lang, sentences,  batch_size=32)
    graph = _build_graph([x.token for x in sentences])
    _set_graph_edge_weights(graph, partial(cosine_similarity, similarities))

    # Remove all nodes with all edges weights equal to zero.
    _remove_unreachable_nodes(graph)

    # PageRank cannot be run in an empty graph.
    if len(graph.nodes()) == 0:
        return []

    # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
    pagerank_scores = _pagerank(graph)

    # Adds the summa scores to the sentence objects.
    _add_scores_to_sentences(sentences, pagerank_scores)

    # Sorts the sentences
    sentences.sort(key=lambda s: s.score, reverse=True)
    return sentences, graph, lang