示例#1
0
def keywords(text,
             ratio=0.2,
             words=None,
             language="english",
             split=False,
             scores=False):
    # Gets a dict of word -> lemma
    tokens = _clean_text_by_word(text, language)
    split_text = list(_tokenize_by_word(text))

    # Creates the graph and adds the edges
    graph = _build_graph(_get_words_for_graph(tokens))
    _set_graph_edges(graph, tokens, split_text)
    del split_text  # It's no longer used

    _remove_unreachable_nodes(graph)

    # PageRank cannot be run in an empty graph.
    if len(graph.nodes()) == 0:
        return [] if split else ""

    # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
    pagerank_scores = _pagerank(graph)

    extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio,
                                       words)

    lemmas_to_word = _lemmas_to_words(tokens)
    keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word)

    # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
    combined_keywords = _get_combined_keywords(keywords, text.split())

    return _format_results(keywords, combined_keywords, split, scores)
示例#2
0
def get_graph(text, language="english", deaccent=False):
    tokens = _clean_text_by_word(text, language, deacc=deaccent)
    split_text = list(_tokenize_by_word(text, deacc=deaccent))

    graph = _build_graph(_get_words_for_graph(tokens))
    _set_graph_edges(graph, tokens, split_text)

    return graph
示例#3
0
def get_graph(text, language="english"):
    tokens = _clean_text_by_word(text, language)
    split_text = list(_tokenize_by_word(text))

    graph = _build_graph(_get_words_for_graph(tokens))
    _set_graph_edges(graph, tokens, split_text)

    return graph
示例#4
0
def keywords(text, ratio=0.2, words=None, language="english", split=False, scores=False):
    # Gets a dict of word -> lemma
    tokens = _clean_text_by_word(text, language)
    split_text = list(_tokenize_by_word(text))

    # Creates the graph and adds the edges
    graph = _build_graph(_get_words_for_graph(tokens))
    _set_graph_edges(graph, tokens, split_text)
    del split_text # It's no longer used

    _remove_unreachable_nodes(graph)

    # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
    pagerank_scores = _pagerank(graph)

    extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words)

    lemmas_to_word = _lemmas_to_words(tokens)
    keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word)

    # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
    combined_keywords = _get_combined_keywords(keywords, text.split())

    return _format_results(keywords, combined_keywords, split, scores)
示例#5
0
def _get_labels(text, language, by_sentence):
    syntactic_units = _clean_text_by_sentences(text, language) if by_sentence \
        else _clean_text_by_word(text, language).values()
    return {unit.token: unit.text for unit in syntactic_units}
示例#6
0
def _get_labels(text, language, by_sentence):
    syntactic_units = _clean_text_by_sentences(text, language) if by_sentence \
        else _clean_text_by_word(text, language).values()
    return {unit.token: unit.text for unit in syntactic_units}