def keywords(text, ratio=0.2, words=None, language="english", split=False, scores=False): # Gets a dict of word -> lemma tokens = _clean_text_by_word(text, language) split_text = list(_tokenize_by_word(text)) # Creates the graph and adds the edges graph = _build_graph(_get_words_for_graph(tokens)) _set_graph_edges(graph, tokens, split_text) del split_text # It's no longer used _remove_unreachable_nodes(graph) # PageRank cannot be run in an empty graph. if len(graph.nodes()) == 0: return [] if split else "" # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score pagerank_scores = _pagerank(graph) extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) lemmas_to_word = _lemmas_to_words(tokens) keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined combined_keywords = _get_combined_keywords(keywords, text.split()) return _format_results(keywords, combined_keywords, split, scores)
def get_graph(text, language="english", deaccent=False): tokens = _clean_text_by_word(text, language, deacc=deaccent) split_text = list(_tokenize_by_word(text, deacc=deaccent)) graph = _build_graph(_get_words_for_graph(tokens)) _set_graph_edges(graph, tokens, split_text) return graph
def get_graph(text, language="english"): tokens = _clean_text_by_word(text, language) split_text = list(_tokenize_by_word(text)) graph = _build_graph(_get_words_for_graph(tokens)) _set_graph_edges(graph, tokens, split_text) return graph
def keywords(text, ratio=0.2, words=None, language="english", split=False, scores=False): # Gets a dict of word -> lemma tokens = _clean_text_by_word(text, language) split_text = list(_tokenize_by_word(text)) # Creates the graph and adds the edges graph = _build_graph(_get_words_for_graph(tokens)) _set_graph_edges(graph, tokens, split_text) del split_text # It's no longer used _remove_unreachable_nodes(graph) # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score pagerank_scores = _pagerank(graph) extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) lemmas_to_word = _lemmas_to_words(tokens) keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined combined_keywords = _get_combined_keywords(keywords, text.split()) return _format_results(keywords, combined_keywords, split, scores)
def _strip_word(word): stripped_word_list = list(_tokenize_by_word(word)) return stripped_word_list[0] if stripped_word_list else ""