def keywords(text, ratio=0.2, words=None, split=False, scores=False): # Gets a dict of word -> lemma tokens = _clean_text_by_word(text) split_text = list(_tokenize_by_word(text)) # Creates the graph and adds the edges graph = _build_graph(_get_words_for_graph(tokens)) _set_graph_edges(graph, tokens, split_text) del split_text # It's no longer used _remove_unreachable_nodes(graph) # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score pagerank_scores = _pagerank(graph) extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) lemmas_to_word = _lemmas_to_words(tokens) keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined combined_keywords = _get_combined_keywords(keywords, text.split()) return _format_results(keywords, combined_keywords, split, scores)
def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=['NN', 'JJ'], lemmatize=False, deacc=True): # Gets a dict of word -> lemma text = to_unicode(text) tokens = _clean_text_by_word(text, deacc=deacc) split_text = list(_tokenize_by_word(text)) # Creates the graph and adds the edges graph = _build_graph(_get_words_for_graph(tokens, pos_filter)) _set_graph_edges(graph, tokens, split_text) del split_text # It's no longer used _remove_unreachable_nodes(graph) # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score pagerank_scores = _pagerank(graph) extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) # The results can be polluted by many variations of the same word if lemmatize: lemmas_to_word = {} for word, unit in iteritems(tokens): lemmas_to_word[unit.token] = [word] else: lemmas_to_word = _lemmas_to_words(tokens) keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined combined_keywords = _get_combined_keywords(keywords, text.split()) return _format_results(keywords, combined_keywords, split, scores)
def get_graph(text): tokens = _clean_text_by_word(text) split_text = list(_tokenize_by_word(text)) graph = _build_graph(_get_words_for_graph(tokens)) _set_graph_edges(graph, tokens, split_text) return graph
def _strip_word(word): """Get cleaned `word`. Parameters ---------- word : str Given word. Returns ------- str Cleaned word. """ stripped_word_list = list(_tokenize_by_word(word)) return stripped_word_list[0] if stripped_word_list else ""
def get_graph(text): """Creates and returns graph from given text, cleans and tokenize text before building graph. Parameters ---------- text : str Sequence of values. Returns ------- :class:`~gensim.summarization.graph.Graph` Created graph. """ tokens = _clean_text_by_word(text) split_text = list(_tokenize_by_word(text)) graph = _build_graph(_get_words_for_graph(tokens)) _set_graph_edges(graph, tokens, split_text) return graph
def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=('NN', 'JJ'), lemmatize=False, deacc=True): # Gets a dict of word -> lemma text = to_unicode(text) tokens = _clean_text_by_word(text, deacc=deacc) split_text = list(_tokenize_by_word(text)) # Creates the graph and adds the edges graph = _build_graph(_get_words_for_graph(tokens, pos_filter)) _set_graph_edges(graph, tokens, split_text) del split_text # It's no longer used _remove_unreachable_nodes(graph) # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score pagerank_scores = _pagerank(graph) extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) # The results can be polluted by many variations of the same word if lemmatize: lemmas_to_word = {} for word, unit in iteritems(tokens): lemmas_to_word[unit.token] = [word] else: lemmas_to_word = _lemmas_to_words(tokens) keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined combined_keywords = _get_combined_keywords(keywords, text.split()) return _format_results(keywords, combined_keywords, split, scores)
def mz_keywords(text, blocksize=1024, scores=False, split=False, weighted=True, threshold=0.0): """Extract keywords from text using the Montemurro and Zanette entropy algorithm. [1]_ Parameters ---------- text: str Document for summarization. blocksize: int, optional Size of blocks to use in analysis. scores: bool, optional Whether to return score with keywords. split: bool, optional Whether to return results as list. weighted: bool, optional Whether to weight scores by word frequency. False can useful for shorter texts, and allows automatic thresholding. threshold: float or 'auto', optional Minimum score for returned keywords, 'auto' calculates the threshold as n_blocks / (n_blocks + 1.0) + 1e-8, use 'auto' with `weighted=False`. Returns ------- results: str newline separated keywords if `split` == False **OR** results: list(str) list of keywords if `scores` == False **OR** results: list(tuple(str, float)) list of (keyword, score) tuples if `scores` == True Results are returned in descending order of score regardless of the format. Note ---- This algorithm looks for keywords that contribute to the structure of the text on scales of `blocksize` words of larger. It is suitable for extracting keywords representing the major themes of long texts. References ---------- .. [1] Marcello A Montemurro, Damian Zanette, "Towards the quantification of the semantic information encoded in written language". Advances in Complex Systems, Volume 13, Issue 2 (2010), pp. 135-153, DOI: 10.1142/S0219525910002530, https://arxiv.org/abs/0907.1558 """ text = to_unicode(text) words = [word for word in _tokenize_by_word(text)] vocab = sorted(set(words)) word_counts = numpy.array( [[words[i:i + blocksize].count(word) for word in vocab] for i in range(0, len(words), blocksize)]).astype('d') n_blocks = word_counts.shape[0] totals = word_counts.sum(axis=0) n_words = totals.sum() p = word_counts / totals log_p = numpy.log2(p) h = numpy.nan_to_num(p * log_p).sum(axis=0) analytic = __analytic_entropy(blocksize, n_blocks, n_words) h += analytic(totals).astype('d') if weighted: h *= totals / n_words if threshold == 'auto': threshold = n_blocks / (n_blocks + 1.0) + 1.0e-8 weights = [(word, score) for (word, score) in zip(vocab, h) if score > threshold] weights.sort(key=lambda x: -x[1]) result = weights if scores else [word for (word, score) in weights] if not (scores or split): result = '\n'.join(result) return result
def _strip_word(word): stripped_word_list = list(_tokenize_by_word(word)) return stripped_word_list[0] if stripped_word_list else ""
def mz_keywords(text, blocksize=1024, scores=False, split=False, weighted=True, threshold=0.0): """Extract keywords from text using the Montemurro and Zanette entropy algorithm. [1]_ Parameters ---------- text: str Document for summarization. blocksize: int, optional Size of blocks to use in analysis. scores: bool, optional Whether to return score with keywords. split: bool, optional Whether to return results as list. weighted: bool, optional Whether to weight scores by word frequency. False can useful for shorter texts, and allows automatic thresholding. threshold: float or 'auto', optional Minimum score for returned keywords, 'auto' calculates the threshold as n_blocks / (n_blocks + 1.0) + 1e-8, use 'auto' with `weighted=False`. Returns ------- results: str newline separated keywords if `split` == False **OR** results: list(str) list of keywords if `scores` == False **OR** results: list(tuple(str, float)) list of (keyword, score) tuples if `scores` == True Results are returned in descending order of score regardless of the format. Note ---- This algorithm looks for keywords that contribute to the structure of the text on scales of `blocksize` words of larger. It is suitable for extracting keywords representing the major themes of long texts. References ---------- .. [1] Marcello A Montemurro, Damian Zanette, "Towards the quantification of the semantic information encoded in written language". Advances in Complex Systems, Volume 13, Issue 2 (2010), pp. 135-153, DOI: 10.1142/S0219525910002530, https://arxiv.org/abs/0907.1558 """ text = to_unicode(text) words = [word for word in _tokenize_by_word(text)] vocab = sorted(set(words)) word_counts = count_freqs_by_blocks(words, vocab, blocksize) n_blocks = word_counts.shape[0] totals = word_counts.sum(axis=0) n_words = totals.sum() p = word_counts / totals log_p = np.log2(p) h = np.nan_to_num(p * log_p).sum(axis=0) analytic = __analytic_entropy(blocksize, n_blocks, n_words) h += analytic(totals).astype('d', copy=False) if weighted: h *= totals / n_words if threshold == 'auto': threshold = n_blocks / (n_blocks + 1.0) + 1.0e-8 weights = [(word, score) for (word, score) in zip(vocab, h) if score > threshold] weights.sort(key=lambda x: -x[1]) result = weights if scores else [word for (word, score) in weights] if not (scores or split): result = '\n'.join(result) return result
def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=('NN', 'JJ'), lemmatize=False, deacc=True): """Get most ranked words of provided text and/or its combinations. Parameters ---------- text : str Input text. ratio : float, optional If no "words" option is selected, the number of sentences is reduced by the provided ratio, else, the ratio is ignored. words : int, optional Number of returned words. split : bool, optional Whether split keywords if True. scores : bool, optional Whether score of keyword. pos_filter : tuple, optional Part of speech filters. lemmatize : bool, optional If True - lemmatize words. deacc : bool, optional If True - remove accentuation. Returns ------- result: list of (str, float) If `scores`, keywords with scores **OR** result: list of str If `split`, keywords only **OR** result: str Keywords, joined by endl. """ # Gets a dict of word -> lemma text = to_unicode(text) tokens = _clean_text_by_word(text, deacc=deacc) split_text = list(_tokenize_by_word(text)) # Creates the graph and adds the edges graph = _build_graph(_get_words_for_graph(tokens, pos_filter)) get_sentence_score_per_word(text) _set_graph_edges(graph, tokens, split_text) del split_text # It's no longer used _remove_unreachable_nodes(graph) if not any(True for _ in graph.iter_edges()): return _format_results([], [], split, scores) # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score pagerank_scores = _pagerank(graph) extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) # The results can be polluted by many variations of the same word if lemmatize: lemmas_to_word = {} for word, unit in iteritems(tokens): lemmas_to_word[unit.token] = [word] else: lemmas_to_word = _lemmas_to_words(tokens) keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined combined_keywords = _get_combined_keywords(keywords, text.split()) return _format_results(keywords, combined_keywords, split, scores)
def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=('NN', 'JJ'), lemmatize=False, deacc=True): """Get most ranked words of provided text and/or its combinations. Parameters ---------- text : str Input text. ratio : float, optional If no "words" option is selected, the number of sentences is reduced by the provided ratio, else, the ratio is ignored. words : int, optional Number of returned words. split : bool, optional Whether split keywords if True. scores : bool, optional Whether score of keyword. pos_filter : tuple, optional Part of speech filters. lemmatize : bool, optional If True - lemmatize words. deacc : bool, optional If True - remove accentuation. Returns ------- result: list of (str, float) If `scores`, keywords with scores **OR** result: list of str If `split`, keywords only **OR** result: str Keywords, joined by endl. """ # Gets a dict of word -> lemma text = to_unicode(text) tokens = _clean_text_by_word(text, deacc=deacc) split_text = list(_tokenize_by_word(text)) # Creates the graph and adds the edges graph = _build_graph(_get_words_for_graph(tokens, pos_filter)) _set_graph_edges(graph, tokens, split_text) del split_text # It's no longer used _remove_unreachable_nodes(graph) if not any(True for _ in graph.iter_edges()): return _format_results([], [], split, scores) # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score pagerank_scores = _pagerank(graph) extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) # The results can be polluted by many variations of the same word if lemmatize: lemmas_to_word = {} for word, unit in iteritems(tokens): lemmas_to_word[unit.token] = [word] else: lemmas_to_word = _lemmas_to_words(tokens) keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined combined_keywords = _get_combined_keywords(keywords, text.split()) return _format_results(keywords, combined_keywords, split, scores)