def _tokenize_entities_document(text, entities, min_length=1, stopwords=None): """ A text tokenizer that passes only terms (a.k.a. 'entities') explicitly contained in the entities argument. Parameters ---------- text : str A single text document to be tokenized entities : iterable of str Collection of noun phrases, obtained from collect_entities function min_length : int Minimum length of any single word stopwords : None or iterable of str Collection of words to ignore as tokens Examples -------- >>> ents = _collect_entities(sample_corpus) >>> text = sample_corpus[0][1] >>> tokenized_text = _tokenize_entities_document(text,ents) >>> tokenized_text == [ ... u'frank', u'swank_tank', u'prancercise', u'sassy_unicorns'] True """ result = [] for np in TextBlob(text).noun_phrases: if np in entities: # filter out stop words tmp = "_".join(_simple_document(np, min_length=min_length, stopwords=stopwords)) # if we end up with nothing, don't append an empty string if tmp: result.append(tmp) return result
def _collect_bigrams_and_trigrams(raw_corpus, top_n=10000, min_length=1, min_freqs=None, stopwords=None): """collects bigrams and trigrams from collection of documents. Input to collocation tokenizer. bigrams are pairs of words that recur in the collection; trigrams are triplets. Parameters ---------- raw_corpus : iterable of tuple of (doc_id(str/int), doc_text(str)) body of documents to examine top_n : int limit results to this many entries min_length : int Minimum length of any single word min_freqs : iterable of int threshold of when to consider a pair of words as a recognized n-gram, starting with bigrams. stopwords : None or iterable of str Collection of words to ignore as tokens Examples -------- >>> patterns = _collect_bigrams_and_trigrams(sample_corpus, min_freqs=[2, 2]) >>> patterns[0].pattern u'(frank swank|swank tank|sassy unicorns)' >>> patterns[1].pattern u'(frank swank tank)' """ from nltk.collocations import TrigramCollocationFinder from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures # generator of documents, turn each element to its list of words doc_texts = (_simple_document(doc_text, min_length=min_length, stopwords=stopwords) for doc_id, doc_text in raw_corpus) # generator, concatenate (chain) all words into a single sequence, lazily words = itertools.chain.from_iterable(doc_texts) tcf = TrigramCollocationFinder.from_words(iter(words)) bcf = tcf.bigram_finder() bcf.apply_freq_filter(min_freqs[0]) bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)] tcf.apply_freq_filter(min_freqs[1]) trigrams = [ ' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n) ] bigrams_patterns = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE) trigrams_patterns = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE) return bigrams_patterns, trigrams_patterns
def _tokenize_mixed_document(text, entities, min_length=1, stopwords=None): """ A text tokenizer that retrieves entities ('noun phrases') first and simple words for the rest of the text. Parameters ---------- text : str A single text document to be tokenized entities : iterable of str Collection of noun phrases, obtained from collect_entities function min_length : int Minimum length of any single word stopwords : None or iterable of str Collection of words to ignore as tokens Examples -------- >>> ents = _collect_entities(sample_corpus) >>> text = sample_corpus[0][1] >>> tokenized_text = _tokenize_mixed_document(text,ents) >>> tokenized_text == [u'frank', u'swank_tank', u'sassy', u'unicorn', ... u'brony', u'prancercise', u'class', u'prancercise', u'popular', ... u'pastime', u'sassy_unicorns'] True """ result = [] for np in TextBlob(text).noun_phrases: if ' ' in np and np not in entities: # break apart the noun phrase; it does not occur often enough in the collection of text to be considered. result.extend( _simple_document(np, min_length=min_length, stopwords=stopwords)) else: # filter out stop words tmp = "_".join( _simple_document(np, min_length=min_length, stopwords=stopwords)) # if we end up with nothing, don't append an empty string if tmp: result.append(tmp) return result
def _tokenize_mixed_document(text, entities, min_length=1, stopwords=None): """ A text tokenizer that retrieves entities ('noun phrases') first and simple words for the rest of the text. Parameters ---------- text : str A single text document to be tokenized entities : iterable of str Collection of noun phrases, obtained from collect_entities function min_length : int Minimum length of any single word stopwords : None or iterable of str Collection of words to ignore as tokens Examples -------- >>> ents = _collect_entities(sample_corpus) >>> text = sample_corpus[0][1] >>> tokenized_text = _tokenize_mixed_document(text,ents) >>> tokenized_text == [u'frank', u'swank_tank', u'sassy', u'unicorn', ... u'brony', u'prancercise', u'class', u'prancercise', u'popular', ... u'pastime', u'sassy_unicorns'] True """ result = [] for np in TextBlob(text).noun_phrases: if " " in np and np not in entities: # break apart the noun phrase; it does not occur often enough in the collection of text to be considered. result.extend(_simple_document(np, min_length=min_length, stopwords=stopwords)) else: # filter out stop words tmp = "_".join(_simple_document(np, min_length=min_length, stopwords=stopwords)) # if we end up with nothing, don't append an empty string if tmp: result.append(tmp) return result
def _collect_bigrams_and_trigrams(raw_corpus, top_n=10000, min_length=1, min_freqs=None, stopwords=None): """collects bigrams and trigrams from collection of documents. Input to collocation tokenizer. bigrams are pairs of words that recur in the collection; trigrams are triplets. Parameters ---------- raw_corpus : iterable of tuple of (doc_id(str/int), doc_text(str)) body of documents to examine top_n : int limit results to this many entries min_length : int Minimum length of any single word min_freqs : iterable of int threshold of when to consider a pair of words as a recognized n-gram, starting with bigrams. stopwords : None or iterable of str Collection of words to ignore as tokens Examples -------- >>> patterns = _collect_bigrams_and_trigrams(sample_corpus, min_freqs=[2, 2]) >>> patterns[0].pattern u'(frank swank|swank tank|sassy unicorns)' >>> patterns[1].pattern u'(frank swank tank)' """ from nltk.collocations import TrigramCollocationFinder from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures # generator of documents, turn each element to its list of words doc_texts = (_simple_document(doc_text, min_length=min_length, stopwords=stopwords) for doc_id, doc_text in raw_corpus) # generator, concatenate (chain) all words into a single sequence, lazily words = itertools.chain.from_iterable(doc_texts) tcf = TrigramCollocationFinder.from_words(iter(words)) bcf = tcf.bigram_finder() bcf.apply_freq_filter(min_freqs[0]) bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)] tcf.apply_freq_filter(min_freqs[1]) trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)] bigrams_patterns = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE) trigrams_patterns = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE) return bigrams_patterns, trigrams_patterns
def _collect_ngrams(raw_corpus, top_n=10000, min_length=1, min_freqs=None, stopwords=None): """collects bigrams and trigrams from collection of documents. Input to collocation tokenizer. bigrams are pairs of words that recur in the collection; trigrams/quadgrams are triplets/quadruplets. Parameters ---------- raw_corpus : iterable of tuple of (doc_id(str/int), doc_text(str)) body of documents to examine top_n : int limit results to this many entries min_length : int Minimum length of any single word min_freqs : iterable of int threshold of when to consider a pair of words as a recognized n-gram, starting with bigrams. stopwords : None or iterable of str Collection of words to ignore as tokens Examples -------- >>> patterns = _collect_ngrams(sample_corpus, min_freqs=[2, 2, 2]) >>> patterns[0].pattern u'(frank swank|swank tank|sassy unicorns)' >>> patterns[1].pattern u'(frank swank tank)' """ # generator of documents, turn each element to its list of words doc_texts = (_simple_document(doc_text, min_length=min_length, stopwords=stopwords) for doc_id, doc_text in raw_corpus) # generator, concatenate (chain) all words into a single sequence, lazily words = itertools.chain.from_iterable(doc_texts) words_iterators = itertools.tee(words, 3) bigrams_patterns = _get_bigrams(words_iterators[0], top_n, min_freqs[0]) trigrams_patterns = _get_trigrams(words_iterators[1], top_n, min_freqs[1]) quadgrams_patterns = _get_quadgrams(words_iterators[2], top_n, min_freqs[2]) return (bigrams_patterns, trigrams_patterns, quadgrams_patterns)
def _collocation_document(text, patterns, min_length=1, stopwords=None): """A text tokenizer that includes collocations(bigrams and trigrams). A collocation is sequence of words or terms that co-occur more often than would be expected by chance. This function breaks a raw document up into tokens based on a pre-established collection of bigrams and trigrams. This collection is derived from a body of many documents, and must be obtained in a prior step using the collect_bigrams_and_trigrams function. Uses nltk.collocations.TrigramCollocationFinder to find trigrams and bigrams. Parameters ---------- text : str A single document's text to be tokenized patterns: tuple of compiled regex object to find n-grams Obtained from collect_bigrams_and_trigrams function min_length : int Minimum length of any single word stopwords : None or iterable of str Collection of words to ignore as tokens Examples -------- >>> patterns = _collect_bigrams_and_trigrams(sample_corpus, min_freqs=[2, 2]) >>> text = sample_corpus[0][1] >>> tokenized_text = _collocation_document(text,patterns) >>> tokenized_text == [ ... u'frank_swank', u'tank', u'walked', u'sassy', u'unicorn', u'brony', ... u'prancercise', u'class', u'daily', u'prancercise', u'tremendously', ... u'popular', u'pastime', u'sassy_unicorns', u'retirees', u'alike'] True """ text = ' '.join( _simple_document(text, min_length=min_length, stopwords=stopwords)) for pattern in patterns: text = re.sub(pattern, lambda match: match.group(0).replace(' ', '_'), text) return text.split()
def _collocation_document(text, patterns, min_length=1, stopwords=None): """A text tokenizer that includes collocations(bigrams and trigrams). A collocation is sequence of words or terms that co-occur more often than would be expected by chance. This function breaks a raw document up into tokens based on a pre-established collection of bigrams, trigrams, and trigrams. This collection is derived from a body of many documents, and must be obtained in a prior step using the collect_ngrams function. Uses nltk.collocations.(Bi/Tri/Quad)gramCollocationFinder to find bigrams/trigrams/quadgrams. Parameters ---------- text : str A single document's text to be tokenized patterns: tuple of compiled regex object to find n-grams Obtained from collect_ngrams function min_length : int Minimum length of any single word stopwords : None or iterable of str Collection of words to ignore as tokens Examples -------- >>> patterns = _collect_ngrams(sample_corpus, min_freqs=[2, 2, 2]) >>> text = sample_corpus[0][1] >>> tokenized_text = _collocation_document(text,patterns) >>> tokenized_text == [ ... u'frank_swank', u'tank', u'walked', u'sassy', u'unicorn', u'brony', ... u'prancercise', u'class', u'daily', u'prancercise', u'tremendously', ... u'popular', u'pastime', u'sassy_unicorns', u'retirees', u'alike'] True """ text = ' '.join(_simple_document(text, min_length=min_length, stopwords=stopwords)) for pattern in patterns: text = re.sub(pattern, lambda match: match.group(0).replace(' ', '_'), text) return text.split()
def _tokenize_entities_document(text, entities, min_length=1, stopwords=None): ''' A text tokenizer that passes only terms (a.k.a. 'entities') explicitly contained in the entities argument. Parameters ---------- text : str A single text document to be tokenized entities : iterable of str Collection of noun phrases, obtained from collect_entities function min_length : int Minimum length of any single word stopwords : None or iterable of str Collection of words to ignore as tokens Examples -------- >>> ents = _collect_entities(sample_corpus) >>> text = sample_corpus[0][1] >>> tokenized_text = _tokenize_entities_document(text,ents) >>> tokenized_text == [ ... u'frank', u'swank_tank', u'prancercise', u'sassy_unicorns'] True ''' result = [] for np in TextBlob(text).noun_phrases: if np in entities: # filter out stop words tmp = "_".join( _simple_document(np, min_length=min_length, stopwords=stopwords)) # if we end up with nothing, don't append an empty string if tmp: result.append(tmp) return result
def test__simple_document(): assert(_simple_document(sample_data[0][1]) == ["frank", "frank", "frank", "dog", "cat"])