def demo_similar(self, word, num=20): """ Distributional similarity: find other words which appear in the same contexts as the specified word; list most similar words first. @param word: The word used to seed the similarity search @type word: C{str} @param num: The number of words to generate (default=20) @type num: C{int} @seealso: L{ContextIndex.similar_words()} """ if '_word_context_index' not in self.__dict__: print 'Building word-context index...' self._word_context_index = nltk.text.ContextIndex(self.tokens, filter=lambda x:x.isalpha(), key=lambda s:s.lower()) # words = self._word_context_index.similar_words(word, num) while 1: word = raw_input('Enter a Chinese word such as "開心"(type 0 to exit):'); print "word='"+ word + "'" if word == '0': break word = word.decode('utf-8') wci = self._word_context_index._word_to_contexts if word in wci.conditions(): contexts = set(wci[word]) fd = FreqDist(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word) words = fd.keys()[:num] print tokenwrap(words) else: print "No matches"
def demo_collocations(self, num=40, window_size=2): """ Print collocations derived from the text, ignoring stopwords. @seealso: L{find_collocations} @param num: The maximum number of collocations to print. @type num: C{int} @param window_size: The number of tokens spanned by a collocation (default=2) @type window_size: C{int} """ if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size): self._num = num self._window_size = window_size print "Building collocations list" from nltk.corpus import stopwords ignored_words = stopwords.words('english') from nltk.collocations import BigramCollocationFinder finder = BigramCollocationFinder.from_words(self.tokens, window_size) finder.apply_freq_filter(2) finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) from nltk.metrics import f_measure, BigramAssocMeasures bigram_measures = BigramAssocMeasures() self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num) colloc_strings = [w1+u' '+w2 for w1, w2 in self._collocations] print "List {0} collocations".format(num) print tokenwrap(colloc_strings, separator=u'; ')
def demo_common_context(self, num=20): """ Find contexts where the specified words appear; list most frequent common contexts first. @seealso: L{ContextIndex.common_contexts()} """ if '_word_context_index' not in self.__dict__: print 'Building word-context index...' self._word_context_index = nltk.text.ContextIndex(self.tokens, key=lambda s:s.lower()) while 1: inp = raw_input('Enter two Chinese words such as "我 你"(type 0 to exit):'); print "inp='"+ inp+"'" if inp == '0': break inp = inp.decode('utf-8') words = inp.split(u' ') try: fd = self._word_context_index.common_contexts(words, True) if not fd: print "No common contexts were found" else: ranked_contexts = fd.keys()[:num] print tokenwrap(w1+"_"+w2 for w1,w2 in ranked_contexts) except ValueError, e: print e
def sandwich(cls, word): """ """ ind = cls.corpora_health.index(max(cls.corpora_health)) results = cls.corpora[ind].sandwich(word) # results = [corpus.sandwich(word) for corpus in cls.corpora] return tokenwrap(results)
def collocations(self, num=20, window_size=2): """ Print collocations derived from the text, ignoring stopwords. :seealso: find_collocations :param num: The maximum number of collocations to print. :type num: int :param window_size: The number of tokens spanned by a collocation (default=2) :type window_size: int """ if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size): self._num = num self._window_size = window_size #print("Building collocations list") from nltk.corpus import stopwords ignored_words = stopwords.words('english') finder = BigramCollocationFinder.from_words( self.tokens, window_size) finder.apply_freq_filter(2) finder.apply_word_filter( lambda w: len(w) < 3 or w.lower() in ignored_words) bigram_measures = BigramAssocMeasures() self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num) colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations] print(tokenwrap(colloc_strings, separator="; "))
def common_contexts(self, words, num=20): """ Find contexts where the specified words appear; list most frequent common contexts first. :param word: The word used to seed the similarity search :type word: str :param num: The number of words to generate (default=20) :type num: int :seealso: ContextIndex.common_contexts() """ if '_word_context_index' not in self.__dict__: # print('Building word-context index...') self._word_context_index = ContextIndex( self.tokens, key=lambda s: s.lower() ) try: fd = self._word_context_index.common_contexts(words, True) if not fd: print("No common contexts were found") else: ranked_contexts = [w for w, _ in fd.most_common(num)] print(tokenwrap(w1 + "_" + w2 for w1, w2 in ranked_contexts)) except ValueError as e: print(e)
def findall(self, regexp): """ Find instances of the regular expression in the text. The text is a list of tokens, and a regexp pattern to match a single token must be surrounded by angle brackets. E.g. >>> print('hack'); from nltk.book import text1, text5, text9 hack... >>> text5.findall("<.*><.*><bro>") you rule bro; telling you bro; u twizted bro >>> text1.findall("<a>(<.*>)<man>") monied; nervous; dangerous; white; white; white; pious; queer; good; mature; white; Cape; great; wise; wise; butterless; white; fiendish; pale; furious; better; certain; complete; dismasted; younger; brave; brave; brave; brave >>> text9.findall("<th.*>{3,}") thread through those; the thought that; that the thing; the thing that; that that thing; through these than through; them that the; through the thick; them that they; thought that the :param regexp: A regular expression :type regexp: str """ if "_token_searcher" not in self.__dict__: self._token_searcher = TokenSearcher(self) hits = self._token_searcher.findall(regexp) hits = [' '.join(h) for h in hits] print(tokenwrap(hits, "; "))
def collocations(self, num=20, window_size=2): """ Print collocations derived from the text, ignoring stopwords. :seealso: find_collocations :param num: The maximum number of collocations to print. :type num: int :param window_size: The number of tokens spanned by a collocation (default=2) :type window_size: int """ if not ( '_collocations' in self.__dict__ and self._num == num and self._window_size == window_size ): self._num = num self._window_size = window_size # print("Building collocations list") from nltk.corpus import stopwords ignored_words = stopwords.words('english') finder = BigramCollocationFinder.from_words(self.tokens, window_size) finder.apply_freq_filter(2) finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) bigram_measures = BigramAssocMeasures() self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num) colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations] print(tokenwrap(colloc_strings, separator="; "))
def similar(self, word, num=20): """ Distributional similarity: find other words which appear in the same contexts as the specified word; list most similar words first. :param word: The word used to seed the similarity search :type word: str :param num: The number of words to generate (default=20) :type num: int :seealso: ContextIndex.similar_words() """ if '_word_context_index' not in self.__dict__: # print('Building word-context index...') self._word_context_index = ContextIndex( self.tokens, filter=lambda x: x.isalpha(), key=lambda s: s.lower() ) # words = self._word_context_index.similar_words(word, num) word = word.lower() wci = self._word_context_index._word_to_contexts if word in wci.conditions(): contexts = set(wci[word]) fd = Counter( w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word ) words = [w for w, _ in fd.most_common(num)] print(tokenwrap(words)) else: print("No matches")
def common_contexts(self, words, num=20): """ Find contexts where the specified words appear; list most frequent common contexts first. :param word: The word used to seed the similarity search :type word: str :param num: The number of words to generate (default=20) :type num: int :seealso: ContextIndex.common_contexts() """ if '_word_context_index' not in self.__dict__: #print('Building word-context index...') self._word_context_index = ContextIndex(self.tokens, key=lambda s: s.lower()) try: fd = self._word_context_index.common_contexts(words, True) if not fd: print("No common contexts were found") else: ranked_contexts = [w for w, _ in fd.most_common(num)] print(tokenwrap(w1 + "_" + w2 for w1, w2 in ranked_contexts)) except ValueError as e: print(e)
def similar(self, word, num=20): """ Distributional similarity: find other words which appear in the same contexts as the specified word; list most similar words first. :param word: The word used to seed the similarity search :type word: str :param num: The number of words to generate (default=20) :type num: int :seealso: ContextIndex.similar_words() """ if '_word_context_index' not in self.__dict__: #print('Building word-context index...') self._word_context_index = ContextIndex( self.tokens, filter=lambda x: x.isalpha(), key=lambda s: s.lower()) # words = self._word_context_index.similar_words(word, num) word = word.lower() wci = self._word_context_index._word_to_contexts if word in wci.conditions(): contexts = set(wci[word]) fd = Counter(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word) words = [w for w, _ in fd.most_common(num)] print(tokenwrap(words)) else: print("No matches")
def gen(context='', hashtag='', tries=30): tokens = nltk.word_tokenize(corpus) text = nltk.Text(tokens) text.generate(0) #generate model n = 10 r = tokenwrap(text._trigram_model.generate(n, context)) return r[:140-len(hashtag)]+' '+hashtag
def main(): # Parse Book into Array parsed_book = open(str(sys.argv[1])).read().split() # Default Values and Parsing Input Values # Graph Values num_points = 30 if "--numPoints" in sys.argv: num_points = sys.argv[sys.argv.index("--numPoints") + 1] y_label = "Frequencies" line_width = 3 title = "Top " + str(num_points) + " Useful Words For " + str(sys.argv[1]) if "--title" in sys.argv: title = sys.argv[sys.argv.index("--title") + 1] if "--yLabel" in sys.argv: y_label = sys.argv[sys.argv.index("--yLabel") + 1] if "--lineWidth" in sys.argv: line_width = sys.argv[sys.argv.index("--lineWidth") + 1] # Stop Words Values blacklist = [] if "--blacklist" in sys.argv: blacklist = sys.argv[sys.argv.index("--blacklist") + 1].replace( " ", "").split(',') # Collocations Values num_collocations = 20 if "--numCollocations" in sys.argv: num_collocations = sys.argv[sys.argv.index("--numCollocations") + 1] window_size = 4 if "--windowSize" in sys.argv: window_size = sys.argv[sys.argv.index("--windowSize") + 1] # Collocations # Paper Explaining The Math # https://nlp.stanford.edu/fsnlp/promo/colloc.pdf print("Words Commonly Used Together:") print( tokenwrap(find_collocations(parsed_book, blacklist, num=int(num_collocations), window_size=int(window_size)), separator=" ; ")) # Filter out Stop Words filtered_freq_dist = word_filter(FreqDist(parsed_book), blacklist) # Plot plot_most_common(filtered_freq_dist, int(num_points), title, y_label, int(line_width))
def preprocessing(comment): """ Function to clean the comment. Lower all words and remove stop words """ words = nltk.word_tokenize(comment) clean_words = [word.lower() for word in words if word.lower() not in stopwords.words('danish')] cleaned_comment = tokenwrap(clean_words) return cleaned_comment
def collocations(self, num=20, window_size=2): """ Print collocations derived from the text, ignoring stopwords. :param num: The maximum number of collocations to print. :type num: int :param window_size: The number of tokens spanned by a collocation (default=2) :type window_size: int """ collocation_strings = [w1 + ' ' + w2 for w1, w2 in self.collocation_list(num, window_size)] print(tokenwrap(collocation_strings, separator="; "))
def demo_findall(text): while 1: inp = raw_input('Enter two Chinese words such as "我:2 手:4"(type 0 to exit):'); print "inp='"+ inp+"'" if inp == '0': break inp = inp.decode('big5') reg = "<1> <2> <3> <4> <5>" if len(inp) == 0: print 'no input words' else: for wp in inp.split(' '): (w, p) = wp.split(':') # reg = re.sub(p, w, reg) reg = re.sub(p, ''.join(['.*', w, '.*']), reg) reg = re.sub('\d', '.*', reg) print "reg=", reg # text.findall(reg) if "_token_searcher" not in text.__dict__: text._token_searcher = nltk.text.TokenSearcher(text) hits = text._token_searcher.findall(reg) hits = [' '.join(h) for h in hits] print tokenwrap(hits, u"; ")
def generate(self, length=100, context=()): """ Return random text, generated using a trigram language model. :param length: The length of text to generate (default=100) :type length: int :seealso: NgramModel """ if '_trigram_model' not in self.__dict__: print "Building ngram index..." estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) self._trigram_model = NgramModel(3, self, estimator=estimator) text = self._trigram_model.generate(length, context=context) return tokenwrap(text)
def generate(self, length=100): """ Print random text, generated using a trigram language model. :param length: The length of text to generate (default=100) :type length: int :seealso: NgramModel """ if '_trigram_model' not in self.__dict__: print("Building ngram index...") estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) self._trigram_model = NgramModel(3, self, estimator=estimator) text = self._trigram_model.generate(length) print(tokenwrap(text))
def collocations(self, num=20, window_size=2): """ Print collocations derived from the text, ignoring stopwords. :param num: The maximum number of collocations to print. :type num: int :param window_size: The number of tokens spanned by a collocation (default=2) :type window_size: int """ collocation_strings = [ w1 + " " + w2 for w1, w2 in self.collocation_list(num, window_size) ] print(tokenwrap(collocation_strings, separator="; "))
def synonyms(word): ## todo: this should move because we want to cache the results so we can calculate health!! results = [] for synset in wn.synsets(word): results.extend(synset.lemma_names) result_set = set(results) if word in result_set: result_set.remove(word) ### todo: stopped here... should filter these down to some reasonable thing ############ todo:check if the above needs to be cached somewhere (maybe it is cached by wn.synsets?) results = list(result_set) results = results[:MAX_SYNONYMS_TO_RETURN] return tokenwrap(results)
def generate(self, length=100, text_seed=None, random_seed=42): """ Print random text, generated using a trigram language model. See also `help(nltk.lm)`. :param length: The length of text to generate (default=100) :type length: int :param text_seed: Generation can be conditioned on preceding context. :type text_seed: list(str) :param random_seed: A random seed or an instance of `random.Random`. If provided, makes the random sampling part of generation reproducible. (default=42) :type random_seed: int """ # Create the model when using it the first time. self._tokenized_sents = [ sent.split(" ") for sent in sent_tokenize(" ".join(self.tokens)) ] if not hasattr(self, "_trigram_model"): print("Building ngram index...", file=sys.stderr) self._trigram_model = self._train_default_ngram_lm( self._tokenized_sents, n=3 ) generated_tokens = [] assert length > 0, "The `length` must be more than 0." while len(generated_tokens) < length: for idx, token in enumerate( self._trigram_model.generate( length, text_seed=text_seed, random_seed=random_seed ) ): if token == "<s>": continue if token == "</s>": break generated_tokens.append(token) random_seed += 1 prefix = " ".join(text_seed) + " " if text_seed else "" output_str = prefix + tokenwrap(generated_tokens[:length]) print(output_str) return output_str
def collocations(self, num=20, window_size=2): """ Print collocations derived from the text, ignoring stopwords. >>> from nltk.book import text4 >>> text4.collocations() # doctest: +ELLIPSIS United States; fellow citizens; four years; ... :param num: The maximum number of collocations to print. :type num: int :param window_size: The number of tokens spanned by a collocation (default=2) :type window_size: int """ collocation_strings = [ w1 + " " + w2 for w1, w2 in self.collocation_list(num, window_size) ] print(tokenwrap(collocation_strings, separator="; "))
def similar(self, word, num=20): """ Returns as a string similar words """ if '_word_context_index' not in self.__dict__: print 'Building word-context index...' self._word_context_index = nltk.ContextIndex(self.tokens, filter=lambda x:x.isalpha(), key=lambda s:s.lower()) # words = self._word_context_index.similar_words(word, num) word = word.lower() wci = self._word_context_index._word_to_contexts if word in wci.conditions(): contexts = set(wci[word]) fd = FreqDist(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word) words = fd.keys()[:num] return tokenwrap(words) else: print "No matches"
#NLTK processing words = [ w for t in status_texts for w in t.split() ] nltk_text = nltk.Text(words) nltk_text.collocations() ignored_words = stopwords.words('english') finder = BigramCollocationFinder.from_words(words, 2) finder.apply_freq_filter(2) finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) bigram_measures = nltk.collocations.BigramAssocMeasures() collocations = finder.nbest(bigram_measures.likelihood_ratio, 20) colloc_strings = [w1+' '+w2 for w1, w2 in collocations] #finder = BigramCollocationFinder(word_fd, bigram_fd) print tokenwrap(colloc_strings, separator="; ") #create unstylized HTML summarizedLinks = Counter(urls) html_file = open('{0}_{1}_statuses.html'.format(data_file, file_time), 'w') html_file.write('<!DOCTYPE html><html><head></head><body><h1>Analysis of past tweets: "{0}"</h1><h2>{1}</h2>'.format(q, now_time.strftime(fmt) )) html_file.write('<br /><br /><h2>Collocations of commonly occuring pairs of words</h2>') html_file.write('<ul>') for collocation in colloc_strings: html_file.write('<li>{0}</li>'.format(collocation)) html_file.write('</ul>') html_file.write('<h2>Most common referenced URLs, unshortened and sorted</h2>')
def wrap(iterable): return tokenwrap(iterable)
def sandwich(cls, word): """ """ results = [corpus.sandwich(word) for corpus in cls.corpora] return tokenwrap(results)