def collocation_list(self, num=20, window_size=2): """ Return collocations derived from the text, ignoring stopwords. :param num: The maximum number of collocations to return. :type num: int :param window_size: The number of tokens spanned by a collocation (default=2) :type window_size: int """ if not ("_collocations" in self.__dict__ and self._num == num and self._window_size == window_size): self._num = num self._window_size = window_size # print("Building collocations list") from nltk.corpus import stopwords ignored_words = stopwords.words("english") finder = BigramCollocationFinder.from_words( self.tokens, window_size) finder.apply_freq_filter(2) finder.apply_word_filter( lambda w: len(w) < 3 or w.lower() in ignored_words) bigram_measures = BigramAssocMeasures() self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num) return [w1 + " " + w2 for w1, w2 in self._collocations]
def collocations(self, num=20, window_size=2): """ Print collocations derived from the text, ignoring stopwords. :seealso: find_collocations :param num: The maximum number of collocations to print. :type num: int :param window_size: The number of tokens spanned by a collocation (default=2) :type window_size: int """ if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size): self._num = num self._window_size = window_size #print("Building collocations list") from nltk.corpus import stopwords ignored_words = stopwords.words('english') finder = BigramCollocationFinder.from_words( self.tokens, window_size) finder.apply_freq_filter(2) finder.apply_word_filter( lambda w: len(w) < 3 or w.lower() in ignored_words) bigram_measures = BigramAssocMeasures() self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num) colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations] print(tokenwrap(colloc_strings, separator="; "))
def collocation_list(self, num=20, window_size=2): """ Return collocations derived from the text, ignoring stopwords. >>> from nltk.book import text4 >>> text4.collocation_list()[:2] [('United', 'States'), ('fellow', 'citizens')] :param num: The maximum number of collocations to return. :type num: int :param window_size: The number of tokens spanned by a collocation (default=2) :type window_size: int :rtype: list(tuple(str, str)) """ if not ("_collocations" in self.__dict__ and self._num == num and self._window_size == window_size): self._num = num self._window_size = window_size # print("Building collocations list") from nltk.corpus import stopwords ignored_words = stopwords.words("english") finder = BigramCollocationFinder.from_words( self.tokens, window_size) finder.apply_freq_filter(2) finder.apply_word_filter( lambda w: len(w) < 3 or w.lower() in ignored_words) bigram_measures = BigramAssocMeasures() self._collocations = list( finder.nbest(bigram_measures.likelihood_ratio, num)) return self._collocations
def collocations(self, duanyu_num=20, window_size=2): finder = BigramCollocationFinder.from_words(self.tokens, window_size) finder.apply_freq_filter(2) finder.apply_word_filter( lambda w: len(w) < 3 or w.lower() in self.ignored_words) bigram_measures = BigramAssocMeasures() self._collocations = finder.nbest(bigram_measures.likelihood_ratio, duanyu_num) cizus = [w1 + ' ' + w2 for w1, w2 in self._collocations] tag_word = pos_tag(self.tokens) tag_word_map = dict(tag_word) cizu_NN = [] for cizu in cizus: flag = True for word in cizu.split(): if tag_word_map[word] not in ['NN', 'NNS', 'NNP', 'NNPS'] \ or word.find('.') != -1 or word in self.ignored_words: flag = False if flag: cizu_NN.append(re.sub('\.|\?|!|…', '', cizu)) text = list(tag_word) text_n_list = [ re.sub('\.|\?|!|…', '', word_[0]) for word_ in text if len(word_[0]) > 4 and word_[0] not in self.ignored_words and word_[1] in ['NN', 'NNS', 'NNP', 'NNPS'] and word_[0].find('.') == -1 ] text_n_list = text_n_list + cizu_NN return text_n_list
def get_collocations(self): ignored_words = stopwords.words('english') finder = BigramCollocationFinder.from_words(self.text_array,2) finder.apply_freq_filter(3) finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) bigram_measures = BigramAssocMeasures() return finder.nbest(bigram_measures.likelihood_ratio,40)
def ShowCollocations(): text.insert(END, "If this doesn't work, please check you have NLTK, PyYAML and the stopword list from the NLTK loaded. See Help for details \n\n\n") import nltk from nltk.collocations import BigramCollocationFinder from nltk.collocations import TrigramCollocationFinder from nltk.metrics import BigramAssocMeasures from nltk.metrics import TrigramAssocMeasures pattern = r'''(?x)([A-Z]\.)+|\w+([-']\w+)*|\$?\d+(\.\d+)?%?|\.\.\.|[][.,;"'?():-_']''' data = resultsbox.get(1.0,END) rawtext=nltk.regexp_tokenize(data, pattern) prepcolloc = [word.lower() for word in rawtext if not word in stopwords and word.isalpha()] text.delete(1.0, END) text.insert(END, "Collocations (occurring at least 3 times with a PMI of 10)\n") text.insert(END, "\nBigram Collocations:\n") bigram = BigramAssocMeasures() bigramfinder = BigramCollocationFinder.from_words(prepcolloc) bigramfinder.apply_freq_filter (3) bigrams=bigramfinder.nbest(bigram.pmi, 10) for item in bigrams: first = item[0] second = item[1] text.insert(END, first) text.insert(END, " ") text.insert(END, second) text.insert(END, "\n")
def best_bigram_word_feats(self, words, score_fn=BigramAssocMeasures.chi_sq, n=1000): bgm = BigramAssocMeasures() bigram_finder = BigramCollocationFinder.from_words(words) self.bigrams = bigram_finder.score_ngrams(bgm.likelihood_ratio) # self.bigrams = bigram_finder.nbest(score_fn, n) d = dict([(' '.join(bigram), s) for bigram, s in self.bigrams]) # d.update(self.best_word_feats(words)) return d
def bi_collocations(tokens, num=20): from nltk.corpus import stopwords ignored_words = stopwords.words('english') word_list = [word for sent in tokens for word in sent] finder = BigramCollocationFinder.from_words(word_list, 2) finder.apply_freq_filter(3) finder.apply_ngram_filter(lambda w1, w2: len(w1) < 3 \ or len(w2) < 3 \ or (len(w1)+len(w2)) < 8 \ or w1.lower() in ignored_words \ or w2.lower() in ignored_words) #length=2 want to keep e.g. rf pulse bigram_measures = BigramAssocMeasures() collocations = finder.nbest(bigram_measures.likelihood_ratio, num) return collocations
def findtopbigrams(bigrams,word_fd,settings): nkey = settings['nkey'] measure = settings['measure'] bigram_measures = BigramAssocMeasures() bigram_fd = FreqDist(bigrams) finder = BigramCollocationFinder(word_fd, bigram_fd) warning = "" if measure == "LR": try: top_bigrams = finder.nbest(bigram_measures.likelihood_ratio, nkey) except: warning = "Problem with LR measure. Default to simple frequency (RAW setting)" print(warning) top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey) elif measure == "PMI": try: top_bigrams = finder.nbest(bigram_measures.pmi, nkey) except: warning = "Problem with PMI measure. Default to simple frequency (RAW setting)" print(warning) top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey) elif measure == "CHISQ": try: top_bigrams = finder.nbest(bigram_measures.chi_sq, nkey) except: warning = "Problem with CHISQ measure. Default to simple frequency (RAW setting)" print(warning) top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey) elif measure == "STUDT": try: top_bigrams = finder.nbest(bigram_measures.student_t, nkey) except: warning = "Problem with STUDT measure. Default to simple frequency (RAW setting)" print(warning) top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey) else: top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey) #score bigrams using LR or similar measure but more helpful to end user to see raw counts + explain measure used in tool tip top_bg_with_count = sorted([(bg, count) for (bg, count) in finder.ngram_fd.items() if bg in top_bigrams], key=lambda bgcount:-bgcount[1]) top_bigrams = [(bg, count) for (bg, count) in top_bg_with_count if count > 1 and bg[0]!=bg[1]] return top_bigrams, bigram_fd, warning
def collocRecursively(corp,interp,constructor,threshhold,addUnrelated,addBigram,filters=None): bgFinder = constructor(corp) if filters: bgFinder = applyFilters(bgFinder,filters) bgScores = {bg:score for bg,score in bgFinder.score_ngrams(BigramAssocMeasures().likelihood_ratio)} print(sorted(list(bgScores.items()),key=lambda tup: tup[1])[-6:]) idx = 0 N = len(corp) newCorp = list() flag = False while idx < N-1: bg = (corp[idx],corp[idx+1]) if bgScores.get((interp(bg[0]),interp(bg[1])),0) > threshhold: addBigram(newCorp,bg) idx += 2 flag = True else: addUnrelated(newCorp,bg[0]) idx += 1 if idx == N-1: addUnrelated(newCorp,corp[idx]) if flag: return collocRecursively(newCorp, interp, constructor, threshhold, addUnrelated, addBigram, filters) return newCorp
from nltk.util import ngrams from nltk.corpus import alpino print(alpino.words()) unigrams = ngrams(alpino.words(), 4) #四元语法 print(unigrams) # for i in unigrams: # print(i) from nltk.collocations import BigramCollocationFinder from nltk.corpus import webtext from nltk.metrics import BigramAssocMeasures from nltk.corpus import stopwords stop_words = set(stopwords.words('english')) stops_filter = lambda w: len(w) < 3 or w in stop_words # 单词长度小于3或是停用词 tokens = [t.lower() for t in webtext.words('grail.txt')] words = BigramCollocationFinder.from_words(tokens) # 创建实例 print(words) words.apply_word_filter(stops_filter) res = words.nbest(BigramAssocMeasures.likelihood_ratio, 5) # 二元语法,前5个 print(res) # 使用词汇搭配查找器生成bigrams import nltk text1 = "Hardwork is the key to success. Never give up!" word = nltk.wordpunct_tokenize(text1) finder = BigramCollocationFinder.from_words(word) bigram_measures = BigramAssocMeasures() value = finder.score_ngrams(bigram_measures.raw_freq) print(sorted(bigram for bigram, score in value))