def find(self, docs, top_n, strip_tags=True): """ Parameter: --------------- docs: list of tokenized documents top_n: int how many labels to return strip_tags: bool whether return without the POS tags or not Return: --------------- list of tuple of str: the bigrams """ # if apply pos constraints # check the pos properties score_func = self.score_func finder = BigramCollocationFinder.from_documents(docs) finder.apply_freq_filter(self._min_freq) bigrams = finder.nbest(score_func, top_n) return bigrams
def create_bigram_finder(tokenized_docs, should_filter=False): if should_filter: bigrams_data_samples = [bigram_prep(doc) for doc in tokenized_docs] else: bigrams_data_samples = tokenized_docs bigrams_finder = BigramCollocationFinder.from_documents(bigrams_data_samples) return bigrams_finder
def collocs(text): bigrams = BigramAssocMeasures() finder = BigramCollocationFinder.from_documents( [nltk.word_tokenize(" ".join(text))]) finder.apply_freq_filter(2) topk = finder.nbest(bigrams.pmi, 15) for tk in topk: print(tk)
def fit(self, X, **fit_params): """ Procedure to iteratively contract bigrams (up to max_collocation_iterations times) that score higher on the collocation_function than the min_collocation_score (and satisfy other criteria set out by the optional parameters). """ self.tokenization_ = X n_tokens = sum([len(x) for x in X]) for i in range(self.max_iterations): bigramer = BigramCollocationFinder.from_documents( self.tokenization_) if not self.ignored_tokens == None: ignore_fn = lambda w: w in self.ignored_tokens bigramer.apply_word_filter(ignore_fn) if not self.excluded_token_regex == None: exclude_fn = (lambda w: re.fullmatch(self.excluded_token_regex, w) is not None) bigramer.apply_word_filter(exclude_fn) if not self.min_token_occurrences == None: minocc_fn = lambda w: bigramer.word_fd[ w] < self.min_token_occurrences bigramer.apply_word_filter(minocc_fn) if not self.max_token_occurrences == None: maxocc_fn = lambda w: bigramer.word_fd[ w] > self.max_token_occurrences bigramer.apply_word_filter(maxocc_fn) if not self.min_token_frequency == None: minfreq_fn = (lambda w: bigramer.word_fd[w] < self. min_token_frequency * n_tokens) bigramer.apply_word_filter(minfreq_fn) if not self.max_token_frequency == None: maxfreq_fn = (lambda w: bigramer.word_fd[w] > self. max_token_frequency * n_tokens) bigramer.apply_word_filter(maxfreq_fn) if not self.min_ngram_occurrences == None: bigramer.apply_freq_filter(self.min_ngram_occurrences) new_grams = list( bigramer.above_score(self.score_function, self.min_score)) if len(new_grams) == 0: break self.mtes_.append(new_grams) contracter = MWETokenizer(new_grams) self.tokenization_ = tuple([ tuple(contracter.tokenize(doc)) for doc in self.tokenization_ ]) return self
def get_top_bigrams(corpus, top_n=100): ''' Most frequent bigram detection ''' finder = BigramCollocationFinder.from_documents( [item.split() for item in corpus]) bigram_measures = BigramAssocMeasures() return finder.nbest(bigram_measures.raw_freq, top_n)
def find(self, docs, top_n, strip_tags=True): """ Parameter: --------------- docs: list of tokenized documents top_n: int how many labels to return strip_tags: bool whether return without the POS tags or not Return: --------------- list of tuple of str: the bigrams """ # if apply pos constraints # check the pos properties if self._pos: assert isinstance(self._pos, list) for pair in self._pos: assert isinstance(pair, tuple) or isinstance(pair, list) assert len(pair) == 2 # because it's bigram score_func = getattr(self.bigram_measures, self._measure_method) finder = BigramCollocationFinder.from_documents(docs) finder.apply_freq_filter(self._min_freq) finder.apply_word_filter(lambda w: len(w) < 3) if self._pos: valid_pos_tags = set([pair for pair in self._pos]) valid_bigrams = [] bigrams = map( partial(get, 0), # get the bigram finder.score_ngrams(score_func)) cnt = 0 for bigram in bigrams: if tuple(map(partial(get, 1), bigram)) in valid_pos_tags: valid_bigrams.append(bigram) cnt += 1 if cnt == top_n: # enough break if strip_tags: valid_bigrams = [ tuple(map(partial(get, 0), bigram)) for bigram in valid_bigrams ] return valid_bigrams else: bigrams = finder.nbest(score_func, top_n) return bigrams
def retrieve_top_bigrams_collocations(corpus, top=5, measure='pmi'): finder = BigramCollocationFinder.from_documents( [item.split() for item in corpus]) bigram_measures = BigramAssocMeasures() if measure == 'pmi': top_bigrams = finder.nbest(bigram_measures.pmi, top) elif measure == 'frequency': top_bigrams = finder.nbest(bigram_measures.raw_freq, top) else: raise ValueError('Type of measure is unknown!') return top_bigrams
def compute_ngrams_count(text_corpus, out_p, n=20): print("Compute ngrams count...") list_of_tokens = [] for document in text_corpus: for sentence in document: list_of_tokens.append(word_tokenize(sentence)) # Unigram tokens = util.flatten_one_level(list_of_tokens) custom_sw = [".", "[", "]", ","] sw = stopwords.words("english") + custom_sw tokens = [w for w in tokens if w not in sw] word_fd = FreqDist(tokens) uni_mc = word_fd.most_common(n) # Bigram bi = BigramCollocationFinder.from_documents(list_of_tokens) #bi.apply_freq_filter(2) #print(bi.ngram_fd.items()) bi_mc = bi.ngram_fd.most_common(n) # Trigram tri = TrigramCollocationFinder.from_documents(list_of_tokens) tri_mc = tri.ngram_fd.most_common(n) # Quadgram quad = QuadgramCollocationFinder.from_documents(list_of_tokens) quad_mc = quad.ngram_fd.most_common(n) # Plot data = [uni_mc, bi_mc, tri_mc, quad_mc] x = [] y = [] for i in range(len(data)): x_ng = [] y_ng = [] for d in data[i]: if i==0: x_ng.append(d[0]) else: x_ng.append(" ".join(d[0])) y_ng.append(d[1]) x.append(x_ng[::-1]) y.append(y_ng[::-1]) title = ["Unigram", "Bigram", "Trigram", "Quadgram"] sup_title = "ngrams count" util.plot_bar_chart_grid(x, y, 1, len(data), title, sup_title, out_p, sup_title_font_size=16, tick_font_size=14, title_font_size=14, h_size=5, w_size=5, rotate=True)
def DEPRECATED_save_bigrams(tokenized_docs, shouldWriteToFile=False): bigrams_data_samples = [bigram_prep(doc) for doc in tokenized_docs] bigram_measures = BigramAssocMeasures() bigrams_finder = BigramCollocationFinder.from_documents(bigrams_data_samples) bigrams_scores = bigrams_finder.score_ngrams(bigram_measures.likelihood_ratio) bigrams_counts = ['%s_%s,%d\n' % (most_common[0][0], most_common[0][1], most_common[1]) for most_common in bigrams_finder.ngram_fd.most_common()] bigrams_scores_as_str = ['%s_%s,%d\n' % (most_common[0][0], most_common[0][1], most_common[1]) for most_common in bigrams_scores] if shouldWriteToFile: with open('./output/bigrams_counts.csv', "w", encoding="utf8") as fout: lines_to_file(bigrams_counts, fout) with open('./output/bigrams_lr_scores.csv', "w", encoding="utf8") as fout: lines_to_file(bigrams_scores_as_str, fout)
def iteratively_contract_bigrams(self): """ Procedure to iteratively contract bigrams (up to max_collocation_iterations times) that score higher on the collocation_function than the min_collocation_score """ for i in range(self.max_collocation_iterations): bigramer = BigramCollocationFinder.from_documents(self.tokens_by_sent()) mwes = list( bigramer.above_score( self.collocation_score_function, self.min_collocation_score ) ) if len(mwes) == 0: break contracter = MWETokenizer(mwes) self.tokens_by_sent_by_doc_ = [ contracter.tokenize_sents(doc) for doc in self.tokens_by_sent_by_doc() ]
def bigram_collocation_feats(self, documents, top_n=None, min_freq=3, assoc_measure=BigramAssocMeasures.pmi): """ Return `top_n` bigram features (using `assoc_measure`). Note that this method is based on bigram collocations measures, and not on simple bigram frequency. :param documents: a list (or iterable) of tokens. :param top_n: number of best words/tokens to use, sorted by association measure. :param assoc_measure: bigram association measure to use as score function. :param min_freq: the minimum number of occurrencies of bigrams to take into consideration. :return: `top_n` ngrams scored by the given association measure. """ finder = BigramCollocationFinder.from_documents(documents) finder.apply_freq_filter(min_freq) return finder.nbest(assoc_measure, top_n)
def bigram_collocation_feats(self, documents, top_n=None, min_freq=3, assoc_measure=BigramAssocMeasures.pmi): """ Return `top_n` bigram features (using `assoc_measure`). Note that this method is based on bigram collocations measures, and not on simple bigram frequency. :param documents: a list (or iterable) of tokens. :param top_n: number of best words/tokens to use, sorted by association measure. :param assoc_measure: bigram association measure to use as score function. :param min_freq: the minimum number of occurrencies of bigrams to take into consideration. :return: `top_n` ngrams scored by the given association measure. """ finder = BigramCollocationFinder.from_documents(documents) finder.apply_freq_filter(min_freq) return finder.nbest(assoc_measure, top_n)
def main() -> None: """Точка входа в приложение.""" corpus_root = Path('corpus') # Настроим логирование результатов global _logger setup_logger(_logger, corpus_root / 'collocations.log') # Загрузим стоп-слова nltk.download('stopwords', '.env/share/nltk_data') stop_words = set(stopwords.words('russian')) # Импортируем корпус tags_root = corpus_root / 'pos_tagging' reader = ConllCorpusReader( str(tags_root), [f.name for f in tags_root.glob('*.tags')], columntypes=['words', 'ignore', 'ignore', 'ignore', 'pos'], separator='\t') _logger.info('Документов: %d', len(reader.fileids())) _logger.info('Токенов в первом документе (%s): %d', reader.fileids()[0], len(reader.words(reader.fileids()[0]))) _logger.info('Загружаем предложения') sentences = reader.sents() # Строим таблицы сопряжённости для всех слов в корпусе _logger.info('Считаем таблицу сопряжённости по всем словам') bigram_finder = BigramCollocationFinder.from_documents( [w.lower() for w in sent] for sent in tqdm(sentences)) _logger.info('Всего биграм: %d', bigram_finder.N) print_samples(bigram_finder) # А теперь отфильтруем по частоте и удалим пунктуацию, стоп-слова _logger.info( 'Отфильтруем пунктуацию, стоп-слова и установим предел по частоте') bigram_finder.apply_freq_filter(5) bigram_finder.apply_word_filter(lambda w: len(w) < 3 or w in stop_words) _logger.info('Всего биграм: %d', bigram_finder.N) print_samples(bigram_finder)
def fit_bigrams(self, text_data=None, show_top_bigrams=True, top_n=20): """If text_data is None, use self.corpus""" import pandas as pd from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder, TrigramAssocMeasures, TrigramCollocationFinder if text_data is None: text_data = self.corpus ## Instantiate and fit bigram functions bigram_measures = BigramAssocMeasures() finder = BigramCollocationFinder.from_documents(text_data) scored = finder.score_ngrams(bigram_measures.raw_freq) self.bigrams = scored if show_top_bigrams: from IPython.display import display bigrams_to_show = scored[:top_n] col_names = ['Bigram', 'Frequency'] caption = f'Top {top_n} Bigrams' df = pd.DataFrame.from_records(bigrams_to_show, columns=col_names) dfs = df.style.set_caption(caption) display(dfs)
def fit(self, X: Iterable[str]): """Fit the ngram model and the vocabulary from the training data. :param X : Iterable over strings containing the corpus used to train the spellcheker. """ from nltk.collocations import BigramCollocationFinder from editdistance import eval as edit_distance self.tokenize_func = self._build_tokenizer() X_tokenized = [ self.tokenize_func(self.string_preprocessor_func(x)) for x in X ] self.unigram_freq_dict = dict(Counter(itertools.chain(*X_tokenized))) bigram_finder = BigramCollocationFinder.from_documents(X_tokenized) self.bigram_freq_dict = dict(bigram_finder.ngram_fd.items()) self.vocabulary = set( list(itertools.chain(*self.bigram_freq_dict.keys()))) if self.min_freq > 0: self._filter_vocabulary(min_freq=self.min_freq) if self.use_bktree: self.bktree = BKTree(edit_distance, self.vocabulary, sort_candidates=self.sort_candidates)
def process(text: str, num_1_grams: int = 100, num_2_grams: int = 100, num_3_grams: int = 100, num_4_grams: int = 100, min_chars: int = 3, max_chars: int = 30): """ Extract keywords from text sources """ # Find all sentences in the text sents = get_sentences(text) # Filter out any sentences which occur identically more than once sent_counter = collections.Counter(sents) sents = [sent for sent in sents if sent_counter[sent] == 1] # Tokenize each sentence sents = [RE_TOKEN.split(sent) for sent in sents] # and len(word) > 1 # Filter out non-alphabetic tokens and convert to lowercase sents = [[token.lower() for token in sent if is_alpha(token)] for sent in sents] # We look at two variants of the input sentences # a. For 1-grams, we remove all stopwords, short tokens, and possesives # b. For 2-grams and longer, we want to keep stopwords and short tokens as # these might provide some information in relation to other words sents_a = [[RE_POSS.sub('', t) for t in sent if filter_token(t)] for sent in sents] sents_b = sents assert len(sents_a) > 0 and len(sents_b), 'Not enough words' counter = collections.Counter() for sent in sents_a: for token in sent: counter[token] += 1 res = [[], [], [], []] if num_1_grams: # Represent tokens using a tuple with only one element to match the # format of the other ngrams with n > 1 tuples = [((token, ), count) for token, count in counter.items()] df_1 = df_top(tuples=tuples, num=num_1_grams, token_filter=filter_1_grams, min_char=min_chars, max_char=max_chars) if df_1 is not None: res[0] = df_1['entry'].tolist() if num_2_grams: bigrams = BigramCollocationFinder.from_documents(sents_b) tuples = bigrams.score_ngrams(BigramAssocMeasures.raw_freq) df_2 = df_top(tuples=tuples, num=num_2_grams, token_filter=filter_2_grams, min_char=min_chars, max_char=max_chars) if df_2 is not None: res[1] = df_2['entry'].tolist() if num_3_grams: trigrams = TrigramCollocationFinder.from_documents(sents_b) tuples = trigrams.score_ngrams(TrigramAssocMeasures.raw_freq) df_3 = df_top(tuples=tuples, num=num_3_grams, token_filter=filter_3_grams, min_char=min_chars, max_char=max_chars) if df_3 is not None: res[2] = df_3['entry'].tolist() if num_4_grams: quadgrams = QuadgramCollocationFinder.from_documents(sents_b) tuples = quadgrams.score_ngrams(QuadgramAssocMeasures.raw_freq) df_4 = df_top(tuples=tuples, num=num_4_grams, token_filter=filter_4_grams, min_char=min_chars, max_char=max_chars) if df_4 is not None: res[3] = df_4['entry'].tolist() return res
print(biden_tweets) #################################################### # Bigram Analysis dnc_tokens_by_document = list() rnc_tokens_by_document = list() for tweet in dnc_doclist: dnc_tokens_by_document.append( tweet_to_tokens(tweet, ["dnc", "dncconvention"])) for tweet in rnc_doclist: rnc_tokens_by_document.append( tweet_to_tokens(tweet, ["rnc", "rncconvention"])) dnc_finder = BigramCollocationFinder.from_documents(dnc_tokens_by_document) dnc_finder.nbest(BigramAssocMeasures.raw_freq, 30) # top 30 DNC bigrams dnc_finder.score_ngrams( BigramAssocMeasures.raw_freq)[:30] # bigrams with scores # horizontal bar chart plot_word_freqs( dnc_finder.score_ngrams(BigramAssocMeasures.raw_freq)[:30], 'b', "Top 30 bigrams in @DNCConvention2020", "Frequency Score") rnc_finder = BigramCollocationFinder.from_documents(rnc_tokens_by_document) rnc_finder.nbest(BigramAssocMeasures.raw_freq, 30) # top 30 RNC bigrams rnc_finder.score_ngrams( BigramAssocMeasures.raw_freq)[:30] # bigrams with scores plot_word_freqs( rnc_finder.score_ngrams(BigramAssocMeasures.raw_freq)[:30], 'r', "Top 30 bigrams in @RNCConvention2020", "Frequency Score")
key=itemgetter(1), reverse=True) sorted_ngrams = sorted_ngrams_fd[0:limit] sorted_ngrams = [(' '.join(text), freq) for text, freq in sorted_ngrams] return sorted_ngrams get_top_ngrams(corpus=norm_alice, ngram_val=2, limit=10) get_top_ngrams(corpus=norm_alice, ngram_val=3, limit=10) from nltk.collocations import BigramCollocationFinder from nltk.collocations import BigramAssocMeasures finder = BigramCollocationFinder.from_documents( [item.split() for item in norm_alice]) bigram_measures = BigramAssocMeasures() finder.nbest(bigram_measures.raw_freq, 10) finder.nbest(bigram_measures.pmi, 10) from nltk.collocations import TrigramCollocationFinder from nltk.collocations import TrigramAssocMeasures finder = TrigramCollocationFinder.from_documents( [item.split() for item in norm_alice]) trigram_measures = TrigramAssocMeasures() finder.nbest(trigram_measures.raw_freq, 10) finder.nbest(trigram_measures.pmi, 10) toy_text = """ Elephants are large mammals of the family Elephantidae
def statictic(sentens): sw = splitSentence(sentences=sentens) return nltk.TextCollection(sw), BigramCollocationFinder.from_documents(sw)
def compute_collocation_bigram(corpus): finder = BigramCollocationFinder.from_documents([item.split() for item in corpus]) bigram_measures = BigramAssocMeasures() return finder, bigram_measures
right_wing_train = read_csv('../data/train/right_wing_train.csv') left_wing_corpus = merge_to_corpus(left_wing_train, 'Tokenizing left-wing:') right_wing_corpus = merge_to_corpus(right_wing_train, 'Tokenizing right-wing:') # Ignoring too common or unwanted words ignored_words = nltk.corpus.stopwords.words('german') ignored_words.extend(["junge", "freiheit", "www.jungefreiheit.de", "co."]) # Filter bigrams, ignored words and words that are too small are ignored bigram_measures = BigramAssocMeasures() word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words # Calculate most common bigrams print("\nCalculating left-wing bigrams") left_finder = BigramCollocationFinder.from_documents(left_wing_corpus) left_finder.apply_freq_filter(MIN_FREQ) left_finder.apply_word_filter(word_filter) print(left_finder.nbest(bigram_measures.likelihood_ratio, 10), '\n') print("Calculating right-wing bigrams") right_finder = BigramCollocationFinder.from_documents(right_wing_corpus) right_finder.apply_freq_filter(MIN_FREQ) right_finder.apply_word_filter(word_filter) print(right_finder.nbest(bigram_measures.likelihood_ratio, 10), '\n') # Calculate most common unigrams print("Calculating left-wing unigrams") left_wing_tokens = [item for sublist in left_wing_corpus for item in sublist] left_wing_fdist = nltk.FreqDist(left_wing_tokens) print(left_wing_fdist.most_common()[:10], '\n')
def find_bigrams(sentences, n_ngrams): cf = BigramCollocationFinder.from_documents(sentences) fng = cf.nbest(BigramAssocMeasures.likelihood_ratio, n_ngrams) return fng
for text, freq in sorted_ngrams] return sorted_ngrams get_top_ngrams(corpus=norm_alice, ngram_val=2, limit=10) get_top_ngrams(corpus=norm_alice, ngram_val=3, limit=10) from nltk.collocations import BigramCollocationFinder from nltk.collocations import BigramAssocMeasures finder = BigramCollocationFinder.from_documents([item.split() for item in norm_alice]) bigram_measures = BigramAssocMeasures() finder.nbest(bigram_measures.raw_freq, 10) finder.nbest(bigram_measures.pmi, 10) from nltk.collocations import TrigramCollocationFinder from nltk.collocations import TrigramAssocMeasures finder = TrigramCollocationFinder.from_documents([item.split() for item in norm_alice]) trigram_measures = TrigramAssocMeasures() finder.nbest(trigram_measures.raw_freq, 10) finder.nbest(trigram_measures.pmi, 10)
def bigram_coll_score(text, n=500): bigram_measure = BigramAssocMeasures() finder = BigramCollocationFinder.from_documents([text]) finder.apply_freq_filter(2) scored = finder.score_ngrams(bigram_measure.likelihood_ratio) return scored[:n]
def probable_occur(bi_gram): bi_gram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_documents(bi_gram) return sorted(finder.nbest(bi_gram_measures.pmi, 10))
def score_bi_gram(bi_gram): bi_gram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_documents(bi_gram) finder.apply_freq_filter(2) scored = finder.score_ngrams(bi_gram_measures.pmi) return scored
ngram_freq_dist = nltk.FreqDist(ngrams) sorted_ngrams_fd = sorted(ngram_freq_dist.items(), key=itemgetter(1), reverse=True) sorted_ngrams = sorted_ngrams_fd[0:limit] sorted_ngrams = [(' '.join(text), freq) for text, freq in sorted_ngrams] return sorted_ngrams corpus, category = get_data() from nltk.collocations import BigramCollocationFinder from nltk.collocations import BigramAssocMeasures finder = BigramCollocationFinder.from_documents( [item.split() for item in corpus]) bigram_measures = BigramAssocMeasures() print finder.nbest(bigram_measures.raw_freq, 10) from nltk.collocations import TrigramCollocationFinder from nltk.collocations import TrigramAssocMeasures finder = TrigramCollocationFinder.from_documents( [item.split() for item in corpus]) trigram_measures = TrigramAssocMeasures() print finder.nbest(trigram_measures.raw_freq, 10) print finder.nbest(trigram_measures.pmi, 10) # print get_top_ngrams(corpus, ngram_val=2, limit=10)