def ngram_collocation(words, sents, n, support=10, topK=200): if n >= 4: finder = TrigramCollocationFinder.from_words(words) ngram_measures = TrigramAssocMeasures() finder.apply_freq_filter(support) pmi_ngrams = finder.nbest(ngram_measures.pmi, topK) ext_ngrams = NgramCollocationExtender(pmi_ngrams, sents, support / 3, 0.3) print_ngrams(ext_ngrams) return ext_ngrams #pmi_ngrams = NgramCollocationFinder(words, 2, lowFreq, topK) #the current collocation measure is PMI else: if n == 2: finder = BigramCollocationFinder.from_words(words) ngram_measures = BigramAssocMeasures() if n == 3: finder = TrigramCollocationFinder.from_words(words) ngram_measures = TrigramAssocMeasures() finder.apply_freq_filter(support) pmi_ngrams = finder.nbest(ngram_measures.pmi, topK) print_ngrams(pmi_ngrams) return pmi_ngrams
def getNameScore(name, data): name = name + '\n' trigram_measures = TrigramAssocMeasures() name_len = len(name) - 2 score = 1 for i in range(0, name_len): trigram_score = data.colloc_finder.score_ngram( trigram_measures.raw_freq, char_with_type(name[i]), char_with_type(name[i + 1]), char_with_type(name[i + 2])) if trigram_score is None: score = score * data.base_frequency else: score = score * trigram_score name_len_score = 0 if data.name_len_probabilities.has_key(len(name)): name_len_score = data.name_len_probabilities[len(name)] else: name_len_score = data.base_name_len_probability # last_letter_score = data.base_name_len_probability # if data.last_letter_probabilities.has_key(name[-1:]): # last_letter_score = data.last_letter_probabilities[name[-1:]] return score * name_len_score * data.name_probability
def collocation(inp, outp, freq_filter, results, coll_type, pos): pos = bool(pos == 'true') with open(inp, 'r') as fd: i = fd.read() all_words = [] if pos: text = i.split(' ')[:-1] all_words = [x[0:x.index('/')] if x != '\n' else x for x in text] all_words = [x.strip(' ').strip('\n') for x in all_words] else: sents = nltk.sent_tokenize(i) for sent in sents: all_words += nltk.word_tokenize(sent) if coll_type == 'bigram': measures = BigramAssocMeasures() finder = BigramCollocationFinder.from_words(all_words) else: measures = TrigramAssocMeasures() finder = TrigramCollocationFinder.from_words(all_words) finder.apply_freq_filter(int(freq_filter)) # score the ngrams and get the first N colls = finder.score_ngrams(measures.pmi)[:int(results)] with open(outp, 'w') as output: for coll in colls: (a, b), score = coll output.write("%s\t%s\n" % (a, b))
def extract_trigrams(self, sent): sent = self._preprocess_sent(sent) trigram_measures = TrigramAssocMeasures() TriFinder = TrigramCollocationFinder.from_words(sent) trigrams = TriFinder.nbest(trigram_measures.pmi, 10000) trigrams = set([' '.join(i) for i in trigrams]) trigrams = trigrams & self._trigrams_set return {i: True for i in trigrams}
def get_top_trigrams(corpus, top_n=100): ''' Most frequent tri-gram detection ''' finder = TrigramCollocationFinder.from_documents( [item.split() for item in corpus]) trigram_measures = TrigramAssocMeasures() return finder.nbest(trigram_measures.raw_freq, top_n)
def retrieve_top_trigrams_collocations(corpus, top=5, measure='pmi'): finder = TrigramCollocationFinder.from_documents( [item.split() for item in corpus]) trigram_measures = TrigramAssocMeasures() if measure == 'pmi': top_trigrams = finder.nbest(trigram_measures.pmi, top) elif measure == 'frequency': top_trigrams = finder.nbest(trigram_measures.raw_freq, top) else: raise ValueError('Type of measure is unknown!') return top_trigrams
def common_collocations(text, occurences=20): tokens = word_tokenize(text) final_results = [] for measures, collocationFinder, min_size in [ (BigramAssocMeasures(), BigramCollocationFinder, 2), (TrigramAssocMeasures(), TrigramCollocationFinder, 3) ]: m = measures finder = collocationFinder.from_words(tokens, window_size=min_size) finder.apply_word_filter(lambda w: len(w) < 2) finder.apply_freq_filter(1) results = finder.nbest(m.student_t, occurences) final_results += [" ".join(gram) for gram in results] return final_results
def save_trigrams(tokenized_docs, shouldWriteToFile=False): trigrams_finder = create_trigram_finder(tokenized_docs) trigram_measures = TrigramAssocMeasures() trigrams_scores = trigrams_finder.score_ngrams(trigram_measures.likelihood_ratio) trigrams_counts = ['%s_%s_%s,%d\n' % (most_common[0][0], most_common[0][1], most_common[0][2], most_common[1]) for most_common in trigrams_finder.ngram_fd.most_common()] trigrams_scores_as_str = [ '%s_%s_%s,%d\n' % (most_common[0][0], most_common[0][1], most_common[0][2], most_common[1]) for most_common in trigrams_scores] if shouldWriteToFile: with open('./output/trigrams_counts.csv', "w", encoding="utf8") as fout: lines_to_file(trigrams_counts, fout) with open('./output/trigrams_lr_scores.csv', "w", encoding="utf8") as fout: lines_to_file(trigrams_scores_as_str, fout)
def create_wordCloud_dict_trigrams(text_content, bad_trigrams): finder = TrigramCollocationFinder.from_words(text_content) trigram_measures = TrigramAssocMeasures() scored = finder.score_ngrams(trigram_measures.raw_freq) # Sort highest to lowest based on the score. #scoredList = sorted(scored, key=itemgetter(1), reverse=True) scoredList = scored word_dict = {} listLen = len(scoredList) # Set the key to the scored value. for i in range(listLen): word_dict[' '.join(scoredList[i][0])] = scoredList[i][1] for bad_bigram in bad_trigrams: if bad_trigram in word_dict: del word_dict[bad_trigram] return word_dict
def compute_collocation(corpora_dir: str, session: int, party: str, num_chunks: int, bigram_out_path: str, trigram_out_path: str, discard_tokens: Set[str], stop_words: Set[str], min_frequency: int) -> None: """ discard_tokens should be a subset of stop_words. This is used for a heuristic to filter trigrams, where the second word is permitted to be a stop word (e.g. "freedom of speech") but not a discarded token (e.g. "I yield to"). The first and third words can never be a stop word. """ tokenized_corpus: List[str] = [] for chunk_index in range(num_chunks): corpus_path = os.path.join(corpora_dir, f'{session}_{party}{chunk_index}.txt') with open(corpus_path) as corpus_file: raw_text = corpus_file.read() tokens: List[str] = nltk.tokenize.word_tokenize(raw_text) tokens = [ t.lower() for t in tokens if t not in discard_tokens and not t.isdigit() ] tokenized_corpus.extend(tokens) del tokens bigram_finder = BigramCollocationFinder.from_words(tokenized_corpus) bigram_finder.apply_freq_filter(min_frequency) bigram_finder.apply_word_filter(lambda word: word in stop_words) bigrams = bigram_finder.score_ngrams(BigramAssocMeasures().raw_freq) trigram_finder = TrigramCollocationFinder.from_words(tokenized_corpus) trigram_finder.apply_freq_filter(min_frequency) trigram_finder.apply_ngram_filter(lambda w1, w2, w3: ( w1 in stop_words) or (w3 in stop_words) or (w2 in discard_tokens)) trigrams = trigram_finder.score_ngrams(TrigramAssocMeasures().raw_freq) num_tokens = len(tokenized_corpus) with open(bigram_out_path, 'w') as bigram_file: for bigram, relative_freq in bigrams: absolute_freq = relative_freq * num_tokens bigram_str = ' '.join(bigram) bigram_file.write(f'{absolute_freq:.0f}\t{bigram_str}\n') with open(trigram_out_path, 'w') as trigram_file: for trigram, relative_freq in trigrams: absolute_freq = relative_freq * num_tokens trigram_str = ' '.join(trigram) trigram_file.write(f'{absolute_freq:.0f}\t{trigram_str}\n')
def ngram_analyze(self, lst, model="student_t"): """ Documentation for analysis tools: http://www.nltk.org/_modules/nltk/metrics/association.html Uses student_t distribution to analyze a list of words by splitting them into \ tuples of 3 elements: eg. (a, b, c), (b, c, d), ... The distribution assigns a score to each tuple. This function returns the \ highest score words Args: ----- lst : a list of words model : the chosen model for ngram analysis (student_t, chi_sq, mi_like, pmi, jaccard) """ lst = self.nlp(lst) string = " ".join(map(str, lst)) words = nltk.word_tokenize(string) measures = TrigramAssocMeasures() finder = TrigramCollocationFinder.from_words(words) scores = [] if model == "student_t": scores = finder.score_ngrams(measures.student_t)[:] elif model == "chi_sq": scores = finder.score_ngrams(measures.chi_sq)[:] elif model == "mi_like": scores = finder.score_ngrams(measures.mi_like)[:] elif model == "pmi": scores = finder.score_ngrams(measures.pmi)[:] elif model == "jaccard": scores = finder.score_ngrams(measures.jaccard)[:] else: print("Not valid model!") scores.sort(key=lambda i: i[1], reverse=True) top = scores[:3] return top
def Collocation(contents, n): from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder, TrigramAssocMeasures, TrigramCollocationFinder, QuadgramAssocMeasures, QuadgramCollocationFinder from nltk.probability import FreqDist, DictionaryProbDist if n==2: bigram_measures = BigramAssocMeasures() finder = BigramCollocationFinder.from_words(contents) scored = finder.score_ngrams(bigram_measures.raw_freq) elif n==3: trigram_measures = TrigramAssocMeasures() finder = TrigramCollocationFinder.from_words(contents) scored = finder.score_ngrams(trigram_measures.raw_freq) elif n==4: quadgram_measures = QuadgramAssocMeasures() finder = QuadgramCollocationFinder.from_words(contents) scored = finder.score_ngrams(quadgram_measures.raw_freq) else: print("Collocation is only available for n=2, 3, or 4.") return(scored)
def x_trigrams(tokens, x): ''' Find the x best tri-grams given tokens (a list of strings) and x which will tell you how many tri-grams to return. Parameters ---------- tokens: A list of strings x: An integer Returns ------- tri_list: A list of tuples, with the tuples being of the form (str, str, str). ''' #Finds bigrams than finds the x best ones trigramass = TrigramAssocMeasures() finder = TrigramCollocationFinder.from_words(tokens) tri_list = finder.nbest(trigramass.pmi, x) return tri_list
def trigram_collocation_finder(tokens,window_size = 3): '''It returns trigram collocations, including their raw frequency, by using a list of tokens or list of sentences that are list of tokens as input. Window size is three. Parameters ----------- tokens: a list of tokens or list of sentences that are list of tokens window_size: the window size of the collocation, by default 3 Returns ------- bigram_collocations: list of bigram collocations and their raw frequency in tuples ''' trigram_measures = TrigramAssocMeasures() if isinstance(tokens[0],list): # todo how to measure the window size here finder = TrigramCollocationFinder.from_documents(tokens) else: finder = TrigramCollocationFinder.from_words(tokens,window_size=window_size) result = finder.score_ngrams(trigram_measures.raw_freq) return result
def top_trigrams(self, tokens): tfinder = TrigramCollocationFinder.from_words(tokens) trigram_measures = TrigramAssocMeasures() tfinder.apply_freq_filter(int(self.trigrams_pct_words * len(tokens))) trigrams = tfinder.nbest(trigram_measures.pmi, self.num_trigrams) return trigrams
def collocations(self, top, freq=None): if freq: self._bigram_finder.apply_freq_filter(freq) self._trigram_finder.apply_freq_filter(freq) return (self._bigram_finder.nbest(BigramAssocMeasures().pmi, top), self._trigram_finder.nbest(TrigramAssocMeasures().pmi, top))
file_1 = open('res/task_1-5.tsv', 'a') file_1.write('3-грамма\tчастота\n') for trigram, freq in count_frequency(trigrams=tri_grams): file_1.write(' '.join(trigram) + '\t' + str(freq) + '\n') file_1.close() # Задание 6 my_evaluation = evaluate_association( trigrams=list(tri_grams), num_of_tokens=len(tokens_without_stop_words))[:30] # Задание 7 trigram_measures = TrigramAssocMeasures() tokens_2 = word_tokenize(open('Text.txt', 'r').read(), 'russian', True) text_2 = Text(tokens_2) finder_thr_1 = TrigramCollocationFinder.from_words(text_2) evaluation_with_punctuation = finder_thr_1.nbest( TrigramAssocMeasures().student_t, 30) file_2 = open('res/task_7_with_p.tsv', 'a') file_2.write('Мои 3-граммы\tNLTK 3-граммы\n') for i in range(30): file_2.write(' '.join(my_evaluation[i][0]) + '\t' + ' '.join(evaluation_with_punctuation[i][0]) + '\n')
from nltk.collocations import BigramCollocationFinder from nltk.collocations import BigramAssocMeasures finder = BigramCollocationFinder.from_documents( [item.split() for item in norm_alice]) bigram_measures = BigramAssocMeasures() finder.nbest(bigram_measures.raw_freq, 10) finder.nbest(bigram_measures.pmi, 10) from nltk.collocations import TrigramCollocationFinder from nltk.collocations import TrigramAssocMeasures finder = TrigramCollocationFinder.from_documents( [item.split() for item in norm_alice]) trigram_measures = TrigramAssocMeasures() finder.nbest(trigram_measures.raw_freq, 10) finder.nbest(trigram_measures.pmi, 10) toy_text = """ Elephants are large mammals of the family Elephantidae and the order Proboscidea. Two species are traditionally recognised, the African elephant and the Asian elephant. Elephants are scattered throughout sub-Saharan Africa, South Asia, and Southeast Asia. Male African elephants are the largest extant terrestrial animals. All elephants have a long trunk used for many purposes, particularly breathing, lifting water and grasping objects. Their incisors grow into tusks, which can serve as weapons and as tools for moving objects and digging. Elephants' large ear flaps help to control their body temperature. Their pillar-like legs can carry their great weight. African elephants have larger ears
def get_topic_data(product, df, final_results, input_text, load_path, encoding_type): data = df[df['ProductId']==product] # prepare the corpus texts = data[input_text].str.split() dictionary = corpora.Dictionary(texts) remove_freq(dictionary, 10) corpus = [dictionary.doc2bow(text) for text in texts] # Load in the tuned LDA model for the product t = final_results.loc[product, 'num_topics'] p = final_results.loc[product, 'passes'] tn = final_results.loc[product, 'top_n removed'].astype(int) na = final_results.loc[product, 'n_above threshold'] lda = gensim.models.ldamodel.LdaModel.load('../models/{}/final_models/{}_{}_{}_{}_{}'.format(load_path, product, t, p, tn, na)) topic_data=[] # be sure to set the appropriate coherence measure topics = lda.top_topics(texts=texts, corpus=corpus, coherence='c_v') # iterate through the topics to get coherence, top review, key words, and bigrams for topic in range(0,t): # sub dataframe where this is the main topic main_topic_df = data[data['{} Topic'.format(encoding_type)]==topic] # sub dataframe where this is a subtopic sub_topic_df = data[data['{} Subtopic'.format(encoding_type)]==topic] # grab the coherence measure coherence = (topics[topic][-1]) # Make a list of the top words from the topic l = lda.show_topic(topic, topn=10) # And then reformat this into a usable list top_words = [x[0] for x in l] # Get the number of reviews fitting into the topic ... # as the main topic, with a fit value above 0.7 as_main = len(main_topic_df.loc[main_topic_df['{} Fit'.format(encoding_type)]>=0.7]) # as the primary subtopic as_primary_sub = len(main_topic_df.loc[(main_topic_df['{} Fit'.format(encoding_type)]<0.7)& (main_topic_df['{} Fit'.format(encoding_type)]>=0.3)]) as_secondary_sub = len(sub_topic_df.loc[sub_topic_df['{} Subtopic Fit'.format(encoding_type)]>=0.3]) #count = len(data[data['{} Topic'.format(encoding_type)]==topic]) try: # Get an index locator for the best fitting review ix = main_topic_df['{} Fit'.format(encoding_type)].idxmax(axis=0) # Find the review that best matches the topic top_review = main_topic_df.loc[ix, 'clean_review'] # Get that best review's fit value (probability review comes from topic) fit = main_topic_df['{} Fit'.format(encoding_type)].max(axis=0) # Getting the bigrams bigram_measures = BigramAssocMeasures() trigram_measures = TrigramAssocMeasures() # Build the bigram distribution over the set of words found in the reviews tagged to this topic #words = np.concatenate(np.array([word_tokenize(r) for r in sub_df['{}_x'.format(input_text)].values])) words = np.concatenate(np.array([word_tokenize(r) for r in main_topic_df['clean_vanilla_x'].values])) bigram_fd = FreqDist(bigrams(words)) trigram_fd = FreqDist(trigrams(words)) bfinder = BigramCollocationFinder.from_words(words, window_size=3) tfinder = TrigramCollocationFinder.from_words(words, window_size=4) for finder in [bfinder, tfinder]: # Get rid of words we don't want finder.apply_word_filter(lambda w: w in ('GOODREVIEW', 'BADREVIEW', 'VGOODREVIEW', 'VBADREVIEW', 's', 'b', 'c', 'oz', 'be')) # Filter out bigrams that don't appear at least 2 times finder.apply_freq_filter(2) # Filter out some common n-grams bfinder.apply_ngram_filter(lambda w1, w2: (w1, w2) in bigrams_filter) tfinder.apply_ngram_filter(lambda w1, w2, w3: (w1, w2, w3) in trigrams_filter) # Get the top 3 bigrams and trigrams by raw frequency and by PMI value bgrams_pmi = bfinder.nbest(bigram_measures.pmi, 10) bgrams_freq = bfinder.nbest(bigram_measures.raw_freq, 10) tgrams_pmi = tfinder.nbest(trigram_measures.pmi, 10) tgrams_freq = tfinder.nbest(trigram_measures.raw_freq, 10) # Format a bit more nicely for readability top_bigrams_pmi = [a[0]+" "+a[1] for a in bgrams_pmi] top_bigrams_freq = [a[0]+" "+a[1] for a in bgrams_freq[2:]] top_trigrams_pmi = [a[0]+" "+a[1]+" "+a[2] for a in tgrams_pmi] top_trigrams_freq = [a[0]+" "+a[1]+" "+a[2] for a in tgrams_freq[2:]] except ValueError: # ValueError in this case indicates there were no reviews that were matched to the topic # hence the results will be blank for that top_review = 'none' fit = '' top_bigrams_pmi = [] top_trigrams_pmi = [] top_bigrams_freq = [] top_trigrams_freq = [] topic_data.append([product, topic, as_main, as_primary_sub, as_secondary_sub, coherence, top_words, top_review, fit, top_bigrams_pmi, top_bigrams_freq, top_trigrams_pmi, top_trigrams_freq]) topic_data=pd.DataFrame(data=topic_data, columns=['product', 'topic', 'as_main_topic', 'as_primary_subtopic', 'as_secondary_subtopic', 'topic_coherence', 'top_words', 'best_review', 'best_review_fit', 'top_bigrams_pmi', 'top_bigrams_freq', 'top_trigrams_pmi', 'top_trigrams_freq']) return topic_data