def create_word_bigram_scores(): posdata = pickle.load(open('pos_review.pkl', 'rb')) negdata = pickle.load(open('neg_review.pkl', 'rb')) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder_pos = BigramCollocationFinder.from_words(posWords) bigram_finder_neg = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder_pos.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder_neg.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams # 词和双词搭配 neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in neg: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def association(self, measure='pmi'): if measure in self._association_dict: return self._association_dict[measure] ngrams = [self.ngram] collocs = {} for ngram in ngrams: self.ngram = ngram dist = self._get_freq_distributions() if len(self.ngram) == 2: finder = BigramCollocationFinder(*dist) measures = getattr(bigram_measures, measure) else: finder = TrigramCollocationFinder(*dist) measures = getattr(trigram_measures, measure) try: collocs = finder.score_ngrams(measures) collocs = dict((x[0][self.edit_pos], (i, x[1])) for i, x in enumerate(collocs)) except Exception as e: print('Exception in pmi_preps', e) print(self) print(dist) collocs = {} self._association_dict[measure] = collocs if collocs: return collocs return collocs
def N_collocations_in_text(text, N, min_freq): # finds <N> most significant two word collocations which occur at # least <min_freq> times text_lower = [w.lower() for w in text] finder = BigramCollocationFinder.from_words(text_lower) finder.apply_freq_filter(min_freq) return finder.nbest(BigramAssocMeasures().pmi, N)
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200): newwords = [] bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) for ngram in itertools.chain(words, bigrams): newwords.append(ngram) return newwords
def get_types_of_guitar_bigrams(tokens): """ Returns bigrams by co-occurrence for guitar types :param tokens: :return:defaultdict() """ T = [] for token in tokens: for tok in token: if not any(t.isdigit() for t in tok): T.append(tok) tokens = T bgm = BigramAssocMeasures() finder = BigramCollocationFinder.from_words(tokens) guitar_filter = lambda *w: 'guitar' not in w finder.apply_freq_filter(2) finder.apply_ngram_filter(guitar_filter) scored = finder.score_ngrams(bgm.pmi) prefix_keys = collections.defaultdict(list) for key, scores in scored: prefix_keys[key[1]].append((key[0], scores)) for key in prefix_keys: prefix_keys[key].sort(key=lambda x: -x[1]) return prefix_keys
def createCollocationGraph(data, title): print('Create Collocation Graph') words = extractTokens(data) words = removeCommonWords(words) #find word pairs finder = BigramCollocationFinder.from_words(words, window_size=5) pairs = sorted(finder.ngram_fd.items(), key=lambda t: (-t[1], t[0])) collocation = nx.DiGraph() for key, value in pairs: left = key[0].lower() right = key[1].lower() if left not in collocation: collocation.add_node(left) if right not in collocation: collocation.add_node(right) if collocation.has_edge(left, right): collocation[left][right]['weight'] += value else: collocation.add_weighted_edges_from([(left, right, value)]) nx.write_gexf(collocation, title)
def BiGramFeature(word_list, method=BigramAssocMeasures.chi_sq, n=2000): # 防止由于wordlist中只包含一种词而造成调用nbest引起的错误 if len(set(word_list)) != 1: bigram_list = BigramCollocationFinder.from_words(word_list) top_bigram = bigram_list.nbest(method, n) return UniGramFeature(top_bigram) else: return UniGramFeature([])
def get_bigrams(words): bigram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(words) finder.apply_freq_filter( 4) # Restrict bigrams to those that appear at least three times return sorted(finder.ngram_fd.items())
def most_common_bigrams(all_words, num_bigrams): bigram_finder = BigramCollocationFinder.from_words(all_words) bigram_freq = dict(bigram_finder.ngram_fd.viewitems()) for k, v in bigram_freq.items(): if not is_feature_relevant(k[0]) or not is_feature_relevant(k[1]): del bigram_freq[k] fd = FreqDist(bigram_freq) return dict(fd.most_common(num_bigrams)).keys()
def bigrams(self) -> List[str]: """ Returns list of bigrams, words inside bigrams are space-separated. Returns up to 50 bigrams. """ finder = BigramCollocationFinder.from_words(self.tokens) return [ f"{word_0}_{word_1}" for word_0, word_1 in finder.nbest(BigramAssocMeasures().pmi, 50) ]
def Mixup2Feature(word_list, bi_method=BigramAssocMeasures.chi_sq,\ bi_n=2000): # 防止由于wordlist中只包含一种词而造成调用nbest引起的错误 if len(set(word_list)) != 1: bigram_list = BigramCollocationFinder.from_words(word_list) # print json.dumps(word_list, ensure_ascii=False) top_bigram = bigram_list.nbest(bi_method, bi_n) return UniGramFeature(word_list + top_bigram) else: return UniGramFeature(word_list)
def association(self, measure='pmi'): if measure in self._association_dict: return self._association_dict[measure] dist = self._get_freq_distributions() if len(self.ngram) == 2: finder = BigramCollocationFinder(*dist) measures = getattr(bigram_measures, measure) else: finder = TrigramCollocationFinder(*dist) measures = getattr(trigram_measures, measure) try: collocs = finder.score_ngrams(measures) except Exception, e: print 'Exception in pmi_preps' print e print self print dist collocs = []
def Mixup3Feature(word_list, bi_method=BigramAssocMeasures.chi_sq,\ bi_n=2000, tri_method=TrigramAssocMeasures.chi_sq, tri_n=1000): # 防止由于wordlist中只包含一种词而造成调用nbest引起的错误 if len(set(word_list)) != 1: bigram_list = BigramCollocationFinder.from_words(word_list) top_bigram = bigram_list.nbest(bi_method, bi_n) trigram_list = TrigramCollocationFinder.from_words(word_list) top_trigram = trigram_list.nbest(tri_method, tri_n) return UniGramFeature(word_list + top_bigram + top_trigram) else: trigram_list = TrigramCollocationFinder.from_words(word_list) top_trigram = trigram_list.nbest(tri_method, tri_n) return UniGramFeature(word_list + top_trigram)
def find_freq_of_features(local_features, imp_feature_set): """ Finds the frequency of most common features in this data """ local_bigram_features = dict(BigramCollocationFinder.from_words(local_features).ngram_fd.viewitems()) local_freq_features = dict.fromkeys(imp_feature_set, 0) for word in local_features: if word in imp_feature_set: local_freq_features[word] += 1 for bigram in local_bigram_features: if bigram in imp_feature_set: local_freq_features[bigram] = local_bigram_features[bigram] return local_freq_features
def remove_non_price_numbers(tokens): """ Remove numbers which are not prices """ finalized_tokens = [] token_bigrams = BigramCollocationFinder.from_words(tokens) num_pattern = r"\d+(?:\.\d+)?" pattern = re.compile(f"{num_pattern}(-{num_pattern})?" ) # include tokens like ranges of numbers/ numbers for token in tokens: if not pattern.fullmatch(token): finalized_tokens.append(token) continue if (token_bigrams.ngram_fd.get(('$', token)) or token_bigrams.ngram_fd.get((token, '$'))) \ or (token_bigrams.ngram_fd.get(('usd', token)) or token_bigrams.ngram_fd.get((token, 'usd'))): finalized_tokens.append(token) return finalized_tokens
def grams(): from nltk import BigramCollocationFinder, BigramAssocMeasures, TrigramAssocMeasures, TrigramCollocationFinder words = get_words() bigram_measures = BigramAssocMeasures() finder = BigramCollocationFinder.from_words(words) finder.apply_freq_filter(40) bigrams = finder.nbest(bigram_measures.pmi, 500) trigram_measures = TrigramAssocMeasures() finder3 = TrigramCollocationFinder.from_words(words) finder3.apply_freq_filter(100) trigrams = finder3.nbest(trigram_measures.pmi, 300) combos2 = [combo2[0]+ " " + combo2[1] for combo2 in bigrams] combos3 = [combo3[0]+ " " + combo3[1] + " " + combo3[2] for combo3 in trigrams] return combos2, combos3
def strip_and_tokenize(msg): msg = msg.strip() msg = msg.lower() msg = re.sub(b'\xe2\x80\x99'.decode(), "'", msg) # remove stop words # remove non-ascii characters tokens = word_tokenize(msg) # tokens_ = tokens.copy() ret_tokens = [] token_bigrams = BigramCollocationFinder.from_words(tokens) for token in tokens: try: token.encode("ascii") except UnicodeEncodeError: # tokens.remove(token) # remove non-ascii tokens continue # if token in punctuations: # remove punctuations # tokens.remove(token) continue # if is_date(token): ret_tokens.append("<DATE>") continue if is_phone_number(token): # tokens.remove(token) ret_tokens.append("<PHONE>") continue # if bool(is_url(token)): ret_tokens.append(retrieve_url_tokens(token)) continue new_token, is_resolved = resolve_clitics(token) if is_resolved: ret_tokens.append(new_token) continue p_token, is_tokenized = tokenize_price_numbers(token, token_bigrams) if is_tokenized: ret_tokens.append(p_token) continue ret_tokens.extend(regex_tokenizer.tokenize(token)) # return ret_tokens
def tokenize_age_tokens(tokens) -> bool: age_pattern = re.compile(r"\d{1,2}") finalized_tokens = [] token_bigrams = BigramCollocationFinder.from_words(tokens) for token in tokens: if short_age_pattern.fullmatch(token): finalized_tokens.append("<AGE>") continue if not age_pattern.fullmatch(token): finalized_tokens.append(token) continue if token_bigrams.ngram_fd.get((token, 'year')) or token_bigrams.ngram_fd.get((token, 'years')): finalized_tokens.append("<AGE>") continue elif token_bigrams.ngram_fd.get((token, 'month')) or token_bigrams.ngram_fd.get((token, 'months')): finalized_tokens.append("<AGE>") continue else: finalized_tokens.append(token) return finalized_tokens
def top_phrases_nltk(revs): """ Find top phrases by finding collocations using nltk :param revs: :return: """ revs = '.\n'.join(revs) bigram_measures = BigramAssocMeasures() tokens = word_tokenize(revs) finder = BigramCollocationFinder.from_words(tokens) finder.apply_freq_filter(3) finder.apply_word_filter( lambda w: len(w) < 2 or w in stopwords.words("english")) colloc = [] for tup in finder.nbest(bigram_measures.pmi, 50): pos = pos_tag(tup) if pos[0][1].startswith(("JJ", "RB")) or pos[1][1].startswith( ("JJ", "RB")): colloc.append(tup) return colloc
def collocations(text, language='english', num=0, window_size=2): #num=20 """ A reimplementation of the basic workings of the collocations method of the ``Text`` class of NLTK. :param text: raw text :param language: language to use to eliminate stopwords :param num: number of collocations :param window_size: window for collocations :return: a list of collocations for the text """ from nltk.corpus import stopwords ignored_words = stopwords.words(language) finder = BigramCollocationFinder.from_words(word_tokenize(text), window_size) finder.apply_freq_filter(2) finder.apply_word_filter( lambda w: len(w) < 3 or w.lower() in ignored_words) bigram_measures = BigramAssocMeasures() c = finder.nbest(bigram_measures.likelihood_ratio, num) collocation_strings = [w1 + ' ' + w2 for w1, w2 in c] return collocation_strings
def collect(self, input_file, limit=None, save=None): """ corllect corpus from Jodel JSON data :param input_file: :param limit: :param save: :return: """ corpus = [] words = [] labels = [] identities = { 'Basel-Stadt': 'Basel', 'Brunswick': 'Braunschweig', 'Cologne': 'Köln', 'Frankfurt': 'Frankfurt am Main', 'Freiburg': 'Freiburg im Breisgau', 'Fribourg-en-Brisgau': 'Freiburg im Breisgau', 'Geneva': 'Genf', 'Genève': 'Genf', 'Hanover': 'Hannover', 'Klagenfurt am Wörthersee': 'Klagenfurt', 'Munich': 'München', 'Nuremberg': 'Nürnberg', 'Ouest lausannois': 'Lausanne', 'Sankt Pölten': 'St. Pölten', 'Sankt Gallen': 'St. Gallen', 'Salzburg-Umgebung': 'Salzburg', 'Vienna': 'Wien', 'Zurich': 'Zürich' } self.city_frequency = defaultdict(int) # iterate over the data with open(input_file, encoding='utf-8', errors='ignore') as f: for line_no, line in enumerate(islice(f, None)): if line_no > 0: if line_no % 10000 == 0: print("%s" % (line_no), file=sys.stderr, flush=True) elif line_no % 500 == 0: print('.', file=sys.stderr, end=' ', flush=True) try: jodel = json.loads(line) except ValueError: print(line) msg = jodel.get('message', None) location = jodel.get('location', None) # skip empty jodels if msg is None or location is None: continue city = location.get('name', None) if city == 'Jodel Team' or city is None: continue # correct city names city = identities.get(city, city) self.city_frequency[city] += 1 # collect all the data and transform it data = [self.clean(msg)] data.extend([ self.clean(child.get('message', [])) for child in jodel.get('children', []) ]) # one instance for each jodel # corpus.extend(data) # labels.extend([city] * len(data)) # one instance for each conversation corpus.append([word for message in data for word in message]) labels.append(city) words.extend([word for message in data for word in message]) if limit is not None and line_no == limit: break assert len(labels) == len( corpus ), "umm, the number of labels (%s) and the number of instances (%s) is not the same" % ( len(labels), len(corpus)) self.int2word = { i: max(self.stems[w].keys(), key=(lambda key: self.stems[w][key])) for w, i in self.word2int.items() } # find collocations print('\nlooking for collocations', file=sys.stderr, flush=True) finder = BigramCollocationFinder.from_words(words) bgm = BigramAssocMeasures() collocations = [ b for b, f in finder.score_ngrams(bgm.mi_like) if f > 1.0 ] self.collocations = set(collocations) print('\ncreating corpus', file=sys.stderr, flush=True) if save is not None: self.corpus = [] with open('%s.corpus' % save, 'w', encoding='utf-8') as save_corpus: for doc, tag in zip(corpus, labels): words = self.join_collocations(doc) tags = [tag] self.corpus.append(TaggedDocument(words, tags=tags)) save_corpus.write('%s\n' % json.dumps({ 'words': words, 'tags': tags })) print('\ncorpus saved as %s' % save, file=sys.stderr, flush=True) with open('%s.citycounts' % save, 'w', encoding='utf-8') as save_counts: json.dump(dict(self.city_frequency), save_counts) else: self.corpus = [ TaggedDocument(self.join_collocations(doc), tags=[tag]) for doc, tag in zip(corpus, labels) ] print('\n%s instances' % len(self.corpus), file=sys.stderr, flush=True) # update mappings self.int2word = { i: max(self.stems[w].keys(), key=(lambda key: self.stems[w][key])) for w, i in self.word2int.items() } print("Found %s collocations" % (len(collocations)), file=sys.stderr, flush=True) for (w1, w2) in collocations[:10]: print('\t', self.int2word[w1], self.int2word[w2], file=sys.stderr, flush=True)
def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200): bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) return bigrams
one_grams = calculate_n_grams(words, 1) one_grams_freq = calculate_n_grams_frequency(one_grams, words, 1) # Calculate MI be_mi = calculate_mi(be_grams_freq, 2, one_grams_freq, len(words), 30) print() print(be_mi) three_mi = calculate_mi(three_grams_freq, 3, one_grams_freq, len(words), 30) print() print(three_mi) # Check bigram_measures = nltk.collocations.BigramAssocMeasures() trigram_measures = nltk.collocations.TrigramAssocMeasures() text = nltk.Text(words) finder_bi = BigramCollocationFinder.from_words(text) finder_thr = TrigramCollocationFinder.from_words(text) print() be_best = finder_bi.nbest(bigram_measures.pmi, 30) print(be_best) print() tri_best = finder_thr.nbest(trigram_measures.pmi, 30) print(tri_best)
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200): bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
"""计算句子中某种语言模式出现概率的统计模型 把自然语言作为模型进行统计分析""" import nltk from nltk import ngrams, BigramCollocationFinder, BigramAssocMeasures, unique_list, KneserNeyProbDist from nltk.corpus import alpino, webtext, stopwords """单词分组 util.py""" n = 4 grams = ngrams(alpino.words(), n) # for i in grams: # print(i) out = list(ngrams([1, 2, 3, 4, 5], 3)) print(out) # [(1, 2, 3), (2, 3, 4), (3, 4, 5)] set = set(stopwords.words('english')) stops_filter = lambda w: len(w) < 3 or w in set tokens = [t.lower() for t in webtext.words('grail.txt')] words = BigramCollocationFinder.from_words(tokens) words.apply_word_filter(stops_filter) out = words.nbest(BigramAssocMeasures.likelihood_ratio, 10) print(out) """最大似然估计的目的就是:利用已知的样本结果,反推最有可能(最大概率)导致这样结果的参数值。""" """最大似然估计wiki https://zh.wikipedia.org/zh-cn/%E6%9C%80%E5%A4%A7%E4%BC%BC%E7%84%B6%E4%BC%B0%E8%AE%A1""" """隐马尔科夫模型估计 HMM""" corpus = nltk.corpus.brown.tagged_sents(categories='adventure')[:700] print(len(corpus)) tag_set = unique_list(tag for sent in corpus for (word, tag) in sent) print(len(tag_set)) """平滑""" # gt = lambda fd, bins:SimpleGoodTuringProbDist(fd, bins=1e5) # train_and_test(gt) corpus = [[((x[0], y[0], z[0]), (x[1], y[1], z[1])) for x, y, z in nltk.trigrams(sent)]
def __init__(self, limit=1000): # TODO: Read all of shakespeare into words? fileid = shakespeare.fileids()[0] words = remove_punctuation(shakespeare.words(fileid)) self.finder = BigramCollocationFinder.from_words(words) self.bigrams = self.finder.nbest(bigram_measures.raw_freq, limit)
def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200): bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) d = dict([(bigram, True) for bigram in bigrams]) d.update(best_word_feats(words)) return d
############################################################################### # Generate a list of word tuples (bigram) to examine collocations from nltk import bigrams bigram_tuples = list(bigrams(brown.words())) len(bigram_tuples) # http://www.nltk.org/howto/collocations.html # collocations are essentially just frequent bigrams, # except that we want to pay more attention to the cases that involve rare words. # In particular, we want to find bigrams that occur more often than we would expect # based on the frequency of the individual words. The collocations() function does this for us. from nltk import collocations from nltk import BigramCollocationFinder bigram_measures = nltk.collocations.BigramAssocMeasures() trigram_measures = nltk.collocations.TrigramAssocMeasures() finder = BigramCollocationFinder.from_words(brown.words()) finder.nbest(bigram_measures.pmi, 10) # apply filters to collocations, such as ignoring all bigrams which occur less than three times in the corpus finder.apply_freq_filter(3) finder.nbest(bigram_measures.pmi, 10) # find collocations among tagged words finder = BigramCollocationFinder.from_words(brown.tagged_words('ca01', tagset='universal')) finder.nbest(bigram_measures.pmi, 5) # tags alone finder = BigramCollocationFinder.from_words(t for w, t in brown.tagged_words('ca01', tagset='universal')) finder.nbest(bigram_measures.pmi, 5) # Spanning intervening words
ret_axes.get_legend().remove() ret_axes.set_axisbelow(True) ret_axes.grid(True) plt.show() ret_axes: Axes = pivot_by_length_frequency[3:].plot(kind='barh') ret_axes.plot(pivot_by_length_frequency['frequency'].iloc[3:], list(ret_axes.get_yticks())) ret_axes.set_xlabel("Bin wise cumulative frequencies by word length") ret_axes.set_title("Whole corpus") ret_axes.get_legend().remove() ret_axes.set_axisbelow(True) ret_axes.grid(True) plt.show() bigrams = BigramCollocationFinder.from_words(corpus) trigrams = TrigramCollocationFinder.from_words(corpus) phone_nums = {} def is_phone_number(phone_str: str) -> bool: phone_regex_1 = r'(?:\+1(?P<delim>(\.|-)?))([0-9]{3})(?P=delim)([0-9]{3})(?P=delim)([0-9]{4})' phone_regex_2 = r'([0-9]{3})(?P<delim>(\.|-)?)([0-9]{3})(?P=delim)([0-9]{4})' if phone_str.startswith('+1'): match = re.fullmatch(phone_regex_1, phone_str) if not match: return False num = '-'.join(match.groups()[2:]) phone_nums[num] = phone_nums.get(num, 0) + 1 else:
def extract_keywords(string, tokenizer, sent_tokenizer, tagger, extractor, proper_noun_tag='SUBST_PROP'): """ Implements KERA keyword extraction algorithm. See: https://www.ida.org/~/media/Corporate/Files/Publications/IDA_Documents/ITSD/ida-document-ns-d-4931.pdf Basic implementation of the procedure described in the paper. Probably needs some refinements in order to be more broadly effective. :param string: Document to analyze. :type string: str|unicode :param tokenizer: Function that returns a token segmentation as an iterable of strings given a string.. :type tokenizer: (str|unicode) -> list[str|unicode] :param sent_tokenizer: Function that returns a sentence segmentation as an iterable of strings given a string. :type sent_tokenizer: (str|unicode) -> list[str|unicode] :param tagger: TextBlob compatible POS tagger. Must accept untokenized sentences. :type tagger: textblob.base.BaseTagger :param extractor: TextBlob compatible noun phrase extractor. Must accept untokenized sentences and use the same POS tagger which is passed as the tagger parameter. :type extractor: textblob.base.BaseNPExtractor :param proper_noun_tag: POS tag indicating proper nouns. :type proper_noun_tag: str|unicode :return: List of keyword/score tuples. Keyword may be a string or tuple of strings. :rtype : list[(str|unicode|(str|unicode)), float] """ # find bigram collocations bigram_measures = BigramAssocMeasures() finder = BigramCollocationFinder.from_words(tokenizer(string)) collocations = finder.score_ngrams(bigram_measures.likelihood_ratio)[0:50] # find noun phrases phrases = [extractor.extract(s) for s in sent_tokenizer(string)] phrases = [item for sublist in phrases for item in sublist] # find proper noun tokens, collect total/frequency for weighting/normalization sents = [tagger.tag(s) for s in sent_tokenizer(string)] sents = [item for sublist in sents for item in sublist] proper_nouns = [] np_doc_len = 0 for i, (token, tag) in enumerate(sents): np_doc_len += 1 if tag == proper_noun_tag: proper_nouns.append((token, i)) # find noun phrase/collocation overlap phrase_strings = [' '.join(x[0]).lower() for x in phrases if isinstance(x[0], list)] collocations = [c for c in collocations if ' '.join(c[0]) in phrase_strings] ranks = [] # calculate combined index score and normalized collocation score for collocations coll_score_total = sum([x[1] for x in collocations]) coll_doc_len = len(tokenizer(string)) for coll, coll_score in collocations: idx = phrases[phrase_strings.index(' '.join(coll))][1] alpha = coll_score / coll_score_total beta = 1 - (float(idx) / coll_doc_len) score = 2 * alpha * beta / (alpha + beta) ranks.append((coll, score)) # calculate combined index score and normalized term frequency score for proper nouns np_strings = [x[0] for x in proper_nouns] np_counts = Counter(np_strings) np_total = len(proper_nouns) # only use normalize over the same number of proper nouns as collocations in order to keep # the scores roughly comparable. # TODO There are rarely more proper names than collocations. Handle this too. for np, count in sorted(np_counts.items(), key=itemgetter(1), reverse=True)[0:len(collocations)]: idx = proper_nouns[np_strings.index(np)][1] alpha = float(count) / np_total beta = 1 - (float(idx) / np_doc_len) score = 2 * alpha * beta / (alpha + beta) ranks.append((np, score)) # return list of keywords and scores sorted by score return sorted(ranks, key=itemgetter(1), reverse=True)
' ', '?', '!', ',', ';', ':', '-', '--', '---', '(', ')', '{', '}', '[', ']', "'", '"', '.', '`', '·', '``', '~', "''" ] # remove punctuations BNC_filtered_words = [w for w in BNC_words if w not in filter_words] CS_filtered_words = [w for w in CS_words if w not in filter_words] # remove stop words BNC_Stop_filtered_words = [ w for w in BNC_filtered_words if w not in stop_words ] CS_Stop_filtered_words = [w for w in CS_filtered_words if w not in stop_words] bigram_measures = collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(BNC_Stop_filtered_words) f = open('G_bnc_output.txt', 'w') text = '\nChi-square Measure - ' s1 = ''.join(str(text)) f.write(str(s1)) bigrams = finder.nbest(bigram_measures.chi_sq, 20) write_file(bigrams) text = ' \nPMI Measure - ' s1 = ''.join(str(text)) f.write(str(s1)) bigrams = finder.nbest(bigram_measures.pmi, 20) write_file(bigrams)
tokens = tokens + lineTokens #print(tokens, file=log_file) # Exclude non-english words (%%%), punctuation & stopwords for token in tokens: #Excludes non-word tokens if re.search('\W', token) == None: #Excludes stopwords if (token not in stopwords.words('english')): words.append( token ) # This will include all tokens except for stopwords and punctuation # Find bigram collocations # http://www.nltk.org/howto/collocations.html finder = BigramCollocationFinder.from_words(words) scored = finder.score_ngrams(BigramAssocMeasures.raw_freq) # Write two-word expressions to log file for item in scored: log_file.write(str(item[1]) + "\t" + str(item[0]) + "\n") #sorted(bigram for bigram, score in scored) mostFrequentBigramsList = finder.nbest(BigramAssocMeasures.raw_freq, 150) # To sort the list alphabetically, just use the sorted() function in Python. # Write to file for item in mostFrequentBigramsList: output_file1.write(str(item) + "\n") # Find trigram collocations
def run_wordcloud_model( entry_id, mode ): # this will extract paragraph and header text from given json file and extract the topics from that mycol = refer_collection() comp_data_entry = mycol.find({"_id": entry_id}) data = [i for i in comp_data_entry] print("wordcloud model started", str(data[0]['_id']), data[0]['link']) try: h_p_data = data[0]["paragraph_text"] + data[0][ "header_text"] # do topic extraction on paragraph and header text wordcloud = WordCloud(background_color="white", max_words=100, contour_width=3, contour_color='steelblue') # Generate a word cloud data_words = list(sent_to_words(h_p_data)) # data_words_nostops = remove_stopwords(data_words) data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ']) # data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ']) # print(data_lemmatized) all_tokens = [j for i in data_lemmatized for j in i] # print('all', all_tokens) all_tokens = [ value for value in all_tokens if (value != 'other' and value != 'day' and value != 'thing' and value != 'last') ] if mode == 'single': combined_text = " ".join(all_tokens) else: if mode == 'bi': finder = BigramCollocationFinder.from_words(all_tokens) bigram_measures = BigramAssocMeasures() scored = finder.score_ngrams(bigram_measures.raw_freq) if mode == 'tri': # print(combined_text) # setup and score the bigrams using the raw frequency. finder = TrigramCollocationFinder.from_words(all_tokens) trigram_measures = TrigramAssocMeasures() scored = finder.score_ngrams(trigram_measures.raw_freq) # print(scored) # By default finder.score_ngrams is sorted, however don't rely on this default behavior. # Sort highest to lowest based on the score. scoredList = sorted(scored, key=itemgetter(1), reverse=True) # print('sclist',scoredList) # word_dict is the dictionary we'll use for the word cloud. # Load dictionary with the FOR loop below. # The dictionary will look like this with the bigram and the score from above. # word_dict = {'bigram A': 0.000697411, # 'bigram B': 0.000524882} word_dict = {} listLen = len(scoredList) # Get the bigram and make a contiguous string for the dictionary key. # Set the key to the scored value. for i in range(listLen - 1): word_dict['_'.join(scoredList[i][0])] = scoredList[i][1] # print('dic',word_dict) if mode == 'single': wordcloud.generate(combined_text) else: wordcloud.generate_from_frequencies(word_dict) except Exception: print("cannot make word cloud for empty text") mycol.update_one({'_id': entry_id}, {'$set': { 'wordcloud_results_' + mode: [] }}) print("vocabulary is empty") return "Vocabulary is empty" # Visualize the word cloud wordcloud.to_image() wordcloud_words = [] word_cloud_results = [] for each_res in wordcloud.words_: word_cloud_results.append([each_res, wordcloud.words_[each_res]]) wordcloud_words.append(each_res) # print('words', wordcloud_words) # plt.imshow(wordcloud, interpolation='bilinear') # plt.axis("off") # plt.savefig(name_j[:-4]+"png") # plt.show() print(word_cloud_results) # return wordcloud_words mycol.update_one( {'_id': entry_id}, {'$set': { 'wordcloud_results_' + mode: word_cloud_results }}) print("Successfully extended the data entry with wordcloud results", entry_id) # run_wordcloud_model("F://Armitage_project//crawl_n_depth//extracted_json_files//3_www.hydroterra.com.au_data.json")
def get_wc_results(text,mode): try: h_p_data = text # do topic extraction on paragraph and header text wordcloud = WordCloud(background_color="white", max_words=100, contour_width=3, contour_color='steelblue') # Generate a word cloud data_words = list(sent_to_words(h_p_data)) # data_words_nostops = remove_stopwords(data_words) data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ']) # data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ']) # print(data_lemmatized) all_tokens = [j for i in data_lemmatized for j in i] # print('all', all_tokens) all_tokens = [value for value in all_tokens if (value != 'other' and value != 'day' and value != 'thing' and value != 'last')] if mode == 'single': combined_text = " ".join(all_tokens) else: if mode=='bi': finder = BigramCollocationFinder.from_words(all_tokens) bigram_measures = BigramAssocMeasures() scored = finder.score_ngrams(bigram_measures.raw_freq) if mode =='tri': # print(combined_text) # setup and score the bigrams using the raw frequency. finder = TrigramCollocationFinder.from_words(all_tokens) trigram_measures = TrigramAssocMeasures() scored = finder.score_ngrams(trigram_measures.raw_freq) # print(scored) # By default finder.score_ngrams is sorted, however don't rely on this default behavior. # Sort highest to lowest based on the score. scoredList = sorted(scored, key=itemgetter(1), reverse=True) # print('sclist',scoredList) # word_dict is the dictionary we'll use for the word cloud. # Load dictionary with the FOR loop below. # The dictionary will look like this with the bigram and the score from above. # word_dict = {'bigram A': 0.000697411, # 'bigram B': 0.000524882} word_dict = {} listLen = len(scoredList) # Get the bigram and make a contiguous string for the dictionary key. # Set the key to the scored value. for i in range(listLen-1): word_dict['_'.join(scoredList[i][0])] = scoredList[i][1] print('dic',word_dict) if mode=='single': wordcloud.generate(combined_text) else: wordcloud.generate_from_frequencies(word_dict) except Exception: print("cannot make word cloud for empty text") return [] # Visualize the word cloud wordcloud.to_image() wordcloud_words = [] word_cloud_results = [] for each_res in wordcloud.words_: word_cloud_results.append([each_res,wordcloud.words_[each_res]]) wordcloud_words.append(each_res) # print('words', wordcloud_words) # plt.imshow(wordcloud, interpolation='bilinear') # plt.axis("off") # plt.savefig(name_j[:-4]+"png") # plt.show() print(word_cloud_results) return word_cloud_results