def create_word_scores(): posWords = pickle.load(open('pos_review.pkl', 'rb')) negWords = pickle.load(open('neg_review.pkl', 'rb')) posWords = list(itertools.chain(*posWords)) # 把多维数组解链成一维数组 negWords = list(itertools.chain(*negWords)) # 同理 word_fd = FreqDist() # 可统计所有词的词频 cond_word_fd = ConditionalFreqDist() # 可统计积极文本中的词频和消极文本中的词频 for word in posWords: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in negWords: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() # 积极词的数量 neg_word_count = cond_word_fd['neg'].N() # 消极词的数量 total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) # 计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) # 同理 word_scores[word] = pos_score + neg_score # 一个词的信息量等于积极卡方统计量加上消极卡方统计量 return word_scores # 包括了每个词和这个词的信息量
def create_word_scores(): posWords = pickle.load(open('pos_review.pkl', 'rb')) negWords = pickle.load(open('neg_review.pkl', 'rb')) posWords = list(itertools.chain(*posWords)) # 把多维数组解链成一维数组 negWords = list(itertools.chain(*negWords)) # 同理 word_fd = FreqDist() # 可统计所有词的词频 cond_word_fd = ConditionalFreqDist() # 可统计积极文本中的词频和消极文本中的词频 for word in posWords: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in negWords: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() # 积极词的数量 neg_word_count = cond_word_fd['neg'].N() # 消极词的数量 total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq( cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) # 计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) # 同理 word_scores[word] = pos_score + neg_score # 一个词的信息量等于积极卡方统计量加上消极卡方统计量 return word_scores # 包括了每个词和这个词的信息量
def create_word_bigram_scores(): posdata = pickle.load(open('pos_review.pkl', 'rb')) negdata = pickle.load(open('neg_review.pkl', 'rb')) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder_pos = BigramCollocationFinder.from_words(posWords) bigram_finder_neg = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder_pos.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder_neg.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams # 词和双词搭配 neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in neg: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def bestWords(): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() reviews = product_reviews_1.reviews() reviewlines = [] for review in reviews: for line in review.review_lines: reviewlines.append(line) featlines = [line for line in reviewlines if len(line.features) > 0] pluswords = [] minuswords = [] for line in featlines: plus = False minus = False for feat in line.features: if feat[1][0] == "+": plus = True elif feat[1][0] == "-": minus = True if plus: for word in line.sent: pluswords.append(word) if minus: for word in line.sent: minuswords.append(word) for word in pluswords: word_fd[word.lower()] += 1 label_word_fd['+'][word.lower()] += 1 for word in minuswords: word_fd[word.lower()] += 1 label_word_fd['-'][word.lower()] += 1 pos_word_count = label_word_fd['+'].N() neg_word_count = label_word_fd['-'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['+'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['-'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.items(), key=(lambda s: s[1]), reverse=True)[:515] return set([w for w, s in best])
def find_1000_best_words(pos_tweet_words,neg_tweet_words, stop_words_filter, bigram_collocation_check): for tweet in pos_tweet_words: tweet_words = tweet[0] all_words = [] if bigram_collocation_check: bigrams = best_bigram_word_feats(tweet_words) if stop_words_filter: words = SentiUtil.stopword_filtered_word_feats(tweet_words) all_words.extend(words) for word in all_words: word_fd.inc(word.lower()) label_word_fd['positive'].inc(word.lower()) for tweet in neg_tweet_words: tweet_words = tweet[0] all_words = [] if bigram_collocation_check: bigrams = best_bigram_word_feats(tweet_words) if stop_words_filter: words = SentiUtil.stopword_filtered_word_feats(tweet_words) all_words.extend(words) for word in all_words: word_fd.inc(word.lower()) label_word_fd['negative'].inc(word.lower()) pos_word_count = label_word_fd['positive'].N() neg_word_count = label_word_fd['negative'].N() total_word_count = pos_word_count + neg_word_count # print "Pos word count : ", pos_word_count # print "Neg word count : ", neg_word_count # print "Total word count : " , total_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['positive'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['negative'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000] bestwords = set([w for w, s in best]) return bestwords
def bigrams(self) -> List[str]: """ Returns list of bigrams, words inside bigrams are space-separated. Returns up to 50 bigrams. """ finder = BigramCollocationFinder.from_words(self.tokens) return [ f"{word_0}_{word_1}" for word_0, word_1 in finder.nbest(BigramAssocMeasures().pmi, 50) ]
def create_word_scores(): tweets = get_tweets_from_db() postweets = tweets[800001:] negtweets = tweets[:800001] posWords = [] negWords = [] for tweet in postweets: posWords.append(tweet[0]) for tweet in negtweets: negWords.append(tweet[0]) posWords = list(itertools.chain(*posWords)) negWords = list(itertools.chain(*negWords)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd[word.lower()] += 1 cond_word_fd['pos'][word.lower()] += 1 for word in negWords: word_fd[word.lower()] += 1 cond_word_fd['neg'][word.lower()] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def grams(): from nltk import BigramCollocationFinder, BigramAssocMeasures, TrigramAssocMeasures, TrigramCollocationFinder words = get_words() bigram_measures = BigramAssocMeasures() finder = BigramCollocationFinder.from_words(words) finder.apply_freq_filter(40) bigrams = finder.nbest(bigram_measures.pmi, 500) trigram_measures = TrigramAssocMeasures() finder3 = TrigramCollocationFinder.from_words(words) finder3.apply_freq_filter(100) trigrams = finder3.nbest(trigram_measures.pmi, 300) combos2 = [combo2[0]+ " " + combo2[1] for combo2 in bigrams] combos3 = [combo3[0]+ " " + combo3[1] + " " + combo3[2] for combo3 in trigrams] return combos2, combos3
def top_phrases_nltk(revs): """ Find top phrases by finding collocations using nltk :param revs: :return: """ revs = '.\n'.join(revs) bigram_measures = BigramAssocMeasures() tokens = word_tokenize(revs) finder = BigramCollocationFinder.from_words(tokens) finder.apply_freq_filter(3) finder.apply_word_filter( lambda w: len(w) < 2 or w in stopwords.words("english")) colloc = [] for tup in finder.nbest(bigram_measures.pmi, 50): pos = pos_tag(tup) if pos[0][1].startswith(("JJ", "RB")) or pos[1][1].startswith( ("JJ", "RB")): colloc.append(tup) return colloc
def select_most_informative_features(feature_set, top_informative_features_percentile): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for feature_list, groupKey in feature_set: for feature in feature_list: word_fd[feature] += 1 label_word_fd[groupKey][feature] += 1 label_feature_count = dict((label, label_word_fd[label].N()) for label in label_word_fd) total_feature_count = sum([label_word_fd[label].N() for label in label_word_fd]) feature_scores = {} for feature in word_fd: label_scores = {(label, BigramAssocMeasures.chi_sq(label_word_fd[label][feature], (word_fd[feature], label_feature_count[label]), total_feature_count)) for label in label_word_fd} feature_scores[feature] = sum([score for label, score in label_scores]) top_features_count = int(round(top_informative_features_percentile * total_feature_count)) best = sorted(feature_scores, key=lambda x: feature_scores[x], reverse=True)[:top_features_count] bestwords = set([w for w in best]) filtered_feature_set = [] for feature_list, groupKey in feature_set: filtered_feature_set.append((dict([(x, True) for x in feature_list if x in bestwords]), groupKey)) return filtered_feature_set
def collocations(text, language='english', num=0, window_size=2): #num=20 """ A reimplementation of the basic workings of the collocations method of the ``Text`` class of NLTK. :param text: raw text :param language: language to use to eliminate stopwords :param num: number of collocations :param window_size: window for collocations :return: a list of collocations for the text """ from nltk.corpus import stopwords ignored_words = stopwords.words(language) finder = BigramCollocationFinder.from_words(word_tokenize(text), window_size) finder.apply_freq_filter(2) finder.apply_word_filter( lambda w: len(w) < 3 or w.lower() in ignored_words) bigram_measures = BigramAssocMeasures() c = finder.nbest(bigram_measures.likelihood_ratio, num) collocation_strings = [w1 + ' ' + w2 for w1, w2 in c] return collocation_strings
def get_wc_results(text,mode): try: h_p_data = text # do topic extraction on paragraph and header text wordcloud = WordCloud(background_color="white", max_words=100, contour_width=3, contour_color='steelblue') # Generate a word cloud data_words = list(sent_to_words(h_p_data)) # data_words_nostops = remove_stopwords(data_words) data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ']) # data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ']) # print(data_lemmatized) all_tokens = [j for i in data_lemmatized for j in i] # print('all', all_tokens) all_tokens = [value for value in all_tokens if (value != 'other' and value != 'day' and value != 'thing' and value != 'last')] if mode == 'single': combined_text = " ".join(all_tokens) else: if mode=='bi': finder = BigramCollocationFinder.from_words(all_tokens) bigram_measures = BigramAssocMeasures() scored = finder.score_ngrams(bigram_measures.raw_freq) if mode =='tri': # print(combined_text) # setup and score the bigrams using the raw frequency. finder = TrigramCollocationFinder.from_words(all_tokens) trigram_measures = TrigramAssocMeasures() scored = finder.score_ngrams(trigram_measures.raw_freq) # print(scored) # By default finder.score_ngrams is sorted, however don't rely on this default behavior. # Sort highest to lowest based on the score. scoredList = sorted(scored, key=itemgetter(1), reverse=True) # print('sclist',scoredList) # word_dict is the dictionary we'll use for the word cloud. # Load dictionary with the FOR loop below. # The dictionary will look like this with the bigram and the score from above. # word_dict = {'bigram A': 0.000697411, # 'bigram B': 0.000524882} word_dict = {} listLen = len(scoredList) # Get the bigram and make a contiguous string for the dictionary key. # Set the key to the scored value. for i in range(listLen-1): word_dict['_'.join(scoredList[i][0])] = scoredList[i][1] print('dic',word_dict) if mode=='single': wordcloud.generate(combined_text) else: wordcloud.generate_from_frequencies(word_dict) except Exception: print("cannot make word cloud for empty text") return [] # Visualize the word cloud wordcloud.to_image() wordcloud_words = [] word_cloud_results = [] for each_res in wordcloud.words_: word_cloud_results.append([each_res,wordcloud.words_[each_res]]) wordcloud_words.append(each_res) # print('words', wordcloud_words) # plt.imshow(wordcloud, interpolation='bilinear') # plt.axis("off") # plt.savefig(name_j[:-4]+"png") # plt.show() print(word_cloud_results) return word_cloud_results
def collect(self, input_file, limit=None, save=None): """ corllect corpus from Jodel JSON data :param input_file: :param limit: :param save: :return: """ corpus = [] words = [] labels = [] identities = { 'Basel-Stadt': 'Basel', 'Brunswick': 'Braunschweig', 'Cologne': 'Köln', 'Frankfurt': 'Frankfurt am Main', 'Freiburg': 'Freiburg im Breisgau', 'Fribourg-en-Brisgau': 'Freiburg im Breisgau', 'Geneva': 'Genf', 'Genève': 'Genf', 'Hanover': 'Hannover', 'Klagenfurt am Wörthersee': 'Klagenfurt', 'Munich': 'München', 'Nuremberg': 'Nürnberg', 'Ouest lausannois': 'Lausanne', 'Sankt Pölten': 'St. Pölten', 'Sankt Gallen': 'St. Gallen', 'Salzburg-Umgebung': 'Salzburg', 'Vienna': 'Wien', 'Zurich': 'Zürich' } self.city_frequency = defaultdict(int) # iterate over the data with open(input_file, encoding='utf-8', errors='ignore') as f: for line_no, line in enumerate(islice(f, None)): if line_no > 0: if line_no % 10000 == 0: print("%s" % (line_no), file=sys.stderr, flush=True) elif line_no % 500 == 0: print('.', file=sys.stderr, end=' ', flush=True) try: jodel = json.loads(line) except ValueError: print(line) msg = jodel.get('message', None) location = jodel.get('location', None) # skip empty jodels if msg is None or location is None: continue city = location.get('name', None) if city == 'Jodel Team' or city is None: continue # correct city names city = identities.get(city, city) self.city_frequency[city] += 1 # collect all the data and transform it data = [self.clean(msg)] data.extend([ self.clean(child.get('message', [])) for child in jodel.get('children', []) ]) # one instance for each jodel # corpus.extend(data) # labels.extend([city] * len(data)) # one instance for each conversation corpus.append([word for message in data for word in message]) labels.append(city) words.extend([word for message in data for word in message]) if limit is not None and line_no == limit: break assert len(labels) == len( corpus ), "umm, the number of labels (%s) and the number of instances (%s) is not the same" % ( len(labels), len(corpus)) self.int2word = { i: max(self.stems[w].keys(), key=(lambda key: self.stems[w][key])) for w, i in self.word2int.items() } # find collocations print('\nlooking for collocations', file=sys.stderr, flush=True) finder = BigramCollocationFinder.from_words(words) bgm = BigramAssocMeasures() collocations = [ b for b, f in finder.score_ngrams(bgm.mi_like) if f > 1.0 ] self.collocations = set(collocations) print('\ncreating corpus', file=sys.stderr, flush=True) if save is not None: self.corpus = [] with open('%s.corpus' % save, 'w', encoding='utf-8') as save_corpus: for doc, tag in zip(corpus, labels): words = self.join_collocations(doc) tags = [tag] self.corpus.append(TaggedDocument(words, tags=tags)) save_corpus.write('%s\n' % json.dumps({ 'words': words, 'tags': tags })) print('\ncorpus saved as %s' % save, file=sys.stderr, flush=True) with open('%s.citycounts' % save, 'w', encoding='utf-8') as save_counts: json.dump(dict(self.city_frequency), save_counts) else: self.corpus = [ TaggedDocument(self.join_collocations(doc), tags=[tag]) for doc, tag in zip(corpus, labels) ] print('\n%s instances' % len(self.corpus), file=sys.stderr, flush=True) # update mappings self.int2word = { i: max(self.stems[w].keys(), key=(lambda key: self.stems[w][key])) for w, i in self.word2int.items() } print("Found %s collocations" % (len(collocations)), file=sys.stderr, flush=True) for (w1, w2) in collocations[:10]: print('\t', self.int2word[w1], self.int2word[w2], file=sys.stderr, flush=True)
def FeatureChoose(pos_wordlist, neg_wordlist, method=BigramAssocMeasures.chi_sq, featuregram='one', n=6000): pos_feature = list() neg_feature = list() pos_all_words = list() neg_all_words = list() # pos_all_feature = dict() # neg_all_feature = dict() if featuregram == 'one': for each in pos_wordlist: cur = UniGramFeature(each) pos_feature.append(cur) # pos_all_feature.update(cur) pos_all_words.extend(cur) for each in neg_wordlist: cur = UniGramFeature(each) neg_feature.append(cur) # neg_all_feature.update(cur) neg_all_words.extend(cur) elif featuregram == 'two': for each in pos_wordlist: cur = Mixup2Feature(each) pos_feature.append(cur) # pos_all_feature.update(cur) pos_all_words.extend(cur) for each in neg_wordlist: cur = Mixup2Feature(each) neg_feature.append(cur) # neg_all_feature.update(cur) neg_all_words.extend(cur) elif featuregram == 'three': for each in pos_wordlist: cur = Mixup3Feature(each) pos_feature.append(cur) # pos_all_feature.update(cur) pos_all_words.extend(cur) for each in neg_wordlist: cur = Mixup3Feature(each) neg_feature.append(cur) # neg_all_feature.update(cur) neg_all_words.extend(cur) else: return [] fd = FreqDist() cfd = ConditionalFreqDist() for word in pos_all_words: fd[word] += 1 cfd['pos'][word] += 1 for word in neg_all_words: fd[word] += 1 cfd['neg'][word] += 1 pos_N = cfd['pos'].N() neg_N = cfd['neg'].N() N = fd.N() score_list = dict() for word, freq in fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cfd['pos'][word], (freq, pos_N), N) neg_score = BigramAssocMeasures.chi_sq(cfd['neg'][word], (freq, neg_N), N) score_list[word] = pos_score + neg_score best_topwords = sorted(score_list.iteritems(), key=lambda kk:kk[1], reverse=True) # print json.dumps(best_topwords[-100:-1], ensure_ascii=False) best_topwords = best_topwords[:n] # print json.dumps(best_topwords[:100], ensure_ascii=False) best_topwords = set(word for word, freq in best_topwords) return pos_feature, neg_feature, best_topwords
def extract_keywords(string, tokenizer, sent_tokenizer, tagger, extractor, proper_noun_tag='SUBST_PROP'): """ Implements KERA keyword extraction algorithm. See: https://www.ida.org/~/media/Corporate/Files/Publications/IDA_Documents/ITSD/ida-document-ns-d-4931.pdf Basic implementation of the procedure described in the paper. Probably needs some refinements in order to be more broadly effective. :param string: Document to analyze. :type string: str|unicode :param tokenizer: Function that returns a token segmentation as an iterable of strings given a string.. :type tokenizer: (str|unicode) -> list[str|unicode] :param sent_tokenizer: Function that returns a sentence segmentation as an iterable of strings given a string. :type sent_tokenizer: (str|unicode) -> list[str|unicode] :param tagger: TextBlob compatible POS tagger. Must accept untokenized sentences. :type tagger: textblob.base.BaseTagger :param extractor: TextBlob compatible noun phrase extractor. Must accept untokenized sentences and use the same POS tagger which is passed as the tagger parameter. :type extractor: textblob.base.BaseNPExtractor :param proper_noun_tag: POS tag indicating proper nouns. :type proper_noun_tag: str|unicode :return: List of keyword/score tuples. Keyword may be a string or tuple of strings. :rtype : list[(str|unicode|(str|unicode)), float] """ # find bigram collocations bigram_measures = BigramAssocMeasures() finder = BigramCollocationFinder.from_words(tokenizer(string)) collocations = finder.score_ngrams(bigram_measures.likelihood_ratio)[0:50] # find noun phrases phrases = [extractor.extract(s) for s in sent_tokenizer(string)] phrases = [item for sublist in phrases for item in sublist] # find proper noun tokens, collect total/frequency for weighting/normalization sents = [tagger.tag(s) for s in sent_tokenizer(string)] sents = [item for sublist in sents for item in sublist] proper_nouns = [] np_doc_len = 0 for i, (token, tag) in enumerate(sents): np_doc_len += 1 if tag == proper_noun_tag: proper_nouns.append((token, i)) # find noun phrase/collocation overlap phrase_strings = [' '.join(x[0]).lower() for x in phrases if isinstance(x[0], list)] collocations = [c for c in collocations if ' '.join(c[0]) in phrase_strings] ranks = [] # calculate combined index score and normalized collocation score for collocations coll_score_total = sum([x[1] for x in collocations]) coll_doc_len = len(tokenizer(string)) for coll, coll_score in collocations: idx = phrases[phrase_strings.index(' '.join(coll))][1] alpha = coll_score / coll_score_total beta = 1 - (float(idx) / coll_doc_len) score = 2 * alpha * beta / (alpha + beta) ranks.append((coll, score)) # calculate combined index score and normalized term frequency score for proper nouns np_strings = [x[0] for x in proper_nouns] np_counts = Counter(np_strings) np_total = len(proper_nouns) # only use normalize over the same number of proper nouns as collocations in order to keep # the scores roughly comparable. # TODO There are rarely more proper names than collocations. Handle this too. for np, count in sorted(np_counts.items(), key=itemgetter(1), reverse=True)[0:len(collocations)]: idx = proper_nouns[np_strings.index(np)][1] alpha = float(count) / np_total beta = 1 - (float(idx) / np_doc_len) score = 2 * alpha * beta / (alpha + beta) ranks.append((np, score)) # return list of keywords and scores sorted by score return sorted(ranks, key=itemgetter(1), reverse=True)
K_FOLDS = 10 # 10-fold crossvalidation CLF = LinearSVC() # the default, non-parameter optimized linear-kernel SVM # Loading dataset and featurised simple Tfidf-BoW model corpus, y = parse_dataset(DATASET_FP) X, vectorizer = featurize(corpus) class_counts = np.asarray(np.unique(y, return_counts=True)).T.tolist() print(class_counts) print(corpus) tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True).tokenize tokens = tokenizer('\n'.join(corpus)) finder = BigramCollocationFinder.from_words(tokens) bigram_measures = BigramAssocMeasures() scored = finder.score_ngrams(bigram_measures.student_t) sorted(bigram for bigram, score in scored) map(lambda x: print(' '.join(x[0]), x[1]), scored[:10]) CLF.fit(X, y) # Returns an array of the same size as 'y' where each entry is a prediction obtained by cross validated predicted = cross_val_predict(CLF, X, y, cv=K_FOLDS) most_informative_feature_for_binary_classification(vectorizer, CLF, n=10) # Modify F1-score calculation depending on the task if TASK.lower() == 'a': score = metrics.f1_score(y, predicted, pos_label=1) elif TASK.lower() == 'b':
for word in pluswords: word_fd[word.lower()]+=1 label_word_fd['+'][word.lower()]+=1 for word in minuswords: word_fd[word.lower()]+=1 label_word_fd['-'][word.lower()]+=1 pos_word_count = label_word_fd['+'].N() neg_word_count = label_word_fd['-'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['+'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['-'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score # accs = [] # for x in range(0,2000,5): #print(x) best = sorted(word_scores.items(), key=(lambda s: s[1]), reverse=True)[:515] bestwords = set([w for w, s in best]) # 1500: accuracy: 0.8091397849462365 # pos precision: 0.891156462585034 # pos recall: 0.7043010752688172 # neg precision: 0.7555555555555555 # neg recall: 0.9139784946236559 # 2000: accuracy: 0.803763440860215
def run_wordcloud_model( entry_id, mode ): # this will extract paragraph and header text from given json file and extract the topics from that mycol = refer_collection() comp_data_entry = mycol.find({"_id": entry_id}) data = [i for i in comp_data_entry] print("wordcloud model started", str(data[0]['_id']), data[0]['link']) try: h_p_data = data[0]["paragraph_text"] + data[0][ "header_text"] # do topic extraction on paragraph and header text wordcloud = WordCloud(background_color="white", max_words=100, contour_width=3, contour_color='steelblue') # Generate a word cloud data_words = list(sent_to_words(h_p_data)) # data_words_nostops = remove_stopwords(data_words) data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ']) # data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ']) # print(data_lemmatized) all_tokens = [j for i in data_lemmatized for j in i] # print('all', all_tokens) all_tokens = [ value for value in all_tokens if (value != 'other' and value != 'day' and value != 'thing' and value != 'last') ] if mode == 'single': combined_text = " ".join(all_tokens) else: if mode == 'bi': finder = BigramCollocationFinder.from_words(all_tokens) bigram_measures = BigramAssocMeasures() scored = finder.score_ngrams(bigram_measures.raw_freq) if mode == 'tri': # print(combined_text) # setup and score the bigrams using the raw frequency. finder = TrigramCollocationFinder.from_words(all_tokens) trigram_measures = TrigramAssocMeasures() scored = finder.score_ngrams(trigram_measures.raw_freq) # print(scored) # By default finder.score_ngrams is sorted, however don't rely on this default behavior. # Sort highest to lowest based on the score. scoredList = sorted(scored, key=itemgetter(1), reverse=True) # print('sclist',scoredList) # word_dict is the dictionary we'll use for the word cloud. # Load dictionary with the FOR loop below. # The dictionary will look like this with the bigram and the score from above. # word_dict = {'bigram A': 0.000697411, # 'bigram B': 0.000524882} word_dict = {} listLen = len(scoredList) # Get the bigram and make a contiguous string for the dictionary key. # Set the key to the scored value. for i in range(listLen - 1): word_dict['_'.join(scoredList[i][0])] = scoredList[i][1] # print('dic',word_dict) if mode == 'single': wordcloud.generate(combined_text) else: wordcloud.generate_from_frequencies(word_dict) except Exception: print("cannot make word cloud for empty text") mycol.update_one({'_id': entry_id}, {'$set': { 'wordcloud_results_' + mode: [] }}) print("vocabulary is empty") return "Vocabulary is empty" # Visualize the word cloud wordcloud.to_image() wordcloud_words = [] word_cloud_results = [] for each_res in wordcloud.words_: word_cloud_results.append([each_res, wordcloud.words_[each_res]]) wordcloud_words.append(each_res) # print('words', wordcloud_words) # plt.imshow(wordcloud, interpolation='bilinear') # plt.axis("off") # plt.savefig(name_j[:-4]+"png") # plt.show() print(word_cloud_results) # return wordcloud_words mycol.update_one( {'_id': entry_id}, {'$set': { 'wordcloud_results_' + mode: word_cloud_results }}) print("Successfully extended the data entry with wordcloud results", entry_id) # run_wordcloud_model("F://Armitage_project//crawl_n_depth//extracted_json_files//3_www.hydroterra.com.au_data.json")