def setup(): global bestwords word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in movie_reviews.words(categories=['pos']): word_fd.inc(word.strip('\'"?,.').lower()) label_word_fd['pos'].inc(word.lower()) for word in movie_reviews.words(categories=['neg']): word_fd.inc(word.strip('\'"?,.').lower()) label_word_fd['neg'].inc(word.lower()) pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000] bestwords = set([w for w, s in best]) return train(best_bigram_word_features)
def create_word_scores(): posWords = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl','r')) negWords = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl','r')) posWords = list(itertools.chain(*posWords)) #把多维数组解链成一维数组 negWords = list(itertools.chain(*negWords)) #同理 word_fd = FreqDist() #可统计所有词的词频 cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频 for word in posWords: #help(FreqDist) word_fd[word] += 1#word_fd.inc(word) cond_word_fd['pos'][word]+= 1#cond_word_fd['pos'].inc(word) for word in negWords: word_fd[word] += 1#word_fd.inc(word) cond_word_fd['neg'][word]+= 1#cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() #积极词的数量 neg_word_count = cond_word_fd['neg'].N() #消极词的数量 total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) #同理 word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量 return word_scores #包括了每个词和这个词的信息量
def create_word_bigram_scores(): posdata = tp.seg_fil_senti_excel("~", 1, 1) negdata = tp.seg_fil_senti_excel("~", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams neg = negWords + negBigrams word_fd = FreqDist() last_word = ConditionalFreqDist() for word in pos: word_fd.inc(word) last_word['pos'].inc(word) for word in neg: word_fd.inc(word) last_word['neg'].inc(word) pos_word_count = last_word['pos'].N() neg_word_count = last_word['neg'].N() totalnumber = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(last_word['pos'][word], (freq, pos_word_count), totalnumber) neg_score = BigramAssocMeasures.chi_sq(last_word['neg'][word], (freq, neg_word_count), totalnumber) word_scores[word] = pos_score + neg_score return word_scores
def store_feature_scores(self): """ Determine the scores of words based on chi-sq and stores word:score to Redis. """ try: word_fd = self.pickle_load('word_fd') label_word_freqdist = self.pickle_load('label_fd') except TypeError: print('Requires frequency distributions to be built.') word_scores = {} pos_word_count = label_word_freqdist['positive'].N() neg_word_count = label_word_freqdist['negative'].N() total_word_count = pos_word_count + neg_word_count for label in label_word_freqdist.conditions(): for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_freqdist['positive'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_freqdist['negative'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score self.pickle_store('word_scores', word_scores)
def create_words_bigrams_scores(): posdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/pos_review.xlsx", 1, 1) negdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/neg_review.xlsx", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word]+=1 cond_word_fd['pos'][word]+=1 for word in neg: word_fd[word]+=1 cond_word_fd['neg'][word]+=1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def best_word_feats(self, words): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in movie_reviews.words(categories=['pos']): word_fd.inc(word.lower()) label_word_fd['pos'].inc(word.lower()) for word in movie_reviews.words(categories=['neg']): word_fd.inc(word.lower()) label_word_fd['neg'].inc(word.lower()) # n_ii = label_word_fd[label][word] # n_ix = word_fd[word] # n_xi = label_word_fd[label].N() # n_xx = label_word_fd.N() pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000] bestwords = set([w for w, s in best]) return dict([(word, True) for word in words if word in bestwords])
def __init__(self): ## Best words feature extraction word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in movie_reviews.words(categories=['pos']): word_fd.inc(word.lower()) label_word_fd['pos'].inc(word.lower()) for word in movie_reviews.words(categories=['neg']): word_fd.inc(word.lower()) label_word_fd['neg'].inc(word.lower()) pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000] self.bestwords = set([w for w, s in best]) self.train_classifier()
def computeFreqDistribution(): if DEBUG: print word_fd pos_word_count = label_word_fd['positive'].N() neg_word_count = label_word_fd['negative'].N() neu_word_count = label_word_fd['neutral'].N() total_word_count = pos_word_count + neg_word_count + neu_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['positive'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['negative'][word], (freq, neg_word_count), total_word_count) neu_score = BigramAssocMeasures.chi_sq(label_word_fd['neutral'][word], (freq, neu_word_count), total_word_count) word_scores[word] = pos_score + neg_score + neu_score if DEBUG: print json.dumps(word_scores, indent = 4) threshold = 2 temp = [] for item in word_scores: if word_scores[item] > threshold: temp.append(item) if DEBUG: print temp return temp
def create_word_bigram_scores(): posdata = tp.seg_fil_txt("/home/hadoop/goodnew.txt") negdata = tp.seg_fil_txt("/home/hadoop/badnew.txt") posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finderr = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finderr.nbest(BigramAssocMeasures.chi_sq,350000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq,350000) pos = posWords + posBigrams neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd.inc(word) cond_word_fd['pos'].inc(word) for word in neg: word_fd.inc(word) cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_bigram_scores(): posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", "1", "1") negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", "1", "1") posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000) pos = posBigrams neg = negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd.inc(word) cond_word_fd['pos'].inc(word) for word in neg: word_fd.inc(word) cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def _computeInstanceInformativeWords(self, cf_dist=None, f_dist=None): '''using chi_square distribution, computes and returns the words that contribute the most significant info. That is words that are mostly unique to each set(positive, negative)''' buff = self._loadData('informative_words.bin') if buff: self.informative_words = buff return elif cf_dist == None or f_dist == None: self.informative_words = dict() return total_num_words = f_dist.N() total_positive_words = cf_dist["positive"].N() total_negative_words = cf_dist["negative"].N() words_score = dict() for word in f_dist.keys(): pos_score = BigramAssocMeasures.chi_sq(cf_dist["positive"][word], (f_dist[word], total_positive_words), total_num_words) neg_score = BigramAssocMeasures.chi_sq(cf_dist["negative"][word], (f_dist[word], total_negative_words), total_num_words) words_score[word] = pos_score + neg_score #Return 1% most useful words self.informative_words = dict(sorted(words_score.iteritems(), key=lambda (word, score): score, reverse=True)[:int(0.01*len(words_score))]) self._saveData('informative_words.bin',self.informative_words)
def create_word_bigram_scores(posWords, negWords): bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000) pos = posWords + posBigrams #词和双词搭配 neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[str(word)] += 1 cond_word_fd['pos'][str(word)] += 1 for word in neg: word_fd[str(word)] += 1 cond_word_fd['neg'][str(word)] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def _get_bigram_scores(self, posdata, negdata): pos_words = list(itertools.chain(*posdata)) neg_words = list(itertools.chain(*negdata)) pos_bigram_finder = BigramCollocationFinder.from_words(pos_words) neg_bigram_finder = BigramCollocationFinder.from_words(neg_words) pos_bigrams = pos_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) neg_bigrams = neg_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = pos_words + pos_bigrams neg = neg_words + neg_bigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in neg: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def get_bestwords(contents, labels, limit = 10000, n = None, cache = True): if cache: if n: cache_path = 'cache/%s_%s.pkl' % (limit, n) if os.path.exists(cache_path): bestwords = pickle.load(open(cache_path, 'r')) print 'Loaded from cache' print 'bestwords count = %d' % (len(bestwords)) return bestwords word_fd = FreqDist() label_word_fd = ConditionalFreqDist() pos_contents = contents[labels == 1] neg_contents = contents[labels != 0] pos_words = set() neg_words = set() for pos_content in pos_contents: pos_words = pos_words.union(word_tokenize(pos_content)) for neg_content in neg_contents: neg_words = neg_words.union(word_tokenize(neg_content)) for word in pos_words: word_fd.inc(word.lower()) label_word_fd['pos'].inc(word.lower()) for word in neg_words: word_fd.inc(word.lower()) label_word_fd['neg'].inc(word.lower()) pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:limit] bestwords = set([w for w, s in best]) print 'all words count = %d' % (len(word_scores)) print 'bestwords count = %d' % (len(bestwords)) if cache: if n: cache_path = 'cache/%s_%s.pkl' % (limit, n) f = open(cache_path, 'w') pickle.dump(bestwords, f) print 'Dumped to cache' return bestwords
def __setTermsCHISQUARE__(self,size): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in self.reader.words(categories=['pos']): word_fd.inc(word.lower()) label_word_fd['pos'].inc(word.lower()) for word in self.reader.words(categories=['neg']): word_fd.inc(word.lower()) label_word_fd['neg'].inc(word.lower()) pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count wordScores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) wordScores[word] = pos_score + neg_score termScore = sorted(wordScores.items(),key=lambda(w,s):s,reverse=True)[:size] self.terms = [w for (w,s) in termScore];
def create_word_scores(): posdata = tp.seg_fil_senti_excel( "/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/pos_review.xlsx", 1, 1 ) negdata = tp.seg_fil_senti_excel( "/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/neg_review.xlsx", 1, 1 ) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd[word] += 1 cond_word_fd["pos"][word] += 1 for word in negWords: word_fd[word] += 1 cond_word_fd["neg"][word] += 1 pos_word_count = cond_word_fd["pos"].N() neg_word_count = cond_word_fd["neg"].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd["pos"][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd["neg"][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_scores(sentences): # logging.info(sentences) words = list(itertools.chain(*sentences)) # logging.info(words) #build frequency distibution of all words and then frequency distributions of words within positive and negative labels word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in words: word_fd.inc(word.lower()) cond_word_fd['pos'].inc(word.lower()) cond_word_fd['neg'].inc(word.lower()) #finds the number of positive and negative words, as well as the total number of words pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count #builds dictionary of word scores based on chi-squared test word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_bigram_scores(): posdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl','r')) negdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl','r')) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams #词和双词搭配 neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word] += 1#word_fd.inc(word) cond_word_fd['pos'][word]+= 1 #cond_word_fd['pos'].inc(word) for word in neg: word_fd[word] += 1#word_fd.inc(word) cond_word_fd['neg'][word]+= 1#cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def get_best_words(words_list, num_best_words): from nltk.probability import FreqDist, ConditionalFreqDist from nltk.metrics import BigramAssocMeasures word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for pair in words_list: line,sent = pair for word in nltk.word_tokenize(line): word_fd.inc(word.lower()) label_word_fd[sent].inc(word.lower()) pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],(freq, pos_word_count),total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],(freq, neg_word_count),total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:num_best_words] bestwords = set([w for w, s in best]) return bestwords
def getBestWords(posWords, negWords): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in posWords: word_fd[word.lower()] += 1 label_word_fd["pos"][word.lower()] += 1 for word in negWords: word_fd[word.lower()] += 1 label_word_fd["neg"][word.lower()] += 1 pos_word_count = label_word_fd["pos"].N() neg_word_count = label_word_fd["neg"].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd["pos"][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd["neg"][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score # best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000] sorted_x = sorted(word_scores.items(), key=operator.itemgetter(1), reverse=True) bestwords = set([w for w, s in sorted_x]) return bestwords
def create_word_scores(): posWords = list(itertools.chain(*datap)) #把多维数组解链成一维数组 negWords = list(itertools.chain(*datan)) #同理 word_fd = nltk.FreqDist() cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频 for word in posWords: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in negWords: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() #积极词的数量 neg_word_count = cond_word_fd['neg'].N() #消极词的数量 total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) #同理 word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量 return word_scores #包括了每个词和这个词的信息量
def create_word_scores(): posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", 1, 1) negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", 1, 1) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd.inc(word) cond_word_fd['pos'].inc(word) for word in negWords: word_fd.inc(word) cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def getWordScores(): posWords = [] negWords = [] with open(RT_POLARITY_POS_FILE, 'r') as posSentences: for i in posSentences: posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) posWords.append(posWord) with open(RT_POLARITY_NEG_FILE, 'r') as negSentences: for i in negSentences: negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negWords.append(negWord) posWords = list(itertools.chain(*posWords)) negWords = list(itertools.chain(*negWords)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd[word.lower()] += 1 cond_word_fd['pos'][word.lower()] += 1 for word in negWords: word_fd[word.lower()] += 1 cond_word_fd['neg'][word.lower()] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def high_words(posids, negids, cutoff, score_fn=BigramAssocMeasures.chi_sq, min_score=5): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() pos = 0 neg = 0 for review in posids: pos += 1 if (pos != cutoff): for word in review['text'].split(' '): word_fd.update(token_helpers.tokenize_simple(word)) label_word_fd['pos'].update(token_helpers.tokenize_simple(word)) for review in negids: neg += 1 if (neg != cutoff): for word in review['text'].split(' '): word_fd.update(token_helpers.tokenize_simple(word)) label_word_fd['neg'].update(token_helpers.tokenize_simple(word)) pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.items(), key=itemgetter(1), reverse=True)[:10000] bestwords = set([w for w, s in best]) return bestwords """
def create_word_scores(posWords, negWords): file_scores = file("cn_sample_data/scores.txt", "w") #迭代,将多个序列合并 word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd[str(word)] += 1 cond_word_fd['pos'][str(word)] += 1 for word in negWords: word_fd[str(word)] += 1 cond_word_fd['neg'][str(word)] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][str(word)], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][str(word)], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score sorted(word_scores.items(), lambda x, y: cmp(x[1], y[1]), reverse=True) for key in word_scores: file_scores.write(str(key)+" : " + str(word_scores[str(key)])+ "\n") file_scores.close() return word_scores
def create_word_scores(self): [posWords, negWords] = self.getAllWords() posWords = list(itertools.chain(*posWords)) negWords = list(itertools.chain(*negWords)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd.inc(word) cond_word_fd['pos'].inc(word) for word in negWords: word_fd.inc(word) cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count log("Total number of words: %d" % total_word_count) word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def get_ranked_ngrams(self, wlist="all", pos=True): """ turn ngram into term: chi_sq associatoin metric """ word_fd = nltk.FreqDist() tag_fd = nltk.ConditionalFreqDist() for key, tweet in self.tweets.items(): word_list = self.get_selected_text(tweet) label = self.instances[key].label for ngram in word_list: # do we want the tag here word_fd.inc(ngram) tag_fd[label].inc(ngram) num_pos = tag_fd["positive"].N() num_neg = tag_fd["negative"].N() # num_neu = tag_fd["neutral"].N() # ignore neutral tweets ngram_dict = {} total = num_pos + num_neg # + num_neu for ngram, frequency in word_fd.items(): try: # build chi_sq metrics for both positive and negative tags pos_metric = BigramAssocMeasures.chi_sq( tag_fd['positive'][ngram], (frequency, num_pos), total) neg_metric = BigramAssocMeasures.chi_sq( tag_fd['negative'][ngram], (frequency, num_neg), total) #neu_metric = BigramAssocMeasures.chi_sq(tag_fd['neutral'][ngram],(frequency,num_neu),total) score = pos_metric + neg_metric ngram_dict[ngram] = score # append score except: continue return ngram_dict
def GetHighInformationWordsChi(num_bestwords): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in movie_reviews.words(categories=['pos']): word_fd[word.lower()] +=1 label_word_fd['pos'][word.lower()] +=1 for word in movie_reviews.words(categories=['neg']): word_fd[word.lower()] +=1 label_word_fd['neg'][word.lower()] +=1 pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:num_bestwords] bestwords = set([w for w, s in best]) return bestwords
def create_word_scores(posWords,negWords,posTag,negTag): from nltk.probability import FreqDist, ConditionalFreqDist import itertools posWords = list(itertools.chain(*posWords)) #把多维数组解链成一维数组 negWords = list(itertools.chain(*negWords)) #同理 word_fd = FreqDist() #可统计所有词的词频 cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频 for word in posWords: #help(FreqDist) word_fd[word] += 1#word_fd.inc(word) cond_word_fd[posTag][word]+= 1#cond_word_fd['pos'].inc(word) for word in negWords: word_fd[word] += 1#word_fd.inc(word) cond_word_fd[negTag][word]+= 1#cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd[posTag].N() #积极词的数量 neg_word_count = cond_word_fd[negTag].N() #消极词的数量 total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd[posTag][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 neg_score = BigramAssocMeasures.chi_sq(cond_word_fd[negTag][word], (freq, neg_word_count), total_word_count) #同理 word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量 return word_scores #包括了每个词和这个词的信息量
def store_word_scores(self): """ Stores 'word scores' into Redis. """ try: word_freqdist = pickle.loads(self.r.get('word_fd')) label_word_freqdist = pickle.loads(self.r.get('label_fd')) except TypeError: print('Requires frequency distributions to be built.') word_scores = {} pos_word_count = label_word_freqdist['pos'].N() neg_word_count = label_word_freqdist['neg'].N() total_word_count = pos_word_count + neg_word_count for word, freq in word_freqdist.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_freqdist['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_freqdist['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score self.r.set('word_scores', word_scores)
def create_word_scores(): posWords = pickle.load( open( '/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl', 'r')) negWords = pickle.load( open( '/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl', 'r')) posWords = list(itertools.chain(*posWords)) #把多维数组解链成一维数组 negWords = list(itertools.chain(*negWords)) #同理 word_fd = FreqDist() #可统计所有词的词频 cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频 for word in posWords: #help(FreqDist) word_fd[word] += 1 #word_fd.inc(word) cond_word_fd['pos'][word] += 1 #cond_word_fd['pos'].inc(word) for word in negWords: word_fd[word] += 1 #word_fd.inc(word) cond_word_fd['neg'][word] += 1 #cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() #积极词的数量 neg_word_count = cond_word_fd['neg'].N() #消极词的数量 total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq( cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) #同理 word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量 return word_scores #包括了每个词和这个词的信息量
def create_word_bigram_scores(): posNegDir = 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature\SenimentReviewSet' posdata = tp.seg_fil_senti_excel(posNegDir + '/pos_review.xlsx', 1, 1, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt') negdata = tp.seg_fil_senti_excel(posNegDir + '/neg_review.xlsx', 1, 1, 'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt') posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in neg: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def scores(): posWords = [] negWords = [] with open('pos.txt', 'r') as posSentences: for i in posSentences: posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) posWord = bigram_words(posWord, score_fn=BigramAssocMeasures.chi_sq, n=1000) posWords.append(posWord) with open('neg.txt', 'r') as negSentences: for i in negSentences: negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negWord = bigram_words(negWord, score_fn=BigramAssocMeasures.chi_sq, n=1000) negWords.append(negWord) posWords = list(itertools.chain(*posWords)) negWords = list(itertools.chain(*negWords)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in negWords: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 # finds the number of positive and negative words, as well as the total number of words pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count # builds dictionary of word scores based on chi-squared test featureScore = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) featureScore[word] = pos_score + neg_score return featureScore
def find_best_words(positiveWords, negativWords, dimention_num): # positiveWords = word_tokenize(positiveWords) # negativWords = word_tokenize(negativWords) space = ' ' positiveWords = word_tokenize(space.join(positiveWords)) negativWords = word_tokenize(space.join(negativWords)) cond_word_fd = ConditionalFreqDist() scoreF = BigramAssocMeasures.chi_sq posBigrams = BCF.from_words(positiveWords).nbest(scoreF, 5000) negBigrams = BCF.from_words(negativWords).nbest(scoreF, 5000) pos = positiveWords + posBigrams neg = negativWords + negBigrams all_words = pos + neg word_fd = FreqDist(all_words) pos_word_fd = FreqDist(pos) neg_word_fd = FreqDist(neg) pos_word_count = pos_word_fd.N() neg_word_count = neg_word_fd.N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(pos_word_fd[word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(neg_word_fd[word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best_vals = sorted(word_scores, key=lambda k: word_scores[k], reverse=True)[:dimention_num] return best_vals
def create_word_scores(): # creates lists of all positive and negative words posWords = [] negWords = [] with open(RT_POLARITY_POS_FILE, 'r') as posSentences: for i in posSentences: posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) posWords.append(posWord) with open(RT_POLARITY_NEG_FILE, 'r') as negSentences: for i in negSentences: negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negWords.append(negWord) posWords = list(itertools.chain(*posWords)) negWords = list(itertools.chain(*negWords)) # build frequency distibution of all words and then frequency distributions of words within positive and negative labels word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd.inc(word.lower()) cond_word_fd['pos'].inc(word.lower()) for word in negWords: word_fd.inc(word.lower()) cond_word_fd['neg'].inc(word.lower()) # finds the number of positive and negative words, as well as the total number of words pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count # builds dictionary of word scores based on chi-squared test word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def getWordScores(): posWords = [] negWords = [] with open(RT_POLARITY_POS_FILE, 'r') as posSentences: for i in posSentences: posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) posWords.append(posWord) with open(RT_POLARITY_NEG_FILE, 'r') as negSentences: for i in negSentences: negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negWords.append(negWord) posWords = list(itertools.chain(*posWords)) negWords = list(itertools.chain(*negWords)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd[word.lower()] += 1 cond_word_fd['pos'][word.lower()] += 1 for word in negWords: word_fd[word.lower()] += 1 cond_word_fd['neg'][word.lower()] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_scores(): #build frequency distibution of all words and then frequency distributions of words within positive and negative labels word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in app_reviews.words(categories=['pos']): if word not in stopset and not (word.isnumeric()) and word.isalpha(): word_fd[lemmatizer.lemmatize(word)] += 1 label_word_fd['pos'][lemmatizer.lemmatize(word)] += 1 for word in app_reviews.words(categories=['neg']): if word not in stopset and not (word.isnumeric()) and word.isalpha(): word_fd[lemmatizer.lemmatize(word)] += 1 label_word_fd['neg'][lemmatizer.lemmatize(word)] += 1 # n_ii = label_word_fd[label][word] # n_ix = word_fd[word] # n_xi = label_word_fd[label].N() # n_xx = label_word_fd.N() #finds the number of positive and negative words, as well as the total number of words pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count #builds dictionary of word scores based on chi-squared test word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_bestwords(self): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() score_fn = BigramAssocMeasures.chi_sq for index, row in self.df.iterrows(): # bigram_finder = BigramCollocationFinder.from_words(row['filtered']) for word in row['filtered']: word_fd[word] += 1 label_word_fd[row['obltrans_pz']][word] += 1 # for bigram in bigrams: # word_fd[bigram] += 1 # label_word_fd['pos'][bigram] += 1 word_count = {} total_word_count = 0 for label in self.label_list: word_count[label] = label_word_fd[label].N() total_word_count += label_word_fd[label].N() word_total_scores = {} for word, freq in word_fd.items(): word_total_scores[word] = 0 word_label_scores = {} for label in self.label_list: if label_word_fd[label][word] == 0: continue # print(label_word_fd[label][word]) # print(word_count[label]) # print(total_word_count) word_label_scores[label] = BigramAssocMeasures.chi_sq( label_word_fd[label][word], (freq, word_count[label]), total_word_count) word_total_scores[word] += word_label_scores[label] best = sorted(word_total_scores.items(), key=lambda tup: tup[1], reverse=True)[:1000] print(best) bestwords = set([w for w, s in best]) self.bestwords = bestwords print(self.bestwords) print(total_word_count) print(word_fd['cz0035']) for label in self.label_list: print(label_word_fd[label]['cz0035']) print(word_count[label])
def create_bigram_scores(): posdata = pickle.load(open('pos_review.pkl', 'rb')) negdata = pickle.load(open('neg_review.pkl', 'rb')) posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finder = BigramCollocationFinder.from_words(posWords) posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 10000) bigram_finder = BigramCollocationFinder.from_words(negWords) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 10000) word_fd = FreqDist() # 可统计所有词的词频 cond_word_fd = ConditionalFreqDist() # 可统计积极文本中的词频和消极文本中的词 for word in posBigrams: word_fd[word] += 1 cond_word_fd["pos"][word] += 1 for word in negBigrams: word_fd[word] += 1 cond_word_fd["neg"][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def best_word_feats(tweets, labels): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() tokenizer = TweetTokenizer() tweets = [tokenizer.tokenize(tweet) for tweet in tweets] for tweet, label in zip(tweets, labels): for word in tweet: word_fd[word.lower()] += 1 if label == 0: label_word_fd['0'][word.lower()] += 1 else: label_word_fd['4'][word.lower()] += 1 total_word_count = word_fd.N() pos_word_count = label_word_fd['4'].N() neg_word_count = label_word_fd['0'].N() word_scores = {} for (word, freq) in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['4'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['0'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best_words = [ word for (word, score ) in sorted(word_scores.items(), key=itemgetter(1), reverse=True) ][:50000] return best_words
def create_word_bigram_scores(): posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_pos_finder = BigramCollocationFinder.from_words(posWords) posBigrams = bigram_pos_finder.nbest(BigramAssocMeasures.chi_sq, 5000) bigram_neg_finder = BigramCollocationFinder.from_words(negWords) negBigrams = bigram_neg_finder.nbest(BigramAssocMeasures.chi_sq, 5000) pos = posWords + posBigrams neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in neg: word_fd[word] += 1 cond_word_fd['neg'][word] += 1 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def jieba_feature(number): posWords = [] negWords = [] for items in str1: for item in items: posWords.append(item) for items in str2: for item in items: negWords.append(item) word_fd = FreqDist() # 可统计所有词的词频 con_word_fd = ConditionalFreqDist() # 可统计积极文本中的词频和消极文本中的词频 for word in posWords: word_fd[word] += 1 con_word_fd['pos'][word] += 1 for word in negWords: word_fd[word] += 1 con_word_fd['neg'][word] += 1 pos_word_count = con_word_fd['pos'].N() # 积极词的数量 neg_word_count = con_word_fd['neg'].N() # 消极词的数量 # 一个词的信息量等于积极卡方统计量加上消极卡方统计量 total_word_count = pos_word_count + neg_word_count word_scores = {} best_words = [] for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(con_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(con_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best_vals = sorted(word_scores.items(), key=lambda item: item[1], reverse=True)[:number] best_words = set([w for w, s in best_vals]) return dict([(word, True) for word in best_words])
def create_word_scores(posWords,negWords,objWords): word_fd = FreqDist() #可统计所有词的词频 print(type(word_fd)) cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频 for word in posWords: #word_fd.inc(word) word_fd[word] += 1 #cond_word_fd['pos'].inc(word) cond_word_fd['pos'][word] += 1 for word in negWords: #word_fd.inc(word) word_fd[word] += 1 #cond_word_fd['neg'].inc(word) cond_word_fd['neg'][word] += 1 for word in objWords: #word_fd.inc(word) word_fd[word] += 1 #cond_word_fd['neg'].inc(word) cond_word_fd['obj'][word] += 1 pos_word_count = cond_word_fd['pos'].N() #积极词的数量 neg_word_count = cond_word_fd['neg'].N() #消极词的数量 obj_word_count = cond_word_fd['obj'].N() #中性词的数量 total_word_count = pos_word_count + neg_word_count + obj_word_count word_scores = {} for word, freq in word_fd.items(): #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) obj_score = BigramAssocMeasures.chi_sq(cond_word_fd['obj'][word], (freq, obj_word_count), total_word_count) #一个词的信息量等于积极卡方统计量加上消极卡方统计量 word_scores[word] = pos_score + neg_score + obj_score return word_scores #包括了每个词和这个词的信息量
def compute_word_scores(self): #Core module which assigns scores to features and top features are selected based on this score. freq_dist_obj = FreqDist() cond_freq_dist_obj = ConditionalFreqDist() #Iterating over pos reviews, to calcutate scores for pos feats for review in self.pos_reviews_list: review_words = self.apply_preprocessing(review) for word in review_words: freq_dist_obj.inc(word) cond_freq_dist_obj['pos'].inc(word) #Iterating over neg reviews, to calculate scores for neg feats for review in self.neg_reviews_list: review_words = self.apply_preprocessing(review) for word in review_words: freq_dist_obj.inc(word) cond_freq_dist_obj['neg'].inc(word) pos_word_count = cond_freq_dist_obj['pos'].N() neg_word_count = cond_freq_dist_obj['neg'].N() total_word_count = pos_word_count + neg_word_count word_score_dict = {} #Finding the scores using chi square for word, freq in freq_dist_obj.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_freq_dist_obj['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_freq_dist_obj['neg'][word], (freq, neg_word_count), total_word_count) word_score_dict[word] = pos_score + neg_score #self.best = sorted(word_score_dict.iteritems(), key=lambda (w,s): s, reverse=True) self.best = sorted(word_score_dict.iteritems(), key=operator.itemgetter(1), reverse=True)
def __init__(self, pos, neg): self.posFeatures = list(itertools.chain(*pos)) self.negFeatures = list(itertools.chain(*neg)) #build frequency distibution of all words and then frequency distributions of words within positive and negative labels word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in tqdm(self.posFeatures): word_fd[word] += 1 cond_word_fd['pos'][word] += 1 for word in tqdm(self.negFeatures): word_fd[word] += 1 cond_word_fd['neg'][word] += 1 #finds the number of positive and negative words, as well as the total number of words pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count self.word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) self.word_scores[word] = pos_score + neg_score
def findtopbigrams(bigrams,word_fd,settings): nkey = settings['nkey'] measure = settings['measure'] bigram_measures = BigramAssocMeasures() bigram_fd = FreqDist(bigrams) finder = BigramCollocationFinder(word_fd, bigram_fd) warning = "" if measure == "LR": try: top_bigrams = finder.nbest(bigram_measures.likelihood_ratio, nkey) except: warning = "Problem with LR measure. Default to simple frequency (RAW setting)" print(warning) top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey) elif measure == "PMI": try: top_bigrams = finder.nbest(bigram_measures.pmi, nkey) except: warning = "Problem with PMI measure. Default to simple frequency (RAW setting)" print(warning) top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey) elif measure == "CHISQ": try: top_bigrams = finder.nbest(bigram_measures.chi_sq, nkey) except: warning = "Problem with CHISQ measure. Default to simple frequency (RAW setting)" print(warning) top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey) elif measure == "STUDT": try: top_bigrams = finder.nbest(bigram_measures.student_t, nkey) except: warning = "Problem with STUDT measure. Default to simple frequency (RAW setting)" print(warning) top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey) else: top_bigrams = finder.nbest(bigram_measures.raw_freq, nkey) #score bigrams using LR or similar measure but more helpful to end user to see raw counts + explain measure used in tool tip top_bg_with_count = sorted([(bg, count) for (bg, count) in finder.ngram_fd.items() if bg in top_bigrams], key=lambda bgcount:-bgcount[1]) top_bigrams = [(bg, count) for (bg, count) in top_bg_with_count if count > 1 and bg[0]!=bg[1]] return top_bigrams, bigram_fd, warning
def bi_collocations(tokens, num=20): from nltk.corpus import stopwords ignored_words = stopwords.words('english') word_list = [word for sent in tokens for word in sent] finder = BigramCollocationFinder.from_words(word_list, 2) finder.apply_freq_filter(3) finder.apply_ngram_filter(lambda w1, w2: len(w1) < 3 \ or len(w2) < 3 \ or (len(w1)+len(w2)) < 8 \ or w1.lower() in ignored_words \ or w2.lower() in ignored_words) #length=2 want to keep e.g. rf pulse bigram_measures = BigramAssocMeasures() collocations = finder.nbest(bigram_measures.likelihood_ratio, num) return collocations
def get_most_common_ngrams(self, n, nb_ngrams=None): """ Compute and return the set of the most common ngrams in the documents. This set is cached inside the object. Args: n: The number of grams. Must be a positive interger. nb_ngrams: The number of ngrams to return, i.e quantifying the 'most'. Returns: A list of the most common ngrams. """ try: # return cached value return self._most_common_ngrams[n] except KeyError: pass # compute all ngrams all_ngrams = [] for document in self.training_set["hits"]["hits"]: if document["_source"]["external_review_report"] is not None: all_ngrams.extend(self.compute_ngrams(document["_source"]["external_review_report"], n)) if document["_source"]["external_review_form"] is not None: all_ngrams.extend(self.compute_ngrams(document["_source"]["external_review_form"], n)) # get the frequency or return all ngrams freq = FreqDist(ngram for ngram in all_ngrams) # store and return the nb_ngrams most common ngrams word_scores = {} if nb_ngrams: self._most_common_ngrams[n] = freq.keys()[:nb_ngrams] for word, freqs in freq.iteritems(): score = BigramAssocMeasures.chi_sq(freq[word], (freqs, freq.N()), freq.N() + freq.N()) word_scores[word] = score self.best = [] self.best = sorted(word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:n] self.bestwords = set([w for w, s in self.best]) else: self._most_common_ngrams[n] = freq.keys() return self.bestwords #self._most_common_ngrams[n]
def process_bigrams(conn, polarity, total_word_count, best_words): cursor = conn.cursor() sql = Statements.GRAM_SQL % polarity cursor.execute(sql) rows = list(cursor.fetchall()) l = [x[0] for x in rows] words_split = map(string.split, l) raw_words = [item for sublist in words_split for item in sublist] words = [] for w in raw_words: if not (w.startswith("http://") or w.startswith("@")): words.append(w) word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in words: word_fd.inc(word.lower()) label_word_fd[polarity].inc(word.lower()) pos_word_count = label_word_fd[polarity].N() word_scores = {} for word, freq in word_fd.iteritems(): score = BigramAssocMeasures.chi_sq(label_word_fd[polarity][word], (freq, pos_word_count), total_word_count) word_scores[word] = score best_raw = sorted(word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:600] best = [x[0] for x in best_raw if x[0] not in STOPWORDS and len(x[0]) > 1] best_words.update(best) best_features = features(best, polarity) return best_features cursor.close()
def chiSQ(priors, likelihood, keep): """ Extract the 10000 most informative features using chi-square """ words = {} # Total word count twc = sum(priors.values()) # All words in the counter words_unique = [likelihood[section].keys() for section in likelihood.keys()] words_unique = set(sum(words_unique, [])) for word in words_unique: # Go past each class scores = [] for c in priors.keys(): # Class word count cwc = priors[c] # Get word occurrence over all classes totalFreq = sum([likelihood[section][word] for section in priors.keys()]) # Word count within class wc = likelihood[c][word] # Get chi-sq score = BigramAssocMeasures.chi_sq(wc, (totalFreq, cwc), twc) # Append scores.append(score) # Add to dict words[word] = sum(scores) # Select best words bestWords = sorted(words.iteritems(), key=lambda (w,s): s, reverse=True)[:keep] # Save with open("chiSQ.txt", 'w') as f: print >> f, bestWords # Get names bestWords = [b[0] for b in bestWords] # Filter likelihood for c in priors.keys(): for key in list(likelihood[c]): if key not in bestWords: del likelihood[c][key] # Return return(likelihood)
def collocations(self, num=20, window_size=2): """ Print collocations derived from the text, ignoring stopwords. :seealso: find_collocations :param num: The maximum number of collocations to print. :type num: int :param window_size: The number of tokens spanned by a collocation (default=2) :type window_size: int """ if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size): self._num = num self._window_size = window_size #print("Building collocations list") from nltk.corpus import stopwords ignored_words = stopwords.words('english') finder = BigramCollocationFinder.from_words(self.tokens, window_size) finder.apply_freq_filter(2) finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) bigram_measures = BigramAssocMeasures() self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num) colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations] print(tokenwrap(colloc_strings, separator="; "))
def collocRecursively(corp,interp,constructor,threshhold,addUnrelated,addBigram,filters=None): bgFinder = constructor(corp) if filters: bgFinder = applyFilters(bgFinder,filters) bgScores = {bg:score for bg,score in bgFinder.score_ngrams(BigramAssocMeasures().likelihood_ratio)} print(sorted(list(bgScores.items()),key=lambda tup: tup[1])[-6:]) idx = 0 N = len(corp) newCorp = list() flag = False while idx < N-1: bg = (corp[idx],corp[idx+1]) if bgScores.get((interp(bg[0]),interp(bg[1])),0) > threshhold: addBigram(newCorp,bg) idx += 2 flag = True else: addUnrelated(newCorp,bg[0]) idx += 1 if idx == N-1: addUnrelated(newCorp,corp[idx]) if flag: return collocRecursively(newCorp, interp, constructor, threshhold, addUnrelated, addBigram, filters) return newCorp
label_word_fd['neg'][word.lower()] += 1 # n_ii = label_word_fd[label][word] # n_ix = word_fd[word] # n_xi = label_word_fd[label].N() # n_xx = label_word_fd.N()` pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:10000] bestwords = set([w for w, s in best]) def best_word_feats(words): return dict([(word, True) for word in words if word in bestwords])
def create_word_scores(): angerWords, disgustWords, fearWords, joyWords, surpriseWords = [], [], [], [], [] with open(ANGER_FILE, 'r', errors="ignore", encoding="utf-8") as angerSentence: for i in angerSentence: angerWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) angerWords.append(angerWord) with open(DISGUST_FILE, 'r', errors="ignore", encoding="utf-8") as disgustSentence: for i in disgustSentence: disgustWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) disgustWords.append(disgustWord) with open(FEAR_FILE, 'r', errors="ignore", encoding="utf-8") as fearSentence: for i in fearSentence: fearWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) fearWords.append(fearWord) with open(JOY_FILE, 'r', errors="ignore", encoding="utf-8") as joySentence: for i in joySentence: joyWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) joyWords.append(joyWord) with open(SURPRISE_FILE, 'r', errors="ignore") as surpriseSentence: for i in surpriseSentence: surpriseWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) surpriseWords.append(surpriseWord) angerWords = list(itertools.chain(*angerWords)) disgustWords = list(itertools.chain(*disgustWords)) fearWords = list(itertools.chain(*fearWords)) joyWords = list(itertools.chain(*joyWords)) surpriseWords = list(itertools.chain(*surpriseWords)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in angerWords: word_fd[word.lower()] += 1 cond_word_fd['anger'][word.lower()] += 1 for word in disgustWords: word_fd[word.lower()] += 1 cond_word_fd['disgust'][word.lower()] += 1 for word in fearWords: word_fd[word.lower()] += 1 cond_word_fd['fear'][word.lower()] += 1 for word in joyWords: word_fd[word.lower()] += 1 cond_word_fd['joy'][word.lower()] += 1 for word in surpriseWords: word_fd[word.lower()] += 1 cond_word_fd['surprise'][word.lower()] += 1 anger_word_count = cond_word_fd['anger'].N() disgust_word_count = cond_word_fd['disgust'].N() fear_word_count = cond_word_fd['fear'].N() joy_word_count = cond_word_fd['joy'].N() surprise_word_count = cond_word_fd['surprise'].N() total_word_count = anger_word_count + disgust_word_count + fear_word_count + joy_word_count + surprise_word_count word_scores = {} for word, freq in word_fd.items(): anger_score = BigramAssocMeasures.chi_sq(cond_word_fd['anger'][word], (freq, anger_word_count), total_word_count) disgust_score = BigramAssocMeasures.chi_sq( cond_word_fd['disgust'][word], (freq, disgust_word_count), total_word_count) fear_score = BigramAssocMeasures.chi_sq(cond_word_fd['fear'][word], (freq, fear_word_count), total_word_count) joy_score = BigramAssocMeasures.chi_sq(cond_word_fd['joy'][word], (freq, joy_word_count), total_word_count) surprise_score = BigramAssocMeasures.chi_sq( cond_word_fd['surprise'][word], (freq, surprise_word_count), total_word_count) word_scores[ word] = anger_score + disgust_score + fear_score + joy_score + surprise_score return word_scores
def train_classifier(self, dataset, feature_fn_name='word', train_ratio=0.8, verbose=False, token_column='text', target_column='category', best_ratio=0.8, pos_target_val=1, neg_target_val=-1): def word_feats(words): return dict([(word, True) for word in words]) def best_word_feats(words): return dict([(word, True) for word in words if word in bestwords]) def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200): bigram_finder = BigramCollocationFinder.from_words(words) bigrams = bigram_finder.nbest(score_fn, n) d = dict([(bigram, True) for bigram in bigrams]) d.update(best_word_feats(words)) return d def best_trigram_word_feats(words, score_fn=TrigramAssocMeasures.chi_sq, n=200): tcf = TrigramCollocationFinder.from_words(words) trigrams = tcf.nbest(score_fn, n) d = dict([(trigram, True) for trigram in trigrams]) d.update(best_bigram_word_feats(words)) d.update(best_word_feats(words)) return d if verbose: print( '\nSelected feature function: {}, token column: {}, train ratio: {}' .format(feature_fn_name, token_column, train_ratio)) df = dataset.sample(frac=1).reset_index(drop=True) negids = df[df[target_column] == neg_target_val].index posids = df[df[target_column] == pos_target_val].index feats = df[token_column] if feature_fn_name in ['best_word', 'best_bigram', 'best_trigram']: word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for tokens in df[df[target_column] == pos_target_val][token_column]: for word in tokens.split(): word_fd[word] += 1 label_word_fd[self._positive_label][word] += 1 for tokens in df[df[target_column] == neg_target_val][token_column]: for word in tokens.split(): word_fd[word] += 1 label_word_fd[self._negative_label][word] += 1 pos_word_count = label_word_fd[self._positive_label].N() neg_word_count = label_word_fd[self._negative_label].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq( label_word_fd[self._positive_label][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq( label_word_fd[self._negative_label][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best_cnt = int(len(word_scores) * best_ratio) best = sorted(word_scores.items(), key=lambda item: item[1], reverse=True)[:best_cnt] bestwords = set([w for w, s in best]) if feature_fn_name == 'best_trigram_word_feats': feat_fn = best_trigram_word_feats elif feature_fn_name == 'best_bigram': feat_fn = best_bigram_word_feats else: feat_fn = best_word_feats else: feat_fn = word_feats negfeats = [(feat_fn(feats[i].split()), self._negative_label) for i in negids] posfeats = [(feat_fn(feats[i].split()), self._positive_label) for i in posids] if verbose: print('No. of samples: {}, Pos: {}, Neg: {}'.format( len(feats), len(posfeats), len(negfeats))) negcutoff = int(len(negfeats) * train_ratio) poscutoff = int(len(posfeats) * train_ratio) trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] classifier = NaiveBayesClassifier.train(trainfeats) refsets = defaultdict(set) testsets = defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) metrics = { 'Accuracy': nltk.classify.util.accuracy(classifier, testfeats), 'Pos precision': precision(refsets[self._positive_label], testsets[self._positive_label]), 'Pos recall': recall(refsets[self._positive_label], testsets[self._positive_label]), 'Neg precision': precision(refsets[self._negative_label], testsets[self._negative_label]), 'Neg recall': recall(refsets[self._negative_label], testsets[self._negative_label]) } if verbose: print(metrics) return classifier, metrics
#expand contradictions for k in punctuation: l = l.replace(k, " ") l = Contractions.expandContractions(l) sentenceWords = nltk.word_tokenize(l) for word in sentenceWords: word_fd[word.lower()] += 1 category_fd['neg'][word.lower()] += 1 negatives.append(l) pos_wordCnt = category_fd['pos'].N() neg_wordCnt = category_fd['neg'].N() total_wordCnt = pos_wordCnt + neg_wordCnt word_scores = {} for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq(category_fd['pos'][word], (freq, pos_wordCnt), total_wordCnt) neg_score = BigramAssocMeasures.chi_sq(category_fd['neg'][word], (freq, neg_wordCnt), total_wordCnt) word_scores[word] = pos_score + neg_score best = sorted(word_scores.items(), key=operator.itemgetter(1), reverse=True)[:10000] bestWords = set([w for w, s in best]) posfeats = [] #positives = movie_reviews.fileids('pos') for line in positives: lineWiki = TextBlob(line.lower()) words = list(lineWiki.words) featset = word_features(words) tag = 'pos'
def cal_word_count(): global train_word_id global pos_info global neg_info pos_info = [] neg_info = [] train_word_id = [] word_fd = FreqDist() #可统计所有词的词频 cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频 print('Loading POS>>>') line_num = 0 with open(pos_file, 'r') as fin: for line in fin: line_num += 1 if not line_num % 10000: print('LINE:%d' % (line_num)) items = line.split() tmp_col = [] for item in items: item_id = term_to_id(item) word_fd[item_id] += 1 cond_word_fd['pos'][item_id] += 1 tmp_col.append(item_id) pos_info.append(tmp_col) print('Loading NEG>>>') line_num = 0 with open(neg_file, 'r') as fin: for line in fin: line_num += 1 if not line_num % 10000: print('LINE:%d' % (line_num)) items = line.split() tmp_col = [] for item in items: item_id = term_to_id(item) word_fd[item_id] += 1 cond_word_fd['neg'][item_id] += 1 tmp_col.append(item_id) neg_info.append(tmp_col) print('Randomize>>>') shuffle(pos_info) shuffle(neg_info) pos_w_count = cond_word_fd['pos'].N() neg_w_count = cond_word_fd['neg'].N() total_w_count = pos_w_count + neg_w_count #print('pos_w_count=%d, neg_w_count=%d, total_w_count=%d'%(pos_w_count, neg_w_count, total_w_count)) #print('word_fd_count=%d'%(word_fd.N())) #计算卡方统计量 global word_scores word_scores = {} print("CALC CHI-SQUARE...") for word, freq in word_fd.items(): pos_score = BigramAssocMeasures.chi_sq( cond_word_fd['pos'][word], (freq, pos_w_count), total_w_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量 neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_w_count), total_w_count) #同理 word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量 del word_fd del cond_word_fd return
from nltk.util import ngrams from nltk.corpus import alpino print(alpino.words()) unigrams = ngrams(alpino.words(), 4) #四元语法 print(unigrams) # for i in unigrams: # print(i) from nltk.collocations import BigramCollocationFinder from nltk.corpus import webtext from nltk.metrics import BigramAssocMeasures from nltk.corpus import stopwords stop_words = set(stopwords.words('english')) stops_filter = lambda w: len(w) < 3 or w in stop_words # 单词长度小于3或是停用词 tokens = [t.lower() for t in webtext.words('grail.txt')] words = BigramCollocationFinder.from_words(tokens) # 创建实例 print(words) words.apply_word_filter(stops_filter) res = words.nbest(BigramAssocMeasures.likelihood_ratio, 5) # 二元语法,前5个 print(res) # 使用词汇搭配查找器生成bigrams import nltk text1 = "Hardwork is the key to success. Never give up!" word = nltk.wordpunct_tokenize(text1) finder = BigramCollocationFinder.from_words(word) bigram_measures = BigramAssocMeasures() value = finder.score_ngrams(bigram_measures.raw_freq) print(sorted(bigram for bigram, score in value))
def information_gain(dataset): frequenciaUnigrama = nltk.FreqDist() condicionalUnigrama = nltk.ConditionalFreqDist() dicionarioUnigrama = [] frequenciaBigrama = nltk.FreqDist() condicionalBigrama = nltk.ConditionalFreqDist() dicionarioBigrama = [] data = dataset.data for frase in data[data.Classificacao == 'no']['Unigrama']: for word in frase: frequenciaUnigrama[word.lower()] += 1 condicionalUnigrama['pos'][word.lower()] += 1 for frase in data[data.Classificacao == 'no']['Bigrama']: for word in frase: frequenciaBigrama[word.lower()] += 1 condicionalBigrama['pos'][word.lower()] += 1 for frase in data[data.Classificacao == 'yes']['Unigrama']: for word in frase: frequenciaUnigrama[word.lower()] += 1 condicionalUnigrama['neg'][word.lower()] += 1 for frase in data[data.Classificacao == 'yes']['Bigrama']: for word in frase: frequenciaBigrama[word.lower()] += 1 condicionalBigrama['neg'][word.lower()] += 1 pos_word_count_unigrama = condicionalUnigrama['pos'].N() pos_word_count_bigrama = condicionalBigrama['pos'].N() neg_word_count_unigrama = condicionalUnigrama['neg'].N() neg_word_count_bigrama = condicionalBigrama['neg'].N() total_word_count_unigrama = pos_word_count_unigrama + neg_word_count_unigrama total_word_count_bigrama = pos_word_count_bigrama + neg_word_count_bigrama word_scores_unigrama = {} word_scores_bigrama = {} for word, freq in frequenciaUnigrama.iteritems(): pos_score = BigramAssocMeasures.chi_sq(condicionalUnigrama['pos'][word], (freq, pos_word_count_unigrama), total_word_count_unigrama) neg_score = BigramAssocMeasures.chi_sq(condicionalUnigrama['neg'][word], (freq, neg_word_count_unigrama), total_word_count_unigrama) word_scores_unigrama[word] = pos_score + neg_score for word, freq in frequenciaBigrama.iteritems(): pos_score = BigramAssocMeasures.chi_sq(condicionalBigrama['pos'][word], (freq, pos_word_count_bigrama), total_word_count_bigrama) neg_score = BigramAssocMeasures.chi_sq(condicionalBigrama['neg'][word], (freq, neg_word_count_bigrama), total_word_count_bigrama) word_scores_bigrama[word] = pos_score + neg_score if(dataset.name == 'OffComBR3'): tamUni = 122 tamBig = 103 elif(dataset.name == 'OffComBR2'): tamUni = 250 tamBig = 426 bestUnigrama = sorted(word_scores_unigrama.iteritems(), key=lambda (w,s): s, reverse=True)[:tamUni] bestBigrama = sorted(word_scores_bigrama.iteritems(), key=lambda (w,s): s, reverse=True)[:tamBig] dicionarioUnigrama = [w for w, s in bestUnigrama] dicionarioBigrama = [w for w, s in bestBigrama] dataset.dicUnigrama = dicionarioUnigrama dataset.dicBigrama = dicionarioBigrama dataset = extraiFeatures(dataset) return dataset
def train_and_test(reviews_pos, reviews_neg): """ 훈련 및 테스트 :param reviews_pos: 긍정 리뷰 list :param reviews_neg: 부정 리뷰 list :return: """ # 긍정 리뷰, 부정 리뷰 각각에서의 전체 단어에 대한 빈도수 계산 tot_poswords = [val for l in [r.words for r in reviews_pos] for val in l] tot_negwords = [val for l in [r.words for r in reviews_neg] for val in l] word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for word in tot_poswords: word_fd[word.lower()] += 1 label_word_fd['pos'][word.lower()] += 1 for word in tot_negwords: word_fd[word.lower()] += 1 label_word_fd['neg'][word.lower()] += 1 pos_words = len(tot_poswords) neg_words = len(tot_negwords) tot_words = pos_words + neg_words # 각 단어별 점수 word_scores = {} for word, freq in iter(word_fd.items()): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_words), tot_words) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_words), tot_words) word_scores[word] = pos_score + neg_score print('total: ', len(word_scores)) # 점수가 높은 10000개의 단어만 추출 best = sorted(iter(word_scores.items()), key=lambda args: args[1], reverse=True)[:10000] bestwords = set([w for w, s in best]) negfeatures = [(best_words_features(r.words, bestwords), 'neg') for r in reviews_neg] posfeatures = [(best_words_features(r.words, bestwords), 'pos') for r in reviews_pos] # 훈련 집합 80%와 테스트 집합 20% 분리 portionpos = int(len(posfeatures) * 0.8) portionneg = int(len(negfeatures) * 0.8) print(portionpos, '-', portionneg) trainfeatures = negfeatures[:portionpos] + posfeatures[:portionneg] print(len(trainfeatures)) # 훈련 classifier = NaiveBayesClassifier.train(trainfeatures) # 테스트 testfeatures = negfeatures[portionneg:] + posfeatures[portionpos:] shuffle(testfeatures) err = 0 print('test on: ', len(testfeatures)) for r in testfeatures: sent = classifier.classify(r[0]) # print(r[1], '-pred: ', sent) if sent != r[1]: err += 1. print('error rate: ', err / float(len(testfeatures)))