def build_topn_best_words(self): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() positivecount = 0; negativecount = 0 with open(r'..\polarityData\TweetCorpus\training.1600000.processed.noemoticon.csv', 'rb') as f: reader = csv.reader(f) for row in reader: #Positive sentiment tweets if(row[0] == '4' and positivecount < self.corpuslength): tweet = row[5] tokens = WhitespaceTokenizer().tokenize(tweet) #print tweet for token in tokens: word_fd.inc(token.lower()) label_word_fd['pos'].inc(token.lower()) positivecount+=1 #Negative sentiment tweets if(row[0] == '0' and negativecount < self.corpuslength): tweet = row[5] tokens = WhitespaceTokenizer().tokenize(tweet) #print tweet for token in tokens: word_fd.inc(token.lower()) label_word_fd['neg'].inc(token.lower()) negativecount+=1 #print word_fd #print label_word_fd pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count print "Positive Word Count:", pos_word_count, "Negative Word Count:", neg_word_count, "Total Word count:", total_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000] self.bestwords = set([w for w, s in best]) print 'Best Words Count:', len(self.bestwords)#, 'Best Words Set:', self.bestwords
def create_word_scores(): # 创建所有正面和负面词汇的清单 posWords = [] negWords = [] with open(RT_POLARITY_POS_FILE, 'r') as posSentences: for i in posSentences: posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) posWords.append(posWord) with open(RT_POLARITY_NEG_FILE, 'r') as negSentences: for i in negSentences: negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) negWords.append(negWord) # 将多维列表转为一维列表 posWords = list(itertools.chain(*posWords)) negWords = list(itertools.chain(*negWords)) #建立所有单词的频率分布,然后建立正负标签内的单词的频率分布 word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd[word.lower()] += 1 cond_word_fd['pos'][word.lower()] += 1 for word in negWords: word_fd[word.lower()] += 1 cond_word_fd['neg'][word.lower()] += 1 #找出正面和负面词的数量,以及词的总数 pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count # #建立基于卡方检验的单词分数字典 word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
label_word_fd[label].inc(word) handle.seek(0) word_fd = nltk.probability.FreqDist() label_word_fd = nltk.probability.ConditionalFreqDist() update_wordcount(word_fd, label_word_fd, smilefile, POSITIVE) update_wordcount(word_fd, label_word_fd, frownfile, NEGATIVE) pos_word_count = label_word_fd[POSITIVE].N() neg_word_count = label_word_fd[NEGATIVE].N() total_word_count = pos_word_count + neg_word_count print "Finding top words" word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd[POSITIVE][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd[NEGATIVE][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000] bestwords = set([w for w, s in best]) print "Best words" #print bestwords def best_word_feats(words): return dict([(word, True) for word in words if word in bestwords]) posfeats = features(best_word_feats, smilefile, POSITIVE) negfeats = features(best_word_feats, frownfile, NEGATIVE) classifier = nltk.NaiveBayesClassifier.train(posfeats + negfeats) save_classifier(classifier)
word_fd = nltk.probability.FreqDist() label_word_fd = nltk.probability.ConditionalFreqDist() update_wordcount(word_fd, label_word_fd, smilefile, POSITIVE) update_wordcount(word_fd, label_word_fd, frownfile, NEGATIVE) pos_word_count = label_word_fd[POSITIVE].N() neg_word_count = label_word_fd[NEGATIVE].N() total_word_count = pos_word_count + neg_word_count print "Finding top words" word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd[POSITIVE][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd[NEGATIVE][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:10000] bestwords = set([w for w, s in best]) print "Best words" #print bestwords def best_word_feats(words): return dict([(word, True) for word in words if word in bestwords])
def count_statistics(candidates, bigram_corpus_size, trigram_corpus_size): """The function for counting contingency tables""" print('=== Counting association measure ===') # Getting word frequencies word_counts = {} for word in candidates: i = 0 for linkage in candidates[word]: for obj in candidates[word][linkage]: i += obj.abs_freq if not obj.third_word: word_counts[obj.first_word + '_' + obj.second_word] = obj.abs_freq word_counts[word] = i # Getting frequencies for a contingency table for word in candidates: for linkage in candidates[word]: for obj in candidates[word][linkage]: # Contingency tables for trigrams if obj.third_word: n_iii = obj.abs_freq # counts (w1, w2, w3) n_ixx = word_counts[obj.first_word] # counts (w1, , ) n_xix = word_counts[obj.second_word] # counts ( , w2, ) n_xxi = word_counts[obj.third_word] # counts ( , , w3) if obj.first_word + '_' + obj.second_word in word_counts: n_iix = word_counts[obj.first_word + '_' + obj.second_word] else: n_iix = 0 if obj.first_word + '_' + obj.third_word in word_counts: n_ixi = word_counts[obj.first_word + '_' + obj.third_word] else: n_ixi = 0 if obj.second_word + '_' + obj.third_word in word_counts: n_xii = word_counts[obj.second_word + '_' + obj.third_word] else: n_xii = 0 n_xxx = trigram_corpus_size # counts any trigram # Counting association measures for trigrams obj.dice = 3 * float(n_iii) / float(n_ixx + n_xix + n_xxi) obj.chi = TrigramAssocMeasures.chi_sq( n_iii, (n_iix, n_ixi, n_xii), (n_ixx, n_xix, n_xxi), n_xxx) obj.jaccard = TrigramAssocMeasures.jaccard( n_iii, (n_iix, n_ixi, n_xii), (n_ixx, n_xix, n_xxi), n_xxx) # obj.likelihood_ratio = TrigramAssocMeasures.likelihood_ratio(n_iii, # (n_iix, n_ixi, n_xii), # (n_ixx, n_xix, n_xxi), # n_xxx) obj.mi = TrigramAssocMeasures.mi_like( n_iii, (n_iix, n_ixi, n_xii), (n_ixx, n_xix, n_xxi), n_xxx) obj.pmi = TrigramAssocMeasures.pmi(n_iii, (n_iix, n_ixi, n_xii), (n_ixx, n_xix, n_xxi), n_xxx) obj.poisson_stirling = TrigramAssocMeasures.poisson_stirling( n_iii, (n_iix, n_ixi, n_xii), (n_ixx, n_xix, n_xxi), n_xxx) obj.t_score = TrigramAssocMeasures.student_t( n_iii, (n_iix, n_ixi, n_xii), (n_ixx, n_xix, n_xxi), n_xxx) # Contingency tables for bigrams else: n_ii = obj.abs_freq # counts (w1, w2) n_ix = word_counts[obj.first_word] # counts (w1, ) n_xi = word_counts[obj.second_word] # counts (, w2) n_xx = bigram_corpus_size # counts any bigram # Counting the Dice statistics for bigrams obj.dice = BigramAssocMeasures.dice( n_ii, (n_ix, n_xi), n_xx) obj.chi = BigramAssocMeasures.chi_sq( n_ii, (n_ix, n_xi), n_xx) obj.t_score = BigramAssocMeasures.student_t( n_ii, (n_ix, n_xi), n_xx) obj.poisson_stirling = BigramAssocMeasures.poisson_stirling( n_ii, (n_ix, n_xi), n_xx) obj.pmi = BigramAssocMeasures.pmi(n_ii, (n_ix, n_xi), n_xx) obj.mi = BigramAssocMeasures.mi_like( n_ii, (n_ix, n_xi), n_xx) obj.likelihood_ratio = BigramAssocMeasures.likelihood_ratio( n_ii, (n_ix, n_xi), n_xx) obj.jaccard = BigramAssocMeasures.jaccard( n_ii, (n_ix, n_xi), n_xx) obj.fisher = BigramAssocMeasures.fisher( n_ii, (n_ix, n_xi), n_xx)