def create_word_bigram_scores(): posdata = tp.seg_fil_txt("/home/hadoop/goodnew.txt") negdata = tp.seg_fil_txt("/home/hadoop/badnew.txt") posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) bigram_finderr = BigramCollocationFinder.from_words(posWords) bigram_finder = BigramCollocationFinder.from_words(negWords) posBigrams = bigram_finderr.nbest(BigramAssocMeasures.chi_sq,350000) negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq,350000) pos = posWords + posBigrams neg = negWords + negBigrams word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in pos: word_fd.inc(word) cond_word_fd['pos'].inc(word) for word in neg: word_fd.inc(word) cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores
def create_word_scores(): posdata = tp.seg_fil_txt("/home/hadoop/goodnew.txt") negdata = tp.seg_fil_txt("/home/hadoop/badnew.txt") posWords = list(itertools.chain(*posdata)) negWords = list(itertools.chain(*negdata)) word_fd = FreqDist() cond_word_fd = ConditionalFreqDist() for word in posWords: word_fd.inc(word) cond_word_fd['pos'].inc(word) for word in negWords: word_fd.inc(word) cond_word_fd['neg'].inc(word) pos_word_count = cond_word_fd['pos'].N() neg_word_count = cond_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score return word_scores