예제 #1
0
def create_word_scores(sentences):
    # logging.info(sentences)
    words = list(itertools.chain(*sentences))
    # logging.info(words)

    #build frequency distibution of all words and then frequency distributions of words within positive and negative labels
    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in words:
        word_fd.inc(word.lower())
        cond_word_fd['pos'].inc(word.lower())
        cond_word_fd['neg'].inc(word.lower())
        
    #finds the number of positive and negative words, as well as the total number of words
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    #builds dictionary of word scores based on chi-squared test
    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
예제 #2
0
def create_word_bigram_scores():
    posdata = tp.seg_fil_senti_excel("~", 1, 1)
    negdata = tp.seg_fil_senti_excel("~", 1, 1)
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    last_word = ConditionalFreqDist()
    for word in pos:
        word_fd.inc(word)
        last_word['pos'].inc(word)
    for word in neg:
        word_fd.inc(word)
        last_word['neg'].inc(word)

    pos_word_count = last_word['pos'].N()
    neg_word_count = last_word['neg'].N()
    totalnumber = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(last_word['pos'][word], (freq, pos_word_count), totalnumber)
        neg_score = BigramAssocMeasures.chi_sq(last_word['neg'][word], (freq, neg_word_count), totalnumber)
        word_scores[word] = pos_score + neg_score

    return word_scores
def create_word_scores():
    # posdata是list类型,长度1084,表中每个元素都是一个list,如元素:[u'\u7535\u6c60', u'\u4e0d\u7ed9\u529b', u'\u90fd'],
    # 是每条评论的分词,如[电池 不给力 都 很 好 老婆 买 带 16G 卡 一张]
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))
    objWords = list(itertools.chain(*objdata))

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()

    for word in posWords:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in negWords:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1
    for word in objWords:
        word_fd[word] += 1
        cond_word_fd['obj'][word] += 1

    pos_word_count = cond_word_fd['pos'].N() # N()计算出现过的次数总和,可以理解为所有pos类型的词出现的次数总和
    neg_word_count = cond_word_fd['neg'].N()
    obj_word_count = cond_word_fd['obj'].N()
    total_word_count = pos_word_count + neg_word_count + obj_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        obj_score = BigramAssocMeasures.chi_sq(cond_word_fd['obj'][word], (freq, obj_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score + obj_score

    return word_scores
def create_word_scores(posWords, negWords, score_method = BigramAssocMeasures.chi_sq):
    '''
    以单独一个词来统计词的信息量
    '''
    word_fd = FreqDist() #可统计所有词的词频
    cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频
    for word in posWords:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in negWords:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1

    pos_word_count = cond_word_fd['pos'].N() #积极评论中词的数量
    neg_word_count = cond_word_fd['neg'].N() #消极评论中词的数量
    total_word_count = pos_word_count + neg_word_count
    print("IN_POSWORD_NUMS : %d\tIN_NEGWORD_NUMS : %d" % (pos_word_count, neg_word_count))

    #默认使用卡方统计量,这里也可以计算互信息等其它统计量
    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = score_method(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = score_method(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量
    return word_scores #键值对-词:词的信息量
예제 #5
0
def create_word_scores():
    posWords = pickle.load(open(POSFILE, 'rb'))
    negWords = pickle.load(open(NEGFILE, 'rb'))

    posWords = list(itertools.chain(*posWords))  # 把多维数组解链成一维数组
    negWords = list(itertools.chain(*negWords))  # 同理

    word_fd = FreqDist()  # 可统计所有词的词频
    cond_word_fd = ConditionalFreqDist()  # 可统计积极文本中的词频和消极文本中的词频
    for word in posWords:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in negWords:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()  # 积极词的数量
    neg_word_count = cond_word_fd['neg'].N()  # 消极词的数量
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(
            cond_word_fd['pos'][word], (freq, pos_word_count),
            total_word_count)  # 计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word],
                                               (freq, neg_word_count),
                                               total_word_count)  # 同理
        word_scores[word] = pos_score + neg_score  # 一个词的信息量等于积极卡方统计量加上消极卡方统计量

    return word_scores  # 包括了每个词和这个词的信息量
def create_word_scores():
    posWords = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl','r'))
    negWords = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl','r'))
    
    posWords = list(itertools.chain(*posWords)) #把多维数组解链成一维数组
    negWords = list(itertools.chain(*negWords)) #同理

    word_fd = FreqDist() #可统计所有词的词频
    cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频
    for word in posWords:
        #help(FreqDist)
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd['pos'][word]+= 1#cond_word_fd['pos'].inc(word)
    for word in negWords:
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd['neg'][word]+= 1#cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N() #积极词的数量
    neg_word_count = cond_word_fd['neg'].N() #消极词的数量
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) #同理
        word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量

    return word_scores #包括了每个词和这个词的信息量
예제 #7
0
    def getBestwords(self):
        sentences = self.getTrainedData(self.trainingData)
        word_fd = FreqDist()
        label_word_fd = ConditionalFreqDist()

        for words in sentences:
            for word in words:
                word_fd[word.lower()] += 1
                label_word_fd['best'][word.lower()] += 1

        word_count = label_word_fd['best'].N()
        total_word_count = word_count + word_count

        word_scores = {}

        for word, freq in word_fd.iteritems():
            score = BigramAssocMeasures.chi_sq(label_word_fd['best'][word],
                                               (freq, word_count),
                                               total_word_count)
            word_scores[word] = score

        self.best = []
        self.best = sorted(word_scores.iteritems(),
                           key=lambda (w, s): s,
                           reverse=True)[:5000]
        self.bestwords = set([w for w, s in self.best])
        return self.best
예제 #8
0
def create_word_scores(posWords, negWords, posTag, negTag):
    from nltk.probability import FreqDist, ConditionalFreqDist
    import itertools
    posWords = list(itertools.chain(*posWords))  #把多维数组解链成一维数组
    negWords = list(itertools.chain(*negWords))  #同理

    word_fd = FreqDist()  #可统计所有词的词频
    cond_word_fd = ConditionalFreqDist()  #可统计积极文本中的词频和消极文本中的词频
    for word in posWords:
        #help(FreqDist)
        word_fd[word] += 1  #word_fd.inc(word)
        cond_word_fd[posTag][word] += 1  #cond_word_fd['pos'].inc(word)
    for word in negWords:
        word_fd[word] += 1  #word_fd.inc(word)
        cond_word_fd[negTag][word] += 1  #cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd[posTag].N()  #积极词的数量
    neg_word_count = cond_word_fd[negTag].N()  #消极词的数量
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(
            cond_word_fd[posTag][word], (freq, pos_word_count),
            total_word_count)  #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd[negTag][word],
                                               (freq, neg_word_count),
                                               total_word_count)  #同理
        word_scores[word] = pos_score + neg_score  #一个词的信息量等于积极卡方统计量加上消极卡方统计量

    return word_scores  #包括了每个词和这个词的信息量
def create_word_scores(posWords,negWords,posTag,negTag):
    from nltk.probability import FreqDist, ConditionalFreqDist
    import itertools 
    posWords = list(itertools.chain(*posWords)) #把多维数组解链成一维数组
    negWords = list(itertools.chain(*negWords)) #同理

    word_fd = FreqDist() #可统计所有词的词频
    cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频
    for word in posWords:
        #help(FreqDist)
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd[posTag][word]+= 1#cond_word_fd['pos'].inc(word)
    for word in negWords:
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd[negTag][word]+= 1#cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd[posTag].N() #积极词的数量
    neg_word_count = cond_word_fd[negTag].N() #消极词的数量
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd[posTag][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd[negTag][word], (freq, neg_word_count), total_word_count) #同理
        word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量

    return word_scores #包括了每个词和这个词的信息量
def create_word_bigram_scores(posWords, negWords):
    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000)

    pos = posWords + posBigrams #词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[str(word)] += 1 
        cond_word_fd['pos'][str(word)] += 1
    for word in neg:
	    word_fd[str(word)] += 1
	    cond_word_fd['neg'][str(word)] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
def create_word_bigram_scores(posWords, negWords, score_method = BigramAssocMeasures.chi_sq):
    '''
    以双词来统计词的信息量
    '''
    bigram_finder = BigramCollocationFinder.from_words(posWords)
    posBigrams = bigram_finder.nbest(score_method, 5000)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    negBigrams = bigram_finder.nbest(score_method, 5000)
    pos = posWords + posBigrams #词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in neg:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count
    print("BIGRAM_IN_POSWORD_NUMS : %d\tBIGRAM_IN_NEGWORD_NUMS : %d" % (pos_word_count, neg_word_count))

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = score_method(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = score_method(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score
    return word_scores
예제 #12
0
def setup():
    global bestwords

    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for word in movie_reviews.words(categories=['pos']):
        word_fd.inc(word.strip('\'"?,.').lower())
        label_word_fd['pos'].inc(word.lower())

    for word in movie_reviews.words(categories=['neg']):
        word_fd.inc(word.strip('\'"?,.').lower())
        label_word_fd['neg'].inc(word.lower())

    pos_word_count = label_word_fd['pos'].N()
    neg_word_count = label_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}

    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
                                               (freq, pos_word_count),
                                               total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
                                               (freq, neg_word_count),
                                               total_word_count)
        word_scores[word] = pos_score + neg_score

    best = sorted(word_scores.iteritems(), key=lambda (w, s): s,
                  reverse=True)[:10000]
    bestwords = set([w for w, s in best])
    return train(best_bigram_word_features)
	def create_word_scores(self):
		[posWords, negWords] = self.getAllWords()
		
		posWords = list(itertools.chain(*posWords))
		negWords = list(itertools.chain(*negWords))

		word_fd = FreqDist()
		cond_word_fd = ConditionalFreqDist()
		for word in posWords:
			word_fd.inc(word)
			cond_word_fd['pos'].inc(word)
		for word in negWords:
			word_fd.inc(word)
			cond_word_fd['neg'].inc(word)

		pos_word_count = cond_word_fd['pos'].N()
		neg_word_count = cond_word_fd['neg'].N()
		total_word_count = pos_word_count + neg_word_count

		log("Total number of words: %d" % total_word_count)

		word_scores = {}
		for word, freq in word_fd.iteritems():
			pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
			neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
			word_scores[word] = pos_score + neg_score

		return word_scores
def create_word_bigram_scores():
    posdata = tp.seg_fil_txt("/home/hadoop/goodnew.txt")
    negdata = tp.seg_fil_txt("/home/hadoop/badnew.txt")
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finderr = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finderr.nbest(BigramAssocMeasures.chi_sq,350000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq,350000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd.inc(word)
        cond_word_fd['pos'].inc(word)
    for word in neg:
        word_fd.inc(word)
        cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
def get_bestwords(contents, labels, limit = 10000, n = None, cache = True):
    if cache:
        if n:
            cache_path = 'cache/%s_%s.pkl' % (limit, n)
            if os.path.exists(cache_path):
                bestwords = pickle.load(open(cache_path, 'r'))
                print 'Loaded from cache'
                print 'bestwords count = %d' % (len(bestwords))
                return bestwords
    
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
    
    pos_contents = contents[labels == 1]
    neg_contents = contents[labels != 0]
    
    pos_words = set()
    neg_words = set()
    
    for pos_content in pos_contents:
        pos_words = pos_words.union(word_tokenize(pos_content))
    
    for neg_content in neg_contents:
        neg_words = neg_words.union(word_tokenize(neg_content))
    
    for word in pos_words:
        word_fd.inc(word.lower())
        label_word_fd['pos'].inc(word.lower())
    
    for word in neg_words:
        word_fd.inc(word.lower())
        label_word_fd['neg'].inc(word.lower())
    
    pos_word_count = label_word_fd['pos'].N()
    neg_word_count = label_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count
    
    word_scores = {}
    
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
            (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
            (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score
    
    best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:limit]
    bestwords = set([w for w, s in best])
    
    print 'all words count = %d' % (len(word_scores))
    print 'bestwords count = %d' % (len(bestwords))
    
    if cache:
        if n:
            cache_path = 'cache/%s_%s.pkl' % (limit, n)
            f = open(cache_path, 'w')
            pickle.dump(bestwords, f)
            print 'Dumped to cache'
    
    return bestwords
예제 #16
0
def getWordScores():
    posWords = []
    negWords = []
    with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
        for i in posSentences:
            posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            posWords.append(posWord)
    with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
        for i in negSentences:
            negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            negWords.append(negWord)
    posWords = list(itertools.chain(*posWords))
    negWords = list(itertools.chain(*negWords))

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in posWords:
        word_fd[word.lower()] += 1
        cond_word_fd['pos'][word.lower()] += 1
    for word in negWords:
        word_fd[word.lower()] += 1
        cond_word_fd['neg'][word.lower()] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
예제 #17
0
파일: ranking.py 프로젝트: zzx88991/mocs
def tfidf(phrase_lists, corpus=nltk.corpus.brown.words(), ngram_range=(1, 6)):
    ranker = CorpusRanker(corpus, ngram_range)
    phrase_frequencies = FreqDist(tuple(p) for p in phrase_lists)
    phrase_scores = {}
    for phrase, freq in phrase_frequencies.iteritems():
        phrase_scores[phrase] = ranker.score(phrase, freq)
    return phrase_scores, phrase_frequencies
예제 #18
0
파일: bill_bayes.py 프로젝트: PamelaM/billy
def most_informative_words(corpus, categories=['dem', 'rep'], count=2500):
    fd = FreqDist()
    cond_fd = ConditionalFreqDist()
    word_counts = {}

    for cat in categories:
        for word in corpus.words(categories=[cat]):
            word = word.lower().strip(".!?:,/ ")
            if not word.isalpha() or word in stopset:
                continue
            fd.inc(word)
            cond_fd[cat].inc(word)

        word_counts[cat] = cond_fd[cat].N()

    total_word_count = sum(word_counts.values())

    word_scores = collections.defaultdict(int)
    for word, freq in fd.iteritems():
        for cat in categories:
            cat_word_score = BigramAssocMeasures.chi_sq(
                cond_fd[cat][word],
                (freq, word_counts[cat]),
                total_word_count)
            word_scores[word] += cat_word_score

    informative_words = sorted(word_scores.iteritems(),
                               key=lambda (w, s): s,
                               reverse=True)[:count]
    return set([w for w, s in informative_words])
예제 #19
0
def create_word_scores():
    posNegDir = 'D:/ReviewHelpfulnessPrediction\FeatureExtractionModule\SentimentFeature\MachineLearningFeature\SenimentReviewSet'
    posdata = tp.seg_fil_senti_excel(posNegDir + '/pos_review.xlsx', 1, 1,
                                        'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt')
    negdata = tp.seg_fil_senti_excel(posNegDir + '/neg_review.xlsx', 1, 1,
                                        'D:/ReviewHelpfulnessPrediction/PreprocessingModule/sentiment_stopword.txt')

    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in posWords:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in negWords:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
예제 #20
0
파일: ranking.py 프로젝트: zzx88991/mocs
def tfidf(phrase_lists, corpus=nltk.corpus.brown.words(), ngram_range=(1, 6)):
    ranker = CorpusRanker(corpus, ngram_range)
    phrase_frequencies = FreqDist(tuple(p) for p in phrase_lists)
    phrase_scores = {}
    for phrase, freq in phrase_frequencies.iteritems():
        phrase_scores[phrase] = ranker.score(phrase, freq)
    return phrase_scores, phrase_frequencies
예제 #21
0
def create_word_scores(sentences):
    # logging.info(sentences)
    words = list(itertools.chain(*sentences))
    # logging.info(words)

    #build frequency distibution of all words and then frequency distributions of words within positive and negative labels
    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in words:
        word_fd.inc(word.lower())
        cond_word_fd['pos'].inc(word.lower())
        cond_word_fd['neg'].inc(word.lower())

    #finds the number of positive and negative words, as well as the total number of words
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    #builds dictionary of word scores based on chi-squared test
    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word],
                                               (freq, pos_word_count),
                                               total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word],
                                               (freq, neg_word_count),
                                               total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
def create_words_bigrams_scores():
    posdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/pos_review.xlsx", 1, 1)
    negdata = tp.seg_fil_senti_excel("./Machine-learning-features/seniment review set/neg_review.xlsx", 1, 1)
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word]+=1
        cond_word_fd['pos'][word]+=1

    for word in neg:
        word_fd[word]+=1
        cond_word_fd['neg'][word]+=1
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
예제 #23
0
    def __setTermsCHISQUARE__(self, size):
        word_fd = FreqDist()
        label_word_fd = ConditionalFreqDist()

        for word in self.reader.words(categories=['pos']):
            word_fd.inc(word.lower())
            label_word_fd['pos'].inc(word.lower())

        for word in self.reader.words(categories=['neg']):
            word_fd.inc(word.lower())
            label_word_fd['neg'].inc(word.lower())

        pos_word_count = label_word_fd['pos'].N()
        neg_word_count = label_word_fd['neg'].N()
        total_word_count = pos_word_count + neg_word_count

        wordScores = {}

        for word, freq in word_fd.iteritems():
            pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
                                                   (freq, pos_word_count),
                                                   total_word_count)
            neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
                                                   (freq, neg_word_count),
                                                   total_word_count)
            wordScores[word] = pos_score + neg_score

        termScore = sorted(wordScores.items(),
                           key=lambda (w, s): s,
                           reverse=True)[:size]
        self.terms = [w for (w, s) in termScore]
예제 #24
0
    def _get_bigram_scores(self, posdata, negdata):
        pos_words = list(itertools.chain(*posdata))
        neg_words = list(itertools.chain(*negdata))

        pos_bigram_finder = BigramCollocationFinder.from_words(pos_words)
        neg_bigram_finder = BigramCollocationFinder.from_words(neg_words)
        pos_bigrams = pos_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
        neg_bigrams = neg_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

        pos = pos_words + pos_bigrams
        neg = neg_words + neg_bigrams

        word_fd = FreqDist()
        cond_word_fd = ConditionalFreqDist()
        for word in pos:
            word_fd[word] += 1
            cond_word_fd['pos'][word] += 1
        for word in neg:
            word_fd[word] += 1
            cond_word_fd['neg'][word] += 1

        pos_word_count = cond_word_fd['pos'].N()
        neg_word_count = cond_word_fd['neg'].N()
        total_word_count = pos_word_count + neg_word_count

        word_scores = {}
        for word, freq in word_fd.iteritems():
            pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
            neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
            word_scores[word] = pos_score + neg_score

        return word_scores
예제 #25
0
def create_word_bigram_scores():
    posdata = get_word('static/pos.txt')
    negdata = get_word('static/neg.txt')
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))
    posbigram_finder = BigramCollocationFinder.from_words(posWords)
    negbigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = posbigram_finder.nbest(BigramAssocMeasures.chi_sq, number)
    negBigrams = negbigram_finder.nbest(BigramAssocMeasures.chi_sq, number)
    pos = posWords + posBigrams  #词和双词搭配
    neg = negWords + negBigrams
    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in neg:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count
    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word],
                                               (freq, pos_word_count),
                                               total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word],
                                               (freq, neg_word_count),
                                               total_word_count)
        word_scores[word] = pos_score + neg_score
    return word_scores
def create_word_scores():
    posdata = tp.seg_fil_senti_excel(
        "/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/pos_review.xlsx", 1, 1
    )
    negdata = tp.seg_fil_senti_excel(
        "/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/neg_review.xlsx", 1, 1
    )

    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in posWords:
        word_fd[word] += 1
        cond_word_fd["pos"][word] += 1
    for word in negWords:
        word_fd[word] += 1
        cond_word_fd["neg"][word] += 1

    pos_word_count = cond_word_fd["pos"].N()
    neg_word_count = cond_word_fd["neg"].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd["pos"][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd["neg"][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
    def clean_train_data_and_find_best_features(self):
        #Top n best unigram features are selected
        freq_dist_obj = FreqDist()
        cond_freq_dist_obj = ConditionalFreqDist()
        self.book_category_set = set() 

        for instance in self.book_instances:
            try:
                raw_data = instance and instance.strip() and instance.strip().split("\t") 
                if not raw_data or len(raw_data) != 4 : continue  
                bookid  = raw_data[0]
                self.book_category_set.add(bookid)
                features = []
                features.extend(self.clean_book_title(raw_data[2]))
                features.extend(self.clean_author_name(raw_data[3]))
                features.extend(self.bookid_to_toc_dict.get(raw_data[1], []))
                for feat in features:
                    freq_dist_obj.inc(feat)
                    cond_freq_dist_obj[bookid].inc(feat)
            except:
                self.logging.info("Exception while running this instance %s \n" % instance)
                
        total_word_count = 0    
        for bookid in self.book_category_set:
            total_word_count += cond_freq_dist_obj[bookid].N()

        word_score_dict = {}
        for word, freq in freq_dist_obj.iteritems():
            score = 0
            if word and word.lower() in self.stopwords_set:continue
            for bookid in self.book_category_set:
                score += BigramAssocMeasures.chi_sq(cond_freq_dist_obj[bookid][word], (freq, cond_freq_dist_obj[bookid].N()), total_word_count)
            word_score_dict[word] = score
        self.select_top_n_best_features(word_score_dict)
def setup():
    global bestwords

    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for word in movie_reviews.words(categories=['pos']):
        word_fd.inc(word.strip('\'"?,.').lower())
        label_word_fd['pos'].inc(word.lower())

    for word in movie_reviews.words(categories=['neg']):
        word_fd.inc(word.strip('\'"?,.').lower())
        label_word_fd['neg'].inc(word.lower())

    pos_word_count = label_word_fd['pos'].N()
    neg_word_count = label_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}

    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
            (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
            (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
    bestwords = set([w for w, s in best])
    return train(best_bigram_word_features)
def create_bigram_scores():
    posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", "1", "1")
    negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", "1", "1")
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000)

    pos = posBigrams
    neg = negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd.inc(word)
        cond_word_fd['pos'].inc(word)
    for word in neg:
        word_fd.inc(word)
        cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
예제 #30
0
def get_best_words(words_list, num_best_words):
	from nltk.probability import FreqDist, ConditionalFreqDist
	from nltk.metrics import BigramAssocMeasures


	word_fd = FreqDist()
	label_word_fd = ConditionalFreqDist()

	for pair in words_list:
		line,sent = pair
		for word in nltk.word_tokenize(line):
			word_fd.inc(word.lower())
			label_word_fd[sent].inc(word.lower())

	pos_word_count = label_word_fd['pos'].N()
	neg_word_count = label_word_fd['neg'].N()
	total_word_count = pos_word_count + neg_word_count


	word_scores = {}
	for word, freq in word_fd.iteritems():
		pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],(freq, pos_word_count),total_word_count)
		neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],(freq, neg_word_count),total_word_count)
		word_scores[word] = pos_score + neg_score
 
	best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:num_best_words]
	bestwords = set([w for w, s in best])

	return bestwords
예제 #31
0
def create_word_bigram_scores(pos_corpus, neg_corpus):

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for corpus in pos_corpus:
        for word in corpus:
            word_fd[word] += 1
            cond_word_fd['pos'][word] += 1
    for corpus in neg_corpus:
        for word in corpus:
            word_fd[word] += 1
            cond_word_fd['neg'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word],
                                               (freq, pos_word_count),
                                               total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word],
                                               (freq, neg_word_count),
                                               total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
예제 #32
0
 def best_word_feats(self, words):
     word_fd = FreqDist()
     label_word_fd = ConditionalFreqDist()
      
     for word in movie_reviews.words(categories=['pos']):
         word_fd.inc(word.lower())
         label_word_fd['pos'].inc(word.lower())
      
     for word in movie_reviews.words(categories=['neg']):
         word_fd.inc(word.lower())
         label_word_fd['neg'].inc(word.lower())
      
     # n_ii = label_word_fd[label][word]
     # n_ix = word_fd[word]
     # n_xi = label_word_fd[label].N()
     # n_xx = label_word_fd.N()
      
     pos_word_count = label_word_fd['pos'].N()
     neg_word_count = label_word_fd['neg'].N()
     total_word_count = pos_word_count + neg_word_count
      
     word_scores = {}
      
     for word, freq in word_fd.iteritems():
         pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
             (freq, pos_word_count), total_word_count)
         neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
             (freq, neg_word_count), total_word_count)
         word_scores[word] = pos_score + neg_score
      
     best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
     bestwords = set([w for w, s in best])
     return dict([(word, True) for word in words if word in bestwords])
예제 #33
0
    def Do_alpha(self):
        """The observed disagreement for the alpha coefficient.

        The alpha coefficient, unlike the other metrics, uses this rather than
        observed agreement.
        """
        total = 0.0
        for i, itemdata in self._grouped_data('item'):
            label_freqs = FreqDist(x['labels'] for x in itemdata)

            for j, nj in label_freqs.iteritems():
                for l, nl in label_freqs.iteritems():
                    total += float(nj * nl) * self.distance(l, j)
        ret = (1.0 / float((len(self.I) * len(self.C) * (len(self.C) - 1)))) * total
        log.debug("Observed disagreement: %f", ret)
        return ret
def create_word_scores(posWords, negWords):
    file_scores = file("cn_sample_data/scores.txt", "w")
    #迭代,将多个序列合并
    
    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in posWords:
        word_fd[str(word)] += 1 
        cond_word_fd['pos'][str(word)] += 1
    for word in negWords:
	    word_fd[str(word)] += 1
	    cond_word_fd['neg'][str(word)] += 1
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count
    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][str(word)], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][str(word)], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score
    sorted(word_scores.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)
    for key in word_scores:
        file_scores.write(str(key)+" : " + str(word_scores[str(key)])+ "\n")
    file_scores.close()
    return word_scores 
예제 #35
0
    def __setTermsCHISQUARE__(self,size):
        word_fd = FreqDist()
        label_word_fd = ConditionalFreqDist()
        
        for word in self.reader.words(categories=['pos']):
            word_fd.inc(word.lower())
            label_word_fd['pos'].inc(word.lower())

        for word in self.reader.words(categories=['neg']):
            word_fd.inc(word.lower())
            label_word_fd['neg'].inc(word.lower())
            
        pos_word_count = label_word_fd['pos'].N()
        neg_word_count = label_word_fd['neg'].N()
        total_word_count = pos_word_count + neg_word_count

        wordScores = {}
        
        for word, freq in word_fd.iteritems():
            pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
                                                   (freq, pos_word_count), total_word_count)
            neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
                                                   (freq, neg_word_count), total_word_count)
            wordScores[word] = pos_score + neg_score

        termScore = sorted(wordScores.items(),key=lambda(w,s):s,reverse=True)[:size]
        self.terms = [w for (w,s) in termScore];
예제 #36
0
  def __init__(self):
    ## Best words feature extraction
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
     
    for word in movie_reviews.words(categories=['pos']):
      word_fd.inc(word.lower())
      label_word_fd['pos'].inc(word.lower())
     
    for word in movie_reviews.words(categories=['neg']):
      word_fd.inc(word.lower())
      label_word_fd['neg'].inc(word.lower())

    pos_word_count = label_word_fd['pos'].N()
    neg_word_count = label_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count
     
    word_scores = {}
     
    for word, freq in word_fd.iteritems():
      pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
        (freq, pos_word_count), total_word_count)
      neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
        (freq, neg_word_count), total_word_count)
      word_scores[word] = pos_score + neg_score
     
    best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
    self.bestwords = set([w for w, s in best])
    self.train_classifier()
예제 #37
0
파일: tverse.py 프로젝트: strago13/gtverse
def get_best_words(corpora):
    # Technically, this stuff should all be restricted to the current train set
    # so it doesn't look at labels in the text set
 
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for text in corpora['positive']:
        for word in text.tokens:
            word_fd.inc(word.lower())
            label_word_fd['positive'].inc(word.lower())
 
    for text in corpora['negative']:
        for word in text.tokens:
            word_fd.inc(word.lower())
            label_word_fd['negative'].inc(word.lower())
  
    pos_word_count = label_word_fd['positive'].N()
    neg_word_count = label_word_fd['negative'].N()
    total_word_count = pos_word_count + neg_word_count
 
    word_scores = {}
 
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd['positive'][word],
                                               (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd['negative'][word],
                                               (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score
 
    best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
    bestwords = set([w for w, s in best])
    return bestwords
예제 #38
0
    def Do_alpha(self):
        """The observed disagreement for the alpha coefficient.

        The alpha coefficient, unlike the other metrics, uses this rather than
        observed agreement.
        """
        total = 0.0
        for i, itemdata in self._grouped_data("item"):
            label_freqs = FreqDist(x["labels"] for x in itemdata)

            for j, nj in label_freqs.iteritems():
                for l, nl in label_freqs.iteritems():
                    total += float(nj * nl) * self.distance(l, j)
        ret = (1.0 / float((len(self.I) * len(self.C) * (len(self.C) - 1)))) * total
        log.debug("Observed disagreement: %f", ret)
        return ret
def create_word_scores():
    posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", 1, 1)
    negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", 1, 1)
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in posWords:
        word_fd.inc(word)
        cond_word_fd['pos'].inc(word)
    for word in negWords:
        word_fd.inc(word)
        cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
def create_word_bigram_scores():
    posdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl','r'))
    negdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl','r'))
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams #词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd['pos'][word]+= 1 #cond_word_fd['pos'].inc(word)
    for word in neg:
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd['neg'][word]+= 1#cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
예제 #41
0
        def GetHighInformationWordsChi(num_bestwords):
            word_fd = FreqDist()
            label_word_fd = ConditionalFreqDist()
 
            for word in movie_reviews.words(categories=['pos']):
                word_fd[word.lower()] +=1
                label_word_fd['pos'][word.lower()] +=1
 
            for word in movie_reviews.words(categories=['neg']):
                word_fd[word.lower()] +=1
                label_word_fd['neg'][word.lower()] +=1
 
            pos_word_count = label_word_fd['pos'].N()
            neg_word_count = label_word_fd['neg'].N()
            total_word_count = pos_word_count + neg_word_count
 
            word_scores = {}
 
            for word, freq in word_fd.iteritems():
                pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
                    (freq, pos_word_count), total_word_count)
                neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
                    (freq, neg_word_count), total_word_count)
                word_scores[word] = pos_score + neg_score
 
            best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:num_bestwords]
            bestwords = set([w for w, s in best])
            return bestwords
예제 #42
0
def most_informative_words(corpus, categories=['dem', 'rep'], count=2500):
    fd = FreqDist()
    cond_fd = ConditionalFreqDist()
    word_counts = {}

    for cat in categories:
        for word in corpus.words(categories=[cat]):
            word = word.lower().strip(".!?:,/ ")
            if not word.isalpha() or word in stopset:
                continue
            fd.inc(word)
            cond_fd[cat].inc(word)

        word_counts[cat] = cond_fd[cat].N()

    total_word_count = sum(word_counts.values())

    word_scores = collections.defaultdict(int)
    for word, freq in fd.iteritems():
        for cat in categories:
            cat_word_score = BigramAssocMeasures.chi_sq(
                cond_fd[cat][word], (freq, word_counts[cat]), total_word_count)
            word_scores[word] += cat_word_score

    informative_words = sorted(word_scores.iteritems(),
                               key=lambda (w, s): s,
                               reverse=True)[:count]
    return set([w for w, s in informative_words])
def create_word_scores():
    posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx",
                                     1, 1)
    negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx",
                                     1, 1)

    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in posWords:
        word_fd.inc(word)
        cond_word_fd['pos'].inc(word)
    for word in negWords:
        word_fd.inc(word)
        cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word],
                                               (freq, pos_word_count),
                                               total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word],
                                               (freq, neg_word_count),
                                               total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
def create_word_bigram_scores(posdata,negdata):
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder_pos = BigramCollocationFinder.from_words(posWords)
    bigram_finder_neg = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder_pos.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder_neg.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in neg:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1
        

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
def create_word_scores(posdata,negdata):
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in posWords:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in negWords:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
def create_word_bigram_scores(posWords,
                              negWords,
                              score_method=BigramAssocMeasures.chi_sq):
    '''
    以双词来统计词的信息量
    '''
    bigram_finder = BigramCollocationFinder.from_words(posWords)
    posBigrams = bigram_finder.nbest(score_method, 5000)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    negBigrams = bigram_finder.nbest(score_method, 5000)
    pos = posWords + posBigrams  #词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in neg:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count
    print("BIGRAM_IN_POSWORD_NUMS : %d\tBIGRAM_IN_NEGWORD_NUMS : %d" %
          (pos_word_count, neg_word_count))

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = score_method(cond_word_fd['pos'][word],
                                 (freq, pos_word_count), total_word_count)
        neg_score = score_method(cond_word_fd['neg'][word],
                                 (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score
    return word_scores
def create_word_scores(posWords,
                       negWords,
                       score_method=BigramAssocMeasures.chi_sq):
    '''
    以单独一个词来统计词的信息量
    '''
    word_fd = FreqDist()  #可统计所有词的词频
    cond_word_fd = ConditionalFreqDist()  #可统计积极文本中的词频和消极文本中的词频
    for word in posWords:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in negWords:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()  #积极评论中词的数量
    neg_word_count = cond_word_fd['neg'].N()  #消极评论中词的数量
    total_word_count = pos_word_count + neg_word_count
    print("IN_POSWORD_NUMS : %d\tIN_NEGWORD_NUMS : %d" %
          (pos_word_count, neg_word_count))

    #默认使用卡方统计量,这里也可以计算互信息等其它统计量
    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = score_method(cond_word_fd['pos'][word],
                                 (freq, pos_word_count), total_word_count)
        neg_score = score_method(cond_word_fd['neg'][word],
                                 (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score  #一个词的信息量等于积极卡方统计量加上消极卡方统计量
    return word_scores  #键值对-词:词的信息量
예제 #48
0
def create_word_scores():

    global POS_Words
    global NEG_Words
    global MIX_Words
    global NEUTRAL_Words

    POS_Words = list(itertools.chain(*POS_Words))
    NEG_Words = list(itertools.chain(*NEG_Words))
    MIX_Words = list(itertools.chain(*MIX_Words))
    NEUTRAL_Words = list(itertools.chain(*NEUTRAL_Words))

    #build frequency distibution of all words and then frequency distributions of words within positive and negative labels
    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()

    for word in POS_Words:
        for key in word:
            word_fd.inc(key.lower())
            cond_word_fd['+'].inc(key.lower())
    for word in NEG_Words:
        for key in word:
            word_fd.inc(key.lower())
            cond_word_fd['-'].inc(key.lower())
    for word in MIX_Words:
        for key in word:
            word_fd.inc(key.lower())
            cond_word_fd['*'].inc(key.lower())
    for word in NEUTRAL_Words:
        for key in word:
            word_fd.inc(key.lower())
            cond_word_fd['='].inc(key.lower())

    #finds the number of positive and negative words, as well as the total number of words
    pos_word_count = cond_word_fd['+'].N()
    neg_word_count = cond_word_fd['-'].N()
    #mix_word_count = cond_word_fd['*'].N()
    neutral_word_count = cond_word_fd['='].N()
    # total_word_count = pos_word_count + neg_word_count + mix_word_count+ neutral_word_count
    total_word_count = pos_word_count + neg_word_count + neutral_word_count

    #builds dictionary of word scores based on chi-squared test
    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['+'][word],
                                               (freq, pos_word_count),
                                               total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['-'][word],
                                               (freq, neg_word_count),
                                               total_word_count)
        #mix_score = BigramAssocMeasures.chi_sq(cond_word_fd['*'][word], (freq, mix_word_count), total_word_count)
        neutral_score = BigramAssocMeasures.chi_sq(cond_word_fd['='][word],
                                                   (freq, neutral_word_count),
                                                   total_word_count)

        #word_scores[word] = pos_score + neg_score + mix_score+neutral_score
        word_scores[word] = pos_score + neg_score + neutral_score

    return word_scores
예제 #49
0
def create_word_scores():
    #creates lists of all positive and negative words
    posWords = []
    negWords = []
    with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
    for i in posSentences:
posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
posWords.append(posWord)
with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
for i in negSentences:
    negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
    negWords.append(negWord)
    posWords = list(itertools.chain(*posWords))
    negWords = list(itertools.chain(*negWords))

    #build frequency distibution of all words and then frequency distributions of words within positive and negative labels
    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in posWords:
    word_fd.inc(word.lower())
cond_word_fd['pos'].inc(word.lower())
for word in negWords:
word_fd.inc(word.lower())
cond_word_fd['neg'].inc(word.lower())

#finds the number of positive and negative words, as well as the total number of words
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count

#builds dictionary of word scores based on chi-squared test
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score

return word_scores

#finds word scores
word_scores = create_word_scores()

#finds the best 'number' words based on word scores
def find_best_words(word_scores, number):
    best_vals = sorted(word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:number]
    best_words = set([w for w, s in best_vals])
    return best_words

#creates feature selection mechanism that only uses best words
def best_word_features(words):
    return dict([(word, True) for word in words if word in best_words])

#numbers of features to select
numbers_to_test = [10, 100, 1000, 10000, 15000]
#tries the best_word_features mechanism with each of the numbers_to_test of features
for num in numbers_to_test:
    print 'evaluating best %d word features' % (num)
    best_words = find_best_words(word_scores, num)
    evaluate_features(best_word_features)
    def bigramAnalysis(self):

        label_word_fd = ConditionalFreqDist()
        word_fd = FreqDist()

        datafiles = [
            {
                'emo': "Sad",
                'name': "/negative.csv"
            }, {
                'emo': "Happy",
                'name': "/positive.csv"
            }
            # , {'emo': 'Happy', 'name': "/trust.csv"}, {'emo': 'Sad', 'name': "/anger.csv"}
        ]

        for value in datafiles:
            emo = value['emo']
            name = value['name']
            read = self.readFile(name)
            normalized_sentences = [s.lower() for s in read['tweets']]

            for statement in normalized_sentences:
                for word in statement.split():
                    wor = word.lower()
                    if word not in stopset:
                        word_fd[word] += 1
                        label_word_fd[emo][word] += 1
                        # word_fd.inc(word.lower())

        word_scores = {}
        pos_word_count = label_word_fd['Happy'].N()
        neg_word_count = label_word_fd['Sad'].N()

        total_word_count = word_fd.N()

        for word, freq in word_fd.iteritems():
            pos_score = BigramAssocMeasures.chi_sq(
                label_word_fd['Happy'][word], (freq, pos_word_count),
                total_word_count)
            neg_score = BigramAssocMeasures.chi_sq(label_word_fd['Sad'][word],
                                                   (freq, neg_word_count),
                                                   total_word_count)
            word_scores[word] = pos_score + neg_score

        best = sorted(word_scores.iteritems(),
                      key=lambda (w, s): s,
                      reverse=True)[:500]
        self.bestwords = set([w for w, s in best])

        print("\n\nevaluating best word features")
        self.unigramAnalysis(self.best_word_feats)

        print("\n\nBigram + bigram chi_sq word ")
        self.unigramAnalysis(self.best_bigram_word_feats)
예제 #51
0
    def pi(self):
        """Scott 1955; here, multi-pi.
        Equivalent to K from Siegel and Castellan (1988).

        """
        total = 0.0
        label_freqs = FreqDist(x['labels'] for x in self.data)
        for k, f in label_freqs.iteritems():
            total += f**2
        Ae = total / float((len(self.I) * len(self.C))**2)
        return (self.avg_Ao() - Ae) / (1 - Ae)
def create_word_scores():
    # creates lists of all positive and negative words
    posWords = []
    negWords = []
      
    sentences = read_in_tweets(twitter_data)
    random.shuffle(sentences)
    sentences = sentences[:100000]
    
    posSentences = []
    negSentences = []
    for tup in sentences:
        if tup[0]=='0':
            negSentences.append(tup[1])
        if tup[0]=='4':
            posSentences.append(tup[1])
    
   
    for i in posSentences:
        posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
        posWords.append(posWord)

    for i in negSentences:
        negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
        negWords.append(negWord)
        
    
    
    posWords = list(itertools.chain(*posWords))
    negWords = list(itertools.chain(*negWords))

    # build frequency distibution of all words and then frequency distributions of words within positive and negative labels
    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in posWords:
        word_fd.inc(word.lower())
        cond_word_fd['pos'].inc(word.lower())
    for word in negWords:
        word_fd.inc(word.lower())
        cond_word_fd['neg'].inc(word.lower())

    # finds the number of positive and negative words, as well as the total number of words
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    # builds dictionary of word scores based on chi-squared test
    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
def process_bigrams(conn, polarity, total_word_count, best_words):
    cursor = conn.cursor()
    sql = Statements.GRAM_SQL % polarity
    cursor.execute(sql)

    rows = list(cursor.fetchall())
    l = [x[0] for x in rows]
    words_split = map(string.split, l)
    raw_words = [item for sublist in words_split for item in sublist]

    words = []
    for w in raw_words:
        if not (w.startswith("http://") or w.startswith("@")):
            words.append(w)

    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for word in words:
        word_fd.inc(word.lower())
        label_word_fd[polarity].inc(word.lower())

    pos_word_count = label_word_fd[polarity].N()

    word_scores = {}

    for word, freq in word_fd.iteritems():
        score = BigramAssocMeasures.chi_sq(label_word_fd[polarity][word],
                                           (freq, pos_word_count),
                                           total_word_count)
        word_scores[word] = score

    best_raw = sorted(word_scores.iteritems(),
                      key=lambda (w, s): s,
                      reverse=True)[:600]
    best = [x[0] for x in best_raw if x[0] not in STOPWORDS and len(x[0]) > 1]
    best_words.update(best)
    best_features = features(best, polarity)

    bigram_finder = BigramCollocationFinder.from_words(words)
    bigram_finder.apply_freq_filter(4)
    bigrams = bigram_finder.nbest(BigramAssocMeasures.pmi, 10)

    bigram_list = []
    for bt in bigrams:
        x = "%s %s" % (bt[0].lower(), bt[1].lower())
        bigram_list.append(x)

    bigram_features = features(bigram_list, polarity)
    best_features += bigram_features
    return best_features
    cursor.close()
예제 #54
0
def create_word_scores():
    #creates lists of all positive and negative words
    posWords = []
    negWords = []
    posRev, negRev = load_file()

    #print len(negfeats)
    #posWords = [(make_full_dict(f), 'pos') for f in word_split(posWords)]

    for f in word_split(negRev):
        posWords.append(f)

    for f in word_split(posRev):
        negWords.append(f)
#build frequency distibution of all words and then frequency distributions of words within positive and negative labels
    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    #print posWords
    posWords = list(itertools.chain(*posWords))
    negWords = list(itertools.chain(*negWords))
    for word in posWords:
        word_fd[word.lower()] += 1
        #print word
        #raw_input('>')
        cond_word_fd['pos'][word.lower()] += 1
#count = count + 1
    for word in negWords:
        word_fd[word.lower()] += 1
        cond_word_fd['neg'][word.lower()] += 1

    #finds the number of positive and negative words, as well as the total number of words
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    #print count
    total_word_count = pos_word_count + neg_word_count
    #print total_word_count
    #raw_input('>')

    #builds dictionary of word scores based on chi-squared test
    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word],
                                               (freq, pos_word_count),
                                               total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word],
                                               (freq, neg_word_count),
                                               total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
def create_word_bigram_scores():
    posdata = tp.seg_fil_senti_excel(
        "/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/pos_review.xlsx",
        1, 1)
    negdata = tp.seg_fil_senti_excel(
        "/home/sooda/nlp/Review-Helpfulness-Prediction/data/sentiment_test/neg_review.xlsx",
        1, 1)

    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    #for word in pos:
    #    word_fd.inc(word)
    #    cond_word_fd['pos'].inc(word)
    #for word in neg:
    #    word_fd.inc(word)
    #    cond_word_fd['neg'].inc(word)
    for word in posWords:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in negWords:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word],
                                               (freq, pos_word_count),
                                               total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word],
                                               (freq, neg_word_count),
                                               total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
def create_word_scores():
    #creates lists of all positive and negative words
    all_positive_words = []
    all_negative_words = []

    with open(RT_POLARITY_POS_FILE, 'r') as positive_sentences:
        for i in positive_sentences:
            #stripping down all the words from extra characters (tabs, punctuation, etc.)
            positive_word = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            all_positive_words.append(positive_word)

    with open(RT_POLARITY_NEG_FILE, 'r') as negative_sentences:
        for i in negative_sentences:
            #stripping down all the words from extra characters (tabs, punctuation, etc.)
            negative_word = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            all_negative_words.append(negative_word)

    all_positive_words = list(itertools.chain(*all_positive_words))
    all_negative_words = list(itertools.chain(*all_negative_words))

    #build frequency distibution of all words and then frequency distributions of words within positive and negative labels
    #how often do words appear?
    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in negative_words:
        word_fd[word.lower()] += 1
        cond_word_fd['pos'][word.lower()] += 1

    for word in negative_word:
        word_fd[word.lower()] += 1
        cond_word_fd['neg'][word.lower()] += 1

    #finds the number of positive and negative words, as well as the total number of words
    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    #builds dictionary of word scores based on chi-squared test
    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word],
                                               (freq, pos_word_count),
                                               total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word],
                                               (freq, neg_word_count),
                                               total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
예제 #57
0
def create_words_bigrams_scores():
    posdata = tp.seg_fil_senti_excel(
        "E:/GraduationProject/pythoncode/project/Prediction/main/FeatureExtractionModule/SentimentFeatures/MachineLearningFeatures/SenimentReviewSet/pos_review.xlsx",
        1, 1)
    negdata = tp.seg_fil_senti_excel(
        "E:/GraduationProject/pythoncode/project/Prediction/main/FeatureExtractionModule/SentimentFeatures/MachineLearningFeatures/SenimentReviewSet/neg_review.xlsx",
        1, 1)

    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        #word_fd.inc(word)
        word_fd[word] += 1
        #cond_word_fd['pos'].inc(word)
        cond_word_fd['pos'][word] += 1
    for word in neg:
        #word_fd.inc(word)
        word_fd[word] += 1
        #cond_word_fd['neg'].inc(word)
        cond_word_fd['neg'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word],
                                               (freq, pos_word_count),
                                               total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word],
                                               (freq, neg_word_count),
                                               total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores