Пример #1
0
def conclusionWeight(review, weight, goodWords, badWords, numSent):
    reviewSent = review.split('.')
    lastPart = reviewSent[-numSent:]
    firstPart = reviewSent[:-numSent]

    lastPart = ''.join(lastPart)
    firstPart = ''.join(firstPart)

    lastPartTokens = utils.customtokenize(lastPart)
    firstPartTokens = utils.customtokenize(firstPart)

    numLastGoodWords = 0;
    numLastBadWords = 0
    for x in lastPartTokens:
        if x in goodWords:
            numLastGoodWords+=1
        if x in badWords:
            numLastBadWords+=1
            
    numFirstGoodWords = 0;
    numFirstBadWords = 0
    for x in firstPartTokens:
        if x in goodWords:
            numFirstGoodWords+=1
        if x in badWords:
            numFirstBadWords+=1


    score = (numFirstGoodWords + (numLastGoodWords * weight)) - \
            (numFirstBadWords + (numLastBadWords * weight))

    #print("review is " + ("negative" if score < 0 else "positive"))

    return score >= 0
Пример #2
0
def posminusneg(review, goodWords, badWords):
    wordsInReview = utils.customtokenize(review) #69.90%
    #wordsInReview = tokenizeAndRemovePOS(review, 'JJ') #64.35%
    #wordsInReview = tokenizeAndRemovePOS(review, 'NN') #69.45% was ~81% after positive reviews
    #wordsInReview = tokenizeAndRemovePOS(review, 'VB') #69.85%
    #wordsInReview = tokenizeAndRemovePOS(review, 'RB') #68.75%

    score = 0
    for x in wordsInReview:
        if x in goodWords:
            score += 1
        if x in badWords:
            score -= 1
        
    #print("review is " + ("negative" if score < 0 else "positive"))
    return score >= 0
Пример #3
0
def getSuperGoodBad(topNum, posTuples, negTuples, dataSetGoodWords, dataSetBadWords):
    # 80% will be train set and 20% test set
    percenttrain = 0.8
    byWeight = True    # toggles equal weighting of words versus weighting by frequency
    printing = False   # turn this on if you want to see more detailed info
    
    posLen = len(posTuples)
    trainCount = int(percenttrain*posLen)
    testCount = posLen - trainCount
    
    random.shuffle(posTuples)
    random.shuffle(negTuples)
    
    trainPosTuples = posTuples[:trainCount]
    trainNegTuples = negTuples[:trainCount]
    
    testPosReviews = [tup[0] for tup in posTuples[trainCount:]]
    testNegReviews = [tup[0] for tup in negTuples[trainCount:]]
    
    # probly could optimize this part
    superPospos = []
    for tup in trainPosTuples:
        superPospos += list(set(tup[1]))

    superNegpos = []
    for tup in trainNegTuples:
        superNegpos += list(set(tup[1]))

    goodFreqDist = FreqDist(superPospos);
    badFreqDist = FreqDist(superNegpos);

    topGoodWords = []
    topBadWords = []
    topGoodDict = {}
    topBadDict = {}
    
    # build custom top good and bad word lists
    while len(topGoodWords) < topNum or len(topBadWords) < topNum:
        if len(topGoodWords) < topNum:        
            goodTup = goodFreqDist.most_common(1)[0]
            goodWord = goodTup[0]
            goodWeight = goodTup[1]
            topGoodDict[goodWord] = goodWeight
            del goodFreqDist[goodWord]

            if goodWord in dataSetGoodWords:
                if goodWord in topBadWords:
                    topBadWords.remove(goodWord)
                else:
                    topGoodWords.append(goodWord)
            
        if len(topBadWords) < topNum:
            badTup = badFreqDist.most_common(1)[0]
            badWord = badTup[0];
            badWeight = badTup[1]
            topBadDict[badWord] = badWeight
            del badFreqDist[badWord]
              
            if badWord in dataSetBadWords:        
                if badWord in topGoodWords:
                    topGoodWords.remove(badWord)
                else:
                    topBadWords.append(badWord)
    
    topGoodCheck = [(word,topGoodDict[word]) for word in topGoodWords]
    topBadCheck = [(word,topBadDict[word]) for word in topBadWords]
    
    topGoodDict = {}
    topBadDict = {}
    # swap weights of each set to equalize versus appearance frequency
    for i in range(len(topGoodCheck)):
        gtup = topGoodCheck[i]
        btup = topBadCheck[i]
        
        topGoodDict[gtup[0]] = btup[1]
        topBadDict[btup[0]] = gtup[1]
        # alternate strategy
        #topGoodDict[gtup[0]] = trainCount - gtup[1]
        #topBadDict[btup[0]] = trainCount - btup[1]
        

    count = 0
    correct = 0    
    avgposscore = 0
    avgnegscore = 0
    curacc = 0
    for posReview in testPosReviews:
        count+=1
        score = 0    
        reviewTokens = utils.customtokenize(posReview.lower())
        for token in reviewTokens:
            if token in topGoodWords:
                score += topGoodDict[token] if byWeight else 1
            if token in topBadWords:
                score -= topBadDict[token] if byWeight else 1
        avgposscore += score
        if score > 0:
            if printing: print("correct! " + str(score))
            correct+=1
        else:
            if printing: print("wrong " + str(score))
            
    curacc = correct/count
    
    for negReview in testNegReviews:
        count+=1
        score = 0    
        reviewTokens = utils.customtokenize(negReview.lower())
        for token in reviewTokens:
            if token in topGoodWords:
                score += topGoodDict[token] if byWeight else 1
            if token in topBadWords:
                score -= topBadDict[token] if byWeight else 1
        avgnegscore += score
        if score <= 0:
            if printing: print("correct! " + str(score))
            correct+=1
        else:
            if printing: print("wrong " + str(score))
    
    finalacc = correct / count
    if printing:
        print("avg positive score : " + "{:.4f}".format(avgposscore / testCount))
        print("avg negative score : " + "{:.4f}".format(avgnegscore / testCount))
        print("positive accuracy  : " + "{:.4f}".format(curacc))
        #calculate positive accuracy from final and negative
        curacc += (finalacc - curacc)*2
        print("negative accuracy  : " + "{:.4f}".format(curacc))    
        print("final accuracy     : " + "{:.4f}".format(finalacc))
        
    return finalacc