def conclusionWeight(review, weight, goodWords, badWords, numSent): reviewSent = review.split('.') lastPart = reviewSent[-numSent:] firstPart = reviewSent[:-numSent] lastPart = ''.join(lastPart) firstPart = ''.join(firstPart) lastPartTokens = utils.customtokenize(lastPart) firstPartTokens = utils.customtokenize(firstPart) numLastGoodWords = 0; numLastBadWords = 0 for x in lastPartTokens: if x in goodWords: numLastGoodWords+=1 if x in badWords: numLastBadWords+=1 numFirstGoodWords = 0; numFirstBadWords = 0 for x in firstPartTokens: if x in goodWords: numFirstGoodWords+=1 if x in badWords: numFirstBadWords+=1 score = (numFirstGoodWords + (numLastGoodWords * weight)) - \ (numFirstBadWords + (numLastBadWords * weight)) #print("review is " + ("negative" if score < 0 else "positive")) return score >= 0
def posminusneg(review, goodWords, badWords): wordsInReview = utils.customtokenize(review) #69.90% #wordsInReview = tokenizeAndRemovePOS(review, 'JJ') #64.35% #wordsInReview = tokenizeAndRemovePOS(review, 'NN') #69.45% was ~81% after positive reviews #wordsInReview = tokenizeAndRemovePOS(review, 'VB') #69.85% #wordsInReview = tokenizeAndRemovePOS(review, 'RB') #68.75% score = 0 for x in wordsInReview: if x in goodWords: score += 1 if x in badWords: score -= 1 #print("review is " + ("negative" if score < 0 else "positive")) return score >= 0
def getSuperGoodBad(topNum, posTuples, negTuples, dataSetGoodWords, dataSetBadWords): # 80% will be train set and 20% test set percenttrain = 0.8 byWeight = True # toggles equal weighting of words versus weighting by frequency printing = False # turn this on if you want to see more detailed info posLen = len(posTuples) trainCount = int(percenttrain*posLen) testCount = posLen - trainCount random.shuffle(posTuples) random.shuffle(negTuples) trainPosTuples = posTuples[:trainCount] trainNegTuples = negTuples[:trainCount] testPosReviews = [tup[0] for tup in posTuples[trainCount:]] testNegReviews = [tup[0] for tup in negTuples[trainCount:]] # probly could optimize this part superPospos = [] for tup in trainPosTuples: superPospos += list(set(tup[1])) superNegpos = [] for tup in trainNegTuples: superNegpos += list(set(tup[1])) goodFreqDist = FreqDist(superPospos); badFreqDist = FreqDist(superNegpos); topGoodWords = [] topBadWords = [] topGoodDict = {} topBadDict = {} # build custom top good and bad word lists while len(topGoodWords) < topNum or len(topBadWords) < topNum: if len(topGoodWords) < topNum: goodTup = goodFreqDist.most_common(1)[0] goodWord = goodTup[0] goodWeight = goodTup[1] topGoodDict[goodWord] = goodWeight del goodFreqDist[goodWord] if goodWord in dataSetGoodWords: if goodWord in topBadWords: topBadWords.remove(goodWord) else: topGoodWords.append(goodWord) if len(topBadWords) < topNum: badTup = badFreqDist.most_common(1)[0] badWord = badTup[0]; badWeight = badTup[1] topBadDict[badWord] = badWeight del badFreqDist[badWord] if badWord in dataSetBadWords: if badWord in topGoodWords: topGoodWords.remove(badWord) else: topBadWords.append(badWord) topGoodCheck = [(word,topGoodDict[word]) for word in topGoodWords] topBadCheck = [(word,topBadDict[word]) for word in topBadWords] topGoodDict = {} topBadDict = {} # swap weights of each set to equalize versus appearance frequency for i in range(len(topGoodCheck)): gtup = topGoodCheck[i] btup = topBadCheck[i] topGoodDict[gtup[0]] = btup[1] topBadDict[btup[0]] = gtup[1] # alternate strategy #topGoodDict[gtup[0]] = trainCount - gtup[1] #topBadDict[btup[0]] = trainCount - btup[1] count = 0 correct = 0 avgposscore = 0 avgnegscore = 0 curacc = 0 for posReview in testPosReviews: count+=1 score = 0 reviewTokens = utils.customtokenize(posReview.lower()) for token in reviewTokens: if token in topGoodWords: score += topGoodDict[token] if byWeight else 1 if token in topBadWords: score -= topBadDict[token] if byWeight else 1 avgposscore += score if score > 0: if printing: print("correct! " + str(score)) correct+=1 else: if printing: print("wrong " + str(score)) curacc = correct/count for negReview in testNegReviews: count+=1 score = 0 reviewTokens = utils.customtokenize(negReview.lower()) for token in reviewTokens: if token in topGoodWords: score += topGoodDict[token] if byWeight else 1 if token in topBadWords: score -= topBadDict[token] if byWeight else 1 avgnegscore += score if score <= 0: if printing: print("correct! " + str(score)) correct+=1 else: if printing: print("wrong " + str(score)) finalacc = correct / count if printing: print("avg positive score : " + "{:.4f}".format(avgposscore / testCount)) print("avg negative score : " + "{:.4f}".format(avgnegscore / testCount)) print("positive accuracy : " + "{:.4f}".format(curacc)) #calculate positive accuracy from final and negative curacc += (finalacc - curacc)*2 print("negative accuracy : " + "{:.4f}".format(curacc)) print("final accuracy : " + "{:.4f}".format(finalacc)) return finalacc