def applyAggregationModel(testReviews, featureAvgSent, model, busImportantFeatures, userImportantFeatures): logger = logging.getLogger('signature.aAM.applyAggregationModel') logger.info('starting applyAggregationModel from %d reviews'%len(testReviews)) fsw = featureStructureWorker() for r, review in enumerate(testReviews): reviewFeatures = review['predSentiments'] #features = encodeAspects2features(fsw, reviewFeatures) features = encodeAspects1features(fsw, reviewFeatures, featureAvgSent) #aggregation = model.predict(features) #print aggregation, review['stars'] predictedFeatures = review['exPredFeatures']#[*,1] #Predicted Features, Predicted Sentiments by BUSINESS busID = review['business_id'] if busID in busImportantFeatures: busSents = busImportantFeatures[busID]['sentiment'] else: busSents = {} testData = {a:busSents[a][0] for a in busSents if a in predictedFeatures and predictedFeatures[a][1] == 1 and busSents[a][1] > 1} features = encodeAspects1features(fsw, testData, featureAvgSent) aggregationBUS = model.predict(features) review['rating_prediction'] = review.get('rating_prediction', {}) review['rating_prediction']['aggregBUSavg'] = aggregationBUS if not r%10000: logger.debug('%d reviews processed'%r) return testReviews
def __init__(self): self.logger = logging.getLogger('signature.aspectDependence') self.logger.info('aspectDependence created') self.fsw = featureStructureWorker() self.aspectList = [x for x in self.fsw.featureIdicator if self.fsw.featureIdicator[x]] self.aspectList.sort() self.aspectStat = dict() for i,x in enumerate(self.aspectList): self.aspectStat[x] = pd.DataFrame(np.zeros((4,5)), index=['n','-1','0','1'], columns=[1,2,3,4,5]) self.aspectPairStat = dict() for i,x in enumerate(self.aspectList): for y in self.aspectList[i+1:]: self.aspectPairStat[(x,y)] = pd.DataFrame(np.zeros((4,4)), index=['n','-1','0','1'], columns=['n','-1','0','1']) #resulting aspect-stars stat self.aspectStars = dict() #resulting dependence Stat self.resultingStat = dict()
def predictAll(path, modelfile): logger = logging.getLogger('signature.pairCompare') logger.info('starting pairCompare') #get data b_file = path+'/businessProfile.json' u_file = path+'/userProfile.json' r_file = path+'/specific_reviews_test.json' fsw = featureStructureWorker() #load model modelDict = pickle.load(open(modelfile,'rb')) logger.info('Model loaded from %s'%modelfile) busImportantFeatures = json.loads(open(b_file,'r').readline()) logger.info('Important BUSINESS Features loaded') userImportantFeatures = json.loads(open(u_file,'r').readline()) logger.info('Important USER Features loaded') testReviewsByUser = dict() for counter, line in enumerate(open(r_file,'r')): if not counter%1000: logger.debug('%d reviews loaded'%counter) review = json.loads(line.strip()) userID = review['user_id'] for aspect in modelDict: if not fsw.featureIdicator[aspect]: continue featureSet = calculateFeatures(logger, review, aspect, busImportantFeatures, userImportantFeatures) if not featureSet: continue review['pairComp'] = review.get('pairComp', {}) predProb = modelDict[aspect][1].predict_proba(np.array([featureSet]))[0][1] if predProb > 0.5: predSent = modelDict[aspect][3].predict_proba(np.array([featureSet]))[0][1] review['pairComp'][aspect] = predSent #print(review['pairComp']) testReviewsByUser[userID] = testReviewsByUser.get(userID, []) testReviewsByUser[userID].append(review) logger.info('Reviews loaded') #save result outfile = open(path+'test_predictions.json','w') for user in testReviewsByUser: outfile.write(json.dumps(testReviewsByUser[user])+'\n') outfile.close()
def learnTopicModel(infileName, dictFile, modelFile, descriptionFile, topic_num = 10): logger = logging.getLogger('signature.learnTopicModel') logger.info('starting learnTopicModel from %s'%infileName) fsw = featureStructureWorker() texts = list() #build corpus review_file = open(infileName,"r") for counter, line in enumerate(review_file): if not counter%10000: logger.debug('%d reviews loaded'%counter) # load review information review = json.loads(line.strip()) reviewFeatures = fsw.getReviewFeaturesExistence(review['features']) text_plus = list() text_minus = list() for aspect in reviewFeatures: sent = np.average(reviewFeatures[aspect]) if sent > 0: text_plus.append(aspect) elif sent < 0: text_minus.append(aspect) if len(text_plus): texts.append(text_plus) if len(text_minus): texts.append(text_minus) #build Dictionary dictionary = corpora.Dictionary(texts) dictionary.filter_extremes(no_below=100, no_above=0.8) logger.info(dictionary) corpus_int = [dictionary.doc2bow(text) for text in texts] logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',datefmt='%m-%d,%H:%M:%S', level=logging.INFO) lda_model = models.ldamodel.LdaModel(corpus=corpus_int, id2word=dictionary, num_topics=topic_num, update_every=1, chunksize=10000, passes=30) lda_model.print_topics(20) output = open(descriptionFile,"w") for i, topic in enumerate(lda_model.show_topics(num_topics=100, num_words=15, log=False, formatted=True)): #print str(i)+"\t"+topic.encode("utf8") try: output.write(str(i)+"\t"+topic.decode('utf8', 'ignore')+"\n\n") except: try: output.write(str(i)+"\t"+topic[:30].decode('utf8', 'ignore')+"\n\n") except: output.write(str(i)+"\t"+"\n\n") output.close() dictionary.save(dictFile) lda_model.save(modelFile)
def learnAggregationModelsCV(trainReviews, featureAvgSent, busImportantFeatures, path): logger = logging.getLogger("signature.lAMCV.learnAggregationModelsCV") logger.info("starting learnAggregationModel from %d reviews" % len(trainReviews)) fsw = featureStructureWorker() learnData = list() learnLabels = list() for j, review in enumerate(trainReviews): reviewFeatures = fsw.getReviewFeaturesSentiment(review["features"]) rating = review["stars"] for aspect in reviewFeatures: if ( review["business_id"] in busImportantFeatures and aspect in busImportantFeatures[review["business_id"]]["sentiment"] and busImportantFeatures[review["business_id"]]["sentiment"][aspect][1] > 5 ): sentiment = busImportantFeatures[review["business_id"]]["sentiment"][aspect][0] reviewFeatures[aspect] = sentiment else: reviewFeatures[aspect] = featureAvgSent[aspect] # features = encodeAspects2features(fsw, reviewFeatures) features = encodeAspects1features(fsw, reviewFeatures, featureAvgSent) learnData.append(features) learnLabels.append(rating) learnData = np.array(learnData) learnLabels = np.array(learnLabels) bestRes = 0.0 bestReg = 0.0 for reg in [0.01, 0.05, 0.1, 0.2, 0.5, 1.0, 5.0, 10, 15, 50, 100, 200, 500]: kf = cross_validation.KFold(len(learnLabels), n_folds=10) results = list() for train_index, test_index in kf: X_train, X_test = learnData[train_index], learnData[test_index] y_train, y_test = learnLabels[train_index], learnLabels[test_index] clf = linear_model.Ridge(alpha=reg) clf.fit(X_train, y_train) results.append(clf.score(X_test, y_test)) if np.average(results) > bestRes: bestRes = np.average(results) bestReg = reg # print reg, np.average(results) logger.info("Best score %f with regularization = %.2f" % (bestRes, bestReg)) clf = linear_model.Ridge(alpha=bestReg) clf.fit(learnData, learnLabels) return clf
def matchProfiles(path, limit = np.Inf): logger = logging.getLogger('signature.matchProfiles') logger.info('starting matchProfiles') #get data b_file = path+'businessFeaturesAggregation_train.json' u_file = path+'userFeaturesAggregation_train.json' r_file = path+'yelp_reviews_test_predictions.json' busImportantFeatures = json.loads(open(b_file,'r').readline()) logger.info('Important BUSINESS Features loaded') userImportantFeatures = json.loads(open(u_file,'r').readline()) logger.info('Important USER Features loaded') fsw = featureStructureWorker() #load featureWeights infile = open(path+'featureWeights.json','r') featureWeights = json.loads(infile.readline().strip()) infile.close() nums = list() reviews = list() for counter, line in enumerate(open(r_file,'r')): if not counter%1000: logger.debug('%d reviews loaded'%counter) if counter > limit: break review = json.loads(line.strip()) userID = review['user_id'] busID = review['business_id'] if busID in busImportantFeatures and userID in userImportantFeatures: num, score = matchBUprofiles(fsw, featureWeights, busImportantFeatures[busID], userImportantFeatures[userID]) nums.append(num) else: score = -1000000 review['rating_prediction'] = review.get('rating_prediction', {}) review['rating_prediction']['match'] = score reviews.append(review) outfile = open(path+'yelp_reviews_test_predictions.json','w') for review in reviews: outfile.write(json.dumps(review).encode('utf8', 'ignore')+'\n') outfile.close() #print nums #print len(nums) print 'AVERAGE NUMBER OF FEATURES = %f'%np.average(num)
def featureAggregation(review_list, ignore_neutral = True): fsw = featureStructureWorker() aggregation_dict = dict() for review in review_list: reviewFeatures = fsw.getReviewFeatures(review) for feature in reviewFeatures: aggregation_dict[feature] = aggregation_dict.get(feature,[]) if ignore_neutral: aggregation_dict[feature].append(np.average([x for x in reviewFeatures[feature] if x])) else: aggregation_dict[feature].append(np.average(reviewFeatures[feature])) for feature in aggregation_dict: aggregation_dict[feature] = [round(np.average(aggregation_dict[feature]),3), len(aggregation_dict[feature])] return copy.deepcopy(aggregation_dict)
def featureStat(infileName, outfileName, limit = 1000): #load reviews review_file = open(infileName,"r") stat = [[],[],[]] feature_stat = dict() fsw = featureStructureWorker() for counter, line in enumerate(review_file): if counter > limit: break review = json.loads(line) #features = fsw.getFeatureAverage(review['features']) features = fsw.getReviewFeaturesExistence(review['features']) stat[0].append(review['ID']) stat[1].append(review['business_id']) stat[2].append(review['user_id']) for feature in features: feature_stat[feature] = feature_stat.get(feature,[[],[],[]]) feature_stat[feature][0].append(review['ID']) feature_stat[feature][1].append(review['business_id']) feature_stat[feature][2].append(review['user_id']) if not counter %1000: print '%d reviews processed'%counter out_file = open(outfileName,"w") out_file.write('%30s\t%s\t%s\t%s\t%s\t%s\t%s\n'%('FeatureName','rewFreq','busFreq','usFreq', 'rewPer','busPer','userPer')) result = [] for i in range(len(stat)): stat[i] = float(len(set(stat[i])))/100 for feature in feature_stat: for i in range(len(feature_stat[feature])): feature_stat[feature][i] = len(set(feature_stat[feature][i])) f = feature_stat[feature] s = '%30s\t%5d\t%5d\t%5d\t%5.1f\t%5.1f\t%5.1f\n'%(feature,f[0],f[1],f[2], f[0]/stat[0],f[1]/stat[1],f[2]/stat[2]) result.append([f[0]/stat[0],s]) result.sort(reverse=True) for r in result: out_file.write(r[1]) review_file.close() out_file.close()
def matchBUprofiles(fsw, featureWeights, busProfile, userProfile): fsw = featureStructureWorker() matchScore = 0.0 num = 0 for feature in fsw.featureIdicator: if not fsw.featureIdicator[feature]: continue if feature not in userProfile['featureFreq'] and feature not in busProfile['featureFreq']: continue if userProfile['featureFreq'].get(feature,0) > 10 and userProfile['sentiment'].get(feature,[0,0])[1] > 1: pass else: continue if busProfile['featureFreq'].get(feature,0) > 10 and busProfile['sentiment'].get(feature,[0,0])[1] > 5: pass else: continue sentiment = busProfile['sentiment'][feature][0] # userImp = userProfile['tfidfDict'].get(feature,0.0) # busImp = busProfile['tfidfDict'].get(feature,0.0) userImp = userProfile['featureFreq'].get(feature,0.0) busImp = busProfile['featureFreq'].get(feature,0.0) weight = featureWeights[feature] coeff = 1.0 if sentiment < weight: coeff = 2.0 # if busImp > 80: # userImp = max(userImp,busImp) matchScore += userImp*busImp*sentiment*weight*coeff num += 1.0 return num, matchScore
def learnFeatureExistance(busImportantFeatures, userImportantFeatures, trainReviews, path): logger = logging.getLogger('signature.lFE.learnFE') logger.info('starting learnFeatureExistance from %d reviews'%len(trainReviews)) fsw = featureStructureWorker() modelDict = dict() trainAveragesDict = dict() for f, feature in enumerate(fsw.featureIdicator): if not fsw.featureIdicator[feature]: continue logger.debug('Start working with %s'%feature) #get data X, Y, trainAveragesDict[feature] = getFeatures(logger, feature, trainReviews, busImportantFeatures, userImportantFeatures, trainAverages = {}, is_train = True) logger.debug('Got features for %d reviews'%len(X)) # #cross validation # indicator = range(len(X)) # random.shuffle(indicator) # thres = int(len(indicator)*0.8) # trainX = np.array([X[i] for i in indicator[:thres]]) # trainY = np.array([Y[i] for i in indicator[:thres]]) # testX = np.array([X[i] for i in indicator[thres:]]) # testY = np.array([Y[i] for i in indicator[thres:]]) #Logistic Regression #bestThres,bestF1,logmodel = getBestLogModel(logger, feature, trainX, trainY, testX, testY, X, Y, path) bestThres,bestF1,logmodel = getBestLogModel(logger, feature, X, Y, path) #bestThresSVM,bestF1SVM,svmmodel = getBestSVMModel(logger, feature, X, Y, path) # crossValidation(logger, np.array(X), np.array(Y)) modelDict[feature] = [bestThres,bestF1,logmodel] # print f # if f > 6: # break return trainAveragesDict, modelDict
def learnTopicExistence(busImportantFeatures, userImportantFeatures, trainReviews, path): logger = logging.getLogger('signature.lTE.learnTopicExistence') logger.info('starting learnTopicExistence from %d reviews'%len(trainReviews)) fsw = featureStructureWorker() modelDict = dict() for f, topic in enumerate(fsw.featureIdicator): if not fsw.featureIdicator[topic]: continue logger.debug('Start working with %s'%topic) #get data X, Y = getFeatures(logger, topic, trainReviews, busImportantFeatures, userImportantFeatures, trainAverages = {}, is_train = True) logger.debug('Got features for %d reviews'%len(X)) modelDict[topic] = getBestModel(logger, topic, X, Y, path) #print modelDict return modelDict
def learnSentimentMatrixFactorization(trainReviews, path): logger = logging.getLogger('signature.lSMF.learnSentimentMF') logger.info('starting learnSentimentMatrixFactorization from %d reviews'%len(trainReviews)) fsw = featureStructureWorker() modelDict = dict() featureThres = dict() for i, feature in enumerate(fsw.featureIdicator): # if feature != 'STAFF': # continue if not fsw.featureIdicator[feature]: continue logger.debug('Start working with %s'%feature) learnData = {'user':[],'item':[],'rating':[]} for j, review in enumerate(trainReviews): reviewFeatures = fsw.getReviewFeaturesSentiment(review['features']) if feature not in reviewFeatures: continue busID = review['business_id'] userID = review['user_id'] rating = np.average(reviewFeatures[feature]) # if rating == 0.0: # continue # rating = 1.0 if rating > 0 else -1.0 learnData['user'].append(userID) learnData['item'].append(busID) learnData['rating'].append(rating) #CROSSS VALIDATION data = graphlab.SFrame(learnData) featureThres[feature], modelDict[feature] = getBestMFThres(logger, feature, data, path) return modelDict, featureThres
def learnAggregationModelsCV(trainReviews, featureAvgSent, path): logger = logging.getLogger('signature.lAMCV.learnAggregationModelsCV') logger.info('starting learnAggregationModel from %d reviews'%len(trainReviews)) fsw = featureStructureWorker() learnData = list() learnLabels = list() for j, review in enumerate(trainReviews): reviewFeatures = fsw.getReviewFeaturesSentiment(review['features']) rating = review['stars'] #features = encodeAspects2features(fsw, reviewFeatures) features = encodeAspects1features(fsw, reviewFeatures, featureAvgSent) learnData.append(features) learnLabels.append(rating) learnData = np.array(learnData) learnLabels = np.array(learnLabels) bestRes = 0.0 bestReg = 0.0 for reg in [0.01,0.05,0.1,0.2,0.5,1.0,5.0,10,15,50,100,200,500]: kf = cross_validation.KFold(len(learnLabels), n_folds=10) results = list() for train_index, test_index in kf: X_train, X_test = learnData[train_index], learnData[test_index] y_train, y_test = learnLabels[train_index], learnLabels[test_index] clf = linear_model.Ridge(alpha = reg) clf.fit (X_train, y_train) results.append(clf.score(X_test, y_test)) if np.average(results) > bestRes: bestRes = np.average(results) bestReg = reg #print reg, np.average(results) logger.info('Best score %f with regularization = %.2f'%(bestRes, bestReg)) clf = linear_model.Ridge(alpha = bestReg) clf.fit(learnData, learnLabels) return clf
def applySMF(path, limit = np.Inf): logger = logging.getLogger('signature.applySentimentMF') logger.info('starting applySentimentMF') #get data r_file = path+'specific_reviews_test.json' testReviews = list() for counter, line in enumerate(open(r_file,'r')): if not counter%5000: logger.debug('%d reviews loaded'%counter) if counter > limit: break testReviews.append(json.loads(line.strip())) logger.info('Test Reviews loaded from %s'%r_file) #load model existence modelDict_ex = dict() featureThres_ex = dict() fsw = featureStructureWorker() for feature in fsw.featureIdicator: if not fsw.featureIdicator[feature]: continue try: modelPath = path + 'sentimentModels/%s_sentiment_ex.model'%feature modelDict_ex[feature] = graphlab.load_model(modelPath) #load average thres_path = path+'sentimentModels/%s_sentiment_ex.threshold'%feature infile = open(thres_path,'r') featureThres_ex[feature] = float(infile.readline().strip()) infile.close() except: logger.error('There is no model for feature: %s'%feature) continue logger.info('Existence Models loaded') #load model modelDict = dict() featureThres = dict() fsw = featureStructureWorker() for feature in fsw.featureIdicator: if not fsw.featureIdicator[feature]: continue try: modelPath = path + 'sentimentModels/%s_sentiment.model'%feature print modelPath modelDict[feature] = graphlab.load_model(modelPath) #load average thres_path = path+'sentimentModels/%s_sentiment.threshold'%feature infile = open(thres_path,'r') featureThres[feature] = float(infile.readline().strip()) infile.close() except: logger.error('There is no model for feature: %s'%feature) continue logger.info('Sentiment Models loaded') #run function results_ex, results = applySentimentMF(testReviews, modelDict_ex, featureThres_ex, modelDict, featureThres) #save result json.dump(results_ex,open(path+'reviews_test_exMFpred.json','w')) json.dump(results,open(path+'reviews_test_MFpred.json','w'))
def getFeatures(logger, feature, reviewsSet, busImportantFeatures, userImportantFeatures): #business_dict, user_dict = loadData(logger) fsw = featureStructureWorker() X1 = list() Y1 = list() X2 = list() Y2 = list() missed = 0 for review in reviewsSet: feature_set = calculateFeatures(logger, review, feature, busImportantFeatures, userImportantFeatures) reviewFeatures = fsw.getReviewFeaturesExistence(review['features']) # # busID = review['business_id'] # userID = review['user_id'] # if busID not in busImportantFeatures or userID not in userImportantFeatures: # missed += 1 # continue # # bus_tfidf = busImportantFeatures[busID]['tfidfDict'].get(feature,0.0) # bus_freq = busImportantFeatures[busID]['featureFreq'].get(feature,0.0)/100.0 # bus_reviews = busImportantFeatures[busID]['reviewsNumber'] # bus_sentiment = (busImportantFeatures[review['business_id']]['sentiment'].get(feature,[0.0,0])[0]+1)/2.0 # # user_tfidf = userImportantFeatures[userID]['tfidfDict'].get(feature,0.0) # user_freq = userImportantFeatures[userID]['featureFreq'].get(feature,0.0)/100.0 # user_reviews = userImportantFeatures[userID]['reviewsNumber'] # user_sentiment = (userImportantFeatures[review['user_id']]['sentiment'].get(feature,[0.0,0])[0]+1)/2.0 # user_text = userImportantFeatures[userID]['textFeatures'] # # '''CHECK IF WE HAVE ENOUGH INFORMATION''' # if bus_reviews > 5 and bus_freq > 0.1 and user_reviews > 5: # 5 1 5 # feature_set = [bus_tfidf, bus_freq, bus_sentiment, # user_tfidf, user_freq, user_sentiment] ## feature_set = [bus_freq] # # feature_set += getCriticalFeatures(feature, busID, busImportantFeatures) # feature_set += getCriticalFeatures(feature, userID, userImportantFeatures) # ## feature_set += user_text if feature_set: if feature in reviewFeatures: Y1.append(1) X1.append(feature_set) sent = np.average(reviewFeatures[feature]) if sent > 0: Y2.append(1) X2.append(feature_set) elif sent < 0: Y2.append(0) X2.append(feature_set) else: Y1.append(0) X1.append(feature_set) else: missed += 1 return X1, Y1, X2, Y2, missed
def applyTopicModel(logger, path, topic_num): stat_file = path+'yelp_reviews_features_stat.json' train_file = path+'yelp_reviews_features_train.json' extrain_file = path+'yelp_reviews_features_extrain.json' test_file = path+'yelp_reviews_features_test.json' #load model model_path = path+'modelLDA/' dictionary = corpora.Dictionary.load(model_path+'dictionary_%d.lda'%topic_num) logger.info("Dictionary loaded from: "+ model_path+'dictionary_%d.lda'%topic_num) lda_model = models.ldamodel.LdaModel.load(model_path+'model_%d.lda'%topic_num) logger.info("Model loaded from:" + model_path+'model_%d.lda'%topic_num) files = [stat_file,train_file,extrain_file,test_file] fsw = featureStructureWorker() for infile in files: reviews = list() for counter, line in enumerate(open(infile,'r')): if not counter%10000: logger.debug('%d reviews loaded'%counter) #print infile, line # load review information review = json.loads(line.strip()) reviews.append(review) # outfile = open(infile.replace('.json','_old.json'),'w') # for review in reviews: # outfile.write(json.dumps(review).encode('utf8', 'ignore')+'\n') # outfile.close() # outfile = open(infile,'w') # outname = infile.replace('.json','_old.json') outname = infile print outname outfile = open(outname,'w') for counter, review in enumerate(reviews): if not counter%1000: logger.debug('%d reviews loaded'%counter) if 'features_sent' in review: reviewFeatures = fsw.getReviewFeaturesExistence(review['features_sent']) else: reviewFeatures = fsw.getReviewFeaturesExistence(review['features']) text_plus = list() text_minus = list() for aspect in reviewFeatures: sent = np.average(reviewFeatures[aspect]) if sent > 0: text_plus.append(aspect) elif sent < 0: text_minus.append(aspect) topics_plus = lda_model[dictionary.doc2bow(text_plus)] topics_minus = lda_model[dictionary.doc2bow(text_minus)] res = dict() if len(topics_plus): res['1'] = topTopics(topics_plus) #print topics_plus, res['1'] if len(topics_minus): res['0'] = topTopics(topics_minus,sign = -1) if 'features_sent' not in review: review['features_sent'] = review['features'].copy() review['features'] = res.copy() outfile.write(json.dumps(review).encode('utf8', 'ignore')+'\n') # if counter > 10: # break outfile.close()
def learnAndApplyMatching(path, limit = np.Inf): logger = logging.getLogger('signature.learnAndApplyMatching') logger.info('starting learnAndApplyMatching') #get data b_file = path+'businessFeaturesAggregation_stat.json' u_file = path+'userFeaturesAggregation_stat.json' train_file = path+'yelp_reviews_features_extrain.json' test_file = path+'yelp_reviews_test_predictions.json' busImportantFeatures = json.loads(open(b_file,'r').readline()) logger.info('Important BUSINESS Features loaded') userImportantFeatures = json.loads(open(u_file,'r').readline()) logger.info('Important USER Features loaded') fsw = featureStructureWorker() # #load featureWeights # infile = open(path+'/featureWeights.json','r') # featureWeights = json.loads(infile.readline().strip()) # infile.close() #learn model learnData = list() learnLabels = list() for counter, line in enumerate(open(train_file,'r')): if not counter%1000: logger.debug('%d reviews loaded'%counter) if counter > limit: break review = json.loads(line.strip()) userID = review['user_id'] busID = review['business_id'] features = getFeatures(busID, userID, busImportantFeatures, userImportantFeatures, fsw) if features: learnData.append(features) learnLabels.append(review['stars']) model = learnMatchModel(logger, learnData, learnLabels) print model.coef_ for aspect in fsw.featureIdicator: if not fsw.featureIdicator[aspect]: continue print aspect # print model # exit() #apply model testReviews = [] for counter, line in enumerate(open(test_file,'r')): if not counter%1000: logger.debug('%d reviews loaded'%counter) if counter > limit: break review = json.loads(line.strip()) testReviews.append(review) outfile = open(path+'yelp_reviews_test_predictions.json','w') for counter, review in enumerate(testReviews): if not counter%1000: logger.debug('%d reviews loaded'%counter) userID = review['user_id'] busID = review['business_id'] test_features = getFeatures(busID, userID, busImportantFeatures, userImportantFeatures, fsw) if test_features: prediction = model.predict(test_features) else: prediction = None review['rating_prediction'] = review.get('rating_prediction', {}) review['rating_prediction']['match_prediction'] = prediction outfile.write(json.dumps(review).encode('utf8', 'ignore')+'\n') outfile.close()
def computeStatWorker(testReviews, predType, path, modelDict, classes = [0, 1]): logger = logging.getLogger('signature.computeStat.cSW') logger.info('start computing Statistic from %d reviews for %s'%(len(testReviews), predType)) fsw = featureStructureWorker() try: os.stat(path+'results/') except: os.mkdir(path+'results/') output = open(path+'results/example_%s_%d.txt'%(predType,classes[1]), 'w') Jaccard = {1:[], 2:[], 3:[], 4:[], 5:[], 6:[]} Jaccard_int = {1:[], 2:[], 3:[], 4:[], 5:[], 6:[]} Jaccard_vector = dict()# thres -> values Accuracy_vector = dict() Presision = {1:[], 2:[], 3:[], 4:[], 5:[], 6:[]} Recall = {1:[], 2:[], 3:[], 4:[], 5:[], 6:[]} F1 = {1:[], 2:[], 3:[], 4:[], 5:[], 6:[]} Presision_o = {1:0.0, 2:0.0, 3:0.0, 4:0.0, 5:0.0, 6:0.0} Recall_o = {1:0.0, 2:0.0, 3:0.0, 4:0.0, 5:0.0, 6:0.0} F1_o = {1:0.0, 2:0.0, 3:0.0, 4:0.0, 5:0.0, 6:0.0} TP_o = {1:0.0, 2:0.0, 3:0.0, 4:0.0, 5:0.0, 6:0.0} FP_o = {1:0.0, 2:0.0, 3:0.0, 4:0.0, 5:0.0, 6:0.0} FN_o = {1:0.0, 2:0.0, 3:0.0, 4:0.0, 5:0.0, 6:0.0} # RMSE = {1:[], 2:[], 3:[], 4:[], 5:[], 6:[]} # RMSE_o = {1:[], 2:[], 3:[], 4:[], 5:[], 6:[]} aspectNumAvg = {'good':[], 0:[], 1:[], 2:[], 3:[], 4:[], 5:[], 6:[]} y_true = list() y_pred_list = [[],[],[],[],[],[]] for r, review in enumerate(testReviews): Jaccard_intersection = {1:0.0, 2:0.0, 3:0.0, 4:0.0, 5:0.0, 6:0.0} Jaccard_union = {1:0.0, 2:0.0, 3:0.0, 4:0.0, 5:0.0, 6:0.0} Jaccard_vector_review = dict() Accuracy_vector_review = dict() for thres in np.arange(-0.05,1.05,0.05): Jaccard_vector_review[thres] = Jaccard_vector_review.get(thres, {1:[0,0], 2:[0,0], 3:[0,0], 4:[0,0],5:[0,0], 6:[0,0]}) Accuracy_vector_review[thres] = Accuracy_vector_review.get(thres, {1:[0,0], 2:[0,0], 3:[0,0], 4:[0,0],5:[0,0], 6:[0,0]}) TP = {1:0.0, 2:0.0, 3:0.0, 4:0.0, 5:0.0, 6:0.0} FP = {1:0.0, 2:0.0, 3:0.0, 4:0.0, 5:0.0, 6:0.0} FN = {1:0.0, 2:0.0, 3:0.0, 4:0.0, 5:0.0, 6:0.0} # RMSE_review = {1:[], 2:[], 3:[], 4:[], 5:[], 6:[]} aspectNum = {'good':0.0, 0:0.0, 1:0.0, 2:0.0, 3:0.0, 4:0.0, 5:0.0, 6:0.0} if predType in review: aspectNum['good'] = len([aspect for aspect in review[predType] if fsw.featureIdicator[aspect]]) if predType not in review: continue # print(review['exPredFeatures']) for feature in review[predType]: if not fsw.featureIdicator[feature]: continue for i in range(0,7): if abs(classes[1] - review[predType][feature][i]) < 0.5: aspectNum[i] += 1 #for plots y_true.append(int(review[predType][feature][0] == classes[1])) for i in range(1,7): y_pred_list[i-1].append(abs(classes[0] - review[predType][feature][i])) #for computing quality for i in range(1,7): realClass = review[predType][feature][0] if review[predType][feature][i] > 0.5: predictedClass = 1 else: predictedClass = 0 if realClass == classes[1]: if predictedClass == classes[1]: TP[i] += 1 TP_o[i] += 1 elif predictedClass == classes[0]: FN[i] += 1 FN_o[i] += 1 elif realClass == classes[0]: if predictedClass == classes[1]: FP[i] += 1 FP_o[i] += 1 if realClass == classes[1] and predictedClass == classes[1]: Jaccard_intersection[i] += 1 if realClass == classes[1] or predictedClass == classes[1]: Jaccard_union[i] += 1 # dif = pow(realClass - review[predType][feature][i], 2) # RMSE[i].append(dif) # RMSE_review[i].append(dif) ''' Jaccard_vector ''' for thres in np.arange(-0.05,1.05,0.05): for i in range(1,7): if review[predType][feature][i] > thres: predictedClass = 1 else: predictedClass = 0 if realClass == classes[1] and predictedClass == classes[1]: Jaccard_vector_review[thres][i][0] += 1.0 if realClass == classes[1] or predictedClass == classes[1]: Jaccard_vector_review[thres][i][1] += 1.0 Accuracy_vector_review[thres][i][1] += 1.0 if realClass == predictedClass: Accuracy_vector_review[thres][i][0] += 1.0 for i in range(1,7): if Jaccard_union[i]: Jaccard[i].append(Jaccard_intersection[i]/Jaccard_union[i]) Jaccard_int[i].append(Jaccard_intersection[i]) if i == 1: if Jaccard[1][-1] > 0.8: if 'sentPredFeatures' in review: output.write(str(review['sentences'])+'\n--\n'+str(review[predType])+'\n--\n'+str(review['sentPredFeatures'])+'\n====================\n\n') pre = 0.0 rec = 0.0 f1 = 0.0 if (TP[i] + FN[i]): if (TP[i] + FP[i]): pre = float(TP[i]) / (TP[i] + FP[i]) else: pre = 0.0 rec = float(TP[i]) / (TP[i] + FN[i]) if pre + rec: f1 = 2 * pre * rec / (pre + rec) else: f1 = 0.0 Presision[i].append(pre) Recall[i].append(rec) F1[i].append(f1) ''' Jaccard_vector ''' for thres in np.arange(-0.05,1.05,0.05): Jaccard_vector[thres] = Jaccard_vector.get(thres, {1:[], 2:[], 3:[], 4:[], 5:[], 6:[]}) Accuracy_vector[thres] = Accuracy_vector.get(thres, {1:[], 2:[], 3:[], 4:[], 5:[], 6:[]}) for i in range(1,7): if Jaccard_vector_review[thres][i][1]: Jaccard_vector[thres][i].append(Jaccard_vector_review[thres][i][0]/Jaccard_vector_review[thres][i][1]) if Accuracy_vector_review[thres][i][1]: Accuracy_vector[thres][i].append(Accuracy_vector_review[thres][i][0]/Accuracy_vector_review[thres][i][1]) #print(aspectNum) for r in aspectNum: aspectNumAvg[r].append(aspectNum[r]) # for i in range(1,4): # print(i, Jaccard_vector_review[0.5][i][0],Jaccard_vector_review[0.5][i][1],Jaccard_vector_review[0.5][i][0]/Jaccard_vector_review[0.5][i][1],Jaccard_intersection[i],Jaccard_union[i],Jaccard_intersection[i]/Jaccard_union[i]) # print(len(Jaccard_vector[0.5][i]), len(Jaccard[i]), np.average(Jaccard_vector[0.5][i]), np.average(Jaccard[i])) # for r in RMSE_review: # if len(RMSE_review[r]): # RMSE_o[i].append(np.average(RMSE_review[r])) # print(TP_o) for i in range(1,7): Presision[i] = np.average(Presision[i]) Recall[i] = np.average(Recall[i]) F1[i] = np.average(F1[i]) if (TP_o[i] + FP_o[i]): Presision_o[i] = float(TP_o[i]) / (TP_o[i] + FP_o[i]) if (TP_o[i] + FN_o[i]): Recall_o[i] = float(TP_o[i])/ (TP_o[i] + FN_o[i]) if Presision_o[i]+Recall_o[i]: F1_o[i] = 2 * Presision_o[i]* Recall_o[i] / (Presision_o[i]+Recall_o[i]) # RMSE_o[i] = np.average(RMSE_o[i]) PreRec = json.dumps([Presision, Recall, F1, Presision_o, Recall_o, F1_o]) drawPR(y_true,y_pred_list, predType+' %d'%classes[1], path, classes) drawROC(y_true,y_pred_list, predType+' %d'%classes[1], path) J_v = {'th':[], 1:[], 2:[], 3:[], 4:[], 5:[], 6:[]} A_v = {'th':[], 1:[], 2:[], 3:[], 4:[], 5:[], 6:[]} for thres in np.arange(-0.05,1.05,0.05): J_v['th'].append(thres) A_v['th'].append(thres) for i in range(1,7): J_v[i].append(np.average(Jaccard_vector[thres][i])) A_v[i].append(np.average(Accuracy_vector[thres][i])) # if thres == 0.5: # print(i,thres,np.average(Jaccard_vector[thres][i])) # print(i,np.average(Jaccard[i])) drawJacAcc(J_v, A_v, predType+' %d'%classes[1], path) drawJaccDist(Jaccard, Jaccard_int, predType+' %d'%classes[1], path) for r in aspectNumAvg: aspectNumAvg[r] = np.average(aspectNumAvg[r]) for i1 in range(1,7): for i2 in range(i1+1,7): print(i1,i2,stats.ttest_ind(Jaccard[i1],Jaccard[i2])) J = [np.average(Jaccard[i]) for i in range(1,7)] J_int = [np.average(Jaccard_int[i]) for i in range(1,7)] # RMSE_final = [np.average(RMSE[i]) for i in range(1,7)] # RMSE_o_final = [np.average(RMSE_o[i]) for i in range(1,7)] # RMSE = [RMSE_final,RMSE_o_final] # # print(RMSE) return J, J_int, PreRec, aspectNumAvg#, RMSE
def featureImportance(review_dict, ignore_neutral = True): logger = logging.getLogger('signature.importantFeatureIdentification.featureImportance') logger.info('starting featureImportance') fsw = featureStructureWorker() N = len(review_dict) featuresDF = dict() # dictionary for counting Document Frequency itemFeatures = dict() # main dictionary with statistics (item:stat) for it, item in enumerate(review_dict): itemFeatures[item] = {'tfidfDict':{},'featureFreq':{},'sentiment':{}, 'reviewsNumber':0, 'maxFreq':0, 'textFeatures':[], 'critical':[],'texts':{}} itemFeatures[item]['reviewsNumber'] = len(review_dict[item]) critical = {} for r, review in enumerate(review_dict[item]): reviewFeatures = fsw.getReviewFeaturesExistence(review['features']) #fill in texts for sentId in review['features']: for feat in review['features'][sentId]: itemFeatures[item]['texts'][feat] = itemFeatures[item]['texts'].get(feat,[]) itemFeatures[item]['texts'][feat].append([review['features'][sentId][feat], review['sentences'][int(sentId)]]) #print reviewFeatures for feature in fsw.featureIdicator: if not fsw.featureIdicator[feature]: continue critical[feature] = critical.get(feature,{'+':[],'-':[],'0':[],'n':[],'1':[]}) if feature not in reviewFeatures: critical[feature]['n'].append(review['stars']) else: critical[feature]['1'].append(review['stars']) sent = np.average(reviewFeatures[feature]) if sent > 0: critical[feature]['+'].append(review['stars']) elif sent < 0: critical[feature]['-'].append(review['stars']) else: critical[feature]['0'].append(review['stars']) for feature in reviewFeatures: #work with frequency itemFeatures[item]['featureFreq'][feature] = itemFeatures[item]['featureFreq'].get(feature,0) itemFeatures[item]['featureFreq'][feature] += 1 #work with sentiment itemFeatures[item]['sentiment'][feature] = itemFeatures[item]['sentiment'].get(feature,[]) if len(reviewFeatures[feature]): if ignore_neutral: arr = [x for x in reviewFeatures[feature] if x] if len(arr): itemFeatures[item]['sentiment'][feature].append(np.average(arr)) else: itemFeatures[item]['sentiment'][feature].append(0.0) else: itemFeatures[item]['sentiment'][feature].append(np.average(reviewFeatures[feature])) else: logger.error('WHY???') itemFeatures[item]['sentiment'][feature].append(0.0) #print review.keys() if not len(itemFeatures[item]['textFeatures']): for tf in review['textFeatures']: itemFeatures[item]['textFeatures'].append(tf) else: for i,tf in enumerate(review['textFeatures']): itemFeatures[item]['textFeatures'][i] += tf # if not r%10: # logger.debug('%d reviews'%r) for feature in itemFeatures[item]['featureFreq']: #work with frequency if itemFeatures[item]['featureFreq'][feature] > itemFeatures[item]['maxFreq']: itemFeatures[item]['maxFreq'] = itemFeatures[item]['featureFreq'][feature] #work with sentiment itemFeatures[item]['sentiment'][feature] = [round(np.average(itemFeatures[item]['sentiment'][feature]),3), len(itemFeatures[item]['sentiment'][feature])] #work with 'Document' Frequency (DF) featuresDF[feature] = featuresDF.get(feature, 0) featuresDF[feature] += 1 for tf in range(len(itemFeatures[item]['textFeatures'])): itemFeatures[item]['textFeatures'][tf] /= itemFeatures[item]['reviewsNumber'] # #critical # for feature in critical: # crit = False # for i in range(4): # for j in range(i,4): # if len(critical[feature][i]) > 2: # if len(critical[feature][j]) > 2: # if sig_dif(critical[feature][i],critical[feature][j]) < 0.0501: # crit = True # print(feature,critical[feature][i],critical[feature][j]) # print(i,j,sig_dif(critical[feature][i],critical[feature][j])) # print(np.average(critical[feature][i]),np.average(critical[feature][j])) # if crit: # itemFeatures[item]['critical'].append(feature) itemFeatures[item]['critical'] = critical.copy() if not it%1000: logger.debug('%d items complete'%it) #prepare IDF for feature in featuresDF: featuresDF[feature] = math.log(float(N)/featuresDF[feature]) logger.debug('IDF prepared for %d items'%it) for it, item in enumerate(itemFeatures): for feature in itemFeatures[item]['featureFreq']: tf = float(itemFeatures[item]['featureFreq'][feature])/itemFeatures[item]['maxFreq'] #print feature, tf idf = featuresDF[feature] itemFeatures[item]['tfidfDict'][feature] = round(tf*idf, 3) Ni = len(review_dict[item]) t = round(100.*itemFeatures[item]['featureFreq'][feature]/Ni,2) itemFeatures[item]['featureFreq'][feature] = t itemFeatures[item]['tfidfList'] = [[itemFeatures[item]['tfidfDict'][feature],feature] for feature in itemFeatures[item]['tfidfDict']] itemFeatures[item]['tfidfList'].sort(reverse = True) itemFeatures[item]['featureFreqList'] = [[itemFeatures[item]['featureFreq'][feature],feature] for feature in itemFeatures[item]['featureFreq']] itemFeatures[item]['featureFreqList'].sort(reverse = True) if not it%1000: logger.debug('%d items completed'%it) return copy.deepcopy(itemFeatures)
def applyFeatureExistance(busImportantFeatures, userImportantFeatures, testReviews, modelDict, path): logger = logging.getLogger('signature.applyFE.aFE') logger.info('starting applyFeatureExistance from %d reviews'%len(testReviews)) fsw = featureStructureWorker() featureWeights = dict() featureSWeights = dict() featureQuality = dict() for k, feature in enumerate(fsw.featureIdicator): # print(k,feature) # if k > 15: # break if not fsw.featureIdicator[feature]: continue if feature not in modelDict: continue logger.debug('Start working with (%d) %s'%(k,feature)) #get data X1, Y1, X2, Y2, missed = getFeatures(logger, feature, testReviews, busImportantFeatures, userImportantFeatures) #weight = frequency featureWeights[feature] = float(sum(Y1))/len(Y1) #weight = sentiment featureSWeights[feature] = float(sum(Y2))/len(Y2) ''' Existence ''' #Ypred = [int(x[1] > modelDict[feature][0]) for x in modelDict[feature][1].predict_proba(np.array(X1))] Ypred = modelDict[feature][1].predict(np.array(X1)) Yreal = np.array(Y1) quality = list(f1_score(Yreal, Ypred, average=None)) quality += list(precision_score(Yreal, Ypred, average=None)) quality += list(recall_score(Yreal, Ypred, average=None)) ''' Sentiment ''' #YSpred = [int(x[1] > modelDict[feature][2]) for x in modelDict[feature][3].predict_proba(np.array(X2))] YSpred = modelDict[feature][3].predict(np.array(X2)) YSreal = np.array(Y2) qualityS = list(f1_score(YSreal, YSpred, average=None)) qualityS += list(precision_score(YSreal, YSpred, average=None)) qualityS += list(recall_score(YSreal, YSpred, average=None)) featureQuality[feature] = [round(featureWeights[feature],2), len(Y1)] featureQuality[feature] += [round(x,2) for x in quality] featureQuality[feature] += [round(featureSWeights[feature],2), len(Y2)] featureQuality[feature] += [round(x,2) for x in qualityS] # print(feature,featureQuality[feature]) for r, review in enumerate(testReviews): existence = 0 predictedExistence = 0 X1, Y1, X2, Y2, missed = getFeatures(logger, feature, [review], busImportantFeatures, userImportantFeatures) if len(Y1): #check if the review has enough history review['exPredFeatures'] = review.get('exPredFeatures', {}) existence = Y1[0] #print Yreal[r], Ypred[r], modelDict[feature][0] prediction = modelDict[feature][1].predict_proba(np.array(X1))[0][1] # probability of second class!!! #prediction = float(modelDict[feature][1].predict(np.array(X1))[0]) #prediction = busImportantFeatures[review['business_id']]['featureFreq'][feature]/100.0 if prediction >= modelDict[feature][0]: predictedExistence = 1 else: predictedExistence = 0 predictedExistence = prediction # print(X1[0], prediction, busImportantFeatures[review['business_id']]['featureFreq'][feature]/100.0) randomPrediction = random.random()#int(random.random() > 0.5) simplePrediction = busImportantFeatures[review['business_id']]['featureFreq'][feature]/100.0#int(busImportantFeatures[review['business_id']]['featureFreq'][feature] > 40) basePredictionPos = 1 basePredictionNeg = 0 #print(existence, predictedExistence, randomPrediction, simplePrediction, basePredictionPos, basePredictionNeg) review['exPredFeatures'][feature] = [existence, predictedExistence, randomPrediction, simplePrediction, basePredictionPos, basePredictionNeg] #print(feature, review['exPredFeatures'][feature]) ''' Sentiment ''' if len(Y2): review['sentPredFeatures'] = review.get('sentPredFeatures', {}) sentiment = Y2[0] #print Yreal[r], Ypred[r], modelDict[feature][0] prediction = modelDict[feature][3].predict_proba(np.array(X2))[0][1] #prediction = float(modelDict[feature][3].predict(np.array(X2))[0]) if prediction >= modelDict[feature][2]: predictedSentiment = 1 else: predictedSentiment = 0 predictedSentiment = prediction randomSPrediction = random.random()#int(random.random() > 0.5) simpleSPrediction = (busImportantFeatures[review['business_id']]['sentiment'].get(feature,[0.0,0])[0]+1)/2.0#int(busImportantFeatures[review['business_id']]['sentiment'].get(feature,[0.0,0])[0] >= -0.5) baseSPredictionPos = 1 baseSPredictionNeg = 0 review['sentPredFeatures'][feature] = [sentiment, predictedSentiment, randomSPrediction, simpleSPrediction, baseSPredictionPos, baseSPredictionNeg] if not r%5000: logger.debug('%d reviews processed'%r) return testReviews, featureWeights, featureQuality
def applyFeatureExistance(busImportantFeatures, userImportantFeatures, testReviews, modelDict, trainAveragesDict, path): logger = logging.getLogger('signature.aFE.applyFE') logger.info('starting applyFeatureExistance from %d reviews'%len(testReviews)) fsw = featureStructureWorker() featureWeights = dict() featureF1 = dict() for i, feature in enumerate(fsw.featureIdicator): if not fsw.featureIdicator[feature]: continue logger.debug('Start working with %s'%feature) #get data X, Y = getFeatures(logger, feature, testReviews, busImportantFeatures, userImportantFeatures, trainAverages = trainAveragesDict[feature], is_train = False) #weight = frequency featureWeights[feature] = float(list(Y).count(1))/len(Y) Ypred = [x[1] for x in modelDict[feature][2].predict_proba(np.array(X))] Yreal = np.array(Y) Ybus = [] for review in testReviews: busID = review['business_id'] if busID in busImportantFeatures: pfreq = busImportantFeatures[busID]['featureFreq'].get(feature,0.0) else: pfreq = featureWeights[feature] Ybus.append(pfreq) featureF1[feature] = drawPR(feature,Yreal,Ypred,Ybus, modelDict[feature][0], path) for r, review in enumerate(testReviews): #reviewFeatures = fsw.getReviewFeaturesExistence(review['features']) review['exPredFeatures'] = review.get('exPredFeatures', {}) existence = Yreal[r] #print Yreal[r], Ypred[r], modelDict[feature][0] if Ypred[r] >= modelDict[feature][0]: predictedExistence = 1 else: predictedExistence = 0 #check if feature important if existence + predictedExistence > 0.5: review['exPredFeatures'][feature] = [existence, predictedExistence] #print review['exPredFeatures'] if not r%10000: logger.debug('%d reviews processed'%r) Jaccard = list() Jaccard_weighted = list() Jaccard_baseline = list() Jaccard_baseline_weighted = list() TP = 0 FP = 0 FN = 0 TP_all = 0 FP_all = 0 FN_all = 0 TP_bus = 0 FP_bus = 0 FN_bus = 0 TP_int = 0 FP_int = 0 FN_int = 0 for r, review in enumerate(testReviews): Jaccard_intersection = 0.0 Jaccard_union = 0.0 Jaccard_intersection_weighted = 0.0 Jaccard_union_weighted = 0.0 Jaccard_intersection_baseline = 0.0 Jaccard_union_baseline = 0.0 Jaccard_intersection_baseline_weighted = 0.0 Jaccard_union_baseline_weighted = 0.0 busID = review['business_id'] if busID in busImportantFeatures: busAspects = set([f for f in busImportantFeatures[busID]['featureFreq'] if busImportantFeatures[busID]['featureFreq'][f] > 10 and busImportantFeatures[busID]['sentiment'][f][1] > 1]) else: busAspects = set([f for f in fsw.featureIdicator if fsw.featureIdicator[feature]]) # userID = review['user_id'] # if userID in userImportantFeatures: # userAspects = set([f for f in userImportantFeatures[userID]['featureFreq'] if userImportantFeatures[userID]['featureFreq'][f] > 10 and # userImportantFeatures[userID]['sentiment'][f][1] > 1]) # else: # userAspects = set([f for f in fsw.featureIdicator if fsw.featureIdicator[feature]]) #interBU = userAspects.intersection(busAspects) #buildin INTERSECTION busID = review['business_id'] if busID in busImportantFeatures: busImpAspects = set([f for f in busImportantFeatures[busID]['featureFreq'] if busImportantFeatures[busID]['featureFreq'][f] > 50 and busImportantFeatures[busID]['sentiment'][f][1] > 1]) busIntAspects = set([f for f in busImportantFeatures[busID]['featureFreq'] if busImportantFeatures[busID]['featureFreq'][f] > 10 and busImportantFeatures[busID]['sentiment'][f][1] > 1]) else: busImpAspects = set([f for f in fsw.featureIdicator if fsw.featureIdicator[feature]]) busIntAspects = set([f for f in fsw.featureIdicator if fsw.featureIdicator[feature]]) userID = review['user_id'] if userID in userImportantFeatures: userAspects = set([f for f in userImportantFeatures[userID]['featureFreq'] if userImportantFeatures[userID]['featureFreq'][f] > 10 and userImportantFeatures[userID]['sentiment'][f][1] > 1]) else: userAspects = set([f for f in fsw.featureIdicator if fsw.featureIdicator[feature]]) interBU = busImpAspects.union(userAspects.intersection(busIntAspects)) for feature in review['exPredFeatures']: if review['exPredFeatures'][feature] == [1,1]: TP += 1 elif review['exPredFeatures'][feature] == [0,1]: FP += 1 if review['exPredFeatures'][feature] == [1,0]: FN += 1 #baseline all if review['exPredFeatures'][feature][0] == 1: TP_all += 1 #baseline business if feature in busAspects and review['exPredFeatures'][feature][0] == 1: TP_bus += 1 elif feature in busAspects and review['exPredFeatures'][feature][0] == 0: FP_bus += 1 elif feature not in busAspects and review['exPredFeatures'][feature][0] == 1: FN_bus += 1 #baseline intersection if feature in interBU and review['exPredFeatures'][feature][0] == 1: TP_int += 1 elif feature in interBU and review['exPredFeatures'][feature][0] == 0: FP_int += 1 elif feature not in interBU and review['exPredFeatures'][feature][0] == 1: FN_int += 1 #print TP_int, FP_int, FN_int if review['exPredFeatures'][feature] == [1,1]: Jaccard_intersection += 1.0 Jaccard_intersection_weighted += featureWeights[feature] Jaccard_union += 1.0 Jaccard_union_weighted += featureWeights[feature] if review['exPredFeatures'][feature][0] == 1: Jaccard_intersection_baseline += 1.0 Jaccard_intersection_baseline_weighted += featureWeights[feature] for feature in fsw.featureIdicator: if fsw.featureIdicator[feature]: FP_all += 1 Jaccard_union_baseline += 1 Jaccard_union_baseline_weighted += featureWeights[feature] if Jaccard_union: Jaccard.append(Jaccard_intersection/Jaccard_union) if Jaccard_union_weighted: Jaccard_weighted.append(Jaccard_intersection_weighted/Jaccard_union_weighted) if Jaccard_union_baseline: Jaccard_baseline.append(Jaccard_intersection_baseline/Jaccard_union_baseline) if Jaccard_union_baseline_weighted: Jaccard_baseline_weighted.append(Jaccard_intersection_baseline_weighted/Jaccard_union_baseline_weighted) #SIGNATURE METHOD Presision = float(TP)/(TP+FP) Recall = float(TP)/(TP+FN) F1 = 2*Presision*Recall/(Presision+Recall) PreRec = [Presision,Recall,F1] #baseline ALL Presision_all = float(TP_all)/(TP_all+FP_all) Recall_all = float(TP_all)/(TP_all+FN_all) F1_all = 2*Presision_all*Recall_all/(Presision_all+Recall_all) PreRec_all = [Presision_all,Recall_all,F1_all] #baseline BUSINESS Presision_bus = float(TP_bus)/(TP_bus+FP_bus) Recall_bus = float(TP_bus)/(TP_bus+FN_bus) F1_bus = 2*Presision_bus*Recall_bus/(Presision_bus+Recall_bus) PreRec_bus = [Presision_bus,Recall_bus,F1_bus] #print TP_int, FP_int #baseline INTERSECTION Presision_int = float(TP_int)/(TP_int+FP_int) Recall_int = float(TP_int)/(TP_int+FN_int) F1_int = 2*Presision_int*Recall_int/(Presision_int+Recall_int) PreRec_int = [Presision_int,Recall_int,F1_int] return testReviews, featureWeights, [[np.average(Jaccard), np.average(Jaccard_weighted)], [np.average(Jaccard_baseline), np.average(Jaccard_baseline_weighted)]], featureF1, [PreRec,PreRec_all, PreRec_bus, PreRec_int]
def applySentimentMF(testReviews, modelDict, featureThres, featureWeights): logger = logging.getLogger('signature.aSMF.applySentimentMF') logger.info('starting applySentimentMatrixFactorization from %d reviews'%len(testReviews)) fsw = featureStructureWorker() feature_data = dict() reviewDict = dict() for r, review in enumerate(testReviews): review['predSentiments'] = dict() busID = review['business_id'] userID = review['user_id'] sentiments = fsw.getReviewFeaturesSentiment(review['features']) #print sentiments ID = busID+'###'+userID reviewDict[ID] = review for feature in review['exPredFeatures']: if not fsw.featureIdicator.get(feature, None): continue sentiment = np.average(sentiments.get(feature,[0.0])) if feature in feature_data: feature_data[feature]['id'].append(ID) feature_data[feature]['user'].append(userID) feature_data[feature]['item'].append(busID) feature_data[feature]['rating'].append(sentiment) else: feature_data[feature] = {'id':[ID],'user':[userID],'item':[busID],'rating':[sentiment]} if not r%1000: logger.debug('%d reviews processed'%r) rmse = list() rmse_weighted = list() rmse_baseline = list() rmse_baseline_weighted = list() accuracy = list() accuracy_weighted = list() accuracy_baseline = list() accuracy_baseline_weighted = list() weighted_sum = list() for f, feature in enumerate(feature_data): # if f > 0: # break #print feature, feature_data[feature] testData = graphlab.SFrame(feature_data[feature]) prediction = modelDict[feature].predict(testData) testData['prediction'] = prediction for i,ID in enumerate(testData['id']): # if testData['prediction'][i] == featureThres[feature]: # sent_pred = 0.0 # sent_pred = 1.0 if testData['prediction'][i] > featureThres[feature] else -1.0 sent_pred = testData['prediction'][i]# - featureThres[feature] reviewDict[ID]['predSentiments'][feature] = sent_pred #print reviewDict[ID]['features'] real_sent = feature_data[feature]['rating'][i] #print real_sent,sent_pred, accuracy if real_sent*sent_pred > 0.0: accuracy.append(1.0) accuracy_weighted.append(featureWeights[feature]) elif real_sent*sent_pred < 0.0: accuracy.append(0.0) accuracy_weighted.append(0.0) #print real_sent,sent_pred, accuracy if real_sent*featureThres[feature] > 0: accuracy_baseline.append(1.0) accuracy_baseline_weighted.append(featureWeights[feature]) elif real_sent*featureThres[feature] < 0: accuracy_baseline.append(0.0) accuracy_baseline_weighted.append(0.0) rmse.append(pow((real_sent-sent_pred),2)) rmse_weighted.append(pow((real_sent-sent_pred),2)*featureWeights[feature]) rmse_baseline.append(pow((real_sent-featureThres[feature]),2)) rmse_baseline_weighted.append(pow((real_sent-featureThres[feature]),2)*featureWeights[feature]) weighted_sum.append(featureWeights[feature]) if not f%1: logger.debug('%d features sentiments predicted'%f) # #RMSE rmse = np.average(rmse) #weighted rmse rmse_weighted = np.sum(rmse_weighted)/np.sum(weighted_sum) #rmse baseline rmse_baseline = np.average(rmse_baseline) #rmse baseline weighted rmse_baseline_weighted = np.sum(rmse_baseline_weighted)/np.sum(weighted_sum) #ACCURACY accuracy = np.average(accuracy) #weighted accuracy accuracy_weighted = np.sum(accuracy_weighted)/np.sum(weighted_sum) #accuracy baseline accuracy_baseline = np.average(accuracy_baseline) #accuracy baseline weighted accuracy_baseline_weighted = np.sum(accuracy_baseline_weighted)/np.sum(weighted_sum) #weighted accuracy return [reviewDict[i] for i in reviewDict], [rmse,rmse_weighted,rmse_baseline,rmse_baseline_weighted, accuracy,accuracy_weighted,accuracy_baseline,accuracy_baseline_weighted]
def applySentimentMF(testReviews, modelDict_ex, featureThres_ex, modelDict, featureThres): logger = logging.getLogger('signature.aSMF.applySentimentMF') logger.info('starting applySentimentMatrixFactorization from %d reviews'%len(testReviews)) fsw = featureStructureWorker() aspect_data = dict() reviewDict_ex = dict() reviewDict = dict() for r, review in enumerate(testReviews): busID = review['business_id'] userID = review['user_id'] reviewID = review['review_id'] for aspect in fsw.featureIdicator: if not fsw.featureIdicator.get(aspect, None): continue if aspect in aspect_data: aspect_data[aspect]['id'].append(reviewID) aspect_data[aspect]['user'].append(userID) aspect_data[aspect]['item'].append(busID) else: aspect_data[aspect] = {'id':[reviewID],'user':[userID],'item':[busID]} if not r%5000: logger.debug('%d reviews processed'%r) for f, aspect in enumerate(aspect_data): logger.info('Prosessing (%d) %s'%(f,aspect)) if aspect not in modelDict_ex or aspect not in modelDict: continue testData = graphlab.SFrame(aspect_data[aspect]) # print('test prepared') prediction_ex = modelDict_ex[aspect].predict(testData) prediction = modelDict[aspect].predict(testData) # print('sentiment predicted') testData['prediction_ex'] = prediction_ex testData['prediction'] = prediction #existence testData_prediction_ex = list(testData['prediction_ex']) for i,prediction_ex in enumerate(testData_prediction_ex): reviewID = aspect_data[aspect]['id'][i] reviewDict_ex[reviewID] = reviewDict_ex.get(reviewID,{}) ex_pred_adjust = (prediction_ex*0.5/featureThres_ex[aspect]) if ex_pred_adjust < 0: ex_pred_adjust = 0 if ex_pred_adjust > 1: ex_pred_adjust = 1 reviewDict_ex[reviewID][aspect] = ex_pred_adjust #sentiment testData_prediction = list(testData['prediction']) for i,sent_prediction in enumerate(testData_prediction): reviewID = aspect_data[aspect]['id'][i] reviewDict[reviewID] = reviewDict.get(reviewID,{}) sent_pred_adjust = (sent_prediction*0.5/featureThres[aspect]) if sent_pred_adjust < 0: sent_pred_adjust = 0 if sent_pred_adjust > 1: sent_pred_adjust = 1 reviewDict[reviewID][aspect] = sent_pred_adjust if not f%1: logger.debug('%d features sentiments predicted'%f) return reviewDict_ex, reviewDict
def applySMF(path, limit = np.Inf): logger = logging.getLogger('signature.aSMF') logger.info('starting applySentimentMF') #get data r_file = path+'yelp_reviews_test_predictions.json' testReviews = list() for counter, line in enumerate(open(r_file,'r')): if not counter%1000: logger.debug('%d reviews loaded'%counter) if counter > limit: break testReviews.append(json.loads(line.strip())) logger.info('Test Reviews loaded from %s'%r_file) #load model modelDict = dict() featureThres = dict() fsw = featureStructureWorker() for feature in fsw.featureIdicator: if not fsw.featureIdicator[feature]: continue try: modelPath = path + '/sentimentModels/%s_sentiment.model'%feature print modelPath modelDict[feature] = graphlab.load_model(modelPath) #load average thres_path = path+'/sentimentModels/%s_sentiment.threshold'%feature infile = open(thres_path,'r') featureThres[feature] = float(infile.readline().strip()) infile.close() except: logger.error('There is no model for feature: %s'%feature) continue logger.info('Models loaded') #load featureWeights infile = open(path+'/featureWeights.json','r') featureWeights = json.loads(infile.readline().strip()) infile.close() #run function reviewsPrediction, results = applySentimentMF(testReviews, modelDict, featureThres, featureWeights) #save result outfile = open(path+'yelp_reviews_test_predictions.json','w') for review in reviewsPrediction: outfile.write(json.dumps(review).encode('utf8', 'ignore')+'\n') outfile.close() try: os.stat(path+'/results/') except: os.mkdir(path+'/results/') outfile = open(path+'/results/Sentiment_prediction.txt','w') outfile.write('RMSE = %f\nRMSE_weighted = %f'%(results[0], results[1])) outfile.write('\n\nRMSE_baseline = %f\nRMSE_baseline_weighted = %f'%(results[2], results[3])) outfile.write('\n===============\n\nAccuracy = %f\nAccuracy_weighted = %f'%(results[4], results[5])) outfile.write('\n\nAccuracy_baseline = %f\nAccuracy_baseline_weighted = %f'%(results[6], results[7])) outfile.close()
def featureImportance(review_dict, ignore_neutral = True): logger = logging.getLogger('signature.IFI.fI') logger.info('starting featureImportance') fsw = featureStructureWorker() N = len(review_dict) featuresDF = dict() # dictionary for counting Document Frequency itemFeatures = dict() # main dictionary with statistics (item:stat) for it, item in enumerate(review_dict): itemFeatures[item] = {'tfidfDict':{},'featureFreq':{},'sentiment':{},'reviewsNumber':0, 'maxFreq':0, 'textFeatures':[]} itemFeatures[item]['reviewsNumber'] = len(review_dict[item]) for r, review in enumerate(review_dict[item]): reviewFeatures = fsw.getReviewFeaturesExistence(review['features']) #print reviewFeatures for feature in reviewFeatures: #work with frequency itemFeatures[item]['featureFreq'][feature] = itemFeatures[item]['featureFreq'].get(feature,0) itemFeatures[item]['featureFreq'][feature] += 1 #work with sentiment itemFeatures[item]['sentiment'][feature] = itemFeatures[item]['sentiment'].get(feature,[]) if len(reviewFeatures[feature]): if ignore_neutral: arr = [x for x in reviewFeatures[feature] if x] if len(arr): itemFeatures[item]['sentiment'][feature].append(np.average(arr)) else: itemFeatures[item]['sentiment'][feature].append(0.0) else: itemFeatures[item]['sentiment'][feature].append(np.average(reviewFeatures[feature])) else: itemFeatures[item]['sentiment'][feature].append(0.0) #print review.keys() if not len(itemFeatures[item]['textFeatures']): for tf in review['textFeatures']: itemFeatures[item]['textFeatures'].append(tf) else: for i,tf in enumerate(review['textFeatures']): itemFeatures[item]['textFeatures'][i] += tf # if not r%10: # logger.debug('%d reviews'%r) for feature in itemFeatures[item]['featureFreq']: #work with frequency if itemFeatures[item]['featureFreq'][feature] > itemFeatures[item]['maxFreq']: itemFeatures[item]['maxFreq'] = itemFeatures[item]['featureFreq'][feature] #work with sentiment itemFeatures[item]['sentiment'][feature] = [round(np.average(itemFeatures[item]['sentiment'][feature]),3), len(itemFeatures[item]['sentiment'][feature])] #work with 'Document' Frequency (DF) featuresDF[feature] = featuresDF.get(feature, 0) featuresDF[feature] += 1 for tf in range(len(itemFeatures[item]['textFeatures'])): itemFeatures[item]['textFeatures'][tf] /= itemFeatures[item]['reviewsNumber'] if not it%1000: logger.debug('%d items'%it) #prepare IDF for feature in featuresDF: featuresDF[feature] = math.log(float(N)/featuresDF[feature]) logger.debug('IDF prepared for %d items'%it) for it, item in enumerate(itemFeatures): for feature in itemFeatures[item]['featureFreq']: tf = float(itemFeatures[item]['featureFreq'][feature])/itemFeatures[item]['maxFreq'] #print feature, tf idf = featuresDF[feature] itemFeatures[item]['tfidfDict'][feature] = round(tf*idf, 3) Ni = len(review_dict[item]) t = round(100.*itemFeatures[item]['featureFreq'][feature]/Ni,2) itemFeatures[item]['featureFreq'][feature] = t itemFeatures[item]['tfidfList'] = [[itemFeatures[item]['tfidfDict'][feature],feature] for feature in itemFeatures[item]['tfidfDict']] itemFeatures[item]['tfidfList'].sort(reverse = True) itemFeatures[item]['featureFreqList'] = [[itemFeatures[item]['featureFreq'][feature],feature] for feature in itemFeatures[item]['featureFreq']] itemFeatures[item]['featureFreqList'].sort(reverse = True) if not it%1000: logger.debug('%d items completed'%it) return copy.deepcopy(itemFeatures)
def learnSentimentMatrixFactorization(trainReviews, path): logger = logging.getLogger('signature.learnSentimentMF.Worker') logger.info('starting learnSentimentMatrixFactorization from %d reviews'%len(trainReviews)) fsw = featureStructureWorker() modelDict_ex = dict() featureThres_ex = dict() modelDict = dict() featureThres = dict() for i, feature in enumerate(fsw.featureIdicator): # if feature != 'SERVICE': # continue if not fsw.featureIdicator[feature]: continue logger.debug('Start working with (%d) %s'%(i,feature)) learnData_ex = {'user':[],'item':[],'rating':[]} learnData = {'user':[],'item':[],'rating':[]} for j, review in enumerate(trainReviews): reviewFeatures = fsw.getReviewFeaturesSentiment(review['features']) busID = review['business_id'] userID = review['user_id'] learnData_ex['user'].append(userID) learnData_ex['item'].append(busID) if feature in reviewFeatures: learnData_ex['rating'].append(1) else: learnData_ex['rating'].append(0) if feature not in reviewFeatures: continue sent = np.average(reviewFeatures[feature]) if sent: learnData['user'].append(userID) learnData['item'].append(busID) if sent > 0: learnData['rating'].append(1) elif sent < 0: learnData['rating'].append(0) if len(learnData_ex['rating']): data_ex = graphlab.SFrame(learnData_ex) featureThres_ex[feature], modelDict_ex[feature] = getBestMFThres(logger, feature, data_ex, path) #CROSSS VALIDATION if len(learnData['rating']): data = graphlab.SFrame(learnData) featureThres[feature], modelDict[feature] = getBestMFThres(logger, feature, data, path) return modelDict_ex, featureThres_ex, modelDict, featureThres
def getFeatures( logger, feature, reviewsSet, busImportantFeatures, userImportantFeatures, trainAverages={}, is_train=True ): business_dict, user_dict = loadData(logger) gP = genderPredictor() gP.load() cW = categoryWorker() cW.load() if is_train: trainAverages = {"mean": [], "std": []} else: pass # load trainAverages fsw = featureStructureWorker() X = list() Y = list() for review in reviewsSet: reviewFeatures = fsw.getReviewFeaturesExistence(review["features"]) if feature in reviewFeatures: existance = 1 else: existance = 0 busID = review["business_id"] userID = review["user_id"] bus_basic_features = getBasicFeatures(feature, busID, busImportantFeatures, is_train) user_basic_features = getBasicFeatures(feature, userID, userImportantFeatures, is_train) bus_additional_features = getBusinessFeatures(busID, business_dict, cW) user_additional_features = getUserFeatures(userID, user_dict, gP) # if not bus_basic_features or not user_basic_features: # continue # sex = [review['usersSex']] Y.append(existance) X.append(bus_basic_features + user_basic_features + bus_additional_features + user_additional_features) # +sex) if is_train: if not len(trainAverages["mean"]): for i in range(len(X[0])): trainAverages["mean"].append([]) trainAverages["std"].append([]) for i, value in enumerate(X[-1]): if value != None: trainAverages["mean"][i].append(value) # count means if is_train: for i in range(len(trainAverages["mean"])): trainAverages["std"][i] = np.std(trainAverages["mean"][i]) trainAverages["mean"][i] = np.average(trainAverages["mean"][i]) # normalization for vector in X: for i in range(len(vector)): if vector[i] == None: vector[i] = 0.0 else: if trainAverages["std"][i]: vector[i] = (vector[i] - trainAverages["mean"][i]) / trainAverages["std"][i] else: vector[i] = vector[i] - trainAverages["mean"][i] if is_train: return X, Y, trainAverages else: return X, Y
def learnFeatureExistance(busImportantFeatures, userImportantFeatures, trainReviews, path): logger = logging.getLogger('signature.learnFE') logger.info('starting learnFeatureExistance from %d reviews'%len(trainReviews)) fsw = featureStructureWorker() modelDict = dict() missed_prediction = dict() for f, feature in enumerate(fsw.featureIdicator): if not fsw.featureIdicator[feature]: continue logger.info('Start working with (%d) %s'%(f,feature)) #get data X1, Y1, X2, Y2, missed = getFeatures(logger, feature, trainReviews, busImportantFeatures, userImportantFeatures) missed_prediction[feature] = [missed, len(Y1)] # stat_line = '%d (%d/%d)reviews (%d of them pos(%d)/neg(%d))'%(len(Y1),sum(Y1),len(Y1) - sum(Y1), # len(Y2),sum(Y2),len(Y2) - sum(Y2)) logger.debug('Got features for %d (%d/%d)reviews (%d of them pos(%d)/neg(%d))'%(len(Y1),sum(Y1),len(Y1) - sum(Y1), len(Y2),sum(Y2),len(Y2) - sum(Y2))) print(len(Y1),len(Y2)) if len(Y1) < 100 or sum(Y1) < 50 or len(Y1) - sum(Y1) < 50: continue if len(Y2) < 100 or sum(Y2) < 50 or len(Y2) - sum(Y2) < 50: continue # if len(Y1) < 10 or sum(Y1) < 10 or len(Y1) - sum(Y1) < 10: # continue # if len(Y2) < 10 or sum(Y2) < 10 or len(Y2) - sum(Y2) < 10: # continue # #cross validation # indicator = range(len(X)) # random.shuffle(indicator) # thres = int(len(indicator)*0.8) # trainX = np.array([X[i] for i in indicator[:thres]]) # trainY = np.array([Y[i] for i in indicator[:thres]]) # testX = np.array([X[i] for i in indicator[thres:]]) # testY = np.array([Y[i] for i in indicator[thres:]]) #Logistic Regression bestThres, bestQ,logmodel = getLogModel(logger, feature, X1, Y1, path) logger.info('Sentiment prediction for (%d) %s'%(f,feature)) #Logistic Regression bestThres_2, bestQ_2, logmodel_2 = getLogModel(logger, feature, X2, Y2, path) feat_info = [len(Y1), sum(Y1), len(Y1) - sum(Y1)] + bestQ + [len(Y2), sum(Y2),len(Y2) - sum(Y2)] + bestQ_2 #bestThresSVM,bestF1SVM,svmmodel = getBestSVMModel(logger, feature, X, Y, path) # crossValidation(logger, np.array(X), np.array(Y)) modelDict[feature] = [bestThres, logmodel, bestThres_2, logmodel_2, feat_info] # print(f) # if f > 6: # break return modelDict
def aspectStat(path): logger = logging.getLogger('signature.aspectStat') logger.info('start computing aspect Stat') #get data b_file = path+'/businessProfile.json' u_file = path+'/userProfile.json' busImportantFeatures = json.loads(open(b_file,'r').readline()) logger.info('Important BUSINESS Features loaded') userImportantFeatures = json.loads(open(u_file,'r').readline()) logger.info('Important USER Features loaded') aspectStat = dict() fsw = featureStructureWorker() for f, aspect in enumerate(fsw.featureIdicator): aspectStat[aspect] = {'total':0, 'bus10':0, 'user10':0, 'posNum':0, 'negNum':0, 'busDiff+-':0, 'userDiff+-':0, 'busDiff01':0, 'userDiff01':0} for busID in busImportantFeatures: bus_reviews = busImportantFeatures[busID]['reviewsNumber'] bus_freq = busImportantFeatures[busID]['featureFreq'].get(aspect,0.0) aspectStat[aspect]['total'] += bus_freq/100.0 * bus_reviews if aspect in busImportantFeatures[busID]['critical']: aspectStat[aspect]['posNum'] += len(busImportantFeatures[busID]['critical'][aspect]['+']) aspectStat[aspect]['negNum'] += len(busImportantFeatures[busID]['critical'][aspect]['-']) if bus_freq > 10: aspectStat[aspect]['bus10'] += 1 if aspect in busImportantFeatures[busID]['critical']: exist = busImportantFeatures[busID]['critical'][aspect]['1'] pos = busImportantFeatures[busID]['critical'][aspect]['+'] neg = busImportantFeatures[busID]['critical'][aspect]['-'] # neutr = busImportantFeatures[busID]['critical'][aspect]['0'] none = busImportantFeatures[busID]['critical'][aspect]['n'] if sig_dif(pos,neg) < 0.10501: aspectStat[aspect]['busDiff+-'] += 1 if sig_dif(exist,none) < 0.10501: aspectStat[aspect]['busDiff01'] += 1 for userID in userImportantFeatures: # user_reviews = userImportantFeatures[userID]['reviewsNumber'] user_freq = userImportantFeatures[userID]['featureFreq'].get(aspect,0.0) if user_freq > 1: aspectStat[aspect]['user10'] += 1 if aspect in userImportantFeatures[userID]['critical']: exist = userImportantFeatures[userID]['critical'][aspect]['1'] pos = userImportantFeatures[userID]['critical'][aspect]['+'] neg = userImportantFeatures[userID]['critical'][aspect]['-'] # neutr = userImportantFeatures[userID]['critical'][aspect]['0'] none = userImportantFeatures[userID]['critical'][aspect]['n'] if sig_dif(pos,neg) < 0.10501: aspectStat[aspect]['userDiff+-'] += 1 if sig_dif(exist,none) < 0.10501: aspectStat[aspect]['userDiff01'] += 1 logger.debug('done with (%d) %s'%(f,aspect)) try: os.stat(path+'results/') except: os.mkdir(path+'results/') outfile = open(path+'/results/aspectStatistics.txt','w') outfile.write('total\tbus10\tuser10\tposNum\tnegNum\tbusDiff+-\tuserDiff+-\tbusDiff01\tuserDiff01\n') aspects = list(aspectStat.keys()) aspects.sort() for aspect in aspects: r = aspectStat[aspect] outfile.write('%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n'%(aspect, r['total'], r['bus10'], r['user10'], r['posNum'],r['negNum'], r['busDiff+-'], r['userDiff+-'], r['busDiff01'], r['userDiff01'])) outfile.close()
def applyTopicModel(logger, path, topic_num): stat_file = path + "yelp_reviews_features_stat.json" train_file = path + "yelp_reviews_features_train.json" extrain_file = path + "yelp_reviews_features_extrain.json" test_file = path + "yelp_reviews_features_test.json" # load model model_path = path + "modelLDA/" dictionary = corpora.Dictionary.load(model_path + "dictionary_%d.lda" % topic_num) logger.info("Dictionary loaded from: " + model_path + "dictionary_%d.lda" % topic_num) lda_model = models.ldamodel.LdaModel.load(model_path + "model_%d.lda" % topic_num) logger.info("Model loaded from:" + model_path + "model_%d.lda" % topic_num) files = [stat_file, train_file, extrain_file, test_file] fsw = featureStructureWorker() for infile in files: reviews = list() for counter, line in enumerate(open(infile, "r")): if not counter % 10000: logger.debug("%d reviews loaded" % counter) # print infile, line # load review information review = json.loads(line.strip()) reviews.append(review) # outfile = open(infile.replace('.json','_old.json'),'w') # for review in reviews: # outfile.write(json.dumps(review).encode('utf8', 'ignore')+'\n') # outfile.close() # outfile = open(infile,'w') # outname = infile.replace('.json','_old.json') outname = infile print outname outfile = open(outname, "w") for counter, review in enumerate(reviews): if not counter % 1000: logger.debug("%d reviews loaded" % counter) text = list() for sentence in review["features"]: for aspect in review["features"][sentence]: text.append(aspect + "_%s" % review["features"][sentence][aspect].strip()) topics = lda_model[dictionary.doc2bow(text)] res = dict() if len(topics): res["1"] = topTopics(topics) # print topics, res if "features_sent" not in review: review["features_sent"] = review["features"].copy() review["features"] = res.copy() outfile.write(json.dumps(review).encode("utf8", "ignore") + "\n") # if counter > 10: # break outfile.close()